Merge remote-tracking branch 'remotes/mesa/17.2' into merge

Change-Id: I8bfcd1eaefb30e59ef6543c71ef1727478bfade2
diff --git a/.travis.yml b/.travis.yml
index 0ab1869..51061d9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,7 @@
   global:
     - XORG_RELEASES=http://xorg.freedesktop.org/releases/individual
     - XCB_RELEASES=http://xcb.freedesktop.org/dist
+    - WAYLAND_RELEASES=http://wayland.freedesktop.org/releases
     - XORGMACROS_VERSION=util-macros-1.19.0
     - GLPROTO_VERSION=glproto-1.4.17
     - DRI2PROTO_VERSION=dri2proto-2.8
@@ -23,7 +24,8 @@
     - LIBVDPAU_VERSION=libvdpau-1.1
     - LIBVA_VERSION=libva-1.6.2
     - LIBWAYLAND_VERSION=wayland-1.11.1
-    - PKG_CONFIG_PATH=$HOME/prefix/lib/pkgconfig
+    - WAYLAND_PROTOCOLS_VERSION=wayland-protocols-1.8
+    - PKG_CONFIG_PATH=$HOME/prefix/lib/pkgconfig:$HOME/prefix/share/pkgconfig
     - LD_LIBRARY_PATH="$HOME/prefix/lib:$LD_LIBRARY_PATH"
 
 matrix:
@@ -38,6 +40,7 @@
         - GALLIUM_ST="--enable-dri --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx --disable-gallium-osmesa"
         - GALLIUM_DRIVERS=""
         - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--disable-libunwind"
       addons:
         apt:
           packages:
@@ -45,6 +48,8 @@
             - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
+            - libxdamage-dev
+            - libxfixes-dev
     - env:
         # NOTE: Building SWR is 2x (yes two) times slower than all the other
         # gallium drivers combined.
@@ -55,23 +60,22 @@
         - MAKE_CHECK_COMMAND="true"
         - LLVM_VERSION=3.9
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
-        - OVERRIDE_CC="gcc-5"
-        - OVERRIDE_CXX="g++-5"
+        - OVERRIDE_CC="gcc-4.8"
+        - OVERRIDE_CXX="g++-4.8"
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
         - GALLIUM_ST="--enable-dri --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx --disable-gallium-osmesa"
         - GALLIUM_DRIVERS="swr"
         - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
         apt:
           sources:
-            - ubuntu-toolchain-r-test
             - llvm-toolchain-trusty-3.9
           packages:
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - g++-5
             - llvm-3.9-dev
             # Common
             - xz-utils
@@ -79,6 +83,7 @@
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
+            - libunwind8-dev
     - env:
         - LABEL="make Gallium Drivers Other"
         - BUILD=make
@@ -89,8 +94,9 @@
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
         - GALLIUM_ST="--enable-dri --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx --disable-gallium-osmesa"
-        - GALLIUM_DRIVERS="i915,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl,etnaviv,imx"
+        - GALLIUM_DRIVERS="i915,nouveau,pl111,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl,etnaviv,imx"
         - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
         apt:
           sources:
@@ -106,6 +112,7 @@
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
+            - libunwind8-dev
     - env:
         # NOTE: Analogous to SWR above, building Clover is quite slow.
         - LABEL="make Gallium ST Clover"
@@ -123,6 +130,7 @@
         # Regardless - we're doing a quick build test here.
         - GALLIUM_DRIVERS="i915"
         - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
         apt:
           sources:
@@ -142,11 +150,14 @@
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
+            - libunwind8-dev
     - env:
         - LABEL="make Gallium ST Other"
         - BUILD=make
         - MAKEFLAGS="-j4"
         - MAKE_CHECK_COMMAND="true"
+        - LLVM_VERSION=3.3
+        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
         - GALLIUM_ST="--enable-dri --disable-opencl --enable-xa --enable-nine --enable-xvmc --enable-vdpau --enable-va --enable-omx --enable-gallium-osmesa"
@@ -155,9 +166,12 @@
         # Regardless - we're doing a quick build test here.
         - GALLIUM_DRIVERS="i915,swrast"
         - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
         apt:
           packages:
+            # We actually want to test against llvm-3.3
+            - llvm-3.3-dev
             # Nine requires gcc 4.6... which is the one we have right ?
             - libxvmc-dev
             # Build locally, for now.
@@ -172,6 +186,7 @@
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
+            - libunwind8-dev
     - env:
         - LABEL="make Vulkan"
         - BUILD=make
@@ -184,6 +199,7 @@
         - GALLIUM_ST="--enable-dri --enable-dri3 --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx --disable-gallium-osmesa"
         - GALLIUM_DRIVERS=""
         - VULKAN_DRIVERS="intel,radeon"
+        - LIBUNWIND_FLAGS="--disable-libunwind"
       addons:
         apt:
           sources:
@@ -248,19 +264,17 @@
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         # Keep it symmetrical to the make build. There's no actual SWR, yet.
         - SCONS_CHECK_COMMAND="true"
-        - OVERRIDE_CC="gcc-5"
-        - OVERRIDE_CXX="g++-5"
+        - OVERRIDE_CC="gcc-4.8"
+        - OVERRIDE_CXX="g++-4.8"
       addons:
         apt:
           sources:
-            - ubuntu-toolchain-r-test
             - llvm-toolchain-trusty-3.9
           packages:
             - scons
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - g++-5
             - llvm-3.9-dev
             # Common
             - xz-utils
@@ -338,10 +352,14 @@
   - tar -jxvf $LIBVA_VERSION.tar.bz2
   - (cd $LIBVA_VERSION && ./configure --prefix=$HOME/prefix --disable-wayland --disable-dummy-driver && make install)
 
-  - wget http://wayland.freedesktop.org/releases/$LIBWAYLAND_VERSION.tar.xz
+  - wget $WAYLAND_RELEASES/$LIBWAYLAND_VERSION.tar.xz
   - tar -axvf $LIBWAYLAND_VERSION.tar.xz
   - (cd $LIBWAYLAND_VERSION && ./configure --prefix=$HOME/prefix --enable-libraries --without-host-scanner --disable-documentation --disable-dtd-validation && make install)
 
+  - wget $WAYLAND_RELEASES/$WAYLAND_PROTOCOLS_VERSION.tar.xz
+  - tar -axvf $WAYLAND_PROTOCOLS_VERSION.tar.xz
+  - (cd $WAYLAND_PROTOCOLS_VERSION && ./configure --prefix=$HOME/prefix && make install)
+
   # Generate the header since one is missing on the Travis instance
   - mkdir -p linux
   - printf "%s\n" \
@@ -363,6 +381,7 @@
       export CC="$CC -isystem`pwd`";
 
       ./autogen.sh --enable-debug
+        $LIBUNWIND_FLAGS
         $DRI_LOADERS
         --with-dri-drivers=$DRI_DRIVERS
         $GALLIUM_ST
diff --git a/Android.common.mk b/Android.common.mk
index e899a31..e5416c4 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -37,11 +37,18 @@
 	-Wno-missing-field-initializers \
 	-Wno-initializer-overrides \
 	-Wno-mismatched-tags \
+	-DVERSION=\"$(MESA_VERSION)\" \
 	-DPACKAGE_VERSION=\"$(MESA_VERSION)\" \
 	-DPACKAGE_BUGREPORT=\"https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa\"
 
+# XXX: The following __STDC_*_MACROS defines should not be needed.
+# It's likely due to a bug elsewhere, but let's temporarily add them
+# here to fix the radeonsi build.
 LOCAL_CFLAGS += \
+	-DANDROID_API_LEVEL=$(PLATFORM_SDK_VERSION) \
 	-DENABLE_SHADER_CACHE \
+	-D__STDC_CONSTANT_MACROS \
+	-D__STDC_LIMIT_MACROS \
 	-DHAVE___BUILTIN_EXPECT \
 	-DHAVE___BUILTIN_FFS \
 	-DHAVE___BUILTIN_FFSLL \
@@ -81,29 +88,15 @@
 
 endif
 endif
-
-ifeq ($(MESA_ENABLE_LLVM),true)
-  ifeq ($(MESA_ANDROID_MAJOR_VERSION),5)
-    LOCAL_CFLAGS += -DHAVE_LLVM=0x0305 -DMESA_LLVM_VERSION_PATCH=2
-    ELF_INCLUDES := external/elfutils/0.153/libelf
-  endif
-  ifeq ($(MESA_ANDROID_MAJOR_VERSION),6)
-    LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_PATCH=0
-    ELF_INCLUDES := external/elfutils/src/libelf
-  endif
-  ifeq ($(MESA_ANDROID_MAJOR_VERSION),7)
-    LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_PATCH=0
-    ELF_INCLUDES := external/elfutils/libelf
-  endif
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_CFLAGS_arm += -DUSE_ARM_ASM
 endif
+LOCAL_CFLAGS_arm64 += -DUSE_AARCH64_ASM
 
 ifneq ($(LOCAL_IS_HOST_MODULE),true)
-# add libdrm if there are hardware drivers
-ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
 LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
-endif
 
 LOCAL_CFLAGS_32 += -DDEFAULT_DRIVER_DIR=\"/system/lib/$(MESA_DRI_MODULE_REL_PATH)\"
 LOCAL_CFLAGS_64 += -DDEFAULT_DRIVER_DIR=\"/system/lib64/$(MESA_DRI_MODULE_REL_PATH)\"
diff --git a/Android.mk b/Android.mk
index fdbf22f..7087a44 100644
--- a/Android.mk
+++ b/Android.mk
@@ -24,7 +24,7 @@
 # BOARD_GPU_DRIVERS should be defined.  The valid values are
 #
 #   classic drivers: i915 i965
-#   gallium drivers: swrast freedreno i915g nouveau r300g r600g radeonsi vc4 virgl vmwgfx
+#   gallium drivers: swrast freedreno i915g nouveau pl111 r300g r600g radeonsi vc4 virgl vmwgfx etnaviv imx
 #
 # The main target is libGLES_mesa.  For each classic driver enabled, a DRI
 # module will also be built.  DRI modules will be loaded by libGLES_mesa.
@@ -32,6 +32,9 @@
 MESA_TOP := $(call my-dir)
 
 MESA_ANDROID_MAJOR_VERSION := $(word 1, $(subst ., , $(PLATFORM_VERSION)))
+ifneq ($(filter 2 4, $(MESA_ANDROID_MAJOR_VERSION)),)
+$(error "Android 4.4 and earlier not supported")
+endif
 
 MESA_DRI_MODULE_REL_PATH := dri
 MESA_DRI_MODULE_PATH := $(TARGET_OUT_SHARED_LIBRARIES)/$(MESA_DRI_MODULE_REL_PATH)
@@ -40,19 +43,39 @@
 MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk
 MESA_PYTHON2 := python
 
-classic_drivers := i915 i965
-gallium_drivers := swrast freedreno i915g nouveau r300g r600g radeonsi vmwgfx vc4 virgl
+# Lists to convert driver names to boolean variables
+# in form of <driver name>.<boolean make variable>
+classic_drivers := i915.HAVE_I915_DRI i965.HAVE_I965_DRI
+gallium_drivers := \
+	swrast.HAVE_GALLIUM_SOFTPIPE \
+	freedreno.HAVE_GALLIUM_FREEDRENO \
+	i915g.HAVE_GALLIUM_I915 \
+	nouveau.HAVE_GALLIUM_NOUVEAU \
+	pl111.HAVE_GALLIUM_PL111 \
+	r300g.HAVE_GALLIUM_R300 \
+	r600g.HAVE_GALLIUM_R600 \
+	radeonsi.HAVE_GALLIUM_RADEONSI \
+	vmwgfx.HAVE_GALLIUM_VMWGFX \
+	vc4.HAVE_GALLIUM_VC4 \
+	virgl.HAVE_GALLIUM_VIRGL \
+	etnaviv.HAVE_GALLIUM_ETNAVIV \
+	imx.HAVE_GALLIUM_IMX
 
-MESA_GPU_DRIVERS := $(strip $(BOARD_GPU_DRIVERS))
-
-# warn about invalid drivers
-invalid_drivers := $(filter-out \
-	$(classic_drivers) $(gallium_drivers), $(MESA_GPU_DRIVERS))
-ifneq ($(invalid_drivers),)
-$(warning invalid GPU drivers: $(invalid_drivers))
-# tidy up
-MESA_GPU_DRIVERS := $(filter-out $(invalid_drivers), $(MESA_GPU_DRIVERS))
+ifeq ($(BOARD_GPU_DRIVERS),all)
+MESA_BUILD_CLASSIC := $(filter HAVE_%, $(subst ., , $(classic_drivers)))
+MESA_BUILD_GALLIUM := $(filter HAVE_%, $(subst ., , $(gallium_drivers)))
+else
+# Warn if we have any invalid driver names
+$(foreach d, $(BOARD_GPU_DRIVERS), \
+	$(if $(findstring $(d).,$(classic_drivers) $(gallium_drivers)), \
+		, \
+		$(warning invalid GPU driver: $(d)) \
+	) \
+)
+MESA_BUILD_CLASSIC := $(strip $(foreach d, $(BOARD_GPU_DRIVERS), $(patsubst $(d).%,%, $(filter $(d).%, $(classic_drivers)))))
+MESA_BUILD_GALLIUM := $(strip $(foreach d, $(BOARD_GPU_DRIVERS), $(patsubst $(d).%,%, $(filter $(d).%, $(gallium_drivers)))))
 endif
+$(foreach d, $(MESA_BUILD_CLASSIC) $(MESA_BUILD_GALLIUM), $(eval $(d) := true))
 
 # host and target must be the same arch to generate matypes.h
 ifeq ($(TARGET_ARCH),$(HOST_ARCH))
@@ -61,23 +84,23 @@
 MESA_ENABLE_ASM := false
 endif
 
-ifneq ($(filter $(classic_drivers), $(MESA_GPU_DRIVERS)),)
-MESA_BUILD_CLASSIC := true
-else
-MESA_BUILD_CLASSIC := false
+ifneq ($(filter true, $(HAVE_GALLIUM_RADEONSI)),)
+MESA_ENABLE_LLVM := true
 endif
 
-ifneq ($(filter $(gallium_drivers), $(MESA_GPU_DRIVERS)),)
-MESA_BUILD_GALLIUM := true
-else
-MESA_BUILD_GALLIUM := false
-endif
-
-MESA_ENABLE_LLVM := $(if $(filter radeonsi,$(MESA_GPU_DRIVERS)),true,false)
+define mesa-build-with-llvm
+  $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5), \
+    $(warning Unsupported LLVM version in Android $(MESA_ANDROID_MAJOR_VERSION)),) \
+  $(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_PATCH=0)) \
+  $(if $(filter 7,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_PATCH=0)) \
+  $(if $(filter O,$(MESA_ANDROID_MAJOR_VERSION)), \
+    $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_PATCH=0)) \
+  $(eval LOCAL_SHARED_LIBRARIES += libLLVM)
+endef
 
 # add subdirectories
-ifneq ($(strip $(MESA_GPU_DRIVERS)),)
-
 SUBDIRS := \
 	src/gbm \
 	src/loader \
@@ -87,16 +110,11 @@
 	src/util \
 	src/egl \
 	src/amd \
+	src/broadcom \
 	src/intel \
 	src/mesa/drivers/dri \
 	src/vulkan
 
 INC_DIRS := $(call all-named-subdir-makefiles,$(SUBDIRS))
-
-ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
 INC_DIRS += $(call all-named-subdir-makefiles,src/gallium)
-endif
-
 include $(INC_DIRS)
-
-endif
diff --git a/Makefile.am b/Makefile.am
index 787174d..ec395b2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,9 +41,10 @@
 	--enable-xa \
 	--enable-xvmc \
 	--enable-llvm-shared-libs \
+	--enable-libunwind \
 	--with-platforms=x11,wayland,drm,surfaceless \
 	--with-dri-drivers=i915,i965,nouveau,radeon,r200,swrast \
-	--with-gallium-drivers=i915,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl,swr,etnaviv,imx \
+	--with-gallium-drivers=i915,nouveau,r300,pl111,r600,radeonsi,freedreno,svga,swrast,vc4,virgl,swr,etnaviv,imx \
 	--with-vulkan-drivers=intel,radeon
 
 ACLOCAL_AMFLAGS = -I m4
@@ -53,6 +54,7 @@
 	common.py \
 	docs \
 	doxygen \
+	bin/git_sha1_gen.sh \
 	scons \
 	SConstruct
 
@@ -61,6 +63,11 @@
 	include/c99_compat.h \
 	include/c99_math.h \
 	include/c11 \
+	include/drm-uapi/drm.h \
+	include/drm-uapi/drm_fourcc.h \
+	include/drm-uapi/drm_mode.h \
+	include/drm-uapi/i915_drm.h \
+	include/drm-uapi/vc4_drm.h \
 	include/D3D9 \
 	include/GL/wglext.h \
 	include/HaikuGL \
diff --git a/SConstruct b/SConstruct
index 696718c..0215aa8 100644
--- a/SConstruct
+++ b/SConstruct
@@ -50,10 +50,10 @@
     pass
 else:
     targets = targets.split(',')
-    print 'scons: warning: targets option is deprecated; pass the targets on their own such as'
-    print
-    print '  scons %s' % ' '.join(targets)
-    print
+    print('scons: warning: targets option is deprecated; pass the targets on their own such as')
+    print()
+    print('  scons %s' % ' '.join(targets))
+    print()
     COMMAND_LINE_TARGETS.append(targets)
 
 
diff --git a/VERSION b/VERSION
index 6729eea..61d7548 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-17.1.5
+17.2.5
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
index 8aea291..7735431 100644
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,17 +1,86 @@
-# stable: This commit depends on 9fd9a7d0ba3 and 678d568c7b2, neither
-#         of which is in branch.
-b84b631c6381d9b36bca5d0e7cc67dd23af188c1 radeonsi: load patch_id for TES-as-ES when exporting for PS
-# fixes:  This commit addressed an earlier commit 126d5ad which did not
+# fixes:  The commits are too invasive for stable. Instead the offending patches
+#         causing regressions have been reverted.
+365d34540f331df57780dddf8da87235be0a6bcb mesa: correctly calculate the storage offset for i915
+de0e62e1065e2d9172acf3ab7c70bba0160125c8 st/mesa: correctly calculate the storage offset
+
+# stable: Add loader::getCapability patches. It's rather invasive infra
+#         not suitable as a bugfix.
+1bf703e4ea5c4f742bc7ba55d01e5afc3f4e11f9 dri_interface,egl,gallium: only expose RGBA visuals on Android
+be5773fa8dfe9255d9abaf5c7d5bbbd2d922da08 Android: fix compile error for DRI2 loader getCapability
+31a6750988d7dd431f72ff1ff11bfca83bde5d8c st/dri: NULL check before deref DRI loader .getCapability
+
+# stable: The commit addresses code that did not land in the stable branch
+31bb8517a194af733deefe2d821537d994d39365 radv/gfx9: fix tile swizzle handling for gfx9
+
+# stable: Commit is not applicable when 4fab67a4415 is missing.
+d496780fb2c7f2cf0e32b6a79dc528e5156dfcb3 intel/eu/validate: Look up types on demand in execution_type()
+
+# fixes: Depend on preseding commit which adds new public GBM API
+3a5e3aa5a53cff55a5e31766d713a41ffa5a93d7 egl/drm: Fix misused x and y offsets in swrast_put_image2()
+fe2a6281b3b299998fe7399e7dbcc2077d773824 egl/drm: Fix misused x and y offsets in swrast_get_image()
+
+# fixes: This commit addressed an earlier commit c7e9ebb3ab8 which did not
+#        land in branch
+45c5c444518b7e83d9accd9f44702fa49282a3b8 radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug
+
+# fixes: This commit addressed earlier commits 61ad2f13 and 6dcc54b4 which did
+#        not land in branch
+979978ee06867a531b8d56cee252f5c83920a339 radv: Check for GFX9 for 1D arrays in image_size intrinsic.
+
+# fixes: This commit addressed earlier commits dcf46e99 and 60878dd0 which did
+#        not land in branch
+8e9e339c530c7b82b5a29d4b3183e8f5a01eae28 radv: copy the number of viewports/scissors at pipeline bind time
+
+# stable: The commit regresses a few dEQP tests. Namely:
+#         dEQP-VK.api.copy_and_blit.core.buffer_to_buffer.partial
+#         dEQP-VK.api.copy_and_blit.dedicated_allocation.buffer_to_buffer.partial
+14555d0b7a51bd3701764fd213c2459410143431 anv: Remove unreachable cases from isl_format_for_size()
+
+# stable: The commit addresses earlier commit a62a9793357 which is no applicable
+#         for the stable branch
+6c7720ed78db754d52f204cbb74897aa9e65ea7e anv/wsi: Allocate enough memory for the entire image
+
+# stable: Commits are too invasive for 17.2.
+98fdff7247b6877d028d33284f9cc63189ee204e configure.ac: factor out detection for old and buggy llvm
+13a53c4f5cdd664fd155c9e78fb46a4387af006c configure.ac: rework llvm libs handling for 3.9+
+a7ecf7b86f4eae59f3ceac2125e5d1725c403c07 Travis: add binutils 2.26 for a few more LLVM 3.9 builds
+36d6d1e931936a80da327889862ba02942ac427b configure.ac: add llvm_add_optional_component helper
+df3a43018020c16c1dfa88a76c9a84c9fb85be38 configure.ac: add missing LLVM components for OpenCL
+
+# stable: Commit is too big for stable at this point.
+4d24a7cb97641cacecd371d1968f6964785822e4 glsl: fix derived cs variables
+
+# stable: 17.3 nomination only.
+fee9d05e2136b2b7c5a1ad2be7180b99f733f539 radv: Update code pointer correctly if a variant is already created
+
+# stable: 17.3 nomination only.
+d8cefaa197f02944812ef535b1b303dd5bf26848 radv: use device name in cache creation like radeonsi.
+
+# fixes:  This commit addressed earlier commit 35ac13ed3 which did not
 #         land in branch.
-9da104593386f6e8ddec8f0d9d288aceb8908fe1 radv: fix regression in descriptor set freeing.
-# stable: This commit addressed an earlier commit 944455217b which did
+11d688d9f0d2ee4d0178d1807c0075e5e8364b1d mesa/bufferobj: don't double negate the range
+
+# extra:  Commit is not applicable when ade416d0236 is missing.
+07bfdb478bf844a0ac9cf3679f51f83c4abea5a1 broadcom/vc5: Propagate vc4 aliasing fix to vc5.
+
+# stable: This commit addressed earlier commit 8d90e28839 which did
 #         not land in branch.
-b28938ffce0580e89e6012826900da2b6013b0df st/glsl_to_tgsi: use correct writemask when converting generic intrinsics
-# stable: This commit depends on 330d0607e and 61d8f3387d, neither of
-#         which is in branch.
-c12f8305a8ae4fd5d78a9ab8bbda790a711d5bed nv50,nvc0: remove IDX from bufctx immediately, to avoid conflicts with clear
-# fixes:  Genuine false positive.
-5d87667fed1bd5ab850abdfb3a10db8c8c21c330 bin/get-fixes-pick-list.sh: better identify multiple "fixes:" tags" has more than one Fixes tag
-# extra: References 6a7c5257cac but because later f8d69beed49
-#        introduced a regression and the latter didn't land
-c35fd58688fd8c0c421c35b28419d20befdcb8b9 i965: Fix anisotropic filtering for mag filter
+446c5726ecb968d06a6607e0df42be1cb74948c4 i965: fix blorp stage_prog_data->param leak
+
+# stable: This commit addressed earlier commit 78ade659569 which did
+#         not land in branch.
+8fbd82f464f26a56167f7962174b2b69756a105a etnaviv: don't do resolve-in-place without valid TS
+
+# stable: This commit addressed earlier commit 8d90e28839 which did
+#         not land in branch.
+7b4387519c382cffef9c62bbbbefcfe71cfde905 intel/fs: Alloc pull constants off mem_ctx
+
+# stable: 17.3 nomination only.
+3f8e3c2bd8f54ae6817f7496be47f4e1a8860d9c radeonsi: add a workaround for weird s_buffer_load_dword behavior on SI
+
+# stable: 17.3 nomination only.
+7dae419aa7c34af820c08896acef3b65d855188e Android: move drivers' symlinks to /vendor (v2)
+
+# fixes:  This commit has more than one Fixes tag but the commit it
+#         addresses didn't land in branch.
+e17e8934f9e4b008bdfb4f9abd8ed4faa604c7d9 automake: include git_sha1.h.in in release tarball
diff --git a/bin/.editorconfig b/bin/.editorconfig
index b9a948f..ef92655 100644
--- a/bin/.editorconfig
+++ b/bin/.editorconfig
@@ -1,3 +1,2 @@
 [*.sh]
-indent_style = space
-indent_size = 2
+indent_style = tab
diff --git a/bin/get-fixes-pick-list.sh b/bin/get-fixes-pick-list.sh
index 81b78ad..fb7ef22 100755
--- a/bin/get-fixes-pick-list.sh
+++ b/bin/get-fixes-pick-list.sh
@@ -36,14 +36,17 @@
 		continue
 	fi
 
+	# Place every "fixes:" tag on its own line and join with the next word
+	# on its line or a later one.
+	fixes=`git show -s $sha | tr -d "\n" | sed -e 's/fixes:[[:space:]]*/\nfixes:/Ig' | grep "fixes:" | sed -e 's/\(fixes:[a-zA-Z0-9]*\).*$/\1/'`
+
 	# For each one try to extract the tag
-	fixes_count=`git show $sha | grep -i "fixes:" | wc -l`
+	fixes_count=`echo "$fixes" | wc -l`
 	warn=`(test $fixes_count -gt 1 && echo $fixes_count) || echo 0`
 	while [ $fixes_count -gt 0 ] ; do
-		fixes=`git show $sha | grep -i "fixes:" | tail -n $fixes_count`
+		# Treat only the current line
+		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
 		fixes_count=$(($fixes_count-1))
-		# The following sed/cut combination is borrowed from GregKH
-		id=`echo ${fixes} | sed -e 's/^[ \t]*//' | cut -f 2 -d ':' | sed -e 's/^[ \t]*//' | cut -f 1 -d ' '`
 
 		# Bail out if we cannot find suitable id.
 		# Any specific validation the $id is valid and not some junk, is
diff --git a/bin/git_sha1_gen.sh b/bin/git_sha1_gen.sh
new file mode 100755
index 0000000..898e590
--- /dev/null
+++ b/bin/git_sha1_gen.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# run git from the sources directory
+cd "$(dirname "$0")"
+
+# don't print anything if git fails
+if ! git_sha1=$(git --git-dir=../.git rev-parse --short=10 HEAD 2>/dev/null)
+then
+  exit
+fi
+
+printf '#define MESA_GIT_SHA1 "git-%s"\n' "$git_sha1"
diff --git a/bin/perf-annotate-jit.py b/bin/perf-annotate-jit.py
index 7464340..4f05585 100755
--- a/bin/perf-annotate-jit.py
+++ b/bin/perf-annotate-jit.py
@@ -133,7 +133,7 @@
 
     def __init__(self, infile, symbol):
         LineParser.__init__(self, infile)
-	self.symbol = symbol
+        self.symbol = symbol
 
     def readline(self):
         # Override LineParser.readline to ignore comment lines
@@ -155,7 +155,7 @@
         addresses.sort()
         total_samples = 0
 
-	sys.stdout.write('%s:\n' % self.symbol)
+        sys.stdout.write('%s:\n' % self.symbol)
         for address, instr in asm:
             try:
                 sample = samples.pop(address)
diff --git a/configure.ac b/configure.ac
index f4cc998..49dd002 100644
--- a/configure.ac
+++ b/configure.ac
@@ -74,13 +74,12 @@
 # in the first entry.
 LIBDRM_REQUIRED=2.4.75
 LIBDRM_RADEON_REQUIRED=2.4.71
-LIBDRM_AMDGPU_REQUIRED=2.4.79
+LIBDRM_AMDGPU_REQUIRED=2.4.82
 LIBDRM_INTEL_REQUIRED=2.4.75
 LIBDRM_NVVIEUX_REQUIRED=2.4.66
 LIBDRM_NOUVEAU_REQUIRED=2.4.66
 LIBDRM_FREEDRENO_REQUIRED=2.4.74
-LIBDRM_VC4_REQUIRED=2.4.69
-LIBDRM_ETNAVIV_REQUIRED=2.4.80
+LIBDRM_ETNAVIV_REQUIRED=2.4.82
 
 dnl Versions for external dependencies
 DRI2PROTO_REQUIRED=2.8
@@ -89,6 +88,7 @@
 LIBVA_REQUIRED=0.38.0
 VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.11
+WAYLAND_PROTOCOLS_REQUIRED=1.8
 XCB_REQUIRED=1.9.3
 XCBDRI2_REQUIRED=1.8
 XCBGLX_REQUIRED=1.8.1
@@ -102,8 +102,8 @@
 dnl LLVM versions
 LLVM_REQUIRED_GALLIUM=3.3.0
 LLVM_REQUIRED_OPENCL=3.6.0
-LLVM_REQUIRED_R600=3.8.0
-LLVM_REQUIRED_RADEONSI=3.8.0
+LLVM_REQUIRED_R600=3.9.0
+LLVM_REQUIRED_RADEONSI=3.9.0
 LLVM_REQUIRED_RADV=3.9.0
 LLVM_REQUIRED_SWR=3.9.0
 
@@ -287,9 +287,9 @@
     CFLAGS="$CFLAGS -Wall"
 
     if test "x$USE_GNU99" = xyes; then
-	CFLAGS="$CFLAGS -std=gnu99"
+        CFLAGS="$CFLAGS -std=gnu99"
     else
-	CFLAGS="$CFLAGS -std=c99"
+        CFLAGS="$CFLAGS -std=c99"
     fi
 
     # Enable -Werror=implicit-function-declaration and
@@ -301,9 +301,9 @@
     CFLAGS="$CFLAGS -Werror=implicit-function-declaration"
     CFLAGS="$CFLAGS -Werror=missing-prototypes"
     AC_LINK_IFELSE([AC_LANG_PROGRAM()],
-		   AC_MSG_RESULT([yes]),
-		   [CFLAGS="$save_CFLAGS -Wmissing-prototypes";
-		    AC_MSG_RESULT([no])])
+                   AC_MSG_RESULT([yes]),
+                   [CFLAGS="$save_CFLAGS -Wmissing-prototypes";
+                    AC_MSG_RESULT([no])])
 
     # Enable -fvisibility=hidden if using a gcc that supports it
     save_CFLAGS="$CFLAGS"
@@ -311,7 +311,7 @@
     VISIBILITY_CFLAGS="-fvisibility=hidden"
     CFLAGS="$CFLAGS $VISIBILITY_CFLAGS"
     AC_LINK_IFELSE([AC_LANG_PROGRAM()], AC_MSG_RESULT([yes]),
-		   [VISIBILITY_CFLAGS=""; AC_MSG_RESULT([no])])
+                   [VISIBILITY_CFLAGS=""; AC_MSG_RESULT([no])])
 
     # Restore CFLAGS; VISIBILITY_CFLAGS are added to it where needed.
     CFLAGS=$save_CFLAGS
@@ -333,10 +333,10 @@
     AC_MSG_CHECKING([whether $CC supports -Werror=vla])
     CFLAGS="$CFLAGS -Werror=vla"
     AC_LINK_IFELSE([AC_LANG_PROGRAM()],
-		   [MSVC2013_COMPAT_CFLAGS="$MSVC2013_COMPAT_CFLAGS -Werror=vla";
-		    MSVC2013_COMPAT_CXXFLAGS="$MSVC2013_COMPAT_CXXFLAGS -Werror=vla";
-		    AC_MSG_RESULT([yes])],
-		    AC_MSG_RESULT([no]))
+                   [MSVC2013_COMPAT_CFLAGS="$MSVC2013_COMPAT_CFLAGS -Werror=vla";
+                    MSVC2013_COMPAT_CXXFLAGS="$MSVC2013_COMPAT_CXXFLAGS -Werror=vla";
+                    AC_MSG_RESULT([yes])],
+                    AC_MSG_RESULT([no]))
     CFLAGS="$save_CFLAGS"
 fi
 if test "x$GXX" = xyes; then
@@ -349,7 +349,7 @@
     CXXFLAGS="$CXXFLAGS $VISIBILITY_CXXFLAGS"
     AC_LANG_PUSH([C++])
     AC_LINK_IFELSE([AC_LANG_PROGRAM()], AC_MSG_RESULT([yes]),
-		   [VISIBILITY_CXXFLAGS="" ; AC_MSG_RESULT([no])])
+                   [VISIBILITY_CXXFLAGS="" ; AC_MSG_RESULT([no])])
     AC_LANG_POP([C++])
 
     # Restore CXXFLAGS; VISIBILITY_CXXFLAGS are added to it where needed.
@@ -410,8 +410,21 @@
 }]])], GCC_ATOMIC_BUILTINS_SUPPORTED=1)
 if test "x$GCC_ATOMIC_BUILTINS_SUPPORTED" = x1; then
     DEFINES="$DEFINES -DUSE_GCC_ATOMIC_BUILTINS"
+    dnl On some platforms, new-style atomics need a helper library
+    AC_MSG_CHECKING(whether -latomic is needed)
+    AC_LINK_IFELSE([AC_LANG_SOURCE([[
+    #include <stdint.h>
+    uint64_t v;
+    int main() {
+        return (int)__atomic_load_n(&v, __ATOMIC_ACQUIRE);
+    }]])], GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=no, GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=yes)
+    AC_MSG_RESULT($GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC)
+    if test "x$GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC" = xyes; then
+        LIBATOMIC_LIBS="-latomic"
+    fi
 fi
 AM_CONDITIONAL([GCC_ATOMIC_BUILTINS_SUPPORTED], [test x$GCC_ATOMIC_BUILTINS_SUPPORTED = x1])
+AC_SUBST([LIBATOMIC_LIBS])
 
 dnl Check if host supports 64-bit atomics
 dnl note that lack of support usually results in link (not compile) error
@@ -455,7 +468,7 @@
 CFLAGS=$save_CFLAGS
 
 AC_ARG_ENABLE(pwr8,
-   [AS_HELP_STRING([--disable-pwr8-inst],
+   [AS_HELP_STRING([--disable-pwr8],
                    [disable POWER8-specific instructions])],
    [enable_pwr8=$enableval], [enable_pwr8=auto])
 
@@ -766,6 +779,27 @@
             ;;
         esac
         ;;
+    powerpc64le)
+        case "$host_os" in
+        linux*)
+            asm_arch=ppc64le
+            ;;
+        esac
+        ;;
+    aarch64)
+        case "$host_os" in
+        linux*)
+            asm_arch=aarch64
+            ;;
+        esac
+        ;;
+    arm)
+        case "$host_os" in
+        linux*)
+            asm_arch=arm
+            ;;
+        esac
+        ;;
     esac
 
     case "$asm_arch" in
@@ -781,6 +815,18 @@
         DEFINES="$DEFINES -DUSE_SPARC_ASM"
         AC_MSG_RESULT([yes, sparc])
         ;;
+    ppc64le)
+        DEFINES="$DEFINES -DUSE_PPC64LE_ASM"
+        AC_MSG_RESULT([yes, ppc64le])
+        ;;
+    aarch64)
+        DEFINES="$DEFINES -DUSE_AARCH64_ASM"
+        AC_MSG_RESULT([yes, aarch64])
+        ;;
+    arm)
+        DEFINES="$DEFINES -DUSE_ARM_ASM"
+        AC_MSG_RESULT([yes, arm])
+        ;;
     *)
         AC_MSG_RESULT([no, platform not supported])
         ;;
@@ -793,6 +839,27 @@
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
 AC_CHECK_FUNC([mkostemp], [DEFINES="$DEFINES -DHAVE_MKOSTEMP"])
 
+AC_MSG_CHECKING([whether strtod has locale support])
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+    #define _GNU_SOURCE
+    #include <stdlib.h>
+    #include <locale.h>
+    #ifdef HAVE_XLOCALE_H
+    #include <xlocale.h>
+    #endif
+    int main() {
+       locale_t loc = newlocale(LC_CTYPE_MASK, "C", NULL);
+       const char *s = "1.0";
+       char *end;
+       double d = strtod_l(s, end, loc);
+       float f = strtof_l(s, end, loc);
+       freelocale(loc);
+       return 0;
+    }]])],
+  [DEFINES="$DEFINES -DHAVE_STRTOD_L"];
+   AC_MSG_RESULT([yes]),
+   AC_MSG_RESULT([no]))
+
 dnl Check to see if dlopen is in default libraries (like Solaris, which
 dnl has it in libc), or if libdl is needed to get it.
 AC_CHECK_FUNC([dlopen], [DEFINES="$DEFINES -DHAVE_DLOPEN"],
@@ -856,8 +923,6 @@
 
 if test "x$pthread_stubs_possible" = xyes; then
     PKG_CHECK_MODULES(PTHREADSTUBS, pthread-stubs >= 0.4)
-    AC_SUBST(PTHREADSTUBS_CFLAGS)
-    AC_SUBST(PTHREADSTUBS_LIBS)
 fi
 
 dnl SELinux awareness.
@@ -1071,16 +1136,12 @@
 dnl
 dnl libunwind
 dnl
+PKG_CHECK_EXISTS(libunwind, [HAVE_LIBUNWIND=yes], [HAVE_LIBUNWIND=no])
 AC_ARG_ENABLE([libunwind],
     [AS_HELP_STRING([--enable-libunwind],
             [Use libunwind for backtracing (default: auto)])],
         [LIBUNWIND="$enableval"],
-        [LIBUNWIND="auto"])
-
-PKG_CHECK_EXISTS(libunwind, [HAVE_LIBUNWIND=yes], [HAVE_LIBUNWIND=no])
-if test "x$LIBUNWIND" = "xauto"; then
-    LIBUNWIND="$HAVE_LIBUNWIND"
-fi
+        [LIBUNWIND="$HAVE_LIBUNWIND"])
 
 if test "x$LIBUNWIND" = "xyes"; then
     PKG_CHECK_MODULES(LIBUNWIND, libunwind)
@@ -1245,7 +1306,7 @@
 AC_ARG_WITH([gallium-drivers],
     [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
         [comma delimited Gallium drivers list, e.g.
-        "i915,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,swr,vc4,virgl,etnaviv,imx"
+        "i915,nouveau,r300,r600,radeonsi,freedreno,pl111,svga,swrast,swr,vc4,virgl,etnaviv,imx"
         @<:@default=r300,r600,svga,swrast@:>@])],
     [with_gallium_drivers="$withval"],
     [with_gallium_drivers="$GALLIUM_DRIVERS_DEFAULT"])
@@ -1287,6 +1348,9 @@
 AM_CONDITIONAL(NEED_OPENGL_COMMON, test "x$enable_opengl" = xyes -o \
                                         "x$enable_gles1" = xyes -o \
                                         "x$enable_gles2" = xyes)
+AM_CONDITIONAL(NEED_KHRPLATFORM, test "x$enable_egl" = xyes -o \
+                                      "x$enable_gles1" = xyes -o \
+                                      "x$enable_gles2" = xyes)
 
 # Validate GLX options
 if test "x$enable_glx" = xyes; then
@@ -1408,7 +1472,7 @@
 PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
                   [have_libdrm=yes], [have_libdrm=no])
 if test "x$have_libdrm" = xyes; then
-	DEFINES="$DEFINES -DHAVE_LIBDRM"
+    DEFINES="$DEFINES -DHAVE_LIBDRM"
 fi
 
 require_libdrm() {
@@ -1673,55 +1737,64 @@
     AC_PATH_PROG([WAYLAND_SCANNER], [wayland-scanner], [:])
 fi
 
+PKG_CHECK_EXISTS([wayland-protocols >= $WAYLAND_PROTOCOLS_REQUIRED], [have_wayland_protocols=yes], [have_wayland_protocols=no])
+if test "x$have_wayland_protocols" = xyes; then
+    ac_wayland_protocols_pkgdatadir=`$PKG_CONFIG --variable=pkgdatadir wayland-protocols`
+fi
+AC_SUBST(WAYLAND_PROTOCOLS_DATADIR, $ac_wayland_protocols_pkgdatadir)
+
 # Do per platform setups and checks
 platforms=`IFS=', '; echo $with_platforms`
 for plat in $platforms; do
-	case "$plat" in
-	wayland)
+    case "$plat" in
+    wayland)
 
-		PKG_CHECK_MODULES([WAYLAND], [wayland-client >= $WAYLAND_REQUIRED wayland-server >= $WAYLAND_REQUIRED])
+        PKG_CHECK_MODULES([WAYLAND], [wayland-client >= $WAYLAND_REQUIRED wayland-server >= $WAYLAND_REQUIRED])
 
-		if test "x$WAYLAND_SCANNER" = "x:"; then
-			AC_MSG_ERROR([wayland-scanner is needed to compile the wayland platform])
-		fi
-		DEFINES="$DEFINES -DHAVE_WAYLAND_PLATFORM"
-		;;
+        if test "x$WAYLAND_SCANNER" = "x:"; then
+                AC_MSG_ERROR([wayland-scanner is needed to compile the wayland platform])
+        fi
+        if test "x$have_wayland_protocols" = xno; then
+                AC_MSG_ERROR([wayland-protocols >= $WAYLAND_PROTOCOLS_REQUIRED is needed to compile the wayland platform])
+        fi
+        DEFINES="$DEFINES -DHAVE_WAYLAND_PLATFORM"
+        ;;
 
-	x11)
-		PKG_CHECK_MODULES([XCB_DRI2], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED xcb-xfixes])
-		DEFINES="$DEFINES -DHAVE_X11_PLATFORM"
-		;;
+    x11)
+        PKG_CHECK_MODULES([XCB_DRI2], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED xcb-xfixes])
+        DEFINES="$DEFINES -DHAVE_X11_PLATFORM"
+        ;;
 
-	drm)
-		test "x$enable_gbm" = "xno" &&
-			AC_MSG_ERROR([EGL platform drm needs gbm])
-		DEFINES="$DEFINES -DHAVE_DRM_PLATFORM"
-		;;
+    drm)
+        test "x$enable_gbm" = "xno" &&
+                AC_MSG_ERROR([EGL platform drm needs gbm])
+        DEFINES="$DEFINES -DHAVE_DRM_PLATFORM"
+        ;;
 
-	surfaceless)
-		DEFINES="$DEFINES -DHAVE_SURFACELESS_PLATFORM"
-		;;
+    surfaceless)
+        DEFINES="$DEFINES -DHAVE_SURFACELESS_PLATFORM"
+        ;;
 
-	android)
-		PKG_CHECK_MODULES([ANDROID], [cutils hardware sync])
-		DEFINES="$DEFINES -DHAVE_ANDROID_PLATFORM"
-		;;
+    android)
+        PKG_CHECK_MODULES([ANDROID], [cutils hardware sync])
+        DEFINES="$DEFINES -DHAVE_ANDROID_PLATFORM"
+        ;;
 
-	*)
-		AC_MSG_ERROR([platform '$plat' does not exist])
-		;;
-	esac
+    *)
+        AC_MSG_ERROR([platform '$plat' does not exist])
+        ;;
+    esac
 
-	case "$plat" in
-	wayland|drm|surfaceless)
-		require_libdrm "Platform $plat"
-		;;
-	esac
+    case "$plat" in
+    wayland|drm|surfaceless)
+        require_libdrm "Platform $plat"
+        ;;
+    esac
 done
 
 if test "x$enable_glx" != xno; then
     if ! echo "$platforms" | grep -q 'x11'; then
-        AC_MSG_ERROR([Building without the x11 platform as GLX is enabled, is not supported])
+        AC_MSG_ERROR([Building GLX without the x11 platform is not supported])
     fi
 fi
 
@@ -1836,12 +1909,11 @@
         xi915)
             require_libdrm "i915"
             HAVE_I915_DRI=yes
-            PKG_CHECK_MODULES([INTEL], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
+            PKG_CHECK_MODULES([I915], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
             ;;
         xi965)
             require_libdrm "i965"
             HAVE_I965_DRI=yes
-            PKG_CHECK_MODULES([INTEL], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
             ;;
         xnouveau)
             require_libdrm "nouveau"
@@ -1953,7 +2025,6 @@
         case "x$driver" in
         xintel)
             require_libdrm "ANV"
-            PKG_CHECK_MODULES([INTEL], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
             require_x11_dri3 "ANV"
             HAVE_INTEL_VULKAN=yes
             ;;
@@ -2096,15 +2167,15 @@
 dnl
 if test -n "$with_gallium_drivers" -a "x$with_gallium_drivers" != xswrast; then
     if test "x$enable_xvmc" = xauto -a "x$have_xvmc_platform" = xyes; then
-	PKG_CHECK_EXISTS([xvmc >= $XVMC_REQUIRED], [enable_xvmc=yes], [enable_xvmc=no])
+        PKG_CHECK_EXISTS([xvmc >= $XVMC_REQUIRED], [enable_xvmc=yes], [enable_xvmc=no])
     fi
 
     if test "x$enable_vdpau" = xauto -a "x$have_vdpau_platform" = xyes; then
-	PKG_CHECK_EXISTS([vdpau >= $VDPAU_REQUIRED], [enable_vdpau=yes], [enable_vdpau=no])
+        PKG_CHECK_EXISTS([vdpau >= $VDPAU_REQUIRED], [enable_vdpau=yes], [enable_vdpau=no])
     fi
 
     if test "x$enable_omx" = xauto -a "x$have_omx_platform" = xyes; then
-	PKG_CHECK_EXISTS([libomxil-bellagio >= $LIBOMXIL_BELLAGIO_REQUIRED], [enable_omx=yes], [enable_omx=no])
+        PKG_CHECK_EXISTS([libomxil-bellagio >= $LIBOMXIL_BELLAGIO_REQUIRED], [enable_omx=yes], [enable_omx=no])
     fi
 
     if test "x$enable_va" = xauto -a "x$have_va_platform" = xyes; then
@@ -2344,6 +2415,15 @@
     [D3D_DRIVER_INSTALL_DIR="${libdir}/d3d"])
 AC_SUBST([D3D_DRIVER_INSTALL_DIR])
 
+dnl Architectures to build SWR library for
+
+AC_ARG_WITH([swr-archs],
+    [AS_HELP_STRING([--with-swr-archs@<:@=DIRS...@:>@],
+        [comma delimited swr architectures list, e.g.
+        "avx,avx2,knl,skx" @<:@default="avx,avx2"@:>@])],
+    [with_swr_archs="$withval"],
+    [with_swr_archs="avx,avx2"])
+
 dnl
 dnl r300 doesn't strictly require LLVM, but for performance reasons we
 dnl highly recommend LLVM usage. So require it at least on x86 and x86_64
@@ -2424,7 +2504,7 @@
             ;;
         xi915)
             HAVE_GALLIUM_I915=yes
-            PKG_CHECK_MODULES([INTEL], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
+            PKG_CHECK_MODULES([I915], [libdrm >= $LIBDRM_INTEL_REQUIRED libdrm_intel >= $LIBDRM_INTEL_REQUIRED])
             require_libdrm "Gallium i915"
             ;;
         xr300)
@@ -2491,16 +2571,50 @@
                 SWR_AVX_CXXFLAGS
             AC_SUBST([SWR_AVX_CXXFLAGS])
 
-            swr_require_cxx_feature_flags "AVX2" "defined(__AVX2__)" \
-                ",-mavx2 -mfma -mbmi2 -mf16c,-march=core-avx2" \
-                SWR_AVX2_CXXFLAGS
-            AC_SUBST([SWR_AVX2_CXXFLAGS])
+            swr_archs=`IFS=', '; echo $with_swr_archs`
+            for arch in $swr_archs; do
+                case "x$arch" in
+                xavx)
+                    HAVE_SWR_AVX=yes
+                    ;;
+                xavx2)
+                    swr_require_cxx_feature_flags "AVX2" "defined(__AVX2__)" \
+                        ",-mavx2 -mfma -mbmi2 -mf16c,-march=core-avx2" \
+                        SWR_AVX2_CXXFLAGS
+                    AC_SUBST([SWR_AVX2_CXXFLAGS])
+                    HAVE_SWR_AVX2=yes
+                    ;;
+                xknl)
+                    swr_require_cxx_feature_flags "KNL" "defined(__AVX512F__) && defined(__AVX512ER__)" \
+                        ",-march=knl,-xMIC-AVX512" \
+                        SWR_KNL_CXXFLAGS
+                    AC_SUBST([SWR_KNL_CXXFLAGS])
+                    HAVE_SWR_KNL=yes
+                    ;;
+                xskx)
+                    swr_require_cxx_feature_flags "SKX" "defined(__AVX512F__) && defined(__AVX512BW__)" \
+                        ",-march=skylake-avx512,-xCORE-AVX512" \
+                        SWR_SKX_CXXFLAGS
+                    AC_SUBST([SWR_SKX_CXXFLAGS])
+                    HAVE_SWR_SKX=yes
+                    ;;
+                *)
+                    AC_MSG_ERROR([unknown SWR build architecture '$arch'])
+                    ;;
+                esac
+            done
+
+            if test "x$HAVE_SWR_AVX" != xyes -a \
+                    "x$HAVE_SWR_AVX2" != xyes -a \
+                    "x$HAVE_SWR_KNL" != xyes -a \
+                    "x$HAVE_SWR_SKX" != xyes; then
+               AC_MSG_ERROR([swr enabled but no swr architectures selected])
+            fi
 
             HAVE_GALLIUM_SWR=yes
             ;;
         xvc4)
             HAVE_GALLIUM_VC4=yes
-            PKG_CHECK_MODULES([VC4], [libdrm >= $LIBDRM_VC4_REQUIRED libdrm_vc4 >= $LIBDRM_VC4_REQUIRED])
             require_libdrm "vc4"
 
             PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
@@ -2508,6 +2622,9 @@
                                DEFINES="$DEFINES -DUSE_VC4_SIMULATOR"],
                               [USE_VC4_SIMULATOR=no])
             ;;
+        xpl111)
+            HAVE_GALLIUM_PL111=yes
+            ;;
         xvirgl)
             HAVE_GALLIUM_VIRGL=yes
             require_libdrm "virgl"
@@ -2531,12 +2648,21 @@
     llvm_add_default_components "gallium"
 fi
 
+AM_CONDITIONAL(HAVE_SWR_AVX, test "x$HAVE_SWR_AVX" = xyes)
+AM_CONDITIONAL(HAVE_SWR_AVX2, test "x$HAVE_SWR_AVX2" = xyes)
+AM_CONDITIONAL(HAVE_SWR_KNL, test "x$HAVE_SWR_KNL" = xyes)
+AM_CONDITIONAL(HAVE_SWR_SKX, test "x$HAVE_SWR_SKX" = xyes)
+
 dnl We need to validate some needed dependencies for renderonly drivers.
 
 if test "x$HAVE_GALLIUM_ETNAVIV" != xyes -a "x$HAVE_GALLIUM_IMX" = xyes  ; then
     AC_MSG_ERROR([Building with imx requires etnaviv])
 fi
 
+if test "x$HAVE_GALLIUM_VC4" != xyes -a "x$HAVE_GALLIUM_PL111" = xyes  ; then
+    AC_MSG_ERROR([Building with pl111 requires vc4])
+fi
+
 dnl
 dnl Set defines and buildtime variables only when using LLVM.
 dnl
@@ -2601,6 +2727,7 @@
 
 AM_CONDITIONAL(HAVE_GALLIUM_SVGA, test "x$HAVE_GALLIUM_SVGA" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_I915, test "x$HAVE_GALLIUM_I915" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_PL111, test "x$HAVE_GALLIUM_PL111" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_R300, test "x$HAVE_GALLIUM_R300" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_R600, test "x$HAVE_GALLIUM_R600" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_RADEONSI, test "x$HAVE_GALLIUM_RADEONSI" = xyes)
@@ -2640,8 +2767,7 @@
 AM_CONDITIONAL(HAVE_RADEON_VULKAN, test "x$HAVE_RADEON_VULKAN" = xyes)
 AM_CONDITIONAL(HAVE_INTEL_VULKAN, test "x$HAVE_INTEL_VULKAN" = xyes)
 
-AM_CONDITIONAL(HAVE_AMD_DRIVERS, test "x$HAVE_GALLIUM_R600" = xyes -o \
-                                      "x$HAVE_GALLIUM_RADEONSI" = xyes -o \
+AM_CONDITIONAL(HAVE_AMD_DRIVERS, test "x$HAVE_GALLIUM_RADEONSI" = xyes -o \
                                       "x$HAVE_RADEON_VULKAN" = xyes)
 
 AM_CONDITIONAL(HAVE_INTEL_DRIVERS, test "x$HAVE_INTEL_VULKAN" = xyes -o \
@@ -2664,6 +2790,9 @@
 AM_CONDITIONAL(HAVE_X86_ASM, test "x$asm_arch" = xx86 -o "x$asm_arch" = xx86_64)
 AM_CONDITIONAL(HAVE_X86_64_ASM, test "x$asm_arch" = xx86_64)
 AM_CONDITIONAL(HAVE_SPARC_ASM, test "x$asm_arch" = xsparc)
+AM_CONDITIONAL(HAVE_PPC64LE_ASM, test "x$asm_arch" = xppc64le)
+AM_CONDITIONAL(HAVE_AARCH64_ASM, test "x$asm_arch" = xaarch64)
+AM_CONDITIONAL(HAVE_ARM_ASM, test "x$asm_arch" = xarm)
 
 AC_SUBST([NINE_MAJOR], 1)
 AC_SUBST([NINE_MINOR], 0)
@@ -2700,18 +2829,18 @@
                              [Build mesa with valgrind support (default: auto)])],
                              [VALGRIND=$enableval], [VALGRIND=auto])
 if test "x$VALGRIND" != xno; then
-	PKG_CHECK_MODULES(VALGRIND, [valgrind], [have_valgrind=yes], [have_valgrind=no])
+    PKG_CHECK_MODULES(VALGRIND, [valgrind], [have_valgrind=yes], [have_valgrind=no])
 fi
 AC_MSG_CHECKING([whether to enable Valgrind support])
 if test "x$VALGRIND" = xauto; then
-	VALGRIND="$have_valgrind"
+    VALGRIND="$have_valgrind"
 fi
 
 if test "x$VALGRIND" = "xyes"; then
-	if ! test "x$have_valgrind" = xyes; then
-		AC_MSG_ERROR([Valgrind support required but not present])
-	fi
-	AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warnings])
+    if ! test "x$have_valgrind" = xyes; then
+        AC_MSG_ERROR([Valgrind support required but not present])
+    fi
+    AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warnings])
 fi
 
 AC_MSG_RESULT([$VALGRIND])
@@ -2732,113 +2861,116 @@
 
 dnl Substitute the config
 AC_CONFIG_FILES([Makefile
-		src/Makefile
-		src/amd/Makefile
-		src/amd/vulkan/Makefile
-		src/compiler/Makefile
-		src/egl/Makefile
-		src/egl/main/egl.pc
-		src/egl/wayland/wayland-drm/Makefile
-		src/egl/wayland/wayland-egl/Makefile
-		src/egl/wayland/wayland-egl/wayland-egl.pc
-		src/gallium/Makefile
-		src/gallium/auxiliary/Makefile
-		src/gallium/auxiliary/pipe-loader/Makefile
-		src/gallium/drivers/freedreno/Makefile
-		src/gallium/drivers/ddebug/Makefile
-		src/gallium/drivers/i915/Makefile
-		src/gallium/drivers/llvmpipe/Makefile
-		src/gallium/drivers/noop/Makefile
-		src/gallium/drivers/nouveau/Makefile
-		src/gallium/drivers/r300/Makefile
-		src/gallium/drivers/r600/Makefile
-		src/gallium/drivers/radeon/Makefile
-		src/gallium/drivers/radeonsi/Makefile
-		src/gallium/drivers/rbug/Makefile
-		src/gallium/drivers/softpipe/Makefile
-		src/gallium/drivers/svga/Makefile
-		src/gallium/drivers/swr/Makefile
-		src/gallium/drivers/trace/Makefile
-		src/gallium/drivers/etnaviv/Makefile
-		src/gallium/drivers/imx/Makefile
-		src/gallium/drivers/vc4/Makefile
-		src/gallium/drivers/virgl/Makefile
-		src/gallium/state_trackers/clover/Makefile
-		src/gallium/state_trackers/dri/Makefile
-		src/gallium/state_trackers/glx/xlib/Makefile
-		src/gallium/state_trackers/nine/Makefile
-		src/gallium/state_trackers/omx/Makefile
-		src/gallium/state_trackers/osmesa/Makefile
-		src/gallium/state_trackers/va/Makefile
-		src/gallium/state_trackers/vdpau/Makefile
-		src/gallium/state_trackers/xa/Makefile
-		src/gallium/state_trackers/xvmc/Makefile
-		src/gallium/targets/d3dadapter9/Makefile
-		src/gallium/targets/d3dadapter9/d3d.pc
-		src/gallium/targets/dri/Makefile
-		src/gallium/targets/libgl-xlib/Makefile
-		src/gallium/targets/omx/Makefile
-		src/gallium/targets/opencl/Makefile
-		src/gallium/targets/opencl/mesa.icd
-		src/gallium/targets/osmesa/Makefile
-		src/gallium/targets/osmesa/osmesa.pc
-		src/gallium/targets/pipe-loader/Makefile
-		src/gallium/targets/va/Makefile
-		src/gallium/targets/vdpau/Makefile
-		src/gallium/targets/xa/Makefile
-		src/gallium/targets/xa/xatracker.pc
-		src/gallium/targets/xvmc/Makefile
-		src/gallium/tests/trivial/Makefile
-		src/gallium/tests/unit/Makefile
-		src/gallium/winsys/etnaviv/drm/Makefile
-		src/gallium/winsys/imx/drm/Makefile
-		src/gallium/winsys/freedreno/drm/Makefile
-		src/gallium/winsys/i915/drm/Makefile
-		src/gallium/winsys/nouveau/drm/Makefile
-		src/gallium/winsys/radeon/drm/Makefile
-		src/gallium/winsys/amdgpu/drm/Makefile
-		src/gallium/winsys/svga/drm/Makefile
-		src/gallium/winsys/sw/dri/Makefile
-		src/gallium/winsys/sw/kms-dri/Makefile
-		src/gallium/winsys/sw/null/Makefile
-		src/gallium/winsys/sw/wrapper/Makefile
-		src/gallium/winsys/sw/xlib/Makefile
-		src/gallium/winsys/vc4/drm/Makefile
-		src/gallium/winsys/virgl/drm/Makefile
-		src/gallium/winsys/virgl/vtest/Makefile
-		src/gbm/Makefile
-		src/gbm/main/gbm.pc
-		src/glx/Makefile
-		src/glx/apple/Makefile
-		src/glx/tests/Makefile
-		src/glx/windows/Makefile
-		src/glx/windows/windowsdriproto.pc
-		src/gtest/Makefile
-		src/intel/Makefile
-		src/loader/Makefile
-		src/mapi/Makefile
-		src/mapi/es1api/glesv1_cm.pc
-		src/mapi/es2api/glesv2.pc
-		src/mapi/glapi/gen/Makefile
-		src/mesa/Makefile
-		src/mesa/gl.pc
-		src/mesa/drivers/dri/dri.pc
-		src/mesa/drivers/dri/common/Makefile
-		src/mesa/drivers/dri/common/xmlpool/Makefile
-		src/mesa/drivers/dri/i915/Makefile
-		src/mesa/drivers/dri/i965/Makefile
-		src/mesa/drivers/dri/Makefile
-		src/mesa/drivers/dri/nouveau/Makefile
-		src/mesa/drivers/dri/r200/Makefile
-		src/mesa/drivers/dri/radeon/Makefile
-		src/mesa/drivers/dri/swrast/Makefile
-		src/mesa/drivers/osmesa/Makefile
-		src/mesa/drivers/osmesa/osmesa.pc
-		src/mesa/drivers/x11/Makefile
-		src/mesa/main/tests/Makefile
-		src/util/Makefile
-		src/util/tests/hash_table/Makefile
-		src/vulkan/Makefile])
+                 src/Makefile
+                 src/amd/Makefile
+                 src/amd/vulkan/Makefile
+                 src/broadcom/Makefile
+                 src/compiler/Makefile
+                 src/egl/Makefile
+                 src/egl/main/egl.pc
+                 src/egl/wayland/wayland-drm/Makefile
+                 src/egl/wayland/wayland-egl/Makefile
+                 src/egl/wayland/wayland-egl/wayland-egl.pc
+                 src/gallium/Makefile
+                 src/gallium/auxiliary/Makefile
+                 src/gallium/auxiliary/pipe-loader/Makefile
+                 src/gallium/drivers/freedreno/Makefile
+                 src/gallium/drivers/ddebug/Makefile
+                 src/gallium/drivers/i915/Makefile
+                 src/gallium/drivers/llvmpipe/Makefile
+                 src/gallium/drivers/noop/Makefile
+                 src/gallium/drivers/nouveau/Makefile
+                 src/gallium/drivers/pl111/Makefile
+                 src/gallium/drivers/r300/Makefile
+                 src/gallium/drivers/r600/Makefile
+                 src/gallium/drivers/radeon/Makefile
+                 src/gallium/drivers/radeonsi/Makefile
+                 src/gallium/drivers/rbug/Makefile
+                 src/gallium/drivers/softpipe/Makefile
+                 src/gallium/drivers/svga/Makefile
+                 src/gallium/drivers/swr/Makefile
+                 src/gallium/drivers/trace/Makefile
+                 src/gallium/drivers/etnaviv/Makefile
+                 src/gallium/drivers/imx/Makefile
+                 src/gallium/drivers/vc4/Makefile
+                 src/gallium/drivers/virgl/Makefile
+                 src/gallium/state_trackers/clover/Makefile
+                 src/gallium/state_trackers/dri/Makefile
+                 src/gallium/state_trackers/glx/xlib/Makefile
+                 src/gallium/state_trackers/nine/Makefile
+                 src/gallium/state_trackers/omx/Makefile
+                 src/gallium/state_trackers/osmesa/Makefile
+                 src/gallium/state_trackers/va/Makefile
+                 src/gallium/state_trackers/vdpau/Makefile
+                 src/gallium/state_trackers/xa/Makefile
+                 src/gallium/state_trackers/xvmc/Makefile
+                 src/gallium/targets/d3dadapter9/Makefile
+                 src/gallium/targets/d3dadapter9/d3d.pc
+                 src/gallium/targets/dri/Makefile
+                 src/gallium/targets/libgl-xlib/Makefile
+                 src/gallium/targets/omx/Makefile
+                 src/gallium/targets/opencl/Makefile
+                 src/gallium/targets/opencl/mesa.icd
+                 src/gallium/targets/osmesa/Makefile
+                 src/gallium/targets/osmesa/osmesa.pc
+                 src/gallium/targets/pipe-loader/Makefile
+                 src/gallium/targets/va/Makefile
+                 src/gallium/targets/vdpau/Makefile
+                 src/gallium/targets/xa/Makefile
+                 src/gallium/targets/xa/xatracker.pc
+                 src/gallium/targets/xvmc/Makefile
+                 src/gallium/tests/trivial/Makefile
+                 src/gallium/tests/unit/Makefile
+                 src/gallium/winsys/etnaviv/drm/Makefile
+                 src/gallium/winsys/imx/drm/Makefile
+                 src/gallium/winsys/freedreno/drm/Makefile
+                 src/gallium/winsys/i915/drm/Makefile
+                 src/gallium/winsys/nouveau/drm/Makefile
+                 src/gallium/winsys/pl111/drm/Makefile
+                 src/gallium/winsys/radeon/drm/Makefile
+                 src/gallium/winsys/amdgpu/drm/Makefile
+                 src/gallium/winsys/svga/drm/Makefile
+                 src/gallium/winsys/sw/dri/Makefile
+                 src/gallium/winsys/sw/kms-dri/Makefile
+                 src/gallium/winsys/sw/null/Makefile
+                 src/gallium/winsys/sw/wrapper/Makefile
+                 src/gallium/winsys/sw/xlib/Makefile
+                 src/gallium/winsys/vc4/drm/Makefile
+                 src/gallium/winsys/virgl/drm/Makefile
+                 src/gallium/winsys/virgl/vtest/Makefile
+                 src/gbm/Makefile
+                 src/gbm/main/gbm.pc
+                 src/glx/Makefile
+                 src/glx/apple/Makefile
+                 src/glx/tests/Makefile
+                 src/glx/windows/Makefile
+                 src/glx/windows/windowsdriproto.pc
+                 src/gtest/Makefile
+                 src/intel/Makefile
+                 src/loader/Makefile
+                 src/mapi/Makefile
+                 src/mapi/es1api/glesv1_cm.pc
+                 src/mapi/es2api/glesv2.pc
+                 src/mapi/glapi/gen/Makefile
+                 src/mesa/Makefile
+                 src/mesa/gl.pc
+                 src/mesa/drivers/dri/dri.pc
+                 src/mesa/drivers/dri/common/Makefile
+                 src/mesa/drivers/dri/common/xmlpool/Makefile
+                 src/mesa/drivers/dri/i915/Makefile
+                 src/mesa/drivers/dri/i965/Makefile
+                 src/mesa/drivers/dri/Makefile
+                 src/mesa/drivers/dri/nouveau/Makefile
+                 src/mesa/drivers/dri/r200/Makefile
+                 src/mesa/drivers/dri/radeon/Makefile
+                 src/mesa/drivers/dri/swrast/Makefile
+                 src/mesa/drivers/osmesa/Makefile
+                 src/mesa/drivers/osmesa/osmesa.pc
+                 src/mesa/drivers/x11/Makefile
+                 src/mesa/main/tests/Makefile
+                 src/util/Makefile
+                 src/util/tests/hash_table/Makefile
+                 src/vulkan/Makefile])
 
 AC_OUTPUT
 
@@ -2962,6 +3094,11 @@
     echo "        HUD lmsensors:   yes"
 fi
 
+echo ""
+if test "x$HAVE_GALLIUM_SWR" != x; then
+    echo "        SWR archs:       $swr_archs"
+fi
+
 dnl Libraries
 echo ""
 echo "        Shared libs:     $enable_shared"
@@ -2969,15 +3106,17 @@
 echo "        Shared-glapi:    $enable_shared_glapi"
 
 dnl Compiler options
-# cleanup the CFLAGS/CXXFLAGS/DEFINES vars
+# cleanup the CFLAGS/CXXFLAGS/LDFLAGS/DEFINES vars
 cflags=`echo $CFLAGS | \
     $SED 's/^ *//;s/  */ /;s/ *$//'`
 cxxflags=`echo $CXXFLAGS | \
     $SED 's/^ *//;s/  */ /;s/ *$//'`
+ldflags=`echo $LDFLAGS | $SED 's/^ *//;s/  */ /;s/ *$//'`
 defines=`echo $DEFINES | $SED 's/^ *//;s/  */ /;s/ *$//'`
 echo ""
 echo "        CFLAGS:          $cflags"
 echo "        CXXFLAGS:        $cxxflags"
+echo "        LDFLAGS:         $ldflags"
 echo "        Macros:          $defines"
 echo ""
 if test "x$enable_llvm" = xyes; then
diff --git a/docs/bugs.html b/docs/bugs.html
index 445d9ca..44955d3 100644
--- a/docs/bugs.html
+++ b/docs/bugs.html
@@ -37,11 +37,14 @@
 the problem.
 <li>Check if your bug is already reported in the database.
 <li>Monitor your bug report for requests for additional information, etc.
+<li>Attach the output of running glxinfo or wglinfo.
+This will tell us the Mesa version, which device driver you're using, etc.
 <li>If you're reporting a crash, try to use your debugger (gdb) to get a stack
 trace. Also, recompile Mesa in debug mode to get more detailed information.
 <li>Describe in detail how to reproduce the bug, especially with games
 and applications that the Mesa developers might not be familiar with.
-<li>Provide a simple GLUT-based test program if possible
+<li>Provide an <a href="https://github.com/apitrace/apitrace">apitrace</a>
+or simple GLUT-based test program if possible.
 </ul>
 
 <p>
diff --git a/docs/contents.html b/docs/contents.html
index 90a1a00..d545542 100644
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -84,6 +84,7 @@
 <li><a href="codingstyle.html" target="_parent">Coding Style</a>
 <li><a href="submittingpatches.html" target="_parent">Submitting patches</a>
 <li><a href="releasing.html" target="_parent">Releasing process</a>
+<li><a href="release-calendar.html" target="_parent">Release calendar</a>
 <li><a href="sourcedocs.html" target="_parent">Source Documentation</a>
 <li><a href="dispatch.html" target="_parent">GL Dispatch</a>
 </ul>
diff --git a/docs/download.html b/docs/download.html
index c68f4ba..6b2b60c 100644
--- a/docs/download.html
+++ b/docs/download.html
@@ -20,7 +20,7 @@
 Primary Mesa download site:
 <a href="ftp://ftp.freedesktop.org/pub/mesa/">ftp.freedesktop.org</a> (FTP)
 or <a href="https://mesa.freedesktop.org/archive/">mesa.freedesktop.org</a>
-(HTTP).
+(HTTPS).
 </p>
 
 <p>
diff --git a/docs/egl.html b/docs/egl.html
index f072ce1..e752a70 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -130,27 +130,6 @@
 runtime</p>
 
 <dl>
-<dt><code>EGL_DRIVERS_PATH</code></dt>
-<dd>
-
-<p>By default, the main library will look for drivers in the directory where
-the drivers are installed to.  This variable specifies a list of
-colon-separated directories where the main library will look for drivers, in
-addition to the default directory.  This variable is ignored for setuid/setgid
-binaries.</p>
-
-<p>This variable is usually set to test an uninstalled build.  For example, one
-may set</p>
-
-<pre>
-  $ export LD_LIBRARY_PATH=$mesa/lib
-  $ export EGL_DRIVERS_PATH=$mesa/lib/egl
-</pre>
-
-<p>to test a build without installation</p>
-
-</dd>
-
 <dt><code>EGL_DRIVER</code></dt>
 <dd>
 
diff --git a/docs/envvars.html b/docs/envvars.html
index 6537365..9e2f816 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -46,6 +46,9 @@
 <li>MESA_NO_MMX - if set, disables Intel MMX optimizations
 <li>MESA_NO_3DNOW - if set, disables AMD 3DNow! optimizations
 <li>MESA_NO_SSE - if set, disables Intel SSE optimizations
+<li>MESA_NO_ERROR - if set error checking is disabled as per KHR_no_error.
+   This will result in undefined behaviour for invalid use of the api, but
+   can reduce CPU use for apps that are known to be error free.</li>
 <li>MESA_DEBUG - if set, error messages are printed to stderr.  For example,
    if the application generates a GL_INVALID_ENUM error, a corresponding error
    message indicating where the error occurred, and possibly why, will be
@@ -160,48 +163,47 @@
    This is useful for debugging hangs, etc.</li>
 <li>INTEL_DEBUG - a comma-separated list of named flags, which do various things:
 <ul>
-   <li>color - use color in output</li>
-   <li>tex - emit messages about textures.</li>
-   <li>state - emit messages about state flag tracking</li>
-   <li>blit - emit messages about blit operations</li>
-   <li>miptree - emit messages about miptrees</li>
-   <li>perf - emit messages about performance issues</li>
-   <li>perfmon - emit messages about AMD_performance_monitor</li>
+   <li>ann - annotate IR in assembly dumps</li>
+   <li>aub - dump batches into an AUB trace for use with simulation tools</li>
    <li>bat - emit batch information</li>
-   <li>pix - emit messages about pixel operations</li>
+   <li>blit - emit messages about blit operations</li>
+   <li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
    <li>buf - emit messages about buffer objects</li>
+   <li>clip - emit messages about the clip unit (for old gens, includes the CLIP program)</li>
+   <li>color - use color in output</li>
+   <li>cs - dump shader assembly for compute shaders</li>
+   <li>do32 - generate compute shader SIMD32 programs even if workgroup size doesn't exceed the SIMD16 limit</li>
+   <li>dri - emit messages about the DRI interface</li>
    <li>fbo - emit messages about framebuffers</li>
    <li>fs - dump shader assembly for fragment shaders</li>
    <li>gs - dump shader assembly for geometry shaders</li>
-   <li>sync - after sending each batch, emit a message and wait for that batch to finish rendering</li>
-   <li>prim - emit messages about drawing primitives</li>
-   <li>vert - emit messages about vertex assembly</li>
-   <li>dri - emit messages about the DRI interface</li>
-   <li>sf - emit messages about the strips &amp; fans unit (for old gens, includes the SF program)</li>
-   <li>stats - enable statistics counters. you probably actually want perfmon or intel_gpu_top instead.</li>
-   <li>urb - emit messages about URB setup</li>
-   <li>vs - dump shader assembly for vertex shaders</li>
-   <li>clip - emit messages about the clip unit (for old gens, includes the CLIP program)</li>
-   <li>aub - dump batches into an AUB trace for use with simulation tools</li>
-   <li>shader_time - record how much GPU time is spent in each shader</li>
-   <li>no16 - suppress generation of 16-wide fragment shaders. useful for debugging broken shaders</li>
-   <li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
-   <li>nodualobj - suppress generation of dual-object geometry shader code</li>
-   <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
-   <li>ann - annotate IR in assembly dumps</li>
+   <li>hex - print instruction hex dump with the disassembly</li>
+   <li>l3 - emit messages about the new L3 state during transitions</li>
+   <li>miptree - emit messages about miptrees</li>
    <li>no8 - don't generate SIMD8 fragment shader</li>
-   <li>vec4 - force vec4 mode in vertex shader</li>
+   <li>no16 - suppress generation of 16-wide fragment shaders. useful for debugging broken shaders</li>
+   <li>nocompact - disable instruction compaction</li>
+   <li>nodualobj - suppress generation of dual-object geometry shader code</li>
+   <li>norbc - disable single sampled render buffer compression</li>
+   <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
+   <li>perf - emit messages about performance issues</li>
+   <li>perfmon - emit messages about AMD_performance_monitor</li>
+   <li>pix - emit messages about pixel operations</li>
+   <li>prim - emit messages about drawing primitives</li>
+   <li>sf - emit messages about the strips &amp; fans unit (for old gens, includes the SF program)</li>
+   <li>shader_time - record how much GPU time is spent in each shader</li>
    <li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
    <li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
-   <li>cs - dump shader assembly for compute shaders</li>
-   <li>hex - print instruction hex dump with the disassembly</li>
-   <li>nocompact - disable instruction compaction</li>
+   <li>state - emit messages about state flag tracking</li>
+   <li>sync - after sending each batch, emit a message and wait for that batch to finish rendering</li>
    <li>tcs - dump shader assembly for tessellation control shaders</li>
    <li>tes - dump shader assembly for tessellation evaluation shaders</li>
-   <li>l3 - emit messages about the new L3 state during transitions</li>
-   <li>do32 - generate compute shader SIMD32 programs even if workgroup size doesn't exceed the SIMD16 limit</li>
-   <li>norbc - disable single sampled render buffer compression</li>
+   <li>tex - emit messages about textures.</li>
+   <li>urb - emit messages about URB setup</li>
+   <li>vert - emit messages about vertex assembly</li>
+   <li>vs - dump shader assembly for vertex shaders</li>
 </ul>
+<li>INTEL_SCALAR_VS (or TCS, TES, GS) - force scalar/vec4 mode for a shader stage (Gen8-9 only)</li>
 <li>INTEL_PRECISE_TRIG - if set to 1, true or yes, then the driver prefers
    accuracy over performance in trig functions.</li>
 </ul>
@@ -302,6 +304,8 @@
 (will often result in incorrect rendering).
 <li>SVGA_DEBUG - for dumping shaders, constant buffers, etc.  See the code
 for details.
+<li>SVGA_EXTRA_LOGGING - if set, enables extra logging to the vmware.log file,
+such as the OpenGL program's name and command line arguments.
 <li>See the driver code for other, lesser-used variables.
 </ul>
 
diff --git a/docs/features.txt b/docs/features.txt
index 5f63632..1f628e1 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -277,7 +277,7 @@
 
 Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES version:
 
-  GL_ARB_bindless_texture                               started (airlied)
+  GL_ARB_bindless_texture                               DONE (radeonsi)
   GL_ARB_cl_event                                       not started
   GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
   GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
@@ -292,12 +292,12 @@
   GL_ARB_sample_locations                               not started
   GL_ARB_seamless_cubemap_per_texture                   DONE (i965, nvc0, radeonsi, r600, softpipe, swr)
   GL_ARB_shader_atomic_counter_ops                      DONE (i965/gen7+, nvc0, radeonsi, softpipe)
-  GL_ARB_shader_ballot                                  DONE (nvc0, radeonsi)
+  GL_ARB_shader_ballot                                  DONE (i965/gen8+, nvc0, radeonsi)
   GL_ARB_shader_clock                                   DONE (i965/gen7+, nv50, nvc0, radeonsi)
   GL_ARB_shader_draw_parameters                         DONE (i965, nvc0, radeonsi)
-  GL_ARB_shader_group_vote                              DONE (nvc0, radeonsi)
+  GL_ARB_shader_group_vote                              DONE (i965, nvc0, radeonsi)
   GL_ARB_shader_stencil_export                          DONE (i965/gen9+, radeonsi, softpipe, llvmpipe, swr)
-  GL_ARB_shader_viewport_layer_array                    DONE (i965/gen6+, radeonsi)
+  GL_ARB_shader_viewport_layer_array                    DONE (i965/gen6+, nvc0, radeonsi)
   GL_ARB_sparse_buffer                                  DONE (radeonsi/CIK+)
   GL_ARB_sparse_texture                                 not started
   GL_ARB_sparse_texture2                                not started
@@ -305,9 +305,9 @@
   GL_ARB_texture_filter_minmax                          not started
   GL_ARB_transform_feedback_overflow_query              DONE (i965/gen6+)
   GL_KHR_blend_equation_advanced_coherent               DONE (i965/gen9+)
-  GL_KHR_no_error                                       not started
-  GL_KHR_texture_compression_astc_hdr                   DONE (core only)
-  GL_KHR_texture_compression_astc_sliced_3d             not started
+  GL_KHR_no_error                                       started (Timothy Arceri)
+  GL_KHR_texture_compression_astc_hdr                   DONE (i965/bxt)
+  GL_KHR_texture_compression_astc_sliced_3d             DONE (i965/gen9+)
   GL_OES_depth_texture_cube_map                         DONE (all drivers that support GLSL 1.30+)
   GL_OES_EGL_image                                      DONE (all drivers)
   GL_OES_EGL_image_external_essl3                       not started
diff --git a/docs/index.html b/docs/index.html
index 40c9e5a..83071f2 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,71 @@
 
 <h1>News</h1>
 
+<h2>July 14, 2017</h2>
+<p>
+<a href="relnotes/17.1.5.html">Mesa 17.1.5</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 30, 2017</h2>
+<p>
+<a href="relnotes/17.1.4.html">Mesa 17.1.4</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 19, 2017</h2>
+<p>
+<a href="relnotes/17.1.3.html">Mesa 17.1.3</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 5, 2017</h2>
+<p>
+<a href="relnotes/17.1.2.html">Mesa 17.1.2</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 1, 2017</h2>
+<p>
+<a href="relnotes/17.0.7.html">Mesa 17.0.7</a> is released.
+This is a bug-fix release.
+<br>
+NOTE: It is anticipated that 17.0.7 will be the final release in the 17.0
+series. Users of 17.0 are encouraged to migrate to the 17.1 series in order
+to obtain future fixes.
+</p>
+
+<h2>May 25, 2017</h2>
+<p>
+<a href="relnotes/17.1.1.html">Mesa 17.1.1</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 12, 2017</h2>
+<p>
+<a href="relnotes/17.0.6.html">Mesa 17.0.6</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 10, 2017</h2>
+<p>
+<a href="relnotes/17.1.0.html">Mesa 17.1.0</a> is released.  This is a
+new development release.  See the release notes for more information
+about the release.
+</p>
+
+<h2>April 28, 2017</h2>
+<p>
+<a href="relnotes/17.0.5.html">Mesa 17.0.5</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>April 17, 2017</h2>
+<p>
+<a href="relnotes/17.0.4.html">Mesa 17.0.4</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>April 1, 2017</h2>
 <p>
 <a href="relnotes/17.0.3.html">Mesa 17.0.3</a> is released.
diff --git a/docs/release-calendar.html b/docs/release-calendar.html
new file mode 100644
index 0000000..9e1f3cc
--- /dev/null
+++ b/docs/release-calendar.html
@@ -0,0 +1,94 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Release calendar</title>
+  <link rel="stylesheet" type="text/css" href="mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="contents.html"></iframe>
+<div class="content">
+
+<h1>Overview</h1>
+
+<p>
+Mesa provides feature/development and stable releases.
+</p>
+<p>
+The table below lists the date and release manager that is expected to do the
+specific release.
+<br>
+Take a look <a href="submittingpatches.html#criteria" target="_parent">here</a>
+if you'd like to nominate a patch in the next stable release.
+</p>
+
+<h1 id="calendar">Calendar</h1>
+
+<table border="1">
+
+<tr>
+<th>Branch</th>
+<th>Expected date</th>
+<th>Release</th>
+<th>Release manager</th>
+<th>Notes</th>
+</tr>
+<tr>
+<td rowspan="3">17.1</td>
+<td>2017-07-28</td>
+<td>17.1.6</td>
+<td>Emil Velikov</td>
+<td></td>
+</tr>
+<tr>
+<td>2017-08-11</td>
+<td>17.1.7</td>
+<td>Juan A. Suarez Romero</td>
+<td></td>
+</tr>
+<tr>
+<td>2017-08-25</td>
+<td>17.1.8</td>
+<td>Andres Gomez</td>
+<td>Final planned release for the 17.1 series</td>
+</tr>
+<tr>
+<td rowspan="5">17.2</td>
+<td>2017-07-21</td>
+<td>17.2.0-rc1</td>
+<td>Emil Velikov</td>
+<td></td>
+</tr>
+<tr>
+<td>2017-07-28</td>
+<td>17.2.0-rc2</td>
+<td>Emil Velikov</td>
+<td></td>
+</tr>
+<tr>
+<td>2017-08-04</td>
+<td>17.2.0-rc3</td>
+<td>Emil Velikov</td>
+<td></td>
+</tr>
+<tr>
+<td>2017-08-11</td>
+<td>17.2.0-rc4</td>
+<td>Emil Velikov</td>
+<td>May be promoted to 17.2.0 final</td>
+</tr>
+<tr>
+<td>2017-08-25</td>
+<td>17.2.1</td>
+<td>Emil Velikov</td>
+<td></td>
+</table>
+
+</div>
+</body>
+</html>
diff --git a/docs/releasing.html b/docs/releasing.html
index 9530ab0..0af05af 100644
--- a/docs/releasing.html
+++ b/docs/releasing.html
@@ -14,6 +14,7 @@
 <iframe src="contents.html"></iframe>
 <div class="content">
 
+
 <h1>Releasing process</h1>
 
 <ul>
@@ -28,6 +29,7 @@
 <li><a href="#bugzilla">Update Bugzilla</a>
 </ul>
 
+
 <h1 id="overview">Overview</h1>
 
 <p>
@@ -48,11 +50,15 @@
 	Mesa 12.0.2 - 12.0 branch, bugfix
 </pre>
 
+
 <h1 id="schedule">Release schedule</h1>
 
 <p>
 Releases should happen on Fridays. Delays can occur although those should be keep
 to a minimum.
+<br>
+See our <a href="release-calendar.html" target="_parent">calendar</a> for the
+date and other details for individual releases.
 </p>
 
 <h2>Feature releases</h2>
@@ -79,15 +85,24 @@
 time (or shortly after) 13.0.1 is out.
 </p>
 
+
 <h1 id="pickntest">Cherry-picking and testing</h1>
 
 <p>
 Commits nominated for the active branch are picked as based on the
 <a href="submittingpatches.html#criteria" target="_parent">criteria</a> as
 described in the same section.
+</p>
 
 <p>
-Maintainer is responsible for testing in various possible permutations of
+Nomination happens in the mesa-stable@ mailing list. However,
+maintainer is resposible of checking for forgotten candidates in the
+master branch. This is achieved by a combination of ad-hoc scripts and
+a casual search for terms such as regression, fix, broken and similar.
+</p>
+
+<p>
+Maintainer is also responsible for testing in various possible permutations of
 the autoconf and scons build.
 </p>
 
@@ -101,33 +116,57 @@
 and the patch <strong>must</strong> be very well contained. Thus it cannot
 affect more than one driver/subsystem.
 </p>
+
 <p>
 Currently Ilia Mirkin and AMD devs have requested "permanent" exception.
 </p>
 
-
 <ul>
 <li>make distcheck, scons and scons check must pass
 <li>Testing with different version of system components - LLVM and others is also
 performed where possible.
+<li>As a general rule, testing with various combinations of configure
+switches, depending on the specific patchset.
 </ul>
+
 <p>
-Achieved by combination of local ad-hoc scripts and AppVeyor plus Travis-CI,
-the latter as part of their Github integration.
+Achieved by combination of local ad-hoc scripts, mingw-w64 cross
+compilation and AppVeyor plus Travis-CI, the latter as part of their
+Github integration.
 </p>
+
+<p>
+For Windows related changes, the main contact point is Brian
+Paul. Jose Fonseca can also help as a fallback contact.
+</p>
+
+<p>
+For Android related changes, the main contact is Tapani
+P&auml;lli. Mauro Rossi is collaborating with android-x86 and may
+provide feedback about the build status in that project.
+</p>
+
+<p>
+For MacOSX related changes, Jeremy Huddleston Sequoia is currently a
+good contact point.
+</p>
+
 <p>
 <strong>Note:</strong> If a patch in the current queue needs any additional
 fix(es), then they should be squashed together.
 <br>
 The commit messages and the <code>cherry picked from</code> tags must be preserved.
 </p>
+
 <p>
 This should be noted in the <a href="#prerelease">pre-announce</a> email.
+</p>
+
 <pre>
     git show b10859ec41d09c57663a258f43fe57c12332698e
 
     commit b10859ec41d09c57663a258f43fe57c12332698e
-    Author: Jonas Pfeil &ltpfeiljonas@gmx.de&gt
+    Author: Jonas Pfeil &lt;pfeiljonas@gmx.de&gt;
     Date:   Wed Mar 1 18:11:10 2017 +0100
 
         ralloc: Make sure ralloc() allocations match malloc()'s alignment.
@@ -146,7 +185,6 @@
 
         (cherry picked from commit ff494fe999510ea40e3ed5827e7818550b6de126)
 </pre>
-</p>
 
 <h2>Regression/functionality testing</h2>
 
@@ -154,15 +192,23 @@
 Less often (once or twice), shortly before the pre-release announcement.
 Ensure that testing is redone if Intel devs have requested an exception, as per above.
 </p>
+
 <ul>
 <li><em>no regressions should be observed for Piglit/dEQP/CTS/Vulkan on Intel platforms</em>
 <li><em>no regressions should be observed for Piglit using the swrast, softpipe
 and llvmpipe drivers</em>
 </ul>
+
 <p>
 Currently testing is performed courtesy of the Intel OTC team and their Jenkins CI setup. Check with the Intel team over IRC how to get things setup.
 </p>
 
+<p>
+Installing the built driver from the pre-announced RC branch in the
+system and making some every day's use until the release may be a good
+idea too.
+</p>
+
 
 <h1 id="branch">Making a branchpoint</h1>
 
@@ -202,15 +248,18 @@
 Now go to
 <a href="https://bugs.freedesktop.org/editversions.cgi?action=add&amp;product=Mesa" target="_parent">Bugzilla</a> and add the new Mesa version X.Y.
 </p>
+
 <p>
 Check that there are no distribution breaking changes and revert them if needed.
 For example: files being overwritten on install, etc. Happens extremely rarely -
 we had only one case so far (see commit 2ced8eb136528914e1bf4e000dea06a9d53c7e04).
 </p>
+
 <p>
 Proceed to <a href="#release">release</a> -rc1.
 </p>
 
+
 <h1 id="prerelease">Pre-release announcement</h1>
 
 <p>
@@ -224,18 +273,22 @@
 </p>
 
 <h2>Terminology used</h2>
+
 <ul><li>Nominated</ul>
+
 <p>
 Patch that is nominated but yet to to merged in the patch queue/branch.
 </p>
 
 <ul><li>Queued</ul>
+
 <p>
 Patch is in the queue/branch and will feature in the next release.
 Barring reported regressions or objections from developers.
 </p>
 
 <ul><li>Rejected</ul>
+
 <p>
 Patch does not fit the
 <a href="submittingpatches.html#criteria" target="_parent">criteria</a> and
@@ -341,6 +394,7 @@
 Reason: ...
 </pre>
 
+
 <h1 id="release">Making a new release</h1>
 
 <p>
@@ -348,18 +402,21 @@
 </p>
 
 <h3>Get latest source files</h3>
+
 <p>
 Ensure the latest code is available - both in your local master and the
 relevant branch.
 </p>
 
 <h3>Perform basic testing</h3>
+
 <p>
 Most of the testing should already be done during the
 <a href="#pickntest">cherry-pick</a> and
 <a href="#prerelease">pre-announce</a> stages.
-
 So we do a quick 'touch test'
+</p>
+
 <ul>
 <li>make distcheck (you can omit this if you're not using --dist below)
 <li>scons (from release tarball)
@@ -379,6 +436,8 @@
 	chmod 755 -fR $__build_root; rm -rf $__build_root
 	mkdir -p $__build_root &amp;&amp; cd $__build_root
 
+	# For the distcheck, you may want to specify which LLVM to use:
+	# export LLVM_CONFIG=/usr/lib/llvm-3.9/bin/llvm-config
 	$__mesa_root/autogen.sh &amp;&amp; make -j2 distcheck
 
 	# Build check the tarballs (scons, linux)
@@ -387,18 +446,22 @@
 	cd .. &amp;&amp; rm -rf mesa-$__version
 
 	# Build check the tarballs (scons, windows/mingw)
+	# You may need to unset LLVM if you set it before:
+	# unset LLVM_CONFIG
 	tar -xaf mesa-$__version.tar.xz &amp;&amp; cd mesa-$__version
 	scons platform=windows toolchain=crossmingw
 	cd .. &amp;&amp; rm -rf mesa-$__version
 
 	# Test the automake binaries
 	tar -xaf mesa-$__version.tar.xz &amp;&amp; cd mesa-$__version
+	# You may want to specify which LLVM to use:
 	./configure \
 		--with-dri-drivers=i965,swrast \
 		--with-gallium-drivers=swrast \
 		--with-vulkan-drivers=intel \
 		--enable-llvm-shared-libs \
 		--enable-llvm \
+		--with-llvm-prefix=/usr/lib/llvm-3.9 \
 		--enable-glx-tls \
 		--enable-gbm \
 		--enable-egl \
@@ -408,7 +471,8 @@
 	__glxgears_cmd='glxgears 2>&amp;1 | grep -v "configuration file"'
 	__es2info_cmd='es2_info 2>&amp;1 | egrep "GL_VERSION|GL_RENDERER|.*dri\.so"'
 	__es2gears_cmd='es2gears_x11 2>&amp;1 | grep -v "configuration file"'
-	export LD_LIBRARY_PATH=`pwd`/test/usr/local/lib/
+	test "x$LD_LIBRARY_PATH" != 'x' &amp;&amp; __old_ld="$LD_LIBRARY_PATH"
+	export LD_LIBRARY_PATH=`pwd`/test/usr/local/lib/:"${__old_ld}"
 	export LIBGL_DRIVERS_PATH=`pwd`/test/usr/local/lib/dri/
 	export LIBGL_DEBUG=verbose
 	eval $__glxinfo_cmd
@@ -428,6 +492,7 @@
 	eval $__es2gears_cmd
 	# Smoke test DOTA2
 	unset LD_LIBRARY_PATH
+	test "x$__old_ld" != 'x' &amp;&amp; export LD_LIBRARY_PATH="$__old_ld" &amp;&amp; unset __old_ld
 	unset LIBGL_DRIVERS_PATH
 	unset LIBGL_DEBUG
 	unset LIBGL_ALWAYS_SOFTWARE
@@ -452,6 +517,7 @@
 
 <p>
 Two scripts are available to help generate portions of the release notes:
+</p>
 
 <pre>
 	./bin/bugzilla_mesa.sh
@@ -468,6 +534,7 @@
 <p>
 Commit these changes and push the branch.
 </p>
+
 <pre>
 	git push origin HEAD
 </pre>
@@ -478,7 +545,10 @@
 <p>
 Start the release process.
 </p>
+
 <pre>
+	# For the dist/distcheck, you may want to specify which LLVM to use:
+	# export LLVM_CONFIG=/usr/lib/llvm-3.9/bin/llvm-config
 	../relative/path/to/release.sh . # append --dist if you've already done distcheck above
 </pre>
 
@@ -505,17 +575,19 @@
 </pre>
 
 <p>
-Also, edit docs/relnotes.html to add a link to the new release notes, and edit
-docs/index.html to add a news entry. Then commit and push:
+Also, edit docs/relnotes.html to add a link to the new release notes,
+edit docs/index.html to add a news entry, and remove the version from
+docs/release-calendar.html. Then commit and push:
 </p>
 
 <pre>
-	git commit -as -m "docs: add news item and link release notes for X.Y.Z"
+	git commit -as -m "docs: update calendar, add news item and link release notes for X.Y.Z"
 	git push origin master X.Y
 </pre>
 
 
 <h1 id="announce">Announce the release</h1>
+
 <p>
 Use the generated template during the releasing process.
 </p>
@@ -528,6 +600,7 @@
 website. Manually check that it is updated 5-10 minutes after the final <code>git push</code>
 </p>
 
+
 <h1 id="bugzilla">Update Bugzilla</h1>
 
 <p>
diff --git a/docs/relnotes.html b/docs/relnotes.html
index c063223..68550b9 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,16 @@
 </p>
 
 <ul>
+<li><a href="relnotes/17.1.5.html">17.1.5 release notes</a>
+<li><a href="relnotes/17.1.4.html">17.1.4 release notes</a>
+<li><a href="relnotes/17.1.3.html">17.1.3 release notes</a>
+<li><a href="relnotes/17.1.2.html">17.1.2 release notes</a>
+<li><a href="relnotes/17.0.7.html">17.0.7 release notes</a>
+<li><a href="relnotes/17.1.1.html">17.1.1 release notes</a>
+<li><a href="relnotes/17.0.6.html">17.0.6 release notes</a>
+<li><a href="relnotes/17.1.0.html">17.1.0 release notes</a>
+<li><a href="relnotes/17.0.5.html">17.0.5 release notes</a>
+<li><a href="relnotes/17.0.4.html">17.0.4 release notes</a>
 <li><a href="relnotes/17.0.3.html">17.0.3 release notes</a>
 <li><a href="relnotes/17.0.2.html">17.0.2 release notes</a>
 <li><a href="relnotes/13.0.6.html">13.0.6 release notes</a>
diff --git a/docs/relnotes/17.0.4.html b/docs/relnotes/17.0.4.html
index 2e2ca9b..16629d1 100644
--- a/docs/relnotes/17.0.4.html
+++ b/docs/relnotes/17.0.4.html
@@ -36,6 +36,13 @@
 </pre>
 
 
+<h2>Next release</h2>
+<p>
+Mesa 17.0.5 is expected in approximatelly two weeks. See the release
+<a href="../release-calendar.html#calendar" target="_parent">calendar</a>
+for details.
+</p>
+
 <h2>New features</h2>
 <p>None</p>
 
diff --git a/docs/relnotes/17.0.7.html b/docs/relnotes/17.0.7.html
new file mode 100644
index 0000000..76e5bc6
--- /dev/null
+++ b/docs/relnotes/17.0.7.html
@@ -0,0 +1,145 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.0.7 Release Notes / June 1, 2017</h1>
+
+<p>
+Mesa 17.0.7 is a bug fix release which fixes bugs found since the 17.0.6 release.
+</p>
+<p>
+Mesa 17.0.7 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+bc68d13c6b1a053b855ac453ebf7e62bd89511adf44bad6c613e09f7fa13390a  mesa-17.0.7.tar.gz
+f6d75304a229c8d10443e219d6b6c0c342567dbab5a879ebe7cfa3c9139c4492  mesa-17.0.7.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98833">Bug 98833</a> - [REGRESSION, bisected] Wayland revert commit breaks non-Vsync fullscreen frame updates</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100741">Bug 100741</a> - Chromium - Memory leak</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100925">Bug 100925</a> - [HSW/BSW/BDW/SKL] Google Earth is not resolving all the details in the map correctly</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.0.6</li>
+</ul>
+
+<p>Bartosz Tomczyk (1):</p>
+<ul>
+  <li>mesa: Avoid leaking surface in st_renderbuffer_delete</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>egl: Partially revert 23c86c74, fix eglMakeCurrent</li>
+</ul>
+
+<p>Daniel Stone (7):</p>
+<ul>
+  <li>vulkan: Fix Wayland uninitialised registry</li>
+  <li>vulkan/wsi/wayland: Remove roundtrip when creating image</li>
+  <li>vulkan/wsi/wayland: Use per-display event queue</li>
+  <li>vulkan/wsi/wayland: Use proxy wrappers for swapchain</li>
+  <li>egl/wayland: Don't open-code roundtrip</li>
+  <li>egl/wayland: Use per-surface event queues</li>
+  <li>egl/wayland: Ensure we get a back buffer</li>
+</ul>
+
+<p>Emil Velikov (5):</p>
+<ul>
+  <li>st/va: fix misplaced closing bracket</li>
+  <li>anv: automake: list shared libraries after the static ones</li>
+  <li>radv: automake: list shared libraries after the static ones</li>
+  <li>egl/wayland: select the format based on the interface used</li>
+  <li>Update version to 17.0.7</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>renderonly: Initialize fields of struct winsys_handle.</li>
+  <li>vc4: Don't allocate new BOs to avoid synchronization when they're shared.</li>
+</ul>
+
+<p>Hans de Goede (1):</p>
+<ul>
+  <li>glxglvnddispatch: Add missing dispatch for GetDriverConfig</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nvc0/ir: SHLADD's middle source must be an immediate</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>i965/blorp: Do and end-of-pipe sync on both sides of fast-clear ops</li>
+  <li>i965: Round copy size to the nearest block in intel_miptree_copy</li>
+</ul>
+
+<p>Lucas Stach (1):</p>
+<ul>
+  <li>etnaviv: stop oversizing buffer resources</li>
+</ul>
+
+<p>Nanley Chery (2):</p>
+<ul>
+  <li>anv/formats: Update the three-channel BC1 mappings</li>
+  <li>i965/formats: Update the three-channel DXT1 mappings</li>
+</ul>
+
+<p>Pohjolainen, Topi (1):</p>
+<ul>
+  <li>intel/isl/gen7: Use stencil vertical alignment of 8 instead of 4</li>
+</ul>
+
+<p>Samuel Iglesias Gonsálvez (3):</p>
+<ul>
+  <li>i965/vec4/gs: restore the uniform values which was overwritten by failed vec4_gs_visitor execution</li>
+  <li>i965/vec4: fix swizzle and writemask when loading an uniform with constant offset</li>
+  <li>i965/vec4: load dvec3/4 uniforms first in the push constant buffer</li>
+</ul>
+
+<p>Tom Stellard (1):</p>
+<ul>
+  <li>gallivm: Make sure module has the correct data layout when pass manager runs</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.1.0.html b/docs/relnotes/17.1.0.html
index 63dfcb5..6e7f849 100644
--- a/docs/relnotes/17.1.0.html
+++ b/docs/relnotes/17.1.0.html
@@ -19,7 +19,8 @@
 <p>
 Mesa 17.1.0 is a new development release.
 People who are concerned with stability and reliability should stick
-with a previous release or wait for Mesa 17.1.1.
+with a previous release or wait for
+<a href="../release-calendar.html#calendar" target="_parent">Mesa 17.1.1</a>.
 </p>
 <p>
 Mesa 17.1.0 implements the OpenGL 4.5 API, but the version reported by
diff --git a/docs/relnotes/17.2.0.html b/docs/relnotes/17.2.0.html
new file mode 100644
index 0000000..1b815d0
--- /dev/null
+++ b/docs/relnotes/17.2.0.html
@@ -0,0 +1,218 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.0 Release Notes / September 4, 2017</h1>
+
+<p>
+Mesa 17.2.0 is a new development release.
+People who are concerned with stability and reliability should stick
+with a previous release or wait for Mesa 17.2.1.
+</p>
+<p>
+Mesa 17.2.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+9484ad96b4bb6cda5bbf1aef52dfa35183dc21aa6258a2991c245996c2fdaf85  mesa-17.2.0.tar.gz
+3123448f770eae58bc73e15480e78909defb892f10ab777e9116c9b218094943  mesa-17.2.0.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>GL_ARB_bindless_texture on radeonsi</li>
+<li>GL_ARB_post_depth_coverage on nvc0 (GM200+)</li>
+<li>GL_ARB_shader_ballot on i965/gen8+</li>
+<li>GL_ARB_shader_group_vote on i965 (with a no-op vec4 implementation)</li>
+<li>GL_ARB_shader_viewport_layer_array on nvc0 (GM200+)</li>
+<li>GL_AMD_vertex_shader_layer on nvc0 (GM200+)</li>
+<li>GL_AMD_vertex_shader_viewport_index on nvc0 (GM200+)</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68365">Bug 68365</a> - [SNB Bisected]Piglit spec_ARB_framebuffer_object_fbo-blit-stretch  fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=77240">Bug 77240</a> - khrplatform.h not installed if EGL is disabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95530">Bug 95530</a> - Stellaris - colored overlay of sectors doesn't render on i965</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96449">Bug 96449</a> - Dying Light reports OpenGL version 3.0 with mesa-git</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96958">Bug 96958</a> - [SKL] Improper rendering in Europa Universalis IV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97524">Bug 97524</a> - Samplers referring to the same texture unit with different types should raise GL_INVALID_OPERATION</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97957">Bug 97957</a> - Awful screen tearing in a separate X server with DRI3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98238">Bug 98238</a> - Witcher 2: objects are black when changing lod on Radeon Pitcairn</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98428">Bug 98428</a> - Undefined non-weak-symbol in dri-drivers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98833">Bug 98833</a> - [REGRESSION, bisected] Wayland revert commit breaks non-Vsync fullscreen frame updates</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99467">Bug 99467</a> - [radv] DOOM 2016 + wine. Green screen everywhere (but can be started)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100070">Bug 100070</a> - Rocket League: grass gets rendered incorrectly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100242">Bug 100242</a> - radeon buffer allocation failure during startup of Factorio</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100620">Bug 100620</a> - [SKL] 48-bit addresses break DOOM</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100690">Bug 100690</a> - [Regression, bisected] TotalWar: Warhammer corrupted graphics</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100741">Bug 100741</a> - Chromium - Memory leak</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100785">Bug 100785</a> - [regression, bisected] arb_gpu_shader5 piglit fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100854">Bug 100854</a> - YUV to RGB Color Space Conversion result is not precise</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100871">Bug 100871</a> - gles cts hangs mesa indefinitely</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100877">Bug 100877</a> - vulkan/tests/block_pool_no_free regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100892">Bug 100892</a> - Polaris 12: winsys init bad switch (missing break) initializing addrlib</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100925">Bug 100925</a> - [HSW/BSW/BDW/SKL] Google Earth is not resolving all the details in the map correctly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100937">Bug 100937</a> - Mesa fails to build with GCC 4.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100945">Bug 100945</a> - Build failure in GNOME Continuous</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100988">Bug 100988</a> - glXGetCurrentDisplay() no longer works for FakeGLX contexts?</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101071">Bug 101071</a> - compiling glsl fails with undefined reference to `pthread_create'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101088">Bug 101088</a> - `gallium: remove pipe_index_buffer and set_index_buffer` causes glitches and crash in gallium nine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101110">Bug 101110</a> - Build failure in GNOME Continuous</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101189">Bug 101189</a> - Latest git fails to compile with radeon</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101252">Bug 101252</a> - eglGetDisplay() is not thread safe</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101254">Bug 101254</a> - VDPAU videos don't start playing with r600 gallium driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101283">Bug 101283</a> - skylake: page fault accessing address 0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101284">Bug 101284</a> - [G45] ES2-CTS.functional.texture.specification.basic_copytexsubimage2d.cube_rgba</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101294">Bug 101294</a> - radeonsi minecraft forge splash freeze since 17.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101306">Bug 101306</a> - [BXT] gles asserts in cts</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101326">Bug 101326</a> - gallium/wgl: Allow context creation without prior SetPixelFormat()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101334">Bug 101334</a> - AMD SI cards: Some vulkan apps freeze the system</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101336">Bug 101336</a> - glcpp-test.sh regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101340">Bug 101340</a> - i915_surface.c:108:4: error: too few arguments to function ‘util_blitter_default_src_texture’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101360">Bug 101360</a> - Assertion failure comparing result of ballotARB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101401">Bug 101401</a> - [REGRESSION][BISECTED] GDM fails to start after 8ec4975cd83365c791a1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101418">Bug 101418</a> - Build failure in GNOME Continuous</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101451">Bug 101451</a> - [G33] ES2-CTS.functional.clipping.polygon regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101464">Bug 101464</a> - PrimitiveRestartNV inside a render list causes a crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101471">Bug 101471</a> - Mesa fails to build: unknown typename bool</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101535">Bug 101535</a> - [bisected] [Skylake] Kwin won't start and glxgears coredumps</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101538">Bug 101538</a> - From &quot;Use isl for hiz layouts&quot; commit onwards, everything crashes with Mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101539">Bug 101539</a> - [Regresion] [IVB] Segment fault in recent commit in intel_miptree_level_has_hiz under Ivy bridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101558">Bug 101558</a> - [regression][bisected] MPV playing video via opengl &quot;randomly&quot; results in only part of the window / screen being rendered with Mesa GIT.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101596">Bug 101596</a> - Blender renders black UI elements</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101607">Bug 101607</a> - Regression in anisotropic filtering from &quot;i965: Convert fs sampler state to use genxml&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101657">Bug 101657</a> - strtod.c:32:10: fatal error: xlocale.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101666">Bug 101666</a> - bitfieldExtract is marked as a built-in function on OpenGL ES 3.0, but was added in OpenGL ES 3.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101683">Bug 101683</a> - Some games hang while loading when compositing is shut off or absent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101703">Bug 101703</a> - No stencil buffer allocated when requested by GLUT</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101704">Bug 101704</a> - [regression][bisected] glReadPixels() from pbuffer failing in Android CTS camera tests</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101766">Bug 101766</a> - Assertion `!&quot;invalid type&quot;' failed when constant expression involves literal of different type</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101774">Bug 101774</a> - gen_clflush.h:37:7: error: implicit declaration of function ‘__builtin_ia32_clflush’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101775">Bug 101775</a> - Xorg segfault since 147d7fb &quot;st/mesa: add a winsys buffers list in st_context&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101829">Bug 101829</a> - read-after-free in st_framebuffer_validate</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101831">Bug 101831</a> - Build failure in GNOME Continuous</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101851">Bug 101851</a> - [regression] libEGL_common.a undefined reference to '__gxx_personality_v0'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101867">Bug 101867</a> - Launch options window renders black in Feral Games in current Mesa trunk</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101876">Bug 101876</a> - SIGSEGV when launching Steam</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101910">Bug 101910</a> - [BYT] ES31-CTS.functional.copy_image.non_compressed.viewclass_96_bits.rgb32f_rgb32f</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101925">Bug 101925</a> - playstore/webview crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101961">Bug 101961</a> - Serious Sam Fusion hangs system completely</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101982">Bug 101982</a> - Weston crashes when running an OpenGL program on i965</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101983">Bug 101983</a> - [G33] ES2-CTS.functional.shaders.struct.uniform.sampler_nested* regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102024">Bug 102024</a> - FORMAT_FEATURE_SAMPLED_IMAGE_BIT not supported for D16_UNORM and D32_SFLOAT</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102148">Bug 102148</a> - Crash when running qopenglwidget example on mesa llvmpipe win32</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102241">Bug 102241</a> - gallium/wgl: SwapBuffers freezing regularly with swap interval enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102308">Bug 102308</a> - segfault in glCompressedTextureSubImage3D</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<ul>
+<li>GL_APPLE_vertex_array_object support removed.</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.2.1.html b/docs/relnotes/17.2.1.html
new file mode 100644
index 0000000..37873cd
--- /dev/null
+++ b/docs/relnotes/17.2.1.html
@@ -0,0 +1,200 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.1 Release Notes / September 17, 2017</h1>
+
+<p>
+Mesa 17.2.1 is a bug fix release which fixes bugs found since the 17.2.0 release.
+</p>
+<p>
+Mesa 17.2.1 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+c902d8dc2540195bc570d88af1a8fd8a1774373660a27bb1d539551f46824bc1  mesa-17.2.1.tar.gz
+77385d17827cff24a3bae134342234f2efe7f7f990e778109682571dbbc9ba1e  mesa-17.2.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100613">Bug 100613</a> - Regression in Mesa 17 on s390x (zSystems)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101709">Bug 101709</a> - [llvmpipe] piglit gl-1.0-scissor-offscreen regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102454">Bug 102454</a> - glibc 2.26 doesn't provide anymore xlocale.h</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102467">Bug 102467</a> - src/mesa/state_tracker/st_cb_readpixels.c:178]: (warning) Redundant assignment</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102502">Bug 102502</a> - [bisected] Kodi crashes since commit 707d2e8b - gallium: fold u_trim_pipe_prim call from st/mesa to drivers</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Bas Nieuwenhuizen (4):</p>
+<ul>
+  <li>radv: Actually set the cmd_buffer usage_flags.</li>
+  <li>radv: Fix vkCopyImage with both depth and stencil aspects.</li>
+  <li>radv: Disable multilayer &amp; multilevel DCC.</li>
+  <li>radv: Don't allocate CMASK for linear images.</li>
+</ul>
+
+<p>Ben Crocker (1):</p>
+<ul>
+  <li>llvmpipe: lp_build_gather_elem_vec BE fix for 3x16 load</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>llvmpipe: initialize llvmpipe-&gt;dirty with LP_NEW_SCISSOR</li>
+</ul>
+
+<p>Charmaine Lee (1):</p>
+<ul>
+  <li>vbo: fix offset in minmax cache key</li>
+</ul>
+
+<p>Dave Airlie (12):</p>
+<ul>
+  <li>radv: disable 1d/2d linear optimisation on gfx9.</li>
+  <li>radv/gfx9: set descriptor up for base_mip to level range.</li>
+  <li>Revert "radv: disable support for VEGA for now."</li>
+  <li>radv/winsys: use amdgpu_bo_va_op_raw.</li>
+  <li>radv/gfx9: allocate events from uncached VA space</li>
+  <li>radv: use simpler indirect packet 3 if possible.</li>
+  <li>radv: don't use iview for meta image width/height.</li>
+  <li>radv: handle GFX9 1D textures</li>
+  <li>radv/gfx9: set mip0-depth correctly for 2d arrays/3d images</li>
+  <li>radv/ac: bump params array for image atomic comp swap</li>
+  <li>radv/gfx9: fix image resource handling.</li>
+  <li>radv/winsys: fix flags vs va_flags thinko.</li>
+</ul>
+
+<p>Emil Velikov (7):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.2.0</li>
+  <li>cherry-ignore: add getCapability patches</li>
+  <li>cherry-ignore: ignore gfx9 tile swizzle fix</li>
+  <li>cherry-ignore: add execution_type() fix to the list</li>
+  <li>cherry-ignore: add EGL+gbm swast patches</li>
+  <li>egl/x11/dri3: adding missing __DRI_BACKGROUND_CALLABLE extension</li>
+  <li>Update version to 17.2.1</li>
+</ul>
+
+<p>Eric Engestrom (3):</p>
+<ul>
+  <li>util: improve compiler guard</li>
+  <li>mesa/st: remove unwanted backup file</li>
+  <li>docs/egl: remove reference to EGL_DRIVERS_PATH</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>radv: don't assert on empty hash table</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>anv/formats: Nicely handle unknown VkFormat enums</li>
+  <li>spirv: Add support for the HelperInvocation builtin</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nvc0: write 0 to pipeline_statistics.cs_invocations</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix crash in fallback GTT mapping.</li>
+  <li>i965: Set "Subslice Hashing Mode" to 16x16 on Apollolake.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>st/mesa: skip draw calls with pipe_draw_info::count == 0</li>
+</ul>
+
+<p>Michael Olbrich (1):</p>
+<ul>
+  <li>egl/dri2: only destroy created objects</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>radeonsi: apply a mask to gl_SampleMaskIn in the PS prolog</li>
+</ul>
+
+<p>Nicolai Hähnle (4):</p>
+<ul>
+  <li>radeonsi/gfx9: always flush DB metadata on framebuffer changes</li>
+  <li>st/glsl_to_tgsi: only the first (inner-most) array reference can be a 2D index</li>
+  <li>ac/surface: match Z and stencil tile config</li>
+  <li>glsl: fix glsl_struct_field size calculations for shader cache</li>
+</ul>
+
+<p>Ray Strode (1):</p>
+<ul>
+  <li>gallivm: correct channel shift logic on big endian</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>freedreno: skip batch-cache for compute shaders</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>st/mesa: fix view template initialization in try_pbo_readpixels</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radeonsi: update dirty_level_mask before dispatching</li>
+</ul>
+
+<p>Timothy Arceri (9):</p>
+<ul>
+  <li>glsl: allow NULL to be passed to encode_type_to_blob()</li>
+  <li>glsl: stop adding pointers from gl_shader_variable to the cache</li>
+  <li>glsl: stop adding pointers from glsl_struct_field to the cache</li>
+  <li>glsl: add has_uniform_storage() helper to shader cache</li>
+  <li>glsl: don't write uniform storage offset if there isn't one</li>
+  <li>glsl: always write a name/label string to the cache</li>
+  <li>compiler: move pointers to the start of shader_info</li>
+  <li>glsl: stop adding pointers from shader_info to the cache</li>
+  <li>glsl: stop adding pointers from bindless structs to the cache</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.2.2.html b/docs/relnotes/17.2.2.html
new file mode 100644
index 0000000..cc358f4
--- /dev/null
+++ b/docs/relnotes/17.2.2.html
@@ -0,0 +1,203 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.2 Release Notes / October 2, 2017</h1>
+
+<p>
+Mesa 17.2.2 is a bug fix release which fixes bugs found since the 17.2.1 release.
+</p>
+<p>
+Mesa 17.2.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+8242256f3243ed3f35184ed7bf0a9070439ccdf477a3bd9cfd2437c0b2f9bc7f  mesa-17.2.2.tar.gz
+cf522244d6a5a1ecde3fc00e7c96935253fe22f808f064cab98be6f3faa65782  mesa-17.2.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102573">Bug 102573</a> - fails to build on armel</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102844">Bug 102844</a> - memory leak with glDeleteProgram for shader program type GL_COMPUTE_SHADER</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102847">Bug 102847</a> - swr fail to build with llvm-5.0.0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102904">Bug 102904</a> - piglit and gl45 cts linker tests regressed</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alexandru-Liviu Prodea (1):</p>
+<ul>
+  <li>Scons: Add LLVM 5.0 support</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>radv: Check for GFX9 for 1D arrays in image_size intrinsic.</li>
+</ul>
+
+<p>Boris Brezillon (1):</p>
+<ul>
+  <li>broadcom/vc4: Fix infinite retry in vc4_bo_alloc()</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>radv/nir: call opt_remove_phis after trivial continues.</li>
+  <li>ac/surface: handle S8 on gfx9</li>
+  <li>st/glsl-&gt;tgsi: fix u64 to bool comparisons.</li>
+</ul>
+
+<p>David Airlie (1):</p>
+<ul>
+  <li>radv: add gfx9 scissor workaround</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.2.1</li>
+  <li>automake: enable libunwind in `make distcheck'</li>
+</ul>
+
+<p>Eric Anholt (4):</p>
+<ul>
+  <li>broadcom/vc4: Fix use-after-free for flushing when writing to a texture.</li>
+  <li>broadcom/vc4: Fix use-after-free trying to mix a quad and tile clear.</li>
+  <li>broadcom/vc4: Fix use-after-free when deleting a program.</li>
+  <li>broadcom/vc4: Keep pipe_sampler_view-&gt;texture matching the original texture.</li>
+</ul>
+
+<p>Gert Wollny (2):</p>
+<ul>
+  <li>travis: force llvm-3.3 for "make Gallium ST Other"</li>
+  <li>travis: Add libunwind-dev to gallium/make builds</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>configure: check if -latomic is needed for __atomic_*</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>nv20: Fix GL_CLAMP</li>
+</ul>
+
+<p>Jason Ekstrand (6):</p>
+<ul>
+  <li>i965/blorp: Set r8stencil_needs_update when writing stencil</li>
+  <li>vulkan/wsi/wayland: Stop printing out the DRM device</li>
+  <li>vulkan/wsi/wayland: Refactor wsi_wl_display code</li>
+  <li>vulkan/wsi/wayland: Stop caching Wayland displays</li>
+  <li>vulkan/wsi/wayland: Copy wl_proxy objects from oldSwapchain if available</li>
+  <li>vulkan/wsi/wayland: Return better error messages</li>
+</ul>
+
+<p>Juan A. Suarez Romero (4):</p>
+<ul>
+  <li>cherry-ignore: add "radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug"</li>
+  <li>cherry-ignore: add "radv: Check for GFX9 for 1D arrays in image_size intrinsic."</li>
+  <li>cherry-ignore: add "radv: copy the number of viewports/scissors at pipeline bind time"</li>
+  <li>Update version to 17.2.2</li>
+</ul>
+
+<p>Józef Kucia (1):</p>
+<ul>
+  <li>anv: Fix descriptors copying</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965/vec4: Actually handle atomic op intrinsics.</li>
+  <li>i965/vec4: Fix swizzles on atomic sources.</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>st/va/postproc: use video original size for postprocessing</li>
+</ul>
+
+<p>Lucas Stach (1):</p>
+<ul>
+  <li>etnaviv: fix 16bpp clears</li>
+</ul>
+
+<p>Matt Turner (2):</p>
+<ul>
+  <li>util: Link libmesautil into u_atomic_test</li>
+  <li>util/u_atomic: Add implementation of __sync_val_compare_and_swap_8</li>
+</ul>
+
+<p>Nicolai Hähnle (9):</p>
+<ul>
+  <li>radeonsi: workaround for gather4 on integer cube maps</li>
+  <li>amd/common: round cube array slice in ac_prepare_cube_coords</li>
+  <li>amd/common: add workaround for cube map array layer clamping</li>
+  <li>glsl/linker: fix output variable overlap check</li>
+  <li>radeonsi: fix array textures layer coordinate</li>
+  <li>radeonsi: set MIP_POINT_PRECLAMP to 0</li>
+  <li>amd/addrlib: fix missing va_end() after va_copy()</li>
+  <li>amd/common: move ac_build_phi from radeonsi</li>
+  <li>radeonsi: fix a regression in integer cube map handling</li>
+</ul>
+
+<p>Samuel Iglesias Gonsálvez (1):</p>
+<ul>
+  <li>anv: fix viewport transformation for z component</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: fix saved compute state when doing statistics/occlusion queries</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>mesa: free current ComputeProgram state in _mesa_free_context_data</li>
+</ul>
+
+<p>Tim Rowley (1):</p>
+<ul>
+  <li>swr/rast: remove llvm fence/atomics from generated files</li>
+</ul>
+
+<p>Tomasz Figa (1):</p>
+<ul>
+  <li>egl/dri2: Implement swapInterval fallback in a conformant way</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.2.3.html b/docs/relnotes/17.2.3.html
new file mode 100644
index 0000000..6e2aea6
--- /dev/null
+++ b/docs/relnotes/17.2.3.html
@@ -0,0 +1,181 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.3 Release Notes / October 19, 2017</h1>
+
+<p>
+Mesa 17.2.3 is a bug fix release which fixes bugs found since the 17.2.2 release.
+</p>
+<p>
+Mesa 17.2.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+fb305eecfeec1fd771fdc96fff973c51871f7bd35fd2bd56cacc27b4b8823220  mesa-17.2.3.tar.gz
+a0b0ec8f7b24dd044d7ab30a8c7e6d3767521e245f88d4ed5dd93315dc56f837  mesa-17.2.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101832">Bug 101832</a> - [PATCH][regression][bisect] Xorg fails to start after f50aa21456d82c8cb6fbaa565835f1acc1720a5d</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102852">Bug 102852</a> - Scons: Support the new Scons 3.0.0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102940">Bug 102940</a> - Regression: Vulkan KMS rendering crashes since 17.2</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Smith (1):</p>
+<ul>
+  <li>radv: Add R16G16B16A16_SNORM fast clear support</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (2):</p>
+<ul>
+  <li>nir/spirv: Allow loop breaks in a switch body.</li>
+  <li>radv: Only set the MTYPE flags on GFX9+.</li>
+</ul>
+
+<p>Ben Crocker (4):</p>
+<ul>
+  <li>gallivm: fix typo in debug_printf message</li>
+  <li>gallivm: allow additional llc options</li>
+  <li>gallivm/ppc64le: adjust VSX code generation control.</li>
+  <li>gallivm/ppc64le: allow environmental control of Altivec code generation</li>
+</ul>
+
+<p>Daniel Stone (2):</p>
+<ul>
+  <li>egl/wayland: Check queryImage return for wl_buffer</li>
+  <li>egl/wayland: Don't use dmabuf with no modifiers</li>
+</ul>
+
+<p>Dave Airlie (2):</p>
+<ul>
+  <li>radv: emit fmuladd instead of fma to llvm.</li>
+  <li>radv: lower ffma in nir.</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>cherry-ignore: add "anv: Remove unreachable cases from isl_format_for_size"</li>
+  <li>cherry-ignore: add "anv/wsi: Allocate enough memory for the entire image"</li>
+  <li>swr/rast: do not crash on NULL strings returned by getenv</li>
+  <li>wayland-drm: use a copy of the wayland_drm_callbacks struct</li>
+  <li>eglmesaext: add forward declaration for struct wl_buffers</li>
+  <li>Update version to 17.2.3</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>scons: use python3-compatible print()</li>
+</ul>
+
+<p>Ilia Mirkin (2):</p>
+<ul>
+  <li>nv50/ir: fix 64-bit integer shifts</li>
+  <li>nv50,nvc0: fix push hint logic in presence of a start offset</li>
+</ul>
+
+<p>Jason Ekstrand (6):</p>
+<ul>
+  <li>intel/compiler: Don't cmod propagate into a saturated operation</li>
+  <li>intel/compiler: Don't propagate cmod into integer multiplies</li>
+  <li>glsl/blob: Return false from ensure_can_read on overrun</li>
+  <li>glsl/blob: Return false from grow_to_fit if we've ever failed</li>
+  <li>nir/opcodes: Fix constant-folding of ufind_msb</li>
+  <li>nir: Get rid of the variable on vote intrinsics</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.2.2</li>
+</ul>
+
+<p>Józef Kucia (3):</p>
+<ul>
+  <li>anv: Fix vkCmdFillBuffer()</li>
+  <li>spirv: Fix SpvOpAtomicISub</li>
+  <li>anv: Do not assert() on VK_ATTACHMENT_UNUSED</li>
+</ul>
+
+<p>Leo Liu (3):</p>
+<ul>
+  <li>st/va: use pipe transfer_map to map upload buffer</li>
+  <li>st/vdpau: don't re-allocate interlaced buffer with packed YUV format</li>
+  <li>st/va: don't re-allocate interlaced buffer with pakced format</li>
+</ul>
+
+<p>Lionel Landwerlin (4):</p>
+<ul>
+  <li>intel: compiler: vec4: add missing default 0 lod</li>
+  <li>anv/cmd_buffer: fix push descriptors with set &gt; 0</li>
+  <li>anv/cmd_buffer: Reset state in cmd_buffer_destroy</li>
+  <li>anv: bo_cache: allow importing a BO larger than needed</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>mesa: fix texture updates for ATI_fragment_shader</li>
+  <li>st/mesa: don't use pipe_surface for passing information about EGLImage</li>
+  <li>glsl_to_tgsi: fix instruction order for bindless textures</li>
+</ul>
+
+<p>Nicolai Hähnle (14):</p>
+<ul>
+  <li>st/glsl_to_tgsi: fix conditional assignments to packed shader outputs</li>
+  <li>amd/common: fix build_cube_select</li>
+  <li>radeonsi/gfx9: fix geometry shaders without output vertices</li>
+  <li>util/queue: fix a race condition in the fence code</li>
+  <li>glsl/lower_instruction: handle denorms and overflow in ldexp correctly</li>
+  <li>radeonsi: move current_rast_prim to r600_common_context</li>
+  <li>radeonsi: don't discard points and lines</li>
+  <li>radeonsi: deduce rast_prim correctly for tessellation point mode</li>
+  <li>radeonsi: fix maximum advertised point size / line width</li>
+  <li>st/mesa: don't clobber glGetInternalformat* buffer for GL_NUM_SAMPLE_COUNTS</li>
+  <li>st/glsl_to_tgsi: fix indirect access to 64-bit integer</li>
+  <li>st/glsl_to_tgsi: fix a use-after-free in merge_two_dsts</li>
+  <li>radeonsi: clamp depth comparison value only for fixed point formats</li>
+  <li>radeonsi: clamp border colors for upgraded depth textures</li>
+</ul>
+
+<p>Rob Clark (2):</p>
+<ul>
+  <li>freedreno/a5xx: align height to GMEM</li>
+  <li>freedreno/a5xx: fix missing restore state</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.2.4.html b/docs/relnotes/17.2.4.html
new file mode 100644
index 0000000..218e1a4
--- /dev/null
+++ b/docs/relnotes/17.2.4.html
@@ -0,0 +1,132 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.4 Release Notes / October 30, 2017</h1>
+
+<p>
+Mesa 17.2.4 is a bug fix release which fixes bugs found since the 17.2.3 release.
+</p>
+<p>
+Mesa 17.2.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+cb266edc5cf7226219ebaf556ca2e03dff282e0324d20afd80423a5754d1272c  mesa-17.2.4.tar.gz
+5ba408fecd6e1132e5490eec1a2f04466214e4c65c8b89b331be844768c2e550  mesa-17.2.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102774">Bug 102774</a> - [BDW] [Bisected] Absolute constant buffers break VAAPI in mpv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103388">Bug 103388</a> - Linking libcltgsi.la (llvm/codegen/libclllvm_la-common.lo) fails with &quot;error: no match for 'operator-'&quot; with GCC-7, Mesa from Git and current LLVM revisions</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>Andres Gomez (8):</p>
+<ul>
+  <li>cherry-ignore: configure.ac: rework llvm detection and handling</li>
+  <li>cherry-ignore: glsl: fix derived cs variables</li>
+  <li>cherry-ignore: added 17.3 nominations.</li>
+  <li>cherry-ignore: radv: Don't use vgpr indexing for outputs on GFX9.</li>
+  <li>cherry-ignore: radv: Disallow indirect outputs for GS on GFX9 as well.</li>
+  <li>cherry-ignore: mesa/bufferobj: don't double negate the range</li>
+  <li>cherry-ignore: broadcom/vc5: Propagate vc4 aliasing fix to vc5.</li>
+  <li>Update version to 17.2.4</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>ac/nir: Fix nir_texop_lod on GFX for 1D arrays.</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>radv/image: bump all the offset to uint64_t.</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.2.3</li>
+</ul>
+
+<p>Henri Verbeet (1):</p>
+<ul>
+  <li>vulkan/wsi: Free the event in x11_manage_fifo_queues().</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>clover: Fix compilation after clang r315871</li>
+</ul>
+
+<p>Jason Ekstrand (4):</p>
+<ul>
+  <li>nir/intrinsics: Set the correct num_indices for load_output</li>
+  <li>intel/fs: Handle flag read/write aliasing in needs_src_copy</li>
+  <li>anv/pipeline: Call nir_lower_system_valaues after brw_preprocess_nir</li>
+  <li>intel/eu: Use EXECUTE_1 for JMPI</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Revert absolute mode for constant buffer pointers.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>Revert "mesa: fix texture updates for ATI_fragment_shader"</li>
+</ul>
+
+<p>Matthew Nicholls (1):</p>
+<ul>
+  <li>ac/nir: generate correct instruction for atomic min/max on unsigned images</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>st/mesa: Initialize textures array in st_framebuffer_validate</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: add the draw count buffer to the list of buffers</li>
+</ul>
+
+<p>Stefan Schake (1):</p>
+<ul>
+  <li>broadcom/vc4: Fix aliasing issue</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/17.2.5.html b/docs/relnotes/17.2.5.html
new file mode 100644
index 0000000..9d145e6
--- /dev/null
+++ b/docs/relnotes/17.2.5.html
@@ -0,0 +1,156 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.2.5 Release Notes / November 10, 2017</h1>
+
+<p>
+Mesa 17.2.5 is a bug fix release which fixes bugs found since the 17.2.4 release.
+</p>
+<p>
+Mesa 17.2.5 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+25b40e72fad64b096c2d8d6fe9579369954debe7970d4ad53e5033c7eec2918b  mesa-17.2.5.tar.gz
+7f7f914b7b9ea0b15f2d9d01a4375e311b0e90e55683b8e8a67ce8691eb1070f  mesa-17.2.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97532">Bug 97532</a> - Regression: GLB 2.7 &amp; Glmark-2 GLES versions segfault due to linker precision error (259fc505) on dead variable</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102680">Bug 102680</a> - [OpenGL CTS] KHR-GL45.shader_ballot_tests.ShaderBallotBitmasks fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102809">Bug 102809</a> - Rust shadows(?) flash random colours</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103142">Bug 103142</a> - R600g+sb: optimizer apparently stuck in an endless loop</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>Andres Gomez (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.2.4</li>
+  <li>cherry-ignore: radv: copy indirect lowering settings from radeonsi</li>
+  <li>cherry-ignore: i965: fix blorp stage_prog_data-&gt;param leak</li>
+  <li>cherry-ignore: etnaviv: don't do resolve-in-place without valid TS</li>
+  <li>cherry-ignore: intel/fs: Alloc pull constants off mem_ctx</li>
+  <li>cherry-ignore: added 17.3 nominations.</li>
+  <li>cherry-ignore: automake: include git_sha1.h.in in release tarball</li>
+  <li>Update version to 17.2.5</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (3):</p>
+<ul>
+  <li>radv: Don't expose heaps with 0 memory.</li>
+  <li>radv: Don't use vgpr indexing for outputs on GFX9.</li>
+  <li>radv: Disallow indirect outputs for GS on GFX9 as well.</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>i915g: make gears run again.</li>
+  <li>radv: free attachments on end command buffer.</li>
+  <li>radv: add initial copy descriptor support. (v2)</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>vc4: fix release build</li>
+</ul>
+
+<p>Gert Wollny (1):</p>
+<ul>
+  <li>r600/sb: bail out if prepare_alu_group() doesn't find a proper scheduling</li>
+</ul>
+
+<p>Jason Ekstrand (4):</p>
+<ul>
+  <li>spirv: Claim support for the simple memory model</li>
+  <li>i965/blorp: Use blorp_to_isl_format for src_isl_format in blit_miptrees</li>
+  <li>i965/blorp: Use more temporary isl_format variables</li>
+  <li>i965/miptree: Take an isl_format in render_aux_usage</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>mesa: Accept GL_BACK in get_fb0_attachment with ARB_ES3_1_compatibility.</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>radeon/video: add gfx9 offsets when rejoin the video surface</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>st/dri: don't expose modifiers in EGL if the driver doesn't implement them</li>
+  <li>ac/surface/gfx9: don't allow DCC for the smallest mipmap levels</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>i965: Check CCS_E compatibility for texture view rendering</li>
+</ul>
+
+<p>Neil Roberts (1):</p>
+<ul>
+  <li>nir/opt_intrinsics: Fix values for gl_SubGroupG{e,t}MaskARB</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>amd/common/gfx9: workaround DCC corruption more conservatively</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>i965: unref push_const_bo in intelDestroyContext</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>radv: copy indirect lowering settings from radeonsi</li>
+</ul>
+
+<p>Tomasz Figa (1):</p>
+<ul>
+  <li>glsl: Allow precision mismatch on dead data with GLSL ES 1.00</li>
+</ul>
+
+<p>Topi Pohjolainen (1):</p>
+<ul>
+  <li>intel/compiler/gen9: Pixel shader header only workaround</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/shading.html b/docs/shading.html
index 7e3d2e4..c789102 100644
--- a/docs/shading.html
+++ b/docs/shading.html
@@ -50,6 +50,8 @@
     The filenames will be "shader_X.vert" or "shader_X.frag" where X
     the shader ID.
 <li><b>cache_info</b> - print debug information about shader cache
+<li><b>cache_fb</b> - force cached shaders to be ignored and do a full
+    recompile via the fallback path</li>
 <li><b>uniform</b> - print message to stdout when glUniform is called
 <li><b>nopvert</b> - force vertex shaders to be a simple shader that just transforms
     the vertex position with ftransform() and passes through the color and
diff --git a/include/EGL/egl.h b/include/EGL/egl.h
index 29f30d9..93a2187 100644
--- a/include/EGL/egl.h
+++ b/include/EGL/egl.h
@@ -31,14 +31,14 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/egl
+**   http://www.khronos.org/registry/egl
 **
-** Khronos $Revision$ on $Date$
+** Khronos $Git commit SHA1: a732b061e7 $ on $Git commit date: 2017-06-17 23:27:53 +0100 $
 */
 
 #include <EGL/eglplatform.h>
 
-/* Generated on date 20161230 */
+/* Generated on date 20170627 */
 
 /* Generated C header for:
  * API: egl
diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index bc8f0ba..f7dc668 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -31,14 +31,14 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/egl
+**   http://www.khronos.org/registry/egl
 **
-** Khronos $Revision$ on $Date$
+** Khronos $Git commit SHA1: a732b061e7 $ on $Git commit date: 2017-06-17 23:27:53 +0100 $
 */
 
 #include <EGL/eglplatform.h>
 
-#define EGL_EGLEXT_VERSION 20161230
+#define EGL_EGLEXT_VERSION 20170627
 
 /* Generated C header for:
  * API: egl
@@ -133,6 +133,15 @@
 #endif
 #endif /* EGL_KHR_debug */
 
+#ifndef EGL_KHR_display_reference
+#define EGL_KHR_display_reference 1
+#define EGL_TRACK_REFERENCES_KHR          0x3352
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDISPLAYATTRIBKHRPROC) (EGLDisplay dpy, EGLint name, EGLAttrib *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDisplayAttribKHR (EGLDisplay dpy, EGLint name, EGLAttrib *value);
+#endif
+#endif /* EGL_KHR_display_reference */
+
 #ifndef EGL_KHR_fence_sync
 #define EGL_KHR_fence_sync 1
 typedef khronos_utime_nanoseconds_t EGLTimeKHR;
@@ -555,6 +564,11 @@
 #define EGL_DISCARD_SAMPLES_ARM           0x3286
 #endif /* EGL_ARM_pixmap_multisample_discard */
 
+#ifndef EGL_EXT_bind_to_front
+#define EGL_EXT_bind_to_front 1
+#define EGL_FRONT_BUFFER_EXT              0x3464
+#endif /* EGL_EXT_bind_to_front */
+
 #ifndef EGL_EXT_buffer_age
 #define EGL_EXT_buffer_age 1
 #define EGL_BUFFER_AGE_EXT                0x313D
@@ -564,6 +578,30 @@
 #define EGL_EXT_client_extensions 1
 #endif /* EGL_EXT_client_extensions */
 
+#ifndef EGL_EXT_compositor
+#define EGL_EXT_compositor 1
+#define EGL_PRIMARY_COMPOSITOR_CONTEXT_EXT 0x3460
+#define EGL_EXTERNAL_REF_ID_EXT           0x3461
+#define EGL_COMPOSITOR_DROP_NEWEST_FRAME_EXT 0x3462
+#define EGL_COMPOSITOR_KEEP_NEWEST_FRAME_EXT 0x3463
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSETCONTEXTLISTEXTPROC) (const EGLint *external_ref_ids, EGLint num_entries);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSETCONTEXTATTRIBUTESEXTPROC) (EGLint external_ref_id, const EGLint *context_attributes, EGLint num_entries);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSETWINDOWLISTEXTPROC) (EGLint external_ref_id, const EGLint *external_win_ids, EGLint num_entries);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSETWINDOWATTRIBUTESEXTPROC) (EGLint external_win_id, const EGLint *window_attributes, EGLint num_entries);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORBINDTEXWINDOWEXTPROC) (EGLint external_win_id);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSETSIZEEXTPROC) (EGLint external_win_id, EGLint width, EGLint height);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOMPOSITORSWAPPOLICYEXTPROC) (EGLint external_win_id, EGLint policy);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSetContextListEXT (const EGLint *external_ref_ids, EGLint num_entries);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSetContextAttributesEXT (EGLint external_ref_id, const EGLint *context_attributes, EGLint num_entries);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSetWindowListEXT (EGLint external_ref_id, const EGLint *external_win_ids, EGLint num_entries);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSetWindowAttributesEXT (EGLint external_win_id, const EGLint *window_attributes, EGLint num_entries);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorBindTexWindowEXT (EGLint external_win_id);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSetSizeEXT (EGLint external_win_id, EGLint width, EGLint height);
+EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSwapPolicyEXT (EGLint external_win_id, EGLint policy);
+#endif
+#endif /* EGL_EXT_compositor */
+
 #ifndef EGL_EXT_create_context_robustness
 #define EGL_EXT_create_context_robustness 1
 #define EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT 0x30BF
@@ -618,6 +656,21 @@
 #define EGL_GL_COLORSPACE_BT2020_PQ_EXT   0x3340
 #endif /* EGL_EXT_gl_colorspace_bt2020_pq */
 
+#ifndef EGL_EXT_gl_colorspace_display_p3
+#define EGL_EXT_gl_colorspace_display_p3 1
+#define EGL_GL_COLORSPACE_DISPLAY_P3_EXT  0x3363
+#endif /* EGL_EXT_gl_colorspace_display_p3 */
+
+#ifndef EGL_EXT_gl_colorspace_display_p3_linear
+#define EGL_EXT_gl_colorspace_display_p3_linear 1
+#define EGL_GL_COLORSPACE_DISPLAY_P3_LINEAR_EXT 0x3362
+#endif /* EGL_EXT_gl_colorspace_display_p3_linear */
+
+#ifndef EGL_EXT_gl_colorspace_scrgb
+#define EGL_EXT_gl_colorspace_scrgb 1
+#define EGL_GL_COLORSPACE_SCRGB_EXT       0x3351
+#endif /* EGL_EXT_gl_colorspace_scrgb */
+
 #ifndef EGL_EXT_gl_colorspace_scrgb_linear
 #define EGL_EXT_gl_colorspace_scrgb_linear 1
 #define EGL_GL_COLORSPACE_SCRGB_LINEAR_EXT 0x3350
@@ -670,6 +723,13 @@
 #endif
 #endif /* EGL_EXT_image_dma_buf_import_modifiers */
 
+#ifndef EGL_EXT_image_implicit_sync_control
+#define EGL_EXT_image_implicit_sync_control 1
+#define EGL_IMPORT_SYNC_TYPE_EXT          0x3470
+#define EGL_IMPORT_IMPLICIT_SYNC_EXT      0x3471
+#define EGL_IMPORT_EXPLICIT_SYNC_EXT      0x3472
+#endif /* EGL_EXT_image_implicit_sync_control */
+
 #ifndef EGL_EXT_multiview_window
 #define EGL_EXT_multiview_window 1
 #define EGL_MULTIVIEW_VIEW_COUNT_EXT      0x3134
@@ -769,6 +829,12 @@
 #endif
 #endif /* EGL_EXT_stream_consumer_egloutput */
 
+#ifndef EGL_EXT_surface_CTA861_3_metadata
+#define EGL_EXT_surface_CTA861_3_metadata 1
+#define EGL_CTA861_3_MAX_CONTENT_LIGHT_LEVEL_EXT 0x3360
+#define EGL_CTA861_3_MAX_FRAME_AVERAGE_LEVEL_EXT 0x3361
+#endif /* EGL_EXT_surface_CTA861_3_metadata */
+
 #ifndef EGL_EXT_surface_SMPTE2086_metadata
 #define EGL_EXT_surface_SMPTE2086_metadata 1
 #define EGL_SMPTE2086_DISPLAY_PRIMARY_RX_EXT 0x3341
@@ -781,6 +847,7 @@
 #define EGL_SMPTE2086_WHITE_POINT_Y_EXT   0x3348
 #define EGL_SMPTE2086_MAX_LUMINANCE_EXT   0x3349
 #define EGL_SMPTE2086_MIN_LUMINANCE_EXT   0x334A
+#define EGL_METADATA_SCALING_EXT          50000
 #endif /* EGL_EXT_surface_SMPTE2086_metadata */
 
 #ifndef EGL_EXT_swap_buffers_with_damage
diff --git a/include/EGL/eglmesaext.h b/include/EGL/eglmesaext.h
index cff1bc6..5bf6306 100644
--- a/include/EGL/eglmesaext.h
+++ b/include/EGL/eglmesaext.h
@@ -70,6 +70,7 @@
 #ifndef EGL_WL_create_wayland_buffer_from_image
 #define EGL_WL_create_wayland_buffer_from_image 1
 
+struct wl_buffer;
 #ifdef EGL_EGLEXT_PROTOTYPES
 EGLAPI struct wl_buffer * EGLAPIENTRY eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImageKHR image);
 #endif
diff --git a/include/GL/glcorearb.h b/include/GL/glcorearb.h
index be94baf..1f4d64e 100644
--- a/include/GL/glcorearb.h
+++ b/include/GL/glcorearb.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,9 +31,7 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 32433 $ on $Date: 2016-02-10 02:02:08 -0500 (Wed, 10 Feb 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
@@ -89,6 +87,179 @@
 typedef unsigned int GLuint;
 typedef unsigned char GLboolean;
 typedef unsigned char GLubyte;
+#define GL_DEPTH_BUFFER_BIT               0x00000100
+#define GL_STENCIL_BUFFER_BIT             0x00000400
+#define GL_COLOR_BUFFER_BIT               0x00004000
+#define GL_FALSE                          0
+#define GL_TRUE                           1
+#define GL_POINTS                         0x0000
+#define GL_LINES                          0x0001
+#define GL_LINE_LOOP                      0x0002
+#define GL_LINE_STRIP                     0x0003
+#define GL_TRIANGLES                      0x0004
+#define GL_TRIANGLE_STRIP                 0x0005
+#define GL_TRIANGLE_FAN                   0x0006
+#define GL_QUADS                          0x0007
+#define GL_NEVER                          0x0200
+#define GL_LESS                           0x0201
+#define GL_EQUAL                          0x0202
+#define GL_LEQUAL                         0x0203
+#define GL_GREATER                        0x0204
+#define GL_NOTEQUAL                       0x0205
+#define GL_GEQUAL                         0x0206
+#define GL_ALWAYS                         0x0207
+#define GL_ZERO                           0
+#define GL_ONE                            1
+#define GL_SRC_COLOR                      0x0300
+#define GL_ONE_MINUS_SRC_COLOR            0x0301
+#define GL_SRC_ALPHA                      0x0302
+#define GL_ONE_MINUS_SRC_ALPHA            0x0303
+#define GL_DST_ALPHA                      0x0304
+#define GL_ONE_MINUS_DST_ALPHA            0x0305
+#define GL_DST_COLOR                      0x0306
+#define GL_ONE_MINUS_DST_COLOR            0x0307
+#define GL_SRC_ALPHA_SATURATE             0x0308
+#define GL_NONE                           0
+#define GL_FRONT_LEFT                     0x0400
+#define GL_FRONT_RIGHT                    0x0401
+#define GL_BACK_LEFT                      0x0402
+#define GL_BACK_RIGHT                     0x0403
+#define GL_FRONT                          0x0404
+#define GL_BACK                           0x0405
+#define GL_LEFT                           0x0406
+#define GL_RIGHT                          0x0407
+#define GL_FRONT_AND_BACK                 0x0408
+#define GL_NO_ERROR                       0
+#define GL_INVALID_ENUM                   0x0500
+#define GL_INVALID_VALUE                  0x0501
+#define GL_INVALID_OPERATION              0x0502
+#define GL_OUT_OF_MEMORY                  0x0505
+#define GL_CW                             0x0900
+#define GL_CCW                            0x0901
+#define GL_POINT_SIZE                     0x0B11
+#define GL_POINT_SIZE_RANGE               0x0B12
+#define GL_POINT_SIZE_GRANULARITY         0x0B13
+#define GL_LINE_SMOOTH                    0x0B20
+#define GL_LINE_WIDTH                     0x0B21
+#define GL_LINE_WIDTH_RANGE               0x0B22
+#define GL_LINE_WIDTH_GRANULARITY         0x0B23
+#define GL_POLYGON_MODE                   0x0B40
+#define GL_POLYGON_SMOOTH                 0x0B41
+#define GL_CULL_FACE                      0x0B44
+#define GL_CULL_FACE_MODE                 0x0B45
+#define GL_FRONT_FACE                     0x0B46
+#define GL_DEPTH_RANGE                    0x0B70
+#define GL_DEPTH_TEST                     0x0B71
+#define GL_DEPTH_WRITEMASK                0x0B72
+#define GL_DEPTH_CLEAR_VALUE              0x0B73
+#define GL_DEPTH_FUNC                     0x0B74
+#define GL_STENCIL_TEST                   0x0B90
+#define GL_STENCIL_CLEAR_VALUE            0x0B91
+#define GL_STENCIL_FUNC                   0x0B92
+#define GL_STENCIL_VALUE_MASK             0x0B93
+#define GL_STENCIL_FAIL                   0x0B94
+#define GL_STENCIL_PASS_DEPTH_FAIL        0x0B95
+#define GL_STENCIL_PASS_DEPTH_PASS        0x0B96
+#define GL_STENCIL_REF                    0x0B97
+#define GL_STENCIL_WRITEMASK              0x0B98
+#define GL_VIEWPORT                       0x0BA2
+#define GL_DITHER                         0x0BD0
+#define GL_BLEND_DST                      0x0BE0
+#define GL_BLEND_SRC                      0x0BE1
+#define GL_BLEND                          0x0BE2
+#define GL_LOGIC_OP_MODE                  0x0BF0
+#define GL_DRAW_BUFFER                    0x0C01
+#define GL_READ_BUFFER                    0x0C02
+#define GL_SCISSOR_BOX                    0x0C10
+#define GL_SCISSOR_TEST                   0x0C11
+#define GL_COLOR_CLEAR_VALUE              0x0C22
+#define GL_COLOR_WRITEMASK                0x0C23
+#define GL_DOUBLEBUFFER                   0x0C32
+#define GL_STEREO                         0x0C33
+#define GL_LINE_SMOOTH_HINT               0x0C52
+#define GL_POLYGON_SMOOTH_HINT            0x0C53
+#define GL_UNPACK_SWAP_BYTES              0x0CF0
+#define GL_UNPACK_LSB_FIRST               0x0CF1
+#define GL_UNPACK_ROW_LENGTH              0x0CF2
+#define GL_UNPACK_SKIP_ROWS               0x0CF3
+#define GL_UNPACK_SKIP_PIXELS             0x0CF4
+#define GL_UNPACK_ALIGNMENT               0x0CF5
+#define GL_PACK_SWAP_BYTES                0x0D00
+#define GL_PACK_LSB_FIRST                 0x0D01
+#define GL_PACK_ROW_LENGTH                0x0D02
+#define GL_PACK_SKIP_ROWS                 0x0D03
+#define GL_PACK_SKIP_PIXELS               0x0D04
+#define GL_PACK_ALIGNMENT                 0x0D05
+#define GL_MAX_TEXTURE_SIZE               0x0D33
+#define GL_MAX_VIEWPORT_DIMS              0x0D3A
+#define GL_SUBPIXEL_BITS                  0x0D50
+#define GL_TEXTURE_1D                     0x0DE0
+#define GL_TEXTURE_2D                     0x0DE1
+#define GL_TEXTURE_WIDTH                  0x1000
+#define GL_TEXTURE_HEIGHT                 0x1001
+#define GL_TEXTURE_BORDER_COLOR           0x1004
+#define GL_DONT_CARE                      0x1100
+#define GL_FASTEST                        0x1101
+#define GL_NICEST                         0x1102
+#define GL_BYTE                           0x1400
+#define GL_UNSIGNED_BYTE                  0x1401
+#define GL_SHORT                          0x1402
+#define GL_UNSIGNED_SHORT                 0x1403
+#define GL_INT                            0x1404
+#define GL_UNSIGNED_INT                   0x1405
+#define GL_FLOAT                          0x1406
+#define GL_STACK_OVERFLOW                 0x0503
+#define GL_STACK_UNDERFLOW                0x0504
+#define GL_CLEAR                          0x1500
+#define GL_AND                            0x1501
+#define GL_AND_REVERSE                    0x1502
+#define GL_COPY                           0x1503
+#define GL_AND_INVERTED                   0x1504
+#define GL_NOOP                           0x1505
+#define GL_XOR                            0x1506
+#define GL_OR                             0x1507
+#define GL_NOR                            0x1508
+#define GL_EQUIV                          0x1509
+#define GL_INVERT                         0x150A
+#define GL_OR_REVERSE                     0x150B
+#define GL_COPY_INVERTED                  0x150C
+#define GL_OR_INVERTED                    0x150D
+#define GL_NAND                           0x150E
+#define GL_SET                            0x150F
+#define GL_TEXTURE                        0x1702
+#define GL_COLOR                          0x1800
+#define GL_DEPTH                          0x1801
+#define GL_STENCIL                        0x1802
+#define GL_STENCIL_INDEX                  0x1901
+#define GL_DEPTH_COMPONENT                0x1902
+#define GL_RED                            0x1903
+#define GL_GREEN                          0x1904
+#define GL_BLUE                           0x1905
+#define GL_ALPHA                          0x1906
+#define GL_RGB                            0x1907
+#define GL_RGBA                           0x1908
+#define GL_POINT                          0x1B00
+#define GL_LINE                           0x1B01
+#define GL_FILL                           0x1B02
+#define GL_KEEP                           0x1E00
+#define GL_REPLACE                        0x1E01
+#define GL_INCR                           0x1E02
+#define GL_DECR                           0x1E03
+#define GL_VENDOR                         0x1F00
+#define GL_RENDERER                       0x1F01
+#define GL_VERSION                        0x1F02
+#define GL_EXTENSIONS                     0x1F03
+#define GL_NEAREST                        0x2600
+#define GL_LINEAR                         0x2601
+#define GL_NEAREST_MIPMAP_NEAREST         0x2700
+#define GL_LINEAR_MIPMAP_NEAREST          0x2701
+#define GL_NEAREST_MIPMAP_LINEAR          0x2702
+#define GL_LINEAR_MIPMAP_LINEAR           0x2703
+#define GL_TEXTURE_MAG_FILTER             0x2800
+#define GL_TEXTURE_MIN_FILTER             0x2801
+#define GL_TEXTURE_WRAP_S                 0x2802
+#define GL_TEXTURE_WRAP_T                 0x2803
+#define GL_REPEAT                         0x2901
 typedef void (APIENTRYP PFNGLCULLFACEPROC) (GLenum mode);
 typedef void (APIENTRYP PFNGLFRONTFACEPROC) (GLenum mode);
 typedef void (APIENTRYP PFNGLHINTPROC) (GLenum target, GLenum mode);
@@ -193,115 +364,7 @@
 #define GL_VERSION_1_1 1
 typedef float GLclampf;
 typedef double GLclampd;
-#define GL_DEPTH_BUFFER_BIT               0x00000100
-#define GL_STENCIL_BUFFER_BIT             0x00000400
-#define GL_COLOR_BUFFER_BIT               0x00004000
-#define GL_FALSE                          0
-#define GL_TRUE                           1
-#define GL_POINTS                         0x0000
-#define GL_LINES                          0x0001
-#define GL_LINE_LOOP                      0x0002
-#define GL_LINE_STRIP                     0x0003
-#define GL_TRIANGLES                      0x0004
-#define GL_TRIANGLE_STRIP                 0x0005
-#define GL_TRIANGLE_FAN                   0x0006
-#define GL_QUADS                          0x0007
-#define GL_NEVER                          0x0200
-#define GL_LESS                           0x0201
-#define GL_EQUAL                          0x0202
-#define GL_LEQUAL                         0x0203
-#define GL_GREATER                        0x0204
-#define GL_NOTEQUAL                       0x0205
-#define GL_GEQUAL                         0x0206
-#define GL_ALWAYS                         0x0207
-#define GL_ZERO                           0
-#define GL_ONE                            1
-#define GL_SRC_COLOR                      0x0300
-#define GL_ONE_MINUS_SRC_COLOR            0x0301
-#define GL_SRC_ALPHA                      0x0302
-#define GL_ONE_MINUS_SRC_ALPHA            0x0303
-#define GL_DST_ALPHA                      0x0304
-#define GL_ONE_MINUS_DST_ALPHA            0x0305
-#define GL_DST_COLOR                      0x0306
-#define GL_ONE_MINUS_DST_COLOR            0x0307
-#define GL_SRC_ALPHA_SATURATE             0x0308
-#define GL_NONE                           0
-#define GL_FRONT_LEFT                     0x0400
-#define GL_FRONT_RIGHT                    0x0401
-#define GL_BACK_LEFT                      0x0402
-#define GL_BACK_RIGHT                     0x0403
-#define GL_FRONT                          0x0404
-#define GL_BACK                           0x0405
-#define GL_LEFT                           0x0406
-#define GL_RIGHT                          0x0407
-#define GL_FRONT_AND_BACK                 0x0408
-#define GL_NO_ERROR                       0
-#define GL_INVALID_ENUM                   0x0500
-#define GL_INVALID_VALUE                  0x0501
-#define GL_INVALID_OPERATION              0x0502
-#define GL_OUT_OF_MEMORY                  0x0505
-#define GL_CW                             0x0900
-#define GL_CCW                            0x0901
-#define GL_POINT_SIZE                     0x0B11
-#define GL_POINT_SIZE_RANGE               0x0B12
-#define GL_POINT_SIZE_GRANULARITY         0x0B13
-#define GL_LINE_SMOOTH                    0x0B20
-#define GL_LINE_WIDTH                     0x0B21
-#define GL_LINE_WIDTH_RANGE               0x0B22
-#define GL_LINE_WIDTH_GRANULARITY         0x0B23
-#define GL_POLYGON_MODE                   0x0B40
-#define GL_POLYGON_SMOOTH                 0x0B41
-#define GL_CULL_FACE                      0x0B44
-#define GL_CULL_FACE_MODE                 0x0B45
-#define GL_FRONT_FACE                     0x0B46
-#define GL_DEPTH_RANGE                    0x0B70
-#define GL_DEPTH_TEST                     0x0B71
-#define GL_DEPTH_WRITEMASK                0x0B72
-#define GL_DEPTH_CLEAR_VALUE              0x0B73
-#define GL_DEPTH_FUNC                     0x0B74
-#define GL_STENCIL_TEST                   0x0B90
-#define GL_STENCIL_CLEAR_VALUE            0x0B91
-#define GL_STENCIL_FUNC                   0x0B92
-#define GL_STENCIL_VALUE_MASK             0x0B93
-#define GL_STENCIL_FAIL                   0x0B94
-#define GL_STENCIL_PASS_DEPTH_FAIL        0x0B95
-#define GL_STENCIL_PASS_DEPTH_PASS        0x0B96
-#define GL_STENCIL_REF                    0x0B97
-#define GL_STENCIL_WRITEMASK              0x0B98
-#define GL_VIEWPORT                       0x0BA2
-#define GL_DITHER                         0x0BD0
-#define GL_BLEND_DST                      0x0BE0
-#define GL_BLEND_SRC                      0x0BE1
-#define GL_BLEND                          0x0BE2
-#define GL_LOGIC_OP_MODE                  0x0BF0
 #define GL_COLOR_LOGIC_OP                 0x0BF2
-#define GL_DRAW_BUFFER                    0x0C01
-#define GL_READ_BUFFER                    0x0C02
-#define GL_SCISSOR_BOX                    0x0C10
-#define GL_SCISSOR_TEST                   0x0C11
-#define GL_COLOR_CLEAR_VALUE              0x0C22
-#define GL_COLOR_WRITEMASK                0x0C23
-#define GL_DOUBLEBUFFER                   0x0C32
-#define GL_STEREO                         0x0C33
-#define GL_LINE_SMOOTH_HINT               0x0C52
-#define GL_POLYGON_SMOOTH_HINT            0x0C53
-#define GL_UNPACK_SWAP_BYTES              0x0CF0
-#define GL_UNPACK_LSB_FIRST               0x0CF1
-#define GL_UNPACK_ROW_LENGTH              0x0CF2
-#define GL_UNPACK_SKIP_ROWS               0x0CF3
-#define GL_UNPACK_SKIP_PIXELS             0x0CF4
-#define GL_UNPACK_ALIGNMENT               0x0CF5
-#define GL_PACK_SWAP_BYTES                0x0D00
-#define GL_PACK_LSB_FIRST                 0x0D01
-#define GL_PACK_ROW_LENGTH                0x0D02
-#define GL_PACK_SKIP_ROWS                 0x0D03
-#define GL_PACK_SKIP_PIXELS               0x0D04
-#define GL_PACK_ALIGNMENT                 0x0D05
-#define GL_MAX_TEXTURE_SIZE               0x0D33
-#define GL_MAX_VIEWPORT_DIMS              0x0D3A
-#define GL_SUBPIXEL_BITS                  0x0D50
-#define GL_TEXTURE_1D                     0x0DE0
-#define GL_TEXTURE_2D                     0x0DE1
 #define GL_POLYGON_OFFSET_UNITS           0x2A00
 #define GL_POLYGON_OFFSET_POINT           0x2A01
 #define GL_POLYGON_OFFSET_LINE            0x2A02
@@ -309,79 +372,14 @@
 #define GL_POLYGON_OFFSET_FACTOR          0x8038
 #define GL_TEXTURE_BINDING_1D             0x8068
 #define GL_TEXTURE_BINDING_2D             0x8069
-#define GL_TEXTURE_WIDTH                  0x1000
-#define GL_TEXTURE_HEIGHT                 0x1001
 #define GL_TEXTURE_INTERNAL_FORMAT        0x1003
-#define GL_TEXTURE_BORDER_COLOR           0x1004
 #define GL_TEXTURE_RED_SIZE               0x805C
 #define GL_TEXTURE_GREEN_SIZE             0x805D
 #define GL_TEXTURE_BLUE_SIZE              0x805E
 #define GL_TEXTURE_ALPHA_SIZE             0x805F
-#define GL_DONT_CARE                      0x1100
-#define GL_FASTEST                        0x1101
-#define GL_NICEST                         0x1102
-#define GL_BYTE                           0x1400
-#define GL_UNSIGNED_BYTE                  0x1401
-#define GL_SHORT                          0x1402
-#define GL_UNSIGNED_SHORT                 0x1403
-#define GL_INT                            0x1404
-#define GL_UNSIGNED_INT                   0x1405
-#define GL_FLOAT                          0x1406
 #define GL_DOUBLE                         0x140A
-#define GL_STACK_OVERFLOW                 0x0503
-#define GL_STACK_UNDERFLOW                0x0504
-#define GL_CLEAR                          0x1500
-#define GL_AND                            0x1501
-#define GL_AND_REVERSE                    0x1502
-#define GL_COPY                           0x1503
-#define GL_AND_INVERTED                   0x1504
-#define GL_NOOP                           0x1505
-#define GL_XOR                            0x1506
-#define GL_OR                             0x1507
-#define GL_NOR                            0x1508
-#define GL_EQUIV                          0x1509
-#define GL_INVERT                         0x150A
-#define GL_OR_REVERSE                     0x150B
-#define GL_COPY_INVERTED                  0x150C
-#define GL_OR_INVERTED                    0x150D
-#define GL_NAND                           0x150E
-#define GL_SET                            0x150F
-#define GL_TEXTURE                        0x1702
-#define GL_COLOR                          0x1800
-#define GL_DEPTH                          0x1801
-#define GL_STENCIL                        0x1802
-#define GL_STENCIL_INDEX                  0x1901
-#define GL_DEPTH_COMPONENT                0x1902
-#define GL_RED                            0x1903
-#define GL_GREEN                          0x1904
-#define GL_BLUE                           0x1905
-#define GL_ALPHA                          0x1906
-#define GL_RGB                            0x1907
-#define GL_RGBA                           0x1908
-#define GL_POINT                          0x1B00
-#define GL_LINE                           0x1B01
-#define GL_FILL                           0x1B02
-#define GL_KEEP                           0x1E00
-#define GL_REPLACE                        0x1E01
-#define GL_INCR                           0x1E02
-#define GL_DECR                           0x1E03
-#define GL_VENDOR                         0x1F00
-#define GL_RENDERER                       0x1F01
-#define GL_VERSION                        0x1F02
-#define GL_EXTENSIONS                     0x1F03
-#define GL_NEAREST                        0x2600
-#define GL_LINEAR                         0x2601
-#define GL_NEAREST_MIPMAP_NEAREST         0x2700
-#define GL_LINEAR_MIPMAP_NEAREST          0x2701
-#define GL_NEAREST_MIPMAP_LINEAR          0x2702
-#define GL_LINEAR_MIPMAP_LINEAR           0x2703
-#define GL_TEXTURE_MAG_FILTER             0x2800
-#define GL_TEXTURE_MIN_FILTER             0x2801
-#define GL_TEXTURE_WRAP_S                 0x2802
-#define GL_TEXTURE_WRAP_T                 0x2803
 #define GL_PROXY_TEXTURE_1D               0x8063
 #define GL_PROXY_TEXTURE_2D               0x8064
-#define GL_REPEAT                         0x2901
 #define GL_R3_G3_B2                       0x2A10
 #define GL_RGB4                           0x804F
 #define GL_RGB5                           0x8050
@@ -2903,6 +2901,17 @@
 #define GL_ARB_ES3_1_compatibility 1
 #endif /* GL_ARB_ES3_1_compatibility */
 
+#ifndef GL_ARB_ES3_2_compatibility
+#define GL_ARB_ES3_2_compatibility 1
+#define GL_PRIMITIVE_BOUNDING_BOX_ARB     0x92BE
+#define GL_MULTISAMPLE_LINE_WIDTH_RANGE_ARB 0x9381
+#define GL_MULTISAMPLE_LINE_WIDTH_GRANULARITY_ARB 0x9382
+typedef void (APIENTRYP PFNGLPRIMITIVEBOUNDINGBOXARBPROC) (GLfloat minX, GLfloat minY, GLfloat minZ, GLfloat minW, GLfloat maxX, GLfloat maxY, GLfloat maxZ, GLfloat maxW);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glPrimitiveBoundingBoxARB (GLfloat minX, GLfloat minY, GLfloat minZ, GLfloat minW, GLfloat maxX, GLfloat maxY, GLfloat maxZ, GLfloat maxW);
+#endif
+#endif /* GL_ARB_ES3_2_compatibility */
+
 #ifndef GL_ARB_ES3_compatibility
 #define GL_ARB_ES3_compatibility 1
 #endif /* GL_ARB_ES3_compatibility */
@@ -3102,6 +3111,16 @@
 #define GL_ARB_draw_indirect 1
 #endif /* GL_ARB_draw_indirect */
 
+#ifndef GL_ARB_draw_instanced
+#define GL_ARB_draw_instanced 1
+typedef void (APIENTRYP PFNGLDRAWARRAYSINSTANCEDARBPROC) (GLenum mode, GLint first, GLsizei count, GLsizei primcount);
+typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDARBPROC) (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glDrawArraysInstancedARB (GLenum mode, GLint first, GLsizei count, GLsizei primcount);
+GLAPI void APIENTRY glDrawElementsInstancedARB (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+#endif
+#endif /* GL_ARB_draw_instanced */
+
 #ifndef GL_ARB_enhanced_layouts
 #define GL_ARB_enhanced_layouts 1
 #endif /* GL_ARB_enhanced_layouts */
@@ -3122,6 +3141,10 @@
 #define GL_ARB_fragment_layer_viewport 1
 #endif /* GL_ARB_fragment_layer_viewport */
 
+#ifndef GL_ARB_fragment_shader_interlock
+#define GL_ARB_fragment_shader_interlock 1
+#endif /* GL_ARB_fragment_shader_interlock */
+
 #ifndef GL_ARB_framebuffer_no_attachments
 #define GL_ARB_framebuffer_no_attachments 1
 #endif /* GL_ARB_framebuffer_no_attachments */
@@ -3134,6 +3157,38 @@
 #define GL_ARB_framebuffer_sRGB 1
 #endif /* GL_ARB_framebuffer_sRGB */
 
+#ifndef GL_ARB_geometry_shader4
+#define GL_ARB_geometry_shader4 1
+#define GL_LINES_ADJACENCY_ARB            0x000A
+#define GL_LINE_STRIP_ADJACENCY_ARB       0x000B
+#define GL_TRIANGLES_ADJACENCY_ARB        0x000C
+#define GL_TRIANGLE_STRIP_ADJACENCY_ARB   0x000D
+#define GL_PROGRAM_POINT_SIZE_ARB         0x8642
+#define GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS_ARB 0x8C29
+#define GL_FRAMEBUFFER_ATTACHMENT_LAYERED_ARB 0x8DA7
+#define GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS_ARB 0x8DA8
+#define GL_FRAMEBUFFER_INCOMPLETE_LAYER_COUNT_ARB 0x8DA9
+#define GL_GEOMETRY_SHADER_ARB            0x8DD9
+#define GL_GEOMETRY_VERTICES_OUT_ARB      0x8DDA
+#define GL_GEOMETRY_INPUT_TYPE_ARB        0x8DDB
+#define GL_GEOMETRY_OUTPUT_TYPE_ARB       0x8DDC
+#define GL_MAX_GEOMETRY_VARYING_COMPONENTS_ARB 0x8DDD
+#define GL_MAX_VERTEX_VARYING_COMPONENTS_ARB 0x8DDE
+#define GL_MAX_GEOMETRY_UNIFORM_COMPONENTS_ARB 0x8DDF
+#define GL_MAX_GEOMETRY_OUTPUT_VERTICES_ARB 0x8DE0
+#define GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS_ARB 0x8DE1
+typedef void (APIENTRYP PFNGLPROGRAMPARAMETERIARBPROC) (GLuint program, GLenum pname, GLint value);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREARBPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURELAYERARBPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREFACEARBPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLenum face);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glProgramParameteriARB (GLuint program, GLenum pname, GLint value);
+GLAPI void APIENTRY glFramebufferTextureARB (GLenum target, GLenum attachment, GLuint texture, GLint level);
+GLAPI void APIENTRY glFramebufferTextureLayerARB (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+GLAPI void APIENTRY glFramebufferTextureFaceARB (GLenum target, GLenum attachment, GLuint texture, GLint level, GLenum face);
+#endif
+#endif /* GL_ARB_geometry_shader4 */
+
 #ifndef GL_ARB_get_program_binary
 #define GL_ARB_get_program_binary 1
 #endif /* GL_ARB_get_program_binary */
@@ -3142,6 +3197,16 @@
 #define GL_ARB_get_texture_sub_image 1
 #endif /* GL_ARB_get_texture_sub_image */
 
+#ifndef GL_ARB_gl_spirv
+#define GL_ARB_gl_spirv 1
+#define GL_SHADER_BINARY_FORMAT_SPIR_V_ARB 0x9551
+#define GL_SPIR_V_BINARY_ARB              0x9552
+typedef void (APIENTRYP PFNGLSPECIALIZESHADERARBPROC) (GLuint shader, const GLchar *pEntryPoint, GLuint numSpecializationConstants, const GLuint *pConstantIndex, const GLuint *pConstantValue);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glSpecializeShaderARB (GLuint shader, const GLchar *pEntryPoint, GLuint numSpecializationConstants, const GLuint *pConstantIndex, const GLuint *pConstantValue);
+#endif
+#endif /* GL_ARB_gl_spirv */
+
 #ifndef GL_ARB_gpu_shader5
 #define GL_ARB_gpu_shader5 1
 #endif /* GL_ARB_gpu_shader5 */
@@ -3150,6 +3215,91 @@
 #define GL_ARB_gpu_shader_fp64 1
 #endif /* GL_ARB_gpu_shader_fp64 */
 
+#ifndef GL_ARB_gpu_shader_int64
+#define GL_ARB_gpu_shader_int64 1
+#define GL_INT64_ARB                      0x140E
+#define GL_INT64_VEC2_ARB                 0x8FE9
+#define GL_INT64_VEC3_ARB                 0x8FEA
+#define GL_INT64_VEC4_ARB                 0x8FEB
+#define GL_UNSIGNED_INT64_VEC2_ARB        0x8FF5
+#define GL_UNSIGNED_INT64_VEC3_ARB        0x8FF6
+#define GL_UNSIGNED_INT64_VEC4_ARB        0x8FF7
+typedef void (APIENTRYP PFNGLUNIFORM1I64ARBPROC) (GLint location, GLint64 x);
+typedef void (APIENTRYP PFNGLUNIFORM2I64ARBPROC) (GLint location, GLint64 x, GLint64 y);
+typedef void (APIENTRYP PFNGLUNIFORM3I64ARBPROC) (GLint location, GLint64 x, GLint64 y, GLint64 z);
+typedef void (APIENTRYP PFNGLUNIFORM4I64ARBPROC) (GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+typedef void (APIENTRYP PFNGLUNIFORM1I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM2I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM3I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM4I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64ARBPROC) (GLint location, GLuint64 x);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLGETUNIFORMI64VARBPROC) (GLuint program, GLint location, GLint64 *params);
+typedef void (APIENTRYP PFNGLGETUNIFORMUI64VARBPROC) (GLuint program, GLint location, GLuint64 *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMI64VARBPROC) (GLuint program, GLint location, GLsizei bufSize, GLint64 *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMUI64VARBPROC) (GLuint program, GLint location, GLsizei bufSize, GLuint64 *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64ARBPROC) (GLuint program, GLint location, GLint64 x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64ARBPROC) (GLuint program, GLint location, GLuint64 x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glUniform1i64ARB (GLint location, GLint64 x);
+GLAPI void APIENTRY glUniform2i64ARB (GLint location, GLint64 x, GLint64 y);
+GLAPI void APIENTRY glUniform3i64ARB (GLint location, GLint64 x, GLint64 y, GLint64 z);
+GLAPI void APIENTRY glUniform4i64ARB (GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+GLAPI void APIENTRY glUniform1i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform2i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform3i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform4i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform1ui64ARB (GLint location, GLuint64 x);
+GLAPI void APIENTRY glUniform2ui64ARB (GLint location, GLuint64 x, GLuint64 y);
+GLAPI void APIENTRY glUniform3ui64ARB (GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+GLAPI void APIENTRY glUniform4ui64ARB (GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+GLAPI void APIENTRY glUniform1ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform2ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform3ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform4ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glGetUniformi64vARB (GLuint program, GLint location, GLint64 *params);
+GLAPI void APIENTRY glGetUniformui64vARB (GLuint program, GLint location, GLuint64 *params);
+GLAPI void APIENTRY glGetnUniformi64vARB (GLuint program, GLint location, GLsizei bufSize, GLint64 *params);
+GLAPI void APIENTRY glGetnUniformui64vARB (GLuint program, GLint location, GLsizei bufSize, GLuint64 *params);
+GLAPI void APIENTRY glProgramUniform1i64ARB (GLuint program, GLint location, GLint64 x);
+GLAPI void APIENTRY glProgramUniform2i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y);
+GLAPI void APIENTRY glProgramUniform3i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z);
+GLAPI void APIENTRY glProgramUniform4i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+GLAPI void APIENTRY glProgramUniform1i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform2i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform3i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform4i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform1ui64ARB (GLuint program, GLint location, GLuint64 x);
+GLAPI void APIENTRY glProgramUniform2ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y);
+GLAPI void APIENTRY glProgramUniform3ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+GLAPI void APIENTRY glProgramUniform4ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+GLAPI void APIENTRY glProgramUniform1ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform2ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform3ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform4ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+#endif
+#endif /* GL_ARB_gpu_shader_int64 */
+
 #ifndef GL_ARB_half_float_vertex
 #define GL_ARB_half_float_vertex 1
 #endif /* GL_ARB_half_float_vertex */
@@ -3172,6 +3322,15 @@
 #endif
 #endif /* GL_ARB_indirect_parameters */
 
+#ifndef GL_ARB_instanced_arrays
+#define GL_ARB_instanced_arrays 1
+#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR_ARB 0x88FE
+typedef void (APIENTRYP PFNGLVERTEXATTRIBDIVISORARBPROC) (GLuint index, GLuint divisor);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glVertexAttribDivisorARB (GLuint index, GLuint divisor);
+#endif
+#endif /* GL_ARB_instanced_arrays */
+
 #ifndef GL_ARB_internalformat_query
 #define GL_ARB_internalformat_query 1
 #endif /* GL_ARB_internalformat_query */
@@ -3205,6 +3364,16 @@
 #define GL_ARB_occlusion_query2 1
 #endif /* GL_ARB_occlusion_query2 */
 
+#ifndef GL_ARB_parallel_shader_compile
+#define GL_ARB_parallel_shader_compile 1
+#define GL_MAX_SHADER_COMPILER_THREADS_ARB 0x91B0
+#define GL_COMPLETION_STATUS_ARB          0x91B1
+typedef void (APIENTRYP PFNGLMAXSHADERCOMPILERTHREADSARBPROC) (GLuint count);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMaxShaderCompilerThreadsARB (GLuint count);
+#endif
+#endif /* GL_ARB_parallel_shader_compile */
+
 #ifndef GL_ARB_pipeline_statistics_query
 #define GL_ARB_pipeline_statistics_query 1
 #define GL_VERTICES_SUBMITTED_ARB         0x82EE
@@ -3219,6 +3388,18 @@
 #define GL_CLIPPING_OUTPUT_PRIMITIVES_ARB 0x82F7
 #endif /* GL_ARB_pipeline_statistics_query */
 
+#ifndef GL_ARB_pixel_buffer_object
+#define GL_ARB_pixel_buffer_object 1
+#define GL_PIXEL_PACK_BUFFER_ARB          0x88EB
+#define GL_PIXEL_UNPACK_BUFFER_ARB        0x88EC
+#define GL_PIXEL_PACK_BUFFER_BINDING_ARB  0x88ED
+#define GL_PIXEL_UNPACK_BUFFER_BINDING_ARB 0x88EF
+#endif /* GL_ARB_pixel_buffer_object */
+
+#ifndef GL_ARB_post_depth_coverage
+#define GL_ARB_post_depth_coverage 1
+#endif /* GL_ARB_post_depth_coverage */
+
 #ifndef GL_ARB_program_interface_query
 #define GL_ARB_program_interface_query 1
 #endif /* GL_ARB_program_interface_query */
@@ -3268,6 +3449,26 @@
 #define GL_ARB_robustness_isolation 1
 #endif /* GL_ARB_robustness_isolation */
 
+#ifndef GL_ARB_sample_locations
+#define GL_ARB_sample_locations 1
+#define GL_SAMPLE_LOCATION_SUBPIXEL_BITS_ARB 0x933D
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB 0x933E
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB 0x933F
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB 0x9340
+#define GL_SAMPLE_LOCATION_ARB            0x8E50
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_ARB 0x9341
+#define GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB 0x9342
+#define GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB 0x9343
+typedef void (APIENTRYP PFNGLFRAMEBUFFERSAMPLELOCATIONSFVARBPROC) (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERSAMPLELOCATIONSFVARBPROC) (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLEVALUATEDEPTHVALUESARBPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferSampleLocationsfvARB (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glNamedFramebufferSampleLocationsfvARB (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glEvaluateDepthValuesARB (void);
+#endif
+#endif /* GL_ARB_sample_locations */
+
 #ifndef GL_ARB_sample_shading
 #define GL_ARB_sample_shading 1
 #define GL_SAMPLE_SHADING_ARB             0x8C36
@@ -3294,14 +3495,26 @@
 #define GL_ARB_separate_shader_objects 1
 #endif /* GL_ARB_separate_shader_objects */
 
+#ifndef GL_ARB_shader_atomic_counter_ops
+#define GL_ARB_shader_atomic_counter_ops 1
+#endif /* GL_ARB_shader_atomic_counter_ops */
+
 #ifndef GL_ARB_shader_atomic_counters
 #define GL_ARB_shader_atomic_counters 1
 #endif /* GL_ARB_shader_atomic_counters */
 
+#ifndef GL_ARB_shader_ballot
+#define GL_ARB_shader_ballot 1
+#endif /* GL_ARB_shader_ballot */
+
 #ifndef GL_ARB_shader_bit_encoding
 #define GL_ARB_shader_bit_encoding 1
 #endif /* GL_ARB_shader_bit_encoding */
 
+#ifndef GL_ARB_shader_clock
+#define GL_ARB_shader_clock 1
+#endif /* GL_ARB_shader_clock */
+
 #ifndef GL_ARB_shader_draw_parameters
 #define GL_ARB_shader_draw_parameters 1
 #endif /* GL_ARB_shader_draw_parameters */
@@ -3338,6 +3551,10 @@
 #define GL_ARB_shader_texture_image_samples 1
 #endif /* GL_ARB_shader_texture_image_samples */
 
+#ifndef GL_ARB_shader_viewport_layer_array
+#define GL_ARB_shader_viewport_layer_array 1
+#endif /* GL_ARB_shader_viewport_layer_array */
+
 #ifndef GL_ARB_shading_language_420pack
 #define GL_ARB_shading_language_420pack 1
 #endif /* GL_ARB_shading_language_420pack */
@@ -3400,6 +3617,14 @@
 #endif
 #endif /* GL_ARB_sparse_texture */
 
+#ifndef GL_ARB_sparse_texture2
+#define GL_ARB_sparse_texture2 1
+#endif /* GL_ARB_sparse_texture2 */
+
+#ifndef GL_ARB_sparse_texture_clamp
+#define GL_ARB_sparse_texture_clamp 1
+#endif /* GL_ARB_sparse_texture_clamp */
+
 #ifndef GL_ARB_stencil_texturing
 #define GL_ARB_stencil_texturing 1
 #endif /* GL_ARB_stencil_texturing */
@@ -3416,6 +3641,24 @@
 #define GL_ARB_texture_barrier 1
 #endif /* GL_ARB_texture_barrier */
 
+#ifndef GL_ARB_texture_border_clamp
+#define GL_ARB_texture_border_clamp 1
+#define GL_CLAMP_TO_BORDER_ARB            0x812D
+#endif /* GL_ARB_texture_border_clamp */
+
+#ifndef GL_ARB_texture_buffer_object
+#define GL_ARB_texture_buffer_object 1
+#define GL_TEXTURE_BUFFER_ARB             0x8C2A
+#define GL_MAX_TEXTURE_BUFFER_SIZE_ARB    0x8C2B
+#define GL_TEXTURE_BINDING_BUFFER_ARB     0x8C2C
+#define GL_TEXTURE_BUFFER_DATA_STORE_BINDING_ARB 0x8C2D
+#define GL_TEXTURE_BUFFER_FORMAT_ARB      0x8C2E
+typedef void (APIENTRYP PFNGLTEXBUFFERARBPROC) (GLenum target, GLenum internalformat, GLuint buffer);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glTexBufferARB (GLenum target, GLenum internalformat, GLuint buffer);
+#endif
+#endif /* GL_ARB_texture_buffer_object */
+
 #ifndef GL_ARB_texture_buffer_object_rgb32
 #define GL_ARB_texture_buffer_object_rgb32 1
 #endif /* GL_ARB_texture_buffer_object_rgb32 */
@@ -3447,6 +3690,12 @@
 #define GL_UNSIGNED_INT_SAMPLER_CUBE_MAP_ARRAY_ARB 0x900F
 #endif /* GL_ARB_texture_cube_map_array */
 
+#ifndef GL_ARB_texture_filter_minmax
+#define GL_ARB_texture_filter_minmax 1
+#define GL_TEXTURE_REDUCTION_MODE_ARB     0x9366
+#define GL_WEIGHTED_AVERAGE_ARB           0x9367
+#endif /* GL_ARB_texture_filter_minmax */
+
 #ifndef GL_ARB_texture_gather
 #define GL_ARB_texture_gather 1
 #define GL_MIN_PROGRAM_TEXTURE_GATHER_OFFSET_ARB 0x8E5E
@@ -3458,10 +3707,19 @@
 #define GL_ARB_texture_mirror_clamp_to_edge 1
 #endif /* GL_ARB_texture_mirror_clamp_to_edge */
 
+#ifndef GL_ARB_texture_mirrored_repeat
+#define GL_ARB_texture_mirrored_repeat 1
+#define GL_MIRRORED_REPEAT_ARB            0x8370
+#endif /* GL_ARB_texture_mirrored_repeat */
+
 #ifndef GL_ARB_texture_multisample
 #define GL_ARB_texture_multisample 1
 #endif /* GL_ARB_texture_multisample */
 
+#ifndef GL_ARB_texture_non_power_of_two
+#define GL_ARB_texture_non_power_of_two 1
+#endif /* GL_ARB_texture_non_power_of_two */
+
 #ifndef GL_ARB_texture_query_levels
 #define GL_ARB_texture_query_levels 1
 #endif /* GL_ARB_texture_query_levels */
@@ -3552,6 +3810,34 @@
 #define GL_ARB_viewport_array 1
 #endif /* GL_ARB_viewport_array */
 
+#ifndef GL_KHR_blend_equation_advanced
+#define GL_KHR_blend_equation_advanced 1
+#define GL_MULTIPLY_KHR                   0x9294
+#define GL_SCREEN_KHR                     0x9295
+#define GL_OVERLAY_KHR                    0x9296
+#define GL_DARKEN_KHR                     0x9297
+#define GL_LIGHTEN_KHR                    0x9298
+#define GL_COLORDODGE_KHR                 0x9299
+#define GL_COLORBURN_KHR                  0x929A
+#define GL_HARDLIGHT_KHR                  0x929B
+#define GL_SOFTLIGHT_KHR                  0x929C
+#define GL_DIFFERENCE_KHR                 0x929E
+#define GL_EXCLUSION_KHR                  0x92A0
+#define GL_HSL_HUE_KHR                    0x92AD
+#define GL_HSL_SATURATION_KHR             0x92AE
+#define GL_HSL_COLOR_KHR                  0x92AF
+#define GL_HSL_LUMINOSITY_KHR             0x92B0
+typedef void (APIENTRYP PFNGLBLENDBARRIERKHRPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glBlendBarrierKHR (void);
+#endif
+#endif /* GL_KHR_blend_equation_advanced */
+
+#ifndef GL_KHR_blend_equation_advanced_coherent
+#define GL_KHR_blend_equation_advanced_coherent 1
+#define GL_BLEND_ADVANCED_COHERENT_KHR    0x9285
+#endif /* GL_KHR_blend_equation_advanced_coherent */
+
 #ifndef GL_KHR_context_flush_control
 #define GL_KHR_context_flush_control 1
 #endif /* GL_KHR_context_flush_control */
@@ -3614,6 +3900,1694 @@
 #define GL_KHR_texture_compression_astc_sliced_3d 1
 #endif /* GL_KHR_texture_compression_astc_sliced_3d */
 
+#ifndef GL_AMD_performance_monitor
+#define GL_AMD_performance_monitor 1
+#define GL_COUNTER_TYPE_AMD               0x8BC0
+#define GL_COUNTER_RANGE_AMD              0x8BC1
+#define GL_UNSIGNED_INT64_AMD             0x8BC2
+#define GL_PERCENTAGE_AMD                 0x8BC3
+#define GL_PERFMON_RESULT_AVAILABLE_AMD   0x8BC4
+#define GL_PERFMON_RESULT_SIZE_AMD        0x8BC5
+#define GL_PERFMON_RESULT_AMD             0x8BC6
+typedef void (APIENTRYP PFNGLGETPERFMONITORGROUPSAMDPROC) (GLint *numGroups, GLsizei groupsSize, GLuint *groups);
+typedef void (APIENTRYP PFNGLGETPERFMONITORCOUNTERSAMDPROC) (GLuint group, GLint *numCounters, GLint *maxActiveCounters, GLsizei counterSize, GLuint *counters);
+typedef void (APIENTRYP PFNGLGETPERFMONITORGROUPSTRINGAMDPROC) (GLuint group, GLsizei bufSize, GLsizei *length, GLchar *groupString);
+typedef void (APIENTRYP PFNGLGETPERFMONITORCOUNTERSTRINGAMDPROC) (GLuint group, GLuint counter, GLsizei bufSize, GLsizei *length, GLchar *counterString);
+typedef void (APIENTRYP PFNGLGETPERFMONITORCOUNTERINFOAMDPROC) (GLuint group, GLuint counter, GLenum pname, void *data);
+typedef void (APIENTRYP PFNGLGENPERFMONITORSAMDPROC) (GLsizei n, GLuint *monitors);
+typedef void (APIENTRYP PFNGLDELETEPERFMONITORSAMDPROC) (GLsizei n, GLuint *monitors);
+typedef void (APIENTRYP PFNGLSELECTPERFMONITORCOUNTERSAMDPROC) (GLuint monitor, GLboolean enable, GLuint group, GLint numCounters, GLuint *counterList);
+typedef void (APIENTRYP PFNGLBEGINPERFMONITORAMDPROC) (GLuint monitor);
+typedef void (APIENTRYP PFNGLENDPERFMONITORAMDPROC) (GLuint monitor);
+typedef void (APIENTRYP PFNGLGETPERFMONITORCOUNTERDATAAMDPROC) (GLuint monitor, GLenum pname, GLsizei dataSize, GLuint *data, GLint *bytesWritten);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glGetPerfMonitorGroupsAMD (GLint *numGroups, GLsizei groupsSize, GLuint *groups);
+GLAPI void APIENTRY glGetPerfMonitorCountersAMD (GLuint group, GLint *numCounters, GLint *maxActiveCounters, GLsizei counterSize, GLuint *counters);
+GLAPI void APIENTRY glGetPerfMonitorGroupStringAMD (GLuint group, GLsizei bufSize, GLsizei *length, GLchar *groupString);
+GLAPI void APIENTRY glGetPerfMonitorCounterStringAMD (GLuint group, GLuint counter, GLsizei bufSize, GLsizei *length, GLchar *counterString);
+GLAPI void APIENTRY glGetPerfMonitorCounterInfoAMD (GLuint group, GLuint counter, GLenum pname, void *data);
+GLAPI void APIENTRY glGenPerfMonitorsAMD (GLsizei n, GLuint *monitors);
+GLAPI void APIENTRY glDeletePerfMonitorsAMD (GLsizei n, GLuint *monitors);
+GLAPI void APIENTRY glSelectPerfMonitorCountersAMD (GLuint monitor, GLboolean enable, GLuint group, GLint numCounters, GLuint *counterList);
+GLAPI void APIENTRY glBeginPerfMonitorAMD (GLuint monitor);
+GLAPI void APIENTRY glEndPerfMonitorAMD (GLuint monitor);
+GLAPI void APIENTRY glGetPerfMonitorCounterDataAMD (GLuint monitor, GLenum pname, GLsizei dataSize, GLuint *data, GLint *bytesWritten);
+#endif
+#endif /* GL_AMD_performance_monitor */
+
+#ifndef GL_APPLE_rgb_422
+#define GL_APPLE_rgb_422 1
+#define GL_RGB_422_APPLE                  0x8A1F
+#define GL_UNSIGNED_SHORT_8_8_APPLE       0x85BA
+#define GL_UNSIGNED_SHORT_8_8_REV_APPLE   0x85BB
+#define GL_RGB_RAW_422_APPLE              0x8A51
+#endif /* GL_APPLE_rgb_422 */
+
+#ifndef GL_EXT_debug_label
+#define GL_EXT_debug_label 1
+#define GL_PROGRAM_PIPELINE_OBJECT_EXT    0x8A4F
+#define GL_PROGRAM_OBJECT_EXT             0x8B40
+#define GL_SHADER_OBJECT_EXT              0x8B48
+#define GL_BUFFER_OBJECT_EXT              0x9151
+#define GL_QUERY_OBJECT_EXT               0x9153
+#define GL_VERTEX_ARRAY_OBJECT_EXT        0x9154
+typedef void (APIENTRYP PFNGLLABELOBJECTEXTPROC) (GLenum type, GLuint object, GLsizei length, const GLchar *label);
+typedef void (APIENTRYP PFNGLGETOBJECTLABELEXTPROC) (GLenum type, GLuint object, GLsizei bufSize, GLsizei *length, GLchar *label);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glLabelObjectEXT (GLenum type, GLuint object, GLsizei length, const GLchar *label);
+GLAPI void APIENTRY glGetObjectLabelEXT (GLenum type, GLuint object, GLsizei bufSize, GLsizei *length, GLchar *label);
+#endif
+#endif /* GL_EXT_debug_label */
+
+#ifndef GL_EXT_debug_marker
+#define GL_EXT_debug_marker 1
+typedef void (APIENTRYP PFNGLINSERTEVENTMARKEREXTPROC) (GLsizei length, const GLchar *marker);
+typedef void (APIENTRYP PFNGLPUSHGROUPMARKEREXTPROC) (GLsizei length, const GLchar *marker);
+typedef void (APIENTRYP PFNGLPOPGROUPMARKEREXTPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glInsertEventMarkerEXT (GLsizei length, const GLchar *marker);
+GLAPI void APIENTRY glPushGroupMarkerEXT (GLsizei length, const GLchar *marker);
+GLAPI void APIENTRY glPopGroupMarkerEXT (void);
+#endif
+#endif /* GL_EXT_debug_marker */
+
+#ifndef GL_EXT_direct_state_access
+#define GL_EXT_direct_state_access 1
+#define GL_PROGRAM_MATRIX_EXT             0x8E2D
+#define GL_TRANSPOSE_PROGRAM_MATRIX_EXT   0x8E2E
+#define GL_PROGRAM_MATRIX_STACK_DEPTH_EXT 0x8E2F
+typedef void (APIENTRYP PFNGLMATRIXLOADFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXLOADDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (APIENTRYP PFNGLMATRIXMULTFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXMULTDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (APIENTRYP PFNGLMATRIXLOADIDENTITYEXTPROC) (GLenum mode);
+typedef void (APIENTRYP PFNGLMATRIXROTATEFEXTPROC) (GLenum mode, GLfloat angle, GLfloat x, GLfloat y, GLfloat z);
+typedef void (APIENTRYP PFNGLMATRIXROTATEDEXTPROC) (GLenum mode, GLdouble angle, GLdouble x, GLdouble y, GLdouble z);
+typedef void (APIENTRYP PFNGLMATRIXSCALEFEXTPROC) (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+typedef void (APIENTRYP PFNGLMATRIXSCALEDEXTPROC) (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+typedef void (APIENTRYP PFNGLMATRIXTRANSLATEFEXTPROC) (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+typedef void (APIENTRYP PFNGLMATRIXTRANSLATEDEXTPROC) (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+typedef void (APIENTRYP PFNGLMATRIXFRUSTUMEXTPROC) (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+typedef void (APIENTRYP PFNGLMATRIXORTHOEXTPROC) (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+typedef void (APIENTRYP PFNGLMATRIXPOPEXTPROC) (GLenum mode);
+typedef void (APIENTRYP PFNGLMATRIXPUSHEXTPROC) (GLenum mode);
+typedef void (APIENTRYP PFNGLCLIENTATTRIBDEFAULTEXTPROC) (GLbitfield mask);
+typedef void (APIENTRYP PFNGLPUSHCLIENTATTRIBDEFAULTEXTPROC) (GLbitfield mask);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERFEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLfloat param);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERFVEXTPROC) (GLuint texture, GLenum target, GLenum pname, const GLfloat *params);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLTEXTUREIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLTEXTUREIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLTEXTURESUBIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLTEXTURESUBIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLCOPYTEXTUREIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+typedef void (APIENTRYP PFNGLCOPYTEXTUREIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+typedef void (APIENTRYP PFNGLCOPYTEXTURESUBIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+typedef void (APIENTRYP PFNGLCOPYTEXTURESUBIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLGETTEXTUREIMAGEEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum format, GLenum type, void *pixels);
+typedef void (APIENTRYP PFNGLGETTEXTUREPARAMETERFVEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETTEXTUREPARAMETERIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETTEXTURELEVELPARAMETERFVEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETTEXTURELEVELPARAMETERIVEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLTEXTUREIMAGE3DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLTEXTURESUBIMAGE3DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLCOPYTEXTURESUBIMAGE3DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLBINDMULTITEXTUREEXTPROC) (GLenum texunit, GLenum target, GLuint texture);
+typedef void (APIENTRYP PFNGLMULTITEXCOORDPOINTEREXTPROC) (GLenum texunit, GLint size, GLenum type, GLsizei stride, const void *pointer);
+typedef void (APIENTRYP PFNGLMULTITEXENVFEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLfloat param);
+typedef void (APIENTRYP PFNGLMULTITEXENVFVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLfloat *params);
+typedef void (APIENTRYP PFNGLMULTITEXENVIEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLMULTITEXENVIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLMULTITEXGENDEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLdouble param);
+typedef void (APIENTRYP PFNGLMULTITEXGENDVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, const GLdouble *params);
+typedef void (APIENTRYP PFNGLMULTITEXGENFEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLfloat param);
+typedef void (APIENTRYP PFNGLMULTITEXGENFVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, const GLfloat *params);
+typedef void (APIENTRYP PFNGLMULTITEXGENIEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLMULTITEXGENIVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXENVFVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXENVIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXGENDVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLdouble *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXGENFVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXGENIVEXTPROC) (GLenum texunit, GLenum coord, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERIEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERFEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLfloat param);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERFVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLfloat *params);
+typedef void (APIENTRYP PFNGLMULTITEXIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLMULTITEXIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLMULTITEXSUBIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLMULTITEXSUBIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLCOPYMULTITEXIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+typedef void (APIENTRYP PFNGLCOPYMULTITEXIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+typedef void (APIENTRYP PFNGLCOPYMULTITEXSUBIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+typedef void (APIENTRYP PFNGLCOPYMULTITEXSUBIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLGETMULTITEXIMAGEEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum format, GLenum type, void *pixels);
+typedef void (APIENTRYP PFNGLGETMULTITEXPARAMETERFVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXPARAMETERIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXLEVELPARAMETERFVEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum pname, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXLEVELPARAMETERIVEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLMULTITEXIMAGE3DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLMULTITEXSUBIMAGE3DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLCOPYMULTITEXSUBIMAGE3DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLENABLECLIENTSTATEINDEXEDEXTPROC) (GLenum array, GLuint index);
+typedef void (APIENTRYP PFNGLDISABLECLIENTSTATEINDEXEDEXTPROC) (GLenum array, GLuint index);
+typedef void (APIENTRYP PFNGLGETFLOATINDEXEDVEXTPROC) (GLenum target, GLuint index, GLfloat *data);
+typedef void (APIENTRYP PFNGLGETDOUBLEINDEXEDVEXTPROC) (GLenum target, GLuint index, GLdouble *data);
+typedef void (APIENTRYP PFNGLGETPOINTERINDEXEDVEXTPROC) (GLenum target, GLuint index, void **data);
+typedef void (APIENTRYP PFNGLENABLEINDEXEDEXTPROC) (GLenum target, GLuint index);
+typedef void (APIENTRYP PFNGLDISABLEINDEXEDEXTPROC) (GLenum target, GLuint index);
+typedef GLboolean (APIENTRYP PFNGLISENABLEDINDEXEDEXTPROC) (GLenum target, GLuint index);
+typedef void (APIENTRYP PFNGLGETINTEGERINDEXEDVEXTPROC) (GLenum target, GLuint index, GLint *data);
+typedef void (APIENTRYP PFNGLGETBOOLEANINDEXEDVEXTPROC) (GLenum target, GLuint index, GLboolean *data);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTUREIMAGE3DEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTUREIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTUREIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTURESUBIMAGE3DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTURESUBIMAGE2DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTURESUBIMAGE1DEXTPROC) (GLuint texture, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLGETCOMPRESSEDTEXTUREIMAGEEXTPROC) (GLuint texture, GLenum target, GLint lod, void *img);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXIMAGE3DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXSUBIMAGE3DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXSUBIMAGE2DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLCOMPRESSEDMULTITEXSUBIMAGE1DEXTPROC) (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *bits);
+typedef void (APIENTRYP PFNGLGETCOMPRESSEDMULTITEXIMAGEEXTPROC) (GLenum texunit, GLenum target, GLint lod, void *img);
+typedef void (APIENTRYP PFNGLMATRIXLOADTRANSPOSEFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXLOADTRANSPOSEDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (APIENTRYP PFNGLMATRIXMULTTRANSPOSEFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXMULTTRANSPOSEDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERDATAEXTPROC) (GLuint buffer, GLsizeiptr size, const void *data, GLenum usage);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERSUBDATAEXTPROC) (GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
+typedef void *(APIENTRYP PFNGLMAPNAMEDBUFFEREXTPROC) (GLuint buffer, GLenum access);
+typedef GLboolean (APIENTRYP PFNGLUNMAPNAMEDBUFFEREXTPROC) (GLuint buffer);
+typedef void (APIENTRYP PFNGLGETNAMEDBUFFERPARAMETERIVEXTPROC) (GLuint buffer, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETNAMEDBUFFERPOINTERVEXTPROC) (GLuint buffer, GLenum pname, void **params);
+typedef void (APIENTRYP PFNGLGETNAMEDBUFFERSUBDATAEXTPROC) (GLuint buffer, GLintptr offset, GLsizeiptr size, void *data);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1FEXTPROC) (GLuint program, GLint location, GLfloat v0);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2FEXTPROC) (GLuint program, GLint location, GLfloat v0, GLfloat v1);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3FEXTPROC) (GLuint program, GLint location, GLfloat v0, GLfloat v1, GLfloat v2);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4FEXTPROC) (GLuint program, GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1IEXTPROC) (GLuint program, GLint location, GLint v0);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2IEXTPROC) (GLuint program, GLint location, GLint v0, GLint v1);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3IEXTPROC) (GLuint program, GLint location, GLint v0, GLint v1, GLint v2);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4IEXTPROC) (GLuint program, GLint location, GLint v0, GLint v1, GLint v2, GLint v3);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2X3FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3X2FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2X4FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4X2FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3X4FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4X3FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (APIENTRYP PFNGLTEXTUREBUFFEREXTPROC) (GLuint texture, GLenum target, GLenum internalformat, GLuint buffer);
+typedef void (APIENTRYP PFNGLMULTITEXBUFFEREXTPROC) (GLenum texunit, GLenum target, GLenum internalformat, GLuint buffer);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIUIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, const GLuint *params);
+typedef void (APIENTRYP PFNGLGETTEXTUREPARAMETERIIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETTEXTUREPARAMETERIUIVEXTPROC) (GLuint texture, GLenum target, GLenum pname, GLuint *params);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERIIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLMULTITEXPARAMETERIUIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, const GLuint *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXPARAMETERIIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETMULTITEXPARAMETERIUIVEXTPROC) (GLenum texunit, GLenum target, GLenum pname, GLuint *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UIEXTPROC) (GLuint program, GLint location, GLuint v0);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UIEXTPROC) (GLuint program, GLint location, GLuint v0, GLuint v1);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UIEXTPROC) (GLuint program, GLint location, GLuint v0, GLuint v1, GLuint v2);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UIEXTPROC) (GLuint program, GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UIVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLuint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UIVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLuint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UIVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLuint *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UIVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLuint *value);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERS4FVEXTPROC) (GLuint program, GLenum target, GLuint index, GLsizei count, const GLfloat *params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERI4IEXTPROC) (GLuint program, GLenum target, GLuint index, GLint x, GLint y, GLint z, GLint w);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERI4IVEXTPROC) (GLuint program, GLenum target, GLuint index, const GLint *params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERSI4IVEXTPROC) (GLuint program, GLenum target, GLuint index, GLsizei count, const GLint *params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERI4UIEXTPROC) (GLuint program, GLenum target, GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERI4UIVEXTPROC) (GLuint program, GLenum target, GLuint index, const GLuint *params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETERSI4UIVEXTPROC) (GLuint program, GLenum target, GLuint index, GLsizei count, const GLuint *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMLOCALPARAMETERIIVEXTPROC) (GLuint program, GLenum target, GLuint index, GLint *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMLOCALPARAMETERIUIVEXTPROC) (GLuint program, GLenum target, GLuint index, GLuint *params);
+typedef void (APIENTRYP PFNGLENABLECLIENTSTATEIEXTPROC) (GLenum array, GLuint index);
+typedef void (APIENTRYP PFNGLDISABLECLIENTSTATEIEXTPROC) (GLenum array, GLuint index);
+typedef void (APIENTRYP PFNGLGETFLOATI_VEXTPROC) (GLenum pname, GLuint index, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETDOUBLEI_VEXTPROC) (GLenum pname, GLuint index, GLdouble *params);
+typedef void (APIENTRYP PFNGLGETPOINTERI_VEXTPROC) (GLenum pname, GLuint index, void **params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMSTRINGEXTPROC) (GLuint program, GLenum target, GLenum format, GLsizei len, const void *string);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETER4DEXTPROC) (GLuint program, GLenum target, GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETER4DVEXTPROC) (GLuint program, GLenum target, GLuint index, const GLdouble *params);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETER4FEXTPROC) (GLuint program, GLenum target, GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+typedef void (APIENTRYP PFNGLNAMEDPROGRAMLOCALPARAMETER4FVEXTPROC) (GLuint program, GLenum target, GLuint index, const GLfloat *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMLOCALPARAMETERDVEXTPROC) (GLuint program, GLenum target, GLuint index, GLdouble *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMLOCALPARAMETERFVEXTPROC) (GLuint program, GLenum target, GLuint index, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMIVEXTPROC) (GLuint program, GLenum target, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGETNAMEDPROGRAMSTRINGEXTPROC) (GLuint program, GLenum target, GLenum pname, void *string);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEEXTPROC) (GLuint renderbuffer, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLGETNAMEDRENDERBUFFERPARAMETERIVEXTPROC) (GLuint renderbuffer, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC) (GLuint renderbuffer, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEMULTISAMPLECOVERAGEEXTPROC) (GLuint renderbuffer, GLsizei coverageSamples, GLsizei colorSamples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef GLenum (APIENTRYP PFNGLCHECKNAMEDFRAMEBUFFERSTATUSEXTPROC) (GLuint framebuffer, GLenum target);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTURE1DEXTPROC) (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTURE2DEXTPROC) (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTURE3DEXTPROC) (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERRENDERBUFFEREXTPROC) (GLuint framebuffer, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+typedef void (APIENTRYP PFNGLGETNAMEDFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC) (GLuint framebuffer, GLenum attachment, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLGENERATETEXTUREMIPMAPEXTPROC) (GLuint texture, GLenum target);
+typedef void (APIENTRYP PFNGLGENERATEMULTITEXMIPMAPEXTPROC) (GLenum texunit, GLenum target);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERDRAWBUFFEREXTPROC) (GLuint framebuffer, GLenum mode);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERDRAWBUFFERSEXTPROC) (GLuint framebuffer, GLsizei n, const GLenum *bufs);
+typedef void (APIENTRYP PFNGLFRAMEBUFFERREADBUFFEREXTPROC) (GLuint framebuffer, GLenum mode);
+typedef void (APIENTRYP PFNGLGETFRAMEBUFFERPARAMETERIVEXTPROC) (GLuint framebuffer, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLNAMEDCOPYBUFFERSUBDATAEXTPROC) (GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTUREEXTPROC) (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTURELAYEREXTPROC) (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLint layer);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTUREFACEEXTPROC) (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLenum face);
+typedef void (APIENTRYP PFNGLTEXTURERENDERBUFFEREXTPROC) (GLuint texture, GLenum target, GLuint renderbuffer);
+typedef void (APIENTRYP PFNGLMULTITEXRENDERBUFFEREXTPROC) (GLenum texunit, GLenum target, GLuint renderbuffer);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYCOLOROFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYEDGEFLAGOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYINDEXOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYNORMALOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYTEXCOORDOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYMULTITEXCOORDOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLenum texunit, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYFOGCOORDOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYSECONDARYCOLOROFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBIOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLENABLEVERTEXARRAYEXTPROC) (GLuint vaobj, GLenum array);
+typedef void (APIENTRYP PFNGLDISABLEVERTEXARRAYEXTPROC) (GLuint vaobj, GLenum array);
+typedef void (APIENTRYP PFNGLENABLEVERTEXARRAYATTRIBEXTPROC) (GLuint vaobj, GLuint index);
+typedef void (APIENTRYP PFNGLDISABLEVERTEXARRAYATTRIBEXTPROC) (GLuint vaobj, GLuint index);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYINTEGERVEXTPROC) (GLuint vaobj, GLenum pname, GLint *param);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYPOINTERVEXTPROC) (GLuint vaobj, GLenum pname, void **param);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYINTEGERI_VEXTPROC) (GLuint vaobj, GLuint index, GLenum pname, GLint *param);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYPOINTERI_VEXTPROC) (GLuint vaobj, GLuint index, GLenum pname, void **param);
+typedef void *(APIENTRYP PFNGLMAPNAMEDBUFFERRANGEEXTPROC) (GLuint buffer, GLintptr offset, GLsizeiptr length, GLbitfield access);
+typedef void (APIENTRYP PFNGLFLUSHMAPPEDNAMEDBUFFERRANGEEXTPROC) (GLuint buffer, GLintptr offset, GLsizeiptr length);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERSTORAGEEXTPROC) (GLuint buffer, GLsizeiptr size, const void *data, GLbitfield flags);
+typedef void (APIENTRYP PFNGLCLEARNAMEDBUFFERDATAEXTPROC) (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data);
+typedef void (APIENTRYP PFNGLCLEARNAMEDBUFFERSUBDATAEXTPROC) (GLuint buffer, GLenum internalformat, GLsizeiptr offset, GLsizeiptr size, GLenum format, GLenum type, const void *data);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERPARAMETERIEXTPROC) (GLuint framebuffer, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLGETNAMEDFRAMEBUFFERPARAMETERIVEXTPROC) (GLuint framebuffer, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1DEXTPROC) (GLuint program, GLint location, GLdouble x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2DEXTPROC) (GLuint program, GLint location, GLdouble x, GLdouble y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3DEXTPROC) (GLuint program, GLint location, GLdouble x, GLdouble y, GLdouble z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4DEXTPROC) (GLuint program, GLint location, GLdouble x, GLdouble y, GLdouble z, GLdouble w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1DVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2DVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3DVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4DVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2X3DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2X4DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3X2DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3X4DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4X2DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4X3DVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+typedef void (APIENTRYP PFNGLTEXTUREBUFFERRANGEEXTPROC) (GLuint texture, GLenum target, GLenum internalformat, GLuint buffer, GLintptr offset, GLsizeiptr size);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE1DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE2DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE3DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE2DMULTISAMPLEEXTPROC) (GLuint texture, GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE3DMULTISAMPLEEXTPROC) (GLuint texture, GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations);
+typedef void (APIENTRYP PFNGLVERTEXARRAYBINDVERTEXBUFFEREXTPROC) (GLuint vaobj, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBFORMATEXTPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBIFORMATEXTPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBLFORMATEXTPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBBINDINGEXTPROC) (GLuint vaobj, GLuint attribindex, GLuint bindingindex);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXBINDINGDIVISOREXTPROC) (GLuint vaobj, GLuint bindingindex, GLuint divisor);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBLOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+typedef void (APIENTRYP PFNGLTEXTUREPAGECOMMITMENTEXTPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBDIVISOREXTPROC) (GLuint vaobj, GLuint index, GLuint divisor);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMatrixLoadfEXT (GLenum mode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixLoaddEXT (GLenum mode, const GLdouble *m);
+GLAPI void APIENTRY glMatrixMultfEXT (GLenum mode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixMultdEXT (GLenum mode, const GLdouble *m);
+GLAPI void APIENTRY glMatrixLoadIdentityEXT (GLenum mode);
+GLAPI void APIENTRY glMatrixRotatefEXT (GLenum mode, GLfloat angle, GLfloat x, GLfloat y, GLfloat z);
+GLAPI void APIENTRY glMatrixRotatedEXT (GLenum mode, GLdouble angle, GLdouble x, GLdouble y, GLdouble z);
+GLAPI void APIENTRY glMatrixScalefEXT (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+GLAPI void APIENTRY glMatrixScaledEXT (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+GLAPI void APIENTRY glMatrixTranslatefEXT (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+GLAPI void APIENTRY glMatrixTranslatedEXT (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+GLAPI void APIENTRY glMatrixFrustumEXT (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+GLAPI void APIENTRY glMatrixOrthoEXT (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+GLAPI void APIENTRY glMatrixPopEXT (GLenum mode);
+GLAPI void APIENTRY glMatrixPushEXT (GLenum mode);
+GLAPI void APIENTRY glClientAttribDefaultEXT (GLbitfield mask);
+GLAPI void APIENTRY glPushClientAttribDefaultEXT (GLbitfield mask);
+GLAPI void APIENTRY glTextureParameterfEXT (GLuint texture, GLenum target, GLenum pname, GLfloat param);
+GLAPI void APIENTRY glTextureParameterfvEXT (GLuint texture, GLenum target, GLenum pname, const GLfloat *params);
+GLAPI void APIENTRY glTextureParameteriEXT (GLuint texture, GLenum target, GLenum pname, GLint param);
+GLAPI void APIENTRY glTextureParameterivEXT (GLuint texture, GLenum target, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glTextureImage1DEXT (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glTextureImage2DEXT (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glTextureSubImage1DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glTextureSubImage2DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glCopyTextureImage1DEXT (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+GLAPI void APIENTRY glCopyTextureImage2DEXT (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+GLAPI void APIENTRY glCopyTextureSubImage1DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+GLAPI void APIENTRY glCopyTextureSubImage2DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glGetTextureImageEXT (GLuint texture, GLenum target, GLint level, GLenum format, GLenum type, void *pixels);
+GLAPI void APIENTRY glGetTextureParameterfvEXT (GLuint texture, GLenum target, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetTextureParameterivEXT (GLuint texture, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetTextureLevelParameterfvEXT (GLuint texture, GLenum target, GLint level, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetTextureLevelParameterivEXT (GLuint texture, GLenum target, GLint level, GLenum pname, GLint *params);
+GLAPI void APIENTRY glTextureImage3DEXT (GLuint texture, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glTextureSubImage3DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glCopyTextureSubImage3DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glBindMultiTextureEXT (GLenum texunit, GLenum target, GLuint texture);
+GLAPI void APIENTRY glMultiTexCoordPointerEXT (GLenum texunit, GLint size, GLenum type, GLsizei stride, const void *pointer);
+GLAPI void APIENTRY glMultiTexEnvfEXT (GLenum texunit, GLenum target, GLenum pname, GLfloat param);
+GLAPI void APIENTRY glMultiTexEnvfvEXT (GLenum texunit, GLenum target, GLenum pname, const GLfloat *params);
+GLAPI void APIENTRY glMultiTexEnviEXT (GLenum texunit, GLenum target, GLenum pname, GLint param);
+GLAPI void APIENTRY glMultiTexEnvivEXT (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glMultiTexGendEXT (GLenum texunit, GLenum coord, GLenum pname, GLdouble param);
+GLAPI void APIENTRY glMultiTexGendvEXT (GLenum texunit, GLenum coord, GLenum pname, const GLdouble *params);
+GLAPI void APIENTRY glMultiTexGenfEXT (GLenum texunit, GLenum coord, GLenum pname, GLfloat param);
+GLAPI void APIENTRY glMultiTexGenfvEXT (GLenum texunit, GLenum coord, GLenum pname, const GLfloat *params);
+GLAPI void APIENTRY glMultiTexGeniEXT (GLenum texunit, GLenum coord, GLenum pname, GLint param);
+GLAPI void APIENTRY glMultiTexGenivEXT (GLenum texunit, GLenum coord, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glGetMultiTexEnvfvEXT (GLenum texunit, GLenum target, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetMultiTexEnvivEXT (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetMultiTexGendvEXT (GLenum texunit, GLenum coord, GLenum pname, GLdouble *params);
+GLAPI void APIENTRY glGetMultiTexGenfvEXT (GLenum texunit, GLenum coord, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetMultiTexGenivEXT (GLenum texunit, GLenum coord, GLenum pname, GLint *params);
+GLAPI void APIENTRY glMultiTexParameteriEXT (GLenum texunit, GLenum target, GLenum pname, GLint param);
+GLAPI void APIENTRY glMultiTexParameterivEXT (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glMultiTexParameterfEXT (GLenum texunit, GLenum target, GLenum pname, GLfloat param);
+GLAPI void APIENTRY glMultiTexParameterfvEXT (GLenum texunit, GLenum target, GLenum pname, const GLfloat *params);
+GLAPI void APIENTRY glMultiTexImage1DEXT (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glMultiTexImage2DEXT (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glMultiTexSubImage1DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glMultiTexSubImage2DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glCopyMultiTexImage1DEXT (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+GLAPI void APIENTRY glCopyMultiTexImage2DEXT (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+GLAPI void APIENTRY glCopyMultiTexSubImage1DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+GLAPI void APIENTRY glCopyMultiTexSubImage2DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glGetMultiTexImageEXT (GLenum texunit, GLenum target, GLint level, GLenum format, GLenum type, void *pixels);
+GLAPI void APIENTRY glGetMultiTexParameterfvEXT (GLenum texunit, GLenum target, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetMultiTexParameterivEXT (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetMultiTexLevelParameterfvEXT (GLenum texunit, GLenum target, GLint level, GLenum pname, GLfloat *params);
+GLAPI void APIENTRY glGetMultiTexLevelParameterivEXT (GLenum texunit, GLenum target, GLint level, GLenum pname, GLint *params);
+GLAPI void APIENTRY glMultiTexImage3DEXT (GLenum texunit, GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glMultiTexSubImage3DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels);
+GLAPI void APIENTRY glCopyMultiTexSubImage3DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glEnableClientStateIndexedEXT (GLenum array, GLuint index);
+GLAPI void APIENTRY glDisableClientStateIndexedEXT (GLenum array, GLuint index);
+GLAPI void APIENTRY glGetFloatIndexedvEXT (GLenum target, GLuint index, GLfloat *data);
+GLAPI void APIENTRY glGetDoubleIndexedvEXT (GLenum target, GLuint index, GLdouble *data);
+GLAPI void APIENTRY glGetPointerIndexedvEXT (GLenum target, GLuint index, void **data);
+GLAPI void APIENTRY glEnableIndexedEXT (GLenum target, GLuint index);
+GLAPI void APIENTRY glDisableIndexedEXT (GLenum target, GLuint index);
+GLAPI GLboolean APIENTRY glIsEnabledIndexedEXT (GLenum target, GLuint index);
+GLAPI void APIENTRY glGetIntegerIndexedvEXT (GLenum target, GLuint index, GLint *data);
+GLAPI void APIENTRY glGetBooleanIndexedvEXT (GLenum target, GLuint index, GLboolean *data);
+GLAPI void APIENTRY glCompressedTextureImage3DEXT (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedTextureImage2DEXT (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedTextureImage1DEXT (GLuint texture, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedTextureSubImage3DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedTextureSubImage2DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedTextureSubImage1DEXT (GLuint texture, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glGetCompressedTextureImageEXT (GLuint texture, GLenum target, GLint lod, void *img);
+GLAPI void APIENTRY glCompressedMultiTexImage3DEXT (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedMultiTexImage2DEXT (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedMultiTexImage1DEXT (GLenum texunit, GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedMultiTexSubImage3DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedMultiTexSubImage2DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glCompressedMultiTexSubImage1DEXT (GLenum texunit, GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *bits);
+GLAPI void APIENTRY glGetCompressedMultiTexImageEXT (GLenum texunit, GLenum target, GLint lod, void *img);
+GLAPI void APIENTRY glMatrixLoadTransposefEXT (GLenum mode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixLoadTransposedEXT (GLenum mode, const GLdouble *m);
+GLAPI void APIENTRY glMatrixMultTransposefEXT (GLenum mode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixMultTransposedEXT (GLenum mode, const GLdouble *m);
+GLAPI void APIENTRY glNamedBufferDataEXT (GLuint buffer, GLsizeiptr size, const void *data, GLenum usage);
+GLAPI void APIENTRY glNamedBufferSubDataEXT (GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
+GLAPI void *APIENTRY glMapNamedBufferEXT (GLuint buffer, GLenum access);
+GLAPI GLboolean APIENTRY glUnmapNamedBufferEXT (GLuint buffer);
+GLAPI void APIENTRY glGetNamedBufferParameterivEXT (GLuint buffer, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetNamedBufferPointervEXT (GLuint buffer, GLenum pname, void **params);
+GLAPI void APIENTRY glGetNamedBufferSubDataEXT (GLuint buffer, GLintptr offset, GLsizeiptr size, void *data);
+GLAPI void APIENTRY glProgramUniform1fEXT (GLuint program, GLint location, GLfloat v0);
+GLAPI void APIENTRY glProgramUniform2fEXT (GLuint program, GLint location, GLfloat v0, GLfloat v1);
+GLAPI void APIENTRY glProgramUniform3fEXT (GLuint program, GLint location, GLfloat v0, GLfloat v1, GLfloat v2);
+GLAPI void APIENTRY glProgramUniform4fEXT (GLuint program, GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3);
+GLAPI void APIENTRY glProgramUniform1iEXT (GLuint program, GLint location, GLint v0);
+GLAPI void APIENTRY glProgramUniform2iEXT (GLuint program, GLint location, GLint v0, GLint v1);
+GLAPI void APIENTRY glProgramUniform3iEXT (GLuint program, GLint location, GLint v0, GLint v1, GLint v2);
+GLAPI void APIENTRY glProgramUniform4iEXT (GLuint program, GLint location, GLint v0, GLint v1, GLint v2, GLint v3);
+GLAPI void APIENTRY glProgramUniform1fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniform2fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniform3fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniform4fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniform1ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GLAPI void APIENTRY glProgramUniform2ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GLAPI void APIENTRY glProgramUniform3ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GLAPI void APIENTRY glProgramUniform4ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GLAPI void APIENTRY glProgramUniformMatrix2fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix3fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix4fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix2x3fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix3x2fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix2x4fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix4x2fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix3x4fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glProgramUniformMatrix4x3fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI void APIENTRY glTextureBufferEXT (GLuint texture, GLenum target, GLenum internalformat, GLuint buffer);
+GLAPI void APIENTRY glMultiTexBufferEXT (GLenum texunit, GLenum target, GLenum internalformat, GLuint buffer);
+GLAPI void APIENTRY glTextureParameterIivEXT (GLuint texture, GLenum target, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glTextureParameterIuivEXT (GLuint texture, GLenum target, GLenum pname, const GLuint *params);
+GLAPI void APIENTRY glGetTextureParameterIivEXT (GLuint texture, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetTextureParameterIuivEXT (GLuint texture, GLenum target, GLenum pname, GLuint *params);
+GLAPI void APIENTRY glMultiTexParameterIivEXT (GLenum texunit, GLenum target, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glMultiTexParameterIuivEXT (GLenum texunit, GLenum target, GLenum pname, const GLuint *params);
+GLAPI void APIENTRY glGetMultiTexParameterIivEXT (GLenum texunit, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetMultiTexParameterIuivEXT (GLenum texunit, GLenum target, GLenum pname, GLuint *params);
+GLAPI void APIENTRY glProgramUniform1uiEXT (GLuint program, GLint location, GLuint v0);
+GLAPI void APIENTRY glProgramUniform2uiEXT (GLuint program, GLint location, GLuint v0, GLuint v1);
+GLAPI void APIENTRY glProgramUniform3uiEXT (GLuint program, GLint location, GLuint v0, GLuint v1, GLuint v2);
+GLAPI void APIENTRY glProgramUniform4uiEXT (GLuint program, GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+GLAPI void APIENTRY glProgramUniform1uivEXT (GLuint program, GLint location, GLsizei count, const GLuint *value);
+GLAPI void APIENTRY glProgramUniform2uivEXT (GLuint program, GLint location, GLsizei count, const GLuint *value);
+GLAPI void APIENTRY glProgramUniform3uivEXT (GLuint program, GLint location, GLsizei count, const GLuint *value);
+GLAPI void APIENTRY glProgramUniform4uivEXT (GLuint program, GLint location, GLsizei count, const GLuint *value);
+GLAPI void APIENTRY glNamedProgramLocalParameters4fvEXT (GLuint program, GLenum target, GLuint index, GLsizei count, const GLfloat *params);
+GLAPI void APIENTRY glNamedProgramLocalParameterI4iEXT (GLuint program, GLenum target, GLuint index, GLint x, GLint y, GLint z, GLint w);
+GLAPI void APIENTRY glNamedProgramLocalParameterI4ivEXT (GLuint program, GLenum target, GLuint index, const GLint *params);
+GLAPI void APIENTRY glNamedProgramLocalParametersI4ivEXT (GLuint program, GLenum target, GLuint index, GLsizei count, const GLint *params);
+GLAPI void APIENTRY glNamedProgramLocalParameterI4uiEXT (GLuint program, GLenum target, GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+GLAPI void APIENTRY glNamedProgramLocalParameterI4uivEXT (GLuint program, GLenum target, GLuint index, const GLuint *params);
+GLAPI void APIENTRY glNamedProgramLocalParametersI4uivEXT (GLuint program, GLenum target, GLuint index, GLsizei count, const GLuint *params);
+GLAPI void APIENTRY glGetNamedProgramLocalParameterIivEXT (GLuint program, GLenum target, GLuint index, GLint *params);
+GLAPI void APIENTRY glGetNamedProgramLocalParameterIuivEXT (GLuint program, GLenum target, GLuint index, GLuint *params);
+GLAPI void APIENTRY glEnableClientStateiEXT (GLenum array, GLuint index);
+GLAPI void APIENTRY glDisableClientStateiEXT (GLenum array, GLuint index);
+GLAPI void APIENTRY glGetFloati_vEXT (GLenum pname, GLuint index, GLfloat *params);
+GLAPI void APIENTRY glGetDoublei_vEXT (GLenum pname, GLuint index, GLdouble *params);
+GLAPI void APIENTRY glGetPointeri_vEXT (GLenum pname, GLuint index, void **params);
+GLAPI void APIENTRY glNamedProgramStringEXT (GLuint program, GLenum target, GLenum format, GLsizei len, const void *string);
+GLAPI void APIENTRY glNamedProgramLocalParameter4dEXT (GLuint program, GLenum target, GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w);
+GLAPI void APIENTRY glNamedProgramLocalParameter4dvEXT (GLuint program, GLenum target, GLuint index, const GLdouble *params);
+GLAPI void APIENTRY glNamedProgramLocalParameter4fEXT (GLuint program, GLenum target, GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+GLAPI void APIENTRY glNamedProgramLocalParameter4fvEXT (GLuint program, GLenum target, GLuint index, const GLfloat *params);
+GLAPI void APIENTRY glGetNamedProgramLocalParameterdvEXT (GLuint program, GLenum target, GLuint index, GLdouble *params);
+GLAPI void APIENTRY glGetNamedProgramLocalParameterfvEXT (GLuint program, GLenum target, GLuint index, GLfloat *params);
+GLAPI void APIENTRY glGetNamedProgramivEXT (GLuint program, GLenum target, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGetNamedProgramStringEXT (GLuint program, GLenum target, GLenum pname, void *string);
+GLAPI void APIENTRY glNamedRenderbufferStorageEXT (GLuint renderbuffer, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glGetNamedRenderbufferParameterivEXT (GLuint renderbuffer, GLenum pname, GLint *params);
+GLAPI void APIENTRY glNamedRenderbufferStorageMultisampleEXT (GLuint renderbuffer, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glNamedRenderbufferStorageMultisampleCoverageEXT (GLuint renderbuffer, GLsizei coverageSamples, GLsizei colorSamples, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI GLenum APIENTRY glCheckNamedFramebufferStatusEXT (GLuint framebuffer, GLenum target);
+GLAPI void APIENTRY glNamedFramebufferTexture1DEXT (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+GLAPI void APIENTRY glNamedFramebufferTexture2DEXT (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+GLAPI void APIENTRY glNamedFramebufferTexture3DEXT (GLuint framebuffer, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset);
+GLAPI void APIENTRY glNamedFramebufferRenderbufferEXT (GLuint framebuffer, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+GLAPI void APIENTRY glGetNamedFramebufferAttachmentParameterivEXT (GLuint framebuffer, GLenum attachment, GLenum pname, GLint *params);
+GLAPI void APIENTRY glGenerateTextureMipmapEXT (GLuint texture, GLenum target);
+GLAPI void APIENTRY glGenerateMultiTexMipmapEXT (GLenum texunit, GLenum target);
+GLAPI void APIENTRY glFramebufferDrawBufferEXT (GLuint framebuffer, GLenum mode);
+GLAPI void APIENTRY glFramebufferDrawBuffersEXT (GLuint framebuffer, GLsizei n, const GLenum *bufs);
+GLAPI void APIENTRY glFramebufferReadBufferEXT (GLuint framebuffer, GLenum mode);
+GLAPI void APIENTRY glGetFramebufferParameterivEXT (GLuint framebuffer, GLenum pname, GLint *params);
+GLAPI void APIENTRY glNamedCopyBufferSubDataEXT (GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+GLAPI void APIENTRY glNamedFramebufferTextureEXT (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level);
+GLAPI void APIENTRY glNamedFramebufferTextureLayerEXT (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLint layer);
+GLAPI void APIENTRY glNamedFramebufferTextureFaceEXT (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLenum face);
+GLAPI void APIENTRY glTextureRenderbufferEXT (GLuint texture, GLenum target, GLuint renderbuffer);
+GLAPI void APIENTRY glMultiTexRenderbufferEXT (GLenum texunit, GLenum target, GLuint renderbuffer);
+GLAPI void APIENTRY glVertexArrayVertexOffsetEXT (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayColorOffsetEXT (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayEdgeFlagOffsetEXT (GLuint vaobj, GLuint buffer, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayIndexOffsetEXT (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayNormalOffsetEXT (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayTexCoordOffsetEXT (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayMultiTexCoordOffsetEXT (GLuint vaobj, GLuint buffer, GLenum texunit, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayFogCoordOffsetEXT (GLuint vaobj, GLuint buffer, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArraySecondaryColorOffsetEXT (GLuint vaobj, GLuint buffer, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayVertexAttribOffsetEXT (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glVertexArrayVertexAttribIOffsetEXT (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glEnableVertexArrayEXT (GLuint vaobj, GLenum array);
+GLAPI void APIENTRY glDisableVertexArrayEXT (GLuint vaobj, GLenum array);
+GLAPI void APIENTRY glEnableVertexArrayAttribEXT (GLuint vaobj, GLuint index);
+GLAPI void APIENTRY glDisableVertexArrayAttribEXT (GLuint vaobj, GLuint index);
+GLAPI void APIENTRY glGetVertexArrayIntegervEXT (GLuint vaobj, GLenum pname, GLint *param);
+GLAPI void APIENTRY glGetVertexArrayPointervEXT (GLuint vaobj, GLenum pname, void **param);
+GLAPI void APIENTRY glGetVertexArrayIntegeri_vEXT (GLuint vaobj, GLuint index, GLenum pname, GLint *param);
+GLAPI void APIENTRY glGetVertexArrayPointeri_vEXT (GLuint vaobj, GLuint index, GLenum pname, void **param);
+GLAPI void *APIENTRY glMapNamedBufferRangeEXT (GLuint buffer, GLintptr offset, GLsizeiptr length, GLbitfield access);
+GLAPI void APIENTRY glFlushMappedNamedBufferRangeEXT (GLuint buffer, GLintptr offset, GLsizeiptr length);
+GLAPI void APIENTRY glNamedBufferStorageEXT (GLuint buffer, GLsizeiptr size, const void *data, GLbitfield flags);
+GLAPI void APIENTRY glClearNamedBufferDataEXT (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data);
+GLAPI void APIENTRY glClearNamedBufferSubDataEXT (GLuint buffer, GLenum internalformat, GLsizeiptr offset, GLsizeiptr size, GLenum format, GLenum type, const void *data);
+GLAPI void APIENTRY glNamedFramebufferParameteriEXT (GLuint framebuffer, GLenum pname, GLint param);
+GLAPI void APIENTRY glGetNamedFramebufferParameterivEXT (GLuint framebuffer, GLenum pname, GLint *params);
+GLAPI void APIENTRY glProgramUniform1dEXT (GLuint program, GLint location, GLdouble x);
+GLAPI void APIENTRY glProgramUniform2dEXT (GLuint program, GLint location, GLdouble x, GLdouble y);
+GLAPI void APIENTRY glProgramUniform3dEXT (GLuint program, GLint location, GLdouble x, GLdouble y, GLdouble z);
+GLAPI void APIENTRY glProgramUniform4dEXT (GLuint program, GLint location, GLdouble x, GLdouble y, GLdouble z, GLdouble w);
+GLAPI void APIENTRY glProgramUniform1dvEXT (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniform2dvEXT (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniform3dvEXT (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniform4dvEXT (GLuint program, GLint location, GLsizei count, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix2dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix3dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix4dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix2x3dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix2x4dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix3x2dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix3x4dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix4x2dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glProgramUniformMatrix4x3dvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLdouble *value);
+GLAPI void APIENTRY glTextureBufferRangeEXT (GLuint texture, GLenum target, GLenum internalformat, GLuint buffer, GLintptr offset, GLsizeiptr size);
+GLAPI void APIENTRY glTextureStorage1DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+GLAPI void APIENTRY glTextureStorage2DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glTextureStorage3DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+GLAPI void APIENTRY glTextureStorage2DMultisampleEXT (GLuint texture, GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations);
+GLAPI void APIENTRY glTextureStorage3DMultisampleEXT (GLuint texture, GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations);
+GLAPI void APIENTRY glVertexArrayBindVertexBufferEXT (GLuint vaobj, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride);
+GLAPI void APIENTRY glVertexArrayVertexAttribFormatEXT (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset);
+GLAPI void APIENTRY glVertexArrayVertexAttribIFormatEXT (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+GLAPI void APIENTRY glVertexArrayVertexAttribLFormatEXT (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+GLAPI void APIENTRY glVertexArrayVertexAttribBindingEXT (GLuint vaobj, GLuint attribindex, GLuint bindingindex);
+GLAPI void APIENTRY glVertexArrayVertexBindingDivisorEXT (GLuint vaobj, GLuint bindingindex, GLuint divisor);
+GLAPI void APIENTRY glVertexArrayVertexAttribLOffsetEXT (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
+GLAPI void APIENTRY glTexturePageCommitmentEXT (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
+GLAPI void APIENTRY glVertexArrayVertexAttribDivisorEXT (GLuint vaobj, GLuint index, GLuint divisor);
+#endif
+#endif /* GL_EXT_direct_state_access */
+
+#ifndef GL_EXT_draw_instanced
+#define GL_EXT_draw_instanced 1
+typedef void (APIENTRYP PFNGLDRAWARRAYSINSTANCEDEXTPROC) (GLenum mode, GLint start, GLsizei count, GLsizei primcount);
+typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDEXTPROC) (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glDrawArraysInstancedEXT (GLenum mode, GLint start, GLsizei count, GLsizei primcount);
+GLAPI void APIENTRY glDrawElementsInstancedEXT (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+#endif
+#endif /* GL_EXT_draw_instanced */
+
+#ifndef GL_EXT_polygon_offset_clamp
+#define GL_EXT_polygon_offset_clamp 1
+#define GL_POLYGON_OFFSET_CLAMP_EXT       0x8E1B
+typedef void (APIENTRYP PFNGLPOLYGONOFFSETCLAMPEXTPROC) (GLfloat factor, GLfloat units, GLfloat clamp);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glPolygonOffsetClampEXT (GLfloat factor, GLfloat units, GLfloat clamp);
+#endif
+#endif /* GL_EXT_polygon_offset_clamp */
+
+#ifndef GL_EXT_post_depth_coverage
+#define GL_EXT_post_depth_coverage 1
+#endif /* GL_EXT_post_depth_coverage */
+
+#ifndef GL_EXT_raster_multisample
+#define GL_EXT_raster_multisample 1
+#define GL_RASTER_MULTISAMPLE_EXT         0x9327
+#define GL_RASTER_SAMPLES_EXT             0x9328
+#define GL_MAX_RASTER_SAMPLES_EXT         0x9329
+#define GL_RASTER_FIXED_SAMPLE_LOCATIONS_EXT 0x932A
+#define GL_MULTISAMPLE_RASTERIZATION_ALLOWED_EXT 0x932B
+#define GL_EFFECTIVE_RASTER_SAMPLES_EXT   0x932C
+typedef void (APIENTRYP PFNGLRASTERSAMPLESEXTPROC) (GLuint samples, GLboolean fixedsamplelocations);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glRasterSamplesEXT (GLuint samples, GLboolean fixedsamplelocations);
+#endif
+#endif /* GL_EXT_raster_multisample */
+
+#ifndef GL_EXT_separate_shader_objects
+#define GL_EXT_separate_shader_objects 1
+#define GL_ACTIVE_PROGRAM_EXT             0x8B8D
+typedef void (APIENTRYP PFNGLUSESHADERPROGRAMEXTPROC) (GLenum type, GLuint program);
+typedef void (APIENTRYP PFNGLACTIVEPROGRAMEXTPROC) (GLuint program);
+typedef GLuint (APIENTRYP PFNGLCREATESHADERPROGRAMEXTPROC) (GLenum type, const GLchar *string);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glUseShaderProgramEXT (GLenum type, GLuint program);
+GLAPI void APIENTRY glActiveProgramEXT (GLuint program);
+GLAPI GLuint APIENTRY glCreateShaderProgramEXT (GLenum type, const GLchar *string);
+#endif
+#endif /* GL_EXT_separate_shader_objects */
+
+#ifndef GL_EXT_shader_integer_mix
+#define GL_EXT_shader_integer_mix 1
+#endif /* GL_EXT_shader_integer_mix */
+
+#ifndef GL_EXT_texture_compression_s3tc
+#define GL_EXT_texture_compression_s3tc 1
+#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT   0x83F0
+#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT  0x83F1
+#define GL_COMPRESSED_RGBA_S3TC_DXT3_EXT  0x83F2
+#define GL_COMPRESSED_RGBA_S3TC_DXT5_EXT  0x83F3
+#endif /* GL_EXT_texture_compression_s3tc */
+
+#ifndef GL_EXT_texture_filter_minmax
+#define GL_EXT_texture_filter_minmax 1
+#endif /* GL_EXT_texture_filter_minmax */
+
+#ifndef GL_EXT_texture_sRGB_decode
+#define GL_EXT_texture_sRGB_decode 1
+#define GL_TEXTURE_SRGB_DECODE_EXT        0x8A48
+#define GL_DECODE_EXT                     0x8A49
+#define GL_SKIP_DECODE_EXT                0x8A4A
+#endif /* GL_EXT_texture_sRGB_decode */
+
+#ifndef GL_EXT_window_rectangles
+#define GL_EXT_window_rectangles 1
+#define GL_INCLUSIVE_EXT                  0x8F10
+#define GL_EXCLUSIVE_EXT                  0x8F11
+#define GL_WINDOW_RECTANGLE_EXT           0x8F12
+#define GL_WINDOW_RECTANGLE_MODE_EXT      0x8F13
+#define GL_MAX_WINDOW_RECTANGLES_EXT      0x8F14
+#define GL_NUM_WINDOW_RECTANGLES_EXT      0x8F15
+typedef void (APIENTRYP PFNGLWINDOWRECTANGLESEXTPROC) (GLenum mode, GLsizei count, const GLint *box);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glWindowRectanglesEXT (GLenum mode, GLsizei count, const GLint *box);
+#endif
+#endif /* GL_EXT_window_rectangles */
+
+#ifndef GL_INTEL_conservative_rasterization
+#define GL_INTEL_conservative_rasterization 1
+#define GL_CONSERVATIVE_RASTERIZATION_INTEL 0x83FE
+#endif /* GL_INTEL_conservative_rasterization */
+
+#ifndef GL_INTEL_framebuffer_CMAA
+#define GL_INTEL_framebuffer_CMAA 1
+typedef void (APIENTRYP PFNGLAPPLYFRAMEBUFFERATTACHMENTCMAAINTELPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glApplyFramebufferAttachmentCMAAINTEL (void);
+#endif
+#endif /* GL_INTEL_framebuffer_CMAA */
+
+#ifndef GL_INTEL_performance_query
+#define GL_INTEL_performance_query 1
+#define GL_PERFQUERY_SINGLE_CONTEXT_INTEL 0x00000000
+#define GL_PERFQUERY_GLOBAL_CONTEXT_INTEL 0x00000001
+#define GL_PERFQUERY_WAIT_INTEL           0x83FB
+#define GL_PERFQUERY_FLUSH_INTEL          0x83FA
+#define GL_PERFQUERY_DONOT_FLUSH_INTEL    0x83F9
+#define GL_PERFQUERY_COUNTER_EVENT_INTEL  0x94F0
+#define GL_PERFQUERY_COUNTER_DURATION_NORM_INTEL 0x94F1
+#define GL_PERFQUERY_COUNTER_DURATION_RAW_INTEL 0x94F2
+#define GL_PERFQUERY_COUNTER_THROUGHPUT_INTEL 0x94F3
+#define GL_PERFQUERY_COUNTER_RAW_INTEL    0x94F4
+#define GL_PERFQUERY_COUNTER_TIMESTAMP_INTEL 0x94F5
+#define GL_PERFQUERY_COUNTER_DATA_UINT32_INTEL 0x94F8
+#define GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL 0x94F9
+#define GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL 0x94FA
+#define GL_PERFQUERY_COUNTER_DATA_DOUBLE_INTEL 0x94FB
+#define GL_PERFQUERY_COUNTER_DATA_BOOL32_INTEL 0x94FC
+#define GL_PERFQUERY_QUERY_NAME_LENGTH_MAX_INTEL 0x94FD
+#define GL_PERFQUERY_COUNTER_NAME_LENGTH_MAX_INTEL 0x94FE
+#define GL_PERFQUERY_COUNTER_DESC_LENGTH_MAX_INTEL 0x94FF
+#define GL_PERFQUERY_GPA_EXTENDED_COUNTERS_INTEL 0x9500
+typedef void (APIENTRYP PFNGLBEGINPERFQUERYINTELPROC) (GLuint queryHandle);
+typedef void (APIENTRYP PFNGLCREATEPERFQUERYINTELPROC) (GLuint queryId, GLuint *queryHandle);
+typedef void (APIENTRYP PFNGLDELETEPERFQUERYINTELPROC) (GLuint queryHandle);
+typedef void (APIENTRYP PFNGLENDPERFQUERYINTELPROC) (GLuint queryHandle);
+typedef void (APIENTRYP PFNGLGETFIRSTPERFQUERYIDINTELPROC) (GLuint *queryId);
+typedef void (APIENTRYP PFNGLGETNEXTPERFQUERYIDINTELPROC) (GLuint queryId, GLuint *nextQueryId);
+typedef void (APIENTRYP PFNGLGETPERFCOUNTERINFOINTELPROC) (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
+typedef void (APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+typedef void (APIENTRYP PFNGLGETPERFQUERYIDBYNAMEINTELPROC) (GLchar *queryName, GLuint *queryId);
+typedef void (APIENTRYP PFNGLGETPERFQUERYINFOINTELPROC) (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glBeginPerfQueryINTEL (GLuint queryHandle);
+GLAPI void APIENTRY glCreatePerfQueryINTEL (GLuint queryId, GLuint *queryHandle);
+GLAPI void APIENTRY glDeletePerfQueryINTEL (GLuint queryHandle);
+GLAPI void APIENTRY glEndPerfQueryINTEL (GLuint queryHandle);
+GLAPI void APIENTRY glGetFirstPerfQueryIdINTEL (GLuint *queryId);
+GLAPI void APIENTRY glGetNextPerfQueryIdINTEL (GLuint queryId, GLuint *nextQueryId);
+GLAPI void APIENTRY glGetPerfCounterInfoINTEL (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
+GLAPI void APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+GLAPI void APIENTRY glGetPerfQueryIdByNameINTEL (GLchar *queryName, GLuint *queryId);
+GLAPI void APIENTRY glGetPerfQueryInfoINTEL (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
+#endif
+#endif /* GL_INTEL_performance_query */
+
+#ifndef GL_NV_bindless_multi_draw_indirect
+#define GL_NV_bindless_multi_draw_indirect 1
+typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSINDIRECTBINDLESSNVPROC) (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount);
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSINDIRECTBINDLESSNVPROC) (GLenum mode, GLenum type, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMultiDrawArraysIndirectBindlessNV (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount);
+GLAPI void APIENTRY glMultiDrawElementsIndirectBindlessNV (GLenum mode, GLenum type, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount);
+#endif
+#endif /* GL_NV_bindless_multi_draw_indirect */
+
+#ifndef GL_NV_bindless_multi_draw_indirect_count
+#define GL_NV_bindless_multi_draw_indirect_count 1
+typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSINDIRECTBINDLESSCOUNTNVPROC) (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei maxDrawCount, GLsizei stride, GLint vertexBufferCount);
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSINDIRECTBINDLESSCOUNTNVPROC) (GLenum mode, GLenum type, const void *indirect, GLsizei drawCount, GLsizei maxDrawCount, GLsizei stride, GLint vertexBufferCount);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMultiDrawArraysIndirectBindlessCountNV (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei maxDrawCount, GLsizei stride, GLint vertexBufferCount);
+GLAPI void APIENTRY glMultiDrawElementsIndirectBindlessCountNV (GLenum mode, GLenum type, const void *indirect, GLsizei drawCount, GLsizei maxDrawCount, GLsizei stride, GLint vertexBufferCount);
+#endif
+#endif /* GL_NV_bindless_multi_draw_indirect_count */
+
+#ifndef GL_NV_bindless_texture
+#define GL_NV_bindless_texture 1
+typedef GLuint64 (APIENTRYP PFNGLGETTEXTUREHANDLENVPROC) (GLuint texture);
+typedef GLuint64 (APIENTRYP PFNGLGETTEXTURESAMPLERHANDLENVPROC) (GLuint texture, GLuint sampler);
+typedef void (APIENTRYP PFNGLMAKETEXTUREHANDLERESIDENTNVPROC) (GLuint64 handle);
+typedef void (APIENTRYP PFNGLMAKETEXTUREHANDLENONRESIDENTNVPROC) (GLuint64 handle);
+typedef GLuint64 (APIENTRYP PFNGLGETIMAGEHANDLENVPROC) (GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum format);
+typedef void (APIENTRYP PFNGLMAKEIMAGEHANDLERESIDENTNVPROC) (GLuint64 handle, GLenum access);
+typedef void (APIENTRYP PFNGLMAKEIMAGEHANDLENONRESIDENTNVPROC) (GLuint64 handle);
+typedef void (APIENTRYP PFNGLUNIFORMHANDLEUI64NVPROC) (GLint location, GLuint64 value);
+typedef void (APIENTRYP PFNGLUNIFORMHANDLEUI64VNVPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64NVPROC) (GLuint program, GLint location, GLuint64 value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+typedef GLboolean (APIENTRYP PFNGLISTEXTUREHANDLERESIDENTNVPROC) (GLuint64 handle);
+typedef GLboolean (APIENTRYP PFNGLISIMAGEHANDLERESIDENTNVPROC) (GLuint64 handle);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI GLuint64 APIENTRY glGetTextureHandleNV (GLuint texture);
+GLAPI GLuint64 APIENTRY glGetTextureSamplerHandleNV (GLuint texture, GLuint sampler);
+GLAPI void APIENTRY glMakeTextureHandleResidentNV (GLuint64 handle);
+GLAPI void APIENTRY glMakeTextureHandleNonResidentNV (GLuint64 handle);
+GLAPI GLuint64 APIENTRY glGetImageHandleNV (GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum format);
+GLAPI void APIENTRY glMakeImageHandleResidentNV (GLuint64 handle, GLenum access);
+GLAPI void APIENTRY glMakeImageHandleNonResidentNV (GLuint64 handle);
+GLAPI void APIENTRY glUniformHandleui64NV (GLint location, GLuint64 value);
+GLAPI void APIENTRY glUniformHandleui64vNV (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniformHandleui64NV (GLuint program, GLint location, GLuint64 value);
+GLAPI void APIENTRY glProgramUniformHandleui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+GLAPI GLboolean APIENTRY glIsTextureHandleResidentNV (GLuint64 handle);
+GLAPI GLboolean APIENTRY glIsImageHandleResidentNV (GLuint64 handle);
+#endif
+#endif /* GL_NV_bindless_texture */
+
+#ifndef GL_NV_blend_equation_advanced
+#define GL_NV_blend_equation_advanced 1
+#define GL_BLEND_OVERLAP_NV               0x9281
+#define GL_BLEND_PREMULTIPLIED_SRC_NV     0x9280
+#define GL_BLUE_NV                        0x1905
+#define GL_COLORBURN_NV                   0x929A
+#define GL_COLORDODGE_NV                  0x9299
+#define GL_CONJOINT_NV                    0x9284
+#define GL_CONTRAST_NV                    0x92A1
+#define GL_DARKEN_NV                      0x9297
+#define GL_DIFFERENCE_NV                  0x929E
+#define GL_DISJOINT_NV                    0x9283
+#define GL_DST_ATOP_NV                    0x928F
+#define GL_DST_IN_NV                      0x928B
+#define GL_DST_NV                         0x9287
+#define GL_DST_OUT_NV                     0x928D
+#define GL_DST_OVER_NV                    0x9289
+#define GL_EXCLUSION_NV                   0x92A0
+#define GL_GREEN_NV                       0x1904
+#define GL_HARDLIGHT_NV                   0x929B
+#define GL_HARDMIX_NV                     0x92A9
+#define GL_HSL_COLOR_NV                   0x92AF
+#define GL_HSL_HUE_NV                     0x92AD
+#define GL_HSL_LUMINOSITY_NV              0x92B0
+#define GL_HSL_SATURATION_NV              0x92AE
+#define GL_INVERT_OVG_NV                  0x92B4
+#define GL_INVERT_RGB_NV                  0x92A3
+#define GL_LIGHTEN_NV                     0x9298
+#define GL_LINEARBURN_NV                  0x92A5
+#define GL_LINEARDODGE_NV                 0x92A4
+#define GL_LINEARLIGHT_NV                 0x92A7
+#define GL_MINUS_CLAMPED_NV               0x92B3
+#define GL_MINUS_NV                       0x929F
+#define GL_MULTIPLY_NV                    0x9294
+#define GL_OVERLAY_NV                     0x9296
+#define GL_PINLIGHT_NV                    0x92A8
+#define GL_PLUS_CLAMPED_ALPHA_NV          0x92B2
+#define GL_PLUS_CLAMPED_NV                0x92B1
+#define GL_PLUS_DARKER_NV                 0x9292
+#define GL_PLUS_NV                        0x9291
+#define GL_RED_NV                         0x1903
+#define GL_SCREEN_NV                      0x9295
+#define GL_SOFTLIGHT_NV                   0x929C
+#define GL_SRC_ATOP_NV                    0x928E
+#define GL_SRC_IN_NV                      0x928A
+#define GL_SRC_NV                         0x9286
+#define GL_SRC_OUT_NV                     0x928C
+#define GL_SRC_OVER_NV                    0x9288
+#define GL_UNCORRELATED_NV                0x9282
+#define GL_VIVIDLIGHT_NV                  0x92A6
+#define GL_XOR_NV                         0x1506
+typedef void (APIENTRYP PFNGLBLENDPARAMETERINVPROC) (GLenum pname, GLint value);
+typedef void (APIENTRYP PFNGLBLENDBARRIERNVPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glBlendParameteriNV (GLenum pname, GLint value);
+GLAPI void APIENTRY glBlendBarrierNV (void);
+#endif
+#endif /* GL_NV_blend_equation_advanced */
+
+#ifndef GL_NV_blend_equation_advanced_coherent
+#define GL_NV_blend_equation_advanced_coherent 1
+#define GL_BLEND_ADVANCED_COHERENT_NV     0x9285
+#endif /* GL_NV_blend_equation_advanced_coherent */
+
+#ifndef GL_NV_clip_space_w_scaling
+#define GL_NV_clip_space_w_scaling 1
+#define GL_VIEWPORT_POSITION_W_SCALE_NV   0x937C
+#define GL_VIEWPORT_POSITION_W_SCALE_X_COEFF_NV 0x937D
+#define GL_VIEWPORT_POSITION_W_SCALE_Y_COEFF_NV 0x937E
+typedef void (APIENTRYP PFNGLVIEWPORTPOSITIONWSCALENVPROC) (GLuint index, GLfloat xcoeff, GLfloat ycoeff);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glViewportPositionWScaleNV (GLuint index, GLfloat xcoeff, GLfloat ycoeff);
+#endif
+#endif /* GL_NV_clip_space_w_scaling */
+
+#ifndef GL_NV_command_list
+#define GL_NV_command_list 1
+#define GL_TERMINATE_SEQUENCE_COMMAND_NV  0x0000
+#define GL_NOP_COMMAND_NV                 0x0001
+#define GL_DRAW_ELEMENTS_COMMAND_NV       0x0002
+#define GL_DRAW_ARRAYS_COMMAND_NV         0x0003
+#define GL_DRAW_ELEMENTS_STRIP_COMMAND_NV 0x0004
+#define GL_DRAW_ARRAYS_STRIP_COMMAND_NV   0x0005
+#define GL_DRAW_ELEMENTS_INSTANCED_COMMAND_NV 0x0006
+#define GL_DRAW_ARRAYS_INSTANCED_COMMAND_NV 0x0007
+#define GL_ELEMENT_ADDRESS_COMMAND_NV     0x0008
+#define GL_ATTRIBUTE_ADDRESS_COMMAND_NV   0x0009
+#define GL_UNIFORM_ADDRESS_COMMAND_NV     0x000A
+#define GL_BLEND_COLOR_COMMAND_NV         0x000B
+#define GL_STENCIL_REF_COMMAND_NV         0x000C
+#define GL_LINE_WIDTH_COMMAND_NV          0x000D
+#define GL_POLYGON_OFFSET_COMMAND_NV      0x000E
+#define GL_ALPHA_REF_COMMAND_NV           0x000F
+#define GL_VIEWPORT_COMMAND_NV            0x0010
+#define GL_SCISSOR_COMMAND_NV             0x0011
+#define GL_FRONT_FACE_COMMAND_NV          0x0012
+typedef void (APIENTRYP PFNGLCREATESTATESNVPROC) (GLsizei n, GLuint *states);
+typedef void (APIENTRYP PFNGLDELETESTATESNVPROC) (GLsizei n, const GLuint *states);
+typedef GLboolean (APIENTRYP PFNGLISSTATENVPROC) (GLuint state);
+typedef void (APIENTRYP PFNGLSTATECAPTURENVPROC) (GLuint state, GLenum mode);
+typedef GLuint (APIENTRYP PFNGLGETCOMMANDHEADERNVPROC) (GLenum tokenID, GLuint size);
+typedef GLushort (APIENTRYP PFNGLGETSTAGEINDEXNVPROC) (GLenum shadertype);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSNVPROC) (GLenum primitiveMode, GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSADDRESSNVPROC) (GLenum primitiveMode, const GLuint64 *indirects, const GLsizei *sizes, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSSTATESNVPROC) (GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSSTATESADDRESSNVPROC) (const GLuint64 *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLCREATECOMMANDLISTSNVPROC) (GLsizei n, GLuint *lists);
+typedef void (APIENTRYP PFNGLDELETECOMMANDLISTSNVPROC) (GLsizei n, const GLuint *lists);
+typedef GLboolean (APIENTRYP PFNGLISCOMMANDLISTNVPROC) (GLuint list);
+typedef void (APIENTRYP PFNGLLISTDRAWCOMMANDSSTATESCLIENTNVPROC) (GLuint list, GLuint segment, const void **indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLCOMMANDLISTSEGMENTSNVPROC) (GLuint list, GLuint segments);
+typedef void (APIENTRYP PFNGLCOMPILECOMMANDLISTNVPROC) (GLuint list);
+typedef void (APIENTRYP PFNGLCALLCOMMANDLISTNVPROC) (GLuint list);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glCreateStatesNV (GLsizei n, GLuint *states);
+GLAPI void APIENTRY glDeleteStatesNV (GLsizei n, const GLuint *states);
+GLAPI GLboolean APIENTRY glIsStateNV (GLuint state);
+GLAPI void APIENTRY glStateCaptureNV (GLuint state, GLenum mode);
+GLAPI GLuint APIENTRY glGetCommandHeaderNV (GLenum tokenID, GLuint size);
+GLAPI GLushort APIENTRY glGetStageIndexNV (GLenum shadertype);
+GLAPI void APIENTRY glDrawCommandsNV (GLenum primitiveMode, GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, GLuint count);
+GLAPI void APIENTRY glDrawCommandsAddressNV (GLenum primitiveMode, const GLuint64 *indirects, const GLsizei *sizes, GLuint count);
+GLAPI void APIENTRY glDrawCommandsStatesNV (GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glDrawCommandsStatesAddressNV (const GLuint64 *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glCreateCommandListsNV (GLsizei n, GLuint *lists);
+GLAPI void APIENTRY glDeleteCommandListsNV (GLsizei n, const GLuint *lists);
+GLAPI GLboolean APIENTRY glIsCommandListNV (GLuint list);
+GLAPI void APIENTRY glListDrawCommandsStatesClientNV (GLuint list, GLuint segment, const void **indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glCommandListSegmentsNV (GLuint list, GLuint segments);
+GLAPI void APIENTRY glCompileCommandListNV (GLuint list);
+GLAPI void APIENTRY glCallCommandListNV (GLuint list);
+#endif
+#endif /* GL_NV_command_list */
+
+#ifndef GL_NV_conditional_render
+#define GL_NV_conditional_render 1
+#define GL_QUERY_WAIT_NV                  0x8E13
+#define GL_QUERY_NO_WAIT_NV               0x8E14
+#define GL_QUERY_BY_REGION_WAIT_NV        0x8E15
+#define GL_QUERY_BY_REGION_NO_WAIT_NV     0x8E16
+typedef void (APIENTRYP PFNGLBEGINCONDITIONALRENDERNVPROC) (GLuint id, GLenum mode);
+typedef void (APIENTRYP PFNGLENDCONDITIONALRENDERNVPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glBeginConditionalRenderNV (GLuint id, GLenum mode);
+GLAPI void APIENTRY glEndConditionalRenderNV (void);
+#endif
+#endif /* GL_NV_conditional_render */
+
+#ifndef GL_NV_conservative_raster
+#define GL_NV_conservative_raster 1
+#define GL_CONSERVATIVE_RASTERIZATION_NV  0x9346
+#define GL_SUBPIXEL_PRECISION_BIAS_X_BITS_NV 0x9347
+#define GL_SUBPIXEL_PRECISION_BIAS_Y_BITS_NV 0x9348
+#define GL_MAX_SUBPIXEL_PRECISION_BIAS_BITS_NV 0x9349
+typedef void (APIENTRYP PFNGLSUBPIXELPRECISIONBIASNVPROC) (GLuint xbits, GLuint ybits);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glSubpixelPrecisionBiasNV (GLuint xbits, GLuint ybits);
+#endif
+#endif /* GL_NV_conservative_raster */
+
+#ifndef GL_NV_conservative_raster_dilate
+#define GL_NV_conservative_raster_dilate 1
+#define GL_CONSERVATIVE_RASTER_DILATE_NV  0x9379
+#define GL_CONSERVATIVE_RASTER_DILATE_RANGE_NV 0x937A
+#define GL_CONSERVATIVE_RASTER_DILATE_GRANULARITY_NV 0x937B
+typedef void (APIENTRYP PFNGLCONSERVATIVERASTERPARAMETERFNVPROC) (GLenum pname, GLfloat value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glConservativeRasterParameterfNV (GLenum pname, GLfloat value);
+#endif
+#endif /* GL_NV_conservative_raster_dilate */
+
+#ifndef GL_NV_conservative_raster_pre_snap_triangles
+#define GL_NV_conservative_raster_pre_snap_triangles 1
+#define GL_CONSERVATIVE_RASTER_MODE_NV    0x954D
+#define GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV 0x954E
+#define GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_TRIANGLES_NV 0x954F
+typedef void (APIENTRYP PFNGLCONSERVATIVERASTERPARAMETERINVPROC) (GLenum pname, GLint param);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glConservativeRasterParameteriNV (GLenum pname, GLint param);
+#endif
+#endif /* GL_NV_conservative_raster_pre_snap_triangles */
+
+#ifndef GL_NV_draw_vulkan_image
+#define GL_NV_draw_vulkan_image 1
+typedef void (APIENTRY  *GLVULKANPROCNV)(void);
+typedef void (APIENTRYP PFNGLDRAWVKIMAGENVPROC) (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+typedef GLVULKANPROCNV (APIENTRYP PFNGLGETVKPROCADDRNVPROC) (const GLchar *name);
+typedef void (APIENTRYP PFNGLWAITVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (APIENTRYP PFNGLSIGNALVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (APIENTRYP PFNGLSIGNALVKFENCENVPROC) (GLuint64 vkFence);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glDrawVkImageNV (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+GLAPI GLVULKANPROCNV APIENTRY glGetVkProcAddrNV (const GLchar *name);
+GLAPI void APIENTRY glWaitVkSemaphoreNV (GLuint64 vkSemaphore);
+GLAPI void APIENTRY glSignalVkSemaphoreNV (GLuint64 vkSemaphore);
+GLAPI void APIENTRY glSignalVkFenceNV (GLuint64 vkFence);
+#endif
+#endif /* GL_NV_draw_vulkan_image */
+
+#ifndef GL_NV_fill_rectangle
+#define GL_NV_fill_rectangle 1
+#define GL_FILL_RECTANGLE_NV              0x933C
+#endif /* GL_NV_fill_rectangle */
+
+#ifndef GL_NV_fragment_coverage_to_color
+#define GL_NV_fragment_coverage_to_color 1
+#define GL_FRAGMENT_COVERAGE_TO_COLOR_NV  0x92DD
+#define GL_FRAGMENT_COVERAGE_COLOR_NV     0x92DE
+typedef void (APIENTRYP PFNGLFRAGMENTCOVERAGECOLORNVPROC) (GLuint color);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFragmentCoverageColorNV (GLuint color);
+#endif
+#endif /* GL_NV_fragment_coverage_to_color */
+
+#ifndef GL_NV_fragment_shader_interlock
+#define GL_NV_fragment_shader_interlock 1
+#endif /* GL_NV_fragment_shader_interlock */
+
+#ifndef GL_NV_framebuffer_mixed_samples
+#define GL_NV_framebuffer_mixed_samples 1
+#define GL_COVERAGE_MODULATION_TABLE_NV   0x9331
+#define GL_COLOR_SAMPLES_NV               0x8E20
+#define GL_DEPTH_SAMPLES_NV               0x932D
+#define GL_STENCIL_SAMPLES_NV             0x932E
+#define GL_MIXED_DEPTH_SAMPLES_SUPPORTED_NV 0x932F
+#define GL_MIXED_STENCIL_SAMPLES_SUPPORTED_NV 0x9330
+#define GL_COVERAGE_MODULATION_NV         0x9332
+#define GL_COVERAGE_MODULATION_TABLE_SIZE_NV 0x9333
+typedef void (APIENTRYP PFNGLCOVERAGEMODULATIONTABLENVPROC) (GLsizei n, const GLfloat *v);
+typedef void (APIENTRYP PFNGLGETCOVERAGEMODULATIONTABLENVPROC) (GLsizei bufsize, GLfloat *v);
+typedef void (APIENTRYP PFNGLCOVERAGEMODULATIONNVPROC) (GLenum components);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glCoverageModulationTableNV (GLsizei n, const GLfloat *v);
+GLAPI void APIENTRY glGetCoverageModulationTableNV (GLsizei bufsize, GLfloat *v);
+GLAPI void APIENTRY glCoverageModulationNV (GLenum components);
+#endif
+#endif /* GL_NV_framebuffer_mixed_samples */
+
+#ifndef GL_NV_framebuffer_multisample_coverage
+#define GL_NV_framebuffer_multisample_coverage 1
+#define GL_RENDERBUFFER_COVERAGE_SAMPLES_NV 0x8CAB
+#define GL_RENDERBUFFER_COLOR_SAMPLES_NV  0x8E10
+#define GL_MAX_MULTISAMPLE_COVERAGE_MODES_NV 0x8E11
+#define GL_MULTISAMPLE_COVERAGE_MODES_NV  0x8E12
+typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLECOVERAGENVPROC) (GLenum target, GLsizei coverageSamples, GLsizei colorSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glRenderbufferStorageMultisampleCoverageNV (GLenum target, GLsizei coverageSamples, GLsizei colorSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#endif
+#endif /* GL_NV_framebuffer_multisample_coverage */
+
+#ifndef GL_NV_geometry_shader_passthrough
+#define GL_NV_geometry_shader_passthrough 1
+#endif /* GL_NV_geometry_shader_passthrough */
+
+#ifndef GL_NV_gpu_shader5
+#define GL_NV_gpu_shader5 1
+typedef int64_t GLint64EXT;
+#define GL_INT64_NV                       0x140E
+#define GL_UNSIGNED_INT64_NV              0x140F
+#define GL_INT8_NV                        0x8FE0
+#define GL_INT8_VEC2_NV                   0x8FE1
+#define GL_INT8_VEC3_NV                   0x8FE2
+#define GL_INT8_VEC4_NV                   0x8FE3
+#define GL_INT16_NV                       0x8FE4
+#define GL_INT16_VEC2_NV                  0x8FE5
+#define GL_INT16_VEC3_NV                  0x8FE6
+#define GL_INT16_VEC4_NV                  0x8FE7
+#define GL_INT64_VEC2_NV                  0x8FE9
+#define GL_INT64_VEC3_NV                  0x8FEA
+#define GL_INT64_VEC4_NV                  0x8FEB
+#define GL_UNSIGNED_INT8_NV               0x8FEC
+#define GL_UNSIGNED_INT8_VEC2_NV          0x8FED
+#define GL_UNSIGNED_INT8_VEC3_NV          0x8FEE
+#define GL_UNSIGNED_INT8_VEC4_NV          0x8FEF
+#define GL_UNSIGNED_INT16_NV              0x8FF0
+#define GL_UNSIGNED_INT16_VEC2_NV         0x8FF1
+#define GL_UNSIGNED_INT16_VEC3_NV         0x8FF2
+#define GL_UNSIGNED_INT16_VEC4_NV         0x8FF3
+#define GL_UNSIGNED_INT64_VEC2_NV         0x8FF5
+#define GL_UNSIGNED_INT64_VEC3_NV         0x8FF6
+#define GL_UNSIGNED_INT64_VEC4_NV         0x8FF7
+#define GL_FLOAT16_NV                     0x8FF8
+#define GL_FLOAT16_VEC2_NV                0x8FF9
+#define GL_FLOAT16_VEC3_NV                0x8FFA
+#define GL_FLOAT16_VEC4_NV                0x8FFB
+typedef void (APIENTRYP PFNGLUNIFORM1I64NVPROC) (GLint location, GLint64EXT x);
+typedef void (APIENTRYP PFNGLUNIFORM2I64NVPROC) (GLint location, GLint64EXT x, GLint64EXT y);
+typedef void (APIENTRYP PFNGLUNIFORM3I64NVPROC) (GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+typedef void (APIENTRYP PFNGLUNIFORM4I64NVPROC) (GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+typedef void (APIENTRYP PFNGLUNIFORM1I64VNVPROC) (GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM2I64VNVPROC) (GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM3I64VNVPROC) (GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM4I64VNVPROC) (GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64NVPROC) (GLint location, GLuint64EXT x);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64NVPROC) (GLint location, GLuint64EXT x, GLuint64EXT y);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64NVPROC) (GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64NVPROC) (GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64VNVPROC) (GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64VNVPROC) (GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64VNVPROC) (GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64VNVPROC) (GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLGETUNIFORMI64VNVPROC) (GLuint program, GLint location, GLint64EXT *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64NVPROC) (GLuint program, GLint location, GLint64EXT x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64NVPROC) (GLuint program, GLint location, GLint64EXT x, GLint64EXT y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64NVPROC) (GLuint program, GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64NVPROC) (GLuint program, GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64NVPROC) (GLuint program, GLint location, GLuint64EXT x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64NVPROC) (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64NVPROC) (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64NVPROC) (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glUniform1i64NV (GLint location, GLint64EXT x);
+GLAPI void APIENTRY glUniform2i64NV (GLint location, GLint64EXT x, GLint64EXT y);
+GLAPI void APIENTRY glUniform3i64NV (GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+GLAPI void APIENTRY glUniform4i64NV (GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+GLAPI void APIENTRY glUniform1i64vNV (GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glUniform2i64vNV (GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glUniform3i64vNV (GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glUniform4i64vNV (GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glUniform1ui64NV (GLint location, GLuint64EXT x);
+GLAPI void APIENTRY glUniform2ui64NV (GLint location, GLuint64EXT x, GLuint64EXT y);
+GLAPI void APIENTRY glUniform3ui64NV (GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+GLAPI void APIENTRY glUniform4ui64NV (GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+GLAPI void APIENTRY glUniform1ui64vNV (GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glUniform2ui64vNV (GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glUniform3ui64vNV (GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glUniform4ui64vNV (GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glGetUniformi64vNV (GLuint program, GLint location, GLint64EXT *params);
+GLAPI void APIENTRY glProgramUniform1i64NV (GLuint program, GLint location, GLint64EXT x);
+GLAPI void APIENTRY glProgramUniform2i64NV (GLuint program, GLint location, GLint64EXT x, GLint64EXT y);
+GLAPI void APIENTRY glProgramUniform3i64NV (GLuint program, GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+GLAPI void APIENTRY glProgramUniform4i64NV (GLuint program, GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+GLAPI void APIENTRY glProgramUniform1i64vNV (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glProgramUniform2i64vNV (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glProgramUniform3i64vNV (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glProgramUniform4i64vNV (GLuint program, GLint location, GLsizei count, const GLint64EXT *value);
+GLAPI void APIENTRY glProgramUniform1ui64NV (GLuint program, GLint location, GLuint64EXT x);
+GLAPI void APIENTRY glProgramUniform2ui64NV (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y);
+GLAPI void APIENTRY glProgramUniform3ui64NV (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+GLAPI void APIENTRY glProgramUniform4ui64NV (GLuint program, GLint location, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+GLAPI void APIENTRY glProgramUniform1ui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glProgramUniform2ui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glProgramUniform3ui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glProgramUniform4ui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+#endif
+#endif /* GL_NV_gpu_shader5 */
+
+#ifndef GL_NV_internalformat_sample_query
+#define GL_NV_internalformat_sample_query 1
+#define GL_MULTISAMPLES_NV                0x9371
+#define GL_SUPERSAMPLE_SCALE_X_NV         0x9372
+#define GL_SUPERSAMPLE_SCALE_Y_NV         0x9373
+#define GL_CONFORMANT_NV                  0x9374
+typedef void (APIENTRYP PFNGLGETINTERNALFORMATSAMPLEIVNVPROC) (GLenum target, GLenum internalformat, GLsizei samples, GLenum pname, GLsizei bufSize, GLint *params);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glGetInternalformatSampleivNV (GLenum target, GLenum internalformat, GLsizei samples, GLenum pname, GLsizei bufSize, GLint *params);
+#endif
+#endif /* GL_NV_internalformat_sample_query */
+
+#ifndef GL_NV_path_rendering
+#define GL_NV_path_rendering 1
+#define GL_PATH_FORMAT_SVG_NV             0x9070
+#define GL_PATH_FORMAT_PS_NV              0x9071
+#define GL_STANDARD_FONT_NAME_NV          0x9072
+#define GL_SYSTEM_FONT_NAME_NV            0x9073
+#define GL_FILE_NAME_NV                   0x9074
+#define GL_PATH_STROKE_WIDTH_NV           0x9075
+#define GL_PATH_END_CAPS_NV               0x9076
+#define GL_PATH_INITIAL_END_CAP_NV        0x9077
+#define GL_PATH_TERMINAL_END_CAP_NV       0x9078
+#define GL_PATH_JOIN_STYLE_NV             0x9079
+#define GL_PATH_MITER_LIMIT_NV            0x907A
+#define GL_PATH_DASH_CAPS_NV              0x907B
+#define GL_PATH_INITIAL_DASH_CAP_NV       0x907C
+#define GL_PATH_TERMINAL_DASH_CAP_NV      0x907D
+#define GL_PATH_DASH_OFFSET_NV            0x907E
+#define GL_PATH_CLIENT_LENGTH_NV          0x907F
+#define GL_PATH_FILL_MODE_NV              0x9080
+#define GL_PATH_FILL_MASK_NV              0x9081
+#define GL_PATH_FILL_COVER_MODE_NV        0x9082
+#define GL_PATH_STROKE_COVER_MODE_NV      0x9083
+#define GL_PATH_STROKE_MASK_NV            0x9084
+#define GL_COUNT_UP_NV                    0x9088
+#define GL_COUNT_DOWN_NV                  0x9089
+#define GL_PATH_OBJECT_BOUNDING_BOX_NV    0x908A
+#define GL_CONVEX_HULL_NV                 0x908B
+#define GL_BOUNDING_BOX_NV                0x908D
+#define GL_TRANSLATE_X_NV                 0x908E
+#define GL_TRANSLATE_Y_NV                 0x908F
+#define GL_TRANSLATE_2D_NV                0x9090
+#define GL_TRANSLATE_3D_NV                0x9091
+#define GL_AFFINE_2D_NV                   0x9092
+#define GL_AFFINE_3D_NV                   0x9094
+#define GL_TRANSPOSE_AFFINE_2D_NV         0x9096
+#define GL_TRANSPOSE_AFFINE_3D_NV         0x9098
+#define GL_UTF8_NV                        0x909A
+#define GL_UTF16_NV                       0x909B
+#define GL_BOUNDING_BOX_OF_BOUNDING_BOXES_NV 0x909C
+#define GL_PATH_COMMAND_COUNT_NV          0x909D
+#define GL_PATH_COORD_COUNT_NV            0x909E
+#define GL_PATH_DASH_ARRAY_COUNT_NV       0x909F
+#define GL_PATH_COMPUTED_LENGTH_NV        0x90A0
+#define GL_PATH_FILL_BOUNDING_BOX_NV      0x90A1
+#define GL_PATH_STROKE_BOUNDING_BOX_NV    0x90A2
+#define GL_SQUARE_NV                      0x90A3
+#define GL_ROUND_NV                       0x90A4
+#define GL_TRIANGULAR_NV                  0x90A5
+#define GL_BEVEL_NV                       0x90A6
+#define GL_MITER_REVERT_NV                0x90A7
+#define GL_MITER_TRUNCATE_NV              0x90A8
+#define GL_SKIP_MISSING_GLYPH_NV          0x90A9
+#define GL_USE_MISSING_GLYPH_NV           0x90AA
+#define GL_PATH_ERROR_POSITION_NV         0x90AB
+#define GL_ACCUM_ADJACENT_PAIRS_NV        0x90AD
+#define GL_ADJACENT_PAIRS_NV              0x90AE
+#define GL_FIRST_TO_REST_NV               0x90AF
+#define GL_PATH_GEN_MODE_NV               0x90B0
+#define GL_PATH_GEN_COEFF_NV              0x90B1
+#define GL_PATH_GEN_COMPONENTS_NV         0x90B3
+#define GL_PATH_STENCIL_FUNC_NV           0x90B7
+#define GL_PATH_STENCIL_REF_NV            0x90B8
+#define GL_PATH_STENCIL_VALUE_MASK_NV     0x90B9
+#define GL_PATH_STENCIL_DEPTH_OFFSET_FACTOR_NV 0x90BD
+#define GL_PATH_STENCIL_DEPTH_OFFSET_UNITS_NV 0x90BE
+#define GL_PATH_COVER_DEPTH_FUNC_NV       0x90BF
+#define GL_PATH_DASH_OFFSET_RESET_NV      0x90B4
+#define GL_MOVE_TO_RESETS_NV              0x90B5
+#define GL_MOVE_TO_CONTINUES_NV           0x90B6
+#define GL_CLOSE_PATH_NV                  0x00
+#define GL_MOVE_TO_NV                     0x02
+#define GL_RELATIVE_MOVE_TO_NV            0x03
+#define GL_LINE_TO_NV                     0x04
+#define GL_RELATIVE_LINE_TO_NV            0x05
+#define GL_HORIZONTAL_LINE_TO_NV          0x06
+#define GL_RELATIVE_HORIZONTAL_LINE_TO_NV 0x07
+#define GL_VERTICAL_LINE_TO_NV            0x08
+#define GL_RELATIVE_VERTICAL_LINE_TO_NV   0x09
+#define GL_QUADRATIC_CURVE_TO_NV          0x0A
+#define GL_RELATIVE_QUADRATIC_CURVE_TO_NV 0x0B
+#define GL_CUBIC_CURVE_TO_NV              0x0C
+#define GL_RELATIVE_CUBIC_CURVE_TO_NV     0x0D
+#define GL_SMOOTH_QUADRATIC_CURVE_TO_NV   0x0E
+#define GL_RELATIVE_SMOOTH_QUADRATIC_CURVE_TO_NV 0x0F
+#define GL_SMOOTH_CUBIC_CURVE_TO_NV       0x10
+#define GL_RELATIVE_SMOOTH_CUBIC_CURVE_TO_NV 0x11
+#define GL_SMALL_CCW_ARC_TO_NV            0x12
+#define GL_RELATIVE_SMALL_CCW_ARC_TO_NV   0x13
+#define GL_SMALL_CW_ARC_TO_NV             0x14
+#define GL_RELATIVE_SMALL_CW_ARC_TO_NV    0x15
+#define GL_LARGE_CCW_ARC_TO_NV            0x16
+#define GL_RELATIVE_LARGE_CCW_ARC_TO_NV   0x17
+#define GL_LARGE_CW_ARC_TO_NV             0x18
+#define GL_RELATIVE_LARGE_CW_ARC_TO_NV    0x19
+#define GL_RESTART_PATH_NV                0xF0
+#define GL_DUP_FIRST_CUBIC_CURVE_TO_NV    0xF2
+#define GL_DUP_LAST_CUBIC_CURVE_TO_NV     0xF4
+#define GL_RECT_NV                        0xF6
+#define GL_CIRCULAR_CCW_ARC_TO_NV         0xF8
+#define GL_CIRCULAR_CW_ARC_TO_NV          0xFA
+#define GL_CIRCULAR_TANGENT_ARC_TO_NV     0xFC
+#define GL_ARC_TO_NV                      0xFE
+#define GL_RELATIVE_ARC_TO_NV             0xFF
+#define GL_BOLD_BIT_NV                    0x01
+#define GL_ITALIC_BIT_NV                  0x02
+#define GL_GLYPH_WIDTH_BIT_NV             0x01
+#define GL_GLYPH_HEIGHT_BIT_NV            0x02
+#define GL_GLYPH_HORIZONTAL_BEARING_X_BIT_NV 0x04
+#define GL_GLYPH_HORIZONTAL_BEARING_Y_BIT_NV 0x08
+#define GL_GLYPH_HORIZONTAL_BEARING_ADVANCE_BIT_NV 0x10
+#define GL_GLYPH_VERTICAL_BEARING_X_BIT_NV 0x20
+#define GL_GLYPH_VERTICAL_BEARING_Y_BIT_NV 0x40
+#define GL_GLYPH_VERTICAL_BEARING_ADVANCE_BIT_NV 0x80
+#define GL_GLYPH_HAS_KERNING_BIT_NV       0x100
+#define GL_FONT_X_MIN_BOUNDS_BIT_NV       0x00010000
+#define GL_FONT_Y_MIN_BOUNDS_BIT_NV       0x00020000
+#define GL_FONT_X_MAX_BOUNDS_BIT_NV       0x00040000
+#define GL_FONT_Y_MAX_BOUNDS_BIT_NV       0x00080000
+#define GL_FONT_UNITS_PER_EM_BIT_NV       0x00100000
+#define GL_FONT_ASCENDER_BIT_NV           0x00200000
+#define GL_FONT_DESCENDER_BIT_NV          0x00400000
+#define GL_FONT_HEIGHT_BIT_NV             0x00800000
+#define GL_FONT_MAX_ADVANCE_WIDTH_BIT_NV  0x01000000
+#define GL_FONT_MAX_ADVANCE_HEIGHT_BIT_NV 0x02000000
+#define GL_FONT_UNDERLINE_POSITION_BIT_NV 0x04000000
+#define GL_FONT_UNDERLINE_THICKNESS_BIT_NV 0x08000000
+#define GL_FONT_HAS_KERNING_BIT_NV        0x10000000
+#define GL_ROUNDED_RECT_NV                0xE8
+#define GL_RELATIVE_ROUNDED_RECT_NV       0xE9
+#define GL_ROUNDED_RECT2_NV               0xEA
+#define GL_RELATIVE_ROUNDED_RECT2_NV      0xEB
+#define GL_ROUNDED_RECT4_NV               0xEC
+#define GL_RELATIVE_ROUNDED_RECT4_NV      0xED
+#define GL_ROUNDED_RECT8_NV               0xEE
+#define GL_RELATIVE_ROUNDED_RECT8_NV      0xEF
+#define GL_RELATIVE_RECT_NV               0xF7
+#define GL_FONT_GLYPHS_AVAILABLE_NV       0x9368
+#define GL_FONT_TARGET_UNAVAILABLE_NV     0x9369
+#define GL_FONT_UNAVAILABLE_NV            0x936A
+#define GL_FONT_UNINTELLIGIBLE_NV         0x936B
+#define GL_CONIC_CURVE_TO_NV              0x1A
+#define GL_RELATIVE_CONIC_CURVE_TO_NV     0x1B
+#define GL_FONT_NUM_GLYPH_INDICES_BIT_NV  0x20000000
+#define GL_STANDARD_FONT_FORMAT_NV        0x936C
+#define GL_PATH_PROJECTION_NV             0x1701
+#define GL_PATH_MODELVIEW_NV              0x1700
+#define GL_PATH_MODELVIEW_STACK_DEPTH_NV  0x0BA3
+#define GL_PATH_MODELVIEW_MATRIX_NV       0x0BA6
+#define GL_PATH_MAX_MODELVIEW_STACK_DEPTH_NV 0x0D36
+#define GL_PATH_TRANSPOSE_MODELVIEW_MATRIX_NV 0x84E3
+#define GL_PATH_PROJECTION_STACK_DEPTH_NV 0x0BA4
+#define GL_PATH_PROJECTION_MATRIX_NV      0x0BA7
+#define GL_PATH_MAX_PROJECTION_STACK_DEPTH_NV 0x0D38
+#define GL_PATH_TRANSPOSE_PROJECTION_MATRIX_NV 0x84E4
+#define GL_FRAGMENT_INPUT_NV              0x936D
+typedef GLuint (APIENTRYP PFNGLGENPATHSNVPROC) (GLsizei range);
+typedef void (APIENTRYP PFNGLDELETEPATHSNVPROC) (GLuint path, GLsizei range);
+typedef GLboolean (APIENTRYP PFNGLISPATHNVPROC) (GLuint path);
+typedef void (APIENTRYP PFNGLPATHCOMMANDSNVPROC) (GLuint path, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const void *coords);
+typedef void (APIENTRYP PFNGLPATHCOORDSNVPROC) (GLuint path, GLsizei numCoords, GLenum coordType, const void *coords);
+typedef void (APIENTRYP PFNGLPATHSUBCOMMANDSNVPROC) (GLuint path, GLsizei commandStart, GLsizei commandsToDelete, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const void *coords);
+typedef void (APIENTRYP PFNGLPATHSUBCOORDSNVPROC) (GLuint path, GLsizei coordStart, GLsizei numCoords, GLenum coordType, const void *coords);
+typedef void (APIENTRYP PFNGLPATHSTRINGNVPROC) (GLuint path, GLenum format, GLsizei length, const void *pathString);
+typedef void (APIENTRYP PFNGLPATHGLYPHSNVPROC) (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLsizei numGlyphs, GLenum type, const void *charcodes, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef void (APIENTRYP PFNGLPATHGLYPHRANGENVPROC) (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint firstGlyph, GLsizei numGlyphs, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef void (APIENTRYP PFNGLWEIGHTPATHSNVPROC) (GLuint resultPath, GLsizei numPaths, const GLuint *paths, const GLfloat *weights);
+typedef void (APIENTRYP PFNGLCOPYPATHNVPROC) (GLuint resultPath, GLuint srcPath);
+typedef void (APIENTRYP PFNGLINTERPOLATEPATHSNVPROC) (GLuint resultPath, GLuint pathA, GLuint pathB, GLfloat weight);
+typedef void (APIENTRYP PFNGLTRANSFORMPATHNVPROC) (GLuint resultPath, GLuint srcPath, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLPATHPARAMETERIVNVPROC) (GLuint path, GLenum pname, const GLint *value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERINVPROC) (GLuint path, GLenum pname, GLint value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERFVNVPROC) (GLuint path, GLenum pname, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERFNVPROC) (GLuint path, GLenum pname, GLfloat value);
+typedef void (APIENTRYP PFNGLPATHDASHARRAYNVPROC) (GLuint path, GLsizei dashCount, const GLfloat *dashArray);
+typedef void (APIENTRYP PFNGLPATHSTENCILFUNCNVPROC) (GLenum func, GLint ref, GLuint mask);
+typedef void (APIENTRYP PFNGLPATHSTENCILDEPTHOFFSETNVPROC) (GLfloat factor, GLfloat units);
+typedef void (APIENTRYP PFNGLSTENCILFILLPATHNVPROC) (GLuint path, GLenum fillMode, GLuint mask);
+typedef void (APIENTRYP PFNGLSTENCILSTROKEPATHNVPROC) (GLuint path, GLint reference, GLuint mask);
+typedef void (APIENTRYP PFNGLSTENCILFILLPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLSTENCILSTROKEPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLPATHCOVERDEPTHFUNCNVPROC) (GLenum func);
+typedef void (APIENTRYP PFNGLCOVERFILLPATHNVPROC) (GLuint path, GLenum coverMode);
+typedef void (APIENTRYP PFNGLCOVERSTROKEPATHNVPROC) (GLuint path, GLenum coverMode);
+typedef void (APIENTRYP PFNGLCOVERFILLPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLCOVERSTROKEPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLGETPATHPARAMETERIVNVPROC) (GLuint path, GLenum pname, GLint *value);
+typedef void (APIENTRYP PFNGLGETPATHPARAMETERFVNVPROC) (GLuint path, GLenum pname, GLfloat *value);
+typedef void (APIENTRYP PFNGLGETPATHCOMMANDSNVPROC) (GLuint path, GLubyte *commands);
+typedef void (APIENTRYP PFNGLGETPATHCOORDSNVPROC) (GLuint path, GLfloat *coords);
+typedef void (APIENTRYP PFNGLGETPATHDASHARRAYNVPROC) (GLuint path, GLfloat *dashArray);
+typedef void (APIENTRYP PFNGLGETPATHMETRICSNVPROC) (GLbitfield metricQueryMask, GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLsizei stride, GLfloat *metrics);
+typedef void (APIENTRYP PFNGLGETPATHMETRICRANGENVPROC) (GLbitfield metricQueryMask, GLuint firstPathName, GLsizei numPaths, GLsizei stride, GLfloat *metrics);
+typedef void (APIENTRYP PFNGLGETPATHSPACINGNVPROC) (GLenum pathListMode, GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLfloat advanceScale, GLfloat kerningScale, GLenum transformType, GLfloat *returnedSpacing);
+typedef GLboolean (APIENTRYP PFNGLISPOINTINFILLPATHNVPROC) (GLuint path, GLuint mask, GLfloat x, GLfloat y);
+typedef GLboolean (APIENTRYP PFNGLISPOINTINSTROKEPATHNVPROC) (GLuint path, GLfloat x, GLfloat y);
+typedef GLfloat (APIENTRYP PFNGLGETPATHLENGTHNVPROC) (GLuint path, GLsizei startSegment, GLsizei numSegments);
+typedef GLboolean (APIENTRYP PFNGLPOINTALONGPATHNVPROC) (GLuint path, GLsizei startSegment, GLsizei numSegments, GLfloat distance, GLfloat *x, GLfloat *y, GLfloat *tangentX, GLfloat *tangentY);
+typedef void (APIENTRYP PFNGLMATRIXLOAD3X2FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXLOAD3X3FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXLOADTRANSPOSE3X3FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXMULT3X2FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXMULT3X3FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLMATRIXMULTTRANSPOSE3X3FNVPROC) (GLenum matrixMode, const GLfloat *m);
+typedef void (APIENTRYP PFNGLSTENCILTHENCOVERFILLPATHNVPROC) (GLuint path, GLenum fillMode, GLuint mask, GLenum coverMode);
+typedef void (APIENTRYP PFNGLSTENCILTHENCOVERSTROKEPATHNVPROC) (GLuint path, GLint reference, GLuint mask, GLenum coverMode);
+typedef void (APIENTRYP PFNGLSTENCILTHENCOVERFILLPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLSTENCILTHENCOVERSTROKEPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef GLenum (APIENTRYP PFNGLPATHGLYPHINDEXRANGENVPROC) (GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint pathParameterTemplate, GLfloat emScale, GLuint baseAndCount[2]);
+typedef GLenum (APIENTRYP PFNGLPATHGLYPHINDEXARRAYNVPROC) (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef GLenum (APIENTRYP PFNGLPATHMEMORYGLYPHINDEXARRAYNVPROC) (GLuint firstPathName, GLenum fontTarget, GLsizeiptr fontSize, const void *fontData, GLsizei faceIndex, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef void (APIENTRYP PFNGLPROGRAMPATHFRAGMENTINPUTGENNVPROC) (GLuint program, GLint location, GLenum genMode, GLint components, const GLfloat *coeffs);
+typedef void (APIENTRYP PFNGLGETPROGRAMRESOURCEFVNVPROC) (GLuint program, GLenum programInterface, GLuint index, GLsizei propCount, const GLenum *props, GLsizei bufSize, GLsizei *length, GLfloat *params);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI GLuint APIENTRY glGenPathsNV (GLsizei range);
+GLAPI void APIENTRY glDeletePathsNV (GLuint path, GLsizei range);
+GLAPI GLboolean APIENTRY glIsPathNV (GLuint path);
+GLAPI void APIENTRY glPathCommandsNV (GLuint path, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const void *coords);
+GLAPI void APIENTRY glPathCoordsNV (GLuint path, GLsizei numCoords, GLenum coordType, const void *coords);
+GLAPI void APIENTRY glPathSubCommandsNV (GLuint path, GLsizei commandStart, GLsizei commandsToDelete, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const void *coords);
+GLAPI void APIENTRY glPathSubCoordsNV (GLuint path, GLsizei coordStart, GLsizei numCoords, GLenum coordType, const void *coords);
+GLAPI void APIENTRY glPathStringNV (GLuint path, GLenum format, GLsizei length, const void *pathString);
+GLAPI void APIENTRY glPathGlyphsNV (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLsizei numGlyphs, GLenum type, const void *charcodes, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI void APIENTRY glPathGlyphRangeNV (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint firstGlyph, GLsizei numGlyphs, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI void APIENTRY glWeightPathsNV (GLuint resultPath, GLsizei numPaths, const GLuint *paths, const GLfloat *weights);
+GLAPI void APIENTRY glCopyPathNV (GLuint resultPath, GLuint srcPath);
+GLAPI void APIENTRY glInterpolatePathsNV (GLuint resultPath, GLuint pathA, GLuint pathB, GLfloat weight);
+GLAPI void APIENTRY glTransformPathNV (GLuint resultPath, GLuint srcPath, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glPathParameterivNV (GLuint path, GLenum pname, const GLint *value);
+GLAPI void APIENTRY glPathParameteriNV (GLuint path, GLenum pname, GLint value);
+GLAPI void APIENTRY glPathParameterfvNV (GLuint path, GLenum pname, const GLfloat *value);
+GLAPI void APIENTRY glPathParameterfNV (GLuint path, GLenum pname, GLfloat value);
+GLAPI void APIENTRY glPathDashArrayNV (GLuint path, GLsizei dashCount, const GLfloat *dashArray);
+GLAPI void APIENTRY glPathStencilFuncNV (GLenum func, GLint ref, GLuint mask);
+GLAPI void APIENTRY glPathStencilDepthOffsetNV (GLfloat factor, GLfloat units);
+GLAPI void APIENTRY glStencilFillPathNV (GLuint path, GLenum fillMode, GLuint mask);
+GLAPI void APIENTRY glStencilStrokePathNV (GLuint path, GLint reference, GLuint mask);
+GLAPI void APIENTRY glStencilFillPathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glStencilStrokePathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glPathCoverDepthFuncNV (GLenum func);
+GLAPI void APIENTRY glCoverFillPathNV (GLuint path, GLenum coverMode);
+GLAPI void APIENTRY glCoverStrokePathNV (GLuint path, GLenum coverMode);
+GLAPI void APIENTRY glCoverFillPathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glCoverStrokePathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glGetPathParameterivNV (GLuint path, GLenum pname, GLint *value);
+GLAPI void APIENTRY glGetPathParameterfvNV (GLuint path, GLenum pname, GLfloat *value);
+GLAPI void APIENTRY glGetPathCommandsNV (GLuint path, GLubyte *commands);
+GLAPI void APIENTRY glGetPathCoordsNV (GLuint path, GLfloat *coords);
+GLAPI void APIENTRY glGetPathDashArrayNV (GLuint path, GLfloat *dashArray);
+GLAPI void APIENTRY glGetPathMetricsNV (GLbitfield metricQueryMask, GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLsizei stride, GLfloat *metrics);
+GLAPI void APIENTRY glGetPathMetricRangeNV (GLbitfield metricQueryMask, GLuint firstPathName, GLsizei numPaths, GLsizei stride, GLfloat *metrics);
+GLAPI void APIENTRY glGetPathSpacingNV (GLenum pathListMode, GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLfloat advanceScale, GLfloat kerningScale, GLenum transformType, GLfloat *returnedSpacing);
+GLAPI GLboolean APIENTRY glIsPointInFillPathNV (GLuint path, GLuint mask, GLfloat x, GLfloat y);
+GLAPI GLboolean APIENTRY glIsPointInStrokePathNV (GLuint path, GLfloat x, GLfloat y);
+GLAPI GLfloat APIENTRY glGetPathLengthNV (GLuint path, GLsizei startSegment, GLsizei numSegments);
+GLAPI GLboolean APIENTRY glPointAlongPathNV (GLuint path, GLsizei startSegment, GLsizei numSegments, GLfloat distance, GLfloat *x, GLfloat *y, GLfloat *tangentX, GLfloat *tangentY);
+GLAPI void APIENTRY glMatrixLoad3x2fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixLoad3x3fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixLoadTranspose3x3fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixMult3x2fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixMult3x3fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glMatrixMultTranspose3x3fNV (GLenum matrixMode, const GLfloat *m);
+GLAPI void APIENTRY glStencilThenCoverFillPathNV (GLuint path, GLenum fillMode, GLuint mask, GLenum coverMode);
+GLAPI void APIENTRY glStencilThenCoverStrokePathNV (GLuint path, GLint reference, GLuint mask, GLenum coverMode);
+GLAPI void APIENTRY glStencilThenCoverFillPathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glStencilThenCoverStrokePathInstancedNV (GLsizei numPaths, GLenum pathNameType, const void *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI GLenum APIENTRY glPathGlyphIndexRangeNV (GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint pathParameterTemplate, GLfloat emScale, GLuint baseAndCount[2]);
+GLAPI GLenum APIENTRY glPathGlyphIndexArrayNV (GLuint firstPathName, GLenum fontTarget, const void *fontName, GLbitfield fontStyle, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI GLenum APIENTRY glPathMemoryGlyphIndexArrayNV (GLuint firstPathName, GLenum fontTarget, GLsizeiptr fontSize, const void *fontData, GLsizei faceIndex, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI void APIENTRY glProgramPathFragmentInputGenNV (GLuint program, GLint location, GLenum genMode, GLint components, const GLfloat *coeffs);
+GLAPI void APIENTRY glGetProgramResourcefvNV (GLuint program, GLenum programInterface, GLuint index, GLsizei propCount, const GLenum *props, GLsizei bufSize, GLsizei *length, GLfloat *params);
+#endif
+#endif /* GL_NV_path_rendering */
+
+#ifndef GL_NV_path_rendering_shared_edge
+#define GL_NV_path_rendering_shared_edge 1
+#define GL_SHARED_EDGE_NV                 0xC0
+#endif /* GL_NV_path_rendering_shared_edge */
+
+#ifndef GL_NV_sample_locations
+#define GL_NV_sample_locations 1
+#define GL_SAMPLE_LOCATION_SUBPIXEL_BITS_NV 0x933D
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_NV 0x933E
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_NV 0x933F
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_NV 0x9340
+#define GL_SAMPLE_LOCATION_NV             0x8E50
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_NV 0x9341
+#define GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_NV 0x9342
+#define GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_NV 0x9343
+typedef void (APIENTRYP PFNGLFRAMEBUFFERSAMPLELOCATIONSFVNVPROC) (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERSAMPLELOCATIONSFVNVPROC) (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLRESOLVEDEPTHVALUESNVPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferSampleLocationsfvNV (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glNamedFramebufferSampleLocationsfvNV (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glResolveDepthValuesNV (void);
+#endif
+#endif /* GL_NV_sample_locations */
+
+#ifndef GL_NV_sample_mask_override_coverage
+#define GL_NV_sample_mask_override_coverage 1
+#endif /* GL_NV_sample_mask_override_coverage */
+
+#ifndef GL_NV_shader_atomic_counters
+#define GL_NV_shader_atomic_counters 1
+#endif /* GL_NV_shader_atomic_counters */
+
+#ifndef GL_NV_shader_atomic_float
+#define GL_NV_shader_atomic_float 1
+#endif /* GL_NV_shader_atomic_float */
+
+#ifndef GL_NV_shader_atomic_float64
+#define GL_NV_shader_atomic_float64 1
+#endif /* GL_NV_shader_atomic_float64 */
+
+#ifndef GL_NV_shader_atomic_fp16_vector
+#define GL_NV_shader_atomic_fp16_vector 1
+#endif /* GL_NV_shader_atomic_fp16_vector */
+
+#ifndef GL_NV_shader_atomic_int64
+#define GL_NV_shader_atomic_int64 1
+#endif /* GL_NV_shader_atomic_int64 */
+
+#ifndef GL_NV_shader_buffer_load
+#define GL_NV_shader_buffer_load 1
+#define GL_BUFFER_GPU_ADDRESS_NV          0x8F1D
+#define GL_GPU_ADDRESS_NV                 0x8F34
+#define GL_MAX_SHADER_BUFFER_ADDRESS_NV   0x8F35
+typedef void (APIENTRYP PFNGLMAKEBUFFERRESIDENTNVPROC) (GLenum target, GLenum access);
+typedef void (APIENTRYP PFNGLMAKEBUFFERNONRESIDENTNVPROC) (GLenum target);
+typedef GLboolean (APIENTRYP PFNGLISBUFFERRESIDENTNVPROC) (GLenum target);
+typedef void (APIENTRYP PFNGLMAKENAMEDBUFFERRESIDENTNVPROC) (GLuint buffer, GLenum access);
+typedef void (APIENTRYP PFNGLMAKENAMEDBUFFERNONRESIDENTNVPROC) (GLuint buffer);
+typedef GLboolean (APIENTRYP PFNGLISNAMEDBUFFERRESIDENTNVPROC) (GLuint buffer);
+typedef void (APIENTRYP PFNGLGETBUFFERPARAMETERUI64VNVPROC) (GLenum target, GLenum pname, GLuint64EXT *params);
+typedef void (APIENTRYP PFNGLGETNAMEDBUFFERPARAMETERUI64VNVPROC) (GLuint buffer, GLenum pname, GLuint64EXT *params);
+typedef void (APIENTRYP PFNGLGETINTEGERUI64VNVPROC) (GLenum value, GLuint64EXT *result);
+typedef void (APIENTRYP PFNGLUNIFORMUI64NVPROC) (GLint location, GLuint64EXT value);
+typedef void (APIENTRYP PFNGLUNIFORMUI64VNVPROC) (GLint location, GLsizei count, const GLuint64EXT *value);
+typedef void (APIENTRYP PFNGLGETUNIFORMUI64VNVPROC) (GLuint program, GLint location, GLuint64EXT *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMUI64NVPROC) (GLuint program, GLint location, GLuint64EXT value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMUI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMakeBufferResidentNV (GLenum target, GLenum access);
+GLAPI void APIENTRY glMakeBufferNonResidentNV (GLenum target);
+GLAPI GLboolean APIENTRY glIsBufferResidentNV (GLenum target);
+GLAPI void APIENTRY glMakeNamedBufferResidentNV (GLuint buffer, GLenum access);
+GLAPI void APIENTRY glMakeNamedBufferNonResidentNV (GLuint buffer);
+GLAPI GLboolean APIENTRY glIsNamedBufferResidentNV (GLuint buffer);
+GLAPI void APIENTRY glGetBufferParameterui64vNV (GLenum target, GLenum pname, GLuint64EXT *params);
+GLAPI void APIENTRY glGetNamedBufferParameterui64vNV (GLuint buffer, GLenum pname, GLuint64EXT *params);
+GLAPI void APIENTRY glGetIntegerui64vNV (GLenum value, GLuint64EXT *result);
+GLAPI void APIENTRY glUniformui64NV (GLint location, GLuint64EXT value);
+GLAPI void APIENTRY glUniformui64vNV (GLint location, GLsizei count, const GLuint64EXT *value);
+GLAPI void APIENTRY glGetUniformui64vNV (GLuint program, GLint location, GLuint64EXT *params);
+GLAPI void APIENTRY glProgramUniformui64NV (GLuint program, GLint location, GLuint64EXT value);
+GLAPI void APIENTRY glProgramUniformui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64EXT *value);
+#endif
+#endif /* GL_NV_shader_buffer_load */
+
+#ifndef GL_NV_shader_buffer_store
+#define GL_NV_shader_buffer_store 1
+#define GL_SHADER_GLOBAL_ACCESS_BARRIER_BIT_NV 0x00000010
+#endif /* GL_NV_shader_buffer_store */
+
+#ifndef GL_NV_shader_thread_group
+#define GL_NV_shader_thread_group 1
+#define GL_WARP_SIZE_NV                   0x9339
+#define GL_WARPS_PER_SM_NV                0x933A
+#define GL_SM_COUNT_NV                    0x933B
+#endif /* GL_NV_shader_thread_group */
+
+#ifndef GL_NV_shader_thread_shuffle
+#define GL_NV_shader_thread_shuffle 1
+#endif /* GL_NV_shader_thread_shuffle */
+
+#ifndef GL_NV_stereo_view_rendering
+#define GL_NV_stereo_view_rendering 1
+#endif /* GL_NV_stereo_view_rendering */
+
+#ifndef GL_NV_texture_barrier
+#define GL_NV_texture_barrier 1
+typedef void (APIENTRYP PFNGLTEXTUREBARRIERNVPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glTextureBarrierNV (void);
+#endif
+#endif /* GL_NV_texture_barrier */
+
+#ifndef GL_NV_uniform_buffer_unified_memory
+#define GL_NV_uniform_buffer_unified_memory 1
+#define GL_UNIFORM_BUFFER_UNIFIED_NV      0x936E
+#define GL_UNIFORM_BUFFER_ADDRESS_NV      0x936F
+#define GL_UNIFORM_BUFFER_LENGTH_NV       0x9370
+#endif /* GL_NV_uniform_buffer_unified_memory */
+
+#ifndef GL_NV_vertex_attrib_integer_64bit
+#define GL_NV_vertex_attrib_integer_64bit 1
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL1I64NVPROC) (GLuint index, GLint64EXT x);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL2I64NVPROC) (GLuint index, GLint64EXT x, GLint64EXT y);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL3I64NVPROC) (GLuint index, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL4I64NVPROC) (GLuint index, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL1I64VNVPROC) (GLuint index, const GLint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL2I64VNVPROC) (GLuint index, const GLint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL3I64VNVPROC) (GLuint index, const GLint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL4I64VNVPROC) (GLuint index, const GLint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL1UI64NVPROC) (GLuint index, GLuint64EXT x);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL2UI64NVPROC) (GLuint index, GLuint64EXT x, GLuint64EXT y);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL3UI64NVPROC) (GLuint index, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL4UI64NVPROC) (GLuint index, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL1UI64VNVPROC) (GLuint index, const GLuint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL2UI64VNVPROC) (GLuint index, const GLuint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL3UI64VNVPROC) (GLuint index, const GLuint64EXT *v);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBL4UI64VNVPROC) (GLuint index, const GLuint64EXT *v);
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBLI64VNVPROC) (GLuint index, GLenum pname, GLint64EXT *params);
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBLUI64VNVPROC) (GLuint index, GLenum pname, GLuint64EXT *params);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBLFORMATNVPROC) (GLuint index, GLint size, GLenum type, GLsizei stride);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glVertexAttribL1i64NV (GLuint index, GLint64EXT x);
+GLAPI void APIENTRY glVertexAttribL2i64NV (GLuint index, GLint64EXT x, GLint64EXT y);
+GLAPI void APIENTRY glVertexAttribL3i64NV (GLuint index, GLint64EXT x, GLint64EXT y, GLint64EXT z);
+GLAPI void APIENTRY glVertexAttribL4i64NV (GLuint index, GLint64EXT x, GLint64EXT y, GLint64EXT z, GLint64EXT w);
+GLAPI void APIENTRY glVertexAttribL1i64vNV (GLuint index, const GLint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL2i64vNV (GLuint index, const GLint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL3i64vNV (GLuint index, const GLint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL4i64vNV (GLuint index, const GLint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL1ui64NV (GLuint index, GLuint64EXT x);
+GLAPI void APIENTRY glVertexAttribL2ui64NV (GLuint index, GLuint64EXT x, GLuint64EXT y);
+GLAPI void APIENTRY glVertexAttribL3ui64NV (GLuint index, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z);
+GLAPI void APIENTRY glVertexAttribL4ui64NV (GLuint index, GLuint64EXT x, GLuint64EXT y, GLuint64EXT z, GLuint64EXT w);
+GLAPI void APIENTRY glVertexAttribL1ui64vNV (GLuint index, const GLuint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL2ui64vNV (GLuint index, const GLuint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL3ui64vNV (GLuint index, const GLuint64EXT *v);
+GLAPI void APIENTRY glVertexAttribL4ui64vNV (GLuint index, const GLuint64EXT *v);
+GLAPI void APIENTRY glGetVertexAttribLi64vNV (GLuint index, GLenum pname, GLint64EXT *params);
+GLAPI void APIENTRY glGetVertexAttribLui64vNV (GLuint index, GLenum pname, GLuint64EXT *params);
+GLAPI void APIENTRY glVertexAttribLFormatNV (GLuint index, GLint size, GLenum type, GLsizei stride);
+#endif
+#endif /* GL_NV_vertex_attrib_integer_64bit */
+
+#ifndef GL_NV_vertex_buffer_unified_memory
+#define GL_NV_vertex_buffer_unified_memory 1
+#define GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV 0x8F1E
+#define GL_ELEMENT_ARRAY_UNIFIED_NV       0x8F1F
+#define GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV 0x8F20
+#define GL_VERTEX_ARRAY_ADDRESS_NV        0x8F21
+#define GL_NORMAL_ARRAY_ADDRESS_NV        0x8F22
+#define GL_COLOR_ARRAY_ADDRESS_NV         0x8F23
+#define GL_INDEX_ARRAY_ADDRESS_NV         0x8F24
+#define GL_TEXTURE_COORD_ARRAY_ADDRESS_NV 0x8F25
+#define GL_EDGE_FLAG_ARRAY_ADDRESS_NV     0x8F26
+#define GL_SECONDARY_COLOR_ARRAY_ADDRESS_NV 0x8F27
+#define GL_FOG_COORD_ARRAY_ADDRESS_NV     0x8F28
+#define GL_ELEMENT_ARRAY_ADDRESS_NV       0x8F29
+#define GL_VERTEX_ATTRIB_ARRAY_LENGTH_NV  0x8F2A
+#define GL_VERTEX_ARRAY_LENGTH_NV         0x8F2B
+#define GL_NORMAL_ARRAY_LENGTH_NV         0x8F2C
+#define GL_COLOR_ARRAY_LENGTH_NV          0x8F2D
+#define GL_INDEX_ARRAY_LENGTH_NV          0x8F2E
+#define GL_TEXTURE_COORD_ARRAY_LENGTH_NV  0x8F2F
+#define GL_EDGE_FLAG_ARRAY_LENGTH_NV      0x8F30
+#define GL_SECONDARY_COLOR_ARRAY_LENGTH_NV 0x8F31
+#define GL_FOG_COORD_ARRAY_LENGTH_NV      0x8F32
+#define GL_ELEMENT_ARRAY_LENGTH_NV        0x8F33
+#define GL_DRAW_INDIRECT_UNIFIED_NV       0x8F40
+#define GL_DRAW_INDIRECT_ADDRESS_NV       0x8F41
+#define GL_DRAW_INDIRECT_LENGTH_NV        0x8F42
+typedef void (APIENTRYP PFNGLBUFFERADDRESSRANGENVPROC) (GLenum pname, GLuint index, GLuint64EXT address, GLsizeiptr length);
+typedef void (APIENTRYP PFNGLVERTEXFORMATNVPROC) (GLint size, GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLNORMALFORMATNVPROC) (GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLCOLORFORMATNVPROC) (GLint size, GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLINDEXFORMATNVPROC) (GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLTEXCOORDFORMATNVPROC) (GLint size, GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLEDGEFLAGFORMATNVPROC) (GLsizei stride);
+typedef void (APIENTRYP PFNGLSECONDARYCOLORFORMATNVPROC) (GLint size, GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLFOGCOORDFORMATNVPROC) (GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBFORMATNVPROC) (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride);
+typedef void (APIENTRYP PFNGLVERTEXATTRIBIFORMATNVPROC) (GLuint index, GLint size, GLenum type, GLsizei stride);
+typedef void (APIENTRYP PFNGLGETINTEGERUI64I_VNVPROC) (GLenum value, GLuint index, GLuint64EXT *result);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glBufferAddressRangeNV (GLenum pname, GLuint index, GLuint64EXT address, GLsizeiptr length);
+GLAPI void APIENTRY glVertexFormatNV (GLint size, GLenum type, GLsizei stride);
+GLAPI void APIENTRY glNormalFormatNV (GLenum type, GLsizei stride);
+GLAPI void APIENTRY glColorFormatNV (GLint size, GLenum type, GLsizei stride);
+GLAPI void APIENTRY glIndexFormatNV (GLenum type, GLsizei stride);
+GLAPI void APIENTRY glTexCoordFormatNV (GLint size, GLenum type, GLsizei stride);
+GLAPI void APIENTRY glEdgeFlagFormatNV (GLsizei stride);
+GLAPI void APIENTRY glSecondaryColorFormatNV (GLint size, GLenum type, GLsizei stride);
+GLAPI void APIENTRY glFogCoordFormatNV (GLenum type, GLsizei stride);
+GLAPI void APIENTRY glVertexAttribFormatNV (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride);
+GLAPI void APIENTRY glVertexAttribIFormatNV (GLuint index, GLint size, GLenum type, GLsizei stride);
+GLAPI void APIENTRY glGetIntegerui64i_vNV (GLenum value, GLuint index, GLuint64EXT *result);
+#endif
+#endif /* GL_NV_vertex_buffer_unified_memory */
+
+#ifndef GL_NV_viewport_array2
+#define GL_NV_viewport_array2 1
+#endif /* GL_NV_viewport_array2 */
+
+#ifndef GL_NV_viewport_swizzle
+#define GL_NV_viewport_swizzle 1
+#define GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV 0x9350
+#define GL_VIEWPORT_SWIZZLE_NEGATIVE_X_NV 0x9351
+#define GL_VIEWPORT_SWIZZLE_POSITIVE_Y_NV 0x9352
+#define GL_VIEWPORT_SWIZZLE_NEGATIVE_Y_NV 0x9353
+#define GL_VIEWPORT_SWIZZLE_POSITIVE_Z_NV 0x9354
+#define GL_VIEWPORT_SWIZZLE_NEGATIVE_Z_NV 0x9355
+#define GL_VIEWPORT_SWIZZLE_POSITIVE_W_NV 0x9356
+#define GL_VIEWPORT_SWIZZLE_NEGATIVE_W_NV 0x9357
+#define GL_VIEWPORT_SWIZZLE_X_NV          0x9358
+#define GL_VIEWPORT_SWIZZLE_Y_NV          0x9359
+#define GL_VIEWPORT_SWIZZLE_Z_NV          0x935A
+#define GL_VIEWPORT_SWIZZLE_W_NV          0x935B
+typedef void (APIENTRYP PFNGLVIEWPORTSWIZZLENVPROC) (GLuint index, GLenum swizzlex, GLenum swizzley, GLenum swizzlez, GLenum swizzlew);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glViewportSwizzleNV (GLuint index, GLenum swizzlex, GLenum swizzley, GLenum swizzlez, GLenum swizzlew);
+#endif
+#endif /* GL_NV_viewport_swizzle */
+
+#ifndef GL_OVR_multiview
+#define GL_OVR_multiview 1
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632
+#define GL_MAX_VIEWS_OVR                  0x9631
+#define GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR 0x9633
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferTextureMultiviewOVR (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
+#endif
+#endif /* GL_OVR_multiview */
+
+#ifndef GL_OVR_multiview2
+#define GL_OVR_multiview2 1
+#endif /* GL_OVR_multiview2 */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/GL/glext.h b/include/GL/glext.h
index 4753575..b7f6119 100644
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,9 +31,7 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 33061 $ on $Date: 2016-07-14 20:14:13 -0400 (Thu, 14 Jul 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
@@ -53,7 +51,7 @@
 #define GLAPI extern
 #endif
 
-#define GL_GLEXT_VERSION 20160714
+#define GL_GLEXT_VERSION 20170608
 
 /* Generated C header for:
  * API: gl
@@ -3355,6 +3353,16 @@
 #define GL_ARB_get_texture_sub_image 1
 #endif /* GL_ARB_get_texture_sub_image */
 
+#ifndef GL_ARB_gl_spirv
+#define GL_ARB_gl_spirv 1
+#define GL_SHADER_BINARY_FORMAT_SPIR_V_ARB 0x9551
+#define GL_SPIR_V_BINARY_ARB              0x9552
+typedef void (APIENTRYP PFNGLSPECIALIZESHADERARBPROC) (GLuint shader, const GLchar *pEntryPoint, GLuint numSpecializationConstants, const GLuint *pConstantIndex, const GLuint *pConstantValue);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glSpecializeShaderARB (GLuint shader, const GLchar *pEntryPoint, GLuint numSpecializationConstants, const GLuint *pConstantIndex, const GLuint *pConstantValue);
+#endif
+#endif /* GL_ARB_gl_spirv */
+
 #ifndef GL_ARB_gpu_shader5
 #define GL_ARB_gpu_shader5 1
 #endif /* GL_ARB_gpu_shader5 */
@@ -5379,10 +5387,49 @@
 #endif
 #endif /* GL_AMD_draw_buffers_blend */
 
+#ifndef GL_AMD_framebuffer_sample_positions
+#define GL_AMD_framebuffer_sample_positions 1
+#define GL_SUBSAMPLE_DISTANCE_AMD         0x883F
+#define GL_PIXELS_PER_SAMPLE_PATTERN_X_AMD 0x91AE
+#define GL_PIXELS_PER_SAMPLE_PATTERN_Y_AMD 0x91AF
+#define GL_ALL_PIXELS_AMD                 0xFFFFFFFF
+typedef void (APIENTRYP PFNGLFRAMEBUFFERSAMPLEPOSITIONSFVAMDPROC) (GLenum target, GLuint numsamples, GLuint pixelindex, const GLfloat *values);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERSAMPLEPOSITIONSFVAMDPROC) (GLuint framebuffer, GLuint numsamples, GLuint pixelindex, const GLfloat *values);
+typedef void (APIENTRYP PFNGLGETFRAMEBUFFERPARAMETERFVAMDPROC) (GLenum target, GLenum pname, GLuint numsamples, GLuint pixelindex, GLsizei size, GLfloat *values);
+typedef void (APIENTRYP PFNGLGETNAMEDFRAMEBUFFERPARAMETERFVAMDPROC) (GLuint framebuffer, GLenum pname, GLuint numsamples, GLuint pixelindex, GLsizei size, GLfloat *values);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferSamplePositionsfvAMD (GLenum target, GLuint numsamples, GLuint pixelindex, const GLfloat *values);
+GLAPI void APIENTRY glNamedFramebufferSamplePositionsfvAMD (GLuint framebuffer, GLuint numsamples, GLuint pixelindex, const GLfloat *values);
+GLAPI void APIENTRY glGetFramebufferParameterfvAMD (GLenum target, GLenum pname, GLuint numsamples, GLuint pixelindex, GLsizei size, GLfloat *values);
+GLAPI void APIENTRY glGetNamedFramebufferParameterfvAMD (GLuint framebuffer, GLenum pname, GLuint numsamples, GLuint pixelindex, GLsizei size, GLfloat *values);
+#endif
+#endif /* GL_AMD_framebuffer_sample_positions */
+
 #ifndef GL_AMD_gcn_shader
 #define GL_AMD_gcn_shader 1
 #endif /* GL_AMD_gcn_shader */
 
+#ifndef GL_AMD_gpu_shader_half_float
+#define GL_AMD_gpu_shader_half_float 1
+#define GL_FLOAT16_NV                     0x8FF8
+#define GL_FLOAT16_VEC2_NV                0x8FF9
+#define GL_FLOAT16_VEC3_NV                0x8FFA
+#define GL_FLOAT16_VEC4_NV                0x8FFB
+#define GL_FLOAT16_MAT2_AMD               0x91C5
+#define GL_FLOAT16_MAT3_AMD               0x91C6
+#define GL_FLOAT16_MAT4_AMD               0x91C7
+#define GL_FLOAT16_MAT2x3_AMD             0x91C8
+#define GL_FLOAT16_MAT2x4_AMD             0x91C9
+#define GL_FLOAT16_MAT3x2_AMD             0x91CA
+#define GL_FLOAT16_MAT3x4_AMD             0x91CB
+#define GL_FLOAT16_MAT4x2_AMD             0x91CC
+#define GL_FLOAT16_MAT4x3_AMD             0x91CD
+#endif /* GL_AMD_gpu_shader_half_float */
+
+#ifndef GL_AMD_gpu_shader_int16
+#define GL_AMD_gpu_shader_int16 1
+#endif /* GL_AMD_gpu_shader_int16 */
+
 #ifndef GL_AMD_gpu_shader_int64
 #define GL_AMD_gpu_shader_int64 1
 typedef int64_t GLint64EXT;
@@ -5410,10 +5457,6 @@
 #define GL_UNSIGNED_INT64_VEC2_NV         0x8FF5
 #define GL_UNSIGNED_INT64_VEC3_NV         0x8FF6
 #define GL_UNSIGNED_INT64_VEC4_NV         0x8FF7
-#define GL_FLOAT16_NV                     0x8FF8
-#define GL_FLOAT16_VEC2_NV                0x8FF9
-#define GL_FLOAT16_VEC3_NV                0x8FFA
-#define GL_FLOAT16_VEC4_NV                0x8FFB
 typedef void (APIENTRYP PFNGLUNIFORM1I64NVPROC) (GLint location, GLint64EXT x);
 typedef void (APIENTRYP PFNGLUNIFORM2I64NVPROC) (GLint location, GLint64EXT x, GLint64EXT y);
 typedef void (APIENTRYP PFNGLUNIFORM3I64NVPROC) (GLint location, GLint64EXT x, GLint64EXT y, GLint64EXT z);
@@ -5586,7 +5629,6 @@
 
 #ifndef GL_AMD_sample_positions
 #define GL_AMD_sample_positions 1
-#define GL_SUBSAMPLE_DISTANCE_AMD         0x883F
 typedef void (APIENTRYP PFNGLSETMULTISAMPLEFVAMDPROC) (GLenum pname, GLuint index, const GLfloat *val);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glSetMultisamplefvAMD (GLenum pname, GLuint index, const GLfloat *val);
@@ -5601,6 +5643,10 @@
 #define GL_AMD_shader_atomic_counter_ops 1
 #endif /* GL_AMD_shader_atomic_counter_ops */
 
+#ifndef GL_AMD_shader_ballot
+#define GL_AMD_shader_ballot 1
+#endif /* GL_AMD_shader_ballot */
+
 #ifndef GL_AMD_shader_explicit_vertex_parameter
 #define GL_AMD_shader_explicit_vertex_parameter 1
 #endif /* GL_AMD_shader_explicit_vertex_parameter */
@@ -5644,6 +5690,10 @@
 #endif
 #endif /* GL_AMD_stencil_operation_extended */
 
+#ifndef GL_AMD_texture_gather_bias_lod
+#define GL_AMD_texture_gather_bias_lod 1
+#endif /* GL_AMD_texture_gather_bias_lod */
+
 #ifndef GL_AMD_texture_texture4
 #define GL_AMD_texture_texture4 1
 #endif /* GL_AMD_texture_texture4 */
@@ -7538,6 +7588,89 @@
 #endif
 #endif /* GL_EXT_light_texture */
 
+#ifndef GL_EXT_memory_object
+#define GL_EXT_memory_object 1
+#define GL_TEXTURE_TILING_EXT             0x9580
+#define GL_DEDICATED_MEMORY_OBJECT_EXT    0x9581
+#define GL_PROTECTED_MEMORY_OBJECT_EXT    0x959B
+#define GL_NUM_TILING_TYPES_EXT           0x9582
+#define GL_TILING_TYPES_EXT               0x9583
+#define GL_OPTIMAL_TILING_EXT             0x9584
+#define GL_LINEAR_TILING_EXT              0x9585
+#define GL_NUM_DEVICE_UUIDS_EXT           0x9596
+#define GL_DEVICE_UUID_EXT                0x9597
+#define GL_DRIVER_UUID_EXT                0x9598
+#define GL_UUID_SIZE_EXT                  16
+typedef void (APIENTRYP PFNGLGETUNSIGNEDBYTEVEXTPROC) (GLenum pname, GLubyte *data);
+typedef void (APIENTRYP PFNGLGETUNSIGNEDBYTEI_VEXTPROC) (GLenum target, GLuint index, GLubyte *data);
+typedef void (APIENTRYP PFNGLDELETEMEMORYOBJECTSEXTPROC) (GLsizei n, const GLuint *memoryObjects);
+typedef GLboolean (APIENTRYP PFNGLISMEMORYOBJECTEXTPROC) (GLuint memoryObject);
+typedef void (APIENTRYP PFNGLCREATEMEMORYOBJECTSEXTPROC) (GLsizei n, GLuint *memoryObjects);
+typedef void (APIENTRYP PFNGLMEMORYOBJECTPARAMETERIVEXTPROC) (GLuint memoryObject, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLGETMEMORYOBJECTPARAMETERIVEXTPROC) (GLuint memoryObject, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLTEXSTORAGEMEM2DEXTPROC) (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXSTORAGEMEM2DMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXSTORAGEMEM3DEXTPROC) (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXSTORAGEMEM3DMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLBUFFERSTORAGEMEMEXTPROC) (GLenum target, GLsizeiptr size, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGEMEM2DEXTPROC) (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGEMEM2DMULTISAMPLEEXTPROC) (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGEMEM3DEXTPROC) (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGEMEM3DMULTISAMPLEEXTPROC) (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERSTORAGEMEMEXTPROC) (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXSTORAGEMEM1DEXTPROC) (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLuint memory, GLuint64 offset);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGEMEM1DEXTPROC) (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLuint memory, GLuint64 offset);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glGetUnsignedBytevEXT (GLenum pname, GLubyte *data);
+GLAPI void APIENTRY glGetUnsignedBytei_vEXT (GLenum target, GLuint index, GLubyte *data);
+GLAPI void APIENTRY glDeleteMemoryObjectsEXT (GLsizei n, const GLuint *memoryObjects);
+GLAPI GLboolean APIENTRY glIsMemoryObjectEXT (GLuint memoryObject);
+GLAPI void APIENTRY glCreateMemoryObjectsEXT (GLsizei n, GLuint *memoryObjects);
+GLAPI void APIENTRY glMemoryObjectParameterivEXT (GLuint memoryObject, GLenum pname, const GLint *params);
+GLAPI void APIENTRY glGetMemoryObjectParameterivEXT (GLuint memoryObject, GLenum pname, GLint *params);
+GLAPI void APIENTRY glTexStorageMem2DEXT (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTexStorageMem2DMultisampleEXT (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTexStorageMem3DEXT (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTexStorageMem3DMultisampleEXT (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glBufferStorageMemEXT (GLenum target, GLsizeiptr size, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTextureStorageMem2DEXT (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTextureStorageMem2DMultisampleEXT (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTextureStorageMem3DEXT (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTextureStorageMem3DMultisampleEXT (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glNamedBufferStorageMemEXT (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTexStorageMem1DEXT (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLuint memory, GLuint64 offset);
+GLAPI void APIENTRY glTextureStorageMem1DEXT (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLuint memory, GLuint64 offset);
+#endif
+#endif /* GL_EXT_memory_object */
+
+#ifndef GL_EXT_memory_object_fd
+#define GL_EXT_memory_object_fd 1
+#define GL_HANDLE_TYPE_OPAQUE_FD_EXT      0x9586
+typedef void (APIENTRYP PFNGLIMPORTMEMORYFDEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, GLint fd);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glImportMemoryFdEXT (GLuint memory, GLuint64 size, GLenum handleType, GLint fd);
+#endif
+#endif /* GL_EXT_memory_object_fd */
+
+#ifndef GL_EXT_memory_object_win32
+#define GL_EXT_memory_object_win32 1
+#define GL_HANDLE_TYPE_OPAQUE_WIN32_EXT   0x9587
+#define GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT 0x9588
+#define GL_DEVICE_LUID_EXT                0x9599
+#define GL_DEVICE_NODE_MASK_EXT           0x959A
+#define GL_LUID_SIZE_EXT                  8
+#define GL_HANDLE_TYPE_D3D12_TILEPOOL_EXT 0x9589
+#define GL_HANDLE_TYPE_D3D12_RESOURCE_EXT 0x958A
+#define GL_HANDLE_TYPE_D3D11_IMAGE_EXT    0x958B
+#define GL_HANDLE_TYPE_D3D11_IMAGE_KMT_EXT 0x958C
+typedef void (APIENTRYP PFNGLIMPORTMEMORYWIN32HANDLEEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, void *handle);
+typedef void (APIENTRYP PFNGLIMPORTMEMORYWIN32NAMEEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, const void *name);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glImportMemoryWin32HandleEXT (GLuint memory, GLuint64 size, GLenum handleType, void *handle);
+GLAPI void APIENTRY glImportMemoryWin32NameEXT (GLuint memory, GLuint64 size, GLenum handleType, const void *name);
+#endif
+#endif /* GL_EXT_memory_object_win32 */
+
 #ifndef GL_EXT_misc_attribute
 #define GL_EXT_misc_attribute 1
 #endif /* GL_EXT_misc_attribute */
@@ -7779,6 +7912,53 @@
 #endif
 #endif /* GL_EXT_secondary_color */
 
+#ifndef GL_EXT_semaphore
+#define GL_EXT_semaphore 1
+#define GL_LAYOUT_GENERAL_EXT             0x958D
+#define GL_LAYOUT_COLOR_ATTACHMENT_EXT    0x958E
+#define GL_LAYOUT_DEPTH_STENCIL_ATTACHMENT_EXT 0x958F
+#define GL_LAYOUT_DEPTH_STENCIL_READ_ONLY_EXT 0x9590
+#define GL_LAYOUT_SHADER_READ_ONLY_EXT    0x9591
+#define GL_LAYOUT_TRANSFER_SRC_EXT        0x9592
+#define GL_LAYOUT_TRANSFER_DST_EXT        0x9593
+typedef void (APIENTRYP PFNGLGENSEMAPHORESEXTPROC) (GLsizei n, GLuint *semaphores);
+typedef void (APIENTRYP PFNGLDELETESEMAPHORESEXTPROC) (GLsizei n, const GLuint *semaphores);
+typedef GLboolean (APIENTRYP PFNGLISSEMAPHOREEXTPROC) (GLuint semaphore);
+typedef void (APIENTRYP PFNGLSEMAPHOREPARAMETERUI64VEXTPROC) (GLuint semaphore, GLenum pname, const GLuint64 *params);
+typedef void (APIENTRYP PFNGLGETSEMAPHOREPARAMETERUI64VEXTPROC) (GLuint semaphore, GLenum pname, GLuint64 *params);
+typedef void (APIENTRYP PFNGLWAITSEMAPHOREEXTPROC) (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts);
+typedef void (APIENTRYP PFNGLSIGNALSEMAPHOREEXTPROC) (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glGenSemaphoresEXT (GLsizei n, GLuint *semaphores);
+GLAPI void APIENTRY glDeleteSemaphoresEXT (GLsizei n, const GLuint *semaphores);
+GLAPI GLboolean APIENTRY glIsSemaphoreEXT (GLuint semaphore);
+GLAPI void APIENTRY glSemaphoreParameterui64vEXT (GLuint semaphore, GLenum pname, const GLuint64 *params);
+GLAPI void APIENTRY glGetSemaphoreParameterui64vEXT (GLuint semaphore, GLenum pname, GLuint64 *params);
+GLAPI void APIENTRY glWaitSemaphoreEXT (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts);
+GLAPI void APIENTRY glSignalSemaphoreEXT (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts);
+#endif
+#endif /* GL_EXT_semaphore */
+
+#ifndef GL_EXT_semaphore_fd
+#define GL_EXT_semaphore_fd 1
+typedef void (APIENTRYP PFNGLIMPORTSEMAPHOREFDEXTPROC) (GLuint semaphore, GLenum handleType, GLint fd);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glImportSemaphoreFdEXT (GLuint semaphore, GLenum handleType, GLint fd);
+#endif
+#endif /* GL_EXT_semaphore_fd */
+
+#ifndef GL_EXT_semaphore_win32
+#define GL_EXT_semaphore_win32 1
+#define GL_HANDLE_TYPE_D3D12_FENCE_EXT    0x9594
+#define GL_D3D12_FENCE_VALUE_EXT          0x9595
+typedef void (APIENTRYP PFNGLIMPORTSEMAPHOREWIN32HANDLEEXTPROC) (GLuint semaphore, GLenum handleType, void *handle);
+typedef void (APIENTRYP PFNGLIMPORTSEMAPHOREWIN32NAMEEXTPROC) (GLuint semaphore, GLenum handleType, const void *name);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glImportSemaphoreWin32HandleEXT (GLuint semaphore, GLenum handleType, void *handle);
+GLAPI void APIENTRY glImportSemaphoreWin32NameEXT (GLuint semaphore, GLenum handleType, const void *name);
+#endif
+#endif /* GL_EXT_semaphore_win32 */
+
 #ifndef GL_EXT_separate_shader_objects
 #define GL_EXT_separate_shader_objects 1
 #define GL_ACTIVE_PROGRAM_EXT             0x8B8D
@@ -8645,6 +8825,16 @@
 #endif
 #endif /* GL_EXT_vertex_weighting */
 
+#ifndef GL_EXT_win32_keyed_mutex
+#define GL_EXT_win32_keyed_mutex 1
+typedef GLboolean (APIENTRYP PFNGLACQUIREKEYEDMUTEXWIN32EXTPROC) (GLuint memory, GLuint64 key, GLuint timeout);
+typedef GLboolean (APIENTRYP PFNGLRELEASEKEYEDMUTEXWIN32EXTPROC) (GLuint memory, GLuint64 key);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI GLboolean APIENTRY glAcquireKeyedMutexWin32EXT (GLuint memory, GLuint64 key, GLuint timeout);
+GLAPI GLboolean APIENTRY glReleaseKeyedMutexWin32EXT (GLuint memory, GLuint64 key);
+#endif
+#endif /* GL_EXT_win32_keyed_mutex */
+
 #ifndef GL_EXT_window_rectangles
 #define GL_EXT_window_rectangles 1
 #define GL_INCLUSIVE_EXT                  0x8F10
@@ -8957,6 +9147,10 @@
 #endif
 #endif /* GL_MESA_resize_buffers */
 
+#ifndef GL_MESA_shader_integer_functions
+#define GL_MESA_shader_integer_functions 1
+#endif /* GL_MESA_shader_integer_functions */
+
 #ifndef GL_MESA_window_pos
 #define GL_MESA_window_pos 1
 typedef void (APIENTRYP PFNGLWINDOWPOS2DMESAPROC) (GLdouble x, GLdouble y);
@@ -9018,6 +9212,10 @@
 #define GL_YCBCR_MESA                     0x8757
 #endif /* GL_MESA_ycbcr_texture */
 
+#ifndef GL_NVX_blend_equation_advanced_multi_draw_buffers
+#define GL_NVX_blend_equation_advanced_multi_draw_buffers 1
+#endif /* GL_NVX_blend_equation_advanced_multi_draw_buffers */
+
 #ifndef GL_NVX_conditional_render
 #define GL_NVX_conditional_render 1
 typedef void (APIENTRYP PFNGLBEGINCONDITIONALRENDERNVXPROC) (GLuint id);
@@ -9037,6 +9235,32 @@
 #define GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX 0x904B
 #endif /* GL_NVX_gpu_memory_info */
 
+#ifndef GL_NVX_linked_gpu_multicast
+#define GL_NVX_linked_gpu_multicast 1
+#define GL_LGPU_SEPARATE_STORAGE_BIT_NVX  0x0800
+#define GL_MAX_LGPU_GPUS_NVX              0x92BA
+typedef void (APIENTRYP PFNGLLGPUNAMEDBUFFERSUBDATANVXPROC) (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
+typedef void (APIENTRYP PFNGLLGPUCOPYIMAGESUBDATANVXPROC) (GLuint sourceGpu, GLbitfield destinationGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srxY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei width, GLsizei height, GLsizei depth);
+typedef void (APIENTRYP PFNGLLGPUINTERLOCKNVXPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glLGPUNamedBufferSubDataNVX (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
+GLAPI void APIENTRY glLGPUCopyImageSubDataNVX (GLuint sourceGpu, GLbitfield destinationGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srxY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei width, GLsizei height, GLsizei depth);
+GLAPI void APIENTRY glLGPUInterlockNVX (void);
+#endif
+#endif /* GL_NVX_linked_gpu_multicast */
+
+#ifndef GL_NV_alpha_to_coverage_dither_control
+#define GL_NV_alpha_to_coverage_dither_control 1
+#define GL_ALPHA_TO_COVERAGE_DITHER_DEFAULT_NV 0x934D
+#define GL_ALPHA_TO_COVERAGE_DITHER_ENABLE_NV 0x934E
+#define GL_ALPHA_TO_COVERAGE_DITHER_DISABLE_NV 0x934F
+#define GL_ALPHA_TO_COVERAGE_DITHER_MODE_NV 0x92BF
+typedef void (APIENTRYP PFNGLALPHATOCOVERAGEDITHERCONTROLNVPROC) (GLenum mode);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glAlphaToCoverageDitherControlNV (GLenum mode);
+#endif
+#endif /* GL_NV_alpha_to_coverage_dither_control */
+
 #ifndef GL_NV_bindless_multi_draw_indirect
 #define GL_NV_bindless_multi_draw_indirect 1
 typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSINDIRECTBINDLESSNVPROC) (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount);
@@ -9330,6 +9554,23 @@
 #endif
 #endif /* GL_NV_draw_texture */
 
+#ifndef GL_NV_draw_vulkan_image
+#define GL_NV_draw_vulkan_image 1
+typedef void (APIENTRY  *GLVULKANPROCNV)(void);
+typedef void (APIENTRYP PFNGLDRAWVKIMAGENVPROC) (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+typedef GLVULKANPROCNV (APIENTRYP PFNGLGETVKPROCADDRNVPROC) (const GLchar *name);
+typedef void (APIENTRYP PFNGLWAITVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (APIENTRYP PFNGLSIGNALVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (APIENTRYP PFNGLSIGNALVKFENCENVPROC) (GLuint64 vkFence);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glDrawVkImageNV (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+GLAPI GLVULKANPROCNV APIENTRY glGetVkProcAddrNV (const GLchar *name);
+GLAPI void APIENTRY glWaitVkSemaphoreNV (GLuint64 vkSemaphore);
+GLAPI void APIENTRY glSignalVkSemaphoreNV (GLuint64 vkSemaphore);
+GLAPI void APIENTRY glSignalVkFenceNV (GLuint64 vkFence);
+#endif
+#endif /* GL_NV_draw_vulkan_image */
+
 #ifndef GL_NV_evaluators
 #define GL_NV_evaluators 1
 #define GL_EVAL_2D_NV                     0x86C0
@@ -9564,6 +9805,41 @@
 #define GL_NV_geometry_shader_passthrough 1
 #endif /* GL_NV_geometry_shader_passthrough */
 
+#ifndef GL_NV_gpu_multicast
+#define GL_NV_gpu_multicast 1
+#define GL_PER_GPU_STORAGE_BIT_NV         0x0800
+#define GL_MULTICAST_GPUS_NV              0x92BA
+#define GL_RENDER_GPU_MASK_NV             0x9558
+#define GL_PER_GPU_STORAGE_NV             0x9548
+#define GL_MULTICAST_PROGRAMMABLE_SAMPLE_LOCATION_NV 0x9549
+typedef void (APIENTRYP PFNGLRENDERGPUMASKNVPROC) (GLbitfield mask);
+typedef void (APIENTRYP PFNGLMULTICASTBUFFERSUBDATANVPROC) (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+typedef void (APIENTRYP PFNGLMULTICASTCOPYBUFFERSUBDATANVPROC) (GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+typedef void (APIENTRYP PFNGLMULTICASTCOPYIMAGESUBDATANVPROC) (GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
+typedef void (APIENTRYP PFNGLMULTICASTBLITFRAMEBUFFERNVPROC) (GLuint srcGpu, GLuint dstGpu, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+typedef void (APIENTRYP PFNGLMULTICASTFRAMEBUFFERSAMPLELOCATIONSFVNVPROC) (GLuint gpu, GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLMULTICASTBARRIERNVPROC) (void);
+typedef void (APIENTRYP PFNGLMULTICASTWAITSYNCNVPROC) (GLuint signalGpu, GLbitfield waitGpuMask);
+typedef void (APIENTRYP PFNGLMULTICASTGETQUERYOBJECTIVNVPROC) (GLuint gpu, GLuint id, GLenum pname, GLint *params);
+typedef void (APIENTRYP PFNGLMULTICASTGETQUERYOBJECTUIVNVPROC) (GLuint gpu, GLuint id, GLenum pname, GLuint *params);
+typedef void (APIENTRYP PFNGLMULTICASTGETQUERYOBJECTI64VNVPROC) (GLuint gpu, GLuint id, GLenum pname, GLint64 *params);
+typedef void (APIENTRYP PFNGLMULTICASTGETQUERYOBJECTUI64VNVPROC) (GLuint gpu, GLuint id, GLenum pname, GLuint64 *params);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glRenderGpuMaskNV (GLbitfield mask);
+GLAPI void APIENTRY glMulticastBufferSubDataNV (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+GLAPI void APIENTRY glMulticastCopyBufferSubDataNV (GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+GLAPI void APIENTRY glMulticastCopyImageSubDataNV (GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
+GLAPI void APIENTRY glMulticastBlitFramebufferNV (GLuint srcGpu, GLuint dstGpu, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+GLAPI void APIENTRY glMulticastFramebufferSampleLocationsfvNV (GLuint gpu, GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glMulticastBarrierNV (void);
+GLAPI void APIENTRY glMulticastWaitSyncNV (GLuint signalGpu, GLbitfield waitGpuMask);
+GLAPI void APIENTRY glMulticastGetQueryObjectivNV (GLuint gpu, GLuint id, GLenum pname, GLint *params);
+GLAPI void APIENTRY glMulticastGetQueryObjectuivNV (GLuint gpu, GLuint id, GLenum pname, GLuint *params);
+GLAPI void APIENTRY glMulticastGetQueryObjecti64vNV (GLuint gpu, GLuint id, GLenum pname, GLint64 *params);
+GLAPI void APIENTRY glMulticastGetQueryObjectui64vNV (GLuint gpu, GLuint id, GLenum pname, GLuint64 *params);
+#endif
+#endif /* GL_NV_gpu_multicast */
+
 #ifndef GL_NV_gpu_program4
 #define GL_NV_gpu_program4 1
 #define GL_MIN_PROGRAM_TEXEL_OFFSET_NV    0x8904
@@ -11198,6 +11474,7 @@
 #define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630
 #define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632
 #define GL_MAX_VIEWS_OVR                  0x9631
+#define GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR 0x9633
 typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glFramebufferTextureMultiviewOVR (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
diff --git a/include/GL/glxext.h b/include/GL/glxext.h
index dce0290..2a6291c 100644
--- a/include/GL/glxext.h
+++ b/include/GL/glxext.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,12 +31,10 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 32889 $ on $Date: 2016-05-31 07:09:51 -0400 (Tue, 31 May 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
-#define GLX_GLXEXT_VERSION 20160531
+#define GLX_GLXEXT_VERSION 20170606
 
 /* Generated C header for:
  * API: glx
@@ -178,6 +176,11 @@
 #endif
 #endif /* GLX_ARB_create_context */
 
+#ifndef GLX_ARB_create_context_no_error
+#define GLX_ARB_create_context_no_error 1
+#define GLX_CONTEXT_OPENGL_NO_ERROR_ARB   0x31B3
+#endif /* GLX_ARB_create_context_no_error */
+
 #ifndef GLX_ARB_create_context_profile
 #define GLX_ARB_create_context_profile 1
 #define GLX_CONTEXT_CORE_PROFILE_BIT_ARB  0x00000001
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index 86efd1b..a8f5af1 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -342,7 +342,7 @@
 #define __DRI2_FENCE "DRI2_Fence"
 #define __DRI2_FENCE_VERSION 2
 
-#define __DRI2_FENCE_TIMEOUT_INFINITE     0xffffffffffffffffllu
+#define __DRI2_FENCE_TIMEOUT_INFINITE     0xffffffffffffffffull
 
 #define __DRI2_FENCE_FLAG_FLUSH_COMMANDS  (1 << 0)
 
@@ -702,6 +702,7 @@
 #define __DRI_ATTRIB_BIND_TO_TEXTURE_TARGETS	46
 #define __DRI_ATTRIB_YINVERTED			47
 #define __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE	48
+#define __DRI_ATTRIB_MAX			(__DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE + 1)
 
 /* __DRI_ATTRIB_RENDER_TYPE */
 #define __DRI_ATTRIB_RGBA_BIT			0x01	
@@ -1049,6 +1050,12 @@
 #define __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS	0x00000004
 
 /**
+ * \requires __DRI2_NO_ERROR.
+ *
+ */
+#define __DRI_CTX_FLAG_NO_ERROR			0x00000008
+
+/**
  * \name Context reset strategies.
  */
 /*@{*/
@@ -1136,7 +1143,7 @@
  * extensions.
  */
 #define __DRI_IMAGE "DRI_IMAGE"
-#define __DRI_IMAGE_VERSION 14
+#define __DRI_IMAGE_VERSION 15
 
 /**
  * These formats correspond to the similarly named MESA_FORMAT_*
@@ -1210,6 +1217,7 @@
 #define __DRI_IMAGE_FOURCC_NV12		0x3231564e
 #define __DRI_IMAGE_FOURCC_NV16		0x3631564e
 #define __DRI_IMAGE_FOURCC_YUYV		0x56595559
+#define __DRI_IMAGE_FOURCC_UYVY		0x59565955
 
 #define __DRI_IMAGE_FOURCC_YVU410	0x39555659
 #define __DRI_IMAGE_FOURCC_YVU411	0x31315659
@@ -1223,7 +1231,7 @@
  * RGB and RGBA are may be usable directly as images but its still
  * recommended to call fromPlanar with plane == 0.
  *
- * Y_U_V, Y_UV and Y_XUXV all requires call to fromPlanar to create
+ * Y_U_V, Y_UV,Y_XUXV and Y_UXVX all requires call to fromPlanar to create
  * usable sub-images, sampling from images return raw YUV data and
  * color conversion needs to be done in the shader.
  *
@@ -1235,6 +1243,7 @@
 #define __DRI_IMAGE_COMPONENTS_Y_U_V	0x3003
 #define __DRI_IMAGE_COMPONENTS_Y_UV	0x3004
 #define __DRI_IMAGE_COMPONENTS_Y_XUXV	0x3005
+#define __DRI_IMAGE_COMPONENTS_Y_UXVX	0x3008
 #define __DRI_IMAGE_COMPONENTS_R	0x3006
 #define __DRI_IMAGE_COMPONENTS_RG	0x3007
 
@@ -1493,6 +1502,67 @@
                                            const uint64_t *modifiers,
                                            const unsigned int modifier_count,
                                            void *loaderPrivate);
+
+   /*
+    * Like createImageFromDmaBufs, but takes also format modifiers.
+    *
+    * For EGL_EXT_image_dma_buf_import_modifiers.
+    *
+    * \since 15
+    */
+   __DRIimage *(*createImageFromDmaBufs2)(__DRIscreen *screen,
+                                          int width, int height, int fourcc,
+                                          uint64_t modifier,
+                                          int *fds, int num_fds,
+                                          int *strides, int *offsets,
+                                          enum __DRIYUVColorSpace color_space,
+                                          enum __DRISampleRange sample_range,
+                                          enum __DRIChromaSiting horiz_siting,
+                                          enum __DRIChromaSiting vert_siting,
+                                          unsigned *error,
+                                          void *loaderPrivate);
+
+   /*
+    * dmabuf format query to support EGL_EXT_image_dma_buf_import_modifiers.
+    *
+    * \param max      Maximum number of formats that can be accomodated into
+    *                 \param formats. If zero, no formats are returned -
+    *                 instead, the driver returns the total number of
+    *                 supported dmabuf formats in \param count.
+    * \param formats  Buffer to fill formats into.
+    * \param count    Count of formats returned, or, total number of
+    *                 supported formats in case \param max is zero.
+    *
+    * Returns true on success.
+    *
+    * \since 15
+    */
+   GLboolean (*queryDmaBufFormats)(__DRIscreen *screen, int max,
+                                   int *formats, int *count);
+
+   /*
+    * dmabuf format modifier query for a given format to support
+    * EGL_EXT_image_dma_buf_import_modifiers.
+    *
+    * \param fourcc    The format to query modifiers for. If this format
+    *                  is not supported by the driver, return false.
+    * \param max       Maximum number of modifiers that can be accomodated in
+    *                  \param modifiers. If zero, no modifiers are returned -
+    *                  instead, the driver returns the total number of
+    *                  modifiers for \param format in \param count.
+    * \param modifiers Buffer to fill modifiers into.
+    * \param count     Count of the modifiers returned, or, total number of
+    *                  supported modifiers for \param fourcc in case
+    *                  \param max is zero.
+    *
+    * Returns true upon success.
+    *
+    * \since 15
+    */
+   GLboolean (*queryDmaBufModifiers)(__DRIscreen *screen, int fourcc,
+                                     int max, uint64_t *modifiers,
+                                     unsigned int *external_only,
+                                     int *count);
 };
 
 
@@ -1548,6 +1618,19 @@
 };
 
 /**
+ * No-error context driver extension.
+ *
+ * Existence of this extension means the driver can accept the
+ * __DRI_CTX_FLAG_NO_ERROR flag.
+ */
+#define __DRI2_NO_ERROR "DRI_NoError"
+#define __DRI2_NO_ERROR_VERSION 1
+
+typedef struct __DRInoErrorExtensionRec {
+   __DRIextension base;
+} __DRInoErrorExtension;
+
+/**
  * DRI config options extension.
  *
  * This extension provides the XML string containing driver options for use by
@@ -1720,6 +1803,19 @@
     * operations (e.g. it should just set a thread-local variable).
     */
    void (*setBackgroundContext)(void *loaderPrivate);
+
+   /**
+    * Indicate that it is multithread safe to use glthread.  For GLX/EGL
+    * platforms using Xlib, that involves calling XInitThreads, before
+    * opening an X display.
+    *
+    * Note: only supported if extension version is at least 2.
+    *
+    * \param loaderPrivate is the value that was passed to to the driver when
+    * the context was created.  This can be used by the loader to identify
+    * which context any callbacks are associated with.
+    */
+   GLboolean (*isThreadSafe)(void *loaderPrivate);
 };
 
 #endif
diff --git a/include/GL/wglext.h b/include/GL/wglext.h
index 8c39b96..77ec4a1 100644
--- a/include/GL/wglext.h
+++ b/include/GL/wglext.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,9 +31,7 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 32686 $ on $Date: 2016-04-19 21:08:44 -0400 (Tue, 19 Apr 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
@@ -41,7 +39,7 @@
 #include <windows.h>
 #endif
 
-#define WGL_WGLEXT_VERSION 20160419
+#define WGL_WGLEXT_VERSION 20170606
 
 /* Generated C header for:
  * API: wgl
@@ -92,6 +90,11 @@
 #endif
 #endif /* WGL_ARB_create_context */
 
+#ifndef WGL_ARB_create_context_no_error
+#define WGL_ARB_create_context_no_error 1
+#define WGL_CONTEXT_OPENGL_NO_ERROR_ARB   0x31B3
+#endif /* WGL_ARB_create_context_no_error */
+
 #ifndef WGL_ARB_create_context_profile
 #define WGL_ARB_create_context_profile 1
 #define WGL_CONTEXT_PROFILE_MASK_ARB      0x9126
@@ -341,6 +344,13 @@
 #define WGL_TYPE_RGBA_FLOAT_ATI           0x21A0
 #endif /* WGL_ATI_pixel_format_float */
 
+#ifndef WGL_EXT_colorspace
+#define WGL_EXT_colorspace 1
+#define WGL_COLORSPACE_EXT                0x3087
+#define WGL_COLORSPACE_SRGB_EXT           0x3089
+#define WGL_COLORSPACE_LINEAR_EXT         0x308A
+#endif /* WGL_EXT_colorspace */
+
 #ifndef WGL_EXT_create_context_es2_profile
 #define WGL_EXT_create_context_es2_profile 1
 #define WGL_CONTEXT_ES2_PROFILE_BIT_EXT   0x00000004
diff --git a/include/GLES/egl.h b/include/GLES/egl.h
index 5778e00..86f644c 100644
--- a/include/GLES/egl.h
+++ b/include/GLES/egl.h
@@ -1,9 +1,23 @@
 /*
+** Copyright (c) 2008-2017 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+/*
  * Skeleton egl.h to provide compatibility for early GLES 1.0
  * applications. Several early implementations included gl.h
  * in egl.h leading applications to include only egl.h
- *
- * $Revision: 6252 $ on $Date:: 2008-08-06 16:35:08 -0700 #$
  */
 
 #ifndef __legacy_egl_h_
diff --git a/include/GLES/gl.h b/include/GLES/gl.h
index 5b8d85a..44dcddc 100644
--- a/include/GLES/gl.h
+++ b/include/GLES/gl.h
@@ -1,58 +1,90 @@
 #ifndef __gl_h_
-#define __gl_h_
-
-/* $Revision: 10601 $ on $Date:: 2010-03-04 22:15:27 -0800 #$ */
-
-#include <GLES/glplatform.h>
+#define __gl_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
- * This document is licensed under the SGI Free Software B License Version
- * 2.0. For details, see http://oss.sgi.com/projects/FreeB/ .
+** Copyright (c) 2013-2017 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+/*
+** This header is generated from the Khronos OpenGL / OpenGL ES XML
+** API Registry. The current version of the Registry, generator scripts
+** used to make the header, and the header can be found at
+**   https://github.com/KhronosGroup/OpenGL-Registry
+*/
+
+#include <GLES/glplatform.h>
+
+/* Generated on date 20170606 */
+
+/* Generated C header for:
+ * API: gles1
+ * Profile: common
+ * Versions considered: .*
+ * Versions emitted: .*
+ * Default extensions included: None
+ * Additional extensions included: ^(GL_OES_read_format|GL_OES_compressed_paletted_texture|GL_OES_point_size_array|GL_OES_point_sprite)$
+ * Extensions removed: _nomatch_^
  */
 
-typedef void             GLvoid;
-typedef char             GLchar;
-typedef unsigned int     GLenum;
-typedef unsigned char    GLboolean;
-typedef unsigned int     GLbitfield;
-typedef khronos_int8_t   GLbyte;
-typedef short            GLshort;
-typedef int              GLint;
-typedef int              GLsizei;
-typedef khronos_uint8_t  GLubyte;
-typedef unsigned short   GLushort;
-typedef unsigned int     GLuint;
-typedef khronos_float_t  GLfloat;
-typedef khronos_float_t  GLclampf;
-typedef khronos_int32_t  GLfixed;
-typedef khronos_int32_t  GLclampx;
+#ifndef GL_VERSION_ES_CM_1_0
+#define GL_VERSION_ES_CM_1_0 1
 
+/*
+ * XXX: Temporary fix; needs to be reverted as part of the next
+ * header update.
+ * For more details:
+ * https://github.com/KhronosGroup/OpenGL-Registry/pull/76
+ * https://lists.freedesktop.org/archives/mesa-dev/2017-June/161647.html
+ */
+#include <KHR/khrplatform.h>
+typedef khronos_int8_t GLbyte;
+typedef khronos_float_t GLclampf;
+typedef short GLshort;
+typedef unsigned short GLushort;
+
+typedef void GLvoid;
+typedef unsigned int GLenum;
+typedef khronos_float_t GLfloat;
+typedef khronos_int32_t GLfixed;
+typedef unsigned int GLuint;
+typedef khronos_ssize_t GLsizeiptr;
 typedef khronos_intptr_t GLintptr;
-typedef khronos_ssize_t  GLsizeiptr;
-
-
-/*************************************************************/
-
-/* OpenGL ES core versions */
-#define GL_VERSION_ES_CM_1_0          1
-#define GL_VERSION_ES_CL_1_0          1
-#define GL_VERSION_ES_CM_1_1          1
-#define GL_VERSION_ES_CL_1_1          1
-
-/* ClearBufferMask */
+typedef unsigned int GLbitfield;
+typedef int GLint;
+typedef khronos_uint8_t GLubyte;
+typedef unsigned char GLboolean;
+typedef int GLsizei;
+typedef khronos_int32_t GLclampx;
+#define GL_VERSION_ES_CL_1_0              1
+#define GL_VERSION_ES_CM_1_1              1
+#define GL_VERSION_ES_CL_1_1              1
 #define GL_DEPTH_BUFFER_BIT               0x00000100
 #define GL_STENCIL_BUFFER_BIT             0x00000400
 #define GL_COLOR_BUFFER_BIT               0x00004000
-
-/* Boolean */
 #define GL_FALSE                          0
 #define GL_TRUE                           1
-
-/* BeginMode */
 #define GL_POINTS                         0x0000
 #define GL_LINES                          0x0001
 #define GL_LINE_LOOP                      0x0002
@@ -60,8 +92,6 @@
 #define GL_TRIANGLES                      0x0004
 #define GL_TRIANGLE_STRIP                 0x0005
 #define GL_TRIANGLE_FAN                   0x0006
-
-/* AlphaFunction */
 #define GL_NEVER                          0x0200
 #define GL_LESS                           0x0201
 #define GL_EQUAL                          0x0202
@@ -70,8 +100,6 @@
 #define GL_NOTEQUAL                       0x0205
 #define GL_GEQUAL                         0x0206
 #define GL_ALWAYS                         0x0207
-
-/* BlendingFactorDest */
 #define GL_ZERO                           0
 #define GL_ONE                            1
 #define GL_SRC_COLOR                      0x0300
@@ -80,53 +108,18 @@
 #define GL_ONE_MINUS_SRC_ALPHA            0x0303
 #define GL_DST_ALPHA                      0x0304
 #define GL_ONE_MINUS_DST_ALPHA            0x0305
-
-/* BlendingFactorSrc */
-/*      GL_ZERO */
-/*      GL_ONE */
 #define GL_DST_COLOR                      0x0306
 #define GL_ONE_MINUS_DST_COLOR            0x0307
 #define GL_SRC_ALPHA_SATURATE             0x0308
-/*      GL_SRC_ALPHA */
-/*      GL_ONE_MINUS_SRC_ALPHA */
-/*      GL_DST_ALPHA */
-/*      GL_ONE_MINUS_DST_ALPHA */
-
-/* ClipPlaneName */
 #define GL_CLIP_PLANE0                    0x3000
 #define GL_CLIP_PLANE1                    0x3001
 #define GL_CLIP_PLANE2                    0x3002
 #define GL_CLIP_PLANE3                    0x3003
 #define GL_CLIP_PLANE4                    0x3004
 #define GL_CLIP_PLANE5                    0x3005
-
-/* ColorMaterialFace */
-/*      GL_FRONT_AND_BACK */
-
-/* ColorMaterialParameter */
-/*      GL_AMBIENT_AND_DIFFUSE */
-
-/* ColorPointerType */
-/*      GL_UNSIGNED_BYTE */
-/*      GL_FLOAT */
-/*      GL_FIXED */
-
-/* CullFaceMode */
 #define GL_FRONT                          0x0404
 #define GL_BACK                           0x0405
 #define GL_FRONT_AND_BACK                 0x0408
-
-/* DepthFunction */
-/*      GL_NEVER */
-/*      GL_LESS */
-/*      GL_EQUAL */
-/*      GL_LEQUAL */
-/*      GL_GREATER */
-/*      GL_NOTEQUAL */
-/*      GL_GEQUAL */
-/*      GL_ALWAYS */
-
-/* EnableCap */
 #define GL_FOG                            0x0B60
 #define GL_LIGHTING                       0x0B50
 #define GL_TEXTURE_2D                     0x0DE1
@@ -137,21 +130,12 @@
 #define GL_DITHER                         0x0BD0
 #define GL_STENCIL_TEST                   0x0B90
 #define GL_DEPTH_TEST                     0x0B71
-/*      GL_LIGHT0 */
-/*      GL_LIGHT1 */
-/*      GL_LIGHT2 */
-/*      GL_LIGHT3 */
-/*      GL_LIGHT4 */
-/*      GL_LIGHT5 */
-/*      GL_LIGHT6 */
-/*      GL_LIGHT7 */
 #define GL_POINT_SMOOTH                   0x0B10
 #define GL_LINE_SMOOTH                    0x0B20
 #define GL_SCISSOR_TEST                   0x0C11
 #define GL_COLOR_MATERIAL                 0x0B57
 #define GL_NORMALIZE                      0x0BA1
 #define GL_RESCALE_NORMAL                 0x803A
-#define GL_POLYGON_OFFSET_FILL            0x8037
 #define GL_VERTEX_ARRAY                   0x8074
 #define GL_NORMAL_ARRAY                   0x8075
 #define GL_COLOR_ARRAY                    0x8076
@@ -160,8 +144,6 @@
 #define GL_SAMPLE_ALPHA_TO_COVERAGE       0x809E
 #define GL_SAMPLE_ALPHA_TO_ONE            0x809F
 #define GL_SAMPLE_COVERAGE                0x80A0
-
-/* ErrorCode */
 #define GL_NO_ERROR                       0
 #define GL_INVALID_ENUM                   0x0500
 #define GL_INVALID_VALUE                  0x0501
@@ -169,24 +151,15 @@
 #define GL_STACK_OVERFLOW                 0x0503
 #define GL_STACK_UNDERFLOW                0x0504
 #define GL_OUT_OF_MEMORY                  0x0505
-
-/* FogMode */
-/*      GL_LINEAR */
 #define GL_EXP                            0x0800
 #define GL_EXP2                           0x0801
-
-/* FogParameter */
 #define GL_FOG_DENSITY                    0x0B62
 #define GL_FOG_START                      0x0B63
 #define GL_FOG_END                        0x0B64
 #define GL_FOG_MODE                       0x0B65
 #define GL_FOG_COLOR                      0x0B66
-
-/* FrontFaceDirection */
 #define GL_CW                             0x0900
 #define GL_CCW                            0x0901
-
-/* GetPName */
 #define GL_CURRENT_COLOR                  0x0B00
 #define GL_CURRENT_NORMAL                 0x0B02
 #define GL_CURRENT_TEXTURE_COORDS         0x0B03
@@ -229,11 +202,8 @@
 #define GL_BLEND_SRC                      0x0BE1
 #define GL_LOGIC_OP_MODE                  0x0BF0
 #define GL_SCISSOR_BOX                    0x0C10
-#define GL_SCISSOR_TEST                   0x0C11
 #define GL_COLOR_CLEAR_VALUE              0x0C22
 #define GL_COLOR_WRITEMASK                0x0C23
-#define GL_UNPACK_ALIGNMENT               0x0CF5
-#define GL_PACK_ALIGNMENT                 0x0D05
 #define GL_MAX_LIGHTS                     0x0D31
 #define GL_MAX_CLIP_PLANES                0x0D32
 #define GL_MAX_TEXTURE_SIZE               0x0D33
@@ -272,33 +242,18 @@
 #define GL_SAMPLES                        0x80A9
 #define GL_SAMPLE_COVERAGE_VALUE          0x80AA
 #define GL_SAMPLE_COVERAGE_INVERT         0x80AB
-
-/* GetTextureParameter */
-/*      GL_TEXTURE_MAG_FILTER */
-/*      GL_TEXTURE_MIN_FILTER */
-/*      GL_TEXTURE_WRAP_S */
-/*      GL_TEXTURE_WRAP_T */
-
 #define GL_NUM_COMPRESSED_TEXTURE_FORMATS 0x86A2
 #define GL_COMPRESSED_TEXTURE_FORMATS     0x86A3
-
-/* HintMode */
 #define GL_DONT_CARE                      0x1100
 #define GL_FASTEST                        0x1101
 #define GL_NICEST                         0x1102
-
-/* HintTarget */
 #define GL_PERSPECTIVE_CORRECTION_HINT    0x0C50
 #define GL_POINT_SMOOTH_HINT              0x0C51
 #define GL_LINE_SMOOTH_HINT               0x0C52
 #define GL_FOG_HINT                       0x0C54
 #define GL_GENERATE_MIPMAP_HINT           0x8192
-
-/* LightModelParameter */
 #define GL_LIGHT_MODEL_AMBIENT            0x0B53
 #define GL_LIGHT_MODEL_TWO_SIDE           0x0B52
-
-/* LightParameter */
 #define GL_AMBIENT                        0x1200
 #define GL_DIFFUSE                        0x1201
 #define GL_SPECULAR                       0x1202
@@ -309,16 +264,12 @@
 #define GL_CONSTANT_ATTENUATION           0x1207
 #define GL_LINEAR_ATTENUATION             0x1208
 #define GL_QUADRATIC_ATTENUATION          0x1209
-
-/* DataType */
 #define GL_BYTE                           0x1400
 #define GL_UNSIGNED_BYTE                  0x1401
 #define GL_SHORT                          0x1402
 #define GL_UNSIGNED_SHORT                 0x1403
 #define GL_FLOAT                          0x1406
 #define GL_FIXED                          0x140C
-
-/* LogicOp */
 #define GL_CLEAR                          0x1500
 #define GL_AND                            0x1501
 #define GL_AND_REVERSE                    0x1502
@@ -335,117 +286,49 @@
 #define GL_OR_INVERTED                    0x150D
 #define GL_NAND                           0x150E
 #define GL_SET                            0x150F
-
-/* MaterialFace */
-/*      GL_FRONT_AND_BACK */
-
-/* MaterialParameter */
 #define GL_EMISSION                       0x1600
 #define GL_SHININESS                      0x1601
 #define GL_AMBIENT_AND_DIFFUSE            0x1602
-/*      GL_AMBIENT */
-/*      GL_DIFFUSE */
-/*      GL_SPECULAR */
-
-/* MatrixMode */
 #define GL_MODELVIEW                      0x1700
 #define GL_PROJECTION                     0x1701
 #define GL_TEXTURE                        0x1702
-
-/* NormalPointerType */
-/*      GL_BYTE */
-/*      GL_SHORT */
-/*      GL_FLOAT */
-/*      GL_FIXED */
-
-/* PixelFormat */
 #define GL_ALPHA                          0x1906
 #define GL_RGB                            0x1907
 #define GL_RGBA                           0x1908
 #define GL_LUMINANCE                      0x1909
 #define GL_LUMINANCE_ALPHA                0x190A
-
-/* PixelStoreParameter */
 #define GL_UNPACK_ALIGNMENT               0x0CF5
 #define GL_PACK_ALIGNMENT                 0x0D05
-
-/* PixelType */
-/*      GL_UNSIGNED_BYTE */
 #define GL_UNSIGNED_SHORT_4_4_4_4         0x8033
 #define GL_UNSIGNED_SHORT_5_5_5_1         0x8034
 #define GL_UNSIGNED_SHORT_5_6_5           0x8363
-
-/* ShadingModel */
 #define GL_FLAT                           0x1D00
 #define GL_SMOOTH                         0x1D01
-
-/* StencilFunction */
-/*      GL_NEVER */
-/*      GL_LESS */
-/*      GL_EQUAL */
-/*      GL_LEQUAL */
-/*      GL_GREATER */
-/*      GL_NOTEQUAL */
-/*      GL_GEQUAL */
-/*      GL_ALWAYS */
-
-/* StencilOp */
-/*      GL_ZERO */
 #define GL_KEEP                           0x1E00
 #define GL_REPLACE                        0x1E01
 #define GL_INCR                           0x1E02
 #define GL_DECR                           0x1E03
-/*      GL_INVERT */
-
-/* StringName */
 #define GL_VENDOR                         0x1F00
 #define GL_RENDERER                       0x1F01
 #define GL_VERSION                        0x1F02
 #define GL_EXTENSIONS                     0x1F03
-
-/* TexCoordPointerType */
-/*      GL_SHORT */
-/*      GL_FLOAT */
-/*      GL_FIXED */
-/*      GL_BYTE */
-
-/* TextureEnvMode */
 #define GL_MODULATE                       0x2100
 #define GL_DECAL                          0x2101
-/*      GL_BLEND */
 #define GL_ADD                            0x0104
-/*      GL_REPLACE */
-
-/* TextureEnvParameter */
 #define GL_TEXTURE_ENV_MODE               0x2200
 #define GL_TEXTURE_ENV_COLOR              0x2201
-
-/* TextureEnvTarget */
 #define GL_TEXTURE_ENV                    0x2300
-
-/* TextureMagFilter */
 #define GL_NEAREST                        0x2600
 #define GL_LINEAR                         0x2601
-
-/* TextureMinFilter */
-/*      GL_NEAREST */
-/*      GL_LINEAR */
 #define GL_NEAREST_MIPMAP_NEAREST         0x2700
 #define GL_LINEAR_MIPMAP_NEAREST          0x2701
 #define GL_NEAREST_MIPMAP_LINEAR          0x2702
 #define GL_LINEAR_MIPMAP_LINEAR           0x2703
-
-/* TextureParameterName */
 #define GL_TEXTURE_MAG_FILTER             0x2800
 #define GL_TEXTURE_MIN_FILTER             0x2801
 #define GL_TEXTURE_WRAP_S                 0x2802
 #define GL_TEXTURE_WRAP_T                 0x2803
 #define GL_GENERATE_MIPMAP                0x8191
-
-/* TextureTarget */
-/*      GL_TEXTURE_2D */
-
-/* TextureUnit */
 #define GL_TEXTURE0                       0x84C0
 #define GL_TEXTURE1                       0x84C1
 #define GL_TEXTURE2                       0x84C2
@@ -480,18 +363,8 @@
 #define GL_TEXTURE31                      0x84DF
 #define GL_ACTIVE_TEXTURE                 0x84E0
 #define GL_CLIENT_ACTIVE_TEXTURE          0x84E1
-
-/* TextureWrapMode */
 #define GL_REPEAT                         0x2901
 #define GL_CLAMP_TO_EDGE                  0x812F
-
-/* VertexPointerType */
-/*      GL_SHORT */
-/*      GL_FLOAT */
-/*      GL_FIXED */
-/*      GL_BYTE */
-
-/* LightName */
 #define GL_LIGHT0                         0x4000
 #define GL_LIGHT1                         0x4001
 #define GL_LIGHT2                         0x4002
@@ -500,25 +373,18 @@
 #define GL_LIGHT5                         0x4005
 #define GL_LIGHT6                         0x4006
 #define GL_LIGHT7                         0x4007
-
-/* Buffer Objects */
 #define GL_ARRAY_BUFFER                   0x8892
 #define GL_ELEMENT_ARRAY_BUFFER           0x8893
-
-#define GL_ARRAY_BUFFER_BINDING               0x8894
-#define GL_ELEMENT_ARRAY_BUFFER_BINDING       0x8895
-#define GL_VERTEX_ARRAY_BUFFER_BINDING        0x8896
-#define GL_NORMAL_ARRAY_BUFFER_BINDING        0x8897
-#define GL_COLOR_ARRAY_BUFFER_BINDING         0x8898
+#define GL_ARRAY_BUFFER_BINDING           0x8894
+#define GL_ELEMENT_ARRAY_BUFFER_BINDING   0x8895
+#define GL_VERTEX_ARRAY_BUFFER_BINDING    0x8896
+#define GL_NORMAL_ARRAY_BUFFER_BINDING    0x8897
+#define GL_COLOR_ARRAY_BUFFER_BINDING     0x8898
 #define GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING 0x889A
-
 #define GL_STATIC_DRAW                    0x88E4
 #define GL_DYNAMIC_DRAW                   0x88E8
-
 #define GL_BUFFER_SIZE                    0x8764
 #define GL_BUFFER_USAGE                   0x8765
-
-/* Texture combine + dot3 */
 #define GL_SUBTRACT                       0x84E7
 #define GL_COMBINE                        0x8570
 #define GL_COMBINE_RGB                    0x8571
@@ -535,75 +401,29 @@
 #define GL_OPERAND0_ALPHA                 0x8598
 #define GL_OPERAND1_ALPHA                 0x8599
 #define GL_OPERAND2_ALPHA                 0x859A
-
 #define GL_ALPHA_SCALE                    0x0D1C
-
 #define GL_SRC0_RGB                       0x8580
 #define GL_SRC1_RGB                       0x8581
 #define GL_SRC2_RGB                       0x8582
 #define GL_SRC0_ALPHA                     0x8588
 #define GL_SRC1_ALPHA                     0x8589
 #define GL_SRC2_ALPHA                     0x858A
-
 #define GL_DOT3_RGB                       0x86AE
 #define GL_DOT3_RGBA                      0x86AF
-
-/*------------------------------------------------------------------------*
- * required OES extension tokens
- *------------------------------------------------------------------------*/
-
-/* OES_read_format */
-#ifndef GL_OES_read_format
-#define GL_IMPLEMENTATION_COLOR_READ_TYPE_OES                   0x8B9A
-#define GL_IMPLEMENTATION_COLOR_READ_FORMAT_OES                 0x8B9B
-#endif
-
-/* GL_OES_compressed_paletted_texture */
-#ifndef GL_OES_compressed_paletted_texture
-#define GL_PALETTE4_RGB8_OES                                    0x8B90
-#define GL_PALETTE4_RGBA8_OES                                   0x8B91
-#define GL_PALETTE4_R5_G6_B5_OES                                0x8B92
-#define GL_PALETTE4_RGBA4_OES                                   0x8B93
-#define GL_PALETTE4_RGB5_A1_OES                                 0x8B94
-#define GL_PALETTE8_RGB8_OES                                    0x8B95
-#define GL_PALETTE8_RGBA8_OES                                   0x8B96
-#define GL_PALETTE8_R5_G6_B5_OES                                0x8B97
-#define GL_PALETTE8_RGBA4_OES                                   0x8B98
-#define GL_PALETTE8_RGB5_A1_OES                                 0x8B99
-#endif
-
-/* OES_point_size_array */
-#ifndef GL_OES_point_size_array
-#define GL_POINT_SIZE_ARRAY_OES                                 0x8B9C
-#define GL_POINT_SIZE_ARRAY_TYPE_OES                            0x898A
-#define GL_POINT_SIZE_ARRAY_STRIDE_OES                          0x898B
-#define GL_POINT_SIZE_ARRAY_POINTER_OES                         0x898C
-#define GL_POINT_SIZE_ARRAY_BUFFER_BINDING_OES                  0x8B9F
-#endif
-
-/* GL_OES_point_sprite */
-#ifndef GL_OES_point_sprite
-#define GL_POINT_SPRITE_OES                                     0x8861
-#define GL_COORD_REPLACE_OES                                    0x8862
-#endif
-
-/*************************************************************/
-
-/* Available only in Common profile */
-GL_API void GL_APIENTRY glAlphaFunc (GLenum func, GLclampf ref);
-GL_API void GL_APIENTRY glClearColor (GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha);
-GL_API void GL_APIENTRY glClearDepthf (GLclampf depth);
-GL_API void GL_APIENTRY glClipPlanef (GLenum plane, const GLfloat *equation);
+GL_API void GL_APIENTRY glAlphaFunc (GLenum func, GLfloat ref);
+GL_API void GL_APIENTRY glClearColor (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
+GL_API void GL_APIENTRY glClearDepthf (GLfloat d);
+GL_API void GL_APIENTRY glClipPlanef (GLenum p, const GLfloat *eqn);
 GL_API void GL_APIENTRY glColor4f (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
-GL_API void GL_APIENTRY glDepthRangef (GLclampf zNear, GLclampf zFar);
+GL_API void GL_APIENTRY glDepthRangef (GLfloat n, GLfloat f);
 GL_API void GL_APIENTRY glFogf (GLenum pname, GLfloat param);
 GL_API void GL_APIENTRY glFogfv (GLenum pname, const GLfloat *params);
-GL_API void GL_APIENTRY glFrustumf (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
-GL_API void GL_APIENTRY glGetClipPlanef (GLenum pname, GLfloat eqn[4]);
-GL_API void GL_APIENTRY glGetFloatv (GLenum pname, GLfloat *params);
+GL_API void GL_APIENTRY glFrustumf (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
+GL_API void GL_APIENTRY glGetClipPlanef (GLenum plane, GLfloat *equation);
+GL_API void GL_APIENTRY glGetFloatv (GLenum pname, GLfloat *data);
 GL_API void GL_APIENTRY glGetLightfv (GLenum light, GLenum pname, GLfloat *params);
 GL_API void GL_APIENTRY glGetMaterialfv (GLenum face, GLenum pname, GLfloat *params);
-GL_API void GL_APIENTRY glGetTexEnvfv (GLenum env, GLenum pname, GLfloat *params);
+GL_API void GL_APIENTRY glGetTexEnvfv (GLenum target, GLenum pname, GLfloat *params);
 GL_API void GL_APIENTRY glGetTexParameterfv (GLenum target, GLenum pname, GLfloat *params);
 GL_API void GL_APIENTRY glLightModelf (GLenum pname, GLfloat param);
 GL_API void GL_APIENTRY glLightModelfv (GLenum pname, const GLfloat *params);
@@ -616,7 +436,7 @@
 GL_API void GL_APIENTRY glMultMatrixf (const GLfloat *m);
 GL_API void GL_APIENTRY glMultiTexCoord4f (GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q);
 GL_API void GL_APIENTRY glNormal3f (GLfloat nx, GLfloat ny, GLfloat nz);
-GL_API void GL_APIENTRY glOrthof (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
+GL_API void GL_APIENTRY glOrthof (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
 GL_API void GL_APIENTRY glPointParameterf (GLenum pname, GLfloat param);
 GL_API void GL_APIENTRY glPointParameterfv (GLenum pname, const GLfloat *params);
 GL_API void GL_APIENTRY glPointSize (GLfloat size);
@@ -628,27 +448,25 @@
 GL_API void GL_APIENTRY glTexParameterf (GLenum target, GLenum pname, GLfloat param);
 GL_API void GL_APIENTRY glTexParameterfv (GLenum target, GLenum pname, const GLfloat *params);
 GL_API void GL_APIENTRY glTranslatef (GLfloat x, GLfloat y, GLfloat z);
-
-/* Available in both Common and Common-Lite profiles */
 GL_API void GL_APIENTRY glActiveTexture (GLenum texture);
-GL_API void GL_APIENTRY glAlphaFuncx (GLenum func, GLclampx ref);
+GL_API void GL_APIENTRY glAlphaFuncx (GLenum func, GLfixed ref);
 GL_API void GL_APIENTRY glBindBuffer (GLenum target, GLuint buffer);
 GL_API void GL_APIENTRY glBindTexture (GLenum target, GLuint texture);
 GL_API void GL_APIENTRY glBlendFunc (GLenum sfactor, GLenum dfactor);
-GL_API void GL_APIENTRY glBufferData (GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
-GL_API void GL_APIENTRY glBufferSubData (GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+GL_API void GL_APIENTRY glBufferData (GLenum target, GLsizeiptr size, const void *data, GLenum usage);
+GL_API void GL_APIENTRY glBufferSubData (GLenum target, GLintptr offset, GLsizeiptr size, const void *data);
 GL_API void GL_APIENTRY glClear (GLbitfield mask);
-GL_API void GL_APIENTRY glClearColorx (GLclampx red, GLclampx green, GLclampx blue, GLclampx alpha);
-GL_API void GL_APIENTRY glClearDepthx (GLclampx depth);
+GL_API void GL_APIENTRY glClearColorx (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
+GL_API void GL_APIENTRY glClearDepthx (GLfixed depth);
 GL_API void GL_APIENTRY glClearStencil (GLint s);
 GL_API void GL_APIENTRY glClientActiveTexture (GLenum texture);
 GL_API void GL_APIENTRY glClipPlanex (GLenum plane, const GLfixed *equation);
 GL_API void GL_APIENTRY glColor4ub (GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha);
 GL_API void GL_APIENTRY glColor4x (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
 GL_API void GL_APIENTRY glColorMask (GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha);
-GL_API void GL_APIENTRY glColorPointer (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
-GL_API void GL_APIENTRY glCompressedTexImage2D (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data);
-GL_API void GL_APIENTRY glCompressedTexSubImage2D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const GLvoid *data);
+GL_API void GL_APIENTRY glColorPointer (GLint size, GLenum type, GLsizei stride, const void *pointer);
+GL_API void GL_APIENTRY glCompressedTexImage2D (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *data);
+GL_API void GL_APIENTRY glCompressedTexSubImage2D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *data);
 GL_API void GL_APIENTRY glCopyTexImage2D (GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
 GL_API void GL_APIENTRY glCopyTexSubImage2D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
 GL_API void GL_APIENTRY glCullFace (GLenum mode);
@@ -656,33 +474,33 @@
 GL_API void GL_APIENTRY glDeleteTextures (GLsizei n, const GLuint *textures);
 GL_API void GL_APIENTRY glDepthFunc (GLenum func);
 GL_API void GL_APIENTRY glDepthMask (GLboolean flag);
-GL_API void GL_APIENTRY glDepthRangex (GLclampx zNear, GLclampx zFar);
+GL_API void GL_APIENTRY glDepthRangex (GLfixed n, GLfixed f);
 GL_API void GL_APIENTRY glDisable (GLenum cap);
 GL_API void GL_APIENTRY glDisableClientState (GLenum array);
 GL_API void GL_APIENTRY glDrawArrays (GLenum mode, GLint first, GLsizei count);
-GL_API void GL_APIENTRY glDrawElements (GLenum mode, GLsizei count, GLenum type, const GLvoid *indices);
+GL_API void GL_APIENTRY glDrawElements (GLenum mode, GLsizei count, GLenum type, const void *indices);
 GL_API void GL_APIENTRY glEnable (GLenum cap);
 GL_API void GL_APIENTRY glEnableClientState (GLenum array);
 GL_API void GL_APIENTRY glFinish (void);
 GL_API void GL_APIENTRY glFlush (void);
 GL_API void GL_APIENTRY glFogx (GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glFogxv (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glFogxv (GLenum pname, const GLfixed *param);
 GL_API void GL_APIENTRY glFrontFace (GLenum mode);
-GL_API void GL_APIENTRY glFrustumx (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
-GL_API void GL_APIENTRY glGetBooleanv (GLenum pname, GLboolean *params);
+GL_API void GL_APIENTRY glFrustumx (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
+GL_API void GL_APIENTRY glGetBooleanv (GLenum pname, GLboolean *data);
 GL_API void GL_APIENTRY glGetBufferParameteriv (GLenum target, GLenum pname, GLint *params);
-GL_API void GL_APIENTRY glGetClipPlanex (GLenum pname, GLfixed eqn[4]);
+GL_API void GL_APIENTRY glGetClipPlanex (GLenum plane, GLfixed *equation);
 GL_API void GL_APIENTRY glGenBuffers (GLsizei n, GLuint *buffers);
 GL_API void GL_APIENTRY glGenTextures (GLsizei n, GLuint *textures);
 GL_API GLenum GL_APIENTRY glGetError (void);
 GL_API void GL_APIENTRY glGetFixedv (GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetIntegerv (GLenum pname, GLint *params);
+GL_API void GL_APIENTRY glGetIntegerv (GLenum pname, GLint *data);
 GL_API void GL_APIENTRY glGetLightxv (GLenum light, GLenum pname, GLfixed *params);
 GL_API void GL_APIENTRY glGetMaterialxv (GLenum face, GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetPointerv (GLenum pname, GLvoid **params);
-GL_API const GLubyte * GL_APIENTRY glGetString (GLenum name);
-GL_API void GL_APIENTRY glGetTexEnviv (GLenum env, GLenum pname, GLint *params);
-GL_API void GL_APIENTRY glGetTexEnvxv (GLenum env, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetPointerv (GLenum pname, void **params);
+GL_API const GLubyte *GL_APIENTRY glGetString (GLenum name);
+GL_API void GL_APIENTRY glGetTexEnviv (GLenum target, GLenum pname, GLint *params);
+GL_API void GL_APIENTRY glGetTexEnvxv (GLenum target, GLenum pname, GLfixed *params);
 GL_API void GL_APIENTRY glGetTexParameteriv (GLenum target, GLenum pname, GLint *params);
 GL_API void GL_APIENTRY glGetTexParameterxv (GLenum target, GLenum pname, GLfixed *params);
 GL_API void GL_APIENTRY glHint (GLenum target, GLenum mode);
@@ -690,7 +508,7 @@
 GL_API GLboolean GL_APIENTRY glIsEnabled (GLenum cap);
 GL_API GLboolean GL_APIENTRY glIsTexture (GLuint texture);
 GL_API void GL_APIENTRY glLightModelx (GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glLightModelxv (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glLightModelxv (GLenum pname, const GLfixed *param);
 GL_API void GL_APIENTRY glLightx (GLenum light, GLenum pname, GLfixed param);
 GL_API void GL_APIENTRY glLightxv (GLenum light, GLenum pname, const GLfixed *params);
 GL_API void GL_APIENTRY glLineWidthx (GLfixed width);
@@ -698,13 +516,13 @@
 GL_API void GL_APIENTRY glLoadMatrixx (const GLfixed *m);
 GL_API void GL_APIENTRY glLogicOp (GLenum opcode);
 GL_API void GL_APIENTRY glMaterialx (GLenum face, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glMaterialxv (GLenum face, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glMaterialxv (GLenum face, GLenum pname, const GLfixed *param);
 GL_API void GL_APIENTRY glMatrixMode (GLenum mode);
 GL_API void GL_APIENTRY glMultMatrixx (const GLfixed *m);
-GL_API void GL_APIENTRY glMultiTexCoord4x (GLenum target, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
+GL_API void GL_APIENTRY glMultiTexCoord4x (GLenum texture, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
 GL_API void GL_APIENTRY glNormal3x (GLfixed nx, GLfixed ny, GLfixed nz);
-GL_API void GL_APIENTRY glNormalPointer (GLenum type, GLsizei stride, const GLvoid *pointer);
-GL_API void GL_APIENTRY glOrthox (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
+GL_API void GL_APIENTRY glNormalPointer (GLenum type, GLsizei stride, const void *pointer);
+GL_API void GL_APIENTRY glOrthox (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
 GL_API void GL_APIENTRY glPixelStorei (GLenum pname, GLint param);
 GL_API void GL_APIENTRY glPointParameterx (GLenum pname, GLfixed param);
 GL_API void GL_APIENTRY glPointParameterxv (GLenum pname, const GLfixed *params);
@@ -712,9 +530,9 @@
 GL_API void GL_APIENTRY glPolygonOffsetx (GLfixed factor, GLfixed units);
 GL_API void GL_APIENTRY glPopMatrix (void);
 GL_API void GL_APIENTRY glPushMatrix (void);
-GL_API void GL_APIENTRY glReadPixels (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels);
+GL_API void GL_APIENTRY glReadPixels (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, void *pixels);
 GL_API void GL_APIENTRY glRotatex (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
-GL_API void GL_APIENTRY glSampleCoverage (GLclampf value, GLboolean invert);
+GL_API void GL_APIENTRY glSampleCoverage (GLfloat value, GLboolean invert);
 GL_API void GL_APIENTRY glSampleCoveragex (GLclampx value, GLboolean invert);
 GL_API void GL_APIENTRY glScalex (GLfixed x, GLfixed y, GLfixed z);
 GL_API void GL_APIENTRY glScissor (GLint x, GLint y, GLsizei width, GLsizei height);
@@ -722,49 +540,60 @@
 GL_API void GL_APIENTRY glStencilFunc (GLenum func, GLint ref, GLuint mask);
 GL_API void GL_APIENTRY glStencilMask (GLuint mask);
 GL_API void GL_APIENTRY glStencilOp (GLenum fail, GLenum zfail, GLenum zpass);
-GL_API void GL_APIENTRY glTexCoordPointer (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
+GL_API void GL_APIENTRY glTexCoordPointer (GLint size, GLenum type, GLsizei stride, const void *pointer);
 GL_API void GL_APIENTRY glTexEnvi (GLenum target, GLenum pname, GLint param);
 GL_API void GL_APIENTRY glTexEnvx (GLenum target, GLenum pname, GLfixed param);
 GL_API void GL_APIENTRY glTexEnviv (GLenum target, GLenum pname, const GLint *params);
 GL_API void GL_APIENTRY glTexEnvxv (GLenum target, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glTexImage2D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels);
+GL_API void GL_APIENTRY glTexImage2D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
 GL_API void GL_APIENTRY glTexParameteri (GLenum target, GLenum pname, GLint param);
 GL_API void GL_APIENTRY glTexParameterx (GLenum target, GLenum pname, GLfixed param);
 GL_API void GL_APIENTRY glTexParameteriv (GLenum target, GLenum pname, const GLint *params);
 GL_API void GL_APIENTRY glTexParameterxv (GLenum target, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glTexSubImage2D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels);
+GL_API void GL_APIENTRY glTexSubImage2D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
 GL_API void GL_APIENTRY glTranslatex (GLfixed x, GLfixed y, GLfixed z);
-GL_API void GL_APIENTRY glVertexPointer (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
+GL_API void GL_APIENTRY glVertexPointer (GLint size, GLenum type, GLsizei stride, const void *pointer);
 GL_API void GL_APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
+#endif /* GL_VERSION_ES_CM_1_0 */
 
-/*------------------------------------------------------------------------*
- * Required OES extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_OES_read_format */
-#ifndef GL_OES_read_format
-#define GL_OES_read_format 1
-#endif
-
-/* GL_OES_compressed_paletted_texture */
 #ifndef GL_OES_compressed_paletted_texture
 #define GL_OES_compressed_paletted_texture 1
-#endif
+#define GL_PALETTE4_RGB8_OES              0x8B90
+#define GL_PALETTE4_RGBA8_OES             0x8B91
+#define GL_PALETTE4_R5_G6_B5_OES          0x8B92
+#define GL_PALETTE4_RGBA4_OES             0x8B93
+#define GL_PALETTE4_RGB5_A1_OES           0x8B94
+#define GL_PALETTE8_RGB8_OES              0x8B95
+#define GL_PALETTE8_RGBA8_OES             0x8B96
+#define GL_PALETTE8_R5_G6_B5_OES          0x8B97
+#define GL_PALETTE8_RGBA4_OES             0x8B98
+#define GL_PALETTE8_RGB5_A1_OES           0x8B99
+#endif /* GL_OES_compressed_paletted_texture */
 
-/* GL_OES_point_size_array */
 #ifndef GL_OES_point_size_array
 #define GL_OES_point_size_array 1
-GL_API void GL_APIENTRY glPointSizePointerOES (GLenum type, GLsizei stride, const GLvoid *pointer);
-#endif
+#define GL_POINT_SIZE_ARRAY_OES           0x8B9C
+#define GL_POINT_SIZE_ARRAY_TYPE_OES      0x898A
+#define GL_POINT_SIZE_ARRAY_STRIDE_OES    0x898B
+#define GL_POINT_SIZE_ARRAY_POINTER_OES   0x898C
+#define GL_POINT_SIZE_ARRAY_BUFFER_BINDING_OES 0x8B9F
+GL_API void GL_APIENTRY glPointSizePointerOES (GLenum type, GLsizei stride, const void *pointer);
+#endif /* GL_OES_point_size_array */
 
-/* GL_OES_point_sprite */
 #ifndef GL_OES_point_sprite
 #define GL_OES_point_sprite 1
-#endif
+#define GL_POINT_SPRITE_OES               0x8861
+#define GL_COORD_REPLACE_OES              0x8862
+#endif /* GL_OES_point_sprite */
+
+#ifndef GL_OES_read_format
+#define GL_OES_read_format 1
+#define GL_IMPLEMENTATION_COLOR_READ_TYPE_OES 0x8B9A
+#define GL_IMPLEMENTATION_COLOR_READ_FORMAT_OES 0x8B9B
+#endif /* GL_OES_read_format */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* __gl_h_ */
-
+#endif
diff --git a/include/GLES/glext.h b/include/GLES/glext.h
index 67092fd..6ea91a1 100644
--- a/include/GLES/glext.h
+++ b/include/GLES/glext.h
@@ -1,613 +1,141 @@
 #ifndef __glext_h_
-#define __glext_h_
-
-/* $Revision: 20798 $ on $Date:: 2013-03-07 01:19:34 -0800 #$ */
+#define __glext_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
- * This document is licensed under the SGI Free Software B License Version
- * 2.0. For details, see http://oss.sgi.com/projects/FreeB/ .
- */
+** Copyright (c) 2013-2017 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+/*
+** This header is generated from the Khronos OpenGL / OpenGL ES XML
+** API Registry. The current version of the Registry, generator scripts
+** used to make the header, and the header can be found at
+**   https://github.com/KhronosGroup/OpenGL-Registry
+*/
 
 #ifndef GL_APIENTRYP
-#   define GL_APIENTRYP GL_APIENTRY*
+#define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/*------------------------------------------------------------------------*
- * OES extension tokens
- *------------------------------------------------------------------------*/
+/* Generated on date 20170606 */
 
-/* GL_OES_blend_equation_separate */
-#ifndef GL_OES_blend_equation_separate
-/* BLEND_EQUATION_RGB_OES same as BLEND_EQUATION_OES */
-#define GL_BLEND_EQUATION_RGB_OES                               0x8009
-#define GL_BLEND_EQUATION_ALPHA_OES                             0x883D
-#endif
-
-/* GL_OES_blend_func_separate */
-#ifndef GL_OES_blend_func_separate
-#define GL_BLEND_DST_RGB_OES                                    0x80C8
-#define GL_BLEND_SRC_RGB_OES                                    0x80C9
-#define GL_BLEND_DST_ALPHA_OES                                  0x80CA
-#define GL_BLEND_SRC_ALPHA_OES                                  0x80CB
-#endif
-
-/* GL_OES_blend_subtract */
-#ifndef GL_OES_blend_subtract
-#define GL_BLEND_EQUATION_OES                                   0x8009
-#define GL_FUNC_ADD_OES                                         0x8006
-#define GL_FUNC_SUBTRACT_OES                                    0x800A
-#define GL_FUNC_REVERSE_SUBTRACT_OES                            0x800B
-#endif
-
-/* GL_OES_compressed_ETC1_RGB8_texture */
-#ifndef GL_OES_compressed_ETC1_RGB8_texture
-#define GL_ETC1_RGB8_OES                                        0x8D64
-#endif
-
-/* GL_OES_depth24 */
-#ifndef GL_OES_depth24
-#define GL_DEPTH_COMPONENT24_OES                                0x81A6
-#endif
-
-/* GL_OES_depth32 */
-#ifndef GL_OES_depth32
-#define GL_DEPTH_COMPONENT32_OES                                0x81A7
-#endif
-
-/* GL_OES_draw_texture */
-#ifndef GL_OES_draw_texture
-#define GL_TEXTURE_CROP_RECT_OES                                0x8B9D
-#endif
-
-/* GL_OES_EGL_image */
-#ifndef GL_OES_EGL_image
-typedef void* GLeglImageOES;
-#endif
-
-/* GL_OES_EGL_image_external */
-#ifndef GL_OES_EGL_image_external
-/* GLeglImageOES defined in GL_OES_EGL_image already. */
-#define GL_TEXTURE_EXTERNAL_OES                                 0x8D65
-#define GL_TEXTURE_BINDING_EXTERNAL_OES                         0x8D67
-#define GL_REQUIRED_TEXTURE_IMAGE_UNITS_OES                     0x8D68
-#endif
-
-/* GL_OES_element_index_uint */
-#ifndef GL_OES_element_index_uint
-#define GL_UNSIGNED_INT                                         0x1405
-#endif
-
-/* GL_OES_fixed_point */
-#ifndef GL_OES_fixed_point
-#define GL_FIXED_OES                                            0x140C
-#endif
-
-/* GL_OES_framebuffer_object */
-#ifndef GL_OES_framebuffer_object
-#define GL_NONE_OES                                             0
-#define GL_FRAMEBUFFER_OES                                      0x8D40
-#define GL_RENDERBUFFER_OES                                     0x8D41
-#define GL_RGBA4_OES                                            0x8056
-#define GL_RGB5_A1_OES                                          0x8057
-#define GL_RGB565_OES                                           0x8D62
-#define GL_DEPTH_COMPONENT16_OES                                0x81A5
-#define GL_RENDERBUFFER_WIDTH_OES                               0x8D42
-#define GL_RENDERBUFFER_HEIGHT_OES                              0x8D43
-#define GL_RENDERBUFFER_INTERNAL_FORMAT_OES                     0x8D44
-#define GL_RENDERBUFFER_RED_SIZE_OES                            0x8D50
-#define GL_RENDERBUFFER_GREEN_SIZE_OES                          0x8D51
-#define GL_RENDERBUFFER_BLUE_SIZE_OES                           0x8D52
-#define GL_RENDERBUFFER_ALPHA_SIZE_OES                          0x8D53
-#define GL_RENDERBUFFER_DEPTH_SIZE_OES                          0x8D54
-#define GL_RENDERBUFFER_STENCIL_SIZE_OES                        0x8D55
-#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_OES               0x8CD0
-#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_OES               0x8CD1
-#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_OES             0x8CD2
-#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_OES     0x8CD3
-#define GL_COLOR_ATTACHMENT0_OES                                0x8CE0
-#define GL_DEPTH_ATTACHMENT_OES                                 0x8D00
-#define GL_STENCIL_ATTACHMENT_OES                               0x8D20
-#define GL_FRAMEBUFFER_COMPLETE_OES                             0x8CD5
-#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_OES                0x8CD6
-#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_OES        0x8CD7
-#define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_OES                0x8CD9
-#define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_OES                   0x8CDA
-#define GL_FRAMEBUFFER_UNSUPPORTED_OES                          0x8CDD
-#define GL_FRAMEBUFFER_BINDING_OES                              0x8CA6
-#define GL_RENDERBUFFER_BINDING_OES                             0x8CA7
-#define GL_MAX_RENDERBUFFER_SIZE_OES                            0x84E8
-#define GL_INVALID_FRAMEBUFFER_OPERATION_OES                    0x0506
-#endif
-
-/* GL_OES_mapbuffer */
-#ifndef GL_OES_mapbuffer
-#define GL_WRITE_ONLY_OES                                       0x88B9
-#define GL_BUFFER_ACCESS_OES                                    0x88BB
-#define GL_BUFFER_MAPPED_OES                                    0x88BC
-#define GL_BUFFER_MAP_POINTER_OES                               0x88BD
-#endif
-
-/* GL_OES_matrix_get */
-#ifndef GL_OES_matrix_get
-#define GL_MODELVIEW_MATRIX_FLOAT_AS_INT_BITS_OES               0x898D
-#define GL_PROJECTION_MATRIX_FLOAT_AS_INT_BITS_OES              0x898E
-#define GL_TEXTURE_MATRIX_FLOAT_AS_INT_BITS_OES                 0x898F
-#endif
-
-/* GL_OES_matrix_palette */
-#ifndef GL_OES_matrix_palette
-#define GL_MAX_VERTEX_UNITS_OES                                 0x86A4
-#define GL_MAX_PALETTE_MATRICES_OES                             0x8842
-#define GL_MATRIX_PALETTE_OES                                   0x8840
-#define GL_MATRIX_INDEX_ARRAY_OES                               0x8844
-#define GL_WEIGHT_ARRAY_OES                                     0x86AD
-#define GL_CURRENT_PALETTE_MATRIX_OES                           0x8843
-#define GL_MATRIX_INDEX_ARRAY_SIZE_OES                          0x8846
-#define GL_MATRIX_INDEX_ARRAY_TYPE_OES                          0x8847
-#define GL_MATRIX_INDEX_ARRAY_STRIDE_OES                        0x8848
-#define GL_MATRIX_INDEX_ARRAY_POINTER_OES                       0x8849
-#define GL_MATRIX_INDEX_ARRAY_BUFFER_BINDING_OES                0x8B9E
-#define GL_WEIGHT_ARRAY_SIZE_OES                                0x86AB
-#define GL_WEIGHT_ARRAY_TYPE_OES                                0x86A9
-#define GL_WEIGHT_ARRAY_STRIDE_OES                              0x86AA
-#define GL_WEIGHT_ARRAY_POINTER_OES                             0x86AC
-#define GL_WEIGHT_ARRAY_BUFFER_BINDING_OES                      0x889E
-#endif
-
-/* GL_OES_packed_depth_stencil */
-#ifndef GL_OES_packed_depth_stencil
-#define GL_DEPTH_STENCIL_OES                                    0x84F9
-#define GL_UNSIGNED_INT_24_8_OES                                0x84FA
-#define GL_DEPTH24_STENCIL8_OES                                 0x88F0
-#endif
-
-/* GL_OES_required_internalformat */
-/* No new tokens introduced by this extension. */
-
-/* GL_OES_rgb8_rgba8 */
-#ifndef GL_OES_rgb8_rgba8
-#define GL_RGB8_OES                                             0x8051
-#define GL_RGBA8_OES                                            0x8058
-#endif
-
-/* GL_OES_stencil1 */
-#ifndef GL_OES_stencil1
-#define GL_STENCIL_INDEX1_OES                                   0x8D46
-#endif
-
-/* GL_OES_stencil4 */
-#ifndef GL_OES_stencil4
-#define GL_STENCIL_INDEX4_OES                                   0x8D47
-#endif
-
-/* GL_OES_stencil8 */
-#ifndef GL_OES_stencil8
-#define GL_STENCIL_INDEX8_OES                                   0x8D48
-#endif
-
-/* GL_OES_stencil_wrap */
-#ifndef GL_OES_stencil_wrap
-#define GL_INCR_WRAP_OES                                        0x8507
-#define GL_DECR_WRAP_OES                                        0x8508
-#endif
-
-/* GL_OES_texture_cube_map */
-#ifndef GL_OES_texture_cube_map
-#define GL_NORMAL_MAP_OES                                       0x8511
-#define GL_REFLECTION_MAP_OES                                   0x8512
-#define GL_TEXTURE_CUBE_MAP_OES                                 0x8513
-#define GL_TEXTURE_BINDING_CUBE_MAP_OES                         0x8514
-#define GL_TEXTURE_CUBE_MAP_POSITIVE_X_OES                      0x8515
-#define GL_TEXTURE_CUBE_MAP_NEGATIVE_X_OES                      0x8516
-#define GL_TEXTURE_CUBE_MAP_POSITIVE_Y_OES                      0x8517
-#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_OES                      0x8518
-#define GL_TEXTURE_CUBE_MAP_POSITIVE_Z_OES                      0x8519
-#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_OES                      0x851A
-#define GL_MAX_CUBE_MAP_TEXTURE_SIZE_OES                        0x851C
-#define GL_TEXTURE_GEN_MODE_OES                                 0x2500
-#define GL_TEXTURE_GEN_STR_OES                                  0x8D60
-#endif
-
-/* GL_OES_texture_mirrored_repeat */
-#ifndef GL_OES_texture_mirrored_repeat
-#define GL_MIRRORED_REPEAT_OES                                  0x8370
-#endif
-
-/* GL_OES_vertex_array_object */
-#ifndef GL_OES_vertex_array_object
-#define GL_VERTEX_ARRAY_BINDING_OES                             0x85B5
-#endif
-
-/*------------------------------------------------------------------------*
- * AMD extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_AMD_compressed_3DC_texture */
-#ifndef GL_AMD_compressed_3DC_texture
-#define GL_3DC_X_AMD                                            0x87F9
-#define GL_3DC_XY_AMD                                           0x87FA
-#endif
-
-/* GL_AMD_compressed_ATC_texture */
-#ifndef GL_AMD_compressed_ATC_texture
-#define GL_ATC_RGB_AMD                                          0x8C92
-#define GL_ATC_RGBA_EXPLICIT_ALPHA_AMD                          0x8C93
-#define GL_ATC_RGBA_INTERPOLATED_ALPHA_AMD                      0x87EE
-#endif
-
-/*------------------------------------------------------------------------*
- * APPLE extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_APPLE_copy_texture_levels */
-/* No new tokens introduced by this extension. */
-
-/* GL_APPLE_framebuffer_multisample */
-#ifndef GL_APPLE_framebuffer_multisample
-#define GL_RENDERBUFFER_SAMPLES_APPLE                           0x8CAB
-#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_APPLE             0x8D56
-#define GL_MAX_SAMPLES_APPLE                                    0x8D57
-#define GL_READ_FRAMEBUFFER_APPLE                               0x8CA8
-#define GL_DRAW_FRAMEBUFFER_APPLE                               0x8CA9
-#define GL_DRAW_FRAMEBUFFER_BINDING_APPLE                       0x8CA6
-#define GL_READ_FRAMEBUFFER_BINDING_APPLE                       0x8CAA
-#endif
-
-/* GL_APPLE_sync */
-#ifndef GL_APPLE_sync
-
-/* These types are defined with reference to <inttypes.h>
- * in the Apple extension spec, but here we use the Khronos
- * portable types in khrplatform.h, and assume those types
- * are always defined.
- * If any other extensions using these types are defined,
- * the typedefs must move out of this block and be shared.
+/* Generated C header for:
+ * API: gles1
+ * Profile: common
+ * Versions considered: .*
+ * Versions emitted: _nomatch_^
+ * Default extensions included: gles1
+ * Additional extensions included: _nomatch_^
+ * Extensions removed: ^(GL_OES_read_format|GL_OES_compressed_paletted_texture|GL_OES_point_size_array|GL_OES_point_sprite)$
  */
-typedef khronos_int64_t GLint64;
-typedef khronos_uint64_t GLuint64;
-typedef struct __GLsync *GLsync;
 
-#define GL_SYNC_OBJECT_APPLE                                    0x8A53
-#define GL_MAX_SERVER_WAIT_TIMEOUT_APPLE                        0x9111
-#define GL_OBJECT_TYPE_APPLE                                    0x9112
-#define GL_SYNC_CONDITION_APPLE                                 0x9113
-#define GL_SYNC_STATUS_APPLE                                    0x9114
-#define GL_SYNC_FLAGS_APPLE                                     0x9115
-#define GL_SYNC_FENCE_APPLE                                     0x9116
-#define GL_SYNC_GPU_COMMANDS_COMPLETE_APPLE                     0x9117
-#define GL_UNSIGNALED_APPLE                                     0x9118
-#define GL_SIGNALED_APPLE                                       0x9119
-#define GL_ALREADY_SIGNALED_APPLE                               0x911A
-#define GL_TIMEOUT_EXPIRED_APPLE                                0x911B
-#define GL_CONDITION_SATISFIED_APPLE                            0x911C
-#define GL_WAIT_FAILED_APPLE                                    0x911D
-#define GL_SYNC_FLUSH_COMMANDS_BIT_APPLE                        0x00000001
-#define GL_TIMEOUT_IGNORED_APPLE                                0xFFFFFFFFFFFFFFFFull
+#ifndef GL_OES_EGL_image
+#define GL_OES_EGL_image 1
+typedef void *GLeglImageOES;
+typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETTEXTURE2DOESPROC) (GLenum target, GLeglImageOES image);
+typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETRENDERBUFFERSTORAGEOESPROC) (GLenum target, GLeglImageOES image);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glEGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image);
+GL_API void GL_APIENTRY glEGLImageTargetRenderbufferStorageOES (GLenum target, GLeglImageOES image);
 #endif
+#endif /* GL_OES_EGL_image */
 
-/* GL_APPLE_texture_2D_limited_npot */
-/* No new tokens introduced by this extension. */
+#ifndef GL_OES_EGL_image_external
+#define GL_OES_EGL_image_external 1
+#define GL_TEXTURE_EXTERNAL_OES           0x8D65
+#define GL_TEXTURE_BINDING_EXTERNAL_OES   0x8D67
+#define GL_REQUIRED_TEXTURE_IMAGE_UNITS_OES 0x8D68
+#endif /* GL_OES_EGL_image_external */
 
-/* GL_APPLE_texture_format_BGRA8888 */
-#ifndef GL_APPLE_texture_format_BGRA8888
-#define GL_BGRA_EXT                                             0x80E1
-#endif
-
-/* GL_APPLE_texture_max_level */
-#ifndef GL_APPLE_texture_max_level
-#define GL_TEXTURE_MAX_LEVEL_APPLE                              0x813D
-#endif
-
-/*------------------------------------------------------------------------*
- * ARM extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_ARM_rgba8 */
-/* No new tokens introduced by this extension. */
-
-/*------------------------------------------------------------------------*
- * EXT extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_EXT_blend_minmax */
-#ifndef GL_EXT_blend_minmax
-#define GL_MIN_EXT                                              0x8007
-#define GL_MAX_EXT                                              0x8008
-#endif
-
-/* GL_EXT_discard_framebuffer */
-#ifndef GL_EXT_discard_framebuffer
-#define GL_COLOR_EXT                                            0x1800
-#define GL_DEPTH_EXT                                            0x1801
-#define GL_STENCIL_EXT                                          0x1802
-#endif
-
-/* GL_EXT_map_buffer_range */
-#ifndef GL_EXT_map_buffer_range
-#define GL_MAP_READ_BIT_EXT                                     0x0001
-#define GL_MAP_WRITE_BIT_EXT                                    0x0002
-#define GL_MAP_INVALIDATE_RANGE_BIT_EXT                         0x0004
-#define GL_MAP_INVALIDATE_BUFFER_BIT_EXT                        0x0008
-#define GL_MAP_FLUSH_EXPLICIT_BIT_EXT                           0x0010
-#define GL_MAP_UNSYNCHRONIZED_BIT_EXT                           0x0020
-#endif
-
-/* GL_EXT_multisampled_render_to_texture */
-#ifndef GL_EXT_multisampled_render_to_texture
-#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_SAMPLES_EXT           0x8D6C
-/* reuse values from GL_EXT_framebuffer_multisample (desktop extension) */
-#define GL_RENDERBUFFER_SAMPLES_EXT                             0x8CAB
-#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_EXT               0x8D56
-#define GL_MAX_SAMPLES_EXT                                      0x8D57
-#endif
-
-/* GL_EXT_multi_draw_arrays */
-/* No new tokens introduced by this extension. */
-
-/* GL_EXT_read_format_bgra */
-#ifndef GL_EXT_read_format_bgra
-#define GL_BGRA_EXT                                             0x80E1
-#define GL_UNSIGNED_SHORT_4_4_4_4_REV_EXT                       0x8365
-#define GL_UNSIGNED_SHORT_1_5_5_5_REV_EXT                       0x8366
-#endif
-
-/* GL_EXT_robustness */
-#ifndef GL_EXT_robustness
-/* reuse GL_NO_ERROR */
-#define GL_GUILTY_CONTEXT_RESET_EXT                             0x8253
-#define GL_INNOCENT_CONTEXT_RESET_EXT                           0x8254
-#define GL_UNKNOWN_CONTEXT_RESET_EXT                            0x8255
-#define GL_CONTEXT_ROBUST_ACCESS_EXT                            0x90F3
-#define GL_RESET_NOTIFICATION_STRATEGY_EXT                      0x8256
-#define GL_LOSE_CONTEXT_ON_RESET_EXT                            0x8252
-#define GL_NO_RESET_NOTIFICATION_EXT                            0x8261
-#endif
-
-/* GL_EXT_sRGB */
-#ifndef GL_EXT_sRGB
-#define GL_SRGB_EXT                                             0x8C40
-#define GL_SRGB_ALPHA_EXT                                       0x8C42
-#define GL_SRGB8_ALPHA8_EXT                                     0x8C43
-#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING_EXT            0x8210
-#endif
-
-/* GL_EXT_texture_compression_dxt1 */
-#ifndef GL_EXT_texture_compression_dxt1
-#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT                         0x83F0
-#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT                        0x83F1
-#endif
-
-/* GL_EXT_texture_filter_anisotropic */
-#ifndef GL_EXT_texture_filter_anisotropic
-#define GL_TEXTURE_MAX_ANISOTROPY_EXT                           0x84FE
-#define GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT                       0x84FF
-#endif
-
-/* GL_EXT_texture_format_BGRA8888 */
-#ifndef GL_EXT_texture_format_BGRA8888
-#define GL_BGRA_EXT                                             0x80E1
-#endif
-
-/* GL_EXT_texture_lod_bias */
-#ifndef GL_EXT_texture_lod_bias
-#define GL_MAX_TEXTURE_LOD_BIAS_EXT                             0x84FD
-#define GL_TEXTURE_FILTER_CONTROL_EXT                           0x8500
-#define GL_TEXTURE_LOD_BIAS_EXT                                 0x8501
-#endif
-
-/* GL_EXT_texture_storage */
-#ifndef GL_EXT_texture_storage
-#define GL_TEXTURE_IMMUTABLE_FORMAT_EXT                         0x912F
-#define GL_ALPHA8_EXT                                           0x803C
-#define GL_LUMINANCE8_EXT                                       0x8040
-#define GL_LUMINANCE8_ALPHA8_EXT                                0x8045
-#define GL_RGBA32F_EXT                                          0x8814
-#define GL_RGB32F_EXT                                           0x8815
-#define GL_ALPHA32F_EXT                                         0x8816
-#define GL_LUMINANCE32F_EXT                                     0x8818
-#define GL_LUMINANCE_ALPHA32F_EXT                               0x8819
-/* reuse GL_RGBA16F_EXT */
-#define GL_RGB16F_EXT                                           0x881B
-#define GL_ALPHA16F_EXT                                         0x881C
-#define GL_LUMINANCE16F_EXT                                     0x881E
-#define GL_LUMINANCE_ALPHA16F_EXT                               0x881F
-#define GL_RGB10_A2_EXT                                         0x8059
-#define GL_RGB10_EXT                                            0x8052
-#define GL_BGRA8_EXT                                            0x93A1
-#endif
-
-/*------------------------------------------------------------------------*
- * IMG extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_IMG_read_format */
-#ifndef GL_IMG_read_format
-#define GL_BGRA_IMG                                             0x80E1
-#define GL_UNSIGNED_SHORT_4_4_4_4_REV_IMG                       0x8365
-#endif
-
-/* GL_IMG_texture_compression_pvrtc */
-#ifndef GL_IMG_texture_compression_pvrtc
-#define GL_COMPRESSED_RGB_PVRTC_4BPPV1_IMG                      0x8C00
-#define GL_COMPRESSED_RGB_PVRTC_2BPPV1_IMG                      0x8C01
-#define GL_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG                     0x8C02
-#define GL_COMPRESSED_RGBA_PVRTC_2BPPV1_IMG                     0x8C03
-#endif
-
-/* GL_IMG_texture_env_enhanced_fixed_function */
-#ifndef GL_IMG_texture_env_enhanced_fixed_function
-#define GL_MODULATE_COLOR_IMG                                   0x8C04
-#define GL_RECIP_ADD_SIGNED_ALPHA_IMG                           0x8C05
-#define GL_TEXTURE_ALPHA_MODULATE_IMG                           0x8C06
-#define GL_FACTOR_ALPHA_MODULATE_IMG                            0x8C07
-#define GL_FRAGMENT_ALPHA_MODULATE_IMG                          0x8C08
-#define GL_ADD_BLEND_IMG                                        0x8C09
-#define GL_DOT3_RGBA_IMG                                        0x86AF
-#endif
-
-/* GL_IMG_user_clip_plane */
-#ifndef GL_IMG_user_clip_plane
-#define GL_CLIP_PLANE0_IMG                                      0x3000
-#define GL_CLIP_PLANE1_IMG                                      0x3001
-#define GL_CLIP_PLANE2_IMG                                      0x3002
-#define GL_CLIP_PLANE3_IMG                                      0x3003
-#define GL_CLIP_PLANE4_IMG                                      0x3004
-#define GL_CLIP_PLANE5_IMG                                      0x3005
-#define GL_MAX_CLIP_PLANES_IMG                                  0x0D32
-#endif
-
-/* GL_IMG_multisampled_render_to_texture */
-#ifndef GL_IMG_multisampled_render_to_texture
-#define GL_RENDERBUFFER_SAMPLES_IMG                             0x9133
-#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_IMG               0x9134
-#define GL_MAX_SAMPLES_IMG                                      0x9135
-#define GL_TEXTURE_SAMPLES_IMG                                  0x9136
-#endif
-
-/*------------------------------------------------------------------------*
- * NV extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_NV_fence */
-#ifndef GL_NV_fence
-#define GL_ALL_COMPLETED_NV                                     0x84F2
-#define GL_FENCE_STATUS_NV                                      0x84F3
-#define GL_FENCE_CONDITION_NV                                   0x84F4
-#endif
-
-/*------------------------------------------------------------------------*
- * QCOM extension tokens
- *------------------------------------------------------------------------*/
-
-/* GL_QCOM_driver_control */
-/* No new tokens introduced by this extension. */
-
-/* GL_QCOM_extended_get */
-#ifndef GL_QCOM_extended_get
-#define GL_TEXTURE_WIDTH_QCOM                                   0x8BD2
-#define GL_TEXTURE_HEIGHT_QCOM                                  0x8BD3
-#define GL_TEXTURE_DEPTH_QCOM                                   0x8BD4
-#define GL_TEXTURE_INTERNAL_FORMAT_QCOM                         0x8BD5
-#define GL_TEXTURE_FORMAT_QCOM                                  0x8BD6
-#define GL_TEXTURE_TYPE_QCOM                                    0x8BD7
-#define GL_TEXTURE_IMAGE_VALID_QCOM                             0x8BD8
-#define GL_TEXTURE_NUM_LEVELS_QCOM                              0x8BD9
-#define GL_TEXTURE_TARGET_QCOM                                  0x8BDA
-#define GL_TEXTURE_OBJECT_VALID_QCOM                            0x8BDB
-#define GL_STATE_RESTORE                                        0x8BDC
-#endif
-
-/* GL_QCOM_extended_get2 */
-/* No new tokens introduced by this extension. */
-
-/* GL_QCOM_perfmon_global_mode */
-#ifndef GL_QCOM_perfmon_global_mode
-#define GL_PERFMON_GLOBAL_MODE_QCOM                             0x8FA0
-#endif
-
-/* GL_QCOM_writeonly_rendering */
-#ifndef GL_QCOM_writeonly_rendering
-#define GL_WRITEONLY_RENDERING_QCOM                             0x8823
-#endif
-
-/* GL_QCOM_tiled_rendering */
-#ifndef GL_QCOM_tiled_rendering
-#define GL_COLOR_BUFFER_BIT0_QCOM                               0x00000001
-#define GL_COLOR_BUFFER_BIT1_QCOM                               0x00000002
-#define GL_COLOR_BUFFER_BIT2_QCOM                               0x00000004
-#define GL_COLOR_BUFFER_BIT3_QCOM                               0x00000008
-#define GL_COLOR_BUFFER_BIT4_QCOM                               0x00000010
-#define GL_COLOR_BUFFER_BIT5_QCOM                               0x00000020
-#define GL_COLOR_BUFFER_BIT6_QCOM                               0x00000040
-#define GL_COLOR_BUFFER_BIT7_QCOM                               0x00000080
-#define GL_DEPTH_BUFFER_BIT0_QCOM                               0x00000100
-#define GL_DEPTH_BUFFER_BIT1_QCOM                               0x00000200
-#define GL_DEPTH_BUFFER_BIT2_QCOM                               0x00000400
-#define GL_DEPTH_BUFFER_BIT3_QCOM                               0x00000800
-#define GL_DEPTH_BUFFER_BIT4_QCOM                               0x00001000
-#define GL_DEPTH_BUFFER_BIT5_QCOM                               0x00002000
-#define GL_DEPTH_BUFFER_BIT6_QCOM                               0x00004000
-#define GL_DEPTH_BUFFER_BIT7_QCOM                               0x00008000
-#define GL_STENCIL_BUFFER_BIT0_QCOM                             0x00010000
-#define GL_STENCIL_BUFFER_BIT1_QCOM                             0x00020000
-#define GL_STENCIL_BUFFER_BIT2_QCOM                             0x00040000
-#define GL_STENCIL_BUFFER_BIT3_QCOM                             0x00080000
-#define GL_STENCIL_BUFFER_BIT4_QCOM                             0x00100000
-#define GL_STENCIL_BUFFER_BIT5_QCOM                             0x00200000
-#define GL_STENCIL_BUFFER_BIT6_QCOM                             0x00400000
-#define GL_STENCIL_BUFFER_BIT7_QCOM                             0x00800000
-#define GL_MULTISAMPLE_BUFFER_BIT0_QCOM                         0x01000000
-#define GL_MULTISAMPLE_BUFFER_BIT1_QCOM                         0x02000000
-#define GL_MULTISAMPLE_BUFFER_BIT2_QCOM                         0x04000000
-#define GL_MULTISAMPLE_BUFFER_BIT3_QCOM                         0x08000000
-#define GL_MULTISAMPLE_BUFFER_BIT4_QCOM                         0x10000000
-#define GL_MULTISAMPLE_BUFFER_BIT5_QCOM                         0x20000000
-#define GL_MULTISAMPLE_BUFFER_BIT6_QCOM                         0x40000000
-#define GL_MULTISAMPLE_BUFFER_BIT7_QCOM                         0x80000000
-#endif
-
-/*------------------------------------------------------------------------*
- * End of extension tokens, start of corresponding extension functions
- *------------------------------------------------------------------------*/
-
-/*------------------------------------------------------------------------*
- * OES extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_OES_blend_equation_separate */
 #ifndef GL_OES_blend_equation_separate
 #define GL_OES_blend_equation_separate 1
+#define GL_BLEND_EQUATION_RGB_OES         0x8009
+#define GL_BLEND_EQUATION_ALPHA_OES       0x883D
+typedef void (GL_APIENTRYP PFNGLBLENDEQUATIONSEPARATEOESPROC) (GLenum modeRGB, GLenum modeAlpha);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glBlendEquationSeparateOES (GLenum modeRGB, GLenum modeAlpha);
 #endif
-typedef void (GL_APIENTRYP PFNGLBLENDEQUATIONSEPARATEOESPROC) (GLenum modeRGB, GLenum modeAlpha);
-#endif
+#endif /* GL_OES_blend_equation_separate */
 
-/* GL_OES_blend_func_separate */
 #ifndef GL_OES_blend_func_separate
 #define GL_OES_blend_func_separate 1
+#define GL_BLEND_DST_RGB_OES              0x80C8
+#define GL_BLEND_SRC_RGB_OES              0x80C9
+#define GL_BLEND_DST_ALPHA_OES            0x80CA
+#define GL_BLEND_SRC_ALPHA_OES            0x80CB
+typedef void (GL_APIENTRYP PFNGLBLENDFUNCSEPARATEOESPROC) (GLenum srcRGB, GLenum dstRGB, GLenum srcAlpha, GLenum dstAlpha);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glBlendFuncSeparateOES (GLenum srcRGB, GLenum dstRGB, GLenum srcAlpha, GLenum dstAlpha);
 #endif
-typedef void (GL_APIENTRYP PFNGLBLENDFUNCSEPARATEOESPROC) (GLenum srcRGB, GLenum dstRGB, GLenum srcAlpha, GLenum dstAlpha);
-#endif
+#endif /* GL_OES_blend_func_separate */
 
-/* GL_OES_blend_subtract */
 #ifndef GL_OES_blend_subtract
 #define GL_OES_blend_subtract 1
+#define GL_BLEND_EQUATION_OES             0x8009
+#define GL_FUNC_ADD_OES                   0x8006
+#define GL_FUNC_SUBTRACT_OES              0x800A
+#define GL_FUNC_REVERSE_SUBTRACT_OES      0x800B
+typedef void (GL_APIENTRYP PFNGLBLENDEQUATIONOESPROC) (GLenum mode);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glBlendEquationOES (GLenum mode);
 #endif
-typedef void (GL_APIENTRYP PFNGLBLENDEQUATIONOESPROC) (GLenum mode);
-#endif
+#endif /* GL_OES_blend_subtract */
 
-/* GL_OES_byte_coordinates */
 #ifndef GL_OES_byte_coordinates
 #define GL_OES_byte_coordinates 1
-#endif
+#endif /* GL_OES_byte_coordinates */
 
-/* GL_OES_compressed_ETC1_RGB8_texture */
+#ifndef GL_OES_compressed_ETC1_RGB8_sub_texture
+#define GL_OES_compressed_ETC1_RGB8_sub_texture 1
+#endif /* GL_OES_compressed_ETC1_RGB8_sub_texture */
+
 #ifndef GL_OES_compressed_ETC1_RGB8_texture
 #define GL_OES_compressed_ETC1_RGB8_texture 1
-#endif
+#define GL_ETC1_RGB8_OES                  0x8D64
+#endif /* GL_OES_compressed_ETC1_RGB8_texture */
 
-/* GL_OES_depth24 */
 #ifndef GL_OES_depth24
 #define GL_OES_depth24 1
-#endif
+#define GL_DEPTH_COMPONENT24_OES          0x81A6
+#endif /* GL_OES_depth24 */
 
-/* GL_OES_depth32 */
 #ifndef GL_OES_depth32
 #define GL_OES_depth32 1
-#endif
+#define GL_DEPTH_COMPONENT32_OES          0x81A7
+#endif /* GL_OES_depth32 */
 
-/* GL_OES_draw_texture */
 #ifndef GL_OES_draw_texture
 #define GL_OES_draw_texture 1
+#define GL_TEXTURE_CROP_RECT_OES          0x8B9D
+typedef void (GL_APIENTRYP PFNGLDRAWTEXSOESPROC) (GLshort x, GLshort y, GLshort z, GLshort width, GLshort height);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXIOESPROC) (GLint x, GLint y, GLint z, GLint width, GLint height);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXXOESPROC) (GLfixed x, GLfixed y, GLfixed z, GLfixed width, GLfixed height);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXSVOESPROC) (const GLshort *coords);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXIVOESPROC) (const GLint *coords);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXXVOESPROC) (const GLfixed *coords);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXFOESPROC) (GLfloat x, GLfloat y, GLfloat z, GLfloat width, GLfloat height);
+typedef void (GL_APIENTRYP PFNGLDRAWTEXFVOESPROC) (const GLfloat *coords);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glDrawTexsOES (GLshort x, GLshort y, GLshort z, GLshort width, GLshort height);
 GL_API void GL_APIENTRY glDrawTexiOES (GLint x, GLint y, GLint z, GLint width, GLint height);
@@ -618,357 +146,425 @@
 GL_API void GL_APIENTRY glDrawTexfOES (GLfloat x, GLfloat y, GLfloat z, GLfloat width, GLfloat height);
 GL_API void GL_APIENTRY glDrawTexfvOES (const GLfloat *coords);
 #endif
-typedef void (GL_APIENTRYP PFNGLDRAWTEXSOESPROC) (GLshort x, GLshort y, GLshort z, GLshort width, GLshort height);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXIOESPROC) (GLint x, GLint y, GLint z, GLint width, GLint height);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXXOESPROC) (GLfixed x, GLfixed y, GLfixed z, GLfixed width, GLfixed height);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXSVOESPROC) (const GLshort *coords);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXIVOESPROC) (const GLint *coords);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXXVOESPROC) (const GLfixed *coords);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXFOESPROC) (GLfloat x, GLfloat y, GLfloat z, GLfloat width, GLfloat height);
-typedef void (GL_APIENTRYP PFNGLDRAWTEXFVOESPROC) (const GLfloat *coords);
-#endif
+#endif /* GL_OES_draw_texture */
 
-/* GL_OES_EGL_image */
-#ifndef GL_OES_EGL_image
-#define GL_OES_EGL_image 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glEGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image);
-GL_API void GL_APIENTRY glEGLImageTargetRenderbufferStorageOES (GLenum target, GLeglImageOES image);
-#endif
-typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETTEXTURE2DOESPROC) (GLenum target, GLeglImageOES image);
-typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETRENDERBUFFERSTORAGEOESPROC) (GLenum target, GLeglImageOES image);
-#endif
-
-/* GL_OES_EGL_image_external */
-#ifndef GL_OES_EGL_image_external
-#define GL_OES_EGL_image_external 1
-/* glEGLImageTargetTexture2DOES defined in GL_OES_EGL_image already. */
-#endif
-
-/* GL_OES_element_index_uint */
 #ifndef GL_OES_element_index_uint
 #define GL_OES_element_index_uint 1
-#endif
+#define GL_UNSIGNED_INT                   0x1405
+#endif /* GL_OES_element_index_uint */
 
-/* GL_OES_extended_matrix_palette */
 #ifndef GL_OES_extended_matrix_palette
 #define GL_OES_extended_matrix_palette 1
-#endif
+#endif /* GL_OES_extended_matrix_palette */
 
-/* GL_OES_fbo_render_mipmap */
 #ifndef GL_OES_fbo_render_mipmap
 #define GL_OES_fbo_render_mipmap 1
-#endif
+#endif /* GL_OES_fbo_render_mipmap */
 
-/* GL_OES_fixed_point */
 #ifndef GL_OES_fixed_point
 #define GL_OES_fixed_point 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glAlphaFuncxOES (GLenum func, GLclampx ref);
-GL_API void GL_APIENTRY glClearColorxOES (GLclampx red, GLclampx green, GLclampx blue, GLclampx alpha);
-GL_API void GL_APIENTRY glClearDepthxOES (GLclampx depth);
-GL_API void GL_APIENTRY glClipPlanexOES (GLenum plane, const GLfixed *equation);
-GL_API void GL_APIENTRY glColor4xOES (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
-GL_API void GL_APIENTRY glDepthRangexOES (GLclampx zNear, GLclampx zFar);
-GL_API void GL_APIENTRY glFogxOES (GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glFogxvOES (GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glFrustumxOES (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
-GL_API void GL_APIENTRY glGetClipPlanexOES (GLenum pname, GLfixed eqn[4]);
-GL_API void GL_APIENTRY glGetFixedvOES (GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetLightxvOES (GLenum light, GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetMaterialxvOES (GLenum face, GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetTexEnvxvOES (GLenum env, GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glGetTexParameterxvOES (GLenum target, GLenum pname, GLfixed *params);
-GL_API void GL_APIENTRY glLightModelxOES (GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glLightModelxvOES (GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glLightxOES (GLenum light, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glLightxvOES (GLenum light, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glLineWidthxOES (GLfixed width);
-GL_API void GL_APIENTRY glLoadMatrixxOES (const GLfixed *m);
-GL_API void GL_APIENTRY glMaterialxOES (GLenum face, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glMaterialxvOES (GLenum face, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glMultMatrixxOES (const GLfixed *m);
-GL_API void GL_APIENTRY glMultiTexCoord4xOES (GLenum target, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
-GL_API void GL_APIENTRY glNormal3xOES (GLfixed nx, GLfixed ny, GLfixed nz);
-GL_API void GL_APIENTRY glOrthoxOES (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
-GL_API void GL_APIENTRY glPointParameterxOES (GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glPointParameterxvOES (GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glPointSizexOES (GLfixed size);
-GL_API void GL_APIENTRY glPolygonOffsetxOES (GLfixed factor, GLfixed units);
-GL_API void GL_APIENTRY glRotatexOES (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
-GL_API void GL_APIENTRY glSampleCoveragexOES (GLclampx value, GLboolean invert);
-GL_API void GL_APIENTRY glScalexOES (GLfixed x, GLfixed y, GLfixed z);
-GL_API void GL_APIENTRY glTexEnvxOES (GLenum target, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glTexEnvxvOES (GLenum target, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glTexParameterxOES (GLenum target, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glTexParameterxvOES (GLenum target, GLenum pname, const GLfixed *params);
-GL_API void GL_APIENTRY glTranslatexOES (GLfixed x, GLfixed y, GLfixed z);
-#endif
-typedef void (GL_APIENTRYP PFNGLALPHAFUNCXOESPROC) (GLenum func, GLclampx ref);
-typedef void (GL_APIENTRYP PFNGLCLEARCOLORXOESPROC) (GLclampx red, GLclampx green, GLclampx blue, GLclampx alpha);
-typedef void (GL_APIENTRYP PFNGLCLEARDEPTHXOESPROC) (GLclampx depth);
+#define GL_FIXED_OES                      0x140C
+typedef void (GL_APIENTRYP PFNGLALPHAFUNCXOESPROC) (GLenum func, GLfixed ref);
+typedef void (GL_APIENTRYP PFNGLCLEARCOLORXOESPROC) (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
+typedef void (GL_APIENTRYP PFNGLCLEARDEPTHXOESPROC) (GLfixed depth);
 typedef void (GL_APIENTRYP PFNGLCLIPPLANEXOESPROC) (GLenum plane, const GLfixed *equation);
 typedef void (GL_APIENTRYP PFNGLCOLOR4XOESPROC) (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
-typedef void (GL_APIENTRYP PFNGLDEPTHRANGEXOESPROC) (GLclampx zNear, GLclampx zFar);
+typedef void (GL_APIENTRYP PFNGLDEPTHRANGEXOESPROC) (GLfixed n, GLfixed f);
 typedef void (GL_APIENTRYP PFNGLFOGXOESPROC) (GLenum pname, GLfixed param);
-typedef void (GL_APIENTRYP PFNGLFOGXVOESPROC) (GLenum pname, const GLfixed *params);
-typedef void (GL_APIENTRYP PFNGLFRUSTUMXOESPROC) (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
-typedef void (GL_APIENTRYP PFNGLGETCLIPPLANEXOESPROC) (GLenum pname, GLfixed eqn[4]);
+typedef void (GL_APIENTRYP PFNGLFOGXVOESPROC) (GLenum pname, const GLfixed *param);
+typedef void (GL_APIENTRYP PFNGLFRUSTUMXOESPROC) (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
+typedef void (GL_APIENTRYP PFNGLGETCLIPPLANEXOESPROC) (GLenum plane, GLfixed *equation);
 typedef void (GL_APIENTRYP PFNGLGETFIXEDVOESPROC) (GLenum pname, GLfixed *params);
-typedef void (GL_APIENTRYP PFNGLGETLIGHTXVOESPROC) (GLenum light, GLenum pname, GLfixed *params);
-typedef void (GL_APIENTRYP PFNGLGETMATERIALXVOESPROC) (GLenum face, GLenum pname, GLfixed *params);
-typedef void (GL_APIENTRYP PFNGLGETTEXENVXVOESPROC) (GLenum env, GLenum pname, GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLGETTEXENVXVOESPROC) (GLenum target, GLenum pname, GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLGETTEXPARAMETERXVOESPROC) (GLenum target, GLenum pname, GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLLIGHTMODELXOESPROC) (GLenum pname, GLfixed param);
-typedef void (GL_APIENTRYP PFNGLLIGHTMODELXVOESPROC) (GLenum pname, const GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLLIGHTMODELXVOESPROC) (GLenum pname, const GLfixed *param);
 typedef void (GL_APIENTRYP PFNGLLIGHTXOESPROC) (GLenum light, GLenum pname, GLfixed param);
 typedef void (GL_APIENTRYP PFNGLLIGHTXVOESPROC) (GLenum light, GLenum pname, const GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLLINEWIDTHXOESPROC) (GLfixed width);
 typedef void (GL_APIENTRYP PFNGLLOADMATRIXXOESPROC) (const GLfixed *m);
 typedef void (GL_APIENTRYP PFNGLMATERIALXOESPROC) (GLenum face, GLenum pname, GLfixed param);
-typedef void (GL_APIENTRYP PFNGLMATERIALXVOESPROC) (GLenum face, GLenum pname, const GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLMATERIALXVOESPROC) (GLenum face, GLenum pname, const GLfixed *param);
 typedef void (GL_APIENTRYP PFNGLMULTMATRIXXOESPROC) (const GLfixed *m);
-typedef void (GL_APIENTRYP PFNGLMULTITEXCOORD4XOESPROC) (GLenum target, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
+typedef void (GL_APIENTRYP PFNGLMULTITEXCOORD4XOESPROC) (GLenum texture, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
 typedef void (GL_APIENTRYP PFNGLNORMAL3XOESPROC) (GLfixed nx, GLfixed ny, GLfixed nz);
-typedef void (GL_APIENTRYP PFNGLORTHOXOESPROC) (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
-typedef void (GL_APIENTRYP PFNGLPOINTPARAMETERXOESPROC) (GLenum pname, GLfixed param);
+typedef void (GL_APIENTRYP PFNGLORTHOXOESPROC) (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
 typedef void (GL_APIENTRYP PFNGLPOINTPARAMETERXVOESPROC) (GLenum pname, const GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLPOINTSIZEXOESPROC) (GLfixed size);
 typedef void (GL_APIENTRYP PFNGLPOLYGONOFFSETXOESPROC) (GLfixed factor, GLfixed units);
 typedef void (GL_APIENTRYP PFNGLROTATEXOESPROC) (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
-typedef void (GL_APIENTRYP PFNGLSAMPLECOVERAGEXOESPROC) (GLclampx value, GLboolean invert);
 typedef void (GL_APIENTRYP PFNGLSCALEXOESPROC) (GLfixed x, GLfixed y, GLfixed z);
 typedef void (GL_APIENTRYP PFNGLTEXENVXOESPROC) (GLenum target, GLenum pname, GLfixed param);
 typedef void (GL_APIENTRYP PFNGLTEXENVXVOESPROC) (GLenum target, GLenum pname, const GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLTEXPARAMETERXOESPROC) (GLenum target, GLenum pname, GLfixed param);
 typedef void (GL_APIENTRYP PFNGLTEXPARAMETERXVOESPROC) (GLenum target, GLenum pname, const GLfixed *params);
 typedef void (GL_APIENTRYP PFNGLTRANSLATEXOESPROC) (GLfixed x, GLfixed y, GLfixed z);
+typedef void (GL_APIENTRYP PFNGLGETLIGHTXVOESPROC) (GLenum light, GLenum pname, GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLGETMATERIALXVOESPROC) (GLenum face, GLenum pname, GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLPOINTPARAMETERXOESPROC) (GLenum pname, GLfixed param);
+typedef void (GL_APIENTRYP PFNGLSAMPLECOVERAGEXOESPROC) (GLclampx value, GLboolean invert);
+typedef void (GL_APIENTRYP PFNGLGETTEXGENXVOESPROC) (GLenum coord, GLenum pname, GLfixed *params);
+typedef void (GL_APIENTRYP PFNGLTEXGENXOESPROC) (GLenum coord, GLenum pname, GLfixed param);
+typedef void (GL_APIENTRYP PFNGLTEXGENXVOESPROC) (GLenum coord, GLenum pname, const GLfixed *params);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glAlphaFuncxOES (GLenum func, GLfixed ref);
+GL_API void GL_APIENTRY glClearColorxOES (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
+GL_API void GL_APIENTRY glClearDepthxOES (GLfixed depth);
+GL_API void GL_APIENTRY glClipPlanexOES (GLenum plane, const GLfixed *equation);
+GL_API void GL_APIENTRY glColor4xOES (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
+GL_API void GL_APIENTRY glDepthRangexOES (GLfixed n, GLfixed f);
+GL_API void GL_APIENTRY glFogxOES (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glFogxvOES (GLenum pname, const GLfixed *param);
+GL_API void GL_APIENTRY glFrustumxOES (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
+GL_API void GL_APIENTRY glGetClipPlanexOES (GLenum plane, GLfixed *equation);
+GL_API void GL_APIENTRY glGetFixedvOES (GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetTexEnvxvOES (GLenum target, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetTexParameterxvOES (GLenum target, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glLightModelxOES (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glLightModelxvOES (GLenum pname, const GLfixed *param);
+GL_API void GL_APIENTRY glLightxOES (GLenum light, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glLightxvOES (GLenum light, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glLineWidthxOES (GLfixed width);
+GL_API void GL_APIENTRY glLoadMatrixxOES (const GLfixed *m);
+GL_API void GL_APIENTRY glMaterialxOES (GLenum face, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glMaterialxvOES (GLenum face, GLenum pname, const GLfixed *param);
+GL_API void GL_APIENTRY glMultMatrixxOES (const GLfixed *m);
+GL_API void GL_APIENTRY glMultiTexCoord4xOES (GLenum texture, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
+GL_API void GL_APIENTRY glNormal3xOES (GLfixed nx, GLfixed ny, GLfixed nz);
+GL_API void GL_APIENTRY glOrthoxOES (GLfixed l, GLfixed r, GLfixed b, GLfixed t, GLfixed n, GLfixed f);
+GL_API void GL_APIENTRY glPointParameterxvOES (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glPointSizexOES (GLfixed size);
+GL_API void GL_APIENTRY glPolygonOffsetxOES (GLfixed factor, GLfixed units);
+GL_API void GL_APIENTRY glRotatexOES (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glScalexOES (GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glTexEnvxOES (GLenum target, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glTexEnvxvOES (GLenum target, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glTexParameterxOES (GLenum target, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glTexParameterxvOES (GLenum target, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glTranslatexOES (GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glGetLightxvOES (GLenum light, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetMaterialxvOES (GLenum face, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glPointParameterxOES (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glSampleCoveragexOES (GLclampx value, GLboolean invert);
+GL_API void GL_APIENTRY glGetTexGenxvOES (GLenum coord, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glTexGenxOES (GLenum coord, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glTexGenxvOES (GLenum coord, GLenum pname, const GLfixed *params);
 #endif
+#endif /* GL_OES_fixed_point */
 
-/* GL_OES_framebuffer_object */
 #ifndef GL_OES_framebuffer_object
 #define GL_OES_framebuffer_object 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API GLboolean GL_APIENTRY glIsRenderbufferOES (GLuint renderbuffer);
-GL_API void GL_APIENTRY glBindRenderbufferOES (GLenum target, GLuint renderbuffer);
-GL_API void GL_APIENTRY glDeleteRenderbuffersOES (GLsizei n, const GLuint* renderbuffers);
-GL_API void GL_APIENTRY glGenRenderbuffersOES (GLsizei n, GLuint* renderbuffers);
-GL_API void GL_APIENTRY glRenderbufferStorageOES (GLenum target, GLenum internalformat, GLsizei width, GLsizei height);
-GL_API void GL_APIENTRY glGetRenderbufferParameterivOES (GLenum target, GLenum pname, GLint* params);
-GL_API GLboolean GL_APIENTRY glIsFramebufferOES (GLuint framebuffer);
-GL_API void GL_APIENTRY glBindFramebufferOES (GLenum target, GLuint framebuffer);
-GL_API void GL_APIENTRY glDeleteFramebuffersOES (GLsizei n, const GLuint* framebuffers);
-GL_API void GL_APIENTRY glGenFramebuffersOES (GLsizei n, GLuint* framebuffers);
-GL_API GLenum GL_APIENTRY glCheckFramebufferStatusOES (GLenum target);
-GL_API void GL_APIENTRY glFramebufferRenderbufferOES (GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
-GL_API void GL_APIENTRY glFramebufferTexture2DOES (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-GL_API void GL_APIENTRY glGetFramebufferAttachmentParameterivOES (GLenum target, GLenum attachment, GLenum pname, GLint* params);
-GL_API void GL_APIENTRY glGenerateMipmapOES (GLenum target);
-#endif
+#define GL_NONE_OES                       0
+#define GL_FRAMEBUFFER_OES                0x8D40
+#define GL_RENDERBUFFER_OES               0x8D41
+#define GL_RGBA4_OES                      0x8056
+#define GL_RGB5_A1_OES                    0x8057
+#define GL_RGB565_OES                     0x8D62
+#define GL_DEPTH_COMPONENT16_OES          0x81A5
+#define GL_RENDERBUFFER_WIDTH_OES         0x8D42
+#define GL_RENDERBUFFER_HEIGHT_OES        0x8D43
+#define GL_RENDERBUFFER_INTERNAL_FORMAT_OES 0x8D44
+#define GL_RENDERBUFFER_RED_SIZE_OES      0x8D50
+#define GL_RENDERBUFFER_GREEN_SIZE_OES    0x8D51
+#define GL_RENDERBUFFER_BLUE_SIZE_OES     0x8D52
+#define GL_RENDERBUFFER_ALPHA_SIZE_OES    0x8D53
+#define GL_RENDERBUFFER_DEPTH_SIZE_OES    0x8D54
+#define GL_RENDERBUFFER_STENCIL_SIZE_OES  0x8D55
+#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_OES 0x8CD0
+#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_OES 0x8CD1
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_OES 0x8CD2
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_OES 0x8CD3
+#define GL_COLOR_ATTACHMENT0_OES          0x8CE0
+#define GL_DEPTH_ATTACHMENT_OES           0x8D00
+#define GL_STENCIL_ATTACHMENT_OES         0x8D20
+#define GL_FRAMEBUFFER_COMPLETE_OES       0x8CD5
+#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_OES 0x8CD6
+#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_OES 0x8CD7
+#define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_OES 0x8CD9
+#define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_OES 0x8CDA
+#define GL_FRAMEBUFFER_UNSUPPORTED_OES    0x8CDD
+#define GL_FRAMEBUFFER_BINDING_OES        0x8CA6
+#define GL_RENDERBUFFER_BINDING_OES       0x8CA7
+#define GL_MAX_RENDERBUFFER_SIZE_OES      0x84E8
+#define GL_INVALID_FRAMEBUFFER_OPERATION_OES 0x0506
 typedef GLboolean (GL_APIENTRYP PFNGLISRENDERBUFFEROESPROC) (GLuint renderbuffer);
 typedef void (GL_APIENTRYP PFNGLBINDRENDERBUFFEROESPROC) (GLenum target, GLuint renderbuffer);
-typedef void (GL_APIENTRYP PFNGLDELETERENDERBUFFERSOESPROC) (GLsizei n, const GLuint* renderbuffers);
-typedef void (GL_APIENTRYP PFNGLGENRENDERBUFFERSOESPROC) (GLsizei n, GLuint* renderbuffers);
+typedef void (GL_APIENTRYP PFNGLDELETERENDERBUFFERSOESPROC) (GLsizei n, const GLuint *renderbuffers);
+typedef void (GL_APIENTRYP PFNGLGENRENDERBUFFERSOESPROC) (GLsizei n, GLuint *renderbuffers);
 typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEOESPROC) (GLenum target, GLenum internalformat, GLsizei width, GLsizei height);
-typedef void (GL_APIENTRYP PFNGLGETRENDERBUFFERPARAMETERIVOESPROC) (GLenum target, GLenum pname, GLint* params);
+typedef void (GL_APIENTRYP PFNGLGETRENDERBUFFERPARAMETERIVOESPROC) (GLenum target, GLenum pname, GLint *params);
 typedef GLboolean (GL_APIENTRYP PFNGLISFRAMEBUFFEROESPROC) (GLuint framebuffer);
 typedef void (GL_APIENTRYP PFNGLBINDFRAMEBUFFEROESPROC) (GLenum target, GLuint framebuffer);
-typedef void (GL_APIENTRYP PFNGLDELETEFRAMEBUFFERSOESPROC) (GLsizei n, const GLuint* framebuffers);
-typedef void (GL_APIENTRYP PFNGLGENFRAMEBUFFERSOESPROC) (GLsizei n, GLuint* framebuffers);
+typedef void (GL_APIENTRYP PFNGLDELETEFRAMEBUFFERSOESPROC) (GLsizei n, const GLuint *framebuffers);
+typedef void (GL_APIENTRYP PFNGLGENFRAMEBUFFERSOESPROC) (GLsizei n, GLuint *framebuffers);
 typedef GLenum (GL_APIENTRYP PFNGLCHECKFRAMEBUFFERSTATUSOESPROC) (GLenum target);
 typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERRENDERBUFFEROESPROC) (GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
 typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DOESPROC) (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-typedef void (GL_APIENTRYP PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVOESPROC) (GLenum target, GLenum attachment, GLenum pname, GLint* params);
+typedef void (GL_APIENTRYP PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVOESPROC) (GLenum target, GLenum attachment, GLenum pname, GLint *params);
 typedef void (GL_APIENTRYP PFNGLGENERATEMIPMAPOESPROC) (GLenum target);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API GLboolean GL_APIENTRY glIsRenderbufferOES (GLuint renderbuffer);
+GL_API void GL_APIENTRY glBindRenderbufferOES (GLenum target, GLuint renderbuffer);
+GL_API void GL_APIENTRY glDeleteRenderbuffersOES (GLsizei n, const GLuint *renderbuffers);
+GL_API void GL_APIENTRY glGenRenderbuffersOES (GLsizei n, GLuint *renderbuffers);
+GL_API void GL_APIENTRY glRenderbufferStorageOES (GLenum target, GLenum internalformat, GLsizei width, GLsizei height);
+GL_API void GL_APIENTRY glGetRenderbufferParameterivOES (GLenum target, GLenum pname, GLint *params);
+GL_API GLboolean GL_APIENTRY glIsFramebufferOES (GLuint framebuffer);
+GL_API void GL_APIENTRY glBindFramebufferOES (GLenum target, GLuint framebuffer);
+GL_API void GL_APIENTRY glDeleteFramebuffersOES (GLsizei n, const GLuint *framebuffers);
+GL_API void GL_APIENTRY glGenFramebuffersOES (GLsizei n, GLuint *framebuffers);
+GL_API GLenum GL_APIENTRY glCheckFramebufferStatusOES (GLenum target);
+GL_API void GL_APIENTRY glFramebufferRenderbufferOES (GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+GL_API void GL_APIENTRY glFramebufferTexture2DOES (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+GL_API void GL_APIENTRY glGetFramebufferAttachmentParameterivOES (GLenum target, GLenum attachment, GLenum pname, GLint *params);
+GL_API void GL_APIENTRY glGenerateMipmapOES (GLenum target);
 #endif
+#endif /* GL_OES_framebuffer_object */
 
-/* GL_OES_mapbuffer */
 #ifndef GL_OES_mapbuffer
 #define GL_OES_mapbuffer 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void* GL_APIENTRY glMapBufferOES (GLenum target, GLenum access);
-GL_API GLboolean GL_APIENTRY glUnmapBufferOES (GLenum target);
-GL_API void GL_APIENTRY glGetBufferPointervOES (GLenum target, GLenum pname, GLvoid ** params);
-#endif
-typedef void* (GL_APIENTRYP PFNGLMAPBUFFEROESPROC) (GLenum target, GLenum access);
+#define GL_WRITE_ONLY_OES                 0x88B9
+#define GL_BUFFER_ACCESS_OES              0x88BB
+#define GL_BUFFER_MAPPED_OES              0x88BC
+#define GL_BUFFER_MAP_POINTER_OES         0x88BD
+typedef void *(GL_APIENTRYP PFNGLMAPBUFFEROESPROC) (GLenum target, GLenum access);
 typedef GLboolean (GL_APIENTRYP PFNGLUNMAPBUFFEROESPROC) (GLenum target);
-typedef void (GL_APIENTRYP PFNGLGETBUFFERPOINTERVOESPROC) (GLenum target, GLenum pname, GLvoid ** params);
+typedef void (GL_APIENTRYP PFNGLGETBUFFERPOINTERVOESPROC) (GLenum target, GLenum pname, void **params);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void *GL_APIENTRY glMapBufferOES (GLenum target, GLenum access);
+GL_API GLboolean GL_APIENTRY glUnmapBufferOES (GLenum target);
+GL_API void GL_APIENTRY glGetBufferPointervOES (GLenum target, GLenum pname, void **params);
 #endif
+#endif /* GL_OES_mapbuffer */
 
-/* GL_OES_matrix_get */
 #ifndef GL_OES_matrix_get
 #define GL_OES_matrix_get 1
-#endif
+#define GL_MODELVIEW_MATRIX_FLOAT_AS_INT_BITS_OES 0x898D
+#define GL_PROJECTION_MATRIX_FLOAT_AS_INT_BITS_OES 0x898E
+#define GL_TEXTURE_MATRIX_FLOAT_AS_INT_BITS_OES 0x898F
+#endif /* GL_OES_matrix_get */
 
-/* GL_OES_matrix_palette */
 #ifndef GL_OES_matrix_palette
 #define GL_OES_matrix_palette 1
+#define GL_MAX_VERTEX_UNITS_OES           0x86A4
+#define GL_MAX_PALETTE_MATRICES_OES       0x8842
+#define GL_MATRIX_PALETTE_OES             0x8840
+#define GL_MATRIX_INDEX_ARRAY_OES         0x8844
+#define GL_WEIGHT_ARRAY_OES               0x86AD
+#define GL_CURRENT_PALETTE_MATRIX_OES     0x8843
+#define GL_MATRIX_INDEX_ARRAY_SIZE_OES    0x8846
+#define GL_MATRIX_INDEX_ARRAY_TYPE_OES    0x8847
+#define GL_MATRIX_INDEX_ARRAY_STRIDE_OES  0x8848
+#define GL_MATRIX_INDEX_ARRAY_POINTER_OES 0x8849
+#define GL_MATRIX_INDEX_ARRAY_BUFFER_BINDING_OES 0x8B9E
+#define GL_WEIGHT_ARRAY_SIZE_OES          0x86AB
+#define GL_WEIGHT_ARRAY_TYPE_OES          0x86A9
+#define GL_WEIGHT_ARRAY_STRIDE_OES        0x86AA
+#define GL_WEIGHT_ARRAY_POINTER_OES       0x86AC
+#define GL_WEIGHT_ARRAY_BUFFER_BINDING_OES 0x889E
+typedef void (GL_APIENTRYP PFNGLCURRENTPALETTEMATRIXOESPROC) (GLuint matrixpaletteindex);
+typedef void (GL_APIENTRYP PFNGLLOADPALETTEFROMMODELVIEWMATRIXOESPROC) (void);
+typedef void (GL_APIENTRYP PFNGLMATRIXINDEXPOINTEROESPROC) (GLint size, GLenum type, GLsizei stride, const void *pointer);
+typedef void (GL_APIENTRYP PFNGLWEIGHTPOINTEROESPROC) (GLint size, GLenum type, GLsizei stride, const void *pointer);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glCurrentPaletteMatrixOES (GLuint matrixpaletteindex);
 GL_API void GL_APIENTRY glLoadPaletteFromModelViewMatrixOES (void);
-GL_API void GL_APIENTRY glMatrixIndexPointerOES (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
-GL_API void GL_APIENTRY glWeightPointerOES (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
+GL_API void GL_APIENTRY glMatrixIndexPointerOES (GLint size, GLenum type, GLsizei stride, const void *pointer);
+GL_API void GL_APIENTRY glWeightPointerOES (GLint size, GLenum type, GLsizei stride, const void *pointer);
 #endif
-typedef void (GL_APIENTRYP PFNGLCURRENTPALETTEMATRIXOESPROC) (GLuint matrixpaletteindex);
-typedef void (GL_APIENTRYP PFNGLLOADPALETTEFROMMODELVIEWMATRIXOESPROC) (void);
-typedef void (GL_APIENTRYP PFNGLMATRIXINDEXPOINTEROESPROC) (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
-typedef void (GL_APIENTRYP PFNGLWEIGHTPOINTEROESPROC) (GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
-#endif
+#endif /* GL_OES_matrix_palette */
 
-/* GL_OES_packed_depth_stencil */
 #ifndef GL_OES_packed_depth_stencil
 #define GL_OES_packed_depth_stencil 1
-#endif
+#define GL_DEPTH_STENCIL_OES              0x84F9
+#define GL_UNSIGNED_INT_24_8_OES          0x84FA
+#define GL_DEPTH24_STENCIL8_OES           0x88F0
+#endif /* GL_OES_packed_depth_stencil */
 
-/* GL_OES_required_internalformat */
-#ifndef GL_OES_required_internalformat
-#define GL_OES_required_internalformat 1
-#endif
-
-/* GL_OES_query_matrix */
 #ifndef GL_OES_query_matrix
 #define GL_OES_query_matrix 1
+typedef GLbitfield (GL_APIENTRYP PFNGLQUERYMATRIXXOESPROC) (GLfixed *mantissa, GLint *exponent);
 #ifdef GL_GLEXT_PROTOTYPES
-GL_API GLbitfield GL_APIENTRY glQueryMatrixxOES (GLfixed mantissa[16], GLint exponent[16]);
+GL_API GLbitfield GL_APIENTRY glQueryMatrixxOES (GLfixed *mantissa, GLint *exponent);
 #endif
-typedef GLbitfield (GL_APIENTRYP PFNGLQUERYMATRIXXOESPROC) (GLfixed mantissa[16], GLint exponent[16]);
-#endif
+#endif /* GL_OES_query_matrix */
 
-/* GL_OES_rgb8_rgba8 */
+#ifndef GL_OES_required_internalformat
+#define GL_OES_required_internalformat 1
+#define GL_ALPHA8_OES                     0x803C
+#define GL_LUMINANCE4_ALPHA4_OES          0x8043
+#define GL_LUMINANCE8_ALPHA8_OES          0x8045
+#define GL_LUMINANCE8_OES                 0x8040
+#define GL_RGB8_OES                       0x8051
+#define GL_RGBA8_OES                      0x8058
+#define GL_RGB10_EXT                      0x8052
+#define GL_RGB10_A2_EXT                   0x8059
+#endif /* GL_OES_required_internalformat */
+
 #ifndef GL_OES_rgb8_rgba8
 #define GL_OES_rgb8_rgba8 1
-#endif
+#endif /* GL_OES_rgb8_rgba8 */
 
-/* GL_OES_single_precision */
 #ifndef GL_OES_single_precision
 #define GL_OES_single_precision 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glDepthRangefOES (GLclampf zNear, GLclampf zFar);
-GL_API void GL_APIENTRY glFrustumfOES (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
-GL_API void GL_APIENTRY glOrthofOES (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
-GL_API void GL_APIENTRY glClipPlanefOES (GLenum plane, const GLfloat *equation);
-GL_API void GL_APIENTRY glGetClipPlanefOES (GLenum pname, GLfloat eqn[4]);
-GL_API void GL_APIENTRY glClearDepthfOES (GLclampf depth);
-#endif
-typedef void (GL_APIENTRYP PFNGLDEPTHRANGEFOESPROC) (GLclampf zNear, GLclampf zFar);
-typedef void (GL_APIENTRYP PFNGLFRUSTUMFOESPROC) (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
-typedef void (GL_APIENTRYP PFNGLORTHOFOESPROC) (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
-typedef void (GL_APIENTRYP PFNGLCLIPPLANEFOESPROC) (GLenum plane, const GLfloat *equation);
-typedef void (GL_APIENTRYP PFNGLGETCLIPPLANEFOESPROC) (GLenum pname, GLfloat eqn[4]);
 typedef void (GL_APIENTRYP PFNGLCLEARDEPTHFOESPROC) (GLclampf depth);
+typedef void (GL_APIENTRYP PFNGLCLIPPLANEFOESPROC) (GLenum plane, const GLfloat *equation);
+typedef void (GL_APIENTRYP PFNGLDEPTHRANGEFOESPROC) (GLclampf n, GLclampf f);
+typedef void (GL_APIENTRYP PFNGLFRUSTUMFOESPROC) (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
+typedef void (GL_APIENTRYP PFNGLGETCLIPPLANEFOESPROC) (GLenum plane, GLfloat *equation);
+typedef void (GL_APIENTRYP PFNGLORTHOFOESPROC) (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glClearDepthfOES (GLclampf depth);
+GL_API void GL_APIENTRY glClipPlanefOES (GLenum plane, const GLfloat *equation);
+GL_API void GL_APIENTRY glDepthRangefOES (GLclampf n, GLclampf f);
+GL_API void GL_APIENTRY glFrustumfOES (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
+GL_API void GL_APIENTRY glGetClipPlanefOES (GLenum plane, GLfloat *equation);
+GL_API void GL_APIENTRY glOrthofOES (GLfloat l, GLfloat r, GLfloat b, GLfloat t, GLfloat n, GLfloat f);
 #endif
+#endif /* GL_OES_single_precision */
 
-/* GL_OES_stencil1 */
 #ifndef GL_OES_stencil1
 #define GL_OES_stencil1 1
-#endif
+#define GL_STENCIL_INDEX1_OES             0x8D46
+#endif /* GL_OES_stencil1 */
 
-/* GL_OES_stencil4 */
 #ifndef GL_OES_stencil4
 #define GL_OES_stencil4 1
-#endif
+#define GL_STENCIL_INDEX4_OES             0x8D47
+#endif /* GL_OES_stencil4 */
 
-/* GL_OES_stencil8 */
 #ifndef GL_OES_stencil8
 #define GL_OES_stencil8 1
-#endif
+#define GL_STENCIL_INDEX8_OES             0x8D48
+#endif /* GL_OES_stencil8 */
 
-/* GL_OES_stencil_wrap */
 #ifndef GL_OES_stencil_wrap
 #define GL_OES_stencil_wrap 1
-#endif
+#define GL_INCR_WRAP_OES                  0x8507
+#define GL_DECR_WRAP_OES                  0x8508
+#endif /* GL_OES_stencil_wrap */
 
-/* GL_OES_texture_cube_map */
 #ifndef GL_OES_texture_cube_map
 #define GL_OES_texture_cube_map 1
+#define GL_NORMAL_MAP_OES                 0x8511
+#define GL_REFLECTION_MAP_OES             0x8512
+#define GL_TEXTURE_CUBE_MAP_OES           0x8513
+#define GL_TEXTURE_BINDING_CUBE_MAP_OES   0x8514
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_X_OES 0x8515
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_X_OES 0x8516
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_Y_OES 0x8517
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_OES 0x8518
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_Z_OES 0x8519
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_OES 0x851A
+#define GL_MAX_CUBE_MAP_TEXTURE_SIZE_OES  0x851C
+#define GL_TEXTURE_GEN_MODE_OES           0x2500
+#define GL_TEXTURE_GEN_STR_OES            0x8D60
+typedef void (GL_APIENTRYP PFNGLTEXGENFOESPROC) (GLenum coord, GLenum pname, GLfloat param);
+typedef void (GL_APIENTRYP PFNGLTEXGENFVOESPROC) (GLenum coord, GLenum pname, const GLfloat *params);
+typedef void (GL_APIENTRYP PFNGLTEXGENIOESPROC) (GLenum coord, GLenum pname, GLint param);
+typedef void (GL_APIENTRYP PFNGLTEXGENIVOESPROC) (GLenum coord, GLenum pname, const GLint *params);
+typedef void (GL_APIENTRYP PFNGLGETTEXGENFVOESPROC) (GLenum coord, GLenum pname, GLfloat *params);
+typedef void (GL_APIENTRYP PFNGLGETTEXGENIVOESPROC) (GLenum coord, GLenum pname, GLint *params);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glTexGenfOES (GLenum coord, GLenum pname, GLfloat param);
 GL_API void GL_APIENTRY glTexGenfvOES (GLenum coord, GLenum pname, const GLfloat *params);
 GL_API void GL_APIENTRY glTexGeniOES (GLenum coord, GLenum pname, GLint param);
 GL_API void GL_APIENTRY glTexGenivOES (GLenum coord, GLenum pname, const GLint *params);
-GL_API void GL_APIENTRY glTexGenxOES (GLenum coord, GLenum pname, GLfixed param);
-GL_API void GL_APIENTRY glTexGenxvOES (GLenum coord, GLenum pname, const GLfixed *params);
 GL_API void GL_APIENTRY glGetTexGenfvOES (GLenum coord, GLenum pname, GLfloat *params);
 GL_API void GL_APIENTRY glGetTexGenivOES (GLenum coord, GLenum pname, GLint *params);
-GL_API void GL_APIENTRY glGetTexGenxvOES (GLenum coord, GLenum pname, GLfixed *params);
 #endif
-typedef void (GL_APIENTRYP PFNGLTEXGENFOESPROC) (GLenum coord, GLenum pname, GLfloat param);
-typedef void (GL_APIENTRYP PFNGLTEXGENFVOESPROC) (GLenum coord, GLenum pname, const GLfloat *params);
-typedef void (GL_APIENTRYP PFNGLTEXGENIOESPROC) (GLenum coord, GLenum pname, GLint param);
-typedef void (GL_APIENTRYP PFNGLTEXGENIVOESPROC) (GLenum coord, GLenum pname, const GLint *params);
-typedef void (GL_APIENTRYP PFNGLTEXGENXOESPROC) (GLenum coord, GLenum pname, GLfixed param);
-typedef void (GL_APIENTRYP PFNGLTEXGENXVOESPROC) (GLenum coord, GLenum pname, const GLfixed *params);
-typedef void (GL_APIENTRYP PFNGLGETTEXGENFVOESPROC) (GLenum coord, GLenum pname, GLfloat *params);
-typedef void (GL_APIENTRYP PFNGLGETTEXGENIVOESPROC) (GLenum coord, GLenum pname, GLint *params);
-typedef void (GL_APIENTRYP PFNGLGETTEXGENXVOESPROC) (GLenum coord, GLenum pname, GLfixed *params);
-#endif
+#endif /* GL_OES_texture_cube_map */
 
-/* GL_OES_texture_env_crossbar */
 #ifndef GL_OES_texture_env_crossbar
 #define GL_OES_texture_env_crossbar 1
-#endif
+#endif /* GL_OES_texture_env_crossbar */
 
-/* GL_OES_texture_mirrored_repeat */
 #ifndef GL_OES_texture_mirrored_repeat
 #define GL_OES_texture_mirrored_repeat 1
-#endif
+#define GL_MIRRORED_REPEAT_OES            0x8370
+#endif /* GL_OES_texture_mirrored_repeat */
 
-/* GL_OES_vertex_array_object */
 #ifndef GL_OES_vertex_array_object
 #define GL_OES_vertex_array_object 1
+#define GL_VERTEX_ARRAY_BINDING_OES       0x85B5
+typedef void (GL_APIENTRYP PFNGLBINDVERTEXARRAYOESPROC) (GLuint array);
+typedef void (GL_APIENTRYP PFNGLDELETEVERTEXARRAYSOESPROC) (GLsizei n, const GLuint *arrays);
+typedef void (GL_APIENTRYP PFNGLGENVERTEXARRAYSOESPROC) (GLsizei n, GLuint *arrays);
+typedef GLboolean (GL_APIENTRYP PFNGLISVERTEXARRAYOESPROC) (GLuint array);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glBindVertexArrayOES (GLuint array);
 GL_API void GL_APIENTRY glDeleteVertexArraysOES (GLsizei n, const GLuint *arrays);
 GL_API void GL_APIENTRY glGenVertexArraysOES (GLsizei n, GLuint *arrays);
 GL_API GLboolean GL_APIENTRY glIsVertexArrayOES (GLuint array);
 #endif
-typedef void (GL_APIENTRYP PFNGLBINDVERTEXARRAYOESPROC) (GLuint array);
-typedef void (GL_APIENTRYP PFNGLDELETEVERTEXARRAYSOESPROC) (GLsizei n, const GLuint *arrays);
-typedef void (GL_APIENTRYP PFNGLGENVERTEXARRAYSOESPROC) (GLsizei n, GLuint *arrays);
-typedef GLboolean (GL_APIENTRYP PFNGLISVERTEXARRAYOESPROC) (GLuint array);
-#endif
+#endif /* GL_OES_vertex_array_object */
 
-/*------------------------------------------------------------------------*
- * AMD extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_AMD_compressed_3DC_texture */
 #ifndef GL_AMD_compressed_3DC_texture
 #define GL_AMD_compressed_3DC_texture 1
-#endif
+#define GL_3DC_X_AMD                      0x87F9
+#define GL_3DC_XY_AMD                     0x87FA
+#endif /* GL_AMD_compressed_3DC_texture */
 
-/* GL_AMD_compressed_ATC_texture */
 #ifndef GL_AMD_compressed_ATC_texture
 #define GL_AMD_compressed_ATC_texture 1
-#endif
+#define GL_ATC_RGB_AMD                    0x8C92
+#define GL_ATC_RGBA_EXPLICIT_ALPHA_AMD    0x8C93
+#define GL_ATC_RGBA_INTERPOLATED_ALPHA_AMD 0x87EE
+#endif /* GL_AMD_compressed_ATC_texture */
 
-/*------------------------------------------------------------------------*
- * APPLE extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_APPLE_copy_texture_levels */
 #ifndef GL_APPLE_copy_texture_levels
 #define GL_APPLE_copy_texture_levels 1
+typedef void (GL_APIENTRYP PFNGLCOPYTEXTURELEVELSAPPLEPROC) (GLuint destinationTexture, GLuint sourceTexture, GLint sourceBaseLevel, GLsizei sourceLevelCount);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glCopyTextureLevelsAPPLE (GLuint destinationTexture, GLuint sourceTexture, GLint sourceBaseLevel, GLsizei sourceLevelCount);
 #endif
-typedef void (GL_APIENTRYP PFNGLCOPYTEXTURELEVELSAPPLEPROC) (GLuint destinationTexture, GLuint sourceTexture, GLint sourceBaseLevel, GLsizei sourceLevelCount);
-#endif
+#endif /* GL_APPLE_copy_texture_levels */
 
-/* GL_APPLE_framebuffer_multisample */
 #ifndef GL_APPLE_framebuffer_multisample
 #define GL_APPLE_framebuffer_multisample 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glRenderbufferStorageMultisampleAPPLE (GLenum, GLsizei, GLenum, GLsizei, GLsizei);
-GL_API void GL_APIENTRY glResolveMultisampleFramebufferAPPLE (void);
-#endif /* GL_GLEXT_PROTOTYPES */
+#define GL_RENDERBUFFER_SAMPLES_APPLE     0x8CAB
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_APPLE 0x8D56
+#define GL_MAX_SAMPLES_APPLE              0x8D57
+#define GL_READ_FRAMEBUFFER_APPLE         0x8CA8
+#define GL_DRAW_FRAMEBUFFER_APPLE         0x8CA9
+#define GL_DRAW_FRAMEBUFFER_BINDING_APPLE 0x8CA6
+#define GL_READ_FRAMEBUFFER_BINDING_APPLE 0x8CAA
 typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEAPPLEPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
 typedef void (GL_APIENTRYP PFNGLRESOLVEMULTISAMPLEFRAMEBUFFERAPPLEPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glRenderbufferStorageMultisampleAPPLE (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GL_API void GL_APIENTRY glResolveMultisampleFramebufferAPPLE (void);
 #endif
+#endif /* GL_APPLE_framebuffer_multisample */
 
-/* GL_APPLE_sync */
 #ifndef GL_APPLE_sync
 #define GL_APPLE_sync 1
+typedef struct __GLsync *GLsync;
+typedef khronos_uint64_t GLuint64;
+typedef khronos_int64_t GLint64;
+#define GL_SYNC_OBJECT_APPLE              0x8A53
+#define GL_MAX_SERVER_WAIT_TIMEOUT_APPLE  0x9111
+#define GL_OBJECT_TYPE_APPLE              0x9112
+#define GL_SYNC_CONDITION_APPLE           0x9113
+#define GL_SYNC_STATUS_APPLE              0x9114
+#define GL_SYNC_FLAGS_APPLE               0x9115
+#define GL_SYNC_FENCE_APPLE               0x9116
+#define GL_SYNC_GPU_COMMANDS_COMPLETE_APPLE 0x9117
+#define GL_UNSIGNALED_APPLE               0x9118
+#define GL_SIGNALED_APPLE                 0x9119
+#define GL_ALREADY_SIGNALED_APPLE         0x911A
+#define GL_TIMEOUT_EXPIRED_APPLE          0x911B
+#define GL_CONDITION_SATISFIED_APPLE      0x911C
+#define GL_WAIT_FAILED_APPLE              0x911D
+#define GL_SYNC_FLUSH_COMMANDS_BIT_APPLE  0x00000001
+#define GL_TIMEOUT_IGNORED_APPLE          0xFFFFFFFFFFFFFFFFull
+typedef GLsync (GL_APIENTRYP PFNGLFENCESYNCAPPLEPROC) (GLenum condition, GLbitfield flags);
+typedef GLboolean (GL_APIENTRYP PFNGLISSYNCAPPLEPROC) (GLsync sync);
+typedef void (GL_APIENTRYP PFNGLDELETESYNCAPPLEPROC) (GLsync sync);
+typedef GLenum (GL_APIENTRYP PFNGLCLIENTWAITSYNCAPPLEPROC) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+typedef void (GL_APIENTRYP PFNGLWAITSYNCAPPLEPROC) (GLsync sync, GLbitfield flags, GLuint64 timeout);
+typedef void (GL_APIENTRYP PFNGLGETINTEGER64VAPPLEPROC) (GLenum pname, GLint64 *params);
+typedef void (GL_APIENTRYP PFNGLGETSYNCIVAPPLEPROC) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, GLint *values);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API GLsync GL_APIENTRY glFenceSyncAPPLE (GLenum condition, GLbitfield flags);
 GL_API GLboolean GL_APIENTRY glIsSyncAPPLE (GLsync sync);
@@ -978,138 +574,170 @@
 GL_API void GL_APIENTRY glGetInteger64vAPPLE (GLenum pname, GLint64 *params);
 GL_API void GL_APIENTRY glGetSyncivAPPLE (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, GLint *values);
 #endif
-typedef GLsync (GL_APIENTRYP PFNGLFENCESYNCAPPLEPROC) (GLenum condition, GLbitfield flags);
-typedef GLboolean (GL_APIENTRYP PFNGLISSYNCAPPLEPROC) (GLsync sync);
-typedef void (GL_APIENTRYP PFNGLDELETESYNCAPPLEPROC) (GLsync sync);
-typedef GLenum (GL_APIENTRYP PFNGLCLIENTWAITSYNCAPPLEPROC) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-typedef void (GL_APIENTRYP PFNGLWAITSYNCAPPLEPROC) (GLsync sync, GLbitfield flags, GLuint64 timeout);
-typedef void (GL_APIENTRYP PFNGLGETINTEGER64VAPPLEPROC) (GLenum pname, GLint64 *params);
-typedef void (GL_APIENTRYP PFNGLGETSYNCIVAPPLEPROC) (GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, GLint *values);
-#endif
+#endif /* GL_APPLE_sync */
 
-/* GL_APPLE_texture_2D_limited_npot */
 #ifndef GL_APPLE_texture_2D_limited_npot
 #define GL_APPLE_texture_2D_limited_npot 1
-#endif
+#endif /* GL_APPLE_texture_2D_limited_npot */
 
-/* GL_APPLE_texture_format_BGRA8888 */
 #ifndef GL_APPLE_texture_format_BGRA8888
 #define GL_APPLE_texture_format_BGRA8888 1
-#endif
+#define GL_BGRA_EXT                       0x80E1
+#define GL_BGRA8_EXT                      0x93A1
+#endif /* GL_APPLE_texture_format_BGRA8888 */
 
-/* GL_APPLE_texture_max_level */
 #ifndef GL_APPLE_texture_max_level
 #define GL_APPLE_texture_max_level 1
-#endif
+#define GL_TEXTURE_MAX_LEVEL_APPLE        0x813D
+#endif /* GL_APPLE_texture_max_level */
 
-/*------------------------------------------------------------------------*
- * ARM extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_ARM_rgba8 */
 #ifndef GL_ARM_rgba8
 #define GL_ARM_rgba8 1
-#endif
+#endif /* GL_ARM_rgba8 */
 
-/*------------------------------------------------------------------------*
- * EXT extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_EXT_blend_minmax */
 #ifndef GL_EXT_blend_minmax
 #define GL_EXT_blend_minmax 1
-#endif
+#define GL_MIN_EXT                        0x8007
+#define GL_MAX_EXT                        0x8008
+#endif /* GL_EXT_blend_minmax */
 
-/* GL_EXT_discard_framebuffer */
 #ifndef GL_EXT_discard_framebuffer
 #define GL_EXT_discard_framebuffer 1
+#define GL_COLOR_EXT                      0x1800
+#define GL_DEPTH_EXT                      0x1801
+#define GL_STENCIL_EXT                    0x1802
+typedef void (GL_APIENTRYP PFNGLDISCARDFRAMEBUFFEREXTPROC) (GLenum target, GLsizei numAttachments, const GLenum *attachments);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glDiscardFramebufferEXT (GLenum target, GLsizei numAttachments, const GLenum *attachments);
 #endif
-typedef void (GL_APIENTRYP PFNGLDISCARDFRAMEBUFFEREXTPROC) (GLenum target, GLsizei numAttachments, const GLenum *attachments);
-#endif
+#endif /* GL_EXT_discard_framebuffer */
 
-/* GL_EXT_map_buffer_range */
 #ifndef GL_EXT_map_buffer_range
 #define GL_EXT_map_buffer_range 1
+#define GL_MAP_READ_BIT_EXT               0x0001
+#define GL_MAP_WRITE_BIT_EXT              0x0002
+#define GL_MAP_INVALIDATE_RANGE_BIT_EXT   0x0004
+#define GL_MAP_INVALIDATE_BUFFER_BIT_EXT  0x0008
+#define GL_MAP_FLUSH_EXPLICIT_BIT_EXT     0x0010
+#define GL_MAP_UNSYNCHRONIZED_BIT_EXT     0x0020
+typedef void *(GL_APIENTRYP PFNGLMAPBUFFERRANGEEXTPROC) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+typedef void (GL_APIENTRYP PFNGLFLUSHMAPPEDBUFFERRANGEEXTPROC) (GLenum target, GLintptr offset, GLsizeiptr length);
 #ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY *glMapBufferRangeEXT (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+GL_API void *GL_APIENTRY glMapBufferRangeEXT (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
 GL_API void GL_APIENTRY glFlushMappedBufferRangeEXT (GLenum target, GLintptr offset, GLsizeiptr length);
 #endif
-typedef void* (GL_APIENTRYP PFNGLMAPBUFFERRANGEEXTPROC) (GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-typedef void (GL_APIENTRYP PFNGLFLUSHMAPPEDBUFFERRANGEEXTPROC) (GLenum target, GLintptr offset, GLsizeiptr length);
-#endif
+#endif /* GL_EXT_map_buffer_range */
 
-/* GL_EXT_multisampled_render_to_texture */
-#ifndef GL_EXT_multisampled_render_to_texture
-#define GL_EXT_multisampled_render_to_texture 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glRenderbufferStorageMultisampleEXT (GLenum, GLsizei, GLenum, GLsizei, GLsizei);
-GL_API void GL_APIENTRY glFramebufferTexture2DMultisampleEXT (GLenum, GLenum, GLenum, GLuint, GLint, GLsizei);
-#endif
-typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
-typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC) (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
-#endif
-
-/* GL_EXT_multi_draw_arrays */
 #ifndef GL_EXT_multi_draw_arrays
 #define GL_EXT_multi_draw_arrays 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glMultiDrawArraysEXT (GLenum, const GLint *, const GLsizei *, GLsizei);
-GL_API void GL_APIENTRY glMultiDrawElementsEXT (GLenum, const GLsizei *, GLenum, const GLvoid* *, GLsizei);
-#endif /* GL_GLEXT_PROTOTYPES */
 typedef void (GL_APIENTRYP PFNGLMULTIDRAWARRAYSEXTPROC) (GLenum mode, const GLint *first, const GLsizei *count, GLsizei primcount);
-typedef void (GL_APIENTRYP PFNGLMULTIDRAWELEMENTSEXTPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount);
+typedef void (GL_APIENTRYP PFNGLMULTIDRAWELEMENTSEXTPROC) (GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei primcount);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glMultiDrawArraysEXT (GLenum mode, const GLint *first, const GLsizei *count, GLsizei primcount);
+GL_API void GL_APIENTRY glMultiDrawElementsEXT (GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei primcount);
 #endif
+#endif /* GL_EXT_multi_draw_arrays */
 
-/* GL_EXT_read_format_bgra */
+#ifndef GL_EXT_multisampled_render_to_texture
+#define GL_EXT_multisampled_render_to_texture 1
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_SAMPLES_EXT 0x8D6C
+#define GL_RENDERBUFFER_SAMPLES_EXT       0x8CAB
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_EXT 0x8D56
+#define GL_MAX_SAMPLES_EXT                0x8D57
+typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC) (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glRenderbufferStorageMultisampleEXT (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GL_API void GL_APIENTRY glFramebufferTexture2DMultisampleEXT (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
+#endif
+#endif /* GL_EXT_multisampled_render_to_texture */
+
 #ifndef GL_EXT_read_format_bgra
 #define GL_EXT_read_format_bgra 1
-#endif
+#define GL_UNSIGNED_SHORT_4_4_4_4_REV_EXT 0x8365
+#define GL_UNSIGNED_SHORT_1_5_5_5_REV_EXT 0x8366
+#endif /* GL_EXT_read_format_bgra */
 
-/* GL_EXT_robustness */
 #ifndef GL_EXT_robustness
 #define GL_EXT_robustness 1
+#define GL_GUILTY_CONTEXT_RESET_EXT       0x8253
+#define GL_INNOCENT_CONTEXT_RESET_EXT     0x8254
+#define GL_UNKNOWN_CONTEXT_RESET_EXT      0x8255
+#define GL_CONTEXT_ROBUST_ACCESS_EXT      0x90F3
+#define GL_RESET_NOTIFICATION_STRATEGY_EXT 0x8256
+#define GL_LOSE_CONTEXT_ON_RESET_EXT      0x8252
+#define GL_NO_RESET_NOTIFICATION_EXT      0x8261
+typedef GLenum (GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSEXTPROC) (void);
+typedef void (GL_APIENTRYP PFNGLREADNPIXELSEXTPROC) (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
+typedef void (GL_APIENTRYP PFNGLGETNUNIFORMFVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, GLfloat *params);
+typedef void (GL_APIENTRYP PFNGLGETNUNIFORMIVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, GLint *params);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API GLenum GL_APIENTRY glGetGraphicsResetStatusEXT (void);
 GL_API void GL_APIENTRY glReadnPixelsEXT (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
-GL_API void GL_APIENTRY glGetnUniformfvEXT (GLuint program, GLint location, GLsizei bufSize, float *params);
+GL_API void GL_APIENTRY glGetnUniformfvEXT (GLuint program, GLint location, GLsizei bufSize, GLfloat *params);
 GL_API void GL_APIENTRY glGetnUniformivEXT (GLuint program, GLint location, GLsizei bufSize, GLint *params);
 #endif
-typedef GLenum (GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSEXTPROC) (void);
-typedef void (GL_APIENTRYP PFNGLREADNPIXELSEXTPROC) (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
-typedef void (GL_APIENTRYP PFNGLGETNUNIFORMFVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, float *params);
-typedef void (GL_APIENTRYP PFNGLGETNUNIFORMIVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, GLint *params);
-#endif
+#endif /* GL_EXT_robustness */
 
-/* GL_EXT_sRGB */
 #ifndef GL_EXT_sRGB
 #define GL_EXT_sRGB 1
-#endif
+#define GL_SRGB_EXT                       0x8C40
+#define GL_SRGB_ALPHA_EXT                 0x8C42
+#define GL_SRGB8_ALPHA8_EXT               0x8C43
+#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING_EXT 0x8210
+#endif /* GL_EXT_sRGB */
 
-/* GL_EXT_texture_compression_dxt1 */
 #ifndef GL_EXT_texture_compression_dxt1
 #define GL_EXT_texture_compression_dxt1 1
-#endif
+#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT   0x83F0
+#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT  0x83F1
+#endif /* GL_EXT_texture_compression_dxt1 */
 
-/* GL_EXT_texture_filter_anisotropic */
 #ifndef GL_EXT_texture_filter_anisotropic
 #define GL_EXT_texture_filter_anisotropic 1
-#endif
+#define GL_TEXTURE_MAX_ANISOTROPY_EXT     0x84FE
+#define GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT 0x84FF
+#endif /* GL_EXT_texture_filter_anisotropic */
 
-/* GL_EXT_texture_format_BGRA8888 */
 #ifndef GL_EXT_texture_format_BGRA8888
 #define GL_EXT_texture_format_BGRA8888 1
-#endif
+#endif /* GL_EXT_texture_format_BGRA8888 */
 
-/* GL_EXT_texture_lod_bias */
 #ifndef GL_EXT_texture_lod_bias
 #define GL_EXT_texture_lod_bias 1
-#endif
+#define GL_MAX_TEXTURE_LOD_BIAS_EXT       0x84FD
+#define GL_TEXTURE_FILTER_CONTROL_EXT     0x8500
+#define GL_TEXTURE_LOD_BIAS_EXT           0x8501
+#endif /* GL_EXT_texture_lod_bias */
 
-/* GL_EXT_texture_storage */
 #ifndef GL_EXT_texture_storage
 #define GL_EXT_texture_storage 1
+#define GL_TEXTURE_IMMUTABLE_FORMAT_EXT   0x912F
+#define GL_ALPHA8_EXT                     0x803C
+#define GL_LUMINANCE8_EXT                 0x8040
+#define GL_LUMINANCE8_ALPHA8_EXT          0x8045
+#define GL_RGBA32F_EXT                    0x8814
+#define GL_RGB32F_EXT                     0x8815
+#define GL_ALPHA32F_EXT                   0x8816
+#define GL_LUMINANCE32F_EXT               0x8818
+#define GL_LUMINANCE_ALPHA32F_EXT         0x8819
+#define GL_RGBA16F_EXT                    0x881A
+#define GL_RGB16F_EXT                     0x881B
+#define GL_ALPHA16F_EXT                   0x881C
+#define GL_LUMINANCE16F_EXT               0x881E
+#define GL_LUMINANCE_ALPHA16F_EXT         0x881F
+#define GL_R8_EXT                         0x8229
+#define GL_RG8_EXT                        0x822B
+#define GL_R32F_EXT                       0x822E
+#define GL_RG32F_EXT                      0x8230
+#define GL_R16F_EXT                       0x822D
+#define GL_RG16F_EXT                      0x822F
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE1DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE1DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE2DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE3DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glTexStorage1DEXT (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
 GL_API void GL_APIENTRY glTexStorage2DEXT (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
@@ -1118,71 +746,69 @@
 GL_API void GL_APIENTRY glTextureStorage2DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 GL_API void GL_APIENTRY glTextureStorage3DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 #endif
-typedef void (GL_APIENTRYP PFNGLTEXSTORAGE1DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
-typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE1DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
-typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE2DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
-typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE3DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
-#endif
+#endif /* GL_EXT_texture_storage */
 
-/*------------------------------------------------------------------------*
- * IMG extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_IMG_read_format */
-#ifndef GL_IMG_read_format
-#define GL_IMG_read_format 1
-#endif
-
-/* GL_IMG_texture_compression_pvrtc */
-#ifndef GL_IMG_texture_compression_pvrtc
-#define GL_IMG_texture_compression_pvrtc 1
-#endif
-
-/* GL_IMG_texture_env_enhanced_fixed_function */
-#ifndef GL_IMG_texture_env_enhanced_fixed_function
-#define GL_IMG_texture_env_enhanced_fixed_function 1
-#endif
-
-/* GL_IMG_user_clip_plane */
-#ifndef GL_IMG_user_clip_plane
-#define GL_IMG_user_clip_plane 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glClipPlanefIMG (GLenum, const GLfloat *);
-GL_API void GL_APIENTRY glClipPlanexIMG (GLenum, const GLfixed *);
-#endif
-typedef void (GL_APIENTRYP PFNGLCLIPPLANEFIMGPROC) (GLenum p, const GLfloat *eqn);
-typedef void (GL_APIENTRYP PFNGLCLIPPLANEXIMGPROC) (GLenum p, const GLfixed *eqn);
-#endif
-
-/* GL_IMG_multisampled_render_to_texture */
 #ifndef GL_IMG_multisampled_render_to_texture
 #define GL_IMG_multisampled_render_to_texture 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glRenderbufferStorageMultisampleIMG (GLenum, GLsizei, GLenum, GLsizei, GLsizei);
-GL_API void GL_APIENTRY glFramebufferTexture2DMultisampleIMG (GLenum, GLenum, GLenum, GLuint, GLint, GLsizei);
-#endif
+#define GL_RENDERBUFFER_SAMPLES_IMG       0x9133
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_IMG 0x9134
+#define GL_MAX_SAMPLES_IMG                0x9135
+#define GL_TEXTURE_SAMPLES_IMG            0x9136
 typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEIMGPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
 typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEIMGPROC) (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glRenderbufferStorageMultisampleIMG (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GL_API void GL_APIENTRY glFramebufferTexture2DMultisampleIMG (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
 #endif
+#endif /* GL_IMG_multisampled_render_to_texture */
 
-/*------------------------------------------------------------------------*
- * NV extension functions
- *------------------------------------------------------------------------*/
+#ifndef GL_IMG_read_format
+#define GL_IMG_read_format 1
+#define GL_BGRA_IMG                       0x80E1
+#define GL_UNSIGNED_SHORT_4_4_4_4_REV_IMG 0x8365
+#endif /* GL_IMG_read_format */
 
-/* NV_fence */
+#ifndef GL_IMG_texture_compression_pvrtc
+#define GL_IMG_texture_compression_pvrtc 1
+#define GL_COMPRESSED_RGB_PVRTC_4BPPV1_IMG 0x8C00
+#define GL_COMPRESSED_RGB_PVRTC_2BPPV1_IMG 0x8C01
+#define GL_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG 0x8C02
+#define GL_COMPRESSED_RGBA_PVRTC_2BPPV1_IMG 0x8C03
+#endif /* GL_IMG_texture_compression_pvrtc */
+
+#ifndef GL_IMG_texture_env_enhanced_fixed_function
+#define GL_IMG_texture_env_enhanced_fixed_function 1
+#define GL_MODULATE_COLOR_IMG             0x8C04
+#define GL_RECIP_ADD_SIGNED_ALPHA_IMG     0x8C05
+#define GL_TEXTURE_ALPHA_MODULATE_IMG     0x8C06
+#define GL_FACTOR_ALPHA_MODULATE_IMG      0x8C07
+#define GL_FRAGMENT_ALPHA_MODULATE_IMG    0x8C08
+#define GL_ADD_BLEND_IMG                  0x8C09
+#define GL_DOT3_RGBA_IMG                  0x86AF
+#endif /* GL_IMG_texture_env_enhanced_fixed_function */
+
+#ifndef GL_IMG_user_clip_plane
+#define GL_IMG_user_clip_plane 1
+#define GL_CLIP_PLANE0_IMG                0x3000
+#define GL_CLIP_PLANE1_IMG                0x3001
+#define GL_CLIP_PLANE2_IMG                0x3002
+#define GL_CLIP_PLANE3_IMG                0x3003
+#define GL_CLIP_PLANE4_IMG                0x3004
+#define GL_CLIP_PLANE5_IMG                0x3005
+#define GL_MAX_CLIP_PLANES_IMG            0x0D32
+typedef void (GL_APIENTRYP PFNGLCLIPPLANEFIMGPROC) (GLenum p, const GLfloat *eqn);
+typedef void (GL_APIENTRYP PFNGLCLIPPLANEXIMGPROC) (GLenum p, const GLfixed *eqn);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glClipPlanefIMG (GLenum p, const GLfloat *eqn);
+GL_API void GL_APIENTRY glClipPlanexIMG (GLenum p, const GLfixed *eqn);
+#endif
+#endif /* GL_IMG_user_clip_plane */
+
 #ifndef GL_NV_fence
 #define GL_NV_fence 1
-#ifdef GL_GLEXT_PROTOTYPES
-GL_API void GL_APIENTRY glDeleteFencesNV (GLsizei, const GLuint *);
-GL_API void GL_APIENTRY glGenFencesNV (GLsizei, GLuint *);
-GL_API GLboolean GL_APIENTRY glIsFenceNV (GLuint);
-GL_API GLboolean GL_APIENTRY glTestFenceNV (GLuint);
-GL_API void GL_APIENTRY glGetFenceivNV (GLuint, GLenum, GLint *);
-GL_API void GL_APIENTRY glFinishFenceNV (GLuint);
-GL_API void GL_APIENTRY glSetFenceNV (GLuint, GLenum);
-#endif
+#define GL_ALL_COMPLETED_NV               0x84F2
+#define GL_FENCE_STATUS_NV                0x84F3
+#define GL_FENCE_CONDITION_NV             0x84F4
 typedef void (GL_APIENTRYP PFNGLDELETEFENCESNVPROC) (GLsizei n, const GLuint *fences);
 typedef void (GL_APIENTRYP PFNGLGENFENCESNVPROC) (GLsizei n, GLuint *fences);
 typedef GLboolean (GL_APIENTRYP PFNGLISFENCENVPROC) (GLuint fence);
@@ -1190,30 +816,53 @@
 typedef void (GL_APIENTRYP PFNGLGETFENCEIVNVPROC) (GLuint fence, GLenum pname, GLint *params);
 typedef void (GL_APIENTRYP PFNGLFINISHFENCENVPROC) (GLuint fence);
 typedef void (GL_APIENTRYP PFNGLSETFENCENVPROC) (GLuint fence, GLenum condition);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_API void GL_APIENTRY glDeleteFencesNV (GLsizei n, const GLuint *fences);
+GL_API void GL_APIENTRY glGenFencesNV (GLsizei n, GLuint *fences);
+GL_API GLboolean GL_APIENTRY glIsFenceNV (GLuint fence);
+GL_API GLboolean GL_APIENTRY glTestFenceNV (GLuint fence);
+GL_API void GL_APIENTRY glGetFenceivNV (GLuint fence, GLenum pname, GLint *params);
+GL_API void GL_APIENTRY glFinishFenceNV (GLuint fence);
+GL_API void GL_APIENTRY glSetFenceNV (GLuint fence, GLenum condition);
 #endif
+#endif /* GL_NV_fence */
 
-/*------------------------------------------------------------------------*
- * QCOM extension functions
- *------------------------------------------------------------------------*/
-
-/* GL_QCOM_driver_control */
 #ifndef GL_QCOM_driver_control
 #define GL_QCOM_driver_control 1
+typedef char GLchar;
+typedef void (GL_APIENTRYP PFNGLGETDRIVERCONTROLSQCOMPROC) (GLint *num, GLsizei size, GLuint *driverControls);
+typedef void (GL_APIENTRYP PFNGLGETDRIVERCONTROLSTRINGQCOMPROC) (GLuint driverControl, GLsizei bufSize, GLsizei *length, GLchar *driverControlString);
+typedef void (GL_APIENTRYP PFNGLENABLEDRIVERCONTROLQCOMPROC) (GLuint driverControl);
+typedef void (GL_APIENTRYP PFNGLDISABLEDRIVERCONTROLQCOMPROC) (GLuint driverControl);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glGetDriverControlsQCOM (GLint *num, GLsizei size, GLuint *driverControls);
 GL_API void GL_APIENTRY glGetDriverControlStringQCOM (GLuint driverControl, GLsizei bufSize, GLsizei *length, GLchar *driverControlString);
 GL_API void GL_APIENTRY glEnableDriverControlQCOM (GLuint driverControl);
 GL_API void GL_APIENTRY glDisableDriverControlQCOM (GLuint driverControl);
 #endif
-typedef void (GL_APIENTRYP PFNGLGETDRIVERCONTROLSQCOMPROC) (GLint *num, GLsizei size, GLuint *driverControls);
-typedef void (GL_APIENTRYP PFNGLGETDRIVERCONTROLSTRINGQCOMPROC) (GLuint driverControl, GLsizei bufSize, GLsizei *length, GLchar *driverControlString);
-typedef void (GL_APIENTRYP PFNGLENABLEDRIVERCONTROLQCOMPROC) (GLuint driverControl);
-typedef void (GL_APIENTRYP PFNGLDISABLEDRIVERCONTROLQCOMPROC) (GLuint driverControl);
-#endif
+#endif /* GL_QCOM_driver_control */
 
-/* GL_QCOM_extended_get */
 #ifndef GL_QCOM_extended_get
 #define GL_QCOM_extended_get 1
+#define GL_TEXTURE_WIDTH_QCOM             0x8BD2
+#define GL_TEXTURE_HEIGHT_QCOM            0x8BD3
+#define GL_TEXTURE_DEPTH_QCOM             0x8BD4
+#define GL_TEXTURE_INTERNAL_FORMAT_QCOM   0x8BD5
+#define GL_TEXTURE_FORMAT_QCOM            0x8BD6
+#define GL_TEXTURE_TYPE_QCOM              0x8BD7
+#define GL_TEXTURE_IMAGE_VALID_QCOM       0x8BD8
+#define GL_TEXTURE_NUM_LEVELS_QCOM        0x8BD9
+#define GL_TEXTURE_TARGET_QCOM            0x8BDA
+#define GL_TEXTURE_OBJECT_VALID_QCOM      0x8BDB
+#define GL_STATE_RESTORE                  0x8BDC
+typedef void (GL_APIENTRYP PFNGLEXTGETTEXTURESQCOMPROC) (GLuint *textures, GLint maxTextures, GLint *numTextures);
+typedef void (GL_APIENTRYP PFNGLEXTGETBUFFERSQCOMPROC) (GLuint *buffers, GLint maxBuffers, GLint *numBuffers);
+typedef void (GL_APIENTRYP PFNGLEXTGETRENDERBUFFERSQCOMPROC) (GLuint *renderbuffers, GLint maxRenderbuffers, GLint *numRenderbuffers);
+typedef void (GL_APIENTRYP PFNGLEXTGETFRAMEBUFFERSQCOMPROC) (GLuint *framebuffers, GLint maxFramebuffers, GLint *numFramebuffers);
+typedef void (GL_APIENTRYP PFNGLEXTGETTEXLEVELPARAMETERIVQCOMPROC) (GLuint texture, GLenum face, GLint level, GLenum pname, GLint *params);
+typedef void (GL_APIENTRYP PFNGLEXTTEXOBJECTSTATEOVERRIDEIQCOMPROC) (GLenum target, GLenum pname, GLint param);
+typedef void (GL_APIENTRYP PFNGLEXTGETTEXSUBIMAGEQCOMPROC) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, void *texels);
+typedef void (GL_APIENTRYP PFNGLEXTGETBUFFERPOINTERVQCOMPROC) (GLenum target, void **params);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glExtGetTexturesQCOM (GLuint *textures, GLint maxTextures, GLint *numTextures);
 GL_API void GL_APIENTRY glExtGetBuffersQCOM (GLuint *buffers, GLint maxBuffers, GLint *numBuffers);
@@ -1221,58 +870,79 @@
 GL_API void GL_APIENTRY glExtGetFramebuffersQCOM (GLuint *framebuffers, GLint maxFramebuffers, GLint *numFramebuffers);
 GL_API void GL_APIENTRY glExtGetTexLevelParameterivQCOM (GLuint texture, GLenum face, GLint level, GLenum pname, GLint *params);
 GL_API void GL_APIENTRY glExtTexObjectStateOverrideiQCOM (GLenum target, GLenum pname, GLint param);
-GL_API void GL_APIENTRY glExtGetTexSubImageQCOM (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, GLvoid *texels);
-GL_API void GL_APIENTRY glExtGetBufferPointervQCOM (GLenum target, GLvoid **params);
+GL_API void GL_APIENTRY glExtGetTexSubImageQCOM (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, void *texels);
+GL_API void GL_APIENTRY glExtGetBufferPointervQCOM (GLenum target, void **params);
 #endif
-typedef void (GL_APIENTRYP PFNGLEXTGETTEXTURESQCOMPROC) (GLuint *textures, GLint maxTextures, GLint *numTextures);
-typedef void (GL_APIENTRYP PFNGLEXTGETBUFFERSQCOMPROC) (GLuint *buffers, GLint maxBuffers, GLint *numBuffers);
-typedef void (GL_APIENTRYP PFNGLEXTGETRENDERBUFFERSQCOMPROC) (GLuint *renderbuffers, GLint maxRenderbuffers, GLint *numRenderbuffers);
-typedef void (GL_APIENTRYP PFNGLEXTGETFRAMEBUFFERSQCOMPROC) (GLuint *framebuffers, GLint maxFramebuffers, GLint *numFramebuffers);
-typedef void (GL_APIENTRYP PFNGLEXTGETTEXLEVELPARAMETERIVQCOMPROC) (GLuint texture, GLenum face, GLint level, GLenum pname, GLint *params);
-typedef void (GL_APIENTRYP PFNGLEXTTEXOBJECTSTATEOVERRIDEIQCOMPROC) (GLenum target, GLenum pname, GLint param);
-typedef void (GL_APIENTRYP PFNGLEXTGETTEXSUBIMAGEQCOMPROC) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, GLvoid *texels);
-typedef void (GL_APIENTRYP PFNGLEXTGETBUFFERPOINTERVQCOMPROC) (GLenum target, GLvoid **params);
-#endif
+#endif /* GL_QCOM_extended_get */
 
-/* GL_QCOM_extended_get2 */
 #ifndef GL_QCOM_extended_get2
 #define GL_QCOM_extended_get2 1
+typedef void (GL_APIENTRYP PFNGLEXTGETSHADERSQCOMPROC) (GLuint *shaders, GLint maxShaders, GLint *numShaders);
+typedef void (GL_APIENTRYP PFNGLEXTGETPROGRAMSQCOMPROC) (GLuint *programs, GLint maxPrograms, GLint *numPrograms);
+typedef GLboolean (GL_APIENTRYP PFNGLEXTISPROGRAMBINARYQCOMPROC) (GLuint program);
+typedef void (GL_APIENTRYP PFNGLEXTGETPROGRAMBINARYSOURCEQCOMPROC) (GLuint program, GLenum shadertype, GLchar *source, GLint *length);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glExtGetShadersQCOM (GLuint *shaders, GLint maxShaders, GLint *numShaders);
 GL_API void GL_APIENTRY glExtGetProgramsQCOM (GLuint *programs, GLint maxPrograms, GLint *numPrograms);
 GL_API GLboolean GL_APIENTRY glExtIsProgramBinaryQCOM (GLuint program);
 GL_API void GL_APIENTRY glExtGetProgramBinarySourceQCOM (GLuint program, GLenum shadertype, GLchar *source, GLint *length);
 #endif
-typedef void (GL_APIENTRYP PFNGLEXTGETSHADERSQCOMPROC) (GLuint *shaders, GLint maxShaders, GLint *numShaders);
-typedef void (GL_APIENTRYP PFNGLEXTGETPROGRAMSQCOMPROC) (GLuint *programs, GLint maxPrograms, GLint *numPrograms);
-typedef GLboolean (GL_APIENTRYP PFNGLEXTISPROGRAMBINARYQCOMPROC) (GLuint program);
-typedef void (GL_APIENTRYP PFNGLEXTGETPROGRAMBINARYSOURCEQCOMPROC) (GLuint program, GLenum shadertype, GLchar *source, GLint *length);
-#endif
+#endif /* GL_QCOM_extended_get2 */
 
-/* GL_QCOM_perfmon_global_mode */
 #ifndef GL_QCOM_perfmon_global_mode
 #define GL_QCOM_perfmon_global_mode 1
-#endif
+#define GL_PERFMON_GLOBAL_MODE_QCOM       0x8FA0
+#endif /* GL_QCOM_perfmon_global_mode */
 
-/* GL_QCOM_writeonly_rendering */
-#ifndef GL_QCOM_writeonly_rendering
-#define GL_QCOM_writeonly_rendering 1
-#endif
-
-/* GL_QCOM_tiled_rendering */
 #ifndef GL_QCOM_tiled_rendering
 #define GL_QCOM_tiled_rendering 1
+#define GL_COLOR_BUFFER_BIT0_QCOM         0x00000001
+#define GL_COLOR_BUFFER_BIT1_QCOM         0x00000002
+#define GL_COLOR_BUFFER_BIT2_QCOM         0x00000004
+#define GL_COLOR_BUFFER_BIT3_QCOM         0x00000008
+#define GL_COLOR_BUFFER_BIT4_QCOM         0x00000010
+#define GL_COLOR_BUFFER_BIT5_QCOM         0x00000020
+#define GL_COLOR_BUFFER_BIT6_QCOM         0x00000040
+#define GL_COLOR_BUFFER_BIT7_QCOM         0x00000080
+#define GL_DEPTH_BUFFER_BIT0_QCOM         0x00000100
+#define GL_DEPTH_BUFFER_BIT1_QCOM         0x00000200
+#define GL_DEPTH_BUFFER_BIT2_QCOM         0x00000400
+#define GL_DEPTH_BUFFER_BIT3_QCOM         0x00000800
+#define GL_DEPTH_BUFFER_BIT4_QCOM         0x00001000
+#define GL_DEPTH_BUFFER_BIT5_QCOM         0x00002000
+#define GL_DEPTH_BUFFER_BIT6_QCOM         0x00004000
+#define GL_DEPTH_BUFFER_BIT7_QCOM         0x00008000
+#define GL_STENCIL_BUFFER_BIT0_QCOM       0x00010000
+#define GL_STENCIL_BUFFER_BIT1_QCOM       0x00020000
+#define GL_STENCIL_BUFFER_BIT2_QCOM       0x00040000
+#define GL_STENCIL_BUFFER_BIT3_QCOM       0x00080000
+#define GL_STENCIL_BUFFER_BIT4_QCOM       0x00100000
+#define GL_STENCIL_BUFFER_BIT5_QCOM       0x00200000
+#define GL_STENCIL_BUFFER_BIT6_QCOM       0x00400000
+#define GL_STENCIL_BUFFER_BIT7_QCOM       0x00800000
+#define GL_MULTISAMPLE_BUFFER_BIT0_QCOM   0x01000000
+#define GL_MULTISAMPLE_BUFFER_BIT1_QCOM   0x02000000
+#define GL_MULTISAMPLE_BUFFER_BIT2_QCOM   0x04000000
+#define GL_MULTISAMPLE_BUFFER_BIT3_QCOM   0x08000000
+#define GL_MULTISAMPLE_BUFFER_BIT4_QCOM   0x10000000
+#define GL_MULTISAMPLE_BUFFER_BIT5_QCOM   0x20000000
+#define GL_MULTISAMPLE_BUFFER_BIT6_QCOM   0x40000000
+#define GL_MULTISAMPLE_BUFFER_BIT7_QCOM   0x80000000
+typedef void (GL_APIENTRYP PFNGLSTARTTILINGQCOMPROC) (GLuint x, GLuint y, GLuint width, GLuint height, GLbitfield preserveMask);
+typedef void (GL_APIENTRYP PFNGLENDTILINGQCOMPROC) (GLbitfield preserveMask);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_API void GL_APIENTRY glStartTilingQCOM (GLuint x, GLuint y, GLuint width, GLuint height, GLbitfield preserveMask);
 GL_API void GL_APIENTRY glEndTilingQCOM (GLbitfield preserveMask);
 #endif
-typedef void (GL_APIENTRYP PFNGLSTARTTILINGQCOMPROC) (GLuint x, GLuint y, GLuint width, GLuint height, GLbitfield preserveMask);
-typedef void (GL_APIENTRYP PFNGLENDTILINGQCOMPROC) (GLbitfield preserveMask);
-#endif
+#endif /* GL_QCOM_tiled_rendering */
+
+#ifndef GL_QCOM_writeonly_rendering
+#define GL_QCOM_writeonly_rendering 1
+#define GL_WRITEONLY_RENDERING_QCOM       0x8823
+#endif /* GL_QCOM_writeonly_rendering */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* __glext_h_ */
-
+#endif
diff --git a/include/GLES/glplatform.h b/include/GLES/glplatform.h
index 2db6ee2..16060a9 100644
--- a/include/GLES/glplatform.h
+++ b/include/GLES/glplatform.h
@@ -1,20 +1,28 @@
 #ifndef __glplatform_h_
 #define __glplatform_h_
 
-/* $Revision: 10601 $ on $Date:: 2010-03-04 22:15:27 -0800 #$ */
-
 /*
- * This document is licensed under the SGI Free Software B License Version
- * 2.0. For details, see http://oss.sgi.com/projects/FreeB/ .
- */
+** Copyright (c) 2017 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
 
 /* Platform-specific types and definitions for OpenGL ES 1.X  gl.h
  *
  * Adopters may modify khrplatform.h and this file to suit their platform.
- * You are encouraged to submit all modifications to the Khronos group so that
- * they can be included in future versions of this file.  Please submit changes
- * by sending them to the public Khronos Bugzilla (http://khronos.org/bugzilla)
- * by filing a bug against product "OpenGL-ES" component "Registry".
+ * Please contribute modifications back to Khronos as pull requests on the
+ * public github repository:
+ *      https://github.com/KhronosGroup/OpenGL-Registry
  */
 
 #include <KHR/khrplatform.h>
diff --git a/include/GLES2/gl2.h b/include/GLES2/gl2.h
index 57e1b50..8ba907c 100644
--- a/include/GLES2/gl2.h
+++ b/include/GLES2/gl2.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,9 +31,7 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 32749 $ on $Date: 2016-04-28 09:03:03 -0700 (Thu, 28 Apr 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #include <GLES2/gl2platform.h>
@@ -42,7 +40,11 @@
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20160428 */
+#ifndef GL_GLES_PROTOTYPES
+#define GL_GLES_PROTOTYPES 1
+#endif
+
+/* Generated on date 20170606 */
 
 /* Generated C header for:
  * API: gles2
@@ -520,6 +522,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIB4FVPROC) (GLuint index, const GLfloat *v);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC) (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 typedef void (GL_APIENTRYP PFNGLVIEWPORTPROC) (GLint x, GLint y, GLsizei width, GLsizei height);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glActiveTexture (GLenum texture);
 GL_APICALL void GL_APIENTRY glAttachShader (GLuint program, GLuint shader);
 GL_APICALL void GL_APIENTRY glBindAttribLocation (GLuint program, GLuint index, const GLchar *name);
@@ -662,6 +665,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttrib4fv (GLuint index, const GLfloat *v);
 GL_APICALL void GL_APIENTRY glVertexAttribPointer (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 GL_APICALL void GL_APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
+#endif
 #endif /* GL_ES_VERSION_2_0 */
 
 #ifdef __cplusplus
diff --git a/include/GLES2/gl2ext.h b/include/GLES2/gl2ext.h
index b2d750e..e306833 100644
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,16 +31,14 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 33080 $ on $Date: 2016-08-05 04:09:22 -0700 (Fri, 05 Aug 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #ifndef GL_APIENTRYP
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20160805 */
+/* Generated on date 20170606 */
 
 /* Generated C header for:
  * API: gles2
@@ -52,10 +50,6 @@
  * Extensions removed: _nomatch_^
  */
 
-#ifndef GL_ARB_sparse_texture2
-#define GL_ARB_sparse_texture2 1
-#endif /* GL_ARB_sparse_texture2 */
-
 #ifndef GL_KHR_blend_equation_advanced
 #define GL_KHR_blend_equation_advanced 1
 #define GL_MULTIPLY_KHR                   0x9294
@@ -1057,6 +1051,10 @@
 #define GL_SHADER_BINARY_DMP              0x9250
 #endif /* GL_DMP_shader_binary */
 
+#ifndef GL_EXT_EGL_image_array
+#define GL_EXT_EGL_image_array 1
+#endif /* GL_EXT_EGL_image_array */
+
 #ifndef GL_EXT_YUV_target
 #define GL_EXT_YUV_target 1
 #define GL_SAMPLER_EXTERNAL_2D_Y2Y_EXT    0x8BE7
@@ -1118,6 +1116,16 @@
 #endif
 #endif /* GL_EXT_buffer_storage */
 
+#ifndef GL_EXT_clear_texture
+#define GL_EXT_clear_texture 1
+typedef void (GL_APIENTRYP PFNGLCLEARTEXIMAGEEXTPROC) (GLuint texture, GLint level, GLenum format, GLenum type, const void *data);
+typedef void (GL_APIENTRYP PFNGLCLEARTEXSUBIMAGEEXTPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *data);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glClearTexImageEXT (GLuint texture, GLint level, GLenum format, GLenum type, const void *data);
+GL_APICALL void GL_APIENTRY glClearTexSubImageEXT (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *data);
+#endif
+#endif /* GL_EXT_clear_texture */
+
 #ifndef GL_EXT_clip_cull_distance
 #define GL_EXT_clip_cull_distance 1
 #define GL_MAX_CLIP_DISTANCES_EXT         0x0D32
@@ -1147,6 +1155,10 @@
 #define GL_UNSIGNED_NORMALIZED_EXT        0x8C17
 #endif /* GL_EXT_color_buffer_half_float */
 
+#ifndef GL_EXT_conservative_depth
+#define GL_EXT_conservative_depth 1
+#endif /* GL_EXT_conservative_depth */
+
 #ifndef GL_EXT_copy_image
 #define GL_EXT_copy_image 1
 typedef void (GL_APIENTRYP PFNGLCOPYIMAGESUBDATAEXTPROC) (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
@@ -1318,6 +1330,16 @@
 #endif
 #endif /* GL_EXT_draw_instanced */
 
+#ifndef GL_EXT_draw_transform_feedback
+#define GL_EXT_draw_transform_feedback 1
+typedef void (GL_APIENTRYP PFNGLDRAWTRANSFORMFEEDBACKEXTPROC) (GLenum mode, GLuint id);
+typedef void (GL_APIENTRYP PFNGLDRAWTRANSFORMFEEDBACKINSTANCEDEXTPROC) (GLenum mode, GLuint id, GLsizei instancecount);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glDrawTransformFeedbackEXT (GLenum mode, GLuint id);
+GL_APICALL void GL_APIENTRY glDrawTransformFeedbackInstancedEXT (GLenum mode, GLuint id, GLsizei instancecount);
+#endif
+#endif /* GL_EXT_draw_transform_feedback */
+
 #ifndef GL_EXT_float_blend
 #define GL_EXT_float_blend 1
 #endif /* GL_EXT_float_blend */
@@ -1396,6 +1418,85 @@
 #endif
 #endif /* GL_EXT_map_buffer_range */
 
+#ifndef GL_EXT_memory_object
+#define GL_EXT_memory_object 1
+#define GL_TEXTURE_TILING_EXT             0x9580
+#define GL_DEDICATED_MEMORY_OBJECT_EXT    0x9581
+#define GL_PROTECTED_MEMORY_OBJECT_EXT    0x959B
+#define GL_NUM_TILING_TYPES_EXT           0x9582
+#define GL_TILING_TYPES_EXT               0x9583
+#define GL_OPTIMAL_TILING_EXT             0x9584
+#define GL_LINEAR_TILING_EXT              0x9585
+#define GL_NUM_DEVICE_UUIDS_EXT           0x9596
+#define GL_DEVICE_UUID_EXT                0x9597
+#define GL_DRIVER_UUID_EXT                0x9598
+#define GL_UUID_SIZE_EXT                  16
+typedef void (GL_APIENTRYP PFNGLGETUNSIGNEDBYTEVEXTPROC) (GLenum pname, GLubyte *data);
+typedef void (GL_APIENTRYP PFNGLGETUNSIGNEDBYTEI_VEXTPROC) (GLenum target, GLuint index, GLubyte *data);
+typedef void (GL_APIENTRYP PFNGLDELETEMEMORYOBJECTSEXTPROC) (GLsizei n, const GLuint *memoryObjects);
+typedef GLboolean (GL_APIENTRYP PFNGLISMEMORYOBJECTEXTPROC) (GLuint memoryObject);
+typedef void (GL_APIENTRYP PFNGLCREATEMEMORYOBJECTSEXTPROC) (GLsizei n, GLuint *memoryObjects);
+typedef void (GL_APIENTRYP PFNGLMEMORYOBJECTPARAMETERIVEXTPROC) (GLuint memoryObject, GLenum pname, const GLint *params);
+typedef void (GL_APIENTRYP PFNGLGETMEMORYOBJECTPARAMETERIVEXTPROC) (GLuint memoryObject, GLenum pname, GLint *params);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGEMEM2DEXTPROC) (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGEMEM2DMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGEMEM3DEXTPROC) (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGEMEM3DMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLBUFFERSTORAGEMEMEXTPROC) (GLenum target, GLsizeiptr size, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGEMEM2DEXTPROC) (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGEMEM2DMULTISAMPLEEXTPROC) (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGEMEM3DEXTPROC) (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGEMEM3DMULTISAMPLEEXTPROC) (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+typedef void (GL_APIENTRYP PFNGLNAMEDBUFFERSTORAGEMEMEXTPROC) (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glGetUnsignedBytevEXT (GLenum pname, GLubyte *data);
+GL_APICALL void GL_APIENTRY glGetUnsignedBytei_vEXT (GLenum target, GLuint index, GLubyte *data);
+GL_APICALL void GL_APIENTRY glDeleteMemoryObjectsEXT (GLsizei n, const GLuint *memoryObjects);
+GL_APICALL GLboolean GL_APIENTRY glIsMemoryObjectEXT (GLuint memoryObject);
+GL_APICALL void GL_APIENTRY glCreateMemoryObjectsEXT (GLsizei n, GLuint *memoryObjects);
+GL_APICALL void GL_APIENTRY glMemoryObjectParameterivEXT (GLuint memoryObject, GLenum pname, const GLint *params);
+GL_APICALL void GL_APIENTRY glGetMemoryObjectParameterivEXT (GLuint memoryObject, GLenum pname, GLint *params);
+GL_APICALL void GL_APIENTRY glTexStorageMem2DEXT (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTexStorageMem2DMultisampleEXT (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTexStorageMem3DEXT (GLenum target, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTexStorageMem3DMultisampleEXT (GLenum target, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glBufferStorageMemEXT (GLenum target, GLsizeiptr size, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTextureStorageMem2DEXT (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTextureStorageMem2DMultisampleEXT (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTextureStorageMem3DEXT (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glTextureStorageMem3DMultisampleEXT (GLuint texture, GLsizei samples, GLenum internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedSampleLocations, GLuint memory, GLuint64 offset);
+GL_APICALL void GL_APIENTRY glNamedBufferStorageMemEXT (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset);
+#endif
+#endif /* GL_EXT_memory_object */
+
+#ifndef GL_EXT_memory_object_fd
+#define GL_EXT_memory_object_fd 1
+#define GL_HANDLE_TYPE_OPAQUE_FD_EXT      0x9586
+typedef void (GL_APIENTRYP PFNGLIMPORTMEMORYFDEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, GLint fd);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glImportMemoryFdEXT (GLuint memory, GLuint64 size, GLenum handleType, GLint fd);
+#endif
+#endif /* GL_EXT_memory_object_fd */
+
+#ifndef GL_EXT_memory_object_win32
+#define GL_EXT_memory_object_win32 1
+#define GL_HANDLE_TYPE_OPAQUE_WIN32_EXT   0x9587
+#define GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT 0x9588
+#define GL_DEVICE_LUID_EXT                0x9599
+#define GL_DEVICE_NODE_MASK_EXT           0x959A
+#define GL_LUID_SIZE_EXT                  8
+#define GL_HANDLE_TYPE_D3D12_TILEPOOL_EXT 0x9589
+#define GL_HANDLE_TYPE_D3D12_RESOURCE_EXT 0x958A
+#define GL_HANDLE_TYPE_D3D11_IMAGE_EXT    0x958B
+#define GL_HANDLE_TYPE_D3D11_IMAGE_KMT_EXT 0x958C
+typedef void (GL_APIENTRYP PFNGLIMPORTMEMORYWIN32HANDLEEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, void *handle);
+typedef void (GL_APIENTRYP PFNGLIMPORTMEMORYWIN32NAMEEXTPROC) (GLuint memory, GLuint64 size, GLenum handleType, const void *name);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glImportMemoryWin32HandleEXT (GLuint memory, GLuint64 size, GLenum handleType, void *handle);
+GL_APICALL void GL_APIENTRY glImportMemoryWin32NameEXT (GLuint memory, GLuint64 size, GLenum handleType, const void *name);
+#endif
+#endif /* GL_EXT_memory_object_win32 */
+
 #ifndef GL_EXT_multi_draw_arrays
 #define GL_EXT_multi_draw_arrays 1
 typedef void (GL_APIENTRYP PFNGLMULTIDRAWARRAYSEXTPROC) (GLenum mode, const GLint *first, const GLsizei *count, GLsizei primcount);
@@ -1561,6 +1662,53 @@
 #define GL_FRAMEBUFFER_SRGB_EXT           0x8DB9
 #endif /* GL_EXT_sRGB_write_control */
 
+#ifndef GL_EXT_semaphore
+#define GL_EXT_semaphore 1
+#define GL_LAYOUT_GENERAL_EXT             0x958D
+#define GL_LAYOUT_COLOR_ATTACHMENT_EXT    0x958E
+#define GL_LAYOUT_DEPTH_STENCIL_ATTACHMENT_EXT 0x958F
+#define GL_LAYOUT_DEPTH_STENCIL_READ_ONLY_EXT 0x9590
+#define GL_LAYOUT_SHADER_READ_ONLY_EXT    0x9591
+#define GL_LAYOUT_TRANSFER_SRC_EXT        0x9592
+#define GL_LAYOUT_TRANSFER_DST_EXT        0x9593
+typedef void (GL_APIENTRYP PFNGLGENSEMAPHORESEXTPROC) (GLsizei n, GLuint *semaphores);
+typedef void (GL_APIENTRYP PFNGLDELETESEMAPHORESEXTPROC) (GLsizei n, const GLuint *semaphores);
+typedef GLboolean (GL_APIENTRYP PFNGLISSEMAPHOREEXTPROC) (GLuint semaphore);
+typedef void (GL_APIENTRYP PFNGLSEMAPHOREPARAMETERUI64VEXTPROC) (GLuint semaphore, GLenum pname, const GLuint64 *params);
+typedef void (GL_APIENTRYP PFNGLGETSEMAPHOREPARAMETERUI64VEXTPROC) (GLuint semaphore, GLenum pname, GLuint64 *params);
+typedef void (GL_APIENTRYP PFNGLWAITSEMAPHOREEXTPROC) (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts);
+typedef void (GL_APIENTRYP PFNGLSIGNALSEMAPHOREEXTPROC) (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glGenSemaphoresEXT (GLsizei n, GLuint *semaphores);
+GL_APICALL void GL_APIENTRY glDeleteSemaphoresEXT (GLsizei n, const GLuint *semaphores);
+GL_APICALL GLboolean GL_APIENTRY glIsSemaphoreEXT (GLuint semaphore);
+GL_APICALL void GL_APIENTRY glSemaphoreParameterui64vEXT (GLuint semaphore, GLenum pname, const GLuint64 *params);
+GL_APICALL void GL_APIENTRY glGetSemaphoreParameterui64vEXT (GLuint semaphore, GLenum pname, GLuint64 *params);
+GL_APICALL void GL_APIENTRY glWaitSemaphoreEXT (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts);
+GL_APICALL void GL_APIENTRY glSignalSemaphoreEXT (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts);
+#endif
+#endif /* GL_EXT_semaphore */
+
+#ifndef GL_EXT_semaphore_fd
+#define GL_EXT_semaphore_fd 1
+typedef void (GL_APIENTRYP PFNGLIMPORTSEMAPHOREFDEXTPROC) (GLuint semaphore, GLenum handleType, GLint fd);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glImportSemaphoreFdEXT (GLuint semaphore, GLenum handleType, GLint fd);
+#endif
+#endif /* GL_EXT_semaphore_fd */
+
+#ifndef GL_EXT_semaphore_win32
+#define GL_EXT_semaphore_win32 1
+#define GL_HANDLE_TYPE_D3D12_FENCE_EXT    0x9594
+#define GL_D3D12_FENCE_VALUE_EXT          0x9595
+typedef void (GL_APIENTRYP PFNGLIMPORTSEMAPHOREWIN32HANDLEEXTPROC) (GLuint semaphore, GLenum handleType, void *handle);
+typedef void (GL_APIENTRYP PFNGLIMPORTSEMAPHOREWIN32NAMEEXTPROC) (GLuint semaphore, GLenum handleType, const void *name);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glImportSemaphoreWin32HandleEXT (GLuint semaphore, GLenum handleType, void *handle);
+GL_APICALL void GL_APIENTRY glImportSemaphoreWin32NameEXT (GLuint semaphore, GLenum handleType, const void *name);
+#endif
+#endif /* GL_EXT_semaphore_win32 */
+
 #ifndef GL_EXT_separate_shader_objects
 #define GL_EXT_separate_shader_objects 1
 #define GL_ACTIVE_PROGRAM_EXT             0x8259
@@ -1741,6 +1889,10 @@
 #endif
 #endif /* GL_EXT_sparse_texture */
 
+#ifndef GL_EXT_sparse_texture2
+#define GL_EXT_sparse_texture2 1
+#endif /* GL_EXT_sparse_texture2 */
+
 #ifndef GL_EXT_tessellation_point_size
 #define GL_EXT_tessellation_point_size 1
 #endif /* GL_EXT_tessellation_point_size */
@@ -1844,6 +1996,11 @@
 #endif
 #endif /* GL_EXT_texture_buffer */
 
+#ifndef GL_EXT_texture_compression_astc_decode_mode
+#define GL_EXT_texture_compression_astc_decode_mode 1
+#define GL_TEXTURE_ASTC_DECODE_PRECISION_EXT 0x8F69
+#endif /* GL_EXT_texture_compression_astc_decode_mode */
+
 #ifndef GL_EXT_texture_compression_dxt1
 #define GL_EXT_texture_compression_dxt1 1
 #define GL_COMPRESSED_RGB_S3TC_DXT1_EXT   0x83F0
@@ -1973,6 +2130,16 @@
 #define GL_UNPACK_SKIP_PIXELS_EXT         0x0CF4
 #endif /* GL_EXT_unpack_subimage */
 
+#ifndef GL_EXT_win32_keyed_mutex
+#define GL_EXT_win32_keyed_mutex 1
+typedef GLboolean (GL_APIENTRYP PFNGLACQUIREKEYEDMUTEXWIN32EXTPROC) (GLuint memory, GLuint64 key, GLuint timeout);
+typedef GLboolean (GL_APIENTRYP PFNGLRELEASEKEYEDMUTEXWIN32EXTPROC) (GLuint memory, GLuint64 key);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL GLboolean GL_APIENTRY glAcquireKeyedMutexWin32EXT (GLuint memory, GLuint64 key, GLuint timeout);
+GL_APICALL GLboolean GL_APIENTRY glReleaseKeyedMutexWin32EXT (GLuint memory, GLuint64 key);
+#endif
+#endif /* GL_EXT_win32_keyed_mutex */
+
 #ifndef GL_EXT_window_rectangles
 #define GL_EXT_window_rectangles 1
 #define GL_INCLUSIVE_EXT                  0x8F10
@@ -1992,6 +2159,24 @@
 #define GL_GCCSO_SHADER_BINARY_FJ         0x9260
 #endif /* GL_FJ_shader_binary_GCCSO */
 
+#ifndef GL_IMG_bindless_texture
+#define GL_IMG_bindless_texture 1
+typedef GLuint64 (GL_APIENTRYP PFNGLGETTEXTUREHANDLEIMGPROC) (GLuint texture);
+typedef GLuint64 (GL_APIENTRYP PFNGLGETTEXTURESAMPLERHANDLEIMGPROC) (GLuint texture, GLuint sampler);
+typedef void (GL_APIENTRYP PFNGLUNIFORMHANDLEUI64IMGPROC) (GLint location, GLuint64 value);
+typedef void (GL_APIENTRYP PFNGLUNIFORMHANDLEUI64VIMGPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64IMGPROC) (GLuint program, GLint location, GLuint64 value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64VIMGPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL GLuint64 GL_APIENTRY glGetTextureHandleIMG (GLuint texture);
+GL_APICALL GLuint64 GL_APIENTRY glGetTextureSamplerHandleIMG (GLuint texture, GLuint sampler);
+GL_APICALL void GL_APIENTRY glUniformHandleui64IMG (GLint location, GLuint64 value);
+GL_APICALL void GL_APIENTRY glUniformHandleui64vIMG (GLint location, GLsizei count, const GLuint64 *value);
+GL_APICALL void GL_APIENTRY glProgramUniformHandleui64IMG (GLuint program, GLint location, GLuint64 value);
+GL_APICALL void GL_APIENTRY glProgramUniformHandleui64vIMG (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+#endif
+#endif /* GL_IMG_bindless_texture */
+
 #ifndef GL_IMG_framebuffer_downsample
 #define GL_IMG_framebuffer_downsample 1
 #define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_AND_DOWNSAMPLE_IMG 0x913C
@@ -2116,6 +2301,14 @@
 #endif
 #endif /* GL_INTEL_performance_query */
 
+#ifndef GL_MESA_shader_integer_functions
+#define GL_MESA_shader_integer_functions 1
+#endif /* GL_MESA_shader_integer_functions */
+
+#ifndef GL_NVX_blend_equation_advanced_multi_draw_buffers
+#define GL_NVX_blend_equation_advanced_multi_draw_buffers 1
+#endif /* GL_NVX_blend_equation_advanced_multi_draw_buffers */
+
 #ifndef GL_NV_bindless_texture
 #define GL_NV_bindless_texture 1
 typedef GLuint64 (GL_APIENTRYP PFNGLGETTEXTUREHANDLENVPROC) (GLuint texture);
@@ -2334,6 +2527,23 @@
 #endif
 #endif /* GL_NV_draw_instanced */
 
+#ifndef GL_NV_draw_vulkan_image
+#define GL_NV_draw_vulkan_image 1
+typedef void (GL_APIENTRY  *GLVULKANPROCNV)(void);
+typedef void (GL_APIENTRYP PFNGLDRAWVKIMAGENVPROC) (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+typedef GLVULKANPROCNV (GL_APIENTRYP PFNGLGETVKPROCADDRNVPROC) (const GLchar *name);
+typedef void (GL_APIENTRYP PFNGLWAITVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (GL_APIENTRYP PFNGLSIGNALVKSEMAPHORENVPROC) (GLuint64 vkSemaphore);
+typedef void (GL_APIENTRYP PFNGLSIGNALVKFENCENVPROC) (GLuint64 vkFence);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glDrawVkImageNV (GLuint64 vkImage, GLuint sampler, GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1, GLfloat z, GLfloat s0, GLfloat t0, GLfloat s1, GLfloat t1);
+GL_APICALL GLVULKANPROCNV GL_APIENTRY glGetVkProcAddrNV (const GLchar *name);
+GL_APICALL void GL_APIENTRY glWaitVkSemaphoreNV (GLuint64 vkSemaphore);
+GL_APICALL void GL_APIENTRY glSignalVkSemaphoreNV (GLuint64 vkSemaphore);
+GL_APICALL void GL_APIENTRY glSignalVkFenceNV (GLuint64 vkFence);
+#endif
+#endif /* GL_NV_draw_vulkan_image */
+
 #ifndef GL_NV_explicit_attrib_location
 #define GL_NV_explicit_attrib_location 1
 #endif /* GL_NV_explicit_attrib_location */
@@ -3040,6 +3250,7 @@
 #define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630
 #define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632
 #define GL_MAX_VIEWS_OVR                  0x9631
+#define GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR 0x9633
 typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_APICALL void GL_APIENTRY glFramebufferTextureMultiviewOVR (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
@@ -3138,11 +3349,32 @@
 #endif
 #endif /* GL_QCOM_extended_get2 */
 
+#ifndef GL_QCOM_framebuffer_foveated
+#define GL_QCOM_framebuffer_foveated 1
+#define GL_FOVEATION_ENABLE_BIT_QCOM      0x00000001
+#define GL_FOVEATION_SCALED_BIN_METHOD_BIT_QCOM 0x00000002
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERFOVEATIONCONFIGQCOMPROC) (GLuint framebuffer, GLuint numLayers, GLuint focalPointsPerLayer, GLuint requestedFeatures, GLuint *providedFeatures);
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERFOVEATIONPARAMETERSQCOMPROC) (GLuint framebuffer, GLuint layer, GLuint focalPoint, GLfloat focalX, GLfloat focalY, GLfloat gainX, GLfloat gainY, GLfloat foveaArea);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glFramebufferFoveationConfigQCOM (GLuint framebuffer, GLuint numLayers, GLuint focalPointsPerLayer, GLuint requestedFeatures, GLuint *providedFeatures);
+GL_APICALL void GL_APIENTRY glFramebufferFoveationParametersQCOM (GLuint framebuffer, GLuint layer, GLuint focalPoint, GLfloat focalX, GLfloat focalY, GLfloat gainX, GLfloat gainY, GLfloat foveaArea);
+#endif
+#endif /* GL_QCOM_framebuffer_foveated */
+
 #ifndef GL_QCOM_perfmon_global_mode
 #define GL_QCOM_perfmon_global_mode 1
 #define GL_PERFMON_GLOBAL_MODE_QCOM       0x8FA0
 #endif /* GL_QCOM_perfmon_global_mode */
 
+#ifndef GL_QCOM_shader_framebuffer_fetch_noncoherent
+#define GL_QCOM_shader_framebuffer_fetch_noncoherent 1
+#define GL_FRAMEBUFFER_FETCH_NONCOHERENT_QCOM 0x96A2
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERFETCHBARRIERQCOMPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glFramebufferFetchBarrierQCOM (void);
+#endif
+#endif /* GL_QCOM_shader_framebuffer_fetch_noncoherent */
+
 #ifndef GL_QCOM_tiled_rendering
 #define GL_QCOM_tiled_rendering 1
 #define GL_COLOR_BUFFER_BIT0_QCOM         0x00000001
diff --git a/include/GLES2/gl2platform.h b/include/GLES2/gl2platform.h
index 89d4d44..eb318dc 100644
--- a/include/GLES2/gl2platform.h
+++ b/include/GLES2/gl2platform.h
@@ -1,20 +1,28 @@
 #ifndef __gl2platform_h_
 #define __gl2platform_h_
 
-/* $Revision: 23328 $ on $Date:: 2013-10-02 02:28:28 -0700 #$ */
-
 /*
- * This document is licensed under the SGI Free Software B License Version
- * 2.0. For details, see http://oss.sgi.com/projects/FreeB/ .
- */
+** Copyright (c) 2017 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
 
 /* Platform-specific types and definitions for OpenGL ES 2.X  gl2.h
  *
  * Adopters may modify khrplatform.h and this file to suit their platform.
- * You are encouraged to submit all modifications to the Khronos group so that
- * they can be included in future versions of this file.  Please submit changes
- * by sending them to the public Khronos Bugzilla (http://khronos.org/bugzilla)
- * by filing a bug against product "OpenGL-ES" component "Registry".
+ * Please contribute modifications back to Khronos as pull requests on the
+ * public github repository:
+ *      https://github.com/KhronosGroup/OpenGL-Registry
  */
 
 #include <KHR/khrplatform.h>
diff --git a/include/GLES3/gl3.h b/include/GLES3/gl3.h
index d8153c0..71e72b4 100644
--- a/include/GLES3/gl3.h
+++ b/include/GLES3/gl3.h
@@ -6,7 +6,7 @@
 #endif
 
 /*
-** Copyright (c) 2013-2016 The Khronos Group Inc.
+** Copyright (c) 2013-2017 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -31,9 +31,7 @@
 ** This header is generated from the Khronos OpenGL / OpenGL ES XML
 ** API Registry. The current version of the Registry, generator scripts
 ** used to make the header, and the header can be found at
-**   http://www.opengl.org/registry/
-**
-** Khronos $Revision: 32749 $ on $Date: 2016-04-28 09:03:03 -0700 (Thu, 28 Apr 2016) $
+**   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
 #include <GLES3/gl3platform.h>
@@ -42,7 +40,11 @@
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20160428 */
+#ifndef GL_GLES_PROTOTYPES
+#define GL_GLES_PROTOTYPES 1
+#endif
+
+/* Generated on date 20170606 */
 
 /* Generated C header for:
  * API: gles2
@@ -520,6 +522,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIB4FVPROC) (GLuint index, const GLfloat *v);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC) (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 typedef void (GL_APIENTRYP PFNGLVIEWPORTPROC) (GLint x, GLint y, GLsizei width, GLsizei height);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glActiveTexture (GLenum texture);
 GL_APICALL void GL_APIENTRY glAttachShader (GLuint program, GLuint shader);
 GL_APICALL void GL_APIENTRY glBindAttribLocation (GLuint program, GLuint index, const GLchar *name);
@@ -662,6 +665,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttrib4fv (GLuint index, const GLfloat *v);
 GL_APICALL void GL_APIENTRY glVertexAttribPointer (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 GL_APICALL void GL_APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
+#endif
 #endif /* GL_ES_VERSION_2_0 */
 
 #ifndef GL_ES_VERSION_3_0
@@ -1092,6 +1096,7 @@
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 typedef void (GL_APIENTRYP PFNGLGETINTERNALFORMATIVPROC) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glReadBuffer (GLenum src);
 GL_APICALL void GL_APIENTRY glDrawRangeElements (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices);
 GL_APICALL void GL_APIENTRY glTexImage3D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
@@ -1196,6 +1201,7 @@
 GL_APICALL void GL_APIENTRY glTexStorage2D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 GL_APICALL void GL_APIENTRY glTexStorage3D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 GL_APICALL void GL_APIENTRY glGetInternalformativ (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#endif
 #endif /* GL_ES_VERSION_3_0 */
 
 #ifdef __cplusplus
diff --git a/include/GLES3/gl31.h b/include/GLES3/gl31.h
index 12a94a5..4806181 100644
--- a/include/GLES3/gl31.h
+++ b/include/GLES3/gl31.h
@@ -42,7 +42,11 @@
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20160428 */
+#ifndef GL_GLES_PROTOTYPES
+#define GL_GLES_PROTOTYPES 1
+#endif
+
+/* Generated on date 20161024 */
 
 /* Generated C header for:
  * API: gles2
@@ -520,6 +524,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIB4FVPROC) (GLuint index, const GLfloat *v);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC) (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 typedef void (GL_APIENTRYP PFNGLVIEWPORTPROC) (GLint x, GLint y, GLsizei width, GLsizei height);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glActiveTexture (GLenum texture);
 GL_APICALL void GL_APIENTRY glAttachShader (GLuint program, GLuint shader);
 GL_APICALL void GL_APIENTRY glBindAttribLocation (GLuint program, GLuint index, const GLchar *name);
@@ -662,6 +667,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttrib4fv (GLuint index, const GLfloat *v);
 GL_APICALL void GL_APIENTRY glVertexAttribPointer (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 GL_APICALL void GL_APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
+#endif
 #endif /* GL_ES_VERSION_2_0 */
 
 #ifndef GL_ES_VERSION_3_0
@@ -1092,6 +1098,7 @@
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 typedef void (GL_APIENTRYP PFNGLGETINTERNALFORMATIVPROC) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glReadBuffer (GLenum src);
 GL_APICALL void GL_APIENTRY glDrawRangeElements (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices);
 GL_APICALL void GL_APIENTRY glTexImage3D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
@@ -1196,6 +1203,7 @@
 GL_APICALL void GL_APIENTRY glTexStorage2D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 GL_APICALL void GL_APIENTRY glTexStorage3D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 GL_APICALL void GL_APIENTRY glGetInternalformativ (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#endif
 #endif /* GL_ES_VERSION_3_0 */
 
 #ifndef GL_ES_VERSION_3_1
@@ -1441,6 +1449,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBIFORMATPROC) (GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBBINDINGPROC) (GLuint attribindex, GLuint bindingindex);
 typedef void (GL_APIENTRYP PFNGLVERTEXBINDINGDIVISORPROC) (GLuint bindingindex, GLuint divisor);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glDispatchCompute (GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
 GL_APICALL void GL_APIENTRY glDispatchComputeIndirect (GLintptr indirect);
 GL_APICALL void GL_APIENTRY glDrawArraysIndirect (GLenum mode, const void *indirect);
@@ -1509,6 +1518,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttribIFormat (GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
 GL_APICALL void GL_APIENTRY glVertexAttribBinding (GLuint attribindex, GLuint bindingindex);
 GL_APICALL void GL_APIENTRY glVertexBindingDivisor (GLuint bindingindex, GLuint divisor);
+#endif
 #endif /* GL_ES_VERSION_3_1 */
 
 #ifdef __cplusplus
diff --git a/include/GLES3/gl32.h b/include/GLES3/gl32.h
index 7345a2f..a1af7c6 100644
--- a/include/GLES3/gl32.h
+++ b/include/GLES3/gl32.h
@@ -42,7 +42,11 @@
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20160428 */
+#ifndef GL_GLES_PROTOTYPES
+#define GL_GLES_PROTOTYPES 1
+#endif
+
+/* Generated on date 20161024 */
 
 /* Generated C header for:
  * API: gles2
@@ -520,6 +524,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIB4FVPROC) (GLuint index, const GLfloat *v);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC) (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 typedef void (GL_APIENTRYP PFNGLVIEWPORTPROC) (GLint x, GLint y, GLsizei width, GLsizei height);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glActiveTexture (GLenum texture);
 GL_APICALL void GL_APIENTRY glAttachShader (GLuint program, GLuint shader);
 GL_APICALL void GL_APIENTRY glBindAttribLocation (GLuint program, GLuint index, const GLchar *name);
@@ -662,6 +667,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttrib4fv (GLuint index, const GLfloat *v);
 GL_APICALL void GL_APIENTRY glVertexAttribPointer (GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
 GL_APICALL void GL_APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
+#endif
 #endif /* GL_ES_VERSION_2_0 */
 
 #ifndef GL_ES_VERSION_3_0
@@ -1092,6 +1098,7 @@
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 typedef void (GL_APIENTRYP PFNGLGETINTERNALFORMATIVPROC) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glReadBuffer (GLenum src);
 GL_APICALL void GL_APIENTRY glDrawRangeElements (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices);
 GL_APICALL void GL_APIENTRY glTexImage3D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
@@ -1196,6 +1203,7 @@
 GL_APICALL void GL_APIENTRY glTexStorage2D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
 GL_APICALL void GL_APIENTRY glTexStorage3D (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
 GL_APICALL void GL_APIENTRY glGetInternalformativ (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
+#endif
 #endif /* GL_ES_VERSION_3_0 */
 
 #ifndef GL_ES_VERSION_3_1
@@ -1441,6 +1449,7 @@
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBIFORMATPROC) (GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
 typedef void (GL_APIENTRYP PFNGLVERTEXATTRIBBINDINGPROC) (GLuint attribindex, GLuint bindingindex);
 typedef void (GL_APIENTRYP PFNGLVERTEXBINDINGDIVISORPROC) (GLuint bindingindex, GLuint divisor);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glDispatchCompute (GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
 GL_APICALL void GL_APIENTRY glDispatchComputeIndirect (GLintptr indirect);
 GL_APICALL void GL_APIENTRY glDrawArraysIndirect (GLenum mode, const void *indirect);
@@ -1509,6 +1518,7 @@
 GL_APICALL void GL_APIENTRY glVertexAttribIFormat (GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
 GL_APICALL void GL_APIENTRY glVertexAttribBinding (GLuint attribindex, GLuint bindingindex);
 GL_APICALL void GL_APIENTRY glVertexBindingDivisor (GLuint bindingindex, GLuint divisor);
+#endif
 #endif /* GL_ES_VERSION_3_1 */
 
 #ifndef GL_ES_VERSION_3_2
@@ -1764,6 +1774,7 @@
 typedef void (GL_APIENTRYP PFNGLTEXBUFFERPROC) (GLenum target, GLenum internalformat, GLuint buffer);
 typedef void (GL_APIENTRYP PFNGLTEXBUFFERRANGEPROC) (GLenum target, GLenum internalformat, GLuint buffer, GLintptr offset, GLsizeiptr size);
 typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DMULTISAMPLEPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations);
+#if GL_GLES_PROTOTYPES
 GL_APICALL void GL_APIENTRY glBlendBarrier (void);
 GL_APICALL void GL_APIENTRY glCopyImageSubData (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
 GL_APICALL void GL_APIENTRY glDebugMessageControl (GLenum source, GLenum type, GLenum severity, GLsizei count, const GLuint *ids, GLboolean enabled);
@@ -1808,6 +1819,7 @@
 GL_APICALL void GL_APIENTRY glTexBuffer (GLenum target, GLenum internalformat, GLuint buffer);
 GL_APICALL void GL_APIENTRY glTexBufferRange (GLenum target, GLenum internalformat, GLuint buffer, GLintptr offset, GLsizeiptr size);
 GL_APICALL void GL_APIENTRY glTexStorage3DMultisample (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations);
+#endif
 #endif /* GL_ES_VERSION_3_2 */
 
 #ifdef __cplusplus
diff --git a/include/GLES3/gl3platform.h b/include/GLES3/gl3platform.h
index b1e869d..ca9d7a6 100644
--- a/include/GLES3/gl3platform.h
+++ b/include/GLES3/gl3platform.h
@@ -1,20 +1,28 @@
 #ifndef __gl3platform_h_
 #define __gl3platform_h_
 
-/* $Revision: 23328 $ on $Date:: 2013-10-02 02:28:28 -0700 #$ */
-
 /*
- * This document is licensed under the SGI Free Software B License Version
- * 2.0. For details, see http://oss.sgi.com/projects/FreeB/ .
- */
+** Copyright (c) 2017 The Khronos Group Inc.
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
 
 /* Platform-specific types and definitions for OpenGL ES 3.X  gl3.h
  *
  * Adopters may modify khrplatform.h and this file to suit their platform.
- * You are encouraged to submit all modifications to the Khronos group so that
- * they can be included in future versions of this file.  Please submit changes
- * by sending them to the public Khronos Bugzilla (http://khronos.org/bugzilla)
- * by filing a bug against product "OpenGL-ES" component "Registry".
+ * Please contribute modifications back to Khronos as pull requests on the
+ * public github repository:
+ *      https://github.com/KhronosGroup/OpenGL-Registry
  */
 
 #include <KHR/khrplatform.h>
diff --git a/include/KHR/khrplatform.h b/include/KHR/khrplatform.h
index 790de44..93acb07 100644
--- a/include/KHR/khrplatform.h
+++ b/include/KHR/khrplatform.h
@@ -26,7 +26,7 @@
 
 /* Khronos platform-specific types and definitions.
  *
- * $Revision: 23298 $ on $Date: 2013-09-30 17:07:13 -0700 (Mon, 30 Sep 2013) $
+ * $Revision: 32517 $ on $Date: 2016-03-11 02:41:19 -0800 (Fri, 11 Mar 2016) $
  *
  * Adopters may modify this file to suit their platform. Adopters are
  * encouraged to submit platform specific modifications to the Khronos
@@ -98,11 +98,7 @@
  * This precedes the return type of the function in the function prototype.
  */
 #if defined(_WIN32) && !defined(__SCITECH_SNAP__)
-#   if defined(KHRONOS_DLL_EXPORTS)
-#      define KHRONOS_APICALL __declspec(dllexport)
-#   else
-#      define KHRONOS_APICALL __declspec(dllimport)
-#   endif
+#   define KHRONOS_APICALL __declspec(dllimport)
 #elif defined (__SYMBIAN32__)
 #   define KHRONOS_APICALL IMPORT_C
 #elif (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303) \
@@ -231,7 +227,7 @@
 typedef unsigned short int     khronos_uint16_t;
 
 /*
- * Types that differ between LLP64 and LP64 architectures - in LLP64, 
+ * Types that differ between LLP64 and LP64 architectures - in LLP64,
  * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
  * to be the only LLP64 architecture in current use.
  */
diff --git a/include/c11/threads_win32.h b/include/c11/threads_win32.h
index d017c31..af7df4b 100644
--- a/include/c11/threads_win32.h
+++ b/include/c11/threads_win32.h
@@ -502,9 +502,13 @@
     HANDLE hCurrentThread;
     BOOL bRet;
 
-    /* GetCurrentThread() returns a pseudo-handle, which is useless.  We need
-     * to call DuplicateHandle to get a real handle.  However the handle value
-     * will not match the one returned by thread_create.
+    /* GetCurrentThread() returns a pseudo-handle, which we need
+     * to pass to DuplicateHandle(). Only the resulting handle can be used
+     * from other threads.
+     *
+     * Note that neither handle can be compared to the one by thread_create.
+     * Only the thread IDs - as returned by GetThreadId() and GetCurrentThreadId()
+     * can be compared directly.
      *
      * Other potential solutions would be:
      * - define thrd_t as a thread Ids, but this would mean we'd need to OpenThread for many operations
diff --git a/include/drm-uapi/README b/include/drm-uapi/README
new file mode 100644
index 0000000..eb37096
--- /dev/null
+++ b/include/drm-uapi/README
@@ -0,0 +1,18 @@
+This directory contains a copy of the installed kernel headers
+required by the anv & i965 drivers to communicate with the kernel.
+Whenever either of those driver needs new definitions for new kernel
+APIs, these files should be updated.
+
+You can copy files installed after running this from the kernel
+repository, at version the drivers require :
+
+$ make headers_install INSTALL_HDR_PATH=/path/to/install
+
+The last update was done at the following kernel commit :
+
+commit 6d61e70ccc21606ffb8a0a03bd3aba24f659502b
+Merge: 338ffbf7cb5e c0bc126f97fb
+Author: Dave Airlie <airlied@redhat.com>
+Date:   Tue Jun 27 07:24:49 2017 +1000
+
+    Backmerge tag 'v4.12-rc7' into drm-next
diff --git a/include/drm-uapi/drm.h b/include/drm-uapi/drm.h
new file mode 100644
index 0000000..0f750fc
--- /dev/null
+++ b/include/drm-uapi/drm.h
@@ -0,0 +1,940 @@
+/**
+ * \file drm.h
+ * Header for the Direct Rendering Manager
+ *
+ * \author Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * \par Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic \c cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if   defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#elif defined(__Fuchsia__)
+
+#include <bits/ioctl.h>
+#include <sys/types.h>
+typedef int8_t __s8;
+typedef uint8_t __u8;
+typedef int16_t __s16;
+typedef uint16_t __u16;
+typedef int32_t __s32;
+typedef uint32_t __u32;
+typedef int64_t __s64;
+typedef uint64_t __u64;
+typedef size_t __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/**
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/**
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/**
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/**
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/**
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char *desc;	  /**< User-space buffer to hold desc */
+};
+
+/**
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/**
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/**
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/**
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/**
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/**
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/**
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/**
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/**
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/**
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/**
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/**
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc *list;
+};
+
+/**
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int *list;
+};
+
+/**
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void *address;	       /**< Address of buffer */
+};
+
+/**
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void *virt;
+#else
+	void *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub *list;	/**< Buffer information */
+};
+
+/**
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int *send_indices;	  /**< List of handles to buffers */
+	int *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int *request_indices;	  /**< Buffer information */
+	int *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/**
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/**
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx *contexts;
+};
+
+/**
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/**
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/**
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/**
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/**
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/**
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/**
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/**
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/**
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/**
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/**
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/**
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/** DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/** DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/** DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+#define DRM_CAP_DUMB_BUFFER		0x1
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+#define DRM_CAP_PRIME			0x5
+#define  DRM_PRIME_CAP_IMPORT		0x1
+#define  DRM_PRIME_CAP_EXPORT		0x2
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/*
+ * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight
+ * combination for the hardware cursor. The intention is that a hardware
+ * agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+#define DRM_CAP_SYNCOBJ		0x13
+
+/** DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * if set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+
+/**
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * Header for events written back to userspace on the drm fd.  The
+ * type defines the type of event, the length specifies the total
+ * length of the event (including the header), and user_data is
+ * typically a 64 bit value passed with the ioctl that triggered the
+ * event.  A read on the drm fd will always only return complete
+ * events, that is, if for example the read buffer is 100 bytes, and
+ * there are two 64 byte events pending, only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic drm events, 0x80000000 and
+ * up are chipset specific.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+#define DRM_EVENT_VBLANK 0x01
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* typedef area */
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h
new file mode 100644
index 0000000..7586c46
--- /dev/null
+++ b/include/drm-uapi/drm_fourcc.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DRM_FOURCC_H
+#define DRM_FOURCC_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define fourcc_code(a, b, c, d) ((__u32)(a) | ((__u32)(b) << 8) | \
+				 ((__u32)(c) << 16) | ((__u32)(d) << 24))
+
+#define DRM_FORMAT_BIG_ENDIAN (1<<31) /* format is big endian instead of little endian */
+
+/* color index */
+#define DRM_FORMAT_C8		fourcc_code('C', '8', ' ', ' ') /* [7:0] C */
+
+/* 8 bpp Red */
+#define DRM_FORMAT_R8		fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+
+/* 16 bpp Red */
+#define DRM_FORMAT_R16		fourcc_code('R', '1', '6', ' ') /* [15:0] R little endian */
+
+/* 16 bpp RG */
+#define DRM_FORMAT_RG88		fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#define DRM_FORMAT_GR88		fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+
+/* 32 bpp RG */
+#define DRM_FORMAT_RG1616	fourcc_code('R', 'G', '3', '2') /* [31:0] R:G 16:16 little endian */
+#define DRM_FORMAT_GR1616	fourcc_code('G', 'R', '3', '2') /* [31:0] G:R 16:16 little endian */
+
+/* 8 bpp RGB */
+#define DRM_FORMAT_RGB332	fourcc_code('R', 'G', 'B', '8') /* [7:0] R:G:B 3:3:2 */
+#define DRM_FORMAT_BGR233	fourcc_code('B', 'G', 'R', '8') /* [7:0] B:G:R 2:3:3 */
+
+/* 16 bpp RGB */
+#define DRM_FORMAT_XRGB4444	fourcc_code('X', 'R', '1', '2') /* [15:0] x:R:G:B 4:4:4:4 little endian */
+#define DRM_FORMAT_XBGR4444	fourcc_code('X', 'B', '1', '2') /* [15:0] x:B:G:R 4:4:4:4 little endian */
+#define DRM_FORMAT_RGBX4444	fourcc_code('R', 'X', '1', '2') /* [15:0] R:G:B:x 4:4:4:4 little endian */
+#define DRM_FORMAT_BGRX4444	fourcc_code('B', 'X', '1', '2') /* [15:0] B:G:R:x 4:4:4:4 little endian */
+
+#define DRM_FORMAT_ARGB4444	fourcc_code('A', 'R', '1', '2') /* [15:0] A:R:G:B 4:4:4:4 little endian */
+#define DRM_FORMAT_ABGR4444	fourcc_code('A', 'B', '1', '2') /* [15:0] A:B:G:R 4:4:4:4 little endian */
+#define DRM_FORMAT_RGBA4444	fourcc_code('R', 'A', '1', '2') /* [15:0] R:G:B:A 4:4:4:4 little endian */
+#define DRM_FORMAT_BGRA4444	fourcc_code('B', 'A', '1', '2') /* [15:0] B:G:R:A 4:4:4:4 little endian */
+
+#define DRM_FORMAT_XRGB1555	fourcc_code('X', 'R', '1', '5') /* [15:0] x:R:G:B 1:5:5:5 little endian */
+#define DRM_FORMAT_XBGR1555	fourcc_code('X', 'B', '1', '5') /* [15:0] x:B:G:R 1:5:5:5 little endian */
+#define DRM_FORMAT_RGBX5551	fourcc_code('R', 'X', '1', '5') /* [15:0] R:G:B:x 5:5:5:1 little endian */
+#define DRM_FORMAT_BGRX5551	fourcc_code('B', 'X', '1', '5') /* [15:0] B:G:R:x 5:5:5:1 little endian */
+
+#define DRM_FORMAT_ARGB1555	fourcc_code('A', 'R', '1', '5') /* [15:0] A:R:G:B 1:5:5:5 little endian */
+#define DRM_FORMAT_ABGR1555	fourcc_code('A', 'B', '1', '5') /* [15:0] A:B:G:R 1:5:5:5 little endian */
+#define DRM_FORMAT_RGBA5551	fourcc_code('R', 'A', '1', '5') /* [15:0] R:G:B:A 5:5:5:1 little endian */
+#define DRM_FORMAT_BGRA5551	fourcc_code('B', 'A', '1', '5') /* [15:0] B:G:R:A 5:5:5:1 little endian */
+
+#define DRM_FORMAT_RGB565	fourcc_code('R', 'G', '1', '6') /* [15:0] R:G:B 5:6:5 little endian */
+#define DRM_FORMAT_BGR565	fourcc_code('B', 'G', '1', '6') /* [15:0] B:G:R 5:6:5 little endian */
+
+/* 24 bpp RGB */
+#define DRM_FORMAT_RGB888	fourcc_code('R', 'G', '2', '4') /* [23:0] R:G:B little endian */
+#define DRM_FORMAT_BGR888	fourcc_code('B', 'G', '2', '4') /* [23:0] B:G:R little endian */
+
+/* 32 bpp RGB */
+#define DRM_FORMAT_XRGB8888	fourcc_code('X', 'R', '2', '4') /* [31:0] x:R:G:B 8:8:8:8 little endian */
+#define DRM_FORMAT_XBGR8888	fourcc_code('X', 'B', '2', '4') /* [31:0] x:B:G:R 8:8:8:8 little endian */
+#define DRM_FORMAT_RGBX8888	fourcc_code('R', 'X', '2', '4') /* [31:0] R:G:B:x 8:8:8:8 little endian */
+#define DRM_FORMAT_BGRX8888	fourcc_code('B', 'X', '2', '4') /* [31:0] B:G:R:x 8:8:8:8 little endian */
+
+#define DRM_FORMAT_ARGB8888	fourcc_code('A', 'R', '2', '4') /* [31:0] A:R:G:B 8:8:8:8 little endian */
+#define DRM_FORMAT_ABGR8888	fourcc_code('A', 'B', '2', '4') /* [31:0] A:B:G:R 8:8:8:8 little endian */
+#define DRM_FORMAT_RGBA8888	fourcc_code('R', 'A', '2', '4') /* [31:0] R:G:B:A 8:8:8:8 little endian */
+#define DRM_FORMAT_BGRA8888	fourcc_code('B', 'A', '2', '4') /* [31:0] B:G:R:A 8:8:8:8 little endian */
+
+#define DRM_FORMAT_XRGB2101010	fourcc_code('X', 'R', '3', '0') /* [31:0] x:R:G:B 2:10:10:10 little endian */
+#define DRM_FORMAT_XBGR2101010	fourcc_code('X', 'B', '3', '0') /* [31:0] x:B:G:R 2:10:10:10 little endian */
+#define DRM_FORMAT_RGBX1010102	fourcc_code('R', 'X', '3', '0') /* [31:0] R:G:B:x 10:10:10:2 little endian */
+#define DRM_FORMAT_BGRX1010102	fourcc_code('B', 'X', '3', '0') /* [31:0] B:G:R:x 10:10:10:2 little endian */
+
+#define DRM_FORMAT_ARGB2101010	fourcc_code('A', 'R', '3', '0') /* [31:0] A:R:G:B 2:10:10:10 little endian */
+#define DRM_FORMAT_ABGR2101010	fourcc_code('A', 'B', '3', '0') /* [31:0] A:B:G:R 2:10:10:10 little endian */
+#define DRM_FORMAT_RGBA1010102	fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */
+#define DRM_FORMAT_BGRA1010102	fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */
+
+/* packed YCbCr */
+#define DRM_FORMAT_YUYV		fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
+#define DRM_FORMAT_YVYU		fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */
+#define DRM_FORMAT_UYVY		fourcc_code('U', 'Y', 'V', 'Y') /* [31:0] Y1:Cr0:Y0:Cb0 8:8:8:8 little endian */
+#define DRM_FORMAT_VYUY		fourcc_code('V', 'Y', 'U', 'Y') /* [31:0] Y1:Cb0:Y0:Cr0 8:8:8:8 little endian */
+
+#define DRM_FORMAT_AYUV		fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
+
+/*
+ * 2 plane RGB + A
+ * index 0 = RGB plane, same format as the corresponding non _A8 format has
+ * index 1 = A plane, [7:0] A
+ */
+#define DRM_FORMAT_XRGB8888_A8	fourcc_code('X', 'R', 'A', '8')
+#define DRM_FORMAT_XBGR8888_A8	fourcc_code('X', 'B', 'A', '8')
+#define DRM_FORMAT_RGBX8888_A8	fourcc_code('R', 'X', 'A', '8')
+#define DRM_FORMAT_BGRX8888_A8	fourcc_code('B', 'X', 'A', '8')
+#define DRM_FORMAT_RGB888_A8	fourcc_code('R', '8', 'A', '8')
+#define DRM_FORMAT_BGR888_A8	fourcc_code('B', '8', 'A', '8')
+#define DRM_FORMAT_RGB565_A8	fourcc_code('R', '5', 'A', '8')
+#define DRM_FORMAT_BGR565_A8	fourcc_code('B', '5', 'A', '8')
+
+/*
+ * 2 plane YCbCr
+ * index 0 = Y plane, [7:0] Y
+ * index 1 = Cr:Cb plane, [15:0] Cr:Cb little endian
+ * or
+ * index 1 = Cb:Cr plane, [15:0] Cb:Cr little endian
+ */
+#define DRM_FORMAT_NV12		fourcc_code('N', 'V', '1', '2') /* 2x2 subsampled Cr:Cb plane */
+#define DRM_FORMAT_NV21		fourcc_code('N', 'V', '2', '1') /* 2x2 subsampled Cb:Cr plane */
+#define DRM_FORMAT_NV16		fourcc_code('N', 'V', '1', '6') /* 2x1 subsampled Cr:Cb plane */
+#define DRM_FORMAT_NV61		fourcc_code('N', 'V', '6', '1') /* 2x1 subsampled Cb:Cr plane */
+#define DRM_FORMAT_NV24		fourcc_code('N', 'V', '2', '4') /* non-subsampled Cr:Cb plane */
+#define DRM_FORMAT_NV42		fourcc_code('N', 'V', '4', '2') /* non-subsampled Cb:Cr plane */
+
+/*
+ * 3 plane YCbCr
+ * index 0: Y plane, [7:0] Y
+ * index 1: Cb plane, [7:0] Cb
+ * index 2: Cr plane, [7:0] Cr
+ * or
+ * index 1: Cr plane, [7:0] Cr
+ * index 2: Cb plane, [7:0] Cb
+ */
+#define DRM_FORMAT_YUV410	fourcc_code('Y', 'U', 'V', '9') /* 4x4 subsampled Cb (1) and Cr (2) planes */
+#define DRM_FORMAT_YVU410	fourcc_code('Y', 'V', 'U', '9') /* 4x4 subsampled Cr (1) and Cb (2) planes */
+#define DRM_FORMAT_YUV411	fourcc_code('Y', 'U', '1', '1') /* 4x1 subsampled Cb (1) and Cr (2) planes */
+#define DRM_FORMAT_YVU411	fourcc_code('Y', 'V', '1', '1') /* 4x1 subsampled Cr (1) and Cb (2) planes */
+#define DRM_FORMAT_YUV420	fourcc_code('Y', 'U', '1', '2') /* 2x2 subsampled Cb (1) and Cr (2) planes */
+#define DRM_FORMAT_YVU420	fourcc_code('Y', 'V', '1', '2') /* 2x2 subsampled Cr (1) and Cb (2) planes */
+#define DRM_FORMAT_YUV422	fourcc_code('Y', 'U', '1', '6') /* 2x1 subsampled Cb (1) and Cr (2) planes */
+#define DRM_FORMAT_YVU422	fourcc_code('Y', 'V', '1', '6') /* 2x1 subsampled Cr (1) and Cb (2) planes */
+#define DRM_FORMAT_YUV444	fourcc_code('Y', 'U', '2', '4') /* non-subsampled Cb (1) and Cr (2) planes */
+#define DRM_FORMAT_YVU444	fourcc_code('Y', 'V', '2', '4') /* non-subsampled Cr (1) and Cb (2) planes */
+
+
+/*
+ * Format Modifiers:
+ *
+ * Format modifiers describe, typically, a re-ordering or modification
+ * of the data in a plane of an FB.  This can be used to express tiled/
+ * swizzled formats, or compression, or a combination of the two.
+ *
+ * The upper 8 bits of the format modifier are a vendor-id as assigned
+ * below.  The lower 56 bits are assigned as vendor sees fit.
+ */
+
+/* Vendor Ids: */
+#define DRM_FORMAT_MOD_NONE           0
+#define DRM_FORMAT_MOD_VENDOR_NONE    0
+#define DRM_FORMAT_MOD_VENDOR_INTEL   0x01
+#define DRM_FORMAT_MOD_VENDOR_AMD     0x02
+#define DRM_FORMAT_MOD_VENDOR_NV      0x03
+#define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04
+#define DRM_FORMAT_MOD_VENDOR_QCOM    0x05
+#define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06
+#define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07
+/* add more to the end as needed */
+
+#define fourcc_mod_code(vendor, val) \
+	((((__u64)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | (val & 0x00ffffffffffffffULL))
+
+/*
+ * Format Modifier tokens:
+ *
+ * When adding a new token please document the layout with a code comment,
+ * similar to the fourcc codes above. drm_fourcc.h is considered the
+ * authoritative source for all of these.
+ */
+
+/*
+ * Linear Layout
+ *
+ * Just plain linear layout. Note that this is different from no specifying any
+ * modifier (e.g. not setting DRM_MODE_FB_MODIFIERS in the DRM_ADDFB2 ioctl),
+ * which tells the driver to also take driver-internal information into account
+ * and so might actually result in a tiled framebuffer.
+ */
+#define DRM_FORMAT_MOD_LINEAR	fourcc_mod_code(NONE, 0)
+
+/* Intel framebuffer modifiers */
+
+/*
+ * Intel X-tiling layout
+ *
+ * This is a tiled layout using 4Kb tiles (except on gen2 where the tiles 2Kb)
+ * in row-major layout. Within the tile bytes are laid out row-major, with
+ * a platform-dependent stride. On top of that the memory can apply
+ * platform-depending swizzling of some higher address bits into bit6.
+ *
+ * This format is highly platforms specific and not useful for cross-driver
+ * sharing. It exists since on a given platform it does uniquely identify the
+ * layout in a simple way for i915-specific userspace.
+ */
+#define I915_FORMAT_MOD_X_TILED	fourcc_mod_code(INTEL, 1)
+
+/*
+ * Intel Y-tiling layout
+ *
+ * This is a tiled layout using 4Kb tiles (except on gen2 where the tiles 2Kb)
+ * in row-major layout. Within the tile bytes are laid out in OWORD (16 bytes)
+ * chunks column-major, with a platform-dependent height. On top of that the
+ * memory can apply platform-depending swizzling of some higher address bits
+ * into bit6.
+ *
+ * This format is highly platforms specific and not useful for cross-driver
+ * sharing. It exists since on a given platform it does uniquely identify the
+ * layout in a simple way for i915-specific userspace.
+ */
+#define I915_FORMAT_MOD_Y_TILED	fourcc_mod_code(INTEL, 2)
+
+/*
+ * Intel Yf-tiling layout
+ *
+ * This is a tiled layout using 4Kb tiles in row-major layout.
+ * Within the tile pixels are laid out in 16 256 byte units / sub-tiles which
+ * are arranged in four groups (two wide, two high) with column-major layout.
+ * Each group therefore consits out of four 256 byte units, which are also laid
+ * out as 2x2 column-major.
+ * 256 byte units are made out of four 64 byte blocks of pixels, producing
+ * either a square block or a 2:1 unit.
+ * 64 byte blocks of pixels contain four pixel rows of 16 bytes, where the width
+ * in pixel depends on the pixel depth.
+ */
+#define I915_FORMAT_MOD_Yf_TILED fourcc_mod_code(INTEL, 3)
+
+/*
+ * Tiled, NV12MT, grouped in 64 (pixels) x 32 (lines) -sized macroblocks
+ *
+ * Macroblocks are laid in a Z-shape, and each pixel data is following the
+ * standard NV12 style.
+ * As for NV12, an image is the result of two frame buffers: one for Y,
+ * one for the interleaved Cb/Cr components (1/2 the height of the Y buffer).
+ * Alignment requirements are (for each buffer):
+ * - multiple of 128 pixels for the width
+ * - multiple of  32 pixels for the height
+ *
+ * For more information: see https://linuxtv.org/downloads/v4l-dvb-apis/re32.html
+ */
+#define DRM_FORMAT_MOD_SAMSUNG_64_32_TILE	fourcc_mod_code(SAMSUNG, 1)
+
+/* Vivante framebuffer modifiers */
+
+/*
+ * Vivante 4x4 tiling layout
+ *
+ * This is a simple tiled layout using tiles of 4x4 pixels in a row-major
+ * layout.
+ */
+#define DRM_FORMAT_MOD_VIVANTE_TILED		fourcc_mod_code(VIVANTE, 1)
+
+/*
+ * Vivante 64x64 super-tiling layout
+ *
+ * This is a tiled layout using 64x64 pixel super-tiles, where each super-tile
+ * contains 8x4 groups of 2x4 tiles of 4x4 pixels (like above) each, all in row-
+ * major layout.
+ *
+ * For more information: see
+ * https://github.com/etnaviv/etna_viv/blob/master/doc/hardware.md#texture-tiling
+ */
+#define DRM_FORMAT_MOD_VIVANTE_SUPER_TILED	fourcc_mod_code(VIVANTE, 2)
+
+/*
+ * Vivante 4x4 tiling layout for dual-pipe
+ *
+ * Same as the 4x4 tiling layout, except every second 4x4 pixel tile starts at a
+ * different base address. Offsets from the base addresses are therefore halved
+ * compared to the non-split tiled layout.
+ */
+#define DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED	fourcc_mod_code(VIVANTE, 3)
+
+/*
+ * Vivante 64x64 super-tiling layout for dual-pipe
+ *
+ * Same as the 64x64 super-tiling layout, except every second 4x4 pixel tile
+ * starts at a different base address. Offsets from the base addresses are
+ * therefore halved compared to the non-split super-tiled layout.
+ */
+#define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4)
+
+/* NVIDIA Tegra frame buffer modifiers */
+
+/*
+ * Some modifiers take parameters, for example the number of vertical GOBs in
+ * a block. Reserve the lower 32 bits for parameters
+ */
+#define __fourcc_mod_tegra_mode_shift 32
+#define fourcc_mod_tegra_code(val, params) \
+	fourcc_mod_code(NV, ((((__u64)val) << __fourcc_mod_tegra_mode_shift) | params))
+#define fourcc_mod_tegra_mod(m) \
+	(m & ~((1ULL << __fourcc_mod_tegra_mode_shift) - 1))
+#define fourcc_mod_tegra_param(m) \
+	(m & ((1ULL << __fourcc_mod_tegra_mode_shift) - 1))
+
+/*
+ * Tegra Tiled Layout, used by Tegra 2, 3 and 4.
+ *
+ * Pixels are arranged in simple tiles of 16 x 16 bytes.
+ */
+#define NV_FORMAT_MOD_TEGRA_TILED fourcc_mod_tegra_code(1, 0)
+
+/*
+ * Tegra 16Bx2 Block Linear layout, used by TK1/TX1
+ *
+ * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked
+ * vertically by a power of 2 (1 to 32 GOBs) to form a block.
+ *
+ * Within a GOB, data is ordered as 16B x 2 lines sectors laid in Z-shape.
+ *
+ * Parameter 'v' is the log2 encoding of the number of GOBs stacked vertically.
+ * Valid values are:
+ *
+ * 0 == ONE_GOB
+ * 1 == TWO_GOBS
+ * 2 == FOUR_GOBS
+ * 3 == EIGHT_GOBS
+ * 4 == SIXTEEN_GOBS
+ * 5 == THIRTYTWO_GOBS
+ *
+ * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format
+ * in full detail.
+ */
+#define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v)
+
+/*
+ * Broadcom VC4 "T" format
+ *
+ * This is the primary layout that the V3D GPU can texture from (it
+ * can't do linear).  The T format has:
+ *
+ * - 64b utiles of pixels in a raster-order grid according to cpp.  It's 4x4
+ *   pixels at 32 bit depth.
+ *
+ * - 1k subtiles made of a 4x4 raster-order grid of 64b utiles (so usually
+ *   16x16 pixels).
+ *
+ * - 4k tiles made of a 2x2 grid of 1k subtiles (so usually 32x32 pixels).  On
+ *   even 4k tile rows, they're arranged as (BL, TL, TR, BR), and on odd rows
+ *   they're (TR, BR, BL, TL), where bottom left is start of memory.
+ *
+ * - an image made of 4k tiles in rows either left-to-right (even rows of 4k
+ *   tiles) or right-to-left (odd rows of 4k tiles).
+ */
+#define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* DRM_FOURCC_H */
diff --git a/include/drm-uapi/drm_mode.h b/include/drm-uapi/drm_mode.h
new file mode 100644
index 0000000..403339f
--- /dev/null
+++ b/include/drm-uapi/drm_mode.h
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ * Copyright (c) 2007 Jakob Bornecrantz <wallbraker@gmail.com>
+ * Copyright (c) 2008 Red Hat Inc.
+ * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * Copyright (c) 2007-2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_MODE_H
+#define _DRM_MODE_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_DISPLAY_INFO_LEN	32
+#define DRM_CONNECTOR_NAME_LEN	32
+#define DRM_DISPLAY_MODE_LEN	32
+#define DRM_PROP_NAME_LEN	32
+
+#define DRM_MODE_TYPE_BUILTIN	(1<<0)
+#define DRM_MODE_TYPE_CLOCK_C	((1<<1) | DRM_MODE_TYPE_BUILTIN)
+#define DRM_MODE_TYPE_CRTC_C	((1<<2) | DRM_MODE_TYPE_BUILTIN)
+#define DRM_MODE_TYPE_PREFERRED	(1<<3)
+#define DRM_MODE_TYPE_DEFAULT	(1<<4)
+#define DRM_MODE_TYPE_USERDEF	(1<<5)
+#define DRM_MODE_TYPE_DRIVER	(1<<6)
+
+/* Video mode flags */
+/* bit compatible with the xrandr RR_ definitions (bits 0-13)
+ *
+ * ABI warning: Existing userspace really expects
+ * the mode flags to match the xrandr definitions. Any
+ * changes that don't match the xrandr definitions will
+ * likely need a new client cap or some other mechanism
+ * to avoid breaking existing userspace. This includes
+ * allocating new flags in the previously unused bits!
+ */
+#define DRM_MODE_FLAG_PHSYNC			(1<<0)
+#define DRM_MODE_FLAG_NHSYNC			(1<<1)
+#define DRM_MODE_FLAG_PVSYNC			(1<<2)
+#define DRM_MODE_FLAG_NVSYNC			(1<<3)
+#define DRM_MODE_FLAG_INTERLACE			(1<<4)
+#define DRM_MODE_FLAG_DBLSCAN			(1<<5)
+#define DRM_MODE_FLAG_CSYNC			(1<<6)
+#define DRM_MODE_FLAG_PCSYNC			(1<<7)
+#define DRM_MODE_FLAG_NCSYNC			(1<<8)
+#define DRM_MODE_FLAG_HSKEW			(1<<9) /* hskew provided */
+#define DRM_MODE_FLAG_BCAST			(1<<10)
+#define DRM_MODE_FLAG_PIXMUX			(1<<11)
+#define DRM_MODE_FLAG_DBLCLK			(1<<12)
+#define DRM_MODE_FLAG_CLKDIV2			(1<<13)
+ /*
+  * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX
+  * (define not exposed to user space).
+  */
+#define DRM_MODE_FLAG_3D_MASK			(0x1f<<14)
+#define  DRM_MODE_FLAG_3D_NONE		(0<<14)
+#define  DRM_MODE_FLAG_3D_FRAME_PACKING		(1<<14)
+#define  DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE	(2<<14)
+#define  DRM_MODE_FLAG_3D_LINE_ALTERNATIVE	(3<<14)
+#define  DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL	(4<<14)
+#define  DRM_MODE_FLAG_3D_L_DEPTH		(5<<14)
+#define  DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH	(6<<14)
+#define  DRM_MODE_FLAG_3D_TOP_AND_BOTTOM	(7<<14)
+#define  DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF	(8<<14)
+
+/* Picture aspect ratio options */
+#define DRM_MODE_PICTURE_ASPECT_NONE		0
+#define DRM_MODE_PICTURE_ASPECT_4_3		1
+#define DRM_MODE_PICTURE_ASPECT_16_9		2
+
+/* Aspect ratio flag bitmask (4 bits 22:19) */
+#define DRM_MODE_FLAG_PIC_AR_MASK		(0x0F<<19)
+#define  DRM_MODE_FLAG_PIC_AR_NONE \
+			(DRM_MODE_PICTURE_ASPECT_NONE<<19)
+#define  DRM_MODE_FLAG_PIC_AR_4_3 \
+			(DRM_MODE_PICTURE_ASPECT_4_3<<19)
+#define  DRM_MODE_FLAG_PIC_AR_16_9 \
+			(DRM_MODE_PICTURE_ASPECT_16_9<<19)
+
+/* DPMS flags */
+/* bit compatible with the xorg definitions. */
+#define DRM_MODE_DPMS_ON	0
+#define DRM_MODE_DPMS_STANDBY	1
+#define DRM_MODE_DPMS_SUSPEND	2
+#define DRM_MODE_DPMS_OFF	3
+
+/* Scaling mode options */
+#define DRM_MODE_SCALE_NONE		0 /* Unmodified timing (display or
+					     software can still scale) */
+#define DRM_MODE_SCALE_FULLSCREEN	1 /* Full screen, ignore aspect */
+#define DRM_MODE_SCALE_CENTER		2 /* Centered, no scaling */
+#define DRM_MODE_SCALE_ASPECT		3 /* Full screen, preserve aspect */
+
+/* Dithering mode options */
+#define DRM_MODE_DITHERING_OFF	0
+#define DRM_MODE_DITHERING_ON	1
+#define DRM_MODE_DITHERING_AUTO 2
+
+/* Dirty info options */
+#define DRM_MODE_DIRTY_OFF      0
+#define DRM_MODE_DIRTY_ON       1
+#define DRM_MODE_DIRTY_ANNOTATE 2
+
+/* Link Status options */
+#define DRM_MODE_LINK_STATUS_GOOD	0
+#define DRM_MODE_LINK_STATUS_BAD	1
+
+/*
+ * DRM_MODE_ROTATE_<degrees>
+ *
+ * Signals that a drm plane is been rotated <degrees> degrees in counter
+ * clockwise direction.
+ *
+ * This define is provided as a convenience, looking up the property id
+ * using the name->prop id lookup is the preferred method.
+ */
+#define DRM_MODE_ROTATE_0       (1<<0)
+#define DRM_MODE_ROTATE_90      (1<<1)
+#define DRM_MODE_ROTATE_180     (1<<2)
+#define DRM_MODE_ROTATE_270     (1<<3)
+
+/*
+ * DRM_MODE_ROTATE_MASK
+ *
+ * Bitmask used to look for drm plane rotations.
+ */
+#define DRM_MODE_ROTATE_MASK (\
+		DRM_MODE_ROTATE_0  | \
+		DRM_MODE_ROTATE_90  | \
+		DRM_MODE_ROTATE_180 | \
+		DRM_MODE_ROTATE_270)
+
+/*
+ * DRM_MODE_REFLECT_<axis>
+ *
+ * Signals that the contents of a drm plane is reflected in the <axis> axis,
+ * in the same way as mirroring.
+ *
+ * This define is provided as a convenience, looking up the property id
+ * using the name->prop id lookup is the preferred method.
+ */
+#define DRM_MODE_REFLECT_X      (1<<4)
+#define DRM_MODE_REFLECT_Y      (1<<5)
+
+/*
+ * DRM_MODE_REFLECT_MASK
+ *
+ * Bitmask used to look for drm plane reflections.
+ */
+#define DRM_MODE_REFLECT_MASK (\
+		DRM_MODE_REFLECT_X | \
+		DRM_MODE_REFLECT_Y)
+
+
+struct drm_mode_modeinfo {
+	__u32 clock;
+	__u16 hdisplay;
+	__u16 hsync_start;
+	__u16 hsync_end;
+	__u16 htotal;
+	__u16 hskew;
+	__u16 vdisplay;
+	__u16 vsync_start;
+	__u16 vsync_end;
+	__u16 vtotal;
+	__u16 vscan;
+
+	__u32 vrefresh;
+
+	__u32 flags;
+	__u32 type;
+	char name[DRM_DISPLAY_MODE_LEN];
+};
+
+struct drm_mode_card_res {
+	__u64 fb_id_ptr;
+	__u64 crtc_id_ptr;
+	__u64 connector_id_ptr;
+	__u64 encoder_id_ptr;
+	__u32 count_fbs;
+	__u32 count_crtcs;
+	__u32 count_connectors;
+	__u32 count_encoders;
+	__u32 min_width;
+	__u32 max_width;
+	__u32 min_height;
+	__u32 max_height;
+};
+
+struct drm_mode_crtc {
+	__u64 set_connectors_ptr;
+	__u32 count_connectors;
+
+	__u32 crtc_id; /**< Id */
+	__u32 fb_id; /**< Id of framebuffer */
+
+	__u32 x; /**< x Position on the framebuffer */
+	__u32 y; /**< y Position on the framebuffer */
+
+	__u32 gamma_size;
+	__u32 mode_valid;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_PRESENT_TOP_FIELD	(1<<0)
+#define DRM_MODE_PRESENT_BOTTOM_FIELD	(1<<1)
+
+/* Planes blend with or override other bits on the CRTC */
+struct drm_mode_set_plane {
+	__u32 plane_id;
+	__u32 crtc_id;
+	__u32 fb_id; /* fb object contains surface format type */
+	__u32 flags; /* see above flags */
+
+	/* Signed dest location allows it to be partially off screen */
+	__s32 crtc_x;
+	__s32 crtc_y;
+	__u32 crtc_w;
+	__u32 crtc_h;
+
+	/* Source values are 16.16 fixed point */
+	__u32 src_x;
+	__u32 src_y;
+	__u32 src_h;
+	__u32 src_w;
+};
+
+struct drm_mode_get_plane {
+	__u32 plane_id;
+
+	__u32 crtc_id;
+	__u32 fb_id;
+
+	__u32 possible_crtcs;
+	__u32 gamma_size;
+
+	__u32 count_format_types;
+	__u64 format_type_ptr;
+};
+
+struct drm_mode_get_plane_res {
+	__u64 plane_id_ptr;
+	__u32 count_planes;
+};
+
+#define DRM_MODE_ENCODER_NONE	0
+#define DRM_MODE_ENCODER_DAC	1
+#define DRM_MODE_ENCODER_TMDS	2
+#define DRM_MODE_ENCODER_LVDS	3
+#define DRM_MODE_ENCODER_TVDAC	4
+#define DRM_MODE_ENCODER_VIRTUAL 5
+#define DRM_MODE_ENCODER_DSI	6
+#define DRM_MODE_ENCODER_DPMST	7
+#define DRM_MODE_ENCODER_DPI	8
+
+struct drm_mode_get_encoder {
+	__u32 encoder_id;
+	__u32 encoder_type;
+
+	__u32 crtc_id; /**< Id of crtc */
+
+	__u32 possible_crtcs;
+	__u32 possible_clones;
+};
+
+/* This is for connectors with multiple signal types. */
+/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */
+enum drm_mode_subconnector {
+	DRM_MODE_SUBCONNECTOR_Automatic = 0,
+	DRM_MODE_SUBCONNECTOR_Unknown = 0,
+	DRM_MODE_SUBCONNECTOR_DVID = 3,
+	DRM_MODE_SUBCONNECTOR_DVIA = 4,
+	DRM_MODE_SUBCONNECTOR_Composite = 5,
+	DRM_MODE_SUBCONNECTOR_SVIDEO = 6,
+	DRM_MODE_SUBCONNECTOR_Component = 8,
+	DRM_MODE_SUBCONNECTOR_SCART = 9,
+};
+
+#define DRM_MODE_CONNECTOR_Unknown	0
+#define DRM_MODE_CONNECTOR_VGA		1
+#define DRM_MODE_CONNECTOR_DVII		2
+#define DRM_MODE_CONNECTOR_DVID		3
+#define DRM_MODE_CONNECTOR_DVIA		4
+#define DRM_MODE_CONNECTOR_Composite	5
+#define DRM_MODE_CONNECTOR_SVIDEO	6
+#define DRM_MODE_CONNECTOR_LVDS		7
+#define DRM_MODE_CONNECTOR_Component	8
+#define DRM_MODE_CONNECTOR_9PinDIN	9
+#define DRM_MODE_CONNECTOR_DisplayPort	10
+#define DRM_MODE_CONNECTOR_HDMIA	11
+#define DRM_MODE_CONNECTOR_HDMIB	12
+#define DRM_MODE_CONNECTOR_TV		13
+#define DRM_MODE_CONNECTOR_eDP		14
+#define DRM_MODE_CONNECTOR_VIRTUAL      15
+#define DRM_MODE_CONNECTOR_DSI		16
+#define DRM_MODE_CONNECTOR_DPI		17
+
+struct drm_mode_get_connector {
+
+	__u64 encoders_ptr;
+	__u64 modes_ptr;
+	__u64 props_ptr;
+	__u64 prop_values_ptr;
+
+	__u32 count_modes;
+	__u32 count_props;
+	__u32 count_encoders;
+
+	__u32 encoder_id; /**< Current Encoder */
+	__u32 connector_id; /**< Id */
+	__u32 connector_type;
+	__u32 connector_type_id;
+
+	__u32 connection;
+	__u32 mm_width;  /**< width in millimeters */
+	__u32 mm_height; /**< height in millimeters */
+	__u32 subpixel;
+
+	__u32 pad;
+};
+
+#define DRM_MODE_PROP_PENDING	(1<<0)
+#define DRM_MODE_PROP_RANGE	(1<<1)
+#define DRM_MODE_PROP_IMMUTABLE	(1<<2)
+#define DRM_MODE_PROP_ENUM	(1<<3) /* enumerated type with text strings */
+#define DRM_MODE_PROP_BLOB	(1<<4)
+#define DRM_MODE_PROP_BITMASK	(1<<5) /* bitmask of enumerated types */
+
+/* non-extended types: legacy bitmask, one bit per type: */
+#define DRM_MODE_PROP_LEGACY_TYPE  ( \
+		DRM_MODE_PROP_RANGE | \
+		DRM_MODE_PROP_ENUM | \
+		DRM_MODE_PROP_BLOB | \
+		DRM_MODE_PROP_BITMASK)
+
+/* extended-types: rather than continue to consume a bit per type,
+ * grab a chunk of the bits to use as integer type id.
+ */
+#define DRM_MODE_PROP_EXTENDED_TYPE	0x0000ffc0
+#define DRM_MODE_PROP_TYPE(n)		((n) << 6)
+#define DRM_MODE_PROP_OBJECT		DRM_MODE_PROP_TYPE(1)
+#define DRM_MODE_PROP_SIGNED_RANGE	DRM_MODE_PROP_TYPE(2)
+
+/* the PROP_ATOMIC flag is used to hide properties from userspace that
+ * is not aware of atomic properties.  This is mostly to work around
+ * older userspace (DDX drivers) that read/write each prop they find,
+ * witout being aware that this could be triggering a lengthy modeset.
+ */
+#define DRM_MODE_PROP_ATOMIC        0x80000000
+
+struct drm_mode_property_enum {
+	__u64 value;
+	char name[DRM_PROP_NAME_LEN];
+};
+
+struct drm_mode_get_property {
+	__u64 values_ptr; /* values and blob lengths */
+	__u64 enum_blob_ptr; /* enum and blob id ptrs */
+
+	__u32 prop_id;
+	__u32 flags;
+	char name[DRM_PROP_NAME_LEN];
+
+	__u32 count_values;
+	/* This is only used to count enum values, not blobs. The _blobs is
+	 * simply because of a historical reason, i.e. backwards compat. */
+	__u32 count_enum_blobs;
+};
+
+struct drm_mode_connector_set_property {
+	__u64 value;
+	__u32 prop_id;
+	__u32 connector_id;
+};
+
+#define DRM_MODE_OBJECT_CRTC 0xcccccccc
+#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0
+#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0
+#define DRM_MODE_OBJECT_MODE 0xdededede
+#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0
+#define DRM_MODE_OBJECT_FB 0xfbfbfbfb
+#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb
+#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee
+#define DRM_MODE_OBJECT_ANY 0
+
+struct drm_mode_obj_get_properties {
+	__u64 props_ptr;
+	__u64 prop_values_ptr;
+	__u32 count_props;
+	__u32 obj_id;
+	__u32 obj_type;
+};
+
+struct drm_mode_obj_set_property {
+	__u64 value;
+	__u32 prop_id;
+	__u32 obj_id;
+	__u32 obj_type;
+};
+
+struct drm_mode_get_blob {
+	__u32 blob_id;
+	__u32 length;
+	__u64 data;
+};
+
+struct drm_mode_fb_cmd {
+	__u32 fb_id;
+	__u32 width;
+	__u32 height;
+	__u32 pitch;
+	__u32 bpp;
+	__u32 depth;
+	/* driver specific handle */
+	__u32 handle;
+};
+
+#define DRM_MODE_FB_INTERLACED	(1<<0) /* for interlaced framebuffers */
+#define DRM_MODE_FB_MODIFIERS	(1<<1) /* enables ->modifer[] */
+
+struct drm_mode_fb_cmd2 {
+	__u32 fb_id;
+	__u32 width;
+	__u32 height;
+	__u32 pixel_format; /* fourcc code from drm_fourcc.h */
+	__u32 flags; /* see above flags */
+
+	/*
+	 * In case of planar formats, this ioctl allows up to 4
+	 * buffer objects with offsets and pitches per plane.
+	 * The pitch and offset order is dictated by the fourcc,
+	 * e.g. NV12 (http://fourcc.org/yuv.php#NV12) is described as:
+	 *
+	 *   YUV 4:2:0 image with a plane of 8 bit Y samples
+	 *   followed by an interleaved U/V plane containing
+	 *   8 bit 2x2 subsampled colour difference samples.
+	 *
+	 * So it would consist of Y as offsets[0] and UV as
+	 * offsets[1].  Note that offsets[0] will generally
+	 * be 0 (but this is not required).
+	 *
+	 * To accommodate tiled, compressed, etc formats, a
+	 * modifier can be specified.  The default value of zero
+	 * indicates "native" format as specified by the fourcc.
+	 * Vendor specific modifier token.  Note that even though
+	 * it looks like we have a modifier per-plane, we in fact
+	 * do not. The modifier for each plane must be identical.
+	 * Thus all combinations of different data layouts for
+	 * multi plane formats must be enumerated as separate
+	 * modifiers.
+	 */
+	__u32 handles[4];
+	__u32 pitches[4]; /* pitch for each plane */
+	__u32 offsets[4]; /* offset of each plane */
+	__u64 modifier[4]; /* ie, tiling, compress */
+};
+
+#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01
+#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02
+#define DRM_MODE_FB_DIRTY_FLAGS         0x03
+
+#define DRM_MODE_FB_DIRTY_MAX_CLIPS     256
+
+/*
+ * Mark a region of a framebuffer as dirty.
+ *
+ * Some hardware does not automatically update display contents
+ * as a hardware or software draw to a framebuffer. This ioctl
+ * allows userspace to tell the kernel and the hardware what
+ * regions of the framebuffer have changed.
+ *
+ * The kernel or hardware is free to update more then just the
+ * region specified by the clip rects. The kernel or hardware
+ * may also delay and/or coalesce several calls to dirty into a
+ * single update.
+ *
+ * Userspace may annotate the updates, the annotates are a
+ * promise made by the caller that the change is either a copy
+ * of pixels or a fill of a single color in the region specified.
+ *
+ * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then
+ * the number of updated regions are half of num_clips given,
+ * where the clip rects are paired in src and dst. The width and
+ * height of each one of the pairs must match.
+ *
+ * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller
+ * promises that the region specified of the clip rects is filled
+ * completely with a single color as given in the color argument.
+ */
+
+struct drm_mode_fb_dirty_cmd {
+	__u32 fb_id;
+	__u32 flags;
+	__u32 color;
+	__u32 num_clips;
+	__u64 clips_ptr;
+};
+
+struct drm_mode_mode_cmd {
+	__u32 connector_id;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_CURSOR_BO	0x01
+#define DRM_MODE_CURSOR_MOVE	0x02
+#define DRM_MODE_CURSOR_FLAGS	0x03
+
+/*
+ * depending on the value in flags different members are used.
+ *
+ * CURSOR_BO uses
+ *    crtc_id
+ *    width
+ *    height
+ *    handle - if 0 turns the cursor off
+ *
+ * CURSOR_MOVE uses
+ *    crtc_id
+ *    x
+ *    y
+ */
+struct drm_mode_cursor {
+	__u32 flags;
+	__u32 crtc_id;
+	__s32 x;
+	__s32 y;
+	__u32 width;
+	__u32 height;
+	/* driver specific handle */
+	__u32 handle;
+};
+
+struct drm_mode_cursor2 {
+	__u32 flags;
+	__u32 crtc_id;
+	__s32 x;
+	__s32 y;
+	__u32 width;
+	__u32 height;
+	/* driver specific handle */
+	__u32 handle;
+	__s32 hot_x;
+	__s32 hot_y;
+};
+
+struct drm_mode_crtc_lut {
+	__u32 crtc_id;
+	__u32 gamma_size;
+
+	/* pointers to arrays */
+	__u64 red;
+	__u64 green;
+	__u64 blue;
+};
+
+struct drm_color_ctm {
+	/* Conversion matrix in S31.32 format. */
+	__s64 matrix[9];
+};
+
+struct drm_color_lut {
+	/*
+	 * Data is U0.16 fixed point format.
+	 */
+	__u16 red;
+	__u16 green;
+	__u16 blue;
+	__u16 reserved;
+};
+
+#define DRM_MODE_PAGE_FLIP_EVENT 0x01
+#define DRM_MODE_PAGE_FLIP_ASYNC 0x02
+#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
+#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8
+#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \
+				   DRM_MODE_PAGE_FLIP_TARGET_RELATIVE)
+#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \
+				  DRM_MODE_PAGE_FLIP_ASYNC | \
+				  DRM_MODE_PAGE_FLIP_TARGET)
+
+/*
+ * Request a page flip on the specified crtc.
+ *
+ * This ioctl will ask KMS to schedule a page flip for the specified
+ * crtc.  Once any pending rendering targeting the specified fb (as of
+ * ioctl time) has completed, the crtc will be reprogrammed to display
+ * that fb after the next vertical refresh.  The ioctl returns
+ * immediately, but subsequent rendering to the current fb will block
+ * in the execbuffer ioctl until the page flip happens.  If a page
+ * flip is already pending as the ioctl is called, EBUSY will be
+ * returned.
+ *
+ * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank
+ * event (see drm.h: struct drm_event_vblank) when the page flip is
+ * done.  The user_data field passed in with this ioctl will be
+ * returned as the user_data field in the vblank event struct.
+ *
+ * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen
+ * 'as soon as possible', meaning that it not delay waiting for vblank.
+ * This may cause tearing on the screen.
+ *
+ * The reserved field must be zero.
+ */
+
+struct drm_mode_crtc_page_flip {
+	__u32 crtc_id;
+	__u32 fb_id;
+	__u32 flags;
+	__u32 reserved;
+	__u64 user_data;
+};
+
+/*
+ * Request a page flip on the specified crtc.
+ *
+ * Same as struct drm_mode_crtc_page_flip, but supports new flags and
+ * re-purposes the reserved field:
+ *
+ * The sequence field must be zero unless either of the
+ * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When
+ * the ABSOLUTE flag is specified, the sequence field denotes the absolute
+ * vblank sequence when the flip should take effect. When the RELATIVE
+ * flag is specified, the sequence field denotes the relative (to the
+ * current one when the ioctl is called) vblank sequence when the flip
+ * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to
+ * make sure the vblank sequence before the target one has passed before
+ * calling this ioctl. The purpose of the
+ * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify
+ * the target for when code dealing with a page flip runs during a
+ * vertical blank period.
+ */
+
+struct drm_mode_crtc_page_flip_target {
+	__u32 crtc_id;
+	__u32 fb_id;
+	__u32 flags;
+	__u32 sequence;
+	__u64 user_data;
+};
+
+/* create a dumb scanout buffer */
+struct drm_mode_create_dumb {
+	__u32 height;
+	__u32 width;
+	__u32 bpp;
+	__u32 flags;
+	/* handle, pitch, size will be returned */
+	__u32 handle;
+	__u32 pitch;
+	__u64 size;
+};
+
+/* set up for mmap of a dumb scanout buffer */
+struct drm_mode_map_dumb {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/**
+	 * Fake offset to use for subsequent mmap call
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 offset;
+};
+
+struct drm_mode_destroy_dumb {
+	__u32 handle;
+};
+
+/* page-flip flags are valid, plus: */
+#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100
+#define DRM_MODE_ATOMIC_NONBLOCK  0x0200
+#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400
+
+#define DRM_MODE_ATOMIC_FLAGS (\
+		DRM_MODE_PAGE_FLIP_EVENT |\
+		DRM_MODE_PAGE_FLIP_ASYNC |\
+		DRM_MODE_ATOMIC_TEST_ONLY |\
+		DRM_MODE_ATOMIC_NONBLOCK |\
+		DRM_MODE_ATOMIC_ALLOW_MODESET)
+
+struct drm_mode_atomic {
+	__u32 flags;
+	__u32 count_objs;
+	__u64 objs_ptr;
+	__u64 count_props_ptr;
+	__u64 props_ptr;
+	__u64 prop_values_ptr;
+	__u64 reserved;
+	__u64 user_data;
+};
+
+/**
+ * Create a new 'blob' data property, copying length bytes from data pointer,
+ * and returning new blob ID.
+ */
+struct drm_mode_create_blob {
+	/** Pointer to data to copy. */
+	__u64 data;
+	/** Length of data to copy. */
+	__u32 length;
+	/** Return: new property ID. */
+	__u32 blob_id;
+};
+
+/**
+ * Destroy a user-created blob property.
+ */
+struct drm_mode_destroy_blob {
+	__u32 blob_id;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
new file mode 100644
index 0000000..e1cb682
--- /dev/null
+++ b/include/drm-uapi/i915_drm.h
@@ -0,0 +1,1475 @@
+/*
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _I915_DRM_H_
+#define _I915_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ */
+
+/**
+ * DOC: uevents generated by i915 on it's device node
+ *
+ * I915_L3_PARITY_UEVENT - Generated when the driver receives a parity mismatch
+ *	event from the gpu l3 cache. Additional information supplied is ROW,
+ *	BANK, SUBBANK, SLICE of the affected cacheline. Userspace should keep
+ *	track of these events and if a specific cache-line seems to have a
+ *	persistent error remap it with the l3 remapping tool supplied in
+ *	intel-gpu-tools.  The value supplied with the event is always 1.
+ *
+ * I915_ERROR_UEVENT - Generated upon error detection, currently only via
+ *	hangcheck. The error detection event is a good indicator of when things
+ *	began to go badly. The value supplied with the event is a 1 upon error
+ *	detection, and a 0 upon reset completion, signifying no more error
+ *	exists. NOTE: Disabling hangcheck or reset via module parameter will
+ *	cause the related events to not be seen.
+ *
+ * I915_RESET_UEVENT - Event is generated just before an attempt to reset the
+ *	the GPU. The value supplied with the event is always 1. NOTE: Disable
+ *	reset via module parameter will cause this event to not be seen.
+ */
+#define I915_L3_PARITY_UEVENT		"L3_PARITY_ERROR"
+#define I915_ERROR_UEVENT		"ERROR"
+#define I915_RESET_UEVENT		"RESET"
+
+/*
+ * MOCS indexes used for GPU surfaces, defining the cacheability of the
+ * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
+ */
+enum i915_mocs_table_index {
+	/*
+	 * Not cached anywhere, coherency between CPU and GPU accesses is
+	 * guaranteed.
+	 */
+	I915_MOCS_UNCACHED,
+	/*
+	 * Cacheability and coherency controlled by the kernel automatically
+	 * based on the DRM_I915_GEM_SET_CACHING IOCTL setting and the current
+	 * usage of the surface (used for display scanout or not).
+	 */
+	I915_MOCS_PTE,
+	/*
+	 * Cached in all GPU caches available on the platform.
+	 * Coherency between CPU and GPU accesses to the surface is not
+	 * guaranteed without extra synchronization.
+	 */
+	I915_MOCS_CACHED,
+};
+
+/* Each region is a minimum of 16k, and there are at most 255 of them.
+ */
+#define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
+				 * of chars for next/prev indices */
+#define I915_LOG_MIN_TEX_REGION_SIZE 14
+
+typedef struct _drm_i915_init {
+	enum {
+		I915_INIT_DMA = 0x01,
+		I915_CLEANUP_DMA = 0x02,
+		I915_RESUME_DMA = 0x03
+	} func;
+	unsigned int mmio_offset;
+	int sarea_priv_offset;
+	unsigned int ring_start;
+	unsigned int ring_end;
+	unsigned int ring_size;
+	unsigned int front_offset;
+	unsigned int back_offset;
+	unsigned int depth_offset;
+	unsigned int w;
+	unsigned int h;
+	unsigned int pitch;
+	unsigned int pitch_bits;
+	unsigned int back_pitch;
+	unsigned int depth_pitch;
+	unsigned int cpp;
+	unsigned int chipset;
+} drm_i915_init_t;
+
+typedef struct _drm_i915_sarea {
+	struct drm_tex_region texList[I915_NR_TEX_REGIONS + 1];
+	int last_upload;	/* last time texture was uploaded */
+	int last_enqueue;	/* last time a buffer was enqueued */
+	int last_dispatch;	/* age of the most recently dispatched buffer */
+	int ctxOwner;		/* last context to upload state */
+	int texAge;
+	int pf_enabled;		/* is pageflipping allowed? */
+	int pf_active;
+	int pf_current_page;	/* which buffer is being displayed? */
+	int perf_boxes;		/* performance boxes to be displayed */
+	int width, height;      /* screen size in pixels */
+
+	drm_handle_t front_handle;
+	int front_offset;
+	int front_size;
+
+	drm_handle_t back_handle;
+	int back_offset;
+	int back_size;
+
+	drm_handle_t depth_handle;
+	int depth_offset;
+	int depth_size;
+
+	drm_handle_t tex_handle;
+	int tex_offset;
+	int tex_size;
+	int log_tex_granularity;
+	int pitch;
+	int rotation;           /* 0, 90, 180 or 270 */
+	int rotated_offset;
+	int rotated_size;
+	int rotated_pitch;
+	int virtualX, virtualY;
+
+	unsigned int front_tiled;
+	unsigned int back_tiled;
+	unsigned int depth_tiled;
+	unsigned int rotated_tiled;
+	unsigned int rotated2_tiled;
+
+	int pipeA_x;
+	int pipeA_y;
+	int pipeA_w;
+	int pipeA_h;
+	int pipeB_x;
+	int pipeB_y;
+	int pipeB_w;
+	int pipeB_h;
+
+	/* fill out some space for old userspace triple buffer */
+	drm_handle_t unused_handle;
+	__u32 unused1, unused2, unused3;
+
+	/* buffer object handles for static buffers. May change
+	 * over the lifetime of the client.
+	 */
+	__u32 front_bo_handle;
+	__u32 back_bo_handle;
+	__u32 unused_bo_handle;
+	__u32 depth_bo_handle;
+
+} drm_i915_sarea_t;
+
+/* due to userspace building against these headers we need some compat here */
+#define planeA_x pipeA_x
+#define planeA_y pipeA_y
+#define planeA_w pipeA_w
+#define planeA_h pipeA_h
+#define planeB_x pipeB_x
+#define planeB_y pipeB_y
+#define planeB_w pipeB_w
+#define planeB_h pipeB_h
+
+/* Flags for perf_boxes
+ */
+#define I915_BOX_RING_EMPTY    0x1
+#define I915_BOX_FLIP          0x2
+#define I915_BOX_WAIT          0x4
+#define I915_BOX_TEXTURE_LOAD  0x8
+#define I915_BOX_LOST_CONTEXT  0x10
+
+/*
+ * i915 specific ioctls.
+ *
+ * The device specific ioctl range is [DRM_COMMAND_BASE, DRM_COMMAND_END) ie
+ * [0x40, 0xa0) (a0 is excluded). The numbers below are defined as offset
+ * against DRM_COMMAND_BASE and should be between [0x0, 0x60).
+ */
+#define DRM_I915_INIT		0x00
+#define DRM_I915_FLUSH		0x01
+#define DRM_I915_FLIP		0x02
+#define DRM_I915_BATCHBUFFER	0x03
+#define DRM_I915_IRQ_EMIT	0x04
+#define DRM_I915_IRQ_WAIT	0x05
+#define DRM_I915_GETPARAM	0x06
+#define DRM_I915_SETPARAM	0x07
+#define DRM_I915_ALLOC		0x08
+#define DRM_I915_FREE		0x09
+#define DRM_I915_INIT_HEAP	0x0a
+#define DRM_I915_CMDBUFFER	0x0b
+#define DRM_I915_DESTROY_HEAP	0x0c
+#define DRM_I915_SET_VBLANK_PIPE	0x0d
+#define DRM_I915_GET_VBLANK_PIPE	0x0e
+#define DRM_I915_VBLANK_SWAP	0x0f
+#define DRM_I915_HWS_ADDR	0x11
+#define DRM_I915_GEM_INIT	0x13
+#define DRM_I915_GEM_EXECBUFFER	0x14
+#define DRM_I915_GEM_PIN	0x15
+#define DRM_I915_GEM_UNPIN	0x16
+#define DRM_I915_GEM_BUSY	0x17
+#define DRM_I915_GEM_THROTTLE	0x18
+#define DRM_I915_GEM_ENTERVT	0x19
+#define DRM_I915_GEM_LEAVEVT	0x1a
+#define DRM_I915_GEM_CREATE	0x1b
+#define DRM_I915_GEM_PREAD	0x1c
+#define DRM_I915_GEM_PWRITE	0x1d
+#define DRM_I915_GEM_MMAP	0x1e
+#define DRM_I915_GEM_SET_DOMAIN	0x1f
+#define DRM_I915_GEM_SW_FINISH	0x20
+#define DRM_I915_GEM_SET_TILING	0x21
+#define DRM_I915_GEM_GET_TILING	0x22
+#define DRM_I915_GEM_GET_APERTURE 0x23
+#define DRM_I915_GEM_MMAP_GTT	0x24
+#define DRM_I915_GET_PIPE_FROM_CRTC_ID	0x25
+#define DRM_I915_GEM_MADVISE	0x26
+#define DRM_I915_OVERLAY_PUT_IMAGE	0x27
+#define DRM_I915_OVERLAY_ATTRS	0x28
+#define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
+#define DRM_I915_GET_SPRITE_COLORKEY	0x2a
+#define DRM_I915_SET_SPRITE_COLORKEY	0x2b
+#define DRM_I915_GEM_WAIT	0x2c
+#define DRM_I915_GEM_CONTEXT_CREATE	0x2d
+#define DRM_I915_GEM_CONTEXT_DESTROY	0x2e
+#define DRM_I915_GEM_SET_CACHING	0x2f
+#define DRM_I915_GEM_GET_CACHING	0x30
+#define DRM_I915_REG_READ		0x31
+#define DRM_I915_GET_RESET_STATS	0x32
+#define DRM_I915_GEM_USERPTR		0x33
+#define DRM_I915_GEM_CONTEXT_GETPARAM	0x34
+#define DRM_I915_GEM_CONTEXT_SETPARAM	0x35
+#define DRM_I915_PERF_OPEN		0x36
+
+#define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
+#define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
+#define DRM_IOCTL_I915_FLIP		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLIP)
+#define DRM_IOCTL_I915_BATCHBUFFER	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_BATCHBUFFER, drm_i915_batchbuffer_t)
+#define DRM_IOCTL_I915_IRQ_EMIT         DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_IRQ_EMIT, drm_i915_irq_emit_t)
+#define DRM_IOCTL_I915_IRQ_WAIT         DRM_IOW( DRM_COMMAND_BASE + DRM_I915_IRQ_WAIT, drm_i915_irq_wait_t)
+#define DRM_IOCTL_I915_GETPARAM         DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GETPARAM, drm_i915_getparam_t)
+#define DRM_IOCTL_I915_SETPARAM         DRM_IOW( DRM_COMMAND_BASE + DRM_I915_SETPARAM, drm_i915_setparam_t)
+#define DRM_IOCTL_I915_ALLOC            DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_ALLOC, drm_i915_mem_alloc_t)
+#define DRM_IOCTL_I915_FREE             DRM_IOW( DRM_COMMAND_BASE + DRM_I915_FREE, drm_i915_mem_free_t)
+#define DRM_IOCTL_I915_INIT_HEAP        DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT_HEAP, drm_i915_mem_init_heap_t)
+#define DRM_IOCTL_I915_CMDBUFFER	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_CMDBUFFER, drm_i915_cmdbuffer_t)
+#define DRM_IOCTL_I915_DESTROY_HEAP	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_DESTROY_HEAP, drm_i915_mem_destroy_heap_t)
+#define DRM_IOCTL_I915_SET_VBLANK_PIPE	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_SET_VBLANK_PIPE, drm_i915_vblank_pipe_t)
+#define DRM_IOCTL_I915_GET_VBLANK_PIPE	DRM_IOR( DRM_COMMAND_BASE + DRM_I915_GET_VBLANK_PIPE, drm_i915_vblank_pipe_t)
+#define DRM_IOCTL_I915_VBLANK_SWAP	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_VBLANK_SWAP, drm_i915_vblank_swap_t)
+#define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
+#define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
+#define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
+#define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
+#define DRM_IOCTL_I915_GEM_SET_CACHING		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_SET_CACHING, struct drm_i915_gem_caching)
+#define DRM_IOCTL_I915_GEM_GET_CACHING		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHING, struct drm_i915_gem_caching)
+#define DRM_IOCTL_I915_GEM_THROTTLE	DRM_IO ( DRM_COMMAND_BASE + DRM_I915_GEM_THROTTLE)
+#define DRM_IOCTL_I915_GEM_ENTERVT	DRM_IO(DRM_COMMAND_BASE + DRM_I915_GEM_ENTERVT)
+#define DRM_IOCTL_I915_GEM_LEAVEVT	DRM_IO(DRM_COMMAND_BASE + DRM_I915_GEM_LEAVEVT)
+#define DRM_IOCTL_I915_GEM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_CREATE, struct drm_i915_gem_create)
+#define DRM_IOCTL_I915_GEM_PREAD	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_PREAD, struct drm_i915_gem_pread)
+#define DRM_IOCTL_I915_GEM_PWRITE	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_PWRITE, struct drm_i915_gem_pwrite)
+#define DRM_IOCTL_I915_GEM_MMAP		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct drm_i915_gem_mmap)
+#define DRM_IOCTL_I915_GEM_MMAP_GTT	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP_GTT, struct drm_i915_gem_mmap_gtt)
+#define DRM_IOCTL_I915_GEM_SET_DOMAIN	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SET_DOMAIN, struct drm_i915_gem_set_domain)
+#define DRM_IOCTL_I915_GEM_SW_FINISH	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SW_FINISH, struct drm_i915_gem_sw_finish)
+#define DRM_IOCTL_I915_GEM_SET_TILING	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_SET_TILING, struct drm_i915_gem_set_tiling)
+#define DRM_IOCTL_I915_GEM_GET_TILING	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_TILING, struct drm_i915_gem_get_tiling)
+#define DRM_IOCTL_I915_GEM_GET_APERTURE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_APERTURE, struct drm_i915_gem_get_aperture)
+#define DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_PIPE_FROM_CRTC_ID, struct drm_i915_get_pipe_from_crtc_id)
+#define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
+#define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image)
+#define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_SET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_SET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
+#define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
+#define DRM_IOCTL_I915_GEM_WAIT		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait)
+#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
+#define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
+#define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
+#define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
+#define DRM_IOCTL_I915_GEM_USERPTR			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_USERPTR, struct drm_i915_gem_userptr)
+#define DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_GETPARAM, struct drm_i915_gem_context_param)
+#define DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_SETPARAM, struct drm_i915_gem_context_param)
+#define DRM_IOCTL_I915_PERF_OPEN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_OPEN, struct drm_i915_perf_open_param)
+
+/* Allow drivers to submit batchbuffers directly to hardware, relying
+ * on the security mechanisms provided by hardware.
+ */
+typedef struct drm_i915_batchbuffer {
+	int start;		/* agp offset */
+	int used;		/* nr bytes in use */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+	int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO */
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+	struct drm_clip_rect *cliprects;	/* pointer to userspace cliprects */
+} drm_i915_batchbuffer_t;
+
+/* As above, but pass a pointer to userspace buffer which can be
+ * validated by the kernel prior to sending to hardware.
+ */
+typedef struct _drm_i915_cmdbuffer {
+	char *buf;	/* pointer to userspace command buffer */
+	int sz;			/* nr bytes in buf */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+	int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO */
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+	struct drm_clip_rect *cliprects;	/* pointer to userspace cliprects */
+} drm_i915_cmdbuffer_t;
+
+/* Userspace can request & wait on irq's:
+ */
+typedef struct drm_i915_irq_emit {
+	int *irq_seq;
+} drm_i915_irq_emit_t;
+
+typedef struct drm_i915_irq_wait {
+	int irq_seq;
+} drm_i915_irq_wait_t;
+
+/* Ioctl to query kernel params:
+ */
+#define I915_PARAM_IRQ_ACTIVE            1
+#define I915_PARAM_ALLOW_BATCHBUFFER     2
+#define I915_PARAM_LAST_DISPATCH         3
+#define I915_PARAM_CHIPSET_ID            4
+#define I915_PARAM_HAS_GEM               5
+#define I915_PARAM_NUM_FENCES_AVAIL      6
+#define I915_PARAM_HAS_OVERLAY           7
+#define I915_PARAM_HAS_PAGEFLIPPING	 8
+#define I915_PARAM_HAS_EXECBUF2          9
+#define I915_PARAM_HAS_BSD		 10
+#define I915_PARAM_HAS_BLT		 11
+#define I915_PARAM_HAS_RELAXED_FENCING	 12
+#define I915_PARAM_HAS_COHERENT_RINGS	 13
+#define I915_PARAM_HAS_EXEC_CONSTANTS	 14
+#define I915_PARAM_HAS_RELAXED_DELTA	 15
+#define I915_PARAM_HAS_GEN7_SOL_RESET	 16
+#define I915_PARAM_HAS_LLC     	 	 17
+#define I915_PARAM_HAS_ALIASING_PPGTT	 18
+#define I915_PARAM_HAS_WAIT_TIMEOUT	 19
+#define I915_PARAM_HAS_SEMAPHORES	 20
+#define I915_PARAM_HAS_PRIME_VMAP_FLUSH	 21
+#define I915_PARAM_HAS_VEBOX		 22
+#define I915_PARAM_HAS_SECURE_BATCHES	 23
+#define I915_PARAM_HAS_PINNED_BATCHES	 24
+#define I915_PARAM_HAS_EXEC_NO_RELOC	 25
+#define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
+#define I915_PARAM_HAS_WT     	 	 27
+#define I915_PARAM_CMD_PARSER_VERSION	 28
+#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
+#define I915_PARAM_MMAP_VERSION          30
+#define I915_PARAM_HAS_BSD2		 31
+#define I915_PARAM_REVISION              32
+#define I915_PARAM_SUBSLICE_TOTAL	 33
+#define I915_PARAM_EU_TOTAL		 34
+#define I915_PARAM_HAS_GPU_RESET	 35
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
+#define I915_PARAM_HAS_EXEC_SOFTPIN	 37
+#define I915_PARAM_HAS_POOLED_EU	 38
+#define I915_PARAM_MIN_EU_IN_POOL	 39
+#define I915_PARAM_MMAP_GTT_VERSION	 40
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
+ * priorities and the driver will attempt to execute batches in priority order.
+ */
+#define I915_PARAM_HAS_SCHEDULER	 41
+#define I915_PARAM_HUC_STATUS		 42
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ * See EXEC_OBJECT_ASYNC.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 43
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete. See I915_EXEC_FENCE_IN and I915_EXEC_FENCE_OUT.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 44
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
+ * user specified bufffers for post-mortem debugging of GPU hangs. See
+ * EXEC_OBJECT_CAPTURE.
+ */
+#define I915_PARAM_HAS_EXEC_CAPTURE	 45
+
+#define I915_PARAM_SLICE_MASK		 46
+
+/* Assuming it's uniform for each slice, this queries the mask of subslices
+ * per-slice for this system.
+ */
+#define I915_PARAM_SUBSLICE_MASK	 47
+
+/*
+ * Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying the batch buffer
+ * as the first execobject as opposed to the last. See I915_EXEC_BATCH_FIRST.
+ */
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST	 48
+
+typedef struct drm_i915_getparam {
+	__s32 param;
+	/*
+	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
+	 * compat32 code. Don't repeat this mistake.
+	 */
+	int *value;
+} drm_i915_getparam_t;
+
+/* Ioctl to set kernel params:
+ */
+#define I915_SETPARAM_USE_MI_BATCHBUFFER_START            1
+#define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY             2
+#define I915_SETPARAM_ALLOW_BATCHBUFFER                   3
+#define I915_SETPARAM_NUM_USED_FENCES                     4
+
+typedef struct drm_i915_setparam {
+	int param;
+	int value;
+} drm_i915_setparam_t;
+
+/* A memory manager for regions of shared memory:
+ */
+#define I915_MEM_REGION_AGP 1
+
+typedef struct drm_i915_mem_alloc {
+	int region;
+	int alignment;
+	int size;
+	int *region_offset;	/* offset from start of fb or agp */
+} drm_i915_mem_alloc_t;
+
+typedef struct drm_i915_mem_free {
+	int region;
+	int region_offset;
+} drm_i915_mem_free_t;
+
+typedef struct drm_i915_mem_init_heap {
+	int region;
+	int size;
+	int start;
+} drm_i915_mem_init_heap_t;
+
+/* Allow memory manager to be torn down and re-initialized (eg on
+ * rotate):
+ */
+typedef struct drm_i915_mem_destroy_heap {
+	int region;
+} drm_i915_mem_destroy_heap_t;
+
+/* Allow X server to configure which pipes to monitor for vblank signals
+ */
+#define	DRM_I915_VBLANK_PIPE_A	1
+#define	DRM_I915_VBLANK_PIPE_B	2
+
+typedef struct drm_i915_vblank_pipe {
+	int pipe;
+} drm_i915_vblank_pipe_t;
+
+/* Schedule buffer swap at given vertical blank:
+ */
+typedef struct drm_i915_vblank_swap {
+	drm_drawable_t drawable;
+	enum drm_vblank_seq_type seqtype;
+	unsigned int sequence;
+} drm_i915_vblank_swap_t;
+
+typedef struct drm_i915_hws_addr {
+	__u64 addr;
+} drm_i915_hws_addr_t;
+
+struct drm_i915_gem_init {
+	/**
+	 * Beginning offset in the GTT to be managed by the DRM memory
+	 * manager.
+	 */
+	__u64 gtt_start;
+	/**
+	 * Ending offset in the GTT to be managed by the DRM memory
+	 * manager.
+	 */
+	__u64 gtt_end;
+};
+
+struct drm_i915_gem_create {
+	/**
+	 * Requested size for the object.
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+	__u32 pad;
+};
+
+struct drm_i915_gem_pread {
+	/** Handle for the object being read. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset into the object to read from */
+	__u64 offset;
+	/** Length of data to read */
+	__u64 size;
+	/**
+	 * Pointer to write the data into.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 data_ptr;
+};
+
+struct drm_i915_gem_pwrite {
+	/** Handle for the object being written to. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset into the object to write to */
+	__u64 offset;
+	/** Length of data to write */
+	__u64 size;
+	/**
+	 * Pointer to read the data from.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 data_ptr;
+};
+
+struct drm_i915_gem_mmap {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset in the object to map. */
+	__u64 offset;
+	/**
+	 * Length of data to map.
+	 *
+	 * The value will be page-aligned.
+	 */
+	__u64 size;
+	/**
+	 * Returned pointer the data was mapped at.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 addr_ptr;
+
+	/**
+	 * Flags for extended behaviour.
+	 *
+	 * Added in version 2.
+	 */
+	__u64 flags;
+#define I915_MMAP_WC 0x1
+};
+
+struct drm_i915_gem_mmap_gtt {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/**
+	 * Fake offset to use for subsequent mmap call
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 offset;
+};
+
+struct drm_i915_gem_set_domain {
+	/** Handle for the object */
+	__u32 handle;
+
+	/** New read domains */
+	__u32 read_domains;
+
+	/** New write domain */
+	__u32 write_domain;
+};
+
+struct drm_i915_gem_sw_finish {
+	/** Handle for the object */
+	__u32 handle;
+};
+
+struct drm_i915_gem_relocation_entry {
+	/**
+	 * Handle of the buffer being pointed to by this relocation entry.
+	 *
+	 * It's appealing to make this be an index into the mm_validate_entry
+	 * list to refer to the buffer, but this allows the driver to create
+	 * a relocation list for state buffers and not re-write it per
+	 * exec using the buffer.
+	 */
+	__u32 target_handle;
+
+	/**
+	 * Value to be added to the offset of the target buffer to make up
+	 * the relocation entry.
+	 */
+	__u32 delta;
+
+	/** Offset in the buffer the relocation entry will be written into */
+	__u64 offset;
+
+	/**
+	 * Offset value of the target buffer that the relocation entry was last
+	 * written as.
+	 *
+	 * If the buffer has the same offset as last time, we can skip syncing
+	 * and writing the relocation.  This value is written back out by
+	 * the execbuffer ioctl when the relocation is written.
+	 */
+	__u64 presumed_offset;
+
+	/**
+	 * Target memory domains read by this operation.
+	 */
+	__u32 read_domains;
+
+	/**
+	 * Target memory domains written by this operation.
+	 *
+	 * Note that only one domain may be written by the whole
+	 * execbuffer operation, so that where there are conflicts,
+	 * the application will get -EINVAL back.
+	 */
+	__u32 write_domain;
+};
+
+/** @{
+ * Intel memory domains
+ *
+ * Most of these just align with the various caches in
+ * the system and are used to flush and invalidate as
+ * objects end up cached in different domains.
+ */
+/** CPU cache */
+#define I915_GEM_DOMAIN_CPU		0x00000001
+/** Render cache, used by 2D and 3D drawing */
+#define I915_GEM_DOMAIN_RENDER		0x00000002
+/** Sampler cache, used by texture engine */
+#define I915_GEM_DOMAIN_SAMPLER		0x00000004
+/** Command queue, used to load batch buffers */
+#define I915_GEM_DOMAIN_COMMAND		0x00000008
+/** Instruction cache, used by shader programs */
+#define I915_GEM_DOMAIN_INSTRUCTION	0x00000010
+/** Vertex address cache */
+#define I915_GEM_DOMAIN_VERTEX		0x00000020
+/** GTT domain - aperture and scanout */
+#define I915_GEM_DOMAIN_GTT		0x00000040
+/** WC domain - uncached access */
+#define I915_GEM_DOMAIN_WC		0x00000080
+/** @} */
+
+struct drm_i915_gem_exec_object {
+	/**
+	 * User's handle for a buffer to be bound into the GTT for this
+	 * operation.
+	 */
+	//__u32 handle;
+
+	/** Number of relocations to be performed on this buffer */
+	//__u32 relocation_count;
+	/**
+	 * Pointer to array of struct drm_i915_gem_relocation_entry containing
+	 * the relocations to be performed in this buffer.
+	 */
+	//__u64 relocs_ptr;
+
+	/** Required alignment in graphics aperture */
+	//__u64 alignment;
+
+	/**
+	 * Returned value of the updated offset of the object, for future
+	 * presumed_offset writes.
+	 */
+	//__u64 offset;
+	__u32 unused;
+};
+
+struct drm_i915_gem_execbuffer {
+	/**
+	 * List of buffers to be validated with their relocations to be
+	 * performend on them.
+	 *
+	 * This is a pointer to an array of struct drm_i915_gem_validate_entry.
+	 *
+	 * These buffers must be listed in an order such that all relocations
+	 * a buffer is performing refer to buffers that have already appeared
+	 * in the validate list.
+	 */
+	__u64 buffers_ptr;
+	__u32 buffer_count;
+
+	/** Offset in the batchbuffer to start execution from. */
+	__u32 batch_start_offset;
+	/** Bytes used in batchbuffer from batch_start_offset */
+	__u32 batch_len;
+	__u32 DR1;
+	__u32 DR4;
+	__u32 num_cliprects;
+	/** This is a struct drm_clip_rect *cliprects */
+	__u64 cliprects_ptr;
+};
+
+struct drm_i915_gem_exec_object2 {
+	/**
+	 * User's handle for a buffer to be bound into the GTT for this
+	 * operation.
+	 */
+	 __u64 handle; // magma uses 64bit buffer 'handles'
+
+	/** Number of relocations to be performed on this buffer */
+	__u32 relocation_count;
+	/**
+	 * Pointer to array of struct drm_i915_gem_relocation_entry containing
+	 * the relocations to be performed in this buffer.
+	 */
+	__u64 relocs_ptr;
+
+	/** Required alignment in graphics aperture */
+	__u64 alignment;
+
+	/**
+	 * When the EXEC_OBJECT_PINNED flag is specified this is populated by
+	 * the user with the GTT offset at which this object will be pinned.
+	 * When the I915_EXEC_NO_RELOC flag is specified this must contain the
+	 * presumed_offset of the object.
+	 * During execbuffer2 the kernel populates it with the value of the
+	 * current GTT offset of the object, for future presumed_offset writes.
+	 */
+	__u64 offset;
+
+#define EXEC_OBJECT_NEEDS_FENCE		 (1<<0)
+#define EXEC_OBJECT_NEEDS_GTT		 (1<<1)
+#define EXEC_OBJECT_WRITE		 (1<<2)
+#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
+#define EXEC_OBJECT_PINNED		 (1<<4)
+#define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
+/* Request that the contents of this execobject be copied into the error
+ * state upon a GPU hang involving this batch for post-mortem debugging.
+ * These buffers are recorded in no particular order as "user" in
+ * /sys/class/drm/cardN/error. Query I915_PARAM_HAS_EXEC_CAPTURE to see
+ * if the kernel supports this flag.
+ */
+#define EXEC_OBJECT_CAPTURE		(1<<7)
+/* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
+	__u64 flags;
+
+	union {
+		__u64 rsvd1;
+		__u64 pad_to_size;
+	};
+	__u64 rsvd2;
+};
+
+struct drm_i915_gem_execbuffer2 {
+	/**
+	 * List of gem_exec_object2 structs
+	 */
+	__u64 buffers_ptr;
+	__u32 buffer_count;
+
+	/** Offset in the batchbuffer to start execution from. */
+	__u32 batch_start_offset;
+	/** Bytes used in batchbuffer from batch_start_offset */
+	__u32 batch_len;
+	__u32 DR1;
+	__u32 DR4;
+	__u32 num_cliprects;
+	/** This is a struct drm_clip_rect *cliprects */
+	__u64 cliprects_ptr;
+#define I915_EXEC_RING_MASK              (7<<0)
+#define I915_EXEC_DEFAULT                (0<<0)
+#define I915_EXEC_RENDER                 (1<<0)
+#define I915_EXEC_BSD                    (2<<0)
+#define I915_EXEC_BLT                    (3<<0)
+#define I915_EXEC_VEBOX                  (4<<0)
+
+/* Used for switching the constants addressing mode on gen4+ RENDER ring.
+ * Gen6+ only supports relative addressing to dynamic state (default) and
+ * absolute addressing.
+ *
+ * These flags are ignored for the BSD and BLT rings.
+ */
+#define I915_EXEC_CONSTANTS_MASK 	(3<<6)
+#define I915_EXEC_CONSTANTS_REL_GENERAL (0<<6) /* default */
+#define I915_EXEC_CONSTANTS_ABSOLUTE 	(1<<6)
+#define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */
+	__u64 flags;
+	__u64 rsvd1; /* now used for context info */
+	__u64 rsvd2;
+};
+
+/** Resets the SO write offset registers for transform feedback on gen7. */
+#define I915_EXEC_GEN7_SOL_RESET	(1<<8)
+
+/** Request a privileged ("secure") batch buffer. Note only available for
+ * DRM_ROOT_ONLY | DRM_MASTER processes.
+ */
+#define I915_EXEC_SECURE		(1<<9)
+
+/** Inform the kernel that the batch is and will always be pinned. This
+ * negates the requirement for a workaround to be performed to avoid
+ * an incoherent CS (such as can be found on 830/845). If this flag is
+ * not passed, the kernel will endeavour to make sure the batch is
+ * coherent with the CS before execution. If this flag is passed,
+ * userspace assumes the responsibility for ensuring the same.
+ */
+#define I915_EXEC_IS_PINNED		(1<<10)
+
+/** Provide a hint to the kernel that the command stream and auxiliary
+ * state buffers already holds the correct presumed addresses and so the
+ * relocation process may be skipped if no buffers need to be moved in
+ * preparation for the execbuffer.
+ */
+#define I915_EXEC_NO_RELOC		(1<<11)
+
+/** Use the reloc.handle as an index into the exec object array rather
+ * than as the per-file handle.
+ */
+#define I915_EXEC_HANDLE_LUT		(1<<12)
+
+/** Used for switching BSD rings on the platforms with two BSD rings */
+#define I915_EXEC_BSD_SHIFT	 (13)
+#define I915_EXEC_BSD_MASK	 (3 << I915_EXEC_BSD_SHIFT)
+/* default ping-pong mode */
+#define I915_EXEC_BSD_DEFAULT	 (0 << I915_EXEC_BSD_SHIFT)
+#define I915_EXEC_BSD_RING1	 (1 << I915_EXEC_BSD_SHIFT)
+#define I915_EXEC_BSD_RING2	 (2 << I915_EXEC_BSD_SHIFT)
+
+/** Tell the kernel that the batchbuffer is processed by
+ *  the resource streamer.
+ */
+#define I915_EXEC_RESOURCE_STREAMER     (1<<15)
+
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+/*
+ * Traditionally the execbuf ioctl has only considered the final element in
+ * the execobject[] to be the executable batch. Often though, the client
+ * will known the batch object prior to construction and being able to place
+ * it into the execobject[] array first can simplify the relocation tracking.
+ * Setting I915_EXEC_BATCH_FIRST tells execbuf to use element 0 of the
+ * execobject[] as the * batch instead (the default is to use the last
+ * element).
+ */
+#define I915_EXEC_BATCH_FIRST		(1<<18)
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1))
+
+#define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
+#define i915_execbuffer2_set_context_id(eb2, context) \
+	(eb2).rsvd1 = context & I915_EXEC_CONTEXT_ID_MASK
+#define i915_execbuffer2_get_context_id(eb2) \
+	((eb2).rsvd1 & I915_EXEC_CONTEXT_ID_MASK)
+
+struct drm_i915_gem_pin {
+	/** Handle of the buffer to be pinned. */
+	__u32 handle;
+	__u32 pad;
+
+	/** alignment required within the aperture */
+	__u64 alignment;
+
+	/** Returned GTT offset of the buffer. */
+	__u64 offset;
+};
+
+struct drm_i915_gem_unpin {
+	/** Handle of the buffer to be unpinned. */
+	__u32 handle;
+	__u32 pad;
+};
+
+struct drm_i915_gem_busy {
+	/** Handle of the buffer to check for busy */
+	__u32 handle;
+
+	/** Return busy status
+	 *
+	 * A return of 0 implies that the object is idle (after
+	 * having flushed any pending activity), and a non-zero return that
+	 * the object is still in-flight on the GPU. (The GPU has not yet
+	 * signaled completion for all pending requests that reference the
+	 * object.) An object is guaranteed to become idle eventually (so
+	 * long as no new GPU commands are executed upon it). Due to the
+	 * asynchronous nature of the hardware, an object reported
+	 * as busy may become idle before the ioctl is completed.
+	 *
+	 * Furthermore, if the object is busy, which engine is busy is only
+	 * provided as a guide. There are race conditions which prevent the
+	 * report of which engines are busy from being always accurate.
+	 * However, the converse is not true. If the object is idle, the
+	 * result of the ioctl, that all engines are idle, is accurate.
+	 *
+	 * The returned dword is split into two fields to indicate both
+	 * the engines on which the object is being read, and the
+	 * engine on which it is currently being written (if any).
+	 *
+	 * The low word (bits 0:15) indicate if the object is being written
+	 * to by any engine (there can only be one, as the GEM implicit
+	 * synchronisation rules force writes to be serialised). Only the
+	 * engine for the last write is reported.
+	 *
+	 * The high word (bits 16:31) are a bitmask of which engines are
+	 * currently reading from the object. Multiple engines may be
+	 * reading from the object simultaneously.
+	 *
+	 * The value of each engine is the same as specified in the
+	 * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc.
+	 * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to
+	 * the I915_EXEC_RENDER engine for execution, and so it is never
+	 * reported as active itself. Some hardware may have parallel
+	 * execution engines, e.g. multiple media engines, which are
+	 * mapped to the same identifier in the EXECBUFFER2 ioctl and
+	 * so are not separately reported for busyness.
+	 *
+	 * Caveat emptor:
+	 * Only the boolean result of this query is reliable; that is whether
+	 * the object is idle or busy. The report of which engines are busy
+	 * should be only used as a heuristic.
+	 */
+	__u32 busy;
+};
+
+/**
+ * I915_CACHING_NONE
+ *
+ * GPU access is not coherent with cpu caches. Default for machines without an
+ * LLC.
+ */
+#define I915_CACHING_NONE		0
+/**
+ * I915_CACHING_CACHED
+ *
+ * GPU access is coherent with cpu caches and furthermore the data is cached in
+ * last-level caches shared between cpu cores and the gpu GT. Default on
+ * machines with HAS_LLC.
+ */
+#define I915_CACHING_CACHED		1
+/**
+ * I915_CACHING_DISPLAY
+ *
+ * Special GPU caching mode which is coherent with the scanout engines.
+ * Transparently falls back to I915_CACHING_NONE on platforms where no special
+ * cache mode (like write-through or gfdt flushing) is available. The kernel
+ * automatically sets this mode when using a buffer as a scanout target.
+ * Userspace can manually set this mode to avoid a costly stall and clflush in
+ * the hotpath of drawing the first frame.
+ */
+#define I915_CACHING_DISPLAY		2
+
+struct drm_i915_gem_caching {
+	/**
+	 * Handle of the buffer to set/get the caching level of. */
+	__u32 handle;
+
+	/**
+	 * Cacheing level to apply or return value
+	 *
+	 * bits0-15 are for generic caching control (i.e. the above defined
+	 * values). bits16-31 are reserved for platform-specific variations
+	 * (e.g. l3$ caching on gen7). */
+	__u32 caching;
+};
+
+#define I915_TILING_NONE	0
+#define I915_TILING_X		1
+#define I915_TILING_Y		2
+#define I915_TILING_LAST	I915_TILING_Y
+
+#define I915_BIT_6_SWIZZLE_NONE		0
+#define I915_BIT_6_SWIZZLE_9		1
+#define I915_BIT_6_SWIZZLE_9_10		2
+#define I915_BIT_6_SWIZZLE_9_11		3
+#define I915_BIT_6_SWIZZLE_9_10_11	4
+/* Not seen by userland */
+#define I915_BIT_6_SWIZZLE_UNKNOWN	5
+/* Seen by userland. */
+#define I915_BIT_6_SWIZZLE_9_17		6
+#define I915_BIT_6_SWIZZLE_9_10_17	7
+
+struct drm_i915_gem_set_tiling {
+	/** Handle of the buffer to have its tiling state updated */
+	__u32 handle;
+
+	/**
+	 * Tiling mode for the object (I915_TILING_NONE, I915_TILING_X,
+	 * I915_TILING_Y).
+	 *
+	 * This value is to be set on request, and will be updated by the
+	 * kernel on successful return with the actual chosen tiling layout.
+	 *
+	 * The tiling mode may be demoted to I915_TILING_NONE when the system
+	 * has bit 6 swizzling that can't be managed correctly by GEM.
+	 *
+	 * Buffer contents become undefined when changing tiling_mode.
+	 */
+	__u32 tiling_mode;
+
+	/**
+	 * Stride in bytes for the object when in I915_TILING_X or
+	 * I915_TILING_Y.
+	 */
+	__u32 stride;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping.
+	 */
+	__u32 swizzle_mode;
+};
+
+struct drm_i915_gem_get_tiling {
+	/** Handle of the buffer to get tiling state for. */
+	__u32 handle;
+
+	/**
+	 * Current tiling mode for the object (I915_TILING_NONE, I915_TILING_X,
+	 * I915_TILING_Y).
+	 */
+	__u32 tiling_mode;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping.
+	 */
+	__u32 swizzle_mode;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping whilst bound.
+	 */
+	__u32 phys_swizzle_mode;
+};
+
+struct drm_i915_gem_get_aperture {
+	/** Total size of the aperture used by i915_gem_execbuffer, in bytes */
+	__u64 aper_size;
+
+	/**
+	 * Available space in the aperture used by i915_gem_execbuffer, in
+	 * bytes
+	 */
+	__u64 aper_available_size;
+};
+
+struct drm_i915_get_pipe_from_crtc_id {
+	/** ID of CRTC being requested **/
+	__u32 crtc_id;
+
+	/** pipe of requested CRTC **/
+	__u32 pipe;
+};
+
+#define I915_MADV_WILLNEED 0
+#define I915_MADV_DONTNEED 1
+#define __I915_MADV_PURGED 2 /* internal state */
+
+struct drm_i915_gem_madvise {
+	/** Handle of the buffer to change the backing store advice */
+	__u32 handle;
+
+	/* Advice: either the buffer will be needed again in the near future,
+	 *         or wont be and could be discarded under memory pressure.
+	 */
+	__u32 madv;
+
+	/** Whether the backing store still exists. */
+	__u32 retained;
+};
+
+/* flags */
+#define I915_OVERLAY_TYPE_MASK 		0xff
+#define I915_OVERLAY_YUV_PLANAR 	0x01
+#define I915_OVERLAY_YUV_PACKED 	0x02
+#define I915_OVERLAY_RGB		0x03
+
+#define I915_OVERLAY_DEPTH_MASK		0xff00
+#define I915_OVERLAY_RGB24		0x1000
+#define I915_OVERLAY_RGB16		0x2000
+#define I915_OVERLAY_RGB15		0x3000
+#define I915_OVERLAY_YUV422		0x0100
+#define I915_OVERLAY_YUV411		0x0200
+#define I915_OVERLAY_YUV420		0x0300
+#define I915_OVERLAY_YUV410		0x0400
+
+#define I915_OVERLAY_SWAP_MASK		0xff0000
+#define I915_OVERLAY_NO_SWAP		0x000000
+#define I915_OVERLAY_UV_SWAP		0x010000
+#define I915_OVERLAY_Y_SWAP		0x020000
+#define I915_OVERLAY_Y_AND_UV_SWAP	0x030000
+
+#define I915_OVERLAY_FLAGS_MASK		0xff000000
+#define I915_OVERLAY_ENABLE		0x01000000
+
+struct drm_intel_overlay_put_image {
+	/* various flags and src format description */
+	__u32 flags;
+	/* source picture description */
+	__u32 bo_handle;
+	/* stride values and offsets are in bytes, buffer relative */
+	__u16 stride_Y; /* stride for packed formats */
+	__u16 stride_UV;
+	__u32 offset_Y; /* offset for packet formats */
+	__u32 offset_U;
+	__u32 offset_V;
+	/* in pixels */
+	__u16 src_width;
+	__u16 src_height;
+	/* to compensate the scaling factors for partially covered surfaces */
+	__u16 src_scan_width;
+	__u16 src_scan_height;
+	/* output crtc description */
+	__u32 crtc_id;
+	__u16 dst_x;
+	__u16 dst_y;
+	__u16 dst_width;
+	__u16 dst_height;
+};
+
+/* flags */
+#define I915_OVERLAY_UPDATE_ATTRS	(1<<0)
+#define I915_OVERLAY_UPDATE_GAMMA	(1<<1)
+#define I915_OVERLAY_DISABLE_DEST_COLORKEY	(1<<2)
+struct drm_intel_overlay_attrs {
+	__u32 flags;
+	__u32 color_key;
+	__s32 brightness;
+	__u32 contrast;
+	__u32 saturation;
+	__u32 gamma0;
+	__u32 gamma1;
+	__u32 gamma2;
+	__u32 gamma3;
+	__u32 gamma4;
+	__u32 gamma5;
+};
+
+/*
+ * Intel sprite handling
+ *
+ * Color keying works with a min/mask/max tuple.  Both source and destination
+ * color keying is allowed.
+ *
+ * Source keying:
+ * Sprite pixels within the min & max values, masked against the color channels
+ * specified in the mask field, will be transparent.  All other pixels will
+ * be displayed on top of the primary plane.  For RGB surfaces, only the min
+ * and mask fields will be used; ranged compares are not allowed.
+ *
+ * Destination keying:
+ * Primary plane pixels that match the min value, masked against the color
+ * channels specified in the mask field, will be replaced by corresponding
+ * pixels from the sprite plane.
+ *
+ * Note that source & destination keying are exclusive; only one can be
+ * active on a given plane.
+ */
+
+#define I915_SET_COLORKEY_NONE		(1<<0) /* disable color key matching */
+#define I915_SET_COLORKEY_DESTINATION	(1<<1)
+#define I915_SET_COLORKEY_SOURCE	(1<<2)
+struct drm_intel_sprite_colorkey {
+	__u32 plane_id;
+	__u32 min_value;
+	__u32 channel_mask;
+	__u32 max_value;
+	__u32 flags;
+};
+
+struct drm_i915_gem_wait {
+	/** Handle of BO we shall wait on */
+	__u32 bo_handle;
+	__u32 flags;
+	/** Number of nanoseconds to wait, Returns time remaining. */
+	__s64 timeout_ns;
+};
+
+struct drm_i915_gem_context_create {
+	/*  output: id of new context*/
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct drm_i915_gem_context_destroy {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct drm_i915_reg_read {
+	/*
+	 * Register offset.
+	 * For 64bit wide registers where the upper 32bits don't immediately
+	 * follow the lower 32bits, the offset of the lower 32bits must
+	 * be specified
+	 */
+	__u64 offset;
+	__u64 val; /* Return value */
+};
+/* Known registers:
+ *
+ * Render engine timestamp - 0x2358 + 64bit - gen7+
+ * - Note this register returns an invalid value if using the default
+ *   single instruction 8byte read, in order to workaround that use
+ *   offset (0x2538 | 1) instead.
+ *
+ */
+
+struct drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+
+	/* All resets since boot/module reload, for all contexts */
+	__u32 reset_count;
+
+	/* Number of batches lost when active in GPU, for this context */
+	__u32 batch_active;
+
+	/* Number of batches lost pending for execution, for this context */
+	__u32 batch_pending;
+
+	__u32 pad;
+};
+
+struct drm_i915_gem_userptr {
+	__u64 user_ptr;
+	__u64 user_size;
+	__u32 flags;
+#define I915_USERPTR_READ_ONLY 0x1
+#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+};
+
+struct drm_i915_gem_context_param {
+	__u32 ctx_id;
+	__u32 size;
+	__u64 param;
+#define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
+#define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
+#define I915_CONTEXT_PARAM_GTT_SIZE	0x3
+#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_BANNABLE	0x5
+	__u64 value;
+};
+
+enum drm_i915_oa_format {
+	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
+	I915_OA_FORMAT_A29,	    /* HSW only */
+	I915_OA_FORMAT_A13_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8,	    /* HSW only */
+	I915_OA_FORMAT_A45_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8_A16,   /* HSW only */
+	I915_OA_FORMAT_C4_B8,	    /* HSW+ */
+
+	/* Gen8+ */
+	I915_OA_FORMAT_A12,
+	I915_OA_FORMAT_A12_B8_C8,
+	I915_OA_FORMAT_A32u40_A4u32_B8_C8,
+
+	I915_OA_FORMAT_MAX	    /* non-ABI */
+};
+
+enum drm_i915_perf_property_id {
+	/**
+	 * Open the stream for a specific context handle (as used with
+	 * execbuffer2). A stream opened for a specific context this way
+	 * won't typically require root privileges.
+	 */
+	DRM_I915_PERF_PROP_CTX_HANDLE = 1,
+
+	/**
+	 * A value of 1 requests the inclusion of raw OA unit reports as
+	 * part of stream samples.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_OA,
+
+	/**
+	 * The value specifies which set of OA unit metrics should be
+	 * be configured, defining the contents of any OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_METRICS_SET,
+
+	/**
+	 * The value specifies the size and layout of OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_FORMAT,
+
+	/**
+	 * Specifying this property implicitly requests periodic OA unit
+	 * sampling and (at least on Haswell) the sampling frequency is derived
+	 * from this exponent as follows:
+	 *
+	 *   80ns * 2^(period_exponent + 1)
+	 */
+	DRM_I915_PERF_PROP_OA_EXPONENT,
+
+	DRM_I915_PERF_PROP_MAX /* non-ABI */
+};
+
+struct drm_i915_perf_open_param {
+	__u32 flags;
+#define I915_PERF_FLAG_FD_CLOEXEC	(1<<0)
+#define I915_PERF_FLAG_FD_NONBLOCK	(1<<1)
+#define I915_PERF_FLAG_DISABLED		(1<<2)
+
+	/** The number of u64 (id, value) pairs */
+	__u32 num_properties;
+
+	/**
+	 * Pointer to array of u64 (id, value) pairs configuring the stream
+	 * to open.
+	 */
+	__u64 properties_ptr;
+};
+
+/**
+ * Enable data capture for a stream that was either opened in a disabled state
+ * via I915_PERF_FLAG_DISABLED or was later disabled via
+ * I915_PERF_IOCTL_DISABLE.
+ *
+ * It is intended to be cheaper to disable and enable a stream than it may be
+ * to close and re-open a stream with the same configuration.
+ *
+ * It's undefined whether any pending data for the stream will be lost.
+ */
+#define I915_PERF_IOCTL_ENABLE	_IO('i', 0x0)
+
+/**
+ * Disable data capture for a stream.
+ *
+ * It is an error to try and read a stream that is disabled.
+ */
+#define I915_PERF_IOCTL_DISABLE	_IO('i', 0x1)
+
+/**
+ * Common to all i915 perf records
+ */
+struct drm_i915_perf_record_header {
+	__u32 type;
+	__u16 pad;
+	__u16 size;
+};
+
+enum drm_i915_perf_record_type {
+
+	/**
+	 * Samples are the work horse record type whose contents are extensible
+	 * and defined when opening an i915 perf stream based on the given
+	 * properties.
+	 *
+	 * Boolean properties following the naming convention
+	 * DRM_I915_PERF_SAMPLE_xyz_PROP request the inclusion of 'xyz' data in
+	 * every sample.
+	 *
+	 * The order of these sample properties given by userspace has no
+	 * affect on the ordering of data within a sample. The order is
+	 * documented here.
+	 *
+	 * struct {
+	 *     struct drm_i915_perf_record_header header;
+	 *
+	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
+	 * };
+	 */
+	DRM_I915_PERF_RECORD_SAMPLE = 1,
+
+	/*
+	 * Indicates that one or more OA reports were not written by the
+	 * hardware. This can happen for example if an MI_REPORT_PERF_COUNT
+	 * command collides with periodic sampling - which would be more likely
+	 * at higher sampling frequencies.
+	 */
+	DRM_I915_PERF_RECORD_OA_REPORT_LOST = 2,
+
+	/**
+	 * An error occurred that resulted in all pending OA reports being lost.
+	 */
+	DRM_I915_PERF_RECORD_OA_BUFFER_LOST = 3,
+
+	DRM_I915_PERF_RECORD_MAX /* non-ABI */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _I915_DRM_H_ */
diff --git a/include/drm-uapi/vc4_drm.h b/include/drm-uapi/vc4_drm.h
new file mode 100644
index 0000000..0caeaf3
--- /dev/null
+++ b/include/drm-uapi/vc4_drm.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _VC4_DRM_H_
+#define _VC4_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_VC4_SUBMIT_CL                         0x00
+#define DRM_VC4_WAIT_SEQNO                        0x01
+#define DRM_VC4_WAIT_BO                           0x02
+#define DRM_VC4_CREATE_BO                         0x03
+#define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
+#define DRM_VC4_GET_HANG_STATE                    0x06
+#define DRM_VC4_GET_PARAM                         0x07
+#define DRM_VC4_SET_TILING                        0x08
+#define DRM_VC4_GET_TILING                        0x09
+
+#define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
+#define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
+#define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
+#define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
+#define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
+#define DRM_IOCTL_VC4_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_PARAM, struct drm_vc4_get_param)
+#define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
+#define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
+
+struct drm_vc4_submit_rcl_surface {
+	__u32 hindex; /* Handle index, or ~0 if not present. */
+	__u32 offset; /* Offset to start of buffer. */
+	/*
+	 * Bits for either render config (color_write) or load/store packet.
+	 * Bits should all be 0 for MSAA load/stores.
+	 */
+	__u16 bits;
+
+#define VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES		(1 << 0)
+	__u16 flags;
+};
+
+/**
+ * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
+ * engine.
+ *
+ * Drivers typically use GPU BOs to store batchbuffers / command lists and
+ * their associated state.  However, because the VC4 lacks an MMU, we have to
+ * do validation of memory accesses by the GPU commands.  If we were to store
+ * our commands in BOs, we'd need to do uncached readback from them to do the
+ * validation process, which is too expensive.  Instead, userspace accumulates
+ * commands and associated state in plain memory, then the kernel copies the
+ * data to its own address space, and then validates and stores it in a GPU
+ * BO.
+ */
+struct drm_vc4_submit_cl {
+	/* Pointer to the binner command list.
+	 *
+	 * This is the first set of commands executed, which runs the
+	 * coordinate shader to determine where primitives land on the screen,
+	 * then writes out the state updates and draw calls necessary per tile
+	 * to the tile allocation BO.
+	 */
+	__u64 bin_cl;
+
+	/* Pointer to the shader records.
+	 *
+	 * Shader records are the structures read by the hardware that contain
+	 * pointers to uniforms, shaders, and vertex attributes.  The
+	 * reference to the shader record has enough information to determine
+	 * how many pointers are necessary (fixed number for shaders/uniforms,
+	 * and an attribute count), so those BO indices into bo_handles are
+	 * just stored as __u32s before each shader record passed in.
+	 */
+	__u64 shader_rec;
+
+	/* Pointer to uniform data and texture handles for the textures
+	 * referenced by the shader.
+	 *
+	 * For each shader state record, there is a set of uniform data in the
+	 * order referenced by the record (FS, VS, then CS).  Each set of
+	 * uniform data has a __u32 index into bo_handles per texture
+	 * sample operation, in the order the QPU_W_TMUn_S writes appear in
+	 * the program.  Following the texture BO handle indices is the actual
+	 * uniform data.
+	 *
+	 * The individual uniform state blocks don't have sizes passed in,
+	 * because the kernel has to determine the sizes anyway during shader
+	 * code validation.
+	 */
+	__u64 uniforms;
+	__u64 bo_handles;
+
+	/* Size in bytes of the binner command list. */
+	__u32 bin_cl_size;
+	/* Size in bytes of the set of shader records. */
+	__u32 shader_rec_size;
+	/* Number of shader records.
+	 *
+	 * This could just be computed from the contents of shader_records and
+	 * the address bits of references to them from the bin CL, but it
+	 * keeps the kernel from having to resize some allocations it makes.
+	 */
+	__u32 shader_rec_count;
+	/* Size in bytes of the uniform state. */
+	__u32 uniforms_size;
+
+	/* Number of BO handles passed in (size is that times 4). */
+	__u32 bo_handle_count;
+
+	/* RCL setup: */
+	__u16 width;
+	__u16 height;
+	__u8 min_x_tile;
+	__u8 min_y_tile;
+	__u8 max_x_tile;
+	__u8 max_y_tile;
+	struct drm_vc4_submit_rcl_surface color_read;
+	struct drm_vc4_submit_rcl_surface color_write;
+	struct drm_vc4_submit_rcl_surface zs_read;
+	struct drm_vc4_submit_rcl_surface zs_write;
+	struct drm_vc4_submit_rcl_surface msaa_color_write;
+	struct drm_vc4_submit_rcl_surface msaa_zs_write;
+	__u32 clear_color[2];
+	__u32 clear_z;
+	__u8 clear_s;
+
+	__u32 pad:24;
+
+#define VC4_SUBMIT_CL_USE_CLEAR_COLOR			(1 << 0)
+	__u32 flags;
+
+	/* Returned value of the seqno of this render job (for the
+	 * wait ioctl).
+	 */
+	__u64 seqno;
+};
+
+/**
+ * struct drm_vc4_wait_seqno - ioctl argument for waiting for
+ * DRM_VC4_SUBMIT_CL completion using its returned seqno.
+ *
+ * timeout_ns is the timeout in nanoseconds, where "0" means "don't
+ * block, just return the status."
+ */
+struct drm_vc4_wait_seqno {
+	__u64 seqno;
+	__u64 timeout_ns;
+};
+
+/**
+ * struct drm_vc4_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_VC4_SUBMIT_CL on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_vc4_wait_bo {
+	__u32 handle;
+	__u32 pad;
+	__u64 timeout_ns;
+};
+
+/**
+ * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_create_bo {
+	__u32 size;
+	__u32 flags;
+	/** Returned GEM handle for the BO. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/**
+ * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
+ *
+ * This doesn't actually perform an mmap.  Instead, it returns the
+ * offset you need to use in an mmap on the DRM device node.  This
+ * means that tools like valgrind end up knowing about the mapped
+ * memory.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_mmap_bo {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 flags;
+	/** offset into the drm node to use for subsequent mmap call. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	__u32 size;
+	/* Flags, currently must be 0. */
+	__u32 flags;
+
+	/* Pointer to the data. */
+	__u64 data;
+
+	/** Returned GEM handle for the BO. */
+	__u32 handle;
+	/* Pad, must be 0. */
+	__u32 pad;
+};
+
+struct drm_vc4_get_hang_state_bo {
+	__u32 handle;
+	__u32 paddr;
+	__u32 size;
+	__u32 pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+	/** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+	__u64 bo;
+	/**
+	 * On input, the size of the bo array.  Output is the number
+	 * of bos to be returned.
+	 */
+	__u32 bo_count;
+
+	__u32 start_bin, start_render;
+
+	__u32 ct0ca, ct0ea;
+	__u32 ct1ca, ct1ea;
+	__u32 ct0cs, ct1cs;
+	__u32 ct0ra0, ct1ra0;
+
+	__u32 bpca, bpcs;
+	__u32 bpoa, bpos;
+
+	__u32 vpmbase;
+
+	__u32 dbge;
+	__u32 fdbgo;
+	__u32 fdbgb;
+	__u32 fdbgr;
+	__u32 fdbgs;
+	__u32 errstat;
+
+	/* Pad that we may save more registers into in the future. */
+	__u32 pad[16];
+};
+
+#define DRM_VC4_PARAM_V3D_IDENT0		0
+#define DRM_VC4_PARAM_V3D_IDENT1		1
+#define DRM_VC4_PARAM_V3D_IDENT2		2
+#define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
+#define DRM_VC4_PARAM_SUPPORTS_ETC1		4
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
+
+struct drm_vc4_get_param {
+	__u32 param;
+	__u32 pad;
+	__u64 value;
+};
+
+struct drm_vc4_get_tiling {
+	__u32 handle;
+	__u32 flags;
+	__u64 modifier;
+};
+
+struct drm_vc4_set_tiling {
+	__u32 handle;
+	__u32 flags;
+	__u64 modifier;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _VC4_DRM_H_ */
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 17504f5..57e70b7 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -165,3 +165,26 @@
 CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x3184, glk,     "Intel(R) HD Graphics (Geminilake)")
 CHIPSET(0x3185, glk_2x6, "Intel(R) HD Graphics (Geminilake 2x6)")
+CHIPSET(0x3E90, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
+CHIPSET(0x3E93, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
+CHIPSET(0x3E91, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3E92, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3E9B, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3E94, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3EA6, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
+CHIPSET(0x3EA7, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
+CHIPSET(0x3EA8, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
+CHIPSET(0x3EA5, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
+CHIPSET(0x5A49, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)")
+CHIPSET(0x5A4A, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)")
+CHIPSET(0x5A41, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)")
+CHIPSET(0x5A42, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)")
+CHIPSET(0x5A44, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)")
+CHIPSET(0x5A59, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)")
+CHIPSET(0x5A5A, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)")
+CHIPSET(0x5A5C, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)")
+CHIPSET(0x5A50, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
+CHIPSET(0x5A51, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
+CHIPSET(0x5A52, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
+CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
diff --git a/scons/custom.py b/scons/custom.py
index 544b15d..955247c 100644
--- a/scons/custom.py
+++ b/scons/custom.py
@@ -281,7 +281,7 @@
                     # cause duplicate actions.
                     f = f[len(cur_srcdir + '/'):]
                 # do not include any headers
-                if f.endswith(tuple(['.h','.hpp'])):
+                if f.endswith(tuple(['.h','.hpp','.inl'])):
                     continue
                 srcs.append(f)
 
diff --git a/scons/gallium.py b/scons/gallium.py
index 61643a6..e394bf8 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -145,6 +145,30 @@
     sys.stdout.write(' %s\n' % ['no', 'yes'][int(bool(result))])
     return result
 
+def check_header(env, header):
+    '''Check if the header exist'''
+
+    conf = SCons.Script.Configure(env)
+    have_header = False
+
+    if conf.CheckHeader(header):
+        have_header = True
+
+    env = conf.Finish()
+    return have_header
+
+def check_functions(env, functions):
+    '''Check if all of the functions exist'''
+
+    conf = SCons.Script.Configure(env)
+    have_functions = True
+
+    for function in functions:
+        if not conf.CheckFunc(function):
+            have_functions = False
+
+    env = conf.Finish()
+    return have_functions
 
 def check_prog(env, prog):
     """Check whether this program exists."""
@@ -233,16 +257,16 @@
     # Backwards compatability with the debug= profile= options
     if env['build'] == 'debug':
         if not env['debug']:
-            print 'scons: warning: debug option is deprecated and will be removed eventually; use instead'
-            print
-            print ' scons build=release'
-            print
+            print('scons: warning: debug option is deprecated and will be removed eventually; use instead')
+            print('')
+            print(' scons build=release')
+            print('')
             env['build'] = 'release'
         if env['profile']:
-            print 'scons: warning: profile option is deprecated and will be removed eventually; use instead'
-            print
-            print ' scons build=profile'
-            print
+            print('scons: warning: profile option is deprecated and will be removed eventually; use instead')
+            print('')
+            print(' scons build=profile')
+            print('')
             env['build'] = 'profile'
     if False:
         # Enforce SConscripts to use the new build variable
@@ -276,7 +300,7 @@
     env['build_dir'] = build_dir
     env.SConsignFile(os.path.join(build_dir, '.sconsign'))
     if 'SCONS_CACHE_DIR' in os.environ:
-        print 'scons: Using build cache in %s.' % (os.environ['SCONS_CACHE_DIR'],)
+        print('scons: Using build cache in %s.' % (os.environ['SCONS_CACHE_DIR'],))
         env.CacheDir(os.environ['SCONS_CACHE_DIR'])
     env['CONFIGUREDIR'] = os.path.join(build_dir, 'conf')
     env['CONFIGURELOG'] = os.path.join(os.path.abspath(build_dir), 'config.log')
@@ -325,10 +349,11 @@
                 'GLX_INDIRECT_RENDERING',
             ]
 
-        conf = SCons.Script.Configure(env)
-        if conf.CheckHeader('xlocale.h'):
+        if check_header(env, 'xlocale.h'):
             cppdefines += ['HAVE_XLOCALE_H']
-        env = conf.Finish()
+
+        if check_functions(env, ['strtod_l', 'strtof_l']):
+            cppdefines += ['HAVE_STRTOD_L']
 
     if platform == 'windows':
         cppdefines += [
@@ -360,8 +385,8 @@
     if env['embedded']:
         cppdefines += ['PIPE_SUBSYSTEM_EMBEDDED']
     if env['texture_float']:
-        print 'warning: Floating-point textures enabled.'
-        print 'warning: Please consult docs/patents.txt with your lawyer before building Mesa.'
+        print('warning: Floating-point textures enabled.')
+        print('warning: Please consult docs/patents.txt with your lawyer before building Mesa.')
         cppdefines += ['TEXTURE_FLOAT_ENABLED']
     env.Append(CPPDEFINES = cppdefines)
 
diff --git a/scons/llvm.py b/scons/llvm.py
index 928fc97..eaa2ecb 100644
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -68,13 +68,13 @@
     if env['platform'] == 'windows':
         # XXX: There is no llvm-config on Windows, so assume a standard layout
         if llvm_dir is None:
-            print 'scons: LLVM environment variable must be specified when building for windows'
+            print('scons: LLVM environment variable must be specified when building for windows')
             return
 
         # Try to determine the LLVM version from llvm/Config/config.h
         llvm_config = os.path.join(llvm_dir, 'include/llvm/Config/llvm-config.h')
         if not os.path.exists(llvm_config):
-            print 'scons: could not find %s' % llvm_config
+            print('scons: could not find %s' % llvm_config)
             return
         llvm_version_major_re = re.compile(r'^#define LLVM_VERSION_MAJOR ([0-9]+)')
         llvm_version_minor_re = re.compile(r'^#define LLVM_VERSION_MINOR ([0-9]+)')
@@ -92,10 +92,10 @@
             llvm_version = distutils.version.LooseVersion('%s.%s' % (llvm_version_major, llvm_version_minor))
 
         if llvm_version is None:
-            print 'scons: could not determine the LLVM version from %s' % llvm_config
+            print('scons: could not determine the LLVM version from %s' % llvm_config)
             return
         if llvm_version < distutils.version.LooseVersion(required_llvm_version):
-            print 'scons: LLVM version %s found, but %s is required' % (llvm_version, required_llvm_version)
+            print('scons: LLVM version %s found, but %s is required' % (llvm_version, required_llvm_version))
             return
 
         env.Prepend(CPPPATH = [os.path.join(llvm_dir, 'include')])
@@ -104,7 +104,26 @@
         ])
         env.Prepend(LIBPATH = [os.path.join(llvm_dir, 'lib')])
         # LIBS should match the output of `llvm-config --libs engine mcjit bitwriter x86asmprinter irreader`
-        if llvm_version >= distutils.version.LooseVersion('4.0'):
+        if llvm_version >= distutils.version.LooseVersion('5.0'):
+            env.Prepend(LIBS = [
+                'LLVMX86Disassembler', 'LLVMX86AsmParser',
+                'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter',
+                'LLVMDebugInfoCodeView', 'LLVMCodeGen',
+                'LLVMScalarOpts', 'LLVMInstCombine',
+                'LLVMTransformUtils',
+                'LLVMBitWriter', 'LLVMX86Desc',
+                'LLVMMCDisassembler', 'LLVMX86Info',
+                'LLVMX86AsmPrinter', 'LLVMX86Utils',
+                'LLVMMCJIT', 'LLVMExecutionEngine', 'LLVMTarget',
+                'LLVMAnalysis', 'LLVMProfileData',
+                'LLVMRuntimeDyld', 'LLVMObject', 'LLVMMCParser',
+                'LLVMBitReader', 'LLVMMC', 'LLVMCore',
+                'LLVMSupport',
+                'LLVMIRReader', 'LLVMAsmParser',
+                'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF',
+                'LLVMBinaryFormat',
+            ])
+        elif llvm_version >= distutils.version.LooseVersion('4.0'):
             env.Prepend(LIBS = [
                 'LLVMX86Disassembler', 'LLVMX86AsmParser',
                 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter',
@@ -212,14 +231,14 @@
     else:
         llvm_config = os.environ.get('LLVM_CONFIG', 'llvm-config')
         if not env.Detect(llvm_config):
-            print 'scons: %s script not found' % llvm_config
+            print('scons: %s script not found' % llvm_config)
             return
 
         llvm_version = env.backtick('%s --version' % llvm_config).rstrip()
         llvm_version = distutils.version.LooseVersion(llvm_version)
 
         if llvm_version < distutils.version.LooseVersion(required_llvm_version):
-            print 'scons: LLVM version %s found, but %s is required' % (llvm_version, required_llvm_version)
+            print('scons: LLVM version %s found, but %s is required' % (llvm_version, required_llvm_version))
             return
 
         try:
@@ -245,13 +264,13 @@
                 env.ParseConfig('%s --system-libs' % llvm_config)
                 env.Append(CXXFLAGS = ['-std=c++11'])
         except OSError:
-            print 'scons: llvm-config version %s failed' % llvm_version
+            print('scons: llvm-config version %s failed' % llvm_version)
             return
 
     assert llvm_version is not None
     env['llvm'] = True
 
-    print 'scons: Found LLVM version %s' % llvm_version
+    print('scons: Found LLVM version %s' % llvm_version)
     env['LLVM_VERSION'] = llvm_version
 
     # Define HAVE_LLVM macro with the major/minor version number (e.g., 0x0206 for 2.6)
diff --git a/src/Makefile.am b/src/Makefile.am
index aa5f8aa..5aee6b0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -21,18 +21,7 @@
 
 .PHONY: git_sha1.h.tmp
 git_sha1.h.tmp:
-	@# Don't assume that $(top_srcdir)/.git is a directory. It may be
-	@# a gitlink file if $(top_srcdir) is a submodule checkout or a linked
-	@# worktree.
-	@# If we are building from a release tarball copy the bundled header.
-	@touch git_sha1.h.tmp
-	@if test -e $(top_srcdir)/.git; then \
-		if which git > /dev/null; then \
-		    git --git-dir=$(top_srcdir)/.git log -n 1 --oneline | \
-			sed 's/^\([^ ]*\) .*/#define MESA_GIT_SHA1 "git-\1"/' \
-			> git_sha1.h.tmp ; \
-		fi \
-	fi
+	@sh $(top_srcdir)/bin/git_sha1_gen.sh > $@
 
 git_sha1.h: git_sha1.h.tmp
 	@echo "updating git_sha1.h"
@@ -93,6 +82,10 @@
 SUBDIRS += intel
 endif
 
+if HAVE_GALLIUM_VC4
+SUBDIRS += broadcom
+endif
+
 if NEED_OPENGL_COMMON
 SUBDIRS += mesa
 endif
diff --git a/src/SConscript b/src/SConscript
index d861af8..5e84398 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -22,27 +22,15 @@
     to retrieve the git hashid and write the header file.  An empty file
     will be created if anything goes wrong."""
 
-    args = [ 'git', 'rev-parse', '--short=10', 'HEAD' ]
-    try:
-        (commit, foo) = subprocess.Popen(args, stdout=subprocess.PIPE).communicate()
-    except:
-        print "Warning: exception in write_git_sha1_h_file()"
-        # git log command didn't work
-        if not os.path.exists(filename):
-            dirname = os.path.dirname(filename)
-            if dirname and not os.path.exists(dirname):
-                os.makedirs(dirname)
-            # create an empty file if none already exists
-            f = open(filename, "w")
-            f.close()
-        return
-
-    # note that commit[:-1] removes the trailing newline character
-    commit = '#define MESA_GIT_SHA1 "git-%s"\n' % commit[:-1]
     tempfile = "git_sha1.h.tmp"
-    f = open(tempfile, "w")
-    f.write(commit)
-    f.close()
+    with open(tempfile, "w") as f:
+        args = [ 'sh', Dir('#').abspath + '/bin/git_sha1_gen.sh' ]
+        try:
+            subprocess.Popen(args, stdout=f).wait()
+        except:
+            print("Warning: exception in write_git_sha1_h_file()")
+            return
+
     if not os.path.exists(filename) or not filecmp.cmp(tempfile, filename):
         # The filename does not exist or it's different from the new file,
         # so replace old file with new.
diff --git a/src/amd/Android.addrlib.mk b/src/amd/Android.addrlib.mk
index 540de55..a29f7c1 100644
--- a/src/amd/Android.addrlib.mk
+++ b/src/amd/Android.addrlib.mk
@@ -42,5 +42,11 @@
 	$(MESA_TOP)/src/amd/addrlib/gfx9/chip \
 	$(MESA_TOP)/src/amd/addrlib/r800/chip
 
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(LOCAL_PATH) \
+	$(LOCAL_PATH)/addrlib/core \
+	$(LOCAL_PATH)/addrlib/inc/chip/r800 \
+	$(LOCAL_PATH)/addrlib/r800/chip
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/amd/Android.common.mk b/src/amd/Android.common.mk
index faace71..92b2452 100644
--- a/src/amd/Android.common.mk
+++ b/src/amd/Android.common.mk
@@ -29,6 +29,7 @@
 LOCAL_MODULE := libmesa_amd_common
 
 LOCAL_SRC_FILES := \
+	$(AMD_COMMON_FILES) \
 	$(AMD_COMPILER_FILES) \
 	$(AMD_DEBUG_FILES)
 
@@ -49,15 +50,25 @@
 	$(MESA_TOP)/include \
 	$(MESA_TOP)/src \
 	$(MESA_TOP)/src/amd/common \
+	$(MESA_TOP)/src/compiler \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary \
-	$(intermediates)/common \
-	external/llvm/include \
-	external/llvm/device/include \
-	external/libcxx/include \
-	$(ELF_INCLUDES)
+	$(intermediates)/common
 
-LOCAL_STATIC_LIBRARIES := libLLVMCore
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(LOCAL_PATH)/common
+
+LOCAL_SHARED_LIBRARIES := \
+	libdrm_amdgpu
+
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_nir
+
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libelf
+
+$(call mesa-build-with-llvm)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/amd/Makefile.common.am b/src/amd/Makefile.common.am
index 83f148b..dff461c 100644
--- a/src/amd/Makefile.common.am
+++ b/src/amd/Makefile.common.am
@@ -25,6 +25,7 @@
 
 # TODO cleanup these
 common_libamd_common_la_CPPFLAGS = \
+	$(AMDGPU_CFLAGS) \
 	$(VALGRIND_CFLAGS) \
 	$(DEFINES) \
 	-I$(top_srcdir)/include \
@@ -54,6 +55,7 @@
 noinst_LTLIBRARIES += $(COMMON_LIBS)
 
 common_libamd_common_la_SOURCES = \
+	$(AMD_COMMON_FILES) \
 	$(AMD_COMPILER_FILES) \
 	$(AMD_DEBUG_FILES) \
 	$(AMD_GENERATED_FILES)
diff --git a/src/amd/Makefile.sources b/src/amd/Makefile.sources
index 816e7e4..098f6b9 100644
--- a/src/amd/Makefile.sources
+++ b/src/amd/Makefile.sources
@@ -42,16 +42,25 @@
 AMD_COMPILER_FILES = \
 	common/ac_binary.c \
 	common/ac_binary.h \
+	common/ac_exp_param.h \
 	common/ac_llvm_build.c \
 	common/ac_llvm_build.h \
 	common/ac_llvm_helper.cpp \
 	common/ac_llvm_util.c \
-	common/ac_llvm_util.h
+	common/ac_llvm_util.h \
+	common/ac_shader_info.c \
+	common/ac_shader_info.h
 
 AMD_NIR_FILES = \
 	common/ac_nir_to_llvm.c \
 	common/ac_nir_to_llvm.h
 
+AMD_COMMON_FILES = \
+	common/ac_gpu_info.c \
+	common/ac_gpu_info.h \
+	common/ac_surface.c \
+	common/ac_surface.h
+
 AMD_DEBUG_FILES = \
 	common/ac_debug.c \
 	common/ac_debug.h
diff --git a/src/amd/addrlib/addrinterface.cpp b/src/amd/addrlib/addrinterface.cpp
index ea2506e..85e298d 100644
--- a/src/amd/addrlib/addrinterface.cpp
+++ b/src/amd/addrlib/addrinterface.cpp
@@ -1054,7 +1054,7 @@
 */
 ADDR_E_RETURNCODE ADDR_API AddrGetMaxAlignments(
     ADDR_HANDLE                     hLib, ///< address lib handle
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) ///< [out] output structure
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) ///< [out] output structure
 {
     Addr::Lib* pLib = Lib::GetLib(hLib);
 
diff --git a/src/amd/addrlib/addrinterface.h b/src/amd/addrlib/addrinterface.h
index c36d465..b9d4d8f 100644
--- a/src/amd/addrlib/addrinterface.h
+++ b/src/amd/addrlib/addrinterface.h
@@ -2295,17 +2295,17 @@
 
 /**
 ****************************************************************************************************
-*   ADDR_GET_MAX_ALINGMENTS_OUTPUT
+*   ADDR_GET_MAX_ALIGNMENTS_OUTPUT
 *
 *   @brief
 *       Output structure of AddrGetMaxAlignments
 ****************************************************************************************************
 */
-typedef struct _ADDR_GET_MAX_ALINGMENTS_OUTPUT
+typedef struct _ADDR_GET_MAX_ALIGNMENTS_OUTPUT
 {
     UINT_32 size;                   ///< Size of this structure in bytes
     UINT_64 baseAlign;              ///< Maximum base alignment in bytes
-} ADDR_GET_MAX_ALINGMENTS_OUTPUT;
+} ADDR_GET_MAX_ALIGNMENTS_OUTPUT;
 
 /**
 ****************************************************************************************************
@@ -2317,7 +2317,7 @@
 */
 ADDR_E_RETURNCODE ADDR_API AddrGetMaxAlignments(
     ADDR_HANDLE                     hLib,
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut);
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut);
 
 
 
diff --git a/src/amd/addrlib/core/addrlib.cpp b/src/amd/addrlib/core/addrlib.cpp
index b86fd81..65fd345 100644
--- a/src/amd/addrlib/core/addrlib.cpp
+++ b/src/amd/addrlib/core/addrlib.cpp
@@ -356,14 +356,14 @@
 ****************************************************************************************************
 */
 ADDR_E_RETURNCODE Lib::GetMaxAlignments(
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut    ///< [out] output structure
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut    ///< [out] output structure
     ) const
 {
     ADDR_E_RETURNCODE returnCode = ADDR_OK;
 
     if (GetFillSizeFieldsFlags() == TRUE)
     {
-        if (pOut->size != sizeof(ADDR_GET_MAX_ALINGMENTS_OUTPUT))
+        if (pOut->size != sizeof(ADDR_GET_MAX_ALIGNMENTS_OUTPUT))
         {
             returnCode = ADDR_PARAMSIZEMISMATCH;
         }
diff --git a/src/amd/addrlib/core/addrlib.h b/src/amd/addrlib/core/addrlib.h
index 736604e..2070084 100644
--- a/src/amd/addrlib/core/addrlib.h
+++ b/src/amd/addrlib/core/addrlib.h
@@ -169,14 +169,14 @@
 
     BOOL_32 GetExportNorm(const ELEM_GETEXPORTNORM_INPUT* pIn) const;
 
-    ADDR_E_RETURNCODE GetMaxAlignments(ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) const;
+    ADDR_E_RETURNCODE GetMaxAlignments(ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) const;
 
 protected:
     Lib();  // Constructor is protected
     Lib(const Client* pClient);
 
     /// Pure virtual function to get max alignments
-    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) const = 0;
+    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) const = 0;
 
     //
     // Initialization
diff --git a/src/amd/addrlib/core/addrobject.cpp b/src/amd/addrlib/core/addrobject.cpp
index dcdb1bf..452feb5 100644
--- a/src/amd/addrlib/core/addrobject.cpp
+++ b/src/amd/addrlib/core/addrobject.cpp
@@ -216,20 +216,16 @@
 #if DEBUG
     if (m_client.callbacks.debugPrint != NULL)
     {
-        va_list ap;
-
-        va_start(ap, pDebugString);
-
         ADDR_DEBUGPRINT_INPUT debugPrintInput = {0};
 
         debugPrintInput.size         = sizeof(ADDR_DEBUGPRINT_INPUT);
         debugPrintInput.pDebugString = const_cast<CHAR*>(pDebugString);
         debugPrintInput.hClient      = m_client.handle;
-        va_copy(debugPrintInput.ap, ap);
+        va_start(debugPrintInput.ap, pDebugString);
 
         m_client.callbacks.debugPrint(&debugPrintInput);
 
-        va_end(ap);
+        va_end(debugPrintInput.ap);
     }
 #endif
 }
diff --git a/src/amd/addrlib/gfx9/gfx9addrlib.cpp b/src/amd/addrlib/gfx9/gfx9addrlib.cpp
index 9b25371..57ecb05 100644
--- a/src/amd/addrlib/gfx9/gfx9addrlib.cpp
+++ b/src/amd/addrlib/gfx9/gfx9addrlib.cpp
@@ -663,7 +663,7 @@
 ************************************************************************************************************************
 */
 ADDR_E_RETURNCODE Gfx9Lib::HwlGetMaxAlignments(
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut    ///< [out] output structure
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut    ///< [out] output structure
     ) const
 {
     pOut->baseAlign = HwlComputeSurfaceBaseAlign(ADDR_SW_64KB);
diff --git a/src/amd/addrlib/gfx9/gfx9addrlib.h b/src/amd/addrlib/gfx9/gfx9addrlib.h
index 9623610..b7d1287 100644
--- a/src/amd/addrlib/gfx9/gfx9addrlib.h
+++ b/src/amd/addrlib/gfx9/gfx9addrlib.h
@@ -374,7 +374,7 @@
 
 private:
     virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(
-        ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) const;
+        ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) const;
 
     virtual BOOL_32 HwlInitGlobalParams(
         const ADDR_CREATE_INPUT* pCreateIn);
diff --git a/src/amd/addrlib/r800/ciaddrlib.cpp b/src/amd/addrlib/r800/ciaddrlib.cpp
index fe965b8..5dd52dd 100644
--- a/src/amd/addrlib/r800/ciaddrlib.cpp
+++ b/src/amd/addrlib/r800/ciaddrlib.cpp
@@ -2177,7 +2177,7 @@
 ****************************************************************************************************
 */
 ADDR_E_RETURNCODE CiLib::HwlGetMaxAlignments(
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut    ///< [out] output structure
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut    ///< [out] output structure
     ) const
 {
     const UINT_32 pipes = HwlGetPipes(&m_tileTable[0].info);
diff --git a/src/amd/addrlib/r800/ciaddrlib.h b/src/amd/addrlib/r800/ciaddrlib.h
index 7e331dd..3c838df 100644
--- a/src/amd/addrlib/r800/ciaddrlib.h
+++ b/src/amd/addrlib/r800/ciaddrlib.h
@@ -168,7 +168,7 @@
         const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*  pIn,
         ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*       pOut) const;
 
-    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) const;
+    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) const;
 
     virtual VOID HwlPadDimensions(
         AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
diff --git a/src/amd/addrlib/r800/siaddrlib.cpp b/src/amd/addrlib/r800/siaddrlib.cpp
index ffa5488..9ee1335 100644
--- a/src/amd/addrlib/r800/siaddrlib.cpp
+++ b/src/amd/addrlib/r800/siaddrlib.cpp
@@ -3483,7 +3483,7 @@
 ****************************************************************************************************
 */
 ADDR_E_RETURNCODE SiLib::HwlGetMaxAlignments(
-    ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut    ///< [out] output structure
+    ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut    ///< [out] output structure
     ) const
 {
     const UINT_32 pipes = HwlGetPipes(&m_tileTable[0].info);
diff --git a/src/amd/addrlib/r800/siaddrlib.h b/src/amd/addrlib/r800/siaddrlib.h
index 37e26ff..faf63fd 100644
--- a/src/amd/addrlib/r800/siaddrlib.h
+++ b/src/amd/addrlib/r800/siaddrlib.h
@@ -245,7 +245,7 @@
         return TRUE;
     }
 
-    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALINGMENTS_OUTPUT* pOut) const;
+    virtual ADDR_E_RETURNCODE HwlGetMaxAlignments(ADDR_GET_MAX_ALIGNMENTS_OUTPUT* pOut) const;
 
     virtual VOID HwlComputeSurfaceAlignmentsMacroTiled(
         AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
index 618b5cf..1bf52c7 100644
--- a/src/amd/common/ac_binary.c
+++ b/src/amd/common/ac_binary.c
@@ -109,7 +109,7 @@
 	}
 }
 
-void ac_elf_read(const char *elf_data, unsigned elf_size,
+bool ac_elf_read(const char *elf_data, unsigned elf_size,
 		 struct ac_shader_binary *binary)
 {
 	char *elf_buffer;
@@ -118,6 +118,7 @@
 	Elf_Data *symbols = NULL, *relocs = NULL;
 	size_t section_str_index;
 	unsigned symbol_sh_link = 0;
+	bool success = true;
 
 	/* One of the libelf implementations
 	 * (http://www.mr511.de/software/english.htm) requires calling
@@ -137,7 +138,8 @@
 		GElf_Shdr section_header;
 		if (gelf_getshdr(section, &section_header) != &section_header) {
 			fprintf(stderr, "Failed to read ELF section header\n");
-			return;
+			success = false;
+			break;
 		}
 		name = elf_strptr(elf, section_str_index, section_header.sh_name);
 		if (!strcmp(name, ".text")) {
@@ -148,6 +150,11 @@
 		} else if (!strcmp(name, ".AMDGPU.config")) {
 			section_data = elf_getdata(section, section_data);
 			binary->config_size = section_data->d_size;
+			if (!binary->config_size) {
+				fprintf(stderr, ".AMDGPU.config is empty!\n");
+				success = false;
+				break;
+			}
 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 			memcpy(binary->config, section_data->d_buf, binary->config_size);
 		} else if (!strcmp(name, ".AMDGPU.disasm")) {
@@ -186,6 +193,7 @@
 		binary->global_symbol_count = 1;
 		binary->config_size_per_symbol = binary->config_size;
 	}
+	return success;
 }
 
 const unsigned char *ac_shader_binary_config_start(
diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
index a784a72..45f554e 100644
--- a/src/amd/common/ac_binary.h
+++ b/src/amd/common/ac_binary.h
@@ -83,7 +83,7 @@
  * Parse the elf binary stored in \p elf_data and create a
  * ac_shader_binary object.
  */
-void ac_elf_read(const char *elf_data, unsigned elf_size,
+bool ac_elf_read(const char *elf_data, unsigned elf_size,
 		 struct ac_shader_binary *binary);
 
 /**
diff --git a/src/amd/common/ac_debug.c b/src/amd/common/ac_debug.c
index 9d051f9..79473ec 100644
--- a/src/amd/common/ac_debug.c
+++ b/src/amd/common/ac_debug.c
@@ -132,9 +132,15 @@
 static void ac_parse_set_reg_packet(FILE *f, uint32_t *ib, unsigned count,
 				    unsigned reg_offset)
 {
-	unsigned reg = (ib[1] << 2) + reg_offset;
+	unsigned reg = ((ib[1] & 0xFFFF) << 2) + reg_offset;
+	unsigned index = ib[1] >> 28;
 	int i;
 
+	if (index != 0) {
+		print_spaces(f, INDENT_PKT);
+		fprintf(f, "INDEX = %u\n", index);
+	}
+
 	for (i = 0; i < count; i++)
 		ac_dump_reg(f, reg + i*4, ib[2+i], ~0);
 }
@@ -214,6 +220,52 @@
 			print_named_value(f, "ADDRESS_HI", ib[3], 16);
 		}
 		break;
+	case PKT3_EVENT_WRITE_EOP:
+		ac_dump_reg(f, R_028A90_VGT_EVENT_INITIATOR, ib[1],
+			    S_028A90_EVENT_TYPE(~0));
+		print_named_value(f, "EVENT_INDEX", (ib[1] >> 8) & 0xf, 4);
+		print_named_value(f, "TCL1_VOL_ACTION_ENA", (ib[1] >> 12) & 0x1, 1);
+		print_named_value(f, "TC_VOL_ACTION_ENA", (ib[1] >> 13) & 0x1, 1);
+		print_named_value(f, "TC_WB_ACTION_ENA", (ib[1] >> 15) & 0x1, 1);
+		print_named_value(f, "TCL1_ACTION_ENA", (ib[1] >> 16) & 0x1, 1);
+		print_named_value(f, "TC_ACTION_ENA", (ib[1] >> 17) & 0x1, 1);
+		print_named_value(f, "ADDRESS_LO", ib[2], 32);
+		print_named_value(f, "ADDRESS_HI", ib[3], 16);
+		print_named_value(f, "DST_SEL", (ib[3] >> 16) & 0x3, 2);
+		print_named_value(f, "INT_SEL", (ib[3] >> 24) & 0x7, 3);
+		print_named_value(f, "DATA_SEL", ib[3] >> 29, 3);
+		print_named_value(f, "DATA_LO", ib[4], 32);
+		print_named_value(f, "DATA_HI", ib[5], 32);
+		break;
+	case PKT3_RELEASE_MEM:
+		ac_dump_reg(f, R_028A90_VGT_EVENT_INITIATOR, ib[1],
+			    S_028A90_EVENT_TYPE(~0));
+		print_named_value(f, "EVENT_INDEX", (ib[1] >> 8) & 0xf, 4);
+		print_named_value(f, "TCL1_VOL_ACTION_ENA", (ib[1] >> 12) & 0x1, 1);
+		print_named_value(f, "TC_VOL_ACTION_ENA", (ib[1] >> 13) & 0x1, 1);
+		print_named_value(f, "TC_WB_ACTION_ENA", (ib[1] >> 15) & 0x1, 1);
+		print_named_value(f, "TCL1_ACTION_ENA", (ib[1] >> 16) & 0x1, 1);
+		print_named_value(f, "TC_ACTION_ENA", (ib[1] >> 17) & 0x1, 1);
+		print_named_value(f, "TC_NC_ACTION_ENA", (ib[1] >> 19) & 0x1, 1);
+		print_named_value(f, "TC_WC_ACTION_ENA", (ib[1] >> 20) & 0x1, 1);
+		print_named_value(f, "TC_MD_ACTION_ENA", (ib[1] >> 21) & 0x1, 1);
+		print_named_value(f, "DST_SEL", (ib[2] >> 16) & 0x3, 2);
+		print_named_value(f, "INT_SEL", (ib[2] >> 24) & 0x7, 3);
+		print_named_value(f, "DATA_SEL", ib[2] >> 29, 3);
+		print_named_value(f, "ADDRESS_LO", ib[3], 32);
+		print_named_value(f, "ADDRESS_HI", ib[4], 32);
+		print_named_value(f, "DATA_LO", ib[5], 32);
+		print_named_value(f, "DATA_HI", ib[6], 32);
+		print_named_value(f, "CTXID", ib[7], 32);
+		break;
+	case PKT3_WAIT_REG_MEM:
+		print_named_value(f, "OP", ib[1], 32);
+		print_named_value(f, "ADDRESS_LO", ib[2], 32);
+		print_named_value(f, "ADDRESS_HI", ib[3], 32);
+		print_named_value(f, "REF", ib[4], 32);
+		print_named_value(f, "MASK", ib[5], 32);
+		print_named_value(f, "POLL_INTERVAL", ib[6], 16);
+		break;
 	case PKT3_DRAW_INDEX_AUTO:
 		ac_dump_reg(f, R_030930_VGT_NUM_INDICES, ib[1], ~0);
 		ac_dump_reg(f, R_0287F0_VGT_DRAW_INITIATOR, ib[2], ~0);
diff --git a/src/amd/common/ac_exp_param.h b/src/amd/common/ac_exp_param.h
new file mode 100644
index 0000000..b97ce81
--- /dev/null
+++ b/src/amd/common/ac_exp_param.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+#ifndef AC_EXP_PARAM_H
+#define AC_EXP_PARAM_H
+
+enum {
+	/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
+	AC_EXP_PARAM_OFFSET_0 = 0,
+	AC_EXP_PARAM_OFFSET_31 = 31,
+	/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
+	AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
+	AC_EXP_PARAM_DEFAULT_VAL_0001,
+	AC_EXP_PARAM_DEFAULT_VAL_1110,
+	AC_EXP_PARAM_DEFAULT_VAL_1111,
+	AC_EXP_PARAM_UNDEFINED = 255,
+};
+
+#endif
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
new file mode 100644
index 0000000..929dfd2
--- /dev/null
+++ b/src/amd/common/ac_gpu_info.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#include "ac_gpu_info.h"
+#include "sid.h"
+#include "gfx9d.h"
+
+#include "util/u_math.h"
+
+#include <stdio.h>
+
+#include <xf86drm.h>
+#include <amdgpu_drm.h>
+
+#include <amdgpu.h>
+
+#define CIK_TILE_MODE_COLOR_2D			14
+
+#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+
+static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
+{
+   unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D];
+
+   switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
+   case CIK__PIPE_CONFIG__ADDR_SURF_P2:
+       return 2;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
+       return 4;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
+       return 8;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
+       return 16;
+   default:
+       fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n");
+       assert(!"this should never occur");
+       return 2;
+   }
+}
+
+static bool has_syncobj(int fd)
+{
+	uint64_t value;
+	if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value))
+		return false;
+	return value ? true : false;
+}
+
+bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
+		       struct radeon_info *info,
+		       struct amdgpu_gpu_info *amdinfo)
+{
+	struct amdgpu_buffer_size_alignments alignment_info = {};
+	struct amdgpu_heap_info vram, vram_vis, gtt;
+	struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {}, vce = {}, vcn_dec = {};
+	uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
+	uint32_t unused_feature;
+	int r, i, j;
+	drmDevicePtr devinfo;
+
+	/* Get PCI info. */
+	r = drmGetDevice2(fd, 0, &devinfo);
+	if (r) {
+		fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
+		return false;
+	}
+	info->pci_domain = devinfo->businfo.pci->domain;
+	info->pci_bus = devinfo->businfo.pci->bus;
+	info->pci_dev = devinfo->businfo.pci->dev;
+	info->pci_func = devinfo->businfo.pci->func;
+	drmFreeDevice(&devinfo);
+
+	/* Query hardware and driver information. */
+	r = amdgpu_query_gpu_info(dev, amdinfo);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM,
+				AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
+				&vram_vis);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
+		return false;
+	}
+
+	if (info->drm_major == 3 && info->drm_minor >= 17) {
+		r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec);
+		if (r) {
+			fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n");
+			return false;
+		}
+	}
+
+	r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0,
+					&info->me_fw_version, &unused_feature);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0,
+					&info->pfp_fw_version, &unused_feature);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0,
+					&info->ce_fw_version, &unused_feature);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0,
+					&uvd_version, &uvd_feature);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
+		return false;
+	}
+
+	r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0,
+					&vce_version, &vce_feature);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
+		return false;
+	}
+
+	/* Set chip identification. */
+	info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
+	info->vce_harvest_config = amdinfo->vce_harvest_config;
+
+	switch (info->pci_id) {
+#define CHIPSET(pci_id, name, cfamily) case pci_id: info->family = CHIP_##cfamily; break;
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+
+	default:
+		fprintf(stderr, "amdgpu: Invalid PCI ID.\n");
+		return false;
+	}
+
+	if (info->family >= CHIP_VEGA10)
+		info->chip_class = GFX9;
+	else if (info->family >= CHIP_TONGA)
+		info->chip_class = VI;
+	else if (info->family >= CHIP_BONAIRE)
+		info->chip_class = CIK;
+	else if (info->family >= CHIP_TAHITI)
+		info->chip_class = SI;
+	else {
+		fprintf(stderr, "amdgpu: Unknown family.\n");
+		return false;
+	}
+
+	/* Set which chips have dedicated VRAM. */
+	info->has_dedicated_vram =
+		!(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
+
+	/* Set hardware information. */
+	info->gart_size = gtt.heap_size;
+	info->vram_size = vram.heap_size;
+	info->vram_vis_size = vram_vis.heap_size;
+	/* The kernel can split large buffers in VRAM but not in GTT, so large
+	 * allocations can fail or cause buffer movement failures in the kernel.
+	 */
+	info->max_alloc_size = MIN2(info->vram_size * 0.9, info->gart_size * 0.7);
+	/* convert the shader clock from KHz to MHz */
+	info->max_shader_clock = amdinfo->max_engine_clk / 1000;
+	info->max_se = amdinfo->num_shader_engines;
+	info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
+	info->has_hw_decode =
+		(uvd.available_rings != 0) || (vcn_dec.available_rings != 0);
+	info->uvd_fw_version =
+		uvd.available_rings ? uvd_version : 0;
+	info->vce_fw_version =
+		vce.available_rings ? vce_version : 0;
+	info->has_userptr = true;
+	info->has_syncobj = has_syncobj(fd);
+	info->num_render_backends = amdinfo->rb_pipes;
+	info->clock_crystal_freq = amdinfo->gpu_counter_freq;
+	if (!info->clock_crystal_freq) {
+		fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
+		info->clock_crystal_freq = 1;
+	}
+	info->tcc_cache_line_size = 64; /* TC L2 line size on GCN */
+	if (info->chip_class == GFX9) {
+		info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg);
+		info->pipe_interleave_bytes =
+			256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg);
+	} else {
+		info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo);
+		info->pipe_interleave_bytes =
+			256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg);
+	}
+	info->has_virtual_memory = true;
+
+	assert(util_is_power_of_two(dma.available_rings + 1));
+	assert(util_is_power_of_two(compute.available_rings + 1));
+
+	info->num_sdma_rings = util_bitcount(dma.available_rings);
+	info->num_compute_rings = util_bitcount(compute.available_rings);
+
+	/* Get the number of good compute units. */
+	info->num_good_compute_units = 0;
+	for (i = 0; i < info->max_se; i++)
+		for (j = 0; j < info->max_sh_per_se; j++)
+			info->num_good_compute_units +=
+				util_bitcount(amdinfo->cu_bitmap[i][j]);
+
+	memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode,
+		sizeof(amdinfo->gb_tile_mode));
+	info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask;
+
+	memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode,
+		sizeof(amdinfo->gb_macro_tile_mode));
+
+	info->pte_fragment_size = alignment_info.size_local;
+	info->gart_page_size = alignment_info.size_remote;
+
+	if (info->chip_class == SI)
+		info->gfx_ib_pad_with_type2 = TRUE;
+
+	return true;
+}
+
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
new file mode 100644
index 0000000..20907c2
--- /dev/null
+++ b/src/amd/common/ac_gpu_info.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#ifndef AC_GPU_INFO_H
+#define AC_GPU_INFO_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "amd_family.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Prior to C11 the following may trigger a typedef redeclaration warning */
+typedef struct amdgpu_device *amdgpu_device_handle;
+struct amdgpu_gpu_info;
+
+struct radeon_info {
+	/* PCI info: domain:bus:dev:func */
+	uint32_t                    pci_domain;
+	uint32_t                    pci_bus;
+	uint32_t                    pci_dev;
+	uint32_t                    pci_func;
+
+	/* Device info. */
+	uint32_t                    pci_id;
+	enum radeon_family          family;
+	enum chip_class             chip_class;
+	uint32_t                    pte_fragment_size;
+	uint32_t                    gart_page_size;
+	uint64_t                    gart_size;
+	uint64_t                    vram_size;
+	uint64_t                    vram_vis_size;
+	uint64_t                    max_alloc_size;
+	uint32_t                    min_alloc_size;
+	bool                        has_dedicated_vram;
+	bool                        has_virtual_memory;
+	bool                        gfx_ib_pad_with_type2;
+	bool                        has_hw_decode;
+	uint32_t                    num_sdma_rings;
+	uint32_t                    num_compute_rings;
+	uint32_t                    uvd_fw_version;
+	uint32_t                    vce_fw_version;
+	uint32_t                    me_fw_version;
+	uint32_t                    pfp_fw_version;
+	uint32_t                    ce_fw_version;
+	uint32_t                    vce_harvest_config;
+	uint32_t                    clock_crystal_freq;
+	uint32_t                    tcc_cache_line_size;
+
+	/* Kernel info. */
+	uint32_t                    drm_major; /* version */
+	uint32_t                    drm_minor;
+	uint32_t                    drm_patchlevel;
+	bool                        has_userptr;
+	bool                        has_syncobj;
+
+	/* Shader cores. */
+	uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
+	uint32_t                    max_shader_clock;
+	uint32_t                    num_good_compute_units;
+	uint32_t                    max_se; /* shader engines */
+	uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
+
+	/* Render backends (color + depth blocks). */
+	uint32_t                    r300_num_gb_pipes;
+	uint32_t                    r300_num_z_pipes;
+	uint32_t                    r600_gb_backend_map; /* R600 harvest config */
+	bool                        r600_gb_backend_map_valid;
+	uint32_t                    r600_num_banks;
+	uint32_t                    num_render_backends;
+	uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
+	uint32_t                    pipe_interleave_bytes;
+	uint32_t                    enabled_rb_mask; /* GCN harvest config */
+
+	uint64_t                    max_alignment; /* from addrlib */
+	/* Tile modes. */
+	uint32_t                    si_tile_mode_array[32];
+	uint32_t                    cik_macrotile_mode_array[16];
+};
+
+bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
+		       struct radeon_info *info,
+		       struct amdgpu_gpu_info *amdinfo);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* AC_GPU_INFO_H */
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index d45094c..47db2b3 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -33,20 +33,25 @@
 #include <stdio.h>
 
 #include "ac_llvm_util.h"
-
+#include "ac_exp_param.h"
 #include "util/bitscan.h"
 #include "util/macros.h"
 #include "sid.h"
 
+#include "shader_enums.h"
+
 /* Initialize module-independent parts of the context.
  *
  * The caller is responsible for initializing ctx::module and ctx::builder.
  */
 void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context)
+ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+		     enum chip_class chip_class)
 {
 	LLVMValueRef args[1];
 
+	ctx->chip_class = chip_class;
+
 	ctx->context = context;
 	ctx->module = NULL;
 	ctx->builder = NULL;
@@ -54,11 +59,20 @@
 	ctx->voidt = LLVMVoidTypeInContext(ctx->context);
 	ctx->i1 = LLVMInt1TypeInContext(ctx->context);
 	ctx->i8 = LLVMInt8TypeInContext(ctx->context);
+	ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
 	ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
+	ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
+	ctx->f16 = LLVMHalfTypeInContext(ctx->context);
 	ctx->f32 = LLVMFloatTypeInContext(ctx->context);
+	ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
 	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
 	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
-	ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
+	ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
+
+	ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
+	ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
+	ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
+	ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 
 	ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 						     "range", 5);
@@ -165,6 +179,20 @@
 	}
 }
 
+/**
+ * Helper function that builds an LLVM IR PHI node and immediately adds
+ * incoming edges.
+ */
+LLVMValueRef
+ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+	     unsigned count_incoming, LLVMValueRef *values,
+	     LLVMBasicBlockRef *blocks)
+{
+	LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
+	LLVMAddIncoming(phi, values, blocks, count_incoming);
+	return phi;
+}
+
 LLVMValueRef
 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 				LLVMValueRef *values,
@@ -231,42 +259,16 @@
 		     LLVMValueRef in[3],
 		     struct cube_selection_coords *out)
 {
-	LLVMBuilderRef builder = ctx->builder;
+	LLVMTypeRef f32 = ctx->f32;
 
-	if (HAVE_LLVM >= 0x0309) {
-		LLVMTypeRef f32 = ctx->f32;
-
-		out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
-					f32, in, 3, AC_FUNC_ATTR_READNONE);
-		out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
-					f32, in, 3, AC_FUNC_ATTR_READNONE);
-		out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
-					f32, in, 3, AC_FUNC_ATTR_READNONE);
-		out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
-					f32, in, 3, AC_FUNC_ATTR_READNONE);
-	} else {
-		LLVMValueRef c[4] = {
-			in[0],
-			in[1],
-			in[2],
-			LLVMGetUndef(LLVMTypeOf(in[0]))
-		};
-		LLVMValueRef vec = ac_build_gather_values(ctx, c, 4);
-
-		LLVMValueRef tmp =
-			ac_build_intrinsic(ctx, "llvm.AMDGPU.cube",
-					   LLVMTypeOf(vec), &vec, 1,
-					   AC_FUNC_ATTR_READNONE);
-
-		out->stc[1] = LLVMBuildExtractElement(builder, tmp,
-				LLVMConstInt(ctx->i32, 0, 0), "");
-		out->stc[0] = LLVMBuildExtractElement(builder, tmp,
-				LLVMConstInt(ctx->i32, 1, 0), "");
-		out->ma = LLVMBuildExtractElement(builder, tmp,
-				LLVMConstInt(ctx->i32, 2, 0), "");
-		out->id = LLVMBuildExtractElement(builder, tmp,
-				LLVMConstInt(ctx->i32, 3, 0), "");
-	}
+	out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
+					 f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
+					 f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
+				     f32, in, 3, AC_FUNC_ATTR_READNONE);
+	out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
+				     f32, in, 3, AC_FUNC_ATTR_READNONE);
 }
 
 /**
@@ -305,15 +307,15 @@
 	is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 
 	/* Select sc */
-	tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], coords[0], "");
+	tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 	sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
-		LLVMBuildSelect(builder, is_ma_x, sgn_ma,
+		LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 			LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 	out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 
 	/* Select tc */
 	tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
-	sgn = LLVMBuildSelect(builder, is_ma_y, LLVMBuildFNeg(builder, sgn_ma, ""),
+	sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 		LLVMConstReal(f32, -1.0), "");
 	out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 
@@ -327,7 +329,7 @@
 
 void
 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
-		       bool is_deriv, bool is_array,
+		       bool is_deriv, bool is_array, bool is_lod,
 		       LLVMValueRef *coords_arg,
 		       LLVMValueRef *derivs_arg)
 {
@@ -337,6 +339,38 @@
 	LLVMValueRef coords[3];
 	LLVMValueRef invma;
 
+	if (is_array && !is_lod) {
+		LLVMValueRef tmp = coords_arg[3];
+		tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
+
+		/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
+		 *
+		 *    "For Array forms, the array layer used will be
+		 *
+		 *       max(0, min(d−1, floor(layer+0.5)))
+		 *
+		 *     where d is the depth of the texture array and layer
+		 *     comes from the component indicated in the tables below.
+		 *     Workaroudn for an issue where the layer is taken from a
+		 *     helper invocation which happens to fall on a different
+		 *     layer due to extrapolation."
+		 *
+		 * VI and earlier attempt to implement this in hardware by
+		 * clamping the value of coords[2] = (8 * layer) + face.
+		 * Unfortunately, this means that the we end up with the wrong
+		 * face when clamping occurs.
+		 *
+		 * Clamp the layer earlier to work around the issue.
+		 */
+		if (ctx->chip_class <= VI) {
+			LLVMValueRef ge0;
+			ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
+			tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
+		}
+
+		coords_arg[3] = tmp;
+	}
+
 	build_cube_intrinsic(ctx, coords_arg, &selcoords);
 
 	invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
@@ -556,7 +590,7 @@
 			    bool has_add_tid)
 {
 	/* TODO: Fix stores with ADD_TID and remove the "has_add_tid" flag. */
-	if (HAVE_LLVM >= 0x0309 && !has_add_tid) {
+	if (!has_add_tid) {
 		/* Split 3 channel stores, becase LLVM doesn't support 3-channel
 		 * intrinsics. */
 		if (num_channels == 3) {
@@ -657,114 +691,89 @@
 		     unsigned inst_offset,
 		     unsigned glc,
 		     unsigned slc,
-		     bool readonly_memory)
+		     bool can_speculate,
+		     bool allow_smem)
 {
+	LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
+	if (voffset)
+		offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
+	if (soffset)
+		offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
+
+	/* TODO: VI and later generations can use SMEM with GLC=1.*/
+	if (allow_smem && !glc && !slc) {
+		assert(vindex == NULL);
+
+		LLVMValueRef result[4];
+
+		for (int i = 0; i < num_channels; i++) {
+			if (i) {
+				offset = LLVMBuildAdd(ctx->builder, offset,
+						      LLVMConstInt(ctx->i32, 4, 0), "");
+			}
+			LLVMValueRef args[2] = {rsrc, offset};
+			result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
+						       ctx->f32, args, 2,
+						       AC_FUNC_ATTR_READNONE |
+						       AC_FUNC_ATTR_LEGACY);
+		}
+		if (num_channels == 1)
+			return result[0];
+
+		if (num_channels == 3)
+			result[num_channels++] = LLVMGetUndef(ctx->f32);
+		return ac_build_gather_values(ctx, result, num_channels);
+	}
+
 	unsigned func = CLAMP(num_channels, 1, 3) - 1;
 
-	if (HAVE_LLVM >= 0x309) {
-		LLVMValueRef args[] = {
-			LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-			vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
-			LLVMConstInt(ctx->i32, inst_offset, 0),
-			LLVMConstInt(ctx->i1, glc, 0),
-			LLVMConstInt(ctx->i1, slc, 0)
-		};
+	LLVMValueRef args[] = {
+		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+		vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
+		offset,
+		LLVMConstInt(ctx->i1, glc, 0),
+		LLVMConstInt(ctx->i1, slc, 0)
+	};
 
-		LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
-		                       ctx->v4f32};
-		const char *type_names[] = {"f32", "v2f32", "v4f32"};
-		char name[256];
+	LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
+			       ctx->v4f32};
+	const char *type_names[] = {"f32", "v2f32", "v4f32"};
+	char name[256];
 
-		if (voffset) {
-			args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset,
-			                       "");
-		}
+	snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
+		 type_names[func]);
 
-		if (soffset) {
-			args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset,
-			                       "");
-		}
-
-		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
-		         type_names[func]);
-
-		return ac_build_intrinsic(ctx, name, types[func], args,
-					  ARRAY_SIZE(args),
-					  /* READNONE means writes can't
-					   * affect it, while READONLY means
-					   * that writes can affect it. */
-					  readonly_memory && HAVE_LLVM >= 0x0400 ?
-						  AC_FUNC_ATTR_READNONE :
-						  AC_FUNC_ATTR_READONLY);
-	} else {
-		LLVMValueRef args[] = {
-			LLVMBuildBitCast(ctx->builder, rsrc, ctx->v16i8, ""),
-			voffset ? voffset : vindex,
-			soffset,
-			LLVMConstInt(ctx->i32, inst_offset, 0),
-			LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
-			LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
-			LLVMConstInt(ctx->i32, glc, 0),
-			LLVMConstInt(ctx->i32, slc, 0),
-			LLVMConstInt(ctx->i32, 0, 0), // TFE
-		};
-
-		LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
-		                       ctx->v4i32};
-		const char *type_names[] = {"i32", "v2i32", "v4i32"};
-		const char *arg_type = "i32";
-		char name[256];
-
-		if (voffset && vindex) {
-			LLVMValueRef vaddr[] = {vindex, voffset};
-
-			arg_type = "v2i32";
-			args[1] = ac_build_gather_values(ctx, vaddr, 2);
-		}
-
-		snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
-		         type_names[func], arg_type);
-
-		return ac_build_intrinsic(ctx, name, types[func], args,
-					  ARRAY_SIZE(args), AC_FUNC_ATTR_READONLY);
-	}
+	return ac_build_intrinsic(ctx, name, types[func], args,
+				  ARRAY_SIZE(args),
+				  /* READNONE means writes can't affect it, while
+				   * READONLY means that writes can affect it. */
+				  can_speculate && HAVE_LLVM >= 0x0400 ?
+					  AC_FUNC_ATTR_READNONE :
+					  AC_FUNC_ATTR_READONLY);
 }
 
 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 					 LLVMValueRef rsrc,
 					 LLVMValueRef vindex,
 					 LLVMValueRef voffset,
-					 bool readonly_memory)
+					 bool can_speculate)
 {
-	if (HAVE_LLVM >= 0x0309) {
-		LLVMValueRef args [] = {
-			LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-			vindex,
-			voffset,
-			LLVMConstInt(ctx->i1, 0, 0), /* glc */
-			LLVMConstInt(ctx->i1, 0, 0), /* slc */
-		};
-
-		return ac_build_intrinsic(ctx,
-					  "llvm.amdgcn.buffer.load.format.v4f32",
-					  ctx->v4f32, args, ARRAY_SIZE(args),
-					  /* READNONE means writes can't
-					   * affect it, while READONLY means
-					   * that writes can affect it. */
-					  readonly_memory && HAVE_LLVM >= 0x0400 ?
-						  AC_FUNC_ATTR_READNONE :
-						  AC_FUNC_ATTR_READONLY);
-	}
-
-	LLVMValueRef args[] = {
-		rsrc,
-		voffset,
+	LLVMValueRef args [] = {
+		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
 		vindex,
+		voffset,
+		LLVMConstInt(ctx->i1, 0, 0), /* glc */
+		LLVMConstInt(ctx->i1, 0, 0), /* slc */
 	};
-	return ac_build_intrinsic(ctx, "llvm.SI.vs.load.input",
-				  ctx->v4f32, args, 3,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+
+	return ac_build_intrinsic(ctx,
+				  "llvm.amdgcn.buffer.load.format.v4f32",
+				  ctx->v4f32, args, ARRAY_SIZE(args),
+				  /* READNONE means writes can't affect it, while
+				   * READONLY means that writes can affect it. */
+				  can_speculate && HAVE_LLVM >= 0x0400 ?
+					  AC_FUNC_ATTR_READNONE :
+					  AC_FUNC_ATTR_READONLY);
 }
 
 /**
@@ -835,21 +844,21 @@
 	      bool has_ds_bpermute,
 	      uint32_t mask,
 	      int idx,
-	      LLVMValueRef lds,
 	      LLVMValueRef val)
 {
-	LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+	LLVMValueRef tl, trbl, args[2];
 	LLVMValueRef result;
 
-	thread_id = ac_get_thread_id(ctx);
-
-	tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-			      LLVMConstInt(ctx->i32, mask, false), "");
-
-	trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-				LLVMConstInt(ctx->i32, idx, false), "");
-
 	if (has_ds_bpermute) {
+		LLVMValueRef thread_id, tl_tid, trbl_tid;
+		thread_id = ac_get_thread_id(ctx);
+
+		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+				      LLVMConstInt(ctx->i32, mask, false), "");
+
+		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+					LLVMConstInt(ctx->i32, idx, false), "");
+
 		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
 				       LLVMConstInt(ctx->i32, 4, false), "");
 		args[1] = val;
@@ -867,15 +876,42 @@
 					  AC_FUNC_ATTR_READNONE |
 					  AC_FUNC_ATTR_CONVERGENT);
 	} else {
-		LLVMValueRef store_ptr, load_ptr0, load_ptr1;
+		uint32_t masks[2];
 
-		store_ptr = ac_build_gep0(ctx, lds, thread_id);
-		load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
-		load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
+		switch (mask) {
+		case AC_TID_MASK_TOP_LEFT:
+			masks[0] = 0x8000;
+			if (idx == 1)
+				masks[1] = 0x8055;
+			else
+				masks[1] = 0x80aa;
 
-		LLVMBuildStore(ctx->builder, val, store_ptr);
-		tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
-		trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+			break;
+		case AC_TID_MASK_TOP:
+			masks[0] = 0x8044;
+			masks[1] = 0x80ee;
+			break;
+		case AC_TID_MASK_LEFT:
+			masks[0] = 0x80a0;
+			masks[1] = 0x80f5;
+			break;
+		}
+
+		args[0] = val;
+		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
+
+		tl = ac_build_intrinsic(ctx,
+					"llvm.amdgcn.ds.swizzle", ctx->i32,
+					args, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
+
+		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
+		trbl = ac_build_intrinsic(ctx,
+					"llvm.amdgcn.ds.swizzle", ctx->i32,
+					args, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
 	}
 
 	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
@@ -1244,3 +1280,265 @@
                          data_type_name, coords_type_name, rsrc_type_name);
         }
 }
+
+#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
+#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+
+enum ac_ir_type {
+	AC_IR_UNDEF,
+	AC_IR_CONST,
+	AC_IR_VALUE,
+};
+
+struct ac_vs_exp_chan
+{
+	LLVMValueRef value;
+	float const_float;
+	enum ac_ir_type type;
+};
+
+struct ac_vs_exp_inst {
+	unsigned offset;
+	LLVMValueRef inst;
+	struct ac_vs_exp_chan chan[4];
+};
+
+struct ac_vs_exports {
+	unsigned num;
+	struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
+};
+
+/* Return true if the PARAM export has been eliminated. */
+static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
+				      uint32_t num_outputs,
+				      struct ac_vs_exp_inst *exp)
+{
+	unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
+	bool is_zero[4] = {}, is_one[4] = {};
+
+	for (i = 0; i < 4; i++) {
+		/* It's a constant expression. Undef outputs are eliminated too. */
+		if (exp->chan[i].type == AC_IR_UNDEF) {
+			is_zero[i] = true;
+			is_one[i] = true;
+		} else if (exp->chan[i].type == AC_IR_CONST) {
+			if (exp->chan[i].const_float == 0)
+				is_zero[i] = true;
+			else if (exp->chan[i].const_float == 1)
+				is_one[i] = true;
+			else
+				return false; /* other constant */
+		} else
+			return false;
+	}
+
+	/* Only certain combinations of 0 and 1 can be eliminated. */
+	if (is_zero[0] && is_zero[1] && is_zero[2])
+		default_val = is_zero[3] ? 0 : 1;
+	else if (is_one[0] && is_one[1] && is_one[2])
+		default_val = is_zero[3] ? 2 : 3;
+	else
+		return false;
+
+	/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
+	LLVMInstructionEraseFromParent(exp->inst);
+
+	/* Change OFFSET to DEFAULT_VAL. */
+	for (i = 0; i < num_outputs; i++) {
+		if (vs_output_param_offset[i] == exp->offset) {
+			vs_output_param_offset[i] =
+				AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
+			break;
+		}
+	}
+	return true;
+}
+
+static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
+					   uint32_t num_outputs,
+					   struct ac_vs_exports *processed,
+				           struct ac_vs_exp_inst *exp)
+{
+	unsigned p, copy_back_channels = 0;
+
+	/* See if the output is already in the list of processed outputs.
+	 * The LLVMValueRef comparison relies on SSA.
+	 */
+	for (p = 0; p < processed->num; p++) {
+		bool different = false;
+
+		for (unsigned j = 0; j < 4; j++) {
+			struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
+			struct ac_vs_exp_chan *c2 = &exp->chan[j];
+
+			/* Treat undef as a match. */
+			if (c2->type == AC_IR_UNDEF)
+				continue;
+
+			/* If c1 is undef but c2 isn't, we can copy c2 to c1
+			 * and consider the instruction duplicated.
+			 */
+			if (c1->type == AC_IR_UNDEF) {
+				copy_back_channels |= 1 << j;
+				continue;
+			}
+
+			/* Test whether the channels are not equal. */
+			if (c1->type != c2->type ||
+			    (c1->type == AC_IR_CONST &&
+			     c1->const_float != c2->const_float) ||
+			    (c1->type == AC_IR_VALUE &&
+			     c1->value != c2->value)) {
+				different = true;
+				break;
+			}
+		}
+		if (!different)
+			break;
+
+		copy_back_channels = 0;
+	}
+	if (p == processed->num)
+		return false;
+
+	/* If a match was found, but the matching export has undef where the new
+	 * one has a normal value, copy the normal value to the undef channel.
+	 */
+	struct ac_vs_exp_inst *match = &processed->exp[p];
+
+	while (copy_back_channels) {
+		unsigned chan = u_bit_scan(&copy_back_channels);
+
+		assert(match->chan[chan].type == AC_IR_UNDEF);
+		LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
+			       exp->chan[chan].value);
+		match->chan[chan] = exp->chan[chan];
+	}
+
+	/* The PARAM export is duplicated. Kill it. */
+	LLVMInstructionEraseFromParent(exp->inst);
+
+	/* Change OFFSET to the matching export. */
+	for (unsigned i = 0; i < num_outputs; i++) {
+		if (vs_output_param_offset[i] == exp->offset) {
+			vs_output_param_offset[i] = match->offset;
+			break;
+		}
+	}
+	return true;
+}
+
+void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
+			    LLVMValueRef main_fn,
+			    uint8_t *vs_output_param_offset,
+			    uint32_t num_outputs,
+			    uint8_t *num_param_exports)
+{
+	LLVMBasicBlockRef bb;
+	bool removed_any = false;
+	struct ac_vs_exports exports;
+
+	exports.num = 0;
+
+	/* Process all LLVM instructions. */
+	bb = LLVMGetFirstBasicBlock(main_fn);
+	while (bb) {
+		LLVMValueRef inst = LLVMGetFirstInstruction(bb);
+
+		while (inst) {
+			LLVMValueRef cur = inst;
+			inst = LLVMGetNextInstruction(inst);
+			struct ac_vs_exp_inst exp;
+
+			if (LLVMGetInstructionOpcode(cur) != LLVMCall)
+				continue;
+
+			LLVMValueRef callee = ac_llvm_get_called_value(cur);
+
+			if (!ac_llvm_is_function(callee))
+				continue;
+
+			const char *name = LLVMGetValueName(callee);
+			unsigned num_args = LLVMCountParams(callee);
+
+			/* Check if this is an export instruction. */
+			if ((num_args != 9 && num_args != 8) ||
+			    (strcmp(name, "llvm.SI.export") &&
+			     strcmp(name, "llvm.amdgcn.exp.f32")))
+				continue;
+
+			LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
+			unsigned target = LLVMConstIntGetZExtValue(arg);
+
+			if (target < V_008DFC_SQ_EXP_PARAM)
+				continue;
+
+			target -= V_008DFC_SQ_EXP_PARAM;
+
+			/* Parse the instruction. */
+			memset(&exp, 0, sizeof(exp));
+			exp.offset = target;
+			exp.inst = cur;
+
+			for (unsigned i = 0; i < 4; i++) {
+				LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
+
+				exp.chan[i].value = v;
+
+				if (LLVMIsUndef(v)) {
+					exp.chan[i].type = AC_IR_UNDEF;
+				} else if (LLVMIsAConstantFP(v)) {
+					LLVMBool loses_info;
+					exp.chan[i].type = AC_IR_CONST;
+					exp.chan[i].const_float =
+						LLVMConstRealGetDouble(v, &loses_info);
+				} else {
+					exp.chan[i].type = AC_IR_VALUE;
+				}
+			}
+
+			/* Eliminate constant and duplicated PARAM exports. */
+			if (ac_eliminate_const_output(vs_output_param_offset,
+						      num_outputs, &exp) ||
+			    ac_eliminate_duplicated_output(vs_output_param_offset,
+							   num_outputs, &exports,
+							   &exp)) {
+				removed_any = true;
+			} else {
+				exports.exp[exports.num++] = exp;
+			}
+		}
+		bb = LLVMGetNextBasicBlock(bb);
+	}
+
+	/* Remove holes in export memory due to removed PARAM exports.
+	 * This is done by renumbering all PARAM exports.
+	 */
+	if (removed_any) {
+		uint8_t old_offset[VARYING_SLOT_MAX];
+		unsigned out, i;
+
+		/* Make a copy of the offsets. We need the old version while
+		 * we are modifying some of them. */
+		memcpy(old_offset, vs_output_param_offset,
+		       sizeof(old_offset));
+
+		for (i = 0; i < exports.num; i++) {
+			unsigned offset = exports.exp[i].offset;
+
+			/* Update vs_output_param_offset. Multiple outputs can
+			 * have the same offset.
+			 */
+			for (out = 0; out < num_outputs; out++) {
+				if (old_offset[out] == offset)
+					vs_output_param_offset[out] = i;
+			}
+
+			/* Change the PARAM offset in the instruction. */
+			LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
+				       LLVMConstInt(ctx->i32,
+						    V_008DFC_SQ_EXP_PARAM + i, 0));
+		}
+		*num_param_exports = exports.num;
+	}
+}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index d6edcde..f4f485d 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -28,6 +28,8 @@
 #include <stdbool.h>
 #include <llvm-c/TargetMachine.h>
 
+#include "amd_family.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -40,11 +42,20 @@
 	LLVMTypeRef voidt;
 	LLVMTypeRef i1;
 	LLVMTypeRef i8;
+	LLVMTypeRef i16;
 	LLVMTypeRef i32;
+	LLVMTypeRef i64;
+	LLVMTypeRef f16;
 	LLVMTypeRef f32;
+	LLVMTypeRef f64;
 	LLVMTypeRef v4i32;
 	LLVMTypeRef v4f32;
-	LLVMTypeRef v16i8;
+	LLVMTypeRef v8i32;
+
+	LLVMValueRef i32_0;
+	LLVMValueRef i32_1;
+	LLVMValueRef f32_0;
+	LLVMValueRef f32_1;
 
 	unsigned range_md_kind;
 	unsigned invariant_load_md_kind;
@@ -52,10 +63,13 @@
 	unsigned fpmath_md_kind;
 	LLVMValueRef fpmath_md_2p5_ulp;
 	LLVMValueRef empty_md;
+
+	enum chip_class chip_class;
 };
 
 void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context);
+ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+		     enum chip_class chip_class);
 
 LLVMValueRef
 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
@@ -65,6 +79,11 @@
 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize);
 
 LLVMValueRef
+ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+	     unsigned count_incoming, LLVMValueRef *values,
+	     LLVMBasicBlockRef *blocks);
+
+LLVMValueRef
 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 				LLVMValueRef *values,
 				unsigned value_count,
@@ -82,7 +101,7 @@
 
 void
 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
-		       bool is_deriv, bool is_array,
+		       bool is_deriv, bool is_array, bool is_lod,
 		       LLVMValueRef *coords_arg,
 		       LLVMValueRef *derivs_arg);
 
@@ -143,13 +162,14 @@
 		     unsigned inst_offset,
 		     unsigned glc,
 		     unsigned slc,
-		     bool readonly_memory);
+		     bool can_speculate,
+		     bool allow_smem);
 
 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 					 LLVMValueRef rsrc,
 					 LLVMValueRef vindex,
 					 LLVMValueRef voffset,
-					 bool readonly_memory);
+					 bool can_speculate);
 
 LLVMValueRef
 ac_get_thread_id(struct ac_llvm_context *ctx);
@@ -163,7 +183,6 @@
 	      bool has_ds_bpermute,
 	      uint32_t mask,
 	      int idx,
-	      LLVMValueRef lds,
 	      LLVMValueRef val);
 
 #define AC_SENDMSG_GS 2
@@ -239,6 +258,12 @@
 			    LLVMTypeRef coords_type,
 			    LLVMTypeRef rsrc_type,
 			    char *out_name, unsigned out_len);
+
+void ac_optimize_vs_outputs(struct ac_llvm_context *ac,
+			    LLVMValueRef main_fn,
+			    uint8_t *vs_output_param_offset,
+			    uint32_t num_outputs,
+			    uint8_t *num_param_exports);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp
index d9ea4b1..4db7036 100644
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -34,6 +34,7 @@
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/IR/Attributes.h>
+#include <llvm/IR/CallSite.h>
 
 #if HAVE_LLVM < 0x0500
 namespace llvm {
@@ -44,9 +45,13 @@
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
 {
    llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
+#if HAVE_LLVM < 0x0500
    llvm::AttrBuilder B;
    B.addDereferenceableAttr(bytes);
    A->addAttr(llvm::AttributeList::get(A->getContext(), A->getArgNo() + 1,  B));
+#else
+   A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
+#endif
 }
 
 bool ac_is_sgpr_param(LLVMValueRef arg)
@@ -57,3 +62,21 @@
 	return AS.hasAttribute(ArgNo + 1, llvm::Attribute::ByVal) ||
 	       AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg);
 }
+
+LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call)
+{
+#if HAVE_LLVM >= 0x0309
+	return LLVMGetCalledValue(call);
+#else
+	return llvm::wrap(llvm::CallSite(llvm::unwrap<llvm::Instruction>(call)).getCalledValue());
+#endif
+}
+
+bool ac_llvm_is_function(LLVMValueRef v)
+{
+#if HAVE_LLVM >= 0x0309
+	return LLVMGetValueKind(v) == LLVMFunctionValueKind;
+#else
+	return llvm::isa<llvm::Function>(llvm::unwrap(v));
+#endif
+}
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index bc4d9d9..675926e 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -40,21 +40,23 @@
 	LLVMInitializeAMDGPUTargetMC();
 	LLVMInitializeAMDGPUAsmPrinter();
 
-	/*
-	 * Workaround for bug in llvm 4.0 that causes image intrinsics
+	/* For inline assembly. */
+	LLVMInitializeAMDGPUAsmParser();
+
+	/* Workaround for bug in llvm 4.0 that causes image intrinsics
 	 * to disappear.
 	 * https://reviews.llvm.org/D26348
 	 */
-#if HAVE_LLVM >= 0x0400
-	const char *argv[2] = {"mesa", "-simplifycfg-sink-common=false"};
-	LLVMParseCommandLineOptions(2, argv, NULL);
-#endif
-
+	if (HAVE_LLVM >= 0x0400) {
+		/* "mesa" is the prefix for error messages */
+		const char *argv[2] = { "mesa", "-simplifycfg-sink-common=false" };
+		LLVMParseCommandLineOptions(2, argv, NULL);
+	}
 }
 
 static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT;
 
-static LLVMTargetRef ac_get_llvm_target(const char *triple)
+LLVMTargetRef ac_get_llvm_target(const char *triple)
 {
 	LLVMTargetRef target = NULL;
 	char *err_message = NULL;
@@ -105,34 +107,35 @@
 		return "fiji";
 	case CHIP_STONEY:
 		return "stoney";
-#if HAVE_LLVM == 0x0308
-	case CHIP_POLARIS10:
-		return "tonga";
-	case CHIP_POLARIS11:
-		return "tonga";
-#else
 	case CHIP_POLARIS10:
 		return "polaris10";
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
 		return "polaris11";
-#endif
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		return "gfx900";
 	default:
 		return "";
 	}
 }
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill)
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options)
 {
 	assert(family >= CHIP_TAHITI);
-
-	const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--";
+	char features[256];
+	const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--";
 	LLVMTargetRef target = ac_get_llvm_target(triple);
+
+	snprintf(features, sizeof(features),
+		 "+DumpCode,+vgpr-spilling,-fp32-denormals%s",
+		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "");
+	
 	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
 	                             target,
 	                             triple,
 	                             ac_get_llvm_processor_name(family),
-	                             "+DumpCode,+vgpr-spilling",
+				     features,
 	                             LLVMCodeGenLevelDefault,
 	                             LLVMRelocDefault,
 	                             LLVMCodeModelDefault);
@@ -224,3 +227,13 @@
 	fprintf(stderr, "%s", str);
 	LLVMDisposeMessage(str);
 }
+
+void
+ac_llvm_add_target_dep_function_attr(LLVMValueRef F,
+				     const char *name, int value)
+{
+	char str[16];
+
+	snprintf(str, sizeof(str), "%i", value);
+	LLVMAddTargetDependentFunctionAttr(F, name, str);
+}
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index faecf1e..cc4fe3b 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -54,8 +54,13 @@
 	AC_FUNC_ATTR_LEGACY       = (1u << 31),
 };
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill);
+enum ac_target_machine_options {
+	AC_TM_SUPPORTS_SPILL = (1 << 0),
+	AC_TM_SISCHED = (1 << 1),
+};
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options);
 
+LLVMTargetRef ac_get_llvm_target(const char *triple);
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
 bool ac_is_sgpr_param(LLVMValueRef param);
 void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function,
@@ -64,6 +69,13 @@
 			    unsigned attrib_mask);
 void ac_dump_module(LLVMModuleRef module);
 
+LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call);
+bool ac_llvm_is_function(LLVMValueRef v);
+
+void
+ac_llvm_add_target_dep_function_attr(LLVMValueRef F,
+				     const char *name, int value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 62b7598..a9c231c 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -30,6 +30,8 @@
 #include "../vulkan/radv_descriptor_set.h"
 #include "util/bitscan.h"
 #include <llvm-c/Transforms/Scalar.h>
+#include "ac_shader_info.h"
+#include "ac_exp_param.h"
 
 enum radeon_llvm_calling_convention {
 	RADEON_LLVM_AMDGPU_VS = 87,
@@ -55,7 +57,7 @@
 	struct ac_llvm_context ac;
 	const struct ac_nir_compiler_options *options;
 	struct ac_shader_variant_info *shader_info;
-
+	unsigned max_workgroup_size;
 	LLVMContextRef context;
 	LLVMModuleRef module;
 	LLVMBuilderRef builder;
@@ -63,6 +65,7 @@
 
 	struct hash_table *defs;
 	struct hash_table *phis;
+	struct hash_table *vars;
 
 	LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
 	LLVMValueRef ring_offsets;
@@ -134,7 +137,6 @@
 	LLVMTypeRef f16;
 	LLVMTypeRef v2f32;
 	LLVMTypeRef v4f32;
-	LLVMTypeRef v16i8;
 	LLVMTypeRef voidt;
 
 	LLVMValueRef i1true;
@@ -153,12 +155,10 @@
 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS * 4];
 
-	LLVMValueRef shared_memory;
 	uint64_t input_mask;
 	uint64_t output_mask;
 	int num_locals;
 	LLVMValueRef *locals;
-	bool has_ddxy;
 	uint8_t num_output_clips;
 	uint8_t num_output_culls;
 
@@ -174,7 +174,7 @@
 };
 
 static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx,
-				     nir_deref_var *deref,
+				     const nir_deref_var *deref,
 				     enum desc_type desc_type);
 static unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan)
 {
@@ -250,12 +250,83 @@
 	LLVMSetFunctionCallConv(func, calling_conv);
 }
 
+#define MAX_ARGS 23
+struct arg_info {
+	LLVMTypeRef types[MAX_ARGS];
+	LLVMValueRef *assign[MAX_ARGS];
+	unsigned array_params_mask;
+	uint8_t count;
+	uint8_t user_sgpr_count;
+	uint8_t sgpr_count;
+	uint8_t num_user_sgprs_used;
+	uint8_t num_sgprs_used;
+	uint8_t num_vgprs_used;
+};
+
+static inline void
+add_argument(struct arg_info *info,
+	     LLVMTypeRef type, LLVMValueRef *param_ptr)
+{
+	assert(info->count < MAX_ARGS);
+	info->assign[info->count] = param_ptr;
+	info->types[info->count] = type;
+	info->count++;
+}
+
+static inline void
+add_sgpr_argument(struct arg_info *info,
+		  LLVMTypeRef type, LLVMValueRef *param_ptr)
+{
+	add_argument(info, type, param_ptr);
+	info->num_sgprs_used += llvm_get_type_size(type) / 4;
+	info->sgpr_count++;
+}
+
+static inline void
+add_user_sgpr_argument(struct arg_info *info,
+		       LLVMTypeRef type,
+		       LLVMValueRef *param_ptr)
+{
+	add_sgpr_argument(info, type, param_ptr);
+	info->num_user_sgprs_used += llvm_get_type_size(type) / 4;
+	info->user_sgpr_count++;
+}
+
+static inline void
+add_vgpr_argument(struct arg_info *info,
+		  LLVMTypeRef type,
+		  LLVMValueRef *param_ptr)
+{
+	add_argument(info, type, param_ptr);
+	info->num_vgprs_used += llvm_get_type_size(type) / 4;
+}
+
+static inline void
+add_user_sgpr_array_argument(struct arg_info *info,
+			     LLVMTypeRef type,
+			     LLVMValueRef *param_ptr)
+{
+	info->array_params_mask |= (1 << info->count);
+	add_user_sgpr_argument(info, type, param_ptr);
+}
+
+static void assign_arguments(LLVMValueRef main_function,
+			     struct arg_info *info)
+{
+	unsigned i;
+	for (i = 0; i < info->count; i++) {
+		if (info->assign[i])
+			*info->assign[i] = LLVMGetParam(main_function, i);
+	}
+}
+
 static LLVMValueRef
 create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
                      LLVMBuilderRef builder, LLVMTypeRef *return_types,
-                     unsigned num_return_elems, LLVMTypeRef *param_types,
-                     unsigned param_count, unsigned array_params_mask,
-                     unsigned sgpr_params, bool unsafe_math)
+                     unsigned num_return_elems,
+		     struct arg_info *args,
+		     unsigned max_workgroup_size,
+		     bool unsafe_math)
 {
 	LLVMTypeRef main_function_type, ret_type;
 	LLVMBasicBlockRef main_function_body;
@@ -268,7 +339,7 @@
 
 	/* Setup the function */
 	main_function_type =
-	    LLVMFunctionType(ret_type, param_types, param_count, 0);
+	    LLVMFunctionType(ret_type, args->types, args->count, 0);
 	LLVMValueRef main_function =
 	    LLVMAddFunction(module, "main", main_function_type);
 	main_function_body =
@@ -276,8 +347,8 @@
 	LLVMPositionBuilderAtEnd(builder, main_function_body);
 
 	LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS);
-	for (unsigned i = 0; i < sgpr_params; ++i) {
-		if (array_params_mask & (1 << i)) {
+	for (unsigned i = 0; i < args->sgpr_count; ++i) {
+		if (args->array_params_mask & (1 << i)) {
 			LLVMValueRef P = LLVMGetParam(main_function, i);
 			ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_BYVAL);
 			ac_add_attr_dereferenceable(P, UINT64_MAX);
@@ -287,6 +358,11 @@
 		}
 	}
 
+	if (max_workgroup_size) {
+		ac_llvm_add_target_dep_function_attr(main_function,
+						     "amdgpu-max-work-group-size",
+						     max_workgroup_size);
+	}
 	if (unsafe_math) {
 		/* These were copied from some LLVM test. */
 		LLVMAddTargetDependentFunctionAttr(main_function,
@@ -311,24 +387,7 @@
 	                       CONST_ADDR_SPACE);
 }
 
-static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
-					  int idx,
-					  LLVMTypeRef type)
-{
-	LLVMValueRef offset;
-	LLVMValueRef ptr;
-	int addr_space;
-
-	offset = LLVMConstInt(ctx->i32, idx * 16, false);
-
-	ptr = ctx->shared_memory;
-	ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
-	addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
-	ptr = LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
-	return ptr;
-}
-
-static LLVMTypeRef to_integer_type_scalar(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
+static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 {
 	if (t == ctx->f16 || t == ctx->i16)
 		return ctx->i16;
@@ -340,7 +399,7 @@
 		unreachable("Unhandled integer size");
 }
 
-static LLVMTypeRef to_integer_type(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
+static LLVMTypeRef to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 {
 	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 		LLVMTypeRef elem_type = LLVMGetElementType(t);
@@ -350,13 +409,13 @@
 	return to_integer_type_scalar(ctx, t);
 }
 
-static LLVMValueRef to_integer(struct nir_to_llvm_context *ctx, LLVMValueRef v)
+static LLVMValueRef to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 {
 	LLVMTypeRef type = LLVMTypeOf(v);
 	return LLVMBuildBitCast(ctx->builder, v, to_integer_type(ctx, type), "");
 }
 
-static LLVMTypeRef to_float_type_scalar(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
+static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 {
 	if (t == ctx->i16 || t == ctx->f16)
 		return ctx->f16;
@@ -368,7 +427,7 @@
 		unreachable("Unhandled float size");
 }
 
-static LLVMTypeRef to_float_type(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
+static LLVMTypeRef to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 {
 	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 		LLVMTypeRef elem_type = LLVMGetElementType(t);
@@ -378,13 +437,13 @@
 	return to_float_type_scalar(ctx, t);
 }
 
-static LLVMValueRef to_float(struct nir_to_llvm_context *ctx, LLVMValueRef v)
+static LLVMValueRef to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 {
 	LLVMTypeRef type = LLVMTypeOf(v);
 	return LLVMBuildBitCast(ctx->builder, v, to_float_type(ctx, type), "");
 }
 
-static int get_elem_bits(struct nir_to_llvm_context *ctx, LLVMTypeRef type)
+static int get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 {
 	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 		type = LLVMGetElementType(type);
@@ -523,21 +582,22 @@
 			    "");
 }
 
-static void set_userdata_location(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs)
+static void set_userdata_location(struct ac_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
 {
-	ud_info->sgpr_idx = sgpr_idx;
+	ud_info->sgpr_idx = *sgpr_idx;
 	ud_info->num_sgprs = num_sgprs;
 	ud_info->indirect = false;
 	ud_info->indirect_offset = 0;
+	*sgpr_idx += num_sgprs;
 }
 
 static void set_userdata_location_shader(struct nir_to_llvm_context *ctx,
-					 int idx, uint8_t sgpr_idx, uint8_t num_sgprs)
+					 int idx, uint8_t *sgpr_idx, uint8_t num_sgprs)
 {
 	set_userdata_location(&ctx->shader_info->user_sgprs_locs.shader_data[idx], sgpr_idx, num_sgprs);
 }
 
-#if 0
+
 static void set_userdata_location_indirect(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs,
 					   uint32_t indirect_offset)
 {
@@ -546,7 +606,6 @@
 	ud_info->indirect = true;
 	ud_info->indirect_offset = indirect_offset;
 }
-#endif
 
 static void declare_tess_lds(struct nir_to_llvm_context *ctx)
 {
@@ -556,314 +615,293 @@
 		"tess_lds");
 }
 
-static void create_function(struct nir_to_llvm_context *ctx)
+struct user_sgpr_info {
+	bool need_ring_offsets;
+	uint8_t sgpr_count;
+	bool indirect_all_descriptor_sets;
+};
+
+static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
+				struct user_sgpr_info *user_sgpr_info)
 {
-	LLVMTypeRef arg_types[23];
-	unsigned arg_idx = 0;
-	unsigned array_params_mask = 0;
-	unsigned sgpr_count = 0, user_sgpr_count;
-	unsigned i;
-	unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0;
-	unsigned user_sgpr_idx;
-	bool need_push_constants;
-	bool need_ring_offsets = false;
+	memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
 
 	/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
 	if (ctx->stage == MESA_SHADER_GEOMETRY ||
 	    ctx->stage == MESA_SHADER_VERTEX ||
 	    ctx->stage == MESA_SHADER_TESS_CTRL ||
 	    ctx->stage == MESA_SHADER_TESS_EVAL ||
-	    ctx->stage == MESA_SHADER_FRAGMENT ||
 	    ctx->is_gs_copy_shader)
-		need_ring_offsets = true;
+		user_sgpr_info->need_ring_offsets = true;
 
-	need_push_constants = true;
-	if (!ctx->options->layout)
-		need_push_constants = false;
-	else if (!ctx->options->layout->push_constant_size &&
-		 !ctx->options->layout->dynamic_offset_count)
-		need_push_constants = false;
+	if (ctx->stage == MESA_SHADER_FRAGMENT &&
+	    ctx->shader_info->info.ps.needs_sample_positions)
+		user_sgpr_info->need_ring_offsets = true;
 
-	if (need_ring_offsets && !ctx->options->supports_spill) {
-		arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* address of rings */
-	}
-
-	/* 1 for each descriptor set */
-	for (unsigned i = 0; i < num_sets; ++i) {
-		if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
-			array_params_mask |= (1 << arg_idx);
-			arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024);
-		}
-	}
-
-	if (need_push_constants) {
-		/* 1 for push constants and dynamic descriptors */
-		array_params_mask |= (1 << arg_idx);
-		arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024);
+	/* 2 user sgprs will nearly always be allocated for scratch/rings */
+	if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
+		user_sgpr_info->sgpr_count += 2;
 	}
 
 	switch (ctx->stage) {
 	case MESA_SHADER_COMPUTE:
-		arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); /* grid size */
-		user_sgpr_count = arg_idx;
-		arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3);
-		arg_types[arg_idx++] = ctx->i32;
-		sgpr_count = arg_idx;
-
-		arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3);
+		user_sgpr_info->sgpr_count += ctx->shader_info->info.cs.grid_components_used;
+		break;
+	case MESA_SHADER_FRAGMENT:
+		user_sgpr_info->sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
 		break;
 	case MESA_SHADER_VERTEX:
 		if (!ctx->is_gs_copy_shader) {
-			arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* vertex buffers */
-			arg_types[arg_idx++] = ctx->i32; // base vertex
-			arg_types[arg_idx++] = ctx->i32; // start instance
-			arg_types[arg_idx++] = ctx->i32; // draw index
+			user_sgpr_info->sgpr_count += ctx->shader_info->info.vs.has_vertex_buffers ? 2 : 0;
+			if (ctx->shader_info->info.vs.needs_draw_id) {
+				user_sgpr_info->sgpr_count += 3;
+			} else {
+				user_sgpr_info->sgpr_count += 2;
+			}
 		}
-		user_sgpr_count = arg_idx;
-		if (ctx->options->key.vs.as_es)
-			arg_types[arg_idx++] = ctx->i32; //es2gs offset
-		else if (ctx->options->key.vs.as_ls) {
-			arg_types[arg_idx++] = ctx->i32; //ls out layout
-			user_sgpr_count++;
+		if (ctx->options->key.vs.as_ls)
+			user_sgpr_info->sgpr_count++;
+		break;
+	case MESA_SHADER_TESS_CTRL:
+		user_sgpr_info->sgpr_count += 4;
+		break;
+	case MESA_SHADER_TESS_EVAL:
+		user_sgpr_info->sgpr_count += 1;
+		break;
+	case MESA_SHADER_GEOMETRY:
+		user_sgpr_info->sgpr_count += 2;
+		break;
+	default:
+		break;
+	}
+
+	if (ctx->shader_info->info.needs_push_constants)
+		user_sgpr_info->sgpr_count += 2;
+
+	uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;
+	if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
+		user_sgpr_info->sgpr_count += 2;
+		user_sgpr_info->indirect_all_descriptor_sets = true;
+	} else {
+		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
+	}
+}
+
+static void create_function(struct nir_to_llvm_context *ctx)
+{
+	unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0;
+	uint8_t user_sgpr_idx;
+	struct user_sgpr_info user_sgpr_info;
+	struct arg_info args = {};
+	LLVMValueRef desc_sets;
+
+	allocate_user_sgprs(ctx, &user_sgpr_info);
+	if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
+		add_user_sgpr_argument(&args, const_array(ctx->v4i32, 16), &ctx->ring_offsets); /* address of rings */
+	}
+
+	/* 1 for each descriptor set */
+	if (!user_sgpr_info.indirect_all_descriptor_sets) {
+		for (unsigned i = 0; i < num_sets; ++i) {
+			if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
+				add_user_sgpr_array_argument(&args, const_array(ctx->i8, 1024 * 1024), &ctx->descriptor_sets[i]);
+			}
 		}
-		sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; // vertex id
+	} else
+		add_user_sgpr_array_argument(&args, const_array(const_array(ctx->i8, 1024 * 1024), 32), &desc_sets);
+
+	if (ctx->shader_info->info.needs_push_constants) {
+		/* 1 for push constants and dynamic descriptors */
+		add_user_sgpr_array_argument(&args, const_array(ctx->i8, 1024 * 1024), &ctx->push_constants);
+	}
+
+	switch (ctx->stage) {
+	case MESA_SHADER_COMPUTE:
+		if (ctx->shader_info->info.cs.grid_components_used)
+			add_user_sgpr_argument(&args, LLVMVectorType(ctx->i32, ctx->shader_info->info.cs.grid_components_used), &ctx->num_work_groups); /* grid size */
+		add_sgpr_argument(&args, LLVMVectorType(ctx->i32, 3), &ctx->workgroup_ids);
+		add_sgpr_argument(&args, ctx->i32, &ctx->tg_size);
+		add_vgpr_argument(&args, LLVMVectorType(ctx->i32, 3), &ctx->local_invocation_ids);
+		break;
+	case MESA_SHADER_VERTEX:
 		if (!ctx->is_gs_copy_shader) {
-			arg_types[arg_idx++] = ctx->i32; // rel auto id
-			arg_types[arg_idx++] = ctx->i32; // vs prim id
-			arg_types[arg_idx++] = ctx->i32; // instance id
+			if (ctx->shader_info->info.vs.has_vertex_buffers)
+				add_user_sgpr_argument(&args, const_array(ctx->v4i32, 16), &ctx->vertex_buffers); /* vertex buffers */
+			add_user_sgpr_argument(&args, ctx->i32, &ctx->base_vertex); // base vertex
+			add_user_sgpr_argument(&args, ctx->i32, &ctx->start_instance);// start instance
+			if (ctx->shader_info->info.vs.needs_draw_id)
+				add_user_sgpr_argument(&args, ctx->i32, &ctx->draw_index); // draw id
+		}
+		if (ctx->options->key.vs.as_es)
+			add_sgpr_argument(&args, ctx->i32, &ctx->es2gs_offset); // es2gs offset
+		else if (ctx->options->key.vs.as_ls)
+			add_user_sgpr_argument(&args, ctx->i32, &ctx->ls_out_layout); // ls out layout
+		add_vgpr_argument(&args, ctx->i32, &ctx->vertex_id); // vertex id
+		if (!ctx->is_gs_copy_shader) {
+			add_vgpr_argument(&args, ctx->i32, &ctx->rel_auto_id); // rel auto id
+			add_vgpr_argument(&args, ctx->i32, &ctx->vs_prim_id); // vs prim id
+			add_vgpr_argument(&args, ctx->i32, &ctx->instance_id); // instance id
 		}
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		arg_types[arg_idx++] = ctx->i32; // tcs offchip layout
-		arg_types[arg_idx++] = ctx->i32; // tcs out offsets
-		arg_types[arg_idx++] = ctx->i32; // tcs out layout
-		arg_types[arg_idx++] = ctx->i32; // tcs in layout
-		user_sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; // param oc lds
-		arg_types[arg_idx++] = ctx->i32; // tess factor offset
-		sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; // patch id
-		arg_types[arg_idx++] = ctx->i32; // rel ids;
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_offsets); // tcs out offsets
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_layout); // tcs out layout
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_in_layout); // tcs in layout
+		add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // param oc lds
+		add_sgpr_argument(&args, ctx->i32, &ctx->tess_factor_offset); // tess factor offset
+		add_vgpr_argument(&args, ctx->i32, &ctx->tcs_patch_id); // patch id
+		add_vgpr_argument(&args, ctx->i32, &ctx->tcs_rel_ids); // rel ids;
 		break;
 	case MESA_SHADER_TESS_EVAL:
-		arg_types[arg_idx++] = ctx->i32; // tcs offchip layout
-		user_sgpr_count = arg_idx;
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
 		if (ctx->options->key.tes.as_es) {
-			arg_types[arg_idx++] = ctx->i32; // OC LDS
-			arg_types[arg_idx++] = ctx->i32; //
-			arg_types[arg_idx++] = ctx->i32; // es2gs offset
+			add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // OC LDS
+			add_sgpr_argument(&args, ctx->i32, NULL); //
+			add_sgpr_argument(&args, ctx->i32, &ctx->es2gs_offset); // es2gs offset
 		} else {
-			arg_types[arg_idx++] = ctx->i32; //
-			arg_types[arg_idx++] = ctx->i32; // OC LDS
+			add_sgpr_argument(&args, ctx->i32, NULL); //
+			add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // OC LDS
 		}
-		sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->f32; // tes_u
-		arg_types[arg_idx++] = ctx->f32; // tes_v
-		arg_types[arg_idx++] = ctx->i32; // tes rel patch id
-		arg_types[arg_idx++] = ctx->i32; // tes patch id
+		add_vgpr_argument(&args, ctx->f32, &ctx->tes_u); // tes_u
+		add_vgpr_argument(&args, ctx->f32, &ctx->tes_v); // tes_v
+		add_vgpr_argument(&args, ctx->i32, &ctx->tes_rel_patch_id); // tes rel patch id
+		add_vgpr_argument(&args, ctx->i32, &ctx->tes_patch_id); // tes patch id
 		break;
 	case MESA_SHADER_GEOMETRY:
-		arg_types[arg_idx++] = ctx->i32; // gsvs stride
-		arg_types[arg_idx++] = ctx->i32; // gsvs num entires
-		user_sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; // gs2vs offset
-	        arg_types[arg_idx++] = ctx->i32; // wave id
-		sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; // vtx0
-		arg_types[arg_idx++] = ctx->i32; // vtx1
-		arg_types[arg_idx++] = ctx->i32; // prim id
-		arg_types[arg_idx++] = ctx->i32; // vtx2
-		arg_types[arg_idx++] = ctx->i32; // vtx3
-		arg_types[arg_idx++] = ctx->i32; // vtx4
-		arg_types[arg_idx++] = ctx->i32; // vtx5
-		arg_types[arg_idx++] = ctx->i32; // GS instance id
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->gsvs_ring_stride); // gsvs stride
+		add_user_sgpr_argument(&args, ctx->i32, &ctx->gsvs_num_entries); // gsvs num entires
+		add_sgpr_argument(&args, ctx->i32, &ctx->gs2vs_offset); // gs2vs offset
+	        add_sgpr_argument(&args, ctx->i32, &ctx->gs_wave_id); // wave id
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[0]); // vtx0
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[1]); // vtx1
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_prim_id); // prim id
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[2]);
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[3]);
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[4]);
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_vtx_offset[5]);
+		add_vgpr_argument(&args, ctx->i32, &ctx->gs_invocation_id);
 		break;
 	case MESA_SHADER_FRAGMENT:
-		arg_types[arg_idx++] = ctx->i32; /* sample position offset */
-		user_sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->i32; /* prim mask */
-		sgpr_count = arg_idx;
-		arg_types[arg_idx++] = ctx->v2i32; /* persp sample */
-		arg_types[arg_idx++] = ctx->v2i32; /* persp center */
-		arg_types[arg_idx++] = ctx->v2i32; /* persp centroid */
-		arg_types[arg_idx++] = ctx->v3i32; /* persp pull model */
-		arg_types[arg_idx++] = ctx->v2i32; /* linear sample */
-		arg_types[arg_idx++] = ctx->v2i32; /* linear center */
-		arg_types[arg_idx++] = ctx->v2i32; /* linear centroid */
-		arg_types[arg_idx++] = ctx->f32;  /* line stipple tex */
-		arg_types[arg_idx++] = ctx->f32;  /* pos x float */
-		arg_types[arg_idx++] = ctx->f32;  /* pos y float */
-		arg_types[arg_idx++] = ctx->f32;  /* pos z float */
-		arg_types[arg_idx++] = ctx->f32;  /* pos w float */
-		arg_types[arg_idx++] = ctx->i32;  /* front face */
-		arg_types[arg_idx++] = ctx->i32;  /* ancillary */
-		arg_types[arg_idx++] = ctx->i32;  /* sample coverage */
-		arg_types[arg_idx++] = ctx->i32;  /* fixed pt */
+		if (ctx->shader_info->info.ps.needs_sample_positions)
+			add_user_sgpr_argument(&args, ctx->i32, &ctx->sample_pos_offset); /* sample position offset */
+		add_sgpr_argument(&args, ctx->i32, &ctx->prim_mask); /* prim mask */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->persp_sample); /* persp sample */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->persp_center); /* persp center */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->persp_centroid); /* persp centroid */
+		add_vgpr_argument(&args, ctx->v3i32, NULL); /* persp pull model */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->linear_sample); /* linear sample */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->linear_center); /* linear center */
+		add_vgpr_argument(&args, ctx->v2i32, &ctx->linear_centroid); /* linear centroid */
+		add_vgpr_argument(&args, ctx->f32, NULL);  /* line stipple tex */
+		add_vgpr_argument(&args, ctx->f32, &ctx->frag_pos[0]);  /* pos x float */
+		add_vgpr_argument(&args, ctx->f32, &ctx->frag_pos[1]);  /* pos y float */
+		add_vgpr_argument(&args, ctx->f32, &ctx->frag_pos[2]);  /* pos z float */
+		add_vgpr_argument(&args, ctx->f32, &ctx->frag_pos[3]);  /* pos w float */
+		add_vgpr_argument(&args, ctx->i32, &ctx->front_face);  /* front face */
+		add_vgpr_argument(&args, ctx->i32, &ctx->ancillary);  /* ancillary */
+		add_vgpr_argument(&args, ctx->i32, &ctx->sample_coverage);  /* sample coverage */
+		add_vgpr_argument(&args, ctx->i32, NULL);  /* fixed pt */
 		break;
 	default:
 		unreachable("Shader stage not implemented");
 	}
 
 	ctx->main_function = create_llvm_function(
-	    ctx->context, ctx->module, ctx->builder, NULL, 0, arg_types,
-	    arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
+	    ctx->context, ctx->module, ctx->builder, NULL, 0, &args,
+	    ctx->max_workgroup_size,
+	    ctx->options->unsafe_math);
 	set_llvm_calling_convention(ctx->main_function, ctx->stage);
 
-	ctx->shader_info->num_input_sgprs = 0;
+
 	ctx->shader_info->num_input_vgprs = 0;
+	ctx->shader_info->num_input_sgprs = ctx->shader_info->num_user_sgprs =
+	  ctx->options->supports_spill ? 2 : 0;
 
-	ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ? 2 : 0;
-	for (i = 0; i < user_sgpr_count; i++)
-		ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
-
-	ctx->shader_info->num_input_sgprs = ctx->shader_info->num_user_sgprs;
-	for (; i < sgpr_count; i++)
-		ctx->shader_info->num_input_sgprs += llvm_get_type_size(arg_types[i]) / 4;
+	ctx->shader_info->num_user_sgprs += args.num_user_sgprs_used;
+	ctx->shader_info->num_input_sgprs += args.num_sgprs_used;
 
 	if (ctx->stage != MESA_SHADER_FRAGMENT)
-		for (; i < arg_idx; ++i)
-			ctx->shader_info->num_input_vgprs += llvm_get_type_size(arg_types[i]) / 4;
+		ctx->shader_info->num_input_vgprs = args.num_vgprs_used;
 
-	arg_idx = 0;
+	assign_arguments(ctx->main_function, &args);
+
 	user_sgpr_idx = 0;
 
-	if (ctx->options->supports_spill || need_ring_offsets) {
-		set_userdata_location_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS, user_sgpr_idx, 2);
-		user_sgpr_idx += 2;
+	if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
+		set_userdata_location_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx, 2);
 		if (ctx->options->supports_spill) {
 			ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
 							       LLVMPointerType(ctx->i8, CONST_ADDR_SPACE),
 							       NULL, 0, AC_FUNC_ATTR_READNONE);
 			ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
-							     const_array(ctx->v16i8, 16), "");
-		} else
-			ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
+							     const_array(ctx->v4i32, 16), "");
+		}
 	}
 
-	for (unsigned i = 0; i < num_sets; ++i) {
-		if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
-			set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
-			user_sgpr_idx += 2;
-			ctx->descriptor_sets[i] =
-				LLVMGetParam(ctx->main_function, arg_idx++);
-		} else
-			ctx->descriptor_sets[i] = NULL;
+	if (!user_sgpr_info.indirect_all_descriptor_sets) {
+		for (unsigned i = 0; i < num_sets; ++i) {
+			if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
+				set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], &user_sgpr_idx, 2);
+			} else
+				ctx->descriptor_sets[i] = NULL;
+		}
+	} else {
+		uint32_t desc_sgpr_idx = user_sgpr_idx;
+		set_userdata_location_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_idx, 2);
+
+		for (unsigned i = 0; i < num_sets; ++i) {
+			if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
+				set_userdata_location_indirect(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], desc_sgpr_idx, 2, i * 8);
+				ctx->descriptor_sets[i] = ac_build_indexed_load_const(&ctx->ac, desc_sets, LLVMConstInt(ctx->i32, i, false));
+
+			} else
+				ctx->descriptor_sets[i] = NULL;
+		}
+		ctx->shader_info->need_indirect_descriptor_sets = true;
 	}
 
-	if (need_push_constants) {
-		ctx->push_constants = LLVMGetParam(ctx->main_function, arg_idx++);
-		set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
-		user_sgpr_idx += 2;
+	if (ctx->shader_info->info.needs_push_constants) {
+		set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_idx, 2);
 	}
 
 	switch (ctx->stage) {
 	case MESA_SHADER_COMPUTE:
-		set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE, user_sgpr_idx, 3);
-		user_sgpr_idx += 3;
-		ctx->num_work_groups =
-		    LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->workgroup_ids =
-		    LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tg_size =
-		    LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->local_invocation_ids =
-		    LLVMGetParam(ctx->main_function, arg_idx++);
+		if (ctx->shader_info->info.cs.grid_components_used) {
+			set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, ctx->shader_info->info.cs.grid_components_used);
+		}
 		break;
 	case MESA_SHADER_VERTEX:
 		if (!ctx->is_gs_copy_shader) {
-			set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx, 2);
-			user_sgpr_idx += 2;
-			ctx->vertex_buffers = LLVMGetParam(ctx->main_function, arg_idx++);
-			set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, 3);
-			user_sgpr_idx += 3;
-			ctx->base_vertex = LLVMGetParam(ctx->main_function, arg_idx++);
-			ctx->start_instance = LLVMGetParam(ctx->main_function, arg_idx++);
-			ctx->draw_index = LLVMGetParam(ctx->main_function, arg_idx++);
+			if (ctx->shader_info->info.vs.has_vertex_buffers) {
+				set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_idx, 2);
+			}
+			unsigned vs_num = 2;
+			if (ctx->shader_info->info.vs.needs_draw_id)
+				vs_num++;
+
+			set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_idx, vs_num);
 		}
-		if (ctx->options->key.vs.as_es)
-			ctx->es2gs_offset = LLVMGetParam(ctx->main_function, arg_idx++);
-		else if (ctx->options->key.vs.as_ls) {
-			set_userdata_location_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, user_sgpr_idx, 1);
-			user_sgpr_idx += 1;
-			ctx->ls_out_layout = LLVMGetParam(ctx->main_function, arg_idx++);
-		}
-		ctx->vertex_id = LLVMGetParam(ctx->main_function, arg_idx++);
-		if (!ctx->is_gs_copy_shader) {
-			ctx->rel_auto_id = LLVMGetParam(ctx->main_function, arg_idx++);
-			ctx->vs_prim_id = LLVMGetParam(ctx->main_function, arg_idx++);
-			ctx->instance_id = LLVMGetParam(ctx->main_function, arg_idx++);
+		if (ctx->options->key.vs.as_ls) {
+			set_userdata_location_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, &user_sgpr_idx, 1);
 		}
 		if (ctx->options->key.vs.as_ls)
 			declare_tess_lds(ctx);
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, user_sgpr_idx, 4);
-		user_sgpr_idx += 4;
-		ctx->tcs_offchip_layout = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tcs_out_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tcs_out_layout = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tcs_in_layout = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->oc_lds = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tess_factor_offset = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tcs_patch_id = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tcs_rel_ids = LLVMGetParam(ctx->main_function, arg_idx++);
-
+		set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
 		declare_tess_lds(ctx);
 		break;
 	case MESA_SHADER_TESS_EVAL:
-		set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, user_sgpr_idx, 1);
-		user_sgpr_idx += 1;
-		ctx->tcs_offchip_layout = LLVMGetParam(ctx->main_function, arg_idx++);
-		if (ctx->options->key.tes.as_es) {
-			ctx->oc_lds = LLVMGetParam(ctx->main_function, arg_idx++);
-			arg_idx++;
-			ctx->es2gs_offset = LLVMGetParam(ctx->main_function, arg_idx++);
-		} else {
-			arg_idx++;
-			ctx->oc_lds = LLVMGetParam(ctx->main_function, arg_idx++);
-		}
-		ctx->tes_u = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tes_v = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tes_rel_patch_id = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->tes_patch_id = LLVMGetParam(ctx->main_function, arg_idx++);
+		set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
 		break;
 	case MESA_SHADER_GEOMETRY:
-		set_userdata_location_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES, user_sgpr_idx, 2);
-		user_sgpr_idx += 2;
-		ctx->gsvs_ring_stride = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gsvs_num_entries = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs2vs_offset = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_wave_id = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[0] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[1] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_prim_id = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[2] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[3] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[4] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_vtx_offset[5] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++);
+		set_userdata_location_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES, &user_sgpr_idx, 2);
 		break;
 	case MESA_SHADER_FRAGMENT:
-		set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, user_sgpr_idx, 1);
-		user_sgpr_idx += 1;
-		ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->persp_centroid = LLVMGetParam(ctx->main_function, arg_idx++);
-		arg_idx++;
-		ctx->linear_sample = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->linear_center = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->linear_centroid = LLVMGetParam(ctx->main_function, arg_idx++);
-		arg_idx++; /* line stipple */
-		ctx->frag_pos[0] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->frag_pos[1] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->frag_pos[2] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->frag_pos[3] = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->front_face = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->ancillary = LLVMGetParam(ctx->main_function, arg_idx++);
-		ctx->sample_coverage = LLVMGetParam(ctx->main_function, arg_idx++);
+		if (ctx->shader_info->info.ps.needs_sample_positions) {
+			set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, &user_sgpr_idx, 1);
+		}
 		break;
 	default:
 		unreachable("Shader stage not implemented");
@@ -889,7 +927,6 @@
 	ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
 	ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
 	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
-	ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
 
 	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 	ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
@@ -977,7 +1014,7 @@
 }
 
 static LLVMTypeRef get_def_type(struct nir_to_llvm_context *ctx,
-                                nir_ssa_def *def)
+                                const nir_ssa_def *def)
 {
 	LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, def->bit_size);
 	if (def->num_components > 1) {
@@ -995,7 +1032,7 @@
 
 
 static LLVMBasicBlockRef get_block(struct nir_to_llvm_context *ctx,
-                                   struct nir_block *b)
+                                   const struct nir_block *b)
 {
 	struct hash_entry *entry = _mesa_hash_table_search(ctx->defs, b);
 	return (LLVMBasicBlockRef)entry->data;
@@ -1044,7 +1081,7 @@
 	return value;
 }
 
-static LLVMValueRef emit_int_cmp(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
                                  LLVMIntPredicate pred, LLVMValueRef src0,
                                  LLVMValueRef src1)
 {
@@ -1054,7 +1091,7 @@
 	                       LLVMConstInt(ctx->i32, 0, false), "");
 }
 
-static LLVMValueRef emit_float_cmp(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx,
                                    LLVMRealPredicate pred, LLVMValueRef src0,
                                    LLVMValueRef src1)
 {
@@ -1067,7 +1104,7 @@
 	                       LLVMConstInt(ctx->i32, 0, false), "");
 }
 
-static LLVMValueRef emit_intrin_1f_param(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
 					 const char *intrin,
 					 LLVMTypeRef result_type,
 					 LLVMValueRef src0)
@@ -1077,11 +1114,13 @@
 		to_float(ctx, src0),
 	};
 
-	sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
-	return ac_build_intrinsic(&ctx->ac, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
+	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
 }
 
-static LLVMValueRef emit_intrin_2f_param(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
 				       const char *intrin,
 				       LLVMTypeRef result_type,
 				       LLVMValueRef src0, LLVMValueRef src1)
@@ -1092,11 +1131,13 @@
 		to_float(ctx, src1),
 	};
 
-	sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
-	return ac_build_intrinsic(&ctx->ac, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
+	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
 }
 
-static LLVMValueRef emit_intrin_3f_param(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
 					 const char *intrin,
 					 LLVMTypeRef result_type,
 					 LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
@@ -1108,19 +1149,21 @@
 		to_float(ctx, src2),
 	};
 
-	sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
-	return ac_build_intrinsic(&ctx->ac, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
+	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
+						 get_elem_bits(ctx, result_type));
+	assert(length < sizeof(name));
+	return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
 }
 
-static LLVMValueRef emit_bcsel(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx,
 			       LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
 {
 	LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0,
-				       ctx->i32zero, "");
+				       ctx->i32_0, "");
 	return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
 }
 
-static LLVMValueRef emit_find_lsb(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_find_lsb(struct ac_llvm_context *ctx,
 				  LLVMValueRef src0)
 {
 	LLVMValueRef params[2] = {
@@ -1133,24 +1176,34 @@
 		 *
 		 * The hardware already implements the correct behavior.
 		 */
-		LLVMConstInt(ctx->i32, 1, false),
+		LLVMConstInt(ctx->i1, 1, false),
 	};
-	return ac_build_intrinsic(&ctx->ac, "llvm.cttz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
+
+	LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
+					      params, 2,
+					      AC_FUNC_ATTR_READNONE);
+
+	/* TODO: We need an intrinsic to skip this conditional. */
+	/* Check for zero: */
+	return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
+							   LLVMIntEQ, src0,
+							   ctx->i32_0, ""),
+			       LLVMConstInt(ctx->i32, -1, 0), lsb, "");
 }
 
-static LLVMValueRef emit_ifind_msb(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_ifind_msb(struct ac_llvm_context *ctx,
 				   LLVMValueRef src0)
 {
-	return ac_build_imsb(&ctx->ac, src0, ctx->i32);
+	return ac_build_imsb(ctx, src0, ctx->i32);
 }
 
-static LLVMValueRef emit_ufind_msb(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_ufind_msb(struct ac_llvm_context *ctx,
 				   LLVMValueRef src0)
 {
-	return ac_build_umsb(&ctx->ac, src0, ctx->i32);
+	return ac_build_umsb(ctx, src0, ctx->i32);
 }
 
-static LLVMValueRef emit_minmax_int(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_minmax_int(struct ac_llvm_context *ctx,
 				    LLVMIntPredicate pred,
 				    LLVMValueRef src0, LLVMValueRef src1)
 {
@@ -1160,38 +1213,38 @@
 			       src1, "");
 
 }
-static LLVMValueRef emit_iabs(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx,
 			      LLVMValueRef src0)
 {
 	return emit_minmax_int(ctx, LLVMIntSGT, src0,
 			       LLVMBuildNeg(ctx->builder, src0, ""));
 }
 
-static LLVMValueRef emit_fsign(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_fsign(struct ac_llvm_context *ctx,
 			       LLVMValueRef src0)
 {
 	LLVMValueRef cmp, val;
 
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, ctx->f32zero, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, ctx->f32one, src0, "");
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, ctx->f32zero, "");
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, ctx->f32_0, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, ctx->f32_1, src0, "");
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, ctx->f32_0, "");
 	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(ctx->f32, -1.0), "");
 	return val;
 }
 
-static LLVMValueRef emit_isign(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_isign(struct ac_llvm_context *ctx,
 			       LLVMValueRef src0)
 {
 	LLVMValueRef cmp, val;
 
-	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32zero, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32one, src0, "");
-	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32zero, "");
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32_0, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32_1, src0, "");
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32_0, "");
 	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(ctx->i32, -1, true), "");
 	return val;
 }
 
-static LLVMValueRef emit_ffract(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_ffract(struct ac_llvm_context *ctx,
 				LLVMValueRef src0)
 {
 	const char *intr = "llvm.floor.f32";
@@ -1199,13 +1252,13 @@
 	LLVMValueRef params[] = {
 		fsrc0,
 	};
-	LLVMValueRef floor = ac_build_intrinsic(&ctx->ac, intr,
+	LLVMValueRef floor = ac_build_intrinsic(ctx, intr,
 						ctx->f32, params, 1,
 						AC_FUNC_ATTR_READNONE);
 	return LLVMBuildFSub(ctx->builder, fsrc0, floor, "");
 }
 
-static LLVMValueRef emit_uint_carry(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx,
 				    const char *intrin,
 				    LLVMValueRef src0, LLVMValueRef src1)
 {
@@ -1216,7 +1269,7 @@
 	ret_type = LLVMStructTypeInContext(ctx->context, types,
 					   2, true);
 
-	res = ac_build_intrinsic(&ctx->ac, intrin, ret_type,
+	res = ac_build_intrinsic(ctx, intrin, ret_type,
 				 params, 2, AC_FUNC_ATTR_READNONE);
 
 	res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
@@ -1224,22 +1277,44 @@
 	return res;
 }
 
-static LLVMValueRef emit_b2f(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
 			     LLVMValueRef src0)
 {
 	return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), "");
 }
 
+static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0)
+{
+	src0 = to_float(ctx, src0);
+	return LLVMBuildSExt(ctx->builder,
+			     LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, ctx->f32_0, ""),
+			     ctx->i32, "");
+}
+
+static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0)
+{
+	return LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
+}
+
+static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
+			     LLVMValueRef src0)
+{
+	return LLVMBuildSExt(ctx->builder,
+			     LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, ctx->i32_0, ""),
+			     ctx->i32, "");
+}
+
 static LLVMValueRef emit_f2f16(struct nir_to_llvm_context *ctx,
 			       LLVMValueRef src0)
 {
 	LLVMValueRef result;
 	LLVMValueRef cond;
 
-	src0 = to_float(ctx, src0);
+	src0 = to_float(&ctx->ac, src0);
 	result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");
 
-	/* TODO SI/CIK options here */
 	if (ctx->options->chip_class >= VI) {
 		LLVMValueRef args[2];
 		/* Check if the result is a denormal - and flush to 0 if so. */
@@ -1253,11 +1328,26 @@
 
 	if (ctx->options->chip_class >= VI)
 		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, "");
-
+	else {
+		/* for SI/CIK */
+		/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
+		 * so compare the result and flush to 0 if it's smaller.
+		 */
+		LLVMValueRef temp, cond2;
+		temp = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
+					    ctx->f32, result);
+		cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT,
+				     LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""),
+				     temp, "");
+		cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+				      temp, ctx->f32zero, "");
+		cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
+		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, "");
+	}
 	return result;
 }
 
-static LLVMValueRef emit_umul_high(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx,
 				   LLVMValueRef src0, LLVMValueRef src1)
 {
 	LLVMValueRef dst64, result;
@@ -1270,7 +1360,7 @@
 	return result;
 }
 
-static LLVMValueRef emit_imul_high(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx,
 				   LLVMValueRef src0, LLVMValueRef src1)
 {
 	LLVMValueRef dst64, result;
@@ -1283,19 +1373,19 @@
 	return result;
 }
 
-static LLVMValueRef emit_bitfield_extract(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_bitfield_extract(struct ac_llvm_context *ctx,
 					  bool is_signed,
-					  LLVMValueRef srcs[3])
+					  const LLVMValueRef srcs[3])
 {
 	LLVMValueRef result;
 	LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
 
-	result = ac_build_bfe(&ctx->ac, srcs[0], srcs[1], srcs[2], is_signed);
+	result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
 	result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
 	return result;
 }
 
-static LLVMValueRef emit_bitfield_insert(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_bitfield_insert(struct ac_llvm_context *ctx,
 					 LLVMValueRef src0, LLVMValueRef src1,
 					 LLVMValueRef src2, LLVMValueRef src3)
 {
@@ -1304,9 +1394,9 @@
 	bfi_args[0] = LLVMBuildShl(ctx->builder,
 				   LLVMBuildSub(ctx->builder,
 						LLVMBuildShl(ctx->builder,
-							     ctx->i32one,
+							     ctx->i32_1,
 							     src3, ""),
-						ctx->i32one, ""),
+						ctx->i32_1, ""),
 				   src2, "");
 	bfi_args[1] = LLVMBuildShl(ctx->builder, src1, src2, "");
 	bfi_args[2] = src0;
@@ -1325,7 +1415,7 @@
 	return result;
 }
 
-static LLVMValueRef emit_pack_half_2x16(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_pack_half_2x16(struct ac_llvm_context *ctx,
 					LLVMValueRef src0)
 {
 	LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
@@ -1333,8 +1423,8 @@
 	LLVMValueRef comp[2];
 
 	src0 = to_float(ctx, src0);
-	comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, "");
-	comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, "");
+	comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
+	comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
 	for (i = 0; i < 2; i++) {
 		comp[i] = LLVMBuildFPTrunc(ctx->builder, comp[i], ctx->f16, "");
 		comp[i] = LLVMBuildBitCast(ctx->builder, comp[i], ctx->i16, "");
@@ -1347,7 +1437,7 @@
 	return comp[0];
 }
 
-static LLVMValueRef emit_unpack_half_2x16(struct nir_to_llvm_context *ctx,
+static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx,
 					  LLVMValueRef src0)
 {
 	LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
@@ -1361,10 +1451,11 @@
 		temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
 	}
 
-	result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), temps[0],
-					ctx->i32zero, "");
+	LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
+	result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(v2f32), temps[0],
+					ctx->i32_0, "");
 	result = LLVMBuildInsertElement(ctx->builder, result, temps[1],
-					ctx->i32one, "");
+					ctx->i32_1, "");
 	return result;
 }
 
@@ -1375,12 +1466,6 @@
 	unsigned mask;
 	int idx;
 	LLVMValueRef result;
-	ctx->has_ddxy = true;
-
-	if (!ctx->lds && !ctx->has_ds_bpermute)
-		ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
-						       LLVMArrayType(ctx->i32, 64),
-						       "ddxy_lds", LOCAL_ADDR_SPACE);
 
 	if (op == nir_op_fddx_fine || op == nir_op_fddx)
 		mask = AC_TID_MASK_LEFT;
@@ -1398,7 +1483,7 @@
 		idx = 2;
 
 	result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
-			      mask, idx, ctx->lds,
+			      mask, idx,
 			      src0);
 	return result;
 }
@@ -1424,7 +1509,7 @@
 	return ac_build_gather_values(&ctx->ac, result, 4);
 }
 
-static void visit_alu(struct nir_to_llvm_context *ctx, nir_alu_instr *instr)
+static void visit_alu(struct nir_to_llvm_context *ctx, const nir_alu_instr *instr)
 {
 	LLVMValueRef src[4], result = NULL;
 	unsigned num_components = instr->dest.dest.ssa.num_components;
@@ -1457,7 +1542,7 @@
 		result = src[0];
 		break;
 	case nir_op_fneg:
-	        src[0] = to_float(ctx, src[0]);
+	        src[0] = to_float(&ctx->ac, src[0]);
 		result = LLVMBuildFNeg(ctx->builder, src[0], "");
 		break;
 	case nir_op_ineg:
@@ -1470,13 +1555,13 @@
 		result = LLVMBuildAdd(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_fadd:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = LLVMBuildFAdd(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_fsub:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = LLVMBuildFSub(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_isub:
@@ -1492,17 +1577,17 @@
 		result = LLVMBuildURem(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_fmod:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
-		result = emit_intrin_1f_param(ctx, "llvm.floor",
-		                              to_float_type(ctx, def_type), result);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
+		                              to_float_type(&ctx->ac, def_type), result);
 		result = LLVMBuildFMul(ctx->builder, src[1] , result, "");
 		result = LLVMBuildFSub(ctx->builder, src[0], result, "");
 		break;
 	case nir_op_frem:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = LLVMBuildFRem(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_irem:
@@ -1515,17 +1600,17 @@
 		result = LLVMBuildUDiv(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_fmul:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = LLVMBuildFMul(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_fdiv:
-		src[0] = to_float(ctx, src[0]);
-		src[1] = to_float(ctx, src[1]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		src[1] = to_float(&ctx->ac, src[1]);
 		result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
 		break;
 	case nir_op_frcp:
-		src[0] = to_float(ctx, src[0]);
+		src[0] = to_float(&ctx->ac, src[0]);
 		result = ac_build_fdiv(&ctx->ac, ctx->f32one, src[0]);
 		break;
 	case nir_op_iand:
@@ -1538,146 +1623,155 @@
 		result = LLVMBuildXor(ctx->builder, src[0], src[1], "");
 		break;
 	case nir_op_ishl:
-		result = LLVMBuildShl(ctx->builder, src[0], src[1], "");
+		result = LLVMBuildShl(ctx->builder, src[0],
+				      LLVMBuildZExt(ctx->builder, src[1],
+						    LLVMTypeOf(src[0]), ""),
+				      "");
 		break;
 	case nir_op_ishr:
-		result = LLVMBuildAShr(ctx->builder, src[0], src[1], "");
+		result = LLVMBuildAShr(ctx->builder, src[0],
+				       LLVMBuildZExt(ctx->builder, src[1],
+						     LLVMTypeOf(src[0]), ""),
+				       "");
 		break;
 	case nir_op_ushr:
-		result = LLVMBuildLShr(ctx->builder, src[0], src[1], "");
+		result = LLVMBuildLShr(ctx->builder, src[0],
+				       LLVMBuildZExt(ctx->builder, src[1],
+						     LLVMTypeOf(src[0]), ""),
+				       "");
 		break;
 	case nir_op_ilt:
-		result = emit_int_cmp(ctx, LLVMIntSLT, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
 		break;
 	case nir_op_ine:
-		result = emit_int_cmp(ctx, LLVMIntNE, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
 		break;
 	case nir_op_ieq:
-		result = emit_int_cmp(ctx, LLVMIntEQ, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
 		break;
 	case nir_op_ige:
-		result = emit_int_cmp(ctx, LLVMIntSGE, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
 		break;
 	case nir_op_ult:
-		result = emit_int_cmp(ctx, LLVMIntULT, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
 		break;
 	case nir_op_uge:
-		result = emit_int_cmp(ctx, LLVMIntUGE, src[0], src[1]);
+		result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
 		break;
 	case nir_op_feq:
-		result = emit_float_cmp(ctx, LLVMRealUEQ, src[0], src[1]);
+		result = emit_float_cmp(&ctx->ac, LLVMRealUEQ, src[0], src[1]);
 		break;
 	case nir_op_fne:
-		result = emit_float_cmp(ctx, LLVMRealUNE, src[0], src[1]);
+		result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
 		break;
 	case nir_op_flt:
-		result = emit_float_cmp(ctx, LLVMRealULT, src[0], src[1]);
+		result = emit_float_cmp(&ctx->ac, LLVMRealULT, src[0], src[1]);
 		break;
 	case nir_op_fge:
-		result = emit_float_cmp(ctx, LLVMRealUGE, src[0], src[1]);
+		result = emit_float_cmp(&ctx->ac, LLVMRealUGE, src[0], src[1]);
 		break;
 	case nir_op_fabs:
-		result = emit_intrin_1f_param(ctx, "llvm.fabs",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_iabs:
-		result = emit_iabs(ctx, src[0]);
+		result = emit_iabs(&ctx->ac, src[0]);
 		break;
 	case nir_op_imax:
-		result = emit_minmax_int(ctx, LLVMIntSGT, src[0], src[1]);
+		result = emit_minmax_int(&ctx->ac, LLVMIntSGT, src[0], src[1]);
 		break;
 	case nir_op_imin:
-		result = emit_minmax_int(ctx, LLVMIntSLT, src[0], src[1]);
+		result = emit_minmax_int(&ctx->ac, LLVMIntSLT, src[0], src[1]);
 		break;
 	case nir_op_umax:
-		result = emit_minmax_int(ctx, LLVMIntUGT, src[0], src[1]);
+		result = emit_minmax_int(&ctx->ac, LLVMIntUGT, src[0], src[1]);
 		break;
 	case nir_op_umin:
-		result = emit_minmax_int(ctx, LLVMIntULT, src[0], src[1]);
+		result = emit_minmax_int(&ctx->ac, LLVMIntULT, src[0], src[1]);
 		break;
 	case nir_op_isign:
-		result = emit_isign(ctx, src[0]);
+		result = emit_isign(&ctx->ac, src[0]);
 		break;
 	case nir_op_fsign:
-		src[0] = to_float(ctx, src[0]);
-		result = emit_fsign(ctx, src[0]);
+		src[0] = to_float(&ctx->ac, src[0]);
+		result = emit_fsign(&ctx->ac, src[0]);
 		break;
 	case nir_op_ffloor:
-		result = emit_intrin_1f_param(ctx, "llvm.floor",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_ftrunc:
-		result = emit_intrin_1f_param(ctx, "llvm.trunc",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_fceil:
-		result = emit_intrin_1f_param(ctx, "llvm.ceil",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_fround_even:
-		result = emit_intrin_1f_param(ctx, "llvm.rint",
-		                              to_float_type(ctx, def_type),src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.rint",
+		                              to_float_type(&ctx->ac, def_type),src[0]);
 		break;
 	case nir_op_ffract:
-		result = emit_ffract(ctx, src[0]);
+		result = emit_ffract(&ctx->ac, src[0]);
 		break;
 	case nir_op_fsin:
-		result = emit_intrin_1f_param(ctx, "llvm.sin",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sin",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_fcos:
-		result = emit_intrin_1f_param(ctx, "llvm.cos",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.cos",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_fsqrt:
-		result = emit_intrin_1f_param(ctx, "llvm.sqrt",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_fexp2:
-		result = emit_intrin_1f_param(ctx, "llvm.exp2",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_flog2:
-		result = emit_intrin_1f_param(ctx, "llvm.log2",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_frsq:
-		result = emit_intrin_1f_param(ctx, "llvm.sqrt",
-		                              to_float_type(ctx, def_type), src[0]);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
+		                              to_float_type(&ctx->ac, def_type), src[0]);
 		result = ac_build_fdiv(&ctx->ac, ctx->f32one, result);
 		break;
 	case nir_op_fpow:
-		result = emit_intrin_2f_param(ctx, "llvm.pow",
-		                              to_float_type(ctx, def_type), src[0], src[1]);
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.pow",
+		                              to_float_type(&ctx->ac, def_type), src[0], src[1]);
 		break;
 	case nir_op_fmax:
-		result = emit_intrin_2f_param(ctx, "llvm.maxnum",
-		                              to_float_type(ctx, def_type), src[0], src[1]);
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
+		                              to_float_type(&ctx->ac, def_type), src[0], src[1]);
 		if (instr->dest.dest.ssa.bit_size == 32)
-			result = emit_intrin_1f_param(ctx, "llvm.canonicalize",
-						      to_float_type(ctx, def_type),
+			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
+						      to_float_type(&ctx->ac, def_type),
 						      result);
 		break;
 	case nir_op_fmin:
-		result = emit_intrin_2f_param(ctx, "llvm.minnum",
-		                              to_float_type(ctx, def_type), src[0], src[1]);
+		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
+		                              to_float_type(&ctx->ac, def_type), src[0], src[1]);
 		if (instr->dest.dest.ssa.bit_size == 32)
-			result = emit_intrin_1f_param(ctx, "llvm.canonicalize",
-						      to_float_type(ctx, def_type),
+			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
+						      to_float_type(&ctx->ac, def_type),
 						      result);
 		break;
 	case nir_op_ffma:
-		result = emit_intrin_3f_param(ctx, "llvm.fma",
-		                              to_float_type(ctx, def_type), src[0], src[1], src[2]);
+		result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd",
+		                              to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
 		break;
 	case nir_op_ibitfield_extract:
-		result = emit_bitfield_extract(ctx, true, src);
+		result = emit_bitfield_extract(&ctx->ac, true, src);
 		break;
 	case nir_op_ubitfield_extract:
-		result = emit_bitfield_extract(ctx, false, src);
+		result = emit_bitfield_extract(&ctx->ac, false, src);
 		break;
 	case nir_op_bitfield_insert:
-		result = emit_bitfield_insert(ctx, src[0], src[1], src[2], src[3]);
+		result = emit_bitfield_insert(&ctx->ac, src[0], src[1], src[2], src[3]);
 		break;
 	case nir_op_bitfield_reverse:
 		result = ac_build_intrinsic(&ctx->ac, "llvm.bitreverse.i32", ctx->i32, src, 1, AC_FUNC_ATTR_READNONE);
@@ -1689,82 +1783,107 @@
 	case nir_op_vec3:
 	case nir_op_vec4:
 		for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-			src[i] = to_integer(ctx, src[i]);
+			src[i] = to_integer(&ctx->ac, src[i]);
 		result = ac_build_gather_values(&ctx->ac, src, num_components);
 		break;
 	case nir_op_f2i32:
 	case nir_op_f2i64:
-		src[0] = to_float(ctx, src[0]);
+		src[0] = to_float(&ctx->ac, src[0]);
 		result = LLVMBuildFPToSI(ctx->builder, src[0], def_type, "");
 		break;
 	case nir_op_f2u32:
 	case nir_op_f2u64:
-		src[0] = to_float(ctx, src[0]);
+		src[0] = to_float(&ctx->ac, src[0]);
 		result = LLVMBuildFPToUI(ctx->builder, src[0], def_type, "");
 		break;
 	case nir_op_i2f32:
 	case nir_op_i2f64:
-		result = LLVMBuildSIToFP(ctx->builder, src[0], to_float_type(ctx, def_type), "");
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = LLVMBuildSIToFP(ctx->builder, src[0], to_float_type(&ctx->ac, def_type), "");
 		break;
 	case nir_op_u2f32:
 	case nir_op_u2f64:
-		result = LLVMBuildUIToFP(ctx->builder, src[0], to_float_type(ctx, def_type), "");
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = LLVMBuildUIToFP(ctx->builder, src[0], to_float_type(&ctx->ac, def_type), "");
 		break;
 	case nir_op_f2f64:
-		result = LLVMBuildFPExt(ctx->builder, src[0], to_float_type(ctx, def_type), "");
+		result = LLVMBuildFPExt(ctx->builder, src[0], to_float_type(&ctx->ac, def_type), "");
 		break;
 	case nir_op_f2f32:
-		result = LLVMBuildFPTrunc(ctx->builder, src[0], to_float_type(ctx, def_type), "");
+		result = LLVMBuildFPTrunc(ctx->builder, src[0], to_float_type(&ctx->ac, def_type), "");
 		break;
 	case nir_op_u2u32:
 	case nir_op_u2u64:
-		if (get_elem_bits(ctx, LLVMTypeOf(src[0])) < get_elem_bits(ctx, def_type))
+		src[0] = to_integer(&ctx->ac, src[0]);
+		if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
 			result = LLVMBuildZExt(ctx->builder, src[0], def_type, "");
 		else
 			result = LLVMBuildTrunc(ctx->builder, src[0], def_type, "");
 		break;
 	case nir_op_i2i32:
 	case nir_op_i2i64:
-		if (get_elem_bits(ctx, LLVMTypeOf(src[0])) < get_elem_bits(ctx, def_type))
+		src[0] = to_integer(&ctx->ac, src[0]);
+		if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
 			result = LLVMBuildSExt(ctx->builder, src[0], def_type, "");
 		else
 			result = LLVMBuildTrunc(ctx->builder, src[0], def_type, "");
 		break;
 	case nir_op_bcsel:
-		result = emit_bcsel(ctx, src[0], src[1], src[2]);
+		result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
 		break;
 	case nir_op_find_lsb:
-		result = emit_find_lsb(ctx, src[0]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = emit_find_lsb(&ctx->ac, src[0]);
 		break;
 	case nir_op_ufind_msb:
-		result = emit_ufind_msb(ctx, src[0]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = emit_ufind_msb(&ctx->ac, src[0]);
 		break;
 	case nir_op_ifind_msb:
-		result = emit_ifind_msb(ctx, src[0]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = emit_ifind_msb(&ctx->ac, src[0]);
 		break;
 	case nir_op_uadd_carry:
-		result = emit_uint_carry(ctx, "llvm.uadd.with.overflow.i32", src[0], src[1]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		src[1] = to_integer(&ctx->ac, src[1]);
+		result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
 		break;
 	case nir_op_usub_borrow:
-		result = emit_uint_carry(ctx, "llvm.usub.with.overflow.i32", src[0], src[1]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		src[1] = to_integer(&ctx->ac, src[1]);
+		result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
 		break;
 	case nir_op_b2f:
-		result = emit_b2f(ctx, src[0]);
+		result = emit_b2f(&ctx->ac, src[0]);
+		break;
+	case nir_op_f2b:
+		result = emit_f2b(&ctx->ac, src[0]);
+		break;
+	case nir_op_b2i:
+		result = emit_b2i(&ctx->ac, src[0]);
+		break;
+	case nir_op_i2b:
+		src[0] = to_integer(&ctx->ac, src[0]);
+		result = emit_i2b(&ctx->ac, src[0]);
 		break;
 	case nir_op_fquantize2f16:
 		result = emit_f2f16(ctx, src[0]);
 		break;
 	case nir_op_umul_high:
-		result = emit_umul_high(ctx, src[0], src[1]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		src[1] = to_integer(&ctx->ac, src[1]);
+		result = emit_umul_high(&ctx->ac, src[0], src[1]);
 		break;
 	case nir_op_imul_high:
-		result = emit_imul_high(ctx, src[0], src[1]);
+		src[0] = to_integer(&ctx->ac, src[0]);
+		src[1] = to_integer(&ctx->ac, src[1]);
+		result = emit_imul_high(&ctx->ac, src[0], src[1]);
 		break;
 	case nir_op_pack_half_2x16:
-		result = emit_pack_half_2x16(ctx, src[0]);
+		result = emit_pack_half_2x16(&ctx->ac, src[0]);
 		break;
 	case nir_op_unpack_half_2x16:
-		result = emit_unpack_half_2x16(ctx, src[0]);
+		result = emit_unpack_half_2x16(&ctx->ac, src[0]);
 		break;
 	case nir_op_fddx:
 	case nir_op_fddy:
@@ -1814,14 +1933,14 @@
 
 	if (result) {
 		assert(instr->dest.dest.is_ssa);
-		result = to_integer(ctx, result);
+		result = to_integer(&ctx->ac, result);
 		_mesa_hash_table_insert(ctx->defs, &instr->dest.dest.ssa,
 		                        result);
 	}
 }
 
 static void visit_load_const(struct nir_to_llvm_context *ctx,
-                             nir_load_const_instr *instr)
+                             const nir_load_const_instr *instr)
 {
 	LLVMValueRef values[4], value = NULL;
 	LLVMTypeRef element_type =
@@ -1905,7 +2024,7 @@
 
 static LLVMValueRef radv_lower_gather4_integer(struct nir_to_llvm_context *ctx,
 					       struct ac_image_args *args,
-					       nir_tex_instr *instr)
+					       const nir_tex_instr *instr)
 {
 	enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
 	LLVMValueRef coord = args->addr;
@@ -2018,7 +2137,8 @@
 }
 
 static LLVMValueRef build_tex_intrinsic(struct nir_to_llvm_context *ctx,
-					nir_tex_instr *instr,
+					const nir_tex_instr *instr,
+					bool lod_is_zero,
 					struct ac_image_args *args)
 {
 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
@@ -2044,7 +2164,10 @@
 		args->bias = true;
 		break;
 	case nir_texop_txl:
-		args->lod = true;
+		if (lod_is_zero)
+			args->level_zero = true;
+		else
+			args->lod = true;
 		break;
 	case nir_texop_txs:
 	case nir_texop_query_levels:
@@ -2070,7 +2193,7 @@
 		break;
 	}
 
-	if (instr->op == nir_texop_tg4) {
+	if (instr->op == nir_texop_tg4 && ctx->options->chip_class <= VI) {
 		enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
 		if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
 			return radv_lower_gather4_integer(ctx, args, instr);
@@ -2127,7 +2250,7 @@
 }
 
 static LLVMValueRef visit_get_buffer_size(struct nir_to_llvm_context *ctx,
-                                          nir_intrinsic_instr *instr)
+                                          const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef desc = get_src(ctx, instr->src[0]);
 
@@ -2139,7 +2262,7 @@
 	const char *store_name;
 	LLVMValueRef src_data = get_src(ctx, instr->src[0]);
 	LLVMTypeRef data_type = ctx->f32;
-	int elem_size_mult = get_elem_bits(ctx, LLVMTypeOf(src_data)) / 32;
+	int elem_size_mult = get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 32;
 	int components_32bit = elem_size_mult * instr->num_components;
 	unsigned writemask = nir_intrinsic_write_mask(instr);
 	LLVMValueRef base_data, base_offset;
@@ -2156,7 +2279,7 @@
 	if (components_32bit > 1)
 		data_type = LLVMVectorType(ctx->f32, components_32bit);
 
-	base_data = to_float(ctx, src_data);
+	base_data = to_float(&ctx->ac, src_data);
 	base_data = trim_vector(ctx, base_data, instr->num_components);
 	base_data = LLVMBuildBitCast(ctx->builder, base_data,
 				     data_type, "");
@@ -2220,7 +2343,7 @@
 }
 
 static LLVMValueRef visit_atomic_ssbo(struct nir_to_llvm_context *ctx,
-                                      nir_intrinsic_instr *instr)
+                                      const nir_intrinsic_instr *instr)
 {
 	const char *name;
 	LLVMValueRef params[6];
@@ -2276,7 +2399,7 @@
 }
 
 static LLVMValueRef visit_load_buffer(struct nir_to_llvm_context *ctx,
-                                      nir_intrinsic_instr *instr)
+                                      const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef results[2];
 	int load_components;
@@ -2336,15 +2459,13 @@
 }
 
 static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
-                                          nir_intrinsic_instr *instr)
+                                          const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef results[8], ret;
 	LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
 	int num_components = instr->num_components;
 
-	rsrc = LLVMBuildBitCast(ctx->builder, rsrc, LLVMVectorType(ctx->i8, 16), "");
-
 	if (instr->dest.ssa.bit_size == 64)
 		num_components *= 2;
 
@@ -2354,7 +2475,7 @@
 			LLVMBuildAdd(ctx->builder, LLVMConstInt(ctx->i32, 4 * i, 0),
 				     offset, "")
 		};
-		results[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.load.const", ctx->f32,
+		results[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.load.const.v4i32", ctx->f32,
 						params, 2,
 						AC_FUNC_ATTR_READNONE |
 						AC_FUNC_ATTR_LEGACY);
@@ -2731,7 +2852,7 @@
 
 static LLVMValueRef
 load_tes_input(struct nir_to_llvm_context *ctx,
-	       nir_intrinsic_instr *instr)
+	       const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef buf_addr;
 	LLVMValueRef result;
@@ -2755,7 +2876,7 @@
 						     is_compact, vertex_index, indir_index);
 
 	result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, instr->num_components, NULL,
-				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true);
+				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, false);
 	result = trim_vector(ctx, result, instr->num_components);
 	result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx, &instr->dest.ssa), "");
 	return result;
@@ -2802,6 +2923,45 @@
 	return result;
 }
 
+static LLVMValueRef
+build_gep_for_deref(struct nir_to_llvm_context *ctx,
+		    nir_deref_var *deref)
+{
+	struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, deref->var);
+	assert(entry->data);
+	LLVMValueRef val = entry->data;
+	nir_deref *tail = deref->deref.child;
+	while (tail != NULL) {
+		LLVMValueRef offset;
+		switch (tail->deref_type) {
+		case nir_deref_type_array: {
+			nir_deref_array *array = nir_deref_as_array(tail);
+			offset = LLVMConstInt(ctx->i32, array->base_offset, 0);
+			if (array->deref_array_type ==
+			    nir_deref_array_type_indirect) {
+				offset = LLVMBuildAdd(ctx->builder, offset,
+						      get_src(ctx,
+							      array->indirect),
+						      "");
+			}
+			break;
+		}
+		case nir_deref_type_struct: {
+			nir_deref_struct *deref_struct =
+				nir_deref_as_struct(tail);
+			offset = LLVMConstInt(ctx->i32,
+					      deref_struct->index, 0);
+			break;
+		}
+		default:
+			unreachable("bad deref type");
+		}
+		val = ac_build_gep0(&ctx->ac, val, offset);
+		tail = tail->child;
+	}
+	return val;
+}
+
 static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
 				   nir_intrinsic_instr *instr)
 {
@@ -2863,6 +3023,14 @@
 			}
 		}
 		break;
+	case nir_var_shared: {
+		LLVMValueRef address = build_gep_for_deref(ctx,
+							   instr->variables[0]);
+		LLVMValueRef val = LLVMBuildLoad(ctx->builder, address, "");
+		return LLVMBuildBitCast(ctx->builder, val,
+					get_def_type(ctx, &instr->dest.ssa),
+					"");
+	}
 	case nir_var_shader_out:
 		if (ctx->stage == MESA_SHADER_TESS_CTRL)
 			return load_tcs_output(ctx, instr);
@@ -2885,23 +3053,6 @@
 			}
 		}
 		break;
-	case nir_var_shared: {
-		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
-		LLVMValueRef derived_ptr;
-
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
-		for (unsigned chan = 0; chan < ve; chan++) {
-			LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
-			if (indir_index)
-				index = LLVMBuildAdd(ctx->builder, index, indir_index, "");
-			derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
-
-			values[chan] = LLVMBuildLoad(ctx->builder, derived_ptr, "");
-		}
-		break;
-	}
 	default:
 		unreachable("unhandle variable mode");
 	}
@@ -2915,14 +3066,14 @@
 {
 	LLVMValueRef temp_ptr, value;
 	int idx = instr->variables[0]->var->data.driver_location;
-	LLVMValueRef src = to_float(ctx, get_src(ctx, instr->src[0]));
+	LLVMValueRef src = to_float(&ctx->ac, get_src(ctx, instr->src[0]));
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;
 	radv_get_deref_offset(ctx, instr->variables[0], false,
 	                      NULL, NULL, &const_index, &indir_index);
 
-	if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
+	if (get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
 		int old_writemask = writemask;
 
 		src = LLVMBuildBitCast(ctx->builder, src,
@@ -3002,24 +3153,32 @@
 		}
 		break;
 	case nir_var_shared: {
-		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
-
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
-		for (unsigned chan = 0; chan < 8; chan++) {
-			if (!(writemask & (1 << chan)))
-				continue;
-			LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
-			LLVMValueRef derived_ptr;
-
-			if (indir_index)
-				index = LLVMBuildAdd(ctx->builder, index, indir_index, "");
-
-			value = llvm_extract_elem(ctx, src, chan);
-			derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
-			LLVMBuildStore(ctx->builder,
-			               to_integer(ctx, value), derived_ptr);
+		int writemask = instr->const_index[0];
+		LLVMValueRef address = build_gep_for_deref(ctx,
+							   instr->variables[0]);
+		LLVMValueRef val = get_src(ctx, instr->src[0]);
+		unsigned components =
+			glsl_get_vector_elements(
+			   nir_deref_tail(&instr->variables[0]->deref)->type);
+		if (writemask == (1 << components) - 1) {
+			val = LLVMBuildBitCast(
+			   ctx->builder, val,
+			   LLVMGetElementType(LLVMTypeOf(address)), "");
+			LLVMBuildStore(ctx->builder, val, address);
+		} else {
+			for (unsigned chan = 0; chan < 4; chan++) {
+				if (!(writemask & (1 << chan)))
+					continue;
+				LLVMValueRef ptr =
+					LLVMBuildStructGEP(ctx->builder,
+							   address, chan, "");
+				LLVMValueRef src = llvm_extract_elem(ctx, val,
+								     chan);
+				src = LLVMBuildBitCast(
+				   ctx->builder, src,
+				   LLVMGetElementType(LLVMTypeOf(ptr)), "");
+				LLVMBuildStore(ctx->builder, src, ptr);
+			}
 		}
 		break;
 	}
@@ -3095,7 +3254,7 @@
 
 	res = ac_build_image_opcode(&ctx->ac, &args);
 
-	res = to_integer(ctx, res);
+	res = to_integer(&ctx->ac, res);
 	LLVMValueRef four = LLVMConstInt(ctx->i32, 4, false);
 	LLVMValueRef F = LLVMConstInt(ctx->i32, 0xf, false);
 
@@ -3133,7 +3292,7 @@
 }
 
 static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr)
+				     const nir_intrinsic_instr *instr)
 {
 	const struct glsl_type *type = instr->variables[0]->var->type;
 	if(instr->variables[0]->deref.child)
@@ -3150,13 +3309,13 @@
 
 	int count;
 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+	bool is_array = glsl_sampler_type_is_array(type);
 	bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
 			     dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
 	bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
 		      dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
-
-	count = image_type_to_components_count(dim,
-					       glsl_sampler_type_is_array(type));
+	bool gfx9_1d = ctx->options->chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
+	count = image_type_to_components_count(dim, is_array);
 
 	if (is_ms) {
 		LLVMValueRef fmask_load_address[3];
@@ -3164,7 +3323,7 @@
 
 		fmask_load_address[0] = LLVMBuildExtractElement(ctx->builder, src0, masks[0], "");
 		fmask_load_address[1] = LLVMBuildExtractElement(ctx->builder, src0, masks[1], "");
-		if (glsl_sampler_type_is_array(type))
+		if (is_array)
 			fmask_load_address[2] = LLVMBuildExtractElement(ctx->builder, src0, masks[2], "");
 		else
 			fmask_load_address[2] = NULL;
@@ -3179,7 +3338,7 @@
 							       sample_index,
 							       get_sampler_desc(ctx, instr->variables[0], DESC_FMASK));
 	}
-	if (count == 1) {
+	if (count == 1 && !gfx9_1d) {
 		if (instr->src[0].ssa->num_components)
 			res = LLVMBuildExtractElement(ctx->builder, src0, masks[0], "");
 		else
@@ -3189,13 +3348,22 @@
 		if (is_ms)
 			count--;
 		for (chan = 0; chan < count; ++chan) {
-			coords[chan] = LLVMBuildExtractElement(ctx->builder, src0, masks[chan], "");
+			coords[chan] = llvm_extract_elem(ctx, src0, chan);
 		}
-
 		if (add_frag_pos) {
 			for (chan = 0; chan < count; ++chan)
 				coords[chan] = LLVMBuildAdd(ctx->builder, coords[chan], LLVMBuildFPToUI(ctx->builder, ctx->frag_pos[chan], ctx->i32, ""), "");
 		}
+
+		if (gfx9_1d) {
+			if (is_array) {
+				coords[2] = coords[1];
+				coords[1] = ctx->ac.i32_0;
+			} else
+				coords[1] = ctx->ac.i32_0;
+			count++;
+		}
+
 		if (is_ms) {
 			coords[count] = sample_index;
 			count++;
@@ -3211,7 +3379,7 @@
 }
 
 static LLVMValueRef visit_image_load(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr)
+				     const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef params[7];
 	LLVMValueRef res;
@@ -3233,7 +3401,7 @@
 					 params, 5, 0);
 
 		res = trim_vector(ctx, res, instr->dest.ssa.num_components);
-		res = to_integer(ctx, res);
+		res = to_integer(&ctx->ac, res);
 	} else {
 		bool is_da = glsl_sampler_type_is_array(type) ||
 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
@@ -3266,7 +3434,7 @@
 		res = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->v4f32,
 					 params, 7, AC_FUNC_ATTR_READONLY);
 	}
-	return to_integer(ctx, res);
+	return to_integer(&ctx->ac, res);
 }
 
 static void visit_image_store(struct nir_to_llvm_context *ctx,
@@ -3276,17 +3444,20 @@
 	char intrinsic_name[64];
 	const nir_variable *var = instr->variables[0]->var;
 	const struct glsl_type *type = glsl_without_array(var->type);
-
+	LLVMValueRef glc = ctx->i1false;
+	bool force_glc = ctx->options->chip_class == SI;
+	if (force_glc)
+		glc = ctx->i1true;
 	if (ctx->stage == MESA_SHADER_FRAGMENT)
 		ctx->shader_info->fs.writes_memory = true;
 
 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
-		params[0] = to_float(ctx, get_src(ctx, instr->src[2])); /* data */
+		params[0] = to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */
 		params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER);
 		params[2] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]),
 						    LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */
 		params[3] = LLVMConstInt(ctx->i32, 0, false); /* voffset */
-		params[4] = ctx->i1false;  /* glc */
+		params[4] = glc;  /* glc */
 		params[5] = ctx->i1false;  /* slc */
 		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->voidt,
 				   params, 6, 0);
@@ -3294,10 +3465,9 @@
 		bool is_da = glsl_sampler_type_is_array(type) ||
 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
 		LLVMValueRef da = is_da ? ctx->i1true : ctx->i1false;
-		LLVMValueRef glc = ctx->i1false;
 		LLVMValueRef slc = ctx->i1false;
 
-		params[0] = to_float(ctx, get_src(ctx, instr->src[2]));
+		params[0] = to_float(&ctx->ac, get_src(ctx, instr->src[2]));
 		params[1] = get_image_coords(ctx, instr); /* coords */
 		params[2] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[3] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
@@ -3327,9 +3497,9 @@
 }
 
 static LLVMValueRef visit_image_atomic(struct nir_to_llvm_context *ctx,
-                                       nir_intrinsic_instr *instr)
+                                       const nir_intrinsic_instr *instr)
 {
-	LLVMValueRef params[6];
+	LLVMValueRef params[7];
 	int param_count = 0;
 	const nir_variable *var = instr->variables[0]->var;
 
@@ -3341,15 +3511,17 @@
 	if (ctx->stage == MESA_SHADER_FRAGMENT)
 		ctx->shader_info->fs.writes_memory = true;
 
+	bool is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT;
+
 	switch (instr->intrinsic) {
 	case nir_intrinsic_image_atomic_add:
 		atomic_name = "add";
 		break;
 	case nir_intrinsic_image_atomic_min:
-		atomic_name = "smin";
+		atomic_name = is_unsigned ? "umin" : "smin";
 		break;
 	case nir_intrinsic_image_atomic_max:
-		atomic_name = "smax";
+		atomic_name = is_unsigned ? "umax" : "smax";
 		break;
 	case nir_intrinsic_image_atomic_and:
 		atomic_name = "and";
@@ -3402,11 +3574,12 @@
 				  "llvm.amdgcn.image.atomic.%s.%s", atomic_name, coords_type);
 	}
 
+	assert(length < sizeof(intrinsic_name));
 	return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->i32, params, param_count, 0);
 }
 
 static LLVMValueRef visit_image_size(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr)
+				     const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef res;
 	const nir_variable *var = instr->variables[0]->var;
@@ -3429,14 +3602,23 @@
 
 	res = ac_build_image_opcode(&ctx->ac, &args);
 
+	LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false);
+
 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
 	    glsl_sampler_type_is_array(type)) {
-		LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false);
 		LLVMValueRef six = LLVMConstInt(ctx->i32, 6, false);
 		LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, res, two, "");
 		z = LLVMBuildSDiv(ctx->builder, z, six, "");
 		res = LLVMBuildInsertElement(ctx->builder, res, z, two, "");
 	}
+	if (ctx->options->chip_class >= GFX9 &&
+	    glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
+	    glsl_sampler_type_is_array(type)) {
+		LLVMValueRef layers = LLVMBuildExtractElement(ctx->builder, res, two, "");
+		res = LLVMBuildInsertElement(ctx->builder, res, layers,
+						ctx->ac.i32_1, "");
+
+	}
 	return res;
 }
 
@@ -3470,7 +3652,7 @@
 }
 
 static void emit_discard_if(struct nir_to_llvm_context *ctx,
-			    nir_intrinsic_instr *instr)
+			    const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef cond;
 	ctx->shader_info->fs.can_discard = true;
@@ -3497,12 +3679,11 @@
 }
 
 static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr)
+				     const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef ptr, result;
-	int idx = instr->variables[0]->var->data.driver_location;
 	LLVMValueRef src = get_src(ctx, instr->src[0]);
-	ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
+	ptr = build_gep_for_deref(ctx, instr->variables[0]);
 
 	if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap) {
 		LLVMValueRef src1 = get_src(ctx, instr->src[1]);
@@ -3545,7 +3726,7 @@
 			return NULL;
 		}
 
-		result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, to_integer(ctx, src),
+		result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, to_integer(&ctx->ac, src),
 					    LLVMAtomicOrderingSequentiallyConsistent,
 					    false);
 	}
@@ -3596,7 +3777,6 @@
 	sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
 	result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false);
 
-	ctx->shader_info->fs.uses_sample_positions = true;
 	return result;
 }
 
@@ -3604,13 +3784,13 @@
 {
 	LLVMValueRef values[2];
 
-	values[0] = emit_ffract(ctx, ctx->frag_pos[0]);
-	values[1] = emit_ffract(ctx, ctx->frag_pos[1]);
+	values[0] = emit_ffract(&ctx->ac, ctx->frag_pos[0]);
+	values[1] = emit_ffract(&ctx->ac, ctx->frag_pos[1]);
 	return ac_build_gather_values(&ctx->ac, values, 2);
 }
 
 static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx,
-				 nir_intrinsic_instr *instr)
+				 const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef result[2];
 	LLVMValueRef interp_param, attr_number;
@@ -3633,8 +3813,8 @@
 	}
 
 	if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) {
-		src_c0 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, ""));
-		src_c1 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, ""));
+		src_c0 = to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, ""));
+		src_c1 = to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, ""));
 	} else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) {
 		LLVMValueRef sample_position;
 		LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
@@ -3715,7 +3895,7 @@
 
 static void
 visit_emit_vertex(struct nir_to_llvm_context *ctx,
-		  nir_intrinsic_instr *instr)
+		  const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef gs_next_vertex;
 	LLVMValueRef can_emit, kill;
@@ -3783,14 +3963,14 @@
 
 static void
 visit_end_primitive(struct nir_to_llvm_context *ctx,
-		    nir_intrinsic_instr *instr)
+		    const nir_intrinsic_instr *instr)
 {
 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id);
 }
 
 static LLVMValueRef
 visit_load_tess_coord(struct nir_to_llvm_context *ctx,
-		      nir_intrinsic_instr *instr)
+		      const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef coord[4] = {
 		ctx->tes_u,
@@ -3843,13 +4023,16 @@
 			result = ctx->gs_invocation_id;
 		break;
 	case nir_intrinsic_load_primitive_id:
-		if (ctx->stage == MESA_SHADER_GEOMETRY)
+		if (ctx->stage == MESA_SHADER_GEOMETRY) {
+			ctx->shader_info->gs.uses_prim_id = true;
 			result = ctx->gs_prim_id;
-		else if (ctx->stage == MESA_SHADER_TESS_CTRL)
+		} else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+			ctx->shader_info->tcs.uses_prim_id = true;
 			result = ctx->tcs_patch_id;
-		else if (ctx->stage == MESA_SHADER_TESS_EVAL)
+		} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
+			ctx->shader_info->tcs.uses_prim_id = true;
 			result = ctx->tes_patch_id;
-		else
+		} else
 			fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
 		break;
 	case nir_intrinsic_load_sample_id:
@@ -3988,8 +4171,8 @@
 }
 
 static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx,
-					  nir_deref_var *deref,
-					  enum desc_type desc_type)
+				     const nir_deref_var *deref,
+				     enum desc_type desc_type)
 {
 	unsigned desc_set = deref->var->data.descriptor_set;
 	LLVMValueRef list = ctx->descriptor_sets[desc_set];
@@ -4031,7 +4214,8 @@
 	}
 
 	if (deref->deref.child) {
-		nir_deref_array *child = (nir_deref_array*)deref->deref.child;
+		const nir_deref_array *child =
+			(const nir_deref_array *)deref->deref.child;
 
 		assert(child->deref_array_type != nir_deref_array_type_wildcard);
 		offset += child->base_offset * stride;
@@ -4072,7 +4256,7 @@
 
 static void set_tex_fetch_args(struct nir_to_llvm_context *ctx,
 			       struct ac_image_args *args,
-			       nir_tex_instr *instr,
+			       const nir_tex_instr *instr,
 			       nir_texop op,
 			       LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
 			       LLVMValueRef *param, unsigned count,
@@ -4159,9 +4343,9 @@
 static LLVMValueRef apply_round_slice(struct nir_to_llvm_context *ctx,
 				      LLVMValueRef coord)
 {
-	coord = to_float(ctx, coord);
+	coord = to_float(&ctx->ac, coord);
 	coord = ac_build_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
-	coord = to_integer(ctx, coord);
+	coord = to_integer(&ctx->ac, coord);
 	return coord;
 }
 
@@ -4179,7 +4363,7 @@
 	LLVMValueRef derivs[6];
 	unsigned chan, count = 0;
 	unsigned const_src = 0, num_deriv_comp = 0;
-
+	bool lod_is_zero = false;
 	tex_fetch_ptrs(ctx, instr, &res_ptr, &samp_ptr, &fmask_ptr);
 
 	for (unsigned i = 0; i < instr->num_srcs; i++) {
@@ -4199,9 +4383,14 @@
 		case nir_tex_src_bias:
 			bias = get_src(ctx, instr->src[i].src);
 			break;
-		case nir_tex_src_lod:
+		case nir_tex_src_lod: {
+			nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
+
+			if (val && val->i32[0] == 0)
+				lod_is_zero = true;
 			lod = get_src(ctx, instr->src[i].src);
 			break;
+		}
 		case nir_tex_src_ms_index:
 			sample_index = get_src(ctx, instr->src[i].src);
 			break;
@@ -4286,36 +4475,50 @@
 
 	/* pack derivatives */
 	if (ddx || ddy) {
+		int num_src_deriv_channels, num_dest_deriv_channels;
 		switch (instr->sampler_dim) {
 		case GLSL_SAMPLER_DIM_3D:
 		case GLSL_SAMPLER_DIM_CUBE:
 			num_deriv_comp = 3;
+			num_src_deriv_channels = 3;
+			num_dest_deriv_channels = 3;
 			break;
 		case GLSL_SAMPLER_DIM_2D:
 		default:
+			num_src_deriv_channels = 2;
+			num_dest_deriv_channels = 2;
 			num_deriv_comp = 2;
 			break;
 		case GLSL_SAMPLER_DIM_1D:
-			num_deriv_comp = 1;
+			num_src_deriv_channels = 1;
+			if (ctx->options->chip_class >= GFX9) {
+				num_dest_deriv_channels = 2;
+				num_deriv_comp = 2;
+			} else {
+				num_dest_deriv_channels = 1;
+				num_deriv_comp = 1;
+			}
 			break;
 		}
 
-		for (unsigned i = 0; i < num_deriv_comp; i++) {
-			derivs[i] = to_float(ctx, llvm_extract_elem(ctx, ddx, i));
-			derivs[num_deriv_comp + i] = to_float(ctx, llvm_extract_elem(ctx, ddy, i));
+		for (unsigned i = 0; i < num_src_deriv_channels; i++) {
+			derivs[i] = to_float(&ctx->ac, llvm_extract_elem(ctx, ddx, i));
+			derivs[num_dest_deriv_channels + i] = to_float(&ctx->ac, llvm_extract_elem(ctx, ddy, i));
+		}
+		for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
+			derivs[i] = ctx->ac.f32_0;
+			derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0;
 		}
 	}
 
 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) {
-		if (instr->is_array && instr->op != nir_texop_lod)
-			coords[3] = apply_round_slice(ctx, coords[3]);
 		for (chan = 0; chan < instr->coord_components; chan++)
-			coords[chan] = to_float(ctx, coords[chan]);
+			coords[chan] = to_float(&ctx->ac, coords[chan]);
 		if (instr->coord_components == 3)
 			coords[3] = LLVMGetUndef(ctx->f32);
 		ac_prepare_cube_coords(&ctx->ac,
 			instr->op == nir_texop_txd, instr->is_array,
-			coords, derivs);
+			instr->op == nir_texop_lod, coords, derivs);
 		if (num_deriv_comp)
 			num_deriv_comp--;
 	}
@@ -4343,10 +4546,30 @@
 			}
 			address[count++] = coords[2];
 		}
+
+		if (ctx->options->chip_class >= GFX9) {
+			LLVMValueRef filler;
+			if (instr->op == nir_texop_txf)
+				filler = ctx->ac.i32_0;
+			else
+				filler = LLVMConstReal(ctx->f32, 0.5);
+
+			if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) {
+				/* No nir_texop_lod, because it does not take a slice
+				 * even with array textures. */
+				if (instr->is_array && instr->op != nir_texop_lod ) {
+					address[count] = address[count - 1];
+					address[count - 1] = filler;
+					count++;
+				} else
+					address[count++] = filler;
+			}
+		}
 	}
 
 	/* Pack LOD */
-	if ((instr->op == nir_texop_txl || instr->op == nir_texop_txf) && lod) {
+	if (lod && ((instr->op == nir_texop_txl && !lod_is_zero) ||
+		    instr->op == nir_texop_txf)) {
 		address[count++] = lod;
 	} else if (instr->op == nir_texop_txf_ms && sample_index) {
 		address[count++] = sample_index;
@@ -4377,10 +4600,10 @@
 				   fmask_ptr, NULL,
 				   txf_address, txf_count, 0xf);
 
-		result = build_tex_intrinsic(ctx, instr, &txf_args);
+		result = build_tex_intrinsic(ctx, instr, false, &txf_args);
 
 		result = LLVMBuildExtractElement(ctx->builder, result, ctx->i32zero, "");
-		result = emit_int_cmp(ctx, LLVMIntEQ, result, ctx->i32zero);
+		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->i32zero);
 		goto write_result;
 	}
 
@@ -4422,7 +4645,7 @@
 	set_tex_fetch_args(ctx, &args, instr, instr->op,
 			   res_ptr, samp_ptr, address, count, dmask);
 
-	result = build_tex_intrinsic(ctx, instr, &args);
+	result = build_tex_intrinsic(ctx, instr, lod_is_zero, &args);
 
 	if (instr->op == nir_texop_query_levels)
 		result = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, 3, false), "");
@@ -4436,13 +4659,21 @@
 		LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, result, two, "");
 		z = LLVMBuildSDiv(ctx->builder, z, six, "");
 		result = LLVMBuildInsertElement(ctx->builder, result, z, two, "");
+	} else if (ctx->options->chip_class >= GFX9 &&
+		   instr->op == nir_texop_txs &&
+		   instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+		   instr->is_array) {
+		LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false);
+		LLVMValueRef layers = LLVMBuildExtractElement(ctx->builder, result, two, "");
+		result = LLVMBuildInsertElement(ctx->builder, result, layers,
+						ctx->ac.i32_1, "");
 	} else if (instr->dest.ssa.num_components != 4)
 		result = trim_vector(ctx, result, instr->dest.ssa.num_components);
 
 write_result:
 	if (result) {
 		assert(instr->dest.is_ssa);
-		result = to_integer(ctx, result);
+		result = to_integer(&ctx->ac, result);
 		_mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
 	}
 }
@@ -4480,7 +4711,7 @@
 
 
 static void visit_ssa_undef(struct nir_to_llvm_context *ctx,
-			    nir_ssa_undef_instr *instr)
+			    const nir_ssa_undef_instr *instr)
 {
 	unsigned num_components = instr->def.num_components;
 	LLVMValueRef undef;
@@ -4494,7 +4725,7 @@
 }
 
 static void visit_jump(struct nir_to_llvm_context *ctx,
-		       nir_jump_instr *instr)
+		       const nir_jump_instr *instr)
 {
 	switch (instr->type) {
 	case nir_jump_break:
@@ -4669,7 +4900,7 @@
 		for (unsigned chan = 0; chan < 4; chan++) {
 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false);
 			ctx->inputs[radeon_llvm_reg_index_soa(idx, chan)] =
-				to_integer(ctx, LLVMBuildExtractElement(ctx->builder,
+				to_integer(&ctx->ac, LLVMBuildExtractElement(ctx->builder,
 							input, llvm_chan, ""));
 		}
 	}
@@ -4892,6 +5123,68 @@
 	ctx->output_mask |= mask_attribs;
 }
 
+static LLVMTypeRef
+glsl_base_to_llvm_type(struct nir_to_llvm_context *ctx,
+		       enum glsl_base_type type)
+{
+	switch (type) {
+	case GLSL_TYPE_INT:
+	case GLSL_TYPE_UINT:
+	case GLSL_TYPE_BOOL:
+	case GLSL_TYPE_SUBROUTINE:
+		return ctx->i32;
+	case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+		return ctx->f32;
+	case GLSL_TYPE_INT64:
+	case GLSL_TYPE_UINT64:
+		return ctx->i64;
+	case GLSL_TYPE_DOUBLE:
+		return ctx->f64;
+	default:
+		unreachable("unknown GLSL type");
+	}
+}
+
+static LLVMTypeRef
+glsl_to_llvm_type(struct nir_to_llvm_context *ctx,
+		  const struct glsl_type *type)
+{
+	if (glsl_type_is_scalar(type)) {
+		return glsl_base_to_llvm_type(ctx, glsl_get_base_type(type));
+	}
+
+	if (glsl_type_is_vector(type)) {
+		return LLVMVectorType(
+		   glsl_base_to_llvm_type(ctx, glsl_get_base_type(type)),
+		   glsl_get_vector_elements(type));
+	}
+
+	if (glsl_type_is_matrix(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ctx, glsl_get_column_type(type)),
+		   glsl_get_matrix_columns(type));
+	}
+
+	if (glsl_type_is_array(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ctx, glsl_get_array_element(type)),
+		   glsl_get_length(type));
+	}
+
+	assert(glsl_type_is_struct(type));
+
+	LLVMTypeRef member_types[glsl_get_length(type)];
+
+	for (unsigned i = 0; i < glsl_get_length(type); i++) {
+		member_types[i] =
+			glsl_to_llvm_type(ctx,
+					  glsl_get_struct_field(type, i));
+	}
+
+	return LLVMStructTypeInContext(ctx->context, member_types,
+				       glsl_get_length(type), false);
+}
+
 static void
 setup_locals(struct nir_to_llvm_context *ctx,
 	     struct nir_function *func)
@@ -4915,8 +5208,22 @@
 	}
 }
 
+static void
+setup_shared(struct nir_to_llvm_context *ctx,
+	     struct nir_shader *nir)
+{
+	nir_foreach_variable(variable, &nir->shared) {
+		LLVMValueRef shared =
+			LLVMAddGlobalInAddressSpace(
+			   ctx->module, glsl_to_llvm_type(ctx, variable->type),
+			   variable->name ? variable->name : "",
+			   LOCAL_ADDR_SPACE);
+		_mesa_hash_table_insert(ctx->vars, variable, shared);
+	}
+}
+
 static LLVMValueRef
-emit_float_saturate(struct nir_to_llvm_context *ctx, LLVMValueRef v, float lo, float hi)
+emit_float_saturate(struct ac_llvm_context *ctx, LLVMValueRef v, float lo, float hi)
 {
 	v = to_float(ctx, v);
 	v = emit_intrin_2f_param(ctx, "llvm.maxnum.f32", ctx->f32, v, LLVMConstReal(ctx->f32, lo));
@@ -4969,6 +5276,7 @@
 		unsigned index = target - V_008DFC_SQ_EXP_MRT;
 		unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
 		bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
+		bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
 
 		switch(col_format) {
 		case V_028714_SPI_SHADER_ZERO:
@@ -5026,7 +5334,7 @@
 
 		case V_028714_SPI_SHADER_SNORM16_ABGR:
 			for (unsigned chan = 0; chan < 4; chan++) {
-				val[chan] = emit_float_saturate(ctx, values[chan], -1, 1);
+				val[chan] = emit_float_saturate(&ctx->ac, values[chan], -1, 1);
 				val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
 							LLVMConstReal(ctx->f32, 32767), "");
 
@@ -5046,11 +5354,13 @@
 			break;
 
 		case V_028714_SPI_SHADER_UINT16_ABGR: {
-			LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 255 : 65535, 0);
+			LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+							    is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
+			LLVMValueRef max_alpha = !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
 
 			for (unsigned chan = 0; chan < 4; chan++) {
-				val[chan] = to_integer(ctx, values[chan]);
-				val[chan] = emit_minmax_int(ctx, LLVMIntULT, val[chan], max);
+				val[chan] = to_integer(&ctx->ac, values[chan]);
+				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntULT, val[chan], chan == 3 ? max_alpha : max_rgb);
 			}
 
 			args->compr = 1;
@@ -5060,14 +5370,18 @@
 		}
 
 		case V_028714_SPI_SHADER_SINT16_ABGR: {
-			LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 127 : 32767, 0);
-			LLVMValueRef min = LLVMConstInt(ctx->i32, is_int8 ? -128 : -32768, 0);
+			LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+							    is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
+			LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
+							    is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
+			LLVMValueRef max_alpha = !is_int10 ? max_rgb : ctx->i32one;
+			LLVMValueRef min_alpha = !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
 
 			/* Clamp. */
 			for (unsigned chan = 0; chan < 4; chan++) {
-				val[chan] = to_integer(ctx, values[chan]);
-				val[chan] = emit_minmax_int(ctx, LLVMIntSLT, val[chan], max);
-				val[chan] = emit_minmax_int(ctx, LLVMIntSGT, val[chan], min);
+				val[chan] = to_integer(&ctx->ac, values[chan]);
+				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSLT, val[chan], chan == 3 ? max_alpha : max_rgb);
+				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSGT, val[chan], chan == 3 ? min_alpha : min_rgb);
 			}
 
 			args->compr = 1;
@@ -5085,11 +5399,12 @@
 		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 
 	for (unsigned i = 0; i < 4; ++i)
-		args->out[i] = to_float(ctx, args->out[i]);
+		args->out[i] = to_float(&ctx->ac, args->out[i]);
 }
 
 static void
 handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
+		       bool export_prim_id,
 		       struct ac_vs_output_info *outinfo)
 {
 	uint32_t param_count = 0;
@@ -5099,8 +5414,9 @@
 	LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 	int i;
 
-	outinfo->prim_id_output = 0xffffffff;
-	outinfo->layer_output = 0xffffffff;
+	memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+	       sizeof(outinfo->vs_output_param_offset));
+
 	if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
 		LLVMValueRef slots[8];
 		unsigned j;
@@ -5110,7 +5426,7 @@
 
 		i = VARYING_SLOT_CLIP_DIST0;
 		for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++)
-			slots[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
+			slots[j] = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
 							       ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
 
 		for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++)
@@ -5130,72 +5446,37 @@
 
 	}
 
-	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
-		LLVMValueRef values[4];
-		if (!(ctx->output_mask & (1ull << i)))
-			continue;
-
+	LLVMValueRef pos_values[4] = {ctx->f32zero, ctx->f32zero, ctx->f32zero, ctx->f32one};
+	if (ctx->output_mask & (1ull << VARYING_SLOT_POS)) {
 		for (unsigned j = 0; j < 4; j++)
-			values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
-					      ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
+			pos_values[j] = LLVMBuildLoad(ctx->builder,
+			                         ctx->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_POS, j)], "");
+	}
+	si_llvm_init_export_args(ctx, pos_values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
 
-		if (i == VARYING_SLOT_POS) {
-			target = V_008DFC_SQ_EXP_POS;
-		} else if (i == VARYING_SLOT_CLIP_DIST0) {
-			continue;
-		} else if (i == VARYING_SLOT_PSIZ) {
-			outinfo->writes_pointsize = true;
-			psize_value = values[0];
-			continue;
-		} else if (i == VARYING_SLOT_LAYER) {
-			outinfo->writes_layer = true;
-			layer_value = values[0];
-			outinfo->layer_output = param_count;
-			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			param_count++;
-		} else if (i == VARYING_SLOT_VIEWPORT) {
-			outinfo->writes_viewport_index = true;
-			viewport_index_value = values[0];
-			continue;
-		} else if (i == VARYING_SLOT_PRIMITIVE_ID) {
-			outinfo->prim_id_output = param_count;
-			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			param_count++;
-		} else if (i >= VARYING_SLOT_VAR0) {
-			outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0);
-			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			param_count++;
-		}
-
-		si_llvm_init_export_args(ctx, values, target, &args);
-
-		if (target >= V_008DFC_SQ_EXP_POS &&
-		    target <= (V_008DFC_SQ_EXP_POS + 3)) {
-			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
-			       &args, sizeof(args));
-		} else {
-			ac_build_export(&ctx->ac, &args);
-		}
+	if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) {
+		outinfo->writes_pointsize = true;
+		psize_value = LLVMBuildLoad(ctx->builder,
+		                            ctx->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_PSIZ, 0)], "");
 	}
 
-	/* We need to add the position output manually if it's missing. */
-	if (!pos_args[0].out[0]) {
-		pos_args[0].enabled_channels = 0xf;
-		pos_args[0].valid_mask = 0;
-		pos_args[0].done = 0;
-		pos_args[0].target = V_008DFC_SQ_EXP_POS;
-		pos_args[0].compr = 0;
-		pos_args[0].out[0] = ctx->f32zero; /* X */
-		pos_args[0].out[1] = ctx->f32zero; /* Y */
-		pos_args[0].out[2] = ctx->f32zero; /* Z */
-		pos_args[0].out[3] = ctx->f32one;  /* W */
+	if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) {
+		outinfo->writes_layer = true;
+		layer_value = LLVMBuildLoad(ctx->builder,
+		                            ctx->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)], "");
 	}
 
-	uint32_t mask = ((outinfo->writes_pointsize == true ? 1 : 0) |
-			 (outinfo->writes_layer == true ? 4 : 0) |
-			 (outinfo->writes_viewport_index == true ? 8 : 0));
-	if (mask) {
-		pos_args[1].enabled_channels = mask;
+	if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) {
+		outinfo->writes_viewport_index = true;
+		viewport_index_value = LLVMBuildLoad(ctx->builder,
+		                                     ctx->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_VIEWPORT, 0)], "");
+	}
+
+	if (outinfo->writes_pointsize ||
+	    outinfo->writes_layer ||
+	    outinfo->writes_viewport_index) {
+		pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
+						(outinfo->writes_layer == true ? 4 : 0));
 		pos_args[1].valid_mask = 0;
 		pos_args[1].done = 0;
 		pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
@@ -5209,8 +5490,26 @@
 			pos_args[1].out[0] = psize_value;
 		if (outinfo->writes_layer == true)
 			pos_args[1].out[2] = layer_value;
-		if (outinfo->writes_viewport_index == true)
-			pos_args[1].out[3] = viewport_index_value;
+		if (outinfo->writes_viewport_index == true) {
+			if (ctx->options->chip_class >= GFX9) {
+				/* GFX9 has the layer in out.z[10:0] and the viewport
+				 * index in out.z[19:16].
+				 */
+				LLVMValueRef v = viewport_index_value;
+				v = to_integer(&ctx->ac, v);
+				v = LLVMBuildShl(ctx->builder, v,
+						 LLVMConstInt(ctx->i32, 16, false),
+						 "");
+				v = LLVMBuildOr(ctx->builder, v,
+						to_integer(&ctx->ac, pos_args[1].out[2]), "");
+
+				pos_args[1].out[2] = to_float(&ctx->ac, v);
+				pos_args[1].enabled_channels |= 1 << 2;
+			} else {
+				pos_args[1].out[3] = viewport_index_value;
+				pos_args[1].enabled_channels |= 1 << 3;
+			}
+		}
 	}
 	for (i = 0; i < 4; i++) {
 		if (pos_args[i].out[0])
@@ -5229,6 +5528,58 @@
 		ac_build_export(&ctx->ac, &pos_args[i]);
 	}
 
+	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
+		LLVMValueRef values[4];
+		if (!(ctx->output_mask & (1ull << i)))
+			continue;
+
+		for (unsigned j = 0; j < 4; j++)
+			values[j] = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
+					      ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
+
+		if (i == VARYING_SLOT_LAYER) {
+			target = V_008DFC_SQ_EXP_PARAM + param_count;
+			outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = param_count;
+			param_count++;
+		} else if (i == VARYING_SLOT_PRIMITIVE_ID) {
+			target = V_008DFC_SQ_EXP_PARAM + param_count;
+			outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
+			param_count++;
+		} else if (i >= VARYING_SLOT_VAR0) {
+			outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0);
+			target = V_008DFC_SQ_EXP_PARAM + param_count;
+			outinfo->vs_output_param_offset[i] = param_count;
+			param_count++;
+		} else
+			continue;
+
+		si_llvm_init_export_args(ctx, values, target, &args);
+
+		if (target >= V_008DFC_SQ_EXP_POS &&
+		    target <= (V_008DFC_SQ_EXP_POS + 3)) {
+			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
+			       &args, sizeof(args));
+		} else {
+			ac_build_export(&ctx->ac, &args);
+		}
+	}
+
+	if (export_prim_id) {
+		LLVMValueRef values[4];
+		target = V_008DFC_SQ_EXP_PARAM + param_count;
+		outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
+		param_count++;
+
+		values[0] = ctx->vs_prim_id;
+		ctx->shader_info->vs.vgpr_comp_cnt = MAX2(2,
+							  ctx->shader_info->vs.vgpr_comp_cnt);
+		for (unsigned j = 1; j < 4; j++)
+			values[j] = ctx->f32zero;
+		si_llvm_init_export_args(ctx, values, target, &args);
+		ac_build_export(&ctx->ac, &args);
+		outinfo->export_prim_id = true;
+	}
+
 	outinfo->pos_exports = num_pos_exports;
 	outinfo->param_exports = param_count;
 }
@@ -5536,24 +5887,22 @@
 	write_tess_factors(ctx);
 }
 
-static void
+static bool
 si_export_mrt_color(struct nir_to_llvm_context *ctx,
-		    LLVMValueRef *color, unsigned param, bool is_last)
+		    LLVMValueRef *color, unsigned param, bool is_last,
+		    struct ac_export_args *args)
 {
-
-	struct ac_export_args args;
-
 	/* Export */
 	si_llvm_init_export_args(ctx, color, param,
-				 &args);
+				 args);
 
 	if (is_last) {
-		args.valid_mask = 1; /* whether the EXEC mask is valid */
-		args.done = 1; /* DONE bit */
-	} else if (!args.enabled_channels)
-		return; /* unnecessary NULL export */
+		args->valid_mask = 1; /* whether the EXEC mask is valid */
+		args->done = 1; /* DONE bit */
+	} else if (!args->enabled_channels)
+		return false; /* unnecessary NULL export */
 
-	ac_build_export(&ctx->ac, &args);
+	return true;
 }
 
 static void
@@ -5589,10 +5938,11 @@
 		args.enabled_channels |= 0x4;
 	}
 
-	/* SI (except OLAND) has a bug that it only looks
+	/* SI (except OLAND and HAINAN) has a bug that it only looks
 	 * at the X writemask component. */
 	if (ctx->options->chip_class == SI &&
-	    ctx->options->family != CHIP_OLAND)
+	    ctx->options->family != CHIP_OLAND &&
+	    ctx->options->family != CHIP_HAINAN)
 		args.enabled_channels |= 0x1;
 
 	ac_build_export(&ctx->ac, &args);
@@ -5603,6 +5953,7 @@
 {
 	unsigned index = 0;
 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+	struct ac_export_args color_args[8];
 
 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
 		LLVMValueRef values[4];
@@ -5612,34 +5963,39 @@
 
 		if (i == FRAG_RESULT_DEPTH) {
 			ctx->shader_info->fs.writes_z = true;
-			depth = to_float(ctx, LLVMBuildLoad(ctx->builder,
+			depth = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
 							    ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
 		} else if (i == FRAG_RESULT_STENCIL) {
 			ctx->shader_info->fs.writes_stencil = true;
-			stencil = to_float(ctx, LLVMBuildLoad(ctx->builder,
+			stencil = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
 							      ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
 		} else if (i == FRAG_RESULT_SAMPLE_MASK) {
 			ctx->shader_info->fs.writes_sample_mask = true;
-			samplemask = to_float(ctx, LLVMBuildLoad(ctx->builder,
+			samplemask = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
 								  ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
 		} else {
 			bool last = false;
 			for (unsigned j = 0; j < 4; j++)
-				values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
+				values[j] = to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
 									ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
 
 			if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask)
 				last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
 
-			si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + index, last);
-			index++;
+			bool ret = si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + (i - FRAG_RESULT_DATA0), last, &color_args[index]);
+			if (ret)
+				index++;
 		}
 	}
 
+	for (unsigned i = 0; i < index; i++)
+		ac_build_export(&ctx->ac, &color_args[i]);
 	if (depth || stencil || samplemask)
 		si_export_mrt_z(ctx, depth, stencil, samplemask);
-	else if (!index)
-		si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true);
+	else if (!index) {
+		si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true, &color_args[0]);
+		ac_build_export(&ctx->ac, &color_args[0]);
+	}
 
 	ctx->shader_info->fs.output_mask = index ? ((1ull << index) - 1) : 0;
 }
@@ -5660,7 +6016,8 @@
 		else if (ctx->options->key.vs.as_es)
 			handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
 		else
-			handle_vs_outputs_post(ctx, &ctx->shader_info->vs.outinfo);
+			handle_vs_outputs_post(ctx, ctx->options->key.vs.export_prim_id,
+					       &ctx->shader_info->vs.outinfo);
 		break;
 	case MESA_SHADER_FRAGMENT:
 		handle_fs_outputs_post(ctx);
@@ -5675,22 +6032,14 @@
 		if (ctx->options->key.tes.as_es)
 			handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info);
 		else
-			handle_vs_outputs_post(ctx, &ctx->shader_info->tes.outinfo);
+			handle_vs_outputs_post(ctx, ctx->options->key.tes.export_prim_id,
+					       &ctx->shader_info->tes.outinfo);
 		break;
 	default:
 		break;
 	}
 }
 
-static void
-handle_shared_compute_var(struct nir_to_llvm_context *ctx,
-			  struct nir_variable *variable, uint32_t *offset, int idx)
-{
-	unsigned size = glsl_count_attribute_slots(variable->type, false);
-	variable->data.driver_location = *offset;
-	*offset += size;
-}
-
 static void ac_llvm_finalize_module(struct nir_to_llvm_context * ctx)
 {
 	LLVMPassManagerRef passmgr;
@@ -5718,6 +6067,39 @@
 }
 
 static void
+ac_nir_eliminate_const_vs_outputs(struct nir_to_llvm_context *ctx)
+{
+	struct ac_vs_output_info *outinfo;
+
+	switch (ctx->stage) {
+	case MESA_SHADER_FRAGMENT:
+	case MESA_SHADER_COMPUTE:
+	case MESA_SHADER_TESS_CTRL:
+	case MESA_SHADER_GEOMETRY:
+		return;
+	case MESA_SHADER_VERTEX:
+		if (ctx->options->key.vs.as_ls ||
+		    ctx->options->key.vs.as_es)
+			return;
+		outinfo = &ctx->shader_info->vs.outinfo;
+		break;
+	case MESA_SHADER_TESS_EVAL:
+		if (ctx->options->key.vs.as_es)
+			return;
+		outinfo = &ctx->shader_info->tes.outinfo;
+		break;
+	default:
+		unreachable("Unhandled shader type");
+	}
+
+	ac_optimize_vs_outputs(&ctx->ac,
+			       ctx->main_function,
+			       outinfo->vs_output_param_offset,
+			       VARYING_SLOT_MAX,
+			       &outinfo->param_exports);
+}
+
+static void
 ac_setup_rings(struct nir_to_llvm_context *ctx)
 {
 	if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
@@ -5739,8 +6121,6 @@
 		tmp = LLVMBuildExtractElement(ctx->builder, ctx->gsvs_ring, ctx->i32one, "");
 		tmp = LLVMBuildOr(ctx->builder, tmp, ctx->gsvs_ring_stride, "");
 		ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, tmp, ctx->i32one, "");
-
-		ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->v16i8, "");
 	}
 
 	if (ctx->stage == MESA_SHADER_TESS_CTRL ||
@@ -5750,6 +6130,27 @@
 	}
 }
 
+static unsigned
+ac_nir_get_max_workgroup_size(enum chip_class chip_class,
+			      const struct nir_shader *nir)
+{
+	switch (nir->stage) {
+	case MESA_SHADER_TESS_CTRL:
+		return chip_class >= CIK ? 128 : 64;
+	case MESA_SHADER_GEOMETRY:
+		return 64;
+	case MESA_SHADER_COMPUTE:
+		break;
+	default:
+		return 0;
+	}
+
+	unsigned max_workgroup_size = nir->info.cs.local_size[0] *
+		nir->info.cs.local_size[1] *
+		nir->info.cs.local_size[2];
+	return max_workgroup_size;
+}
+
 static
 LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
                                        struct nir_shader *nir,
@@ -5764,13 +6165,15 @@
 	ctx.context = LLVMContextCreate();
 	ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
 
-	ac_llvm_context_init(&ctx.ac, ctx.context);
+	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class);
 	ctx.ac.module = ctx.module;
 
 	ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
 
 	memset(shader_info, 0, sizeof(*shader_info));
 
+	ac_nir_shader_info_pass(nir, options, &shader_info->info);
+		
 	LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
 
 	LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
@@ -5784,6 +6187,7 @@
 	ctx.builder = LLVMCreateBuilderInContext(ctx.context);
 	ctx.ac.builder = ctx.builder;
 	ctx.stage = nir->stage;
+	ctx.max_workgroup_size = ac_nir_get_max_workgroup_size(ctx.options->chip_class, nir);
 
 	for (i = 0; i < AC_UD_MAX_SETS; i++)
 		shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
@@ -5792,34 +6196,12 @@
 
 	create_function(&ctx);
 
-	if (nir->stage == MESA_SHADER_COMPUTE) {
-		int num_shared = 0;
-		nir_foreach_variable(variable, &nir->shared)
-			num_shared++;
-		if (num_shared) {
-			int idx = 0;
-			uint32_t shared_size = 0;
-			LLVMValueRef var;
-			LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
-			nir_foreach_variable(variable, &nir->shared) {
-				handle_shared_compute_var(&ctx, variable, &shared_size, idx);
-				idx++;
-			}
-
-			shared_size *= 16;
-			var = LLVMAddGlobalInAddressSpace(ctx.module,
-							  LLVMArrayType(ctx.i8, shared_size),
-							  "compute_lds",
-							  LOCAL_ADDR_SPACE);
-			LLVMSetAlignment(var, 4);
-			ctx.shared_memory = LLVMBuildBitCast(ctx.builder, var, i8p, "");
-		}
-	} else if (nir->stage == MESA_SHADER_GEOMETRY) {
+	if (nir->stage == MESA_SHADER_GEOMETRY) {
 		ctx.gs_next_vertex = ac_build_alloca(&ctx, ctx.i32, "gs_next_vertex");
 
-		ctx.gs_max_out_vertices = nir->info->gs.vertices_out;
+		ctx.gs_max_out_vertices = nir->info.gs.vertices_out;
 	} else if (nir->stage == MESA_SHADER_TESS_EVAL) {
-		ctx.tes_primitive_mode = nir->info->tess.primitive_mode;
+		ctx.tes_primitive_mode = nir->info.tess.primitive_mode;
 	}
 
 	ac_setup_rings(&ctx);
@@ -5830,8 +6212,8 @@
 	if (nir->stage == MESA_SHADER_FRAGMENT)
 		handle_fs_inputs_pre(&ctx, nir);
 
-	ctx.num_output_clips = nir->info->clip_distance_array_size;
-	ctx.num_output_culls = nir->info->cull_distance_array_size;
+	ctx.num_output_clips = nir->info.clip_distance_array_size;
+	ctx.num_output_culls = nir->info.cull_distance_array_size;
 
 	nir_foreach_variable(variable, &nir->outputs)
 		handle_shader_output_decl(&ctx, variable);
@@ -5840,11 +6222,16 @@
 	                                   _mesa_key_pointer_equal);
 	ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
 	                                   _mesa_key_pointer_equal);
+	ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+	                                     _mesa_key_pointer_equal);
 
 	func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
 	setup_locals(&ctx, func);
 
+	if (nir->stage == MESA_SHADER_COMPUTE)
+		setup_shared(&ctx, nir);
+
 	visit_cf_list(&ctx, &func->impl->body);
 	phi_post_pass(&ctx);
 
@@ -5852,15 +6239,18 @@
 	LLVMBuildRetVoid(ctx.builder);
 
 	ac_llvm_finalize_module(&ctx);
+
+	ac_nir_eliminate_const_vs_outputs(&ctx);
 	free(ctx.locals);
 	ralloc_free(ctx.defs);
 	ralloc_free(ctx.phis);
+	ralloc_free(ctx.vars);
 
 	if (nir->stage == MESA_SHADER_GEOMETRY) {
 		unsigned addclip = ctx.num_output_clips + ctx.num_output_culls > 4;
 		shader_info->gs.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16;
 		shader_info->gs.max_gsvs_emit_size = shader_info->gs.gsvs_vertex_size *
-			nir->info->gs.vertices_out;
+			nir->info.gs.vertices_out;
 	} else if (nir->stage == MESA_SHADER_TESS_CTRL) {
 		shader_info->tcs.outputs_written = ctx.tess_outputs_written;
 		shader_info->tcs.patch_outputs_written = ctx.tess_patch_outputs_written;
@@ -6013,26 +6403,26 @@
 	switch (nir->stage) {
 	case MESA_SHADER_COMPUTE:
 		for (int i = 0; i < 3; ++i)
-			shader_info->cs.block_size[i] = nir->info->cs.local_size[i];
+			shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
 		break;
 	case MESA_SHADER_FRAGMENT:
-		shader_info->fs.early_fragment_test = nir->info->fs.early_fragment_tests;
+		shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
 		break;
 	case MESA_SHADER_GEOMETRY:
-		shader_info->gs.vertices_in = nir->info->gs.vertices_in;
-		shader_info->gs.vertices_out = nir->info->gs.vertices_out;
-		shader_info->gs.output_prim = nir->info->gs.output_primitive;
-		shader_info->gs.invocations = nir->info->gs.invocations;
+		shader_info->gs.vertices_in = nir->info.gs.vertices_in;
+		shader_info->gs.vertices_out = nir->info.gs.vertices_out;
+		shader_info->gs.output_prim = nir->info.gs.output_primitive;
+		shader_info->gs.invocations = nir->info.gs.invocations;
 		break;
 	case MESA_SHADER_TESS_EVAL:
-		shader_info->tes.primitive_mode = nir->info->tess.primitive_mode;
-		shader_info->tes.spacing = nir->info->tess.spacing;
-		shader_info->tes.ccw = nir->info->tess.ccw;
-		shader_info->tes.point_mode = nir->info->tess.point_mode;
+		shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
+		shader_info->tes.spacing = nir->info.tess.spacing;
+		shader_info->tes.ccw = nir->info.tess.ccw;
+		shader_info->tes.point_mode = nir->info.tess.point_mode;
 		shader_info->tes.as_es = options->key.tes.as_es;
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		shader_info->tcs.tcs_vertices_out = nir->info->tess.tcs_vertices_out;
+		shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
 		break;
 	case MESA_SHADER_VERTEX:
 		shader_info->vs.as_es = options->key.vs.as_es;
@@ -6088,11 +6478,11 @@
 						   AC_FUNC_ATTR_LEGACY);
 
 			LLVMBuildStore(ctx->builder,
-				       to_float(ctx, value), ctx->outputs[radeon_llvm_reg_index_soa(i, j)]);
+				       to_float(&ctx->ac, value), ctx->outputs[radeon_llvm_reg_index_soa(i, j)]);
 		}
 		idx += slot_inc;
 	}
-	handle_vs_outputs_post(ctx, &ctx->shader_info->vs.outinfo);
+	handle_vs_outputs_post(ctx, false, &ctx->shader_info->vs.outinfo);
 }
 
 void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
@@ -6109,7 +6499,7 @@
 	ctx.options = options;
 	ctx.shader_info = shader_info;
 
-	ac_llvm_context_init(&ctx.ac, ctx.context);
+	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class);
 	ctx.ac.module = ctx.module;
 
 	ctx.is_gs_copy_shader = true;
@@ -6122,11 +6512,11 @@
 
 	create_function(&ctx);
 
-	ctx.gs_max_out_vertices = geom_shader->info->gs.vertices_out;
+	ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
 	ac_setup_rings(&ctx);
 
-	ctx.num_output_clips = geom_shader->info->clip_distance_array_size;
-	ctx.num_output_culls = geom_shader->info->cull_distance_array_size;
+	ctx.num_output_clips = geom_shader->info.clip_distance_array_size;
+	ctx.num_output_culls = geom_shader->info.cull_distance_array_size;
 
 	nir_foreach_variable(variable, &geom_shader->outputs)
 		handle_shader_output_decl(&ctx, variable);
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index 3d0b456..af93a1c 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -29,7 +29,7 @@
 #include "llvm-c/TargetMachine.h"
 #include "amd_family.h"
 #include "../vulkan/radv_descriptor_set.h"
-
+#include "ac_shader_info.h"
 #include "shader_enums.h"
 struct ac_shader_binary;
 struct ac_shader_config;
@@ -41,10 +41,12 @@
 	uint32_t instance_rate_inputs;
 	uint32_t as_es:1;
 	uint32_t as_ls:1;
+	uint32_t export_prim_id:1;
 };
 
 struct ac_tes_variant_key {
 	uint32_t as_es:1;
+	uint32_t export_prim_id:1;
 };
 
 struct ac_tcs_variant_key {
@@ -55,6 +57,7 @@
 struct ac_fs_variant_key {
 	uint32_t col_format;
 	uint32_t is_int8;
+	uint32_t is_int10;
 };
 
 union ac_shader_variant_key {
@@ -83,7 +86,8 @@
 enum ac_ud_index {
 	AC_UD_SCRATCH_RING_OFFSETS = 0,
 	AC_UD_PUSH_CONSTANTS = 1,
-	AC_UD_SHADER_START = 2,
+	AC_UD_INDIRECT_DESCRIPTOR_SETS = 2,
+	AC_UD_SHADER_START = 3,
 	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
 	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 	AC_UD_VS_LS_TCS_IN_LAYOUT,
@@ -120,15 +124,15 @@
 };
 
 struct ac_vs_output_info {
+	uint8_t	vs_output_param_offset[VARYING_SLOT_MAX];
 	uint8_t clip_dist_mask;
 	uint8_t cull_dist_mask;
+	uint8_t param_exports;
 	bool writes_pointsize;
 	bool writes_layer;
 	bool writes_viewport_index;
-	uint32_t prim_id_output;
-	uint32_t layer_output;
+	bool export_prim_id;
 	uint32_t export_mask;
-	unsigned param_exports;
 	unsigned pos_exports;
 };
 
@@ -138,10 +142,11 @@
 
 struct ac_shader_variant_info {
 	struct ac_userdata_locations user_sgprs_locs;
+	struct ac_shader_info info;
 	unsigned num_user_sgprs;
 	unsigned num_input_sgprs;
 	unsigned num_input_vgprs;
-
+	bool need_indirect_descriptor_sets;
 	union {
 		struct {
 			struct ac_vs_output_info outinfo;
@@ -166,7 +171,6 @@
 			bool force_persample;
 			bool prim_id_input;
 			bool layer_input;
-			bool uses_sample_positions;
 		} fs;
 		struct {
 			unsigned block_size[3];
@@ -178,6 +182,7 @@
 			unsigned invocations;
 			unsigned gsvs_vertex_size;
 			unsigned max_gsvs_emit_size;
+			bool uses_prim_id;
 		} gs;
 		struct {
 			bool uses_prim_id;
diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
new file mode 100644
index 0000000..13d73df
--- /dev/null
+++ b/src/amd/common/ac_shader_info.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2017 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "nir/nir.h"
+#include "ac_shader_info.h"
+#include "ac_nir_to_llvm.h"
+
+static void mark_sampler_desc(nir_variable *var, struct ac_shader_info *info)
+{
+	info->desc_set_used_mask = (1 << var->data.descriptor_set);
+}
+
+static void
+gather_intrinsic_info(nir_intrinsic_instr *instr, struct ac_shader_info *info)
+{
+	switch (instr->intrinsic) {
+	case nir_intrinsic_interp_var_at_sample:
+		info->ps.needs_sample_positions = true;
+		break;
+	case nir_intrinsic_load_draw_id:
+		info->vs.needs_draw_id = true;
+		break;
+	case nir_intrinsic_load_num_work_groups:
+		info->cs.grid_components_used = instr->num_components;
+		break;
+	case nir_intrinsic_vulkan_resource_index:
+		info->desc_set_used_mask |= (1 << nir_intrinsic_desc_set(instr));
+		break;
+	case nir_intrinsic_image_load:
+	case nir_intrinsic_image_store:
+	case nir_intrinsic_image_atomic_add:
+	case nir_intrinsic_image_atomic_min:
+	case nir_intrinsic_image_atomic_max:
+	case nir_intrinsic_image_atomic_and:
+	case nir_intrinsic_image_atomic_or:
+	case nir_intrinsic_image_atomic_xor:
+	case nir_intrinsic_image_atomic_exchange:
+	case nir_intrinsic_image_atomic_comp_swap:
+	case nir_intrinsic_image_size:
+		mark_sampler_desc(instr->variables[0]->var, info);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+gather_tex_info(nir_tex_instr *instr, struct ac_shader_info *info)
+{
+	if (instr->sampler)
+		mark_sampler_desc(instr->sampler->var, info);
+	if (instr->texture)
+		mark_sampler_desc(instr->texture->var, info);
+}
+
+static void
+gather_info_block(nir_block *block, struct ac_shader_info *info)
+{
+	nir_foreach_instr(instr, block) {
+		switch (instr->type) {
+		case nir_instr_type_intrinsic:
+			gather_intrinsic_info(nir_instr_as_intrinsic(instr), info);
+			break;
+		case nir_instr_type_tex:
+			gather_tex_info(nir_instr_as_tex(instr), info);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+static void
+gather_info_input_decl(nir_shader *nir,
+		       const struct ac_nir_compiler_options *options,
+		       nir_variable *var,
+		       struct ac_shader_info *info)
+{
+	switch (nir->stage) {
+	case MESA_SHADER_VERTEX:
+		info->vs.has_vertex_buffers = true;
+		break;
+	default:
+		break;
+	}
+}
+
+void
+ac_nir_shader_info_pass(struct nir_shader *nir,
+			const struct ac_nir_compiler_options *options,
+			struct ac_shader_info *info)
+{
+	struct nir_function *func = (struct nir_function *)exec_list_get_head(&nir->functions);
+
+	info->needs_push_constants = true;
+	if (!options->layout)
+		info->needs_push_constants = false;
+	else if (!options->layout->push_constant_size &&
+		 !options->layout->dynamic_offset_count)
+		info->needs_push_constants = false;
+
+	nir_foreach_variable(variable, &nir->inputs)
+		gather_info_input_decl(nir, options, variable, info);
+
+	nir_foreach_block(block, func->impl) {
+		gather_info_block(block, info);
+	}
+}
diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
new file mode 100644
index 0000000..5f03e79
--- /dev/null
+++ b/src/amd/common/ac_shader_info.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2017 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef AC_SHADER_INFO_H
+#define AC_SHADER_INFO_H
+
+struct nir_shader;
+struct ac_nir_compiler_options;
+
+struct ac_shader_info {
+	bool needs_push_constants;
+	uint32_t desc_set_used_mask;
+	struct {
+		bool has_vertex_buffers; /* needs vertex buffers and base/start */
+		bool needs_draw_id;
+	} vs;
+	struct {
+		bool needs_sample_positions;
+	} ps;
+	struct {
+		uint8_t grid_components_used;
+	} cs;
+};
+
+/* A NIR pass to gather all the info needed to optimise the allocation patterns
+ * for the RADV user sgprs
+ */
+void
+ac_nir_shader_info_pass(struct nir_shader *nir,
+			const struct ac_nir_compiler_options *options,
+			struct ac_shader_info *info);
+
+#endif
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
new file mode 100644
index 0000000..cf07760
--- /dev/null
+++ b/src/amd/common/ac_surface.c
@@ -0,0 +1,1211 @@
+/*
+ * Copyright © 2011 Red Hat All Rights Reserved.
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#include "ac_surface.h"
+#include "amd_family.h"
+#include "amdgpu_id.h"
+#include "ac_gpu_info.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <amdgpu.h>
+#include <amdgpu_drm.h>
+
+#include "addrlib/addrinterface.h"
+
+#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
+#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
+#endif
+
+#ifndef CIASICIDGFXENGINE_ARCTICISLAND
+#define CIASICIDGFXENGINE_ARCTICISLAND 0x0000000D
+#endif
+
+static void addrlib_family_rev_id(enum radeon_family family,
+				  unsigned *addrlib_family,
+				  unsigned *addrlib_revid)
+{
+	switch (family) {
+	case CHIP_TAHITI:
+		*addrlib_family = FAMILY_SI;
+		*addrlib_revid = SI_TAHITI_P_A0;
+		break;
+	case CHIP_PITCAIRN:
+		*addrlib_family = FAMILY_SI;
+		*addrlib_revid = SI_PITCAIRN_PM_A0;
+		break;
+	case CHIP_VERDE:
+		*addrlib_family = FAMILY_SI;
+		*addrlib_revid = SI_CAPEVERDE_M_A0;
+		break;
+	case CHIP_OLAND:
+		*addrlib_family = FAMILY_SI;
+		*addrlib_revid = SI_OLAND_M_A0;
+		break;
+	case CHIP_HAINAN:
+		*addrlib_family = FAMILY_SI;
+		*addrlib_revid = SI_HAINAN_V_A0;
+		break;
+	case CHIP_BONAIRE:
+		*addrlib_family = FAMILY_CI;
+		*addrlib_revid = CI_BONAIRE_M_A0;
+		break;
+	case CHIP_KAVERI:
+		*addrlib_family = FAMILY_KV;
+		*addrlib_revid = KV_SPECTRE_A0;
+		break;
+	case CHIP_KABINI:
+		*addrlib_family = FAMILY_KV;
+		*addrlib_revid = KB_KALINDI_A0;
+		break;
+	case CHIP_HAWAII:
+		*addrlib_family = FAMILY_CI;
+		*addrlib_revid = CI_HAWAII_P_A0;
+		break;
+	case CHIP_MULLINS:
+		*addrlib_family = FAMILY_KV;
+		*addrlib_revid = ML_GODAVARI_A0;
+		break;
+	case CHIP_TONGA:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_TONGA_P_A0;
+		break;
+	case CHIP_ICELAND:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_ICELAND_M_A0;
+		break;
+	case CHIP_CARRIZO:
+		*addrlib_family = FAMILY_CZ;
+		*addrlib_revid = CARRIZO_A0;
+		break;
+	case CHIP_STONEY:
+		*addrlib_family = FAMILY_CZ;
+		*addrlib_revid = STONEY_A0;
+		break;
+	case CHIP_FIJI:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_FIJI_P_A0;
+		break;
+	case CHIP_POLARIS10:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_POLARIS10_P_A0;
+		break;
+	case CHIP_POLARIS11:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_POLARIS11_M_A0;
+		break;
+	case CHIP_POLARIS12:
+		*addrlib_family = FAMILY_VI;
+		*addrlib_revid = VI_POLARIS12_V_A0;
+		break;
+	case CHIP_VEGA10:
+		*addrlib_family = FAMILY_AI;
+		*addrlib_revid = AI_VEGA10_P_A0;
+		break;
+	case CHIP_RAVEN:
+		*addrlib_family = FAMILY_RV;
+		*addrlib_revid = RAVEN_A0;
+		break;
+	default:
+		fprintf(stderr, "amdgpu: Unknown family.\n");
+	}
+}
+
+static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
+{
+	return malloc(pInput->sizeInBytes);
+}
+
+static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
+{
+	free(pInput->pVirtAddr);
+	return ADDR_OK;
+}
+
+ADDR_HANDLE amdgpu_addr_create(const struct radeon_info *info,
+			       const struct amdgpu_gpu_info *amdinfo,
+			       uint64_t *max_alignment)
+{
+	ADDR_CREATE_INPUT addrCreateInput = {0};
+	ADDR_CREATE_OUTPUT addrCreateOutput = {0};
+	ADDR_REGISTER_VALUE regValue = {0};
+	ADDR_CREATE_FLAGS createFlags = {{0}};
+	ADDR_GET_MAX_ALIGNMENTS_OUTPUT addrGetMaxAlignmentsOutput = {0};
+	ADDR_E_RETURNCODE addrRet;
+
+	addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
+	addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
+
+	regValue.gbAddrConfig = amdinfo->gb_addr_cfg;
+	createFlags.value = 0;
+
+	addrlib_family_rev_id(info->family, &addrCreateInput.chipFamily, &addrCreateInput.chipRevision);
+	if (addrCreateInput.chipFamily == FAMILY_UNKNOWN)
+		return NULL;
+
+	if (addrCreateInput.chipFamily >= FAMILY_AI) {
+		addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND;
+		regValue.blockVarSizeLog2 = 0;
+	} else {
+		regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3;
+		regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2;
+
+		regValue.backendDisables = amdinfo->enabled_rb_pipes_mask;
+		regValue.pTileConfig = amdinfo->gb_tile_mode;
+		regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode);
+		if (addrCreateInput.chipFamily == FAMILY_SI) {
+			regValue.pMacroTileConfig = NULL;
+			regValue.noOfMacroEntries = 0;
+		} else {
+			regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode;
+			regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode);
+		}
+
+		createFlags.useTileIndex = 1;
+		createFlags.useHtileSliceAlign = 1;
+
+		addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
+	}
+
+	addrCreateInput.callbacks.allocSysMem = allocSysMem;
+	addrCreateInput.callbacks.freeSysMem = freeSysMem;
+	addrCreateInput.callbacks.debugPrint = 0;
+	addrCreateInput.createFlags = createFlags;
+	addrCreateInput.regValue = regValue;
+
+	addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
+	if (addrRet != ADDR_OK)
+		return NULL;
+
+	if (max_alignment) {
+		addrRet = AddrGetMaxAlignments(addrCreateOutput.hLib, &addrGetMaxAlignmentsOutput);
+		if (addrRet == ADDR_OK){
+			*max_alignment = addrGetMaxAlignmentsOutput.baseAlign;
+		}
+	}
+	return addrCreateOutput.hLib;
+}
+
+static int surf_config_sanity(const struct ac_surf_config *config)
+{
+	/* all dimension must be at least 1 ! */
+	if (!config->info.width || !config->info.height || !config->info.depth ||
+	    !config->info.array_size || !config->info.levels)
+		return -EINVAL;
+
+	switch (config->info.samples) {
+	case 0:
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (config->is_3d && config->info.array_size > 1)
+		return -EINVAL;
+	if (config->is_cube && config->info.depth > 1)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int gfx6_compute_level(ADDR_HANDLE addrlib,
+			      const struct ac_surf_config *config,
+			      struct radeon_surf *surf, bool is_stencil,
+			      unsigned level, bool compressed,
+			      ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
+			      ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
+			      ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
+			      ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
+			      ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
+			      ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
+{
+	struct legacy_surf_level *surf_level;
+	ADDR_E_RETURNCODE ret;
+
+	AddrSurfInfoIn->mipLevel = level;
+	AddrSurfInfoIn->width = u_minify(config->info.width, level);
+	AddrSurfInfoIn->height = u_minify(config->info.height, level);
+
+	/* Make GFX6 linear surfaces compatible with GFX9 for hybrid graphics,
+	 * because GFX9 needs linear alignment of 256 bytes.
+	 */
+	if (config->info.levels == 1 &&
+	    AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED &&
+	    AddrSurfInfoIn->bpp) {
+		unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8);
+
+		assert(util_is_power_of_two(AddrSurfInfoIn->bpp));
+		AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment);
+	}
+
+	if (config->is_3d)
+		AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level);
+	else if (config->is_cube)
+		AddrSurfInfoIn->numSlices = 6;
+	else
+		AddrSurfInfoIn->numSlices = config->info.array_size;
+
+	if (level > 0) {
+		/* Set the base level pitch. This is needed for calculation
+		 * of non-zero levels. */
+		if (is_stencil)
+			AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x;
+		else
+			AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x;
+
+		/* Convert blocks to pixels for compressed formats. */
+		if (compressed)
+			AddrSurfInfoIn->basePitch *= surf->blk_w;
+	}
+
+	ret = AddrComputeSurfaceInfo(addrlib,
+				     AddrSurfInfoIn,
+				     AddrSurfInfoOut);
+	if (ret != ADDR_OK) {
+		return ret;
+	}
+
+	surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level];
+	surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign);
+	surf_level->slice_size = AddrSurfInfoOut->sliceSize;
+	surf_level->nblk_x = AddrSurfInfoOut->pitch;
+	surf_level->nblk_y = AddrSurfInfoOut->height;
+
+	switch (AddrSurfInfoOut->tileMode) {
+	case ADDR_TM_LINEAR_ALIGNED:
+		surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+		break;
+	case ADDR_TM_1D_TILED_THIN1:
+		surf_level->mode = RADEON_SURF_MODE_1D;
+		break;
+	case ADDR_TM_2D_TILED_THIN1:
+		surf_level->mode = RADEON_SURF_MODE_2D;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (is_stencil)
+		surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
+	else
+		surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex;
+
+	surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize;
+
+	/* Clear DCC fields at the beginning. */
+	surf_level->dcc_offset = 0;
+
+	/* The previous level's flag tells us if we can use DCC for this level. */
+	if (AddrSurfInfoIn->flags.dccCompatible &&
+	    (level == 0 || AddrDccOut->subLvlCompressible)) {
+		AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
+		AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
+		AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
+		AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
+		AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+		ret = AddrComputeDccInfo(addrlib,
+					 AddrDccIn,
+					 AddrDccOut);
+
+		if (ret == ADDR_OK) {
+			surf_level->dcc_offset = surf->dcc_size;
+			surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
+			surf->num_dcc_levels = level + 1;
+			surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
+			surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
+		}
+	}
+
+	/* TC-compatible HTILE. */
+	if (!is_stencil &&
+	    AddrSurfInfoIn->flags.depth &&
+	    surf_level->mode == RADEON_SURF_MODE_2D &&
+	    level == 0) {
+		AddrHtileIn->flags.tcCompatible = AddrSurfInfoIn->flags.tcCompatible;
+		AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
+		AddrHtileIn->height = AddrSurfInfoOut->height;
+		AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
+		AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
+		AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
+		AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
+		AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
+		AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+		ret = AddrComputeHtileInfo(addrlib,
+					   AddrHtileIn,
+					   AddrHtileOut);
+
+		if (ret == ADDR_OK) {
+			surf->htile_size = AddrHtileOut->htileBytes;
+			surf->htile_slice_size = AddrHtileOut->sliceSize;
+			surf->htile_alignment = AddrHtileOut->baseAlign;
+		}
+	}
+
+	return 0;
+}
+
+#define   G_009910_MICRO_TILE_MODE(x)          (((x) >> 0) & 0x03)
+#define   G_009910_MICRO_TILE_MODE_NEW(x)      (((x) >> 22) & 0x07)
+
+static void gfx6_set_micro_tile_mode(struct radeon_surf *surf,
+				     const struct radeon_info *info)
+{
+	uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]];
+
+	if (info->chip_class >= CIK)
+		surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode);
+	else
+		surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode);
+}
+
+static unsigned cik_get_macro_tile_index(struct radeon_surf *surf)
+{
+	unsigned index, tileb;
+
+	tileb = 8 * 8 * surf->bpe;
+	tileb = MIN2(surf->u.legacy.tile_split, tileb);
+
+	for (index = 0; tileb > 64; index++)
+		tileb >>= 1;
+
+	assert(index < 16);
+	return index;
+}
+
+/**
+ * Copy surface-global settings like pipe/bank config from level 0 surface
+ * computation.
+ */
+static void gfx6_surface_settings(const struct radeon_info* info,
+				  ADDR_COMPUTE_SURFACE_INFO_OUTPUT* csio,
+				  struct radeon_surf *surf)
+{
+	surf->surf_alignment = csio->baseAlign;
+	surf->u.legacy.pipe_config = csio->pTileInfo->pipeConfig - 1;
+	gfx6_set_micro_tile_mode(surf, info);
+
+	/* For 2D modes only. */
+	if (csio->tileMode >= ADDR_TM_2D_TILED_THIN1) {
+		surf->u.legacy.bankw = csio->pTileInfo->bankWidth;
+		surf->u.legacy.bankh = csio->pTileInfo->bankHeight;
+		surf->u.legacy.mtilea = csio->pTileInfo->macroAspectRatio;
+		surf->u.legacy.tile_split = csio->pTileInfo->tileSplitBytes;
+		surf->u.legacy.num_banks = csio->pTileInfo->banks;
+		surf->u.legacy.macro_tile_index = csio->macroModeIndex;
+	} else {
+		surf->u.legacy.macro_tile_index = 0;
+	}
+}
+
+/**
+ * Fill in the tiling information in \p surf based on the given surface config.
+ *
+ * The following fields of \p surf must be initialized by the caller:
+ * blk_w, blk_h, bpe, flags.
+ */
+static int gfx6_compute_surface(ADDR_HANDLE addrlib,
+				const struct radeon_info *info,
+				const struct ac_surf_config *config,
+				enum radeon_surf_mode mode,
+				struct radeon_surf *surf)
+{
+	unsigned level;
+	bool compressed;
+	ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+	ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
+	ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
+	ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
+	ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
+	ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
+	ADDR_TILEINFO AddrTileInfoIn = {0};
+	ADDR_TILEINFO AddrTileInfoOut = {0};
+	int r;
+
+	AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
+	AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
+	AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
+	AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
+	AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
+	AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
+	AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
+
+	compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+	/* MSAA and FMASK require 2D tiling. */
+	if (config->info.samples > 1 ||
+	    (surf->flags & RADEON_SURF_FMASK))
+		mode = RADEON_SURF_MODE_2D;
+
+	/* DB doesn't support linear layouts. */
+	if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) &&
+	    mode < RADEON_SURF_MODE_1D)
+		mode = RADEON_SURF_MODE_1D;
+
+	/* Set the requested tiling mode. */
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:
+		AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
+		break;
+	case RADEON_SURF_MODE_1D:
+		AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
+		break;
+	case RADEON_SURF_MODE_2D:
+		AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* The format must be set correctly for the allocation of compressed
+	 * textures to work. In other cases, setting the bpp is sufficient.
+	 */
+	if (compressed) {
+		switch (surf->bpe) {
+		case 8:
+			AddrSurfInfoIn.format = ADDR_FMT_BC1;
+			break;
+		case 16:
+			AddrSurfInfoIn.format = ADDR_FMT_BC3;
+			break;
+		default:
+			assert(0);
+		}
+	}
+	else {
+		AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
+	}
+
+	AddrDccIn.numSamples = AddrSurfInfoIn.numSamples =
+		config->info.samples ? config->info.samples : 1;
+	AddrSurfInfoIn.tileIndex = -1;
+
+	/* Set the micro tile type. */
+	if (surf->flags & RADEON_SURF_SCANOUT)
+		AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
+	else if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))
+		AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
+	else
+		AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
+
+	AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+	AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+	AddrSurfInfoIn.flags.cube = config->is_cube;
+	AddrSurfInfoIn.flags.fmask = (surf->flags & RADEON_SURF_FMASK) != 0;
+	AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
+	AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1;
+	AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
+
+	/* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
+	 * requested, because TC-compatible HTILE requires 2D tiling.
+	 */
+	AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible &&
+					 !AddrSurfInfoIn.flags.fmask &&
+					 config->info.samples <= 1 &&
+					 (surf->flags & RADEON_SURF_OPTIMIZE_FOR_SPACE);
+
+	/* DCC notes:
+	 * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
+	 *   with samples >= 4.
+	 * - Mipmapped array textures have low performance (discovered by a closed
+	 *   driver team).
+	 */
+	AddrSurfInfoIn.flags.dccCompatible =
+		info->chip_class >= VI &&
+		!(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
+		!(surf->flags & RADEON_SURF_DISABLE_DCC) &&
+		!compressed && AddrDccIn.numSamples <= 1 &&
+		((config->info.array_size == 1 && config->info.depth == 1) ||
+		 config->info.levels == 1);
+
+	AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0;
+	AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth;
+
+	/* On CI/VI, the DB uses the same pitch and tile mode (except tilesplit)
+	 * for Z and stencil. This can cause a number of problems which we work
+	 * around here:
+	 *
+	 * - a depth part that is incompatible with mipmapped texturing
+	 * - at least on Stoney, entirely incompatible Z/S aspects (e.g.
+	 *   incorrect tiling applied to the stencil part, stencil buffer
+	 *   memory accesses that go out of bounds) even without mipmapping
+	 *
+	 * Some piglit tests that are prone to different types of related
+	 * failures:
+	 *  ./bin/ext_framebuffer_multisample-upsample 2 stencil
+	 *  ./bin/framebuffer-blit-levels {draw,read} stencil
+	 *  ./bin/ext_framebuffer_multisample-unaligned-blit N {depth,stencil} {msaa,upsample,downsample}
+	 *  ./bin/fbo-depth-array fs-writes-{depth,stencil} / {depth,stencil}-{clear,layered-clear,draw}
+	 *  ./bin/depthstencil-render-miplevels 1024 d=s=z24_s8
+	 */
+	int stencil_tile_idx = -1;
+
+	if (AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.noStencil &&
+	    (config->info.levels > 1 || info->family == CHIP_STONEY)) {
+		/* Compute stencilTileIdx that is compatible with the (depth)
+		 * tileIdx. This degrades the depth surface if necessary to
+		 * ensure that a matching stencilTileIdx exists. */
+		AddrSurfInfoIn.flags.matchStencilTileCfg = 1;
+
+		/* Keep the depth mip-tail compatible with texturing. */
+		AddrSurfInfoIn.flags.noStencil = 1;
+	}
+
+	/* Set preferred macrotile parameters. This is usually required
+	 * for shared resources. This is for 2D tiling only. */
+	if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
+	    surf->u.legacy.bankw && surf->u.legacy.bankh &&
+	    surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
+		assert(!(surf->flags & RADEON_SURF_FMASK));
+
+		/* If any of these parameters are incorrect, the calculation
+		 * will fail. */
+		AddrTileInfoIn.banks = surf->u.legacy.num_banks;
+		AddrTileInfoIn.bankWidth = surf->u.legacy.bankw;
+		AddrTileInfoIn.bankHeight = surf->u.legacy.bankh;
+		AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea;
+		AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split;
+		AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */
+		AddrSurfInfoIn.flags.opt4Space = 0;
+		AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
+
+		/* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
+		 * the tile index, because we are expected to know it if
+		 * we know the other parameters.
+		 *
+		 * This is something that can easily be fixed in Addrlib.
+		 * For now, just figure it out here.
+		 * Note that only 2D_TILE_THIN1 is handled here.
+		 */
+		assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+		assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
+
+		if (info->chip_class == SI) {
+			if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) {
+				if (surf->bpe == 2)
+					AddrSurfInfoIn.tileIndex = 11; /* 16bpp */
+				else
+					AddrSurfInfoIn.tileIndex = 12; /* 32bpp */
+			} else {
+				if (surf->bpe == 1)
+					AddrSurfInfoIn.tileIndex = 14; /* 8bpp */
+				else if (surf->bpe == 2)
+					AddrSurfInfoIn.tileIndex = 15; /* 16bpp */
+				else if (surf->bpe == 4)
+					AddrSurfInfoIn.tileIndex = 16; /* 32bpp */
+				else
+					AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */
+			}
+		} else {
+			/* CIK - VI */
+			if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
+				AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
+			else
+				AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
+
+			/* Addrlib doesn't set this if tileIndex is forced like above. */
+			AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
+		}
+	}
+
+	surf->num_dcc_levels = 0;
+	surf->surf_size = 0;
+	surf->dcc_size = 0;
+	surf->dcc_alignment = 1;
+	surf->htile_size = 0;
+	surf->htile_slice_size = 0;
+	surf->htile_alignment = 1;
+
+	const bool only_stencil = (surf->flags & RADEON_SURF_SBUFFER) &&
+				  !(surf->flags & RADEON_SURF_ZBUFFER);
+
+	/* Calculate texture layout information. */
+	if (!only_stencil) {
+		for (level = 0; level < config->info.levels; level++) {
+			r = gfx6_compute_level(addrlib, config, surf, false, level, compressed,
+					       &AddrSurfInfoIn, &AddrSurfInfoOut,
+					       &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
+			if (r)
+				return r;
+
+			if (level > 0)
+				continue;
+
+			/* Check that we actually got a TC-compatible HTILE if
+			 * we requested it (only for level 0, since we're not
+			 * supporting HTILE on higher mip levels anyway). */
+			assert(AddrSurfInfoOut.tcCompatible ||
+			       !AddrSurfInfoIn.flags.tcCompatible ||
+			       AddrSurfInfoIn.flags.matchStencilTileCfg);
+
+			if (AddrSurfInfoIn.flags.matchStencilTileCfg) {
+				if (!AddrSurfInfoOut.tcCompatible) {
+					AddrSurfInfoIn.flags.tcCompatible = 0;
+					surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
+				}
+
+				AddrSurfInfoIn.flags.matchStencilTileCfg = 0;
+				AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex;
+				stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx;
+
+				assert(stencil_tile_idx >= 0);
+			}
+
+			gfx6_surface_settings(info, &AddrSurfInfoOut, surf);
+		}
+	}
+
+	/* Calculate texture layout information for stencil. */
+	if (surf->flags & RADEON_SURF_SBUFFER) {
+		AddrSurfInfoIn.tileIndex = stencil_tile_idx;
+		AddrSurfInfoIn.bpp = 8;
+		AddrSurfInfoIn.flags.depth = 0;
+		AddrSurfInfoIn.flags.stencil = 1;
+		AddrSurfInfoIn.flags.tcCompatible = 0;
+		/* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
+		AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split;
+
+		for (level = 0; level < config->info.levels; level++) {
+			r = gfx6_compute_level(addrlib, config, surf, true, level, compressed,
+					       &AddrSurfInfoIn, &AddrSurfInfoOut,
+					       &AddrDccIn, &AddrDccOut,
+					       NULL, NULL);
+			if (r)
+				return r;
+
+			/* DB uses the depth pitch for both stencil and depth. */
+			if (!only_stencil) {
+				if (surf->u.legacy.stencil_level[level].nblk_x !=
+				    surf->u.legacy.level[level].nblk_x)
+					surf->u.legacy.stencil_adjusted = true;
+			} else {
+				surf->u.legacy.level[level].nblk_x =
+					surf->u.legacy.stencil_level[level].nblk_x;
+			}
+
+			if (level == 0) {
+				if (only_stencil)
+					gfx6_surface_settings(info, &AddrSurfInfoOut, surf);
+
+				/* For 2D modes only. */
+				if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+					surf->u.legacy.stencil_tile_split =
+						AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+				}
+			}
+		}
+	}
+
+	/* Recalculate the whole DCC miptree size including disabled levels.
+	 * This is what addrlib does, but calling addrlib would be a lot more
+	 * complicated.
+	 */
+	if (surf->dcc_size && config->info.levels > 1) {
+		surf->dcc_size = align64(surf->surf_size >> 8,
+					 info->pipe_interleave_bytes *
+					 info->num_tile_pipes);
+	}
+
+	/* Make sure HTILE covers the whole miptree, because the shader reads
+	 * TC-compatible HTILE even for levels where it's disabled by DB.
+	 */
+	if (surf->htile_size && config->info.levels > 1)
+		surf->htile_size *= 2;
+
+	surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+	/* workout base swizzle */
+	if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) {
+		ADDR_COMPUTE_BASE_SWIZZLE_INPUT AddrBaseSwizzleIn = {0};
+		ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT AddrBaseSwizzleOut = {0};
+
+		AddrBaseSwizzleIn.surfIndex = config->info.surf_index;
+		AddrBaseSwizzleIn.tileIndex = AddrSurfInfoIn.tileIndex;
+		AddrBaseSwizzleIn.macroModeIndex = AddrSurfInfoOut.macroModeIndex;
+		AddrBaseSwizzleIn.pTileInfo = AddrSurfInfoOut.pTileInfo;
+		AddrBaseSwizzleIn.tileMode = AddrSurfInfoOut.tileMode;
+		AddrComputeBaseSwizzle(addrlib, &AddrBaseSwizzleIn, &AddrBaseSwizzleOut);
+		surf->u.legacy.tile_swizzle = AddrBaseSwizzleOut.tileSwizzle;
+	}
+	return 0;
+}
+
+/* This is only called when expecting a tiled layout. */
+static int
+gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib,
+				ADDR2_COMPUTE_SURFACE_INFO_INPUT *in,
+				bool is_fmask, AddrSwizzleMode *swizzle_mode)
+{
+	ADDR_E_RETURNCODE ret;
+	ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0};
+	ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0};
+
+	sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT);
+	sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT);
+
+	sin.flags = in->flags;
+	sin.resourceType = in->resourceType;
+	sin.format = in->format;
+	sin.resourceLoction = ADDR_RSRC_LOC_INVIS;
+	/* TODO: We could allow some of these: */
+	sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */
+	sin.forbiddenBlock.var = 1; /* don't allow the variable-sized swizzle modes */
+	sin.forbiddenBlock.linear = 1; /* don't allow linear swizzle modes */
+	sin.bpp = in->bpp;
+	sin.width = in->width;
+	sin.height = in->height;
+	sin.numSlices = in->numSlices;
+	sin.numMipLevels = in->numMipLevels;
+	sin.numSamples = in->numSamples;
+	sin.numFrags = in->numFrags;
+
+	if (is_fmask) {
+		sin.flags.color = 0;
+		sin.flags.fmask = 1;
+	}
+
+	ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout);
+	if (ret != ADDR_OK)
+		return ret;
+
+	*swizzle_mode = sout.swizzleMode;
+	return 0;
+}
+
+static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
+				struct radeon_surf *surf, bool compressed,
+				ADDR2_COMPUTE_SURFACE_INFO_INPUT *in)
+{
+	ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {};
+	ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+	ADDR_E_RETURNCODE ret;
+
+	out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
+	out.pMipInfo = mip_info;
+
+	ret = Addr2ComputeSurfaceInfo(addrlib, in, &out);
+	if (ret != ADDR_OK)
+	return ret;
+
+	if (in->flags.stencil) {
+		surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode;
+		surf->u.gfx9.stencil.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
+								   out.mipChainPitch - 1;
+		surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign);
+		surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign);
+		surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize;
+		return 0;
+	}
+
+	surf->u.gfx9.surf.swizzle_mode = in->swizzleMode;
+	surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
+							out.mipChainPitch - 1;
+
+	/* CMASK fast clear uses these even if FMASK isn't allocated.
+	 * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4.
+	 */
+	surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3;
+	surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch;
+
+	surf->u.gfx9.surf_slice_size = out.sliceSize;
+	surf->u.gfx9.surf_pitch = out.pitch;
+	surf->u.gfx9.surf_height = out.height;
+	surf->surf_size = out.surfSize;
+	surf->surf_alignment = out.baseAlign;
+
+	if (in->swizzleMode == ADDR_SW_LINEAR) {
+		for (unsigned i = 0; i < in->numMipLevels; i++)
+			surf->u.gfx9.offset[i] = mip_info[i].offset;
+	}
+
+	if (in->flags.depth) {
+		assert(in->swizzleMode != ADDR_SW_LINEAR);
+
+		/* HTILE */
+		ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0};
+		ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0};
+
+		hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT);
+		hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT);
+
+		hin.hTileFlags.pipeAligned = 1;
+		hin.hTileFlags.rbAligned = 1;
+		hin.depthFlags = in->flags;
+		hin.swizzleMode = in->swizzleMode;
+		hin.unalignedWidth = in->width;
+		hin.unalignedHeight = in->height;
+		hin.numSlices = in->numSlices;
+		hin.numMipLevels = in->numMipLevels;
+
+		ret = Addr2ComputeHtileInfo(addrlib, &hin, &hout);
+		if (ret != ADDR_OK)
+			return ret;
+
+		surf->u.gfx9.htile.rb_aligned = hin.hTileFlags.rbAligned;
+		surf->u.gfx9.htile.pipe_aligned = hin.hTileFlags.pipeAligned;
+		surf->htile_size = hout.htileBytes;
+		surf->htile_slice_size = hout.sliceSize;
+		surf->htile_alignment = hout.baseAlign;
+	} else {
+		/* DCC */
+		if (!(surf->flags & RADEON_SURF_DISABLE_DCC) &&
+		    !(surf->flags & RADEON_SURF_SCANOUT) &&
+		    !compressed &&
+		    in->swizzleMode != ADDR_SW_LINEAR &&
+		    /* TODO: We could support DCC with MSAA. */
+		    in->numSamples == 1) {
+			ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
+			ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
+			ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {};
+
+			din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
+			dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
+			dout.pMipInfo = meta_mip_info;
+
+			din.dccKeyFlags.pipeAligned = 1;
+			din.dccKeyFlags.rbAligned = 1;
+			din.colorFlags = in->flags;
+			din.resourceType = in->resourceType;
+			din.swizzleMode = in->swizzleMode;
+			din.bpp = in->bpp;
+			din.unalignedWidth = in->width;
+			din.unalignedHeight = in->height;
+			din.numSlices = in->numSlices;
+			din.numFrags = in->numFrags;
+			din.numMipLevels = in->numMipLevels;
+			din.dataSurfaceSize = out.surfSize;
+
+			ret = Addr2ComputeDccInfo(addrlib, &din, &dout);
+			if (ret != ADDR_OK)
+				return ret;
+
+			surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned;
+			surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned;
+			surf->u.gfx9.dcc_pitch_max = dout.pitch - 1;
+			surf->dcc_size = dout.dccRamSize;
+			surf->dcc_alignment = dout.dccRamBaseAlign;
+			surf->num_dcc_levels = in->numMipLevels;
+
+			/* Disable DCC for levels that are in the mip tail.
+			 *
+			 * There are two issues that this is intended to
+			 * address:
+			 *
+			 * 1. Multiple mip levels may share a cache line. This
+			 *    can lead to corruption when switching between
+			 *    rendering to different mip levels because the
+			 *    RBs don't maintain coherency.
+			 *
+			 * 2. Texturing with metadata after rendering sometimes
+			 *    fails with corruption, probably for a similar
+			 *    reason.
+			 *
+			 * Working around these issues for all levels in the
+			 * mip tail may be overly conservative, but it's what
+			 * Vulkan does.
+			 *
+			 * Alternative solutions that also work but are worse:
+			 * - Disable DCC entirely.
+			 * - Flush TC L2 after rendering.
+			 */
+			for (unsigned i = 0; i < in->numMipLevels; i++) {
+				if (meta_mip_info[i].inMiptail) {
+					surf->num_dcc_levels = i;
+					break;
+				}
+			}
+
+			if (!surf->num_dcc_levels)
+				surf->dcc_size = 0;
+		}
+
+		/* FMASK */
+		if (in->numSamples > 1) {
+			ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0};
+			ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
+
+			fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT);
+			fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT);
+
+			ret = gfx9_get_preferred_swizzle_mode(addrlib, in, true, &fin.swizzleMode);
+			if (ret != ADDR_OK)
+				return ret;
+
+			fin.unalignedWidth = in->width;
+			fin.unalignedHeight = in->height;
+			fin.numSlices = in->numSlices;
+			fin.numSamples = in->numSamples;
+			fin.numFrags = in->numFrags;
+
+			ret = Addr2ComputeFmaskInfo(addrlib, &fin, &fout);
+			if (ret != ADDR_OK)
+				return ret;
+
+			surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
+			surf->u.gfx9.fmask.epitch = fout.pitch - 1;
+			surf->u.gfx9.fmask_size = fout.fmaskBytes;
+			surf->u.gfx9.fmask_alignment = fout.baseAlign;
+		}
+
+		/* CMASK */
+		if (in->swizzleMode != ADDR_SW_LINEAR) {
+			ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0};
+			ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0};
+
+			cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
+			cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);
+
+			cin.cMaskFlags.pipeAligned = 1;
+			cin.cMaskFlags.rbAligned = 1;
+			cin.colorFlags = in->flags;
+			cin.resourceType = in->resourceType;
+			cin.unalignedWidth = in->width;
+			cin.unalignedHeight = in->height;
+			cin.numSlices = in->numSlices;
+
+			if (in->numSamples > 1)
+				cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode;
+			else
+				cin.swizzleMode = in->swizzleMode;
+
+			ret = Addr2ComputeCmaskInfo(addrlib, &cin, &cout);
+			if (ret != ADDR_OK)
+				return ret;
+
+			surf->u.gfx9.cmask.rb_aligned = cin.cMaskFlags.rbAligned;
+			surf->u.gfx9.cmask.pipe_aligned = cin.cMaskFlags.pipeAligned;
+			surf->u.gfx9.cmask_size = cout.cmaskBytes;
+			surf->u.gfx9.cmask_alignment = cout.baseAlign;
+		}
+	}
+
+	return 0;
+}
+
+static int gfx9_compute_surface(ADDR_HANDLE addrlib,
+				const struct ac_surf_config *config,
+				enum radeon_surf_mode mode,
+				struct radeon_surf *surf)
+{
+	bool compressed;
+	ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+	int r;
+
+	assert(!(surf->flags & RADEON_SURF_FMASK));
+
+	AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
+
+	compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+	/* The format must be set correctly for the allocation of compressed
+	 * textures to work. In other cases, setting the bpp is sufficient. */
+	if (compressed) {
+		switch (surf->bpe) {
+		case 8:
+			AddrSurfInfoIn.format = ADDR_FMT_BC1;
+			break;
+		case 16:
+			AddrSurfInfoIn.format = ADDR_FMT_BC3;
+			break;
+		default:
+			assert(0);
+		}
+	} else {
+		AddrSurfInfoIn.bpp = surf->bpe * 8;
+	}
+
+	AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+	AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+	AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
+	/* flags.texture currently refers to TC-compatible HTILE */
+	AddrSurfInfoIn.flags.texture = AddrSurfInfoIn.flags.color ||
+				       surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
+	AddrSurfInfoIn.flags.opt4space = 1;
+
+	AddrSurfInfoIn.numMipLevels = config->info.levels;
+	AddrSurfInfoIn.numSamples = config->info.samples ? config->info.samples : 1;
+	AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;
+
+	/* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
+	 * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
+	 * must sample 1D textures as 2D. */
+	if (config->is_3d)
+		AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D;
+	else
+		AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D;
+
+	AddrSurfInfoIn.width = config->info.width;
+	AddrSurfInfoIn.height = config->info.height;
+
+	if (config->is_3d)
+		AddrSurfInfoIn.numSlices = config->info.depth;
+	else if (config->is_cube)
+		AddrSurfInfoIn.numSlices = 6;
+	else
+		AddrSurfInfoIn.numSlices = config->info.array_size;
+
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:
+		assert(config->info.samples <= 1);
+		assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+		AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR;
+		break;
+
+	case RADEON_SURF_MODE_1D:
+	case RADEON_SURF_MODE_2D:
+		if (surf->flags & RADEON_SURF_IMPORTED) {
+			AddrSurfInfoIn.swizzleMode = surf->u.gfx9.surf.swizzle_mode;
+			break;
+		}
+
+		r = gfx9_get_preferred_swizzle_mode(addrlib, &AddrSurfInfoIn, false,
+						    &AddrSurfInfoIn.swizzleMode);
+		if (r)
+			return r;
+		break;
+
+	default:
+		assert(0);
+	}
+
+	surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType;
+
+	surf->num_dcc_levels = 0;
+	surf->surf_size = 0;
+	surf->dcc_size = 0;
+	surf->htile_size = 0;
+	surf->htile_slice_size = 0;
+	surf->u.gfx9.surf_offset = 0;
+	surf->u.gfx9.stencil_offset = 0;
+	surf->u.gfx9.fmask_size = 0;
+	surf->u.gfx9.cmask_size = 0;
+
+	/* Calculate texture layout information. */
+	r = gfx9_compute_miptree(addrlib, surf, compressed, &AddrSurfInfoIn);
+	if (r)
+		return r;
+
+	/* Calculate texture layout information for stencil. */
+	if (surf->flags & RADEON_SURF_SBUFFER) {
+		AddrSurfInfoIn.flags.stencil = 1;
+		AddrSurfInfoIn.bpp = 8;
+
+		if (!AddrSurfInfoIn.flags.depth) {
+			r = gfx9_get_preferred_swizzle_mode(addrlib, &AddrSurfInfoIn, false,
+							    &AddrSurfInfoIn.swizzleMode);
+			if (r)
+				return r;
+		} else
+			AddrSurfInfoIn.flags.depth = 0;
+
+		r = gfx9_compute_miptree(addrlib, surf, compressed, &AddrSurfInfoIn);
+		if (r)
+			return r;
+	}
+
+	surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;
+
+	switch (surf->u.gfx9.surf.swizzle_mode) {
+		/* S = standard. */
+		case ADDR_SW_256B_S:
+		case ADDR_SW_4KB_S:
+		case ADDR_SW_64KB_S:
+		case ADDR_SW_VAR_S:
+		case ADDR_SW_64KB_S_T:
+		case ADDR_SW_4KB_S_X:
+		case ADDR_SW_64KB_S_X:
+		case ADDR_SW_VAR_S_X:
+			surf->micro_tile_mode = RADEON_MICRO_MODE_THIN;
+			break;
+
+		/* D = display. */
+		case ADDR_SW_LINEAR:
+		case ADDR_SW_256B_D:
+		case ADDR_SW_4KB_D:
+		case ADDR_SW_64KB_D:
+		case ADDR_SW_VAR_D:
+		case ADDR_SW_64KB_D_T:
+		case ADDR_SW_4KB_D_X:
+		case ADDR_SW_64KB_D_X:
+		case ADDR_SW_VAR_D_X:
+			surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY;
+			break;
+
+		/* R = rotated. */
+		case ADDR_SW_256B_R:
+		case ADDR_SW_4KB_R:
+		case ADDR_SW_64KB_R:
+		case ADDR_SW_VAR_R:
+		case ADDR_SW_64KB_R_T:
+		case ADDR_SW_4KB_R_X:
+		case ADDR_SW_64KB_R_X:
+		case ADDR_SW_VAR_R_X:
+			surf->micro_tile_mode = RADEON_MICRO_MODE_ROTATED;
+			break;
+
+		/* Z = depth. */
+		case ADDR_SW_4KB_Z:
+		case ADDR_SW_64KB_Z:
+		case ADDR_SW_VAR_Z:
+		case ADDR_SW_64KB_Z_T:
+		case ADDR_SW_4KB_Z_X:
+		case ADDR_SW_64KB_Z_X:
+		case ADDR_SW_VAR_Z_X:
+			surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH;
+			break;
+
+		default:
+			assert(0);
+	}
+
+	return 0;
+}
+
+int ac_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info,
+		       const struct ac_surf_config *config,
+		       enum radeon_surf_mode mode,
+		       struct radeon_surf *surf)
+{
+	int r;
+
+	r = surf_config_sanity(config);
+	if (r)
+		return r;
+
+	if (info->chip_class >= GFX9)
+		return gfx9_compute_surface(addrlib, config, mode, surf);
+	else
+		return gfx6_compute_surface(addrlib, info, config, mode, surf);
+}
diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
new file mode 100644
index 0000000..3eaef63
--- /dev/null
+++ b/src/amd/common/ac_surface.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#ifndef AC_SURFACE_H
+#define AC_SURFACE_H
+
+#include <stdint.h>
+
+#include "amd_family.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declarations. */
+typedef void* ADDR_HANDLE;
+
+struct amdgpu_gpu_info;
+struct radeon_info;
+
+#define RADEON_SURF_MAX_LEVELS                  15
+
+enum radeon_surf_mode {
+    RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
+    RADEON_SURF_MODE_1D = 2,
+    RADEON_SURF_MODE_2D = 3,
+};
+
+/* These are defined exactly like GB_TILE_MODEn.MICRO_TILE_MODE_NEW. */
+enum radeon_micro_mode {
+    RADEON_MICRO_MODE_DISPLAY = 0,
+    RADEON_MICRO_MODE_THIN = 1,
+    RADEON_MICRO_MODE_DEPTH = 2,
+    RADEON_MICRO_MODE_ROTATED = 3,
+};
+
+/* the first 16 bits are reserved for libdrm_radeon, don't use them */
+#define RADEON_SURF_SCANOUT                     (1 << 16)
+#define RADEON_SURF_ZBUFFER                     (1 << 17)
+#define RADEON_SURF_SBUFFER                     (1 << 18)
+#define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
+/* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
+#define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
+#define RADEON_SURF_FMASK                       (1 << 21)
+#define RADEON_SURF_DISABLE_DCC                 (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
+#define RADEON_SURF_IMPORTED                    (1 << 24)
+#define RADEON_SURF_OPTIMIZE_FOR_SPACE          (1 << 25)
+
+struct legacy_surf_level {
+    uint64_t                    offset;
+    uint64_t                    slice_size;
+    uint64_t                    dcc_offset;
+    uint64_t                    dcc_fast_clear_size;
+    uint16_t                    nblk_x;
+    uint16_t                    nblk_y;
+    enum radeon_surf_mode       mode;
+};
+
+struct legacy_surf_layout {
+    unsigned                    bankw:4;  /* max 8 */
+    unsigned                    bankh:4;  /* max 8 */
+    unsigned                    mtilea:4; /* max 8 */
+    unsigned                    tile_split:13;         /* max 4K */
+    unsigned                    stencil_tile_split:13; /* max 4K */
+    unsigned                    pipe_config:5;      /* max 17 */
+    unsigned                    num_banks:5;        /* max 16 */
+    unsigned                    macro_tile_index:4; /* max 15 */
+
+    /* Whether the depth miptree or stencil miptree as used by the DB are
+     * adjusted from their TC compatible form to ensure depth/stencil
+     * compatibility. If either is true, the corresponding plane cannot be
+     * sampled from.
+     */
+    unsigned                    depth_adjusted:1;
+    unsigned                    stencil_adjusted:1;
+
+    uint8_t                     tile_swizzle;
+    struct legacy_surf_level    level[RADEON_SURF_MAX_LEVELS];
+    struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
+    uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
+    uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
+};
+
+/* Same as addrlib - AddrResourceType. */
+enum gfx9_resource_type {
+    RADEON_RESOURCE_1D = 0,
+    RADEON_RESOURCE_2D,
+    RADEON_RESOURCE_3D,
+};
+
+struct gfx9_surf_flags {
+    uint16_t                    swizzle_mode; /* tile mode */
+    uint16_t                    epitch; /* (pitch - 1) or (height - 1) */
+};
+
+struct gfx9_surf_meta_flags {
+    unsigned                    rb_aligned:1;   /* optimal for RBs */
+    unsigned                    pipe_aligned:1; /* optimal for TC */
+};
+
+struct gfx9_surf_layout {
+    struct gfx9_surf_flags      surf;    /* color or depth surface */
+    struct gfx9_surf_flags      fmask;   /* not added to surf_size */
+    struct gfx9_surf_flags      stencil; /* added to surf_size, use stencil_offset */
+
+    struct gfx9_surf_meta_flags dcc;   /* metadata of color */
+    struct gfx9_surf_meta_flags htile; /* metadata of depth and stencil */
+    struct gfx9_surf_meta_flags cmask; /* metadata of fmask */
+
+    enum gfx9_resource_type     resource_type; /* 1D, 2D or 3D */
+    uint64_t                    surf_offset; /* 0 unless imported with an offset */
+    /* The size of the 2D plane containing all mipmap levels. */
+    uint64_t                    surf_slice_size;
+    uint16_t                    surf_pitch; /* in blocks */
+    uint16_t                    surf_height;
+    /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
+    uint32_t                    offset[RADEON_SURF_MAX_LEVELS];
+
+    uint16_t                    dcc_pitch_max;  /* (mip chain pitch - 1) */
+
+    uint64_t                    stencil_offset; /* separate stencil */
+    uint64_t                    fmask_size;
+    uint64_t                    cmask_size;
+
+    uint32_t                    fmask_alignment;
+    uint32_t                    cmask_alignment;
+};
+
+struct radeon_surf {
+    /* Format properties. */
+    unsigned                    blk_w:4;
+    unsigned                    blk_h:4;
+    unsigned                    bpe:5;
+    /* Number of mipmap levels where DCC is enabled starting from level 0.
+     * Non-zero levels may be disabled due to alignment constraints, but not
+     * the first level.
+     */
+    unsigned                    num_dcc_levels:4;
+    unsigned                    is_linear:1;
+    /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
+    unsigned                    micro_tile_mode:3;
+    uint32_t                    flags;
+
+    /* These are return values. Some of them can be set by the caller, but
+     * they will be treated as hints (e.g. bankw, bankh) and might be
+     * changed by the calculator.
+     */
+    uint64_t                    surf_size;
+    uint64_t                    dcc_size;
+    uint64_t                    htile_size;
+
+    uint32_t                    htile_slice_size;
+
+    uint32_t                    surf_alignment;
+    uint32_t                    dcc_alignment;
+    uint32_t                    htile_alignment;
+
+    union {
+        /* R600-VI return values.
+         *
+         * Some of them can be set by the caller if certain parameters are
+         * desirable. The allocator will try to obey them.
+         */
+        struct legacy_surf_layout legacy;
+
+        /* GFX9+ return values. */
+        struct gfx9_surf_layout gfx9;
+    } u;
+};
+
+struct ac_surf_info {
+	uint32_t width;
+	uint32_t height;
+	uint32_t depth;
+	uint32_t surf_index;
+	uint8_t samples;
+	uint8_t levels;
+	uint16_t array_size;
+};
+
+struct ac_surf_config {
+	struct ac_surf_info info;
+	unsigned is_3d : 1;
+	unsigned is_cube : 1;
+};
+
+ADDR_HANDLE amdgpu_addr_create(const struct radeon_info *info,
+			       const struct amdgpu_gpu_info *amdinfo,
+			       uint64_t *max_alignment);
+
+int ac_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info,
+		       const struct ac_surf_config * config,
+		       enum radeon_surf_mode mode,
+		       struct radeon_surf *surf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* AC_SURFACE_H */
diff --git a/src/amd/common/amd_kernel_code_t.h b/src/amd/common/amd_kernel_code_t.h
index d0d7809..f8e9508 100644
--- a/src/amd/common/amd_kernel_code_t.h
+++ b/src/amd/common/amd_kernel_code_t.h
@@ -36,7 +36,7 @@
 
 // Gets bits for specified mask from specified src packed instance.
 #define AMD_HSA_BITS_GET(src, mask)                                            \
-  ((src & mask) >> mask ## _SHIFT)                                             \
+  ((src & mask) >> mask ## _SHIFT)
 
 /* Every amd_*_code_t has the following properties, which are composed of
  * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
diff --git a/src/amd/common/gfx9d.h b/src/amd/common/gfx9d.h
index 787d0a9..8c61645 100644
--- a/src/amd/common/gfx9d.h
+++ b/src/amd/common/gfx9d.h
@@ -1345,8 +1345,8 @@
 #define     V_008F14_IMG_DATA_FORMAT_RESERVED_56                    0x38
 #define     V_008F14_IMG_DATA_FORMAT_4_4                            0x39
 #define     V_008F14_IMG_DATA_FORMAT_6_5_5                          0x3A
-#define     V_008F14_IMG_DATA_S8_16                                 0x3B
-#define     V_008F14_IMG_DATA_S8_32                                 0x3C
+#define     V_008F14_IMG_DATA_FORMAT_S8_16                          0x3B
+#define     V_008F14_IMG_DATA_FORMAT_S8_32                          0x3C
 #define     V_008F14_IMG_DATA_FORMAT_8_AS_32                        0x3D
 #define     V_008F14_IMG_DATA_FORMAT_8_AS_32_32                     0x3E
 #define     V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32              0x3F
diff --git a/src/amd/common/r600d_common.h b/src/amd/common/r600d_common.h
index 3fdfb7c..3374475 100644
--- a/src/amd/common/r600d_common.h
+++ b/src/amd/common/r600d_common.h
@@ -54,6 +54,17 @@
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define		WAIT_REG_MEM_EQUAL		3
 #define         WAIT_REG_MEM_MEM_SPACE(x)       (((unsigned)(x) & 0x3) << 4)
+#define PKT3_COPY_DATA			       0x40
+#define		COPY_DATA_SRC_SEL(x)		((x) & 0xf)
+#define			COPY_DATA_REG		0
+#define			COPY_DATA_MEM		1
+#define                 COPY_DATA_PERF          4
+#define                 COPY_DATA_IMM           5
+#define                 COPY_DATA_TIMESTAMP     9
+#define		COPY_DATA_DST_SEL(x)		(((unsigned)(x) & 0xf) << 8)
+#define                 COPY_DATA_MEM_ASYNC     5
+#define		COPY_DATA_COUNT_SEL		(1 << 16)
+#define		COPY_DATA_WR_CONFIRM		(1 << 20)
 #define PKT3_EVENT_WRITE                       0x46
 #define PKT3_EVENT_WRITE_EOP                   0x47
 #define         EOP_DATA_SEL(x)                         ((x) << 29)
diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index b9ddadc..1016f67 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -154,6 +154,7 @@
 #define			COPY_DATA_MEM		1
 #define                 COPY_DATA_PERF          4
 #define                 COPY_DATA_IMM           5
+#define                 COPY_DATA_TIMESTAMP     9
 #define		COPY_DATA_DST_SEL(x)		(((unsigned)(x) & 0xf) << 8)
 #define		COPY_DATA_COUNT_SEL		(1 << 16)
 #define		COPY_DATA_WR_CONFIRM		(1 << 20)
@@ -169,7 +170,7 @@
  */
 /* fix CP DMA before uncommenting: */
 /*#define PKT3_EVENT_WRITE_EOS                   0x48*/ /* not on GFX9 */
-#define PKT3_RELEASE_MEM                       0x49 /* GFX9+ (any ring) or GFX8 (compute ring only) */
+#define PKT3_RELEASE_MEM                       0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
 #define PKT3_ONE_REG_WRITE                     0x57 /* not on CIK */
 #define PKT3_ACQUIRE_MEM                       0x58 /* new for CIK */
 #define PKT3_SET_CONFIG_REG                    0x68
@@ -279,6 +280,7 @@
 #define     S_500_DSL_SEL(x)		(((unsigned)(x) & 0x3) << 20)
 #define       V_500_DST_ADDR		0
 #define       V_500_GDS			1 /* program DAS to 1 as well */
+#define       V_500_NOWHERE		2 /* new for GFX9 */
 #define       V_500_DST_ADDR_TC_L2	3 /* new for CIK */
 #define     S_500_ENGINE(x)		((x) & 0x1)
 #define       V_500_ME			0
@@ -2451,6 +2453,8 @@
 #define   S_008F3C_BORDER_COLOR_PTR(x)                                (((unsigned)(x) & 0xFFF) << 0)
 #define   G_008F3C_BORDER_COLOR_PTR(x)                                (((x) >> 0) & 0xFFF)
 #define   C_008F3C_BORDER_COLOR_PTR                                   0xFFFFF000
+/* The UPGRADED_DEPTH field is driver-specific and does not exist in hardware. */
+#define   S_008F3C_UPGRADED_DEPTH(x)                                  (((unsigned)(x) & 0x1) << 29)
 #define   S_008F3C_BORDER_COLOR_TYPE(x)                               (((unsigned)(x) & 0x03) << 30)
 #define   G_008F3C_BORDER_COLOR_TYPE(x)                               (((x) >> 30) & 0x03)
 #define   C_008F3C_BORDER_COLOR_TYPE                                  0x3FFFFFFF
diff --git a/src/amd/vulkan/Makefile.am b/src/amd/vulkan/Makefile.am
index a645432..3350f54 100644
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -107,13 +107,11 @@
 
 vulkan_api_xml = $(top_srcdir)/src/vulkan/registry/vk.xml
 
-radv_entrypoints.h : radv_entrypoints_gen.py $(vulkan_api_xml)
-	$(AM_V_GEN) cat $(vulkan_api_xml) |\
-	$(PYTHON2) $(srcdir)/radv_entrypoints_gen.py header > $@
-
-radv_entrypoints.c : radv_entrypoints_gen.py $(vulkan_api_xml)
-	$(AM_V_GEN) cat $(vulkan_api_xml) |\
-	$(PYTHON2) $(srcdir)/radv_entrypoints_gen.py code > $@
+radv_entrypoints.c: radv_entrypoints_gen.py $(vulkan_api_xml)
+	$(MKDIR_GEN)
+	$(AM_V_GEN)$(PYTHON2) $(srcdir)/radv_entrypoints_gen.py \
+		--xml $(vulkan_api_xml) --outdir $(builddir)
+radv_entrypoints.h: radv_entrypoints.c
 
 vk_format_table.c: vk_format_table.py \
 		   vk_format_parse.py \
diff --git a/src/amd/vulkan/Makefile.sources b/src/amd/vulkan/Makefile.sources
index 4896952..d3e0c81 100644
--- a/src/amd/vulkan/Makefile.sources
+++ b/src/amd/vulkan/Makefile.sources
@@ -51,6 +51,7 @@
 	radv_meta_fast_clear.c \
 	radv_meta_resolve.c \
 	radv_meta_resolve_cs.c \
+	radv_meta_resolve_fs.c \
 	radv_pass.c \
 	radv_pipeline.c \
 	radv_pipeline_cache.c \
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index fd15541..d4c4217 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -29,6 +29,7 @@
 #include "radv_radeon_winsys.h"
 #include "radv_cs.h"
 #include "sid.h"
+#include "gfx9d.h"
 #include "vk_format.h"
 #include "radv_meta.h"
 
@@ -233,6 +234,14 @@
 	cmd_buffer->record_fail = false;
 
 	cmd_buffer->ring_offsets_idx = -1;
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		void *fence_ptr;
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
+					     &cmd_buffer->gfx9_fence_offset,
+					     &fence_ptr);
+		cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
+	}
 }
 
 static bool
@@ -357,6 +366,17 @@
 			  8);
 	radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, pipeline->graphics.blend.cb_color_control);
 	radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, pipeline->graphics.blend.db_alpha_to_mask);
+
+	if (cmd_buffer->device->physical_device->has_rbplus) {
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, 8);
+		radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.sx_mrt_blend_opt, 8);
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
+		radeon_emit(cmd_buffer->cs, 0);	/* R_028754_SX_PS_DOWNCONVERT */
+		radeon_emit(cmd_buffer->cs, 0);	/* R_028758_SX_BLEND_OPT_EPSILON */
+		radeon_emit(cmd_buffer->cs, 0);	/* R_02875C_SX_BLEND_OPT_CONTROL */
+	}
 }
 
 static void
@@ -378,8 +398,8 @@
 	       x >= 4096 ? 0xffff : x * 16;
 }
 
-static uint32_t
-shader_stage_to_user_data_0(gl_shader_stage stage, bool has_gs, bool has_tess)
+uint32_t
+radv_shader_stage_to_user_data_0(gl_shader_stage stage, bool has_gs, bool has_tess)
 {
 	switch (stage) {
 	case MESA_SHADER_FRAGMENT:
@@ -405,7 +425,7 @@
 	}
 }
 
-static struct ac_userdata_info *
+struct ac_userdata_info *
 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
 		      gl_shader_stage stage,
 		      int idx)
@@ -420,7 +440,7 @@
 			   int idx, uint64_t va)
 {
 	struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
-	uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+	uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 	if (loc->sgpr_idx == -1)
 		return;
 	assert(loc->num_sgprs == 2);
@@ -454,10 +474,15 @@
 
 	radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
 
-	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) {
+	/* GFX9: Flush DFSM when the AA mode changes. */
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+	}
+	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) {
 		uint32_t offset;
 		struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+		uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 		if (loc->sgpr_idx == -1)
 			return;
 		assert(loc->num_sgprs == 1);
@@ -510,6 +535,14 @@
 			       raster->pa_su_sc_mode_cntl);
 }
 
+static inline void
+radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+		   unsigned size)
+{
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
+		si_cp_dma_prefetch(cmd_buffer, va, size);
+}
+
 static void
 radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer,
 		struct radv_pipeline *pipeline,
@@ -521,6 +554,7 @@
 	unsigned export_count;
 
 	ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, shader->code_size);
 
 	export_count = MAX2(1, outinfo->param_exports);
 	radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG,
@@ -555,8 +589,9 @@
 	radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_CL_VS_OUT_CNTL,
 			       pipeline->graphics.pa_cl_vs_out_cntl);
 
-	radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF,
-			       S_028AB4_REUSE_OFF(outinfo->writes_viewport_index));
+	if (cmd_buffer->device->physical_device->rad_info.chip_class <= VI)
+		radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF,
+				       S_028AB4_REUSE_OFF(outinfo->writes_viewport_index));
 }
 
 static void
@@ -568,6 +603,7 @@
 	uint64_t va = ws->buffer_get_va(shader->bo);
 
 	ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, shader->code_size);
 
 	radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
 			       outinfo->esgs_itemsize / 4);
@@ -587,6 +623,7 @@
 	uint32_t rsrc2 = shader->rsrc2;
 
 	ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, shader->code_size);
 
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
 	radeon_emit(cmd_buffer->cs, va >> 8);
@@ -610,6 +647,7 @@
 	uint64_t va = ws->buffer_get_va(shader->bo);
 
 	ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, shader->code_size);
 
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
 	radeon_emit(cmd_buffer->cs, va >> 8);
@@ -635,7 +673,7 @@
 	else
 		radv_emit_hw_vs(cmd_buffer, pipeline, vs, &vs->info.vs.outinfo);
 
-	radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, 0);
+	radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, pipeline->graphics.vgt_primitiveid_en);
 }
 
 
@@ -672,7 +710,7 @@
 
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, AC_UD_TCS_OFFCHIP_LAYOUT);
 	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_TESS_CTRL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+		uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_CTRL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 		assert(loc->num_sgprs == 4);
 		assert(!loc->indirect);
 		radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 4);
@@ -685,7 +723,7 @@
 
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, AC_UD_TES_OFFCHIP_LAYOUT);
 	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_TESS_EVAL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+		uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_EVAL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 		assert(loc->num_sgprs == 1);
 		assert(!loc->indirect);
 
@@ -695,7 +733,7 @@
 
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_LS_TCS_IN_LAYOUT);
 	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+		uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 		assert(loc->num_sgprs == 1);
 		assert(!loc->indirect);
 
@@ -743,6 +781,8 @@
 
 	va = ws->buffer_get_va(gs->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, gs->code_size);
+
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
 	radeon_emit(cmd_buffer->cs, va >> 8);
 	radeon_emit(cmd_buffer->cs, va >> 40);
@@ -783,6 +823,7 @@
 
 	va = ws->buffer_get_va(ps->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, ps->code_size);
 
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
 	radeon_emit(cmd_buffer->cs, va >> 8);
@@ -815,6 +856,12 @@
 	radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask);
 	radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
 
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		/* optimise this? */
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+	}
+
 	if (pipeline->graphics.ps_input_cntl_num) {
 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0, pipeline->graphics.ps_input_cntl_num);
 		for (unsigned i = 0; i < pipeline->graphics.ps_input_cntl_num; i++) {
@@ -867,6 +914,16 @@
 	    cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
 	     pipeline->graphics.can_use_guardband)
 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
+
+	radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en);
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
+		radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, pipeline->graphics.prim);
+	} else {
+		radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, pipeline->graphics.prim);
+	}
+	radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, pipeline->graphics.gs_out);
+
 	cmd_buffer->state.emitted_pipeline = pipeline;
 }
 
@@ -881,6 +938,11 @@
 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
 {
 	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
+		si_emit_cache_flush(cmd_buffer);
+	}
 	si_write_scissors(cmd_buffer->cs, 0, count,
 			  cmd_buffer->state.dynamic.scissor.scissors,
 			  cmd_buffer->state.dynamic.viewport.viewports,
@@ -895,21 +957,44 @@
 			 struct radv_color_buffer_info *cb)
 {
 	bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI;
-	radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_base);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_view);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_info);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
-	radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
-	radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
 
-	if (is_vi) { /* DCC BASE */
-		radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_base >> 32);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_info);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
+		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask >> 32);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask >> 32);
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
+		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
+		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base >> 32);
+		
+		radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
+				       cb->gfx9_epitch);
+	} else {
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_info);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
+		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
+		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
+
+		if (is_vi) { /* DCC BASE */
+			radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
+		}
 	}
 }
 
@@ -920,62 +1005,61 @@
 		      VkImageLayout layout)
 {
 	uint32_t db_z_info = ds->db_z_info;
+	uint32_t db_stencil_info = ds->db_stencil_info;
 
-	if (!radv_layout_has_htile(image, layout))
+	if (!radv_layout_has_htile(image, layout,
+	                           radv_image_queue_family_mask(image,
+	                                                        cmd_buffer->queue_family_index,
+	                                                        cmd_buffer->queue_family_index))) {
 		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
-
-	if (!radv_layout_can_expclear(image, layout))
-		db_z_info &= C_028040_ALLOW_EXPCLEAR & C_028044_ALLOW_EXPCLEAR;
+		db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
+	}
 
 	radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
-	radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
-
-	radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
-	radeon_emit(cmd_buffer->cs, ds->db_depth_info);	/* R_02803C_DB_DEPTH_INFO */
-	radeon_emit(cmd_buffer->cs, db_z_info);			/* R_028040_DB_Z_INFO */
-	radeon_emit(cmd_buffer->cs, ds->db_stencil_info);	/* R_028044_DB_STENCIL_INFO */
-	radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* R_028048_DB_Z_READ_BASE */
-	radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* R_02804C_DB_STENCIL_READ_BASE */
-	radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* R_028050_DB_Z_WRITE_BASE */
-	radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* R_028054_DB_STENCIL_WRITE_BASE */
-	radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
-	radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
-
 	radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
+
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
+		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
+		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
+		radeon_emit(cmd_buffer->cs, ds->db_depth_size);
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
+		radeon_emit(cmd_buffer->cs, db_z_info);			/* DB_Z_INFO */
+		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* DB_STENCIL_INFO */
+		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* DB_Z_READ_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);	/* DB_Z_READ_BASE_HI */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* DB_STENCIL_READ_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); /* DB_STENCIL_READ_BASE_HI */
+		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* DB_Z_WRITE_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_z_write_base >> 32);	/* DB_Z_WRITE_BASE_HI */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* DB_STENCIL_WRITE_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
+		radeon_emit(cmd_buffer->cs, ds->db_z_info2);
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
+	} else {
+		radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
+
+		radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
+		radeon_emit(cmd_buffer->cs, ds->db_depth_info);	/* R_02803C_DB_DEPTH_INFO */
+		radeon_emit(cmd_buffer->cs, db_z_info);			/* R_028040_DB_Z_INFO */
+		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* R_028044_DB_STENCIL_INFO */
+		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* R_028048_DB_Z_READ_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* R_02804C_DB_STENCIL_READ_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* R_028050_DB_Z_WRITE_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* R_028054_DB_STENCIL_WRITE_BASE */
+		radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
+		radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
+
+	}
+
 	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
 			       ds->pa_su_poly_offset_db_fmt_cntl);
 }
 
-/*
- * To hw resolve multisample images both src and dst need to have the same
- * micro tiling mode. However we don't always know in advance when creating
- * the images. This function gets called if we have a resolve attachment,
- * and tests if the attachment image has the same tiling mode, then it
- * checks if the generated framebuffer data has the same tiling mode, and
- * updates it if not.
- */
-static void radv_set_optimal_micro_tile_mode(struct radv_device *device,
-					     struct radv_attachment_info *att,
-					     uint32_t micro_tile_mode)
-{
-	struct radv_image *image = att->attachment->image;
-	uint32_t tile_mode_index;
-	if (image->surface.nsamples <= 1)
-		return;
-
-	if (image->surface.micro_tile_mode != micro_tile_mode) {
-		radv_image_set_optimal_micro_tile_mode(device, image, micro_tile_mode);
-	}
-
-	if (att->cb.micro_tile_mode != micro_tile_mode) {
-		tile_mode_index = image->surface.tiling_index[0];
-
-		att->cb.cb_color_attrib &= C_028C74_TILE_MODE_INDEX;
-		att->cb.cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
-		att->cb.micro_tile_mode = micro_tile_mode;
-	}
-}
-
 void
 radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
 			  struct radv_image *image,
@@ -1043,6 +1127,35 @@
 	radeon_emit(cmd_buffer->cs, 0);
 }
 
+/*
+ *with DCC some colors don't require CMASK elimiation before being
+ * used as a texture. This sets a predicate value to determine if the
+ * cmask eliminate is required.
+ */
+void
+radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
+				  struct radv_image *image,
+				  bool value)
+{
+	uint64_t pred_val = value;
+	uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+	va += image->offset + image->dcc_pred_offset;
+
+	if (!image->surface.dcc_size)
+		return;
+
+	cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8);
+
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
+	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+				    S_370_WR_CONFIRM(1) |
+				    S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cmd_buffer->cs, va);
+	radeon_emit(cmd_buffer->cs, va >> 32);
+	radeon_emit(cmd_buffer->cs, pred_val);
+	radeon_emit(cmd_buffer->cs, pred_val >> 32);
+}
+
 void
 radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
 			  struct radv_image *image,
@@ -1085,7 +1198,7 @@
 	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c;
 	cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
 	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
 				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
 				    COPY_DATA_COUNT_SEL);
@@ -1094,7 +1207,7 @@
 	radeon_emit(cmd_buffer->cs, reg >> 2);
 	radeon_emit(cmd_buffer->cs, 0);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
 	radeon_emit(cmd_buffer->cs, 0);
 }
 
@@ -1104,21 +1217,21 @@
 	int i;
 	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
-	int dst_resolve_micro_tile_mode = -1;
 
-	if (subpass->has_resolve) {
-		uint32_t a = subpass->resolve_attachments[0].attachment;
-		const struct radv_image *image = framebuffer->attachments[a].attachment->image;
-		dst_resolve_micro_tile_mode = image->surface.micro_tile_mode;
-	}
-	for (i = 0; i < subpass->color_count; ++i) {
+	/* this may happen for inherited secondary recording */
+	if (!framebuffer)
+		return;
+
+	for (i = 0; i < 8; ++i) {
+		if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
+			radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+				       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
+			continue;
+		}
+
 		int idx = subpass->color_attachments[i].attachment;
 		struct radv_attachment_info *att = &framebuffer->attachments[idx];
 
-		if (dst_resolve_micro_tile_mode != -1) {
-			radv_set_optimal_micro_tile_mode(cmd_buffer->device,
-							 att, dst_resolve_micro_tile_mode);
-		}
 		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8);
 
 		assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT);
@@ -1127,16 +1240,18 @@
 		radv_load_color_clear_regs(cmd_buffer, att->attachment->image, i);
 	}
 
-	for (i = subpass->color_count; i < 8; i++)
-		radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
-				       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
-
 	if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
 		int idx = subpass->depth_stencil_attachment.attachment;
 		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
 		struct radv_attachment_info *att = &framebuffer->attachments[idx];
 		struct radv_image *image = att->attachment->image;
 		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8);
+		uint32_t queue_mask = radv_image_queue_family_mask(image,
+		                                                   cmd_buffer->queue_family_index,
+		                                                   cmd_buffer->queue_family_index);
+		/* We currently don't support writing decompressed HTILE */
+		assert(radv_layout_has_htile(image, layout, queue_mask) ==
+		       radv_layout_is_htile_compressed(image, layout, queue_mask));
 
 		radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout);
 
@@ -1146,13 +1261,22 @@
 		}
 		radv_load_depth_clear_regs(cmd_buffer, image);
 	} else {
-		radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
-		radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* R_028040_DB_Z_INFO */
-		radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* R_028044_DB_STENCIL_INFO */
+		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
+		else
+			radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
+
+		radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
+		radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
 	}
 	radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
 			       S_028208_BR_X(framebuffer->width) |
 			       S_028208_BR_Y(framebuffer->height));
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+	}
 }
 
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
@@ -1253,9 +1377,9 @@
 				   gl_shader_stage stage)
 {
 	struct ac_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx];
-	uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+	uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
 
-	if (desc_set_loc->sgpr_idx == -1)
+	if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect)
 		return;
 
 	assert(!desc_set_loc->indirect);
@@ -1273,30 +1397,12 @@
 				  unsigned idx)
 {
 	if (cmd_buffer->state.pipeline) {
-		if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
-			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-							   idx, set->va,
-							   MESA_SHADER_FRAGMENT);
-
-		if (stages & VK_SHADER_STAGE_VERTEX_BIT)
-			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-							   idx, set->va,
-							   MESA_SHADER_VERTEX);
-
-		if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(cmd_buffer->state.pipeline))
-			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-							   idx, set->va,
-							   MESA_SHADER_GEOMETRY);
-
-		if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(cmd_buffer->state.pipeline))
-			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-							   idx, set->va,
-							   MESA_SHADER_TESS_CTRL);
-
-		if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(cmd_buffer->state.pipeline))
-			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-							   idx, set->va,
-							   MESA_SHADER_TESS_EVAL);
+		radv_foreach_stage(stage, stages) {
+			if (cmd_buffer->state.pipeline->shaders[stage])
+				emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+								   idx, set->va,
+								   stage);
+		}
 	}
 
 	if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT))
@@ -1324,22 +1430,79 @@
 }
 
 static void
+radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer)
+{
+	uint32_t size = MAX_SETS * 2 * 4;
+	uint32_t offset;
+	void *ptr;
+	
+	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
+					  256, &offset, &ptr))
+		return;
+
+	for (unsigned i = 0; i < MAX_SETS; i++) {
+		uint32_t *uptr = ((uint32_t *)ptr) + i * 2;
+		uint64_t set_va = 0;
+		struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i];
+		if (set)
+			set_va = set->va;
+		uptr[0] = set_va & 0xffffffff;
+		uptr[1] = set_va >> 32;
+	}
+
+	uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+	va += offset;
+
+	if (cmd_buffer->state.pipeline) {
+		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
+			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
+						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+
+		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
+			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
+						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+
+		if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
+			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
+						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+
+		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
+			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
+						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+
+		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
+			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
+						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+	}
+
+	if (cmd_buffer->state.compute_pipeline)
+		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
+					   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
+}
+
+static void
 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
 		       VkShaderStageFlags stages)
 {
 	unsigned i;
+
 	if (!cmd_buffer->state.descriptors_dirty)
 		return;
 
 	if (cmd_buffer->state.push_descriptors_dirty)
 		radv_flush_push_descriptors(cmd_buffer);
 
+	if ((cmd_buffer->state.pipeline && cmd_buffer->state.pipeline->need_indirect_descriptor_sets) ||
+	    (cmd_buffer->state.compute_pipeline && cmd_buffer->state.compute_pipeline->need_indirect_descriptor_sets)) {
+		radv_flush_indirect_descriptor_sets(cmd_buffer);
+	}
+
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
 	                                                   cmd_buffer->cs,
 	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);
 
 	for (i = 0; i < MAX_SETS; i++) {
-		if (!(cmd_buffer->state.descriptors_dirty & (1 << i)))
+		if (!(cmd_buffer->state.descriptors_dirty & (1u << i)))
 			continue;
 		struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i];
 		if (!set)
@@ -1380,29 +1543,13 @@
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
 	                                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
-	if (stages & VK_SHADER_STAGE_VERTEX_BIT)
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
-					   AC_UD_PUSH_CONSTANTS, va);
 
-	if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
-					   AC_UD_PUSH_CONSTANTS, va);
-
-	if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline))
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY,
-					   AC_UD_PUSH_CONSTANTS, va);
-
-	if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(pipeline))
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL,
-					   AC_UD_PUSH_CONSTANTS, va);
-
-	if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(pipeline))
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL,
-					   AC_UD_PUSH_CONSTANTS, va);
-
-	if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
-					   AC_UD_PUSH_CONSTANTS, va);
+	radv_foreach_stage(stage, stages) {
+		if (pipeline->shaders[stage]) {
+			radv_emit_userdata_address(cmd_buffer, pipeline, stage,
+						   AC_UD_PUSH_CONSTANTS, va);
+		}
+	}
 
 	cmd_buffer->push_constant_stages &= ~stages;
 	assert(cmd_buffer->cs->cdw <= cdw_max);
@@ -1415,8 +1562,13 @@
 
 	if (primitive_reset_en != cmd_buffer->state.last_primitive_reset_en) {
 		cmd_buffer->state.last_primitive_reset_en = primitive_reset_en;
-		radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
-				       primitive_reset_en);
+		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+			radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
+					       primitive_reset_en);
+		} else {
+			radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
+					       primitive_reset_en);
+		}
 	}
 
 	if (primitive_reset_en) {
@@ -1431,20 +1583,13 @@
 }
 
 static void
-radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer,
-			    bool indexed_draw, bool instanced_draw,
-			    bool indirect_draw,
-			    uint32_t draw_vertex_count)
+radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
 {
-	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
 	struct radv_device *device = cmd_buffer->device;
-	uint32_t ia_multi_vgt_param;
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
-							   cmd_buffer->cs, 4096);
-
-	if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) &&
-	    cmd_buffer->state.pipeline->num_vertex_attribs) {
+	if ((cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline || cmd_buffer->state.vb_dirty) &&
+	    cmd_buffer->state.pipeline->num_vertex_attribs &&
+	    cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.has_vertex_buffers) {
 		unsigned vb_offset;
 		void *vb_ptr;
 		uint32_t i = 0;
@@ -1479,12 +1624,26 @@
 		va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
 		va += vb_offset;
 
-		radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
+		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
 					   AC_UD_VS_VERTEX_BUFFERS, va);
 	}
-
-	cmd_buffer->state.vertex_descriptors_dirty = false;
 	cmd_buffer->state.vb_dirty = 0;
+}
+
+static void
+radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer,
+			    bool indexed_draw, bool instanced_draw,
+			    bool indirect_draw,
+			    uint32_t draw_vertex_count)
+{
+	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+	uint32_t ia_multi_vgt_param;
+
+	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+							   cmd_buffer->cs, 4096);
+
+	radv_cmd_buffer_update_vertex_descriptors(cmd_buffer);
+
 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
 		radv_emit_graphics_pipeline(cmd_buffer, pipeline);
 
@@ -1493,24 +1652,15 @@
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, draw_vertex_count);
 	if (cmd_buffer->state.last_ia_multi_vgt_param != ia_multi_vgt_param) {
-		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
+		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+			radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
+		else if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
 			radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
 		else
 			radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
 		cmd_buffer->state.last_ia_multi_vgt_param = ia_multi_vgt_param;
 	}
 
-	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) {
-		radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en);
-
-		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
-			radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, cmd_buffer->state.pipeline->graphics.prim);
-		} else {
-			radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, cmd_buffer->state.pipeline->graphics.prim);
-		}
-		radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, cmd_buffer->state.pipeline->graphics.gs_out);
-	}
-
 	radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
 
 	radv_emit_primitive_reset_state(cmd_buffer, indexed_draw);
@@ -1667,8 +1817,9 @@
 		radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
 
 		for (unsigned i = 0; i < subpass->color_count; ++i) {
-			radv_handle_subpass_image_transition(cmd_buffer,
-							subpass->color_attachments[i]);
+			if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
+				radv_handle_subpass_image_transition(cmd_buffer,
+				                                     subpass->color_attachments[i]);
 		}
 
 		for (unsigned i = 0; i < subpass->input_count; ++i) {
@@ -1723,6 +1874,9 @@
 			if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
 			    att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 				clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+				if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
+				    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
+					clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
 			}
 			if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
 			    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
@@ -1820,7 +1974,7 @@
 		device->ws->cs_add_buffer(cmd_buffer->cs, device->gfx_init, 8);
 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
 		radeon_emit(cmd_buffer->cs, va);
-		radeon_emit(cmd_buffer->cs, (va >> 32) & 0xffff);
+		radeon_emit(cmd_buffer->cs, va >> 32);
 		radeon_emit(cmd_buffer->cs, device->gfx_init_size_dw & 0xffff);
 	} else
 		si_init_config(cmd_buffer);
@@ -1835,6 +1989,7 @@
 
 	memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
 	cmd_buffer->state.last_primitive_reset_en = -1;
+	cmd_buffer->usage_flags = pBeginInfo->flags;
 
 	/* setup initial configuration into command buffer */
 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
@@ -1880,7 +2035,7 @@
 	/* We have to defer setting up vertex buffer since we need the buffer
 	 * stride from the pipeline. */
 
-	assert(firstBinding + bindingCount < MAX_VBS);
+	assert(firstBinding + bindingCount <= MAX_VBS);
 	for (uint32_t i = 0; i < bindingCount; i++) {
 		vb[firstBinding + i].buffer = radv_buffer_from_handle(pBuffers[i]);
 		vb[firstBinding + i].offset = pOffsets[i];
@@ -1895,12 +2050,16 @@
 	VkIndexType indexType)
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
 
-	cmd_buffer->state.index_buffer = radv_buffer_from_handle(buffer);
-	cmd_buffer->state.index_offset = offset;
 	cmd_buffer->state.index_type = indexType; /* vk matches hw */
+	cmd_buffer->state.index_va = cmd_buffer->device->ws->buffer_get_va(index_buffer->bo);
+	cmd_buffer->state.index_va += index_buffer->offset + offset;
+
+	int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
+	cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift;
 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
-	cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->state.index_buffer->bo, 8);
+	cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, index_buffer->bo, 8);
 }
 
 
@@ -1910,13 +2069,13 @@
 {
 	struct radeon_winsys *ws = cmd_buffer->device->ws;
 
-	assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
-
 	cmd_buffer->state.descriptors[idx] = set;
-	cmd_buffer->state.descriptors_dirty |= (1 << idx);
+	cmd_buffer->state.descriptors_dirty |= (1u << idx);
 	if (!set)
 		return;
 
+	assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
+
 	for (unsigned j = 0; j < set->layout->buffer_count; ++j)
 		if (set->descriptors[j])
 			ws->cs_add_buffer(cmd_buffer->cs, set->descriptors[j], 7);
@@ -2023,7 +2182,7 @@
 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
 
 	cmd_buffer->state.descriptors[set] = push_set;
-	cmd_buffer->state.descriptors_dirty |= (1 << set);
+	cmd_buffer->state.descriptors_dirty |= (1u << set);
 }
 
 void radv_CmdPushDescriptorSetKHR(
@@ -2048,7 +2207,7 @@
 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
 
 	cmd_buffer->state.descriptors[set] = push_set;
-	cmd_buffer->state.descriptors_dirty |= (1 << set);
+	cmd_buffer->state.descriptors_dirty |= (1u << set);
 	cmd_buffer->state.push_descriptors_dirty = true;
 }
 
@@ -2072,7 +2231,7 @@
 						 descriptorUpdateTemplate, pData);
 
 	cmd_buffer->state.descriptors[set] = push_set;
-	cmd_buffer->state.descriptors_dirty |= (1 << set);
+	cmd_buffer->state.descriptors_dirty |= (1u << set);
 	cmd_buffer->state.push_descriptors_dirty = true;
 }
 
@@ -2093,8 +2252,13 @@
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
-	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
+	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
+		if (cmd_buffer->device->physical_device->rad_info.chip_class == SI)
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
 		si_emit_cache_flush(cmd_buffer);
+	}
+
+	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
 
 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
 	    cmd_buffer->record_fail)
@@ -2119,6 +2283,7 @@
 	va = ws->buffer_get_va(compute_shader->bo);
 
 	ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8);
+	radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size);
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
 							   cmd_buffer->cs, 16);
@@ -2180,7 +2345,6 @@
 		if (!pipeline)
 			break;
 
-		cmd_buffer->state.vertex_descriptors_dirty = true;
 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
 		cmd_buffer->push_constant_stages |= pipeline->active_stages;
 
@@ -2526,20 +2690,18 @@
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
 
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
-							     AC_UD_VS_BASE_VERTEX_START_INSTANCE);
-	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline),
-								radv_pipeline_has_tess(cmd_buffer->state.pipeline));
-		radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3);
-		radeon_emit(cmd_buffer->cs, firstVertex);
-		radeon_emit(cmd_buffer->cs, firstInstance);
+	assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr);
+	radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr,
+			      cmd_buffer->state.pipeline->graphics.vtx_emit_num);
+	radeon_emit(cmd_buffer->cs, firstVertex);
+	radeon_emit(cmd_buffer->cs, firstInstance);
+	if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3)
 		radeon_emit(cmd_buffer->cs, 0);
-	}
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, cmd_buffer->state.predicating));
 	radeon_emit(cmd_buffer->cs, instanceCount);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0));
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
 	radeon_emit(cmd_buffer->cs, vertexCount);
 	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
 		    S_0287F0_USE_OPAQUE(0));
@@ -2559,33 +2721,35 @@
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	int index_size = cmd_buffer->state.index_type ? 4 : 2;
-	uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size;
 	uint64_t index_va;
 
 	radv_cmd_buffer_flush_state(cmd_buffer, true, (instanceCount > 1), false, indexCount);
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
+	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-	radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
-
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
-							     AC_UD_VS_BASE_VERTEX_START_INSTANCE);
-	if (loc->sgpr_idx != -1) {
-		uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline),
-								radv_pipeline_has_tess(cmd_buffer->state.pipeline));
-		radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3);
-		radeon_emit(cmd_buffer->cs, vertexOffset);
-		radeon_emit(cmd_buffer->cs, firstInstance);
-		radeon_emit(cmd_buffer->cs, 0);
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_03090C_VGT_INDEX_TYPE,
+					   2, cmd_buffer->state.index_type);
+	} else {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+		radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
 	}
+
+	assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr);
+	radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr,
+			      cmd_buffer->state.pipeline->graphics.vtx_emit_num);
+	radeon_emit(cmd_buffer->cs, vertexOffset);
+	radeon_emit(cmd_buffer->cs, firstInstance);
+	if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3)
+		radeon_emit(cmd_buffer->cs, 0);
+
 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
 	radeon_emit(cmd_buffer->cs, instanceCount);
 
-	index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo);
-	index_va += firstIndex * index_size + cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset;
+	index_va = cmd_buffer->state.index_va;
+	index_va += firstIndex * index_size;
 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false));
-	radeon_emit(cmd_buffer->cs, index_max_size);
+	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
 	radeon_emit(cmd_buffer->cs, index_va);
 	radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF);
 	radeon_emit(cmd_buffer->cs, indexCount);
@@ -2617,37 +2781,47 @@
 	if (count_buffer) {
 		count_va = cmd_buffer->device->ws->buffer_get_va(count_buffer->bo);
 		count_va += count_offset + count_buffer->offset;
+
+		cmd_buffer->device->ws->cs_add_buffer(cs, count_buffer->bo, 8);
 	}
 
 	if (!draw_count)
 		return;
 
 	cmd_buffer->device->ws->cs_add_buffer(cs, buffer->bo, 8);
+	bool draw_id_enable = cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.needs_draw_id;
+	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
+	assert(base_reg);
 
-	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
-							     AC_UD_VS_BASE_VERTEX_START_INSTANCE);
-	uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline),
-							radv_pipeline_has_tess(cmd_buffer->state.pipeline));
-	assert(loc->sgpr_idx != -1);
 	radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
 	radeon_emit(cs, 1);
 	radeon_emit(cs, indirect_va);
 	radeon_emit(cs, indirect_va >> 32);
 
-	radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
-				       PKT3_DRAW_INDIRECT_MULTI,
-			     8, false));
-	radeon_emit(cs, 0);
-	radeon_emit(cs, ((base_reg + loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, ((base_reg + (loc->sgpr_idx + 1) * 4) - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, (((base_reg + (loc->sgpr_idx + 2) * 4) - SI_SH_REG_OFFSET) >> 2) |
-	                S_2C3_DRAW_INDEX_ENABLE(1) |
-	                S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
-	radeon_emit(cs, draw_count); /* count */
-	radeon_emit(cs, count_va); /* count_addr */
-	radeon_emit(cs, count_va >> 32);
-	radeon_emit(cs, stride); /* stride */
-	radeon_emit(cs, di_src_sel);
+	if (draw_count == 1 && !count_va && !draw_id_enable) {
+		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
+				     PKT3_DRAW_INDIRECT, 3, false));
+		radeon_emit(cs, 0);
+		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
+		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
+		radeon_emit(cs, di_src_sel);
+	} else {
+		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
+				     PKT3_DRAW_INDIRECT_MULTI,
+				     8, false));
+		radeon_emit(cs, 0);
+		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
+		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
+		radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
+			    S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
+			    S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
+		radeon_emit(cs, draw_count); /* count */
+		radeon_emit(cs, count_va); /* count_addr */
+		radeon_emit(cs, count_va >> 32);
+		radeon_emit(cs, stride); /* stride */
+		radeon_emit(cs, di_src_sel);
+	}
+
 	radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
@@ -2683,13 +2857,10 @@
 	uint32_t                                    stride)
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-	int index_size = cmd_buffer->state.index_type ? 4 : 2;
-	uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size;
 	uint64_t index_va;
 	radv_cmd_buffer_flush_state(cmd_buffer, true, false, true, 0);
 
-	index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo);
-	index_va += cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset;
+	index_va = cmd_buffer->state.index_va;
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21);
 
@@ -2701,7 +2872,7 @@
 	radeon_emit(cmd_buffer->cs, index_va >> 32);
 
 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
-	radeon_emit(cmd_buffer->cs, index_max_size);
+	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
 
 	radv_emit_indirect_draw(cmd_buffer, buffer, offset,
 	                        countBuffer, countBufferOffset, maxDrawCount, stride, true);
@@ -2785,11 +2956,14 @@
 							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
 	if (loc->sgpr_idx != -1) {
 		assert(!loc->indirect);
-		assert(loc->num_sgprs == 3);
-		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
+		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
+		assert(loc->num_sgprs == grid_used);
+		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
 		radeon_emit(cmd_buffer->cs, x);
-		radeon_emit(cmd_buffer->cs, y);
-		radeon_emit(cmd_buffer->cs, z);
+		if (grid_used > 1)
+			radeon_emit(cmd_buffer->cs, y);
+		if (grid_used > 2)
+			radeon_emit(cmd_buffer->cs, z);
 	}
 
 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
@@ -2821,7 +2995,8 @@
 	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
 							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
 	if (loc->sgpr_idx != -1) {
-		for (unsigned i = 0; i < 3; ++i) {
+		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
+		for (unsigned i = 0; i < grid_used; ++i) {
 			radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
 			radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
 				    COPY_DATA_DST_SEL(COPY_DATA_REG));
@@ -2892,10 +3067,13 @@
 	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
 							     MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
 	if (loc->sgpr_idx != -1) {
-		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
+		uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
+		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
 		radeon_emit(cmd_buffer->cs, blocks[0]);
-		radeon_emit(cmd_buffer->cs, blocks[1]);
-		radeon_emit(cmd_buffer->cs, blocks[2]);
+		if (grid_used > 1)
+			radeon_emit(cmd_buffer->cs, blocks[1]);
+		if (grid_used > 2)
+			radeon_emit(cmd_buffer->cs, blocks[2]);
 	}
 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
 		    PKT3_SHADER_TYPE_S(1));
@@ -2932,10 +3110,16 @@
 	cmd_buffer->state.framebuffer = NULL;
 }
 
-
+/*
+ * For HTILE we have the following interesting clear words:
+ *   0x0000030f: Uncompressed.
+ *   0xfffffff0: Clear depth to 1.0
+ *   0x00000000: Clear depth to 0.0
+ */
 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
                                   struct radv_image *image,
-                                  const VkImageSubresourceRange *range)
+                                  const VkImageSubresourceRange *range,
+                                  uint32_t clear_word)
 {
 	assert(range->baseMipLevel == 0);
 	assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
@@ -2947,7 +3131,7 @@
 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
 	                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 
-	radv_fill_buffer(cmd_buffer, image->bo, offset, size, 0xffffffff);
+	radv_fill_buffer(cmd_buffer, image->bo, offset, size, clear_word);
 
 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
 	                                RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
@@ -2959,27 +3143,27 @@
 					       struct radv_image *image,
 					       VkImageLayout src_layout,
 					       VkImageLayout dst_layout,
+					       unsigned src_queue_mask,
+					       unsigned dst_queue_mask,
 					       const VkImageSubresourceRange *range,
 					       VkImageAspectFlags pending_clears)
 {
 	if (dst_layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL &&
 	    (pending_clears & vk_format_aspects(image->vk_format)) == vk_format_aspects(image->vk_format) &&
 	    cmd_buffer->state.render_area.offset.x == 0 && cmd_buffer->state.render_area.offset.y == 0 &&
-	    cmd_buffer->state.render_area.extent.width == image->extent.width &&
-	    cmd_buffer->state.render_area.extent.height == image->extent.height) {
+	    cmd_buffer->state.render_area.extent.width == image->info.width &&
+	    cmd_buffer->state.render_area.extent.height == image->info.height) {
 		/* The clear will initialize htile. */
 		return;
 	} else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED &&
-	           radv_layout_has_htile(image, dst_layout)) {
+	           radv_layout_has_htile(image, dst_layout, dst_queue_mask)) {
 		/* TODO: merge with the clear if applicable */
-		radv_initialize_htile(cmd_buffer, image, range);
-	} else if (!radv_layout_has_htile(image, src_layout) &&
-	           radv_layout_has_htile(image, dst_layout)) {
-		radv_initialize_htile(cmd_buffer, image, range);
-	} else if ((radv_layout_has_htile(image, src_layout) &&
-	            !radv_layout_has_htile(image, dst_layout)) ||
-	           (radv_layout_is_htile_compressed(image, src_layout) &&
-	            !radv_layout_is_htile_compressed(image, dst_layout))) {
+		radv_initialize_htile(cmd_buffer, image, range, 0);
+	} else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
+	           radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
+		radv_initialize_htile(cmd_buffer, image, range, 0xffffffff);
+	} else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
+	           !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
 		VkImageSubresourceRange local_range = *range;
 		local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
 		local_range.baseMipLevel = 0;
@@ -3095,7 +3279,9 @@
 
 	if (image->surface.htile_size)
 		radv_handle_depth_image_transition(cmd_buffer, image, src_layout,
-						   dst_layout, range, pending_clears);
+						   dst_layout, src_queue_mask,
+						   dst_queue_mask, range,
+						   pending_clears);
 
 	if (image->cmask.size)
 		radv_handle_cmask_image_transition(cmd_buffer, image, src_layout,
@@ -3173,28 +3359,17 @@
 
 	cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8);
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
+	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18);
 
 	/* TODO: this is overkill. Probably should figure something out from
 	 * the stage mask. */
 
-	if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
-				EVENT_INDEX(5));
-		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-		radeon_emit(cs, 2);
-		radeon_emit(cs, 0);
-	}
-
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
-			EVENT_INDEX(5));
-	radeon_emit(cs, va);
-	radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-	radeon_emit(cs, value);
-	radeon_emit(cs, 0);
+	si_cs_emit_write_event_eop(cs,
+				   cmd_buffer->state.predicating,
+				   cmd_buffer->device->physical_device->rad_info.chip_class,
+				   false,
+				   EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
+				   1, va, 2, value);
 
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
@@ -3242,14 +3417,7 @@
 
 		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
 
-		radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-		radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		radeon_emit(cs, 1); /* reference value */
-		radeon_emit(cs, 0xffffffff); /* mask */
-		radeon_emit(cs, 4); /* poll interval */
-
+		si_emit_wait_fence(cs, false, va, 1, 0xffffffff);
 		assert(cmd_buffer->cs->cdw <= cdw_max);
 	}
 
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 4d1398e..c345d04 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -37,4 +37,8 @@
 	RADV_DEBUG_NO_IBS            = 0x200,
 };
 
+enum {
+	RADV_PERFTEST_BATCHCHAIN     =   0x1,
+	RADV_PERFTEST_SISCHED        =   0x2,
+};
 #endif
diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
index 48cb8c2..d8593cf 100644
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -66,6 +66,7 @@
 
 	set_layout->binding_count = max_binding + 1;
 	set_layout->shader_stages = 0;
+	set_layout->dynamic_shader_stages = 0;
 	set_layout->size = 0;
 
 	memset(set_layout->binding, 0, size - sizeof(struct radv_descriptor_set_layout));
@@ -77,6 +78,7 @@
 		const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
 		uint32_t b = binding->binding;
 		uint32_t alignment;
+		unsigned binding_buffer_count = 0;
 
 		switch (binding->descriptorType) {
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -85,7 +87,7 @@
 			set_layout->binding[b].dynamic_offset_count = 1;
 			set_layout->dynamic_shader_stages |= binding->stageFlags;
 			set_layout->binding[b].size = 0;
-			set_layout->binding[b].buffer_count = 1;
+			binding_buffer_count = 1;
 			alignment = 1;
 			break;
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
@@ -93,7 +95,7 @@
 		case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
 		case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
 			set_layout->binding[b].size = 16;
-			set_layout->binding[b].buffer_count = 1;
+			binding_buffer_count = 1;
 			alignment = 16;
 			break;
 		case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
@@ -101,13 +103,13 @@
 		case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
 			/* main descriptor + fmask descriptor */
 			set_layout->binding[b].size = 64;
-			set_layout->binding[b].buffer_count = 1;
+			binding_buffer_count = 1;
 			alignment = 32;
 			break;
 		case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 			/* main descriptor + fmask descriptor + sampler */
 			set_layout->binding[b].size = 96;
-			set_layout->binding[b].buffer_count = 1;
+			binding_buffer_count = 1;
 			alignment = 32;
 			break;
 		case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -150,7 +152,7 @@
 		}
 
 		set_layout->size += binding->descriptorCount * set_layout->binding[b].size;
-		buffer_count += binding->descriptorCount * set_layout->binding[b].buffer_count;
+		buffer_count += binding->descriptorCount * binding_buffer_count;
 		dynamic_offset_count += binding->descriptorCount *
 			set_layout->binding[b].dynamic_offset_count;
 		set_layout->shader_stages |= binding->stageFlags;
@@ -261,26 +263,29 @@
 			   struct radv_descriptor_set **out_set)
 {
 	struct radv_descriptor_set *set;
-	unsigned mem_size = sizeof(struct radv_descriptor_set) +
+	unsigned range_offset = sizeof(struct radv_descriptor_set) +
 		sizeof(struct radeon_winsys_bo *) * layout->buffer_count;
-	set = vk_alloc2(&device->alloc, NULL, mem_size, 8,
-			  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+	unsigned mem_size = range_offset +
+		sizeof(struct radv_descriptor_range) * layout->dynamic_offset_count;
 
-	if (!set)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+	if (pool->host_memory_base) {
+		if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
+			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+
+		set = (struct radv_descriptor_set*)pool->host_memory_ptr;
+		pool->host_memory_ptr += mem_size;
+	} else {
+		set = vk_alloc2(&device->alloc, NULL, mem_size, 8,
+		                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+		if (!set)
+			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+	}
 
 	memset(set, 0, mem_size);
 
 	if (layout->dynamic_offset_count) {
-		unsigned size = sizeof(struct radv_descriptor_range) *
-		                layout->dynamic_offset_count;
-		set->dynamic_descriptors = vk_alloc2(&device->alloc, NULL, size, 8,
-			                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
-		if (!set->dynamic_descriptors) {
-			vk_free2(&device->alloc, NULL, set);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-		}
+		set->dynamic_descriptors = (struct radv_descriptor_range*)((uint8_t*)set + range_offset);
 	}
 
 	set->layout = layout;
@@ -297,10 +302,12 @@
 			set->va = device->ws->buffer_get_va(set->bo) + pool->current_offset;
 			pool->current_offset += layout_size;
 			list_addtail(&set->vram_list, &pool->vram_list);
-		} else {
+		} else if (!pool->host_memory_base) {
 			uint64_t offset = 0;
 			struct list_head *prev = &pool->vram_list;
 			struct radv_descriptor_set *cur;
+
+			assert(!pool->host_memory_base);
 			LIST_FOR_EACH_ENTRY(cur, &pool->vram_list, vram_list) {
 				uint64_t start = (uint8_t*)cur->mapped_ptr - pool->mapped_ptr;
 				if (start - offset >= layout_size)
@@ -311,7 +318,6 @@
 			}
 
 			if (pool->size - offset < layout_size) {
-				vk_free2(&device->alloc, NULL, set->dynamic_descriptors);
 				vk_free2(&device->alloc, NULL, set);
 				return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 			}
@@ -319,7 +325,8 @@
 			set->mapped_ptr = (uint32_t*)(pool->mapped_ptr + offset);
 			set->va = device->ws->buffer_get_va(set->bo) + offset;
 			list_add(&set->vram_list, prev);
-		}
+		} else
+			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 	}
 
 	for (unsigned i = 0; i < layout->binding_count; ++i) {
@@ -348,10 +355,10 @@
 			    struct radv_descriptor_set *set,
 			    bool free_bo)
 {
+	assert(!pool->host_memory_base);
+
 	if (free_bo && set->size)
 		list_del(&set->vram_list);
-	if (set->dynamic_descriptors)
-		vk_free2(&device->alloc, NULL, set->dynamic_descriptors);
 	vk_free2(&device->alloc, NULL, set);
 }
 
@@ -364,18 +371,17 @@
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	struct radv_descriptor_pool *pool;
 	int size = sizeof(struct radv_descriptor_pool);
-	uint64_t bo_size = 0;
-	pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
-			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-	if (!pool)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+	uint64_t bo_size = 0, bo_count = 0, range_count = 0;
 
-	memset(pool, 0, sizeof(*pool));
 
 	for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
+		if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER)
+			bo_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+
 		switch(pCreateInfo->pPoolSizes[i].type) {
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
 		case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+			range_count += pCreateInfo->pPoolSizes[i].descriptorCount;
 			break;
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
 		case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
@@ -399,6 +405,26 @@
 		}
 	}
 
+	if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
+		uint64_t host_size = pCreateInfo->maxSets * sizeof(struct radv_descriptor_set);
+		host_size += sizeof(struct radeon_winsys_bo*) * bo_count;
+		host_size += sizeof(struct radv_descriptor_range) * range_count;
+		size += host_size;
+	}
+
+	pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
+	                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+	if (!pool)
+		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+	memset(pool, 0, sizeof(*pool));
+
+	if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
+		pool->host_memory_base = (uint8_t*)pool + sizeof(struct radv_descriptor_pool);
+		pool->host_memory_ptr = pool->host_memory_base;
+		pool->host_memory_end = (uint8_t*)pool + size;
+	}
+
 	if (bo_size) {
 		pool->bo = device->ws->buffer_create(device->ws, bo_size,
 							32, RADEON_DOMAIN_VRAM, 0);
@@ -422,9 +448,11 @@
 	if (!pool)
 		return;
 
-	list_for_each_entry_safe(struct radv_descriptor_set, set,
-				 &pool->vram_list, vram_list) {
-		radv_descriptor_set_destroy(device, pool, set, false);
+	if (!pool->host_memory_base) {
+		list_for_each_entry_safe(struct radv_descriptor_set, set,
+		                         &pool->vram_list, vram_list) {
+			radv_descriptor_set_destroy(device, pool, set, false);
+		}
 	}
 
 	if (pool->bo)
@@ -440,14 +468,17 @@
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_descriptor_pool, pool, descriptorPool);
 
-	list_for_each_entry_safe(struct radv_descriptor_set, set,
-				 &pool->vram_list, vram_list) {
-		radv_descriptor_set_destroy(device, pool, set, false);
+	if (!pool->host_memory_base) {
+		list_for_each_entry_safe(struct radv_descriptor_set, set,
+		                         &pool->vram_list, vram_list) {
+			radv_descriptor_set_destroy(device, pool, set, false);
+		}
 	}
 
 	list_inithead(&pool->vram_list);
 
 	pool->current_offset = 0;
+	pool->host_memory_ptr = pool->host_memory_base;
 
 	return VK_SUCCESS;
 }
@@ -496,7 +527,7 @@
 	for (uint32_t i = 0; i < count; i++) {
 		RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
 
-		if (set)
+		if (set && !pool->host_memory_base)
 			radv_descriptor_set_destroy(device, pool, set, true);
 	}
 	return VK_SUCCESS;
@@ -572,11 +603,18 @@
 		       struct radv_cmd_buffer *cmd_buffer,
 		       unsigned *dst,
 		       struct radeon_winsys_bo **buffer_list,
+		       VkDescriptorType descriptor_type,
 		       const VkDescriptorImageInfo *image_info)
 {
 	RADV_FROM_HANDLE(radv_image_view, iview, image_info->imageView);
-	memcpy(dst, iview->descriptor, 8 * 4);
-	memcpy(dst + 8, iview->fmask_descriptor, 8 * 4);
+
+	if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
+		memcpy(dst, iview->storage_descriptor, 8 * 4);
+		memcpy(dst + 8, iview->storage_fmask_descriptor, 8 * 4);
+	} else {
+		memcpy(dst, iview->descriptor, 8 * 4);
+		memcpy(dst + 8, iview->fmask_descriptor, 8 * 4);
+	}
 
 	if (cmd_buffer)
 		device->ws->cs_add_buffer(cmd_buffer->cs, iview->bo, 7);
@@ -589,12 +627,13 @@
 					struct radv_cmd_buffer *cmd_buffer,
 					unsigned *dst,
 					struct radeon_winsys_bo **buffer_list,
+					VkDescriptorType descriptor_type,
 					const VkDescriptorImageInfo *image_info,
 					bool has_sampler)
 {
 	RADV_FROM_HANDLE(radv_sampler, sampler, image_info->sampler);
 
-	write_image_descriptor(device, cmd_buffer, dst, buffer_list, image_info);
+	write_image_descriptor(device, cmd_buffer, dst, buffer_list, descriptor_type, image_info);
 	/* copy over sampler state */
 	if (has_sampler)
 		memcpy(dst + 16, sampler->state, 16);
@@ -639,7 +678,7 @@
 		ptr += binding_layout->offset / 4;
 		ptr += binding_layout->size * writeset->dstArrayElement / 4;
 		buffer_list += binding_layout->buffer_offset;
-		buffer_list += binding_layout->buffer_count * writeset->dstArrayElement;
+		buffer_list += writeset->dstArrayElement;
 		for (j = 0; j < writeset->descriptorCount; ++j) {
 			switch(writeset->descriptorType) {
 			case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -665,10 +704,12 @@
 			case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
 			case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
 				write_image_descriptor(device, cmd_buffer, ptr, buffer_list,
+						       writeset->descriptorType,
 						       writeset->pImageInfo + j);
 				break;
 			case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 				write_combined_image_sampler_descriptor(device, cmd_buffer, ptr, buffer_list,
+									writeset->descriptorType,
 									writeset->pImageInfo + j,
 									!binding_layout->immutable_samplers_offset);
 				if (copy_immutable_samplers) {
@@ -690,12 +731,63 @@
 				break;
 			}
 			ptr += binding_layout->size / 4;
-			buffer_list += binding_layout->buffer_count;
+			++buffer_list;
 		}
 
 	}
-	if (descriptorCopyCount)
-		radv_finishme("copy descriptors");
+
+	for (i = 0; i < descriptorCopyCount; i++) {
+		const VkCopyDescriptorSet *copyset = &pDescriptorCopies[i];
+		RADV_FROM_HANDLE(radv_descriptor_set, src_set,
+		                 copyset->srcSet);
+		RADV_FROM_HANDLE(radv_descriptor_set, dst_set,
+		                 copyset->dstSet);
+		const struct radv_descriptor_set_binding_layout *src_binding_layout =
+			src_set->layout->binding + copyset->srcBinding;
+		const struct radv_descriptor_set_binding_layout *dst_binding_layout =
+			dst_set->layout->binding + copyset->dstBinding;
+		uint32_t *src_ptr = src_set->mapped_ptr;
+		uint32_t *dst_ptr = dst_set->mapped_ptr;
+		struct radeon_winsys_bo **src_buffer_list = src_set->descriptors;
+		struct radeon_winsys_bo **dst_buffer_list = dst_set->descriptors;
+
+		src_ptr += src_binding_layout->offset / 4;
+		dst_ptr += dst_binding_layout->offset / 4;
+
+		src_ptr += src_binding_layout->size * copyset->srcArrayElement / 4;
+		dst_ptr += dst_binding_layout->size * copyset->dstArrayElement / 4;
+
+		src_buffer_list += src_binding_layout->buffer_offset;
+		src_buffer_list += copyset->srcArrayElement;
+
+		dst_buffer_list += dst_binding_layout->buffer_offset;
+		dst_buffer_list += copyset->dstArrayElement;
+
+		for (j = 0; j < copyset->descriptorCount; ++j) {
+			switch (src_binding_layout->type) {
+			case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+			case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+				unsigned src_idx = copyset->srcArrayElement + j;
+				unsigned dst_idx = copyset->dstArrayElement + j;
+				struct radv_descriptor_range *src_range, *dst_range;
+				src_idx += src_binding_layout->dynamic_offset_offset;
+				dst_idx += dst_binding_layout->dynamic_offset_offset;
+
+				src_range = src_set->dynamic_descriptors + src_idx;
+				dst_range = dst_set->dynamic_descriptors + dst_idx;
+				*dst_range = *src_range;
+				break;
+			}
+			default:
+				memcpy(dst_ptr, src_ptr, src_binding_layout->size);
+			}
+			src_ptr += src_binding_layout->size / 4;
+			dst_ptr += dst_binding_layout->size / 4;
+			dst_buffer_list[j] = src_buffer_list[j];
+			++src_buffer_list;
+			++dst_buffer_list;
+		}
+	}
 }
 
 void radv_UpdateDescriptorSets(
@@ -734,8 +826,7 @@
 		const VkDescriptorUpdateTemplateEntryKHR *entry = &pCreateInfo->pDescriptorUpdateEntries[i];
 		const struct radv_descriptor_set_binding_layout *binding_layout =
 			set_layout->binding + entry->dstBinding;
-		const uint32_t buffer_offset = binding_layout->buffer_offset +
-			binding_layout->buffer_count * entry->dstArrayElement;
+		const uint32_t buffer_offset = binding_layout->buffer_offset + entry->dstArrayElement;
 		const uint32_t *immutable_samplers = NULL;
 		uint32_t dst_offset;
 		uint32_t dst_stride;
@@ -775,7 +866,6 @@
 			.dst_offset = dst_offset,
 			.dst_stride = dst_stride,
 			.buffer_offset = buffer_offset,
-			.buffer_count = binding_layout->buffer_count,
 			.has_sampler = !binding_layout->immutable_samplers_offset,
 			.immutable_samplers = immutable_samplers
 		};
@@ -837,10 +927,12 @@
 			case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
 			case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
 				write_image_descriptor(device, cmd_buffer, pDst, buffer_list,
+						       templ->entry[i].descriptor_type,
 					               (struct VkDescriptorImageInfo *) pSrc);
 				break;
 			case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 				write_combined_image_sampler_descriptor(device, cmd_buffer, pDst, buffer_list,
+									templ->entry[i].descriptor_type,
 									(struct VkDescriptorImageInfo *) pSrc,
 									templ->entry[i].has_sampler);
 				if (templ->entry[i].immutable_samplers)
@@ -859,7 +951,7 @@
 			}
 		        pSrc += templ->entry[i].src_stride;
 			pDst += templ->entry[i].dst_stride;
-			buffer_list += templ->entry[i].buffer_count;
+			++buffer_list;
 		}
 	}
 }
diff --git a/src/amd/vulkan/radv_descriptor_set.h b/src/amd/vulkan/radv_descriptor_set.h
index a9f4bc6..4b63992 100644
--- a/src/amd/vulkan/radv_descriptor_set.h
+++ b/src/amd/vulkan/radv_descriptor_set.h
@@ -26,7 +26,7 @@
 
 #include <vulkan/vulkan.h>
 
-#define MAX_SETS         8
+#define MAX_SETS         32
 
 struct radv_descriptor_set_binding_layout {
    VkDescriptorType type;
@@ -38,10 +38,9 @@
    uint32_t buffer_offset;
    uint16_t dynamic_offset_offset;
 
+   uint16_t dynamic_offset_count;
    /* redundant with the type, each for a single array element */
    uint32_t size;
-   uint32_t buffer_count;
-   uint16_t dynamic_offset_count;
 
    /* Offset in the radv_descriptor_set_layout of the immutable samplers, or 0
     * if there are no immutable samplers. */
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 10783eb..6b4c43c 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -33,7 +33,7 @@
 #include "radv_cs.h"
 #include "util/disk_cache.h"
 #include "util/strtod.h"
-#include "util/vk_util.h"
+#include "vk_util.h"
 #include <xf86drm.h>
 #include <amdgpu.h>
 #include <amdgpu_drm.h>
@@ -42,6 +42,7 @@
 #include "ac_llvm_util.h"
 #include "vk_format.h"
 #include "sid.h"
+#include "gfx9d.h"
 #include "util/debug.h"
 
 static int
@@ -61,6 +62,15 @@
 	return 0;
 }
 
+static void
+radv_get_device_uuid(drmDevicePtr device, void *uuid) {
+	memset(uuid, 0, VK_UUID_SIZE);
+	memcpy((char*)uuid + 0, &device->businfo.pci->domain, 2);
+	memcpy((char*)uuid + 2, &device->businfo.pci->bus, 1);
+	memcpy((char*)uuid + 3, &device->businfo.pci->dev, 1);
+	memcpy((char*)uuid + 4, &device->businfo.pci->func, 1);
+}
+
 static const VkExtensionProperties instance_extensions[] = {
 	{
 		.extensionName = VK_KHR_SURFACE_EXTENSION_NAME,
@@ -81,13 +91,21 @@
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
 	{
 		.extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
-		.specVersion = 5,
+		.specVersion = 6,
 	},
 #endif
 	{
 		.extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
 		.specVersion = 1,
 	},
+	{
+		.extensionName = VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+		.specVersion = 1,
+	},
 };
 
 static const VkExtensionProperties common_device_extensions[] = {
@@ -124,7 +142,37 @@
 		.specVersion = 1,
 	},
 	{
-		.extensionName = VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME,
+		.extensionName = VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_VARIABLE_POINTERS_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+};
+static const VkExtensionProperties ext_sema_device_extensions[] = {
+	{
+		.extensionName = VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
+		.specVersion = 1,
+	},
+	{
+		.extensionName = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
 		.specVersion = 1,
 	},
 };
@@ -187,11 +235,109 @@
 	return false;
 }
 
+static const char *
+get_chip_name(enum radeon_family family)
+{
+	switch (family) {
+	case CHIP_TAHITI: return "AMD RADV TAHITI";
+	case CHIP_PITCAIRN: return "AMD RADV PITCAIRN";
+	case CHIP_VERDE: return "AMD RADV CAPE VERDE";
+	case CHIP_OLAND: return "AMD RADV OLAND";
+	case CHIP_HAINAN: return "AMD RADV HAINAN";
+	case CHIP_BONAIRE: return "AMD RADV BONAIRE";
+	case CHIP_KAVERI: return "AMD RADV KAVERI";
+	case CHIP_KABINI: return "AMD RADV KABINI";
+	case CHIP_HAWAII: return "AMD RADV HAWAII";
+	case CHIP_MULLINS: return "AMD RADV MULLINS";
+	case CHIP_TONGA: return "AMD RADV TONGA";
+	case CHIP_ICELAND: return "AMD RADV ICELAND";
+	case CHIP_CARRIZO: return "AMD RADV CARRIZO";
+	case CHIP_FIJI: return "AMD RADV FIJI";
+	case CHIP_POLARIS10: return "AMD RADV POLARIS10";
+	case CHIP_POLARIS11: return "AMD RADV POLARIS11";
+	case CHIP_POLARIS12: return "AMD RADV POLARIS12";
+	case CHIP_STONEY: return "AMD RADV STONEY";
+	case CHIP_VEGA10: return "AMD RADV VEGA";
+	case CHIP_RAVEN: return "AMD RADV RAVEN";
+	default: return "AMD RADV unknown";
+	}
+}
+
+static void
+radv_physical_device_init_mem_types(struct radv_physical_device *device)
+{
+	STATIC_ASSERT(RADV_MEM_HEAP_COUNT <= VK_MAX_MEMORY_HEAPS);
+	uint64_t visible_vram_size = MIN2(device->rad_info.vram_size,
+	                                  device->rad_info.vram_vis_size);
+
+	int vram_index = -1, visible_vram_index = -1, gart_index = -1;
+	device->memory_properties.memoryHeapCount = 0;
+	if (device->rad_info.vram_size - visible_vram_size > 0) {
+		vram_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[vram_index] = (VkMemoryHeap) {
+			.size = device->rad_info.vram_size - visible_vram_size,
+			.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+		};
+	}
+	if (visible_vram_size) {
+		visible_vram_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[visible_vram_index] = (VkMemoryHeap) {
+			.size = visible_vram_size,
+			.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+		};
+	}
+	if (device->rad_info.gart_size > 0) {
+		gart_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[gart_index] = (VkMemoryHeap) {
+			.size = device->rad_info.gart_size,
+			.flags = 0,
+		};
+	}
+
+	STATIC_ASSERT(RADV_MEM_TYPE_COUNT <= VK_MAX_MEMORY_TYPES);
+	unsigned type_count = 0;
+	if (vram_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_VRAM;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+			.heapIndex = vram_index,
+		};
+	}
+	if (gart_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_WRITE_COMBINE;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			.heapIndex = gart_index,
+		};
+	}
+	if (visible_vram_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_VRAM_CPU_ACCESS;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+			VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			.heapIndex = visible_vram_index,
+		};
+	}
+	if (gart_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_CACHED;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+			VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+			.heapIndex = gart_index,
+		};
+	}
+	device->memory_properties.memoryTypeCount = type_count;
+}
+
 static VkResult
 radv_physical_device_init(struct radv_physical_device *device,
 			  struct radv_instance *instance,
-			  const char *path)
+			  drmDevicePtr drm_device)
 {
+	const char *path = drm_device->nodes[DRM_NODE_RENDER];
 	VkResult result;
 	drmVersionPtr version;
 	int fd;
@@ -219,7 +365,8 @@
 	assert(strlen(path) < ARRAY_SIZE(device->path));
 	strncpy(device->path, path, ARRAY_SIZE(device->path));
 
-	device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags);
+	device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags,
+					       instance->perftest_flags);
 	if (!device->ws) {
 		result = VK_ERROR_INCOMPATIBLE_DRIVER;
 		goto fail;
@@ -248,9 +395,27 @@
 	if (result != VK_SUCCESS)
 		goto fail;
 
-	fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
-	device->name = device->rad_info.name;
+	if (device->rad_info.has_syncobj) {
+		result = radv_extensions_register(instance,
+						  &device->extensions,
+						  ext_sema_device_extensions,
+						  ARRAY_SIZE(ext_sema_device_extensions));
+		if (result != VK_SUCCESS)
+			goto fail;
+	}
 
+	fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
+	device->name = get_chip_name(device->rad_info.family);
+
+	radv_get_device_uuid(drm_device, device->device_uuid);
+
+	if (device->rad_info.family == CHIP_STONEY ||
+	    device->rad_info.chip_class >= GFX9) {
+		device->has_rbplus = true;
+		device->rbplus_allowed = device->rad_info.family == CHIP_STONEY;
+	}
+
+	radv_physical_device_init_mem_types(device);
 	return VK_SUCCESS;
 
 fail:
@@ -267,7 +432,6 @@
 	close(device->local_fd);
 }
 
-
 static void *
 default_alloc_func(void *pUserData, size_t size, size_t align,
                    VkSystemAllocationScope allocationScope)
@@ -309,6 +473,12 @@
 	{NULL, 0}
 };
 
+static const struct debug_control radv_perftest_options[] = {
+	{"batchchain", RADV_PERFTEST_BATCHCHAIN},
+	{"sisched", RADV_PERFTEST_SISCHED},
+	{NULL, 0}
+};
+
 VkResult radv_CreateInstance(
 	const VkInstanceCreateInfo*                 pCreateInfo,
 	const VkAllocationCallbacks*                pAllocator,
@@ -366,6 +536,9 @@
 	instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"),
 						   radv_debug_options);
 
+	instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"),
+						   radv_perftest_options);
+
 	*pInstance = radv_instance_to_handle(instance);
 
 	return VK_SUCCESS;
@@ -413,7 +586,7 @@
 			result = radv_physical_device_init(instance->physicalDevices +
 			                                   instance->physicalDeviceCount,
 			                                   instance,
-			                                   devices[i]->nodes[DRM_NODE_RENDER]);
+			                                   devices[i]);
 			if (result == VK_SUCCESS)
 				++instance->physicalDeviceCount;
 			else if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
@@ -456,8 +629,8 @@
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceFeatures*                   pFeatures)
 {
-	//   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
-
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+	bool is_gfx9 = pdevice->rad_info.chip_class >= GFX9;
 	memset(pFeatures, 0, sizeof(*pFeatures));
 
 	*pFeatures = (VkPhysicalDeviceFeatures) {
@@ -465,9 +638,9 @@
 		.fullDrawIndexUint32                      = true,
 		.imageCubeArray                           = true,
 		.independentBlend                         = true,
-		.geometryShader                           = true,
-		.tessellationShader                       = true,
-		.sampleRateShading                        = false,
+		.geometryShader                           = !is_gfx9,
+		.tessellationShader                       = !is_gfx9,
+		.sampleRateShading                        = true,
 		.dualSrcBlend                             = true,
 		.logicOp                                  = true,
 		.multiDrawIndirect                        = true,
@@ -501,7 +674,7 @@
 		.shaderClipDistance                       = true,
 		.shaderCullDistance                       = true,
 		.shaderFloat64                            = true,
-		.shaderInt64                              = false,
+		.shaderInt64                              = true,
 		.shaderInt16                              = false,
 		.sparseBinding                            = true,
 		.variableMultisampleRate                  = true,
@@ -513,29 +686,19 @@
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceFeatures2KHR               *pFeatures)
 {
-	return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
-}
-
-static uint32_t radv_get_driver_version()
-{
-	const char *minor_string = strchr(VERSION, '.');
-	const char *patch_string = minor_string ? strchr(minor_string + 1, ','): NULL;
-	int major = atoi(VERSION);
-	int minor = minor_string ? atoi(minor_string + 1) : 0;
-	int patch = patch_string ? atoi(patch_string + 1) : 0;
-	if (strstr(VERSION, "devel")) {
-		if (patch == 0) {
-			patch = 99;
-			if (minor == 0) {
-				minor = 99;
-				--major;
-			} else
-				--minor;
-		} else
-			--patch;
+	vk_foreach_struct(ext, pFeatures->pNext) {
+		switch (ext->sType) {
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR: {
+			VkPhysicalDeviceVariablePointerFeaturesKHR *features = (void *)ext;
+			features->variablePointersStorageBuffer = true;
+			features->variablePointers = false;
+			break;
+		}
+		default:
+			break;
+		}
 	}
-	uint32_t version = VK_MAKE_VERSION(major, minor, patch);
-	return version;
+	return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 }
 
 void radv_GetPhysicalDeviceProperties(
@@ -654,7 +817,7 @@
 		.sampledImageStencilSampleCounts          = sample_counts,
 		.storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
 		.maxSampleMaskWords                       = 1,
-		.timestampComputeAndGraphics              = false,
+		.timestampComputeAndGraphics              = true,
 		.timestampPeriod                          = 1000000.0 / pdevice->rad_info.clock_crystal_freq,
 		.maxClipDistances                         = 8,
 		.maxCullDistances                         = 8,
@@ -673,7 +836,7 @@
 
 	*pProperties = (VkPhysicalDeviceProperties) {
 		.apiVersion = VK_MAKE_VERSION(1, 0, 42),
-		.driverVersion = radv_get_driver_version(),
+		.driverVersion = vk_get_driver_version(),
 		.vendorID = 0x1002,
 		.deviceID = pdevice->rad_info.pci_id,
 		.deviceType = pdevice->rad_info.has_dedicated_vram ? VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU : VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
@@ -689,6 +852,7 @@
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceProperties2KHR             *pProperties)
 {
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
 	radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
 
 	vk_foreach_struct(ext, pProperties->pNext) {
@@ -699,6 +863,13 @@
 			properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
 			break;
 		}
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR: {
+			VkPhysicalDeviceIDPropertiesKHR *properties = (VkPhysicalDeviceIDPropertiesKHR*)ext;
+			radv_device_get_cache_uuid(0, properties->driverUUID);
+			memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+			properties->deviceLUIDValid = false;
+			break;
+		}
 		default:
 			break;
 		}
@@ -712,7 +883,7 @@
 {
 	int num_queue_families = 1;
 	int idx;
-	if (pdevice->rad_info.compute_rings > 0 &&
+	if (pdevice->rad_info.num_compute_rings > 0 &&
 	    pdevice->rad_info.chip_class >= CIK &&
 	    !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE))
 		num_queue_families++;
@@ -739,7 +910,7 @@
 		idx++;
 	}
 
-	if (pdevice->rad_info.compute_rings > 0 &&
+	if (pdevice->rad_info.num_compute_rings > 0 &&
 	    pdevice->rad_info.chip_class >= CIK &&
 	    !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
 		if (*pCount > idx) {
@@ -747,7 +918,7 @@
 				.queueFlags = VK_QUEUE_COMPUTE_BIT |
 				              VK_QUEUE_TRANSFER_BIT |
 				              VK_QUEUE_SPARSE_BINDING_BIT,
-				.queueCount = pdevice->rad_info.compute_rings,
+				.queueCount = pdevice->rad_info.num_compute_rings,
 				.timestampValidBits = 64,
 				.minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
 			};
@@ -801,47 +972,7 @@
 {
 	RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
 
-	STATIC_ASSERT(RADV_MEM_TYPE_COUNT <= VK_MAX_MEMORY_TYPES);
-
-	pMemoryProperties->memoryTypeCount = RADV_MEM_TYPE_COUNT;
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-		.heapIndex = RADV_MEM_HEAP_VRAM,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_WRITE_COMBINE] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-		.heapIndex = RADV_MEM_HEAP_GTT,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM_CPU_ACCESS] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-		VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-		.heapIndex = RADV_MEM_HEAP_VRAM_CPU_ACCESS,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_CACHED] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-		VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-		.heapIndex = RADV_MEM_HEAP_GTT,
-	};
-
-	STATIC_ASSERT(RADV_MEM_HEAP_COUNT <= VK_MAX_MEMORY_HEAPS);
-
-	pMemoryProperties->memoryHeapCount = RADV_MEM_HEAP_COUNT;
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM] = (VkMemoryHeap) {
-		.size = physical_device->rad_info.vram_size -
-				physical_device->rad_info.visible_vram_size,
-		.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-	};
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM_CPU_ACCESS] = (VkMemoryHeap) {
-		.size = physical_device->rad_info.visible_vram_size,
-		.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-	};
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_GTT] = (VkMemoryHeap) {
-		.size = physical_device->rad_info.gart_size,
-		.flags = 0,
-	};
+	*pMemoryProperties = physical_device->memory_properties;
 }
 
 void radv_GetPhysicalDeviceMemoryProperties2KHR(
@@ -918,6 +1049,8 @@
 	case CHIP_POLARIS10:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
 		device->gs_table_depth = 32;
 		return;
 	default:
@@ -942,6 +1075,19 @@
 			return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
 	}
 
+	/* Check enabled features */
+	if (pCreateInfo->pEnabledFeatures) {
+		VkPhysicalDeviceFeatures supported_features;
+		radv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
+		VkBool32 *supported_feature = (VkBool32 *)&supported_features;
+		VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
+		unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
+		for (uint32_t i = 0; i < num_features; i++) {
+			if (enabled_feature[i] && !supported_feature[i])
+				return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+		}
+	}
+
 	device = vk_alloc2(&physical_device->instance->alloc, pAllocator,
 			     sizeof(*device), 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -1040,7 +1186,9 @@
 		case RADV_QUEUE_GENERAL:
 		case RADV_QUEUE_COMPUTE:
 			si_cs_emit_cache_flush(device->flush_cs[family],
+					       false,
 			                       device->physical_device->rad_info.chip_class,
+					       NULL, 0,
 			                       family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
 			                       RADV_CMD_FLAG_INV_ICACHE |
 			                       RADV_CMD_FLAG_INV_SMEM_L1 |
@@ -1055,7 +1203,9 @@
 		case RADV_QUEUE_GENERAL:
 		case RADV_QUEUE_COMPUTE:
 			si_cs_emit_cache_flush(device->flush_shader_cs[family],
+					       false,
 			                       device->physical_device->rad_info.chip_class,
+					       NULL, 0,
 			                       family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
 					       family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
 			                       RADV_CMD_FLAG_INV_ICACHE |
@@ -1418,11 +1568,10 @@
 		max_offchip_buffers = MIN2(max_offchip_buffers, 126);
 		break;
 	case CIK:
-		max_offchip_buffers = MIN2(max_offchip_buffers, 508);
-		break;
 	case VI:
+	case GFX9:
 	default:
-		max_offchip_buffers = MIN2(max_offchip_buffers, 512);
+		max_offchip_buffers = MIN2(max_offchip_buffers, 508);
 		break;
 	}
 
@@ -1659,6 +1808,10 @@
 						       S_030938_SIZE(tess_factor_ring_size / 4));
 				radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
 						       tf_va >> 8);
+				if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
+					radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
+							       tf_va >> 40);
+				}
 				radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
 			} else {
 				radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
@@ -1701,7 +1854,9 @@
 
 		if (!i) {
 			si_cs_emit_cache_flush(cs,
+					       false,
 			                       queue->device->physical_device->rad_info.chip_class,
+					       NULL, 0,
 			                       queue->queue_family_index == RING_COMPUTE &&
 			                         queue->device->physical_device->rad_info.chip_class >= CIK,
 			                       RADV_CMD_FLAG_INV_ICACHE |
@@ -1796,6 +1951,89 @@
 	return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 }
 
+static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
+				      int num_sems,
+				      const VkSemaphore *sems,
+				      bool reset_temp)
+{
+	int syncobj_idx = 0, sem_idx = 0;
+
+	if (num_sems == 0)
+		return VK_SUCCESS;
+	for (uint32_t i = 0; i < num_sems; i++) {
+		RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]);
+
+		if (sem->temp_syncobj || sem->syncobj)
+			counts->syncobj_count++;
+		else
+			counts->sem_count++;
+	}
+
+	if (counts->syncobj_count) {
+		counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count);
+		if (!counts->syncobj)
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+
+	if (counts->sem_count) {
+		counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count);
+		if (!counts->sem) {
+			free(counts->syncobj);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+	}
+
+	for (uint32_t i = 0; i < num_sems; i++) {
+		RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]);
+
+		if (sem->temp_syncobj) {
+			counts->syncobj[syncobj_idx++] = sem->temp_syncobj;
+			if (reset_temp) {
+				/* after we wait on a temp import - drop it */
+				sem->temp_syncobj = 0;
+			}
+		}
+		else if (sem->syncobj)
+			counts->syncobj[syncobj_idx++] = sem->syncobj;
+		else {
+			assert(sem->sem);
+			counts->sem[sem_idx++] = sem->sem;
+		}
+	}
+
+	return VK_SUCCESS;
+}
+
+void radv_free_sem_info(struct radv_winsys_sem_info *sem_info)
+{
+	free(sem_info->wait.syncobj);
+	free(sem_info->wait.sem);
+	free(sem_info->signal.syncobj);
+	free(sem_info->signal.sem);
+}
+
+VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
+			     int num_wait_sems,
+			     const VkSemaphore *wait_sems,
+			     int num_signal_sems,
+			     const VkSemaphore *signal_sems)
+{
+	VkResult ret;
+	memset(sem_info, 0, sizeof(*sem_info));
+
+	ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, true);
+	if (ret)
+		return ret;
+	ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, false);
+	if (ret)
+		radv_free_sem_info(sem_info);
+
+	/* caller can override these */
+	sem_info->cs_emit_wait = true;
+	sem_info->cs_emit_signal = true;
+	return ret;
+}
+
 VkResult radv_QueueSubmit(
 	VkQueue                                     _queue,
 	uint32_t                                    submitCount,
@@ -1846,16 +2084,22 @@
 		bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
 		bool can_patch = !do_flush;
 		uint32_t advance;
+		struct radv_winsys_sem_info sem_info;
+
+		result = radv_alloc_sem_info(&sem_info,
+					     pSubmits[i].waitSemaphoreCount,
+					     pSubmits[i].pWaitSemaphores,
+					     pSubmits[i].signalSemaphoreCount,
+					     pSubmits[i].pSignalSemaphores);
+		if (result != VK_SUCCESS)
+			return result;
 
 		if (!pSubmits[i].commandBufferCount) {
 			if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) {
 				ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
 								   &queue->device->empty_cs[queue->queue_family_index],
 								   1, NULL, NULL,
-								   (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
-								   pSubmits[i].waitSemaphoreCount,
-								   (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
-								   pSubmits[i].signalSemaphoreCount,
+								   &sem_info,
 								   false, base_fence);
 				if (ret) {
 					radv_loge("failed to submit CS %d\n", i);
@@ -1863,6 +2107,7 @@
 				}
 				fence_emitted = true;
 			}
+			radv_free_sem_info(&sem_info);
 			continue;
 		}
 
@@ -1887,18 +2132,16 @@
 		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + do_flush; j += advance) {
 			advance = MIN2(max_cs_submission,
 				       pSubmits[i].commandBufferCount + do_flush - j);
-			bool b = j == 0;
-			bool e = j + advance == pSubmits[i].commandBufferCount + do_flush;
 
 			if (queue->device->trace_bo)
 				*queue->device->trace_id_ptr = 0;
 
+			sem_info.cs_emit_wait = j == 0;
+			sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount + do_flush;
+
 			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
 							advance, initial_preamble_cs, continue_preamble_cs,
-							(struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
-							b ? pSubmits[i].waitSemaphoreCount : 0,
-							(struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
-							e ? pSubmits[i].signalSemaphoreCount : 0,
+							   &sem_info,
 							can_patch, base_fence);
 
 			if (ret) {
@@ -1919,16 +2162,19 @@
 				}
 			}
 		}
+
+		radv_free_sem_info(&sem_info);
 		free(cs_array);
 	}
 
 	if (fence) {
-		if (!fence_emitted)
+		if (!fence_emitted) {
+			struct radv_winsys_sem_info sem_info = {0};
 			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
 							   &queue->device->empty_cs[queue->queue_family_index],
-							   1, NULL, NULL, NULL, 0, NULL, 0,
+							   1, NULL, NULL, &sem_info,
 							   false, base_fence);
-
+		}
 		fence->submitted = true;
 	}
 
@@ -2015,7 +2261,8 @@
 	VkResult result;
 	enum radeon_bo_domain domain;
 	uint32_t flags = 0;
-	const VkDedicatedAllocationMemoryAllocateInfoNV *dedicate_info = NULL;
+	enum radv_mem_type mem_type_index = device->physical_device->mem_type_indices[pAllocateInfo->memoryTypeIndex];
+
 	assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
 
 	if (pAllocateInfo->allocationSize == 0) {
@@ -2024,15 +2271,10 @@
 		return VK_SUCCESS;
 	}
 
-	vk_foreach_struct(ext, pAllocateInfo->pNext) {
-		switch (ext->sType) {
-		case VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV:
-			dedicate_info = (const VkDedicatedAllocationMemoryAllocateInfoNV *)ext;
-			break;
-		default:
-			break;
-		}
-	}
+	const VkImportMemoryFdInfoKHR *import_info =
+		vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
+	const VkMemoryDedicatedAllocateInfoKHR *dedicate_info =
+		vk_find_struct_const(pAllocateInfo->pNext, MEMORY_DEDICATED_ALLOCATE_INFO_KHR);
 
 	mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
 			  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -2047,30 +2289,44 @@
 		mem->buffer = NULL;
 	}
 
+	if (import_info) {
+		assert(import_info->handleType ==
+		       VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
+		mem->bo = device->ws->buffer_from_fd(device->ws, import_info->fd,
+						     NULL, NULL);
+		if (!mem->bo) {
+			result = VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR;
+			goto fail;
+		} else {
+			close(import_info->fd);
+			goto out_success;
+		}
+	}
+
 	uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
-	    pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_CACHED)
+	if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
+	    mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
 		domain = RADEON_DOMAIN_GTT;
 	else
 		domain = RADEON_DOMAIN_VRAM;
 
-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_VRAM)
+	if (mem_type_index == RADV_MEM_TYPE_VRAM)
 		flags |= RADEON_FLAG_NO_CPU_ACCESS;
 	else
 		flags |= RADEON_FLAG_CPU_ACCESS;
 
-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
+	if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
 		flags |= RADEON_FLAG_GTT_WC;
 
-	mem->bo = device->ws->buffer_create(device->ws, alloc_size, 65536,
+	mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
 					       domain, flags);
 
 	if (!mem->bo) {
 		result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
 		goto fail;
 	}
-	mem->type_index = pAllocateInfo->memoryTypeIndex;
-
+	mem->type_index = mem_type_index;
+out_success:
 	*pMem = radv_device_memory_to_handle(mem);
 
 	return VK_SUCCESS;
@@ -2153,13 +2409,14 @@
 }
 
 void radv_GetBufferMemoryRequirements(
-	VkDevice                                    device,
+	VkDevice                                    _device,
 	VkBuffer                                    _buffer,
 	VkMemoryRequirements*                       pMemoryRequirements)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
 
-	pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1;
+	pMemoryRequirements->memoryTypeBits = (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
 
 	if (buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
 		pMemoryRequirements->alignment = 4096;
@@ -2169,19 +2426,68 @@
 	pMemoryRequirements->size = align64(buffer->size, pMemoryRequirements->alignment);
 }
 
+void radv_GetBufferMemoryRequirements2KHR(
+	VkDevice                                     device,
+	const VkBufferMemoryRequirementsInfo2KHR*    pInfo,
+	VkMemoryRequirements2KHR*                    pMemoryRequirements)
+{
+	radv_GetBufferMemoryRequirements(device, pInfo->buffer,
+                                        &pMemoryRequirements->memoryRequirements);
+
+	vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+		switch (ext->sType) {
+		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR: {
+			VkMemoryDedicatedRequirementsKHR *req =
+			               (VkMemoryDedicatedRequirementsKHR *) ext;
+			req->requiresDedicatedAllocation = false;
+			req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
 void radv_GetImageMemoryRequirements(
-	VkDevice                                    device,
+	VkDevice                                    _device,
 	VkImage                                     _image,
 	VkMemoryRequirements*                       pMemoryRequirements)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_image, image, _image);
 
-	pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1;
+	pMemoryRequirements->memoryTypeBits = (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
 
 	pMemoryRequirements->size = image->size;
 	pMemoryRequirements->alignment = image->alignment;
 }
 
+void radv_GetImageMemoryRequirements2KHR(
+	VkDevice                                    device,
+	const VkImageMemoryRequirementsInfo2KHR*    pInfo,
+	VkMemoryRequirements2KHR*                   pMemoryRequirements)
+{
+	radv_GetImageMemoryRequirements(device, pInfo->image,
+                                        &pMemoryRequirements->memoryRequirements);
+
+	RADV_FROM_HANDLE(radv_image, image, pInfo->image);
+
+	vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+		switch (ext->sType) {
+		case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR: {
+			VkMemoryDedicatedRequirementsKHR *req =
+			               (VkMemoryDedicatedRequirementsKHR *) ext;
+			req->requiresDedicatedAllocation = image->shareable;
+			req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
 void radv_GetImageSparseMemoryRequirements(
 	VkDevice                                    device,
 	VkImage                                     image,
@@ -2191,6 +2497,15 @@
 	stub();
 }
 
+void radv_GetImageSparseMemoryRequirements2KHR(
+	VkDevice                                    device,
+	const VkImageSparseMemoryRequirementsInfo2KHR* pInfo,
+	uint32_t*                                   pSparseMemoryRequirementCount,
+	VkSparseImageMemoryRequirements2KHR*            pSparseMemoryRequirements)
+{
+	stub();
+}
+
 void radv_GetDeviceMemoryCommitment(
 	VkDevice                                    device,
 	VkDeviceMemory                              memory,
@@ -2292,6 +2607,7 @@
 	bool fence_emitted = false;
 
 	for (uint32_t i = 0; i < bindInfoCount; ++i) {
+		struct radv_winsys_sem_info sem_info;
 		for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) {
 			radv_sparse_buffer_bind_memory(queue->device,
 			                               pBindInfo[i].pBufferBinds + j);
@@ -2302,19 +2618,28 @@
 			                                     pBindInfo[i].pImageOpaqueBinds + j);
 		}
 
+		VkResult result;
+		result = radv_alloc_sem_info(&sem_info,
+					     pBindInfo[i].waitSemaphoreCount,
+					     pBindInfo[i].pWaitSemaphores,
+					     pBindInfo[i].signalSemaphoreCount,
+					     pBindInfo[i].pSignalSemaphores);
+		if (result != VK_SUCCESS)
+			return result;
+
 		if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) {
 			queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
 			                             &queue->device->empty_cs[queue->queue_family_index],
 			                             1, NULL, NULL,
-			                             (struct radeon_winsys_sem **)pBindInfo[i].pWaitSemaphores,
-			                             pBindInfo[i].waitSemaphoreCount,
-			                             (struct radeon_winsys_sem **)pBindInfo[i].pSignalSemaphores,
-			                             pBindInfo[i].signalSemaphoreCount,
+						     &sem_info,
 			                             false, base_fence);
 			fence_emitted = true;
 			if (fence)
 				fence->submitted = true;
 		}
+
+		radv_free_sem_info(&sem_info);
+
 	}
 
 	if (fence && !fence_emitted) {
@@ -2451,13 +2776,38 @@
 	VkSemaphore*                                pSemaphore)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
-	struct radeon_winsys_sem *sem;
+	const VkExportSemaphoreCreateInfoKHR *export =
+		vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO_KHR);
+	VkExternalSemaphoreHandleTypeFlagsKHR handleTypes =
+		export ? export->handleTypes : 0;
 
-	sem = device->ws->create_sem(device->ws);
+	struct radv_semaphore *sem = vk_alloc2(&device->alloc, pAllocator,
+					       sizeof(*sem), 8,
+					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!sem)
 		return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-	*pSemaphore = radeon_winsys_sem_to_handle(sem);
+	sem->temp_syncobj = 0;
+	/* create a syncobject if we are going to export this semaphore */
+	if (handleTypes) {
+		assert (device->physical_device->rad_info.has_syncobj);
+		assert (handleTypes == VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
+		int ret = device->ws->create_syncobj(device->ws, &sem->syncobj);
+		if (ret) {
+			vk_free2(&device->alloc, pAllocator, sem);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		sem->sem = NULL;
+	} else {
+		sem->sem = device->ws->create_sem(device->ws);
+		if (!sem->sem) {
+			vk_free2(&device->alloc, pAllocator, sem);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		sem->syncobj = 0;
+	}
+
+	*pSemaphore = radv_semaphore_to_handle(sem);
 	return VK_SUCCESS;
 }
 
@@ -2467,11 +2817,15 @@
 	const VkAllocationCallbacks*                pAllocator)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
-	RADV_FROM_HANDLE(radeon_winsys_sem, sem, _semaphore);
+	RADV_FROM_HANDLE(radv_semaphore, sem, _semaphore);
 	if (!_semaphore)
 		return;
 
-	device->ws->destroy_sem(sem);
+	if (sem->syncobj)
+		device->ws->destroy_syncobj(device->ws, sem->syncobj);
+	else
+		device->ws->destroy_sem(sem->sem);
+	vk_free2(&device->alloc, pAllocator, sem);
 }
 
 VkResult radv_CreateEvent(
@@ -2490,7 +2844,7 @@
 
 	event->bo = device->ws->buffer_create(device->ws, 8, 8,
 					      RADEON_DOMAIN_GTT,
-					      RADEON_FLAG_CPU_ACCESS);
+					      RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS);
 	if (!event->bo) {
 		vk_free2(&device->alloc, pAllocator, event);
 		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -2606,9 +2960,9 @@
 si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil)
 {
 	if (stencil)
-		return image->surface.stencil_tiling_index[level];
+		return image->surface.u.legacy.stencil_tiling_index[level];
 	else
-		return image->surface.tiling_index[level];
+		return image->surface.u.legacy.tiling_index[level];
 }
 
 static uint32_t radv_surface_layer_count(struct radv_image_view *iview)
@@ -2624,47 +2978,82 @@
 	const struct vk_format_description *desc;
 	unsigned ntype, format, swap, endian;
 	unsigned blend_clamp = 0, blend_bypass = 0;
-	unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
 	uint64_t va;
 	const struct radeon_surf *surf = &iview->image->surface;
-	const struct radeon_surf_level *level_info = &surf->level[iview->base_mip];
 
 	desc = vk_format_description(iview->vk_format);
 
 	memset(cb, 0, sizeof(*cb));
 
-	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
-	va += level_info->offset;
-	cb->cb_color_base = va >> 8;
+	/* Intensity is implemented as Red, so treat it that way. */
+	cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == VK_SWIZZLE_1);
 
+	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
+
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		struct gfx9_surf_meta_flags meta;
+		if (iview->image->dcc_offset)
+			meta = iview->image->surface.u.gfx9.dcc;
+		else
+			meta = iview->image->surface.u.gfx9.cmask;
+
+		cb->cb_color_attrib |= S_028C74_COLOR_SW_MODE(iview->image->surface.u.gfx9.surf.swizzle_mode) |
+			S_028C74_FMASK_SW_MODE(iview->image->surface.u.gfx9.fmask.swizzle_mode) |
+			S_028C74_RB_ALIGNED(meta.rb_aligned) |
+			S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
+
+		va += iview->image->surface.u.gfx9.surf_offset >> 8;
+	} else {
+		const struct legacy_surf_level *level_info = &surf->u.legacy.level[iview->base_mip];
+		unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
+
+		va += level_info->offset;
+
+		pitch_tile_max = level_info->nblk_x / 8 - 1;
+		slice_tile_max = (level_info->nblk_x * level_info->nblk_y) / 64 - 1;
+		tile_mode_index = si_tile_mode_index(iview->image, iview->base_mip, false);
+
+		cb->cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
+		cb->cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
+		cb->cb_color_cmask_slice = iview->image->cmask.slice_tile_max;
+
+		cb->cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
+		cb->micro_tile_mode = iview->image->surface.micro_tile_mode;
+
+		if (iview->image->fmask.size) {
+			if (device->physical_device->rad_info.chip_class >= CIK)
+				cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1);
+			cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index);
+			cb->cb_color_fmask_slice = S_028C88_TILE_MAX(iview->image->fmask.slice_tile_max);
+		} else {
+			/* This must be set for fast clear to work without FMASK. */
+			if (device->physical_device->rad_info.chip_class >= CIK)
+				cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
+			cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
+			cb->cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
+		}
+	}
+
+	cb->cb_color_base = va >> 8;
+	if (device->physical_device->rad_info.chip_class < GFX9)
+		cb->cb_color_base |= iview->image->surface.u.legacy.tile_swizzle;
 	/* CMASK variables */
 	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
 	va += iview->image->cmask.offset;
 	cb->cb_color_cmask = va >> 8;
-	cb->cb_color_cmask_slice = iview->image->cmask.slice_tile_max;
 
 	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
 	va += iview->image->dcc_offset;
 	cb->cb_dcc_base = va >> 8;
+	if (device->physical_device->rad_info.chip_class < GFX9)
+		cb->cb_dcc_base |= iview->image->surface.u.legacy.tile_swizzle;
 
 	uint32_t max_slice = radv_surface_layer_count(iview);
 	cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) |
 		S_028C6C_SLICE_MAX(iview->base_layer + max_slice - 1);
 
-	cb->micro_tile_mode = iview->image->surface.micro_tile_mode;
-	pitch_tile_max = level_info->nblk_x / 8 - 1;
-	slice_tile_max = (level_info->nblk_x * level_info->nblk_y) / 64 - 1;
-	tile_mode_index = si_tile_mode_index(iview->image, iview->base_mip, false);
-
-	cb->cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
-	cb->cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
-
-	/* Intensity is implemented as Red, so treat it that way. */
-	cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == VK_SWIZZLE_1) |
-		S_028C74_TILE_MODE_INDEX(tile_mode_index);
-
-	if (iview->image->samples > 1) {
-		unsigned log_samples = util_logbase2(iview->image->samples);
+	if (iview->image->info.samples > 1) {
+		unsigned log_samples = util_logbase2(iview->image->info.samples);
 
 		cb->cb_color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
 			S_028C74_NUM_FRAGMENTS(log_samples);
@@ -2672,18 +3061,11 @@
 
 	if (iview->image->fmask.size) {
 		va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + iview->image->fmask.offset;
-		if (device->physical_device->rad_info.chip_class >= CIK)
-			cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1);
-		cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index);
 		cb->cb_color_fmask = va >> 8;
-		cb->cb_color_fmask_slice = S_028C88_TILE_MAX(iview->image->fmask.slice_tile_max);
+		if (device->physical_device->rad_info.chip_class < GFX9)
+			cb->cb_color_fmask |= iview->image->surface.u.legacy.tile_swizzle;
 	} else {
-		/* This must be set for fast clear to work without FMASK. */
-		if (device->physical_device->rad_info.chip_class >= CIK)
-			cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
-		cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
 		cb->cb_color_fmask = cb->cb_color_base;
-		cb->cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
 	}
 
 	ntype = radv_translate_color_numformat(iview->vk_format,
@@ -2728,20 +3110,24 @@
 				    format != V_028C70_COLOR_24_8) |
 		S_028C70_NUMBER_TYPE(ntype) |
 		S_028C70_ENDIAN(endian);
-	if (iview->image->samples > 1)
-		if (iview->image->fmask.size)
-			cb->cb_color_info |= S_028C70_COMPRESSION(1);
+	if ((iview->image->info.samples > 1) && iview->image->fmask.size) {
+		cb->cb_color_info |= S_028C70_COMPRESSION(1);
+		if (device->physical_device->rad_info.chip_class == SI) {
+			unsigned fmask_bankh = util_logbase2(iview->image->fmask.bank_height);
+			cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+		}
+	}
 
 	if (iview->image->cmask.size &&
 	    !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS))
 		cb->cb_color_info |= S_028C70_FAST_CLEAR(1);
 
-	if (iview->image->surface.dcc_size && level_info->dcc_enabled)
+	if (iview->image->surface.dcc_size && iview->base_mip < surf->num_dcc_levels)
 		cb->cb_color_info |= S_028C70_DCC_ENABLE(1);
 
 	if (device->physical_device->rad_info.chip_class >= VI) {
 		unsigned max_uncompressed_block_size = 2;
-		if (iview->image->samples > 1) {
+		if (iview->image->info.samples > 1) {
 			if (iview->image->surface.bpe == 1)
 				max_uncompressed_block_size = 0;
 			else if (iview->image->surface.bpe == 2)
@@ -2755,9 +3141,24 @@
 	/* This must be set for fast clear to work without FMASK. */
 	if (!iview->image->fmask.size &&
 	    device->physical_device->rad_info.chip_class == SI) {
-		unsigned bankh = util_logbase2(iview->image->surface.bankh);
+		unsigned bankh = util_logbase2(iview->image->surface.u.legacy.bankh);
 		cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
 	}
+
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		unsigned mip0_depth = iview->image->type == VK_IMAGE_TYPE_3D ?
+		  (iview->extent.depth - 1) : (iview->image->info.array_size - 1);
+
+		cb->cb_color_view |= S_028C6C_MIP_LEVEL(iview->base_mip);
+		cb->cb_color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
+			S_028C74_RESOURCE_TYPE(iview->image->surface.u.gfx9.resource_type);
+		cb->cb_color_attrib2 = S_028C68_MIP0_WIDTH(iview->extent.width - 1) |
+			S_028C68_MIP0_HEIGHT(iview->extent.height - 1) |
+			S_028C68_MAX_MIP(iview->image->info.levels - 1);
+
+		cb->gfx9_epitch = S_0287A0_EPITCH(iview->image->surface.u.gfx9.surf.epitch);
+
+	}
 }
 
 static void
@@ -2766,12 +3167,11 @@
 			   struct radv_image_view *iview)
 {
 	unsigned level = iview->base_mip;
-	unsigned format;
+	unsigned format, stencil_format;
 	uint64_t va, s_offs, z_offs;
-	const struct radeon_surf_level *level_info = &iview->image->surface.level[level];
 	bool stencil_only = false;
 	memset(ds, 0, sizeof(*ds));
-	switch (iview->vk_format) {
+	switch (iview->image->vk_format) {
 	case VK_FORMAT_D24_UNORM_S8_UINT:
 	case VK_FORMAT_X8_D24_UNORM_PACK32:
 		ds->pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
@@ -2790,98 +3190,123 @@
 		break;
 	case VK_FORMAT_S8_UINT:
 		stencil_only = true;
-		level_info = &iview->image->surface.stencil_level[level];
 		break;
 	default:
 		break;
 	}
 
-	format = radv_translate_dbformat(iview->vk_format);
-
-	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
-	s_offs = z_offs = va;
-	z_offs += iview->image->surface.level[level].offset;
-	s_offs += iview->image->surface.stencil_level[level].offset;
+	format = radv_translate_dbformat(iview->image->vk_format);
+	stencil_format = iview->image->surface.flags & RADEON_SURF_SBUFFER ?
+		V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
 
 	uint32_t max_slice = radv_surface_layer_count(iview);
 	ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) |
 		S_028008_SLICE_MAX(iview->base_layer + max_slice - 1);
-	ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
-	ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1);
 
-	if (iview->image->samples > 1)
-		ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->samples));
+	ds->db_htile_data_base = 0;
+	ds->db_htile_surface = 0;
 
-	if (iview->image->surface.flags & RADEON_SURF_SBUFFER)
-		ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_8);
-	else
-		ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_INVALID);
+	va = device->ws->buffer_get_va(iview->bo) + iview->image->offset;
+	s_offs = z_offs = va;
 
-	if (device->physical_device->rad_info.chip_class >= CIK) {
-		struct radeon_info *info = &device->physical_device->rad_info;
-		unsigned tiling_index = iview->image->surface.tiling_index[level];
-		unsigned stencil_index = iview->image->surface.stencil_tiling_index[level];
-		unsigned macro_index = iview->image->surface.macro_tile_index;
-		unsigned tile_mode = info->si_tile_mode_array[tiling_index];
-		unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
-		unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		assert(iview->image->surface.u.gfx9.surf_offset == 0);
+		s_offs += iview->image->surface.u.gfx9.stencil_offset;
+
+		ds->db_z_info = S_028038_FORMAT(format) |
+			S_028038_NUM_SAMPLES(util_logbase2(iview->image->info.samples)) |
+			S_028038_SW_MODE(iview->image->surface.u.gfx9.surf.swizzle_mode) |
+			S_028038_MAXMIP(iview->image->info.levels - 1);
+		ds->db_stencil_info = S_02803C_FORMAT(stencil_format) |
+			S_02803C_SW_MODE(iview->image->surface.u.gfx9.stencil.swizzle_mode);
+
+		ds->db_z_info2 = S_028068_EPITCH(iview->image->surface.u.gfx9.surf.epitch);
+		ds->db_stencil_info2 = S_02806C_EPITCH(iview->image->surface.u.gfx9.stencil.epitch);
+		ds->db_depth_view |= S_028008_MIPID(level);
+
+		ds->db_depth_size = S_02801C_X_MAX(iview->image->info.width - 1) |
+			S_02801C_Y_MAX(iview->image->info.height - 1);
+
+		/* Only use HTILE for the first level. */
+		if (iview->image->surface.htile_size && !level) {
+			ds->db_z_info |= S_028038_TILE_SURFACE_ENABLE(1);
+
+			if (!(iview->image->surface.flags & RADEON_SURF_SBUFFER))
+				/* Use all of the htile_buffer for depth if there's no stencil. */
+				ds->db_stencil_info |= S_02803C_TILE_STENCIL_DISABLE(1);
+			va = device->ws->buffer_get_va(iview->bo) + iview->image->offset +
+				iview->image->htile_offset;
+			ds->db_htile_data_base = va >> 8;
+			ds->db_htile_surface = S_028ABC_FULL_CACHE(1) |
+				S_028ABC_PIPE_ALIGNED(iview->image->surface.u.gfx9.htile.pipe_aligned) |
+				S_028ABC_RB_ALIGNED(iview->image->surface.u.gfx9.htile.rb_aligned);
+		}
+	} else {
+		const struct legacy_surf_level *level_info = &iview->image->surface.u.legacy.level[level];
 
 		if (stencil_only)
-			tile_mode = stencil_tile_mode;
+			level_info = &iview->image->surface.u.legacy.stencil_level[level];
 
-		ds->db_depth_info |=
-			S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
-			S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
-			S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
-			S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
-			S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
-			S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
-		ds->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
-		ds->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
-	} else {
-		unsigned tile_mode_index = si_tile_mode_index(iview->image, level, false);
-		ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
-		tile_mode_index = si_tile_mode_index(iview->image, level, true);
-		ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
-	}
+		z_offs += iview->image->surface.u.legacy.level[level].offset;
+		s_offs += iview->image->surface.u.legacy.stencil_level[level].offset;
 
-	if (iview->image->surface.htile_size && !level) {
-		ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
-			S_028040_ALLOW_EXPCLEAR(1);
+		ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
+		ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1);
+		ds->db_stencil_info = S_028044_FORMAT(stencil_format);
 
-		if (iview->image->surface.flags & RADEON_SURF_SBUFFER) {
-			/* Workaround: For a not yet understood reason, the
-			 * combination of MSAA, fast stencil clear and stencil
-			 * decompress messes with subsequent stencil buffer
-			 * uses. Problem was reproduced on Verde, Bonaire,
-			 * Tonga, and Carrizo.
-			 *
-			 * Disabling EXPCLEAR works around the problem.
-			 *
-			 * Check piglit's arb_texture_multisample-stencil-clear
-			 * test if you want to try changing this.
-			 */
-			if (iview->image->samples <= 1)
-				ds->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1);
-		} else
-			/* Use all of the htile_buffer for depth if there's no stencil. */
-			ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
+		if (iview->image->info.samples > 1)
+			ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->info.samples));
 
-		va = device->ws->buffer_get_va(iview->bo) + iview->image->offset +
-		     iview->image->htile_offset;
-		ds->db_htile_data_base = va >> 8;
-		ds->db_htile_surface = S_028ABC_FULL_CACHE(1);
-	} else {
-		ds->db_htile_data_base = 0;
-		ds->db_htile_surface = 0;
+		if (device->physical_device->rad_info.chip_class >= CIK) {
+			struct radeon_info *info = &device->physical_device->rad_info;
+			unsigned tiling_index = iview->image->surface.u.legacy.tiling_index[level];
+			unsigned stencil_index = iview->image->surface.u.legacy.stencil_tiling_index[level];
+			unsigned macro_index = iview->image->surface.u.legacy.macro_tile_index;
+			unsigned tile_mode = info->si_tile_mode_array[tiling_index];
+			unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
+			unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
+
+			if (stencil_only)
+				tile_mode = stencil_tile_mode;
+
+			ds->db_depth_info |=
+				S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
+				S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
+				S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
+				S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
+				S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
+				S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
+			ds->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
+			ds->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
+		} else {
+			unsigned tile_mode_index = si_tile_mode_index(iview->image, level, false);
+			ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
+			tile_mode_index = si_tile_mode_index(iview->image, level, true);
+			ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
+			if (stencil_only)
+				ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
+		}
+
+		ds->db_depth_size = S_028058_PITCH_TILE_MAX((level_info->nblk_x / 8) - 1) |
+			S_028058_HEIGHT_TILE_MAX((level_info->nblk_y / 8) - 1);
+		ds->db_depth_slice = S_02805C_SLICE_TILE_MAX((level_info->nblk_x * level_info->nblk_y) / 64 - 1);
+
+		if (iview->image->surface.htile_size && !level) {
+			ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1);
+
+			if (!(iview->image->surface.flags & RADEON_SURF_SBUFFER))
+				/* Use all of the htile_buffer for depth if there's no stencil. */
+				ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
+
+			va = device->ws->buffer_get_va(iview->bo) + iview->image->offset +
+				iview->image->htile_offset;
+			ds->db_htile_data_base = va >> 8;
+			ds->db_htile_surface = S_028ABC_FULL_CACHE(1);
+		}
 	}
 
 	ds->db_z_read_base = ds->db_z_write_base = z_offs >> 8;
 	ds->db_stencil_read_base = ds->db_stencil_write_base = s_offs >> 8;
-
-	ds->db_depth_size = S_028058_PITCH_TILE_MAX((level_info->nblk_x / 8) - 1) |
-		S_028058_HEIGHT_TILE_MAX((level_info->nblk_y / 8) - 1);
-	ds->db_depth_slice = S_02805C_SLICE_TILE_MAX((level_info->nblk_x * level_info->nblk_y) / 64 - 1);
 }
 
 VkResult radv_CreateFramebuffer(
@@ -3115,7 +3540,6 @@
 	vk_free2(&device->alloc, pAllocator, sampler);
 }
 
-
 /* vk_icd.h does not declare this function, so we declare it here to
  * suppress Wmissing-prototypes.
  */
@@ -3159,3 +3583,94 @@
 	*pSupportedVersion = MIN2(*pSupportedVersion, 3u);
 	return VK_SUCCESS;
 }
+
+VkResult radv_GetMemoryFdKHR(VkDevice _device,
+			     const VkMemoryGetFdInfoKHR *pGetFdInfo,
+			     int *pFD)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_device_memory, memory, pGetFdInfo->memory);
+
+	assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
+
+	/* We support only one handle type. */
+	assert(pGetFdInfo->handleType ==
+	       VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
+
+	bool ret = radv_get_memory_fd(device, memory, pFD);
+	if (ret == false)
+		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+	return VK_SUCCESS;
+}
+
+VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device,
+				       VkExternalMemoryHandleTypeFlagBitsKHR handleType,
+				       int fd,
+				       VkMemoryFdPropertiesKHR *pMemoryFdProperties)
+{
+   /* The valid usage section for this function says:
+    *
+    *    "handleType must not be one of the handle types defined as opaque."
+    *
+    * Since we only handle opaque handles for now, there are no FD properties.
+    */
+   return VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR;
+}
+
+VkResult radv_ImportSemaphoreFdKHR(VkDevice _device,
+				   const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
+	uint32_t syncobj_handle = 0;
+	assert(pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
+
+	int ret = device->ws->import_syncobj(device->ws, pImportSemaphoreFdInfo->fd, &syncobj_handle);
+	if (ret != 0)
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR;
+
+	if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR) {
+		sem->temp_syncobj = syncobj_handle;
+	} else {
+		sem->syncobj = syncobj_handle;
+	}
+	close(pImportSemaphoreFdInfo->fd);
+	return VK_SUCCESS;
+}
+
+VkResult radv_GetSemaphoreFdKHR(VkDevice _device,
+				const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
+				int *pFd)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_semaphore, sem, pGetFdInfo->semaphore);
+	int ret;
+	uint32_t syncobj_handle;
+
+	assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
+	if (sem->temp_syncobj)
+		syncobj_handle = sem->temp_syncobj;
+	else
+		syncobj_handle = sem->syncobj;
+	ret = device->ws->export_syncobj(device->ws, syncobj_handle, pFd);
+	if (ret)
+		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+	return VK_SUCCESS;
+}
+
+void radv_GetPhysicalDeviceExternalSemaphorePropertiesKHR(
+	VkPhysicalDevice                            physicalDevice,
+	const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo,
+	VkExternalSemaphorePropertiesKHR*           pExternalSemaphoreProperties)
+{
+	if (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR) {
+		pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+		pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+		pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR |
+			VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR;
+	} else {
+		pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
+		pExternalSemaphoreProperties->compatibleHandleTypes = 0;
+		pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
+	}
+}
diff --git a/src/amd/vulkan/radv_entrypoints_gen.py b/src/amd/vulkan/radv_entrypoints_gen.py
index 3474c78..9634f76 100644
--- a/src/amd/vulkan/radv_entrypoints_gen.py
+++ b/src/amd/vulkan/radv_entrypoints_gen.py
@@ -1,6 +1,6 @@
 # coding=utf-8
 #
-# Copyright © 2015 Intel Corporation
+# Copyright © 2015, 2017 Intel Corporation
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -22,26 +22,41 @@
 # IN THE SOFTWARE.
 #
 
-import sys
-import xml.etree.ElementTree as ET
+import argparse
+import functools
+import os
+import textwrap
+import xml.etree.cElementTree as et
 
-max_api_version = 1.0
+from mako.template import Template
 
-supported_extensions = [
-   'VK_AMD_draw_indirect_count',
-   'VK_NV_dedicated_allocation',
-   'VK_KHR_descriptor_update_template',
-   'VK_KHR_get_physical_device_properties2',
-   'VK_KHR_incremental_present',
-   'VK_KHR_maintenance1',
-   'VK_KHR_push_descriptor',
-   'VK_KHR_sampler_mirror_clamp_to_edge',
-   'VK_KHR_shader_draw_parameters',
-   'VK_KHR_surface',
-   'VK_KHR_swapchain',
-   'VK_KHR_wayland_surface',
-   'VK_KHR_xcb_surface',
-   'VK_KHR_xlib_surface',
+MAX_API_VERSION = 1.0
+
+SUPPORTED_EXTENSIONS = [
+    'VK_AMD_draw_indirect_count',
+    'VK_NV_dedicated_allocation',
+    'VK_KHR_descriptor_update_template',
+    'VK_KHR_get_physical_device_properties2',
+    'VK_KHR_incremental_present',
+    'VK_KHR_maintenance1',
+    'VK_KHR_push_descriptor',
+    'VK_KHR_sampler_mirror_clamp_to_edge',
+    'VK_KHR_shader_draw_parameters',
+    'VK_KHR_surface',
+    'VK_KHR_swapchain',
+    'VK_KHR_wayland_surface',
+    'VK_KHR_xcb_surface',
+    'VK_KHR_xlib_surface',
+    'VK_KHR_get_memory_requirements2',
+    'VK_KHR_dedicated_allocation',
+    'VK_KHR_external_memory_capabilities',
+    'VK_KHR_external_memory',
+    'VK_KHR_external_memory_fd',
+    'VK_KHR_storage_buffer_storage_class',
+    'VK_KHR_variable_pointers',
+    'VK_KHR_external_semaphore_capabilities',
+    'VK_KHR_external_semaphore',
+    'VK_KHR_external_semaphore_fd',
 ]
 
 # We generate a static hash table for entry point lookup
@@ -49,54 +64,204 @@
 # function and a power-of-two size table. The prime numbers are determined
 # experimentally.
 
-none = 0xffff
-hash_size = 256
-u32_mask = 2**32 - 1
-hash_mask = hash_size - 1
+TEMPLATE_H = Template(textwrap.dedent("""\
+    /* This file generated from ${filename}, don't edit directly. */
 
-prime_factor = 5024183
-prime_step = 19
+    struct radv_dispatch_table {
+       union {
+          void *entrypoints[${len(entrypoints)}];
+          struct {
+          % for _, name, _, _, _, guard in entrypoints:
+            % if guard is not None:
+    #ifdef ${guard}
+              PFN_vk${name} ${name};
+    #else
+              void *${name};
+    # endif
+            % else:
+              PFN_vk${name} ${name};
+            % endif
+          % endfor
+          };
+       };
+    };
 
-def hash(name):
-    h = 0;
-    for c in name:
-        h = (h * prime_factor + ord(c)) & u32_mask
+    % for type_, name, args, num, h, guard in entrypoints:
+      % if guard is not None:
+    #ifdef ${guard}
+      % endif
+      ${type_} radv_${name}(${args});
+      % if guard is not None:
+    #endif // ${guard}
+      % endif
+    % endfor
+    """), output_encoding='utf-8')
 
-    return h
+TEMPLATE_C = Template(textwrap.dedent(u"""\
+    /*
+     * Copyright © 2015 Intel Corporation
+     *
+     * Permission is hereby granted, free of charge, to any person obtaining a
+     * copy of this software and associated documentation files (the "Software"),
+     * to deal in the Software without restriction, including without limitation
+     * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+     * and/or sell copies of the Software, and to permit persons to whom the
+     * Software is furnished to do so, subject to the following conditions:
+     *
+     * The above copyright notice and this permission notice (including the next
+     * paragraph) shall be included in all copies or substantial portions of the
+     * Software.
+     *
+     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+     * IN THE SOFTWARE.
+     */
 
-def print_guard_start(guard):
-    if guard is not None:
-        print "#ifdef {0}".format(guard)
+    /* This file generated from ${filename}, don't edit directly. */
 
-def print_guard_end(guard):
-    if guard is not None:
-        print "#endif // {0}".format(guard)
+    #include "radv_private.h"
 
-opt_header = False
-opt_code = False
+    struct radv_entrypoint {
+       uint32_t name;
+       uint32_t hash;
+    };
 
-if (sys.argv[1] == "header"):
-    opt_header = True
-    sys.argv.pop()
-elif (sys.argv[1] == "code"):
-    opt_code = True
-    sys.argv.pop()
+    /* We use a big string constant to avoid lots of reloctions from the entry
+     * point table to lots of little strings. The entries in the entry point table
+     * store the index into this big string.
+     */
 
-# Extract the entry points from the registry
+    static const char strings[] =
+    % for _, name, _, _, _, _ in entrypoints:
+        "vk${name}\\0"
+    % endfor
+    ;
+
+    static const struct radv_entrypoint entrypoints[] = {
+    % for _, _, _, num, h, _ in entrypoints:
+        { ${offsets[num]}, ${'{:0=#8x}'.format(h)} },
+    % endfor
+    };
+
+    /* Weak aliases for all potential implementations. These will resolve to
+     * NULL if they're not defined, which lets the resolve_entrypoint() function
+     * either pick the correct entry point.
+     */
+
+    % for layer in ['radv']:
+      % for type_, name, args, _, _, guard in entrypoints:
+        % if guard is not None:
+    #ifdef ${guard}
+        % endif
+        ${type_} ${layer}_${name}(${args}) __attribute__ ((weak));
+        % if guard is not None:
+    #endif // ${guard}
+        % endif
+      % endfor
+
+      const struct radv_dispatch_table ${layer}_layer = {
+      % for _, name, args, _, _, guard in entrypoints:
+        % if guard is not None:
+    #ifdef ${guard}
+        % endif
+        .${name} = ${layer}_${name},
+        % if guard is not None:
+    #endif // ${guard}
+        % endif
+      % endfor
+      };
+    % endfor
+
+    static void * __attribute__ ((noinline))
+    radv_resolve_entrypoint(uint32_t index)
+    {
+       return radv_layer.entrypoints[index];
+    }
+
+    /* Hash table stats:
+     * size ${hash_size} entries
+     * collisions entries:
+    % for i in xrange(10):
+     *     ${i}${'+' if i == 9 else ''}     ${collisions[i]}
+    % endfor
+     */
+
+    #define none ${'{:#x}'.format(none)}
+    static const uint16_t map[] = {
+    % for i in xrange(0, hash_size, 8):
+      % for j in xrange(i, i + 8):
+        ## This is 6 because the 0x is counted in the length
+        % if mapping[j] & 0xffff == 0xffff:
+          none,
+        % else:
+          ${'{:0=#6x}'.format(mapping[j] & 0xffff)},
+        % endif
+      % endfor
+    % endfor
+    };
+
+    void *
+    radv_lookup_entrypoint(const char *name)
+    {
+       static const uint32_t prime_factor = ${prime_factor};
+       static const uint32_t prime_step = ${prime_step};
+       const struct radv_entrypoint *e;
+       uint32_t hash, h, i;
+       const char *p;
+
+       hash = 0;
+       for (p = name; *p; p++)
+          hash = hash * prime_factor + *p;
+
+       h = hash;
+       do {
+          i = map[h & ${hash_mask}];
+          if (i == none)
+             return NULL;
+          e = &entrypoints[i];
+          h += prime_step;
+       } while (e->hash != hash);
+
+       if (strcmp(name, strings + e->name) != 0)
+          return NULL;
+
+       return radv_resolve_entrypoint(i);
+    }"""), output_encoding='utf-8')
+
+NONE = 0xffff
+HASH_SIZE = 256
+U32_MASK = 2**32 - 1
+HASH_MASK = HASH_SIZE - 1
+
+PRIME_FACTOR = 5024183
+PRIME_STEP = 19
+
+
+def cal_hash(name):
+    """Calculate the same hash value that Mesa will calculate in C."""
+    return functools.reduce(
+        lambda h, c: (h * PRIME_FACTOR + ord(c)) & U32_MASK, name, 0)
+
+
 def get_entrypoints(doc, entrypoints_to_defines):
+    """Extract the entry points from the registry."""
     entrypoints = []
 
     enabled_commands = set()
     for feature in doc.findall('./feature'):
         assert feature.attrib['api'] == 'vulkan'
-        if float(feature.attrib['number']) > max_api_version:
+        if float(feature.attrib['number']) > MAX_API_VERSION:
             continue
 
         for command in feature.findall('./require/command'):
             enabled_commands.add(command.attrib['name'])
 
     for extension in doc.findall('.extensions/extension'):
-        if extension.attrib['name'] not in supported_extensions:
+        if extension.attrib['name'] not in SUPPORTED_EXTENSIONS:
             continue
 
         assert extension.attrib['supported'] == 'vulkan'
@@ -112,219 +277,78 @@
             continue
 
         shortname = fullname[2:]
-        params = map(lambda p: "".join(p.itertext()), command.findall('./param'))
+        params = (''.join(p.itertext()) for p in command.findall('./param'))
         params = ', '.join(params)
-        if fullname in entrypoints_to_defines:
-            guard = entrypoints_to_defines[fullname]
-        else:
-            guard = None
-        entrypoints.append((type, shortname, params, index, hash(fullname), guard))
+        guard = entrypoints_to_defines.get(fullname)
+        entrypoints.append((type, shortname, params, index, cal_hash(fullname), guard))
         index += 1
 
     return entrypoints
 
-# Maps entry points to extension defines
+
 def get_entrypoints_defines(doc):
+    """Maps entry points to extension defines."""
     entrypoints_to_defines = {}
-    extensions = doc.findall('./extensions/extension')
-    for extension in extensions:
-        define = extension.get('protect')
-        entrypoints = extension.findall('./require/command')
-        for entrypoint in entrypoints:
-            fullname = entrypoint.get('name')
+
+    for extension in doc.findall('./extensions/extension[@protect]'):
+        define = extension.attrib['protect']
+
+        for entrypoint in extension.findall('./require/command'):
+            fullname = entrypoint.attrib['name']
             entrypoints_to_defines[fullname] = define
+
     return entrypoints_to_defines
 
-doc = ET.parse(sys.stdin)
-entrypoints = get_entrypoints(doc, get_entrypoints_defines(doc))
 
-# For outputting entrypoints.h we generate a radv_EntryPoint() prototype
-# per entry point.
+def gen_code(entrypoints):
+    """Generate the C code."""
+    i = 0
+    offsets = []
+    for _, name, _, _, _, _ in entrypoints:
+        offsets.append(i)
+        i += 2 + len(name) + 1
 
-if opt_header:
-    print "/* This file generated from vk_gen.py, don't edit directly. */\n"
-
-    print "struct radv_dispatch_table {"
-    print "   union {"
-    print "      void *entrypoints[%d];" % len(entrypoints)
-    print "      struct {"
-
-    for type, name, args, num, h, guard in entrypoints:
-        if guard is not None:
-            print "#ifdef {0}".format(guard)
-            print "         PFN_vk{0} {0};".format(name)
-            print "#else"
-            print "         void *{0};".format(name)
-            print "#endif"
+    mapping = [NONE] * HASH_SIZE
+    collisions = [0] * 10
+    for _, name, _, num, h, _ in entrypoints:
+        level = 0
+        while mapping[h & HASH_MASK] != NONE:
+            h = h + PRIME_STEP
+            level = level + 1
+        if level > 9:
+            collisions[9] += 1
         else:
-            print "         PFN_vk{0} {0};".format(name)
-    print "      };\n"
-    print "   };\n"
-    print "};\n"
+            collisions[level] += 1
+        mapping[h & HASH_MASK] = num
 
-    for type, name, args, num, h, guard in entrypoints:
-        print_guard_start(guard)
-        print "%s radv_%s(%s);" % (type, name, args)
-        print_guard_end(guard)
-    exit()
+    return TEMPLATE_C.render(entrypoints=entrypoints,
+                             offsets=offsets,
+                             collisions=collisions,
+                             mapping=mapping,
+                             hash_mask=HASH_MASK,
+                             prime_step=PRIME_STEP,
+                             prime_factor=PRIME_FACTOR,
+                             none=NONE,
+                             hash_size=HASH_SIZE,
+                             filename=os.path.basename(__file__))
 
 
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--outdir', help='Where to write the files.',
+                        required=True)
+    parser.add_argument('--xml', help='Vulkan API XML file.', required=True)
+    args = parser.parse_args()
 
-print """/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
+    doc = et.parse(args.xml)
+    entrypoints = get_entrypoints(doc, get_entrypoints_defines(doc))
 
-/* DO NOT EDIT! This is a generated file. */
+    with open(os.path.join(args.outdir, 'radv_entrypoints.h'), 'wb') as f:
+        f.write(TEMPLATE_H.render(entrypoints=entrypoints,
+                                  filename=os.path.basename(__file__)))
+    with open(os.path.join(args.outdir, 'radv_entrypoints.c'), 'wb') as f:
+        f.write(gen_code(entrypoints))
 
-#include "radv_private.h"
 
-struct radv_entrypoint {
-   uint32_t name;
-   uint32_t hash;
-};
-
-/* We use a big string constant to avoid lots of reloctions from the entry
- * point table to lots of little strings. The entries in the entry point table
- * store the index into this big string.
- */
-
-static const char strings[] ="""
-
-offsets = []
-i = 0;
-for type, name, args, num, h, guard in entrypoints:
-    print "   \"vk%s\\0\"" % name
-    offsets.append(i)
-    i += 2 + len(name) + 1
-print "   ;"
-
-# Now generate the table of all entry points
-
-print "\nstatic const struct radv_entrypoint entrypoints[] = {"
-for type, name, args, num, h, guard in entrypoints:
-    print "   { %5d, 0x%08x }," % (offsets[num], h)
-print "};\n"
-
-print """
-
-/* Weak aliases for all potential implementations. These will resolve to
- * NULL if they're not defined, which lets the resolve_entrypoint() function
- * either pick the correct entry point.
- */
-"""
-
-for layer in [ "radv" ]:
-    for type, name, args, num, h, guard in entrypoints:
-        print_guard_start(guard)
-        print "%s %s_%s(%s) __attribute__ ((weak));" % (type, layer, name, args)
-        print_guard_end(guard)
-    print "\nconst struct radv_dispatch_table %s_layer = {" % layer
-    for type, name, args, num, h, guard in entrypoints:
-        print_guard_start(guard)
-        print "   .%s = %s_%s," % (name, layer, name)
-        print_guard_end(guard)
-    print "};\n"
-
-print """
-
-static void * __attribute__ ((noinline))
-radv_resolve_entrypoint(uint32_t index)
-{
-   return radv_layer.entrypoints[index];
-}
-"""
-
-# Now generate the hash table used for entry point look up.  This is a
-# uint16_t table of entry point indices. We use 0xffff to indicate an entry
-# in the hash table is empty.
-
-map = [none for f in xrange(hash_size)]
-collisions = [0 for f in xrange(10)]
-for type, name, args, num, h, guard in entrypoints:
-    level = 0
-    while map[h & hash_mask] != none:
-        h = h + prime_step
-        level = level + 1
-    if level > 9:
-        collisions[9] += 1
-    else:
-        collisions[level] += 1
-    map[h & hash_mask] = num
-
-print "/* Hash table stats:"
-print " * size %d entries" % hash_size
-print " * collisions  entries"
-for i in xrange(10):
-    if (i == 9):
-        plus = "+"
-    else:
-        plus = " "
-
-    print " *     %2d%s     %4d" % (i, plus, collisions[i])
-print " */\n"
-
-print "#define none 0x%04x\n" % none
-
-print "static const uint16_t map[] = {"
-for i in xrange(0, hash_size, 8):
-    print "   ",
-    for j in xrange(i, i + 8):
-        if map[j] & 0xffff == 0xffff:
-            print "  none,",
-        else:
-            print "0x%04x," % (map[j] & 0xffff),
-    print
-
-print "};"    
-
-# Finally we generate the hash table lookup function.  The hash function and
-# linear probing algorithm matches the hash table generated above.
-
-print """
-void *
-radv_lookup_entrypoint(const char *name)
-{
-   static const uint32_t prime_factor = %d;
-   static const uint32_t prime_step = %d;
-   const struct radv_entrypoint *e;
-   uint32_t hash, h, i;
-   const char *p;
-
-   hash = 0;
-   for (p = name; *p; p++)
-      hash = hash * prime_factor + *p;
-
-   h = hash;
-   do {
-      i = map[h & %d];
-      if (i == none)
-         return NULL;
-      e = &entrypoints[i];
-      h += prime_step;
-   } while (e->hash != hash);
-
-   if (strcmp(name, strings + e->name) != 0)
-      return NULL;
-
-   return radv_resolve_entrypoint(i);
-}
-""" % (prime_factor, prime_step, hash_mask)
+if __name__ == '__main__':
+    main()
diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c
index 61cc673..b18fce3 100644
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -28,6 +28,8 @@
 #include "sid.h"
 #include "r600d_common.h"
 
+#include "vk_util.h"
+
 #include "util/u_half.h"
 #include "util/format_srgb.h"
 #include "util/format_r11g11b10f.h"
@@ -576,6 +578,10 @@
 			         VK_FORMAT_FEATURE_BLIT_DST_BIT;
 			tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
 			         VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
+
+			/* GFX9 doesn't support linear depth surfaces */
+			if (physical_device->rad_info.chip_class >= GFX9)
+				linear = 0;
 		}
 	} else {
 		bool linear_sampling;
@@ -956,6 +962,12 @@
 		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0xffff)) & 0xffff;
 		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0xffff)) << 16;
 		break;
+	case VK_FORMAT_R16G16B16A16_SNORM:
+		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
+		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], -1.0f, 1.0f) * 0x7fff)) << 16;
+		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
+		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], -1.0f, 1.0f) * 0x7fff)) << 16;
+		break;
 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
 		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
 		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
@@ -975,6 +987,27 @@
 		clear_vals[0] = float3_to_r11g11b10f(value->float32);
 		clear_vals[1] = 0;
 		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+		if (value->float32[0] != value->float32[1] ||
+		    value->float32[0] != value->float32[2])
+			return false;
+		clear_vals[0] = fui(value->float32[0]);
+		clear_vals[1] = fui(value->float32[3]);
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		if (value->uint32[0] != value->uint32[1] ||
+		    value->uint32[0] != value->uint32[2])
+			return false;
+		clear_vals[0] = value->uint32[0];
+		clear_vals[1] = value->uint32[3];
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		if (value->int32[0] != value->int32[1] ||
+		    value->int32[0] != value->int32[2])
+			return false;
+		clear_vals[0] = value->int32[0];
+		clear_vals[1] = value->int32[3];
+		break;
 	default:
 		fprintf(stderr, "failed to fast clear %d\n", format);
 		return false;
@@ -1006,16 +1039,11 @@
 						   &pFormatProperties->formatProperties);
 }
 
-VkResult radv_GetPhysicalDeviceImageFormatProperties(
-	VkPhysicalDevice                            physicalDevice,
-	VkFormat                                    format,
-	VkImageType                                 type,
-	VkImageTiling                               tiling,
-	VkImageUsageFlags                           usage,
-	VkImageCreateFlags                          createFlags,
-	VkImageFormatProperties*                    pImageFormatProperties)
+static VkResult radv_get_image_format_properties(struct radv_physical_device *physical_device,
+						 const VkPhysicalDeviceImageFormatInfo2KHR *info,
+						 VkImageFormatProperties *pImageFormatProperties)
+
 {
-	RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
 	VkFormatProperties format_props;
 	VkFormatFeatureFlags format_feature_flags;
 	VkExtent3D maxExtent;
@@ -1023,11 +1051,11 @@
 	uint32_t maxArraySize;
 	VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT;
 
-	radv_physical_device_get_format_properties(physical_device, format,
+	radv_physical_device_get_format_properties(physical_device, info->format,
 						   &format_props);
-	if (tiling == VK_IMAGE_TILING_LINEAR) {
+	if (info->tiling == VK_IMAGE_TILING_LINEAR) {
 		format_feature_flags = format_props.linearTilingFeatures;
-	} else if (tiling == VK_IMAGE_TILING_OPTIMAL) {
+	} else if (info->tiling == VK_IMAGE_TILING_OPTIMAL) {
 		format_feature_flags = format_props.optimalTilingFeatures;
 	} else {
 		unreachable("bad VkImageTiling");
@@ -1036,7 +1064,7 @@
 	if (format_feature_flags == 0)
 		goto unsupported;
 
-	switch (type) {
+	switch (info->type) {
 	default:
 		unreachable("bad vkimage type\n");
 	case VK_IMAGE_TYPE_1D:
@@ -1062,34 +1090,34 @@
 		break;
 	}
 
-	if (tiling == VK_IMAGE_TILING_OPTIMAL &&
-	    type == VK_IMAGE_TYPE_2D &&
+	if (info->tiling == VK_IMAGE_TILING_OPTIMAL &&
+	    info->type == VK_IMAGE_TYPE_2D &&
 	    (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
 				     VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
-	    !(createFlags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
-	    !(usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
+	    !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
+	    !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
 		sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | VK_SAMPLE_COUNT_8_BIT;
 	}
 
-	if (usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
+	if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
 		if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
 			goto unsupported;
 		}
 	}
 
-	if (usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+	if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
 		if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
 			goto unsupported;
 		}
 	}
 
-	if (usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+	if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
 		if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
 			goto unsupported;
 		}
 	}
 
-	if (usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+	if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
 		if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
 			goto unsupported;
 		}
@@ -1120,18 +1148,132 @@
 	return VK_ERROR_FORMAT_NOT_SUPPORTED;
 }
 
+VkResult radv_GetPhysicalDeviceImageFormatProperties(
+	VkPhysicalDevice                            physicalDevice,
+	VkFormat                                    format,
+	VkImageType                                 type,
+	VkImageTiling                               tiling,
+	VkImageUsageFlags                           usage,
+	VkImageCreateFlags                          createFlags,
+	VkImageFormatProperties*                    pImageFormatProperties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
+
+	const VkPhysicalDeviceImageFormatInfo2KHR info = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+		.pNext = NULL,
+		.format = format,
+		.type = type,
+		.tiling = tiling,
+		.usage = usage,
+		.flags = createFlags,
+	};
+
+	return radv_get_image_format_properties(physical_device, &info,
+						pImageFormatProperties);
+}
+
+static void
+get_external_image_format_properties(const VkPhysicalDeviceImageFormatInfo2KHR *pImageFormatInfo,
+				     VkExternalMemoryPropertiesKHR *external_properties)
+{
+	VkExternalMemoryFeatureFlagBitsKHR flags = 0;
+	VkExternalMemoryHandleTypeFlagsKHR export_flags = 0;
+	VkExternalMemoryHandleTypeFlagsKHR compat_flags = 0;
+	switch (pImageFormatInfo->type) {
+	case VK_IMAGE_TYPE_2D:
+		flags = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_KHR|VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR|VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR;
+		compat_flags = export_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+		break;
+	default:
+		break;
+	}
+
+	*external_properties = (VkExternalMemoryPropertiesKHR) {
+		.externalMemoryFeatures = flags,
+		.exportFromImportedHandleTypes = export_flags,
+		.compatibleHandleTypes = compat_flags,
+	};
+}
+
 VkResult radv_GetPhysicalDeviceImageFormatProperties2KHR(
 	VkPhysicalDevice                            physicalDevice,
-	const VkPhysicalDeviceImageFormatInfo2KHR*  pImageFormatInfo,
-	VkImageFormatProperties2KHR                *pImageFormatProperties)
+	const VkPhysicalDeviceImageFormatInfo2KHR  *base_info,
+	VkImageFormatProperties2KHR                *base_props)
 {
-	return radv_GetPhysicalDeviceImageFormatProperties(physicalDevice,
-							   pImageFormatInfo->format,
-							   pImageFormatInfo->type,
-							   pImageFormatInfo->tiling,
-							   pImageFormatInfo->usage,
-							   pImageFormatInfo->flags,
-							   &pImageFormatProperties->imageFormatProperties);
+	RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
+	const VkPhysicalDeviceExternalImageFormatInfoKHR *external_info = NULL;
+	VkExternalImageFormatPropertiesKHR *external_props = NULL;
+	VkResult result;
+
+	result = radv_get_image_format_properties(physical_device, base_info,
+						&base_props->imageFormatProperties);
+	if (result != VK_SUCCESS)
+		return result;
+
+	   /* Extract input structs */
+	vk_foreach_struct_const(s, base_info->pNext) {
+		switch (s->sType) {
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR:
+			external_info = (const void *) s;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Extract output structs */
+	vk_foreach_struct(s, base_props->pNext) {
+		switch (s->sType) {
+		case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR:
+			external_props = (void *) s;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* From the Vulkan 1.0.42 spec:
+	 *
+	 *    If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2KHR will
+	 *    behave as if VkPhysicalDeviceExternalImageFormatInfoKHR was not
+	 *    present and VkExternalImageFormatPropertiesKHR will be ignored.
+	 */
+	if (external_info && external_info->handleType != 0) {
+		switch (external_info->handleType) {
+		case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+			get_external_image_format_properties(base_info, &external_props->externalMemoryProperties);
+			break;
+		default:
+			/* From the Vulkan 1.0.42 spec:
+			 *
+			 *    If handleType is not compatible with the [parameters] specified
+			 *    in VkPhysicalDeviceImageFormatInfo2KHR, then
+			 *    vkGetPhysicalDeviceImageFormatProperties2KHR returns
+			 *    VK_ERROR_FORMAT_NOT_SUPPORTED.
+			 */
+			result = vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+					   "unsupported VkExternalMemoryTypeFlagBitsKHR 0x%x",
+					   external_info->handleType);
+			goto fail;
+		}
+	}
+
+	return VK_SUCCESS;
+
+fail:
+	if (result == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+		/* From the Vulkan 1.0.42 spec:
+		 *
+		 *    If the combination of parameters to
+		 *    vkGetPhysicalDeviceImageFormatProperties2KHR is not supported by
+		 *    the implementation for use in vkCreateImage, then all members of
+		 *    imageFormatProperties will be filled with zero.
+		 */
+		base_props->imageFormatProperties = (VkImageFormatProperties) {0};
+	}
+
+	return result;
 }
 
 void radv_GetPhysicalDeviceSparseImageFormatProperties(
@@ -1157,3 +1299,27 @@
 	/* Sparse images are not yet supported. */
 	*pPropertyCount = 0;
 }
+
+void radv_GetPhysicalDeviceExternalBufferPropertiesKHR(
+	VkPhysicalDevice                            physicalDevice,
+	const VkPhysicalDeviceExternalBufferInfoKHR *pExternalBufferInfo,
+	VkExternalBufferPropertiesKHR               *pExternalBufferProperties)
+{
+	VkExternalMemoryFeatureFlagBitsKHR flags = 0;
+	VkExternalMemoryHandleTypeFlagsKHR export_flags = 0;
+	VkExternalMemoryHandleTypeFlagsKHR compat_flags = 0;
+	switch(pExternalBufferInfo->handleType) {
+	case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+		flags = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR |
+		        VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR;
+		compat_flags = export_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+		break;
+	default:
+		break;
+	}
+	pExternalBufferProperties->externalMemoryProperties = (VkExternalMemoryPropertiesKHR) {
+		.externalMemoryFeatures = flags,
+		.exportFromImportedHandleTypes = export_flags,
+		.compatibleHandleTypes = compat_flags,
+	};
+}
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 4e6453c..6cc4540 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -27,11 +27,14 @@
 
 #include "radv_private.h"
 #include "vk_format.h"
+#include "vk_util.h"
 #include "radv_radeon_winsys.h"
 #include "sid.h"
+#include "gfx9d.h"
 #include "util/debug.h"
+#include "util/u_atomic.h"
 static unsigned
-radv_choose_tiling(struct radv_device *Device,
+radv_choose_tiling(struct radv_device *device,
 		   const struct radv_image_create_info *create_info)
 {
 	const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
@@ -41,12 +44,17 @@
 		return RADEON_SURF_MODE_LINEAR_ALIGNED;
 	}
 
-	/* Textures with a very small height are recommended to be linear. */
-	if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D ||
-	    /* Only very thin and long 2D textures should benefit from
-	     * linear_aligned. */
-	    (pCreateInfo->extent.width > 8 && pCreateInfo->extent.height <= 2))
-		return RADEON_SURF_MODE_LINEAR_ALIGNED;
+	if (!vk_format_is_compressed(pCreateInfo->format) &&
+	    !vk_format_is_depth_or_stencil(pCreateInfo->format)
+	    && device->physical_device->rad_info.chip_class <= VI) {
+		/* this causes hangs in some VK CTS tests on GFX9. */
+		/* Textures with a very small height are recommended to be linear. */
+		if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D ||
+		    /* Only very thin and long 2D textures should benefit from
+		     * linear_aligned. */
+		    (pCreateInfo->extent.width > 8 && pCreateInfo->extent.height <= 2))
+			return RADEON_SURF_MODE_LINEAR_ALIGNED;
+	}
 
 	/* MSAA resources must be 2D tiled. */
 	if (pCreateInfo->samples > 1)
@@ -67,22 +75,15 @@
 
 	is_depth = vk_format_has_depth(desc);
 	is_stencil = vk_format_has_stencil(desc);
-	surface->npix_x = pCreateInfo->extent.width;
-	surface->npix_y = pCreateInfo->extent.height;
-	surface->npix_z = pCreateInfo->extent.depth;
 
 	surface->blk_w = vk_format_get_blockwidth(pCreateInfo->format);
 	surface->blk_h = vk_format_get_blockheight(pCreateInfo->format);
-	surface->blk_d = 1;
-	surface->array_size = pCreateInfo->arrayLayers;
-	surface->last_level = pCreateInfo->mipLevels - 1;
 
-	surface->bpe = vk_format_get_blocksize(pCreateInfo->format);
+	surface->bpe = vk_format_get_blocksize(vk_format_depth_only(pCreateInfo->format));
 	/* align byte per element on dword */
 	if (surface->bpe == 3) {
 		surface->bpe = 4;
 	}
-	surface->nsamples = pCreateInfo->samples ? pCreateInfo->samples : 1;
 	surface->flags = RADEON_SURF_SET(array_mode, MODE);
 
 	switch (pCreateInfo->imageType){
@@ -110,15 +111,16 @@
 	}
 
 	if (is_stencil)
-		surface->flags |= RADEON_SURF_SBUFFER |
-			RADEON_SURF_HAS_SBUFFER_MIPTREE;
+		surface->flags |= RADEON_SURF_SBUFFER;
 
 	surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
+	surface->flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
 
 	if ((pCreateInfo->usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
 	                           VK_IMAGE_USAGE_STORAGE_BIT)) ||
 	    (pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) ||
             (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) ||
+            pCreateInfo->mipLevels > 1 || pCreateInfo->arrayLayers > 1 ||
             device->physical_device->rad_info.chip_class < VI ||
             create_info->scanout || (device->debug_flags & RADV_DEBUG_NO_DCC) ||
             !radv_is_colorbuffer_format_supported(pCreateInfo->format, &blendable))
@@ -137,9 +139,9 @@
 si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil)
 {
 	if (stencil)
-		return image->surface.stencil_tiling_index[level];
+		return image->surface.u.legacy.stencil_tiling_index[level];
 	else
-		return image->surface.tiling_index[level];
+		return image->surface.u.legacy.tiling_index[level];
 }
 
 static unsigned radv_map_swizzle(unsigned swizzle)
@@ -185,6 +187,11 @@
 	state[0] = va;
 	state[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 		S_008F04_STRIDE(stride);
+
+	if (device->physical_device->rad_info.chip_class != VI && stride) {
+		range /= stride;
+	}
+
 	state[2] = range;
 	state[3] = S_008F0C_DST_SEL_X(radv_map_swizzle(desc->swizzle[0])) |
 		   S_008F0C_DST_SEL_Y(radv_map_swizzle(desc->swizzle[1])) |
@@ -197,41 +204,91 @@
 static void
 si_set_mutable_tex_desc_fields(struct radv_device *device,
 			       struct radv_image *image,
-			       const struct radeon_surf_level *base_level_info,
+			       const struct legacy_surf_level *base_level_info,
 			       unsigned base_level, unsigned first_level,
 			       unsigned block_width, bool is_stencil,
 			       uint32_t *state)
 {
-	uint64_t gpu_address = device->ws->buffer_get_va(image->bo) + image->offset;
-	uint64_t va = gpu_address + base_level_info->offset;
-	unsigned pitch = base_level_info->nblk_x * block_width;
-
-	state[1] &= C_008F14_BASE_ADDRESS_HI;
-	state[3] &= C_008F1C_TILING_INDEX;
-	state[4] &= C_008F20_PITCH_GFX6;
-	state[6] &= C_008F28_COMPRESSION_EN;
-
-	assert(!(va & 255));
+	uint64_t gpu_address = image->bo ? device->ws->buffer_get_va(image->bo) + image->offset : 0;
+	uint64_t va = gpu_address;
+	enum chip_class chip_class = device->physical_device->rad_info.chip_class;
+	uint64_t meta_va = 0;
+	if (chip_class >= GFX9) {
+		if (is_stencil)
+			va += image->surface.u.gfx9.stencil_offset;
+		else
+			va += image->surface.u.gfx9.surf_offset;
+	} else
+		va += base_level_info->offset;
 
 	state[0] = va >> 8;
+	if (chip_class < GFX9)
+		state[0] |= image->surface.u.legacy.tile_swizzle;
+	state[1] &= C_008F14_BASE_ADDRESS_HI;
 	state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
-	state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(image, base_level,
-							     is_stencil));
-	state[4] |= S_008F20_PITCH_GFX6(pitch - 1);
 
-	if (image->surface.dcc_size && image->surface.level[first_level].dcc_enabled) {
-		state[6] |= S_008F28_COMPRESSION_EN(1);
-		state[7] = (gpu_address +
-			    image->dcc_offset +
-			    base_level_info->dcc_offset) >> 8;
+	if (chip_class >= VI) {
+		state[6] &= C_008F28_COMPRESSION_EN;
+		state[7] = 0;
+		if (image->surface.dcc_size && first_level < image->surface.num_dcc_levels) {
+			meta_va = gpu_address + image->dcc_offset;
+			if (chip_class <= VI)
+				meta_va += base_level_info->dcc_offset;
+			state[6] |= S_008F28_COMPRESSION_EN(1);
+			state[7] = meta_va >> 8;
+			if (chip_class < GFX9)
+				state[7] |= image->surface.u.legacy.tile_swizzle;
+		}
+	}
+
+	if (chip_class >= GFX9) {
+		state[3] &= C_008F1C_SW_MODE;
+		state[4] &= C_008F20_PITCH_GFX9;
+
+		if (is_stencil) {
+			state[3] |= S_008F1C_SW_MODE(image->surface.u.gfx9.stencil.swizzle_mode);
+			state[4] |= S_008F20_PITCH_GFX9(image->surface.u.gfx9.stencil.epitch);
+		} else {
+			state[3] |= S_008F1C_SW_MODE(image->surface.u.gfx9.surf.swizzle_mode);
+			state[4] |= S_008F20_PITCH_GFX9(image->surface.u.gfx9.surf.epitch);
+		}
+
+		state[5] &= C_008F24_META_DATA_ADDRESS &
+			    C_008F24_META_PIPE_ALIGNED &
+			    C_008F24_META_RB_ALIGNED;
+		if (meta_va) {
+			struct gfx9_surf_meta_flags meta;
+
+			if (image->dcc_offset)
+				meta = image->surface.u.gfx9.dcc;
+			else
+				meta = image->surface.u.gfx9.htile;
+
+			state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
+				    S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
+				    S_008F24_META_RB_ALIGNED(meta.rb_aligned);
+		}
+	} else {
+		/* SI-CI-VI */
+		unsigned pitch = base_level_info->nblk_x * block_width;
+		unsigned index = si_tile_mode_index(image, base_level, is_stencil);
+
+		state[3] &= C_008F1C_TILING_INDEX;
+		state[3] |= S_008F1C_TILING_INDEX(index);
+		state[4] &= C_008F20_PITCH_GFX6;
+		state[4] |= S_008F20_PITCH_GFX6(pitch - 1);
 	}
 }
 
 static unsigned radv_tex_dim(VkImageType image_type, VkImageViewType view_type,
-			     unsigned nr_layers, unsigned nr_samples, bool is_storage_image)
+			     unsigned nr_layers, unsigned nr_samples, bool is_storage_image, bool gfx9)
 {
 	if (view_type == VK_IMAGE_VIEW_TYPE_CUBE || view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
 		return is_storage_image ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_CUBE;
+
+	/* GFX9 allocates 1D textures as 2D. */
+	if (gfx9 && image_type == VK_IMAGE_TYPE_1D)
+		image_type = VK_IMAGE_TYPE_2D;
 	switch (image_type) {
 	case VK_IMAGE_TYPE_1D:
 		return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_1D_ARRAY : V_008F1C_SQ_RSRC_IMG_1D;
@@ -249,13 +306,43 @@
 		unreachable("illegale image type");
 	}
 }
+
+static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
+{
+	unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+
+	if (swizzle[3] == VK_SWIZZLE_X) {
+		/* For the pre-defined border color values (white, opaque
+		 * black, transparent black), the only thing that matters is
+		 * that the alpha channel winds up in the correct place
+		 * (because the RGB channels are all the same) so either of
+		 * these enumerations will work.
+		 */
+		if (swizzle[2] == VK_SWIZZLE_Y)
+			bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
+		else
+			bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
+	} else if (swizzle[0] == VK_SWIZZLE_X) {
+		if (swizzle[1] == VK_SWIZZLE_Y)
+			bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+		else
+			bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
+	} else if (swizzle[1] == VK_SWIZZLE_X) {
+		bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
+	} else if (swizzle[2] == VK_SWIZZLE_X) {
+		bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
+	}
+
+	return bc_swizzle;
+}
+
 /**
  * Build the sampler view descriptor for a texture.
  */
 static void
 si_make_texture_descriptor(struct radv_device *device,
 			   struct radv_image *image,
-			   bool sampler,
+			   bool is_storage_image,
 			   VkImageViewType view_type,
 			   VkFormat vk_format,
 			   const VkComponentMapping *mapping,
@@ -291,40 +378,59 @@
 		data_format = 0;
 	}
 
-	type = radv_tex_dim(image->type, view_type, image->array_size, image->samples,
-			    (image->usage & VK_IMAGE_USAGE_STORAGE_BIT));
+	type = radv_tex_dim(image->type, view_type, image->info.array_size, image->info.samples,
+			    is_storage_image, device->physical_device->rad_info.chip_class >= GFX9);
 	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
 	        height = 1;
-		depth = image->array_size;
+		depth = image->info.array_size;
 	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
 		   type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
 		if (view_type != VK_IMAGE_VIEW_TYPE_3D)
-			depth = image->array_size;
+			depth = image->info.array_size;
 	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
-		depth = image->array_size / 6;
+		depth = image->info.array_size / 6;
 
 	state[0] = 0;
 	state[1] = (S_008F14_DATA_FORMAT_GFX6(data_format) |
 		    S_008F14_NUM_FORMAT_GFX6(num_format));
 	state[2] = (S_008F18_WIDTH(width - 1) |
-		    S_008F18_HEIGHT(height - 1));
+		    S_008F18_HEIGHT(height - 1) |
+		    S_008F18_PERF_MOD(4));
 	state[3] = (S_008F1C_DST_SEL_X(radv_map_swizzle(swizzle[0])) |
 		    S_008F1C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) |
 		    S_008F1C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) |
 		    S_008F1C_DST_SEL_W(radv_map_swizzle(swizzle[3])) |
-		    S_008F1C_BASE_LEVEL(image->samples > 1 ?
+		    S_008F1C_BASE_LEVEL(image->info.samples > 1 ?
 					0 : first_level) |
-		    S_008F1C_LAST_LEVEL(image->samples > 1 ?
-					util_logbase2(image->samples) :
+		    S_008F1C_LAST_LEVEL(image->info.samples > 1 ?
+					util_logbase2(image->info.samples) :
 					last_level) |
-		    S_008F1C_POW2_PAD(image->levels > 1) |
 		    S_008F1C_TYPE(type));
-	state[4] = S_008F20_DEPTH(depth - 1);
-	state[5] = (S_008F24_BASE_ARRAY(first_layer) |
-		    S_008F24_LAST_ARRAY(last_layer));
+	state[4] = 0;
+	state[5] = S_008F24_BASE_ARRAY(first_layer);
 	state[6] = 0;
 	state[7] = 0;
 
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
+
+		/* Depth is the the last accessible layer on Gfx9.
+		 * The hw doesn't need to know the total number of layers.
+		 */
+		if (type == V_008F1C_SQ_RSRC_IMG_3D)
+			state[4] |= S_008F20_DEPTH(depth - 1);
+		else
+			state[4] |= S_008F20_DEPTH(last_layer);
+
+		state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
+		state[5] |= S_008F24_MAX_MIP(image->info.samples > 1 ?
+					     util_logbase2(image->info.samples) :
+					     image->info.levels - 1);
+	} else {
+		state[3] |= S_008F1C_POW2_PAD(image->info.levels > 1);
+		state[4] |= S_008F20_DEPTH(depth - 1);
+		state[5] |= S_008F24_LAST_ARRAY(last_layer);
+	}
 	if (image->dcc_offset) {
 		unsigned swap = radv_translate_colorswap(vk_format, FALSE);
 
@@ -333,7 +439,7 @@
 		/* The last dword is unused by hw. The shader uses it to clear
 		 * bits in the first dword of sampler state.
 		 */
-		if (device->physical_device->rad_info.chip_class <= CIK && image->samples <= 1) {
+		if (device->physical_device->rad_info.chip_class <= CIK && image->info.samples <= 1) {
 			if (first_level == last_level)
 				state[7] = C_008F30_MAX_ANISO_RATIO;
 			else
@@ -343,45 +449,75 @@
 
 	/* Initialize the sampler view for FMASK. */
 	if (image->fmask.size) {
-		uint32_t fmask_format;
+		uint32_t fmask_format, num_format;
 		uint64_t gpu_address = device->ws->buffer_get_va(image->bo);
 		uint64_t va;
 
 		va = gpu_address + image->offset + image->fmask.offset;
 
-		switch (image->samples) {
-		case 2:
-			fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
-			break;
-		case 4:
-			fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
-			break;
-		case 8:
-			fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
-			break;
-		default:
-			assert(0);
-			fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID;
+		if (device->physical_device->rad_info.chip_class >= GFX9) {
+			fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK;
+			switch (image->info.samples) {
+			case 2:
+				num_format = V_008F14_IMG_FMASK_8_2_2;
+				break;
+			case 4:
+				num_format = V_008F14_IMG_FMASK_8_4_4;
+				break;
+			case 8:
+				num_format = V_008F14_IMG_FMASK_32_8_8;
+				break;
+			default:
+				unreachable("invalid nr_samples");
+			}
+		} else {
+			switch (image->info.samples) {
+			case 2:
+				fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
+				break;
+			case 4:
+				fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
+				break;
+			case 8:
+				fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
+				break;
+			default:
+				assert(0);
+				fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID;
+			}
+			num_format = V_008F14_IMG_NUM_FORMAT_UINT;
 		}
 
 		fmask_state[0] = va >> 8;
+		if (device->physical_device->rad_info.chip_class < GFX9)
+			fmask_state[0] |= image->surface.u.legacy.tile_swizzle;
 		fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
 			S_008F14_DATA_FORMAT_GFX6(fmask_format) |
-			S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_UINT);
+			S_008F14_NUM_FORMAT_GFX6(num_format);
 		fmask_state[2] = S_008F18_WIDTH(width - 1) |
 			S_008F18_HEIGHT(height - 1);
 		fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-			S_008F1C_TILING_INDEX(image->fmask.tile_mode_index) |
-			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, 1, 0, false));
-		fmask_state[4] = S_008F20_DEPTH(depth - 1) |
-			S_008F20_PITCH_GFX6(image->fmask.pitch_in_pixels - 1);
-		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer) |
-			S_008F24_LAST_ARRAY(last_layer);
+			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, 1, 0, false, false));
+		fmask_state[4] = 0;
+		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
 		fmask_state[6] = 0;
 		fmask_state[7] = 0;
+
+		if (device->physical_device->rad_info.chip_class >= GFX9) {
+			fmask_state[3] |= S_008F1C_SW_MODE(image->surface.u.gfx9.fmask.swizzle_mode);
+			fmask_state[4] |= S_008F20_DEPTH(last_layer) |
+					  S_008F20_PITCH_GFX9(image->surface.u.gfx9.fmask.epitch);
+			fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(image->surface.u.gfx9.cmask.pipe_aligned) |
+					  S_008F24_META_RB_ALIGNED(image->surface.u.gfx9.cmask.rb_aligned);
+		} else {
+			fmask_state[3] |= S_008F1C_TILING_INDEX(image->fmask.tile_mode_index);
+			fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
+				S_008F20_PITCH_GFX6(image->fmask.pitch_in_pixels - 1);
+			fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
+		}
 	} else if (fmask_state)
 		memset(fmask_state, 0, 8 * 4);
 }
@@ -409,15 +545,15 @@
 	md->metadata[1] = si_get_bo_metadata_word1(device);
 
 
-	si_make_texture_descriptor(device, image, true,
+	si_make_texture_descriptor(device, image, false,
 				   (VkImageViewType)image->type, image->vk_format,
-				   &fixedmapping, 0, image->levels - 1, 0,
-				   image->array_size,
-				   image->extent.width, image->extent.height,
-				   image->extent.depth,
+				   &fixedmapping, 0, image->info.levels - 1, 0,
+				   image->info.array_size,
+				   image->info.width, image->info.height,
+				   image->info.depth,
 				   desc, NULL);
 
-	si_set_mutable_tex_desc_fields(device, image, &image->surface.level[0], 0, 0,
+	si_set_mutable_tex_desc_fields(device, image, &image->surface.u.legacy.level[0], 0, 0,
 				       image->surface.blk_w, false, desc);
 
 	/* Clear the base address and set the relative DCC offset. */
@@ -429,10 +565,11 @@
 	memcpy(&md->metadata[2], desc, sizeof(desc));
 
 	/* Dwords [10:..] contain the mipmap level offsets. */
-	for (i = 0; i <= image->levels - 1; i++)
-		md->metadata[10+i] = image->surface.level[i].offset >> 8;
-
-	md->size_metadata = (11 + image->levels - 1) * 4;
+	if (device->physical_device->rad_info.chip_class <= VI) {
+		for (i = 0; i <= image->info.levels - 1; i++)
+			md->metadata[10+i] = image->surface.u.legacy.level[i].offset >> 8;
+		md->size_metadata = (11 + image->info.levels - 1) * 4;
+	}
 }
 
 void
@@ -443,19 +580,23 @@
 	struct radeon_surf *surface = &image->surface;
 
 	memset(metadata, 0, sizeof(*metadata));
-	metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ?
-		RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-	metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ?
-		RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-	metadata->pipe_config = surface->pipe_config;
-	metadata->bankw = surface->bankw;
-	metadata->bankh = surface->bankh;
-	metadata->tile_split = surface->tile_split;
-	metadata->mtilea = surface->mtilea;
-	metadata->num_banks = surface->num_banks;
-	metadata->stride = surface->level[0].pitch_bytes;
-	metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
 
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+	} else {
+		metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
+			RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
+			RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+		metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config;
+		metadata->u.legacy.bankw = surface->u.legacy.bankw;
+		metadata->u.legacy.bankh = surface->u.legacy.bankh;
+		metadata->u.legacy.tile_split = surface->u.legacy.tile_split;
+		metadata->u.legacy.mtilea = surface->u.legacy.mtilea;
+		metadata->u.legacy.num_banks = surface->u.legacy.num_banks;
+		metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+		metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+	}
 	radv_query_opaque_metadata(device, image, metadata);
 }
 
@@ -467,14 +608,20 @@
 			  struct radv_fmask_info *out)
 {
 	/* FMASK is allocated like an ordinary texture. */
-	struct radeon_surf fmask = image->surface;
-
+	struct radeon_surf fmask = {};
+	struct ac_surf_info info = image->info;
 	memset(out, 0, sizeof(*out));
 
-	fmask.bo_alignment = 0;
-	fmask.bo_size = 0;
-	fmask.nsamples = 1;
-	fmask.flags |= RADEON_SURF_FMASK;
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		out->alignment = image->surface.u.gfx9.fmask_alignment;
+		out->size = image->surface.u.gfx9.fmask_size;
+		return;
+	}
+
+	fmask.blk_w = image->surface.blk_w;
+	fmask.blk_h = image->surface.blk_h;
+	info.samples = 1;
+	fmask.flags = image->surface.flags | RADEON_SURF_FMASK;
 
 	/* Force 2D tiling if it wasn't set. This may occur when creating
 	 * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample
@@ -482,8 +629,6 @@
 	fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE);
 	fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE);
 
-	fmask.flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
-
 	switch (nr_samples) {
 	case 2:
 	case 4:
@@ -496,25 +641,25 @@
 		return;
 	}
 
-	device->ws->surface_init(device->ws, &fmask);
-	assert(fmask.level[0].mode == RADEON_SURF_MODE_2D);
+	device->ws->surface_init(device->ws, &info, &fmask);
+	assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
 
-	out->slice_tile_max = (fmask.level[0].nblk_x * fmask.level[0].nblk_y) / 64;
+	out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
 	if (out->slice_tile_max)
 		out->slice_tile_max -= 1;
 
-	out->tile_mode_index = fmask.tiling_index[0];
-	out->pitch_in_pixels = fmask.level[0].nblk_x;
-	out->bank_height = fmask.bankh;
-	out->alignment = MAX2(256, fmask.bo_alignment);
-	out->size = fmask.bo_size;
+	out->tile_mode_index = fmask.u.legacy.tiling_index[0];
+	out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
+	out->bank_height = fmask.u.legacy.bankh;
+	out->alignment = MAX2(256, fmask.surf_alignment);
+	out->size = fmask.surf_size;
 }
 
 static void
 radv_image_alloc_fmask(struct radv_device *device,
 		       struct radv_image *image)
 {
-	radv_image_get_fmask_info(device, image, image->samples, &image->fmask);
+	radv_image_get_fmask_info(device, image, image->info.samples, &image->fmask);
 
 	image->fmask.offset = align64(image->size, image->fmask.alignment);
 	image->size = image->fmask.offset + image->fmask.size;
@@ -530,6 +675,12 @@
 	unsigned num_pipes = device->physical_device->rad_info.num_tile_pipes;
 	unsigned cl_width, cl_height;
 
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		out->alignment = image->surface.u.gfx9.cmask_alignment;
+		out->size = image->surface.u.gfx9.cmask_size;
+		return;
+	}
+
 	switch (num_pipes) {
 	case 2:
 		cl_width = 32;
@@ -554,8 +705,8 @@
 
 	unsigned base_align = num_pipes * pipe_interleave_bytes;
 
-	unsigned width = align(image->surface.npix_x, cl_width*8);
-	unsigned height = align(image->surface.npix_y, cl_height*8);
+	unsigned width = align(image->info.width, cl_width*8);
+	unsigned height = align(image->info.height, cl_height*8);
 	unsigned slice_elements = (width * height) / (8*8);
 
 	/* Each element of CMASK is a nibble. */
@@ -566,7 +717,7 @@
 		out->slice_tile_max -= 1;
 
 	out->alignment = MAX2(256, base_align);
-	out->size = (image->type == VK_IMAGE_TYPE_3D ? image->extent.depth : image->array_size) *
+	out->size = (image->type == VK_IMAGE_TYPE_3D ? image->info.depth : image->info.array_size) *
 		    align(slice_bytes, base_align);
 }
 
@@ -574,12 +725,16 @@
 radv_image_alloc_cmask(struct radv_device *device,
 		       struct radv_image *image)
 {
+	uint32_t clear_value_size = 0;
 	radv_image_get_cmask_info(device, image, &image->cmask);
 
 	image->cmask.offset = align64(image->size, image->cmask.alignment);
 	/* + 8 for storing the clear values */
-	image->clear_value_offset = image->cmask.offset + image->cmask.size;
-	image->size = image->cmask.offset + image->cmask.size + 8;
+	if (!image->clear_value_offset) {
+		image->clear_value_offset = image->cmask.offset + image->cmask.size;
+		clear_value_size = 8;
+	}
+	image->size = image->cmask.offset + image->cmask.size + clear_value_size;
 	image->alignment = MAX2(image->alignment, image->cmask.alignment);
 }
 
@@ -588,9 +743,10 @@
 		       struct radv_image *image)
 {
 	image->dcc_offset = align64(image->size, image->surface.dcc_alignment);
-	/* + 8 for storing the clear values */
+	/* + 16 for storing the clear values + dcc pred */
 	image->clear_value_offset = image->dcc_offset + image->surface.dcc_size;
-	image->size = image->dcc_offset + image->surface.dcc_size + 8;
+	image->dcc_pred_offset = image->clear_value_offset + 8;
+	image->size = image->dcc_offset + image->surface.dcc_size + 16;
 	image->alignment = MAX2(image->alignment, image->surface.dcc_alignment);
 }
 
@@ -598,7 +754,7 @@
 radv_image_alloc_htile(struct radv_device *device,
 		       struct radv_image *image)
 {
-	if ((device->debug_flags & RADV_DEBUG_NO_HIZ) || image->levels > 1) {
+	if ((device->debug_flags & RADV_DEBUG_NO_HIZ) || image->info.levels > 1) {
 		image->surface.htile_size = 0;
 		return;
 	}
@@ -637,11 +793,14 @@
 
 	memset(image, 0, sizeof(*image));
 	image->type = pCreateInfo->imageType;
-	image->extent = pCreateInfo->extent;
+	image->info.width = pCreateInfo->extent.width;
+	image->info.height = pCreateInfo->extent.height;
+	image->info.depth = pCreateInfo->extent.depth;
+	image->info.samples = pCreateInfo->samples;
+	image->info.array_size = pCreateInfo->arrayLayers;
+	image->info.levels = pCreateInfo->mipLevels;
+
 	image->vk_format = pCreateInfo->format;
-	image->levels = pCreateInfo->mipLevels;
-	image->array_size = pCreateInfo->arrayLayers;
-	image->samples = pCreateInfo->samples;
 	image->tiling = pCreateInfo->tiling;
 	image->usage = pCreateInfo->usage;
 	image->flags = pCreateInfo->flags;
@@ -649,15 +808,24 @@
 	image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE;
 	if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) {
 		for (uint32_t i = 0; i < pCreateInfo->queueFamilyIndexCount; ++i)
-			image->queue_family_mask |= 1u << pCreateInfo->pQueueFamilyIndices[i];
+			if (pCreateInfo->pQueueFamilyIndices[i] == VK_QUEUE_FAMILY_EXTERNAL_KHR)
+				image->queue_family_mask |= (1u << RADV_MAX_QUEUE_FAMILIES) - 1u;
+			else
+				image->queue_family_mask |= 1u << pCreateInfo->pQueueFamilyIndices[i];
+	}
+
+	image->shareable = vk_find_struct_const(pCreateInfo->pNext,
+	                                        EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL;
+	if (!vk_format_is_depth(pCreateInfo->format) && !create_info->scanout && !image->shareable) {
+		image->info.surf_index = p_atomic_inc_return(&device->image_mrt_offset_counter) - 1;
 	}
 
 	radv_init_surface(device, &image->surface, create_info);
 
-	device->ws->surface_init(device->ws, &image->surface);
+	device->ws->surface_init(device->ws, &image->info, &image->surface);
 
-	image->size = image->surface.bo_size;
-	image->alignment = image->surface.bo_alignment;
+	image->size = image->surface.surf_size;
+	image->alignment = image->surface.surf_alignment;
 
 	if (image->exclusive || image->queue_family_mask == 1)
 		can_cmask_dcc = true;
@@ -670,22 +838,17 @@
 
 	if ((pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) &&
 	    pCreateInfo->mipLevels == 1 &&
-	    !image->surface.dcc_size && image->extent.depth == 1 && can_cmask_dcc)
+	    !image->surface.dcc_size && image->info.depth == 1 && can_cmask_dcc &&
+	    !image->surface.is_linear)
 		radv_image_alloc_cmask(device, image);
-	if (image->samples > 1 && vk_format_is_color(pCreateInfo->format)) {
+
+	if (image->info.samples > 1 && vk_format_is_color(pCreateInfo->format)) {
 		radv_image_alloc_fmask(device, image);
 	} else if (vk_format_is_depth(pCreateInfo->format)) {
 
 		radv_image_alloc_htile(device, image);
 	}
 
-
-	if (create_info->stride && create_info->stride != image->surface.level[0].pitch_bytes) {
-		image->surface.level[0].nblk_x = create_info->stride / image->surface.bpe;
-		image->surface.level[0].pitch_bytes = create_info->stride;
-		image->surface.level[0].slice_size = create_info->stride * image->surface.level[0].nblk_y;
-	}
-
 	if (pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
 		image->alignment = MAX2(image->alignment, 4096);
 		image->size = align64(image->size, image->alignment);
@@ -704,25 +867,75 @@
 	return VK_SUCCESS;
 }
 
+static void
+radv_image_view_make_descriptor(struct radv_image_view *iview,
+				struct radv_device *device,
+				const VkComponentMapping *components,
+				bool is_storage_image)
+{
+	struct radv_image *image = iview->image;
+	bool is_stencil = iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT;
+	uint32_t blk_w;
+	uint32_t *descriptor;
+	uint32_t *fmask_descriptor;
+	uint32_t hw_level = 0;
+
+	if (is_storage_image) {
+		descriptor = iview->storage_descriptor;
+		fmask_descriptor = iview->storage_fmask_descriptor;
+	} else {
+		descriptor = iview->descriptor;
+		fmask_descriptor = iview->fmask_descriptor;
+	}
+
+	assert(image->surface.blk_w % vk_format_get_blockwidth(image->vk_format) == 0);
+	blk_w = image->surface.blk_w / vk_format_get_blockwidth(image->vk_format) * vk_format_get_blockwidth(iview->vk_format);
+
+	if (device->physical_device->rad_info.chip_class >= GFX9)
+		hw_level = iview->base_mip;
+	si_make_texture_descriptor(device, image, is_storage_image,
+				   iview->type,
+				   iview->vk_format,
+				   components,
+				   hw_level, hw_level + iview->level_count - 1,
+				   iview->base_layer,
+				   iview->base_layer + iview->layer_count - 1,
+				   iview->extent.width,
+				   iview->extent.height,
+				   iview->extent.depth,
+				   descriptor,
+				   fmask_descriptor);
+
+	const struct legacy_surf_level *base_level_info = NULL;
+	if (device->physical_device->rad_info.chip_class <= GFX9) {
+		if (is_stencil)
+			base_level_info = &image->surface.u.legacy.stencil_level[iview->base_mip];
+		else
+			base_level_info = &image->surface.u.legacy.level[iview->base_mip];
+	}
+	si_set_mutable_tex_desc_fields(device, image,
+				       base_level_info,
+				       iview->base_mip,
+				       iview->base_mip,
+				       blk_w, is_stencil, descriptor);
+}
+
 void
 radv_image_view_init(struct radv_image_view *iview,
 		     struct radv_device *device,
-		     const VkImageViewCreateInfo* pCreateInfo,
-		     struct radv_cmd_buffer *cmd_buffer,
-		     VkImageUsageFlags usage_mask)
+		     const VkImageViewCreateInfo* pCreateInfo)
 {
 	RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image);
 	const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
-	uint32_t blk_w;
-	bool is_stencil = false;
+
 	switch (image->type) {
 	case VK_IMAGE_TYPE_1D:
 	case VK_IMAGE_TYPE_2D:
-		assert(range->baseArrayLayer + radv_get_layerCount(image, range) - 1 <= image->array_size);
+		assert(range->baseArrayLayer + radv_get_layerCount(image, range) - 1 <= image->info.array_size);
 		break;
 	case VK_IMAGE_TYPE_3D:
 		assert(range->baseArrayLayer + radv_get_layerCount(image, range) - 1
-		       <= radv_minify(image->extent.depth, range->baseMipLevel));
+		       <= radv_minify(image->info.depth, range->baseMipLevel));
 		break;
 	default:
 		unreachable("bad VkImageType");
@@ -734,127 +947,59 @@
 	iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask;
 
 	if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-		is_stencil = true;
 		iview->vk_format = vk_format_stencil_only(iview->vk_format);
 	} else if (iview->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
 		iview->vk_format = vk_format_depth_only(iview->vk_format);
 	}
 
-	iview->extent = (VkExtent3D) {
-		.width  = radv_minify(image->extent.width , range->baseMipLevel),
-		.height = radv_minify(image->extent.height, range->baseMipLevel),
-		.depth  = radv_minify(image->extent.depth , range->baseMipLevel),
-	};
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		iview->extent = (VkExtent3D) {
+			.width = image->info.width,
+			.height = image->info.height,
+			.depth = image->info.depth,
+		};
+	} else {
+		iview->extent = (VkExtent3D) {
+			.width  = radv_minify(image->info.width , range->baseMipLevel),
+			.height = radv_minify(image->info.height, range->baseMipLevel),
+			.depth  = radv_minify(image->info.depth , range->baseMipLevel),
+		};
+	}
 
-	iview->extent.width = round_up_u32(iview->extent.width * vk_format_get_blockwidth(iview->vk_format),
-					   vk_format_get_blockwidth(image->vk_format));
-	iview->extent.height = round_up_u32(iview->extent.height * vk_format_get_blockheight(iview->vk_format),
-					    vk_format_get_blockheight(image->vk_format));
+	if (iview->vk_format != image->vk_format) {
+		iview->extent.width = round_up_u32(iview->extent.width * vk_format_get_blockwidth(iview->vk_format),
+						   vk_format_get_blockwidth(image->vk_format));
+		iview->extent.height = round_up_u32(iview->extent.height * vk_format_get_blockheight(iview->vk_format),
+						    vk_format_get_blockheight(image->vk_format));
+	}
 
-	assert(image->surface.blk_w % vk_format_get_blockwidth(image->vk_format) == 0);
-	blk_w = image->surface.blk_w / vk_format_get_blockwidth(image->vk_format) * vk_format_get_blockwidth(iview->vk_format);
 	iview->base_layer = range->baseArrayLayer;
 	iview->layer_count = radv_get_layerCount(image, range);
 	iview->base_mip = range->baseMipLevel;
+	iview->level_count = radv_get_levelCount(image, range);
 
-	si_make_texture_descriptor(device, image, false,
-				   iview->type,
-				   iview->vk_format,
-				   &pCreateInfo->components,
-				   0, radv_get_levelCount(image, range) - 1,
-				   range->baseArrayLayer,
-				   range->baseArrayLayer + radv_get_layerCount(image, range) - 1,
-				   iview->extent.width,
-				   iview->extent.height,
-				   iview->extent.depth,
-				   iview->descriptor,
-				   iview->fmask_descriptor);
-	si_set_mutable_tex_desc_fields(device, image,
-				       is_stencil ? &image->surface.stencil_level[range->baseMipLevel] : &image->surface.level[range->baseMipLevel], range->baseMipLevel,
-				       range->baseMipLevel,
-				       blk_w, is_stencil, iview->descriptor);
-}
-
-void radv_image_set_optimal_micro_tile_mode(struct radv_device *device,
-					    struct radv_image *image, uint32_t micro_tile_mode)
-{
-	/* These magic numbers were copied from addrlib. It doesn't use any
-	 * definitions for them either. They are all 2D_TILED_THIN1 modes with
-	 * different bpp and micro tile mode.
-	 */
-	if (device->physical_device->rad_info.chip_class >= CIK) {
-		switch (micro_tile_mode) {
-		case 0: /* displayable */
-			image->surface.tiling_index[0] = 10;
-			break;
-		case 1: /* thin */
-			image->surface.tiling_index[0] = 14;
-			break;
-		case 3: /* rotated */
-			image->surface.tiling_index[0] = 28;
-			break;
-		default: /* depth, thick */
-			assert(!"unexpected micro mode");
-			return;
-		}
-	} else { /* SI */
-		switch (micro_tile_mode) {
-		case 0: /* displayable */
-			switch (image->surface.bpe) {
-			case 1:
-                            image->surface.tiling_index[0] = 10;
-                            break;
-			case 2:
-                            image->surface.tiling_index[0] = 11;
-                            break;
-			default: /* 4, 8 */
-                            image->surface.tiling_index[0] = 12;
-                            break;
-			}
-			break;
-		case 1: /* thin */
-			switch (image->surface.bpe) {
-			case 1:
-                                image->surface.tiling_index[0] = 14;
-                                break;
-			case 2:
-                                image->surface.tiling_index[0] = 15;
-                                break;
-			case 4:
-                                image->surface.tiling_index[0] = 16;
-                                break;
-			default: /* 8, 16 */
-                                image->surface.tiling_index[0] = 17;
-                                break;
-			}
-			break;
-		default: /* depth, thick */
-			assert(!"unexpected micro mode");
-			return;
-		}
-	}
-
-	image->surface.micro_tile_mode = micro_tile_mode;
+	radv_image_view_make_descriptor(iview, device, &pCreateInfo->components, false);
+	radv_image_view_make_descriptor(iview, device, &pCreateInfo->components, true);
 }
 
 bool radv_layout_has_htile(const struct radv_image *image,
-                           VkImageLayout layout)
+                           VkImageLayout layout,
+                           unsigned queue_mask)
 {
-	return (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
-		layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+	return image->surface.htile_size &&
+	       (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
+	        layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) &&
+	       queue_mask == (1u << RADV_QUEUE_GENERAL);
 }
 
 bool radv_layout_is_htile_compressed(const struct radv_image *image,
-                                     VkImageLayout layout)
+                                     VkImageLayout layout,
+                                     unsigned queue_mask)
 {
-	return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-}
-
-bool radv_layout_can_expclear(const struct radv_image *image,
-                              VkImageLayout layout)
-{
-	return (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
-		layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+	return image->surface.htile_size &&
+	       (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
+	        layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) &&
+	       queue_mask == (1u << RADV_QUEUE_GENERAL);
 }
 
 bool radv_layout_can_fast_clear(const struct radv_image *image,
@@ -870,6 +1015,8 @@
 {
 	if (!image->exclusive)
 		return image->queue_family_mask;
+	if (family == VK_QUEUE_FAMILY_EXTERNAL_KHR)
+		return (1u << RADV_MAX_QUEUE_FAMILIES) - 1u;
 	if (family == VK_QUEUE_FAMILY_IGNORED)
 		return 1u << queue_family;
 	return 1u << family;
@@ -907,22 +1054,34 @@
 }
 
 void radv_GetImageSubresourceLayout(
-	VkDevice                                    device,
+	VkDevice                                    _device,
 	VkImage                                     _image,
 	const VkImageSubresource*                   pSubresource,
 	VkSubresourceLayout*                        pLayout)
 {
 	RADV_FROM_HANDLE(radv_image, image, _image);
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	int level = pSubresource->mipLevel;
 	int layer = pSubresource->arrayLayer;
+	struct radeon_surf *surface = &image->surface;
 
-	pLayout->offset = image->surface.level[level].offset + image->surface.level[level].slice_size * layer;
-	pLayout->rowPitch = image->surface.level[level].pitch_bytes;
-	pLayout->arrayPitch = image->surface.level[level].slice_size;
-	pLayout->depthPitch = image->surface.level[level].slice_size;
-	pLayout->size = image->surface.level[level].slice_size;
-	if (image->type == VK_IMAGE_TYPE_3D)
-		pLayout->size *= image->surface.level[level].nblk_z;
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		pLayout->offset = surface->u.gfx9.offset[level] + surface->u.gfx9.surf_slice_size * layer;
+		pLayout->rowPitch = surface->u.gfx9.surf_pitch * surface->bpe;
+		pLayout->arrayPitch = surface->u.gfx9.surf_slice_size;
+		pLayout->depthPitch = surface->u.gfx9.surf_slice_size;
+		pLayout->size = surface->u.gfx9.surf_slice_size;
+		if (image->type == VK_IMAGE_TYPE_3D)
+			pLayout->size *= u_minify(image->info.depth, level);
+	} else {
+		pLayout->offset = surface->u.legacy.level[level].offset + surface->u.legacy.level[level].slice_size * layer;
+		pLayout->rowPitch = surface->u.legacy.level[level].nblk_x * surface->bpe;
+		pLayout->arrayPitch = surface->u.legacy.level[level].slice_size;
+		pLayout->depthPitch = surface->u.legacy.level[level].slice_size;
+		pLayout->size = surface->u.legacy.level[level].slice_size;
+		if (image->type == VK_IMAGE_TYPE_3D)
+			pLayout->size *= u_minify(image->info.depth, level);
+	}
 }
 
 
@@ -940,7 +1099,7 @@
 	if (view == NULL)
 		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-	radv_image_view_init(view, device, pCreateInfo, NULL, ~0);
+	radv_image_view_init(view, device, pCreateInfo);
 
 	*pView = radv_image_view_to_handle(view);
 
diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
index fac0dcf..af56f49 100644
--- a/src/amd/vulkan/radv_meta.c
+++ b/src/amd/vulkan/radv_meta.c
@@ -30,21 +30,20 @@
 #include <pwd.h>
 #include <sys/stat.h>
 
-void
-radv_meta_save(struct radv_meta_saved_state *state,
+static void
+radv_meta_save_novertex(struct radv_meta_saved_state *state,
 	       const struct radv_cmd_buffer *cmd_buffer,
 	       uint32_t dynamic_mask)
 {
 	state->old_pipeline = cmd_buffer->state.pipeline;
 	state->old_descriptor_set0 = cmd_buffer->state.descriptors[0];
-	memcpy(state->old_vertex_bindings, cmd_buffer->state.vertex_bindings,
-	       sizeof(state->old_vertex_bindings));
 
 	state->dynamic_mask = dynamic_mask;
 	radv_dynamic_state_copy(&state->dynamic, &cmd_buffer->state.dynamic,
 				dynamic_mask);
 
 	memcpy(state->push_constants, cmd_buffer->push_constants, MAX_PUSH_CONSTANTS_SIZE);
+	state->vertex_saved = false;
 }
 
 void
@@ -53,12 +52,13 @@
 {
 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_GRAPHICS,
 			     radv_pipeline_to_handle(state->old_pipeline));
-
 	cmd_buffer->state.descriptors[0] = state->old_descriptor_set0;
-	memcpy(cmd_buffer->state.vertex_bindings, state->old_vertex_bindings,
-	       sizeof(state->old_vertex_bindings));
+	if (state->vertex_saved) {
+		memcpy(cmd_buffer->state.vertex_bindings, state->old_vertex_bindings,
+		       sizeof(state->old_vertex_bindings));
+		cmd_buffer->state.vb_dirty |= (1 << RADV_META_VERTEX_BINDING_COUNT) - 1;
+	}
 
-	cmd_buffer->state.vb_dirty |= (1 << RADV_META_VERTEX_BINDING_COUNT) - 1;
 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
 
 	radv_dynamic_state_copy(&cmd_buffer->state.dynamic, &state->dynamic,
@@ -338,8 +338,14 @@
 	result = radv_device_init_meta_resolve_compute_state(device);
 	if (result != VK_SUCCESS)
 		goto fail_resolve_compute;
+
+	result = radv_device_init_meta_resolve_fragment_state(device);
+	if (result != VK_SUCCESS)
+		goto fail_resolve_fragment;
 	return VK_SUCCESS;
 
+fail_resolve_fragment:
+	radv_device_finish_meta_resolve_compute_state(device);
 fail_resolve_compute:
 	radv_device_finish_meta_fast_clear_flush_state(device);
 fail_fast_clear:
@@ -376,6 +382,7 @@
 	radv_device_finish_meta_buffer_state(device);
 	radv_device_finish_meta_fast_clear_flush_state(device);
 	radv_device_finish_meta_resolve_compute_state(device);
+	radv_device_finish_meta_resolve_fragment_state(device);
 
 	radv_store_meta_pipeline(device);
 	radv_pipeline_cache_finish(&device->meta_state.cache);
@@ -387,12 +394,166 @@
  * should have no effect.
  */
 void
-radv_meta_save_graphics_reset_vport_scissor(struct radv_meta_saved_state *saved_state,
-					    struct radv_cmd_buffer *cmd_buffer)
+radv_meta_save_graphics_reset_vport_scissor_novertex(struct radv_meta_saved_state *saved_state,
+						     struct radv_cmd_buffer *cmd_buffer)
 {
 	uint32_t dirty_state = (1 << VK_DYNAMIC_STATE_VIEWPORT) | (1 << VK_DYNAMIC_STATE_SCISSOR);
-	radv_meta_save(saved_state, cmd_buffer, dirty_state);
+	radv_meta_save_novertex(saved_state, cmd_buffer, dirty_state);
 	cmd_buffer->state.dynamic.viewport.count = 0;
 	cmd_buffer->state.dynamic.scissor.count = 0;
 	cmd_buffer->state.dirty |= dirty_state;
 }
+
+nir_ssa_def *radv_meta_gen_rect_vertices_comp2(nir_builder *vs_b, nir_ssa_def *comp2)
+{
+
+	nir_intrinsic_instr *vertex_id = nir_intrinsic_instr_create(vs_b->shader, nir_intrinsic_load_vertex_id_zero_base);
+	nir_ssa_dest_init(&vertex_id->instr, &vertex_id->dest, 1, 32, "vertexid");
+	nir_builder_instr_insert(vs_b, &vertex_id->instr);
+
+	/* vertex 0 - -1.0, -1.0 */
+	/* vertex 1 - -1.0, 1.0 */
+	/* vertex 2 - 1.0, -1.0 */
+	/* so channel 0 is vertex_id != 2 ? -1.0 : 1.0
+	   channel 1 is vertex id != 1 ? -1.0 : 1.0 */
+
+	nir_ssa_def *c0cmp = nir_ine(vs_b, &vertex_id->dest.ssa,
+				     nir_imm_int(vs_b, 2));
+	nir_ssa_def *c1cmp = nir_ine(vs_b, &vertex_id->dest.ssa,
+				     nir_imm_int(vs_b, 1));
+
+	nir_ssa_def *comp[4];
+	comp[0] = nir_bcsel(vs_b, c0cmp,
+			    nir_imm_float(vs_b, -1.0),
+			    nir_imm_float(vs_b, 1.0));
+
+	comp[1] = nir_bcsel(vs_b, c1cmp,
+			    nir_imm_float(vs_b, -1.0),
+			    nir_imm_float(vs_b, 1.0));
+	comp[2] = comp2;
+	comp[3] = nir_imm_float(vs_b, 1.0);
+	nir_ssa_def *outvec = nir_vec(vs_b, comp, 4);
+
+	return outvec;
+}
+
+nir_ssa_def *radv_meta_gen_rect_vertices(nir_builder *vs_b)
+{
+	return radv_meta_gen_rect_vertices_comp2(vs_b, nir_imm_float(vs_b, 0.0));
+}
+
+/* vertex shader that generates vertices */
+nir_shader *
+radv_meta_build_nir_vs_generate_vertices(void)
+{
+	const struct glsl_type *vec4 = glsl_vec4_type();
+
+	nir_builder b;
+	nir_variable *v_position;
+
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_vs_gen_verts");
+
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices(&b);
+
+	v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4,
+					 "gl_Position");
+	v_position->data.location = VARYING_SLOT_POS;
+
+	nir_store_var(&b, v_position, outvec, 0xf);
+
+	return b.shader;
+}
+
+nir_shader *
+radv_meta_build_nir_fs_noop(void)
+{
+	nir_builder b;
+
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+	b.shader->info.name = ralloc_asprintf(b.shader,
+					       "meta_noop_fs");
+
+	return b.shader;
+}
+
+void radv_meta_build_resolve_shader_core(nir_builder *b,
+					 bool is_integer,
+					 int samples,
+					 nir_variable *input_img,
+					 nir_variable *color,
+					 nir_ssa_def *img_coord)
+{
+	/* do a txf_ms on each sample */
+	nir_ssa_def *tmp;
+	nir_if *outer_if = NULL;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
+	tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+	tex->op = nir_texop_txf_ms;
+	tex->src[0].src_type = nir_tex_src_coord;
+	tex->src[0].src = nir_src_for_ssa(img_coord);
+	tex->src[1].src_type = nir_tex_src_ms_index;
+	tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+	tex->dest_type = nir_type_float;
+	tex->is_array = false;
+	tex->coord_components = 2;
+	tex->texture = nir_deref_var_create(tex, input_img);
+	tex->sampler = NULL;
+
+	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
+	nir_builder_instr_insert(b, &tex->instr);
+
+	tmp = &tex->dest.ssa;
+
+	if (!is_integer && samples > 1) {
+		nir_tex_instr *tex_all_same = nir_tex_instr_create(b->shader, 1);
+		tex_all_same->sampler_dim = GLSL_SAMPLER_DIM_MS;
+		tex_all_same->op = nir_texop_samples_identical;
+		tex_all_same->src[0].src_type = nir_tex_src_coord;
+		tex_all_same->src[0].src = nir_src_for_ssa(img_coord);
+		tex_all_same->dest_type = nir_type_float;
+		tex_all_same->is_array = false;
+		tex_all_same->coord_components = 2;
+		tex_all_same->texture = nir_deref_var_create(tex_all_same, input_img);
+		tex_all_same->sampler = NULL;
+
+		nir_ssa_dest_init(&tex_all_same->instr, &tex_all_same->dest, 1, 32, "tex");
+		nir_builder_instr_insert(b, &tex_all_same->instr);
+
+		nir_ssa_def *all_same = nir_ine(b, &tex_all_same->dest.ssa, nir_imm_int(b, 0));
+		nir_if *if_stmt = nir_if_create(b->shader);
+		if_stmt->condition = nir_src_for_ssa(all_same);
+		nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+		b->cursor = nir_after_cf_list(&if_stmt->then_list);
+		for (int i = 1; i < samples; i++) {
+			nir_tex_instr *tex_add = nir_tex_instr_create(b->shader, 2);
+			tex_add->sampler_dim = GLSL_SAMPLER_DIM_MS;
+			tex_add->op = nir_texop_txf_ms;
+			tex_add->src[0].src_type = nir_tex_src_coord;
+			tex_add->src[0].src = nir_src_for_ssa(img_coord);
+			tex_add->src[1].src_type = nir_tex_src_ms_index;
+			tex_add->src[1].src = nir_src_for_ssa(nir_imm_int(b, i));
+			tex_add->dest_type = nir_type_float;
+			tex_add->is_array = false;
+			tex_add->coord_components = 2;
+			tex_add->texture = nir_deref_var_create(tex_add, input_img);
+			tex_add->sampler = NULL;
+
+			nir_ssa_dest_init(&tex_add->instr, &tex_add->dest, 4, 32, "tex");
+			nir_builder_instr_insert(b, &tex_add->instr);
+
+			tmp = nir_fadd(b, tmp, &tex_add->dest.ssa);
+		}
+
+		tmp = nir_fdiv(b, tmp, nir_imm_float(b, samples));
+		nir_store_var(b, color, tmp, 0xf);
+		b->cursor = nir_after_cf_list(&if_stmt->else_list);
+		outer_if = if_stmt;
+	}
+	nir_store_var(b, color, &tex->dest.ssa, 0xf);
+
+	if (outer_if)
+		b->cursor = nir_after_cf_node(&outer_if->cf_node);
+}
diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index 6cfc613..adc889b 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -35,6 +35,7 @@
 #define RADV_META_VERTEX_BINDING_COUNT 2
 
 struct radv_meta_saved_state {
+	bool vertex_saved;
 	struct radv_vertex_binding old_vertex_bindings[RADV_META_VERTEX_BINDING_COUNT];
 	struct radv_descriptor_set *old_descriptor_set0;
 	struct radv_pipeline *old_pipeline;
@@ -90,9 +91,9 @@
 
 VkResult radv_device_init_meta_resolve_compute_state(struct radv_device *device);
 void radv_device_finish_meta_resolve_compute_state(struct radv_device *device);
-void radv_meta_save(struct radv_meta_saved_state *state,
-		    const struct radv_cmd_buffer *cmd_buffer,
-		    uint32_t dynamic_mask);
+
+VkResult radv_device_init_meta_resolve_fragment_state(struct radv_device *device);
+void radv_device_finish_meta_resolve_fragment_state(struct radv_device *device);
 
 void radv_meta_restore(const struct radv_meta_saved_state *state,
 		       struct radv_cmd_buffer *cmd_buffer);
@@ -200,8 +201,8 @@
 					 struct radv_image *image,
 					 const VkImageSubresourceRange *subresourceRange);
 
-void radv_meta_save_graphics_reset_vport_scissor(struct radv_meta_saved_state *saved_state,
-						 struct radv_cmd_buffer *cmd_buffer);
+void radv_meta_save_graphics_reset_vport_scissor_novertex(struct radv_meta_saved_state *saved_state,
+							  struct radv_cmd_buffer *cmd_buffer);
 
 void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer,
 				     struct radv_image *src_image,
@@ -211,9 +212,32 @@
 				     uint32_t region_count,
 				     const VkImageResolve *regions);
 
+void radv_meta_resolve_fragment_image(struct radv_cmd_buffer *cmd_buffer,
+				      struct radv_image *src_image,
+				      VkImageLayout src_image_layout,
+				      struct radv_image *dest_image,
+				      VkImageLayout dest_image_layout,
+				      uint32_t region_count,
+				      const VkImageResolve *regions);
+
 void radv_blit_to_prime_linear(struct radv_cmd_buffer *cmd_buffer,
 			       struct radv_image *image,
 			       struct radv_image *linear_image);
+
+/* common nir builder helpers */
+#include "nir/nir_builder.h"
+
+nir_ssa_def *radv_meta_gen_rect_vertices(nir_builder *vs_b);
+nir_ssa_def *radv_meta_gen_rect_vertices_comp2(nir_builder *vs_b, nir_ssa_def *comp2);
+nir_shader *radv_meta_build_nir_vs_generate_vertices(void);
+nir_shader *radv_meta_build_nir_fs_noop(void);
+
+void radv_meta_build_resolve_shader_core(nir_builder *b,
+					 bool is_integer,
+					 int samples,
+					 nir_variable *input_img,
+					 nir_variable *color,
+					 nir_ssa_def *img_coord);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c
index a3256ab..2c1a132 100644
--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -38,25 +38,64 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_blit_vs");
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_blit_vs");
 
-	nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
-						   vec4, "a_pos");
-	pos_in->data.location = VERT_ATTRIB_GENERIC0;
 	nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
 						    vec4, "gl_Position");
 	pos_out->data.location = VARYING_SLOT_POS;
-	nir_copy_var(&b, pos_out, pos_in);
 
-	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
-						       vec4, "a_tex_pos");
-	tex_pos_in->data.location = VERT_ATTRIB_GENERIC1;
 	nir_variable *tex_pos_out = nir_variable_create(b.shader, nir_var_shader_out,
 							vec4, "v_tex_pos");
 	tex_pos_out->data.location = VARYING_SLOT_VAR0;
 	tex_pos_out->data.interpolation = INTERP_MODE_SMOOTH;
-	nir_copy_var(&b, tex_pos_out, tex_pos_in);
 
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices(&b);
+
+	nir_store_var(&b, pos_out, outvec, 0xf);
+
+	nir_intrinsic_instr *src_box = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	src_box->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_base(src_box, 0);
+	nir_intrinsic_set_range(src_box, 16);
+	src_box->num_components = 4;
+	nir_ssa_dest_init(&src_box->instr, &src_box->dest, 4, 32, "src_box");
+	nir_builder_instr_insert(&b, &src_box->instr);
+
+	nir_intrinsic_instr *src0_z = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	src0_z->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_base(src0_z, 16);
+	nir_intrinsic_set_range(src0_z, 4);
+	src0_z->num_components = 1;
+	nir_ssa_dest_init(&src0_z->instr, &src0_z->dest, 1, 32, "src0_z");
+	nir_builder_instr_insert(&b, &src0_z->instr);
+
+	nir_intrinsic_instr *vertex_id = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_vertex_id_zero_base);
+	nir_ssa_dest_init(&vertex_id->instr, &vertex_id->dest, 1, 32, "vertexid");
+	nir_builder_instr_insert(&b, &vertex_id->instr);
+
+	/* vertex 0 - src0_x, src0_y, src0_z */
+	/* vertex 1 - src0_x, src1_y, src0_z*/
+	/* vertex 2 - src1_x, src0_y, src0_z */
+	/* so channel 0 is vertex_id != 2 ? src_x : src_x + w
+	   channel 1 is vertex id != 1 ? src_y : src_y + w */
+
+	nir_ssa_def *c0cmp = nir_ine(&b, &vertex_id->dest.ssa,
+				     nir_imm_int(&b, 2));
+	nir_ssa_def *c1cmp = nir_ine(&b, &vertex_id->dest.ssa,
+				     nir_imm_int(&b, 1));
+
+	nir_ssa_def *comp[4];
+	comp[0] = nir_bcsel(&b, c0cmp,
+			    nir_channel(&b, &src_box->dest.ssa, 0),
+			    nir_channel(&b, &src_box->dest.ssa, 2));
+
+	comp[1] = nir_bcsel(&b, c1cmp,
+			    nir_channel(&b, &src_box->dest.ssa, 1),
+			    nir_channel(&b, &src_box->dest.ssa, 3));
+	comp[2] = &src0_z->dest.ssa;
+	comp[3] = nir_imm_float(&b, 1.0);
+	nir_ssa_def *out_tex_vec = nir_vec(&b, comp, 4);
+	nir_store_var(&b, tex_pos_out, out_tex_vec, 0xf);
 	return b.shader;
 }
 
@@ -70,7 +109,7 @@
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
 
 	sprintf(shader_name, "meta_blit_fs.%d", tex_dim);
-	b.shader->info->name = ralloc_strdup(b.shader, shader_name);
+	b.shader->info.name = ralloc_strdup(b.shader, shader_name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec4, "v_tex_pos");
@@ -124,7 +163,7 @@
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
 
 	sprintf(shader_name, "meta_blit_depth_fs.%d", tex_dim);
-	b.shader->info->name = ralloc_strdup(b.shader, shader_name);
+	b.shader->info.name = ralloc_strdup(b.shader, shader_name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec4, "v_tex_pos");
@@ -178,7 +217,7 @@
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
 
 	sprintf(shader_name, "meta_blit_stencil_fs.%d", tex_dim);
-	b.shader->info->name = ralloc_strdup(b.shader, shader_name);
+	b.shader->info.name = ralloc_strdup(b.shader, shader_name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec4, "v_tex_pos");
@@ -236,65 +275,26 @@
                VkFilter blit_filter)
 {
 	struct radv_device *device = cmd_buffer->device;
-	unsigned offset = 0;
-	struct blit_vb_data {
-		float pos[2];
-		float tex_coord[3];
-	} vb_data[3];
+	uint32_t src_width = radv_minify(src_iview->image->info.width, src_iview->base_mip);
+	uint32_t src_height = radv_minify(src_iview->image->info.height, src_iview->base_mip);
+	uint32_t src_depth = radv_minify(src_iview->image->info.depth, src_iview->base_mip);
+	uint32_t dst_width = radv_minify(dest_iview->image->info.width, dest_iview->base_mip);
+	uint32_t dst_height = radv_minify(dest_iview->image->info.height, dest_iview->base_mip);
 
-	assert(src_image->samples == dest_image->samples);
-	unsigned vb_size = 3 * sizeof(*vb_data);
-	vb_data[0] = (struct blit_vb_data) {
-		.pos = {
-			-1.0,
-			-1.0,
-		},
-		.tex_coord = {
-			(float)src_offset_0.x / (float)src_iview->extent.width,
-			(float)src_offset_0.y / (float)src_iview->extent.height,
-			(float)src_offset_0.z / (float)src_iview->extent.depth,
-		},
+	assert(src_image->info.samples == dest_image->info.samples);
+
+	float vertex_push_constants[5] = {
+		(float)src_offset_0.x / (float)src_width,
+		(float)src_offset_0.y / (float)src_height,
+		(float)src_offset_1.x / (float)src_width,
+		(float)src_offset_1.y / (float)src_height,
+		(float)src_offset_0.z / (float)src_depth,
 	};
 
-	vb_data[1] = (struct blit_vb_data) {
-		.pos = {
-			-1.0,
-			1.0,
-		},
-		.tex_coord = {
-			(float)src_offset_0.x / (float)src_iview->extent.width,
-			(float)src_offset_1.y / (float)src_iview->extent.height,
-			(float)src_offset_0.z / (float)src_iview->extent.depth,
-		},
-	};
-
-	vb_data[2] = (struct blit_vb_data) {
-		.pos = {
-			1.0,
-			-1.0,
-		},
-		.tex_coord = {
-			(float)src_offset_1.x / (float)src_iview->extent.width,
-			(float)src_offset_0.y / (float)src_iview->extent.height,
-			(float)src_offset_0.z / (float)src_iview->extent.depth,
-		},
-	};
-	radv_cmd_buffer_upload_data(cmd_buffer, vb_size, 16, vb_data, &offset);
-
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = vb_size,
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
-
-	radv_CmdBindVertexBuffers(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1,
-				  (VkBuffer[]) {
-						  radv_buffer_to_handle(&vertex_buffer)
-						  },
-				  (VkDeviceSize[]) {
-					  0,
-						  });
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      device->meta_state.blit.pipeline_layout,
+			      VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
+			      vertex_push_constants);
 
 	VkSampler sampler;
 	radv_CreateSampler(radv_device_to_handle(device),
@@ -315,8 +315,8 @@
 					       .pAttachments = (VkImageView[]) {
 					       radv_image_view_to_handle(dest_iview),
 				       },
-				       .width = dest_iview->extent.width,
-				       .height = dest_iview->extent.height,
+				       .width = dst_width,
+				       .height = dst_height,
 				       .layers = 1,
 				}, &cmd_buffer->pool->alloc, &fb);
 	VkPipeline pipeline;
@@ -509,10 +509,10 @@
 	 *    vkCmdBlitImage must not be used for multisampled source or
 	 *    destination images. Use vkCmdResolveImage for this purpose.
 	 */
-	assert(src_image->samples == 1);
-	assert(dest_image->samples == 1);
+	assert(src_image->info.samples == 1);
+	assert(dest_image->info.samples == 1);
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	for (unsigned r = 0; r < regionCount; r++) {
 		const VkImageSubresourceLayers *src_res = &pRegions[r].srcSubresource;
@@ -531,8 +531,7 @@
 						     .baseArrayLayer = src_res->baseArrayLayer,
 						     .layerCount = 1
 					     },
-						     },
-				     cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT);
+				     });
 
 		unsigned dst_start, dst_end;
 		if (dest_image->type == VK_IMAGE_TYPE_3D) {
@@ -580,12 +579,6 @@
 		dest_box.extent.height = abs(dst_y1 - dst_y0);
 
 		struct radv_image_view dest_iview;
-		unsigned usage;
-		if (dst_res->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT)
-			usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-		else
-			usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-
 		const unsigned num_layers = dst_end - dst_start;
 		for (unsigned i = 0; i < num_layers; i++) {
 			const VkOffset3D dest_offset_0 = {
@@ -625,8 +618,7 @@
 							     .baseArrayLayer = dest_array_slice,
 							     .layerCount = 1
 						     },
-					     },
-					     cmd_buffer, usage);
+					     });
 			meta_emit_blit(cmd_buffer,
 				       src_image, &src_iview,
 				       src_offset_0, src_offset_1,
@@ -708,6 +700,8 @@
    VK_FORMAT_R8G8B8A8_UNORM,
    VK_FORMAT_R8G8B8A8_UINT,
    VK_FORMAT_R8G8B8A8_SINT,
+   VK_FORMAT_A2R10G10B10_UINT_PACK32,
+   VK_FORMAT_A2R10G10B10_SINT_PACK32,
    VK_FORMAT_R16G16B16A16_UNORM,
    VK_FORMAT_R16G16B16A16_SNORM,
    VK_FORMAT_R16G16B16A16_UINT,
@@ -765,31 +759,8 @@
 
 		VkPipelineVertexInputStateCreateInfo vi_create_info = {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-			.vertexBindingDescriptionCount = 1,
-			.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-				{
-					.binding = 0,
-					.stride = 5 * sizeof(float),
-					.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-				},
-			},
-			.vertexAttributeDescriptionCount = 2,
-			.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-				{
-					/* Position */
-					.location = 0,
-					.binding = 0,
-					.format = VK_FORMAT_R32G32_SFLOAT,
-					.offset = 0
-				},
-				{
-					/* Texture Coordinate */
-					.location = 1,
-					.binding = 0,
-					.format = VK_FORMAT_R32G32B32_SFLOAT,
-					.offset = 8
-				}
-			}
+			.vertexBindingDescriptionCount = 0,
+			.vertexAttributeDescriptionCount = 0,
 		};
 
 		VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
@@ -917,7 +888,7 @@
 					       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
 						       .attachmentCount = 1,
 						       .pAttachments = &(VkAttachmentDescription) {
-						       .format = 0,
+						       .format = VK_FORMAT_D32_SFLOAT,
 						       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 						       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
 						       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
@@ -944,31 +915,8 @@
 
 	VkPipelineVertexInputStateCreateInfo vi_create_info = {
 		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-		.vertexBindingDescriptionCount = 1,
-		.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-			{
-				.binding = 0,
-				.stride = 5 * sizeof(float),
-				.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-			},
-		},
-		.vertexAttributeDescriptionCount = 2,
-		.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-			{
-				/* Position */
-				.location = 0,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32_SFLOAT,
-				.offset = 0
-			},
-			{
-				/* Texture Coordinate */
-				.location = 1,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32B32_SFLOAT,
-				.offset = 8
-			}
-		}
+		.vertexBindingDescriptionCount = 0,
+		.vertexAttributeDescriptionCount = 0,
 	};
 
 	VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
@@ -1098,7 +1046,7 @@
 					       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
 						       .attachmentCount = 1,
 						       .pAttachments = &(VkAttachmentDescription) {
-						       .format = 0,
+						       .format = VK_FORMAT_S8_UINT,
 						       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 						       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
 						       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
@@ -1125,31 +1073,8 @@
 
 	VkPipelineVertexInputStateCreateInfo vi_create_info = {
 		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-		.vertexBindingDescriptionCount = 1,
-		.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-			{
-				.binding = 0,
-				.stride = 5 * sizeof(float),
-				.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-			},
-		},
-		.vertexAttributeDescriptionCount = 2,
-		.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-			{
-				/* Position */
-				.location = 0,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32_SFLOAT,
-				.offset = 0
-			},
-			{
-				/* Texture Coordinate */
-				.location = 1,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32B32_SFLOAT,
-				.offset = 8
-			}
-		}
+		.vertexBindingDescriptionCount = 0,
+		.vertexAttributeDescriptionCount = 0,
 	};
 
 	VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
@@ -1308,11 +1233,15 @@
 	if (result != VK_SUCCESS)
 		goto fail;
 
+	const VkPushConstantRange push_constant_range = {VK_SHADER_STAGE_VERTEX_BIT, 0, 20};
+
 	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
 					   &(VkPipelineLayoutCreateInfo) {
 						   .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
 							   .setLayoutCount = 1,
 							   .pSetLayouts = &device->meta_state.blit.ds_layout,
+							   .pushConstantRangeCount = 1,
+							   .pPushConstantRanges = &push_constant_range,
 							   },
 					   &device->meta_state.alloc, &device->meta_state.blit.pipeline_layout);
 	if (result != VK_SUCCESS)
@@ -1329,12 +1258,10 @@
 		goto fail;
 
 	result = radv_device_init_meta_blit_stencil(device, &vs);
-	if (result != VK_SUCCESS)
-		goto fail;
-	return VK_SUCCESS;
 
 fail:
 	ralloc_free(vs.nir);
-	radv_device_finish_meta_blit_state(device);
+	if (result != VK_SUCCESS)
+		radv_device_finish_meta_blit_state(device);
 	return result;
 }
diff --git a/src/amd/vulkan/radv_meta_blit2d.c b/src/amd/vulkan/radv_meta_blit2d.c
index f69fec8..724a88f 100644
--- a/src/amd/vulkan/radv_meta_blit2d.c
+++ b/src/amd/vulkan/radv_meta_blit2d.c
@@ -53,8 +53,8 @@
 static void
 create_iview(struct radv_cmd_buffer *cmd_buffer,
              struct radv_meta_blit2d_surf *surf,
-             VkImageUsageFlags usage,
-             struct radv_image_view *iview, VkFormat depth_format)
+             struct radv_image_view *iview, VkFormat depth_format,
+              VkImageAspectFlagBits aspects)
 {
 	VkFormat format;
 
@@ -70,13 +70,13 @@
 					     .viewType = VK_IMAGE_VIEW_TYPE_2D,
 					     .format = format,
 					     .subresourceRange = {
-					     .aspectMask = surf->aspect_mask,
+					     .aspectMask = aspects,
 					     .baseMipLevel = surf->level,
 					     .levelCount = 1,
 					     .baseArrayLayer = surf->layer,
 					     .layerCount = 1
 				     },
-					     }, cmd_buffer, usage);
+			     });
 }
 
 static void
@@ -112,7 +112,8 @@
                 struct radv_meta_blit2d_surf *src_img,
                 struct radv_meta_blit2d_buffer *src_buf,
                 struct blit2d_src_temps *tmp,
-                enum blit2d_src_type src_type, VkFormat depth_format)
+                enum blit2d_src_type src_type, VkFormat depth_format,
+                VkImageAspectFlagBits aspects)
 {
 	struct radv_device *device = cmd_buffer->device;
 
@@ -136,11 +137,10 @@
 
 		radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
 				      device->meta_state.blit2d.p_layouts[src_type],
-				      VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4,
+				      VK_SHADER_STAGE_FRAGMENT_BIT, 16, 4,
 				      &src_buf->pitch);
 	} else {
-		create_iview(cmd_buffer, src_img, VK_IMAGE_USAGE_SAMPLED_BIT, &tmp->iview,
-			     depth_format);
+		create_iview(cmd_buffer, src_img, &tmp->iview, depth_format, aspects);
 
 		radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
 					      device->meta_state.blit2d.p_layouts[src_type],
@@ -177,17 +177,10 @@
                 uint32_t width,
                 uint32_t height,
 		VkFormat depth_format,
-                struct blit2d_dst_temps *tmp)
+                struct blit2d_dst_temps *tmp,
+                VkImageAspectFlagBits aspects)
 {
-	VkImageUsageFlagBits bits;
-
-	if (dst->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT)
-		bits = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-	else
-		bits = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-
-	create_iview(cmd_buffer, dst, bits,
-		     &tmp->iview, depth_format);
+	create_iview(cmd_buffer, dst, &tmp->iview, depth_format, aspects);
 
 	radv_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device),
 			       &(VkFramebufferCreateInfo) {
@@ -260,154 +253,111 @@
 	struct radv_device *device = cmd_buffer->device;
 
 	for (unsigned r = 0; r < num_rects; ++r) {
-		VkFormat depth_format = 0;
-		if (dst->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
-			depth_format = vk_format_stencil_only(dst->image->vk_format);
-		else if (dst->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
-			depth_format = vk_format_depth_only(dst->image->vk_format);
-		struct blit2d_src_temps src_temps;
-		blit2d_bind_src(cmd_buffer, src_img, src_buf, &src_temps, src_type, depth_format);
+		unsigned i;
+		for_each_bit(i, dst->aspect_mask) {
+			unsigned aspect_mask = 1u << i;
+			VkFormat depth_format = 0;
+			if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
+				depth_format = vk_format_stencil_only(dst->image->vk_format);
+			else if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
+				depth_format = vk_format_depth_only(dst->image->vk_format);
+			struct blit2d_src_temps src_temps;
+			blit2d_bind_src(cmd_buffer, src_img, src_buf, &src_temps, src_type, depth_format, aspect_mask);
 
-		uint32_t offset = 0;
-		struct blit2d_dst_temps dst_temps;
-		blit2d_bind_dst(cmd_buffer, dst, rects[r].dst_x + rects[r].width,
-				rects[r].dst_y + rects[r].height, depth_format, &dst_temps);
+			struct blit2d_dst_temps dst_temps;
+			blit2d_bind_dst(cmd_buffer, dst, rects[r].dst_x + rects[r].width,
+					rects[r].dst_y + rects[r].height, depth_format, &dst_temps, aspect_mask);
 
-		struct blit_vb_data {
-			float pos[2];
-			float tex_coord[2];
-		} vb_data[3];
-
-		unsigned vb_size = 3 * sizeof(*vb_data);
-
-		vb_data[0] = (struct blit_vb_data) {
-			.pos = {
-				-1.0,
-				-1.0,
-			},
-			.tex_coord = {
+			float vertex_push_constants[4] = {
 				rects[r].src_x,
 				rects[r].src_y,
-			},
-		};
-
-		vb_data[1] = (struct blit_vb_data) {
-			.pos = {
-				-1.0,
-				1.0,
-			},
-			.tex_coord = {
-				rects[r].src_x,
-				rects[r].src_y + rects[r].height,
-			},
-		};
-
-		vb_data[2] = (struct blit_vb_data) {
-			.pos = {
-				1.0,
-				-1.0,
-			},
-			.tex_coord = {
 				rects[r].src_x + rects[r].width,
-				rects[r].src_y,
-			},
-		};
+				rects[r].src_y + rects[r].height,
+			};
+
+			radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+					device->meta_state.blit2d.p_layouts[src_type],
+					VK_SHADER_STAGE_VERTEX_BIT, 0, 16,
+					vertex_push_constants);
+
+			if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
+				unsigned fs_key = radv_format_meta_fs_key(dst_temps.iview.vk_format);
+
+				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
+							&(VkRenderPassBeginInfo) {
+								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+									.renderPass = device->meta_state.blit2d.render_passes[fs_key],
+									.framebuffer = dst_temps.fb,
+									.renderArea = {
+									.offset = { rects[r].dst_x, rects[r].dst_y, },
+									.extent = { rects[r].width, rects[r].height },
+								},
+									.clearValueCount = 0,
+										.pClearValues = NULL,
+										}, VK_SUBPASS_CONTENTS_INLINE);
 
 
-		radv_cmd_buffer_upload_data(cmd_buffer, vb_size, 16, vb_data, &offset);
-
-		struct radv_buffer vertex_buffer = {
-			.device = device,
-			.size = vb_size,
-			.bo = cmd_buffer->upload.upload_bo,
-			.offset = offset,
-		};
-
-		radv_CmdBindVertexBuffers(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1,
-					  (VkBuffer[]) {
-						  radv_buffer_to_handle(&vertex_buffer),
-							  },
-					  (VkDeviceSize[]) {
-						  0,
-							  });
+				bind_pipeline(cmd_buffer, src_type, fs_key);
+			} else if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
+				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
+							&(VkRenderPassBeginInfo) {
+								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+									.renderPass = device->meta_state.blit2d.depth_only_rp,
+									.framebuffer = dst_temps.fb,
+									.renderArea = {
+									.offset = { rects[r].dst_x, rects[r].dst_y, },
+									.extent = { rects[r].width, rects[r].height },
+								},
+									.clearValueCount = 0,
+										.pClearValues = NULL,
+										}, VK_SUBPASS_CONTENTS_INLINE);
 
 
-		if (dst->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
-			unsigned fs_key = radv_format_meta_fs_key(dst_temps.iview.vk_format);
+				bind_depth_pipeline(cmd_buffer, src_type);
 
-			radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
-						      &(VkRenderPassBeginInfo) {
-							      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-								      .renderPass = device->meta_state.blit2d.render_passes[fs_key],
-								      .framebuffer = dst_temps.fb,
-								      .renderArea = {
-								      .offset = { rects[r].dst_x, rects[r].dst_y, },
-								      .extent = { rects[r].width, rects[r].height },
-							      },
-								      .clearValueCount = 0,
-									       .pClearValues = NULL,
-									       }, VK_SUBPASS_CONTENTS_INLINE);
+			} else if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
+							&(VkRenderPassBeginInfo) {
+								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+									.renderPass = device->meta_state.blit2d.stencil_only_rp,
+									.framebuffer = dst_temps.fb,
+									.renderArea = {
+									.offset = { rects[r].dst_x, rects[r].dst_y, },
+									.extent = { rects[r].width, rects[r].height },
+								},
+									.clearValueCount = 0,
+										.pClearValues = NULL,
+										}, VK_SUBPASS_CONTENTS_INLINE);
 
 
-			bind_pipeline(cmd_buffer, src_type, fs_key);
-		} else if (dst->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
-			radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
-						      &(VkRenderPassBeginInfo) {
-							      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-								      .renderPass = device->meta_state.blit2d.depth_only_rp,
-								      .framebuffer = dst_temps.fb,
-								      .renderArea = {
-								      .offset = { rects[r].dst_x, rects[r].dst_y, },
-								      .extent = { rects[r].width, rects[r].height },
-							      },
-								      .clearValueCount = 0,
-									       .pClearValues = NULL,
-									       }, VK_SUBPASS_CONTENTS_INLINE);
+				bind_stencil_pipeline(cmd_buffer, src_type);
+			} else
+				unreachable("Processing blit2d with multiple aspects.");
+
+			radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
+				.x = rects[r].dst_x,
+				.y = rects[r].dst_y,
+				.width = rects[r].width,
+				.height = rects[r].height,
+				.minDepth = 0.0f,
+				.maxDepth = 1.0f
+			});
+
+			radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkRect2D) {
+				.offset = (VkOffset2D) { rects[r].dst_x, rects[r].dst_y },
+				.extent = (VkExtent2D) { rects[r].width, rects[r].height },
+			});
 
 
-			bind_depth_pipeline(cmd_buffer, src_type);
 
-		} else if (dst->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-			radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
-						      &(VkRenderPassBeginInfo) {
-							      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-								      .renderPass = device->meta_state.blit2d.stencil_only_rp,
-								      .framebuffer = dst_temps.fb,
-								      .renderArea = {
-								      .offset = { rects[r].dst_x, rects[r].dst_y, },
-								      .extent = { rects[r].width, rects[r].height },
-							      },
-								      .clearValueCount = 0,
-									       .pClearValues = NULL,
-									       }, VK_SUBPASS_CONTENTS_INLINE);
+			radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
+			radv_CmdEndRenderPass(radv_cmd_buffer_to_handle(cmd_buffer));
 
-
-			bind_stencil_pipeline(cmd_buffer, src_type);
+			/* At the point where we emit the draw call, all data from the
+			* descriptor sets, etc. has been used.  We are free to delete it.
+			*/
+			blit2d_unbind_dst(cmd_buffer, &dst_temps);
 		}
-
-		radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
-			.x = rects[r].dst_x,
-			.y = rects[r].dst_y,
-			.width = rects[r].width,
-			.height = rects[r].height,
-			.minDepth = 0.0f,
-			.maxDepth = 1.0f
-		});
-
-		radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkRect2D) {
-			.offset = (VkOffset2D) { rects[r].dst_x, rects[r].dst_y },
-			.extent = (VkExtent2D) { rects[r].width, rects[r].height },
-		});
-
-
-
-		radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
-		radv_CmdEndRenderPass(radv_cmd_buffer_to_handle(cmd_buffer));
-
-		/* At the point where we emit the draw call, all data from the
-		 * descriptor sets, etc. has been used.  We are free to delete it.
-		 */
-		blit2d_unbind_dst(cmd_buffer, &dst_temps);
 	}
 }
 
@@ -433,25 +383,53 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_blit_vs");
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_blit2d_vs");
 
-	nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
-						   vec4, "a_pos");
-	pos_in->data.location = VERT_ATTRIB_GENERIC0;
 	nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
 						    vec4, "gl_Position");
 	pos_out->data.location = VARYING_SLOT_POS;
-	nir_copy_var(&b, pos_out, pos_in);
 
-	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
-						       vec2, "a_tex_pos");
-	tex_pos_in->data.location = VERT_ATTRIB_GENERIC1;
 	nir_variable *tex_pos_out = nir_variable_create(b.shader, nir_var_shader_out,
 							vec2, "v_tex_pos");
 	tex_pos_out->data.location = VARYING_SLOT_VAR0;
 	tex_pos_out->data.interpolation = INTERP_MODE_SMOOTH;
-	nir_copy_var(&b, tex_pos_out, tex_pos_in);
 
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices(&b);
+	nir_store_var(&b, pos_out, outvec, 0xf);
+
+	nir_intrinsic_instr *src_box = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	src_box->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_base(src_box, 0);
+	nir_intrinsic_set_range(src_box, 16);
+	src_box->num_components = 4;
+	nir_ssa_dest_init(&src_box->instr, &src_box->dest, 4, 32, "src_box");
+	nir_builder_instr_insert(&b, &src_box->instr);
+
+	nir_intrinsic_instr *vertex_id = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_vertex_id_zero_base);
+	nir_ssa_dest_init(&vertex_id->instr, &vertex_id->dest, 1, 32, "vertexid");
+	nir_builder_instr_insert(&b, &vertex_id->instr);
+
+	/* vertex 0 - src_x, src_y */
+	/* vertex 1 - src_x, src_y+h */
+	/* vertex 2 - src_x+w, src_y */
+	/* so channel 0 is vertex_id != 2 ? src_x : src_x + w
+	   channel 1 is vertex id != 1 ? src_y : src_y + w */
+
+	nir_ssa_def *c0cmp = nir_ine(&b, &vertex_id->dest.ssa,
+				     nir_imm_int(&b, 2));
+	nir_ssa_def *c1cmp = nir_ine(&b, &vertex_id->dest.ssa,
+				     nir_imm_int(&b, 1));
+
+	nir_ssa_def *comp[2];
+	comp[0] = nir_bcsel(&b, c0cmp,
+			    nir_channel(&b, &src_box->dest.ssa, 0),
+			    nir_channel(&b, &src_box->dest.ssa, 2));
+
+	comp[1] = nir_bcsel(&b, c1cmp,
+			    nir_channel(&b, &src_box->dest.ssa, 1),
+			    nir_channel(&b, &src_box->dest.ssa, 3));
+	nir_ssa_def *out_tex_vec = nir_vec(&b, comp, 2);
+	nir_store_var(&b, tex_pos_out, out_tex_vec, 0x3);
 	return b.shader;
 }
 
@@ -502,6 +480,8 @@
 	sampler->data.binding = 0;
 
 	nir_intrinsic_instr *width = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(width, 16);
+	nir_intrinsic_set_range(width, 4);
 	width->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
 	width->num_components = 1;
 	nir_ssa_dest_init(&width->instr, &width->dest, 1, 32, "width");
@@ -532,31 +512,8 @@
 
 static const VkPipelineVertexInputStateCreateInfo normal_vi_create_info = {
 	.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	.vertexBindingDescriptionCount = 1,
-	.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-		{
-			.binding = 0,
-			.stride = 4 * sizeof(float),
-			.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-		},
-	},
-	.vertexAttributeDescriptionCount = 2,
-	.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-		{
-			/* Position */
-			.location = 0,
-			.binding = 0,
-			.format = VK_FORMAT_R32G32_SFLOAT,
-			.offset = 0
-		},
-		{
-			/* Texture Coordinate */
-			.location = 1,
-			.binding = 0,
-			.format = VK_FORMAT_R32G32_SFLOAT,
-			.offset = 8
-		},
-	},
+	.vertexBindingDescriptionCount = 0,
+	.vertexAttributeDescriptionCount = 0,
 };
 
 static nir_shader *
@@ -568,7 +525,7 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, name);
+	b.shader->info.name = ralloc_strdup(b.shader, name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec2, "v_tex_pos");
@@ -597,7 +554,7 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, name);
+	b.shader->info.name = ralloc_strdup(b.shader, name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec2, "v_tex_pos");
@@ -626,7 +583,7 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, name);
+	b.shader->info.name = ralloc_strdup(b.shader, name);
 
 	nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
 						       vec2, "v_tex_pos");
@@ -754,8 +711,8 @@
 						       .format = format,
 						       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 						       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
-						       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-						       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
+						       .initialLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+						       .finalLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 						       },
 					       .subpassCount = 1,
 					       .pSubpasses = &(VkSubpassDescription) {
@@ -764,12 +721,12 @@
 						       .colorAttachmentCount = 1,
 						       .pColorAttachments = &(VkAttachmentReference) {
 							       .attachment = 0,
-							       .layout = VK_IMAGE_LAYOUT_GENERAL,
+							       .layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 							},
 					       .pResolveAttachments = NULL,
 					       .pDepthStencilAttachment = &(VkAttachmentReference) {
 						       .attachment = VK_ATTACHMENT_UNUSED,
-						       .layout = VK_IMAGE_LAYOUT_GENERAL,
+						       .layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 					       },
 					       .preserveAttachmentCount = 1,
 					       .pPreserveAttachments = (uint32_t[]) { 0 },
@@ -909,11 +866,11 @@
 						       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
 							       .attachmentCount = 1,
 							       .pAttachments = &(VkAttachmentDescription) {
-							       .format = 0,
+							       .format = VK_FORMAT_D32_SFLOAT,
 							       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 							       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
-							       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-							       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
+							       .initialLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+							       .finalLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 						       },
 						       .subpassCount = 1,
 						       .pSubpasses = &(VkSubpassDescription) {
@@ -924,7 +881,7 @@
 						       .pResolveAttachments = NULL,
 						       .pDepthStencilAttachment = &(VkAttachmentReference) {
 							       .attachment = 0,
-							       .layout = VK_IMAGE_LAYOUT_GENERAL,
+							       .layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 						       },
 						       .preserveAttachmentCount = 1,
 						       .pPreserveAttachments = (uint32_t[]) { 0 },
@@ -1064,11 +1021,11 @@
 						       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
 							       .attachmentCount = 1,
 							       .pAttachments = &(VkAttachmentDescription) {
-							       .format = 0,
+							       .format = VK_FORMAT_S8_UINT,
 							       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 							       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
-							       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-							       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
+							       .initialLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+							       .finalLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 						       },
 						       .subpassCount = 1,
 						       .pSubpasses = &(VkSubpassDescription) {
@@ -1079,7 +1036,7 @@
 						       .pResolveAttachments = NULL,
 						       .pDepthStencilAttachment = &(VkAttachmentReference) {
 							       .attachment = 0,
-							       .layout = VK_IMAGE_LAYOUT_GENERAL,
+							       .layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
 						       },
 						       .preserveAttachmentCount = 1,
 						       .pPreserveAttachments = (uint32_t[]) { 0 },
@@ -1185,6 +1142,8 @@
    VK_FORMAT_R8G8B8A8_UNORM,
    VK_FORMAT_R8G8B8A8_UINT,
    VK_FORMAT_R8G8B8A8_SINT,
+   VK_FORMAT_A2R10G10B10_UINT_PACK32,
+   VK_FORMAT_A2R10G10B10_SINT_PACK32,
    VK_FORMAT_R16G16B16A16_UNORM,
    VK_FORMAT_R16G16B16A16_SNORM,
    VK_FORMAT_R16G16B16A16_UINT,
@@ -1201,6 +1160,10 @@
 
 	zero(device->meta_state.blit2d);
 
+	const VkPushConstantRange push_constant_ranges[] = {
+		{VK_SHADER_STAGE_VERTEX_BIT, 0, 16},
+		{VK_SHADER_STAGE_FRAGMENT_BIT, 16, 4},
+	};
 	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
 						&(VkDescriptorSetLayoutCreateInfo) {
 							.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@@ -1224,6 +1187,8 @@
 						   .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
 							   .setLayoutCount = 1,
 							   .pSetLayouts = &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_IMAGE],
+							   .pushConstantRangeCount = 1,
+							   .pPushConstantRanges = push_constant_ranges,
 							   },
 					   &device->meta_state.alloc, &device->meta_state.blit2d.p_layouts[BLIT2D_SRC_TYPE_IMAGE]);
 	if (result != VK_SUCCESS)
@@ -1247,14 +1212,14 @@
 	if (result != VK_SUCCESS)
 		goto fail;
 
-	const VkPushConstantRange push_constant_range = {VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4};
+
 	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
 					   &(VkPipelineLayoutCreateInfo) {
 						   .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
 						   .setLayoutCount = 1,
 						   .pSetLayouts = &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_BUFFER],
-						   .pushConstantRangeCount = 1,
-						   .pPushConstantRanges = &push_constant_range,
+						   .pushConstantRangeCount = 2,
+						   .pPushConstantRanges = push_constant_ranges,
 					   },
 					   &device->meta_state.alloc, &device->meta_state.blit2d.p_layouts[BLIT2D_SRC_TYPE_BUFFER]);
 	if (result != VK_SUCCESS)
diff --git a/src/amd/vulkan/radv_meta_buffer.c b/src/amd/vulkan/radv_meta_buffer.c
index 0bb926f..a8a41e0 100644
--- a/src/amd/vulkan/radv_meta_buffer.c
+++ b/src/amd/vulkan/radv_meta_buffer.c
@@ -10,17 +10,17 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_buffer_fill");
-	b.shader->info->cs.local_size[0] = 64;
-	b.shader->info->cs.local_size[1] = 1;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_buffer_fill");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
@@ -36,6 +36,8 @@
 	nir_builder_instr_insert(&b, &dst_buf->instr);
 
 	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(load, 0);
+	nir_intrinsic_set_range(load, 4);
 	load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	load->num_components = 1;
 	nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, "fill_value");
@@ -60,17 +62,17 @@
 	nir_builder b;
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_buffer_copy");
-	b.shader->info->cs.local_size[0] = 64;
-	b.shader->info->cs.local_size[1] = 1;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_buffer_copy");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c
index 09a29d2..91af80c 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -42,10 +42,10 @@
 							     false,
 							     GLSL_TYPE_FLOAT);
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_itob_cs");
-	b.shader->info->cs.local_size[0] = 16;
-	b.shader->info->cs.local_size[1] = 16;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_itob_cs");
+	b.shader->info.cs.local_size[0] = 16;
+	b.shader->info.cs.local_size[1] = 16;
+	b.shader->info.cs.local_size[2] = 1;
 	nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
 						      sampler_type, "s_tex");
 	input_img->data.descriptor_set = 0;
@@ -59,21 +59,25 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 
 
 	nir_intrinsic_instr *offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(offset, 0);
+	nir_intrinsic_set_range(offset, 12);
 	offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	offset->num_components = 2;
 	nir_ssa_dest_init(&offset->instr, &offset->dest, 2, 32, "offset");
 	nir_builder_instr_insert(&b, &offset->instr);
 
 	nir_intrinsic_instr *stride = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(stride, 0);
+	nir_intrinsic_set_range(stride, 12);
 	stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8));
 	stride->num_components = 1;
 	nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "stride");
@@ -240,10 +244,10 @@
 							     false,
 							     GLSL_TYPE_FLOAT);
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_btoi_cs");
-	b.shader->info->cs.local_size[0] = 16;
-	b.shader->info->cs.local_size[1] = 16;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_btoi_cs");
+	b.shader->info.cs.local_size[0] = 16;
+	b.shader->info.cs.local_size[1] = 16;
+	b.shader->info.cs.local_size[2] = 1;
 	nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
 						      buf_type, "s_tex");
 	input_img->data.descriptor_set = 0;
@@ -257,19 +261,23 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_intrinsic_instr *offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(offset, 0);
+	nir_intrinsic_set_range(offset, 12);
 	offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	offset->num_components = 2;
 	nir_ssa_dest_init(&offset->instr, &offset->dest, 2, 32, "offset");
 	nir_builder_instr_insert(&b, &offset->instr);
 
 	nir_intrinsic_instr *stride = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(stride, 0);
+	nir_intrinsic_set_range(stride, 12);
 	stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8));
 	stride->num_components = 1;
 	nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "stride");
@@ -436,10 +444,10 @@
 							     false,
 							     GLSL_TYPE_FLOAT);
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_itoi_cs");
-	b.shader->info->cs.local_size[0] = 16;
-	b.shader->info->cs.local_size[1] = 16;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_itoi_cs");
+	b.shader->info.cs.local_size[0] = 16;
+	b.shader->info.cs.local_size[1] = 16;
+	b.shader->info.cs.local_size[2] = 1;
 	nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
 						      buf_type, "s_tex");
 	input_img->data.descriptor_set = 0;
@@ -453,19 +461,23 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_intrinsic_instr *src_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(src_offset, 0);
+	nir_intrinsic_set_range(src_offset, 16);
 	src_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	src_offset->num_components = 2;
 	nir_ssa_dest_init(&src_offset->instr, &src_offset->dest, 2, 32, "src_offset");
 	nir_builder_instr_insert(&b, &src_offset->instr);
 
 	nir_intrinsic_instr *dst_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(dst_offset, 0);
+	nir_intrinsic_set_range(dst_offset, 16);
 	dst_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8));
 	dst_offset->num_components = 2;
 	nir_ssa_dest_init(&dst_offset->instr, &dst_offset->dest, 2, 32, "dst_offset");
@@ -622,10 +634,10 @@
 							     false,
 							     GLSL_TYPE_FLOAT);
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_cleari_cs");
-	b.shader->info->cs.local_size[0] = 16;
-	b.shader->info->cs.local_size[1] = 16;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_cleari_cs");
+	b.shader->info.cs.local_size[0] = 16;
+	b.shader->info.cs.local_size[1] = 16;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform,
 						       img_type, "out_img");
@@ -635,13 +647,15 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_intrinsic_instr *clear_val = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(clear_val, 0);
+	nir_intrinsic_set_range(clear_val, 16);
 	clear_val->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	clear_val->num_components = 4;
 	nir_ssa_dest_init(&clear_val->instr, &clear_val->dest, 4, 32, "clear_value");
@@ -845,7 +859,6 @@
 static void
 create_iview(struct radv_cmd_buffer *cmd_buffer,
              struct radv_meta_blit2d_surf *surf,
-             VkImageUsageFlags usage,
              struct radv_image_view *iview)
 {
 
@@ -862,7 +875,7 @@
 					     .baseArrayLayer = surf->layer,
 					     .layerCount = 1
 				     },
-					     }, cmd_buffer, usage);
+			     });
 }
 
 static void
@@ -948,7 +961,7 @@
 	struct radv_device *device = cmd_buffer->device;
 	struct itob_temps temps;
 
-	create_iview(cmd_buffer, src, VK_IMAGE_USAGE_SAMPLED_BIT, &temps.src_iview);
+	create_iview(cmd_buffer, src, &temps.src_iview);
 	create_bview(cmd_buffer, dst->buffer, dst->offset, dst->format, &temps.dst_bview);
 	itob_bind_descriptors(cmd_buffer, &temps);
 
@@ -1034,7 +1047,7 @@
 	struct btoi_temps temps;
 
 	create_bview(cmd_buffer, src->buffer, src->offset, src->format, &temps.src_bview);
-	create_iview(cmd_buffer, dst, VK_IMAGE_USAGE_STORAGE_BIT, &temps.dst_iview);
+	create_iview(cmd_buffer, dst, &temps.dst_iview);
 	btoi_bind_descriptors(cmd_buffer, &temps);
 
 	btoi_bind_pipeline(cmd_buffer);
@@ -1124,8 +1137,8 @@
 	struct radv_device *device = cmd_buffer->device;
 	struct itoi_temps temps;
 
-	create_iview(cmd_buffer, src, VK_IMAGE_USAGE_SAMPLED_BIT, &temps.src_iview);
-	create_iview(cmd_buffer, dst, VK_IMAGE_USAGE_STORAGE_BIT, &temps.dst_iview);
+	create_iview(cmd_buffer, src, &temps.src_iview);
+	create_iview(cmd_buffer, dst, &temps.dst_iview);
 
 	itoi_bind_descriptors(cmd_buffer, &temps);
 
@@ -1196,7 +1209,7 @@
 	struct radv_device *device = cmd_buffer->device;
 	struct radv_image_view dst_iview;
 
-	create_iview(cmd_buffer, dst, VK_IMAGE_USAGE_STORAGE_BIT, &dst_iview);
+	create_iview(cmd_buffer, dst, &dst_iview);
 	cleari_bind_descriptors(cmd_buffer, &dst_iview);
 
 	cleari_bind_pipeline(cmd_buffer);
@@ -1213,5 +1226,5 @@
 			      VK_SHADER_STAGE_COMPUTE_BIT, 0, 16,
 			      push_constants);
 
-	radv_unaligned_dispatch(cmd_buffer, dst->image->extent.width, dst->image->extent.height, 1);
+	radv_unaligned_dispatch(cmd_buffer, dst->image->info.width, dst->image->info.height, 1);
 }
diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index a5502ce..451078b 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -27,17 +27,6 @@
 
 #include "util/format_rgb9e5.h"
 #include "vk_format.h"
-/** Vertex attributes for color clears.  */
-struct color_clear_vattrs {
-	float position[2];
-	VkClearColorValue color;
-};
-
-/** Vertex attributes for depthstencil clears.  */
-struct depthstencil_clear_vattrs {
-	float position[2];
-	float depth_clear;
-};
 
 enum {
 	DEPTH_CLEAR_SLOW,
@@ -56,47 +45,34 @@
 	nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL);
 	nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
 
-	vs_b.shader->info->name = ralloc_strdup(vs_b.shader, "meta_clear_color_vs");
-	fs_b.shader->info->name = ralloc_strdup(fs_b.shader, "meta_clear_color_fs");
+	vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_color_vs");
+	fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_color_fs");
 
 	const struct glsl_type *position_type = glsl_vec4_type();
 	const struct glsl_type *color_type = glsl_vec4_type();
 
-	nir_variable *vs_in_pos =
-		nir_variable_create(vs_b.shader, nir_var_shader_in, position_type,
-				    "a_position");
-	vs_in_pos->data.location = VERT_ATTRIB_GENERIC0;
-
 	nir_variable *vs_out_pos =
 		nir_variable_create(vs_b.shader, nir_var_shader_out, position_type,
 				    "gl_Position");
 	vs_out_pos->data.location = VARYING_SLOT_POS;
 
-	nir_variable *vs_in_color =
-		nir_variable_create(vs_b.shader, nir_var_shader_in, color_type,
-				    "a_color");
-	vs_in_color->data.location = VERT_ATTRIB_GENERIC1;
-
-	nir_variable *vs_out_color =
-		nir_variable_create(vs_b.shader, nir_var_shader_out, color_type,
-				    "v_color");
-	vs_out_color->data.location = VARYING_SLOT_VAR0;
-	vs_out_color->data.interpolation = INTERP_MODE_FLAT;
-
-	nir_variable *fs_in_color =
-		nir_variable_create(fs_b.shader, nir_var_shader_in, color_type,
-				    "v_color");
-	fs_in_color->data.location = vs_out_color->data.location;
-	fs_in_color->data.interpolation = vs_out_color->data.interpolation;
+	nir_intrinsic_instr *in_color_load = nir_intrinsic_instr_create(fs_b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(in_color_load, 0);
+	nir_intrinsic_set_range(in_color_load, 16);
+	in_color_load->src[0] = nir_src_for_ssa(nir_imm_int(&fs_b, 0));
+	in_color_load->num_components = 4;
+	nir_ssa_dest_init(&in_color_load->instr, &in_color_load->dest, 4, 32, "clear color");
+	nir_builder_instr_insert(&fs_b, &in_color_load->instr);
 
 	nir_variable *fs_out_color =
 		nir_variable_create(fs_b.shader, nir_var_shader_out, color_type,
 				    "f_color");
 	fs_out_color->data.location = FRAG_RESULT_DATA0 + frag_output;
 
-	nir_copy_var(&vs_b, vs_out_pos, vs_in_pos);
-	nir_copy_var(&vs_b, vs_out_color, vs_in_color);
-	nir_copy_var(&fs_b, fs_out_color, fs_in_color);
+	nir_store_var(&fs_b, fs_out_color, &in_color_load->dest.ssa, 0xf);
+
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices(&vs_b);
+	nir_store_var(&vs_b, vs_out_pos, outvec, 0xf);
 
 	const struct glsl_type *layer_type = glsl_int_type();
 	nir_variable *vs_out_layer =
@@ -105,8 +81,10 @@
 	vs_out_layer->data.location = VARYING_SLOT_LAYER;
 	vs_out_layer->data.interpolation = INTERP_MODE_FLAT;
 	nir_ssa_def *inst_id = nir_load_system_value(&vs_b, nir_intrinsic_load_instance_id, 0);
+	nir_ssa_def *base_instance = nir_load_system_value(&vs_b, nir_intrinsic_load_base_instance, 0);
 
-	nir_store_var(&vs_b, vs_out_layer, inst_id, 0x1);
+	nir_ssa_def *layer_id = nir_iadd(&vs_b, inst_id, base_instance);
+	nir_store_var(&vs_b, vs_out_layer, layer_id, 0x1);
 
 	*out_vs = vs_b.shader;
 	*out_fs = fs_b.shader;
@@ -121,6 +99,7 @@
                 const VkPipelineVertexInputStateCreateInfo *vi_state,
                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
                 const VkPipelineColorBlendStateCreateInfo *cb_state,
+		const VkPipelineLayout layout,
 		const struct radv_graphics_pipeline_create_info *extra,
                 const VkAllocationCallbacks *alloc,
                 struct radv_pipeline **pipeline)
@@ -200,10 +179,11 @@
 								       VK_DYNAMIC_STATE_STENCIL_REFERENCE,
 							       },
 						       },
-													    .flags = 0,
-														     .renderPass = radv_render_pass_to_handle(render_pass),
-														     .subpass = 0,
-														     },
+						    .layout = layout,
+						    .flags = 0,
+						    .renderPass = radv_render_pass_to_handle(render_pass),
+						    .subpass = 0,
+						},
 					       extra,
 					       alloc,
 					       &pipeline_h);
@@ -269,31 +249,8 @@
 
 	const VkPipelineVertexInputStateCreateInfo vi_state = {
 		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-		.vertexBindingDescriptionCount = 1,
-		.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-			{
-				.binding = 0,
-				.stride = sizeof(struct color_clear_vattrs),
-				.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-			},
-		},
-		.vertexAttributeDescriptionCount = 2,
-		.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-			{
-				/* Position */
-				.location = 0,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32_SFLOAT,
-				.offset = offsetof(struct color_clear_vattrs, position),
-			},
-			{
-				/* Color */
-				.location = 1,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32B32A32_SFLOAT,
-				.offset = offsetof(struct color_clear_vattrs, color),
-			},
-		},
+		.vertexBindingDescriptionCount = 0,
+		.vertexAttributeDescriptionCount = 0,
 	};
 
 	const VkPipelineDepthStencilStateCreateInfo ds_state = {
@@ -326,6 +283,7 @@
 	};
 	result = create_pipeline(device, radv_render_pass_from_handle(pass),
 				 samples, vs_nir, fs_nir, &vi_state, &ds_state, &cb_state,
+				 device->meta_state.clear_color_p_layout,
 				 &extra, &device->meta_state.alloc, pipeline);
 
 	return result;
@@ -368,7 +326,12 @@
 		}
 		destroy_render_pass(device, state->clear[i].depthstencil_rp);
 	}
-
+	radv_DestroyPipelineLayout(radv_device_to_handle(device),
+				   state->clear_color_p_layout,
+				   &state->alloc);
+	radv_DestroyPipelineLayout(radv_device_to_handle(device),
+				   state->clear_depth_p_layout,
+				   &state->alloc);
 }
 
 static void
@@ -382,14 +345,13 @@
 	const uint32_t subpass_att = clear_att->colorAttachment;
 	const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment;
 	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
-	const uint32_t samples = iview->image->samples;
+	const uint32_t samples = iview->image->info.samples;
 	const uint32_t samples_log2 = ffs(samples) - 1;
 	unsigned fs_key = radv_format_meta_fs_key(iview->vk_format);
 	struct radv_pipeline *pipeline;
 	VkClearColorValue clear_value = clear_att->clearValue.color;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
 	VkPipeline pipeline_h;
-	uint32_t offset;
 
 	if (fs_key == -1) {
 		radv_finishme("color clears incomplete");
@@ -407,29 +369,10 @@
 	assert(clear_att->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
 	assert(clear_att->colorAttachment < subpass->color_count);
 
-	const struct color_clear_vattrs vertex_data[3] = {
-		{
-			.position = {
-				-1.0,
-				-1.0,
-			},
-			.color = clear_value,
-		},
-		{
-			.position = {
-				-1.0,
-				1.0,
-			},
-			.color = clear_value,
-		},
-		{
-			.position = {
-				1.0,
-				-1.0,
-			},
-			.color = clear_value,
-		},
-	};
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      device->meta_state.clear_color_p_layout,
+			      VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16,
+			      &clear_value);
 
 	struct radv_subpass clear_subpass = {
 		.color_count = 1,
@@ -441,19 +384,6 @@
 
 	radv_cmd_buffer_set_subpass(cmd_buffer, &clear_subpass, false);
 
-	radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset);
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = sizeof(vertex_data),
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
-
-
-	radv_CmdBindVertexBuffers(cmd_buffer_h, 0, 1,
-					(VkBuffer[]) { radv_buffer_to_handle(&vertex_buffer) },
-					(VkDeviceSize[]) { 0 });
-
 	if (cmd_buffer->state.pipeline != pipeline) {
 		radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS,
 					   pipeline_h);
@@ -470,7 +400,7 @@
 
 	radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &clear_rect->rect);
 
-	radv_CmdDraw(cmd_buffer_h, 3, clear_rect->layerCount, 0, 0);
+	radv_CmdDraw(cmd_buffer_h, 3, clear_rect->layerCount, 0, clear_rect->baseArrayLayer);
 
 	radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false);
 }
@@ -484,21 +414,25 @@
 	nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL);
 	nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
 
-	vs_b.shader->info->name = ralloc_strdup(vs_b.shader, "meta_clear_depthstencil_vs");
-	fs_b.shader->info->name = ralloc_strdup(fs_b.shader, "meta_clear_depthstencil_fs");
-	const struct glsl_type *position_type = glsl_vec4_type();
-
-	nir_variable *vs_in_pos =
-		nir_variable_create(vs_b.shader, nir_var_shader_in, position_type,
-				    "a_position");
-	vs_in_pos->data.location = VERT_ATTRIB_GENERIC0;
+	vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_depthstencil_vs");
+	fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_depthstencil_fs");
+	const struct glsl_type *position_out_type = glsl_vec4_type();
 
 	nir_variable *vs_out_pos =
-		nir_variable_create(vs_b.shader, nir_var_shader_out, position_type,
+		nir_variable_create(vs_b.shader, nir_var_shader_out, position_out_type,
 				    "gl_Position");
 	vs_out_pos->data.location = VARYING_SLOT_POS;
 
-	nir_copy_var(&vs_b, vs_out_pos, vs_in_pos);
+	nir_intrinsic_instr *in_color_load = nir_intrinsic_instr_create(vs_b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(in_color_load, 0);
+	nir_intrinsic_set_range(in_color_load, 4);
+	in_color_load->src[0] = nir_src_for_ssa(nir_imm_int(&vs_b, 0));
+	in_color_load->num_components = 1;
+	nir_ssa_dest_init(&in_color_load->instr, &in_color_load->dest, 1, 32, "depth value");
+	nir_builder_instr_insert(&vs_b, &in_color_load->instr);
+
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices_comp2(&vs_b, &in_color_load->dest.ssa);
+	nir_store_var(&vs_b, vs_out_pos, outvec, 0xf);
 
 	const struct glsl_type *layer_type = glsl_int_type();
 	nir_variable *vs_out_layer =
@@ -507,7 +441,10 @@
 	vs_out_layer->data.location = VARYING_SLOT_LAYER;
 	vs_out_layer->data.interpolation = INTERP_MODE_FLAT;
 	nir_ssa_def *inst_id = nir_load_system_value(&vs_b, nir_intrinsic_load_instance_id, 0);
-	nir_store_var(&vs_b, vs_out_layer, inst_id, 0x1);
+	nir_ssa_def *base_instance = nir_load_system_value(&vs_b, nir_intrinsic_load_base_instance, 0);
+
+	nir_ssa_def *layer_id = nir_iadd(&vs_b, inst_id, base_instance);
+	nir_store_var(&vs_b, vs_out_layer, layer_id, 0x1);
 
 	*out_vs = vs_b.shader;
 	*out_fs = fs_b.shader;
@@ -523,7 +460,7 @@
 					       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
 						       .attachmentCount = 1,
 						       .pAttachments = &(VkAttachmentDescription) {
-						       .format = VK_FORMAT_UNDEFINED,
+						       .format = VK_FORMAT_D32_SFLOAT_S8_UINT,
 						       .samples = samples,
 						       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
 						       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
@@ -562,24 +499,8 @@
 
 	const VkPipelineVertexInputStateCreateInfo vi_state = {
 		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-		.vertexBindingDescriptionCount = 1,
-		.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-			{
-				.binding = 0,
-				.stride = sizeof(struct depthstencil_clear_vattrs),
-				.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-			},
-		},
-		.vertexAttributeDescriptionCount = 1,
-		.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-			{
-				/* Position */
-				.location = 0,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32B32_SFLOAT,
-				.offset = offsetof(struct depthstencil_clear_vattrs, position),
-			},
-		},
+		.vertexBindingDescriptionCount = 0,
+		.vertexAttributeDescriptionCount = 0,
 	};
 
 	const VkPipelineDepthStencilStateCreateInfo ds_state = {
@@ -619,14 +540,19 @@
 	}
 	result = create_pipeline(device, radv_render_pass_from_handle(render_pass),
 				 samples, vs_nir, fs_nir, &vi_state, &ds_state, &cb_state,
+				 device->meta_state.clear_depth_p_layout,
 				 &extra, &device->meta_state.alloc, pipeline);
 	return result;
 }
 
-static bool depth_view_can_fast_clear(const struct radv_image_view *iview,
+static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer,
+				      const struct radv_image_view *iview,
 				      VkImageLayout layout,
 				      const VkClearRect *clear_rect)
 {
+	uint32_t queue_mask = radv_image_queue_family_mask(iview->image,
+	                                                   cmd_buffer->queue_family_index,
+	                                                   cmd_buffer->queue_family_index);
 	if (clear_rect->rect.offset.x || clear_rect->rect.offset.y ||
 	    clear_rect->rect.extent.width != iview->extent.width ||
 	    clear_rect->rect.extent.height != iview->extent.height)
@@ -634,14 +560,15 @@
 	if (iview->image->surface.htile_size &&
 	    iview->base_mip == 0 &&
 	    iview->base_layer == 0 &&
-	    radv_layout_can_expclear(iview->image, layout) &&
-	    memcmp(&iview->extent, &iview->image->extent, sizeof(iview->extent)) == 0)
+	    radv_layout_is_htile_compressed(iview->image, layout, queue_mask) &&
+	    !radv_image_extent_compare(iview->image, &iview->extent))
 		return true;
 	return false;
 }
 
 static struct radv_pipeline *
-pick_depthstencil_pipeline(struct radv_meta_state *meta_state,
+pick_depthstencil_pipeline(struct radv_cmd_buffer *cmd_buffer,
+			   struct radv_meta_state *meta_state,
 			   const struct radv_image_view *iview,
 			   int samples_log2,
 			   VkImageAspectFlags aspects,
@@ -649,7 +576,7 @@
 			   const VkClearRect *clear_rect,
 			   VkClearDepthStencilValue clear_value)
 {
-	bool fast = depth_view_can_fast_clear(iview, layout, clear_rect);
+	bool fast = depth_view_can_fast_clear(cmd_buffer, iview, layout, clear_rect);
 	int index = DEPTH_CLEAR_SLOW;
 
 	if (fast) {
@@ -682,10 +609,9 @@
 	VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
 	VkImageAspectFlags aspects = clear_att->aspectMask;
 	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
-	const uint32_t samples = iview->image->samples;
+	const uint32_t samples = iview->image->info.samples;
 	const uint32_t samples_log2 = ffs(samples) - 1;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
-	uint32_t offset;
 
 	assert(aspects == VK_IMAGE_ASPECT_DEPTH_BIT ||
 	       aspects == VK_IMAGE_ASPECT_STENCIL_BIT ||
@@ -696,48 +622,18 @@
 	if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
 		clear_value.depth = 1.0f;
 
-	const struct depthstencil_clear_vattrs vertex_data[3] = {
-		{
-			.position = {
-				-1.0,
-				-1.0
-			},
-			.depth_clear = clear_value.depth,
-		},
-		{
-			.position = {
-				-1.0,
-				1.0,
-			},
-			.depth_clear = clear_value.depth,
-		},
-		{
-			.position = {
-				1.0,
-				-1.0,
-			},
-			.depth_clear = clear_value.depth,
-		},
-	};
-
-	radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset);
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = sizeof(vertex_data),
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      device->meta_state.clear_depth_p_layout,
+			      VK_SHADER_STAGE_VERTEX_BIT, 0, 4,
+			      &clear_value.depth);
 
 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
 		radv_CmdSetStencilReference(cmd_buffer_h, VK_STENCIL_FACE_FRONT_BIT,
 						  clear_value.stencil);
 	}
 
-	radv_CmdBindVertexBuffers(cmd_buffer_h, 0, 1,
-					(VkBuffer[]) { radv_buffer_to_handle(&vertex_buffer) },
-					(VkDeviceSize[]) { 0 });
-
-	struct radv_pipeline *pipeline = pick_depthstencil_pipeline(meta_state,
+	struct radv_pipeline *pipeline = pick_depthstencil_pipeline(cmd_buffer,
+								    meta_state,
 								    iview,
 								    samples_log2,
 								    aspects,
@@ -749,7 +645,7 @@
 					   radv_pipeline_to_handle(pipeline));
 	}
 
-	if (depth_view_can_fast_clear(iview, subpass->depth_stencil_attachment.layout, clear_rect))
+	if (depth_view_can_fast_clear(cmd_buffer, iview, subpass->depth_stencil_attachment.layout, clear_rect))
 		radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
 
 	radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
@@ -763,14 +659,103 @@
 
 	radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &clear_rect->rect);
 
-	radv_CmdDraw(cmd_buffer_h, 3, clear_rect->layerCount, 0, 0);
+	radv_CmdDraw(cmd_buffer_h, 3, clear_rect->layerCount, 0, clear_rect->baseArrayLayer);
 }
 
+static bool
+emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
+		      const VkClearAttachment *clear_att,
+		      const VkClearRect *clear_rect,
+		      enum radv_cmd_flush_bits *pre_flush,
+		      enum radv_cmd_flush_bits *post_flush)
+{
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	const uint32_t pass_att = subpass->depth_stencil_attachment.attachment;
+	VkImageLayout image_layout = subpass->depth_stencil_attachment.layout;
+	const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
+	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
+	VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
+	VkImageAspectFlags aspects = clear_att->aspectMask;
+	uint32_t clear_word;
+
+	if (!iview->image->surface.htile_size)
+		return false;
+
+	if (cmd_buffer->device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS)
+		return false;
+
+	if (!radv_layout_is_htile_compressed(iview->image, image_layout, radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index, cmd_buffer->queue_family_index)))
+		goto fail;
+
+	/* don't fast clear 3D */
+	if (iview->image->type == VK_IMAGE_TYPE_3D)
+		goto fail;
+
+	/* all layers are bound */
+	if (iview->base_layer > 0)
+		goto fail;
+	if (iview->image->info.array_size != iview->layer_count)
+		goto fail;
+
+	if (iview->image->info.levels > 1)
+		goto fail;
+
+	if (!radv_image_extent_compare(iview->image, &iview->extent))
+		goto fail;
+
+	if (clear_rect->rect.offset.x || clear_rect->rect.offset.y ||
+	    clear_rect->rect.extent.width != iview->image->info.width ||
+	    clear_rect->rect.extent.height != iview->image->info.height)
+		goto fail;
+
+	if (clear_rect->baseArrayLayer != 0)
+		goto fail;
+	if (clear_rect->layerCount != iview->image->info.array_size)
+		goto fail;
+
+	if ((clear_value.depth != 0.0 && clear_value.depth != 1.0) || !(aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+		goto fail;
+
+	if (vk_format_aspects(iview->image->vk_format) & VK_IMAGE_ASPECT_STENCIL_BIT) {
+		if (clear_value.stencil != 0 || !(aspects & VK_IMAGE_ASPECT_STENCIL_BIT))
+			goto fail;
+		clear_word = clear_value.depth ? 0xfffc0000 : 0;
+	} else
+		clear_word = clear_value.depth ? 0xfffffff0 : 0;
+
+	if (pre_flush) {
+		cmd_buffer->state.flush_bits |= (RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+						 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) & ~ *pre_flush;
+		*pre_flush |= cmd_buffer->state.flush_bits;
+	} else
+		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
+
+	radv_fill_buffer(cmd_buffer, iview->image->bo,
+	                 iview->image->offset + iview->image->htile_offset,
+	                 iview->image->surface.htile_size, clear_word);
+
+
+	radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
+	if (post_flush)
+		*post_flush |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+	                       RADV_CMD_FLAG_INV_VMEM_L1 |
+	                       RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+	else
+		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+	                                        RADV_CMD_FLAG_INV_VMEM_L1 |
+	                                        RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+	return true;
+fail:
+	return false;
+}
 
 static VkFormat pipeline_formats[] = {
 	VK_FORMAT_R8G8B8A8_UNORM,
 	VK_FORMAT_R8G8B8A8_UINT,
 	VK_FORMAT_R8G8B8A8_SINT,
+	VK_FORMAT_A2R10G10B10_UINT_PACK32,
+	VK_FORMAT_A2R10G10B10_SINT_PACK32,
 	VK_FORMAT_R16G16B16A16_UNORM,
 	VK_FORMAT_R16G16B16A16_SNORM,
 	VK_FORMAT_R16G16B16A16_UINT,
@@ -788,6 +773,34 @@
 
 	memset(&device->meta_state.clear, 0, sizeof(device->meta_state.clear));
 
+	VkPipelineLayoutCreateInfo pl_color_create_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+		.setLayoutCount = 0,
+		.pushConstantRangeCount = 1,
+		.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16},
+	};
+
+	res = radv_CreatePipelineLayout(radv_device_to_handle(device),
+					&pl_color_create_info,
+					&device->meta_state.alloc,
+					&device->meta_state.clear_color_p_layout);
+	if (res != VK_SUCCESS)
+		goto fail;
+
+	VkPipelineLayoutCreateInfo pl_depth_create_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+		.setLayoutCount = 0,
+		.pushConstantRangeCount = 1,
+		.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_VERTEX_BIT, 0, 4},
+	};
+
+	res = radv_CreatePipelineLayout(radv_device_to_handle(device),
+					&pl_depth_create_info,
+					&device->meta_state.alloc,
+					&device->meta_state.clear_depth_p_layout);
+	if (res != VK_SUCCESS)
+		goto fail;
+
 	for (uint32_t i = 0; i < ARRAY_SIZE(state->clear); ++i) {
 		uint32_t samples = 1 << i;
 		for (uint32_t j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) {
@@ -850,6 +863,83 @@
 	return res;
 }
 
+static void vi_get_fast_clear_parameters(VkFormat format,
+					 const VkClearColorValue *clear_value,
+					 uint32_t* reset_value,
+					 bool *can_avoid_fast_clear_elim)
+{
+	bool values[4] = {};
+	int extra_channel;
+	bool main_value = false;
+	bool extra_value = false;
+	int i;
+	*can_avoid_fast_clear_elim = false;
+
+	*reset_value = 0x20202020U;
+
+	const struct vk_format_description *desc = vk_format_description(format);
+	if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 ||
+	    format == VK_FORMAT_R5G6B5_UNORM_PACK16 ||
+	    format == VK_FORMAT_B5G6R5_UNORM_PACK16)
+		extra_channel = -1;
+	else if (desc->layout == VK_FORMAT_LAYOUT_PLAIN) {
+		if (radv_translate_colorswap(format, false) <= 1)
+			extra_channel = desc->nr_channels - 1;
+		else
+			extra_channel = 0;
+	} else
+		return;
+
+	for (i = 0; i < 4; i++) {
+		int index = desc->swizzle[i] - VK_SWIZZLE_X;
+		if (desc->swizzle[i] < VK_SWIZZLE_X ||
+		    desc->swizzle[i] > VK_SWIZZLE_W)
+			continue;
+
+		if (desc->channel[i].pure_integer &&
+		    desc->channel[i].type == VK_FORMAT_TYPE_SIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+			values[i] = clear_value->int32[i] != 0;
+			if (clear_value->int32[i] != 0 && MIN2(clear_value->int32[i], max) != max)
+				return;
+		} else if (desc->channel[i].pure_integer &&
+			   desc->channel[i].type == VK_FORMAT_TYPE_UNSIGNED) {
+			/* Use the maximum value for clamping the clear color. */
+			unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+			values[i] = clear_value->uint32[i] != 0U;
+			if (clear_value->uint32[i] != 0U && MIN2(clear_value->uint32[i], max) != max)
+				return;
+		} else {
+			values[i] = clear_value->float32[i] != 0.0F;
+			if (clear_value->float32[i] != 0.0F && clear_value->float32[i] != 1.0F)
+				return;
+		}
+
+		if (index == extra_channel)
+			extra_value = values[i];
+		else
+			main_value = values[i];
+	}
+
+	for (int i = 0; i < 4; ++i)
+		if (values[i] != main_value &&
+		    desc->swizzle[i] - VK_SWIZZLE_X != extra_channel &&
+		    desc->swizzle[i] >= VK_SWIZZLE_X &&
+		    desc->swizzle[i] <= VK_SWIZZLE_W)
+			return;
+
+	*can_avoid_fast_clear_elim = true;
+	if (main_value)
+		*reset_value |= 0x80808080U;
+
+	if (extra_value)
+		*reset_value |= 0x40404040U;
+	return;
+}
+
 static bool
 emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 		      const VkClearAttachment *clear_att,
@@ -875,8 +965,6 @@
 
 	if (!radv_layout_can_fast_clear(iview->image, image_layout, radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index, cmd_buffer->queue_family_index)))
 		goto fail;
-	if (vk_format_get_blocksizebits(iview->image->vk_format) > 64)
-		goto fail;
 
 	/* don't fast clear 3D */
 	if (iview->image->type == VK_IMAGE_TYPE_3D)
@@ -885,26 +973,30 @@
 	/* all layers are bound */
 	if (iview->base_layer > 0)
 		goto fail;
-	if (iview->image->array_size != iview->layer_count)
+	if (iview->image->info.array_size != iview->layer_count)
 		goto fail;
 
-	if (iview->image->levels > 1)
+	if (iview->image->info.levels > 1)
 		goto fail;
 
-	if (iview->image->surface.level[0].mode < RADEON_SURF_MODE_1D)
+	if (iview->image->surface.is_linear)
 		goto fail;
-
-	if (memcmp(&iview->extent, &iview->image->extent, sizeof(iview->extent)))
+	if (!radv_image_extent_compare(iview->image, &iview->extent))
 		goto fail;
 
 	if (clear_rect->rect.offset.x || clear_rect->rect.offset.y ||
-	    clear_rect->rect.extent.width != iview->image->extent.width ||
-	    clear_rect->rect.extent.height != iview->image->extent.height)
+	    clear_rect->rect.extent.width != iview->image->info.width ||
+	    clear_rect->rect.extent.height != iview->image->info.height)
 		goto fail;
 
 	if (clear_rect->baseArrayLayer != 0)
 		goto fail;
-	if (clear_rect->layerCount != iview->image->array_size)
+	if (clear_rect->layerCount != iview->image->info.array_size)
+		goto fail;
+
+	/* RB+ doesn't work with CMASK fast clear on Stoney. */
+	if (!iview->image->surface.dcc_size &&
+	    cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY)
 		goto fail;
 
 	/* DCC */
@@ -922,10 +1014,23 @@
 		                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
 	/* clear cmask buffer */
 	if (iview->image->surface.dcc_size) {
+		uint32_t reset_value;
+		bool can_avoid_fast_clear_elim;
+		vi_get_fast_clear_parameters(iview->image->vk_format,
+					     &clear_value, &reset_value,
+					     &can_avoid_fast_clear_elim);
+
 		radv_fill_buffer(cmd_buffer, iview->image->bo,
 				 iview->image->offset + iview->image->dcc_offset,
-				 iview->image->surface.dcc_size, 0x20202020);
+				 iview->image->surface.dcc_size, reset_value);
+		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, iview->image,
+						  !can_avoid_fast_clear_elim);
 	} else {
+
+		if (iview->image->surface.bpe > 8) {
+			/* 128 bit formats not supported */
+			return false;
+		}
 		radv_fill_buffer(cmd_buffer, iview->image->bo,
 				 iview->image->offset + iview->image->cmask.offset,
 				 iview->image->cmask.size, 0);
@@ -965,7 +1070,9 @@
 	} else {
 		assert(clear_att->aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT |
 						VK_IMAGE_ASPECT_STENCIL_BIT));
-		emit_depthstencil_clear(cmd_buffer, clear_att, clear_rect);
+		if (!emit_fast_htile_clear(cmd_buffer, clear_att, clear_rect,
+		                           pre_flush, post_flush))
+			emit_depthstencil_clear(cmd_buffer, clear_att, clear_rect);
 	}
 }
 
@@ -980,7 +1087,8 @@
 	ds = cmd_state->subpass->depth_stencil_attachment.attachment;
 	for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) {
 		uint32_t a = cmd_state->subpass->color_attachments[i].attachment;
-		if (cmd_state->attachments[a].pending_clear_aspects) {
+		if (a != VK_ATTACHMENT_UNUSED &&
+		    cmd_state->attachments[a].pending_clear_aspects) {
 			return true;
 		}
 	}
@@ -1009,7 +1117,7 @@
 	if (!subpass_needs_clear(cmd_buffer))
 		return;
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	VkClearRect clear_rect = {
 		.rect = cmd_state->render_area,
@@ -1020,7 +1128,8 @@
 	for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) {
 		uint32_t a = cmd_state->subpass->color_attachments[i].attachment;
 
-		if (!cmd_state->attachments[a].pending_clear_aspects)
+		if (a == VK_ATTACHMENT_UNUSED ||
+		    !cmd_state->attachments[a].pending_clear_aspects)
 			continue;
 
 		assert(cmd_state->attachments[a].pending_clear_aspects ==
@@ -1067,6 +1176,9 @@
 {
 	VkDevice device_h = radv_device_to_handle(cmd_buffer->device);
 	struct radv_image_view iview;
+	uint32_t width = radv_minify(image->info.width, range->baseMipLevel + level);
+	uint32_t height = radv_minify(image->info.height, range->baseMipLevel + level);
+
 	radv_image_view_init(&iview, cmd_buffer->device,
 			     &(VkImageViewCreateInfo) {
 				     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
@@ -1080,8 +1192,7 @@
 					     .baseArrayLayer = range->baseArrayLayer + layer,
 					     .layerCount = 1
 				     },
-			     },
-			     cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
+			     });
 
 	VkFramebuffer fb;
 	radv_CreateFramebuffer(device_h,
@@ -1091,9 +1202,9 @@
 					       .pAttachments = (VkImageView[]) {
 					       radv_image_view_to_handle(&iview),
 				       },
-					       .width = iview.extent.width,
-							.height = iview.extent.height,
-							.layers = 1
+					       .width = width,
+					       .height = height,
+					       .layers = 1
 			       },
 			       &cmd_buffer->pool->alloc,
 			       &fb);
@@ -1149,8 +1260,8 @@
 						.renderArea = {
 						.offset = { 0, 0, },
 						.extent = {
-							.width = iview.extent.width,
-							.height = iview.extent.height,
+							.width = width,
+							.height = height,
 						},
 					},
 						.renderPass = pass,
@@ -1169,7 +1280,7 @@
 	VkClearRect clear_rect = {
 		.rect = {
 			.offset = { 0, 0 },
-			.extent = { iview.extent.width, iview.extent.height },
+			.extent = { width, height },
 		},
 		.baseArrayLayer = range->baseArrayLayer,
 		.layerCount = 1, /* FINISHME: clear multi-layer framebuffer */
@@ -1214,7 +1325,7 @@
 		const VkImageSubresourceRange *range = &ranges[r];
 		for (uint32_t l = 0; l < radv_get_levelCount(image, range); ++l) {
 			const uint32_t layer_count = image->type == VK_IMAGE_TYPE_3D ?
-				radv_minify(image->extent.depth, range->baseMipLevel + l) :
+				radv_minify(image->info.depth, range->baseMipLevel + l) :
 				radv_get_layerCount(image, range);
 			for (uint32_t s = 0; s < layer_count; ++s) {
 
@@ -1257,7 +1368,7 @@
 	if (cs)
 		radv_meta_begin_cleari(cmd_buffer, &saved_state.compute);
 	else
-		radv_meta_save_graphics_reset_vport_scissor(&saved_state.gfx, cmd_buffer);
+		radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state.gfx, cmd_buffer);
 
 	radv_cmd_clear_image(cmd_buffer, image, imageLayout,
 			     (const VkClearValue *) pColor,
@@ -1281,7 +1392,7 @@
 	RADV_FROM_HANDLE(radv_image, image, image_h);
 	struct radv_meta_saved_state saved_state;
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	radv_cmd_clear_image(cmd_buffer, image, imageLayout,
 			     (const VkClearValue *) pDepthStencil,
@@ -1305,7 +1416,7 @@
 	if (!cmd_buffer->state.subpass)
 		return;
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	/* FINISHME: We can do better than this dumb loop. It thrashes too much
 	 * state.
diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c
index 54dadde..411d0b8 100644
--- a/src/amd/vulkan/radv_meta_copy.c
+++ b/src/amd/vulkan/radv_meta_copy.c
@@ -118,12 +118,12 @@
 	/* The Vulkan 1.0 spec says "dstImage must have a sample count equal to
 	 * VK_SAMPLE_COUNT_1_BIT."
 	 */
-	assert(image->samples == 1);
+	assert(image->info.samples == 1);
 
 	if (cs)
 		radv_meta_begin_bufimage(cmd_buffer, &saved_state.compute);
 	else
-		radv_meta_save_graphics_reset_vport_scissor(&saved_state.gfx, cmd_buffer);
+		radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state.gfx, cmd_buffer);
 
 	for (unsigned r = 0; r < regionCount; r++) {
 
@@ -337,11 +337,11 @@
 	 *    vkCmdCopyImage can be used to copy image data between multisample
 	 *    images, but both images must have the same number of samples.
 	 */
-	assert(src_image->samples == dest_image->samples);
+	assert(src_image->info.samples == dest_image->info.samples);
 	if (cs)
 		radv_meta_begin_itoi(cmd_buffer, &saved_state.compute);
 	else
-		radv_meta_save_graphics_reset_vport_scissor(&saved_state.gfx, cmd_buffer);
+		radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state.gfx, cmd_buffer);
 
 	for (unsigned r = 0; r < regionCount; r++) {
 		assert(pRegions[r].srcSubresource.aspectMask ==
@@ -447,8 +447,8 @@
 	image_copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 	image_copy.dstSubresource.layerCount = 1;
 
-	image_copy.extent.width = image->extent.width;
-	image_copy.extent.height = image->extent.height;
+	image_copy.extent.width = image->info.width;
+	image_copy.extent.height = image->info.height;
 	image_copy.extent.depth = 1;
 
 	meta_copy_image(cmd_buffer, image, linear_image,
diff --git a/src/amd/vulkan/radv_meta_decompress.c b/src/amd/vulkan/radv_meta_decompress.c
index 854b88a..f68ce8d 100644
--- a/src/amd/vulkan/radv_meta_decompress.c
+++ b/src/amd/vulkan/radv_meta_decompress.c
@@ -26,64 +26,20 @@
 
 #include "radv_meta.h"
 #include "radv_private.h"
-#include "nir/nir_builder.h"
 #include "sid.h"
-/**
- * Vertex attributes used by all pipelines.
- */
-struct vertex_attrs {
-	float position[2]; /**< 3DPRIM_RECTLIST */
-};
-
-/* passthrough vertex shader */
-static nir_shader *
-build_nir_vs(void)
-{
-	const struct glsl_type *vec4 = glsl_vec4_type();
-
-	nir_builder b;
-	nir_variable *a_position;
-	nir_variable *v_position;
-
-	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_depth_decomp_vs");
-
-	a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4,
-					 "a_position");
-	a_position->data.location = VERT_ATTRIB_GENERIC0;
-
-	v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4,
-					 "gl_Position");
-	v_position->data.location = VARYING_SLOT_POS;
-
-	nir_copy_var(&b, v_position, a_position);
-
-	return b.shader;
-}
-
-/* simple passthrough shader */
-static nir_shader *
-build_nir_fs(void)
-{
-	nir_builder b;
-
-	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_asprintf(b.shader,
-					       "meta_depth_decomp_noop_fs");
-
-	return b.shader;
-}
 
 static VkResult
-create_pass(struct radv_device *device)
+create_pass(struct radv_device *device,
+	    uint32_t samples,
+	    VkRenderPass *pass)
 {
 	VkResult result;
 	VkDevice device_h = radv_device_to_handle(device);
 	const VkAllocationCallbacks *alloc = &device->meta_state.alloc;
 	VkAttachmentDescription attachment;
 
-	attachment.format = VK_FORMAT_UNDEFINED;
-	attachment.samples = 1;
+	attachment.format = VK_FORMAT_D32_SFLOAT_S8_UINT;
+	attachment.samples = samples;
 	attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
 	attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
 	attachment.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
@@ -111,20 +67,24 @@
 								.dependencyCount = 0,
 								   },
 				       alloc,
-				       &device->meta_state.depth_decomp.pass);
+				       pass);
 
 	return result;
 }
 
 static VkResult
 create_pipeline(struct radv_device *device,
-                VkShaderModule vs_module_h)
+                VkShaderModule vs_module_h,
+		uint32_t samples,
+		VkRenderPass pass,
+		VkPipeline *decompress_pipeline,
+		VkPipeline *resummarize_pipeline)
 {
 	VkResult result;
 	VkDevice device_h = radv_device_to_handle(device);
 
 	struct radv_shader_module fs_module = {
-		.nir = build_nir_fs(),
+		.nir = radv_meta_build_nir_fs_noop(),
 	};
 
 	if (!fs_module.nir) {
@@ -152,24 +112,8 @@
 		},
 		.pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-			.vertexBindingDescriptionCount = 1,
-			.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-				{
-					.binding = 0,
-					.stride = sizeof(struct vertex_attrs),
-					.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-				},
-			},
-			.vertexAttributeDescriptionCount = 1,
-			.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-				{
-					/* Position */
-					.location = 0,
-					.binding = 0,
-					.format = VK_FORMAT_R32G32_SFLOAT,
-					.offset = offsetof(struct vertex_attrs, position),
-				},
-			},
+			.vertexBindingDescriptionCount = 0,
+			.vertexAttributeDescriptionCount = 0,
 		},
 		.pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
@@ -191,7 +135,7 @@
 		},
 		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
-			.rasterizationSamples = 1,
+			.rasterizationSamples = samples,
 			.sampleShadingEnable = false,
 			.pSampleMask = NULL,
 			.alphaToCoverageEnable = false,
@@ -218,7 +162,7 @@
 				VK_DYNAMIC_STATE_SCISSOR,
 			},
 		},
-		.renderPass = device->meta_state.depth_decomp.pass,
+		.renderPass = pass,
 		.subpass = 0,
 	};
 
@@ -231,7 +175,7 @@
 							.db_flush_stencil_inplace = true,
 					       },
 					       &device->meta_state.alloc,
-					       &device->meta_state.depth_decomp.decompress_pipeline);
+					       decompress_pipeline);
 	if (result != VK_SUCCESS)
 		goto cleanup;
 
@@ -245,7 +189,7 @@
 							.db_resummarize = true,
 					       },
 					       &device->meta_state.alloc,
-					       &device->meta_state.depth_decomp.resummarize_pipeline);
+					       resummarize_pipeline);
 	if (result != VK_SUCCESS)
 		goto cleanup;
 
@@ -261,45 +205,55 @@
 {
 	struct radv_meta_state *state = &device->meta_state;
 	VkDevice device_h = radv_device_to_handle(device);
-	VkRenderPass pass_h = device->meta_state.depth_decomp.pass;
 	const VkAllocationCallbacks *alloc = &device->meta_state.alloc;
 
-	if (pass_h)
-		radv_DestroyRenderPass(device_h, pass_h,
-					     &device->meta_state.alloc);
-
-	VkPipeline pipeline_h = state->depth_decomp.decompress_pipeline;
-	if (pipeline_h) {
-		radv_DestroyPipeline(device_h, pipeline_h, alloc);
-	}
-	pipeline_h = state->depth_decomp.resummarize_pipeline;
-	if (pipeline_h) {
-		radv_DestroyPipeline(device_h, pipeline_h, alloc);
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->depth_decomp); ++i) {
+		VkRenderPass pass_h = state->depth_decomp[i].pass;
+		if (pass_h) {
+			radv_DestroyRenderPass(device_h, pass_h, alloc);
+		}
+		VkPipeline pipeline_h = state->depth_decomp[i].decompress_pipeline;
+		if (pipeline_h) {
+			radv_DestroyPipeline(device_h, pipeline_h, alloc);
+		}
+		pipeline_h = state->depth_decomp[i].resummarize_pipeline;
+		if (pipeline_h) {
+			radv_DestroyPipeline(device_h, pipeline_h, alloc);
+		}
 	}
 }
 
 VkResult
 radv_device_init_meta_depth_decomp_state(struct radv_device *device)
 {
+	struct radv_meta_state *state = &device->meta_state;
 	VkResult res = VK_SUCCESS;
 
-	zero(device->meta_state.depth_decomp);
+	zero(state->depth_decomp);
 
-	struct radv_shader_module vs_module = { .nir = build_nir_vs() };
+	struct radv_shader_module vs_module = { .nir = radv_meta_build_nir_vs_generate_vertices() };
 	if (!vs_module.nir) {
 		/* XXX: Need more accurate error */
 		res = VK_ERROR_OUT_OF_HOST_MEMORY;
 		goto fail;
 	}
 
-	res = create_pass(device);
-	if (res != VK_SUCCESS)
-		goto fail;
-
 	VkShaderModule vs_module_h = radv_shader_module_to_handle(&vs_module);
-	res = create_pipeline(device, vs_module_h);
-	if (res != VK_SUCCESS)
-		goto fail;
+
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->depth_decomp); ++i) {
+		uint32_t samples = 1 << i;
+
+		res = create_pass(device, samples, &state->depth_decomp[i].pass);
+		if (res != VK_SUCCESS)
+			goto fail;
+
+		res = create_pipeline(device, vs_module_h, samples,
+				      state->depth_decomp[i].pass,
+				      &state->depth_decomp[i].decompress_pipeline,
+				      &state->depth_decomp[i].resummarize_pipeline);
+		if (res != VK_SUCCESS)
+			goto fail;
+	}
 
 	goto cleanup;
 
@@ -318,45 +272,7 @@
 		  const VkExtent2D *depth_decomp_extent,
 		  VkPipeline pipeline_h)
 {
-	struct radv_device *device = cmd_buffer->device;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
-	uint32_t offset;
-	const struct vertex_attrs vertex_data[3] = {
-		{
-			.position = {
-				-1.0,
-				-1.0,
-			},
-		},
-		{
-			.position = {
-				-1.0,
-				1.0,
-			},
-		},
-		{
-			.position = {
-				1.0,
-				-1.0,
-			},
-		},
-	};
-
-	radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset);
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = sizeof(vertex_data),
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
-
-	VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer);
-
-	radv_CmdBindVertexBuffers(cmd_buffer_h,
-				  /*firstBinding*/ 0,
-				  /*bindingCount*/ 1,
-				  (VkBuffer[]) { vertex_buffer_h },
-				  (VkDeviceSize[]) { 0 });
 
 	RADV_FROM_HANDLE(radv_pipeline, pipeline, pipeline_h);
 
@@ -383,25 +299,33 @@
 }
 
 
+enum radv_depth_op {
+	DEPTH_DECOMPRESS,
+	DEPTH_RESUMMARIZE,
+};
+
 static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 					     struct radv_image *image,
 					     VkImageSubresourceRange *subresourceRange,
-					     VkPipeline pipeline_h)
+					     enum radv_depth_op op)
 {
 	struct radv_meta_saved_state saved_state;
 	struct radv_meta_saved_pass_state saved_pass_state;
 	VkDevice device_h = radv_device_to_handle(cmd_buffer->device);
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
-	uint32_t width = radv_minify(image->extent.width,
+	uint32_t width = radv_minify(image->info.width,
 				     subresourceRange->baseMipLevel);
-	uint32_t height = radv_minify(image->extent.height,
+	uint32_t height = radv_minify(image->info.height,
 				     subresourceRange->baseMipLevel);
+	uint32_t samples = image->info.samples;
+	uint32_t samples_log2 = ffs(samples) - 1;
+	struct radv_meta_state *meta_state = &cmd_buffer->device->meta_state;
 
 	if (!image->surface.htile_size)
 		return;
 	radv_meta_save_pass(&saved_pass_state, cmd_buffer);
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	for (uint32_t layer = 0; layer < radv_get_layerCount(image, subresourceRange); layer++) {
 		struct radv_image_view iview;
@@ -418,8 +342,7 @@
 						     .baseArrayLayer = subresourceRange->baseArrayLayer + layer,
 						     .layerCount = 1,
 					     },
-				     },
-				     cmd_buffer, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT);
+				     });
 
 
 		VkFramebuffer fb_h;
@@ -440,7 +363,7 @@
 		radv_CmdBeginRenderPass(cmd_buffer_h,
 					      &(VkRenderPassBeginInfo) {
 						      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-							      .renderPass = cmd_buffer->device->meta_state.depth_decomp.pass,
+							      .renderPass = meta_state->depth_decomp[samples_log2].pass,
 							      .framebuffer = fb_h,
 							      .renderArea = {
 							      .offset = {
@@ -457,6 +380,18 @@
 					   },
 					   VK_SUBPASS_CONTENTS_INLINE);
 
+		VkPipeline pipeline_h;
+		switch (op) {
+		case DEPTH_DECOMPRESS:
+			pipeline_h = meta_state->depth_decomp[samples_log2].decompress_pipeline;
+			break;
+		case DEPTH_RESUMMARIZE:
+			pipeline_h = meta_state->depth_decomp[samples_log2].resummarize_pipeline;
+			break;
+		default:
+			unreachable("unknown operation");
+		}
+
 		emit_depth_decomp(cmd_buffer, &(VkOffset2D){0, 0 }, &(VkExtent2D){width, height}, pipeline_h);
 		radv_CmdEndRenderPass(cmd_buffer_h);
 
@@ -472,8 +407,7 @@
 					 VkImageSubresourceRange *subresourceRange)
 {
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange,
-					 cmd_buffer->device->meta_state.depth_decomp.decompress_pipeline);
+	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, DEPTH_DECOMPRESS);
 }
 
 void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
@@ -481,6 +415,5 @@
 					 VkImageSubresourceRange *subresourceRange)
 {
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange,
-					 cmd_buffer->device->meta_state.depth_decomp.resummarize_pipeline);
+	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, DEPTH_RESUMMARIZE);
 }
diff --git a/src/amd/vulkan/radv_meta_fast_clear.c b/src/amd/vulkan/radv_meta_fast_clear.c
index 3393bcb..27f8c16 100644
--- a/src/amd/vulkan/radv_meta_fast_clear.c
+++ b/src/amd/vulkan/radv_meta_fast_clear.c
@@ -26,53 +26,7 @@
 
 #include "radv_meta.h"
 #include "radv_private.h"
-#include "nir/nir_builder.h"
 #include "sid.h"
-/**
- * Vertex attributes used by all pipelines.
- */
-struct vertex_attrs {
-	float position[2]; /**< 3DPRIM_RECTLIST */
-};
-
-/* passthrough vertex shader */
-static nir_shader *
-build_nir_vs(void)
-{
-	const struct glsl_type *vec4 = glsl_vec4_type();
-
-	nir_builder b;
-	nir_variable *a_position;
-	nir_variable *v_position;
-
-	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_fast_clear_vs");
-
-	a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4,
-					 "a_position");
-	a_position->data.location = VERT_ATTRIB_GENERIC0;
-
-	v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4,
-					 "gl_Position");
-	v_position->data.location = VARYING_SLOT_POS;
-
-	nir_copy_var(&b, v_position, a_position);
-
-	return b.shader;
-}
-
-/* simple passthrough shader */
-static nir_shader *
-build_nir_fs(void)
-{
-	nir_builder b;
-
-	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_asprintf(b.shader,
-					      "meta_fast_clear_noop_fs");
-
-	return b.shader;
-}
 
 static VkResult
 create_pass(struct radv_device *device)
@@ -128,7 +82,7 @@
 	VkDevice device_h = radv_device_to_handle(device);
 
 	struct radv_shader_module fs_module = {
-		.nir = build_nir_fs(),
+		.nir = radv_meta_build_nir_fs_noop(),
 	};
 
 	if (!fs_module.nir) {
@@ -154,24 +108,8 @@
 
 	const VkPipelineVertexInputStateCreateInfo vi_state = {
 		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-		.vertexBindingDescriptionCount = 1,
-		.pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-			{
-				.binding = 0,
-				.stride = sizeof(struct vertex_attrs),
-				.inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-			},
-		},
-		.vertexAttributeDescriptionCount = 1,
-		.pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-			{
-				/* Position */
-				.location = 0,
-				.binding = 0,
-				.format = VK_FORMAT_R32G32_SFLOAT,
-				.offset = offsetof(struct vertex_attrs, position),
-			},
-		}
+		.vertexBindingDescriptionCount = 0,
+		.vertexAttributeDescriptionCount = 0,
 	};
 
 	const VkPipelineInputAssemblyStateCreateInfo ia_state = {
@@ -330,7 +268,7 @@
 
 	zero(device->meta_state.fast_clear_flush);
 
-	struct radv_shader_module vs_module = { .nir = build_nir_vs() };
+	struct radv_shader_module vs_module = { .nir = radv_meta_build_nir_vs_generate_vertices() };
 	if (!vs_module.nir) {
 		/* XXX: Need more accurate error */
 		res = VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -364,43 +302,6 @@
 {
 	struct radv_device *device = cmd_buffer->device;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
-	uint32_t offset;
-	const struct vertex_attrs vertex_data[3] = {
-		{
-			.position = {
-				-1.0,
-				-1.0,
-			},
-		},
-		{
-			.position = {
-				-1.0,
-				1.0,
-			},
-		},
-		{
-			.position = {
-				1.0,
-				-1.0,
-			},
-		},
-	};
-
-	radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset);
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = sizeof(vertex_data),
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
-
-	VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer);
-
-	radv_CmdBindVertexBuffers(cmd_buffer_h,
-				  /*firstBinding*/ 0,
-				  /*bindingCount*/ 1,
-				  (VkBuffer[]) { vertex_buffer_h },
-				  (VkDeviceSize[]) { 0 });
 
 	VkPipeline pipeline_h;
 	if (fmask_decompress)
@@ -433,6 +334,20 @@
 					 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META);
 }
 
+static void
+radv_emit_set_predication_state_from_image(struct radv_cmd_buffer *cmd_buffer,
+				      struct radv_image *image, bool value)
+{
+	uint64_t va = 0;
+
+	if (value) {
+		va = cmd_buffer->device->ws->buffer_get_va(image->bo) + image->offset;
+		va += image->dcc_pred_offset;
+	}
+
+	si_emit_set_predication_state(cmd_buffer, va);
+}
+
 /**
  */
 void
@@ -448,8 +363,12 @@
 
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
 	radv_meta_save_pass(&saved_pass_state, cmd_buffer);
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
+	if (image->surface.dcc_size) {
+		radv_emit_set_predication_state_from_image(cmd_buffer, image, true);
+		cmd_buffer->state.predicating = true;
+	}
 	for (uint32_t layer = 0; layer < layer_count; ++layer) {
 		struct radv_image_view iview;
 
@@ -466,8 +385,7 @@
 						     .baseArrayLayer = subresourceRange->baseArrayLayer + layer,
 						     .layerCount = 1,
 					      },
-				     },
-				     cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
+				     });
 
 		VkFramebuffer fb_h;
 		radv_CreateFramebuffer(device_h,
@@ -477,8 +395,8 @@
 					.pAttachments = (VkImageView[]) {
 						radv_image_view_to_handle(&iview)
 					},
-				       .width = image->extent.width,
-				       .height = image->extent.height,
+				       .width = image->info.width,
+				       .height = image->info.height,
 				       .layers = 1
 				},
 				&cmd_buffer->pool->alloc,
@@ -495,8 +413,8 @@
 							      0,
 						      },
 						      .extent = {
-							      image->extent.width,
-							      image->extent.height,
+							      image->info.width,
+							      image->info.height,
 						      }
 					      },
 					      .clearValueCount = 0,
@@ -505,7 +423,7 @@
 				     VK_SUBPASS_CONTENTS_INLINE);
 
 		emit_fast_clear_flush(cmd_buffer,
-				      &(VkExtent2D) { image->extent.width, image->extent.height },
+				      &(VkExtent2D) { image->info.width, image->info.height },
 				      image->fmask.size > 0);
 		radv_CmdEndRenderPass(cmd_buffer_h);
 
@@ -513,6 +431,10 @@
 					&cmd_buffer->pool->alloc);
 
 	}
+	if (image->surface.dcc_size) {
+		cmd_buffer->state.predicating = false;
+		radv_emit_set_predication_state_from_image(cmd_buffer, image, false);
+	}
 	radv_meta_restore(&saved_state, cmd_buffer);
 	radv_meta_restore_pass(&saved_pass_state, cmd_buffer);
 }
diff --git a/src/amd/vulkan/radv_meta_resolve.c b/src/amd/vulkan/radv_meta_resolve.c
index 52f7246..dd811c2 100644
--- a/src/amd/vulkan/radv_meta_resolve.c
+++ b/src/amd/vulkan/radv_meta_resolve.c
@@ -28,40 +28,8 @@
 #include "radv_private.h"
 #include "nir/nir_builder.h"
 #include "sid.h"
-/**
- * Vertex attributes used by all pipelines.
- */
-struct vertex_attrs {
-	float position[2]; /**< 3DPRIM_RECTLIST */
-};
 
-/* passthrough vertex shader */
-static nir_shader *
-build_nir_vs(void)
-{
-	const struct glsl_type *vec4 = glsl_vec4_type();
-
-	nir_builder b;
-	nir_variable *a_position;
-	nir_variable *v_position;
-
-	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "meta_resolve_vs");
-
-	a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4,
-					 "a_position");
-	a_position->data.location = VERT_ATTRIB_GENERIC0;
-
-	v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4,
-					 "gl_Position");
-	v_position->data.location = VARYING_SLOT_POS;
-
-	nir_copy_var(&b, v_position, a_position);
-
-	return b.shader;
-}
-
-/* simple passthrough shader */
+/* emit 0, 0, 0, 1 */
 static nir_shader *
 build_nir_fs(void)
 {
@@ -70,7 +38,7 @@
 	nir_variable *f_color; /* vec4, fragment output color */
 
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
-	b.shader->info->name = ralloc_asprintf(b.shader,
+	b.shader->info.name = ralloc_asprintf(b.shader,
 					       "meta_resolve_fs");
 
 	f_color = nir_variable_create(b.shader, nir_var_shader_out, vec4,
@@ -174,24 +142,8 @@
 					       },
 					       .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
 						       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-						       .vertexBindingDescriptionCount = 1,
-						       .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
-							       {
-								       .binding = 0,
-								       .stride = sizeof(struct vertex_attrs),
-								       .inputRate = VK_VERTEX_INPUT_RATE_VERTEX
-							       },
-						       },
-						       .vertexAttributeDescriptionCount = 1,
-						       .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
-							       {
-								       /* Position */
-								       .location = 0,
-								       .binding = 0,
-								       .format = VK_FORMAT_R32G32_SFLOAT,
-								       .offset = offsetof(struct vertex_attrs, position),
-							       },
-						       },
+						       .vertexBindingDescriptionCount = 0,
+						       .vertexAttributeDescriptionCount = 0,
 					       },
 					       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
 						       .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
@@ -288,7 +240,7 @@
 
 	zero(device->meta_state.resolve);
 
-	struct radv_shader_module vs_module = { .nir = build_nir_vs() };
+	struct radv_shader_module vs_module = { .nir = radv_meta_build_nir_vs_generate_vertices() };
 	if (!vs_module.nir) {
 		/* XXX: Need more accurate error */
 		res = VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -322,44 +274,8 @@
 {
 	struct radv_device *device = cmd_buffer->device;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
-	uint32_t offset;
-	const struct vertex_attrs vertex_data[3] = {
-		{
-			.position = {
-				-1.0,
-				-1.0,
-			},
-		},
-		{
-			.position = {
-				-1.0,
-				1.0,
-			},
-		},
-		{
-			.position = {
-				1.0,
-				-1.0,
-			},
-		},
-	};
 
 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
-	radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset);
-	struct radv_buffer vertex_buffer = {
-		.device = device,
-		.size = sizeof(vertex_data),
-		.bo = cmd_buffer->upload.upload_bo,
-		.offset = offset,
-	};
-
-	VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer);
-
-	radv_CmdBindVertexBuffers(cmd_buffer_h,
-				  /*firstBinding*/ 0,
-				  /*bindingCount*/ 1,
-				  (VkBuffer[]) { vertex_buffer_h },
-				  (VkDeviceSize[]) { 0 });
 
 	VkPipeline pipeline_h = device->meta_state.resolve.pipeline;
 	RADV_FROM_HANDLE(radv_pipeline, pipeline, pipeline_h);
@@ -387,6 +303,25 @@
 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
 }
 
+enum radv_resolve_method {
+	RESOLVE_HW,
+	RESOLVE_COMPUTE,
+	RESOLVE_FRAGMENT,
+};
+
+static void radv_pick_resolve_method_images(struct radv_image *src_image,
+					    struct radv_image *dest_image,
+					    enum radv_resolve_method *method)
+
+{
+	if (dest_image->surface.micro_tile_mode != src_image->surface.micro_tile_mode) {
+		if (dest_image->surface.num_dcc_levels > 0)
+			*method = RESOLVE_FRAGMENT;
+		else
+			*method = RESOLVE_COMPUTE;
+	}
+}
+
 void radv_CmdResolveImage(
 	VkCommandBuffer                             cmd_buffer_h,
 	VkImage                                     src_image_h,
@@ -402,28 +337,39 @@
 	struct radv_device *device = cmd_buffer->device;
 	struct radv_meta_saved_state saved_state;
 	VkDevice device_h = radv_device_to_handle(device);
-	bool use_compute_resolve = false;
-
+	enum radv_resolve_method resolve_method = RESOLVE_HW;
 	/* we can use the hw resolve only for single full resolves */
 	if (region_count == 1) {
 		if (regions[0].srcOffset.x ||
 		    regions[0].srcOffset.y ||
 		    regions[0].srcOffset.z)
-			use_compute_resolve = true;
+			resolve_method = RESOLVE_COMPUTE;
 		if (regions[0].dstOffset.x ||
 		    regions[0].dstOffset.y ||
 		    regions[0].dstOffset.z)
-			use_compute_resolve = true;
+			resolve_method = RESOLVE_COMPUTE;
 
-		if (regions[0].extent.width != src_image->extent.width ||
-		    regions[0].extent.height != src_image->extent.height ||
-		    regions[0].extent.depth != src_image->extent.depth)
-			use_compute_resolve = true;
+		if (regions[0].extent.width != src_image->info.width ||
+		    regions[0].extent.height != src_image->info.height ||
+		    regions[0].extent.depth != src_image->info.depth)
+			resolve_method = RESOLVE_COMPUTE;
 	} else
-		use_compute_resolve = true;
+		resolve_method = RESOLVE_COMPUTE;
 
-	if (use_compute_resolve) {
+	radv_pick_resolve_method_images(src_image, dest_image,
+					&resolve_method);
 
+	if (resolve_method == RESOLVE_FRAGMENT) {
+		radv_meta_resolve_fragment_image(cmd_buffer,
+						 src_image,
+						 src_image_layout,
+						 dest_image,
+						 dest_image_layout,
+						 region_count, regions);
+		return;
+	}
+
+	if (resolve_method == RESOLVE_COMPUTE) {
 		radv_meta_resolve_compute_image(cmd_buffer,
 						src_image,
 						src_image_layout,
@@ -433,12 +379,17 @@
 		return;
 	}
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
-	assert(src_image->samples > 1);
-	assert(dest_image->samples == 1);
+	assert(src_image->info.samples > 1);
+	if (src_image->info.samples <= 1) {
+		/* this causes GPU hangs if we get past here */
+		fprintf(stderr, "radv: Illegal resolve operation (src not multisampled), will hang GPU.");
+		return;
+	}
+	assert(dest_image->info.samples == 1);
 
-	if (src_image->samples >= 16) {
+	if (src_image->info.samples >= 16) {
 		/* See commit aa3f9aaf31e9056a255f9e0472ebdfdaa60abe54 for the
 		 * glBlitFramebuffer workaround for samples >= 16.
 		 */
@@ -446,7 +397,7 @@
 			      "samples >= 16");
 	}
 
-	if (src_image->array_size > 1)
+	if (src_image->info.array_size > 1)
 		radv_finishme("vkCmdResolveImage: multisample array images");
 
 	if (dest_image->surface.dcc_size) {
@@ -512,8 +463,7 @@
 							     .baseArrayLayer = src_base_layer + layer,
 							     .layerCount = 1,
 						     },
-							     },
-					     cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT);
+					     });
 
 			struct radv_image_view dest_iview;
 			radv_image_view_init(&dest_iview, cmd_buffer->device,
@@ -529,8 +479,7 @@
 							     .baseArrayLayer = dest_base_layer + layer,
 							     .layerCount = 1,
 						     },
-							     },
-					     cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
+					      });
 
 			VkFramebuffer fb_h;
 			radv_CreateFramebuffer(device_h,
@@ -541,9 +490,9 @@
 							       radv_image_view_to_handle(&src_iview),
 							       radv_image_view_to_handle(&dest_iview),
 						       },
-						       .width = radv_minify(dest_image->extent.width,
+						       .width = radv_minify(dest_image->info.width,
 									    region->dstSubresource.mipLevel),
-						       .height = radv_minify(dest_image->extent.height,
+						       .height = radv_minify(dest_image->info.height,
 									      region->dstSubresource.mipLevel),
 						       .layers = 1
 					       },
@@ -599,6 +548,7 @@
 	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 	struct radv_meta_saved_state saved_state;
+	enum radv_resolve_method resolve_method = RESOLVE_HW;
 
 	/* FINISHME(perf): Skip clears for resolve attachments.
 	 *
@@ -612,15 +562,43 @@
 	if (!subpass->has_resolve)
 		return;
 
-	radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+	for (uint32_t i = 0; i < subpass->color_count; ++i) {
+		VkAttachmentReference src_att = subpass->color_attachments[i];
+		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+
+		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
+		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+
+		struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image;
+		struct radv_image *src_img = cmd_buffer->state.framebuffer->attachments[src_att.attachment].attachment->image;
+
+		radv_pick_resolve_method_images(dst_img, src_img, &resolve_method);
+		if (resolve_method == RESOLVE_FRAGMENT) {
+			break;
+		}
+	}
+
+	if (resolve_method == RESOLVE_COMPUTE) {
+		radv_cmd_buffer_resolve_subpass_cs(cmd_buffer);
+		return;
+	} else if (resolve_method == RESOLVE_FRAGMENT) {
+		radv_cmd_buffer_resolve_subpass_fs(cmd_buffer);
+		return;
+	}
+
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
 
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
 		VkAttachmentReference src_att = subpass->color_attachments[i];
 		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
-		struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image;
-		if (dest_att.attachment == VK_ATTACHMENT_UNUSED)
+
+		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
+		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
 			continue;
 
+		struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image;
+
 		if (dst_img->surface.dcc_size) {
 			radv_initialize_dcc(cmd_buffer, dst_img, 0xffffffff);
 			cmd_buffer->state.attachments[dest_att.attachment].current_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
@@ -634,13 +612,6 @@
 
 		radv_cmd_buffer_set_subpass(cmd_buffer, &resolve_subpass, false);
 
-		/* Subpass resolves must respect the render area. We can ignore the
-		 * render area here because vkCmdBeginRenderPass set the render area
-		 * with 3DSTATE_DRAWING_RECTANGLE.
-		 *
-		 * XXX(chadv): Does the hardware really respect
-		 * 3DSTATE_DRAWING_RECTANGLE when draing a 3DPRIM_RECTLIST?
-		 */
 		emit_resolve(cmd_buffer,
 			     &(VkOffset2D) { 0, 0 },
 			     &(VkExtent2D) { fb->width, fb->height });
diff --git a/src/amd/vulkan/radv_meta_resolve_cs.c b/src/amd/vulkan/radv_meta_resolve_cs.c
index ffa07ca..ce02884 100644
--- a/src/amd/vulkan/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/radv_meta_resolve_cs.c
@@ -31,12 +31,50 @@
 #include "sid.h"
 #include "vk_format.h"
 
+static nir_ssa_def *radv_meta_build_resolve_srgb_conversion(nir_builder *b,
+							    nir_ssa_def *input)
+{
+	nir_const_value v;
+	unsigned i;
+	v.u32[0] = 0x3b4d2e1c; // 0.00313080009
+
+	nir_ssa_def *cmp[3];
+	for (i = 0; i < 3; i++)
+		cmp[i] = nir_flt(b, nir_channel(b, input, i),
+				 nir_build_imm(b, 1, 32, v));
+
+	nir_ssa_def *ltvals[3];
+	v.f32[0] = 12.92;
+	for (i = 0; i < 3; i++)
+		ltvals[i] = nir_fmul(b, nir_channel(b, input, i),
+				     nir_build_imm(b, 1, 32, v));
+
+	nir_ssa_def *gtvals[3];
+
+	for (i = 0; i < 3; i++) {
+		v.f32[0] = 1.0/2.4;
+		gtvals[i] = nir_fpow(b, nir_channel(b, input, i),
+				     nir_build_imm(b, 1, 32, v));
+		v.f32[0] = 1.055;
+		gtvals[i] = nir_fmul(b, gtvals[i],
+				     nir_build_imm(b, 1, 32, v));
+		v.f32[0] = 0.055;
+		gtvals[i] = nir_fsub(b, gtvals[i],
+				     nir_build_imm(b, 1, 32, v));
+	}
+
+	nir_ssa_def *comp[4];
+	for (i = 0; i < 3; i++)
+		comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
+	comp[3] = nir_channels(b, input, 1 << 3);
+	return nir_vec(b, comp, 4);
+}
+
 static nir_shader *
-build_resolve_compute_shader(struct radv_device *dev, bool is_integer, int samples)
+build_resolve_compute_shader(struct radv_device *dev, bool is_integer, bool is_srgb, int samples)
 {
 	nir_builder b;
 	char name[64];
-	nir_if *outer_if = NULL;
 	const struct glsl_type *sampler_type = glsl_sampler_type(GLSL_SAMPLER_DIM_MS,
 								 false,
 								 false,
@@ -45,12 +83,12 @@
 							     false,
 							     false,
 							     GLSL_TYPE_FLOAT);
-	snprintf(name, 64, "meta_resolve_cs-%d-%s", samples, is_integer ? "int" : "float");
+	snprintf(name, 64, "meta_resolve_cs-%d-%s", samples, is_integer ? "int" : (is_srgb ? "srgb" : "float"));
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, name);
-	b.shader->info->cs.local_size[0] = 16;
-	b.shader->info->cs.local_size[1] = 16;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, name);
+	b.shader->info.cs.local_size[0] = 16;
+	b.shader->info.cs.local_size[1] = 16;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
 						      sampler_type, "s_tex");
@@ -64,105 +102,43 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-						b.shader->info->cs.local_size[0],
-						b.shader->info->cs.local_size[1],
-						b.shader->info->cs.local_size[2], 0);
+						b.shader->info.cs.local_size[0],
+						b.shader->info.cs.local_size[1],
+						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_intrinsic_instr *src_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(src_offset, 0);
+	nir_intrinsic_set_range(src_offset, 16);
 	src_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
 	src_offset->num_components = 2;
 	nir_ssa_dest_init(&src_offset->instr, &src_offset->dest, 2, 32, "src_offset");
 	nir_builder_instr_insert(&b, &src_offset->instr);
 
 	nir_intrinsic_instr *dst_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(dst_offset, 0);
+	nir_intrinsic_set_range(dst_offset, 16);
 	dst_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8));
 	dst_offset->num_components = 2;
 	nir_ssa_dest_init(&dst_offset->instr, &dst_offset->dest, 2, 32, "dst_offset");
 	nir_builder_instr_insert(&b, &dst_offset->instr);
 
 	nir_ssa_def *img_coord = nir_channels(&b, nir_iadd(&b, global_id, &src_offset->dest.ssa), 0x3);
-	/* do a txf_ms on each sample */
-	nir_ssa_def *tmp;
+	nir_variable *color = nir_local_variable_create(b.impl, glsl_vec4_type(), "color");
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
-	tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
-	tex->op = nir_texop_txf_ms;
-	tex->src[0].src_type = nir_tex_src_coord;
-	tex->src[0].src = nir_src_for_ssa(img_coord);
-	tex->src[1].src_type = nir_tex_src_ms_index;
-	tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
-	tex->dest_type = nir_type_float;
-	tex->is_array = false;
-	tex->coord_components = 2;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
+	radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img,
+	                                    color, img_coord);
 
-	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
-	nir_builder_instr_insert(&b, &tex->instr);
+	nir_ssa_def *outval = nir_load_var(&b, color);
+	if (is_srgb)
+		outval = radv_meta_build_resolve_srgb_conversion(&b, outval);
 
-	tmp = &tex->dest.ssa;
-	nir_variable *color =
-		nir_local_variable_create(b.impl, glsl_vec4_type(), "color");
-
-	if (!is_integer && samples > 1) {
-		nir_tex_instr *tex_all_same = nir_tex_instr_create(b.shader, 1);
-		tex_all_same->sampler_dim = GLSL_SAMPLER_DIM_MS;
-		tex_all_same->op = nir_texop_samples_identical;
-		tex_all_same->src[0].src_type = nir_tex_src_coord;
-		tex_all_same->src[0].src = nir_src_for_ssa(img_coord);
-		tex_all_same->dest_type = nir_type_float;
-		tex_all_same->is_array = false;
-		tex_all_same->coord_components = 2;
-		tex_all_same->texture = nir_deref_var_create(tex_all_same, input_img);
-		tex_all_same->sampler = NULL;
-
-		nir_ssa_dest_init(&tex_all_same->instr, &tex_all_same->dest, 1, 32, "tex");
-		nir_builder_instr_insert(&b, &tex_all_same->instr);
-
-		nir_ssa_def *all_same = nir_ine(&b, &tex_all_same->dest.ssa, nir_imm_int(&b, 0));
-		nir_if *if_stmt = nir_if_create(b.shader);
-		if_stmt->condition = nir_src_for_ssa(all_same);
-		nir_cf_node_insert(b.cursor, &if_stmt->cf_node);
-
-		b.cursor = nir_after_cf_list(&if_stmt->then_list);
-		for (int i = 1; i < samples; i++) {
-			nir_tex_instr *tex_add = nir_tex_instr_create(b.shader, 2);
-			tex_add->sampler_dim = GLSL_SAMPLER_DIM_MS;
-			tex_add->op = nir_texop_txf_ms;
-			tex_add->src[0].src_type = nir_tex_src_coord;
-			tex_add->src[0].src = nir_src_for_ssa(img_coord);
-			tex_add->src[1].src_type = nir_tex_src_ms_index;
-			tex_add->src[1].src = nir_src_for_ssa(nir_imm_int(&b, i));
-			tex_add->dest_type = nir_type_float;
-			tex_add->is_array = false;
-			tex_add->coord_components = 2;
-			tex_add->texture = nir_deref_var_create(tex_add, input_img);
-			tex_add->sampler = NULL;
-
-			nir_ssa_dest_init(&tex_add->instr, &tex_add->dest, 4, 32, "tex");
-			nir_builder_instr_insert(&b, &tex_add->instr);
-
-			tmp = nir_fadd(&b, tmp, &tex_add->dest.ssa);
-		}
-
-		tmp = nir_fdiv(&b, tmp, nir_imm_float(&b, samples));
-		nir_store_var(&b, color, tmp, 0xf);
-		b.cursor = nir_after_cf_list(&if_stmt->else_list);
-		outer_if = if_stmt;
-	}
-	nir_store_var(&b, color, &tex->dest.ssa, 0xf);
-
-	if (outer_if)
-		b.cursor = nir_after_cf_node(&outer_if->cf_node);
-
-	nir_ssa_def *newv = nir_load_var(&b, color);
 	nir_ssa_def *coord = nir_iadd(&b, global_id, &dst_offset->dest.ssa);
 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_store);
 	store->src[0] = nir_src_for_ssa(coord);
 	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(newv);
+	store->src[2] = nir_src_for_ssa(outval);
 	store->variables[0] = nir_deref_var_create(store, output_img);
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
@@ -230,12 +206,13 @@
 create_resolve_pipeline(struct radv_device *device,
 			int samples,
 			bool is_integer,
+			bool is_srgb,
 			VkPipeline *pipeline)
 {
 	VkResult result;
 	struct radv_shader_module cs = { .nir = NULL };
 
-	cs.nir = build_resolve_compute_shader(device, is_integer, samples);
+	cs.nir = build_resolve_compute_shader(device, is_integer, is_srgb, samples);
 
 	/* compute shader */
 
@@ -282,12 +259,15 @@
 	for (uint32_t i = 0; i < MAX_SAMPLES_LOG2; ++i) {
 		uint32_t samples = 1 << i;
 
-		res = create_resolve_pipeline(device, samples, false,
+		res = create_resolve_pipeline(device, samples, false, false,
 					      &state->resolve_compute.rc[i].pipeline);
 
-		res = create_resolve_pipeline(device, samples, true,
+		res = create_resolve_pipeline(device, samples, true, false,
 					      &state->resolve_compute.rc[i].i_pipeline);
 
+		res = create_resolve_pipeline(device, samples, false, true,
+					      &state->resolve_compute.rc[i].srgb_pipeline);
+
 	}
 
 	return res;
@@ -305,6 +285,10 @@
 		radv_DestroyPipeline(radv_device_to_handle(device),
 				     state->resolve_compute.rc[i].i_pipeline,
 				     &state->alloc);
+
+		radv_DestroyPipeline(radv_device_to_handle(device),
+				     state->resolve_compute.rc[i].srgb_pipeline,
+				     &state->alloc);
 	}
 
 	radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
@@ -315,6 +299,78 @@
 				   &state->alloc);
 }
 
+static void
+emit_resolve(struct radv_cmd_buffer *cmd_buffer,
+	     struct radv_image_view *src_iview,
+	     struct radv_image_view *dest_iview,
+	     const VkOffset2D *src_offset,
+             const VkOffset2D *dest_offset,
+             const VkExtent2D *resolve_extent)
+{
+	struct radv_device *device = cmd_buffer->device;
+	const uint32_t samples = src_iview->image->info.samples;
+	const uint32_t samples_log2 = ffs(samples) - 1;
+	radv_meta_push_descriptor_set(cmd_buffer,
+				      VK_PIPELINE_BIND_POINT_COMPUTE,
+				      device->meta_state.resolve_compute.p_layout,
+				      0, /* set */
+				      2, /* descriptorWriteCount */
+				      (VkWriteDescriptorSet[]) {
+					{
+						.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+						.dstBinding = 0,
+						.dstArrayElement = 0,
+						.descriptorCount = 1,
+						.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+			                      .pImageInfo = (VkDescriptorImageInfo[]) {
+		                              {
+	                                      .sampler = VK_NULL_HANDLE,
+					      .imageView = radv_image_view_to_handle(src_iview),
+	                                      .imageLayout = VK_IMAGE_LAYOUT_GENERAL	                              },
+	                      }
+		              },
+		              {
+		                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+		                      .dstBinding = 1,
+		                      .dstArrayElement = 0,
+				      .descriptorCount = 1,
+				      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+	                      .pImageInfo = (VkDescriptorImageInfo[]) {
+                              {
+                                      .sampler = VK_NULL_HANDLE,
+                                     .imageView = radv_image_view_to_handle(dest_iview),
+                                     .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+                              },
+                      }
+			      }
+				      });
+
+	VkPipeline pipeline;
+	if (vk_format_is_int(src_iview->image->vk_format))
+		pipeline = device->meta_state.resolve_compute.rc[samples_log2].i_pipeline;
+	else if (vk_format_is_srgb(src_iview->image->vk_format))
+		pipeline = device->meta_state.resolve_compute.rc[samples_log2].srgb_pipeline;
+	else
+		pipeline = device->meta_state.resolve_compute.rc[samples_log2].pipeline;
+	if (cmd_buffer->state.compute_pipeline != radv_pipeline_from_handle(pipeline)) {
+		radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
+				     VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+	}
+
+	unsigned push_constants[4] = {
+		src_offset->x,
+		src_offset->y,
+		dest_offset->x,
+		dest_offset->y,
+	};
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      device->meta_state.resolve_compute.p_layout,
+			      VK_SHADER_STAGE_COMPUTE_BIT, 0, 16,
+			      push_constants);
+	radv_unaligned_dispatch(cmd_buffer, resolve_extent->width, resolve_extent->height, 1);
+
+}
+
 void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer,
 				     struct radv_image *src_image,
 				     VkImageLayout src_image_layout,
@@ -323,10 +379,7 @@
 				     uint32_t region_count,
 				     const VkImageResolve *regions)
 {
-	struct radv_device *device = cmd_buffer->device;
 	struct radv_meta_saved_compute_state saved_state;
-	const uint32_t samples = src_image->samples;
-	const uint32_t samples_log2 = ffs(samples) - 1;
 
 	for (uint32_t r = 0; r < region_count; ++r) {
 		const VkImageResolve *region = &regions[r];
@@ -383,8 +436,7 @@
 							     .baseArrayLayer = src_base_layer + layer,
 							     .layerCount = 1,
 						     },
-					     },
-					     cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT);
+					     });
 
 			struct radv_image_view dest_iview;
 			radv_image_view_init(&dest_iview, cmd_buffer->device,
@@ -392,7 +444,7 @@
 						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
 							     .image = radv_image_to_handle(dest_image),
 							     .viewType = radv_meta_get_view_type(dest_image),
-							     .format = dest_image->vk_format,
+							     .format = vk_to_non_srgb_format(dest_image->vk_format),
 							     .subresourceRange = {
 							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
 							     .baseMipLevel = region->dstSubresource.mipLevel,
@@ -400,68 +452,96 @@
 							     .baseArrayLayer = dest_base_layer + layer,
 							     .layerCount = 1,
 						     },
-							     },
-					     cmd_buffer, VK_IMAGE_USAGE_STORAGE_BIT);
+					     });
 
-
-			radv_meta_push_descriptor_set(cmd_buffer,
-						      VK_PIPELINE_BIND_POINT_COMPUTE,
-						      device->meta_state.resolve_compute.p_layout,
-						      0, /* set */
-						      2, /* descriptorWriteCount */
-						      (VkWriteDescriptorSet[]) {
-						              {
-						                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
-						                      .dstBinding = 0,
-						                      .dstArrayElement = 0,
-						                      .descriptorCount = 1,
-						                      .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
-						                      .pImageInfo = (VkDescriptorImageInfo[]) {
-						                              {
-						                                      .sampler = VK_NULL_HANDLE,
-						                                      .imageView = radv_image_view_to_handle(&src_iview),
-						                                      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
-						                              },
-						                      }
-						              },
-						              {
-						                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
-						                      .dstBinding = 1,
-						                      .dstArrayElement = 0,
-						                      .descriptorCount = 1,
-						                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-						                      .pImageInfo = (VkDescriptorImageInfo[]) {
-						                              {
-						                                      .sampler = VK_NULL_HANDLE,
-						                                      .imageView = radv_image_view_to_handle(&dest_iview),
-						                                      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
-						                              },
-						                      }
-					                      }
-				                      });
-
-			VkPipeline pipeline;
-			if (vk_format_is_int(src_image->vk_format))
-				pipeline = device->meta_state.resolve_compute.rc[samples_log2].i_pipeline;
-			else
-				pipeline = device->meta_state.resolve_compute.rc[samples_log2].pipeline;
-			if (cmd_buffer->state.compute_pipeline != radv_pipeline_from_handle(pipeline)) {
-				radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
-						     VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
-			}
-
-			unsigned push_constants[4] = {
-				srcOffset.x,
-				srcOffset.y,
-				dstOffset.x,
-				dstOffset.y,
-			};
-			radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-					      device->meta_state.resolve_compute.p_layout,
-					      VK_SHADER_STAGE_COMPUTE_BIT, 0, 16,
-					      push_constants);
-			radv_unaligned_dispatch(cmd_buffer, extent.width, extent.height, 1);
+			emit_resolve(cmd_buffer,
+				     &src_iview,
+				     &dest_iview,
+				     &(VkOffset2D) {srcOffset.x, srcOffset.y },
+				     &(VkOffset2D) {dstOffset.x, dstOffset.y },
+				     &(VkExtent2D) {extent.width, extent.height });
 		}
 	}
 	radv_meta_restore_compute(&saved_state, cmd_buffer, 16);
 }
+
+/**
+ * Emit any needed resolves for the current subpass.
+ */
+void
+radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer)
+{
+	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radv_meta_saved_compute_state saved_state;
+	/* FINISHME(perf): Skip clears for resolve attachments.
+	 *
+	 * From the Vulkan 1.0 spec:
+	 *
+	 *    If the first use of an attachment in a render pass is as a resolve
+	 *    attachment, then the loadOp is effectively ignored as the resolve is
+	 *    guaranteed to overwrite all pixels in the render area.
+	 */
+
+	if (!subpass->has_resolve)
+		return;
+
+	for (uint32_t i = 0; i < subpass->color_count; ++i) {
+		VkAttachmentReference src_att = subpass->color_attachments[i];
+		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+
+		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
+		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+
+		struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image;
+		struct radv_image_view *src_iview = cmd_buffer->state.framebuffer->attachments[src_att.attachment].attachment;
+
+		if (dst_img->surface.dcc_size) {
+			radv_initialize_dcc(cmd_buffer, dst_img, 0xffffffff);
+			cmd_buffer->state.attachments[dest_att.attachment].current_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+		}
+
+		VkImageSubresourceRange range;
+		range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+		range.baseMipLevel = 0;
+		range.levelCount = 1;
+		range.baseArrayLayer = 0;
+		range.layerCount = 1;
+		radv_fast_clear_flush_image_inplace(cmd_buffer, src_iview->image, &range);
+	}
+
+	radv_meta_save_compute(&saved_state, cmd_buffer, 16);
+
+	for (uint32_t i = 0; i < subpass->color_count; ++i) {
+		VkAttachmentReference src_att = subpass->color_attachments[i];
+		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_image_view *src_iview = cmd_buffer->state.framebuffer->attachments[src_att.attachment].attachment;
+		struct radv_image_view *dst_iview = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment;
+		if (dest_att.attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+
+		emit_resolve(cmd_buffer,
+			     src_iview,
+			     dst_iview,
+			     &(VkOffset2D) { 0, 0 },
+			     &(VkOffset2D) { 0, 0 },
+			     &(VkExtent2D) { fb->width, fb->height });
+	}
+
+	radv_meta_restore_compute(&saved_state, cmd_buffer, 16);
+
+	for (uint32_t i = 0; i < subpass->color_count; ++i) {
+		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image;
+		if (dest_att.attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+		VkImageSubresourceRange range;
+		range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+		range.baseMipLevel = 0;
+		range.levelCount = 1;
+		range.baseArrayLayer = 0;
+		range.layerCount = 1;
+		radv_fast_clear_flush_image_inplace(cmd_buffer, dst_img, &range);
+	}
+}
diff --git a/src/amd/vulkan/radv_meta_resolve_fs.c b/src/amd/vulkan/radv_meta_resolve_fs.c
new file mode 100644
index 0000000..373dd96
--- /dev/null
+++ b/src/amd/vulkan/radv_meta_resolve_fs.c
@@ -0,0 +1,646 @@
+/*
+ * Copyright © 2016 Dave Airlie
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "radv_meta.h"
+#include "radv_private.h"
+#include "nir/nir_builder.h"
+#include "sid.h"
+#include "vk_format.h"
+
+static nir_shader *
+build_nir_vertex_shader(void)
+{
+	const struct glsl_type *vec4 = glsl_vec4_type();
+	nir_builder b;
+
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
+	b.shader->info.name = ralloc_strdup(b.shader, "meta_resolve_vs");
+
+	nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
+						    vec4, "gl_Position");
+	pos_out->data.location = VARYING_SLOT_POS;
+
+	nir_ssa_def *outvec = radv_meta_gen_rect_vertices(&b);
+
+	nir_store_var(&b, pos_out, outvec, 0xf);
+	return b.shader;
+}
+
+static nir_shader *
+build_resolve_fragment_shader(struct radv_device *dev, bool is_integer, int samples)
+{
+	nir_builder b;
+	char name[64];
+	const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
+	const struct glsl_type *vec4 = glsl_vec4_type();
+	const struct glsl_type *sampler_type = glsl_sampler_type(GLSL_SAMPLER_DIM_MS,
+								 false,
+								 false,
+								 GLSL_TYPE_FLOAT);
+
+	snprintf(name, 64, "meta_resolve_fs-%d-%s", samples, is_integer ? "int" : "float");
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+	b.shader->info.name = ralloc_strdup(b.shader, name);
+
+	nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
+						      sampler_type, "s_tex");
+	input_img->data.descriptor_set = 0;
+	input_img->data.binding = 0;
+
+	nir_variable *fs_pos_in = nir_variable_create(b.shader, nir_var_shader_in, vec2, "fs_pos_in");
+	fs_pos_in->data.location = VARYING_SLOT_POS;
+
+	nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+						      vec4, "f_color");
+	color_out->data.location = FRAG_RESULT_DATA0;
+
+	nir_ssa_def *pos_in = nir_load_var(&b, fs_pos_in);
+	nir_intrinsic_instr *src_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(src_offset, 0);
+	nir_intrinsic_set_range(src_offset, 8);
+	src_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	src_offset->num_components = 2;
+	nir_ssa_dest_init(&src_offset->instr, &src_offset->dest, 2, 32, "src_offset");
+	nir_builder_instr_insert(&b, &src_offset->instr);
+
+	nir_ssa_def *pos_int = nir_f2i32(&b, pos_in);
+
+	nir_ssa_def *img_coord = nir_channels(&b, nir_iadd(&b, pos_int, &src_offset->dest.ssa), 0x3);
+	nir_variable *color = nir_local_variable_create(b.impl, glsl_vec4_type(), "color");
+
+	radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img,
+	                                    color, img_coord);
+
+	nir_ssa_def *outval = nir_load_var(&b, color);
+	nir_store_var(&b, color_out, outval, 0xf);
+	return b.shader;
+}
+
+
+static VkResult
+create_layout(struct radv_device *device)
+{
+	VkResult result;
+	/*
+	 * one descriptors for the image being sampled
+	 */
+	VkDescriptorSetLayoutCreateInfo ds_create_info = {
+		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+		.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+		.bindingCount = 1,
+		.pBindings = (VkDescriptorSetLayoutBinding[]) {
+			{
+				.binding = 0,
+				.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+				.descriptorCount = 1,
+				.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+				.pImmutableSamplers = NULL
+			},
+		}
+	};
+
+	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
+						&ds_create_info,
+						&device->meta_state.alloc,
+						&device->meta_state.resolve_fragment.ds_layout);
+	if (result != VK_SUCCESS)
+		goto fail;
+
+
+	VkPipelineLayoutCreateInfo pl_create_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+		.setLayoutCount = 1,
+		.pSetLayouts = &device->meta_state.resolve_fragment.ds_layout,
+		.pushConstantRangeCount = 1,
+		.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_FRAGMENT_BIT, 0, 8},
+	};
+
+	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
+					  &pl_create_info,
+					  &device->meta_state.alloc,
+					  &device->meta_state.resolve_fragment.p_layout);
+	if (result != VK_SUCCESS)
+		goto fail;
+	return VK_SUCCESS;
+fail:
+	return result;
+}
+
+static const VkPipelineVertexInputStateCreateInfo normal_vi_create_info = {
+	.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+	.vertexBindingDescriptionCount = 0,
+	.vertexAttributeDescriptionCount = 0,
+};
+
+static VkFormat pipeline_formats[] = {
+   VK_FORMAT_R8G8B8A8_UNORM,
+   VK_FORMAT_R8G8B8A8_UINT,
+   VK_FORMAT_R8G8B8A8_SINT,
+   VK_FORMAT_A2R10G10B10_UINT_PACK32,
+   VK_FORMAT_A2R10G10B10_SINT_PACK32,
+   VK_FORMAT_R16G16B16A16_UNORM,
+   VK_FORMAT_R16G16B16A16_SNORM,
+   VK_FORMAT_R16G16B16A16_UINT,
+   VK_FORMAT_R16G16B16A16_SINT,
+   VK_FORMAT_R32_SFLOAT,
+   VK_FORMAT_R32G32_SFLOAT,
+   VK_FORMAT_R32G32B32A32_SFLOAT
+};
+
+static VkResult
+create_resolve_pipeline(struct radv_device *device,
+			int samples_log2,
+			VkFormat format)
+{
+	VkResult result;
+	bool is_integer = false;
+	uint32_t samples = 1 << samples_log2;
+	unsigned fs_key = radv_format_meta_fs_key(format);
+	const VkPipelineVertexInputStateCreateInfo *vi_create_info;
+	vi_create_info = &normal_vi_create_info;
+	if (vk_format_is_int(format))
+		is_integer = true;
+
+	struct radv_shader_module fs = { .nir = NULL };
+	fs.nir = build_resolve_fragment_shader(device, is_integer, samples);
+	struct radv_shader_module vs = {
+		.nir = build_nir_vertex_shader(),
+	};
+
+	VkRenderPass *rp = &device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];
+
+	assert(!*rp);
+
+	VkPipeline *pipeline = &device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];
+	assert(!*pipeline);
+
+	VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
+		{
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+			.stage = VK_SHADER_STAGE_VERTEX_BIT,
+			.module = radv_shader_module_to_handle(&vs),
+			.pName = "main",
+			.pSpecializationInfo = NULL
+		}, {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+			.stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+			.module = radv_shader_module_to_handle(&fs),
+			.pName = "main",
+			.pSpecializationInfo = NULL
+		},
+	};
+
+
+	result = radv_CreateRenderPass(radv_device_to_handle(device),
+				       &(VkRenderPassCreateInfo) {
+					       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+					       .attachmentCount = 1,
+					       .pAttachments = &(VkAttachmentDescription) {
+						       .format = format,
+						       .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+						       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+						       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+						       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
+					       },
+					       .subpassCount = 1,
+					       .pSubpasses = &(VkSubpassDescription) {
+						       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+						       .inputAttachmentCount = 0,
+						       .colorAttachmentCount = 1,
+						       .pColorAttachments = &(VkAttachmentReference) {
+							       .attachment = 0,
+							       .layout = VK_IMAGE_LAYOUT_GENERAL,
+						},
+					       .pResolveAttachments = NULL,
+					       .pDepthStencilAttachment = &(VkAttachmentReference) {
+						       .attachment = VK_ATTACHMENT_UNUSED,
+						       .layout = VK_IMAGE_LAYOUT_GENERAL,
+					       },
+					       .preserveAttachmentCount = 1,
+					       .pPreserveAttachments = (uint32_t[]) { 0 },
+				       },
+				       .dependencyCount = 0,
+						}, &device->meta_state.alloc, rp);
+
+
+	const VkGraphicsPipelineCreateInfo vk_pipeline_info = {
+		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+		.stageCount = ARRAY_SIZE(pipeline_shader_stages),
+		.pStages = pipeline_shader_stages,
+		.pVertexInputState = vi_create_info,
+		.pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+			.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+			.primitiveRestartEnable = false,
+		},
+		.pViewportState = &(VkPipelineViewportStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+			.viewportCount = 1,
+			.scissorCount = 1,
+		},
+		.pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+			.rasterizerDiscardEnable = false,
+			.polygonMode = VK_POLYGON_MODE_FILL,
+			.cullMode = VK_CULL_MODE_NONE,
+			.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE
+		},
+		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+			.rasterizationSamples = 1,
+			.sampleShadingEnable = false,
+			.pSampleMask = (VkSampleMask[]) { UINT32_MAX },
+		},
+		.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+			.attachmentCount = 1,
+			.pAttachments = (VkPipelineColorBlendAttachmentState []) {
+				{ .colorWriteMask =
+				  VK_COLOR_COMPONENT_A_BIT |
+				  VK_COLOR_COMPONENT_R_BIT |
+				  VK_COLOR_COMPONENT_G_BIT |
+				  VK_COLOR_COMPONENT_B_BIT },
+			}
+		},
+		.pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+			.dynamicStateCount = 9,
+			.pDynamicStates = (VkDynamicState[]) {
+				VK_DYNAMIC_STATE_VIEWPORT,
+				VK_DYNAMIC_STATE_SCISSOR,
+				VK_DYNAMIC_STATE_LINE_WIDTH,
+				VK_DYNAMIC_STATE_DEPTH_BIAS,
+				VK_DYNAMIC_STATE_BLEND_CONSTANTS,
+				VK_DYNAMIC_STATE_DEPTH_BOUNDS,
+				VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
+				VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
+				VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+			},
+		},
+		.flags = 0,
+		.layout = device->meta_state.resolve_fragment.p_layout,
+		.renderPass = *rp,
+		.subpass = 0,
+	};
+
+	const struct radv_graphics_pipeline_create_info radv_pipeline_info = {
+		.use_rectlist = true
+	};
+
+	result = radv_graphics_pipeline_create(radv_device_to_handle(device),
+					       radv_pipeline_cache_to_handle(&device->meta_state.cache),
+					       &vk_pipeline_info, &radv_pipeline_info,
+					       &device->meta_state.alloc,
+					       pipeline);
+
+	ralloc_free(vs.nir);
+	ralloc_free(fs.nir);
+	if (result != VK_SUCCESS)
+		goto fail;
+
+	return VK_SUCCESS;
+fail:
+	ralloc_free(vs.nir);
+	ralloc_free(fs.nir);
+	return result;
+}
+
+VkResult
+radv_device_init_meta_resolve_fragment_state(struct radv_device *device)
+{
+	struct radv_meta_state *state = &device->meta_state;
+	VkResult res;
+	memset(&state->resolve_fragment, 0, sizeof(state->resolve_fragment));
+
+	res = create_layout(device);
+	if (res != VK_SUCCESS)
+		return res;
+
+	for (uint32_t i = 0; i < MAX_SAMPLES_LOG2; ++i) {
+		for (unsigned j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) {
+			res = create_resolve_pipeline(device, i, pipeline_formats[j]);
+		}
+	}
+
+	return res;
+}
+
+void
+radv_device_finish_meta_resolve_fragment_state(struct radv_device *device)
+{
+	struct radv_meta_state *state = &device->meta_state;
+	for (uint32_t i = 0; i < MAX_SAMPLES_LOG2; ++i) {
+		for (unsigned j = 0; j < NUM_META_FS_KEYS; ++j) {
+			radv_DestroyRenderPass(radv_device_to_handle(device),
+					       state->resolve_fragment.rc[i].render_pass[j],
+					       &state->alloc);
+			radv_DestroyPipeline(radv_device_to_handle(device),
+					     state->resolve_fragment.rc[i].pipeline[j],
+					     &state->alloc);
+		}
+	}
+
+	radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
+					state->resolve_fragment.ds_layout,
+					&state->alloc);
+	radv_DestroyPipelineLayout(radv_device_to_handle(device),
+				   state->resolve_fragment.p_layout,
+				   &state->alloc);
+}
+
+static void
+emit_resolve(struct radv_cmd_buffer *cmd_buffer,
+	     struct radv_image_view *src_iview,
+	     struct radv_image_view *dest_iview,
+	     const VkOffset2D *src_offset,
+             const VkOffset2D *dest_offset,
+             const VkExtent2D *resolve_extent)
+{
+	struct radv_device *device = cmd_buffer->device;
+	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
+	const uint32_t samples = src_iview->image->info.samples;
+	const uint32_t samples_log2 = ffs(samples) - 1;
+	radv_meta_push_descriptor_set(cmd_buffer,
+				      VK_PIPELINE_BIND_POINT_GRAPHICS,
+				      cmd_buffer->device->meta_state.resolve_fragment.p_layout,
+				      0, /* set */
+				      1, /* descriptorWriteCount */
+				      (VkWriteDescriptorSet[]) {
+					      {
+						      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+					              .dstBinding = 0,
+					              .dstArrayElement = 0,
+					              .descriptorCount = 1,
+					              .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+						      .pImageInfo = (VkDescriptorImageInfo[]) {
+						      {
+						      .sampler = VK_NULL_HANDLE,
+						      .imageView = radv_image_view_to_handle(src_iview),
+						      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+						      },
+						      }
+					      },
+				      });
+
+	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
+
+	unsigned push_constants[2] = {
+		src_offset->x,
+		src_offset->y,
+	};
+	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+			      device->meta_state.resolve_fragment.p_layout,
+			      VK_SHADER_STAGE_FRAGMENT_BIT, 0, 8,
+			      push_constants);
+
+	unsigned fs_key = radv_format_meta_fs_key(dest_iview->vk_format);
+	VkPipeline pipeline_h = device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];
+
+	radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS,
+			     pipeline_h);
+
+	radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
+		.x = dest_offset->x,
+		.y = dest_offset->y,
+		.width = resolve_extent->width,
+		.height = resolve_extent->height,
+		.minDepth = 0.0f,
+		.maxDepth = 1.0f
+	});
+
+	radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkRect2D) {
+		.offset = *dest_offset,
+		.extent = *resolve_extent,
+	});
+
+	radv_CmdDraw(cmd_buffer_h, 3, 1, 0, 0);
+	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
+}
+
+void radv_meta_resolve_fragment_image(struct radv_cmd_buffer *cmd_buffer,
+				      struct radv_image *src_image,
+				      VkImageLayout src_image_layout,
+				      struct radv_image *dest_image,
+				      VkImageLayout dest_image_layout,
+				      uint32_t region_count,
+				      const VkImageResolve *regions)
+{
+	struct radv_device *device = cmd_buffer->device;
+	struct radv_meta_saved_state saved_state;
+	const uint32_t samples = src_image->info.samples;
+	const uint32_t samples_log2 = ffs(samples) - 1;
+	unsigned fs_key = radv_format_meta_fs_key(dest_image->vk_format);
+	VkRenderPass rp;
+	for (uint32_t r = 0; r < region_count; ++r) {
+		const VkImageResolve *region = &regions[r];
+		const uint32_t src_base_layer =
+			radv_meta_get_iview_layer(src_image, &region->srcSubresource,
+						  &region->srcOffset);
+		VkImageSubresourceRange range;
+		range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+		range.baseMipLevel = region->srcSubresource.mipLevel;
+		range.levelCount = 1;
+		range.baseArrayLayer = src_base_layer;
+		range.layerCount = region->srcSubresource.layerCount;
+		radv_fast_clear_flush_image_inplace(cmd_buffer, src_image, &range);
+	}
+
+	rp = device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
+
+	for (uint32_t r = 0; r < region_count; ++r) {
+		const VkImageResolve *region = &regions[r];
+
+		assert(region->srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+		assert(region->dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+		assert(region->srcSubresource.layerCount == region->dstSubresource.layerCount);
+
+		const uint32_t src_base_layer =
+			radv_meta_get_iview_layer(src_image, &region->srcSubresource,
+						  &region->srcOffset);
+
+		const uint32_t dest_base_layer =
+			radv_meta_get_iview_layer(dest_image, &region->dstSubresource,
+						  &region->dstOffset);
+
+		const struct VkExtent3D extent =
+			radv_sanitize_image_extent(src_image->type, region->extent);
+		const struct VkOffset3D srcOffset =
+			radv_sanitize_image_offset(src_image->type, region->srcOffset);
+		const struct VkOffset3D dstOffset =
+			radv_sanitize_image_offset(dest_image->type, region->dstOffset);
+
+		for (uint32_t layer = 0; layer < region->srcSubresource.layerCount;
+		     ++layer) {
+
+			struct radv_image_view src_iview;
+			radv_image_view_init(&src_iview, cmd_buffer->device,
+					     &(VkImageViewCreateInfo) {
+						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+							     .image = radv_image_to_handle(src_image),
+							     .viewType = radv_meta_get_view_type(src_image),
+							     .format = src_image->vk_format,
+							     .subresourceRange = {
+							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+							     .baseMipLevel = region->srcSubresource.mipLevel,
+							     .levelCount = 1,
+							     .baseArrayLayer = src_base_layer + layer,
+							     .layerCount = 1,
+						     },
+					     });
+
+			struct radv_image_view dest_iview;
+			radv_image_view_init(&dest_iview, cmd_buffer->device,
+					     &(VkImageViewCreateInfo) {
+						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+							     .image = radv_image_to_handle(dest_image),
+							     .viewType = radv_meta_get_view_type(dest_image),
+							     .format = dest_image->vk_format,
+							     .subresourceRange = {
+							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+							     .baseMipLevel = region->dstSubresource.mipLevel,
+							     .levelCount = 1,
+							     .baseArrayLayer = dest_base_layer + layer,
+							     .layerCount = 1,
+						     },
+					     });
+
+
+			VkFramebuffer fb;
+			radv_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device),
+			       &(VkFramebufferCreateInfo) {
+				       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+					       .attachmentCount = 1,
+					       .pAttachments = (VkImageView[]) {
+					       radv_image_view_to_handle(&dest_iview),
+				       },
+				       .width = extent.width,
+				       .height = extent.height,
+				       .layers = 1
+				}, &cmd_buffer->pool->alloc, &fb);
+
+			radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
+						&(VkRenderPassBeginInfo) {
+							.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+								.renderPass = rp,
+								.framebuffer = fb,
+								.renderArea = {
+								.offset = { dstOffset.x, dstOffset.y, },
+								.extent = { extent.width, extent.height },
+							},
+								.clearValueCount = 0,
+								.pClearValues = NULL,
+						}, VK_SUBPASS_CONTENTS_INLINE);
+
+
+
+			emit_resolve(cmd_buffer,
+				     &src_iview,
+				     &dest_iview,
+				     &(VkOffset2D) { srcOffset.x, srcOffset.y },
+				     &(VkOffset2D) { dstOffset.x, dstOffset.y },
+				     &(VkExtent2D) { extent.width, extent.height });
+
+			radv_CmdEndRenderPass(radv_cmd_buffer_to_handle(cmd_buffer));
+
+			radv_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device), fb, &cmd_buffer->pool->alloc);
+		}
+	}
+
+	radv_meta_restore(&saved_state, cmd_buffer);
+}
+
+
+/**
+ * Emit any needed resolves for the current subpass.
+ */
+void
+radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer)
+{
+	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radv_meta_saved_state saved_state;
+
+	/* FINISHME(perf): Skip clears for resolve attachments.
+	 *
+	 * From the Vulkan 1.0 spec:
+	 *
+	 *    If the first use of an attachment in a render pass is as a resolve
+	 *    attachment, then the loadOp is effectively ignored as the resolve is
+	 *    guaranteed to overwrite all pixels in the render area.
+	 */
+
+	if (!subpass->has_resolve)
+		return;
+
+	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);
+
+	for (uint32_t i = 0; i < subpass->color_count; ++i) {
+		VkAttachmentReference src_att = subpass->color_attachments[i];
+		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+
+		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
+		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+
+		struct radv_image_view *dest_iview = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment;
+		struct radv_image *dst_img = dest_iview->image;
+		struct radv_image_view *src_iview = cmd_buffer->state.framebuffer->attachments[src_att.attachment].attachment;
+
+		if (dst_img->surface.dcc_size) {
+			radv_initialize_dcc(cmd_buffer, dst_img, 0xffffffff);
+			cmd_buffer->state.attachments[dest_att.attachment].current_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+		}
+		{
+			VkImageSubresourceRange range;
+			range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+			range.baseMipLevel = 0;
+			range.levelCount = 1;
+			range.baseArrayLayer = 0;
+			range.layerCount = 1;
+			radv_fast_clear_flush_image_inplace(cmd_buffer, src_iview->image, &range);
+		}
+
+		struct radv_subpass resolve_subpass = {
+			.color_count = 1,
+			.color_attachments = (VkAttachmentReference[]) { dest_att },
+			.depth_stencil_attachment = { .attachment = VK_ATTACHMENT_UNUSED },
+		};
+
+		radv_cmd_buffer_set_subpass(cmd_buffer, &resolve_subpass, false);
+
+		emit_resolve(cmd_buffer,
+			     src_iview,
+			     dest_iview,
+			     &(VkOffset2D) { 0, 0 },
+			     &(VkOffset2D) { 0, 0 },
+			     &(VkExtent2D) { fb->width, fb->height });
+	}
+
+	cmd_buffer->state.subpass = subpass;
+	radv_meta_restore(&saved_state, cmd_buffer);
+}
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index e0c67ce..bf01026 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -26,6 +26,7 @@
  */
 
 #include "util/mesa-sha1.h"
+#include "util/u_atomic.h"
 #include "radv_private.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
@@ -35,12 +36,14 @@
 #include <llvm-c/TargetMachine.h>
 
 #include "sid.h"
+#include "gfx9d.h"
 #include "r600d_common.h"
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
 #include "ac_nir_to_llvm.h"
 #include "vk_format.h"
 #include "util/debug.h"
+#include "ac_exp_param.h"
 
 void radv_shader_variant_destroy(struct radv_device *device,
                                  struct radv_shader_variant *variant);
@@ -50,6 +53,8 @@
 	.lower_scmp = true,
 	.lower_flrp32 = true,
 	.lower_fsat = true,
+	.lower_fdiv = true,
+	.lower_sub = true,
 	.lower_pack_snorm_2x16 = true,
 	.lower_pack_snorm_4x8 = true,
 	.lower_pack_unorm_2x16 = true,
@@ -60,6 +65,8 @@
 	.lower_unpack_unorm_4x8 = true,
 	.lower_extract_byte = true,
 	.lower_extract_word = true,
+	.lower_ffma = true,
+	.max_unroll_iterations = 32
 };
 
 VkResult radv_CreateShaderModule(
@@ -152,6 +159,13 @@
                 NIR_PASS(progress, shader, nir_copy_prop);
                 NIR_PASS(progress, shader, nir_opt_remove_phis);
                 NIR_PASS(progress, shader, nir_opt_dce);
+                if (nir_opt_trivial_continues(shader)) {
+                        progress = true;
+                        NIR_PASS(progress, shader, nir_copy_prop);
+                        NIR_PASS(progress, shader, nir_opt_remove_phis);
+                        NIR_PASS(progress, shader, nir_opt_dce);
+                }
+                NIR_PASS(progress, shader, nir_opt_if);
                 NIR_PASS(progress, shader, nir_opt_dead_cf);
                 NIR_PASS(progress, shader, nir_opt_cse);
                 NIR_PASS(progress, shader, nir_opt_peephole_select, 8);
@@ -159,6 +173,9 @@
                 NIR_PASS(progress, shader, nir_opt_constant_folding);
                 NIR_PASS(progress, shader, nir_opt_undef);
                 NIR_PASS(progress, shader, nir_opt_conditional_discard);
+                if (shader->options->max_unroll_iterations) {
+                        NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
+                }
         } while (progress);
 }
 
@@ -214,6 +231,8 @@
 			.image_read_without_format = true,
 			.image_write_without_format = true,
 			.tessellation = true,
+			.int64 = true,
+			.variable_pointers = true,
 		};
 		entry_point = spirv_to_nir(spirv, module->size / 4,
 					   spec_entries, num_spec_entries,
@@ -252,12 +271,36 @@
 	}
 
 	/* Vulkan uses the separate-shader linking model */
-	nir->info->separate_shader = true;
+	nir->info.separate_shader = true;
 
 	nir_shader_gather_info(nir, entry_point->impl);
 
+	/* While it would be nice not to have this flag, we are constrained
+	 * by the reality that LLVM 5.0 doesn't have working VGPR indexing
+	 * on GFX9.
+	 */
+	bool llvm_has_working_vgpr_indexing =
+		device->physical_device->rad_info.chip_class <= VI;
+
+	/* TODO: Indirect indexing of GS inputs is unimplemented.
+	 *
+	 * TCS and TES load inputs directly from LDS or offchip memory, so
+	 * indirect indexing is trivial.
+	 */
 	nir_variable_mode indirect_mask = 0;
 	indirect_mask |= nir_var_shader_in;
+
+	if (!llvm_has_working_vgpr_indexing &&
+	    nir->info.stage != MESA_SHADER_TESS_CTRL)
+		indirect_mask |= nir_var_shader_out;
+
+        /* TODO: We shouldn't need to do this, however LLVM isn't currently
+	 * smart enough to handle indirects without causing excess spilling
+	 * causing the gpu to hang.
+	 *
+	 * See the following thread for more details of the problem:
+	 * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
+	 */
 	indirect_mask |= nir_var_local;
 
 	nir_lower_indirect_derefs(nir, indirect_mask);
@@ -361,7 +404,7 @@
 void radv_shader_variant_destroy(struct radv_device *device,
                                  struct radv_shader_variant *variant)
 {
-	if (__sync_fetch_and_sub(&variant->ref_count, 1) != 1)
+	if (!p_atomic_dec_zero(&variant->ref_count))
 		return;
 
 	device->ws->buffer_destroy(variant->bo);
@@ -444,12 +487,16 @@
 		options.key = *key;
 
 	struct ac_shader_binary binary;
-
+	enum ac_target_machine_options tm_options = 0;
 	options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
 	options.family = chip_family;
 	options.chip_class = device->physical_device->rad_info.chip_class;
 	options.supports_spill = device->llvm_supports_spill;
-	tm = ac_create_target_machine(chip_family, options.supports_spill);
+	if (options.supports_spill)
+		tm_options |= AC_TM_SUPPORTS_SPILL;
+	if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
+		tm_options |= AC_TM_SISCHED;
+	tm = ac_create_target_machine(chip_family, tm_options);
 	ac_compile_nir_shader(tm, &binary, &variant->config,
 			      &variant->info, shader, &options, dump);
 	LLVMDisposeTargetMachine(tm);
@@ -485,10 +532,14 @@
 
 	struct ac_nir_compiler_options options = {0};
 	struct ac_shader_binary binary;
+	enum ac_target_machine_options tm_options = 0;
 	options.family = chip_family;
 	options.chip_class = pipeline->device->physical_device->rad_info.chip_class;
-	options.supports_spill = pipeline->device->llvm_supports_spill;
-	tm = ac_create_target_machine(chip_family, options.supports_spill);
+	if (options.supports_spill)
+		tm_options |= AC_TM_SUPPORTS_SPILL;
+	if (pipeline->device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
+		tm_options |= AC_TM_SISCHED;
+	tm = ac_create_target_machine(chip_family, tm_options);
 	ac_create_gs_copy_shader(tm, nir, &binary, &variant->config, &variant->info, &options, dump_shader);
 	LLVMDisposeTargetMachine(tm);
 
@@ -527,8 +578,8 @@
 	bool dump = (pipeline->device->debug_flags & RADV_DEBUG_DUMP_SHADERS);
 
 	if (module->nir)
-		_mesa_sha1_compute(module->nir->info->name,
-				   strlen(module->nir->info->name),
+		_mesa_sha1_compute(module->nir->info.name,
+				   strlen(module->nir->info.name),
 				   module->sha1);
 
 	radv_hash_shader(sha1, module, entrypoint, spec_info, layout, key, 0);
@@ -592,11 +643,14 @@
 }
 
 static union ac_shader_variant_key
-radv_compute_tes_key(bool as_es)
+radv_compute_tes_key(bool as_es, bool export_prim_id)
 {
 	union ac_shader_variant_key key;
 	memset(&key, 0, sizeof(key));
 	key.tes.as_es = as_es;
+	/* export prim id only happens when no geom shader */
+	if (!as_es)
+		key.tes.export_prim_id = export_prim_id;
 	return key;
 }
 
@@ -627,13 +681,15 @@
 	nir_shader *tes_nir, *tcs_nir;
 	void *tes_code = NULL, *tcs_code = NULL;
 	unsigned tes_code_size = 0, tcs_code_size = 0;
-	union ac_shader_variant_key tes_key = radv_compute_tes_key(radv_pipeline_has_gs(pipeline));
+	union ac_shader_variant_key tes_key;
 	union ac_shader_variant_key tcs_key;
 	bool dump = (pipeline->device->debug_flags & RADV_DEBUG_DUMP_SHADERS);
 
+	tes_key = radv_compute_tes_key(radv_pipeline_has_gs(pipeline),
+				       pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.prim_id_input);
 	if (tes_module->nir)
-		_mesa_sha1_compute(tes_module->nir->info->name,
-				   strlen(tes_module->nir->info->name),
+		_mesa_sha1_compute(tes_module->nir->info.name,
+				   strlen(tes_module->nir->info.name),
 				   tes_module->sha1);
 	radv_hash_shader(tes_sha1, tes_module, tes_entrypoint, tes_spec_info, layout, &tes_key, 0);
 
@@ -645,8 +701,8 @@
 		tcs_key = radv_compute_tcs_key(tes_variant->info.tes.primitive_mode, input_vertices);
 
 		if (tcs_module->nir)
-			_mesa_sha1_compute(tcs_module->nir->info->name,
-					   strlen(tcs_module->nir->info->name),
+			_mesa_sha1_compute(tcs_module->nir->info.name,
+					   strlen(tcs_module->nir->info.name),
 					   tcs_module->sha1);
 
 		radv_hash_shader(tcs_sha1, tcs_module, tcs_entrypoint, tcs_spec_info, layout, &tcs_key, 0);
@@ -675,16 +731,16 @@
 		return;
 
 	nir_lower_tes_patch_vertices(tes_nir,
-				     tcs_nir->info->tess.tcs_vertices_out);
+				     tcs_nir->info.tess.tcs_vertices_out);
 
 	tes_variant = radv_shader_variant_create(pipeline->device, tes_nir,
 						 layout, &tes_key, &tes_code,
 						 &tes_code_size, dump);
 
-	tcs_key = radv_compute_tcs_key(tes_nir->info->tess.primitive_mode, input_vertices);
+	tcs_key = radv_compute_tcs_key(tes_nir->info.tess.primitive_mode, input_vertices);
 	if (tcs_module->nir)
-		_mesa_sha1_compute(tcs_module->nir->info->name,
-				   strlen(tcs_module->nir->info->name),
+		_mesa_sha1_compute(tcs_module->nir->info.name,
+				   strlen(tcs_module->nir->info.name),
 				   tcs_module->sha1);
 
 	radv_hash_shader(tcs_sha1, tcs_module, tcs_entrypoint, tcs_spec_info, layout, &tcs_key, 0);
@@ -822,6 +878,79 @@
 	}
 }
 
+static uint32_t si_translate_blend_opt_function(VkBlendOp op)
+{
+	switch (op) {
+	case VK_BLEND_OP_ADD:
+		return V_028760_OPT_COMB_ADD;
+	case VK_BLEND_OP_SUBTRACT:
+		return V_028760_OPT_COMB_SUBTRACT;
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return V_028760_OPT_COMB_REVSUBTRACT;
+	case VK_BLEND_OP_MIN:
+		return V_028760_OPT_COMB_MIN;
+	case VK_BLEND_OP_MAX:
+		return V_028760_OPT_COMB_MAX;
+	default:
+		return V_028760_OPT_COMB_BLEND_DISABLED;
+	}
+}
+
+static uint32_t si_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha)
+{
+	switch (factor) {
+	case VK_BLEND_FACTOR_ZERO:
+		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
+	case VK_BLEND_FACTOR_ONE:
+		return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
+	case VK_BLEND_FACTOR_SRC_COLOR:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
+				: V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
+				: V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
+	case VK_BLEND_FACTOR_SRC_ALPHA:
+		return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
+	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+		return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
+	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
+				: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+	default:
+		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+	}
+}
+
+/**
+ * Get rid of DST in the blend factors by commuting the operands:
+ *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+ */
+static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
+				unsigned *dst_factor, unsigned expected_dst,
+				unsigned replacement_src)
+{
+	if (*src_factor == expected_dst &&
+	    *dst_factor == VK_BLEND_FACTOR_ZERO) {
+		*src_factor = VK_BLEND_FACTOR_ZERO;
+		*dst_factor = replacement_src;
+
+		/* Commuting the operands requires reversing subtractions. */
+		if (*func == VK_BLEND_OP_SUBTRACT)
+			*func = VK_BLEND_OP_REVERSE_SUBTRACT;
+		else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT)
+			*func = VK_BLEND_OP_SUBTRACT;
+	}
+}
+
+static bool si_blend_factor_uses_dst(unsigned factor)
+{
+	return factor == VK_BLEND_FACTOR_DST_COLOR ||
+		factor == VK_BLEND_FACTOR_DST_ALPHA ||
+		factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
+		factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA ||
+		factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
+}
+
 static bool is_dual_src(VkBlendFactor factor)
 {
 	switch (factor) {
@@ -1005,14 +1134,17 @@
 	unsigned col_format = 0;
 
 	for (unsigned i = 0; i < (single_cb_enable ? 1 : subpass->color_count); ++i) {
-		struct radv_render_pass_attachment *attachment;
 		unsigned cf;
 
-		attachment = pass->attachments + subpass->color_attachments[i].attachment;
+		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
+			cf = V_028714_SPI_SHADER_ZERO;
+		} else {
+			struct radv_render_pass_attachment *attachment = pass->attachments + subpass->color_attachments[i].attachment;
 
-		cf = si_choose_spi_color_format(attachment->format,
-						blend_enable & (1 << i),
-						blend_need_alpha & (1 << i));
+			cf = si_choose_spi_color_format(attachment->format,
+			                                blend_enable & (1 << i),
+			                                blend_need_alpha & (1 << i));
+		}
 
 		col_format |= cf << (4 * i);
 	}
@@ -1034,31 +1166,51 @@
 	       desc->channel[channel].size == 8;
 }
 
+static bool
+format_is_int10(VkFormat format)
+{
+	const struct vk_format_description *desc = vk_format_description(format);
+
+	if (desc->nr_channels != 4)
+		return false;
+	for (unsigned i = 0; i < 4; i++) {
+		if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
+			return true;
+	}
+	return false;
+}
+
 unsigned radv_format_meta_fs_key(VkFormat format)
 {
 	unsigned col_format = si_choose_spi_color_format(format, false, false) - 1;
 	bool is_int8 = format_is_int8(format);
+	bool is_int10 = format_is_int10(format);
 
-	return col_format + (is_int8 ? 3 : 0);
+	return col_format + (is_int8 ? 3 : is_int10 ? 5 : 0);
 }
 
-static unsigned
-radv_pipeline_compute_is_int8(const VkGraphicsPipelineCreateInfo *pCreateInfo)
+static void
+radv_pipeline_compute_get_int_clamp(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+				    unsigned *is_int8, unsigned *is_int10)
 {
 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
-	unsigned is_int8 = 0;
+	*is_int8 = 0;
+	*is_int10 = 0;
 
 	for (unsigned i = 0; i < subpass->color_count; ++i) {
 		struct radv_render_pass_attachment *attachment;
 
+		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
+			continue;
+
 		attachment = pass->attachments + subpass->color_attachments[i].attachment;
 
 		if (format_is_int8(attachment->format))
-			is_int8 |= 1 << i;
+			*is_int8 |= 1 << i;
+		if (format_is_int10(attachment->format))
+			*is_int10 |= 1 << i;
 	}
-
-	return is_int8;
 }
 
 static void
@@ -1096,6 +1248,7 @@
 	for (i = 0; i < vkblend->attachmentCount; i++) {
 		const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
 		unsigned blend_cntl = 0;
+		unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
 		VkBlendOp eqRGB = att->colorBlendOp;
 		VkBlendFactor srcRGB = att->srcColorBlendFactor;
 		VkBlendFactor dstRGB = att->dstColorBlendFactor;
@@ -1103,7 +1256,7 @@
 		VkBlendFactor srcA = att->srcAlphaBlendFactor;
 		VkBlendFactor dstA = att->dstAlphaBlendFactor;
 
-		blend->sx_mrt0_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+		blend->sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
 
 		if (!att->colorWriteMask)
 			continue;
@@ -1127,6 +1280,50 @@
 			dstA = VK_BLEND_FACTOR_ONE;
 		}
 
+		/* Blending optimizations for RB+.
+		 * These transformations don't change the behavior.
+		 *
+		 * First, get rid of DST in the blend factors:
+		 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+		 */
+		si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
+				    VK_BLEND_FACTOR_DST_COLOR,
+				    VK_BLEND_FACTOR_SRC_COLOR);
+
+		si_blend_remove_dst(&eqA, &srcA, &dstA,
+				    VK_BLEND_FACTOR_DST_COLOR,
+				    VK_BLEND_FACTOR_SRC_COLOR);
+
+		si_blend_remove_dst(&eqA, &srcA, &dstA,
+				    VK_BLEND_FACTOR_DST_ALPHA,
+				    VK_BLEND_FACTOR_SRC_ALPHA);
+
+		/* Look up the ideal settings from tables. */
+		srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
+		dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
+		srcA_opt = si_translate_blend_opt_factor(srcA, true);
+		dstA_opt = si_translate_blend_opt_factor(dstA, true);
+
+				/* Handle interdependencies. */
+		if (si_blend_factor_uses_dst(srcRGB))
+			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+		if (si_blend_factor_uses_dst(srcA))
+			dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+
+		if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
+		    (dstRGB == VK_BLEND_FACTOR_ZERO ||
+		     dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
+		     dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
+			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+
+		/* Set the final value. */
+		blend->sx_mrt_blend_opt[i] =
+			S_028760_COLOR_SRC_OPT(srcRGB_opt) |
+			S_028760_COLOR_DST_OPT(dstRGB_opt) |
+			S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
+			S_028760_ALPHA_SRC_OPT(srcA_opt) |
+			S_028760_ALPHA_DST_OPT(dstA_opt) |
+			S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
 		blend_cntl |= S_028780_ENABLE(1);
 
 		blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
@@ -1150,8 +1347,14 @@
 		    dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
 			blend_need_alpha |= 1 << i;
 	}
-	for (i = vkblend->attachmentCount; i < 8; i++)
+	for (i = vkblend->attachmentCount; i < 8; i++) {
 		blend->cb_blend_control[i] = 0;
+		blend->sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+	}
+
+	/* disable RB+ for now */
+	if (pipeline->device->physical_device->has_rbplus)
+		blend->cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
 
 	if (blend->cb_target_mask)
 		blend->cb_color_control |= S_028808_MODE(mode);
@@ -1196,12 +1399,24 @@
 	memset(ds, 0, sizeof(*ds));
 	if (!vkds)
 		return;
-	ds->db_depth_control = S_028800_Z_ENABLE(vkds->depthTestEnable ? 1 : 0) |
-		S_028800_Z_WRITE_ENABLE(vkds->depthWriteEnable ? 1 : 0) |
-		S_028800_ZFUNC(vkds->depthCompareOp) |
-		S_028800_DEPTH_BOUNDS_ENABLE(vkds->depthBoundsTestEnable ? 1 : 0);
 
-	if (vkds->stencilTestEnable) {
+	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
+	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
+	if (subpass->depth_stencil_attachment.attachment == VK_ATTACHMENT_UNUSED)
+		return;
+
+	struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment.attachment;
+	bool has_depth_attachment = vk_format_is_depth(attachment->format);
+	bool has_stencil_attachment = vk_format_is_stencil(attachment->format);
+
+	if (has_depth_attachment) {
+		ds->db_depth_control = S_028800_Z_ENABLE(vkds->depthTestEnable ? 1 : 0) |
+		                       S_028800_Z_WRITE_ENABLE(vkds->depthWriteEnable ? 1 : 0) |
+		                       S_028800_ZFUNC(vkds->depthCompareOp) |
+		                       S_028800_DEPTH_BOUNDS_ENABLE(vkds->depthBoundsTestEnable ? 1 : 0);
+	}
+
+	if (has_stencil_attachment && vkds->stencilTestEnable) {
 		ds->db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
 		ds->db_depth_control |= S_028800_STENCILFUNC(vkds->front.compareOp);
 		ds->db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(vkds->front.failOp));
@@ -1301,7 +1516,9 @@
 	else
 		ms->num_samples = 1;
 
-	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.force_persample) {
+	if (vkms && vkms->sampleShadingEnable) {
+		ps_iter_samples = ceil(vkms->minSampleShading * ms->num_samples);
+	} else if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.force_persample) {
 		ps_iter_samples = ms->num_samples;
 	}
 
@@ -1319,11 +1536,12 @@
 		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
 		EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
 		EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+	ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(pipeline->device->physical_device->rad_info.chip_class >= GFX9);
 
 	if (ms->num_samples > 1) {
 		unsigned log_samples = util_logbase2(ms->num_samples);
 		unsigned log_ps_iter_samples = util_logbase2(util_next_power_of_two(ps_iter_samples));
-		ms->pa_sc_mode_cntl_0 = S_028A48_MSAA_ENABLE(1);
+		ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
 		ms->pa_sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); /* CM_R_028BDC_PA_SC_LINE_CNTL */
 		ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_samples) |
 			S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
@@ -1592,7 +1810,7 @@
 }
 
 static union ac_shader_variant_key
-radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es, bool as_ls)
+radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es, bool as_ls, bool export_prim_id)
 {
 	union ac_shader_variant_key key;
 	const VkPipelineVertexInputStateCreateInfo *input_state =
@@ -1602,6 +1820,7 @@
 	key.vs.instance_rate_inputs = 0;
 	key.vs.as_es = as_es;
 	key.vs.as_ls = as_ls;
+	key.vs.export_prim_id = export_prim_id;
 
 	for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
 		unsigned binding;
@@ -1843,6 +2062,24 @@
 	       S_028A40_GS_WRITE_OPTIMIZE(1);
 }
 
+static void calculate_vgt_gs_mode(struct radv_pipeline *pipeline)
+{
+	struct radv_shader_variant *vs;
+	vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : (radv_pipeline_has_tess(pipeline) ? pipeline->shaders[MESA_SHADER_TESS_EVAL] :  pipeline->shaders[MESA_SHADER_VERTEX]);
+
+	struct ac_vs_output_info *outinfo = &vs->info.vs.outinfo;
+
+	pipeline->graphics.vgt_primitiveid_en = false;
+	pipeline->graphics.vgt_gs_mode = 0;
+
+	if (radv_pipeline_has_gs(pipeline)) {
+		pipeline->graphics.vgt_gs_mode = si_vgt_gs_mode(pipeline->shaders[MESA_SHADER_GEOMETRY]);
+	} else if (outinfo->export_prim_id) {
+		pipeline->graphics.vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
+		pipeline->graphics.vgt_primitiveid_en = true;
+	}
+}
+
 static void calculate_pa_cl_vs_out_cntl(struct radv_pipeline *pipeline)
 {
 	struct radv_shader_variant *vs;
@@ -1870,6 +2107,25 @@
 		clip_dist_mask;
 
 }
+
+static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
+{
+	uint32_t ps_input_cntl;
+	if (offset <= AC_EXP_PARAM_OFFSET_31) {
+		ps_input_cntl = S_028644_OFFSET(offset);
+		if (flat_shade)
+			ps_input_cntl |= S_028644_FLAT_SHADE(1);
+	} else {
+		/* The input is a DEFAULT_VAL constant. */
+		assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+		       offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+		offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+		ps_input_cntl = S_028644_OFFSET(0x20) |
+			S_028644_DEFAULT_VAL(offset);
+	}
+	return ps_input_cntl;
+}
+
 static void calculate_ps_inputs(struct radv_pipeline *pipeline)
 {
 	struct radv_shader_variant *ps, *vs;
@@ -1881,6 +2137,23 @@
 	outinfo = &vs->info.vs.outinfo;
 
 	unsigned ps_offset = 0;
+
+	if (ps->info.fs.prim_id_input) {
+		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
+		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
+			pipeline->graphics.ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+			++ps_offset;
+		}
+	}
+
+	if (ps->info.fs.layer_input) {
+		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
+		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
+			pipeline->graphics.ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+			++ps_offset;
+		}
+	}
+
 	if (ps->info.fs.has_pcoord) {
 		unsigned val;
 		val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
@@ -1888,52 +2161,22 @@
 		ps_offset++;
 	}
 
-	if (ps->info.fs.prim_id_input && (outinfo->prim_id_output != 0xffffffff)) {
-		unsigned vs_offset, flat_shade;
-		unsigned val;
-		vs_offset = outinfo->prim_id_output;
-		flat_shade = true;
-		val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade);
-		pipeline->graphics.ps_input_cntl[ps_offset] = val;
-		++ps_offset;
-	}
-
-	if (ps->info.fs.layer_input && (outinfo->layer_output != 0xffffffff)) {
-		unsigned vs_offset, flat_shade;
-		unsigned val;
-		vs_offset = outinfo->layer_output;
-		flat_shade = true;
-		val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade);
-		pipeline->graphics.ps_input_cntl[ps_offset] = val;
-		++ps_offset;
-	}
-
 	for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) {
-		unsigned vs_offset, flat_shade;
-		unsigned val;
-
+		unsigned vs_offset;
+		bool flat_shade;
 		if (!(ps->info.fs.input_mask & (1u << i)))
 			continue;
 
-		if (!(outinfo->export_mask & (1u << i))) {
+		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
+		if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
 			pipeline->graphics.ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20);
 			++ps_offset;
 			continue;
 		}
 
-		vs_offset = util_bitcount(outinfo->export_mask & ((1u << i) - 1));
-		if (outinfo->prim_id_output != 0xffffffff) {
-			if (vs_offset >= outinfo->prim_id_output)
-				vs_offset++;
-		}
-		if (outinfo->layer_output != 0xffffffff) {
-			if (vs_offset >= outinfo->layer_output)
-			  vs_offset++;
-		}
 		flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset));
 
-		val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade);
-		pipeline->graphics.ps_input_cntl[ps_offset] = val;
+		pipeline->graphics.ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade);
 		++ps_offset;
 	}
 
@@ -1968,14 +2211,46 @@
 
 	radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
 
+	if (!modules[MESA_SHADER_FRAGMENT]) {
+		nir_builder fs_b;
+		nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
+		fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "noop_fs");
+		fs_m.nir = fs_b.shader;
+		modules[MESA_SHADER_FRAGMENT] = &fs_m;
+	}
+
+	if (modules[MESA_SHADER_FRAGMENT]) {
+		union ac_shader_variant_key key = {0};
+		key.fs.col_format = pipeline->graphics.blend.spi_shader_col_format;
+
+		if (pipeline->device->physical_device->rad_info.chip_class < VI)
+			radv_pipeline_compute_get_int_clamp(pCreateInfo, &key.fs.is_int8, &key.fs.is_int10);
+
+		const VkPipelineShaderStageCreateInfo *stage = pStages[MESA_SHADER_FRAGMENT];
+
+		pipeline->shaders[MESA_SHADER_FRAGMENT] =
+			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_FRAGMENT],
+					       stage ? stage->pName : "main",
+					       MESA_SHADER_FRAGMENT,
+					       stage ? stage->pSpecializationInfo : NULL,
+					       pipeline->layout, &key);
+		pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_FRAGMENT);
+	}
+
+	if (fs_m.nir)
+		ralloc_free(fs_m.nir);
+
 	if (modules[MESA_SHADER_VERTEX]) {
 		bool as_es = false;
 		bool as_ls = false;
+		bool export_prim_id = false;
 		if (modules[MESA_SHADER_TESS_CTRL])
 			as_ls = true;
 		else if (modules[MESA_SHADER_GEOMETRY])
 			as_es = true;
-		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, as_es, as_ls);
+		else if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.prim_id_input)
+			export_prim_id = true;
+		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, as_es, as_ls, export_prim_id);
 
 		pipeline->shaders[MESA_SHADER_VERTEX] =
 			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_VERTEX],
@@ -1988,7 +2263,7 @@
 	}
 
 	if (modules[MESA_SHADER_GEOMETRY]) {
-		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, false, false);
+		union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo, false, false, false);
 
 		pipeline->shaders[MESA_SHADER_GEOMETRY] =
 			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_GEOMETRY],
@@ -1998,10 +2273,7 @@
 					       pipeline->layout, &key);
 
 		pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_GEOMETRY);
-
-		pipeline->graphics.vgt_gs_mode = si_vgt_gs_mode(pipeline->shaders[MESA_SHADER_GEOMETRY]);
-	} else
-		pipeline->graphics.vgt_gs_mode = 0;
+	}
 
 	if (modules[MESA_SHADER_TESS_EVAL]) {
 		assert(modules[MESA_SHADER_TESS_CTRL]);
@@ -2020,33 +2292,6 @@
 			mesa_to_vk_shader_stage(MESA_SHADER_TESS_CTRL);
 	}
 
-	if (!modules[MESA_SHADER_FRAGMENT]) {
-		nir_builder fs_b;
-		nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
-		fs_b.shader->info->name = ralloc_strdup(fs_b.shader, "noop_fs");
-		fs_m.nir = fs_b.shader;
-		modules[MESA_SHADER_FRAGMENT] = &fs_m;
-	}
-
-	if (modules[MESA_SHADER_FRAGMENT]) {
-		union ac_shader_variant_key key;
-		key.fs.col_format = pipeline->graphics.blend.spi_shader_col_format;
-		key.fs.is_int8 = radv_pipeline_compute_is_int8(pCreateInfo);
-
-		const VkPipelineShaderStageCreateInfo *stage = pStages[MESA_SHADER_FRAGMENT];
-
-		pipeline->shaders[MESA_SHADER_FRAGMENT] =
-			 radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_FRAGMENT],
-					       stage ? stage->pName : "main",
-					       MESA_SHADER_FRAGMENT,
-					       stage ? stage->pSpecializationInfo : NULL,
-					       pipeline->layout, &key);
-		pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_FRAGMENT);
-	}
-
-	if (fs_m.nir)
-		ralloc_free(fs_m.nir);
-
 	radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo, extra);
 	radv_pipeline_init_raster_state(pipeline, pCreateInfo);
 	radv_pipeline_init_multisample_state(pipeline, pCreateInfo);
@@ -2104,15 +2349,25 @@
 		S_02880C_EXEC_ON_HIER_FAIL(ps->info.fs.writes_memory) |
 		S_02880C_EXEC_ON_NOOP(ps->info.fs.writes_memory);
 
+	if (pipeline->device->physical_device->has_rbplus)
+		pipeline->graphics.db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
+
 	pipeline->graphics.shader_z_format =
 		ps->info.fs.writes_sample_mask ? V_028710_SPI_SHADER_32_ABGR :
 		ps->info.fs.writes_stencil ? V_028710_SPI_SHADER_32_GR :
 		ps->info.fs.writes_z ? V_028710_SPI_SHADER_32_R :
 		V_028710_SPI_SHADER_ZERO;
 
+	calculate_vgt_gs_mode(pipeline);
 	calculate_pa_cl_vs_out_cntl(pipeline);
 	calculate_ps_inputs(pipeline);
 
+	for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+		if (pipeline->shaders[i]) {
+			pipeline->need_indirect_descriptor_sets |= pipeline->shaders[i]->info.need_indirect_descriptor_sets;
+		}
+	}
+
 	uint32_t stages = 0;
 	if (radv_pipeline_has_tess(pipeline)) {
 		stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
@@ -2124,10 +2379,15 @@
 				S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
 		else
 			stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+
 	} else if (radv_pipeline_has_gs(pipeline))
 		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
 			S_028B54_GS_EN(1) |
 			S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+
+	if (device->physical_device->rad_info.chip_class >= GFX9)
+		stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+
 	pipeline->graphics.vgt_shader_stages_en = stages;
 
 	if (radv_pipeline_has_gs(pipeline))
@@ -2175,6 +2435,16 @@
 		pipeline->binding_stride[desc->binding] = desc->stride;
 	}
 
+	struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX,
+							     AC_UD_VS_BASE_VERTEX_START_INSTANCE);
+	if (loc->sgpr_idx != -1) {
+		pipeline->graphics.vtx_base_sgpr = radv_shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+		pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
+		if (pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.needs_draw_id)
+			pipeline->graphics.vtx_emit_num = 3;
+		else
+			pipeline->graphics.vtx_emit_num = 2;
+	}
 	if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
 		radv_dump_pipeline_stats(device, pipeline);
 	}
@@ -2271,6 +2541,7 @@
 				       pipeline->layout, NULL);
 
 
+	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
 	result = radv_pipeline_scratch_init(device, pipeline);
 	if (result != VK_SUCCESS) {
 		radv_pipeline_destroy(device, pipeline, pAllocator);
diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c
index 5f6355f..9a82426 100644
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -23,6 +23,7 @@
 
 #include "util/mesa-sha1.h"
 #include "util/debug.h"
+#include "util/u_atomic.h"
 #include "radv_private.h"
 
 #include "ac_nir_to_llvm.h"
@@ -117,6 +118,9 @@
 	const uint32_t mask = cache->table_size - 1;
 	const uint32_t start = (*(uint32_t *) sha1);
 
+	if (cache->table_size == 0)
+		return NULL;
+
 	for (uint32_t i = 0; i < cache->table_size; i++) {
 		const uint32_t index = (start + i) & mask;
 		struct cache_entry *entry = cache->hash_table[index];
@@ -171,6 +175,7 @@
 		variant->info = entry->variant_info;
 		variant->rsrc1 = entry->rsrc1;
 		variant->rsrc2 = entry->rsrc2;
+		variant->code_size = entry->code_size;
 		variant->ref_count = 1;
 
 		variant->bo = device->ws->buffer_create(device->ws, entry->code_size, 256,
@@ -183,7 +188,7 @@
 		entry->variant = variant;
 	}
 
-	__sync_fetch_and_add(&entry->variant->ref_count, 1);
+	p_atomic_inc(&entry->variant->ref_count);
 	return entry->variant;
 }
 
@@ -275,7 +280,7 @@
 		} else {
 			entry->variant = variant;
 		}
-		__sync_fetch_and_add(&variant->ref_count, 1);
+		p_atomic_inc(&variant->ref_count);
 		pthread_mutex_unlock(&cache->mutex);
 		return variant;
 	}
@@ -295,7 +300,7 @@
 	entry->rsrc2 = variant->rsrc2;
 	entry->code_size = code_size;
 	entry->variant = variant;
-	__sync_fetch_and_add(&variant->ref_count, 1);
+	p_atomic_inc(&variant->ref_count);
 
 	radv_pipeline_cache_add_entry(cache, entry);
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 08f53a1..c7d5884 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -47,12 +47,14 @@
 #include "compiler/shader_enums.h"
 #include "util/macros.h"
 #include "util/list.h"
-#include "util/vk_alloc.h"
 #include "main/macros.h"
+#include "vk_alloc.h"
 
 #include "radv_radeon_winsys.h"
 #include "ac_binary.h"
 #include "ac_nir_to_llvm.h"
+#include "ac_gpu_info.h"
+#include "ac_surface.h"
 #include "radv_debug.h"
 #include "radv_descriptor_set.h"
 
@@ -82,7 +84,7 @@
 #define MAX_PUSH_DESCRIPTORS 32
 #define MAX_DYNAMIC_BUFFERS 16
 #define MAX_SAMPLES_LOG2 4
-#define NUM_META_FS_KEYS 11
+#define NUM_META_FS_KEYS 13
 #define RADV_MAX_DRM_DEVICES 8
 
 #define NUM_DEPTH_CLEAR_PIPELINES 3
@@ -266,10 +268,17 @@
 	char                                        path[20];
 	const char *                                name;
 	uint8_t                                     uuid[VK_UUID_SIZE];
+	uint8_t                                     device_uuid[VK_UUID_SIZE];
 
 	int local_fd;
 	struct wsi_device                       wsi_device;
 	struct radv_extensions                      extensions;
+
+	bool has_rbplus; /* if RB+ register exist */
+	bool rbplus_allowed; /* if RB+ is allowed */
+
+	VkPhysicalDeviceMemoryProperties memory_properties;
+	enum radv_mem_type mem_type_indices[RADV_MEM_TYPE_COUNT];
 };
 
 struct radv_instance {
@@ -282,6 +291,7 @@
 	struct radv_physical_device                 physicalDevices[RADV_MAX_DRM_DEVICES];
 
 	uint64_t debug_flags;
+	uint64_t perftest_flags;
 };
 
 VkResult radv_init_wsi(struct radv_physical_device *physical_device);
@@ -343,6 +353,8 @@
 		struct radv_pipeline *depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
 	} clear[1 + MAX_SAMPLES_LOG2];
 
+	VkPipelineLayout                          clear_color_p_layout;
+	VkPipelineLayout                          clear_depth_p_layout;
 	struct {
 		VkRenderPass render_pass[NUM_META_FS_KEYS];
 
@@ -415,14 +427,25 @@
 		struct {
 			VkPipeline                                pipeline;
 			VkPipeline                                i_pipeline;
+			VkPipeline                                srgb_pipeline;
 		} rc[MAX_SAMPLES_LOG2];
 	} resolve_compute;
 
 	struct {
+		VkDescriptorSetLayout                     ds_layout;
+		VkPipelineLayout                          p_layout;
+
+		struct {
+			VkRenderPass render_pass[NUM_META_FS_KEYS];
+			VkPipeline   pipeline[NUM_META_FS_KEYS];
+		} rc[MAX_SAMPLES_LOG2];
+	} resolve_fragment;
+
+	struct {
 		VkPipeline                                decompress_pipeline;
 		VkPipeline                                resummarize_pipeline;
 		VkRenderPass                              pass;
-	} depth_decomp;
+	} depth_decomp[1 + MAX_SAMPLES_LOG2];
 
 	struct {
 		VkPipeline                                cmask_eliminate_pipeline;
@@ -525,6 +548,8 @@
 
 	/* Backup in-memory cache to be used if the app doesn't provide one */
 	struct radv_pipeline_cache *                mem_cache;
+
+	uint32_t image_mrt_offset_counter;
 };
 
 struct radv_device_memory {
@@ -570,6 +595,10 @@
 	uint64_t size;
 
 	struct list_head vram_list;
+
+	uint8_t *host_memory_base;
+	uint8_t *host_memory_ptr;
+	uint8_t *host_memory_end;
 };
 
 struct radv_descriptor_update_template_entry {
@@ -585,7 +614,6 @@
 	uint32_t dst_stride;
 
 	uint32_t buffer_offset;
-	uint32_t buffer_count;
 
 	/* Only valid for combined image samplers and samplers */
 	uint16_t has_sampler;
@@ -726,7 +754,6 @@
 struct radv_cmd_state {
 	uint32_t                                      vb_dirty;
 	radv_cmd_dirty_mask_t                         dirty;
-	bool                                          vertex_descriptors_dirty;
 	bool                                          push_descriptors_dirty;
 
 	struct radv_pipeline *                        pipeline;
@@ -741,9 +768,9 @@
 	struct radv_descriptor_set *                  descriptors[MAX_SETS];
 	struct radv_attachment_state *                attachments;
 	VkRect2D                                     render_area;
-	struct radv_buffer *                         index_buffer;
 	uint32_t                                     index_type;
-	uint32_t                                     index_offset;
+	uint64_t                                     index_va;
+	uint32_t                                     max_index_count;
 	int32_t                                      last_primitive_reset_en;
 	uint32_t                                     last_primitive_reset_index;
 	enum radv_cmd_flush_bits                     flush_bits;
@@ -752,6 +779,7 @@
 	uint32_t                                      descriptors_dirty;
 	uint32_t                                      trace_id;
 	uint32_t                                      last_ia_multi_vgt_param;
+	bool predicating;
 };
 
 struct radv_cmd_pool {
@@ -791,8 +819,6 @@
 
 	struct radv_cmd_buffer_upload upload;
 
-	bool record_fail;
-
 	uint32_t scratch_size_needed;
 	uint32_t compute_scratch_size_needed;
 	uint32_t esgs_ring_size_needed;
@@ -800,7 +826,12 @@
 	bool tess_rings_needed;
 	bool sample_positions_needed;
 
+	bool record_fail;
+
 	int ring_offsets_idx; /* just used for verification */
+	uint32_t gfx9_fence_offset;
+	struct radeon_winsys_bo *gfx9_fence_bo;
+	uint32_t gfx9_fence_idx;
 };
 
 struct radv_image;
@@ -820,18 +851,33 @@
 uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 				   bool instanced_draw, bool indirect_draw,
 				   uint32_t draw_vertex_count);
+void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
+				bool predicated,
+				enum chip_class chip_class,
+				bool is_mec,
+				unsigned event, unsigned event_flags,
+				unsigned data_sel,
+				uint64_t va,
+				uint32_t old_fence,
+				uint32_t new_fence);
+
+void si_emit_wait_fence(struct radeon_winsys_cs *cs,
+			bool predicated,
+			uint64_t va, uint32_t ref,
+			uint32_t mask);
 void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
-                            enum chip_class chip_class,
-                            bool is_mec,
-                            enum radv_cmd_flush_bits flush_bits);
-void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
-                            enum chip_class chip_class,
-                            bool is_mec,
-                            enum radv_cmd_flush_bits flush_bits);
+			    bool predicated,
+			    enum chip_class chip_class,
+			    uint32_t *fence_ptr, uint64_t va,
+			    bool is_mec,
+			    enum radv_cmd_flush_bits flush_bits);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
+void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
 			   uint64_t src_va, uint64_t dest_va,
 			   uint64_t size);
+void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+                        unsigned size);
 void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 			    uint64_t size, unsigned value);
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
@@ -856,6 +902,8 @@
 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer);
 void radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer);
 void radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer);
+void radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer);
+void radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer);
 void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
 unsigned radv_cayman_get_maxdist(int log_samples);
 void radv_device_init_msaa(struct radv_device *device);
@@ -867,6 +915,9 @@
 			       struct radv_image *image,
 			       int idx,
 			       uint32_t color_values[2]);
+void radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
+				       struct radv_image *image,
+				       bool value);
 void radv_fill_buffer(struct radv_cmd_buffer *cmd_buffer,
 		      struct radeon_winsys_bo *bo,
 		      uint64_t offset, uint64_t size, uint32_t value);
@@ -952,7 +1003,7 @@
 struct radv_blend_state {
 	uint32_t cb_color_control;
 	uint32_t cb_target_mask;
-	uint32_t sx_mrt0_blend_opt[8];
+	uint32_t sx_mrt_blend_opt[8];
 	uint32_t cb_blend_control[8];
 
 	uint32_t spi_shader_col_format;
@@ -1007,7 +1058,7 @@
 	struct radv_pipeline_layout *                 layout;
 
 	bool                                         needs_data_cache;
-
+	bool					     need_indirect_descriptor_sets;
 	struct radv_shader_variant *                 shaders[MESA_SHADER_STAGES];
 	struct radv_shader_variant *gs_copy_shader;
 	VkShaderStageFlags                           active_stages;
@@ -1031,6 +1082,7 @@
 			unsigned prim;
 			unsigned gs_out;
 			uint32_t vgt_gs_mode;
+			bool vgt_primitiveid_en;
 			bool prim_restart_enable;
 			unsigned esgs_ring_size;
 			unsigned gsvs_ring_size;
@@ -1038,6 +1090,8 @@
 			uint32_t ps_input_cntl_num;
 			uint32_t pa_cl_vs_out_cntl;
 			uint32_t vgt_shader_stages_en;
+			uint32_t vtx_base_sgpr;
+			uint8_t vtx_emit_num;
 			struct radv_prim_vertex_count prim_vertex_count;
  			bool can_use_guardband;
 		} graphics;
@@ -1057,6 +1111,11 @@
 	return pipeline->shaders[MESA_SHADER_TESS_EVAL] ? true : false;
 }
 
+uint32_t radv_shader_stage_to_user_data_0(gl_shader_stage stage, bool has_gs, bool has_tess);
+struct ac_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
+					       gl_shader_stage stage,
+					       int idx);
+
 struct radv_graphics_pipeline_create_info {
 	bool use_rectlist;
 	bool db_depth_clear;
@@ -1141,10 +1200,7 @@
 	 */
 	VkFormat vk_format;
 	VkImageAspectFlags aspects;
-	VkExtent3D extent;
-	uint32_t levels;
-	uint32_t array_size;
-	uint32_t samples; /**< VkImageCreateInfo::samples */
+	struct ac_surf_info info;
 	VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */
 	VkImageTiling tiling; /** VkImageCreateInfo::tiling */
 	VkImageCreateFlags flags; /** VkImageCreateInfo::flags */
@@ -1155,24 +1211,37 @@
 	bool exclusive;
 	unsigned queue_family_mask;
 
+	bool shareable;
+
 	/* Set when bound */
 	struct radeon_winsys_bo *bo;
 	VkDeviceSize offset;
-	uint32_t dcc_offset;
-	uint32_t htile_offset;
+	uint64_t dcc_offset;
+	uint64_t htile_offset;
 	struct radeon_surf surface;
 
 	struct radv_fmask_info fmask;
 	struct radv_cmask_info cmask;
-	uint32_t clear_value_offset;
+	uint64_t clear_value_offset;
+	uint64_t dcc_pred_offset;
 };
 
+/* Whether the image has a htile that is known consistent with the contents of
+ * the image. */
 bool radv_layout_has_htile(const struct radv_image *image,
-                           VkImageLayout layout);
+                           VkImageLayout layout,
+                           unsigned queue_mask);
+
+/* Whether the image has a htile  that is known consistent with the contents of
+ * the image and is allowed to be in compressed form.
+ *
+ * If this is false reads that don't use the htile should be able to return
+ * correct results.
+ */
 bool radv_layout_is_htile_compressed(const struct radv_image *image,
-                                     VkImageLayout layout);
-bool radv_layout_can_expclear(const struct radv_image *image,
-                              VkImageLayout layout);
+                                     VkImageLayout layout,
+                                     unsigned queue_mask);
+
 bool radv_layout_can_fast_clear(const struct radv_image *image,
 			        VkImageLayout layout,
 			        unsigned queue_mask);
@@ -1185,7 +1254,7 @@
 		    const VkImageSubresourceRange *range)
 {
 	return range->layerCount == VK_REMAINING_ARRAY_LAYERS ?
-		image->array_size - range->baseArrayLayer : range->layerCount;
+		image->info.array_size - range->baseArrayLayer : range->layerCount;
 }
 
 static inline uint32_t
@@ -1193,7 +1262,7 @@
 		    const VkImageSubresourceRange *range)
 {
 	return range->levelCount == VK_REMAINING_MIP_LEVELS ?
-		image->levels - range->baseMipLevel : range->levelCount;
+		image->info.levels - range->baseMipLevel : range->levelCount;
 }
 
 struct radeon_bo_metadata;
@@ -1212,15 +1281,21 @@
 	uint32_t base_layer;
 	uint32_t layer_count;
 	uint32_t base_mip;
+	uint32_t level_count;
 	VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
 
 	uint32_t descriptor[8];
 	uint32_t fmask_descriptor[8];
+
+	/* Descriptor for use as a storage image as opposed to a sampled image.
+	 * This has a few differences for cube maps (e.g. type).
+	 */
+	uint32_t storage_descriptor[8];
+	uint32_t storage_fmask_descriptor[8];
 };
 
 struct radv_image_create_info {
 	const VkImageCreateInfo *vk_info;
-	uint32_t stride;
 	bool scanout;
 };
 
@@ -1231,11 +1306,8 @@
 
 void radv_image_view_init(struct radv_image_view *view,
 			  struct radv_device *device,
-			  const VkImageViewCreateInfo* pCreateInfo,
-			  struct radv_cmd_buffer *cmd_buffer,
-			  VkImageUsageFlags usage_mask);
-void radv_image_set_optimal_micro_tile_mode(struct radv_device *device,
-					    struct radv_image *image, uint32_t micro_tile_mode);
+			  const VkImageViewCreateInfo* pCreateInfo);
+
 struct radv_buffer_view {
 	struct radeon_winsys_bo *bo;
 	VkFormat vk_format;
@@ -1279,42 +1351,57 @@
 	}
 }
 
+static inline bool
+radv_image_extent_compare(const struct radv_image *image,
+			  const VkExtent3D *extent)
+{
+	if (extent->width != image->info.width ||
+	    extent->height != image->info.height ||
+	    extent->depth != image->info.depth)
+		return false;
+	return true;
+}
+
 struct radv_sampler {
 	uint32_t state[4];
 };
 
 struct radv_color_buffer_info {
-	uint32_t cb_color_base;
+	uint64_t cb_color_base;
+	uint64_t cb_color_cmask;
+	uint64_t cb_color_fmask;
+	uint64_t cb_dcc_base;
 	uint32_t cb_color_pitch;
 	uint32_t cb_color_slice;
 	uint32_t cb_color_view;
 	uint32_t cb_color_info;
 	uint32_t cb_color_attrib;
+	uint32_t cb_color_attrib2;
 	uint32_t cb_dcc_control;
-	uint32_t cb_color_cmask;
 	uint32_t cb_color_cmask_slice;
-	uint32_t cb_color_fmask;
 	uint32_t cb_color_fmask_slice;
 	uint32_t cb_clear_value0;
 	uint32_t cb_clear_value1;
-	uint32_t cb_dcc_base;
 	uint32_t micro_tile_mode;
+	uint32_t gfx9_epitch;
 };
 
 struct radv_ds_buffer_info {
+	uint64_t db_z_read_base;
+	uint64_t db_stencil_read_base;
+	uint64_t db_z_write_base;
+	uint64_t db_stencil_write_base;
+	uint64_t db_htile_data_base;
 	uint32_t db_depth_info;
 	uint32_t db_z_info;
 	uint32_t db_stencil_info;
-	uint32_t db_z_read_base;
-	uint32_t db_stencil_read_base;
-	uint32_t db_z_write_base;
-	uint32_t db_stencil_write_base;
 	uint32_t db_depth_view;
 	uint32_t db_depth_size;
 	uint32_t db_depth_slice;
 	uint32_t db_htile_surface;
-	uint32_t db_htile_data_base;
 	uint32_t pa_su_poly_offset_db_fmt_cntl;
+	uint32_t db_z_info2;
+	uint32_t db_stencil_info2;
 	float offset_scale;
 };
 
@@ -1343,8 +1430,8 @@
 
 struct radv_subpass {
 	uint32_t                                     input_count;
-	VkAttachmentReference *                      input_attachments;
 	uint32_t                                     color_count;
+	VkAttachmentReference *                      input_attachments;
 	VkAttachmentReference *                      color_attachments;
 	VkAttachmentReference *                      resolve_attachments;
 	VkAttachmentReference                        depth_stencil_attachment;
@@ -1385,6 +1472,20 @@
 	uint32_t pipeline_stats_mask;
 };
 
+struct radv_semaphore {
+	/* use a winsys sem for non-exportable */
+	struct radeon_winsys_sem *sem;
+	uint32_t syncobj;
+	uint32_t temp_syncobj;
+};
+
+VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
+			     int num_wait_sems,
+			     const VkSemaphore *wait_sems,
+			     int num_signal_sems,
+			     const VkSemaphore *signal_sems);
+void radv_free_sem_info(struct radv_winsys_sem_info *sem_info);
+
 void
 radv_update_descriptor_sets(struct radv_device *device,
                             struct radv_cmd_buffer *cmd_buffer,
@@ -1478,6 +1579,6 @@
 RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_render_pass, VkRenderPass)
 RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler, VkSampler)
 RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_shader_module, VkShaderModule)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radeon_winsys_sem, VkSemaphore)
+RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_semaphore, VkSemaphore)
 
 #endif /* RADV_PRIVATE_H */
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index d581ea5..bd7b14a 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -72,6 +72,8 @@
 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
 {
 	nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(flags, 0);
+	nir_intrinsic_set_range(flags, 16);
 	flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
 	flags->num_components = 1;
 	nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
@@ -120,10 +122,10 @@
 	 */
 	nir_builder b;
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "occlusion_query");
-	b.shader->info->cs.local_size[0] = 64;
-	b.shader->info->cs.local_size[1] = 1;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
 	nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
@@ -153,9 +155,9 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-	                                        b.shader->info->cs.local_size[0],
-	                                        b.shader->info->cs.local_size[1],
-	                                        b.shader->info->cs.local_size[2], 0);
+	                                        b.shader->info.cs.local_size[0],
+	                                        b.shader->info.cs.local_size[1],
+	                                        b.shader->info.cs.local_size[2], 0);
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
 
@@ -315,10 +317,10 @@
 	 */
 	nir_builder b;
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
-	b.shader->info->name = ralloc_strdup(b.shader, "pipeline_statistics_query");
-	b.shader->info->cs.local_size[0] = 64;
-	b.shader->info->cs.local_size[1] = 1;
-	b.shader->info->cs.local_size[2] = 1;
+	b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
 
 	nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
 
@@ -345,9 +347,9 @@
 	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
 	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
-	                                        b.shader->info->cs.local_size[0],
-	                                        b.shader->info->cs.local_size[1],
-	                                        b.shader->info->cs.local_size[2], 0);
+	                                        b.shader->info.cs.local_size[0],
+	                                        b.shader->info.cs.local_size[1],
+	                                        b.shader->info.cs.local_size[2], 0);
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
 
@@ -607,12 +609,10 @@
 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
 					     1, &pipeline_statistics_vk_pipeline_info, NULL,
 					     &device->meta_state.query.pipeline_statistics_query_pipeline);
-	if (result != VK_SUCCESS)
-		goto fail;
 
-	return VK_SUCCESS;
 fail:
-	radv_device_finish_meta_query_state(device);
+	if (result != VK_SUCCESS)
+		radv_device_finish_meta_query_state(device);
 	ralloc_free(occlusion_cs.nir);
 	ralloc_free(pipeline_statistics_cs.nir);
 	return result;
@@ -653,7 +653,7 @@
 	struct radv_device *device = cmd_buffer->device;
 	struct radv_meta_saved_compute_state saved_state;
 
-	radv_meta_save_compute(&saved_state, cmd_buffer, 4);
+	radv_meta_save_compute(&saved_state, cmd_buffer, 16);
 
 	struct radv_buffer dst_buffer = {
 		.bo = dst_bo,
@@ -737,7 +737,7 @@
 	                                RADV_CMD_FLAG_INV_VMEM_L1 |
 	                                RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
 
-	radv_meta_restore_compute(&saved_state, cmd_buffer, 4);
+	radv_meta_restore_compute(&saved_state, cmd_buffer, 16);
 }
 
 VkResult radv_CreateQueryPool(
@@ -992,13 +992,7 @@
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
 
 				/* This waits on the ME. All copies below are done on the ME */
-				radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-				radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
-				radeon_emit(cs, avail_va);
-				radeon_emit(cs, avail_va >> 32);
-				radeon_emit(cs, 1); /* reference value */
-				radeon_emit(cs, 0xffffffff); /* mask */
-				radeon_emit(cs, 4); /* poll interval */
+				si_emit_wait_fence(cs, false, avail_va, 1, 0xffffffff);
 			}
 		}
 		radv_query_shader(cmd_buffer, cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
@@ -1021,13 +1015,7 @@
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
 
 				/* This waits on the ME. All copies below are done on the ME */
-				radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-				radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
-				radeon_emit(cs, avail_va);
-				radeon_emit(cs, avail_va >> 32);
-				radeon_emit(cs, 1); /* reference value */
-				radeon_emit(cs, 0xffffffff); /* mask */
-				radeon_emit(cs, 4); /* poll interval */
+				si_emit_wait_fence(cs, false, avail_va, 1, 0xffffffff);
 			}
 			if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
@@ -1151,7 +1139,7 @@
 
 		break;
 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
-		radeon_check_space(cmd_buffer->device->ws, cs, 10);
+		radeon_check_space(cmd_buffer->device->ws, cs, 16);
 
 		va += pipelinestat_block_size;
 
@@ -1160,13 +1148,12 @@
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
-				EVENT_INDEX(5));
-		radeon_emit(cs, avail_va);
-		radeon_emit(cs, (avail_va >> 32) | EOP_DATA_SEL(1));
-		radeon_emit(cs, 1);
-		radeon_emit(cs, 0);
+		si_cs_emit_write_event_eop(cs,
+					   false,
+					   cmd_buffer->device->physical_device->rad_info.chip_class,
+					   false,
+					   EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
+					   1, avail_va, 0, 1);
 		break;
 	default:
 		unreachable("ending unhandled query type");
@@ -1189,32 +1176,42 @@
 
 	cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5);
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
+	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
 
-	if (mec) {
-		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
-		radeon_emit(cs, 3 << 29);
+	switch(pipelineStage) {
+	case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+		radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
+		                COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
+		                COPY_DATA_DST_SEL(V_370_MEM_ASYNC));
+		radeon_emit(cs, 0);
+		radeon_emit(cs, 0);
 		radeon_emit(cs, query_va);
 		radeon_emit(cs, query_va >> 32);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-	} else {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
-		radeon_emit(cs, query_va);
-		radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0xFFFF));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-	}
 
-	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
-	radeon_emit(cs, S_370_DST_SEL(mec ? V_370_MEM_ASYNC : V_370_MEMORY_SYNC) |
-		    S_370_WR_CONFIRM(1) |
-		    S_370_ENGINE_SEL(V_370_ME));
-	radeon_emit(cs, avail_va);
-	radeon_emit(cs, avail_va >> 32);
-	radeon_emit(cs, 1);
+		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+		radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+		                S_370_WR_CONFIRM(1) |
+		                S_370_ENGINE_SEL(V_370_ME));
+		radeon_emit(cs, avail_va);
+		radeon_emit(cs, avail_va >> 32);
+		radeon_emit(cs, 1);
+		break;
+	default:
+		si_cs_emit_write_event_eop(cs,
+					   false,
+					   cmd_buffer->device->physical_device->rad_info.chip_class,
+					   mec,
+					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
+					   3, query_va, 0, 0);
+		si_cs_emit_write_event_eop(cs,
+					   false,
+					   cmd_buffer->device->physical_device->rad_info.chip_class,
+					   mec,
+					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
+					   1, avail_va, 0, 1);
+		break;
+	}
 
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
index f6bab74..285ae2c 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -35,6 +35,10 @@
 #include "main/macros.h"
 #include "amd_family.h"
 
+struct radeon_info;
+struct ac_surf_info;
+struct radeon_surf;
+
 #define FREE(x) free(x)
 
 enum radeon_bo_domain { /* bitfield */
@@ -47,7 +51,8 @@
 	RADEON_FLAG_GTT_WC =        (1 << 0),
 	RADEON_FLAG_CPU_ACCESS =    (1 << 1),
 	RADEON_FLAG_NO_CPU_ACCESS = (1 << 2),
-	RADEON_FLAG_VIRTUAL =       (1 << 3)
+	RADEON_FLAG_VIRTUAL =       (1 << 3),
+	RADEON_FLAG_VA_UNCACHED =   (1 << 4),
 };
 
 enum radeon_bo_usage { /* bitfield */
@@ -71,63 +76,6 @@
 	uint32_t *buf; /* The base pointer of the chunk. */
 };
 
-struct radeon_info {
-	/* PCI info: domain:bus:dev:func */
-	uint32_t                    pci_domain;
-	uint32_t                    pci_bus;
-	uint32_t                    pci_dev;
-	uint32_t                    pci_func;
-
-	/* Device info. */
-	uint32_t                    pci_id;
-	enum radeon_family          family;
-	const char                  *name;
-	enum chip_class             chip_class;
-	uint32_t                    gart_page_size;
-	uint64_t                    gart_size;
-	uint64_t                    vram_size;
-	uint64_t                    visible_vram_size;
-	bool                        has_dedicated_vram;
-	bool                     has_virtual_memory;
-	bool                        gfx_ib_pad_with_type2;
-	bool                     has_uvd;
-	uint32_t                    sdma_rings;
-	uint32_t                    compute_rings;
-	uint32_t                    vce_fw_version;
-	uint32_t                    vce_harvest_config;
-	uint32_t                    clock_crystal_freq; /* in kHz */
-
-	/* Kernel info. */
-	uint32_t                    drm_major; /* version */
-	uint32_t                    drm_minor;
-	uint32_t                    drm_patchlevel;
-	bool                     has_userptr;
-
-	/* Shader cores. */
-	uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
-	uint32_t                    max_shader_clock;
-	uint32_t                    num_good_compute_units;
-	uint32_t                    max_se; /* shader engines */
-	uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
-
-	/* Render backends (color + depth blocks). */
-	uint32_t                    r300_num_gb_pipes;
-	uint32_t                    r300_num_z_pipes;
-	uint32_t                    r600_gb_backend_map; /* R600 harvest config */
-	bool                     r600_gb_backend_map_valid;
-	uint32_t                    r600_num_banks;
-	uint32_t                    num_render_backends;
-	uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
-	uint32_t                    pipe_interleave_bytes;
-	uint32_t                    enabled_rb_mask; /* GCN harvest config */
-
-	/* Tile modes. */
-	uint32_t                    si_tile_mode_array[32];
-	uint32_t                    cik_macrotile_mode_array[16];
-};
-
-#define RADEON_SURF_MAX_LEVEL                   32
-
 #define RADEON_SURF_TYPE_MASK                   0xFF
 #define RADEON_SURF_TYPE_SHIFT                  0
 #define     RADEON_SURF_TYPE_1D                     0
@@ -138,93 +86,11 @@
 #define     RADEON_SURF_TYPE_2D_ARRAY               5
 #define RADEON_SURF_MODE_MASK                   0xFF
 #define RADEON_SURF_MODE_SHIFT                  8
-#define     RADEON_SURF_MODE_LINEAR_ALIGNED         1
-#define     RADEON_SURF_MODE_1D                     2
-#define     RADEON_SURF_MODE_2D                     3
-#define RADEON_SURF_SCANOUT                     (1 << 16)
-#define RADEON_SURF_ZBUFFER                     (1 << 17)
-#define RADEON_SURF_SBUFFER                     (1 << 18)
-#define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
-#define RADEON_SURF_HAS_SBUFFER_MIPTREE         (1 << 19)
-#define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
-#define RADEON_SURF_FMASK                       (1 << 21)
-#define RADEON_SURF_DISABLE_DCC                 (1 << 22)
-#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
 
 #define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
 #define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
 #define RADEON_SURF_CLR(v, field)   ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT))
 
-struct radeon_surf_level {
-	uint64_t                    offset;
-	uint64_t                    slice_size;
-	uint32_t                    npix_x;
-	uint32_t                    npix_y;
-	uint32_t                    npix_z;
-	uint32_t                    nblk_x;
-	uint32_t                    nblk_y;
-	uint32_t                    nblk_z;
-	uint32_t                    pitch_bytes;
-	uint32_t                    mode;
-	uint64_t                    dcc_offset;
-	uint64_t                    dcc_fast_clear_size;
-	bool                        dcc_enabled;
-};
-
-
-/* surface defintions from the winsys */
-struct radeon_surf {
-	/* These are inputs to the calculator. */
-	uint32_t                    npix_x;
-	uint32_t                    npix_y;
-	uint32_t                    npix_z;
-	uint32_t                    blk_w;
-	uint32_t                    blk_h;
-	uint32_t                    blk_d;
-	uint32_t                    array_size;
-	uint32_t                    last_level;
-	uint32_t                    bpe;
-	uint32_t                    nsamples;
-	uint32_t                    flags;
-
-	/* These are return values. Some of them can be set by the caller, but
-	 * they will be treated as hints (e.g. bankw, bankh) and might be
-	 * changed by the calculator.
-	 */
-	uint64_t                    bo_size;
-	uint64_t                    bo_alignment;
-	/* This applies to EG and later. */
-	uint32_t                    bankw;
-	uint32_t                    bankh;
-	uint32_t                    mtilea;
-	uint32_t                    tile_split;
-	uint32_t                    stencil_tile_split;
-	uint64_t                    stencil_offset;
-	struct radeon_surf_level    level[RADEON_SURF_MAX_LEVEL];
-	struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
-	uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
-	uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
-	uint32_t                    pipe_config;
-	uint32_t                    num_banks;
-	uint32_t                    macro_tile_index;
-	uint32_t                    micro_tile_mode; /* displayable, thin, depth, rotated */
-
-	/* Whether the depth miptree or stencil miptree as used by the DB are
-	 * adjusted from their TC compatible form to ensure depth/stencil
-	 * compatibility. If either is true, the corresponding plane cannot be
-	 * sampled from.
-	 */
-	bool                        depth_adjusted;
-	bool                        stencil_adjusted;
-
-	uint64_t                    dcc_size;
-	uint64_t                    dcc_alignment;
-
-	uint64_t                    htile_size;
-	uint64_t                    htile_slice_size;
-	uint64_t                    htile_alignment;
-};
-
 enum radeon_bo_layout {
 	RADEON_LAYOUT_LINEAR = 0,
 	RADEON_LAYOUT_TILED,
@@ -238,16 +104,25 @@
 	/* Tiling flags describing the texture layout for display code
 	 * and DRI sharing.
 	 */
-	enum radeon_bo_layout   microtile;
-	enum radeon_bo_layout   macrotile;
-	unsigned                pipe_config;
-	unsigned                bankw;
-	unsigned                bankh;
-	unsigned                tile_split;
-	unsigned                mtilea;
-	unsigned                num_banks;
-	unsigned                stride;
-	bool                    scanout;
+	union {
+		struct {
+			enum radeon_bo_layout   microtile;
+			enum radeon_bo_layout   macrotile;
+			unsigned                pipe_config;
+			unsigned                bankw;
+			unsigned                bankh;
+			unsigned                tile_split;
+			unsigned                mtilea;
+			unsigned                num_banks;
+			unsigned                stride;
+			bool                    scanout;
+		} legacy;
+
+		struct {
+			/* surface flags */
+			unsigned swizzle_mode:5;
+		} gfx9;
+	} u;
 
 	/* Additional metadata associated with the buffer, in bytes.
 	 * The maximum size is 64 * 4. This is opaque for the winsys & kernel.
@@ -257,9 +132,23 @@
 	uint32_t                metadata[64];
 };
 
+uint32_t syncobj_handle;
 struct radeon_winsys_bo;
 struct radeon_winsys_fence;
-struct radeon_winsys_sem;
+
+struct radv_winsys_sem_counts {
+	uint32_t syncobj_count;
+	uint32_t sem_count;
+	uint32_t *syncobj;
+	struct radeon_winsys_sem **sem;
+};
+
+struct radv_winsys_sem_info {
+	bool cs_emit_signal;
+	bool cs_emit_wait;
+	struct radv_winsys_sem_counts wait;
+	struct radv_winsys_sem_counts signal;
+};
 
 struct radeon_winsys {
 	void (*destroy)(struct radeon_winsys *ws);
@@ -317,10 +206,7 @@
 			 unsigned cs_count,
 			 struct radeon_winsys_cs *initial_preamble_cs,
 			 struct radeon_winsys_cs *continue_preamble_cs,
-			 struct radeon_winsys_sem **wait_sem,
-			 unsigned wait_sem_count,
-			 struct radeon_winsys_sem **signal_sem,
-			 unsigned signal_sem_count,
+			 struct radv_winsys_sem_info *sem_info,
 			 bool can_patch,
 			 struct radeon_winsys_fence *fence);
 
@@ -334,6 +220,7 @@
 	void (*cs_dump)(struct radeon_winsys_cs *cs, FILE* file, uint32_t trace_id);
 
 	int (*surface_init)(struct radeon_winsys *ws,
+			    const struct ac_surf_info *surf_info,
 			    struct radeon_surf *surf);
 
 	int (*surface_best)(struct radeon_winsys *ws,
@@ -346,9 +233,17 @@
 			   bool absolute,
 			   uint64_t timeout);
 
+	/* old semaphores - non shareable */
 	struct radeon_winsys_sem *(*create_sem)(struct radeon_winsys *ws);
 	void (*destroy_sem)(struct radeon_winsys_sem *sem);
 
+	/* new shareable sync objects */
+	int (*create_syncobj)(struct radeon_winsys *ws, uint32_t *handle);
+	void (*destroy_syncobj)(struct radeon_winsys *ws, uint32_t handle);
+
+	int (*export_syncobj)(struct radeon_winsys *ws, uint32_t syncobj, int *fd);
+	int (*import_syncobj)(struct radeon_winsys *ws, int fd, uint32_t *syncobj);
+
 };
 
 static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
diff --git a/src/amd/vulkan/radv_wsi.c b/src/amd/vulkan/radv_wsi.c
index 3a8617f..c939bc3 100644
--- a/src/amd/vulkan/radv_wsi.c
+++ b/src/amd/vulkan/radv_wsi.c
@@ -26,7 +26,7 @@
 #include "radv_private.h"
 #include "radv_meta.h"
 #include "wsi_common.h"
-#include "util/vk_util.h"
+#include "vk_util.h"
 
 static const struct wsi_callbacks wsi_cbs = {
    .get_phys_device_format_properties = radv_GetPhysicalDeviceFormatProperties,
@@ -154,6 +154,7 @@
 	VkImage image_h;
 	struct radv_image *image;
 	int fd;
+	RADV_FROM_HANDLE(radv_device, device, device_h);
 
 	result = radv_image_create(device_h,
 				   &(struct radv_image_create_info) {
@@ -185,19 +186,33 @@
 
 	VkDeviceMemory memory_h;
 
-	const VkDedicatedAllocationMemoryAllocateInfoNV ded_alloc = {
-		.sType = VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV,
+	const VkMemoryDedicatedAllocateInfoKHR ded_alloc = {
+		.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
 		.pNext = NULL,
 		.buffer = VK_NULL_HANDLE,
 		.image = image_h
 	};
 
+	/* Find the first VRAM memory type, or GART for PRIME images. */
+	int memory_type_index = -1;
+	for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
+		bool is_local = !!(device->physical_device->memory_properties.memoryTypes[i].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+		if ((linear && !is_local) || (!linear && is_local)) {
+			memory_type_index = i;
+			break;
+		}
+	}
+
+	/* fallback */
+	if (memory_type_index == -1)
+		memory_type_index = 0;
+
 	result = radv_AllocateMemory(device_h,
 				     &(VkMemoryAllocateInfo) {
 					     .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
 					     .pNext = &ded_alloc,
 					     .allocationSize = image->size,
-					     .memoryTypeIndex = linear ? 1 : 0,
+					     .memoryTypeIndex = memory_type_index,
 				     },
 				     NULL /* XXX: pAllocator */,
 				     &memory_h);
@@ -211,7 +226,6 @@
 	 * or the fd for the linear image if a copy is required.
 	 */
 	if (!needs_linear_copy || (needs_linear_copy && linear)) {
-		RADV_FROM_HANDLE(radv_device, device, device_h);
 		RADV_FROM_HANDLE(radv_device_memory, memory, memory_h);
 		if (!radv_get_memory_fd(device, memory, &fd))
 			goto fail_alloc_memory;
@@ -224,7 +238,11 @@
 	*memory_p = memory_h;
 	*size = image->size;
 	*offset = image->offset;
-	*row_pitch = surface->level[0].pitch_bytes;
+
+	if (device->physical_device->rad_info.chip_class >= GFX9)
+		*row_pitch = surface->u.gfx9.surf_pitch * surface->bpe;
+	else
+		*row_pitch = surface->u.legacy.level[0].nblk_x * surface->bpe;
 	return VK_SUCCESS;
  fail_alloc_memory:
 	radv_FreeMemory(device_h, memory_h, pAllocator);
@@ -438,11 +456,10 @@
 	VkResult result = swapchain->acquire_next_image(swapchain, timeout, semaphore,
 	                                                pImageIndex);
 
-	if (fence && result == VK_SUCCESS) {
+	if (fence && (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR)) {
 		fence->submitted = true;
 		fence->signalled = true;
 	}
-
 	return result;
 }
 
@@ -452,7 +469,6 @@
 {
 	RADV_FROM_HANDLE(radv_queue, queue, _queue);
 	VkResult result = VK_SUCCESS;
-
 	const VkPresentRegionsKHR *regions =
 	         vk_find_struct_const(pPresentInfo->pNext, PRESENT_REGIONS_KHR);
 
@@ -461,6 +477,20 @@
 		struct radeon_winsys_cs *cs;
 		const VkPresentRegionKHR *region = NULL;
 		VkResult item_result;
+		struct radv_winsys_sem_info sem_info;
+
+		item_result = radv_alloc_sem_info(&sem_info,
+						  pPresentInfo->waitSemaphoreCount,
+						  pPresentInfo->pWaitSemaphores,
+						  0,
+						  NULL);
+		if (pPresentInfo->pResults != NULL)
+			pPresentInfo->pResults[i] = item_result;
+		result = result == VK_SUCCESS ? item_result : result;
+		if (item_result != VK_SUCCESS) {
+			radv_free_sem_info(&sem_info);
+			continue;
+		}
 
 		assert(radv_device_from_handle(swapchain->device) == queue->device);
 		if (swapchain->fences[0] == VK_NULL_HANDLE) {
@@ -472,8 +502,10 @@
 			if (pPresentInfo->pResults != NULL)
 				pPresentInfo->pResults[i] = item_result;
 			result = result == VK_SUCCESS ? item_result : result;
-			if (item_result != VK_SUCCESS)
+			if (item_result != VK_SUCCESS) {
+				radv_free_sem_info(&sem_info);
 				continue;
+			}
 		} else {
 			radv_ResetFences(radv_device_to_handle(queue->device),
 					 1, &swapchain->fences[0]);
@@ -487,11 +519,12 @@
 		RADV_FROM_HANDLE(radv_fence, fence, swapchain->fences[0]);
 		struct radeon_winsys_fence *base_fence = fence->fence;
 		struct radeon_winsys_ctx *ctx = queue->hw_ctx;
+
 		queue->device->ws->cs_submit(ctx, queue->queue_idx,
 					     &cs,
 					     1, NULL, NULL,
-					     (struct radeon_winsys_sem **)pPresentInfo->pWaitSemaphores,
-					     pPresentInfo->waitSemaphoreCount, NULL, 0, false, base_fence);
+					     &sem_info,
+					     false, base_fence);
 		fence->submitted = true;
 
 		if (regions && regions->pRegions)
@@ -504,8 +537,10 @@
 		if (pPresentInfo->pResults != NULL)
 			pPresentInfo->pResults[i] = item_result;
 		result = result == VK_SUCCESS ? item_result : result;
-		if (item_result != VK_SUCCESS)
+		if (item_result != VK_SUCCESS) {
+			radv_free_sem_info(&sem_info);
 			continue;
+		}
 
 		VkFence last = swapchain->fences[2];
 		swapchain->fences[2] = swapchain->fences[1];
@@ -517,6 +552,7 @@
 					   1, &last, true, 1);
 		}
 
+		radv_free_sem_info(&sem_info);
 	}
 
 	return VK_SUCCESS;
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 8d7db96..ef4f926 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -30,6 +30,7 @@
 #include "radv_private.h"
 #include "radv_cs.h"
 #include "sid.h"
+#include "gfx9d.h"
 #include "radv_util.h"
 #include "main/macros.h"
 
@@ -241,6 +242,9 @@
 	radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
 	radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+	radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+	if (physical_device->rad_info.chip_class >= GFX9)
+		radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF, 0);
 	radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0);
 	if (physical_device->rad_info.chip_class < CIK)
 		radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
@@ -328,24 +332,28 @@
 		raster_config_1 = 0x00000000;
 		break;
 	default:
-		fprintf(stderr,
-			"radeonsi: Unknown GPU, using 0 for raster_config\n");
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
+		if (physical_device->rad_info.chip_class <= VI) {
+			fprintf(stderr,
+				"radeonsi: Unknown GPU, using 0 for raster_config\n");
+			raster_config = 0x00000000;
+			raster_config_1 = 0x00000000;
+		}
 		break;
 	}
 
 	/* Always use the default config when all backends are enabled
 	 * (or when we failed to determine the enabled backends).
 	 */
-	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-		radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG,
-				       raster_config);
-		if (physical_device->rad_info.chip_class >= CIK)
-			radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1,
-					       raster_config_1);
-	} else {
-		si_write_harvested_raster_configs(physical_device, cs, raster_config, raster_config_1);
+	if (physical_device->rad_info.chip_class <= VI) {
+		if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+			radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG,
+					       raster_config);
+			if (physical_device->rad_info.chip_class >= CIK)
+				radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1,
+						       raster_config_1);
+		} else {
+			si_write_harvested_raster_configs(physical_device, cs, raster_config, raster_config_1);
+		}
 	}
 
 	radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
@@ -369,22 +377,31 @@
 			       S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
 			       S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE));
 
-	radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0);
-	radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0);
-	radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0);
+	if (physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_uconfig_reg(cs, R_030920_VGT_MAX_VTX_INDX, ~0);
+		radeon_set_uconfig_reg(cs, R_030924_VGT_MIN_VTX_INDX, 0);
+		radeon_set_uconfig_reg(cs, R_030928_VGT_INDX_OFFSET, 0);
+	} else {
+		radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0);
+		radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0);
+		radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0);
+	}
 
 	if (physical_device->rad_info.chip_class >= CIK) {
-		/* If this is 0, Bonaire can hang even if GS isn't being used.
-		 * Other chips are unaffected. These are suboptimal values,
-		 * but we don't use on-chip GS.
-		 */
-		radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL,
-				       S_028A44_ES_VERTS_PER_SUBGRP(64) |
-				       S_028A44_GS_PRIMS_PER_SUBGRP(4));
-
-		radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
-		radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-		radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+		if (physical_device->rad_info.chip_class >= GFX9) {
+			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
+		} else {
+			radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
+			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
+			radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+			/* If this is 0, Bonaire can hang even if GS isn't being used.
+			 * Other chips are unaffected. These are suboptimal values,
+			 * but we don't use on-chip GS.
+			 */
+			radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL,
+					       S_028A44_ES_VERTS_PER_SUBGRP(64) |
+					       S_028A44_GS_PRIMS_PER_SUBGRP(4));
+		}
 		radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
 
 		if (physical_device->rad_info.num_good_compute_units /
@@ -435,9 +452,41 @@
 		radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
 	}
 
-	if (physical_device->rad_info.family == CHIP_STONEY)
+	if (physical_device->has_rbplus)
 		radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0);
 
+	if (physical_device->rad_info.chip_class >= GFX9) {
+		unsigned num_se = physical_device->rad_info.max_se;
+		unsigned pc_lines = 0;
+
+		switch (physical_device->rad_info.family) {
+		case CHIP_VEGA10:
+			pc_lines = 4096;
+			break;
+		case CHIP_RAVEN:
+			pc_lines = 1024;
+			break;
+		default:
+			assert(0);
+		}
+
+		radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
+				       S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
+		radeon_set_context_reg(cs, R_028064_DB_RENDER_FILTER, 0);
+		/* TODO: We can use this to disable RBs for rendering to GART: */
+		radeon_set_context_reg(cs, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, 0);
+		radeon_set_context_reg(cs, R_02883C_PA_SU_OVER_RASTERIZATION_CNTL, 0);
+		/* TODO: Enable the binner: */
+		radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
+				       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+				       S_028C44_DISABLE_START_OF_PRIM(1));
+		radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1,
+				       S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) |
+				       S_028C48_MAX_PRIM_PER_BATCH(1023));
+		radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
+				       S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
+		radeon_set_uconfig_reg(cs, R_030968_VGT_INSTANCE_BASE_ID, 0);
+	}
 	si_emit_compute(physical_device, cs);
 }
 
@@ -651,6 +700,9 @@
 
 	multi_instances_smaller_than_primgroup = indirect_draw || (instanced_draw &&
 								   num_prims < primgroup_size);
+	if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.prim_id_input)
+		ia_switch_on_eoi = true;
+
 	if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) {
 		/* SWITCH_ON_EOI must be set if PrimID is used. */
 		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.uses_prim_id ||
@@ -667,7 +719,8 @@
 		/* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
 		if (cmd_buffer->device->has_distributed_tess) {
 			if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) {
-				partial_es_wave = true;
+				if (chip_class <= VI)
+					partial_es_wave = true;
 
 				if (family == CHIP_TONGA ||
 				    family == CHIP_FIJI ||
@@ -735,10 +788,15 @@
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
 	/* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-	if (ia_switch_on_eoi)
+	if (chip_class <= VI && ia_switch_on_eoi)
 		partial_es_wave = true;
 
 	if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) {
+
+		if (radv_pipeline_has_gs(cmd_buffer->state.pipeline) &&
+		    cmd_buffer->state.pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.uses_prim_id)
+			ia_switch_on_eoi = true;
+
 		/* GS requirement. */
 		if (SI_GS_PER_ES / primgroup_size >= cmd_buffer->device->gs_table_depth - 3)
 			partial_es_wave = true;
@@ -757,28 +815,98 @@
 		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
 		S_028AA8_WD_SWITCH_ON_EOP(chip_class >= CIK ? wd_switch_on_eop : 0) |
-		S_028AA8_MAX_PRIMGRP_IN_WAVE(chip_class >= VI ?
-					     max_primgroup_in_wave : 0);
+		/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
+		S_028AA8_MAX_PRIMGRP_IN_WAVE(chip_class == VI ?
+					     max_primgroup_in_wave : 0) |
+		S_030960_EN_INST_OPT_BASIC(chip_class >= GFX9) |
+		S_030960_EN_INST_OPT_ADV(chip_class >= GFX9);
 
 }
 
+void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
+				bool predicated,
+				enum chip_class chip_class,
+				bool is_mec,
+				unsigned event, unsigned event_flags,
+				unsigned data_sel,
+				uint64_t va,
+				uint32_t old_fence,
+				uint32_t new_fence)
+{
+	unsigned op = EVENT_TYPE(event) |
+		EVENT_INDEX(5) |
+		event_flags;
+	unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
+
+	if (chip_class >= GFX9 || is_gfx8_mec) {
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, predicated));
+		radeon_emit(cs, op);
+		radeon_emit(cs, EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, va);            /* address lo */
+		radeon_emit(cs, va >> 32);      /* address hi */
+		radeon_emit(cs, new_fence);     /* immediate data lo */
+		radeon_emit(cs, 0); /* immediate data hi */
+		if (!is_gfx8_mec)
+			radeon_emit(cs, 0); /* unused */
+	} else {
+		if (chip_class == CIK ||
+		    chip_class == VI) {
+			/* Two EOP events are required to make all engines go idle
+			 * (and optional cache flushes executed) before the timestamp
+			 * is written.
+			 */
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+			radeon_emit(cs, op);
+			radeon_emit(cs, va);
+			radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+			radeon_emit(cs, old_fence); /* immediate data */
+			radeon_emit(cs, 0); /* unused */
+		}
+
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+		radeon_emit(cs, op);
+		radeon_emit(cs, va);
+		radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, new_fence); /* immediate data */
+		radeon_emit(cs, 0); /* unused */
+	}
+}
+
+void
+si_emit_wait_fence(struct radeon_winsys_cs *cs,
+		   bool predicated,
+		   uint64_t va, uint32_t ref,
+		   uint32_t mask)
+{
+	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, predicated));
+	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, ref); /* reference value */
+	radeon_emit(cs, mask); /* mask */
+	radeon_emit(cs, 4); /* poll interval */
+}
+
 static void
 si_emit_acquire_mem(struct radeon_winsys_cs *cs,
                     bool is_mec,
+		    bool predicated,
+		    bool is_gfx9,
                     unsigned cp_coher_cntl)
 {
-	if (is_mec) {
-		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
-		                            PKT3_SHADER_TYPE_S(1));
+	if (is_mec || is_gfx9) {
+		uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
+		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, predicated) |
+		                            PKT3_SHADER_TYPE_S(is_mec));
 		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
 		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
-		radeon_emit(cs, 0xff);            /* CP_COHER_SIZE_HI */
+		radeon_emit(cs, hi_val);          /* CP_COHER_SIZE_HI */
 		radeon_emit(cs, 0);               /* CP_COHER_BASE */
 		radeon_emit(cs, 0);               /* CP_COHER_BASE_HI */
 		radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
 	} else {
 		/* ACQUIRE_MEM is only required on a compute ring. */
-		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, predicated));
 		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
 		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
 		radeon_emit(cs, 0);               /* CP_COHER_BASE */
@@ -788,109 +916,179 @@
 
 void
 si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+		       bool predicated,
                        enum chip_class chip_class,
+		       uint32_t *flush_cnt,
+		       uint64_t flush_va,
                        bool is_mec,
                        enum radv_cmd_flush_bits flush_bits)
 {
 	unsigned cp_coher_cntl = 0;
-
+	uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+					     RADV_CMD_FLAG_FLUSH_AND_INV_DB);
+	
 	if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
 	if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
-		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
-			S_0085F0_CB0_DEST_BASE_ENA(1) |
-			S_0085F0_CB1_DEST_BASE_ENA(1) |
-			S_0085F0_CB2_DEST_BASE_ENA(1) |
-			S_0085F0_CB3_DEST_BASE_ENA(1) |
-			S_0085F0_CB4_DEST_BASE_ENA(1) |
-			S_0085F0_CB5_DEST_BASE_ENA(1) |
-			S_0085F0_CB6_DEST_BASE_ENA(1) |
-			S_0085F0_CB7_DEST_BASE_ENA(1);
+	if (chip_class <= VI) {
+		if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
+			cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
+				S_0085F0_CB0_DEST_BASE_ENA(1) |
+				S_0085F0_CB1_DEST_BASE_ENA(1) |
+				S_0085F0_CB2_DEST_BASE_ENA(1) |
+				S_0085F0_CB3_DEST_BASE_ENA(1) |
+				S_0085F0_CB4_DEST_BASE_ENA(1) |
+				S_0085F0_CB5_DEST_BASE_ENA(1) |
+				S_0085F0_CB6_DEST_BASE_ENA(1) |
+				S_0085F0_CB7_DEST_BASE_ENA(1);
 
-		/* Necessary for DCC */
-		if (chip_class >= VI) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
-			                            EVENT_INDEX(5));
-			radeon_emit(cs, 0);
-			radeon_emit(cs, 0);
-			radeon_emit(cs, 0);
-			radeon_emit(cs, 0);
+			/* Necessary for DCC */
+			if (chip_class >= VI) {
+				si_cs_emit_write_event_eop(cs,
+							   predicated,
+							   chip_class,
+							   is_mec,
+							   V_028A90_FLUSH_AND_INV_CB_DATA_TS,
+							   0, 0, 0, 0, 0);
+			}
+		}
+		if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
+			cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
+				S_0085F0_DB_DEST_BASE_ENA(1);
 		}
 	}
 
-	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
-		cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
-			S_0085F0_DB_DEST_BASE_ENA(1);
-	}
-
 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
 	}
 
-	if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-					      RADV_CMD_FLAG_FLUSH_AND_INV_DB))) {
+	if (!flush_cb_db) {
 		if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 		} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 		}
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 	}
 
+	if (chip_class >= GFX9 && flush_cb_db) {
+		unsigned cb_db_event, tc_flags;
+
+		/* Set the CB/DB flush event. */
+		switch (flush_cb_db) {
+		case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
+			cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+			break;
+		case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
+			cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+			break;
+		default:
+			/* both CB & DB */
+			cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+		}
+
+		/* TC    | TC_WB         = invalidate L2 data
+		 * TC_MD | TC_WB         = invalidate L2 metadata
+		 * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
+		 *
+		 * The metadata cache must always be invalidated for coherency
+		 * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
+		 *
+		 * TC must be invalidated on GFX9 only if the CB/DB surface is
+		 * not pipe-aligned. If the surface is RB-aligned, it might not
+		 * strictly be pipe-aligned since RB alignment takes precendence.
+		 */
+		tc_flags = EVENT_TC_WB_ACTION_ENA |
+			   EVENT_TC_MD_ACTION_ENA;
+
+		/* Ideally flush TC together with CB/DB. */
+		if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
+			tc_flags |= EVENT_TC_ACTION_ENA |
+				    EVENT_TCL1_ACTION_ENA;
+
+			/* Clear the flags. */
+		        flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
+					 RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 |
+					 RADV_CMD_FLAG_INV_VMEM_L1);
+		}
+		assert(flush_cnt);
+		uint32_t old_fence = (*flush_cnt)++;
+
+		si_cs_emit_write_event_eop(cs, predicated, chip_class, false, cb_db_event, tc_flags, 1,
+					   flush_va, old_fence, *flush_cnt);
+		si_emit_wait_fence(cs, predicated, flush_va, *flush_cnt, 0xffffffff);
+	}
+
 	/* VGT state sync */
 	if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 	}
 
 	/* Make sure ME is idle (it executes most packets) before continuing.
 	 * This prevents read-after-write hazards between PFP and ME.
 	 */
-	if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+	if ((cp_coher_cntl ||
+	     (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+			    RADV_CMD_FLAG_INV_VMEM_L1 |
+			    RADV_CMD_FLAG_INV_GLOBAL_L2 |
+			    RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
 	    !is_mec) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, predicated));
 		radeon_emit(cs, 0);
 	}
 
 	if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
 	    (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
-		if (chip_class >= VI)
-			cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
-	} else	if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
-		cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) |
-		                 S_0301F0_TC_NC_ACTION_ENA(1);
-
-		/* L2 writeback doesn't combine with L1 invalidate */
-		si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
-
+		si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9,
+				    cp_coher_cntl |
+				    S_0085F0_TC_ACTION_ENA(1) |
+				    S_0085F0_TCL1_ACTION_ENA(1) |
+				    S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI));
 		cp_coher_cntl = 0;
+	} else {
+		if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
+			/* WB = write-back
+			 * NC = apply to non-coherent MTYPEs
+			 *      (i.e. MTYPE <= 1, which is what we use everywhere)
+			 *
+			 * WB doesn't work without NC.
+			 */
+			si_emit_acquire_mem(cs, is_mec, predicated,
+					    chip_class >= GFX9,
+					    cp_coher_cntl |
+					    S_0301F0_TC_WB_ACTION_ENA(1) |
+					    S_0301F0_TC_NC_ACTION_ENA(1));
+			cp_coher_cntl = 0;
+		}
+		if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
+			si_emit_acquire_mem(cs, is_mec,
+					    predicated, chip_class >= GFX9,
+					    cp_coher_cntl |
+					    S_0085F0_TCL1_ACTION_ENA(1));
+			cp_coher_cntl = 0;
+		}
 	}
 
-	if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1)
-		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-
 	/* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
 	 * Therefore, it should be last. Done in PFP.
 	 */
 	if (cp_coher_cntl)
-		si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
+		si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9, cp_coher_cntl);
 }
 
 void
@@ -907,67 +1105,138 @@
 	                                          RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
 	                                          RADV_CMD_FLAG_VGT_FLUSH);
 
+	if (!cmd_buffer->state.flush_bits)
+		return;
+
+	enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
 
+	uint32_t *ptr = NULL;
+	uint64_t va = 0;
+	if (chip_class == GFX9) {
+		va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset;
+		ptr = &cmd_buffer->gfx9_fence_idx;
+	}
 	si_cs_emit_cache_flush(cmd_buffer->cs,
+			       cmd_buffer->state.predicating,
 	                       cmd_buffer->device->physical_device->rad_info.chip_class,
+			       ptr, va,
 	                       radv_cmd_buffer_uses_mec(cmd_buffer),
 	                       cmd_buffer->state.flush_bits);
 
 
-	if (cmd_buffer->state.flush_bits)
-		radv_cmd_buffer_trace_emit(cmd_buffer);
+	radv_cmd_buffer_trace_emit(cmd_buffer);
 	cmd_buffer->state.flush_bits = 0;
 }
 
+/* sets the CP predication state using a boolean stored at va */
+void
+si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
+{
+	uint32_t op = 0;
+
+	if (va)
+		op = PRED_OP(PREDICATION_OP_BOOL64) | PREDICATION_DRAW_VISIBLE;
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+		radeon_emit(cmd_buffer->cs, op);
+		radeon_emit(cmd_buffer->cs, va);
+		radeon_emit(cmd_buffer->cs, va >> 32);
+	} else {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+		radeon_emit(cmd_buffer->cs, va);
+		radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF));
+	}
+}
 
 /* Set this if you want the 3D engine to wait until CP DMA is done.
  * It should be set on the last CP DMA packet. */
-#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
+#define CP_DMA_SYNC	(1 << 0)
 
 /* Set this if the source data was used as a destination in a previous CP DMA
  * packet. It's for preventing a read-after-write (RAW) hazard between two
  * CP DMA packets. */
-#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
-#define CIK_CP_DMA_USE_L2	(1 << 2)
+#define CP_DMA_RAW_WAIT	(1 << 1)
+#define CP_DMA_USE_L2	(1 << 2)
+#define CP_DMA_CLEAR	(1 << 3)
 
 /* Alignment for optimal performance. */
-#define CP_DMA_ALIGNMENT	32
-/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
+#define SI_CPDMA_ALIGNMENT	32
 
-static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer,
-				       uint64_t dst_va, uint64_t src_va,
-				       unsigned size, unsigned flags)
+/* The max number of bytes that can be copied per packet. */
+static inline unsigned cp_dma_max_byte_count(struct radv_cmd_buffer *cmd_buffer)
+{
+	unsigned max = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 ?
+			       S_414_BYTE_COUNT_GFX9(~0u) :
+			       S_414_BYTE_COUNT_GFX6(~0u);
+
+	/* make it aligned for optimal performance */
+	return max & ~(SI_CPDMA_ALIGNMENT - 1);
+}
+
+/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
+ * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
+ * clear value.
+ */
+static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
+			   uint64_t dst_va, uint64_t src_va,
+			   unsigned size, unsigned flags)
 {
 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
-	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM_GFX6(1) : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
-	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
-			   S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
-			   S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
+	uint32_t header = 0, command = 0;
 
 	assert(size);
-	assert((size & ((1<<21)-1)) == size);
+	assert(size <= cp_dma_max_byte_count(cmd_buffer));
 
 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+		command |= S_414_BYTE_COUNT_GFX9(size);
+	else
+		command |= S_414_BYTE_COUNT_GFX6(size);
+
+	/* Sync flags. */
+	if (flags & CP_DMA_SYNC)
+		header |= S_411_CP_SYNC(1);
+	else {
+		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+			command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+		else
+			command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+	}
+
+	if (flags & CP_DMA_RAW_WAIT)
+		command |= S_414_RAW_WAIT(1);
+
+	/* Src and dst flags. */
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
+	    !(flags & CP_DMA_CLEAR) &&
+	    src_va == dst_va)
+		header |= S_411_DSL_SEL(V_411_NOWHERE); /* prefetch only */
+	else if (flags & CP_DMA_USE_L2)
+		header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);
+
+	if (flags & CP_DMA_CLEAR)
+		header |= S_411_SRC_SEL(V_411_DATA);
+	else if (flags & CP_DMA_USE_L2)
+		header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);
 
 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, cmd_buffer->state.predicating));
+		radeon_emit(cs, header);
 		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
 		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, command);
 	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		assert(!(flags & CP_DMA_USE_L2));
+		header |= S_411_SRC_ADDR_HI(src_va >> 32);
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, cmd_buffer->state.predicating));
 		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
+		radeon_emit(cs, header);			/* SRC_ADDR_HI [15:0] + flags. */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, command);
 	}
 
 	/* CP DMA is executed in ME, but index buffers are read by PFP.
@@ -975,53 +1244,22 @@
 	 * indices. If we wanted to execute CP DMA in PFP, this packet
 	 * should precede it.
 	 */
-	if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+	if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
 		radeon_emit(cs, 0);
 	}
 
 	radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
-/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
-static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer,
-					uint64_t dst_va, unsigned size,
-					uint32_t clear_value, unsigned flags)
+void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+                        unsigned size)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
-	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM_GFX6(1) : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
-	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
+	uint64_t aligned_va = va & ~(SI_CPDMA_ALIGNMENT - 1);
+	uint64_t aligned_size = ((va + size + SI_CPDMA_ALIGNMENT -1) & ~(SI_CPDMA_ALIGNMENT - 1)) - aligned_va;
 
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
-
-	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | dst_sel | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-
-	/* See "copy_buffer" for explanation. */
-	if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-		radeon_emit(cs, 0);
-	}
-	radv_cmd_buffer_trace_emit(cmd_buffer);
+	si_emit_cp_dma(cmd_buffer, aligned_va, aligned_va,
+		       aligned_size, CP_DMA_USE_L2);
 }
 
 static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count,
@@ -1033,14 +1271,14 @@
 	 */
 	if (cmd_buffer->state.flush_bits) {
 		si_emit_cache_flush(cmd_buffer);
-		*flags |= SI_CP_DMA_RAW_WAIT;
+		*flags |= CP_DMA_RAW_WAIT;
 	}
 
 	/* Do the synchronization after the last dma, so that all data
 	 * is written to memory.
 	 */
 	if (byte_count == remaining_size)
-		*flags |= R600_CP_DMA_SYNC;
+		*flags |= CP_DMA_SYNC;
 }
 
 static void si_cp_dma_realign_engine(struct radv_cmd_buffer *cmd_buffer, unsigned size)
@@ -1048,20 +1286,20 @@
 	uint64_t va;
 	uint32_t offset;
 	unsigned dma_flags = 0;
-	unsigned buf_size = CP_DMA_ALIGNMENT * 2;
+	unsigned buf_size = SI_CPDMA_ALIGNMENT * 2;
 	void *ptr;
 
-	assert(size < CP_DMA_ALIGNMENT);
+	assert(size < SI_CPDMA_ALIGNMENT);
 
-	radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, CP_DMA_ALIGNMENT,  &offset, &ptr);
+	radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, SI_CPDMA_ALIGNMENT,  &offset, &ptr);
 
 	va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
 	va += offset;
 
 	si_cp_dma_prepare(cmd_buffer, size, size, &dma_flags);
 
-	si_emit_cp_dma_copy_buffer(cmd_buffer, va, va + CP_DMA_ALIGNMENT, size,
-				   dma_flags);
+	si_emit_cp_dma(cmd_buffer, va, va + SI_CPDMA_ALIGNMENT, size,
+		       dma_flags);
 }
 
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
@@ -1078,15 +1316,15 @@
 		 * just to align the internal counter. Otherwise, the DMA engine
 		 * would slow down by an order of magnitude for following copies.
 		 */
-		if (size % CP_DMA_ALIGNMENT)
-			realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+		if (size % SI_CPDMA_ALIGNMENT)
+			realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
 
 		/* If the copy begins unaligned, we must start copying from the next
 		 * aligned block and the skipped part should be copied after everything
 		 * else has been copied. Only the src alignment matters, not dst.
 		 */
-		if (src_va % CP_DMA_ALIGNMENT) {
-			skipped_size = CP_DMA_ALIGNMENT - (src_va % CP_DMA_ALIGNMENT);
+		if (src_va % SI_CPDMA_ALIGNMENT) {
+			skipped_size = SI_CPDMA_ALIGNMENT - (src_va % SI_CPDMA_ALIGNMENT);
 			/* The main part will be skipped if the size is too small. */
 			skipped_size = MIN2(skipped_size, size);
 			size -= skipped_size;
@@ -1097,14 +1335,14 @@
 
 	while (size) {
 		unsigned dma_flags = 0;
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
 
 		si_cp_dma_prepare(cmd_buffer, byte_count,
 				  size + skipped_size + realign_size,
 				  &dma_flags);
 
-		si_emit_cp_dma_copy_buffer(cmd_buffer, main_dest_va, main_src_va,
-					   byte_count, dma_flags);
+		si_emit_cp_dma(cmd_buffer, main_dest_va, main_src_va,
+			       byte_count, dma_flags);
 
 		size -= byte_count;
 		main_src_va += byte_count;
@@ -1118,8 +1356,8 @@
 				  size + skipped_size + realign_size,
 				  &dma_flags);
 
-		si_emit_cp_dma_copy_buffer(cmd_buffer, dest_va, src_va,
-					   skipped_size, dma_flags);
+		si_emit_cp_dma(cmd_buffer, dest_va, src_va,
+			       skipped_size, dma_flags);
 	}
 	if (realign_size)
 		si_cp_dma_realign_engine(cmd_buffer, realign_size);
@@ -1135,14 +1373,14 @@
 	assert(va % 4 == 0 && size % 4 == 0);
 
 	while (size) {
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-		unsigned dma_flags = 0;
+		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
+		unsigned dma_flags = CP_DMA_CLEAR;
 
 		si_cp_dma_prepare(cmd_buffer, byte_count, size, &dma_flags);
 
 		/* Emit the clear packet. */
-		si_emit_cp_dma_clear_buffer(cmd_buffer, va, byte_count, value,
-					    dma_flags);
+		si_emit_cp_dma(cmd_buffer, va, value, byte_count,
+			       dma_flags);
 
 		size -= byte_count;
 		va += byte_count;
diff --git a/src/amd/vulkan/vk_format.h b/src/amd/vulkan/vk_format.h
index 13ac179..43265ed 100644
--- a/src/amd/vulkan/vk_format.h
+++ b/src/amd/vulkan/vk_format.h
@@ -367,6 +367,19 @@
 }
 
 static inline bool
+vk_format_is_stencil(VkFormat format)
+{
+	const struct vk_format_description *desc = vk_format_description(format);
+
+	assert(desc);
+	if (!desc) {
+		return false;
+	}
+
+	return vk_format_has_stencil(desc);
+}
+
+static inline bool
 vk_format_is_color(VkFormat format)
 {
 	return !vk_format_is_depth_or_stencil(format);
@@ -396,6 +409,13 @@
 	return channel >= 0 && desc->channel[channel].pure_integer;
 }
 
+static inline bool
+vk_format_is_srgb(VkFormat format)
+{
+	const struct vk_format_description *desc = vk_format_description(format);
+	return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB;
+}
+
 static inline VkFormat
 vk_format_stencil_only(VkFormat format)
 {
@@ -445,4 +465,27 @@
 	}
 }
 
+static inline VkFormat
+vk_to_non_srgb_format(VkFormat format)
+{
+	switch(format) {
+	case VK_FORMAT_R8_SRGB :
+		return VK_FORMAT_R8_UNORM;
+	case VK_FORMAT_R8G8_SRGB:
+		return VK_FORMAT_R8G8_UNORM;
+	case VK_FORMAT_R8G8B8_SRGB:
+		return VK_FORMAT_R8G8B8_UNORM;
+	case VK_FORMAT_B8G8R8_SRGB:
+		return VK_FORMAT_B8G8R8_UNORM;
+	case VK_FORMAT_R8G8B8A8_SRGB :
+		return VK_FORMAT_R8G8B8A8_UNORM;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		return VK_FORMAT_B8G8R8A8_UNORM;
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+	default:
+		return format;
+	}
+}
+
 #endif /* VK_FORMAT_H */
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
index 7b67945..0e587f5 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
@@ -39,6 +39,23 @@
 
 static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo);
 
+static int
+radv_amdgpu_bo_va_op(amdgpu_device_handle dev,
+		     amdgpu_bo_handle bo,
+		     uint64_t offset,
+		     uint64_t size,
+		     uint64_t addr,
+		     uint64_t flags,
+		     uint32_t ops)
+{
+	size = ALIGN(size, getpagesize());
+	flags |= (AMDGPU_VM_PAGE_READABLE |
+		  AMDGPU_VM_PAGE_WRITEABLE |
+		  AMDGPU_VM_PAGE_EXECUTABLE);
+	return amdgpu_bo_va_op_raw(dev, bo, offset, size, addr,
+				   flags, ops);
+}
+
 static void
 radv_amdgpu_winsys_virtual_map(struct radv_amdgpu_winsys_bo *bo,
                                const struct radv_amdgpu_map_range *range)
@@ -49,8 +66,8 @@
 		return; /* TODO: PRT mapping */
 
 	p_atomic_inc(&range->bo->ref_count);
-	int r = amdgpu_bo_va_op(range->bo->bo, range->bo_offset, range->size,
-	                        range->offset + bo->va, 0, AMDGPU_VA_OP_MAP);
+	int r = radv_amdgpu_bo_va_op(bo->ws->dev, range->bo->bo, range->bo_offset, range->size,
+				     range->offset + bo->va, 0, AMDGPU_VA_OP_MAP);
 	if (r)
 		abort();
 }
@@ -64,8 +81,8 @@
 	if (!range->bo)
 		return; /* TODO: PRT mapping */
 
-	int r = amdgpu_bo_va_op(range->bo->bo, range->bo_offset, range->size,
-	                        range->offset + bo->va, 0, AMDGPU_VA_OP_UNMAP);
+	int r = radv_amdgpu_bo_va_op(bo->ws->dev, range->bo->bo, range->bo_offset, range->size,
+				     range->offset + bo->va, 0, AMDGPU_VA_OP_UNMAP);
 	if (r)
 		abort();
 	radv_amdgpu_winsys_bo_destroy((struct radeon_winsys_bo *)range->bo);
@@ -149,6 +166,7 @@
 	if (parent->ranges[first].bo == bo && (!bo || offset - bo_offset == parent->ranges[first].offset - parent->ranges[first].bo_offset)) {
 		size += offset - parent->ranges[first].offset;
 		offset = parent->ranges[first].offset;
+		bo_offset = parent->ranges[first].bo_offset;
 		remove_first = true;
 	}
 
@@ -234,7 +252,7 @@
 			bo->ws->num_buffers--;
 			pthread_mutex_unlock(&bo->ws->global_bo_list_lock);
 		}
-		amdgpu_bo_va_op(bo->bo, 0, bo->size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
+		radv_amdgpu_bo_va_op(bo->ws->dev, bo->bo, 0, bo->size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
 		amdgpu_bo_free(bo->bo);
 	}
 	amdgpu_va_range_free(bo->va_handle);
@@ -322,7 +340,11 @@
 		goto error_bo_alloc;
 	}
 
-	r = amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP);
+
+	uint32_t va_flags = 0;
+	if ((flags & RADEON_FLAG_VA_UNCACHED) && ws->info.chip_class >= GFX9)
+		va_flags |= AMDGPU_VM_MTYPE_UC;
+	r = radv_amdgpu_bo_va_op(ws->dev, buf_handle, 0, size, va, va_flags, AMDGPU_VA_OP_MAP);
 	if (r)
 		goto error_va_map;
 
@@ -398,7 +420,7 @@
 	if (r)
 		goto error_query;
 
-	r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
+	r = radv_amdgpu_bo_va_op(ws->dev, result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
 	if (r)
 		goto error_va_map;
 
@@ -467,25 +489,29 @@
 	struct amdgpu_bo_metadata metadata = {0};
 	uint32_t tiling_flags = 0;
 
-	if (md->macrotile == RADEON_LAYOUT_TILED)
-		tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
-	else if (md->microtile == RADEON_LAYOUT_TILED)
-		tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
-	else
-		tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
+	if (bo->ws->info.chip_class >= GFX9) {
+		tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
+	} else {
+		if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+			tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
+		else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
+			tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
+		else
+			tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
 
-	tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config);
-	tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw));
-	tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh));
-	if (md->tile_split)
-		tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->tile_split));
-	tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea));
-	tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1);
+		tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
+		tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
+		tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
+		if (md->u.legacy.tile_split)
+			tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->u.legacy.tile_split));
+		tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
+		tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1);
 
-	if (md->scanout)
-		tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
-	else
-		tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
+		if (md->u.legacy.scanout)
+			tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
+		else
+			tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
+	}
 
 	metadata.tiling_info = tiling_flags;
 	metadata.size_metadata = md->size_metadata;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index ca7d647..0d89b95 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -89,26 +89,35 @@
 	}
 }
 
+static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
+				   uint32_t ip_type,
+				   uint32_t ring,
+				   struct radv_winsys_sem_info *sem_info);
+static int radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
+				 struct amdgpu_cs_request *request,
+				 struct radv_winsys_sem_info *sem_info);
+
 static void radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx,
-					 struct amdgpu_cs_fence *fence,
+					 struct radv_amdgpu_fence *fence,
 					 struct amdgpu_cs_request *req)
 {
-	fence->context = ctx->ctx;
-	fence->ip_type = req->ip_type;
-	fence->ip_instance = req->ip_instance;
-	fence->ring = req->ring;
-	fence->fence = req->seq_no;
+	fence->fence.context = ctx->ctx;
+	fence->fence.ip_type = req->ip_type;
+	fence->fence.ip_instance = req->ip_instance;
+	fence->fence.ring = req->ring;
+	fence->fence.fence = req->seq_no;
+	fence->user_ptr = (volatile uint64_t*)(ctx->fence_map + (req->ip_type * MAX_RINGS_PER_TYPE + req->ring) * sizeof(uint64_t));
 }
 
 static struct radeon_winsys_fence *radv_amdgpu_create_fence()
 {
-	struct radv_amdgpu_cs_fence *fence = calloc(1, sizeof(struct amdgpu_cs_fence));
+	struct radv_amdgpu_fence *fence = calloc(1, sizeof(struct radv_amdgpu_fence));
 	return (struct radeon_winsys_fence*)fence;
 }
 
 static void radv_amdgpu_destroy_fence(struct radeon_winsys_fence *_fence)
 {
-	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
+	struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
 	free(fence);
 }
 
@@ -117,16 +126,23 @@
 			      bool absolute,
 			      uint64_t timeout)
 {
-	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
+	struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
 	unsigned flags = absolute ? AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE : 0;
 	int r;
 	uint32_t expired = 0;
 
+	if (fence->user_ptr) {
+		if (*fence->user_ptr >= fence->fence.fence)
+			return true;
+		if (!absolute && !timeout)
+			return false;
+	}
+
 	/* Now use the libdrm query. */
-	r = amdgpu_cs_query_fence_status(fence,
-					 timeout,
-					 flags,
-					 &expired);
+	r = amdgpu_cs_query_fence_status(&fence->fence,
+	                                 timeout,
+	                                 flags,
+	                                 &expired);
 
 	if (r) {
 		fprintf(stderr, "amdgpu: radv_amdgpu_cs_query_fence_status failed.\n");
@@ -619,6 +635,16 @@
 	return r;
 }
 
+static struct amdgpu_cs_fence_info radv_set_cs_fence(struct radv_amdgpu_ctx *ctx, int ip_type, int ring)
+{
+	struct amdgpu_cs_fence_info ret = {0};
+	if (ctx->fence_map) {
+		ret.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
+		ret.offset = (ip_type * MAX_RINGS_PER_TYPE + ring) * sizeof(uint64_t);
+	}
+	return ret;
+}
+
 static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
 				    struct amdgpu_cs_request *request)
 {
@@ -629,6 +655,7 @@
 
 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
 						int queue_idx,
+						struct radv_winsys_sem_info *sem_info,
 						struct radeon_winsys_cs **cs_array,
 						unsigned cs_count,
 						struct radeon_winsys_cs *initial_preamble_cs,
@@ -637,7 +664,7 @@
 {
 	int r;
 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
-	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
+	struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
 	struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
 	amdgpu_bo_list_handle bo_list;
 	struct amdgpu_cs_request request = {0};
@@ -676,6 +703,7 @@
 	request.number_of_ibs = 1;
 	request.ibs = &cs0->ib;
 	request.resources = bo_list;
+	request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
 
 	if (initial_preamble_cs) {
 		request.ibs = ibs;
@@ -684,7 +712,7 @@
 		ibs[0] = ((struct radv_amdgpu_cs*)initial_preamble_cs)->ib;
 	}
 
-	r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
+	r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
 	if (r) {
 		if (r == -ENOMEM)
 			fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@@ -705,6 +733,7 @@
 
 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
 						 int queue_idx,
+						 struct radv_winsys_sem_info *sem_info,
 						 struct radeon_winsys_cs **cs_array,
 						 unsigned cs_count,
 						 struct radeon_winsys_cs *initial_preamble_cs,
@@ -713,10 +742,10 @@
 {
 	int r;
 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
-	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
+	struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
 	amdgpu_bo_list_handle bo_list;
 	struct amdgpu_cs_request request;
-
+	bool emit_signal_sem = sem_info->cs_emit_signal;
 	assert(cs_count);
 
 	for (unsigned i = 0; i < cs_count;) {
@@ -740,6 +769,7 @@
 		request.resources = bo_list;
 		request.number_of_ibs = cnt + !!preamble_cs;
 		request.ibs = ibs;
+		request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
 
 		if (preamble_cs) {
 			ibs[0] = radv_amdgpu_cs(preamble_cs)->ib;
@@ -755,7 +785,8 @@
 			}
 		}
 
-		r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
+		sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
+		r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
 		if (r) {
 			if (r == -ENOMEM)
 				fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@@ -781,6 +812,7 @@
 
 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
 					       int queue_idx,
+					       struct radv_winsys_sem_info *sem_info,
 					       struct radeon_winsys_cs **cs_array,
 					       unsigned cs_count,
 					       struct radeon_winsys_cs *initial_preamble_cs,
@@ -789,14 +821,15 @@
 {
 	int r;
 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
-	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
+	struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
 	struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
 	struct radeon_winsys *ws = (struct radeon_winsys*)cs0->ws;
 	amdgpu_bo_list_handle bo_list;
 	struct amdgpu_cs_request request;
 	uint32_t pad_word = 0xffff1000U;
+	bool emit_signal_sem = sem_info->cs_emit_signal;
 
-	if (radv_amdgpu_winsys(ws)->family == FAMILY_SI)
+	if (radv_amdgpu_winsys(ws)->info.chip_class == SI)
 		pad_word = 0x80000000;
 
 	assert(cs_count);
@@ -858,8 +891,10 @@
 		request.resources = bo_list;
 		request.number_of_ibs = 1;
 		request.ibs = &ib;
+		request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
 
-		r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
+		sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
+		r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
 		if (r) {
 			if (r == -ENOMEM)
 				fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@@ -890,39 +925,27 @@
 					unsigned cs_count,
 					struct radeon_winsys_cs *initial_preamble_cs,
 					struct radeon_winsys_cs *continue_preamble_cs,
-					struct radeon_winsys_sem **wait_sem,
-					unsigned wait_sem_count,
-					struct radeon_winsys_sem **signal_sem,
-					unsigned signal_sem_count,
+					struct radv_winsys_sem_info *sem_info,
 					bool can_patch,
 					struct radeon_winsys_fence *_fence)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]);
 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
 	int ret;
-	int i;
-	
-	for (i = 0; i < wait_sem_count; i++) {
-		amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)wait_sem[i];
-		amdgpu_cs_wait_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
-					 sem);
-	}
+
+	assert(sem_info);
 	if (!cs->ws->use_ib_bos) {
-		ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, cs_array,
+		ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, cs_array,
 							   cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
-	} else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && false) {
-		ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, cs_array,
+	} else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && cs->ws->batchchain) {
+		ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, cs_array,
 							    cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
 	} else {
-		ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, cs_array,
+		ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, cs_array,
 							     cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
 	}
 
-	for (i = 0; i < signal_sem_count; i++) {
-		amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)signal_sem[i];
-		amdgpu_cs_signal_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
-					   sem);
-	}
+	radv_amdgpu_signal_sems(ctx, cs->hw_ip, queue_idx, sem_info);
 	return ret;
 }
 
@@ -978,6 +1001,15 @@
 		goto error_create;
 	}
 	ctx->ws = ws;
+
+	assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096);
+	ctx->fence_bo = ws->base.buffer_create(&ws->base, 4096, 8,
+	                                      RADEON_DOMAIN_GTT,
+	                                      RADEON_FLAG_CPU_ACCESS);
+	if (ctx->fence_bo)
+		ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo);
+	if (ctx->fence_map)
+		memset(ctx->fence_map, 0, 4096);
 	return (struct radeon_winsys_ctx *)ctx;
 error_create:
 	FREE(ctx);
@@ -987,6 +1019,7 @@
 static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
 {
 	struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
+	ctx->ws->base.buffer_destroy(ctx->fence_bo);
 	amdgpu_cs_ctx_free(ctx->ctx);
 	FREE(ctx);
 }
@@ -997,9 +1030,9 @@
 	struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
 	int ip_type = ring_to_hw_ip(ring_type);
 
-	if (ctx->last_submission[ip_type][ring_index].fence) {
+	if (ctx->last_submission[ip_type][ring_index].fence.fence) {
 		uint32_t expired;
-		int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index],
+		int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence,
 		                                       1000000000ull, 0, &expired);
 
 		if (ret || !expired)
@@ -1011,19 +1044,202 @@
 
 static struct radeon_winsys_sem *radv_amdgpu_create_sem(struct radeon_winsys *_ws)
 {
-	int ret;
-	amdgpu_semaphore_handle sem;
-
-	ret = amdgpu_cs_create_semaphore(&sem);
-	if (ret)
+	struct amdgpu_cs_fence *sem = CALLOC_STRUCT(amdgpu_cs_fence);
+	if (!sem)
 		return NULL;
+
 	return (struct radeon_winsys_sem *)sem;
 }
 
 static void radv_amdgpu_destroy_sem(struct radeon_winsys_sem *_sem)
 {
-	amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)_sem;
-	amdgpu_cs_destroy_semaphore(sem);
+	struct amdgpu_cs_fence *sem = (struct amdgpu_cs_fence *)_sem;
+	FREE(sem);
+}
+
+static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
+				   uint32_t ip_type,
+				   uint32_t ring,
+				   struct radv_winsys_sem_info *sem_info)
+{
+	for (unsigned i = 0; i < sem_info->signal.sem_count; i++) {
+		struct amdgpu_cs_fence *sem = (struct amdgpu_cs_fence *)(sem_info->signal.sem)[i];
+
+		if (sem->context)
+			return -EINVAL;
+
+		*sem = ctx->last_submission[ip_type][ring].fence;
+	}
+	return 0;
+}
+
+static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
+									  struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
+{
+	struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
+	if (!syncobj)
+		return NULL;
+
+	for (unsigned i = 0; i < counts->syncobj_count; i++) {
+		struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
+		sem->handle = counts->syncobj[i];
+	}
+
+	chunk->chunk_id = chunk_id;
+	chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * counts->syncobj_count;
+	chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
+	return syncobj;
+}
+
+static int radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
+				 struct amdgpu_cs_request *request,
+				 struct radv_winsys_sem_info *sem_info)
+{
+	int r;
+	int num_chunks;
+	int size;
+	bool user_fence;
+	struct drm_amdgpu_cs_chunk *chunks;
+	struct drm_amdgpu_cs_chunk_data *chunk_data;
+	struct drm_amdgpu_cs_chunk_dep *sem_dependencies = NULL;
+	struct drm_amdgpu_cs_chunk_sem *wait_syncobj = NULL, *signal_syncobj = NULL;
+	int i;
+	struct amdgpu_cs_fence *sem;
+
+	user_fence = (request->fence_info.handle != NULL);
+	size = request->number_of_ibs + (user_fence ? 2 : 1) + 3;
+
+	chunks = alloca(sizeof(struct drm_amdgpu_cs_chunk) * size);
+
+	size = request->number_of_ibs + (user_fence ? 1 : 0);
+
+	chunk_data = alloca(sizeof(struct drm_amdgpu_cs_chunk_data) * size);
+
+	num_chunks = request->number_of_ibs;
+	for (i = 0; i < request->number_of_ibs; i++) {
+		struct amdgpu_cs_ib_info *ib;
+		chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
+		chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
+		chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
+
+		ib = &request->ibs[i];
+
+		chunk_data[i].ib_data._pad = 0;
+		chunk_data[i].ib_data.va_start = ib->ib_mc_address;
+		chunk_data[i].ib_data.ib_bytes = ib->size * 4;
+		chunk_data[i].ib_data.ip_type = request->ip_type;
+		chunk_data[i].ib_data.ip_instance = request->ip_instance;
+		chunk_data[i].ib_data.ring = request->ring;
+		chunk_data[i].ib_data.flags = ib->flags;
+	}
+
+	if (user_fence) {
+		i = num_chunks++;
+
+		chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
+		chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
+		chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
+
+		amdgpu_cs_chunk_fence_info_to_data(&request->fence_info,
+						   &chunk_data[i]);
+	}
+
+	if (sem_info->wait.syncobj_count && sem_info->cs_emit_wait) {
+		wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait,
+								  &chunks[num_chunks],
+								  AMDGPU_CHUNK_ID_SYNCOBJ_IN);
+		if (!wait_syncobj) {
+			r = -ENOMEM;
+			goto error_out;
+		}
+		num_chunks++;
+
+		if (sem_info->wait.sem_count == 0)
+			sem_info->cs_emit_wait = false;
+
+	}
+
+	if (sem_info->wait.sem_count && sem_info->cs_emit_wait) {
+		sem_dependencies = malloc(sizeof(struct drm_amdgpu_cs_chunk_dep) * sem_info->wait.sem_count);
+		if (!sem_dependencies) {
+			r = -ENOMEM;
+			goto error_out;
+		}
+		int sem_count = 0;
+		for (unsigned j = 0; j < sem_info->wait.sem_count; j++) {
+			sem = (struct amdgpu_cs_fence *)sem_info->wait.sem[j];
+			if (!sem->context)
+				continue;
+			struct drm_amdgpu_cs_chunk_dep *dep = &sem_dependencies[sem_count++];
+
+			amdgpu_cs_chunk_fence_to_dep(sem, dep);
+
+			sem->context = NULL;
+		}
+		i = num_chunks++;
+
+		/* dependencies chunk */
+		chunks[i].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
+		chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_dep) / 4 * sem_count;
+		chunks[i].chunk_data = (uint64_t)(uintptr_t)sem_dependencies;
+
+		sem_info->cs_emit_wait = false;
+	}
+
+	if (sem_info->signal.syncobj_count && sem_info->cs_emit_signal) {
+		signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal,
+								    &chunks[num_chunks],
+								    AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
+		if (!signal_syncobj) {
+			r = -ENOMEM;
+			goto error_out;
+		}
+		num_chunks++;
+	}
+
+	r = amdgpu_cs_submit_raw(ctx->ws->dev,
+				 ctx->ctx,
+				 request->resources,
+				 num_chunks,
+				 chunks,
+				 &request->seq_no);
+error_out:
+	free(sem_dependencies);
+	free(wait_syncobj);
+	free(signal_syncobj);
+	return r;
+}
+
+static int radv_amdgpu_create_syncobj(struct radeon_winsys *_ws,
+				      uint32_t *handle)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+	return amdgpu_cs_create_syncobj(ws->dev, handle);
+}
+
+static void radv_amdgpu_destroy_syncobj(struct radeon_winsys *_ws,
+				    uint32_t handle)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+	amdgpu_cs_destroy_syncobj(ws->dev, handle);
+}
+
+static int radv_amdgpu_export_syncobj(struct radeon_winsys *_ws,
+				      uint32_t syncobj,
+				      int *fd)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+
+	return amdgpu_cs_export_syncobj(ws->dev, syncobj, fd);
+}
+
+static int radv_amdgpu_import_syncobj(struct radeon_winsys *_ws,
+				      int fd,
+				      uint32_t *syncobj)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+
+	return amdgpu_cs_import_syncobj(ws->dev, fd, syncobj);
 }
 
 void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
@@ -1044,5 +1260,9 @@
 	ws->base.destroy_fence = radv_amdgpu_destroy_fence;
 	ws->base.create_sem = radv_amdgpu_create_sem;
 	ws->base.destroy_sem = radv_amdgpu_destroy_sem;
+	ws->base.create_syncobj = radv_amdgpu_create_syncobj;
+	ws->base.destroy_syncobj = radv_amdgpu_destroy_syncobj;
+	ws->base.export_syncobj = radv_amdgpu_export_syncobj;
+	ws->base.import_syncobj = radv_amdgpu_import_syncobj;
 	ws->base.fence_wait = radv_amdgpu_fence_wait;
 }
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h
index fc6a2c8..42d89ee 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h
@@ -42,10 +42,19 @@
 	MAX_RINGS_PER_TYPE = 8
 };
 
+
+struct radv_amdgpu_fence {
+	struct amdgpu_cs_fence fence;
+	volatile uint64_t *user_ptr;
+};
+
 struct radv_amdgpu_ctx {
 	struct radv_amdgpu_winsys *ws;
 	amdgpu_context_handle ctx;
-	struct amdgpu_cs_fence last_submission[AMDGPU_HW_IP_DMA + 1][MAX_RINGS_PER_TYPE];
+	struct radv_amdgpu_fence last_submission[AMDGPU_HW_IP_DMA + 1][MAX_RINGS_PER_TYPE];
+
+	struct radeon_winsys_bo *fence_bo;
+	uint64_t *fence_map;
 };
 
 static inline struct radv_amdgpu_ctx *
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
index 511f464..eaa978e 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
@@ -35,63 +35,39 @@
 #include "radv_amdgpu_surface.h"
 #include "sid.h"
 
-#ifndef NO_ENTRIES
-#define NO_ENTRIES 32
-#endif
+#include "ac_surface.h"
 
-#ifndef NO_MACRO_ENTRIES
-#define NO_MACRO_ENTRIES 16
-#endif
-
-#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
-#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
-#endif
-
-static int radv_amdgpu_surface_sanity(const struct radeon_surf *surf)
+static int radv_amdgpu_surface_sanity(const struct ac_surf_info *surf_info,
+				      const struct radeon_surf *surf)
 {
 	unsigned type = RADEON_SURF_GET(surf->flags, TYPE);
 
 	if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX))
 		return -EINVAL;
 
-	/* all dimension must be at least 1 ! */
-	if (!surf->npix_x || !surf->npix_y || !surf->npix_z ||
-	    !surf->array_size)
+	if (!surf->blk_w || !surf->blk_h)
 		return -EINVAL;
 
-	if (!surf->blk_w || !surf->blk_h || !surf->blk_d)
-		return -EINVAL;
-
-	switch (surf->nsamples) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		break;
-	default:
-		return -EINVAL;
-	}
-
 	switch (type) {
 	case RADEON_SURF_TYPE_1D:
-		if (surf->npix_y > 1)
+		if (surf_info->height > 1)
 			return -EINVAL;
 		/* fall through */
 	case RADEON_SURF_TYPE_2D:
 	case RADEON_SURF_TYPE_CUBEMAP:
-		if (surf->npix_z > 1 || surf->array_size > 1)
+		if (surf_info->depth > 1 || surf_info->array_size > 1)
 			return -EINVAL;
 		break;
 	case RADEON_SURF_TYPE_3D:
-		if (surf->array_size > 1)
+		if (surf_info->array_size > 1)
 			return -EINVAL;
 		break;
 	case RADEON_SURF_TYPE_1D_ARRAY:
-		if (surf->npix_y > 1)
+		if (surf_info->height > 1)
 			return -EINVAL;
 		/* fall through */
 	case RADEON_SURF_TYPE_2D_ARRAY:
-		if (surf->npix_z > 1)
+		if (surf_info->depth > 1)
 			return -EINVAL;
 		break;
 	default:
@@ -100,453 +76,28 @@
 	return 0;
 }
 
-static void *ADDR_API radv_allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
-{
-	return malloc(pInput->sizeInBytes);
-}
-
-static ADDR_E_RETURNCODE ADDR_API radv_freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
-{
-	free(pInput->pVirtAddr);
-	return ADDR_OK;
-}
-
-ADDR_HANDLE radv_amdgpu_addr_create(struct amdgpu_gpu_info *amdinfo, int family, int rev_id,
-				    enum chip_class chip_class)
-{
-	ADDR_CREATE_INPUT addrCreateInput = {0};
-	ADDR_CREATE_OUTPUT addrCreateOutput = {0};
-	ADDR_REGISTER_VALUE regValue = {0};
-	ADDR_CREATE_FLAGS createFlags = {{0}};
-	ADDR_E_RETURNCODE addrRet;
-
-	addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
-	addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
-
-	regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3;
-	regValue.gbAddrConfig = amdinfo->gb_addr_cfg;
-	regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2;
-
-	regValue.backendDisables = amdinfo->backend_disable[0];
-	regValue.pTileConfig = amdinfo->gb_tile_mode;
-	regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode);
-	if (chip_class == SI) {
-		regValue.pMacroTileConfig = NULL;
-		regValue.noOfMacroEntries = 0;
-	} else {
-		regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode;
-		regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode);
-	}
-
-	createFlags.value = 0;
-	createFlags.useTileIndex = 1;
-
-	addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
-	addrCreateInput.chipFamily = family;
-	addrCreateInput.chipRevision = rev_id;
-	addrCreateInput.createFlags = createFlags;
-	addrCreateInput.callbacks.allocSysMem = radv_allocSysMem;
-	addrCreateInput.callbacks.freeSysMem = radv_freeSysMem;
-	addrCreateInput.callbacks.debugPrint = 0;
-	addrCreateInput.regValue = regValue;
-
-	addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
-	if (addrRet != ADDR_OK)
-		return NULL;
-
-	return addrCreateOutput.hLib;
-}
-
-static int radv_compute_level(ADDR_HANDLE addrlib,
-                              struct radeon_surf *surf, bool is_stencil,
-                              unsigned level, unsigned type, bool compressed,
-                              ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
-                              ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
-                              ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                              ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
-{
-	struct radeon_surf_level *surf_level;
-	ADDR_E_RETURNCODE ret;
-
-	AddrSurfInfoIn->mipLevel = level;
-	AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
-	AddrSurfInfoIn->height = u_minify(surf->npix_y, level);
-
-	if (type == RADEON_SURF_TYPE_3D)
-		AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
-	else if (type == RADEON_SURF_TYPE_CUBEMAP)
-		AddrSurfInfoIn->numSlices = 6;
-	else
-		AddrSurfInfoIn->numSlices = surf->array_size;
-
-	if (level > 0) {
-		/* Set the base level pitch. This is needed for calculation
-		 * of non-zero levels. */
-		if (is_stencil)
-			AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x;
-		else
-			AddrSurfInfoIn->basePitch = surf->level[0].nblk_x;
-
-		/* Convert blocks to pixels for compressed formats. */
-		if (compressed)
-			AddrSurfInfoIn->basePitch *= surf->blk_w;
-	}
-
-	ret = AddrComputeSurfaceInfo(addrlib,
-				     AddrSurfInfoIn,
-				     AddrSurfInfoOut);
-	if (ret != ADDR_OK)
-		return ret;
-
-	surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level];
-	surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign);
-	surf_level->slice_size = AddrSurfInfoOut->sliceSize;
-	surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe);
-	surf_level->npix_x = u_minify(surf->npix_x, level);
-	surf_level->npix_y = u_minify(surf->npix_y, level);
-	surf_level->npix_z = u_minify(surf->npix_z, level);
-	surf_level->nblk_x = AddrSurfInfoOut->pitch;
-	surf_level->nblk_y = AddrSurfInfoOut->height;
-	if (type == RADEON_SURF_TYPE_3D)
-		surf_level->nblk_z = AddrSurfInfoOut->depth;
-	else
-		surf_level->nblk_z = 1;
-
-	switch (AddrSurfInfoOut->tileMode) {
-	case ADDR_TM_LINEAR_ALIGNED:
-		surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-		break;
-	case ADDR_TM_1D_TILED_THIN1:
-		surf_level->mode = RADEON_SURF_MODE_1D;
-		break;
-	case ADDR_TM_2D_TILED_THIN1:
-		surf_level->mode = RADEON_SURF_MODE_2D;
-		break;
-	default:
-		assert(0);
-	}
-
-	if (is_stencil)
-		surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
-	else
-		surf->tiling_index[level] = AddrSurfInfoOut->tileIndex;
-
-	surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize;
-
-	/* Clear DCC fields at the beginning. */
-	surf_level->dcc_offset = 0;
-	surf_level->dcc_enabled = false;
-
-	/* The previous level's flag tells us if we can use DCC for this level. */
-	if (AddrSurfInfoIn->flags.dccCompatible &&
-	    (level == 0 || AddrDccOut->subLvlCompressible)) {
-		AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
-		AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
-		AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
-		AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
-		AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-		ret = AddrComputeDccInfo(addrlib,
-					 AddrDccIn,
-					 AddrDccOut);
-
-		if (ret == ADDR_OK) {
-			surf_level->dcc_offset = surf->dcc_size;
-			surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
-			surf_level->dcc_enabled = true;
-			surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
-			surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
-		}
-	}
-
-	if (!is_stencil && AddrSurfInfoIn->flags.depth &&
-	    surf_level->mode == RADEON_SURF_MODE_2D && level == 0) {
-		ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
-		ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
-		AddrHtileIn.flags.tcCompatible = AddrSurfInfoIn->flags.tcCompatible;
-		AddrHtileIn.pitch = AddrSurfInfoOut->pitch;
-		AddrHtileIn.height = AddrSurfInfoOut->height;
-		AddrHtileIn.numSlices = AddrSurfInfoOut->depth;
-		AddrHtileIn.blockWidth = ADDR_HTILE_BLOCKSIZE_8;
-		AddrHtileIn.blockHeight = ADDR_HTILE_BLOCKSIZE_8;
-		AddrHtileIn.pTileInfo = AddrSurfInfoOut->pTileInfo;
-		AddrHtileIn.tileIndex = AddrSurfInfoOut->tileIndex;
-		AddrHtileIn.macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-		ret = AddrComputeHtileInfo(addrlib,
-		                           &AddrHtileIn,
-		                           &AddrHtileOut);
-
-		if (ret == ADDR_OK) {
-			surf->htile_size = AddrHtileOut.htileBytes;
-			surf->htile_slice_size = AddrHtileOut.sliceSize;
-			surf->htile_alignment = AddrHtileOut.baseAlign;
-		}
-	}
-	return 0;
-}
-
-static void radv_set_micro_tile_mode(struct radeon_surf *surf,
-                                     struct radeon_info *info)
-{
-	uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]];
-
-	if (info->chip_class >= CIK)
-		surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode);
-	else
-		surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode);
-}
-
-static unsigned cik_get_macro_tile_index(struct radeon_surf *surf)
-{
-	unsigned index, tileb;
-
-	tileb = 8 * 8 * surf->bpe;
-	tileb = MIN2(surf->tile_split, tileb);
-
-	for (index = 0; tileb > 64; index++)
-		tileb >>= 1;
-
-	assert(index < 16);
-	return index;
-}
-
 static int radv_amdgpu_winsys_surface_init(struct radeon_winsys *_ws,
+					   const struct ac_surf_info *surf_info,
 					   struct radeon_surf *surf)
 {
 	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
-	unsigned level, mode, type;
-	bool compressed;
-	ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
-	ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
-	ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
-	ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
-	ADDR_TILEINFO AddrTileInfoIn = {0};
-	ADDR_TILEINFO AddrTileInfoOut = {0};
+	unsigned mode, type;
 	int r;
 
-	r = radv_amdgpu_surface_sanity(surf);
+	r = radv_amdgpu_surface_sanity(surf_info, surf);
 	if (r)
 		return r;
 
-	AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
-	AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
-	AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
-	AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
-	AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
-
 	type = RADEON_SURF_GET(surf->flags, TYPE);
 	mode = RADEON_SURF_GET(surf->flags, MODE);
-	compressed = surf->blk_w == 4 && surf->blk_h == 4;
 
-	/* MSAA and FMASK require 2D tiling. */
-	if (surf->nsamples > 1 ||
-	    (surf->flags & RADEON_SURF_FMASK))
-		mode = RADEON_SURF_MODE_2D;
+	struct ac_surf_config config;
 
-	/* DB doesn't support linear layouts. */
-	if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) &&
-	    mode < RADEON_SURF_MODE_1D)
-		mode = RADEON_SURF_MODE_1D;
+	memcpy(&config.info, surf_info, sizeof(config.info));
+	config.is_3d = !!(type == RADEON_SURF_TYPE_3D);
+	config.is_cube = !!(type == RADEON_SURF_TYPE_CUBEMAP);
 
-	/* Set the requested tiling mode. */
-	switch (mode) {
-	case RADEON_SURF_MODE_LINEAR_ALIGNED:
-		AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
-		break;
-	case RADEON_SURF_MODE_1D:
-		AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
-		break;
-	case RADEON_SURF_MODE_2D:
-		AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
-		break;
-	default:
-		assert(0);
-	}
-
-	/* The format must be set correctly for the allocation of compressed
-	 * textures to work. In other cases, setting the bpp is sufficient. */
-	if (compressed) {
-		switch (surf->bpe) {
-		case 8:
-			AddrSurfInfoIn.format = ADDR_FMT_BC1;
-			break;
-		case 16:
-			AddrSurfInfoIn.format = ADDR_FMT_BC3;
-			break;
-		default:
-			assert(0);
-		}
-	} else {
-		AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
-	}
-
-	AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples;
-	AddrSurfInfoIn.tileIndex = -1;
-
-	/* Set the micro tile type. */
-	if (surf->flags & RADEON_SURF_SCANOUT)
-		AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
-	else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
-		AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
-	else
-		AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
-
-	AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
-	AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
-	AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
-	AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
-	AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
-	AddrSurfInfoIn.flags.opt4Space = 1;
-
-	/* DCC notes:
-	 * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
-	 *   with samples >= 4.
-	 * - Mipmapped array textures have low performance (discovered by a closed
-	 *   driver team).
-	 */
-	AddrSurfInfoIn.flags.dccCompatible = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
-		!(surf->flags & RADEON_SURF_DISABLE_DCC) &&
-		!compressed && AddrDccIn.numSamples <= 1 &&
-		((surf->array_size == 1 && surf->npix_z == 1) ||
-		 surf->last_level == 0);
-
-	AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0;
-	AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth;
-
-	/* noStencil = 0 can result in a depth part that is incompatible with
-	 * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in
-	 * this case, we may end up setting stencil_adjusted).
-	 *
-	 * TODO: update addrlib to a newer version, remove this, and
-	 * use flags.matchStencilTileCfg = 1 as an alternative fix.
-	 */
-	if (surf->last_level > 0)
-		AddrSurfInfoIn.flags.noStencil = 1;
-
-	/* Set preferred macrotile parameters. This is usually required
-	 * for shared resources. This is for 2D tiling only. */
-	if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
-	    surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) {
-		/* If any of these parameters are incorrect, the calculation
-		 * will fail. */
-		AddrTileInfoIn.banks = surf->num_banks;
-		AddrTileInfoIn.bankWidth = surf->bankw;
-		AddrTileInfoIn.bankHeight = surf->bankh;
-		AddrTileInfoIn.macroAspectRatio = surf->mtilea;
-		AddrTileInfoIn.tileSplitBytes = surf->tile_split;
-		AddrTileInfoIn.pipeConfig = surf->pipe_config + 1; /* +1 compared to GB_TILE_MODE */
-		AddrSurfInfoIn.flags.opt4Space = 0;
-		AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
-
-		/* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
-		 * the tile index, because we are expected to know it if
-		 * we know the other parameters.
-		 *
-		 * This is something that can easily be fixed in Addrlib.
-		 * For now, just figure it out here.
-		 * Note that only 2D_TILE_THIN1 is handled here.
-		 */
-		assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-		assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
-
-		if (ws->info.chip_class == SI) {
-			if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) {
-				if (surf->bpe == 2)
-					AddrSurfInfoIn.tileIndex = 11; /* 16bpp */
-				else
-					AddrSurfInfoIn.tileIndex = 12; /* 32bpp */
-			} else {
-				if (surf->bpe == 1)
-					AddrSurfInfoIn.tileIndex = 14; /* 8bpp */
-				else if (surf->bpe == 2)
-					AddrSurfInfoIn.tileIndex = 15; /* 16bpp */
-				else if (surf->bpe == 4)
-					AddrSurfInfoIn.tileIndex = 16; /* 32bpp */
-				else
-					AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */
-			}
-		} else {
-			if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
-				AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
-			else
-				AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
-			AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
-		}
-	}
-
-	surf->bo_size = 0;
-	surf->dcc_size = 0;
-	surf->dcc_alignment = 1;
-	surf->htile_size = surf->htile_slice_size = 0;
-	surf->htile_alignment = 1;
-
-	/* Calculate texture layout information. */
-	for (level = 0; level <= surf->last_level; level++) {
-		r = radv_compute_level(ws->addrlib, surf, false, level, type, compressed,
-				       &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
-		if (r)
-			break;
-
-		if (level == 0) {
-			surf->bo_alignment = AddrSurfInfoOut.baseAlign;
-			surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
-			radv_set_micro_tile_mode(surf, &ws->info);
-
-			/* For 2D modes only. */
-			if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
-				surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth;
-				surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight;
-				surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio;
-				surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes;
-				surf->num_banks = AddrSurfInfoOut.pTileInfo->banks;
-				surf->macro_tile_index = AddrSurfInfoOut.macroModeIndex;
-			} else {
-				surf->macro_tile_index = 0;
-			}
-		}
-	}
-
-	/* Calculate texture layout information for stencil. */
-	if (surf->flags & RADEON_SURF_SBUFFER) {
-		AddrSurfInfoIn.bpp = 8;
-		AddrSurfInfoIn.flags.depth = 0;
-		AddrSurfInfoIn.flags.stencil = 1;
-		/* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
-		AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
-
-		for (level = 0; level <= surf->last_level; level++) {
-			r = radv_compute_level(ws->addrlib, surf, true, level, type, compressed,
-					       &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
-			if (r)
-				return r;
-
-			/* DB uses the depth pitch for both stencil and depth. */
-			if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x)
-				surf->stencil_adjusted = true;
-
-			if (level == 0) {
-				/* For 2D modes only. */
-				if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
-					surf->stencil_tile_split =
-						AddrSurfInfoOut.pTileInfo->tileSplitBytes;
-				}
-			}
-		}
-	}
-
-	/* Recalculate the whole DCC miptree size including disabled levels.
-	 * This is what addrlib does, but calling addrlib would be a lot more
-	 * complicated.
-	 */
-#if 0
-	if (surf->dcc_size && surf->last_level > 0) {
-		surf->dcc_size = align64(surf->bo_size >> 8,
-					 ws->info.pipe_interleave_bytes *
-					 ws->info.num_tile_pipes);
-	}
-#endif
-	return 0;
+	return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf);
 }
 
 static int radv_amdgpu_winsys_surface_best(struct radeon_winsys *rws,
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h
index cdc8c81..a5652a3 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h
@@ -28,6 +28,5 @@
 #include <amdgpu.h>
 
 void radv_amdgpu_surface_init_functions(struct radv_amdgpu_winsys *ws);
-ADDR_HANDLE radv_amdgpu_addr_create(struct amdgpu_gpu_info *amdinfo, int family, int rev_id, enum chip_class chip_class);
 
 #endif /* RADV_AMDGPU_SURFACE_H */
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
index 629da31..2503489 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -29,6 +29,7 @@
 #include "radv_amdgpu_surface.h"
 #include "radv_debug.h"
 #include "amdgpu_id.h"
+#include "ac_surface.h"
 #include "xf86drm.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -39,302 +40,30 @@
 #include "radv_amdgpu_bo.h"
 #include "radv_amdgpu_surface.h"
 
-#define CIK_TILE_MODE_COLOR_2D			14
-
-#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
-
-static unsigned radv_cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
-{
-	unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D];
-
-	switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
-	case CIK__PIPE_CONFIG__ADDR_SURF_P2:
-		return 2;
-	case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
-		return 4;
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
-		return 8;
-	case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
-	case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
-		return 16;
-	default:
-		fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n");
-		assert(!"this should never occur");
-		return 2;
-	}
-}
-
-static const char *
-get_chip_name(enum radeon_family family)
-{
-	switch (family) {
-	case CHIP_TAHITI: return "AMD RADV TAHITI";
-	case CHIP_PITCAIRN: return "AMD RADV PITCAIRN";
-	case CHIP_VERDE: return "AMD RADV CAPE VERDE";
-	case CHIP_OLAND: return "AMD RADV OLAND";
-	case CHIP_HAINAN: return "AMD RADV HAINAN";
-	case CHIP_BONAIRE: return "AMD RADV BONAIRE";
-	case CHIP_KAVERI: return "AMD RADV KAVERI";
-	case CHIP_KABINI: return "AMD RADV KABINI";
-	case CHIP_HAWAII: return "AMD RADV HAWAII";
-	case CHIP_MULLINS: return "AMD RADV MULLINS";
-	case CHIP_TONGA: return "AMD RADV TONGA";
-	case CHIP_ICELAND: return "AMD RADV ICELAND";
-	case CHIP_CARRIZO: return "AMD RADV CARRIZO";
-	case CHIP_FIJI: return "AMD RADV FIJI";
-	case CHIP_POLARIS10: return "AMD RADV POLARIS10";
-	case CHIP_POLARIS11: return "AMD RADV POLARIS11";
-	case CHIP_POLARIS12: return "AMD RADV POLARIS12";
-	case CHIP_STONEY: return "AMD RADV STONEY";
-	default: return "AMD RADV unknown";
-	}
-}
-
-
 static bool
 do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
 {
-	struct amdgpu_buffer_size_alignments alignment_info = {};
-	struct amdgpu_heap_info vram, visible_vram, gtt;
-	struct drm_amdgpu_info_hw_ip dma = {};
-	struct drm_amdgpu_info_hw_ip compute = {};
-	drmDevicePtr devinfo;
-	int r;
-	int i, j;
-	/* Get PCI info. */
-	r = drmGetDevice2(fd, 0, &devinfo);
-	if (r) {
-		fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
-		goto fail;
-	}
-	ws->info.pci_domain = devinfo->businfo.pci->domain;
-	ws->info.pci_bus = devinfo->businfo.pci->bus;
-	ws->info.pci_dev = devinfo->businfo.pci->dev;
-	ws->info.pci_func = devinfo->businfo.pci->func;
-	drmFreeDevice(&devinfo);
+	if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
+		return false;
 
-	/* Query hardware and driver information. */
-	r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
-		goto fail;
+	/* LLVM 5.0 is required for GFX9. */
+	if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
+		fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n",
+			HAVE_LLVM >> 8, HAVE_LLVM & 255);
+		return false;
 	}
 
-	r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
-		goto fail;
-	}
-
-	r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
-		goto fail;
-	}
-
-	r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM,
-	                           AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &visible_vram);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(visible_vram) failed.\n");
-		goto fail;
-	}
-
-	r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
-		goto fail;
-	}
-
-	r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
-		goto fail;
-	}
-
-	r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
-		goto fail;
-	}
-	ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
-	ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config;
-
-	switch (ws->info.pci_id) {
-#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break;
-#include "pci_ids/radeonsi_pci_ids.h"
-#undef CHIPSET
-	default:
-		fprintf(stderr, "amdgpu: Invalid PCI ID.\n");
-		goto fail;
-	}
-
-	if (ws->info.family >= CHIP_TONGA)
-		ws->info.chip_class = VI;
-	else if (ws->info.family >= CHIP_BONAIRE)
-		ws->info.chip_class = CIK;
-	else if (ws->info.family >= CHIP_TAHITI)
-		ws->info.chip_class = SI;
-	else {
-		fprintf(stderr, "amdgpu: Unknown family.\n");
-		goto fail;
-	}
-
-	/* family and rev_id are for addrlib */
-	switch (ws->info.family) {
-	case CHIP_TAHITI:
-		ws->family = FAMILY_SI;
-		ws->rev_id = SI_TAHITI_P_A0;
-		break;
-	case CHIP_PITCAIRN:
-		ws->family = FAMILY_SI;
-		ws->rev_id = SI_PITCAIRN_PM_A0;
-	  break;
-	case CHIP_VERDE:
-		ws->family = FAMILY_SI;
-		ws->rev_id = SI_CAPEVERDE_M_A0;
-		break;
-	case CHIP_OLAND:
-		ws->family = FAMILY_SI;
-		ws->rev_id = SI_OLAND_M_A0;
-		break;
-	case CHIP_HAINAN:
-		ws->family = FAMILY_SI;
-		ws->rev_id = SI_HAINAN_V_A0;
-		break;
-	case CHIP_BONAIRE:
-		ws->family = FAMILY_CI;
-		ws->rev_id = CI_BONAIRE_M_A0;
-		break;
-	case CHIP_KAVERI:
-		ws->family = FAMILY_KV;
-		ws->rev_id = KV_SPECTRE_A0;
-		break;
-	case CHIP_KABINI:
-		ws->family = FAMILY_KV;
-		ws->rev_id = KB_KALINDI_A0;
-		break;
-	case CHIP_HAWAII:
-		ws->family = FAMILY_CI;
-		ws->rev_id = CI_HAWAII_P_A0;
-		break;
-	case CHIP_MULLINS:
-		ws->family = FAMILY_KV;
-		ws->rev_id = ML_GODAVARI_A0;
-		break;
-	case CHIP_TONGA:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_TONGA_P_A0;
-		break;
-	case CHIP_ICELAND:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_ICELAND_M_A0;
-		break;
-	case CHIP_CARRIZO:
-		ws->family = FAMILY_CZ;
-		ws->rev_id = CARRIZO_A0;
-		break;
-	case CHIP_STONEY:
-		ws->family = FAMILY_CZ;
-		ws->rev_id = STONEY_A0;
-		break;
-	case CHIP_FIJI:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_FIJI_P_A0;
-		break;
-	case CHIP_POLARIS10:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_POLARIS10_P_A0;
-		break;
-	case CHIP_POLARIS11:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_POLARIS11_M_A0;
-		break;
-	case CHIP_POLARIS12:
-		ws->family = FAMILY_VI;
-		ws->rev_id = VI_POLARIS12_V_A0;
-		break;
-	default:
-		fprintf(stderr, "amdgpu: Unknown family.\n");
-		goto fail;
-	}
-
-	ws->addrlib = radv_amdgpu_addr_create(&ws->amdinfo, ws->family, ws->rev_id, ws->info.chip_class);
+	ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
 	if (!ws->addrlib) {
 		fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
-		goto fail;
+		return false;
 	}
 
-	assert(util_is_power_of_two(dma.available_rings + 1));
-	assert(util_is_power_of_two(compute.available_rings + 1));
+	ws->info.num_sdma_rings = MIN2(ws->info.num_sdma_rings, MAX_RINGS_PER_TYPE);
+	ws->info.num_compute_rings = MIN2(ws->info.num_compute_rings, MAX_RINGS_PER_TYPE);
 
-	/* Set hardware information. */
-	ws->info.name = get_chip_name(ws->info.family);
-	ws->info.gart_size = gtt.heap_size;
-	ws->info.vram_size = vram.heap_size;
-	ws->info.visible_vram_size = visible_vram.heap_size;
-	/* convert the shader clock from KHz to MHz */
-	ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000;
-	ws->info.max_se = ws->amdinfo.num_shader_engines;
-	ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
-	ws->info.has_uvd = 0;
-	ws->info.vce_fw_version = 0;
-	ws->info.has_userptr = TRUE;
-	ws->info.num_render_backends = ws->amdinfo.rb_pipes;
-	ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
-	ws->info.num_tile_pipes = radv_cik_get_num_tile_pipes(&ws->amdinfo);
-	ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7);
-	ws->info.has_virtual_memory = TRUE;
-	ws->info.sdma_rings = MIN2(util_bitcount(dma.available_rings),
-	                           MAX_RINGS_PER_TYPE);
-	ws->info.compute_rings = MIN2(util_bitcount(compute.available_rings),
-	                              MAX_RINGS_PER_TYPE);
-
-	/* Get the number of good compute units. */
-	ws->info.num_good_compute_units = 0;
-	for (i = 0; i < ws->info.max_se; i++)
-		for (j = 0; j < ws->info.max_sh_per_se; j++)
-			ws->info.num_good_compute_units +=
-				util_bitcount(ws->amdinfo.cu_bitmap[i][j]);
-
-	memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode,
-	       sizeof(ws->amdinfo.gb_tile_mode));
-	ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask;
-
-	memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode,
-	       sizeof(ws->amdinfo.gb_macro_tile_mode));
-
-	ws->info.gart_page_size = alignment_info.size_remote;
-
-	if (ws->info.chip_class == SI)
-		ws->info.gfx_ib_pad_with_type2 = TRUE;
-
-	ws->use_ib_bos = ws->family >= FAMILY_CI;
+	ws->use_ib_bos = ws->info.chip_class >= CIK;
 	return true;
-fail:
-	return false;
 }
 
 static void radv_amdgpu_winsys_query_info(struct radeon_winsys *rws,
@@ -353,7 +82,7 @@
 }
 
 struct radeon_winsys *
-radv_amdgpu_winsys_create(int fd, uint32_t debug_flags)
+radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags)
 {
 	uint32_t drm_major, drm_minor, r;
 	amdgpu_device_handle dev;
@@ -377,6 +106,7 @@
 	if (debug_flags & RADV_DEBUG_NO_IBS)
 		ws->use_ib_bos = false;
 
+	ws->batchchain = !!(perftest_flags & RADV_PERFTEST_BATCHCHAIN);
 	LIST_INITHEAD(&ws->global_bo_list);
 	pthread_mutex_init(&ws->global_bo_list_lock, NULL);
 	ws->base.query_info = radv_amdgpu_winsys_query_info;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
index abb238b..426cf69 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
@@ -29,6 +29,7 @@
 #define RADV_AMDGPU_WINSYS_H
 
 #include "radv_radeon_winsys.h"
+#include "ac_gpu_info.h"
 #include "addrlib/addrinterface.h"
 #include <amdgpu.h>
 #include "util/list.h"
@@ -41,10 +42,8 @@
 	struct amdgpu_gpu_info amdinfo;
 	ADDR_HANDLE addrlib;
 
-	uint32_t rev_id;
-	unsigned family;
-
 	bool debug_all_bos;
+	bool batchchain;
 	pthread_mutex_t global_bo_list_lock;
 	struct list_head global_bo_list;
 	unsigned num_buffers;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
index d5d0ff5..854e216 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
@@ -29,6 +29,7 @@
 #ifndef RADV_AMDGPU_WINSYS_PUBLIC_H
 #define RADV_AMDGPU_WINSYS_PUBLIC_H
 
-struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint32_t debug_flags);
+struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint64_t debug_flags,
+						uint64_t perftest_flags);
 
 #endif /* RADV_AMDGPU_WINSYS_PUBLIC_H */
diff --git a/src/broadcom/.gitignore b/src/broadcom/.gitignore
new file mode 100644
index 0000000..fcc603f
--- /dev/null
+++ b/src/broadcom/.gitignore
@@ -0,0 +1 @@
+cle/*_pack.h
diff --git a/src/broadcom/Android.genxml.mk b/src/broadcom/Android.genxml.mk
new file mode 100644
index 0000000..a504326
--- /dev/null
+++ b/src/broadcom/Android.genxml.mk
@@ -0,0 +1,58 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_broadcom_genxml
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+# dummy.c source file is generated to meet the build system's rules.
+LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c
+
+$(intermediates)/dummy.c:
+	@mkdir -p $(dir $@)
+	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) touch $@
+
+# This is the list of auto-generated files headers
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/broadcom/, $(BROADCOM_GENXML_GENERATED_FILES))
+
+define header-gen
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) > $@
+endef
+
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v21.xml
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v21.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+	$(call header-gen)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(MESA_TOP)/src/broadcom/cle \
+	$(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/broadcom/Android.mk b/src/broadcom/Android.mk
new file mode 100644
index 0000000..d2da907
--- /dev/null
+++ b/src/broadcom/Android.mk
@@ -0,0 +1,28 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+LOCAL_PATH := $(call my-dir)
+
+# Import variables
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(LOCAL_PATH)/Android.genxml.mk
diff --git a/src/broadcom/Makefile.am b/src/broadcom/Makefile.am
new file mode 100644
index 0000000..f4a005b
--- /dev/null
+++ b/src/broadcom/Makefile.am
@@ -0,0 +1,42 @@
+# Copyright © 2016 Broadcom
+# Copyright © 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+include Makefile.sources
+
+lib_LTLIBRARIES =
+check_LTLIBRARIES =
+noinst_DATA =
+noinst_HEADERS =
+noinst_LTLIBRARIES =
+noinst_PROGRAMS =
+check_PROGRAMS =
+TESTS =
+BUILT_SOURCES =
+CLEANFILES =
+EXTRA_DIST = $(BROADCOM_FILES)
+
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
+
+include Makefile.genxml.am
+
+CLEANFILES += $(BUILT_SOURCES)
diff --git a/src/broadcom/Makefile.genxml.am b/src/broadcom/Makefile.genxml.am
new file mode 100644
index 0000000..b5ff528
--- /dev/null
+++ b/src/broadcom/Makefile.genxml.am
@@ -0,0 +1,37 @@
+# Copyright © 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+BUILT_SOURCES += $(BROADCOM_GENXML_GENERATED_FILES)
+
+EXTRA_DIST += $(BROADCOM_GENXML_XML_FILES)
+EXTRA_DIST += $(BROADCOM_GENXML_GENERATED_FILES)
+
+SUFFIXES = _pack.h .xml
+
+$(BROADCOM_GENXML_GENERATED_FILES): cle/gen_pack_header.py
+
+.xml_pack.h:
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< > $@ || ($(RM) $@; false)
+
+EXTRA_DIST += \
+	cle/gen_pack_header.py \
+	$()
diff --git a/src/broadcom/Makefile.sources b/src/broadcom/Makefile.sources
new file mode 100644
index 0000000..4dadb68
--- /dev/null
+++ b/src/broadcom/Makefile.sources
@@ -0,0 +1,13 @@
+BROADCOM_GENXML_GENERATED_FILES = \
+	cle/v3d_packet_v21_pack.h \
+	$()
+
+BROADCOM_GENXML_XML_FILES = \
+	cle/v3d_packet_v21.xml \
+	$()
+
+BROADCOM_FILES = \
+	cle/v3d_packet_helpers.h \
+	common/v3d_device_info.h \
+	$()
+
diff --git a/src/broadcom/cle/gen_pack_header.py b/src/broadcom/cle/gen_pack_header.py
new file mode 100644
index 0000000..1ebeec7
--- /dev/null
+++ b/src/broadcom/cle/gen_pack_header.py
@@ -0,0 +1,551 @@
+#encoding=utf-8
+
+# Copyright (C) 2016 Intel Corporation
+# Copyright (C) 2016 Broadcom
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import (
+    absolute_import, division, print_function, unicode_literals
+)
+import xml.parsers.expat
+import re
+import sys
+import copy
+
+license =  """/* Generated code, see packets.xml and gen_packet_header.py */
+"""
+
+pack_header = """%(license)s
+
+/* Packets, enums and structures for %(platform)s.
+ *
+ * This file has been generated, do not hand edit.
+ */
+
+#ifndef %(guard)s
+#define %(guard)s
+
+#include "v3d_packet_helpers.h"
+
+"""
+
+def to_alphanum(name):
+    substitutions = {
+        ' ': '_',
+        '/': '_',
+        '[': '',
+        ']': '',
+        '(': '',
+        ')': '',
+        '-': '_',
+        ':': '',
+        '.': '',
+        ',': '',
+        '=': '',
+        '>': '',
+        '#': '',
+        'α': 'alpha',
+        '&': '',
+        '*': '',
+        '"': '',
+        '+': '',
+        '\'': '',
+    }
+
+    for i, j in substitutions.items():
+        name = name.replace(i, j)
+
+    return name
+
+def safe_name(name):
+    name = to_alphanum(name)
+    if not name[0].isalpha():
+        name = '_' + name
+
+    return name
+
+def num_from_str(num_str):
+    if num_str.lower().startswith('0x'):
+        return int(num_str, base=16)
+    else:
+        assert(not num_str.startswith('0') and 'octals numbers not allowed')
+        return int(num_str)
+
+class Field(object):
+    ufixed_pattern = re.compile(r"u(\d+)\.(\d+)")
+    sfixed_pattern = re.compile(r"s(\d+)\.(\d+)")
+
+    def __init__(self, parser, attrs):
+        self.parser = parser
+        if "name" in attrs:
+            self.name = safe_name(attrs["name"]).lower()
+
+        if str(attrs["start"]).endswith("b"):
+            self.start = int(attrs["start"][:-1]) * 8
+        else:
+            self.start = int(attrs["start"])
+        # packet <field> entries in XML start from the bit after the
+        # opcode, so shift everything up by 8 since we'll also have a
+        # Field for the opcode.
+        if not parser.struct:
+            self.start += 8
+
+        self.end = self.start + int(attrs["size"]) - 1
+        self.type = attrs["type"]
+
+        if "prefix" in attrs:
+            self.prefix = safe_name(attrs["prefix"]).upper()
+        else:
+            self.prefix = None
+
+        if "default" in attrs:
+            self.default = int(attrs["default"])
+        else:
+            self.default = None
+
+        ufixed_match = Field.ufixed_pattern.match(self.type)
+        if ufixed_match:
+            self.type = 'ufixed'
+            self.fractional_size = int(ufixed_match.group(2))
+
+        sfixed_match = Field.sfixed_pattern.match(self.type)
+        if sfixed_match:
+            self.type = 'sfixed'
+            self.fractional_size = int(sfixed_match.group(2))
+
+    def emit_template_struct(self, dim):
+        if self.type == 'address':
+            type = '__gen_address_type'
+        elif self.type == 'bool':
+            type = 'bool'
+        elif self.type == 'float':
+            type = 'float'
+        elif self.type == 'ufixed':
+            type = 'float'
+        elif self.type == 'sfixed':
+            type = 'float'
+        elif self.type == 'uint' and self.end - self.start > 32:
+            type = 'uint64_t'
+        elif self.type == 'offset':
+            type = 'uint64_t'
+        elif self.type == 'int':
+            type = 'int32_t'
+        elif self.type == 'uint':
+            type = 'uint32_t'
+        elif self.type in self.parser.structs:
+            type = 'struct ' + self.parser.gen_prefix(safe_name(self.type))
+        elif self.type == 'mbo':
+            return
+        else:
+            print("#error unhandled type: %s" % self.type)
+
+        print("   %-36s %s%s;" % (type, self.name, dim))
+
+        if len(self.values) > 0 and self.default == None:
+            if self.prefix:
+                prefix = self.prefix + "_"
+            else:
+                prefix = ""
+
+        for value in self.values:
+            print("#define %-40s %d" % ((prefix + value.name).replace("__", "_"),
+                                        value.value))
+
+    def overlaps(self, field):
+        return self != field and max(self.start, field.start) <= min(self.end, field.end)
+
+
+class Group(object):
+    def __init__(self, parser, parent, start, count):
+        self.parser = parser
+        self.parent = parent
+        self.start = start
+        self.count = count
+        self.size = 0
+        self.fields = []
+
+    def emit_template_struct(self, dim):
+        if self.count == 0:
+            print("   /* variable length fields follow */")
+        else:
+            if self.count > 1:
+                dim = "%s[%d]" % (dim, self.count)
+
+            for field in self.fields:
+                field.emit_template_struct(dim)
+
+    class Byte:
+        def __init__(self):
+            self.size = 8
+            self.fields = []
+            self.address = None
+
+    def collect_bytes(self, bytes):
+        for field in self.fields:
+            first_byte = field.start // 8
+            last_byte = field.end // 8
+
+            for b in xrange(first_byte, last_byte + 1):
+                if not b in bytes:
+                    bytes[b] = self.Byte()
+
+                bytes[b].fields.append(field)
+
+                if field.type == "address":
+                    # assert bytes[index].address == None
+                    bytes[b].address = field
+
+    def emit_pack_function(self, start):
+        # Determine number of bytes in this group.
+        self.length = max(field.end // 8 for field in self.fields) + 1
+
+        bytes = {}
+        self.collect_bytes(bytes)
+
+        relocs_emitted = set()
+        memcpy_fields = set()
+
+        for index in range(self.length):
+            # Handle MBZ bytes
+            if not index in bytes:
+                print("   cl[%2d] = 0;" % index)
+                continue
+            byte = bytes[index]
+
+            # Call out to the driver to note our relocations.  Inside of the
+            # packet we only store offsets within the BOs, and we store the
+            # handle to the packet outside.  Unlike Intel genxml, we don't
+            # need to have the other bits that will be stored together with
+            # the address during the reloc process, so there's no need for the
+            # complicated combine_address() function.
+            if byte.address and byte.address not in relocs_emitted:
+                print("   __gen_emit_reloc(data, &values->%s);" % byte.address.name)
+                relocs_emitted.add(byte.address)
+
+            # Special case: floats can't have any other fields packed into
+            # them (since they'd change the meaning of the float), and the
+            # per-byte bitshifting math below bloats the pack code for floats,
+            # so just copy them directly here.  Also handle 16/32-bit
+            # uints/ints with no merged fields.
+            if len(byte.fields) == 1:
+                field = byte.fields[0]
+                if field.type in ["float", "uint", "int"] and field.start % 8 == 0 and field.end - field.start == 31:
+                    if field in memcpy_fields:
+                        continue
+
+                    if not any(field.overlaps(scan_field) for scan_field in self.fields):
+                        assert(field.start == index * 8)
+                        print("")
+                        print("   memcpy(&cl[%d], &values->%s, sizeof(values->%s));" %
+                                (index, field.name, field.name))
+                        memcpy_fields.add(field)
+                        continue
+
+            byte_start = index * 8
+
+            v = None
+            prefix = "   cl[%2d] =" % index
+
+            field_index = 0
+            for field in byte.fields:
+                if field.type != "mbo":
+                    name = field.name
+
+                start = field.start
+                end = field.end
+                field_byte_start = (field.start // 8) * 8
+                start -= field_byte_start
+                end -= field_byte_start
+
+                if field.type == "mbo":
+                    s = "__gen_mbo(%d, %d)" % \
+                        (start, end)
+                elif field.type == "address":
+                    s = "__gen_address_offset(&values->%s)" % byte.address.name
+                elif field.type == "uint":
+                    s = "__gen_uint(values->%s, %d, %d)" % \
+                        (name, start, end)
+                elif field.type == "int":
+                    s = "__gen_sint(values->%s, %d, %d)" % \
+                        (name, start, end)
+                elif field.type == "bool":
+                    s = "__gen_uint(values->%s, %d, %d)" % \
+                        (name, start, end)
+                elif field.type == "float":
+                    s = "#error %s float value mixed in with other fields" % name
+                elif field.type == "offset":
+                    s = "__gen_offset(values->%s, %d, %d)" % \
+                        (name, start, end)
+                elif field.type == 'ufixed':
+                    s = "__gen_ufixed(values->%s, %d, %d, %d)" % \
+                        (name, start, end, field.fractional_size)
+                elif field.type == 'sfixed':
+                    s = "__gen_sfixed(values->%s, %d, %d, %d)" % \
+                        (name, start, end, field.fractional_size)
+                elif field.type in self.parser.structs:
+                    s = "__gen_uint(v%d_%d, %d, %d)" % \
+                        (index, field_index, start, end)
+                    field_index = field_index + 1
+                else:
+                    print("/* unhandled field %s, type %s */\n" % (name, field.type))
+                    s = None
+
+                if not s == None:
+                    if byte_start - field_byte_start != 0:
+                        s = "%s >> %d" % (s, byte_start - field_byte_start)
+
+                    if field == byte.fields[-1]:
+                        print("%s %s;" % (prefix, s))
+                    else:
+                        print("%s %s |" % (prefix, s))
+                    prefix = "           "
+
+            print("")
+            continue
+
+    def emit_unpack_function(self, start):
+        for field in self.fields:
+            if field.type != "mbo":
+                convert = None
+
+                args = []
+                args.append('cl')
+                args.append(str(start + field.start))
+                args.append(str(start + field.end))
+
+                if field.type == "address":
+                    convert = "__gen_unpack_address"
+                elif field.type == "uint":
+                    convert = "__gen_unpack_uint"
+                elif field.type == "int":
+                    convert = "__gen_unpack_sint"
+                elif field.type == "bool":
+                    convert = "__gen_unpack_uint"
+                elif field.type == "float":
+                    convert = "__gen_unpack_float"
+                elif field.type == "offset":
+                    convert = "__gen_unpack_offset"
+                elif field.type == 'ufixed':
+                    args.append(str(field.fractional_size))
+                    convert = "__gen_unpack_ufixed"
+                elif field.type == 'sfixed':
+                    args.append(str(field.fractional_size))
+                    convert = "__gen_unpack_sfixed"
+                else:
+                    print("/* unhandled field %s, type %s */\n" % (name, field.type))
+                    s = None
+
+                print("   values->%s = %s(%s);" % \
+                      (field.name, convert, ', '.join(args)))
+
+class Value(object):
+    def __init__(self, attrs):
+        self.name = safe_name(attrs["name"]).upper()
+        self.value = int(attrs["value"])
+
+class Parser(object):
+    def __init__(self):
+        self.parser = xml.parsers.expat.ParserCreate()
+        self.parser.StartElementHandler = self.start_element
+        self.parser.EndElementHandler = self.end_element
+
+        self.packet = None
+        self.struct = None
+        self.structs = {}
+        self.registers = {}
+
+    def gen_prefix(self, name):
+        if name[0] == "_":
+            return 'V3D%s%s' % (self.ver, name)
+        else:
+            return 'V3D%s_%s' % (self.ver, name)
+
+    def gen_guard(self):
+        return self.gen_prefix("PACK_H")
+
+    def start_element(self, name, attrs):
+        if name == "vcxml":
+            self.platform = "V3D {}".format(attrs["gen"])
+            self.ver = attrs["gen"].replace('.', '')
+            print(pack_header % {'license': license, 'platform': self.platform, 'guard': self.gen_guard()})
+        elif name in ("packet", "struct", "register"):
+            default_field = None
+
+            object_name = self.gen_prefix(safe_name(attrs["name"].upper()))
+            if name == "packet":
+                self.packet = object_name
+
+                # Add a fixed Field for the opcode.  We only make <field>s in
+                # the XML for the fields listed in the spec, and all of those
+                # start from bit 0 after of the opcode.
+                default_field = {
+                    "name" : "opcode",
+                    "default" : attrs["code"],
+                    "type" : "uint",
+                    "start" : -8,
+                    "size" : 8,
+                }
+            elif name == "struct":
+                self.struct = object_name
+                self.structs[attrs["name"]] = 1
+            elif name == "register":
+                self.register = object_name
+                self.reg_num = num_from_str(attrs["num"])
+                self.registers[attrs["name"]] = 1
+
+            self.group = Group(self, None, 0, 1)
+            if default_field:
+                field = Field(self, default_field)
+                field.values = []
+                self.group.fields.append(field)
+
+        elif name == "field":
+            self.group.fields.append(Field(self, attrs))
+            self.values = []
+        elif name == "enum":
+            self.values = []
+            self.enum = safe_name(attrs["name"])
+            if "prefix" in attrs:
+                self.prefix = safe_name(attrs["prefix"])
+            else:
+                self.prefix= None
+        elif name == "value":
+            self.values.append(Value(attrs))
+
+    def end_element(self, name):
+        if name  == "packet":
+            self.emit_packet()
+            self.packet = None
+            self.group = None
+        elif name == "struct":
+            self.emit_struct()
+            self.struct = None
+            self.group = None
+        elif name == "register":
+            self.emit_register()
+            self.register = None
+            self.reg_num = None
+            self.group = None
+        elif name  == "field":
+            self.group.fields[-1].values = self.values
+        elif name  == "enum":
+            self.emit_enum()
+            self.enum = None
+        elif name == "vcxml":
+            print('#endif /* %s */' % self.gen_guard())
+
+    def emit_template_struct(self, name, group):
+        print("struct %s {" % name)
+        group.emit_template_struct("")
+        print("};\n")
+
+    def emit_pack_function(self, name, group):
+        print("static inline void\n%s_pack(__gen_user_data *data, uint8_t * restrict cl,\n%sconst struct %s * restrict values)\n{" %
+              (name, ' ' * (len(name) + 6), name))
+
+        group.emit_pack_function(0)
+
+        print("}\n")
+
+        print('#define %-33s %6d' %
+              (name + "_length", self.group.length))
+
+    def emit_unpack_function(self, name, group):
+        print("#ifdef __gen_unpack_address")
+        print("static inline void")
+        print("%s_unpack(const uint8_t * restrict cl,\n%sstruct %s * restrict values)\n{" %
+              (name, ' ' * (len(name) + 8), name))
+
+        group.emit_unpack_function(0)
+
+        print("}\n#endif\n")
+
+    def emit_packet(self):
+        name = self.packet
+
+        assert(self.group.fields[0].name == "opcode")
+        print('#define %-33s %6d' %
+              (name + "_opcode", self.group.fields[0].default))
+
+        default_fields = []
+        for field in self.group.fields:
+            if not type(field) is Field:
+                continue
+            if field.default == None:
+                continue
+            default_fields.append("   .%-35s = %6d" % (field.name, field.default))
+
+        if default_fields:
+            print('#define %-40s\\' % (name + '_header'))
+            print(",  \\\n".join(default_fields))
+            print('')
+
+        self.emit_template_struct(self.packet, self.group)
+        self.emit_pack_function(self.packet, self.group)
+        self.emit_unpack_function(self.packet, self.group)
+
+        print('')
+
+    def emit_register(self):
+        name = self.register
+        if not self.reg_num == None:
+            print('#define %-33s 0x%04x' %
+                  (self.gen_prefix(name + "_num"), self.reg_num))
+
+        self.emit_template_struct(self.register, self.group)
+        self.emit_pack_function(self.register, self.group)
+        self.emit_unpack_function(self.register, self.group)
+
+    def emit_struct(self):
+        name = self.struct
+        # Emit an empty header define so that we can use the CL pack functions
+        # with structs.
+        print('#define ' + name + '_header')
+
+        self.emit_template_struct(self.struct, self.group)
+        self.emit_pack_function(self.struct, self.group)
+        self.emit_unpack_function(self.struct, self.group)
+
+        print('')
+
+    def emit_enum(self):
+        print('/* enum %s */' % self.gen_prefix(self.enum))
+        for value in self.values:
+            if self.prefix:
+                name = self.prefix + "_" + value.name
+            else:
+                name = value.name
+                print('#define %-36s %6d' % (name.upper(), value.value))
+        print('')
+
+    def parse(self, filename):
+        file = open(filename, "rb")
+        self.parser.ParseFile(file)
+        file.close()
+
+if len(sys.argv) < 2:
+    print("No input xml file specified")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+
+p = Parser()
+p.parse(input_file)
diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h
new file mode 100644
index 0000000..c86cad8
--- /dev/null
+++ b/src/broadcom/cle/v3d_packet_helpers.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#else
+#define VG(x)
+#endif
+
+#ifndef __gen_validate_value
+#define __gen_validate_value(x)
+#endif
+/*
+#ifndef __gen_address_type
+#error #define __gen_address_type before including this file
+#endif
+
+#ifndef __gen_user_data
+#error #define __gen_combine_address before including this file
+#endif
+*/
+union __gen_value {
+   float f;
+   uint32_t dw;
+};
+
+static inline uint64_t
+__gen_mbo(uint32_t start, uint32_t end)
+{
+   return (~0ull >> (64 - (end - start + 1))) << start;
+}
+
+static inline uint64_t
+__gen_uint(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+
+#if DEBUG
+   const int width = end - start + 1;
+   if (width < 64) {
+      const uint64_t max = (1ull << width) - 1;
+      assert(v <= max);
+   }
+#endif
+
+   return v << start;
+}
+
+static inline uint64_t
+__gen_sint(int64_t v, uint32_t start, uint32_t end)
+{
+   const int width = end - start + 1;
+
+   __gen_validate_value(v);
+
+#if DEBUG
+   if (width < 64) {
+      const int64_t max = (1ll << (width - 1)) - 1;
+      const int64_t min = -(1ll << (width - 1));
+      assert(min <= v && v <= max);
+   }
+#endif
+
+   const uint64_t mask = ~0ull >> (64 - width);
+
+   return (v & mask) << start;
+}
+
+static inline uint64_t
+__gen_offset(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   uint64_t mask = (~0ull >> (64 - (end - start + 1))) << start;
+
+   assert((v & ~mask) == 0);
+#endif
+
+   return v;
+}
+
+static inline uint32_t
+__gen_float(float v)
+{
+   __gen_validate_value(v);
+   return ((union __gen_value) { .f = (v) }).dw;
+}
+
+static inline uint64_t
+__gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+#if DEBUG
+   const float max = ((1 << (end - start)) - 1) / factor;
+   const float min = -(1 << (end - start)) / factor;
+   assert(min <= v && v <= max);
+#endif
+
+   const int64_t int_val = llroundf(v * factor);
+   const uint64_t mask = ~0ull >> (64 - (end - start + 1));
+
+   return (int_val & mask) << start;
+}
+
+static inline uint64_t
+__gen_ufixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+#if DEBUG
+   const float max = ((1 << (end - start + 1)) - 1) / factor;
+   const float min = 0.0f;
+   assert(min <= v && v <= max);
+#endif
+
+   const uint64_t uint_val = llroundf(v * factor);
+
+   return uint_val << start;
+}
+
+static inline uint64_t
+__gen_unpack_uint(const uint8_t *restrict cl, uint32_t start, uint32_t end)
+{
+   uint64_t val = 0;
+   const int width = end - start + 1;
+   const uint32_t mask = (width == 32 ? ~0 : (1 << width) - 1 );
+
+   for (int byte = start / 8; byte <= end / 8; byte++) {
+      val |= cl[byte] << ((byte - start / 8) * 8);
+   }
+
+   return (val >> (start % 8)) & mask;
+}
+
+static inline uint64_t
+__gen_unpack_sint(const uint8_t *restrict cl, uint32_t start, uint32_t end)
+{
+   int size = end - start + 1;
+   int64_t val = __gen_unpack_uint(cl, start, end);
+
+   /* Get the sign bit extended. */
+   return (val << (64 - size)) >> (64 - size);
+}
+
+static inline float
+__gen_unpack_sfixed(const uint8_t *restrict cl, uint32_t start, uint32_t end,
+                    uint32_t fractional_size)
+{
+        int32_t bits = __gen_unpack_sint(cl, start, end);
+        return (float)bits / (1 << fractional_size);
+}
+
+static inline float
+__gen_unpack_ufixed(const uint8_t *restrict cl, uint32_t start, uint32_t end,
+                    uint32_t fractional_size)
+{
+        int32_t bits = __gen_unpack_uint(cl, start, end);
+        return (float)bits / (1 << fractional_size);
+}
+
+static inline float
+__gen_unpack_float(const uint8_t *restrict cl, uint32_t start, uint32_t end)
+{
+   assert(start % 8 == 0);
+   assert(end - start == 31);
+
+   struct PACKED { float f; } *f = (void *)(cl + (start / 8));
+
+   return f->f;
+}
+
diff --git a/src/broadcom/cle/v3d_packet_v21.xml b/src/broadcom/cle/v3d_packet_v21.xml
new file mode 100644
index 0000000..350cf29
--- /dev/null
+++ b/src/broadcom/cle/v3d_packet_v21.xml
@@ -0,0 +1,334 @@
+<vcxml gen="2.1">
+  <packet name="Halt" code="0"/>
+  <packet name="NOP" code="1"/>
+  <packet name="Flush" code="4" cl="B"/>
+  <packet name="Flush All State" code="5" cl="B"/>
+  <packet name="Start Tile Binning" code="6" cl="B"/>
+  <packet name="Increment Semaphore" code="7"/>
+  <packet name="Wait on Semaphore" code="8"/>
+  <packet name="Branch" code="16">
+    <field name="Address" size="32" start="0" type="address"/>
+  </packet>
+  <packet name="Branch to sub-list" code="17">
+    <field name="Address" size="32" start="0" type="address"/>
+  </packet>
+  <packet name="Return from sub-list" code="18"/>
+
+  <packet name="Store Multi-sample Resolved Tile Color Buffer" code="24" cl="R"/>
+  <packet name="Store Multi-sample Resolved Tile Color Buffer and EOF" code="25" cl="R"/>
+
+  <packet name="Store Full Resolution Tile Buffer" cl="R" code="26">
+    <field name="Address" size="32" start="0" type="address"/>
+    <field name="Last Tile" size="1" start="3" type="bool"/>
+    <field name="Disable Clear on Write" size="1" start="2" type="bool"/>
+    <field name="Disable Z/Stencil Buffer write" size="1" start="1" type="bool"/>
+    <field name="Disable Color Buffer write" size="1" start="0" type="bool"/>
+  </packet>
+
+  <packet name="Re-load Full Resolution Tile Buffer" cl="R" code="27">
+    <field name="Address" size="32" start="0" type="address"/>
+    <field name="Disable Z/Stencil Buffer read" size="1" start="1" type="bool"/>
+    <field name="Disable Color Buffer read" size="1" start="0" type="bool"/>
+  </packet>
+
+  <packet name="Store Tile Buffer General" code="28" cl="R">
+    <field name="Memory base address of frame/tile dump buffer" size="32" start="16" type="address"/>
+    <field name="Last Tile of Frame" size="1" start="19" type="bool"/>
+    <field name="Disable VG-Mask buffer dump" size="1" start="18" type="bool"/>
+    <field name="Disable Z/Stencil buffer dump" size="1" start="17" type="bool"/>
+    <field name="Disable Color buffer dump" size="1" start="16" type="bool"/>
+    <field name="Disable VG-Mask buffer clear on store/dump" size="1" start="15" type="bool"/>
+    <field name="Disable Z/Stencil buffer clear on store/dump" size="1" start="14" type="bool"/>
+    <field name="Disable Color buffer clear on store/dump" size="1" start="13" type="bool"/>
+
+    <field name="Pixel Color Format" size="2" start="8" type="uint">
+      <value name="rgba8888" value="0"/>
+      <value name="bgr565 dithered" value="1"/>
+      <value name="bgr565 no dither" value="2"/>
+    </field>
+
+    <field name="Mode" size="2" start="6" type="uint">
+      <value name="Sample 0" value="0"/>
+      <value name="Decimate x4" value="1"/>
+      <value name="Decimate x16" value="2"/>
+    </field>
+
+    <field name="Format" size="2" start="4" type="uint">
+      <value name="Raster" value="0"/>
+      <value name="T" value="1"/>
+      <value name="LT" value="2"/>
+    </field>
+
+    <field name="Buffer to Store" size="3" start="0" type="uint">
+      <value name="None" value="0"/>
+      <value name="Color" value="1"/>
+      <value name="Z/stencil" value="2"/>
+      <value name="Z" value="3"/>
+      <value name="VG-Mask" value="4"/>
+    </field>
+  </packet>
+
+  <packet name="Load Tile Buffer General" code="29" cl="R">
+    <field name="Memory base address of frame/tile dump buffer" size="32" start="16" type="address"/>
+    <field name="Disable VG-Mask buffer load" size="1" start="18" type="bool"/>
+    <field name="Disable Z/Stencil buffer load" size="1" start="17" type="bool"/>
+    <field name="Disable Color buffer load" size="1" start="16" type="bool"/>
+
+    <field name="Pixel Color Format" size="2" start="8" type="uint">
+      <value name="rgba8888" value="0"/>
+      <value name="bgr565 dithered" value="1"/>
+      <value name="bgr565 no dither" value="2"/>
+    </field>
+
+    <field name="Mode" size="2" start="6" type="uint">
+      <value name="Sample 0" value="0"/>
+      <value name="Decimate x4" value="1"/>
+      <value name="Decimate x16" value="2"/>
+    </field>
+
+    <field name="Format" size="2" start="4" type="uint">
+      <value name="Raster" value="0"/>
+      <value name="T" value="1"/>
+      <value name="LT" value="2"/>
+    </field>
+
+    <field name="Buffer to Store" size="3" start="0" type="uint">
+      <value name="None" value="0"/>
+      <value name="Color" value="1"/>
+      <value name="Z/stencil" value="2"/>
+      <value name="Z" value="3"/>
+      <value name="VG-Mask" value="4"/>
+    </field>
+  </packet>
+
+  <packet name="Indexed Primitive List" code="32">
+    <field name="Maximum Index" size="32" start="72" type="uint"/>
+    <field name="Address of Indices List" size="32" start="40" type="uint"/>
+    <field name="Length" size="32" start="8" type="uint"/>
+    <field name="Index type" size="4" start="4" type="uint">
+      <value name="8-bit" value="0"/>
+      <value name="16-bit" value="1"/>
+    </field>
+    <field name="Primitive mode" size="4" start="0" type="uint">
+      <value name="points" value="0"/>
+      <value name="lines" value="1"/>
+      <value name="line loop" value="2"/>
+      <value name="line strip" value="3"/>
+      <value name="triangles" value="4"/>
+      <value name="triangles strip" value="5"/>
+      <value name="triangles fan" value="6"/>
+    </field>
+  </packet>
+
+  <packet name="Vertex Array Primitives" code="33">
+    <field name="Index of First Vertex" size="32" start="40" type="uint"/>
+    <field name="Length" size="32" start="8" type="uint"/>
+    <field name="Primitive mode" size="4" start="0" type="uint">
+      <value name="points" value="0"/>
+      <value name="lines" value="1"/>
+      <value name="line loop" value="2"/>
+      <value name="line strip" value="3"/>
+      <value name="triangles" value="4"/>
+      <value name="triangles strip" value="5"/>
+      <value name="triangles fan" value="6"/>
+    </field>
+  </packet>
+
+  <packet name="Primitive List Format" cl="R" code="56">
+    <field name="Data Type" size="4" start="4" type="uint">
+      <value name="16-bit index" value="1"/>
+      <value name="32-bit x/y" value="3"/>
+    </field>
+
+    <field name="Primitive Type" size="4" start="0" type="uint">
+      <value name="Points List" value="0"/>
+      <value name="Lines List" value="1"/>
+      <value name="Triangles List" value="2"/>
+      <value name="RHY List" value="3"/>
+    </field>
+  </packet>
+
+  <packet name="GL Shader State" code="64">
+    <!-- The address field will be filled in by kernel validation code. -->
+    <field name="Address" size="28" start="0" type="uint"/>
+    <field name="Extended shader record" size="1" start="3" type="bool"/>
+    <field name="Number of attribute arrays" size="3" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Clear Colors" cl="R" code="114">
+    <field name="Clear Stencil" size="8" start="96" type="uint"/>
+    <field name="Clear VG Mask" size="8" start="88" type="uint"/>
+    <field name="Clear ZS" size="24" start="64" type="uint"/>
+    <field name="Clear Color" size="64" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Configuration Bits" code="96">
+    <field name="Early Z updates enable" size="1" start="17" type="bool"/>
+    <field name="Early Z enable" size="1" start="16" type="bool"/>
+    <field name="Z updates enable" size="1" start="15" type="bool"/>
+    <field name="Depth-Test Function" size="3" start="12" type="uint"/>
+    <!-- add values -->
+    <field name="Coverage Read Mode" size="1" start="11" type="uint"/>
+    <!-- add values -->
+    <field name="Coverage Pipe Select" size="1" start="8" type="bool"/>
+    <field name="Rasteriser Oversample Mode" size="2" start="6" type="bool"/>
+    <!-- add values -->
+    <field name="Coverage Read Type" size="1" start="5" type="uint"/>
+    <!-- add values -->
+    <field name="Antialiased Points and Lines" size="1" start="4" type="bool"/>
+    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+  </packet>
+
+  <packet name="Flat Shade Flags" code="97">
+    <field name="Flat-shading Flags" size="32" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Point size" code="98">
+    <field name="Point Size" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="Line width" code="99">
+    <field name="Line width" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="RHT X boundary" code="100">
+    <field name="RHT primitive X boundary" size="16" start="0" type="int"/>
+  </packet>
+
+  <packet name="Depth Offset" code="101">
+    <!-- these fields are both float-1-8-7 encoded (top 16 bits of a float32) -->
+    <field name="Depth Offset Units" size="16" start="16" type="uint"/>
+    <field name="Depth Offset Factor" size="16" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Clip Window" code="102">
+    <field name="Clip Window Height in pixels" size="16" start="48" type="uint"/>
+    <field name="Clip Window Width in pixels" size="16" start="32" type="uint"/>
+    <field name="Clip Window Bottom Pixel Coordinate" size="16" start="16" type="uint"/>
+    <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Viewport Offset" code="103">
+    <field name="Viewport Centre Y-coordinate" size="16" start="16" type="int"/>
+    <field name="Viewport Centre X-coordinate" size="16" start="0" type="int"/>
+  </packet>
+
+  <packet name="Z min and max clipping planes" code="104">
+    <field name="Maximum Zw" size="32" start="32" type="float"/>
+    <field name="Minimum Zw" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="Clipper XY Scaling" code="105" cl="B">
+    <field name="Viewport Half-Height in 1/16th of pixel" size="32" start="32" type="float"/>
+    <field name="Viewport Half-Width in 1/16th of pixel" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="Clipper Z Scale and Offset" code="106" cl="B">
+    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+  </packet>
+
+  <packet name="Tile Binning Mode Configuration" code="112" cl="B">
+    <field name="Double-buffer in non-ms mode" size="1" start="119" type="bool"/>
+
+    <field name="Tile Allocation Block Size" size="2" start="117" type="uint">
+      <value name="block size 32" value="0"/>
+      <value name="block size 64" value="1"/>
+      <value name="block size 128" value="2"/>
+      <value name="block size 256" value="3"/>
+    </field>
+
+    <field name="Tile Allocation Initial Block Size" size="2" start="115" type="uint">
+      <value name="block size 32" value="0"/>
+      <value name="block size 64" value="1"/>
+      <value name="block size 128" value="2"/>
+      <value name="block size 256" value="3"/>
+    </field>
+
+    <field name="Auto-initialise Tile State Data Array" size="1" start="114" type="bool"/>
+    <field name="Tile Buffer 64-bit Color Depth" size="1" start="113" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="112" type="bool"/>
+
+    <field name="Height (in tiles)" size="8" start="104" type="uint"/>
+    <field name="Width (in tiles)" size="8" start="96" type="uint"/>
+
+    <field name="Tile State Data Array Address" size="32" start="64" type="uint"/>
+    <field name="Tile Allocation memory size" size="32" start="32" type="uint"/>
+    <field name="Tile Allocation memory address" size="32" start="0" type="uint"/>
+
+  </packet>
+
+  <packet name="Tile Rendering Mode Configuration" code="113" cl="R">
+    <field name="Double-buffer in non-ms mode" size="1" start="76" type="bool"/>
+    <field name="Early-Z/Early-Cov disable" size="1" start="75" type="bool"/>
+    <field name="Early-Z Update Direction GT/GE" size="1" start="74" type="bool"/>
+    <field name="Select Coverage Mode" size="1" start="73" type="bool"/>
+    <field name="Enable VG Mask Buffer" size="1" start="72" type="bool"/>
+    <field name="Memory Format" size="2" start="70" type="uint">
+      <value name="Raster" value="0"/>
+      <value name="T" value="1"/>
+      <value name="LT" value="2"/>
+    </field>
+    <field name="Decimate Mode" size="2" start="68" type="uint"/>
+
+    <field name="Non-HDR Frame Buffer Color Format" size="2" start="66" type="uint">
+      <value name="rendering config bgr565 dithered" value="0"/>
+      <value name="rendering config rgba8888" value="1"/>
+      <value name="rendering config bgr565 no dither" value="2"/>
+    </field>
+
+    <field name="Tile Buffer 64-bit Color Depth" size="1" start="65" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="64" type="bool"/>
+    <field name="Height (pixels)" size="16" start="48" type="uint"/>
+    <field name="Width (pixels)" size="16" start="32" type="uint"/>
+    <field name="Memory Address" size="32" start="0" type="address"/>
+  </packet>
+
+  <packet name="Tile Coordinates" code="115" cl="R">
+    <field name="Tile Row Number" size="8" start="8" type="uint"/>
+    <field name="Tile Column Number" size="8" start="0" type="uint"/>
+  </packet>
+
+  <packet name="Gem Relocations" code="254" cl="B">
+    <field name="buffer 1" size="32" start="32" type="uint"/>
+    <field name="buffer 0" size="32" start="0" type="uint"/>
+  </packet>
+
+  <struct name="Shader Record">
+    <field name="Fragment Shader is single threaded" size="1" start="0" type="bool"/>
+    <field name="Point Size included in shaded vertex data" size="1" start="1" type="bool"/>
+    <field name="Enable Clipping" size="1" start="2" type="bool"/>
+
+    <field name="Fragment Shader Number of Uniforms (not used currently)" size="16" start="2b" type="uint"/>
+    <field name="Fragment Shader Number of Varyings" size="8" start="3b" type="uint"/>
+    <field name="Fragment Shader Code Address" size="32" start="4b" type="address"/>
+    <!-- set up by the kernel -->
+    <field name="Fragment Shader Uniforms Address" size="32" start="8b" type="uint"/>
+
+    <field name="Vertex Shader Number of Uniforms (not used currently)" size="16" start="12b" type="uint"/>
+    <field name="Vertex Shader Attribute Array select bits" size="8" start="14b" type="uint"/>
+    <field name="Vertex Shader Total Attributes Size" size="8" start="15b" type="uint"/>
+    <field name="Vertex Shader Code Address" size="32" start="16b" type="address"/>
+    <!-- set up by the kernel -->
+    <field name="Vertex Shader Uniforms Address" size="32" start="16b" type="uint"/>
+
+    <field name="Coordinate Shader Number of Uniforms (not used currently)" size="16" start="24b" type="uint"/>
+    <field name="Coordinate Shader Attribute Array select bits" size="8" start="26b" type="uint"/>
+    <field name="Coordinate Shader Total Attributes Size" size="8" start="27b" type="uint"/>
+    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
+    <!-- set up by the kernel -->
+    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="uint"/>
+  </struct>
+
+  <struct name="Attribute Record">
+    <field name="Address" size="32" start="0b" type="address"/>
+    <field name="Number of Bytes minus 1" size="8" start="4b" type="uint"/>
+    <field name="Stride" size="8" start="5b" type="uint"/>
+    <field name="Vertex Shader VPM offset" size="8" start="6b" type="uint"/>
+    <field name="Coordinate Shader VPM offset" size="8" start="7b" type="uint"/>
+  </struct>
+
+</vcxml>
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
new file mode 100644
index 0000000..5685c7a
--- /dev/null
+++ b/src/broadcom/common/v3d_device_info.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_CHIP_H
+#define V3D_CHIP_H
+
+#include <stdint.h>
+
+/**
+ * Struct for tracking features of the V3D chip. This is where we'll store
+ * boolean flags for features in a specific version, but for now it's just the
+ * version
+ */
+struct v3d_device_info {
+        /** Simple V3D version: major * 10 + minor */
+        uint8_t ver;
+};
+
+#endif
diff --git a/src/compiler/Android.nir.gen.mk b/src/compiler/Android.nir.gen.mk
index 96fc750..e2187d0 100644
--- a/src/compiler/Android.nir.gen.mk
+++ b/src/compiler/Android.nir.gen.mk
@@ -37,10 +37,11 @@
 
 LOCAL_EXPORT_C_INCLUDE_DIRS += \
 	$(intermediates)/nir \
+	$(MESA_TOP)/src/compiler \
 	$(MESA_TOP)/src/compiler/nir
 
 LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \
-	$(NIR_GENERATED_FILES))
+	$(NIR_GENERATED_FILES) $(SPIRV_GENERATED_FILES))
 
 # Modules using libmesa_nir must set LOCAL_GENERATED_SOURCES to this
 MESA_GEN_NIR_H := $(addprefix $(call local-generated-sources-dir)/, \
@@ -93,3 +94,7 @@
 $(intermediates)/nir/nir_opt_algebraic.c: $(nir_opt_algebraic_deps)
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $(nir_opt_algebraic_gen) $< > $@
+
+$(intermediates)/spirv/spirv_info.c: $(LOCAL_PATH)/spirv/spirv_info_c.py $(LOCAL_PATH)/spirv/spirv.core.grammar.json
+	@mkdir -p $(dir $@)
+	$(hide) $(MESA_PYTHON2) $^ $@ || ($(RM) $@; false)
diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk
index f021b80..75a247a 100644
--- a/src/compiler/Android.nir.mk
+++ b/src/compiler/Android.nir.mk
@@ -37,6 +37,7 @@
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa \
+	$(MESA_TOP)/src/compiler/spirv \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
diff --git a/src/compiler/Makefile.am b/src/compiler/Makefile.am
index d52da91..001ff81 100644
--- a/src/compiler/Makefile.am
+++ b/src/compiler/Makefile.am
@@ -33,9 +33,11 @@
 	-I$(top_srcdir)/src/compiler/glsl/glcpp\
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/spirv \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gtest/include \
+	$(VALGRIND_CFLAGS) \
 	$(DEFINES)
 
 AM_CFLAGS = \
diff --git a/src/compiler/Makefile.nir.am b/src/compiler/Makefile.nir.am
index 13f02a7..1533ee5 100644
--- a/src/compiler/Makefile.nir.am
+++ b/src/compiler/Makefile.nir.am
@@ -29,6 +29,7 @@
 nir_libnir_la_SOURCES =					\
 	$(NIR_FILES)					\
 	$(SPIRV_FILES)					\
+	$(SPIRV_GENERATED_FILES) 			\
 	$(NIR_GENERATED_FILES)
 
 nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
@@ -51,6 +52,10 @@
 	$(MKDIR_GEN)
 	$(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false)
 
+spirv/spirv_info.c: spirv/spirv_info_c.py spirv/spirv.core.grammar.json
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/spirv/spirv_info_c.py $(srcdir)/spirv/spirv.core.grammar.json $@ || ($(RM) $@; false)
+
 noinst_PROGRAMS += spirv2nir
 
 spirv2nir_SOURCES = \
@@ -91,8 +96,13 @@
 TESTS += nir/tests/control_flow_tests
 
 
-BUILT_SOURCES += $(NIR_GENERATED_FILES)
-CLEANFILES += $(NIR_GENERATED_FILES)
+BUILT_SOURCES += \
+	$(NIR_GENERATED_FILES) \
+	$(SPIRV_GENERATED_FILES)
+
+CLEANFILES += \
+	$(NIR_GENERATED_FILES) \
+	$(SPIRV_GENERATED_FILES)
 
 EXTRA_DIST += \
 	nir/nir_algebraic.py				\
@@ -104,4 +114,6 @@
 	nir/nir_opt_algebraic.py			\
 	nir/tests \
 	nir/README \
+	spirv/spirv_info_c.py				\
+	spirv/spirv.core.grammar.json			\
 	SConscript.nir
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 2455d4e..a8309a1 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -140,7 +140,9 @@
 	glsl/program.h \
 	glsl/propagate_invariance.cpp \
 	glsl/s_expression.cpp \
-	glsl/s_expression.h
+	glsl/s_expression.h \
+	glsl/string_to_uint_map.cpp \
+	glsl/string_to_uint_map.h
 
 LIBGLSL_SHADER_CACHE_FILES = \
 	glsl/shader_cache.cpp \
@@ -186,7 +188,6 @@
 NIR_FILES = \
 	nir/nir.c \
 	nir/nir.h \
-	nir/nir_array.h \
 	nir/nir_builder.h \
 	nir/nir_clone.c \
 	nir/nir_constant_expressions.h \
@@ -208,6 +209,7 @@
 	nir/nir_lower_64bit_packing.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
+	nir/nir_lower_atomics_to_ssbo.c \
 	nir/nir_lower_bitmap.c \
 	nir/nir_lower_clamp_color_outputs.c \
 	nir/nir_lower_clip.c \
@@ -229,6 +231,7 @@
 	nir/nir_lower_passthrough_edgeflags.c \
 	nir/nir_lower_patch_vertices.c \
 	nir/nir_lower_phis_to_scalar.c \
+	nir/nir_lower_read_invocation_to_scalar.c \
 	nir/nir_lower_regs_to_ssa.c \
 	nir/nir_lower_returns.c \
 	nir/nir_lower_samplers.c \
@@ -254,6 +257,7 @@
 	nir/nir_opt_gcm.c \
 	nir/nir_opt_global_to_local.c \
 	nir/nir_opt_if.c \
+	nir/nir_opt_intrinsics.c \
 	nir/nir_opt_loop_unroll.c \
 	nir/nir_opt_move_comparisons.c \
 	nir/nir_opt_peephole_select.c \
@@ -277,12 +281,14 @@
 	nir/nir_worklist.c \
 	nir/nir_worklist.h
 
+SPIRV_GENERATED_FILES = \
+	spirv/spirv_info.c
+
 SPIRV_FILES = \
 	spirv/GLSL.std.450.h \
 	spirv/nir_spirv.h \
 	spirv/spirv.h \
 	spirv/spirv_info.h \
-	spirv/spirv_info.c \
 	spirv/spirv_to_nir.c \
 	spirv/vtn_alu.c \
 	spirv/vtn_cfg.c \
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 455cb81..3bf4b08 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -622,6 +622,14 @@
           * is used.
           */
          unsigned inner_coverage:1;
+
+         /** \name Layout qualifiers for GL_ARB_bindless_texture */
+         /** \{ */
+         unsigned bindless_sampler:1;
+         unsigned bindless_image:1;
+         unsigned bound_sampler:1;
+         unsigned bound_image:1;
+         /** \} */
       }
       /** \brief Set of flags, accessed by name. */
       q;
@@ -836,6 +844,7 @@
    /* List of ast_declarator_list * */
    exec_list declarations;
    bool is_declaration;
+   const glsl_type *type;
 };
 
 
diff --git a/src/compiler/glsl/ast_array_index.cpp b/src/compiler/glsl/ast_array_index.cpp
index dfa44b7..f6b7a64 100644
--- a/src/compiler/glsl/ast_array_index.cpp
+++ b/src/compiler/glsl/ast_array_index.cpp
@@ -299,12 +299,18 @@
        * values must not diverge between shader invocations run together. If the
        * values *do* diverge, then the behavior of the operation requiring a
        * dynamically uniform expression is undefined.
+       *
+       * From section 4.1.7 of the ARB_bindless_texture spec:
+       *
+       *    "Samplers aggregated into arrays within a shader (using square
+       *    brackets []) can be indexed with arbitrary integer expressions."
        */
       if (array->type->without_array()->is_sampler()) {
          if (!state->is_version(400, 320) &&
              !state->ARB_gpu_shader5_enable &&
              !state->EXT_gpu_shader5_enable &&
-             !state->OES_gpu_shader5_enable) {
+             !state->OES_gpu_shader5_enable &&
+             !state->has_bindless()) {
             if (state->is_version(130, 300))
                _mesa_glsl_error(&loc, state,
                                 "sampler arrays indexed with non-constant "
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 0665e0c..2d156ae 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -107,35 +107,35 @@
     *  qualifiers. [...] It is legal to have additional qualifiers
     *  on a formal parameter, but not to have fewer."
     */
-   if (actual->data.image_coherent && !formal->data.image_coherent) {
+   if (actual->data.memory_coherent && !formal->data.memory_coherent) {
       _mesa_glsl_error(loc, state,
                        "function call parameter `%s' drops "
                        "`coherent' qualifier", formal->name);
       return false;
    }
 
-   if (actual->data.image_volatile && !formal->data.image_volatile) {
+   if (actual->data.memory_volatile && !formal->data.memory_volatile) {
       _mesa_glsl_error(loc, state,
                        "function call parameter `%s' drops "
                        "`volatile' qualifier", formal->name);
       return false;
    }
 
-   if (actual->data.image_restrict && !formal->data.image_restrict) {
+   if (actual->data.memory_restrict && !formal->data.memory_restrict) {
       _mesa_glsl_error(loc, state,
                        "function call parameter `%s' drops "
                        "`restrict' qualifier", formal->name);
       return false;
    }
 
-   if (actual->data.image_read_only && !formal->data.image_read_only) {
+   if (actual->data.memory_read_only && !formal->data.memory_read_only) {
       _mesa_glsl_error(loc, state,
                        "function call parameter `%s' drops "
                        "`readonly' qualifier", formal->name);
       return false;
    }
 
-   if (actual->data.image_write_only && !formal->data.image_write_only) {
+   if (actual->data.memory_write_only && !formal->data.memory_write_only) {
       _mesa_glsl_error(loc, state,
                        "function call parameter `%s' drops "
                        "`writeonly' qualifier", formal->name);
@@ -235,6 +235,8 @@
                              formal->name);
             return false;
          }
+
+         val->variable_referenced()->data.must_be_shader_input = 1;
       }
 
       /* Verify that 'out' and 'inout' actual parameters are lvalues. */
@@ -281,7 +283,7 @@
                              mode, formal->name,
                              actual->variable_referenced()->name);
             return false;
-         } else if (!actual->is_lvalue()) {
+         } else if (!actual->is_lvalue(state)) {
             _mesa_glsl_error(&loc, state,
                              "function parameter '%s %s' is not an lvalue",
                              mode, formal->name);
@@ -738,8 +740,8 @@
    if (src->type->is_error())
       return src;
 
-   assert(a <= GLSL_TYPE_BOOL);
-   assert(b <= GLSL_TYPE_BOOL);
+   assert(a <= GLSL_TYPE_IMAGE);
+   assert(b <= GLSL_TYPE_IMAGE);
 
    if (a == b)
       return src;
@@ -767,6 +769,12 @@
       case GLSL_TYPE_INT64:
          result = new(ctx) ir_expression(ir_unop_i642u, src);
          break;
+      case GLSL_TYPE_SAMPLER:
+         result = new(ctx) ir_expression(ir_unop_unpack_sampler_2x32, src);
+         break;
+      case GLSL_TYPE_IMAGE:
+         result = new(ctx) ir_expression(ir_unop_unpack_image_2x32, src);
+         break;
       }
       break;
    case GLSL_TYPE_INT:
@@ -909,6 +917,22 @@
          break;
       }
       break;
+   case GLSL_TYPE_SAMPLER:
+      switch (b) {
+      case GLSL_TYPE_UINT:
+         result = new(ctx)
+            ir_expression(ir_unop_pack_sampler_2x32, desired_type, src);
+         break;
+      }
+      break;
+   case GLSL_TYPE_IMAGE:
+      switch (b) {
+      case GLSL_TYPE_UINT:
+         result = new(ctx)
+            ir_expression(ir_unop_pack_image_2x32, desired_type, src);
+         break;
+      }
+      break;
    }
 
    assert(result != NULL);
@@ -1504,8 +1528,7 @@
        * components with zero.
        */
       glsl_base_type param_base_type = first_param->type->base_type;
-      assert(param_base_type == GLSL_TYPE_FLOAT ||
-             param_base_type == GLSL_TYPE_DOUBLE);
+      assert(first_param->type->is_float() || first_param->type->is_double());
       ir_variable *rhs_var =
          new(ctx) ir_variable(glsl_type::get_instance(param_base_type, 4, 1),
                               "mat_ctor_vec",
@@ -1514,7 +1537,7 @@
 
       ir_constant_data zero;
       for (unsigned i = 0; i < 4; i++)
-         if (param_base_type == GLSL_TYPE_FLOAT)
+         if (first_param->type->is_float())
             zero.f[i] = 0.0;
          else
             zero.d[i] = 0.0;
@@ -1929,6 +1952,13 @@
    return ir_rvalue::error_value(ctx);
 }
 
+static inline bool is_valid_constructor(const glsl_type *type,
+                                        struct _mesa_glsl_parse_state *state)
+{
+   return type->is_numeric() || type->is_boolean() ||
+          (state->has_bindless() && (type->is_sampler() || type->is_image()));
+}
+
 ir_rvalue *
 ast_function_expression::hir(exec_list *instructions,
                              struct _mesa_glsl_parse_state *state)
@@ -1961,9 +1991,21 @@
 
 
       /* Constructors for opaque types are illegal.
+       *
+       * From section 4.1.7 of the ARB_bindless_texture spec:
+       *
+       * "Samplers are represented using 64-bit integer handles, and may be "
+       *  converted to and from 64-bit integers using constructors."
+       *
+       * From section 4.1.X of the ARB_bindless_texture spec:
+       *
+       * "Images are represented using 64-bit integer handles, and may be
+       *  converted to and from 64-bit integers using constructors."
        */
-      if (constructor_type->contains_opaque()) {
-         _mesa_glsl_error(& loc, state, "cannot construct opaque type `%s'",
+      if (constructor_type->contains_atomic() ||
+          (!state->has_bindless() && constructor_type->contains_opaque())) {
+         _mesa_glsl_error(& loc, state, "cannot construct %s type `%s'",
+                          state->has_bindless() ? "atomic" : "opaque",
                           constructor_type->name);
          return ir_rvalue::error_value(ctx);
       }
@@ -2006,7 +2048,7 @@
                                            state);
       }
 
-      if (!constructor_type->is_numeric() && !constructor_type->is_boolean())
+      if (!is_valid_constructor(constructor_type, state))
          return ir_rvalue::error_value(ctx);
 
       /* Total number of components of the type being constructed. */
@@ -2036,7 +2078,7 @@
             return ir_rvalue::error_value(ctx);
          }
 
-         if (!result->type->is_numeric() && !result->type->is_boolean()) {
+         if (!is_valid_constructor(result->type, state)) {
             _mesa_glsl_error(& loc, state, "cannot construct `%s' from a "
                              "non-numeric data type",
                              constructor_type->name);
@@ -2128,10 +2170,51 @@
 
       /* Type cast each parameter and, if possible, fold constants.*/
       foreach_in_list_safe(ir_rvalue, ir, &actual_parameters) {
-         const glsl_type *desired_type =
-            glsl_type::get_instance(constructor_type->base_type,
-                                    ir->type->vector_elements,
-                                    ir->type->matrix_columns);
+         const glsl_type *desired_type;
+
+         /* From section 5.4.1 of the ARB_bindless_texture spec:
+          *
+          * "In the following four constructors, the low 32 bits of the sampler
+          *  type correspond to the .x component of the uvec2 and the high 32
+          *  bits correspond to the .y component."
+          *
+          *  uvec2(any sampler type)     // Converts a sampler type to a
+          *                              //   pair of 32-bit unsigned integers
+          *  any sampler type(uvec2)     // Converts a pair of 32-bit unsigned integers to
+          *                              //   a sampler type
+          *  uvec2(any image type)       // Converts an image type to a
+          *                              //   pair of 32-bit unsigned integers
+          *  any image type(uvec2)       // Converts a pair of 32-bit unsigned integers to
+          *                              //   an image type
+          */
+         if (ir->type->is_sampler() || ir->type->is_image()) {
+            /* Convert a sampler/image type to a pair of 32-bit unsigned
+             * integers as defined by ARB_bindless_texture.
+             */
+            if (constructor_type != glsl_type::uvec2_type) {
+               _mesa_glsl_error(&loc, state, "sampler and image types can only "
+                                "be converted to a pair of 32-bit unsigned "
+                                "integers");
+            }
+            desired_type = glsl_type::uvec2_type;
+         } else if (constructor_type->is_sampler() ||
+                    constructor_type->is_image()) {
+            /* Convert a pair of 32-bit unsigned integers to a sampler or image
+             * type as defined by ARB_bindless_texture.
+             */
+            if (ir->type != glsl_type::uvec2_type) {
+               _mesa_glsl_error(&loc, state, "sampler and image types can only "
+                                "be converted from a pair of 32-bit unsigned "
+                                "integers");
+            }
+            desired_type = constructor_type;
+         } else {
+            desired_type =
+               glsl_type::get_instance(constructor_type->base_type,
+                                       ir->type->vector_elements,
+                                       ir->type->matrix_columns);
+         }
+
          ir_rvalue *result = convert_component(ir, desired_type);
 
          /* Attempt to convert the parameter to a constant valued expression.
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index a9c0d05..73d8d62 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -86,17 +86,17 @@
          return visit_continue;
 
       ir_variable *var = ir->variable_referenced();
-      /* We can have image_write_only set on both images and buffer variables,
+      /* We can have memory_write_only set on both images and buffer variables,
        * but in the former there is a distinction between reads from
        * the variable itself (write_only) and from the memory they point to
-       * (image_write_only), while in the case of buffer variables there is
+       * (memory_write_only), while in the case of buffer variables there is
        * no such distinction, that is why this check here is limited to
        * buffer variables alone.
        */
       if (!var || var->data.mode != ir_var_shader_storage)
          return visit_continue;
 
-      if (var->data.image_write_only) {
+      if (var->data.memory_write_only) {
          found = var;
          return visit_stop;
       }
@@ -444,10 +444,8 @@
     * type of both operands must be float.
     */
    assert(type_a->is_matrix() || type_b->is_matrix());
-   assert(type_a->base_type == GLSL_TYPE_FLOAT ||
-          type_a->base_type == GLSL_TYPE_DOUBLE);
-   assert(type_b->base_type == GLSL_TYPE_FLOAT ||
-          type_b->base_type == GLSL_TYPE_DOUBLE);
+   assert(type_a->is_float() || type_a->is_double());
+   assert(type_b->is_float() || type_b->is_double());
 
    /*   "* The operator is add (+), subtract (-), or divide (/), and the
     *      operands are matrices with the same number of rows and the same
@@ -949,11 +947,11 @@
          error_emitted = true;
       } else if (lhs_var != NULL && (lhs_var->data.read_only ||
                  (lhs_var->data.mode == ir_var_shader_storage &&
-                  lhs_var->data.image_read_only))) {
-         /* We can have image_read_only set on both images and buffer variables,
+                  lhs_var->data.memory_read_only))) {
+         /* We can have memory_read_only set on both images and buffer variables,
           * but in the former there is a distinction between assignments to
           * the variable itself (read_only) and to the memory they point to
-          * (image_read_only), while in the case of buffer variables there is
+          * (memory_read_only), while in the case of buffer variables there is
           * no such distinction, that is why this check here is limited to
           * buffer variables alone.
           */
@@ -973,7 +971,7 @@
           * The restriction on arrays is lifted in GLSL 1.20 and GLSL ES 3.00.
           */
          error_emitted = true;
-      } else if (!lhs->is_lvalue()) {
+      } else if (!lhs->is_lvalue(state)) {
          _mesa_glsl_error(& lhs_loc, state, "non-lvalue in assignment");
          error_emitted = true;
       }
@@ -1487,8 +1485,7 @@
        * in a scalar boolean.  See page 57 of the GLSL 1.50 spec.
        */
       assert(type->is_error()
-             || ((type->base_type == GLSL_TYPE_BOOL)
-                 && type->is_scalar()));
+             || (type->is_boolean() && type->is_scalar()));
 
       result = new(ctx) ir_expression(operations[this->oper], type,
                                       op[0], op[1]);
@@ -2362,7 +2359,10 @@
 {
    const struct glsl_type *type;
 
-   type = state->symbols->get_type(this->type_name);
+   if (structure)
+      type = structure->type;
+   else
+      type = state->symbols->get_type(this->type_name);
    *name = this->type_name;
 
    YYLTYPE loc = this->get_location();
@@ -2633,8 +2633,7 @@
     *    declare an atomic type with a different precision or to specify the
     *    default precision for an atomic type to be lowp or mediump."
     */
-   if (type->base_type == GLSL_TYPE_ATOMIC_UINT &&
-       precision != ast_precision_high) {
+   if (type->is_atomic_uint() && precision != ast_precision_high) {
       _mesa_glsl_error(loc, state,
                        "atomic_uint can only have highp precision qualifier");
    }
@@ -2900,7 +2899,7 @@
       assert(ctx->Const.MaxAtomicBufferBindings <= MAX_COMBINED_ATOMIC_BUFFERS);
       if (qual_binding >= ctx->Const.MaxAtomicBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) exceeds the "
-                          " maximum number of atomic counter buffer bindings"
+                          "maximum number of atomic counter buffer bindings "
                           "(%u)", qual_binding,
                           ctx->Const.MaxAtomicBufferBindings);
 
@@ -2912,7 +2911,7 @@
       assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
       if (max_index >= ctx->Const.MaxImageUnits) {
          _mesa_glsl_error(loc, state, "Image binding %d exceeds the "
-                          " maximum number of image units (%d)", max_index,
+                          "maximum number of image units (%d)", max_index,
                           ctx->Const.MaxImageUnits);
          return;
       }
@@ -2920,7 +2919,8 @@
    } else {
       _mesa_glsl_error(loc, state,
                        "the \"binding\" qualifier only applies to uniform "
-                       "blocks, opaque variables, or arrays thereof");
+                       "blocks, storage blocks, opaque variables, or arrays "
+                       "thereof");
       return;
    }
 
@@ -2999,6 +2999,26 @@
       _mesa_glsl_error(loc, state, "if a fragment input is (or contains) "
                        "a double, then it must be qualified with 'flat'");
    }
+
+   /* Bindless sampler/image fragment inputs must be qualified with 'flat'.
+    *
+    * From section 4.3.4 of the ARB_bindless_texture spec:
+    *
+    *    "(modify last paragraph, p. 35, allowing samplers and images as
+    *     fragment shader inputs) ... Fragment inputs can only be signed and
+    *     unsigned integers and integer vectors, floating point scalars,
+    *     floating-point vectors, matrices, sampler and image types, or arrays
+    *     or structures of these.  Fragment shader inputs that are signed or
+    *     unsigned integers, integer vectors, or any double-precision floating-
+    *     point type, or any sampler or image type must be qualified with the
+    *     interpolation qualifier "flat"."
+    */
+   if (state->has_bindless()
+       && (var_type->contains_sampler() || var_type->contains_image())) {
+      _mesa_glsl_error(loc, state, "if a fragment input is (or contains) "
+                       "a bindless sampler (or image), then it must be "
+                       "qualified with 'flat'");
+   }
 }
 
 static void
@@ -3228,6 +3248,9 @@
                        "compute shader variables cannot be given "
                        "explicit locations");
       return;
+   default:
+      fail = true;
+      break;
    };
 
    if (fail) {
@@ -3259,7 +3282,7 @@
             ? (qual_location + FRAG_RESULT_DATA0)
             : (qual_location + VARYING_SLOT_VAR0);
          break;
-      case MESA_SHADER_COMPUTE:
+      default:
          assert(!"Unexpected shader type");
          break;
       }
@@ -3295,6 +3318,98 @@
    }
 }
 
+static bool
+validate_storage_for_sampler_image_types(ir_variable *var,
+                                         struct _mesa_glsl_parse_state *state,
+                                         YYLTYPE *loc)
+{
+   /* From section 4.1.7 of the GLSL 4.40 spec:
+    *
+    *    "[Opaque types] can only be declared as function
+    *     parameters or uniform-qualified variables."
+    *
+    * From section 4.1.7 of the ARB_bindless_texture spec:
+    *
+    *    "Samplers may be declared as shader inputs and outputs, as uniform
+    *     variables, as temporary variables, and as function parameters."
+    *
+    * From section 4.1.X of the ARB_bindless_texture spec:
+    *
+    *    "Images may be declared as shader inputs and outputs, as uniform
+    *     variables, as temporary variables, and as function parameters."
+    */
+   if (state->has_bindless()) {
+      if (var->data.mode != ir_var_auto &&
+          var->data.mode != ir_var_uniform &&
+          var->data.mode != ir_var_shader_in &&
+          var->data.mode != ir_var_shader_out &&
+          var->data.mode != ir_var_function_in &&
+          var->data.mode != ir_var_function_out &&
+          var->data.mode != ir_var_function_inout) {
+         _mesa_glsl_error(loc, state, "bindless image/sampler variables may "
+                         "only be declared as shader inputs and outputs, as "
+                         "uniform variables, as temporary variables and as "
+                         "function parameters");
+         return false;
+      }
+   } else {
+      if (var->data.mode != ir_var_uniform &&
+          var->data.mode != ir_var_function_in) {
+         _mesa_glsl_error(loc, state, "image/sampler variables may only be "
+                          "declared as function parameters or "
+                          "uniform-qualified global variables");
+         return false;
+      }
+   }
+   return true;
+}
+
+static bool
+validate_memory_qualifier_for_type(struct _mesa_glsl_parse_state *state,
+                                   YYLTYPE *loc,
+                                   const struct ast_type_qualifier *qual,
+                                   const glsl_type *type)
+{
+   /* From Section 4.10 (Memory Qualifiers) of the GLSL 4.50 spec:
+    *
+    * "Memory qualifiers are only supported in the declarations of image
+    *  variables, buffer variables, and shader storage blocks; it is an error
+    *  to use such qualifiers in any other declarations.
+    */
+   if (!type->is_image() && !qual->flags.q.buffer) {
+      if (qual->flags.q.read_only ||
+          qual->flags.q.write_only ||
+          qual->flags.q.coherent ||
+          qual->flags.q._volatile ||
+          qual->flags.q.restrict_flag) {
+         _mesa_glsl_error(loc, state, "memory qualifiers may only be applied "
+                          "in the declarations of image variables, buffer "
+                          "variables, and shader storage blocks");
+         return false;
+      }
+   }
+   return true;
+}
+
+static bool
+validate_image_format_qualifier_for_type(struct _mesa_glsl_parse_state *state,
+                                         YYLTYPE *loc,
+                                         const struct ast_type_qualifier *qual,
+                                         const glsl_type *type)
+{
+   /* From section 4.4.6.2 (Format Layout Qualifiers) of the GLSL 4.50 spec:
+    *
+    * "Format layout qualifiers can be used on image variable declarations
+    *  (those declared with a basic type  having “image ” in its keyword)."
+    */
+   if (!type->is_image() && qual->flags.q.explicit_image_format) {
+      _mesa_glsl_error(loc, state, "format layout qualifiers may only be "
+                       "applied to images");
+      return false;
+   }
+   return true;
+}
+
 static void
 apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                   ir_variable *var,
@@ -3303,32 +3418,21 @@
 {
    const glsl_type *base_type = var->type->without_array();
 
-   if (!base_type->is_image()) {
-      if (qual->flags.q.read_only ||
-          qual->flags.q.write_only ||
-          qual->flags.q.coherent ||
-          qual->flags.q._volatile ||
-          qual->flags.q.restrict_flag ||
-          qual->flags.q.explicit_image_format) {
-         _mesa_glsl_error(loc, state, "memory qualifiers may only be applied "
-                          "to images");
-      }
+   if (!validate_image_format_qualifier_for_type(state, loc, qual, base_type) ||
+       !validate_memory_qualifier_for_type(state, loc, qual, base_type))
       return;
-   }
 
-   if (var->data.mode != ir_var_uniform &&
-       var->data.mode != ir_var_function_in) {
-      _mesa_glsl_error(loc, state, "image variables may only be declared as "
-                       "function parameters or uniform-qualified "
-                       "global variables");
-   }
+   if (!base_type->is_image())
+      return;
 
-   var->data.image_read_only |= qual->flags.q.read_only;
-   var->data.image_write_only |= qual->flags.q.write_only;
-   var->data.image_coherent |= qual->flags.q.coherent;
-   var->data.image_volatile |= qual->flags.q._volatile;
-   var->data.image_restrict |= qual->flags.q.restrict_flag;
-   var->data.read_only = true;
+   if (!validate_storage_for_sampler_image_types(var, state, loc))
+      return;
+
+   var->data.memory_read_only |= qual->flags.q.read_only;
+   var->data.memory_write_only |= qual->flags.q.write_only;
+   var->data.memory_coherent |= qual->flags.q.coherent;
+   var->data.memory_volatile |= qual->flags.q._volatile;
+   var->data.memory_restrict |= qual->flags.q.restrict_flag;
 
    if (qual->flags.q.explicit_image_format) {
       if (var->data.mode == ir_var_function_in) {
@@ -3365,8 +3469,8 @@
        var->data.image_format != GL_R32F &&
        var->data.image_format != GL_R32I &&
        var->data.image_format != GL_R32UI &&
-       !var->data.image_read_only &&
-       !var->data.image_write_only) {
+       !var->data.memory_read_only &&
+       !var->data.memory_write_only) {
       _mesa_glsl_error(loc, state, "image variables of format other than r32f, "
                        "r32i or r32ui must be qualified `readonly' or "
                        "`writeonly'");
@@ -3421,6 +3525,69 @@
 }
 
 static void
+apply_bindless_qualifier_to_variable(const struct ast_type_qualifier *qual,
+                                     ir_variable *var,
+                                     struct _mesa_glsl_parse_state *state,
+                                     YYLTYPE *loc)
+{
+   bool has_local_qualifiers = qual->flags.q.bindless_sampler ||
+                               qual->flags.q.bindless_image ||
+                               qual->flags.q.bound_sampler ||
+                               qual->flags.q.bound_image;
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "Modify Section 4.4.6 Opaque-Uniform Layout Qualifiers of the GLSL 4.30
+    *  spec"
+    *
+    * "If these layout qualifiers are applied to other types of default block
+    *  uniforms, or variables with non-uniform storage, a compile-time error
+    *  will be generated."
+    */
+   if (has_local_qualifiers && !qual->flags.q.uniform) {
+      _mesa_glsl_error(loc, state, "ARB_bindless_texture layout qualifiers "
+                       "can only be applied to default block uniforms or "
+                       "variables with uniform storage");
+      return;
+   }
+
+   /* The ARB_bindless_texture spec doesn't state anything in this situation,
+    * but it makes sense to only allow bindless_sampler/bound_sampler for
+    * sampler types, and respectively bindless_image/bound_image for image
+    * types.
+    */
+   if ((qual->flags.q.bindless_sampler || qual->flags.q.bound_sampler) &&
+       !var->type->contains_sampler()) {
+      _mesa_glsl_error(loc, state, "bindless_sampler or bound_sampler can only "
+                       "be applied to sampler types");
+      return;
+   }
+
+   if ((qual->flags.q.bindless_image || qual->flags.q.bound_image) &&
+       !var->type->contains_image()) {
+      _mesa_glsl_error(loc, state, "bindless_image or bound_image can only be "
+                       "applied to image types");
+      return;
+   }
+
+   /* The bindless_sampler/bindless_image (and respectively
+    * bound_sampler/bound_image) layout qualifiers can be set at global and at
+    * local scope.
+    */
+   if (var->type->contains_sampler() || var->type->contains_image()) {
+      var->data.bindless = qual->flags.q.bindless_sampler ||
+                           qual->flags.q.bindless_image ||
+                           state->bindless_sampler_specified ||
+                           state->bindless_image_specified;
+
+      var->data.bound = qual->flags.q.bound_sampler ||
+                        qual->flags.q.bound_image ||
+                        state->bound_sampler_specified ||
+                        state->bound_image_specified;
+   }
+}
+
+static void
 apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                    ir_variable *var,
                                    struct _mesa_glsl_parse_state *state,
@@ -3597,14 +3764,9 @@
       }
    }
 
-   if (var->type->contains_sampler()) {
-      if (var->data.mode != ir_var_uniform &&
-          var->data.mode != ir_var_function_in) {
-         _mesa_glsl_error(loc, state, "sampler variables may only be declared "
-                          "as function parameters or uniform-qualified "
-                          "global variables");
-      }
-   }
+   if (var->type->contains_sampler() &&
+       !validate_storage_for_sampler_image_types(var, state, loc))
+      return;
 
    /* Is the 'layout' keyword used with parameters that allow relaxed checking.
     * Many implementations of GL_ARB_fragment_coord_conventions_enable and some
@@ -3717,6 +3879,9 @@
       _mesa_glsl_error(loc, state, "post_depth_coverage layout qualifier only "
                        "valid in fragment shader input layout declaration.");
    }
+
+   if (state->has_bindless())
+      apply_bindless_qualifier_to_variable(qual, var, state, loc);
 }
 
 static void
@@ -3861,8 +4026,23 @@
        * Similar text exists in the GLSL ES 3.00 spec, except that the GLSL ES
        * 3.00 spec allows structs as well.  Varying structs are also allowed
        * in GLSL 1.50.
+       *
+       * From section 4.3.4 of the ARB_bindless_texture spec:
+       *
+       *     "(modify third paragraph of the section to allow sampler and image
+       *     types) ...  Vertex shader inputs can only be float,
+       *     single-precision floating-point scalars, single-precision
+       *     floating-point vectors, matrices, signed and unsigned integers
+       *     and integer vectors, sampler and image types."
+       *
+       * From section 4.3.6 of the ARB_bindless_texture spec:
+       *
+       *     "Output variables can only be floating-point scalars,
+       *     floating-point vectors, matrices, signed or unsigned integers or
+       *     integer vectors, sampler or image types, or arrays or structures
+       *     of any these."
        */
-      switch (var->type->get_scalar_type()->base_type) {
+      switch (var->type->without_array()->base_type) {
       case GLSL_TYPE_FLOAT:
          /* Ok in all GLSL versions */
          break;
@@ -3884,6 +4064,11 @@
       case GLSL_TYPE_UINT64:
       case GLSL_TYPE_INT64:
          break;
+      case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_IMAGE:
+         if (state->has_bindless())
+            break;
+         /* fallthrough */
       default:
          _mesa_glsl_error(loc, state, "illegal type for a varying variable");
          break;
@@ -3910,6 +4095,8 @@
       case MESA_SHADER_COMPUTE:
          /* Invariance isn't meaningful in compute shaders. */
          break;
+      default:
+         break;
       }
    }
 
@@ -4097,6 +4284,22 @@
        */
       earlier->data.precision = var->data.precision;
 
+   } else if (earlier->data.how_declared == ir_var_declared_implicitly &&
+              state->allow_builtin_variable_redeclaration) {
+      /* Allow verbatim redeclarations of built-in variables. Not explicitly
+       * valid, but some applications do it.
+       */
+      if (earlier->data.mode != var->data.mode &&
+          !(earlier->data.mode == ir_var_system_value &&
+            var->data.mode == ir_var_shader_in)) {
+         _mesa_glsl_error(&loc, state,
+                          "redeclaration of `%s' with incorrect qualifiers",
+                          var->name);
+      } else if (earlier->type != var->type) {
+         _mesa_glsl_error(&loc, state,
+                          "redeclaration of `%s' has incorrect type",
+                          var->name);
+      }
    } else if (allow_all_redeclarations) {
       if (earlier->data.mode != var->data.mode) {
          _mesa_glsl_error(&loc, state,
@@ -4154,11 +4357,22 @@
     *    "Opaque variables [...] are initialized only through the
     *     OpenGL API; they cannot be declared with an initializer in a
     *     shader."
+    *
+    * From section 4.1.7 of the ARB_bindless_texture spec:
+    *
+    *    "Samplers may be declared as shader inputs and outputs, as uniform
+    *     variables, as temporary variables, and as function parameters."
+    *
+    * From section 4.1.X of the ARB_bindless_texture spec:
+    *
+    *    "Images may be declared as shader inputs and outputs, as uniform
+    *     variables, as temporary variables, and as function parameters."
     */
-   if (var->type->contains_opaque()) {
+   if (var->type->contains_atomic() ||
+       (!state->has_bindless() && var->type->contains_opaque())) {
       _mesa_glsl_error(&initializer_loc, state,
-                       "cannot initialize opaque variable %s",
-                       var->name);
+                       "cannot initialize %s variable %s",
+                       var->name, state->has_bindless() ? "atomic" : "opaque");
    }
 
    if ((var->data.mode == ir_var_shader_in) && (state->current_function == NULL)) {
@@ -4281,7 +4495,7 @@
       } else {
          if (var->type->is_numeric()) {
             /* Reduce cascading errors. */
-            var->constant_value = type->qualifier.flags.q.constant
+            rhs = var->constant_value = type->qualifier.flags.q.constant
                ? ir_constant::zero(state, var->type) : NULL;
          }
       }
@@ -4713,7 +4927,7 @@
                           "invalid type `%s' in empty declaration",
                           type_name);
       } else {
-         if (decl_type->base_type == GLSL_TYPE_ARRAY) {
+         if (decl_type->is_array()) {
             /* From Section 13.22 (Array Declarations) of the GLSL ES 3.2
              * spec:
              *
@@ -4735,7 +4949,7 @@
             validate_array_dimensions(decl_type, state, &loc);
          }
 
-         if (decl_type->base_type == GLSL_TYPE_ATOMIC_UINT) {
+         if (decl_type->is_atomic_uint()) {
             /* Empty atomic counter declarations are allowed and useful
              * to set the default offset qualifier.
              */
@@ -4938,6 +5152,14 @@
              *    vectors, matrices, signed and unsigned integers and integer
              *    vectors. Vertex shader inputs cannot be arrays or
              *    structures."
+             *
+             * From section 4.3.4 of the ARB_bindless_texture spec:
+             *
+             *    "(modify third paragraph of the section to allow sampler and
+             *    image types) ...  Vertex shader inputs can only be float,
+             *    single-precision floating-point scalars, single-precision
+             *    floating-point vectors, matrices, signed and unsigned
+             *    integers and integer vectors, sampler and image types."
              */
             const glsl_type *check_type = var->type->without_array();
 
@@ -4952,7 +5174,13 @@
                if (state->is_version(120, 300))
                   break;
             case GLSL_TYPE_DOUBLE:
-               if (check_type->base_type == GLSL_TYPE_DOUBLE && (state->is_version(410, 0) || state->ARB_vertex_attrib_64bit_enable))
+               if (check_type->is_double() && (state->is_version(410, 0) || state->ARB_vertex_attrib_64bit_enable))
+                  break;
+            case GLSL_TYPE_SAMPLER:
+               if (check_type->is_sampler() && state->has_bindless())
+                  break;
+            case GLSL_TYPE_IMAGE:
+               if (check_type->is_image() && state->has_bindless())
                   break;
             /* FALLTHROUGH */
             default:
@@ -5039,21 +5267,6 @@
 
          /* From section 4.3.6 (Output variables) of the GLSL 4.40 spec:
           *
-          *     It is a compile-time error to declare a vertex, tessellation
-          *     evaluation, tessellation control, or geometry shader output
-          *     that contains any of the following:
-          *
-          *     * A Boolean type (bool, bvec2 ...)
-          *     * An opaque type
-          */
-         if (check_type->is_boolean() || check_type->contains_opaque())
-            _mesa_glsl_error(&loc, state,
-                             "%s shader output cannot have type %s",
-                             _mesa_shader_stage_to_string(state->stage),
-                             check_type->name);
-
-         /* From section 4.3.6 (Output variables) of the GLSL 4.40 spec:
-          *
           *     It is a compile-time error to declare a fragment shader output
           *     that contains any of the following:
           *
@@ -5207,11 +5420,23 @@
        *
        *    "[Opaque types] can only be declared as function
        *     parameters or uniform-qualified variables."
+       *
+       * From section 4.1.7 of the ARB_bindless_texture spec:
+       *
+       *    "Samplers may be declared as shader inputs and outputs, as uniform
+       *     variables, as temporary variables, and as function parameters."
+       *
+       * From section 4.1.X of the ARB_bindless_texture spec:
+       *
+       *    "Images may be declared as shader inputs and outputs, as uniform
+       *     variables, as temporary variables, and as function parameters."
        */
-      if (var_type->contains_opaque() &&
-          !this->type->qualifier.flags.q.uniform) {
+      if (!this->type->qualifier.flags.q.uniform &&
+          (var_type->contains_atomic() ||
+           (!state->has_bindless() && var_type->contains_opaque()))) {
          _mesa_glsl_error(&loc, state,
-                          "opaque variables must be declared uniform");
+                          "%s variables must be declared uniform",
+                          state->has_bindless() ? "atomic" : "opaque");
       }
 
       /* Process the initializer and add its instructions to a temporary
@@ -5441,11 +5666,23 @@
     *   "Opaque variables cannot be treated as l-values; hence cannot
     *    be used as out or inout function parameters, nor can they be
     *    assigned into."
+    *
+    * From section 4.1.7 of the ARB_bindless_texture spec:
+    *
+    *   "Samplers can be used as l-values, so can be assigned into and used
+    *    as "out" and "inout" function parameters."
+    *
+    * From section 4.1.X of the ARB_bindless_texture spec:
+    *
+    *   "Images can be used as l-values, so can be assigned into and used as
+    *    "out" and "inout" function parameters."
     */
    if ((var->data.mode == ir_var_function_inout || var->data.mode == ir_var_function_out)
-       && type->contains_opaque()) {
+       && (type->contains_atomic() ||
+           (!state->has_bindless() && type->contains_opaque()))) {
       _mesa_glsl_error(&loc, state, "out and inout parameters cannot "
-                       "contain opaque variables");
+                       "contain %s variables",
+                       state->has_bindless() ? "atomic" : "opaque");
       type = glsl_type::error_type;
    }
 
@@ -5612,16 +5849,33 @@
                        "sized", name);
    }
 
+   /* From Section 6.1 (Function Definitions) of the GLSL 1.00 spec:
+    *
+    *     "Arrays are allowed as arguments, but not as the return type. [...]
+    *      The return type can also be a structure if the structure does not
+    *      contain an array."
+    */
+   if (state->language_version == 100 && return_type->contains_array()) {
+      YYLTYPE loc = this->get_location();
+      _mesa_glsl_error(& loc, state,
+                       "function `%s' return type contains an array", name);
+   }
+
    /* From section 4.1.7 of the GLSL 4.40 spec:
     *
     *    "[Opaque types] can only be declared as function parameters
     *     or uniform-qualified variables."
+    *
+    * The ARB_bindless_texture spec doesn't clearly state this, but as it says
+    * "Replace Section 4.1.7 (Samplers), p. 25" and, "Replace Section 4.1.X,
+    * (Images)", this should be allowed.
     */
-   if (return_type->contains_opaque()) {
+   if (return_type->contains_atomic() ||
+       (!state->has_bindless() && return_type->contains_opaque())) {
       YYLTYPE loc = this->get_location();
       _mesa_glsl_error(&loc, state,
-                       "function `%s' return type can't contain an opaque type",
-                       name);
+                       "function `%s' return type can't contain an %s type",
+                       name, state->has_bindless() ? "atomic" : "opaque");
    }
 
    /**/
@@ -5658,16 +5912,27 @@
     * "User code can overload the built-in functions but cannot redefine
     * them."
     */
-   if (state->es_shader && state->language_version >= 300) {
+   if (state->es_shader) {
       /* Local shader has no exact candidates; check the built-ins. */
       _mesa_glsl_initialize_builtin_functions();
-      if (_mesa_glsl_has_builtin_function(state, name)) {
+      if (state->language_version >= 300 &&
+          _mesa_glsl_has_builtin_function(state, name)) {
          YYLTYPE loc = this->get_location();
          _mesa_glsl_error(& loc, state,
                           "A shader cannot redefine or overload built-in "
                           "function `%s' in GLSL ES 3.00", name);
          return NULL;
       }
+
+      if (state->language_version == 100) {
+         ir_function_signature *sig =
+            _mesa_glsl_find_builtin_function(state, name, &hir_parameters);
+         if (sig && sig->is_builtin()) {
+            _mesa_glsl_error(& loc, state,
+                             "A shader cannot redefine built-in "
+                             "function `%s' in GLSL ES 1.00", name);
+         }
+      }
    }
 
    /* Verify that this function's signature either doesn't match a previously
@@ -5703,6 +5968,16 @@
                 */
                return NULL;
             }
+         } else if (state->language_version == 100 && !is_definition) {
+            /* From the GLSL 1.00 spec, section 4.2.7:
+             *
+             *     "A particular variable, structure or function declaration
+             *      may occur at most once within a scope with the exception
+             *      that a single function prototype plus the corresponding
+             *      function definition are allowed."
+             */
+            YYLTYPE loc = this->get_location();
+            _mesa_glsl_error(&loc, state, "function `%s' redeclared", name);
          }
       }
    }
@@ -6812,9 +7087,19 @@
       assert(decl_type);
 
       if (is_interface) {
-         if (decl_type->contains_opaque()) {
+         /* From section 4.3.7 of the ARB_bindless_texture spec:
+          *
+          *    "(remove the following bullet from the last list on p. 39,
+          *     thereby permitting sampler types in interface blocks; image
+          *     types are also permitted in blocks by this extension)"
+          *
+          *     * sampler types are not allowed
+          */
+         if (decl_type->contains_atomic() ||
+             (!state->has_bindless() && decl_type->contains_opaque())) {
             _mesa_glsl_error(&loc, state, "uniform/buffer in non-default "
-                             "interface block contains opaque variable");
+                             "interface block contains %s variable",
+                             state->has_bindless() ? "atomic" : "opaque");
          }
       } else {
          if (decl_type->contains_atomic()) {
@@ -6826,7 +7111,7 @@
             _mesa_glsl_error(&loc, state, "atomic counter in structure");
          }
 
-         if (decl_type->contains_image()) {
+         if (!state->has_bindless() && decl_type->contains_image()) {
             /* FINISHME: Same problem as with atomic counters.
              * FINISHME: Request clarification from Khronos and add
              * FINISHME: spec quotation here.
@@ -6879,6 +7164,9 @@
                           "to struct or interface block members");
       }
 
+      validate_memory_qualifier_for_type(state, &loc, qual, decl_type);
+      validate_image_format_qualifier_for_type(state, &loc, qual, decl_type);
+
       /* From Section 4.4.2.3 (Geometry Outputs) of the GLSL 4.50 spec:
        *
        *   "A block member may be declared with a stream identifier, but
@@ -7121,33 +7409,56 @@
                    || fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_COLUMN_MAJOR);
          }
 
-         /* Image qualifiers are allowed on buffer variables, which can only
-          * be defined inside shader storage buffer objects
+         /* Memory qualifiers are allowed on buffer and image variables, while
+          * the format qualifier is only accepted for images.
           */
-         if (layout && var_mode == ir_var_shader_storage) {
+         if (var_mode == ir_var_shader_storage ||
+             field_type->without_array()->is_image()) {
             /* For readonly and writeonly qualifiers the field definition,
              * if set, overwrites the layout qualifier.
              */
             if (qual->flags.q.read_only) {
-               fields[i].image_read_only = true;
-               fields[i].image_write_only = false;
+               fields[i].memory_read_only = true;
+               fields[i].memory_write_only = false;
             } else if (qual->flags.q.write_only) {
-               fields[i].image_read_only = false;
-               fields[i].image_write_only = true;
+               fields[i].memory_read_only = false;
+               fields[i].memory_write_only = true;
             } else {
-               fields[i].image_read_only = layout->flags.q.read_only;
-               fields[i].image_write_only = layout->flags.q.write_only;
+               fields[i].memory_read_only =
+                  layout ? layout->flags.q.read_only : 0;
+               fields[i].memory_write_only =
+                  layout ? layout->flags.q.write_only : 0;
             }
 
             /* For other qualifiers, we set the flag if either the layout
              * qualifier or the field qualifier are set
              */
-            fields[i].image_coherent = qual->flags.q.coherent ||
-                                        layout->flags.q.coherent;
-            fields[i].image_volatile = qual->flags.q._volatile ||
-                                        layout->flags.q._volatile;
-            fields[i].image_restrict = qual->flags.q.restrict_flag ||
-                                        layout->flags.q.restrict_flag;
+            fields[i].memory_coherent = qual->flags.q.coherent ||
+                                        (layout && layout->flags.q.coherent);
+            fields[i].memory_volatile = qual->flags.q._volatile ||
+                                        (layout && layout->flags.q._volatile);
+            fields[i].memory_restrict = qual->flags.q.restrict_flag ||
+                                        (layout && layout->flags.q.restrict_flag);
+
+            if (field_type->without_array()->is_image()) {
+               if (qual->flags.q.explicit_image_format) {
+                  if (qual->image_base_type !=
+                      field_type->without_array()->sampled_type) {
+                     _mesa_glsl_error(&loc, state, "format qualifier doesn't "
+                                      "match the base data type of the image");
+                  }
+
+                  fields[i].image_format = qual->image_format;
+               } else {
+                  if (!qual->flags.q.write_only) {
+                     _mesa_glsl_error(&loc, state, "image not qualified with "
+                                      "`writeonly' must have a format layout "
+                                      "qualifier");
+                  }
+
+                  fields[i].image_format = GL_NONE;
+               }
+            }
          }
 
          i++;
@@ -7196,13 +7507,12 @@
 
    validate_identifier(this->name, loc, state);
 
-   const glsl_type *t =
-      glsl_type::get_record_instance(fields, decl_count, this->name);
+   type = glsl_type::get_record_instance(fields, decl_count, this->name);
 
-   if (!state->symbols->add_type(name, t)) {
+   if (!type->is_anonymous() && !state->symbols->add_type(name, type)) {
       const glsl_type *match = state->symbols->get_type(name);
       /* allow struct matching for desktop GL - older UE4 does this */
-      if (match != NULL && state->is_version(130, 0) && match->record_compare(t, false))
+      if (match != NULL && state->is_version(130, 0) && match->record_compare(type, false))
          _mesa_glsl_warning(& loc, state, "struct `%s' previously defined", name);
       else
          _mesa_glsl_error(& loc, state, "struct `%s' previously defined", name);
@@ -7211,7 +7521,7 @@
                                      const glsl_type *,
                                      state->num_user_structures + 1);
       if (s != NULL) {
-         s[state->num_user_structures] = t;
+         s[state->num_user_structures] = type;
          state->user_structures = s;
          state->num_user_structures++;
       }
@@ -7271,11 +7581,11 @@
 static void
 apply_memory_qualifiers(ir_variable *var, glsl_struct_field field)
 {
-   var->data.image_read_only = field.image_read_only;
-   var->data.image_write_only = field.image_write_only;
-   var->data.image_coherent = field.image_coherent;
-   var->data.image_volatile = field.image_volatile;
-   var->data.image_restrict = field.image_restrict;
+   var->data.memory_read_only = field.memory_read_only;
+   var->data.memory_write_only = field.memory_write_only;
+   var->data.memory_coherent = field.memory_coherent;
+   var->data.memory_volatile = field.memory_volatile;
+   var->data.memory_restrict = field.memory_restrict;
 }
 
 ir_rvalue *
@@ -7367,21 +7677,17 @@
                                "invalid qualifier for block",
                                this->block_name);
 
-   /* The ast_interface_block has a list of ast_declarator_lists.  We
-    * need to turn those into ir_variables with an association
-    * with this uniform block.
-    */
    enum glsl_interface_packing packing;
-   if (this->layout.flags.q.shared) {
-      packing = GLSL_INTERFACE_PACKING_SHARED;
+   if (this->layout.flags.q.std140) {
+      packing = GLSL_INTERFACE_PACKING_STD140;
    } else if (this->layout.flags.q.packed) {
       packing = GLSL_INTERFACE_PACKING_PACKED;
    } else if (this->layout.flags.q.std430) {
       packing = GLSL_INTERFACE_PACKING_STD430;
    } else {
-      /* The default layout is std140.
+      /* The default layout is shared.
        */
-      packing = GLSL_INTERFACE_PACKING_STD140;
+      packing = GLSL_INTERFACE_PACKING_SHARED;
    }
 
    ir_variable_mode var_mode;
@@ -7474,7 +7780,7 @@
          return NULL;
       } else {
          if (expl_align == 0 || expl_align & (expl_align - 1)) {
-            _mesa_glsl_error(&loc, state, "align layout qualifier in not a "
+            _mesa_glsl_error(&loc, state, "align layout qualifier is not a "
                              "power of 2.");
             return NULL;
          }
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index d302fc4..63c026a 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -69,6 +69,10 @@
           || this->flags.q.column_major
           || this->flags.q.row_major
           || this->flags.q.packed
+          || this->flags.q.bindless_sampler
+          || this->flags.q.bindless_image
+          || this->flags.q.bound_sampler
+          || this->flags.q.bound_image
           || this->flags.q.explicit_align
           || this->flags.q.explicit_component
           || this->flags.q.explicit_location
@@ -181,6 +185,33 @@
    return true;
 }
 
+static void
+merge_bindless_qualifier(YYLTYPE *loc,
+                         _mesa_glsl_parse_state *state,
+                         const ast_type_qualifier &qualifier,
+                         const ast_type_qualifier &new_qualifier)
+{
+   if (state->default_uniform_qualifier->flags.q.bindless_sampler) {
+      state->bindless_sampler_specified = true;
+      state->default_uniform_qualifier->flags.q.bindless_sampler = false;
+   }
+
+   if (state->default_uniform_qualifier->flags.q.bindless_image) {
+      state->bindless_image_specified = true;
+      state->default_uniform_qualifier->flags.q.bindless_image = false;
+   }
+
+   if (state->default_uniform_qualifier->flags.q.bound_sampler) {
+      state->bound_sampler_specified = true;
+      state->default_uniform_qualifier->flags.q.bound_sampler = false;
+   }
+
+   if (state->default_uniform_qualifier->flags.q.bound_image) {
+      state->bound_image_specified = true;
+      state->default_uniform_qualifier->flags.q.bound_image = false;
+   }
+}
+
 /**
  * This function merges duplicate layout identifiers.
  *
@@ -243,6 +274,16 @@
    input_layout_mask.flags.q.sample = 1;
    input_layout_mask.flags.q.smooth = 1;
 
+   if (state->has_bindless()) {
+      /* Allow to use image qualifiers with shader inputs/outputs. */
+      input_layout_mask.flags.q.coherent = 1;
+      input_layout_mask.flags.q._volatile = 1;
+      input_layout_mask.flags.q.restrict_flag = 1;
+      input_layout_mask.flags.q.read_only = 1;
+      input_layout_mask.flags.q.write_only = 1;
+      input_layout_mask.flags.q.explicit_image_format = 1;
+   }
+
    /* Uniform block layout qualifiers get to overwrite each
     * other (rightmost having priority), while all other
     * qualifiers currently don't allow duplicates.
@@ -393,6 +434,18 @@
    if (q.flags.q.local_size_variable)
       this->flags.q.local_size_variable = true;
 
+   if (q.flags.q.bindless_sampler)
+      this->flags.q.bindless_sampler = true;
+
+   if (q.flags.q.bindless_image)
+      this->flags.q.bindless_image = true;
+
+   if (q.flags.q.bound_sampler)
+      this->flags.q.bound_sampler = true;
+
+   if (q.flags.q.bound_image)
+      this->flags.q.bound_image = true;
+
    this->flags.i |= q.flags.i;
 
    if (this->flags.q.in &&
@@ -427,6 +480,12 @@
       this->image_base_type = q.image_base_type;
    }
 
+   if (q.flags.q.bindless_sampler ||
+       q.flags.q.bindless_image ||
+       q.flags.q.bound_sampler ||
+       q.flags.q.bound_image)
+      merge_bindless_qualifier(loc, state, *this, q);
+
    return r;
 }
 
@@ -778,6 +837,10 @@
                     bad.flags.q.subroutine ? " subroutine" : "",
                     bad.flags.q.blend_support ? " blend_support" : "",
                     bad.flags.q.inner_coverage ? " inner_coverage" : "",
+                    bad.flags.q.bindless_sampler ? " bindless_sampler" : "",
+                    bad.flags.q.bindless_image ? " bindless_image" : "",
+                    bad.flags.q.bound_sampler ? " bound_sampler" : "",
+                    bad.flags.q.bound_image ? " bound_image" : "",
                     bad.flags.q.post_depth_coverage ? " post_depth_coverage" : "");
    return false;
 }
diff --git a/src/compiler/glsl/blob.c b/src/compiler/glsl/blob.c
index 769ebf1..65e1376 100644
--- a/src/compiler/glsl/blob.c
+++ b/src/compiler/glsl/blob.c
@@ -26,6 +26,14 @@
 #include "main/macros.h"
 #include "blob.h"
 
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
 #define BLOB_INITIAL_SIZE 4096
 
 /* Ensure that \blob will be able to fit an additional object of size
@@ -38,6 +46,9 @@
    size_t to_allocate;
    uint8_t *new_data;
 
+   if (blob->out_of_memory)
+      return false;
+
    if (blob->size + additional <= blob->allocated)
       return true;
 
@@ -49,8 +60,10 @@
    to_allocate = MAX2(to_allocate, blob->allocated + additional);
 
    new_data = realloc(blob->data, to_allocate);
-   if (new_data == NULL)
+   if (new_data == NULL) {
+      blob->out_of_memory = true;
       return false;
+   }
 
    blob->data = new_data;
    blob->allocated = to_allocate;
@@ -96,6 +109,7 @@
    blob->data = NULL;
    blob->allocated = 0;
    blob->size = 0;
+   blob->out_of_memory = false;
 
    return blob;
 }
@@ -107,9 +121,11 @@
                      size_t to_write)
 {
    /* Detect an attempt to overwrite data out of bounds. */
-   if (offset < 0 || blob->size - offset < to_write)
+   if (blob->size < offset + to_write)
       return false;
 
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write));
+
    memcpy(blob->data + offset, bytes, to_write);
 
    return true;
@@ -121,6 +137,8 @@
    if (! grow_to_fit(blob, to_write))
        return false;
 
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write));
+
    memcpy(blob->data + blob->size, bytes, to_write);
    blob->size += to_write;
 
@@ -195,6 +213,9 @@
 static bool
 ensure_can_read(struct blob_reader *blob, size_t size)
 {
+   if (blob->overrun)
+      return false;
+
    if (blob->current < blob->end && blob->end - blob->current >= size)
       return true;
 
diff --git a/src/compiler/glsl/blob.h b/src/compiler/glsl/blob.h
index 940c81e..4cbbb01 100644
--- a/src/compiler/glsl/blob.h
+++ b/src/compiler/glsl/blob.h
@@ -55,6 +55,12 @@
 
    /** The number of bytes that have actual data written to them. */
    size_t size;
+
+   /**
+    * True if we've ever failed to realloc or if we go pas the end of a fixed
+    * allocation blob.
+    */
+   bool out_of_memory;
 };
 
 /* When done reading, the caller can ensure that everything was consumed by
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index 92e7ea0..84833bd 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -948,8 +948,11 @@
    ir_function_signature *_memory_barrier(const char *intrinsic_name,
                                           builtin_available_predicate avail);
 
+   ir_function_signature *_ballot_intrinsic();
    ir_function_signature *_ballot();
+   ir_function_signature *_read_first_invocation_intrinsic(const glsl_type *type);
    ir_function_signature *_read_first_invocation(const glsl_type *type);
+   ir_function_signature *_read_invocation_intrinsic(const glsl_type *type);
    ir_function_signature *_read_invocation(const glsl_type *type);
 
    ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail,
@@ -957,7 +960,9 @@
    ir_function_signature *_shader_clock(builtin_available_predicate avail,
                                         const glsl_type *type);
 
-   ir_function_signature *_vote(enum ir_expression_operation opcode);
+   ir_function_signature *_vote_intrinsic(builtin_available_predicate avail,
+                                          enum ir_intrinsic_id id);
+   ir_function_signature *_vote(const char *intrinsic_name);
 
 #undef B0
 #undef B1
@@ -1190,6 +1195,53 @@
                 _shader_clock_intrinsic(shader_clock,
                                         glsl_type::uvec2_type),
                 NULL);
+
+   add_function("__intrinsic_vote_all",
+                _vote_intrinsic(vote, ir_intrinsic_vote_all),
+                NULL);
+   add_function("__intrinsic_vote_any",
+                _vote_intrinsic(vote, ir_intrinsic_vote_any),
+                NULL);
+   add_function("__intrinsic_vote_eq",
+                _vote_intrinsic(vote, ir_intrinsic_vote_eq),
+                NULL);
+
+   add_function("__intrinsic_ballot", _ballot_intrinsic(), NULL);
+
+   add_function("__intrinsic_read_invocation",
+                _read_invocation_intrinsic(glsl_type::float_type),
+                _read_invocation_intrinsic(glsl_type::vec2_type),
+                _read_invocation_intrinsic(glsl_type::vec3_type),
+                _read_invocation_intrinsic(glsl_type::vec4_type),
+
+                _read_invocation_intrinsic(glsl_type::int_type),
+                _read_invocation_intrinsic(glsl_type::ivec2_type),
+                _read_invocation_intrinsic(glsl_type::ivec3_type),
+                _read_invocation_intrinsic(glsl_type::ivec4_type),
+
+                _read_invocation_intrinsic(glsl_type::uint_type),
+                _read_invocation_intrinsic(glsl_type::uvec2_type),
+                _read_invocation_intrinsic(glsl_type::uvec3_type),
+                _read_invocation_intrinsic(glsl_type::uvec4_type),
+                NULL);
+
+   add_function("__intrinsic_read_first_invocation",
+                _read_first_invocation_intrinsic(glsl_type::float_type),
+                _read_first_invocation_intrinsic(glsl_type::vec2_type),
+                _read_first_invocation_intrinsic(glsl_type::vec3_type),
+                _read_first_invocation_intrinsic(glsl_type::vec4_type),
+
+                _read_first_invocation_intrinsic(glsl_type::int_type),
+                _read_first_invocation_intrinsic(glsl_type::ivec2_type),
+                _read_first_invocation_intrinsic(glsl_type::ivec3_type),
+                _read_first_invocation_intrinsic(glsl_type::ivec4_type),
+
+                _read_first_invocation_intrinsic(glsl_type::uint_type),
+                _read_first_invocation_intrinsic(glsl_type::uvec2_type),
+                _read_first_invocation_intrinsic(glsl_type::uvec3_type),
+                _read_first_invocation_intrinsic(glsl_type::uvec4_type),
+                NULL);
+
 }
 
 /**
@@ -3168,9 +3220,9 @@
                               glsl_type::uint64_t_type),
                 NULL);
 
-   add_function("anyInvocationARB", _vote(ir_unop_vote_any), NULL);
-   add_function("allInvocationsARB", _vote(ir_unop_vote_all), NULL);
-   add_function("allInvocationsEqualARB", _vote(ir_unop_vote_eq), NULL);
+   add_function("anyInvocationARB", _vote("__intrinsic_vote_any"), NULL);
+   add_function("allInvocationsARB", _vote("__intrinsic_vote_all"), NULL);
+   add_function("allInvocationsEqualARB", _vote("__intrinsic_vote_eq"), NULL);
 
    add_function("__builtin_idiv64",
                 generate_ir::idiv64(mem_ctx, integer_functions_supported),
@@ -3417,7 +3469,7 @@
    return new(mem_ctx) ir_constant(type, &data);
 }
 
-#define IMM_FP(type, val) (type->base_type == GLSL_TYPE_DOUBLE) ? imm(val) : imm((float)val)
+#define IMM_FP(type, val) (type->is_double()) ? imm(val) : imm((float)val)
 
 ir_dereference_variable *
 builtin_builder::var_ref(ir_variable *var)
@@ -3985,14 +4037,14 @@
    ir_variable *t = body.make_temp(x_type, "t");
    if (x_type->vector_elements == 1) {
       /* Both are floats */
-      if (edge_type->base_type == GLSL_TYPE_DOUBLE)
+      if (edge_type->is_double())
          body.emit(assign(t, f2d(b2f(gequal(x, edge)))));
       else
          body.emit(assign(t, b2f(gequal(x, edge))));
    } else if (edge_type->vector_elements == 1) {
       /* x is a vector but edge is a float */
       for (int i = 0; i < x_type->vector_elements; i++) {
-         if (edge_type->base_type == GLSL_TYPE_DOUBLE)
+         if (edge_type->is_double())
             body.emit(assign(t, f2d(b2f(gequal(swizzle(x, i, 1), edge))), 1 << i));
          else
             body.emit(assign(t, b2f(gequal(swizzle(x, i, 1), edge)), 1 << i));
@@ -4000,7 +4052,7 @@
    } else {
       /* Both are vectors */
       for (int i = 0; i < x_type->vector_elements; i++) {
-         if (edge_type->base_type == GLSL_TYPE_DOUBLE)
+         if (edge_type->is_double())
             body.emit(assign(t, f2d(b2f(gequal(swizzle(x, i, 1), swizzle(edge, i, 1)))),
                              1 << i));
          else
@@ -4452,7 +4504,7 @@
    ir_variable *c;
    ir_variable *r;
 
-   if (type->base_type == GLSL_TYPE_DOUBLE) {
+   if (type->is_double()) {
       r = in_var(glsl_type::dvec(type->matrix_columns), "r");
       c = in_var(glsl_type::dvec(type->vector_elements), "c");
    } else {
@@ -5475,7 +5527,7 @@
 ir_function_signature *
 builtin_builder::_ldexp(const glsl_type *x_type, const glsl_type *exp_type)
 {
-   return binop(x_type->base_type == GLSL_TYPE_DOUBLE ? fp64 : gpu_shader5_or_es31_or_integer_functions,
+   return binop(x_type->is_double() ? fp64 : gpu_shader5_or_es31_or_integer_functions,
                 ir_binop_ldexp, x_type, x_type, exp_type);
 }
 
@@ -5877,11 +5929,11 @@
     * accept everything that needs to be accepted, and reject cases
     * like loads from write-only or stores to read-only images.
     */
-   image->data.image_read_only = (flags & IMAGE_FUNCTION_READ_ONLY) != 0;
-   image->data.image_write_only = (flags & IMAGE_FUNCTION_WRITE_ONLY) != 0;
-   image->data.image_coherent = true;
-   image->data.image_volatile = true;
-   image->data.image_restrict = true;
+   image->data.memory_read_only = (flags & IMAGE_FUNCTION_READ_ONLY) != 0;
+   image->data.memory_write_only = (flags & IMAGE_FUNCTION_WRITE_ONLY) != 0;
+   image->data.memory_coherent = true;
+   image->data.memory_volatile = true;
+   image->data.memory_restrict = true;
 
    return sig;
 }
@@ -5917,11 +5969,11 @@
     * accept everything that needs to be accepted, and reject cases
     * like loads from write-only or stores to read-only images.
     */
-   image->data.image_read_only = true;
-   image->data.image_write_only = true;
-   image->data.image_coherent = true;
-   image->data.image_volatile = true;
-   image->data.image_restrict = true;
+   image->data.memory_read_only = true;
+   image->data.memory_write_only = true;
+   image->data.memory_coherent = true;
+   image->data.memory_volatile = true;
+   image->data.memory_restrict = true;
 
    return sig;
 }
@@ -5942,11 +5994,11 @@
     * accept everything that needs to be accepted, and reject cases
     * like loads from write-only or stores to read-only images.
     */
-   image->data.image_read_only = true;
-   image->data.image_write_only = true;
-   image->data.image_coherent = true;
-   image->data.image_volatile = true;
-   image->data.image_restrict = true;
+   image->data.memory_read_only = true;
+   image->data.memory_write_only = true;
+   image->data.memory_coherent = true;
+   image->data.memory_volatile = true;
+   image->data.memory_restrict = true;
 
    return sig;
 }
@@ -6003,12 +6055,34 @@
 }
 
 ir_function_signature *
+builtin_builder::_ballot_intrinsic()
+{
+   ir_variable *value = in_var(glsl_type::bool_type, "value");
+   MAKE_INTRINSIC(glsl_type::uint64_t_type, ir_intrinsic_ballot, shader_ballot,
+                  1, value);
+   return sig;
+}
+
+ir_function_signature *
 builtin_builder::_ballot()
 {
    ir_variable *value = in_var(glsl_type::bool_type, "value");
 
    MAKE_SIG(glsl_type::uint64_t_type, shader_ballot, 1, value);
-   body.emit(ret(expr(ir_unop_ballot, value)));
+   ir_variable *retval = body.make_temp(glsl_type::uint64_t_type, "retval");
+
+   body.emit(call(shader->symbols->get_function("__intrinsic_ballot"),
+                  retval, sig->parameters));
+   body.emit(ret(retval));
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_read_first_invocation_intrinsic(const glsl_type *type)
+{
+   ir_variable *value = in_var(type, "value");
+   MAKE_INTRINSIC(type, ir_intrinsic_read_first_invocation, shader_ballot,
+                  1, value);
    return sig;
 }
 
@@ -6018,7 +6092,21 @@
    ir_variable *value = in_var(type, "value");
 
    MAKE_SIG(type, shader_ballot, 1, value);
-   body.emit(ret(expr(ir_unop_read_first_invocation, value)));
+   ir_variable *retval = body.make_temp(type, "retval");
+
+   body.emit(call(shader->symbols->get_function("__intrinsic_read_first_invocation"),
+                  retval, sig->parameters));
+   body.emit(ret(retval));
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_read_invocation_intrinsic(const glsl_type *type)
+{
+   ir_variable *value = in_var(type, "value");
+   ir_variable *invocation = in_var(glsl_type::uint_type, "invocation");
+   MAKE_INTRINSIC(type, ir_intrinsic_read_invocation, shader_ballot,
+                  2, value, invocation);
    return sig;
 }
 
@@ -6029,7 +6117,11 @@
    ir_variable *invocation = in_var(glsl_type::uint_type, "invocation");
 
    MAKE_SIG(type, shader_ballot, 2, value, invocation);
-   body.emit(ret(expr(ir_binop_read_invocation, value, invocation)));
+   ir_variable *retval = body.make_temp(type, "retval");
+
+   body.emit(call(shader->symbols->get_function("__intrinsic_read_invocation"),
+                  retval, sig->parameters));
+   body.emit(ret(retval));
    return sig;
 }
 
@@ -6062,12 +6154,26 @@
 }
 
 ir_function_signature *
-builtin_builder::_vote(enum ir_expression_operation opcode)
+builtin_builder::_vote_intrinsic(builtin_available_predicate avail,
+                                 enum ir_intrinsic_id id)
+{
+   ir_variable *value = in_var(glsl_type::bool_type, "value");
+   MAKE_INTRINSIC(glsl_type::bool_type, id, avail, 1, value);
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_vote(const char *intrinsic_name)
 {
    ir_variable *value = in_var(glsl_type::bool_type, "value");
 
    MAKE_SIG(glsl_type::bool_type, vote, 1, value);
-   body.emit(ret(expr(opcode, value)));
+
+   ir_variable *retval = body.make_temp(glsl_type::bool_type, "retval");
+
+   body.emit(call(shader->symbols->get_function(intrinsic_name),
+                  retval, sig->parameters));
+   body.emit(ret(retval));
    return sig;
 }
 
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 83a2f02..19d427e 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -336,11 +336,12 @@
    this->fields[this->num_fields].sample = 0;
    this->fields[this->num_fields].patch = 0;
    this->fields[this->num_fields].precision = GLSL_PRECISION_NONE;
-   this->fields[this->num_fields].image_read_only = 0;
-   this->fields[this->num_fields].image_write_only = 0;
-   this->fields[this->num_fields].image_coherent = 0;
-   this->fields[this->num_fields].image_volatile = 0;
-   this->fields[this->num_fields].image_restrict = 0;
+   this->fields[this->num_fields].memory_read_only = 0;
+   this->fields[this->num_fields].memory_write_only = 0;
+   this->fields[this->num_fields].memory_coherent = 0;
+   this->fields[this->num_fields].memory_volatile = 0;
+   this->fields[this->num_fields].memory_restrict = 0;
+   this->fields[this->num_fields].image_format = 0;
    this->fields[this->num_fields].explicit_xfb_buffer = 0;
    this->fields[this->num_fields].xfb_buffer = -1;
    this->fields[this->num_fields].xfb_stride = -1;
@@ -1325,6 +1326,8 @@
    case MESA_SHADER_COMPUTE:
       /* Compute shaders don't have varyings. */
       break;
+   default:
+      break;
    }
 }
 
@@ -1462,6 +1465,8 @@
    case MESA_SHADER_COMPUTE:
       gen.generate_cs_special_vars();
       break;
+   default:
+      break;
    }
 }
 
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index e113253..f1719f9 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -284,30 +284,44 @@
                  *    It is an error to undefine or to redefine a built-in
                  *    (pre-defined) macro name.
                  *
-                 * The GLSL ES 1.00 spec does not contain this text.
+                 * The GLSL ES 1.00 spec does not contain this text, but
+                 * dEQP's preprocess test in GLES2 checks for it.
                  *
-                 * Section 3.3 (Preprocessor) of the GLSL 1.30 spec says:
+                 * Section 3.3 (Preprocessor) revision 7, of the GLSL 4.50
+                 * spec says:
                  *
-                 *    #define and #undef functionality are defined as is
-                 *    standard for C++ preprocessors for macro definitions
-                 *    both with and without macro parameters.
+                 *    By convention, all macro names containing two consecutive
+                 *    underscores ( __ ) are reserved for use by underlying
+                 *    software layers. Defining or undefining such a name
+                 *    in a shader does not itself result in an error, but may
+                 *    result in unintended behaviors that stem from having
+                 *    multiple definitions of the same name. All macro names
+                 *    prefixed with "GL_" (...) are also reseved, and defining
+                 *    such a name results in a compile-time error.
                  *
-                 * At least as far as I can tell GCC allow '#undef __FILE__'.
-                 * Furthermore, there are desktop OpenGL conformance tests
-                 * that expect '#undef __VERSION__' and '#undef
-                 * GL_core_profile' to work.
-                 *
-                 * Only disallow #undef of pre-defined macros on GLSL ES >=
-                 * 3.00 shaders.
+                 * The code below implements the same checks as GLSLang.
                  */
-		if (parser->is_gles &&
-                    parser->version >= 300 &&
-                    (strcmp("__LINE__", $3) == 0
-                     || strcmp("__FILE__", $3) == 0
-                     || strcmp("__VERSION__", $3) == 0
-                     || strncmp("GL_", $3, 3) == 0))
+		if (strncmp("GL_", $3, 3) == 0)
 			glcpp_error(& @1, parser, "Built-in (pre-defined)"
-				    " macro names cannot be undefined.");
+				    " names beginning with GL_ cannot be undefined.");
+		else if (strstr($3, "__") != NULL) {
+			if (parser->is_gles
+			    && parser->version >= 300
+			    && (strcmp("__LINE__", $3) == 0
+				|| strcmp("__FILE__", $3) == 0
+				|| strcmp("__VERSION__", $3) == 0)) {
+				glcpp_error(& @1, parser, "Built-in (pre-defined)"
+					    " names cannot be undefined.");
+			} else if (parser->is_gles && parser->version <= 300) {
+				glcpp_error(& @1, parser,
+					    " names containing consecutive underscores"
+					    " are reserved.");
+			} else {
+				glcpp_warning(& @1, parser,
+					      " names containing consecutive underscores"
+					      " are reserved.");
+			}
+		}
 
 		entry = _mesa_hash_table_search (parser->defines, $3);
 		if (entry) {
@@ -1832,11 +1846,15 @@
 
    /* Special handling for __LINE__ and __FILE__, (not through
     * the hash table). */
-   if (strcmp(identifier, "__LINE__") == 0)
-      return _token_list_create_with_one_integer(parser, node->token->location.first_line);
+   if (*identifier == '_') {
+      if (strcmp(identifier, "__LINE__") == 0)
+         return _token_list_create_with_one_integer(parser,
+                                                    node->token->location.first_line);
 
-   if (strcmp(identifier, "__FILE__") == 0)
-      return _token_list_create_with_one_integer(parser, node->token->location.source);
+      if (strcmp(identifier, "__FILE__") == 0)
+         return _token_list_create_with_one_integer(parser,
+                                                    node->token->location.source);
+   }
 
    /* Look up this identifier in the hash table. */
    entry = _mesa_hash_table_search(parser->defines, identifier);
diff --git a/src/compiler/glsl/glcpp/pp.c b/src/compiler/glsl/glcpp/pp.c
index c526f37..96125f2 100644
--- a/src/compiler/glsl/glcpp/pp.c
+++ b/src/compiler/glsl/glcpp/pp.c
@@ -117,6 +117,12 @@
         char newline_separator[3];
 	int collapsed_newlines = 0;
 
+	backslash = strchr(shader, '\\');
+
+	/* No line continuations were found in this shader, our job is done */
+	if (backslash == NULL)
+		return (char *) shader;
+
 	search_start = shader;
 
 	/* Determine what flavor of newlines this shader is using. GLSL
@@ -157,8 +163,6 @@
 	}
 
 	while (true) {
-		backslash = strchr(search_start, '\\');
-
 		/* If we have previously collapsed any line-continuations,
 		 * then we want to insert additional newlines at the next
 		 * occurrence of a newline character to avoid changing any
@@ -204,6 +208,8 @@
 			shader = skip_newline (backslash + 1);
 			search_start = shader;
 		}
+
+		backslash = strchr(search_start, '\\');
 	}
 
 	ralloc_strcat(&clean, shader);
diff --git a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
index 498dc0f..623438f 100644
--- a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
+++ b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
@@ -1,6 +1,6 @@
-0:2(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
-0:3(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
-0:4(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
+0:2(1): preprocessor error: Built-in (pre-defined) names cannot be undefined.
+0:3(1): preprocessor error: Built-in (pre-defined) names cannot be undefined.
+0:4(1): preprocessor error: Built-in (pre-defined) names cannot be undefined.
 #version 300 es
 
 
diff --git a/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected
index cd0071f..f5517da 100644
--- a/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected
+++ b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected
@@ -1,3 +1,6 @@
+0:2(1): preprocessor warning:  names containing consecutive underscores are reserved.
+0:3(1): preprocessor warning:  names containing consecutive underscores are reserved.
+0:4(1): preprocessor warning:  names containing consecutive underscores are reserved.
 #version 110
 
 
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index e703073..7b93d34 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1589,6 +1589,27 @@
          }
       }
 
+      /* Layout qualifiers for ARB_bindless_texture. */
+      if (!$$.flags.i) {
+         if (match_layout_qualifier($1, "bindless_sampler", state) == 0)
+            $$.flags.q.bindless_sampler = 1;
+         if (match_layout_qualifier($1, "bound_sampler", state) == 0)
+            $$.flags.q.bound_sampler = 1;
+
+         if (state->has_shader_image_load_store()) {
+            if (match_layout_qualifier($1, "bindless_image", state) == 0)
+               $$.flags.q.bindless_image = 1;
+            if (match_layout_qualifier($1, "bound_image", state) == 0)
+               $$.flags.q.bound_image = 1;
+         }
+
+         if ($$.flags.i && !state->has_bindless()) {
+            _mesa_glsl_error(& @1, state,
+                             "qualifier `%s` requires "
+                             "ARB_bindless_texture", $1);
+         }
+      }
+
       if (!$$.flags.i) {
          _mesa_glsl_error(& @1, state, "unrecognized layout identifier "
                           "`%s'", $1);
@@ -2388,10 +2409,29 @@
       ast_fully_specified_type *const type = $1;
       type->set_location(@1);
 
-      if (type->qualifier.flags.i != 0)
-         _mesa_glsl_error(&@1, state,
-			  "only precision qualifiers may be applied to "
-			  "structure members");
+      if (state->has_bindless()) {
+         ast_type_qualifier input_layout_mask;
+
+         /* Allow to declare qualifiers for images. */
+         input_layout_mask.flags.i = 0;
+         input_layout_mask.flags.q.coherent = 1;
+         input_layout_mask.flags.q._volatile = 1;
+         input_layout_mask.flags.q.restrict_flag = 1;
+         input_layout_mask.flags.q.read_only = 1;
+         input_layout_mask.flags.q.write_only = 1;
+         input_layout_mask.flags.q.explicit_image_format = 1;
+
+         if ((type->qualifier.flags.i & ~input_layout_mask.flags.i) != 0) {
+            _mesa_glsl_error(&@1, state,
+                             "only precision and image qualifiers may be "
+                             "applied to structure members");
+         }
+      } else {
+         if (type->qualifier.flags.i != 0)
+            _mesa_glsl_error(&@1, state,
+                             "only precision qualifiers may be applied to "
+                             "structure members");
+      }
 
       $$ = new(ctx) ast_declarator_list(type);
       $$->set_location(@2);
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index eb12eff..68af6ba 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -305,8 +305,16 @@
           sizeof(this->atomic_counter_offsets));
    this->allow_extension_directive_midshader =
       ctx->Const.AllowGLSLExtensionDirectiveMidShader;
+   this->allow_builtin_variable_redeclaration =
+      ctx->Const.AllowGLSLBuiltinVariableRedeclaration;
 
    this->cs_input_local_size_variable_specified = false;
+
+   /* ARB_bindless_texture */
+   this->bindless_sampler_specified = false;
+   this->bindless_image_specified = false;
+   this->bound_sampler_specified = false;
+   this->bound_image_specified = false;
 }
 
 /**
@@ -607,6 +615,7 @@
    EXT(ARB_ES3_1_compatibility),
    EXT(ARB_ES3_2_compatibility),
    EXT(ARB_arrays_of_arrays),
+   EXT(ARB_bindless_texture),
    EXT(ARB_compute_shader),
    EXT(ARB_compute_variable_group_size),
    EXT(ARB_conservative_depth),
@@ -1667,20 +1676,19 @@
 					   ast_declarator_list *declarator_list)
 {
    if (identifier == NULL) {
-      static mtx_t mutex = _MTX_INITIALIZER_NP;
-      static unsigned anon_count = 1;
-      unsigned count;
-
-      mtx_lock(&mutex);
-      count = anon_count++;
-      mtx_unlock(&mutex);
-
-      identifier = linear_asprintf(lin_ctx, "#anon_struct_%04x", count);
+      /* All anonymous structs have the same name. This simplifies matching of
+       * globals whose type is an unnamed struct.
+       *
+       * It also avoids a memory leak when the same shader is compiled over and
+       * over again.
+       */
+      identifier = "#anon_struct";
    }
    name = identifier;
    this->declarations.push_degenerate_list_at_head(&declarator_list->link);
    is_declaration = true;
    layout = NULL;
+   type = NULL;
 }
 
 void ast_subroutine_list::print(void) const
@@ -1848,6 +1856,11 @@
       /* Nothing to do. */
       break;
    }
+
+   shader->bindless_sampler = state->bindless_sampler_specified;
+   shader->bindless_image = state->bindless_image_specified;
+   shader->bound_sampler = state->bound_sampler_specified;
+   shader->bound_image = state->bound_image_specified;
 }
 
 extern "C" {
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 6c3bc8a..be6c8dc 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -334,6 +334,11 @@
       return ARB_shader_image_load_store_enable || is_version(420, 310);
    }
 
+   bool has_bindless() const
+   {
+      return ARB_bindless_texture_enable;
+   }
+
    void process_version_directive(YYLTYPE *locp, int version,
                                   const char *ident);
 
@@ -423,6 +428,16 @@
    bool cs_input_local_size_variable_specified;
 
    /**
+    * True if a shader declare bindless_sampler/bindless_image, and
+    * respectively bound_sampler/bound_image at global scope as specified by
+    * ARB_bindless_texture.
+    */
+   bool bindless_sampler_specified;
+   bool bindless_image_specified;
+   bool bound_sampler_specified;
+   bool bound_image_specified;
+
+   /**
     * Output layout qualifiers from GLSL 1.50 (geometry shader controls),
     * and GLSL 4.00 (tessellation control shader).
     */
@@ -592,6 +607,8 @@
    bool ARB_ES3_2_compatibility_warn;
    bool ARB_arrays_of_arrays_enable;
    bool ARB_arrays_of_arrays_warn;
+   bool ARB_bindless_texture_enable;
+   bool ARB_bindless_texture_warn;
    bool ARB_compute_shader_enable;
    bool ARB_compute_shader_warn;
    bool ARB_compute_variable_group_size_enable;
@@ -825,6 +842,7 @@
    unsigned atomic_counter_offsets[MAX_COMBINED_ATOMIC_BUFFERS];
 
    bool allow_extension_directive_midshader;
+   bool allow_builtin_variable_redeclaration;
 
    /**
     * Known subroutine type declarations.
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
index 870d457..9f5e92a 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -133,13 +133,13 @@
 nir_remap_attributes(nir_shader *shader)
 {
    nir_foreach_variable(var, &shader->inputs) {
-      var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read &
+      var->data.location += _mesa_bitcount_64(shader->info.double_inputs_read &
                                               BITFIELD64_MASK(var->data.location));
    }
 
    /* Once the remap is done, reset double_inputs_read, so later it will have
     * which location/slots are doubles */
-   shader->info->double_inputs_read = 0;
+   shader->info.double_inputs_read = 0;
 }
 
 nir_shader *
@@ -166,10 +166,10 @@
    if (shader->stage == MESA_SHADER_VERTEX)
       nir_remap_attributes(shader);
 
-   shader->info->name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
+   shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
    if (shader_prog->Label)
-      shader->info->label = ralloc_strdup(shader, shader_prog->Label);
-   shader->info->has_transform_feedback_varyings =
+      shader->info.label = ralloc_strdup(shader, shader_prog->Label);
+   shader->info.has_transform_feedback_varyings =
       shader_prog->TransformFeedback.NumVarying > 0;
 
    return shader;
@@ -308,6 +308,13 @@
 void
 nir_visitor::visit(ir_variable *ir)
 {
+   /* TODO: In future we should switch to using the NIR lowering pass but for
+    * now just ignore these variables as GLSL IR should have lowered them.
+    * Anything remaining are just dead vars that weren't cleaned up.
+    */
+   if (ir->data.mode == ir_var_shader_shared)
+      return;
+
    nir_variable *var = ralloc(shader, nir_variable);
    var->type = ir->type;
    var->name = ralloc_strdup(var, ir->name);
@@ -361,7 +368,7 @@
       if (glsl_type_is_dual_slot(glsl_without_array(var->type))) {
          for (uint i = 0; i < glsl_count_attribute_slots(var->type, true); i++) {
             uint64_t bitfield = BITFIELD64_BIT(var->data.location + i);
-            shader->info->double_inputs_read |= bitfield;
+            shader->info.double_inputs_read |= bitfield;
          }
       }
       break;
@@ -417,13 +424,14 @@
    }
 
    var->data.index = ir->data.index;
+   var->data.descriptor_set = 0;
    var->data.binding = ir->data.binding;
    var->data.offset = ir->data.offset;
-   var->data.image.read_only = ir->data.image_read_only;
-   var->data.image.write_only = ir->data.image_write_only;
-   var->data.image.coherent = ir->data.image_coherent;
-   var->data.image._volatile = ir->data.image_volatile;
-   var->data.image.restrict_flag = ir->data.image_restrict;
+   var->data.image.read_only = ir->data.memory_read_only;
+   var->data.image.write_only = ir->data.memory_write_only;
+   var->data.image.coherent = ir->data.memory_coherent;
+   var->data.image._volatile = ir->data.memory_volatile;
+   var->data.image.restrict_flag = ir->data.memory_restrict;
    var->data.image.format = ir->data.image_format;
    var->data.fb_fetch_output = ir->data.fb_fetch_output;
 
@@ -791,6 +799,24 @@
       case ir_intrinsic_shared_atomic_comp_swap:
          op = nir_intrinsic_shared_atomic_comp_swap;
          break;
+      case ir_intrinsic_vote_any:
+         op = nir_intrinsic_vote_any;
+         break;
+      case ir_intrinsic_vote_all:
+         op = nir_intrinsic_vote_all;
+         break;
+      case ir_intrinsic_vote_eq:
+         op = nir_intrinsic_vote_eq;
+         break;
+      case ir_intrinsic_ballot:
+         op = nir_intrinsic_ballot;
+         break;
+      case ir_intrinsic_read_invocation:
+         op = nir_intrinsic_read_invocation;
+         break;
+      case ir_intrinsic_read_first_invocation:
+         op = nir_intrinsic_read_first_invocation;
+         break;
       default:
          unreachable("not reached");
       }
@@ -985,7 +1011,7 @@
           * consider a true boolean to be ~0. Fix this up with a != 0
           * comparison.
           */
-         if (type->base_type == GLSL_TYPE_BOOL) {
+         if (type->is_boolean()) {
             nir_alu_instr *load_ssbo_compare =
                nir_alu_instr_create(shader, nir_op_ine);
             load_ssbo_compare->src[0].src.is_ssa = true;
@@ -1127,6 +1153,53 @@
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
+      case nir_intrinsic_vote_any:
+      case nir_intrinsic_vote_all:
+      case nir_intrinsic_vote_eq: {
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
+
+         ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
+
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
+      }
+
+      case nir_intrinsic_ballot: {
+         nir_ssa_dest_init(&instr->instr, &instr->dest,
+                           ir->return_deref->type->vector_elements, 64, NULL);
+
+         ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
+
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
+      }
+      case nir_intrinsic_read_invocation: {
+         nir_ssa_dest_init(&instr->instr, &instr->dest,
+                           ir->return_deref->type->vector_elements, 32, NULL);
+         instr->num_components = ir->return_deref->type->vector_elements;
+
+         ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
+
+         ir_rvalue *invocation = (ir_rvalue *) ir->actual_parameters.get_head()->next;
+         instr->src[1] = nir_src_for_ssa(evaluate_rvalue(invocation));
+
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
+      }
+      case nir_intrinsic_read_first_invocation: {
+         nir_ssa_dest_init(&instr->instr, &instr->dest,
+                           ir->return_deref->type->vector_elements, 32, NULL);
+         instr->num_components = ir->return_deref->type->vector_elements;
+
+         ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
+
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
+      }
       default:
          unreachable("not reached");
       }
@@ -1334,7 +1407,7 @@
        * a true boolean to be ~0. Fix this up with a != 0 comparison.
        */
 
-      if (ir->type->base_type == GLSL_TYPE_BOOL)
+      if (ir->type->is_boolean())
          this->result = nir_ine(&b, &load->dest.ssa, nir_imm_int(&b, 0));
 
       return;
diff --git a/src/compiler/glsl/hir_field_selection.cpp b/src/compiler/glsl/hir_field_selection.cpp
index eab08ad..9f23643 100644
--- a/src/compiler/glsl/hir_field_selection.cpp
+++ b/src/compiler/glsl/hir_field_selection.cpp
@@ -46,8 +46,7 @@
    YYLTYPE loc = expr->get_location();
    if (op->type->is_error()) {
       /* silently propagate the error */
-   } else if (op->type->base_type == GLSL_TYPE_STRUCT
-              || op->type->base_type == GLSL_TYPE_INTERFACE) {
+   } else if (op->type->is_record() || op->type->is_interface()) {
       result = new(ctx) ir_dereference_record(op,
 					      expr->primary_expression.identifier);
 
diff --git a/src/compiler/glsl/int64.glsl b/src/compiler/glsl/int64.glsl
index 1ac8d12..538f56c 100644
--- a/src/compiler/glsl/int64.glsl
+++ b/src/compiler/glsl/int64.glsl
@@ -1,8 +1,8 @@
 /* Compile with:
  *
- * glsl_compiler --version 140 --dump-builder int64.glsl > builtin_int64.h
+ * glsl_compiler --version 400 --dump-builder int64.glsl > builtin_int64.h
  *
- * Using version 1.40+ prevents built-in variables from being included.
+ * Version 4.00+ is required for umulExtended.
  */
 #version 400
 #extension GL_ARB_gpu_shader_int64: require
diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp
index 2bbc7a1..78889bd 100644
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -24,6 +24,8 @@
 #include "main/core.h" /* for MAX2 */
 #include "ir.h"
 #include "compiler/glsl_types.h"
+#include "glsl_parser_extras.h"
+
 
 ir_rvalue::ir_rvalue(enum ir_node_type t)
    : ir_instruction(t)
@@ -368,6 +370,16 @@
       this->type = glsl_type::vec4_type;
       break;
 
+   case ir_unop_unpack_sampler_2x32:
+   case ir_unop_unpack_image_2x32:
+      this->type = glsl_type::uvec2_type;
+      break;
+
+   case ir_unop_pack_sampler_2x32:
+   case ir_unop_pack_image_2x32:
+      this->type = op0->type;
+      break;
+
    case ir_unop_frexp_sig:
       this->type = op0->type;
       break;
@@ -381,20 +393,6 @@
       this->type = glsl_type::int_type;
       break;
 
-   case ir_unop_ballot:
-      this->type = glsl_type::uint64_t_type;
-      break;
-
-   case ir_unop_read_first_invocation:
-      this->type = op0->type;
-      break;
-
-   case ir_unop_vote_any:
-   case ir_unop_vote_all:
-   case ir_unop_vote_eq:
-      this->type = glsl_type::bool_type;
-      break;
-
    case ir_unop_bitcast_i642d:
    case ir_unop_bitcast_u642d:
       this->type = glsl_type::get_instance(GLSL_TYPE_DOUBLE,
@@ -504,10 +502,6 @@
       this->type = op0->type->get_scalar_type();
       break;
 
-   case ir_binop_read_invocation:
-      this->type = op0->type;
-      break;
-
    default:
       assert(!"not reached: missing automatic type setup for ir_expression");
       this->type = glsl_type::float_type;
@@ -622,7 +616,7 @@
    this->array_elements = NULL;
 
    assert((type->base_type >= GLSL_TYPE_UINT)
-	  && (type->base_type <= GLSL_TYPE_BOOL));
+	  && (type->base_type <= GLSL_TYPE_IMAGE));
 
    this->type = type;
    memcpy(& this->value, data, sizeof(this->value));
@@ -783,10 +777,9 @@
    if (value->type->is_scalar() && value->next->is_tail_sentinel()) {
       if (type->is_matrix()) {
 	 /* Matrix - fill diagonal (rest is already set to 0) */
-         assert(type->base_type == GLSL_TYPE_FLOAT ||
-                type->base_type == GLSL_TYPE_DOUBLE);
+         assert(type->is_float() || type->is_double());
          for (unsigned i = 0; i < type->matrix_columns; i++) {
-            if (type->base_type == GLSL_TYPE_FLOAT)
+            if (type->is_float())
                this->value.f[i * type->vector_elements + i] =
                   value->value.f[0];
             else
@@ -1237,7 +1230,7 @@
       return true;
    }
 
-   if (this->type->base_type == GLSL_TYPE_STRUCT) {
+   if (this->type->is_record()) {
       const exec_node *a_node = this->components.get_head_raw();
       const exec_node *b_node = c->components.get_head_raw();
 
@@ -1455,7 +1448,7 @@
 }
 
 bool
-ir_dereference::is_lvalue() const
+ir_dereference::is_lvalue(const struct _mesa_glsl_parse_state *state) const
 {
    ir_variable *var = this->variable_referenced();
 
@@ -1464,6 +1457,20 @@
    if ((var == NULL) || var->data.read_only)
       return false;
 
+   /* From section 4.1.7 of the ARB_bindless_texture spec:
+    *
+    * "Samplers can be used as l-values, so can be assigned into and used as
+    *  "out" and "inout" function parameters."
+    *
+    * From section 4.1.X of the ARB_bindless_texture spec:
+    *
+    * "Images can be used as l-values, so can be assigned into and used as
+    *  "out" and "inout" function parameters."
+    */
+   if ((!state || state->has_bindless()) &&
+       (this->type->contains_sampler() || this->type->contains_image()))
+      return true;
+
    /* From section 4.1.7 of the GLSL 4.40 spec:
     *
     *   "Opaque variables cannot be treated as l-values; hence cannot
@@ -1510,7 +1517,7 @@
       assert(type->base_type == GLSL_TYPE_INT);
    } else if (this->op == ir_lod) {
       assert(type->vector_elements == 2);
-      assert(type->base_type == GLSL_TYPE_FLOAT);
+      assert(type->is_float());
    } else if (this->op == ir_samples_identical) {
       assert(type == glsl_type::bool_type);
       assert(sampler->type->is_sampler());
@@ -1583,10 +1590,8 @@
 }
 
 ir_swizzle::ir_swizzle(ir_rvalue *val, ir_swizzle_mask mask)
-   : ir_rvalue(ir_type_swizzle)
+   : ir_rvalue(ir_type_swizzle), val(val), mask(mask)
 {
-   this->val = val;
-   this->mask = mask;
    this->type = glsl_type::get_instance(val->type->base_type,
 					mask.num_components, 1);
 }
@@ -1741,18 +1746,17 @@
    this->data.max_array_access = -1;
    this->data.offset = 0;
    this->data.precision = GLSL_PRECISION_NONE;
-   this->data.image_read_only = false;
-   this->data.image_write_only = false;
-   this->data.image_coherent = false;
-   this->data.image_volatile = false;
-   this->data.image_restrict = false;
+   this->data.memory_read_only = false;
+   this->data.memory_write_only = false;
+   this->data.memory_coherent = false;
+   this->data.memory_volatile = false;
+   this->data.memory_restrict = false;
    this->data.from_ssbo_unsized_array = false;
    this->data.fb_fetch_output = false;
+   this->data.bindless = false;
+   this->data.bound = false;
 
    if (type != NULL) {
-      if (type->is_sampler())
-         this->data.read_only = true;
-
       if (type->is_interface())
          this->init_interface_type(type);
       else if (type->without_array()->is_interface())
@@ -1865,11 +1869,11 @@
 	  a->data.centroid != b->data.centroid ||
           a->data.sample != b->data.sample ||
           a->data.patch != b->data.patch ||
-          a->data.image_read_only != b->data.image_read_only ||
-          a->data.image_write_only != b->data.image_write_only ||
-          a->data.image_coherent != b->data.image_coherent ||
-          a->data.image_volatile != b->data.image_volatile ||
-          a->data.image_restrict != b->data.image_restrict) {
+          a->data.memory_read_only != b->data.memory_read_only ||
+          a->data.memory_write_only != b->data.memory_write_only ||
+          a->data.memory_coherent != b->data.memory_coherent ||
+          a->data.memory_volatile != b->data.memory_volatile ||
+          a->data.memory_restrict != b->data.memory_restrict) {
 
 	 /* parameter a's qualifiers don't match */
 	 return a->name;
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index d7a81c5..840c06e 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -233,7 +233,7 @@
 
    ir_rvalue *as_rvalue_to_saturate();
 
-   virtual bool is_lvalue() const
+   virtual bool is_lvalue(const struct _mesa_glsl_parse_state *state = NULL) const
    {
       return false;
    }
@@ -475,6 +475,17 @@
    }
 
    /**
+    * Return whether this variable contains a bindless sampler/image.
+    */
+   inline bool contains_bindless() const
+   {
+      if (!this->type->contains_sampler() && !this->type->contains_image())
+         return false;
+
+      return this->data.bindless || this->data.mode != ir_var_uniform;
+   }
+
+   /**
     * Set this->interface_type on a newly created variable.
     */
    void init_interface_type(const struct glsl_type *type)
@@ -827,13 +838,13 @@
       ir_depth_layout depth_layout:3;
 
       /**
-       * ARB_shader_image_load_store qualifiers.
+       * Memory qualifiers.
        */
-      unsigned image_read_only:1; /**< "readonly" qualifier. */
-      unsigned image_write_only:1; /**< "writeonly" qualifier. */
-      unsigned image_coherent:1;
-      unsigned image_volatile:1;
-      unsigned image_restrict:1;
+      unsigned memory_read_only:1; /**< "readonly" qualifier. */
+      unsigned memory_write_only:1; /**< "writeonly" qualifier. */
+      unsigned memory_coherent:1;
+      unsigned memory_volatile:1;
+      unsigned memory_restrict:1;
 
       /**
        * ARB_shader_storage_buffer_object
@@ -850,6 +861,18 @@
       unsigned fb_fetch_output:1;
 
       /**
+       * Non-zero if this variable is considered bindless as defined by
+       * ARB_bindless_texture.
+       */
+      unsigned bindless:1;
+
+      /**
+       * Non-zero if this variable is considered bound as defined by
+       * ARB_bindless_texture.
+       */
+      unsigned bound:1;
+
+      /**
        * Emit a warning if this variable is accessed.
        */
    private:
@@ -1098,6 +1121,13 @@
    ir_intrinsic_memory_barrier_image,
    ir_intrinsic_memory_barrier_shared,
 
+   ir_intrinsic_vote_all,
+   ir_intrinsic_vote_any,
+   ir_intrinsic_vote_eq,
+   ir_intrinsic_ballot,
+   ir_intrinsic_read_invocation,
+   ir_intrinsic_read_first_invocation,
+
    ir_intrinsic_shared_load,
    ir_intrinsic_shared_store = MAKE_INTRINSIC_FOR_TYPE(store, shared),
    ir_intrinsic_shared_atomic_add = MAKE_INTRINSIC_FOR_TYPE(atomic_add, shared),
@@ -1922,9 +1952,9 @@
    virtual bool equals(const ir_instruction *ir,
                        enum ir_node_type ignore = ir_type_unset) const;
 
-   bool is_lvalue() const
+   bool is_lvalue(const struct _mesa_glsl_parse_state *state) const
    {
-      return val->is_lvalue() && !mask.has_duplicates;
+      return val->is_lvalue(state) && !mask.has_duplicates;
    }
 
    /**
@@ -1949,7 +1979,7 @@
 public:
    virtual ir_dereference *clone(void *mem_ctx, struct hash_table *) const = 0;
 
-   bool is_lvalue() const;
+   bool is_lvalue(const struct _mesa_glsl_parse_state *state) const;
 
    /**
     * Get the variable that is ultimately referenced by an r-value
diff --git a/src/compiler/glsl/ir_builder_print_visitor.cpp b/src/compiler/glsl/ir_builder_print_visitor.cpp
index 825dbe1..02f15e7 100644
--- a/src/compiler/glsl/ir_builder_print_visitor.cpp
+++ b/src/compiler/glsl/ir_builder_print_visitor.cpp
@@ -372,17 +372,17 @@
          switch (ir->type->base_type) {
          case GLSL_TYPE_UINT:
             if (ir->value.u[i] != 0)
-               print_without_indent("r%04X_data.u[%u] = %u;\n",
+               print_with_indent("r%04X_data.u[%u] = %u;\n",
                                     my_index, i, ir->value.u[i]);
             break;
          case GLSL_TYPE_INT:
             if (ir->value.i[i] != 0)
-               print_without_indent("r%04X_data.i[%u] = %i;\n",
+               print_with_indent("r%04X_data.i[%u] = %i;\n",
                                     my_index, i, ir->value.i[i]);
             break;
          case GLSL_TYPE_FLOAT:
             if (ir->value.u[i] != 0)
-               print_without_indent("r%04X_data.u[%u] = 0x%08x; /* %f */\n",
+               print_with_indent("r%04X_data.u[%u] = 0x%08x; /* %f */\n",
                                     my_index,
                                     i,
                                     ir->value.u[i],
@@ -395,27 +395,27 @@
 
             memcpy(&v, &ir->value.d[i], sizeof(v));
             if (v != 0)
-               print_without_indent("r%04X_data.u64[%u] = 0x%016" PRIx64 "; /* %g */\n",
+               print_with_indent("r%04X_data.u64[%u] = 0x%016" PRIx64 "; /* %g */\n",
                                     my_index, i, v, ir->value.d[i]);
             break;
          }
          case GLSL_TYPE_UINT64:
             if (ir->value.u64[i] != 0)
-               print_without_indent("r%04X_data.u64[%u] = %" PRIu64 ";\n",
+               print_with_indent("r%04X_data.u64[%u] = %" PRIu64 ";\n",
                                     my_index,
                                     i,
                                     ir->value.u64[i]);
             break;
          case GLSL_TYPE_INT64:
             if (ir->value.i64[i] != 0)
-               print_without_indent("r%04X_data.i64[%u] = %" PRId64 ";\n",
+               print_with_indent("r%04X_data.i64[%u] = %" PRId64 ";\n",
                                     my_index,
                                     i,
                                     ir->value.i64[i]);
             break;
          case GLSL_TYPE_BOOL:
             if (ir->value.u[i] != 0)
-               print_without_indent("r%04X_data.u[%u] = 1;\n", my_index, i);
+               print_with_indent("r%04X_data.u[%u] = 1;\n", my_index, i);
             break;
          default:
             unreachable("Invalid constant type");
diff --git a/src/compiler/glsl/ir_clone.cpp b/src/compiler/glsl/ir_clone.cpp
index bfe2573..a64c7af 100644
--- a/src/compiler/glsl/ir_clone.cpp
+++ b/src/compiler/glsl/ir_clone.cpp
@@ -339,6 +339,8 @@
    case GLSL_TYPE_BOOL:
    case GLSL_TYPE_UINT64:
    case GLSL_TYPE_INT64:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
       return new(mem_ctx) ir_constant(this->type, &this->value);
 
    case GLSL_TYPE_STRUCT: {
@@ -367,8 +369,6 @@
       return c;
    }
 
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
diff --git a/src/compiler/glsl/ir_constant_expression.cpp b/src/compiler/glsl/ir_constant_expression.cpp
index cd3cd1b..fe13674 100644
--- a/src/compiler/glsl/ir_constant_expression.cpp
+++ b/src/compiler/glsl/ir_constant_expression.cpp
@@ -725,6 +725,8 @@
          case GLSL_TYPE_FLOAT: data.f[i] = v->value.f[swiz_idx[i]]; break;
          case GLSL_TYPE_BOOL:  data.b[i] = v->value.b[swiz_idx[i]]; break;
          case GLSL_TYPE_DOUBLE:data.d[i] = v->value.d[swiz_idx[i]]; break;
+         case GLSL_TYPE_UINT64:data.u64[i] = v->value.u64[swiz_idx[i]]; break;
+         case GLSL_TYPE_INT64: data.i64[i] = v->value.i64[swiz_idx[i]]; break;
          default:              assert(!"Should not get here."); break;
          }
       }
diff --git a/src/compiler/glsl/ir_equals.cpp b/src/compiler/glsl/ir_equals.cpp
index 58b08d4..81980eb 100644
--- a/src/compiler/glsl/ir_equals.cpp
+++ b/src/compiler/glsl/ir_equals.cpp
@@ -58,7 +58,7 @@
       return false;
 
    for (unsigned i = 0; i < type->components(); i++) {
-      if (type->base_type == GLSL_TYPE_DOUBLE) {
+      if (type->is_double()) {
          if (value.d[i] != other->value.d[i])
             return false;
       } else {
diff --git a/src/compiler/glsl/ir_expression_operation.py b/src/compiler/glsl/ir_expression_operation.py
index 1d29560..52f2550 100644
--- a/src/compiler/glsl/ir_expression_operation.py
+++ b/src/compiler/glsl/ir_expression_operation.py
@@ -180,7 +180,7 @@
          for (unsigned j = 0; j < p; j++) {
             for (unsigned i = 0; i < n; i++) {
                for (unsigned k = 0; k < m; k++) {
-                  if (op[0]->type->base_type == GLSL_TYPE_DOUBLE)
+                  if (op[0]->type->is_double())
                      data.d[i+n*j] += op[0]->value.d[i+n*k]*op[1]->value.d[k+m*j];
                   else
                      data.f[i+n*j] += op[0]->value.f[i+n*k]*op[1]->value.f[k+m*j];
@@ -276,12 +276,9 @@
 # This template is for ir_triop_lrp.
 constant_template_lrp = mako.template.Template("""\
    case ${op.get_enum_name()}: {
-      assert(op[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             op[0]->type->base_type == GLSL_TYPE_DOUBLE);
-      assert(op[1]->type->base_type == GLSL_TYPE_FLOAT ||
-             op[1]->type->base_type == GLSL_TYPE_DOUBLE);
-      assert(op[2]->type->base_type == GLSL_TYPE_FLOAT ||
-             op[2]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(op[0]->type->is_float() || op[0]->type->is_double());
+      assert(op[1]->type->is_float() || op[1]->type->is_double());
+      assert(op[2]->type->is_float() || op[2]->type->is_double());
 
       unsigned c2_inc = op[2]->type->is_scalar() ? 0 : 1;
       for (unsigned c = 0, c2 = 0; c < components; c2 += c2_inc, c++) {
@@ -546,6 +543,12 @@
    operation("pack_double_2x32", 1, printable_name="packDouble2x32", source_types=(uint_type,), dest_type=double_type, c_expression="memcpy(&data.d[0], &op[0]->value.u[0], sizeof(double))", flags=frozenset((horizontal_operation, non_assign_operation))),
    operation("unpack_double_2x32", 1, printable_name="unpackDouble2x32", source_types=(double_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.d[0], sizeof(double))", flags=frozenset((horizontal_operation, non_assign_operation))),
 
+   # Sampler/Image packing, part of ARB_bindless_texture.
+   operation("pack_sampler_2x32", 1, printable_name="packSampler2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
+   operation("pack_image_2x32", 1, printable_name="packImage2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
+   operation("unpack_sampler_2x32", 1, printable_name="unpackSampler2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.u64[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
+   operation("unpack_image_2x32", 1, printable_name="unpackImage2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.u64[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
+
    operation("frexp_sig", 1),
    operation("frexp_exp", 1),
 
@@ -570,15 +573,6 @@
    # of its length.
    operation("ssbo_unsized_array_length", 1),
 
-   # ARB_shader_ballot operations
-   operation("ballot", 1, source_types=(bool_type,), dest_type=uint64_type),
-   operation("read_first_invocation", 1),
-
-   # Vote among threads on the value of the boolean argument.
-   operation("vote_any", 1),
-   operation("vote_all", 1),
-   operation("vote_eq", 1),
-
    # 64-bit integer packing ops.
    operation("pack_int_2x32", 1, printable_name="packInt2x32", source_types=(int_type,), dest_type=int64_type, c_expression="memcpy(&data.i64[0], &op[0]->value.i[0], sizeof(int64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
    operation("pack_uint_2x32", 1, printable_name="packUint2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
@@ -670,9 +664,6 @@
    # operand1 is the sample ID
    operation("interpolate_at_sample", 2),
 
-   # ARB_shader_ballot operation
-   operation("read_invocation", 2),
-
    # Fused floating-point multiply-add, part of ARB_gpu_shader5.
    operation("fma", 3, source_types=real_types, c_expression="{src0} * {src1} + {src2}"),
 
diff --git a/src/compiler/glsl/ir_function.cpp b/src/compiler/glsl/ir_function.cpp
index 3aeba81..3ee0d17 100644
--- a/src/compiler/glsl/ir_function.cpp
+++ b/src/compiler/glsl/ir_function.cpp
@@ -149,13 +149,13 @@
    if (from_type == to_type)
       return PARAMETER_EXACT_MATCH;
 
-   if (to_type->base_type == GLSL_TYPE_DOUBLE) {
-      if (from_type->base_type == GLSL_TYPE_FLOAT)
+   if (to_type->is_double()) {
+      if (from_type->is_float())
          return PARAMETER_FLOAT_TO_DOUBLE;
       return PARAMETER_INT_TO_DOUBLE;
    }
 
-   if (to_type->base_type == GLSL_TYPE_FLOAT)
+   if (to_type->is_float())
       return PARAMETER_INT_TO_FLOAT;
 
    /* int -> uint and any other oddball conversions */
diff --git a/src/compiler/glsl/ir_print_visitor.cpp b/src/compiler/glsl/ir_print_visitor.cpp
index 1c84c1b..86ddea6 100644
--- a/src/compiler/glsl/ir_print_visitor.cpp
+++ b/src/compiler/glsl/ir_print_visitor.cpp
@@ -145,12 +145,11 @@
 static void
 print_type(FILE *f, const glsl_type *t)
 {
-   if (t->base_type == GLSL_TYPE_ARRAY) {
+   if (t->is_array()) {
       fprintf(f, "(array ");
       print_type(f, t->fields.array);
       fprintf(f, " %u)", t->length);
-   } else if ((t->base_type == GLSL_TYPE_STRUCT)
-              && !is_gl_identifier(t->name)) {
+   } else if (t->is_record() && !is_gl_identifier(t->name)) {
       fprintf(f, "%s@%p", t->name, (void *) t);
    } else {
       fprintf(f, "%s", t->name);
@@ -175,7 +174,7 @@
       snprintf(loc, sizeof(loc), "location=%i ", ir->data.location);
 
    char component[32] = {0};
-   if (ir->data.explicit_component)
+   if (ir->data.explicit_component || ir->data.location_frac != 0)
       snprintf(component, sizeof(component), "component=%i ", ir->data.location_frac);
 
    char stream[32] = {0};
@@ -189,11 +188,24 @@
       snprintf(stream, sizeof(stream), "stream%u ", ir->data.stream);
    }
 
+   char image_format[32] = {0};
+   if (ir->data.image_format) {
+      snprintf(image_format, sizeof(image_format), "format=%x ",
+               ir->data.image_format);
+   }
+
    const char *const cent = (ir->data.centroid) ? "centroid " : "";
    const char *const samp = (ir->data.sample) ? "sample " : "";
    const char *const patc = (ir->data.patch) ? "patch " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
    const char *const prec = (ir->data.precise) ? "precise " : "";
+   const char *const bindless = (ir->data.bindless) ? "bindless " : "";
+   const char *const bound = (ir->data.bound) ? "bound " : "";
+   const char *const memory_read_only = (ir->data.memory_read_only) ? "readonly " : "";
+   const char *const memory_write_only = (ir->data.memory_write_only) ? "writeonly " : "";
+   const char *const memory_coherent = (ir->data.memory_coherent) ? "coherent " : "";
+   const char *const memory_volatile = (ir->data.memory_volatile) ? "volatile " : "";
+   const char *const memory_restrict = (ir->data.memory_restrict) ? "restrict " : "";
    const char *const mode[] = { "", "uniform ", "shader_storage ",
                                 "shader_shared ", "shader_in ", "shader_out ",
                                 "in ", "out ", "inout ",
@@ -202,8 +214,11 @@
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
    STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_MODE_COUNT);
 
-   fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s) ",
-           binding, loc, component, cent, samp, patc, inv, prec, mode[ir->data.mode],
+   fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ",
+           binding, loc, component, cent, bindless, bound,
+           image_format, memory_read_only, memory_write_only,
+           memory_coherent, memory_volatile, memory_restrict,
+           samp, patc, inv, prec, mode[ir->data.mode],
            stream,
            interp[ir->data.interpolation]);
 
@@ -477,7 +492,11 @@
             else
                fprintf(f, "%f", ir->value.f[i]);
             break;
-	 case GLSL_TYPE_UINT64:fprintf(f, "%" PRIu64, ir->value.u64[i]); break;
+	 case GLSL_TYPE_SAMPLER:
+	 case GLSL_TYPE_IMAGE:
+	 case GLSL_TYPE_UINT64:
+            fprintf(f, "%" PRIu64, ir->value.u64[i]);
+            break;
 	 case GLSL_TYPE_INT64: fprintf(f, "%" PRIi64, ir->value.i64[i]); break;
 	 case GLSL_TYPE_BOOL:  fprintf(f, "%d", ir->value.b[i]); break;
 	 case GLSL_TYPE_DOUBLE:
diff --git a/src/compiler/glsl/ir_reader.cpp b/src/compiler/glsl/ir_reader.cpp
index 6d3d048..b87933b 100644
--- a/src/compiler/glsl/ir_reader.cpp
+++ b/src/compiler/glsl/ir_reader.cpp
@@ -833,7 +833,7 @@
 	 return NULL;
       }
 
-      if (type->base_type == GLSL_TYPE_FLOAT) {
+      if (type->is_float()) {
 	 s_number *value = SX_AS_NUMBER(expr);
 	 if (value == NULL) {
 	    ir_read_error(values, "expected numbers");
diff --git a/src/compiler/glsl/ir_uniform.h b/src/compiler/glsl/ir_uniform.h
index b6aec7fc..9545c49 100644
--- a/src/compiler/glsl/ir_uniform.h
+++ b/src/compiler/glsl/ir_uniform.h
@@ -107,6 +107,11 @@
    struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES];
 
    /**
+    * Mask of shader stages (1 << MESA_SHADER_xxx) where this uniform is used.
+    */
+   unsigned active_shader_mask;
+
+   /**
     * Storage used by the driver for the uniform
     */
    unsigned num_driver_storage;
@@ -201,6 +206,12 @@
     * top-level shader storage block member. (GL_TOP_LEVEL_ARRAY_STRIDE).
     */
    unsigned top_level_array_stride;
+
+   /**
+    * Whether this uniform variable has the bindless_sampler or bindless_image
+    * layout qualifier as specified by ARB_bindless_texture.
+    */
+   bool is_bindless;
 };
 
 #ifdef __cplusplus
diff --git a/src/compiler/glsl/ir_validate.cpp b/src/compiler/glsl/ir_validate.cpp
index 76a4ed1..6e2f3e5 100644
--- a/src/compiler/glsl/ir_validate.cpp
+++ b/src/compiler/glsl/ir_validate.cpp
@@ -241,8 +241,8 @@
       assert(ir->operands[0]->type == ir->type);
       break;
    case ir_unop_logic_not:
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->type->is_boolean());
+      assert(ir->operands[0]->type->is_boolean());
       break;
 
    case ir_unop_neg:
@@ -252,8 +252,8 @@
    case ir_unop_abs:
    case ir_unop_sign:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE ||
+             ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double() ||
              ir->operands[0]->type->base_type == GLSL_TYPE_INT64);
       assert(ir->type == ir->operands[0]->type);
       break;
@@ -261,8 +261,8 @@
    case ir_unop_rcp:
    case ir_unop_rsq:
    case ir_unop_sqrt:
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_float() ||
+             ir->type->is_double());
       assert(ir->type == ir->operands[0]->type);
       break;
 
@@ -271,41 +271,41 @@
    case ir_unop_exp2:
    case ir_unop_log2:
    case ir_unop_saturate:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type == ir->operands[0]->type);
       break;
 
    case ir_unop_f2i:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
    case ir_unop_f2u:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_UINT);
       break;
    case ir_unop_i2f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_f2b:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->operands[0]->type->is_float());
+      assert(ir->type->is_boolean());
       break;
    case ir_unop_b2f:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_boolean());
+      assert(ir->type->is_float());
       break;
    case ir_unop_i2b:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->type->is_boolean());
       break;
    case ir_unop_b2i:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->operands[0]->type->is_boolean());
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
    case ir_unop_u2f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_i2u:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
@@ -317,35 +317,35 @@
       break;
    case ir_unop_bitcast_i2f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_bitcast_f2i:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
    case ir_unop_bitcast_u2f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_bitcast_f2u:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_UINT);
       break;
 
    case ir_unop_bitcast_u642d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT64);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_bitcast_i642d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT64);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_bitcast_d2u64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_UINT64);
       break;
    case ir_unop_bitcast_d2i64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_INT64);
       break;
    case ir_unop_i642i:
@@ -366,23 +366,23 @@
       break;
    case ir_unop_i642b:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT64);
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->type->is_boolean());
       break;
    case ir_unop_i642f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT64);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_u642f:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT64);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->type->is_float());
       break;
    case ir_unop_i642d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT64);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_u642d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT64);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_i2i64:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
@@ -393,15 +393,15 @@
       assert(ir->type->base_type == GLSL_TYPE_INT64);
       break;
    case ir_unop_b2i64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->operands[0]->type->is_boolean());
       assert(ir->type->base_type == GLSL_TYPE_INT64);
       break;
    case ir_unop_f2i64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_INT64);
       break;
    case ir_unop_d2i64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_INT64);
       break;
    case ir_unop_i2u64:
@@ -413,11 +413,11 @@
       assert(ir->type->base_type == GLSL_TYPE_UINT64);
       break;
    case ir_unop_f2u64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->type->base_type == GLSL_TYPE_UINT64);
       break;
    case ir_unop_d2u64:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_UINT64);
       break;
    case ir_unop_u642i64:
@@ -433,8 +433,8 @@
    case ir_unop_ceil:
    case ir_unop_floor:
    case ir_unop_fract:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double());
       assert(ir->operands[0]->type == ir->type);
       break;
    case ir_unop_sin:
@@ -445,7 +445,7 @@
    case ir_unop_dFdy:
    case ir_unop_dFdy_coarse:
    case ir_unop_dFdy_fine:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_float());
       assert(ir->operands[0]->type == ir->type);
       break;
 
@@ -477,6 +477,16 @@
       assert(ir->operands[0]->type == glsl_type::uvec2_type);
       break;
 
+   case ir_unop_pack_sampler_2x32:
+      assert(ir->type->is_sampler());
+      assert(ir->operands[0]->type == glsl_type::uvec2_type);
+      break;
+
+   case ir_unop_pack_image_2x32:
+      assert(ir->type->is_image());
+      assert(ir->operands[0]->type == glsl_type::uvec2_type);
+      break;
+
    case ir_unop_unpack_snorm_2x16:
    case ir_unop_unpack_unorm_2x16:
    case ir_unop_unpack_half_2x16:
@@ -505,6 +515,16 @@
       assert(ir->operands[0]->type == glsl_type::uint64_t_type);
       break;
 
+   case ir_unop_unpack_sampler_2x32:
+      assert(ir->type == glsl_type::uvec2_type);
+      assert(ir->operands[0]->type->is_sampler());
+      break;
+
+   case ir_unop_unpack_image_2x32:
+      assert(ir->type == glsl_type::uvec2_type);
+      assert(ir->operands[0]->type->is_image());
+      break;
+
    case ir_unop_bitfield_reverse:
       assert(ir->operands[0]->type == ir->type);
       assert(ir->type->is_integer());
@@ -539,42 +559,42 @@
       break;
 
    case ir_unop_d2f:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+      assert(ir->operands[0]->type->is_double());
+      assert(ir->type->is_float());
       break;
    case ir_unop_f2d:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float());
+      assert(ir->type->is_double());
       break;
    case ir_unop_d2i:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
    case ir_unop_i2d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_d2u:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_UINT);
       break;
    case ir_unop_u2d:
       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_double());
       break;
    case ir_unop_d2b:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->operands[0]->type->is_double());
+      assert(ir->type->is_boolean());
       break;
 
    case ir_unop_frexp_sig:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
-      assert(ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double());
+      assert(ir->type->is_double());
       break;
    case ir_unop_frexp_exp:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double());
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
    case ir_unop_subroutine_to_int:
@@ -582,29 +602,6 @@
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
 
-   case ir_unop_ballot:
-      assert(ir->type == glsl_type::uint64_t_type);
-      assert(ir->operands[0]->type == glsl_type::bool_type);
-      break;
-
-   case ir_binop_read_invocation:
-      assert(ir->operands[1]->type == glsl_type::uint_type);
-      /* fall-through */
-   case ir_unop_read_first_invocation:
-      assert(ir->type == ir->operands[0]->type);
-      assert(ir->type->is_scalar() || ir->type->is_vector());
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->type->base_type == GLSL_TYPE_INT ||
-             ir->type->base_type == GLSL_TYPE_UINT);
-      break;
-
-   case ir_unop_vote_any:
-   case ir_unop_vote_all:
-   case ir_unop_vote_eq:
-      assert(ir->type == glsl_type::bool_type);
-      assert(ir->operands[0]->type == glsl_type::bool_type);
-      break;
-
    case ir_binop_add:
    case ir_binop_sub:
    case ir_binop_mul:
@@ -651,7 +648,7 @@
        * comparison on scalar or vector types and return a boolean scalar or
        * vector type of the same size.
        */
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->type->is_boolean());
       assert(ir->operands[0]->type == ir->operands[1]->type);
       assert(ir->operands[0]->type->is_vector()
 	     || ir->operands[0]->type->is_scalar());
@@ -699,16 +696,16 @@
    case ir_binop_logic_and:
    case ir_binop_logic_xor:
    case ir_binop_logic_or:
-      assert(ir->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->operands[1]->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->type->is_boolean());
+      assert(ir->operands[0]->type->is_boolean());
+      assert(ir->operands[1]->type->is_boolean());
       break;
 
    case ir_binop_dot:
       assert(ir->type == glsl_type::float_type ||
              ir->type == glsl_type::double_type);
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double());
       assert(ir->operands[0]->type->is_vector());
       assert(ir->operands[0]->type == ir->operands[1]->type);
       break;
@@ -748,16 +745,16 @@
       break;
 
    case ir_triop_fma:
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->type->is_float() ||
+             ir->type->is_double());
       assert(ir->type == ir->operands[0]->type);
       assert(ir->type == ir->operands[1]->type);
       assert(ir->type == ir->operands[2]->type);
       break;
 
    case ir_triop_lrp:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT ||
-             ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(ir->operands[0]->type->is_float() ||
+             ir->operands[0]->type->is_double());
       assert(ir->operands[0]->type == ir->operands[1]->type);
       assert(ir->operands[2]->type == ir->operands[0]->type ||
              ir->operands[2]->type == glsl_type::float_type ||
@@ -765,7 +762,7 @@
       break;
 
    case ir_triop_csel:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
+      assert(ir->operands[0]->type->is_boolean());
       assert(ir->type->vector_elements == ir->operands[0]->type->vector_elements);
       assert(ir->type == ir->operands[1]->type);
       assert(ir->type == ir->operands[2]->type);
diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index 8911c3d..84a3879 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -25,7 +25,7 @@
 #include "ir.h"
 #include "linker.h"
 #include "ir_uniform.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 
 /* These functions are put in a "private" namespace instead of being marked
  * static so that the unit tests can access them.  See
@@ -97,7 +97,8 @@
  */
 void
 set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
-                   const glsl_type *type, const char *name, int *binding)
+                   const ir_variable *var, const glsl_type *type,
+                   const char *name, int *binding)
 {
 
    if (type->is_array() && type->fields.array->is_array()) {
@@ -106,7 +107,7 @@
       for (unsigned int i = 0; i < type->length; i++) {
          const char *element_name = ralloc_asprintf(mem_ctx, "%s[%d]", name, i);
 
-         set_opaque_binding(mem_ctx, prog, element_type,
+         set_opaque_binding(mem_ctx, prog, var, element_type,
                             element_name, binding);
       }
    } else {
@@ -117,7 +118,7 @@
 
       const unsigned elements = MAX2(storage->array_elements, 1);
 
-      /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec
+      /* Section 4.4.6 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.50 spec
        * says:
        *
        *     "If the binding identifier is used with an array, the first element
@@ -129,20 +130,44 @@
       }
 
       for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
-        gl_linked_shader *shader = prog->_LinkedShaders[sh];
+         gl_linked_shader *shader = prog->_LinkedShaders[sh];
 
-         if (shader) {
-            if (storage->type->is_sampler() && storage->opaque[sh].active) {
-               for (unsigned i = 0; i < elements; i++) {
-                  const unsigned index = storage->opaque[sh].index + i;
+         if (!shader)
+            continue;
+         if (!storage->opaque[sh].active)
+            continue;
+
+         if (storage->type->is_sampler()) {
+            for (unsigned i = 0; i < elements; i++) {
+               const unsigned index = storage->opaque[sh].index + i;
+
+               if (var->data.bindless) {
+                  if (index >= shader->Program->sh.NumBindlessSamplers)
+                     break;
+                  shader->Program->sh.BindlessSamplers[index].unit =
+                     storage->storage[i].i;
+                  shader->Program->sh.BindlessSamplers[index].bound = true;
+                  shader->Program->sh.HasBoundBindlessSampler = true;
+               } else {
+                  if (index >= ARRAY_SIZE(shader->Program->SamplerUnits))
+                     break;
                   shader->Program->SamplerUnits[index] =
                      storage->storage[i].i;
                }
+            }
+         } else if (storage->type->is_image()) {
+            for (unsigned i = 0; i < elements; i++) {
+               const unsigned index = storage->opaque[sh].index + i;
 
-            } else if (storage->type->is_image() &&
-                    storage->opaque[sh].active) {
-               for (unsigned i = 0; i < elements; i++) {
-                  const unsigned index = storage->opaque[sh].index + i;
+
+               if (var->data.bindless) {
+                  if (index >= shader->Program->sh.NumBindlessImages)
+                     break;
+                  shader->Program->sh.BindlessImages[index].unit =
+                     storage->storage[i].i;
+                  shader->Program->sh.BindlessImages[index].bound = true;
+                  shader->Program->sh.HasBoundBindlessImage = true;
+               } else {
                   if (index >= ARRAY_SIZE(shader->Program->sh.ImageUnits))
                      break;
                   shader->Program->sh.ImageUnits[index] =
@@ -280,7 +305,7 @@
             if (type->without_array()->is_sampler() ||
                 type->without_array()->is_image()) {
                int binding = var->data.binding;
-               linker::set_opaque_binding(mem_ctx, prog, var->type,
+               linker::set_opaque_binding(mem_ctx, prog, var, var->type,
                                           var->name, &binding);
             } else if (var->is_in_buffer_block()) {
                const glsl_type *const iface_type = var->get_interface_type();
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index c29fbed..99b171d 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -27,7 +27,7 @@
 #include "ir_uniform.h"
 #include "glsl_symbol_table.h"
 #include "program.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 #include "ir_array_refcount.h"
 
 /**
@@ -42,21 +42,6 @@
  */
 #define UNMAPPED_UNIFORM_LOC ~0u
 
-/**
- * Count the backing storage requirements for a type
- */
-unsigned
-values_for_type(const glsl_type *type)
-{
-   if (type->is_sampler()) {
-      return 1;
-   } else if (type->is_array() && type->fields.array->is_sampler()) {
-      return type->array_size();
-   } else {
-      return type->component_slots();
-   }
-}
-
 void
 program_resource_visitor::process(const glsl_type *type, const char *name)
 {
@@ -273,7 +258,7 @@
         num_shader_samplers(0), num_shader_images(0),
         num_shader_uniform_components(0), num_shader_subroutines(0),
         is_buffer_block(false), is_shader_storage(false), map(map),
-        hidden_map(hidden_map)
+        hidden_map(hidden_map), current_var(NULL)
    {
       /* empty */
    }
@@ -351,13 +336,17 @@
        * uniform for multiple shader targets, but in this case we want to
        * count it for each shader target.
        */
-      const unsigned values = values_for_type(type);
+      const unsigned values = type->component_slots();
       if (type->contains_subroutine()) {
          this->num_shader_subroutines += values;
-      } else if (type->contains_sampler()) {
-         this->num_shader_samplers += values;
-      } else if (type->contains_image()) {
-         this->num_shader_images += values;
+      } else if (type->contains_sampler() && !current_var->data.bindless) {
+         /* Samplers (bound or bindless) are counted as two components as
+          * specified by ARB_bindless_texture. */
+         this->num_shader_samplers += values / 2;
+      } else if (type->contains_image() && !current_var->data.bindless) {
+         /* Images (bound or bindless) are counted as two components as
+          * specified by ARB_bindless_texture. */
+         this->num_shader_images += values / 2;
 
          /* As drivers are likely to represent image uniforms as
           * scalar indices, count them against the limit of uniform
@@ -429,10 +418,17 @@
                               struct string_to_uint_map *map,
                               struct gl_uniform_storage *uniforms,
                               union gl_constant_value *values)
-      : prog(prog), map(map), uniforms(uniforms), values(values)
+      : prog(prog), map(map), uniforms(uniforms), values(values),
+        bindless_targets(NULL), bindless_access(NULL)
    {
    }
 
+   virtual ~parcel_out_uniform_storage()
+   {
+      free(this->bindless_targets);
+      free(this->bindless_access);
+   }
+
    void start_shader(gl_shader_stage shader_type)
    {
       assert(shader_type < MESA_SHADER_STAGES);
@@ -445,6 +441,16 @@
       this->next_subroutine = 0;
       this->record_array_count = 1;
       memset(this->targets, 0, sizeof(this->targets));
+
+      this->num_bindless_samplers = 0;
+      this->next_bindless_sampler = 0;
+      free(this->bindless_targets);
+      this->bindless_targets = NULL;
+
+      this->num_bindless_images = 0;
+      this->next_bindless_image = 0;
+      free(this->bindless_access);
+      this->bindless_access = NULL;
    }
 
    void set_and_process(ir_variable *var)
@@ -452,6 +458,9 @@
       current_var = var;
       field_counter = 0;
       this->record_next_sampler = new string_to_uint_map;
+      this->record_next_bindless_sampler = new string_to_uint_map;
+      this->record_next_image = new string_to_uint_map;
+      this->record_next_bindless_image = new string_to_uint_map;
 
       buffer_block_index = -1;
       if (var->is_in_buffer_block()) {
@@ -512,6 +521,9 @@
          process(var);
       }
       delete this->record_next_sampler;
+      delete this->record_next_bindless_sampler;
+      delete this->record_next_image;
+      delete this->record_next_bindless_image;
    }
 
    int buffer_block_index;
@@ -519,101 +531,152 @@
    gl_shader_stage shader_type;
 
 private:
+   bool set_opaque_indices(const glsl_type *base_type,
+                           struct gl_uniform_storage *uniform,
+                           const char *name, unsigned &next_index,
+                           struct string_to_uint_map *record_next_index)
+   {
+      assert(base_type->is_sampler() || base_type->is_image());
+
+      if (this->record_array_count > 1) {
+         unsigned inner_array_size = MAX2(1, uniform->array_elements);
+         char *name_copy = ralloc_strdup(NULL, name);
+
+         /* Remove all array subscripts from the sampler/image name */
+         char *str_start;
+         const char *str_end;
+         while((str_start = strchr(name_copy, '[')) &&
+               (str_end = strchr(name_copy, ']'))) {
+            memmove(str_start, str_end + 1, 1 + strlen(str_end + 1));
+         }
+
+         unsigned index = 0;
+         if (record_next_index->get(index, name_copy)) {
+            /* In this case, we've already seen this uniform so we just use the
+             * next sampler/image index recorded the last time we visited.
+             */
+            uniform->opaque[shader_type].index = index;
+            index = inner_array_size + uniform->opaque[shader_type].index;
+            record_next_index->put(index, name_copy);
+
+            ralloc_free(name_copy);
+            /* Return as everything else has already been initialised in a
+             * previous pass.
+             */
+            return false;
+         } else {
+            /* We've never seen this uniform before so we need to allocate
+             * enough indices to store it.
+             *
+             * Nested struct arrays behave like arrays of arrays so we need to
+             * increase the index by the total number of elements of the
+             * sampler/image in case there is more than one sampler/image
+             * inside the structs. This allows the offset to be easily
+             * calculated for indirect indexing.
+             */
+            uniform->opaque[shader_type].index = next_index;
+            next_index += inner_array_size * this->record_array_count;
+
+            /* Store the next index for future passes over the struct array
+             */
+            index = uniform->opaque[shader_type].index + inner_array_size;
+            record_next_index->put(index, name_copy);
+            ralloc_free(name_copy);
+         }
+      } else {
+         /* Increment the sampler/image by 1 for non-arrays and by the number
+          * of array elements for arrays.
+          */
+         uniform->opaque[shader_type].index = next_index;
+         next_index += MAX2(1, uniform->array_elements);
+      }
+      return true;
+   }
+
    void handle_samplers(const glsl_type *base_type,
                         struct gl_uniform_storage *uniform, const char *name)
    {
       if (base_type->is_sampler()) {
          uniform->opaque[shader_type].active = true;
 
-         /* Handle multiple samplers inside struct arrays */
-         if (this->record_array_count > 1) {
-            unsigned inner_array_size = MAX2(1, uniform->array_elements);
-            char *name_copy = ralloc_strdup(NULL, name);
-
-            /* Remove all array subscripts from the sampler name */
-            char *str_start;
-            const char *str_end;
-            while((str_start = strchr(name_copy, '[')) &&
-                  (str_end = strchr(name_copy, ']'))) {
-               memmove(str_start, str_end + 1, 1 + strlen(str_end + 1));
-            }
-
-            unsigned index = 0;
-            if (this->record_next_sampler->get(index, name_copy)) {
-               /* In this case, we've already seen this uniform so we just use
-                * the next sampler index recorded the last time we visited.
-                */
-               uniform->opaque[shader_type].index = index;
-               index = inner_array_size + uniform->opaque[shader_type].index;
-               this->record_next_sampler->put(index, name_copy);
-
-               ralloc_free(name_copy);
-               /* Return as everything else has already been initialised in a
-                * previous pass.
-                */
-               return;
-            } else {
-               /* We've never seen this uniform before so we need to allocate
-                * enough indices to store it.
-                *
-                * Nested struct arrays behave like arrays of arrays so we need
-                * to increase the index by the total number of elements of the
-                * sampler in case there is more than one sampler inside the
-                * structs. This allows the offset to be easily calculated for
-                * indirect indexing.
-                */
-               uniform->opaque[shader_type].index = this->next_sampler;
-               this->next_sampler +=
-                  inner_array_size * this->record_array_count;
-
-               /* Store the next index for future passes over the struct array
-                */
-               index = uniform->opaque[shader_type].index + inner_array_size;
-               this->record_next_sampler->put(index, name_copy);
-               ralloc_free(name_copy);
-            }
-         } else {
-            /* Increment the sampler by 1 for non-arrays and by the number of
-             * array elements for arrays.
-             */
-            uniform->opaque[shader_type].index = this->next_sampler;
-            this->next_sampler += MAX2(1, uniform->array_elements);
-         }
-
          const gl_texture_index target = base_type->sampler_index();
          const unsigned shadow = base_type->sampler_shadow;
-         for (unsigned i = uniform->opaque[shader_type].index;
-              i < MIN2(this->next_sampler, MAX_SAMPLERS);
-              i++) {
-            this->targets[i] = target;
-            this->shader_samplers_used |= 1U << i;
-            this->shader_shadow_samplers |= shadow << i;
+
+         if (current_var->data.bindless) {
+            if (!set_opaque_indices(base_type, uniform, name,
+                                    this->next_bindless_sampler,
+                                    this->record_next_bindless_sampler))
+               return;
+
+            this->num_bindless_samplers = this->next_bindless_sampler;
+
+            this->bindless_targets = (gl_texture_index *)
+               realloc(this->bindless_targets,
+                       this->num_bindless_samplers * sizeof(gl_texture_index));
+
+            for (unsigned i = uniform->opaque[shader_type].index;
+                 i < this->num_bindless_samplers;
+                 i++) {
+               this->bindless_targets[i] = target;
+            }
+         } else {
+            if (!set_opaque_indices(base_type, uniform, name,
+                                    this->next_sampler,
+                                    this->record_next_sampler))
+               return;
+
+            for (unsigned i = uniform->opaque[shader_type].index;
+                 i < MIN2(this->next_sampler, MAX_SAMPLERS);
+                 i++) {
+               this->targets[i] = target;
+               this->shader_samplers_used |= 1U << i;
+               this->shader_shadow_samplers |= shadow << i;
+            }
          }
       }
    }
 
    void handle_images(const glsl_type *base_type,
-                      struct gl_uniform_storage *uniform)
+                      struct gl_uniform_storage *uniform, const char *name)
    {
       if (base_type->is_image()) {
-         uniform->opaque[shader_type].index = this->next_image;
          uniform->opaque[shader_type].active = true;
 
          /* Set image access qualifiers */
          const GLenum access =
-            (current_var->data.image_read_only ? GL_READ_ONLY :
-             current_var->data.image_write_only ? GL_WRITE_ONLY :
+            (current_var->data.memory_read_only ? GL_READ_ONLY :
+             current_var->data.memory_write_only ? GL_WRITE_ONLY :
                 GL_READ_WRITE);
 
-         const unsigned first = this->next_image;
+         if (current_var->data.bindless) {
+            if (!set_opaque_indices(base_type, uniform, name,
+                                    this->next_bindless_image,
+                                    this->record_next_bindless_image))
+               return;
 
-         /* Increment the image index by 1 for non-arrays and by the
-          * number of array elements for arrays.
-          */
-         this->next_image += MAX2(1, uniform->array_elements);
+            this->num_bindless_images = this->next_bindless_image;
 
-         for (unsigned i = first; i < MIN2(next_image, MAX_IMAGE_UNIFORMS); i++)
-            prog->_LinkedShaders[shader_type]->Program->sh.ImageAccess[i] = access;
+            this->bindless_access = (GLenum *)
+               realloc(this->bindless_access,
+                       this->num_bindless_images * sizeof(GLenum));
+
+            for (unsigned i = uniform->opaque[shader_type].index;
+                 i < this->num_bindless_images;
+                 i++) {
+               this->bindless_access[i] = access;
+            }
+         } else {
+            if (!set_opaque_indices(base_type, uniform, name,
+                                    this->next_image,
+                                    this->record_next_image))
+               return;
+
+            for (unsigned i = uniform->opaque[shader_type].index;
+                 i < MIN2(this->next_image, MAX_IMAGE_UNIFORMS);
+                 i++) {
+               prog->_LinkedShaders[shader_type]->Program->sh.ImageAccess[i] = access;
+            }
+         }
       }
    }
 
@@ -703,9 +766,11 @@
       this->uniforms[id].opaque[shader_type].index = ~0;
       this->uniforms[id].opaque[shader_type].active = false;
 
+      this->uniforms[id].active_shader_mask |= 1 << shader_type;
+
       /* This assigns uniform indices to sampler and image uniforms. */
       handle_samplers(base_type, &this->uniforms[id], name);
-      handle_images(base_type, &this->uniforms[id]);
+      handle_images(base_type, &this->uniforms[id], name);
       handle_subroutines(base_type, &this->uniforms[id]);
 
       /* For array of arrays or struct arrays the base location may have
@@ -753,6 +818,7 @@
 
       this->uniforms[id].is_shader_storage =
          current_var->is_in_shader_storage_block();
+      this->uniforms[id].is_bindless = current_var->data.bindless;
 
       /* Do not assign storage if the uniform is a builtin or buffer object */
       if (!this->uniforms[id].builtin &&
@@ -787,7 +853,7 @@
 
          if (type->without_array()->is_matrix()) {
             const glsl_type *matrix = type->without_array();
-            const unsigned N = matrix->base_type == GLSL_TYPE_DOUBLE ? 8 : 4;
+            const unsigned N = matrix->is_double() ? 8 : 4;
             const unsigned items =
                row_major ? matrix->matrix_columns : matrix->vector_elements;
 
@@ -813,7 +879,7 @@
       if (!this->uniforms[id].builtin &&
           !this->uniforms[id].is_shader_storage &&
           this->buffer_block_index == -1)
-         this->values += values_for_type(type);
+         this->values += type->component_slots();
    }
 
    /**
@@ -825,7 +891,9 @@
 
    struct gl_uniform_storage *uniforms;
    unsigned next_sampler;
+   unsigned next_bindless_sampler;
    unsigned next_image;
+   unsigned next_bindless_image;
    unsigned next_subroutine;
 
    /**
@@ -852,6 +920,21 @@
     */
    struct string_to_uint_map *record_next_sampler;
 
+   /* Map for temporarily storing next imager index when handling images in
+    * struct arrays.
+    */
+   struct string_to_uint_map *record_next_image;
+
+   /* Map for temporarily storing next bindless sampler index when handling
+    * bindless samplers in struct arrays.
+    */
+   struct string_to_uint_map *record_next_bindless_sampler;
+
+   /* Map for temporarily storing next bindless image index when handling
+    * bindless images in struct arrays.
+    */
+   struct string_to_uint_map *record_next_bindless_image;
+
 public:
    union gl_constant_value *values;
 
@@ -866,6 +949,27 @@
     * Mask of samplers used by the current shader stage for shadows.
     */
    unsigned shader_shadow_samplers;
+
+   /**
+    * Number of bindless samplers used by the current shader stage.
+    */
+   unsigned num_bindless_samplers;
+
+   /**
+    * Texture targets for bindless samplers used by the current stage.
+    */
+   gl_texture_index *bindless_targets;
+
+   /**
+    * Number of bindless images used by the current shader stage.
+    */
+   unsigned num_bindless_images;
+
+   /**
+    * Access types for bindless images used by the current stage.
+    */
+   GLenum *bindless_access;
+
 };
 
 static bool
@@ -1232,12 +1336,14 @@
                                      prog->data->UniformStorage, data);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (prog->_LinkedShaders[i] == NULL)
+      struct gl_linked_shader *shader = prog->_LinkedShaders[i];
+
+      if (!shader)
          continue;
 
       parcel.start_shader((gl_shader_stage)i);
 
-      foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) {
+      foreach_in_list(ir_instruction, node, shader->ir) {
          ir_variable *const var = node->as_variable();
 
          if ((var == NULL) || (var->data.mode != ir_var_uniform &&
@@ -1247,15 +1353,36 @@
          parcel.set_and_process(var);
       }
 
-      prog->_LinkedShaders[i]->Program->SamplersUsed =
-         parcel.shader_samplers_used;
-      prog->_LinkedShaders[i]->shadow_samplers = parcel.shader_shadow_samplers;
+      shader->Program->SamplersUsed = parcel.shader_samplers_used;
+      shader->shadow_samplers = parcel.shader_shadow_samplers;
 
-      STATIC_ASSERT(sizeof(prog->_LinkedShaders[i]->Program->sh.SamplerTargets) ==
+      if (parcel.num_bindless_samplers > 0) {
+         shader->Program->sh.NumBindlessSamplers = parcel.num_bindless_samplers;
+         shader->Program->sh.BindlessSamplers =
+            rzalloc_array(shader->Program, gl_bindless_sampler,
+                          parcel.num_bindless_samplers);
+         for (unsigned j = 0; j < parcel.num_bindless_samplers; j++) {
+            shader->Program->sh.BindlessSamplers[j].target =
+               parcel.bindless_targets[j];
+         }
+      }
+
+      if (parcel.num_bindless_images > 0) {
+         shader->Program->sh.NumBindlessImages = parcel.num_bindless_images;
+         shader->Program->sh.BindlessImages =
+            rzalloc_array(shader->Program, gl_bindless_image,
+                          parcel.num_bindless_images);
+         for (unsigned j = 0; j < parcel.num_bindless_images; j++) {
+            shader->Program->sh.BindlessImages[j].access =
+               parcel.bindless_access[j];
+         }
+      }
+
+      STATIC_ASSERT(sizeof(shader->Program->sh.SamplerTargets) ==
                     sizeof(parcel.targets));
-      memcpy(prog->_LinkedShaders[i]->Program->sh.SamplerTargets,
+      memcpy(shader->Program->sh.SamplerTargets,
              parcel.targets,
-             sizeof(prog->_LinkedShaders[i]->Program->sh.SamplerTargets));
+             sizeof(shader->Program->sh.SamplerTargets));
    }
 
    /* If this is a fallback compile for a cache miss we already have the
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index f0df3d6..17d5560 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -184,25 +184,6 @@
    return has_xfb_qualifiers;
 }
 
-static bool
-anonymous_struct_type_matches(const glsl_type *output_type,
-                              const glsl_type *to_match)
-{
-    while (output_type->is_array() && to_match->is_array()) {
-        /* if the lengths at each level don't match fail. */
-        if (output_type->length != to_match->length)
-            return false;
-        output_type = output_type->fields.array;
-        to_match = to_match->fields.array;
-    }
-
-    if (output_type->is_array() || to_match->is_array())
-        return false;
-    return output_type->is_anonymous() &&
-           to_match->is_anonymous() &&
-           to_match->record_compare(output_type);
-}
-
 /**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.
@@ -247,19 +228,15 @@
        *     fragment language."
        */
       if (!output->type->is_array() || !is_gl_identifier(output->name)) {
-         bool anon_matches = anonymous_struct_type_matches(output->type, type_to_match);
-
-         if (!anon_matches) {
-            linker_error(prog,
-                         "%s shader output `%s' declared as type `%s', "
-                         "but %s shader input declared as type `%s'\n",
-                         _mesa_shader_stage_to_string(producer_stage),
-                         output->name,
-                         output->type->name,
-                         _mesa_shader_stage_to_string(consumer_stage),
-                         input->type->name);
-            return;
-         }
+         linker_error(prog,
+                      "%s shader output `%s' declared as type `%s', "
+                      "but %s shader input declared as type `%s'\n",
+                      _mesa_shader_stage_to_string(producer_stage),
+                      output->name,
+                      output->type->name,
+                      _mesa_shader_stage_to_string(consumer_stage),
+                      input->type->name);
+         return;
       }
    }
 
@@ -1217,6 +1194,7 @@
 {
 public:
    varying_matches(bool disable_varying_packing, bool xfb_enabled,
+                   bool enhanced_layouts_enabled,
                    gl_shader_stage producer_stage,
                    gl_shader_stage consumer_stage);
    ~varying_matches();
@@ -1250,6 +1228,8 @@
     */
    const bool xfb_enabled;
 
+   const bool enhanced_layouts_enabled;
+
    /**
     * Enum representing the order in which varyings are packed within a
     * packing class.
@@ -1326,10 +1306,12 @@
 
 varying_matches::varying_matches(bool disable_varying_packing,
                                  bool xfb_enabled,
+                                 bool enhanced_layouts_enabled,
                                  gl_shader_stage producer_stage,
                                  gl_shader_stage consumer_stage)
    : disable_varying_packing(disable_varying_packing),
      xfb_enabled(xfb_enabled),
+     enhanced_layouts_enabled(enhanced_layouts_enabled),
      producer_stage(producer_stage),
      consumer_stage(consumer_stage)
 {
@@ -1407,7 +1389,7 @@
 
    if (!disable_varying_packing &&
        (needs_flat_qualifier ||
-        (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT))) {
+        (consumer_stage != MESA_SHADER_NONE && consumer_stage != MESA_SHADER_FRAGMENT))) {
       /* Since this varying is not being consumed by the fragment shader, its
        * interpolation type varying cannot possibly affect rendering.
        * Also, this variable is non-flat and is (or contains) an integer
@@ -1461,17 +1443,24 @@
       ? consumer_stage : producer_stage;
    const glsl_type *type = get_varying_type(var, stage);
 
+   if (producer_var && consumer_var &&
+       consumer_var->data.must_be_shader_input) {
+      producer_var->data.must_be_shader_input = 1;
+   }
+
    this->matches[this->num_matches].packing_class
       = this->compute_packing_class(var);
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
-   if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) {
+   if ((this->disable_varying_packing && !is_varying_packing_safe(type, var)) ||
+       var->data.must_be_shader_input) {
       unsigned slots = type->count_attribute_slots(false);
       this->matches[this->num_matches].num_components = slots * 4;
    } else {
       this->matches[this->num_matches].num_components
          = type->component_slots();
    }
+
    this->matches[this->num_matches].producer_var = producer_var;
    this->matches[this->num_matches].consumer_var = consumer_var;
    this->num_matches++;
@@ -1544,7 +1533,8 @@
        * we can pack varyings together that are only used for transform
        * feedback.
        */
-      if ((this->disable_varying_packing &&
+      if (var->data.must_be_shader_input ||
+          (this->disable_varying_packing &&
            !(previous_var_xfb_only && var->data.is_xfb_only)) ||
           (i > 0 && this->matches[i - 1].packing_class
           != this->matches[i].packing_class )) {
@@ -1614,6 +1604,12 @@
 void
 varying_matches::store_locations() const
 {
+   /* Check is location needs to be packed with lower_packed_varyings() or if
+    * we can just use ARB_enhanced_layouts packing.
+    */
+   bool pack_loc[MAX_VARYINGS_INCL_PATCH] = { 0 };
+   const glsl_type *loc_type[MAX_VARYINGS_INCL_PATCH][4] = { {NULL, NULL} };
+
    for (unsigned i = 0; i < this->num_matches; i++) {
       ir_variable *producer_var = this->matches[i].producer_var;
       ir_variable *consumer_var = this->matches[i].consumer_var;
@@ -1631,6 +1627,64 @@
          consumer_var->data.location = VARYING_SLOT_VAR0 + slot;
          consumer_var->data.location_frac = offset;
       }
+
+      /* Find locations suitable for native packing via
+       * ARB_enhanced_layouts.
+       */
+      if (producer_var && consumer_var) {
+         if (enhanced_layouts_enabled) {
+            const glsl_type *type =
+               get_varying_type(producer_var, producer_stage);
+            if (type->is_array() || type->is_matrix() || type->is_record() ||
+                type->is_double()) {
+               unsigned comp_slots = type->component_slots() + offset;
+               unsigned slots = comp_slots / 4;
+               if (comp_slots % 4)
+                  slots += 1;
+
+               for (unsigned j = 0; j < slots; j++) {
+                  pack_loc[slot + j] = true;
+               }
+            } else if (offset + type->vector_elements > 4) {
+               pack_loc[slot] = true;
+               pack_loc[slot + 1] = true;
+            } else {
+               loc_type[slot][offset] = type;
+            }
+         }
+      }
+   }
+
+   /* Attempt to use ARB_enhanced_layouts for more efficient packing if
+    * suitable.
+    */
+   if (enhanced_layouts_enabled) {
+      for (unsigned i = 0; i < this->num_matches; i++) {
+         ir_variable *producer_var = this->matches[i].producer_var;
+         ir_variable *consumer_var = this->matches[i].consumer_var;
+         unsigned generic_location = this->matches[i].generic_location;
+         unsigned slot = generic_location / 4;
+
+         if (pack_loc[slot] || !producer_var || !consumer_var)
+            continue;
+
+         const glsl_type *type =
+            get_varying_type(producer_var, producer_stage);
+         bool type_match = true;
+         for (unsigned j = 0; j < 4; j++) {
+            if (loc_type[slot][j]) {
+               if (type->base_type != loc_type[slot][j]->base_type)
+                  type_match = false;
+            }
+         }
+
+         if (type_match) {
+            producer_var->data.explicit_location = 1;
+            consumer_var->data.explicit_location = 1;
+            producer_var->data.explicit_component = 1;
+            consumer_var->data.explicit_component = 1;
+         }
+      }
    }
 }
 
@@ -1660,8 +1714,9 @@
     * Therefore, the packing class depends only on the interpolation type.
     */
    unsigned packing_class = var->data.centroid | (var->data.sample << 1) |
-                            (var->data.patch << 2);
-   packing_class *= 4;
+                            (var->data.patch << 2) |
+                            (var->data.must_be_shader_input << 3);
+   packing_class *= 8;
    packing_class += var->is_interpolation_flat()
       ? unsigned(INTERP_MODE_FLAT) : var->data.interpolation;
    return packing_class;
@@ -1678,7 +1733,7 @@
 {
    const glsl_type *element_type = var->type;
 
-   while (element_type->base_type == GLSL_TYPE_ARRAY) {
+   while (element_type->is_array()) {
       element_type = element_type->fields.array;
    }
 
@@ -2017,7 +2072,8 @@
       var_slot = var->data.location - VARYING_SLOT_VAR0;
 
       unsigned num_elements = get_varying_type(var, stage->Stage)
-         ->count_attribute_slots(stage->Stage == MESA_SHADER_VERTEX);
+         ->count_attribute_slots(io_mode == ir_var_shader_in &&
+                                 stage->Stage == MESA_SHADER_VERTEX);
       for (unsigned i = 0; i < num_elements; i++) {
          if (var_slot >= 0 && var_slot < MAX_VARYINGS_INCL_PATCH)
             slots |= UINT64_C(1) << var_slot;
@@ -2091,8 +2147,9 @@
       disable_varying_packing = true;
 
    varying_matches matches(disable_varying_packing, xfb_enabled,
-                           producer ? producer->Stage : (gl_shader_stage)-1,
-                           consumer ? consumer->Stage : (gl_shader_stage)-1);
+                           ctx->Extensions.ARB_enhanced_layouts,
+                           producer ? producer->Stage : MESA_SHADER_NONE,
+                           consumer ? consumer->Stage : MESA_SHADER_NONE);
    hash_table *tfeedback_candidates =
          _mesa_hash_table_create(NULL, _mesa_key_hash_string,
                                  _mesa_key_string_equal);
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 97eca76..c7a7fc5 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -75,7 +75,7 @@
 #include "program/program.h"
 #include "util/mesa-sha1.h"
 #include "util/set.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 #include "linker.h"
 #include "link_varyings.h"
 #include "ir_optimization.h"
@@ -90,27 +90,31 @@
 
 namespace {
 
+struct find_variable {
+   const char *name;
+   bool found;
+
+   find_variable(const char *name) : name(name), found(false) {}
+};
+
 /**
  * Visitor that determines whether or not a variable is ever written.
+ *
+ * Use \ref find_assignments for convenience.
  */
 class find_assignment_visitor : public ir_hierarchical_visitor {
 public:
-   find_assignment_visitor(const char *name)
-      : name(name), found(false)
+   find_assignment_visitor(unsigned num_vars,
+                           find_variable * const *vars)
+      : num_variables(num_vars), num_found(0), variables(vars)
    {
-      /* empty */
    }
 
    virtual ir_visitor_status visit_enter(ir_assignment *ir)
    {
       ir_variable *const var = ir->lhs->variable_referenced();
 
-      if (strcmp(name, var->name) == 0) {
-         found = true;
-         return visit_stop;
-      }
-
-      return visit_continue_with_parent;
+      return check_variable_name(var->name);
    }
 
    virtual ir_visitor_status visit_enter(ir_call *ir)
@@ -123,35 +127,71 @@
          if (sig_param->data.mode == ir_var_function_out ||
              sig_param->data.mode == ir_var_function_inout) {
             ir_variable *var = param_rval->variable_referenced();
-            if (var && strcmp(name, var->name) == 0) {
-               found = true;
+            if (var && check_variable_name(var->name) == visit_stop)
                return visit_stop;
-            }
          }
       }
 
       if (ir->return_deref != NULL) {
          ir_variable *const var = ir->return_deref->variable_referenced();
 
-         if (strcmp(name, var->name) == 0) {
-            found = true;
+         if (check_variable_name(var->name) == visit_stop)
             return visit_stop;
+      }
+
+      return visit_continue_with_parent;
+   }
+
+private:
+   ir_visitor_status check_variable_name(const char *name)
+   {
+      for (unsigned i = 0; i < num_variables; ++i) {
+         if (strcmp(variables[i]->name, name) == 0) {
+            if (!variables[i]->found) {
+               variables[i]->found = true;
+
+               assert(num_found < num_variables);
+               if (++num_found == num_variables)
+                  return visit_stop;
+            }
+            break;
          }
       }
 
       return visit_continue_with_parent;
    }
 
-   bool variable_found()
-   {
-      return found;
-   }
-
 private:
-   const char *name;       /**< Find writes to a variable with this name. */
-   bool found;             /**< Was a write to the variable found? */
+   unsigned num_variables;           /**< Number of variables to find */
+   unsigned num_found;               /**< Number of variables already found */
+   find_variable * const *variables; /**< Variables to find */
 };
 
+/**
+ * Determine whether or not any of NULL-terminated list of variables is ever
+ * written to.
+ */
+static void
+find_assignments(exec_list *ir, find_variable * const *vars)
+{
+   unsigned num_variables = 0;
+
+   for (find_variable * const *v = vars; *v; ++v)
+      num_variables++;
+
+   find_assignment_visitor visitor(num_variables, vars);
+   visitor.run(ir);
+}
+
+/**
+ * Determine whether or not the given variable is ever written to.
+ */
+static void
+find_assignments(exec_list *ir, find_variable *var)
+{
+   find_assignment_visitor visitor(1, &var);
+   visitor.run(ir);
+}
 
 /**
  * Visitor that determines whether or not a variable is ever read.
@@ -567,11 +607,16 @@
        * gl_ClipVertex nor gl_ClipDistance. However with
        * GL_EXT_clip_cull_distance, this functionality is exposed in ES 3.0.
        */
-      find_assignment_visitor clip_distance("gl_ClipDistance");
-      find_assignment_visitor cull_distance("gl_CullDistance");
-
-      clip_distance.run(shader->ir);
-      cull_distance.run(shader->ir);
+      find_variable gl_ClipDistance("gl_ClipDistance");
+      find_variable gl_CullDistance("gl_CullDistance");
+      find_variable gl_ClipVertex("gl_ClipVertex");
+      find_variable * const variables[] = {
+         &gl_ClipDistance,
+         &gl_CullDistance,
+         !prog->IsES ? &gl_ClipVertex : NULL,
+         NULL
+      };
+      find_assignments(shader->ir, variables);
 
       /* From the ARB_cull_distance spec:
        *
@@ -583,17 +628,13 @@
        * gl_ClipVertex.
        */
       if (!prog->IsES) {
-         find_assignment_visitor clip_vertex("gl_ClipVertex");
-
-         clip_vertex.run(shader->ir);
-
-         if (clip_vertex.variable_found() && clip_distance.variable_found()) {
+         if (gl_ClipVertex.found && gl_ClipDistance.found) {
             linker_error(prog, "%s shader writes to both `gl_ClipVertex' "
                          "and `gl_ClipDistance'\n",
                          _mesa_shader_stage_to_string(shader->Stage));
             return;
          }
-         if (clip_vertex.variable_found() && cull_distance.variable_found()) {
+         if (gl_ClipVertex.found && gl_CullDistance.found) {
             linker_error(prog, "%s shader writes to both `gl_ClipVertex' "
                          "and `gl_CullDistance'\n",
                          _mesa_shader_stage_to_string(shader->Stage));
@@ -601,13 +642,13 @@
          }
       }
 
-      if (clip_distance.variable_found()) {
+      if (gl_ClipDistance.found) {
          ir_variable *clip_distance_var =
                 shader->symbols->get_variable("gl_ClipDistance");
          assert(clip_distance_var);
          *clip_distance_array_size = clip_distance_var->type->length;
       }
-      if (cull_distance.variable_found()) {
+      if (gl_CullDistance.found) {
          ir_variable *cull_distance_var =
                 shader->symbols->get_variable("gl_CullDistance");
          assert(cull_distance_var);
@@ -676,9 +717,9 @@
     * gl_Position is not an error.
     */
    if (prog->data->Version < (prog->IsES ? 300 : 140)) {
-      find_assignment_visitor find("gl_Position");
-      find.run(shader->ir);
-      if (!find.variable_found()) {
+      find_variable gl_Position("gl_Position");
+      find_assignments(shader->ir, &gl_Position);
+      if (!gl_Position.found) {
         if (prog->IsES) {
           linker_warning(prog,
                          "vertex shader does not write to `gl_Position'. "
@@ -722,13 +763,12 @@
    if (shader == NULL)
       return;
 
-   find_assignment_visitor frag_color("gl_FragColor");
-   find_assignment_visitor frag_data("gl_FragData");
+   find_variable gl_FragColor("gl_FragColor");
+   find_variable gl_FragData("gl_FragData");
+   find_variable * const variables[] = { &gl_FragColor, &gl_FragData, NULL };
+   find_assignments(shader->ir, variables);
 
-   frag_color.run(shader->ir);
-   frag_data.run(shader->ir);
-
-   if (frag_color.variable_found() && frag_data.variable_found()) {
+   if (gl_FragColor.found && gl_FragData.found) {
       linker_error(prog,  "fragment shader writes to both "
                    "`gl_FragColor' and `gl_FragData'\n");
    }
@@ -850,14 +890,6 @@
             }
             return true;
          }
-      } else {
-         /* The arrays of structs could have different glsl_type pointers but
-          * they are actually the same type. Use record_compare() to check that.
-          */
-         if (existing->type->fields.array->is_record() &&
-             var->type->fields.array->is_record() &&
-             existing->type->fields.array->record_compare(var->type->fields.array))
-            return true;
       }
    }
    return false;
@@ -907,28 +939,23 @@
          /* Check if types match. */
          if (var->type != existing->type) {
             if (!validate_intrastage_arrays(prog, var, existing)) {
-               if (var->type->is_record() && existing->type->is_record()
-                   && existing->type->record_compare(var->type)) {
-                   existing->type = var->type;
-               } else {
-                  /* If it is an unsized array in a Shader Storage Block,
-                   * two different shaders can access to different elements.
-                   * Because of that, they might be converted to different
-                   * sized arrays, then check that they are compatible but
-                   * ignore the array size.
-                   */
-                  if (!(var->data.mode == ir_var_shader_storage &&
-                        var->data.from_ssbo_unsized_array &&
-                        existing->data.mode == ir_var_shader_storage &&
-                        existing->data.from_ssbo_unsized_array &&
-                        var->type->gl_type == existing->type->gl_type)) {
-                     linker_error(prog, "%s `%s' declared as type "
-                                  "`%s' and type `%s'\n",
-                                  mode_string(var),
-                                  var->name, var->type->name,
-                                  existing->type->name);
-                     return;
-                  }
+               /* If it is an unsized array in a Shader Storage Block,
+                * two different shaders can access to different elements.
+                * Because of that, they might be converted to different
+                * sized arrays, then check that they are compatible but
+                * ignore the array size.
+                */
+               if (!(var->data.mode == ir_var_shader_storage &&
+                     var->data.from_ssbo_unsized_array &&
+                     existing->data.mode == ir_var_shader_storage &&
+                     existing->data.from_ssbo_unsized_array &&
+                     var->type->gl_type == existing->type->gl_type)) {
+                  linker_error(prog, "%s `%s' declared as type "
+                                 "`%s' and type `%s'\n",
+                                 mode_string(var),
+                                 var->name, var->type->name,
+                                 existing->type->name);
+                  return;
                }
             }
          }
@@ -1101,10 +1128,16 @@
          if (prog->IsES && (prog->data->Version != 310 ||
                             !var->get_interface_type()) &&
              existing->data.precision != var->data.precision) {
-            linker_error(prog, "declarations for %s `%s` have "
-                         "mismatching precision qualifiers\n",
-                         mode_string(var), var->name);
-            return;
+            if ((existing->data.used && var->data.used) || prog->data->Version >= 300) {
+               linker_error(prog, "declarations for %s `%s` have "
+                            "mismatching precision qualifiers\n",
+                            mode_string(var), var->name);
+               return;
+            } else {
+               linker_warning(prog, "declarations for %s `%s` have "
+                              "mismatching precision qualifiers\n",
+                              mode_string(var), var->name);
+            }
          }
       } else
          variables->add_variable(var);
@@ -1665,6 +1698,49 @@
 }
 
 /**
+ * Check for conflicting bindless/bound sampler/image layout qualifiers at
+ * global scope.
+ */
+static void
+link_bindless_layout_qualifiers(struct gl_shader_program *prog,
+                                struct gl_program *gl_prog,
+                                struct gl_shader **shader_list,
+                                unsigned num_shaders)
+{
+   bool bindless_sampler, bindless_image;
+   bool bound_sampler, bound_image;
+
+   bindless_sampler = bindless_image = false;
+   bound_sampler = bound_image = false;
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      if (shader->bindless_sampler)
+         bindless_sampler = true;
+      if (shader->bindless_image)
+         bindless_image = true;
+      if (shader->bound_sampler)
+         bound_sampler = true;
+      if (shader->bound_image)
+         bound_image = true;
+
+      if ((bindless_sampler && bound_sampler) ||
+          (bindless_image && bound_image)) {
+         /* From section 4.4.6 of the ARB_bindless_texture spec:
+          *
+          *     "If both bindless_sampler and bound_sampler, or bindless_image
+          *      and bound_image, are declared at global scope in any
+          *      compilation unit, a link- time error will be generated."
+          */
+         linker_error(prog, "both bindless_sampler and bound_sampler, or "
+                      "bindless_image and bound_image, can't be declared at "
+                      "global scope");
+      }
+   }
+}
+
+/**
  * Performs the cross-validation of tessellation control shader vertices and
  * layout qualifiers for the attached tessellation control shaders,
  * and propagates them to the linked TCS and linked shader program.
@@ -2226,6 +2302,8 @@
       link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list,
                                         num_shaders);
 
+   link_bindless_layout_qualifiers(prog, gl_prog, shader_list, num_shaders);
+
    populate_symbol_table(linked);
 
    /* The pointer to the main function in the final linked shader (i.e., the
@@ -2589,12 +2667,14 @@
    } to_assign[32];
    assert(max_index <= 32);
 
-   /* Temporary array for the set of attributes that have locations assigned.
+   /* Temporary array for the set of attributes that have locations assigned,
+    * for the purpose of checking overlapping slots/components of (non-ES)
+    * fragment shader outputs.
     */
-   ir_variable *assigned[16];
+   ir_variable *assigned[12 * 4]; /* (max # of FS outputs) * # components */
+   unsigned assigned_attr = 0;
 
    unsigned num_attr = 0;
-   unsigned assigned_attr = 0;
 
    foreach_in_list(ir_instruction, node, sh->ir) {
       ir_variable *const var = node->as_variable();
@@ -2833,6 +2913,18 @@
                }
             }
 
+            if (target_index == MESA_SHADER_FRAGMENT && !prog->IsES) {
+               /* Only track assigned variables for non-ES fragment shaders
+                * to avoid overflowing the array.
+                *
+                * At most one variable per fragment output component should
+                * reach this.
+                */
+               assert(assigned_attr < ARRAY_SIZE(assigned));
+               assigned[assigned_attr] = var;
+               assigned_attr++;
+            }
+
             used_locations |= (use_mask << attr);
 
             /* From the GL 4.5 core spec, section 11.1.1 (Vertex Attributes):
@@ -2859,9 +2951,6 @@
                double_storage_locations |= (use_mask << attr);
          }
 
-         assigned[assigned_attr] = var;
-         assigned_attr++;
-
          continue;
       }
 
@@ -3660,7 +3749,10 @@
                        bool use_implicit_location, int location,
                        const glsl_type *outermost_struct_type)
 {
-   gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable);
+   /* Allocate zero-initialized memory to ensure that bitfield padding
+    * is zero.
+    */
+   gl_shader_variable *out = rzalloc(shProg, struct gl_shader_variable);
    if (!out)
       return NULL;
 
@@ -3704,8 +3796,7 @@
     *        qualifier, except for vertex shader inputs and fragment shader
     *        outputs."
     */
-   if (in->type->base_type == GLSL_TYPE_ATOMIC_UINT ||
-       is_gl_identifier(in->name) ||
+   if (in->type->is_atomic_uint() || is_gl_identifier(in->name) ||
        !(in->data.explicit_location || use_implicit_location)) {
       out->location = -1;
    } else {
@@ -4486,24 +4577,17 @@
       if (!sh)
          continue;
 
-      if (first == last) {
-         /* For a single shader program only allow inputs to the vertex shader
-          * and outputs from the fragment shader to be removed.
-          */
-         if (stage != MESA_SHADER_VERTEX)
-            set_always_active_io(sh->ir, ir_var_shader_in);
-         if (stage != MESA_SHADER_FRAGMENT)
-            set_always_active_io(sh->ir, ir_var_shader_out);
-      } else {
-         /* For multi-stage separate shader programs only allow inputs and
-          * outputs between the shader stages to be removed as well as inputs
-          * to the vertex shader and outputs from the fragment shader.
-          */
-         if (stage == first && stage != MESA_SHADER_VERTEX)
-            set_always_active_io(sh->ir, ir_var_shader_in);
-         else if (stage == last && stage != MESA_SHADER_FRAGMENT)
-            set_always_active_io(sh->ir, ir_var_shader_out);
-      }
+      /* Prevent the removal of inputs to the first and outputs from the last
+       * stage, unless they are the initial pipeline inputs or final pipeline
+       * outputs, respectively.
+       *
+       * The removal of IO between shaders in the same program is always
+       * allowed.
+       */
+      if (stage == first && stage != MESA_SHADER_VERTEX)
+         set_always_active_io(sh->ir, ir_var_shader_in);
+      if (stage == last && stage != MESA_SHADER_FRAGMENT)
+         set_always_active_io(sh->ir, ir_var_shader_out);
    }
 }
 
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index d06f419..dd627be 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -75,9 +75,6 @@
 validate_interstage_uniform_blocks(struct gl_shader_program *prog,
                                    gl_linked_shader **stages);
 
-unsigned
-values_for_type(const glsl_type *type);
-
 extern void
 link_assign_atomic_counter_resources(struct gl_context *ctx,
                                      struct gl_shader_program *prog);
diff --git a/src/compiler/glsl/lower_buffer_access.cpp b/src/compiler/glsl/lower_buffer_access.cpp
index 51e6560..24a96e2 100644
--- a/src/compiler/glsl/lower_buffer_access.cpp
+++ b/src/compiler/glsl/lower_buffer_access.cpp
@@ -164,8 +164,8 @@
       /* We're dereffing a column out of a row-major matrix, so we
        * gather the vector from each stored row.
       */
-      assert(deref->type->base_type == GLSL_TYPE_FLOAT ||
-             deref->type->base_type == GLSL_TYPE_DOUBLE);
+      assert(deref->type->is_float() || deref->type->is_double());
+
       /* Matrices, row_major or not, are stored as if they were
        * arrays of vectors of the appropriate size in std140.
        * Arrays have their strides rounded up to a vec4, so the
@@ -199,7 +199,7 @@
       else
          matrix_stride = glsl_align(matrix_columns * N, 16);
 
-      const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
+      const glsl_type *deref_type = deref->type->is_float() ?
          glsl_type::float_type : glsl_type::double_type;
 
       for (unsigned i = 0; i < deref->type->vector_elements; i++) {
diff --git a/src/compiler/glsl/lower_instructions.cpp b/src/compiler/glsl/lower_instructions.cpp
index 697bb84..423d599 100644
--- a/src/compiler/glsl/lower_instructions.cpp
+++ b/src/compiler/glsl/lower_instructions.cpp
@@ -358,13 +358,21 @@
     * into
     *
     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
-    *    resulting_biased_exp = extracted_biased_exp + exp;
+    *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
     *
-    *    if (resulting_biased_exp < 1 || x == 0.0f) {
-    *       return copysign(0.0, x);
+    *    if (extracted_biased_exp >= 255)
+    *       return x; // +/-inf, NaN
+    *
+    *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
+    *
+    *    if (min(resulting_biased_exp, extracted_biased_exp) < 1)
+    *       resulting_biased_exp = 0;
+    *    if (resulting_biased_exp >= 255 ||
+    *        min(resulting_biased_exp, extracted_biased_exp) < 1) {
+    *       sign_mantissa &= sign_mask;
     *    }
     *
-    *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
+    *    return bitcast_u2f(sign_mantissa |
     *                       lshift(i2u(resulting_biased_exp), exp_shift));
     *
     * which we can't actually implement as such, since the GLSL IR doesn't
@@ -372,45 +380,58 @@
     * using conditional-select:
     *
     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
-    *    resulting_biased_exp = extracted_biased_exp + exp;
+    *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
     *
-    *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
-    *                                         gequal(resulting_biased_exp, 1);
-    *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
-    *    resulting_biased_exp = csel(is_not_zero_or_underflow,
-    *                                resulting_biased_exp, 0);
+    *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
     *
-    *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
-    *                       lshift(i2u(resulting_biased_exp), exp_shift));
+    *    flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0);
+    *    resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp)
+    *    zero_mantissa = logic_or(flush_to_zero,
+    *                             gequal(resulting_biased_exp, 255));
+    *    sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa);
+    *
+    *    result = sign_mantissa |
+    *             lshift(i2u(resulting_biased_exp), exp_shift));
+    *
+    *    return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result));
+    *
+    * The definition of ldexp in the GLSL spec says:
+    *
+    *    "If this product is too large to be represented in the
+    *     floating-point type, the result is undefined."
+    *
+    * However, the definition of ldexp in the GLSL ES spec does not contain
+    * this sentence, so we do need to handle overflow correctly.
+    *
+    * There is additional language limiting the defined range of exp, but this
+    * is merely to allow implementations that store 2^exp in a temporary
+    * variable.
     */
 
    const unsigned vec_elem = ir->type->vector_elements;
 
    /* Types */
    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
+   const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 
-   /* Constants */
-   ir_constant *zeroi = ir_constant::zero(ir, ivec);
-
-   ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
-
-   ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
-
    /* Temporary variables */
    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
-
-   ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
-                                                  ir_var_temporary);
+   ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary);
 
    ir_variable *extracted_biased_exp =
       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
    ir_variable *resulting_biased_exp =
       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 
-   ir_variable *is_not_zero_or_underflow =
-      new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
+   ir_variable *sign_mantissa =
+      new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary);
+
+   ir_variable *flush_to_zero =
+      new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary);
+   ir_variable *zero_mantissa =
+      new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary);
 
    ir_instruction &i = *base_ir;
 
@@ -423,58 +444,82 @@
    /* Extract the biased exponent from <x>. */
    i.insert_before(extracted_biased_exp);
    i.insert_before(assign(extracted_biased_exp,
-                          rshift(bitcast_f2i(abs(x)), exp_shift)));
+                          rshift(bitcast_f2i(abs(x)),
+                                 new(ir) ir_constant(23, vec_elem))));
 
+   /* The definition of ldexp in the GLSL 4.60 spec says:
+    *
+    *    "If exp is greater than +128 (single-precision) or +1024
+    *     (double-precision), the value returned is undefined. If exp is less
+    *     than -126 (single-precision) or -1022 (double-precision), the value
+    *     returned may be flushed to zero."
+    *
+    * So we do not have to guard against the possibility of addition overflow,
+    * which could happen when exp is close to INT_MAX. Addition underflow
+    * cannot happen (the worst case is 0 + (-INT_MAX)).
+    */
    i.insert_before(resulting_biased_exp);
    i.insert_before(assign(resulting_biased_exp,
-                          add(extracted_biased_exp, exp)));
+                          min2(add(extracted_biased_exp, exp),
+                               new(ir) ir_constant(255, vec_elem))));
 
-   /* Test if result is ±0.0, subnormal, or underflow by checking if the
-    * resulting biased exponent would be less than 0x1. If so, the result is
-    * 0.0 with the sign of x. (Actually, invert the conditions so that
-    * immediate values are the second arguments, which is better for i965)
-    */
-   i.insert_before(zero_sign_x);
-   i.insert_before(assign(zero_sign_x,
-                          bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
+   i.insert_before(sign_mantissa);
+   i.insert_before(assign(sign_mantissa,
+                          bit_and(bitcast_f2u(x),
+                                  new(ir) ir_constant(0x807fffffu, vec_elem))));
 
-   i.insert_before(is_not_zero_or_underflow);
-   i.insert_before(assign(is_not_zero_or_underflow,
-                          logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
-                                    gequal(resulting_biased_exp,
-                                           new(ir) ir_constant(0x1, vec_elem)))));
-   i.insert_before(assign(x, csel(is_not_zero_or_underflow,
-                                  x, zero_sign_x)));
-   i.insert_before(assign(resulting_biased_exp,
-                          csel(is_not_zero_or_underflow,
-                               resulting_biased_exp, zeroi)));
-
-   /* We could test for overflows by checking if the resulting biased exponent
-    * would be greater than 0xFE. Turns out we don't need to because the GLSL
-    * spec says:
+   /* We flush to zero if the original or resulting biased exponent is 0,
+    * indicating a +/-0.0 or subnormal input or output.
     *
-    *    "If this product is too large to be represented in the
-    *     floating-point type, the result is undefined."
+    * The mantissa is set to 0 if the resulting biased exponent is 255, since
+    * an overflow should produce a +/-inf result.
+    *
+    * Note that NaN inputs are handled separately.
     */
+   i.insert_before(flush_to_zero);
+   i.insert_before(assign(flush_to_zero,
+                          lequal(min2(resulting_biased_exp,
+                                      extracted_biased_exp),
+                                 ir_constant::zero(ir, ivec))));
+   i.insert_before(assign(resulting_biased_exp,
+                          csel(flush_to_zero,
+                               ir_constant::zero(ir, ivec),
+                               resulting_biased_exp)));
 
-   ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
+   i.insert_before(zero_mantissa);
+   i.insert_before(assign(zero_mantissa,
+                          logic_or(flush_to_zero,
+                                   equal(resulting_biased_exp,
+                                         new(ir) ir_constant(255, vec_elem)))));
+   i.insert_before(assign(sign_mantissa,
+                          csel(zero_mantissa,
+                               bit_and(sign_mantissa,
+                                       new(ir) ir_constant(0x80000000u, vec_elem)),
+                               sign_mantissa)));
 
    /* Don't generate new IR that would need to be lowered in an additional
     * pass.
     */
+   i.insert_before(result);
    if (!lowering(INSERT_TO_SHIFTS)) {
-      ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
-      ir->operation = ir_unop_bitcast_i2f;
-      ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
-                                        exp_shift_clone, exp_width);
-      ir->operands[1] = NULL;
+      i.insert_before(assign(result,
+                             bitfield_insert(sign_mantissa,
+                                             i2u(resulting_biased_exp),
+                                             new(ir) ir_constant(23u, vec_elem),
+                                             new(ir) ir_constant(8u, vec_elem))));
    } else {
-      ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x807fffffu, vec_elem);
-      ir->operation = ir_unop_bitcast_u2f;
-      ir->operands[0] = bit_or(bit_and(bitcast_f2u(x), sign_mantissa_mask),
-                               lshift(i2u(resulting_biased_exp), exp_shift_clone));
+      i.insert_before(assign(result,
+                             bit_or(sign_mantissa,
+                                    lshift(i2u(resulting_biased_exp),
+                                           new(ir) ir_constant(23, vec_elem)))));
    }
 
+   ir->operation = ir_triop_csel;
+   ir->operands[0] = gequal(extracted_biased_exp,
+                            new(ir) ir_constant(255, vec_elem));
+   ir->operands[1] = new(ir) ir_dereference_variable(x);
+   ir->operands[2] = bitcast_u2f(result);
+
    this->progress = true;
 }
 
diff --git a/src/compiler/glsl/lower_int64.cpp b/src/compiler/glsl/lower_int64.cpp
index 0a19ce4..9770d31 100644
--- a/src/compiler/glsl/lower_int64.cpp
+++ b/src/compiler/glsl/lower_int64.cpp
@@ -128,12 +128,6 @@
 
 } /* anonymous namespace */
 
-static bool
-is_integer_64(const glsl_type *t)
-{
-   return t->base_type == GLSL_TYPE_UINT64 || t->base_type == GLSL_TYPE_INT64;
-}
-
 /**
  * Determine if a particular type of lowering should occur
  */
@@ -208,8 +202,7 @@
                            ir_rvalue *val,
                            ir_variable **expanded_src)
 {
-   assert(val->type->base_type == GLSL_TYPE_UINT64 ||
-          val->type->base_type == GLSL_TYPE_INT64);
+   assert(val->type->is_integer_64());
 
    ir_variable *const temp = body.make_temp(val->type, "tmp");
 
@@ -327,7 +320,7 @@
                                function_generator generator)
 {
    for (unsigned i = 0; i < ir->get_num_operands(); i++)
-      if (!is_integer_64(ir->operands[i]->type))
+      if (!ir->operands[i]->type->is_integer_64())
          return ir;
 
    /* Get a handle to the correct ir_function_signature for the core
@@ -348,6 +341,7 @@
       add_function(f);
    }
 
+   this->progress = true;
    return lower_op_to_function_call(this->base_ir, ir, callee);
 }
 
@@ -364,7 +358,6 @@
    case ir_unop_sign:
       if (lowering(SIGN64)) {
          *rvalue = handle_op(ir, "__builtin_sign64", generate_ir::sign64);
-         this->progress = true;
       }
       break;
 
@@ -375,7 +368,6 @@
          } else {
             *rvalue = handle_op(ir, "__builtin_idiv64", generate_ir::idiv64);
          }
-         this->progress = true;
       }
       break;
 
@@ -386,14 +378,12 @@
          } else {
             *rvalue = handle_op(ir, "__builtin_imod64", generate_ir::imod64);
          }
-         this->progress = true;
       }
       break;
 
    case ir_binop_mul:
       if (lowering(MUL64)) {
          *rvalue = handle_op(ir, "__builtin_umul64", generate_ir::umul64);
-         this->progress = true;
       }
       break;
 
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index 13f7e5b..1aec7ee 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -383,6 +383,12 @@
             rhs = u2i(expr(ir_unop_unpack_uint_2x32, rhs));
          }
          break;
+      case GLSL_TYPE_SAMPLER:
+         rhs = u2i(expr(ir_unop_unpack_sampler_2x32, rhs));
+         break;
+      case GLSL_TYPE_IMAGE:
+         rhs = u2i(expr(ir_unop_unpack_image_2x32, rhs));
+         break;
       default:
          assert(!"Unexpected type conversion while lowering varyings");
          break;
@@ -462,6 +468,14 @@
             rhs = expr(ir_unop_pack_uint_2x32, i2u(rhs));
          }
          break;
+      case GLSL_TYPE_SAMPLER:
+         rhs = new(mem_ctx)
+            ir_expression(ir_unop_pack_sampler_2x32, lhs->type, i2u(rhs));
+         break;
+      case GLSL_TYPE_IMAGE:
+         rhs = new(mem_ctx)
+            ir_expression(ir_unop_pack_image_2x32, lhs->type, i2u(rhs));
+         break;
       default:
          assert(!"Unexpected type conversion while lowering varyings");
          break;
@@ -742,10 +756,11 @@
 bool
 lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
 {
-   /* Things composed of vec4's and varyings with explicitly assigned
-    * locations don't need lowering.  Everything else does.
+   /* Things composed of vec4's, varyings with explicitly assigned
+    * locations or varyings marked as must_be_shader_input (which might be used
+    * by interpolateAt* functions) shouldn't be lowered. Everything else can be.
     */
-   if (var->data.explicit_location)
+   if (var->data.explicit_location || var->data.must_be_shader_input)
       return false;
 
    /* Override disable_varying_packing if the var is only used by transform
diff --git a/src/compiler/glsl/lower_ubo_reference.cpp b/src/compiler/glsl/lower_ubo_reference.cpp
index bfaddac..163c25e 100644
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@@ -104,6 +104,7 @@
    ir_call *lower_ssbo_atomic_intrinsic(ir_call *ir);
    ir_call *check_for_ssbo_atomic_intrinsic(ir_call *ir);
    ir_visitor_status visit_enter(ir_call *ir);
+   ir_visitor_status visit_enter(ir_texture *ir);
 
    struct gl_linked_shader *shader;
    bool clamp_block_indices;
@@ -411,13 +412,13 @@
    if (variable->is_interface_instance()) {
       assert(struct_field);
 
-      return ((struct_field->image_coherent ? ACCESS_COHERENT : 0) |
-              (struct_field->image_restrict ? ACCESS_RESTRICT : 0) |
-              (struct_field->image_volatile ? ACCESS_VOLATILE : 0));
+      return ((struct_field->memory_coherent ? ACCESS_COHERENT : 0) |
+              (struct_field->memory_restrict ? ACCESS_RESTRICT : 0) |
+              (struct_field->memory_volatile ? ACCESS_VOLATILE : 0));
    } else {
-      return ((variable->data.image_coherent ? ACCESS_COHERENT : 0) |
-              (variable->data.image_restrict ? ACCESS_RESTRICT : 0) |
-              (variable->data.image_volatile ? ACCESS_VOLATILE : 0));
+      return ((variable->data.memory_coherent ? ACCESS_COHERENT : 0) |
+              (variable->data.memory_restrict ? ACCESS_RESTRICT : 0) |
+              (variable->data.memory_volatile ? ACCESS_VOLATILE : 0));
    }
 }
 
@@ -892,7 +893,7 @@
    if (!lhs_deref)
       return false;
 
-   assert(lhs_deref->type->record_compare(rhs_deref->type));
+   assert(lhs_deref->type == rhs_deref->type);
    void *mem_ctx = ralloc_parent(shader->ir);
 
    for (unsigned i = 0; i < lhs_deref->type->length; i++) {
@@ -1090,6 +1091,20 @@
 }
 
 
+ir_visitor_status
+lower_ubo_reference_visitor::visit_enter(ir_texture *ir)
+{
+   ir_dereference *sampler = ir->sampler;
+
+   if (sampler->ir_type == ir_type_dereference_record) {
+      handle_rvalue((ir_rvalue **)&ir->sampler);
+      return visit_continue_with_parent;
+   }
+
+   return rvalue_visit(ir);
+}
+
+
 } /* unnamed namespace */
 
 void
diff --git a/src/compiler/glsl/opt_algebraic.cpp b/src/compiler/glsl/opt_algebraic.cpp
index 0ec3315..b44ab59 100644
--- a/src/compiler/glsl/opt_algebraic.cpp
+++ b/src/compiler/glsl/opt_algebraic.cpp
@@ -144,7 +144,7 @@
 static inline bool
 is_less_than_one(ir_constant *ir)
 {
-   assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+   assert(ir->type->is_float());
 
    if (!is_valid_vec_const(ir))
       return false;
@@ -161,7 +161,7 @@
 static inline bool
 is_greater_than_zero(ir_constant *ir)
 {
-   assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+   assert(ir->type->is_float());
 
    if (!is_valid_vec_const(ir))
       return false;
@@ -246,7 +246,7 @@
 /**
  * Reassociates a constant down a tree of adds or multiplies.
  *
- * Consider (2 * (a * (b * 0.5))).  We want to send up with a * b.
+ * Consider (2 * (a * (b * 0.5))).  We want to end up with a * b.
  */
 bool
 ir_algebraic_visitor::reassociate_constant(ir_expression *ir1, int const_index,
@@ -313,7 +313,6 @@
 {
    ir_constant *op_const[4] = {NULL, NULL, NULL, NULL};
    ir_expression *op_expr[4] = {NULL, NULL, NULL, NULL};
-   unsigned int i;
 
    if (ir->operation == ir_binop_mul &&
        ir->operands[0]->type->is_matrix() &&
@@ -330,7 +329,7 @@
    }
 
    assert(ir->get_num_operands() <= 4);
-   for (i = 0; i < ir->get_num_operands(); i++) {
+   for (unsigned i = 0; i < ir->get_num_operands(); i++) {
       if (ir->operands[i]->type->is_matrix())
 	 return ir;
 
@@ -649,8 +648,7 @@
 
    case ir_binop_div:
       if (is_vec_one(op_const[0]) && (
-                ir->type->base_type == GLSL_TYPE_FLOAT ||
-                ir->type->base_type == GLSL_TYPE_DOUBLE)) {
+                ir->type->is_float() || ir->type->is_double())) {
 	 return new(mem_ctx) ir_expression(ir_unop_rcp,
 					   ir->operands[1]->type,
 					   ir->operands[1],
@@ -845,7 +843,7 @@
 
    case ir_binop_min:
    case ir_binop_max:
-      if (ir->type->base_type != GLSL_TYPE_FLOAT || options->EmitNoSat)
+      if (!ir->type->is_float() || options->EmitNoSat)
          break;
 
       /* Replace min(max) operations and its commutative combinations with
diff --git a/src/compiler/glsl/opt_array_splitting.cpp b/src/compiler/glsl/opt_array_splitting.cpp
index e3073b0..d2e81665 100644
--- a/src/compiler/glsl/opt_array_splitting.cpp
+++ b/src/compiler/glsl/opt_array_splitting.cpp
@@ -140,6 +140,29 @@
    if (var->type->is_unsized_array())
       return NULL;
 
+   /* FIXME: arrays of arrays are not handled correctly by this pass so we
+    * skip it for now. While the pass will create functioning code it actually
+    * produces worse code.
+    *
+    * For example the array:
+    *
+    *    int[3][2] a;
+    *
+    * ends up being split up into:
+    *
+    *    int[3][2] a_0;
+    *    int[3][2] a_1;
+    *    int[3][2] a_2;
+    *
+    * And we end up referencing each of these new arrays for example:
+    *
+    *    a[0][1] will be turned into a_0[0][1]
+    *    a[1][0] will be turned into a_1[1][0]
+    *    a[2][0] will be turned into a_2[2][0]
+    */
+   if (var->type->is_array() && var->type->fields.array->is_array())
+      return NULL;
+
    foreach_in_list(variable_entry, entry, &this->variable_list) {
       if (entry->var == var)
          return entry;
@@ -449,9 +472,20 @@
       for (unsigned int i = 0; i < entry->size; i++) {
          const char *name = ralloc_asprintf(mem_ctx, "%s_%d",
                                             entry->var->name, i);
-
-         entry->components[i] =
+         ir_variable *new_var =
             new(entry->mem_ctx) ir_variable(subtype, name, ir_var_temporary);
+
+         /* Do not lose memory/format qualifiers when arrays of images are
+          * split.
+          */
+         new_var->data.memory_read_only = entry->var->data.memory_read_only;
+         new_var->data.memory_write_only = entry->var->data.memory_write_only;
+         new_var->data.memory_coherent = entry->var->data.memory_coherent;
+         new_var->data.memory_volatile = entry->var->data.memory_volatile;
+         new_var->data.memory_restrict = entry->var->data.memory_restrict;
+         new_var->data.image_format = entry->var->data.image_format;
+
+         entry->components[i] = new_var;
          entry->var->insert_before(entry->components[i]);
       }
 
diff --git a/src/compiler/glsl/opt_constant_propagation.cpp b/src/compiler/glsl/opt_constant_propagation.cpp
index 4039512..c5baf98 100644
--- a/src/compiler/glsl/opt_constant_propagation.cpp
+++ b/src/compiler/glsl/opt_constant_propagation.cpp
@@ -237,6 +237,12 @@
       case GLSL_TYPE_BOOL:
 	 data.b[i] = found->constant->value.b[rhs_channel];
 	 break;
+      case GLSL_TYPE_UINT64:
+	 data.u64[i] = found->constant->value.u64[rhs_channel];
+	 break;
+      case GLSL_TYPE_INT64:
+	 data.i64[i] = found->constant->value.i64[rhs_channel];
+	 break;
       default:
 	 assert(!"not reached");
 	 break;
diff --git a/src/compiler/glsl/opt_structure_splitting.cpp b/src/compiler/glsl/opt_structure_splitting.cpp
index f4c129e..8439430 100644
--- a/src/compiler/glsl/opt_structure_splitting.cpp
+++ b/src/compiler/glsl/opt_structure_splitting.cpp
@@ -316,13 +316,13 @@
    /* Trim out variables we can't split. */
    foreach_in_list_safe(variable_entry, entry, &refs.variable_list) {
       if (debug) {
-	 printf("structure %s@%p: decl %d, whole_access %d\n",
-		entry->var->name, (void *) entry->var, entry->declaration,
-		entry->whole_structure_access);
+         printf("structure %s@%p: decl %d, whole_access %d\n",
+                entry->var->name, (void *) entry->var, entry->declaration,
+                entry->whole_structure_access);
       }
 
       if (!entry->declaration || entry->whole_structure_access) {
-	 entry->remove();
+         entry->remove();
       }
    }
 
@@ -339,20 +339,36 @@
 
       entry->mem_ctx = ralloc_parent(entry->var);
 
-      entry->components = ralloc_array(mem_ctx,
-				       ir_variable *,
-				       type->length);
+      entry->components = ralloc_array(mem_ctx, ir_variable *, type->length);
 
       for (unsigned int i = 0; i < entry->var->type->length; i++) {
-	 const char *name = ralloc_asprintf(mem_ctx, "%s_%s",
-					    entry->var->name,
-					    type->fields.structure[i].name);
+         const char *name = ralloc_asprintf(mem_ctx, "%s_%s", entry->var->name,
+                                            type->fields.structure[i].name);
+         ir_variable *new_var =
+            new(entry->mem_ctx) ir_variable(type->fields.structure[i].type,
+                                            name,
+                                            (ir_variable_mode) entry->var->data.mode);
 
-	 entry->components[i] =
-	    new(entry->mem_ctx) ir_variable(type->fields.structure[i].type,
-					    name,
-					    (ir_variable_mode) entry->var->data.mode);
-	 entry->var->insert_before(entry->components[i]);
+         if (type->fields.structure[i].type->without_array()->is_image()) {
+            /* Do not lose memory/format qualifiers for images declared inside
+             * structures as allowed by ARB_bindless_texture.
+             */
+            new_var->data.memory_read_only =
+               type->fields.structure[i].memory_read_only;
+            new_var->data.memory_write_only =
+               type->fields.structure[i].memory_write_only;
+            new_var->data.memory_coherent =
+               type->fields.structure[i].memory_coherent;
+            new_var->data.memory_volatile =
+               type->fields.structure[i].memory_volatile;
+            new_var->data.memory_restrict =
+               type->fields.structure[i].memory_restrict;
+            new_var->data.image_format =
+               type->fields.structure[i].image_format;
+         }
+
+         entry->components[i] = new_var;
+         entry->var->insert_before(entry->components[i]);
       }
 
       entry->var->remove();
diff --git a/src/compiler/glsl/opt_tree_grafting.cpp b/src/compiler/glsl/opt_tree_grafting.cpp
index 28b6e18..b0a1604 100644
--- a/src/compiler/glsl/opt_tree_grafting.cpp
+++ b/src/compiler/glsl/opt_tree_grafting.cpp
@@ -371,6 +371,17 @@
       if (lhs_var->data.precise)
          continue;
 
+      /* Do not graft sampler and image variables. This is a workaround to
+       * st/glsl_to_tgsi being unable to handle expression parameters to image
+       * intrinsics.
+       *
+       * Note that if this is ever fixed, we still need to skip grafting when
+       * any image layout qualifiers (including the image format) are set,
+       * since we must not lose those.
+       */
+      if (lhs_var->type->is_sampler() || lhs_var->type->is_image())
+         continue;
+
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
 
       if (!entry->declaration ||
diff --git a/src/compiler/glsl/shader_cache.cpp b/src/compiler/glsl/shader_cache.cpp
index 1da32d3..05fd953 100644
--- a/src/compiler/glsl/shader_cache.cpp
+++ b/src/compiler/glsl/shader_cache.cpp
@@ -59,7 +59,7 @@
 #include "program.h"
 #include "shader_cache.h"
 #include "util/mesa-sha1.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 
 extern "C" {
 #include "main/enums.h"
@@ -75,10 +75,25 @@
 }
 
 static void
+get_struct_type_field_and_pointer_sizes(size_t *s_field_size,
+                                        size_t *s_field_ptrs)
+{
+   *s_field_size = sizeof(glsl_struct_field);
+   *s_field_ptrs =
+     sizeof(((glsl_struct_field *)0)->type) +
+     sizeof(((glsl_struct_field *)0)->name);
+}
+
+static void
 encode_type_to_blob(struct blob *blob, const glsl_type *type)
 {
    uint32_t encoding;
 
+   if (!type) {
+      blob_write_uint32(blob, 0);
+      return;
+   }
+
    switch (type->base_type) {
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
@@ -122,14 +137,21 @@
       blob_write_uint32(blob, (type->base_type) << 24);
       blob_write_string(blob, type->name);
       blob_write_uint32(blob, type->length);
-      blob_write_bytes(blob, type->fields.structure,
-                       sizeof(glsl_struct_field) * type->length);
+
+      size_t s_field_size, s_field_ptrs;
+      get_struct_type_field_and_pointer_sizes(&s_field_size, &s_field_ptrs);
+
       for (unsigned i = 0; i < type->length; i++) {
          encode_type_to_blob(blob, type->fields.structure[i].type);
          blob_write_string(blob, type->fields.structure[i].name);
+
+         /* Write the struct field skipping the pointers */
+         blob_write_bytes(blob,
+                          ((char *)&type->fields.structure[i]) + s_field_ptrs,
+                          s_field_size - s_field_ptrs);
       }
 
-      if (type->base_type == GLSL_TYPE_INTERFACE) {
+      if (type->is_interface()) {
          blob_write_uint32(blob, type->interface_packing);
          blob_write_uint32(blob, type->interface_row_major);
       }
@@ -149,6 +171,11 @@
 decode_type_from_blob(struct blob_reader *blob)
 {
    uint32_t u = blob_read_uint32(blob);
+
+   if (u == 0) {
+      return NULL;
+   }
+
    glsl_base_type base_type = (glsl_base_type) (u >> 24);
 
    switch (base_type) {
@@ -182,22 +209,33 @@
    case GLSL_TYPE_INTERFACE: {
       char *name = blob_read_string(blob);
       unsigned num_fields = blob_read_uint32(blob);
-      glsl_struct_field *fields = (glsl_struct_field *)
-         blob_read_bytes(blob, sizeof(glsl_struct_field) * num_fields);
+
+      size_t s_field_size, s_field_ptrs;
+      get_struct_type_field_and_pointer_sizes(&s_field_size, &s_field_ptrs);
+
+      glsl_struct_field *fields =
+         (glsl_struct_field *) malloc(s_field_size * num_fields);
       for (unsigned i = 0; i < num_fields; i++) {
          fields[i].type = decode_type_from_blob(blob);
          fields[i].name = blob_read_string(blob);
+
+         blob_copy_bytes(blob, ((uint8_t *) &fields[i]) + s_field_ptrs,
+                         s_field_size - s_field_ptrs);
       }
 
+      const glsl_type *t;
       if (base_type == GLSL_TYPE_INTERFACE) {
          enum glsl_interface_packing packing =
             (glsl_interface_packing) blob_read_uint32(blob);
          bool row_major = blob_read_uint32(blob);
-         return glsl_type::get_interface_instance(fields, num_fields,
-                                                  packing, row_major, name);
+         t = glsl_type::get_interface_instance(fields, num_fields, packing,
+                                               row_major, name);
       } else {
-         return glsl_type::get_record_instance(fields, num_fields, name);
+         t = glsl_type::get_record_instance(fields, num_fields, name);
       }
+
+      free(fields);
+      return t;
    }
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
@@ -555,6 +593,17 @@
                       MAX_FEEDBACK_BUFFERS);
 }
 
+static bool
+has_uniform_storage(struct gl_shader_program *prog, unsigned idx)
+{
+   if (!prog->data->UniformStorage[idx].builtin &&
+       !prog->data->UniformStorage[idx].is_shader_storage &&
+       prog->data->UniformStorage[idx].block_index == -1)
+      return true;
+
+   return false;
+}
+
 static void
 write_uniforms(struct blob *metadata, struct gl_shader_program *prog)
 {
@@ -566,8 +615,6 @@
       encode_type_to_blob(metadata, prog->data->UniformStorage[i].type);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].array_elements);
       blob_write_string(metadata, prog->data->UniformStorage[i].name);
-      blob_write_uint32(metadata, prog->data->UniformStorage[i].storage -
-                                  prog->data->UniformDataSlots);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].builtin);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].remap_location);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].block_index);
@@ -576,14 +623,22 @@
       blob_write_uint32(metadata, prog->data->UniformStorage[i].array_stride);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].hidden);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].is_shader_storage);
+      blob_write_uint32(metadata, prog->data->UniformStorage[i].active_shader_mask);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].matrix_stride);
       blob_write_uint32(metadata, prog->data->UniformStorage[i].row_major);
+      blob_write_uint32(metadata, prog->data->UniformStorage[i].is_bindless);
       blob_write_uint32(metadata,
                         prog->data->UniformStorage[i].num_compatible_subroutines);
       blob_write_uint32(metadata,
                         prog->data->UniformStorage[i].top_level_array_size);
       blob_write_uint32(metadata,
                         prog->data->UniformStorage[i].top_level_array_stride);
+
+     if (has_uniform_storage(prog, i)) {
+         blob_write_uint32(metadata, prog->data->UniformStorage[i].storage -
+                                     prog->data->UniformDataSlots);
+      }
+
       blob_write_bytes(metadata, prog->data->UniformStorage[i].opaque,
                        sizeof(prog->data->UniformStorage[i].opaque));
    }
@@ -595,11 +650,9 @@
     */
    blob_write_uint32(metadata, prog->data->NumHiddenUniforms);
    for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
-      if (!prog->data->UniformStorage[i].builtin &&
-          !prog->data->UniformStorage[i].is_shader_storage &&
-          prog->data->UniformStorage[i].block_index == -1) {
+      if (has_uniform_storage(prog, i)) {
          unsigned vec_size =
-            values_for_type(prog->data->UniformStorage[i].type) *
+            prog->data->UniformStorage[i].type->component_slots() *
             MAX2(prog->data->UniformStorage[i].array_elements, 1);
          blob_write_bytes(metadata, prog->data->UniformStorage[i].storage,
                           sizeof(union gl_constant_value) * vec_size);
@@ -631,7 +684,6 @@
       uniforms[i].type = decode_type_from_blob(metadata);
       uniforms[i].array_elements = blob_read_uint32(metadata);
       uniforms[i].name = ralloc_strdup(prog, blob_read_string (metadata));
-      uniforms[i].storage = data + blob_read_uint32(metadata);
       uniforms[i].builtin = blob_read_uint32(metadata);
       uniforms[i].remap_location = blob_read_uint32(metadata);
       uniforms[i].block_index = blob_read_uint32(metadata);
@@ -640,13 +692,19 @@
       uniforms[i].array_stride = blob_read_uint32(metadata);
       uniforms[i].hidden = blob_read_uint32(metadata);
       uniforms[i].is_shader_storage = blob_read_uint32(metadata);
+      uniforms[i].active_shader_mask = blob_read_uint32(metadata);
       uniforms[i].matrix_stride = blob_read_uint32(metadata);
       uniforms[i].row_major = blob_read_uint32(metadata);
+      uniforms[i].is_bindless = blob_read_uint32(metadata);
       uniforms[i].num_compatible_subroutines = blob_read_uint32(metadata);
       uniforms[i].top_level_array_size = blob_read_uint32(metadata);
       uniforms[i].top_level_array_stride = blob_read_uint32(metadata);
       prog->UniformHash->put(i, uniforms[i].name);
 
+      if (has_uniform_storage(prog, i)) {
+         uniforms[i].storage = data + blob_read_uint32(metadata);
+      }
+
       memcpy(uniforms[i].opaque,
              blob_read_bytes(metadata, sizeof(uniforms[i].opaque)),
              sizeof(uniforms[i].opaque));
@@ -655,11 +713,9 @@
    /* Restore uniform values. */
    prog->data->NumHiddenUniforms = blob_read_uint32(metadata);
    for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
-      if (!prog->data->UniformStorage[i].builtin &&
-          !prog->data->UniformStorage[i].is_shader_storage &&
-          prog->data->UniformStorage[i].block_index == -1) {
+      if (has_uniform_storage(prog, i)) {
          unsigned vec_size =
-            values_for_type(prog->data->UniformStorage[i].type) *
+            prog->data->UniformStorage[i].type->component_slots() *
             MAX2(prog->data->UniformStorage[i].array_elements, 1);
          blob_copy_bytes(metadata,
                          (uint8_t *) prog->data->UniformStorage[i].storage,
@@ -864,6 +920,18 @@
 }
 
 static void
+get_shader_var_and_pointer_sizes(size_t *s_var_size, size_t *s_var_ptrs,
+                                 const gl_shader_variable *var)
+{
+   *s_var_size = sizeof(gl_shader_variable);
+   *s_var_ptrs =
+      sizeof(var->type) +
+      sizeof(var->interface_type) +
+      sizeof(var->outermost_struct_type) +
+      sizeof(var->name);
+}
+
+static void
 write_program_resource_data(struct blob *metadata,
                             struct gl_shader_program *prog,
                             struct gl_program_resource *res)
@@ -874,16 +942,19 @@
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT: {
       const gl_shader_variable *var = (gl_shader_variable *)res->Data;
-      blob_write_bytes(metadata, var, sizeof(gl_shader_variable));
+
       encode_type_to_blob(metadata, var->type);
-
-      if (var->interface_type)
-         encode_type_to_blob(metadata, var->interface_type);
-
-      if (var->outermost_struct_type)
-         encode_type_to_blob(metadata, var->outermost_struct_type);
+      encode_type_to_blob(metadata, var->interface_type);
+      encode_type_to_blob(metadata, var->outermost_struct_type);
 
       blob_write_string(metadata, var->name);
+
+      size_t s_var_size, s_var_ptrs;
+      get_shader_var_and_pointer_sizes(&s_var_size, &s_var_ptrs, var);
+
+      /* Write gl_shader_variable skipping over the pointers */
+      blob_write_bytes(metadata, ((char *)var) + s_var_ptrs,
+                       s_var_size - s_var_ptrs);
       break;
    }
    case GL_UNIFORM_BLOCK:
@@ -974,17 +1045,18 @@
    case GL_PROGRAM_OUTPUT: {
       gl_shader_variable *var = ralloc(prog, struct gl_shader_variable);
 
-      blob_copy_bytes(metadata, (uint8_t *) var, sizeof(gl_shader_variable));
       var->type = decode_type_from_blob(metadata);
-
-      if (var->interface_type)
-         var->interface_type = decode_type_from_blob(metadata);
-
-      if (var->outermost_struct_type)
-         var->outermost_struct_type = decode_type_from_blob(metadata);
+      var->interface_type = decode_type_from_blob(metadata);
+      var->outermost_struct_type = decode_type_from_blob(metadata);
 
       var->name = ralloc_strdup(prog, blob_read_string(metadata));
 
+      size_t s_var_size, s_var_ptrs;
+      get_shader_var_and_pointer_sizes(&s_var_size, &s_var_ptrs, var);
+
+      blob_copy_bytes(metadata, ((uint8_t *) var) + s_var_ptrs,
+                      s_var_size - s_var_ptrs);
+
       res->Data = var;
       break;
    }
@@ -1127,6 +1199,7 @@
 {
    assert(shader->Program);
    struct gl_program *glprog = shader->Program;
+   unsigned i;
 
    blob_write_bytes(metadata, glprog->TexturesUsed,
                     sizeof(glprog->TexturesUsed));
@@ -1143,6 +1216,22 @@
    blob_write_bytes(metadata, glprog->sh.ImageUnits,
                     sizeof(glprog->sh.ImageUnits));
 
+   size_t ptr_size = sizeof(GLvoid *);
+
+   blob_write_uint32(metadata, glprog->sh.NumBindlessSamplers);
+   blob_write_uint32(metadata, glprog->sh.HasBoundBindlessSampler);
+   for (i = 0; i < glprog->sh.NumBindlessSamplers; i++) {
+      blob_write_bytes(metadata, &glprog->sh.BindlessSamplers[i],
+                       sizeof(struct gl_bindless_sampler) - ptr_size);
+   }
+
+   blob_write_uint32(metadata, glprog->sh.NumBindlessImages);
+   blob_write_uint32(metadata, glprog->sh.HasBoundBindlessImage);
+   for (i = 0; i < glprog->sh.NumBindlessImages; i++) {
+      blob_write_bytes(metadata, &glprog->sh.BindlessImages[i],
+                       sizeof(struct gl_bindless_image) - ptr_size);
+   }
+
    write_shader_parameters(metadata, glprog->Parameters);
 }
 
@@ -1151,6 +1240,8 @@
                      struct gl_program *glprog,
                      gl_linked_shader *linked)
 {
+   unsigned i;
+
    blob_copy_bytes(metadata, (uint8_t *) glprog->TexturesUsed,
                    sizeof(glprog->TexturesUsed));
    glprog->SamplersUsed = blob_read_uint64(metadata);
@@ -1166,6 +1257,34 @@
    blob_copy_bytes(metadata, (uint8_t *) glprog->sh.ImageUnits,
                    sizeof(glprog->sh.ImageUnits));
 
+   size_t ptr_size = sizeof(GLvoid *);
+
+   glprog->sh.NumBindlessSamplers = blob_read_uint32(metadata);
+   glprog->sh.HasBoundBindlessSampler = blob_read_uint32(metadata);
+   if (glprog->sh.NumBindlessSamplers > 0) {
+      glprog->sh.BindlessSamplers =
+         rzalloc_array(glprog, gl_bindless_sampler,
+                       glprog->sh.NumBindlessSamplers);
+
+      for (i = 0; i < glprog->sh.NumBindlessSamplers; i++) {
+         blob_copy_bytes(metadata, (uint8_t *) &glprog->sh.BindlessSamplers[i],
+                         sizeof(struct gl_bindless_sampler) - ptr_size);
+      }
+   }
+
+   glprog->sh.NumBindlessImages = blob_read_uint32(metadata);
+   glprog->sh.HasBoundBindlessImage = blob_read_uint32(metadata);
+   if (glprog->sh.NumBindlessImages > 0) {
+      glprog->sh.BindlessImages =
+         rzalloc_array(glprog, gl_bindless_image,
+                       glprog->sh.NumBindlessImages);
+
+      for (i = 0; i < glprog->sh.NumBindlessImages; i++) {
+         blob_copy_bytes(metadata, (uint8_t *) &glprog->sh.BindlessImages[i],
+                        sizeof(struct gl_bindless_image) - ptr_size);
+      }
+   }
+
    glprog->Parameters = _mesa_new_parameter_list();
    read_shader_parameters(metadata, glprog->Parameters);
 }
@@ -1178,6 +1297,14 @@
 }
 
 static void
+get_shader_info_and_pointer_sizes(size_t *s_info_size, size_t *s_info_ptrs,
+                                  shader_info *info)
+{
+   *s_info_size = sizeof(shader_info);
+   *s_info_ptrs = sizeof(info->name) + sizeof(info->label);
+}
+
+static void
 create_linked_shader_and_program(struct gl_context *ctx,
                                  gl_shader_stage stage,
                                  struct gl_shader_program *prog,
@@ -1195,12 +1322,16 @@
 
    read_shader_metadata(metadata, glprog, linked);
 
+   glprog->info.name = ralloc_strdup(glprog, blob_read_string(metadata));
+   glprog->info.label = ralloc_strdup(glprog, blob_read_string(metadata));
+
+   size_t s_info_size, s_info_ptrs;
+   get_shader_info_and_pointer_sizes(&s_info_size, &s_info_ptrs,
+                                     &glprog->info);
+
    /* Restore shader info */
-   blob_copy_bytes(metadata, (uint8_t *) &glprog->info, sizeof(shader_info));
-   if (glprog->info.name)
-      glprog->info.name = ralloc_strdup(glprog, blob_read_string(metadata));
-   if (glprog->info.label)
-      glprog->info.label = ralloc_strdup(glprog, blob_read_string(metadata));
+   blob_copy_bytes(metadata, ((uint8_t *) &glprog->info) + s_info_ptrs,
+                   s_info_size - s_info_ptrs);
 
    _mesa_reference_shader_program_data(ctx, &glprog->sh.data, prog->data);
    _mesa_reference_program(ctx, &linked->Program, glprog);
@@ -1239,14 +1370,24 @@
       if (sh) {
          write_shader_metadata(metadata, sh);
 
-         /* Store nir shader info */
-         blob_write_bytes(metadata, &sh->Program->info, sizeof(shader_info));
-
          if (sh->Program->info.name)
             blob_write_string(metadata, sh->Program->info.name);
+         else
+            blob_write_string(metadata, "");
 
          if (sh->Program->info.label)
             blob_write_string(metadata, sh->Program->info.label);
+         else
+            blob_write_string(metadata, "");
+
+         size_t s_info_size, s_info_ptrs;
+         get_shader_info_and_pointer_sizes(&s_info_size, &s_info_ptrs,
+                                           &sh->Program->info);
+
+         /* Store shader info */
+         blob_write_bytes(metadata,
+                          ((char *) &sh->Program->info) + s_info_ptrs,
+                          s_info_size - s_info_ptrs);
       }
    }
 
diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp
index 52554bb..7a84ca7 100644
--- a/src/compiler/glsl/standalone.cpp
+++ b/src/compiler/glsl/standalone.cpp
@@ -36,7 +36,7 @@
 #include "loop_analysis.h"
 #include "standalone_scaffolding.h"
 #include "standalone.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 #include "util/set.h"
 #include "linker.h"
 #include "glsl_parser_extras.h"
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index 5f1b2d0..bc91682 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -183,7 +183,6 @@
    ctx->Extensions.ARB_shader_bit_encoding = true;
    ctx->Extensions.ARB_shader_draw_parameters = true;
    ctx->Extensions.ARB_shader_stencil_export = true;
-   ctx->Extensions.ARB_shader_subroutine = true;
    ctx->Extensions.ARB_shader_texture_lod = true;
    ctx->Extensions.ARB_shading_language_420pack = true;
    ctx->Extensions.ARB_shading_language_packing = true;
@@ -196,6 +195,7 @@
    ctx->Extensions.ARB_uniform_buffer_object = true;
    ctx->Extensions.ARB_viewport_array = true;
    ctx->Extensions.ARB_cull_distance = true;
+   ctx->Extensions.ARB_bindless_texture = true;
 
    ctx->Extensions.OES_EGL_image_external = true;
    ctx->Extensions.OES_standard_derivatives = true;
diff --git a/src/util/string_to_uint_map.cpp b/src/compiler/glsl/string_to_uint_map.cpp
similarity index 100%
rename from src/util/string_to_uint_map.cpp
rename to src/compiler/glsl/string_to_uint_map.cpp
diff --git a/src/util/string_to_uint_map.h b/src/compiler/glsl/string_to_uint_map.h
similarity index 100%
rename from src/util/string_to_uint_map.h
rename to src/compiler/glsl/string_to_uint_map.h
diff --git a/src/compiler/glsl/tests/cache_test.c b/src/compiler/glsl/tests/cache_test.c
index bec1d24..af1b66f 100644
--- a/src/compiler/glsl/tests/cache_test.c
+++ b/src/compiler/glsl/tests/cache_test.c
@@ -159,7 +159,7 @@
     * MESA_GLSL_CACHE_DISABLE set, that disk_cache_create returns NULL.
     */
    setenv("MESA_GLSL_CACHE_DISABLE", "1", 1);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_null(cache, "disk_cache_create with MESA_GLSL_CACHE_DISABLE set");
 
    unsetenv("MESA_GLSL_CACHE_DISABLE");
@@ -170,19 +170,19 @@
    unsetenv("MESA_GLSL_CACHE_DIR");
    unsetenv("XDG_CACHE_HOME");
 
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_non_null(cache, "disk_cache_create with no environment variables");
 
    disk_cache_destroy(cache);
 
    /* Test with XDG_CACHE_HOME set */
    setenv("XDG_CACHE_HOME", CACHE_TEST_TMP "/xdg-cache-home", 1);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_null(cache, "disk_cache_create with XDG_CACHE_HOME set with"
                "a non-existing parent directory");
 
    mkdir(CACHE_TEST_TMP, 0755);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_non_null(cache, "disk_cache_create with XDG_CACHE_HOME set");
 
    check_directories_created(CACHE_TEST_TMP "/xdg-cache-home/mesa");
@@ -194,12 +194,12 @@
    expect_equal(err, 0, "Removing " CACHE_TEST_TMP);
 
    setenv("MESA_GLSL_CACHE_DIR", CACHE_TEST_TMP "/mesa-glsl-cache-dir", 1);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_null(cache, "disk_cache_create with MESA_GLSL_CACHE_DIR set with"
                "a non-existing parent directory");
 
    mkdir(CACHE_TEST_TMP, 0755);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
    expect_non_null(cache, "disk_cache_create with MESA_GLSL_CACHE_DIR set");
 
    check_directories_created(CACHE_TEST_TMP "/mesa-glsl-cache-dir/mesa");
@@ -256,7 +256,7 @@
    uint8_t one_KB_key[20], one_MB_key[20];
    int count;
 
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
 
    disk_cache_compute_key(cache, blob, sizeof(blob), blob_key);
 
@@ -298,7 +298,7 @@
    disk_cache_destroy(cache);
 
    setenv("MESA_GLSL_CACHE_MAX_SIZE", "1K", 1);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
 
    one_KB = calloc(1, 1024);
 
@@ -363,7 +363,7 @@
    disk_cache_destroy(cache);
 
    setenv("MESA_GLSL_CACHE_MAX_SIZE", "1M", 1);
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
 
    disk_cache_put(cache, blob_key, blob, sizeof(blob));
    disk_cache_put(cache, string_key, string, sizeof(string));
@@ -438,7 +438,7 @@
                         { 0,  1, 42, 43, 44, 45, 46, 47, 48, 49,
                          50, 55, 52, 53, 54, 55, 56, 57, 58, 59};
 
-   cache = disk_cache_create("test", "make_check");
+   cache = disk_cache_create("test", "make_check", 0);
 
    /* First test that disk_cache_has_key returns false before disk_cache_put_key */
    result = disk_cache_has_key(cache, key_a);
diff --git a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
index d30abc3..94a6d270e 100644
--- a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
+++ b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
@@ -25,7 +25,7 @@
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "util/ralloc.h"
-#include "util/string_to_uint_map.h"
+#include "string_to_uint_map.h"
 #include "uniform_initializer_utils.h"
 
 namespace linker {
diff --git a/src/compiler/glsl/tests/uniform_initializer_utils.cpp b/src/compiler/glsl/tests/uniform_initializer_utils.cpp
index 2a1a168..9a66eba 100644
--- a/src/compiler/glsl/tests/uniform_initializer_utils.cpp
+++ b/src/compiler/glsl/tests/uniform_initializer_utils.cpp
@@ -214,7 +214,7 @@
             ir_constant *val, unsigned red_zone_size,
             unsigned int boolean_true)
 {
-   if (val->type->base_type == GLSL_TYPE_ARRAY) {
+   if (val->type->is_array()) {
       const glsl_type *const element_type = val->array_elements[0]->type;
 
       for (unsigned i = 0; i < storage_array_size; i++) {
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index db65bb0..188b72f 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -28,7 +28,8 @@
 #include "util/hash_table.h"
 
 
-mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
+mtx_t glsl_type::mem_mutex = _MTX_INITIALIZER_NP;
+mtx_t glsl_type::hash_mutex = _MTX_INITIALIZER_NP;
 hash_table *glsl_type::array_types = NULL;
 hash_table *glsl_type::record_types = NULL;
 hash_table *glsl_type::interface_types = NULL;
@@ -62,13 +63,13 @@
    STATIC_ASSERT((unsigned(GLSL_TYPE_INT)   & 3) == unsigned(GLSL_TYPE_INT));
    STATIC_ASSERT((unsigned(GLSL_TYPE_FLOAT) & 3) == unsigned(GLSL_TYPE_FLOAT));
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
    assert(name != NULL);
    this->name = ralloc_strdup(this->mem_ctx, name);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 
    /* Neither dimension is zero or both dimensions are zero.
     */
@@ -85,22 +86,17 @@
    sampler_array(array), sampled_type(type), interface_packing(0),
    interface_row_major(0), length(0)
 {
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
    assert(name != NULL);
    this->name = ralloc_strdup(this->mem_ctx, name);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 
    memset(& fields, 0, sizeof(fields));
 
-   if (is_sampler()) {
-      /* Samplers take no storage whatsoever. */
-      matrix_columns = vector_elements = 0;
-   } else {
-      matrix_columns = vector_elements = 1;
-   }
+   matrix_columns = vector_elements = 1;
 }
 
 glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
@@ -114,7 +110,7 @@
 {
    unsigned int i;
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
    assert(name != NULL);
@@ -128,7 +124,7 @@
                                                      fields[i].name);
    }
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 }
 
 glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
@@ -144,7 +140,7 @@
 {
    unsigned int i;
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
    assert(name != NULL);
@@ -157,7 +153,7 @@
                                                      fields[i].name);
    }
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 }
 
 glsl_type::glsl_type(const glsl_type *return_type,
@@ -171,7 +167,7 @@
 {
    unsigned int i;
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
 
@@ -190,7 +186,7 @@
       this->fields.parameters[i + 1].out = params[i].out;
    }
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 }
 
 glsl_type::glsl_type(const char *subroutine_name) :
@@ -201,12 +197,12 @@
    vector_elements(1), matrix_columns(1),
    length(0)
 {
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
 
    init_ralloc_type_ctx();
    assert(subroutine_name != NULL);
    this->name = ralloc_strdup(this->mem_ctx, subroutine_name);
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 }
 
 bool
@@ -225,6 +221,19 @@
    }
 }
 
+bool
+glsl_type::contains_array() const
+{
+   if (this->is_record() || this->is_interface()) {
+      for (unsigned int i = 0; i < this->length; i++) {
+         if (this->fields.structure[i].type->contains_array())
+            return true;
+      }
+      return false;
+   } else {
+      return this->is_array();
+   }
+}
 
 bool
 glsl_type::contains_integer() const
@@ -417,6 +426,16 @@
       glsl_type::interface_types = NULL;
    }
 
+   if (glsl_type::function_types != NULL) {
+      _mesa_hash_table_destroy(glsl_type::function_types, NULL);
+      glsl_type::function_types = NULL;
+   }
+
+   if (glsl_type::subroutine_types != NULL) {
+      _mesa_hash_table_destroy(glsl_type::subroutine_types, NULL);
+      glsl_type::subroutine_types = NULL;
+   }
+
    ralloc_free(glsl_type::mem_ctx);
    glsl_type::mem_ctx = NULL;
 }
@@ -442,9 +461,9 @@
     */
    const unsigned name_length = strlen(array->name) + 10 + 3;
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::mem_mutex);
    char *const n = (char *) ralloc_size(this->mem_ctx, name_length);
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::mem_mutex);
 
    if (length == 0)
       snprintf(n, name_length, "%s[]", array->name);
@@ -864,7 +883,7 @@
    char key[128];
    snprintf(key, sizeof(key), "%p[%u]", (void *) base, array_size);
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::hash_mutex);
 
    if (array_types == NULL) {
       array_types = _mesa_hash_table_create(NULL, _mesa_key_hash_string,
@@ -873,9 +892,7 @@
 
    const struct hash_entry *entry = _mesa_hash_table_search(array_types, key);
    if (entry == NULL) {
-      mtx_unlock(&glsl_type::mutex);
       const glsl_type *t = new glsl_type(base, array_size);
-      mtx_lock(&glsl_type::mutex);
 
       entry = _mesa_hash_table_insert(array_types,
                                       ralloc_strdup(mem_ctx, key),
@@ -886,7 +903,7 @@
    assert(((glsl_type *) entry->data)->length == array_size);
    assert(((glsl_type *) entry->data)->fields.array == base);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::hash_mutex);
 
    return (glsl_type *) entry->data;
 }
@@ -910,13 +927,9 @@
     *     type definitions, and field names to be considered the same type."
     *
     * GLSL ES behaves the same (Ver 1.00 Sec 4.2.4, Ver 3.00 Sec 4.2.5).
-    *
-    * Note that we cannot force type name check when comparing unnamed
-    * structure types, these have a unique name assigned during parsing.
     */
-   if (!this->is_anonymous() && !b->is_anonymous())
-      if (strcmp(this->name, b->name) != 0)
-         return false;
+   if (strcmp(this->name, b->name) != 0)
+      return false;
 
    for (unsigned i = 0; i < this->length; i++) {
       if (this->fields.structure[i].type != b->fields.structure[i].type)
@@ -945,20 +958,23 @@
       if (this->fields.structure[i].patch
           != b->fields.structure[i].patch)
          return false;
-      if (this->fields.structure[i].image_read_only
-          != b->fields.structure[i].image_read_only)
+      if (this->fields.structure[i].memory_read_only
+          != b->fields.structure[i].memory_read_only)
          return false;
-      if (this->fields.structure[i].image_write_only
-          != b->fields.structure[i].image_write_only)
+      if (this->fields.structure[i].memory_write_only
+          != b->fields.structure[i].memory_write_only)
          return false;
-      if (this->fields.structure[i].image_coherent
-          != b->fields.structure[i].image_coherent)
+      if (this->fields.structure[i].memory_coherent
+          != b->fields.structure[i].memory_coherent)
          return false;
-      if (this->fields.structure[i].image_volatile
-          != b->fields.structure[i].image_volatile)
+      if (this->fields.structure[i].memory_volatile
+          != b->fields.structure[i].memory_volatile)
          return false;
-      if (this->fields.structure[i].image_restrict
-          != b->fields.structure[i].image_restrict)
+      if (this->fields.structure[i].memory_restrict
+          != b->fields.structure[i].memory_restrict)
+         return false;
+      if (this->fields.structure[i].image_format
+          != b->fields.structure[i].image_format)
          return false;
       if (this->fields.structure[i].precision
           != b->fields.structure[i].precision)
@@ -1019,7 +1035,7 @@
 {
    const glsl_type key(fields, num_fields, name);
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::hash_mutex);
 
    if (record_types == NULL) {
       record_types = _mesa_hash_table_create(NULL, record_key_hash,
@@ -1029,9 +1045,7 @@
    const struct hash_entry *entry = _mesa_hash_table_search(record_types,
                                                             &key);
    if (entry == NULL) {
-      mtx_unlock(&glsl_type::mutex);
       const glsl_type *t = new glsl_type(fields, num_fields, name);
-      mtx_lock(&glsl_type::mutex);
 
       entry = _mesa_hash_table_insert(record_types, t, (void *) t);
    }
@@ -1040,7 +1054,7 @@
    assert(((glsl_type *) entry->data)->length == num_fields);
    assert(strcmp(((glsl_type *) entry->data)->name, name) == 0);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::hash_mutex);
 
    return (glsl_type *) entry->data;
 }
@@ -1055,7 +1069,7 @@
 {
    const glsl_type key(fields, num_fields, packing, row_major, block_name);
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::hash_mutex);
 
    if (interface_types == NULL) {
       interface_types = _mesa_hash_table_create(NULL, record_key_hash,
@@ -1065,10 +1079,8 @@
    const struct hash_entry *entry = _mesa_hash_table_search(interface_types,
                                                             &key);
    if (entry == NULL) {
-      mtx_unlock(&glsl_type::mutex);
       const glsl_type *t = new glsl_type(fields, num_fields,
                                          packing, row_major, block_name);
-      mtx_lock(&glsl_type::mutex);
 
       entry = _mesa_hash_table_insert(interface_types, t, (void *) t);
    }
@@ -1077,7 +1089,7 @@
    assert(((glsl_type *) entry->data)->length == num_fields);
    assert(strcmp(((glsl_type *) entry->data)->name, block_name) == 0);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::hash_mutex);
 
    return (glsl_type *) entry->data;
 }
@@ -1087,7 +1099,7 @@
 {
    const glsl_type key(subroutine_name);
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::hash_mutex);
 
    if (subroutine_types == NULL) {
       subroutine_types = _mesa_hash_table_create(NULL, record_key_hash,
@@ -1097,9 +1109,7 @@
    const struct hash_entry *entry = _mesa_hash_table_search(subroutine_types,
                                                             &key);
    if (entry == NULL) {
-      mtx_unlock(&glsl_type::mutex);
       const glsl_type *t = new glsl_type(subroutine_name);
-      mtx_lock(&glsl_type::mutex);
 
       entry = _mesa_hash_table_insert(subroutine_types, t, (void *) t);
    }
@@ -1107,7 +1117,7 @@
    assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_SUBROUTINE);
    assert(strcmp(((glsl_type *) entry->data)->name, subroutine_name) == 0);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::hash_mutex);
 
    return (glsl_type *) entry->data;
 }
@@ -1142,7 +1152,7 @@
 {
    const glsl_type key(return_type, params, num_params);
 
-   mtx_lock(&glsl_type::mutex);
+   mtx_lock(&glsl_type::hash_mutex);
 
    if (function_types == NULL) {
       function_types = _mesa_hash_table_create(NULL, function_key_hash,
@@ -1151,9 +1161,7 @@
 
    struct hash_entry *entry = _mesa_hash_table_search(function_types, &key);
    if (entry == NULL) {
-      mtx_unlock(&glsl_type::mutex);
       const glsl_type *t = new glsl_type(return_type, params, num_params);
-      mtx_lock(&glsl_type::mutex);
 
       entry = _mesa_hash_table_insert(function_types, t, (void *) t);
    }
@@ -1163,7 +1171,7 @@
    assert(t->base_type == GLSL_TYPE_FUNCTION);
    assert(t->length == num_params);
 
-   mtx_unlock(&glsl_type::mutex);
+   mtx_unlock(&glsl_type::hash_mutex);
 
    return t;
 }
@@ -1296,13 +1304,14 @@
    case GLSL_TYPE_ARRAY:
       return this->length * this->fields.array->component_slots();
 
+   case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
-      return 1;
+      return 2;
+
    case GLSL_TYPE_SUBROUTINE:
-     return 1;
+      return 1;
 
    case GLSL_TYPE_FUNCTION:
-   case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
@@ -1967,6 +1976,8 @@
    case GLSL_TYPE_INT:
    case GLSL_TYPE_FLOAT:
    case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
       return this->matrix_columns;
    case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_UINT64:
@@ -1989,8 +2000,6 @@
       return this->length * this->fields.array->count_attribute_slots(is_vertex_input);
 
    case GLSL_TYPE_FUNCTION:
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_SUBROUTINE:
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index 7709556..2857dc9 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -72,7 +72,20 @@
 {
    return type == GLSL_TYPE_DOUBLE ||
           type == GLSL_TYPE_UINT64 ||
-          type == GLSL_TYPE_INT64;
+          type == GLSL_TYPE_INT64  ||
+          type == GLSL_TYPE_IMAGE  ||
+          type == GLSL_TYPE_SAMPLER;
+}
+
+static inline bool glsl_base_type_is_integer(enum glsl_base_type type)
+{
+   return type == GLSL_TYPE_UINT ||
+          type == GLSL_TYPE_INT ||
+          type == GLSL_TYPE_UINT64 ||
+          type == GLSL_TYPE_INT64 ||
+          type == GLSL_TYPE_BOOL ||
+          type == GLSL_TYPE_SAMPLER ||
+          type == GLSL_TYPE_IMAGE;
 }
 
 enum glsl_sampler_dim {
@@ -147,7 +160,7 @@
     * easier to just ralloc_free 'mem_ctx' (or any of its ancestors). */
    static void* operator new(size_t size)
    {
-      mtx_lock(&glsl_type::mutex);
+      mtx_lock(&glsl_type::mem_mutex);
 
       /* mem_ctx should have been created by the static members */
       assert(glsl_type::mem_ctx != NULL);
@@ -157,7 +170,7 @@
       type = ralloc_size(glsl_type::mem_ctx, size);
       assert(type != NULL);
 
-      mtx_unlock(&glsl_type::mutex);
+      mtx_unlock(&glsl_type::mem_mutex);
 
       return type;
    }
@@ -166,9 +179,9 @@
     * ralloc_free in that case. */
    static void operator delete(void *type)
    {
-      mtx_lock(&glsl_type::mutex);
+      mtx_lock(&glsl_type::mem_mutex);
       ralloc_free(type);
-      mtx_unlock(&glsl_type::mutex);
+      mtx_unlock(&glsl_type::mem_mutex);
    }
 
    /**
@@ -439,7 +452,7 @@
    {
       return (vector_elements == 1)
 	 && (base_type >= GLSL_TYPE_UINT)
-	 && (base_type <= GLSL_TYPE_BOOL);
+	 && (base_type <= GLSL_TYPE_IMAGE);
    }
 
    /**
@@ -479,12 +492,19 @@
    }
 
    /**
+    * Query whether or not a type is a 64-bit integer.
+    */
+   bool is_integer_64() const
+   {
+      return base_type == GLSL_TYPE_UINT64 || base_type == GLSL_TYPE_INT64;
+   }
+
+   /**
     * Query whether or not a type is a 32-bit or 64-bit integer
     */
    bool is_integer_32_64() const
    {
-      return (base_type == GLSL_TYPE_UINT) || (base_type == GLSL_TYPE_INT) ||
-             (base_type == GLSL_TYPE_UINT64) || (base_type == GLSL_TYPE_INT64);
+      return is_integer() || is_integer_64();
    }
 
    /**
@@ -554,6 +574,12 @@
    bool contains_sampler() const;
 
    /**
+    * Query whether or not type is an array or for struct, interface and
+    * array types, contains an array.
+    */
+   bool contains_array() const;
+
+   /**
     * Get the Mesa texture target index for a sampler type.
     */
    gl_texture_index sampler_index() const;
@@ -669,11 +695,19 @@
    }
 
    /**
+    * Query whether or not a type is an atomic_uint.
+    */
+   bool is_atomic_uint() const
+   {
+      return base_type == GLSL_TYPE_ATOMIC_UINT;
+   }
+
+   /**
     * Return the amount of atomic counter storage required for a type.
     */
    unsigned atomic_size() const
    {
-      if (base_type == GLSL_TYPE_ATOMIC_UINT)
+      if (is_atomic_uint())
          return ATOMIC_COUNTER_SIZE;
       else if (is_array())
          return length * fields.array->atomic_size();
@@ -797,7 +831,8 @@
 
 private:
 
-   static mtx_t mutex;
+   static mtx_t mem_mutex;
+   static mtx_t hash_mutex;
 
    /**
     * ralloc context for all glsl_type allocations
@@ -951,14 +986,19 @@
    unsigned precision:2;
 
    /**
-    * Image qualifiers, applicable to buffer variables defined in shader
+    * Memory qualifiers, applicable to buffer variables defined in shader
     * storage buffer objects (SSBOs)
     */
-   unsigned image_read_only:1;
-   unsigned image_write_only:1;
-   unsigned image_coherent:1;
-   unsigned image_volatile:1;
-   unsigned image_restrict:1;
+   unsigned memory_read_only:1;
+   unsigned memory_write_only:1;
+   unsigned memory_coherent:1;
+   unsigned memory_volatile:1;
+   unsigned memory_restrict:1;
+
+   /**
+    * Layout format, applicable to image variables only.
+    */
+   unsigned image_format:16;
 
    /**
     * Any of the xfb_* qualifiers trigger the shader to be in transform
@@ -973,9 +1013,10 @@
       : type(_type), name(_name), location(-1), offset(0), xfb_buffer(0),
         xfb_stride(0), interpolation(0), centroid(0),
         sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
-        precision(GLSL_PRECISION_NONE), image_read_only(0), image_write_only(0),
-        image_coherent(0), image_volatile(0), image_restrict(0),
-        explicit_xfb_buffer(0), implicit_sized_array(0)
+        precision(GLSL_PRECISION_NONE), memory_read_only(0),
+        memory_write_only(0), memory_coherent(0), memory_volatile(0),
+        memory_restrict(0), image_format(0), explicit_xfb_buffer(0),
+        implicit_sized_array(0)
    {
       /* empty */
    }
diff --git a/src/compiler/nir/BUILD.gn b/src/compiler/nir/BUILD.gn
index 60f2ae0..6f3955f 100644
--- a/src/compiler/nir/BUILD.gn
+++ b/src/compiler/nir/BUILD.gn
@@ -51,7 +51,6 @@
   sources = [
     "nir.c",
     "nir.h",
-    "nir_array.h",
     "nir_builder.h",
     "nir_clone.c",
     "nir_constant_expressions.h",
@@ -87,6 +86,7 @@
     "nir_lower_locals_to_regs.c",
     "nir_lower_patch_vertices.c",
     "nir_lower_phis_to_scalar.c",
+    "nir_lower_read_invocation_to_scalar.c",
     "nir_lower_regs_to_ssa.c",
     "nir_lower_returns.c",
     "nir_lower_samplers.c",
@@ -110,6 +110,7 @@
     "nir_opt_gcm.c",
     "nir_opt_global_to_local.c",
     "nir_opt_if.c",
+    "nir_opt_intrinsics.c",
     "nir_opt_loop_unroll.c",
     "nir_opt_move_comparisons.c",
     "nir_opt_peephole_select.c",
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 0abf9b6..7e83dbd 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -44,7 +44,8 @@
 
    shader->options = options;
 
-   shader->info = si ? si : rzalloc(shader, shader_info);
+   if (si)
+      shader->info = *si;
 
    exec_list_make_empty(&shader->functions);
    exec_list_make_empty(&shader->registers);
@@ -1905,6 +1906,22 @@
       return nir_intrinsic_load_patch_vertices_in;
    case SYSTEM_VALUE_HELPER_INVOCATION:
       return nir_intrinsic_load_helper_invocation;
+   case SYSTEM_VALUE_VIEW_INDEX:
+      return nir_intrinsic_load_view_index;
+   case SYSTEM_VALUE_SUBGROUP_SIZE:
+      return nir_intrinsic_load_subgroup_size;
+   case SYSTEM_VALUE_SUBGROUP_INVOCATION:
+      return nir_intrinsic_load_subgroup_invocation;
+   case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
+	return nir_intrinsic_load_subgroup_eq_mask;
+   case SYSTEM_VALUE_SUBGROUP_GE_MASK:
+	return nir_intrinsic_load_subgroup_ge_mask;
+   case SYSTEM_VALUE_SUBGROUP_GT_MASK:
+	return nir_intrinsic_load_subgroup_gt_mask;
+   case SYSTEM_VALUE_SUBGROUP_LE_MASK:
+	return nir_intrinsic_load_subgroup_le_mask;
+   case SYSTEM_VALUE_SUBGROUP_LT_MASK:
+	return nir_intrinsic_load_subgroup_lt_mask;
    default:
       unreachable("system value does not directly correspond to intrinsic");
    }
@@ -1956,6 +1973,22 @@
       return SYSTEM_VALUE_VERTICES_IN;
    case nir_intrinsic_load_helper_invocation:
       return SYSTEM_VALUE_HELPER_INVOCATION;
+   case nir_intrinsic_load_view_index:
+      return SYSTEM_VALUE_VIEW_INDEX;
+   case nir_intrinsic_load_subgroup_size:
+      return SYSTEM_VALUE_SUBGROUP_SIZE;
+   case nir_intrinsic_load_subgroup_invocation:
+      return SYSTEM_VALUE_SUBGROUP_INVOCATION;
+   case nir_intrinsic_load_subgroup_eq_mask:
+      return SYSTEM_VALUE_SUBGROUP_EQ_MASK;
+   case nir_intrinsic_load_subgroup_ge_mask:
+      return SYSTEM_VALUE_SUBGROUP_GE_MASK;
+   case nir_intrinsic_load_subgroup_gt_mask:
+      return SYSTEM_VALUE_SUBGROUP_GT_MASK;
+   case nir_intrinsic_load_subgroup_le_mask:
+      return SYSTEM_VALUE_SUBGROUP_LE_MASK;
+   case nir_intrinsic_load_subgroup_lt_mask:
+      return SYSTEM_VALUE_SUBGROUP_LT_MASK;
    default:
       unreachable("intrinsic doesn't produce a system value");
    }
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ce5b434..78684fd 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -47,6 +47,10 @@
 
 #include "nir_opcodes.h"
 
+#if defined(_WIN32) && !defined(snprintf)
+#define snprintf _snprintf
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -434,15 +438,15 @@
 }
 
 static inline bool
-nir_instr_is_first(nir_instr *instr)
+nir_instr_is_first(const nir_instr *instr)
 {
-   return exec_node_is_head_sentinel(exec_node_get_prev(&instr->node));
+   return exec_node_is_head_sentinel(exec_node_get_prev_const(&instr->node));
 }
 
 static inline bool
-nir_instr_is_last(nir_instr *instr)
+nir_instr_is_last(const nir_instr *instr)
 {
-   return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
+   return exec_node_is_tail_sentinel(exec_node_get_next_const(&instr->node));
 }
 
 typedef struct nir_ssa_def {
@@ -800,7 +804,8 @@
 
 /* is this source channel used? */
 static inline bool
-nir_alu_instr_channel_used(nir_alu_instr *instr, unsigned src, unsigned channel)
+nir_alu_instr_channel_used(const nir_alu_instr *instr, unsigned src,
+                           unsigned channel)
 {
    if (nir_op_infos[instr->op].input_sizes[src] > 0)
       return channel < nir_op_infos[instr->op].input_sizes[src];
@@ -1083,7 +1088,7 @@
 
 #define INTRINSIC_IDX_ACCESSORS(name, flag, type)                             \
 static inline type                                                            \
-nir_intrinsic_##name(nir_intrinsic_instr *instr)                              \
+nir_intrinsic_##name(const nir_intrinsic_instr *instr)                        \
 {                                                                             \
    const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];   \
    assert(info->index_map[NIR_INTRINSIC_##flag] > 0);                         \
@@ -1217,7 +1222,7 @@
 } nir_tex_instr;
 
 static inline unsigned
-nir_tex_instr_dest_size(nir_tex_instr *instr)
+nir_tex_instr_dest_size(const nir_tex_instr *instr)
 {
    switch (instr->op) {
    case nir_texop_txs: {
@@ -1266,7 +1271,7 @@
  * rather than actually sampling it.
  */
 static inline bool
-nir_tex_instr_is_query(nir_tex_instr *instr)
+nir_tex_instr_is_query(const nir_tex_instr *instr)
 {
    switch (instr->op) {
    case nir_texop_txs:
@@ -1289,7 +1294,7 @@
 }
 
 static inline nir_alu_type
-nir_tex_instr_src_type(nir_tex_instr *instr, unsigned src)
+nir_tex_instr_src_type(const nir_tex_instr *instr, unsigned src)
 {
    switch (instr->src[src].src_type) {
    case nir_tex_src_coord:
@@ -1333,7 +1338,7 @@
 }
 
 static inline unsigned
-nir_tex_instr_src_size(nir_tex_instr *instr, unsigned src)
+nir_tex_instr_src_size(const nir_tex_instr *instr, unsigned src)
 {
    if (instr->src[src].src_type == nir_tex_src_coord)
       return instr->coord_components;
@@ -1355,7 +1360,7 @@
 }
 
 static inline int
-nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)
+nir_tex_instr_src_index(const nir_tex_instr *instr, nir_tex_src_type type)
 {
    for (unsigned i = 0; i < instr->num_srcs; i++)
       if (instr->src[i].src_type == type)
@@ -1816,6 +1821,9 @@
    bool lower_extract_byte;
    bool lower_extract_word;
 
+   bool lower_vote_trivial;
+   bool lower_subgroup_masks;
+
    /**
     * Does the driver support real 32-bit integers?  (Otherwise, integers
     * are simulated by floats.)
@@ -1835,6 +1843,8 @@
     */
    bool use_interpolated_input_intrinsics;
 
+   unsigned max_subgroup_size;
+
    unsigned max_unroll_iterations;
 } nir_shader_compiler_options;
 
@@ -1859,7 +1869,7 @@
    const struct nir_shader_compiler_options *options;
 
    /** Various bits of compile-time information about a given shader */
-   struct shader_info *info;
+   struct shader_info info;
 
    /** list of global variables in the shader (nir_variable) */
    struct exec_list globals;
@@ -2303,11 +2313,22 @@
 
    return should_clone;
 }
+
+static inline bool
+should_print_nir(void)
+{
+   static int should_print = -1;
+   if (should_print < 0)
+      should_print = env_var_as_boolean("NIR_PRINT", false);
+
+   return should_print;
+}
 #else
 static inline void nir_validate_shader(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
 static inline bool should_clone_nir(void) { return false; }
+static inline bool should_print_nir(void) { return false; }
 #endif /* DEBUG */
 
 #define _PASS(nir, do_pass) do {                                     \
@@ -2322,14 +2343,22 @@
 
 #define NIR_PASS(progress, nir, pass, ...) _PASS(nir,                \
    nir_metadata_set_validation_flag(nir);                            \
+   if (should_print_nir())                                           \
+      printf("%s\n", #pass);                                         \
    if (pass(nir, ##__VA_ARGS__)) {                                   \
       progress = true;                                               \
+      if (should_print_nir())                                        \
+         nir_print_shader(nir, stdout);                              \
       nir_metadata_check_validation_flag(nir);                       \
    }                                                                 \
 )
 
 #define NIR_PASS_V(nir, pass, ...) _PASS(nir,                        \
+   if (should_print_nir())                                           \
+      printf("%s\n", #pass);                                         \
    pass(nir, ##__VA_ARGS__);                                         \
+   if (should_print_nir())                                           \
+      nir_print_shader(nir, stdout);                                 \
 )
 
 void nir_calc_dominance_impl(nir_function_impl *impl);
@@ -2390,7 +2419,7 @@
 nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr);
 nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr);
 
-bool nir_is_per_vertex_io(nir_variable *var, gl_shader_stage stage);
+bool nir_is_per_vertex_io(const nir_variable *var, gl_shader_stage stage);
 
 void nir_lower_io_types(nir_shader *shader);
 bool nir_lower_regs_to_ssa_impl(nir_function_impl *impl);
@@ -2405,7 +2434,7 @@
 bool nir_lower_vec_to_movs(nir_shader *shader);
 bool nir_lower_alu_to_scalar(nir_shader *shader);
 bool nir_lower_load_const_to_scalar(nir_shader *shader);
-
+bool nir_lower_read_invocation_to_scalar(nir_shader *shader);
 bool nir_lower_phis_to_scalar(nir_shader *shader);
 void nir_lower_io_to_scalar(nir_shader *shader, nir_variable_mode mask);
 
@@ -2444,6 +2473,7 @@
    unsigned lower_y_uv_external;
    unsigned lower_y_u_v_external;
    unsigned lower_yx_xuxv_external;
+   unsigned lower_xy_uxvx_external;
 
    /**
     * To emulate certain texture wrap modes, this can be used
@@ -2546,6 +2576,7 @@
 
 bool nir_lower_atomics(nir_shader *shader,
                        const struct gl_shader_program *shader_program);
+bool nir_lower_atomics_to_ssbo(nir_shader *shader, unsigned ssbo_offset);
 bool nir_lower_to_source_mods(nir_shader *shader);
 
 bool nir_lower_gs_intrinsics(nir_shader *shader);
@@ -2598,6 +2629,7 @@
 bool nir_lower_ssa_defs_to_regs_block(nir_block *block);
 
 bool nir_opt_algebraic(nir_shader *shader);
+bool nir_opt_algebraic_before_ffma(nir_shader *shader);
 bool nir_opt_algebraic_late(nir_shader *shader);
 bool nir_opt_constant_folding(nir_shader *shader);
 
@@ -2617,6 +2649,8 @@
 
 bool nir_opt_if(nir_shader *shader);
 
+bool nir_opt_intrinsics(nir_shader *shader);
+
 bool nir_opt_loop_unroll(nir_shader *shader, nir_variable_mode indirect_mask);
 
 bool nir_opt_move_comparisons(nir_shader *shader);
diff --git a/src/compiler/nir/nir_array.h b/src/compiler/nir/nir_array.h
deleted file mode 100644
index 4d7a532..0000000
--- a/src/compiler/nir/nir_array.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Jason Ekstrand (jason@jlekstrand.net)
- *
- */
-
-#ifndef NIR_ARRAY_H
-#define NIR_ARRAY_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-   void *mem_ctx;
-   size_t size;
-   size_t alloc;
-   void *data;
-} nir_array;
-
-static inline void
-nir_array_init(nir_array *arr, void *mem_ctx)
-{
-   arr->mem_ctx = mem_ctx;
-   arr->size = 0;
-   arr->alloc = 0;
-   arr->data = NULL;
-}
-
-static inline void
-nir_array_fini(nir_array *arr)
-{
-   if (arr->mem_ctx)
-      ralloc_free(arr->data);
-   else
-      free(arr->data);
-}
-
-#define NIR_ARRAY_INITIAL_SIZE 64
-
-/* Increments the size of the array by the given ammount and returns a
- * pointer to the beginning of the newly added space.
- */
-static inline void *
-nir_array_grow(nir_array *arr, size_t additional)
-{
-   size_t new_size = arr->size + additional;
-   if (new_size > arr->alloc) {
-      if (arr->alloc == 0)
-         arr->alloc = NIR_ARRAY_INITIAL_SIZE;
-
-      while (new_size > arr->alloc)
-         arr->alloc *= 2;
-
-      if (arr->mem_ctx)
-         arr->data = reralloc_size(arr->mem_ctx, arr->data, arr->alloc);
-      else
-         arr->data = realloc(arr->data, arr->alloc);
-   }
-
-   void *ptr = (void *)((char *)arr->data + arr->size);
-   arr->size = new_size;
-
-   return ptr;
-}
-
-#define nir_array_add(arr, type, elem) \
-   *(type *)nir_array_grow(arr, sizeof(type)) = (elem)
-
-#define nir_array_foreach(arr, type, elem) \
-   for (type *elem = (type *)(arr)->data; \
-        elem < (type *)((char *)(arr)->data + (arr)->size); elem++)
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* NIR_ARRAY_H */
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 7dbf8ef..7c65886 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -621,7 +621,7 @@
    nir_load_##name(nir_builder *build)                                   \
    {                                                                     \
       return nir_load_system_value(build, nir_intrinsic_load_##name, 0); \
-   }                                                                     \
+   }
 
 #include "nir_intrinsics.h"
 
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index e2204c4e..c13163f 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -765,10 +765,10 @@
    clone_reg_list(&state, &ns->registers, &s->registers);
    ns->reg_alloc = s->reg_alloc;
 
-   *ns->info = *s->info;
-   ns->info->name = ralloc_strdup(ns, ns->info->name);
-   if (ns->info->label)
-      ns->info->label = ralloc_strdup(ns, ns->info->label);
+   ns->info = s->info;
+   ns->info.name = ralloc_strdup(ns, ns->info.name);
+   if (ns->info.label)
+      ns->info.label = ralloc_strdup(ns, ns->info.label);
 
    ns->num_inputs = s->num_inputs;
    ns->num_uniforms = s->num_uniforms;
diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c
index 0c70787..24ac74e 100644
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -49,23 +49,23 @@
 
       if (var->data.mode == nir_var_shader_in) {
          if (is_patch_generic)
-            shader->info->patch_inputs_read |= bitfield;
+            shader->info.patch_inputs_read |= bitfield;
          else
-            shader->info->inputs_read |= bitfield;
+            shader->info.inputs_read |= bitfield;
 
          if (shader->stage == MESA_SHADER_FRAGMENT) {
-            shader->info->fs.uses_sample_qualifier |= var->data.sample;
+            shader->info.fs.uses_sample_qualifier |= var->data.sample;
          }
       } else {
          assert(var->data.mode == nir_var_shader_out);
          if (is_patch_generic) {
-            shader->info->patch_outputs_written |= bitfield;
+            shader->info.patch_outputs_written |= bitfield;
          } else if (!var->data.read_only) {
-            shader->info->outputs_written |= bitfield;
+            shader->info.outputs_written |= bitfield;
          }
 
          if (var->data.fb_fetch_output)
-            shader->info->outputs_read |= bitfield;
+            shader->info.outputs_read |= bitfield;
       }
    }
 }
@@ -197,7 +197,7 @@
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       assert(shader->stage == MESA_SHADER_FRAGMENT);
-      shader->info->fs.uses_discard = true;
+      shader->info.fs.uses_discard = true;
       break;
 
    case nir_intrinsic_interp_var_at_centroid:
@@ -219,7 +219,7 @@
              glsl_type_is_dual_slot(glsl_without_array(var->type))) {
             for (uint i = 0; i < glsl_count_attribute_slots(var->type, false); i++) {
                int idx = var->data.location + i;
-               shader->info->double_inputs_read |= BITFIELD64_BIT(idx);
+               shader->info.double_inputs_read |= BITFIELD64_BIT(idx);
             }
          }
       }
@@ -245,14 +245,14 @@
    case nir_intrinsic_load_tess_coord:
    case nir_intrinsic_load_tess_level_outer:
    case nir_intrinsic_load_tess_level_inner:
-      shader->info->system_values_read |=
+      shader->info.system_values_read |=
          (1ull << nir_system_value_from_intrinsic(instr->intrinsic));
       break;
 
    case nir_intrinsic_end_primitive:
    case nir_intrinsic_end_primitive_with_counter:
       assert(shader->stage == MESA_SHADER_GEOMETRY);
-      shader->info->gs.uses_end_primitive = 1;
+      shader->info.gs.uses_end_primitive = 1;
       break;
 
    default:
@@ -264,7 +264,7 @@
 gather_tex_info(nir_tex_instr *instr, nir_shader *shader)
 {
    if (instr->op == nir_texop_tg4)
-      shader->info->uses_texture_gather = true;
+      shader->info.uses_texture_gather = true;
 }
 
 static void
@@ -290,8 +290,8 @@
 void
 nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint)
 {
-   shader->info->num_textures = 0;
-   shader->info->num_images = 0;
+   shader->info.num_textures = 0;
+   shader->info.num_images = 0;
    nir_foreach_variable(var, &shader->uniforms) {
       const struct glsl_type *type = var->type;
       unsigned count = 1;
@@ -301,21 +301,21 @@
       }
 
       if (glsl_type_is_image(type)) {
-         shader->info->num_images += count;
+         shader->info.num_images += count;
       } else if (glsl_type_is_sampler(type)) {
-         shader->info->num_textures += count;
+         shader->info.num_textures += count;
       }
    }
 
-   shader->info->inputs_read = 0;
-   shader->info->outputs_written = 0;
-   shader->info->outputs_read = 0;
-   shader->info->double_inputs_read = 0;
-   shader->info->patch_inputs_read = 0;
-   shader->info->patch_outputs_written = 0;
-   shader->info->system_values_read = 0;
+   shader->info.inputs_read = 0;
+   shader->info.outputs_written = 0;
+   shader->info.outputs_read = 0;
+   shader->info.double_inputs_read = 0;
+   shader->info.patch_inputs_read = 0;
+   shader->info.patch_outputs_written = 0;
+   shader->info.system_values_read = 0;
    if (shader->stage == MESA_SHADER_FRAGMENT) {
-      shader->info->fs.uses_sample_qualifier = false;
+      shader->info.fs.uses_sample_qualifier = false;
    }
    nir_foreach_block(block, entrypoint) {
       gather_info_block(block, shader);
diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h
index 3a519a73..4082820 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -94,6 +94,19 @@
 INTRINSIC(shader_clock, 0, ARR(0), true, 2, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 
 /*
+ * Shader ballot intrinsics with semantics analogous to the
+ *
+ *    ballotARB()
+ *    readInvocationARB()
+ *    readFirstInvocationARB()
+ *
+ * GLSL functions from ARB_shader_ballot.
+ */
+INTRINSIC(ballot, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(read_invocation, 2, ARR(0, 1), true, 0, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(read_first_invocation, 1, ARR(0), true, 0, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+
+/*
  * Memory barrier with semantics analogous to the compute shader
  * groupMemoryBarrier(), memoryBarrierAtomicCounter(), memoryBarrierBuffer(),
  * memoryBarrierImage() and memoryBarrierShared() GLSL intrinsics.
@@ -107,6 +120,11 @@
 /** A conditional discard, with a single boolean source. */
 INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, xx, xx, xx, 0)
 
+/** ARB_shader_group_vote intrinsics */
+INTRINSIC(vote_any, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(vote_all, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(vote_eq,  1, ARR(1), true, 1, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+
 /**
  * Basic Geometry Shader intrinsics.
  *
@@ -326,9 +344,16 @@
 SYSTEM_VALUE(user_clip_plane, 4, 1, UCP_ID, xx, xx)
 SYSTEM_VALUE(num_work_groups, 3, 0, xx, xx, xx)
 SYSTEM_VALUE(helper_invocation, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(channel_num, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(alpha_ref_float, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(layer_id, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(view_index, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_size, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_invocation, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_eq_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx)
 
 /* Blend constant color values.  Float values are clamped. */
 SYSTEM_VALUE(blend_const_color_r_float, 1, 0, xx, xx, xx)
@@ -408,7 +433,7 @@
 /* src[] = { buffer_index, offset }. No const_index */
 LOAD(ssbo, 2, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base, component } */
-LOAD(output, 1, 1, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(output, 1, 2, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { vertex, offset }. const_index[] = { base, component } */
 LOAD(per_vertex_output, 2, 1, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base } */
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index 1993013..2252e16 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -155,7 +155,7 @@
     * instruction.
     */
    for (unsigned i = 0; i < nir_intrinsic_infos[instr->intrinsic].num_srcs; i++)
-      new_instr->src[i + 1] = instr->src[i];
+      nir_src_copy(&new_instr->src[i + 1], &instr->src[i], new_instr);
 
    if (instr->dest.is_ssa) {
       nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
diff --git a/src/compiler/nir/nir_lower_atomics_to_ssbo.c b/src/compiler/nir/nir_lower_atomics_to_ssbo.c
new file mode 100644
index 0000000..371eb0b
--- /dev/null
+++ b/src/compiler/nir/nir_lower_atomics_to_ssbo.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright © 2017 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+#if defined(_WIN32) && !defined(snprintf)
+#define snprintf _snprintf
+#endif
+
+/*
+ * Remap atomic counters to SSBOs.  Atomic counters get remapped to
+ * SSBO binding points [0..ssbo_offset) and the original SSBOs are
+ * remapped to [ssbo_offset..n) (mostly to align with what mesa/st
+ * does.
+ */
+
+static bool
+lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b)
+{
+   nir_intrinsic_op op;
+   int idx_src;
+
+   b->cursor = nir_before_instr(&instr->instr);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_ssbo_atomic_add:
+   case nir_intrinsic_ssbo_atomic_imin:
+   case nir_intrinsic_ssbo_atomic_umin:
+   case nir_intrinsic_ssbo_atomic_imax:
+   case nir_intrinsic_ssbo_atomic_umax:
+   case nir_intrinsic_ssbo_atomic_and:
+   case nir_intrinsic_ssbo_atomic_or:
+   case nir_intrinsic_ssbo_atomic_xor:
+   case nir_intrinsic_ssbo_atomic_exchange:
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_load_ssbo:
+      /* easy case, keep same opcode and just remap SSBO buffer index: */
+      op = instr->intrinsic;
+      idx_src = (op == nir_intrinsic_store_ssbo) ? 1 : 0;
+      nir_ssa_def *old_idx = nir_ssa_for_src(b, instr->src[idx_src], 1);
+      nir_ssa_def *new_idx = nir_iadd(b, old_idx, nir_imm_int(b, ssbo_offset));
+      nir_instr_rewrite_src(&instr->instr,
+                            &instr->src[idx_src],
+                            nir_src_for_ssa(new_idx));
+      return true;
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_dec:
+      /* inc and dec get remapped to add: */
+      op = nir_intrinsic_ssbo_atomic_add;
+      break;
+   case nir_intrinsic_atomic_counter_read:
+      op = nir_intrinsic_load_ssbo;
+      break;
+   case nir_intrinsic_atomic_counter_min:
+      op = nir_intrinsic_ssbo_atomic_umin;
+      break;
+   case nir_intrinsic_atomic_counter_max:
+      op = nir_intrinsic_ssbo_atomic_umax;
+      break;
+   case nir_intrinsic_atomic_counter_and:
+      op = nir_intrinsic_ssbo_atomic_and;
+      break;
+   case nir_intrinsic_atomic_counter_or:
+      op = nir_intrinsic_ssbo_atomic_or;
+      break;
+   case nir_intrinsic_atomic_counter_xor:
+      op = nir_intrinsic_ssbo_atomic_xor;
+      break;
+   case nir_intrinsic_atomic_counter_exchange:
+      op = nir_intrinsic_ssbo_atomic_exchange;
+      break;
+   case nir_intrinsic_atomic_counter_comp_swap:
+      op = nir_intrinsic_ssbo_atomic_comp_swap;
+      break;
+   default:
+      return false;
+   }
+
+   nir_ssa_def *buffer = nir_imm_int(b, nir_intrinsic_base(instr));
+   nir_ssa_def *temp = NULL;
+   nir_intrinsic_instr *new_instr =
+         nir_intrinsic_instr_create(ralloc_parent(instr), op);
+
+   /* a couple instructions need special handling since they don't map
+    * 1:1 with ssbo atomics
+    */
+   switch (instr->intrinsic) {
+   case nir_intrinsic_atomic_counter_inc:
+      /* remapped to ssbo_atomic_add: { buffer_idx, offset, +1 } */
+      temp = nir_imm_int(b, +1);
+      new_instr->src[0] = nir_src_for_ssa(buffer);
+      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      new_instr->src[2] = nir_src_for_ssa(temp);
+      break;
+   case nir_intrinsic_atomic_counter_dec:
+      /* remapped to ssbo_atomic_add: { buffer_idx, offset, -1 } */
+      /* NOTE semantic difference so we adjust the return value below */
+      temp = nir_imm_int(b, -1);
+      new_instr->src[0] = nir_src_for_ssa(buffer);
+      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      new_instr->src[2] = nir_src_for_ssa(temp);
+      break;
+   case nir_intrinsic_atomic_counter_read:
+      /* remapped to load_ssbo: { buffer_idx, offset } */
+      new_instr->src[0] = nir_src_for_ssa(buffer);
+      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      break;
+   default:
+      /* remapped to ssbo_atomic_x: { buffer_idx, offset, data, (compare)? } */
+      new_instr->src[0] = nir_src_for_ssa(buffer);
+      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      nir_src_copy(&new_instr->src[2], &instr->src[1], new_instr);
+      if (op == nir_intrinsic_ssbo_atomic_comp_swap)
+         nir_src_copy(&new_instr->src[3], &instr->src[2], new_instr);
+      break;
+   }
+
+   nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
+                     instr->dest.ssa.num_components,
+                     instr->dest.ssa.bit_size, NULL);
+   nir_instr_insert_before(&instr->instr, &new_instr->instr);
+   nir_instr_remove(&instr->instr);
+
+   if (instr->intrinsic == nir_intrinsic_atomic_counter_dec) {
+      b->cursor = nir_after_instr(&new_instr->instr);
+      nir_ssa_def *result = nir_iadd(b, &new_instr->dest.ssa, temp);
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(result));
+   } else {
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&new_instr->dest.ssa));
+   }
+
+   /* we could be replacing an intrinsic with fixed # of dest num_components
+    * with one that has variable number.  So best to take this from the dest:
+    */
+   new_instr->num_components = instr->dest.ssa.num_components;
+
+   return true;
+}
+
+static bool
+is_atomic_uint(const struct glsl_type *type)
+{
+   if (glsl_get_base_type(type) == GLSL_TYPE_ARRAY)
+      return is_atomic_uint(glsl_get_array_element(type));
+   return glsl_get_base_type(type) == GLSL_TYPE_ATOMIC_UINT;
+}
+
+bool
+nir_lower_atomics_to_ssbo(nir_shader *shader, unsigned ssbo_offset)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_builder builder;
+         nir_builder_init(&builder, function->impl);
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type == nir_instr_type_intrinsic)
+                  progress |= lower_instr(nir_instr_as_intrinsic(instr),
+                                          ssbo_offset, &builder);
+            }
+         }
+
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+      }
+   }
+
+   if (progress) {
+      /* replace atomic_uint uniforms with ssbo's: */
+      unsigned replaced = 0;
+      nir_foreach_variable_safe(var, &shader->uniforms) {
+         if (is_atomic_uint(var->type)) {
+            exec_node_remove(&var->node);
+
+            if (replaced & (1 << var->data.binding))
+               continue;
+
+            nir_variable *ssbo;
+            char name[16];
+
+            /* A length of 0 is used to denote unsized arrays */
+            const struct glsl_type *type = glsl_array_type(glsl_uint_type(), 0);
+
+            snprintf(name, sizeof(name), "counter%d", var->data.binding);
+
+            ssbo = nir_variable_create(shader, nir_var_shader_storage,
+                                       type, name);
+            ssbo->data.binding = var->data.binding;
+
+            struct glsl_struct_field field = {
+                  .type = type,
+                  .name = "counters",
+                  .location = -1,
+            };
+
+            ssbo->interface_type =
+                  glsl_interface_type(&field, 1, GLSL_INTERFACE_PACKING_STD430,
+                                      false, "counters");
+
+            replaced |= (1 << var->data.binding);
+         }
+      }
+   }
+
+   return progress;
+}
+
diff --git a/src/compiler/nir/nir_lower_bitmap.c b/src/compiler/nir/nir_lower_bitmap.c
index a1b4a32..9d04ae7 100644
--- a/src/compiler/nir/nir_lower_bitmap.c
+++ b/src/compiler/nir/nir_lower_bitmap.c
@@ -111,7 +111,7 @@
    discard->src[0] = nir_src_for_ssa(cond);
    nir_builder_instr_insert(b, &discard->instr);
 
-   shader->info->fs.uses_discard = true;
+   shader->info.fs.uses_discard = true;
 }
 
 static void
diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c
index 7bed46b..ea12f51 100644
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -289,7 +289,7 @@
          discard->src[0] = nir_src_for_ssa(cond);
          nir_builder_instr_insert(&b, &discard->instr);
 
-         b.shader->info->fs.uses_discard = true;
+         b.shader->info.fs.uses_discard = true;
       }
    }
 
diff --git a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
index 6705a3c..ea23a60 100644
--- a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
+++ b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
@@ -142,8 +142,8 @@
    const unsigned cull_array_size = get_unwrapped_array_length(nir, cull);
 
    if (store_info) {
-      nir->info->clip_distance_array_size = clip_array_size;
-      nir->info->cull_distance_array_size = cull_array_size;
+      nir->info.clip_distance_array_size = clip_array_size;
+      nir->info.cull_distance_array_size = cull_array_size;
    }
 
    if (clip)
diff --git a/src/compiler/nir/nir_lower_gs_intrinsics.c b/src/compiler/nir/nir_lower_gs_intrinsics.c
index 68e20dd..4ddace9 100644
--- a/src/compiler/nir/nir_lower_gs_intrinsics.c
+++ b/src/compiler/nir/nir_lower_gs_intrinsics.c
@@ -77,7 +77,7 @@
    nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
 
    nir_ssa_def *max_vertices =
-      nir_imm_int(b, b->shader->info->gs.vertices_out);
+      nir_imm_int(b, b->shader->info.gs.vertices_out);
 
    /* Create: if (vertex_count < max_vertices) and insert it.
     *
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 1ae2cc7..bb1cdec 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -68,7 +68,7 @@
  * (such as geometry shader inputs).
  */
 bool
-nir_is_per_vertex_io(nir_variable *var, gl_shader_stage stage)
+nir_is_per_vertex_io(const nir_variable *var, gl_shader_stage stage)
 {
    if (var->data.patch || !glsl_type_is_array(var->type))
       return false;
diff --git a/src/compiler/nir/nir_lower_io_to_scalar.c b/src/compiler/nir/nir_lower_io_to_scalar.c
index f2345d5..fffd1d3 100644
--- a/src/compiler/nir/nir_lower_io_to_scalar.c
+++ b/src/compiler/nir/nir_lower_io_to_scalar.c
@@ -49,7 +49,7 @@
       nir_intrinsic_set_base(chan_intr, nir_intrinsic_base(intr));
       nir_intrinsic_set_component(chan_intr, nir_intrinsic_component(intr) + i);
       /* offset */
-      chan_intr->src[0] = intr->src[0];
+      nir_src_copy(&chan_intr->src[0], &intr->src[0], chan_intr);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
 
@@ -84,7 +84,7 @@
       /* value */
       chan_intr->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
       /* offset */
-      chan_intr->src[1] = intr->src[1];
+      nir_src_copy(&chan_intr->src[1], &intr->src[1], chan_intr);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
    }
diff --git a/src/compiler/nir/nir_lower_io_to_temporaries.c b/src/compiler/nir/nir_lower_io_to_temporaries.c
index 6031bbd..d2df14e 100644
--- a/src/compiler/nir/nir_lower_io_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_io_to_temporaries.c
@@ -141,6 +141,7 @@
    temp->data.mode = nir_var_global;
    temp->data.read_only = false;
    temp->data.fb_fetch_output = false;
+   temp->data.compact = false;
 
    return nvar;
 }
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index f1af237..d0667bc 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -26,7 +26,6 @@
  */
 
 #include "nir.h"
-#include "nir_array.h"
 
 struct locals_to_regs_state {
    nir_shader *shader;
diff --git a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c b/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
new file mode 100644
index 0000000..69e7c0a
--- /dev/null
+++ b/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/** @file nir_lower_read_invocation_to_scalar.c
+ *
+ * Replaces nir_intrinsic_read_invocation/nir_intrinsic_read_first_invocation
+ * operations with num_components != 1 with individual per-channel operations.
+ */
+
+static void
+lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0], intrin->num_components);
+   nir_ssa_def *reads[4];
+
+   for (unsigned i = 0; i < intrin->num_components; i++) {
+      nir_intrinsic_instr *chan_intrin =
+         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
+      nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
+                        1, intrin->dest.ssa.bit_size, NULL);
+      chan_intrin->num_components = 1;
+
+      /* value */
+      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
+      /* invocation */
+      if (intrin->intrinsic == nir_intrinsic_read_invocation)
+         nir_src_copy(&chan_intrin->src[1], &intrin->src[1], chan_intrin);
+
+      nir_builder_instr_insert(b, &chan_intrin->instr);
+
+      reads[i] = &chan_intrin->dest.ssa;
+   }
+
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                            nir_src_for_ssa(nir_vec(b, reads,
+                                                    intrin->num_components)));
+   nir_instr_remove(&intrin->instr);
+}
+
+static bool
+nir_lower_read_invocation_to_scalar_impl(nir_function_impl *impl)
+{
+   bool progress = false;
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         if (intrin->num_components == 1)
+            continue;
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_read_invocation:
+         case nir_intrinsic_read_first_invocation:
+            lower_read_invocation_to_scalar(&b, intrin);
+            progress = true;
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+   return progress;
+}
+
+bool
+nir_lower_read_invocation_to_scalar(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= nir_lower_read_invocation_to_scalar_impl(function->impl);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 6ad5ad6..ba20d30 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -58,9 +58,9 @@
           */
 
          nir_const_value local_size;
-         local_size.u32[0] = b->shader->info->cs.local_size[0];
-         local_size.u32[1] = b->shader->info->cs.local_size[1];
-         local_size.u32[2] = b->shader->info->cs.local_size[2];
+         local_size.u32[0] = b->shader->info.cs.local_size[0];
+         local_size.u32[1] = b->shader->info.cs.local_size[1];
+         local_size.u32[2] = b->shader->info.cs.local_size[2];
 
          nir_ssa_def *group_id = nir_load_work_group_id(b);
          nir_ssa_def *local_id = nir_load_local_invocation_id(b);
@@ -88,9 +88,9 @@
          nir_ssa_def *local_id = nir_load_local_invocation_id(b);
 
          nir_ssa_def *size_x =
-            nir_imm_int(b, b->shader->info->cs.local_size[0]);
+            nir_imm_int(b, b->shader->info.cs.local_size[0]);
          nir_ssa_def *size_y =
-            nir_imm_int(b, b->shader->info->cs.local_size[1]);
+            nir_imm_int(b, b->shader->info.cs.local_size[1]);
 
          sysval = nir_imul(b, nir_channel(b, local_id, 2),
                               nir_imul(b, size_x, size_y));
@@ -116,6 +116,20 @@
                            nir_load_base_instance(b));
          break;
 
+      case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
+      case SYSTEM_VALUE_SUBGROUP_GE_MASK:
+      case SYSTEM_VALUE_SUBGROUP_GT_MASK:
+      case SYSTEM_VALUE_SUBGROUP_LE_MASK:
+      case SYSTEM_VALUE_SUBGROUP_LT_MASK: {
+         nir_intrinsic_op op =
+            nir_intrinsic_from_system_value(var->data.location);
+         nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
+         nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
+         nir_builder_instr_insert(b, &load->instr);
+         sysval = &load->dest.ssa;
+         break;
+      }
+
       default:
          break;
       }
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index 4ef8195..65681de 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -301,6 +301,20 @@
                       nir_channel(b, xuxv, 3));
 }
 
+static void
+lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex)
+{
+  b->cursor = nir_after_instr(&tex->instr);
+
+  nir_ssa_def *y = sample_plane(b, tex, 0);
+  nir_ssa_def *uxvx = sample_plane(b, tex, 1);
+
+  convert_yuv_to_rgb(b, tex,
+                     nir_channel(b, y, 1),
+                     nir_channel(b, uxvx, 0),
+                     nir_channel(b, uxvx, 2));
+}
+
 /*
  * Emits a textureLod operation used to replace an existing
  * textureGrad instruction.
@@ -760,6 +774,10 @@
          progress = true;
       }
 
+      if ((1 << tex->texture_index) & options->lower_xy_uxvx_external) {
+         lower_xy_uxvx_external(b, tex);
+         progress = true;
+      }
 
       if (sat_mask) {
          saturate_src(b, tex, sat_mask);
diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c
index 7d1a3bd..90da101 100644
--- a/src/compiler/nir/nir_lower_two_sided_color.c
+++ b/src/compiler/nir/nir_lower_two_sided_color.c
@@ -46,7 +46,8 @@
  */
 
 static nir_variable *
-create_input(nir_shader *shader, unsigned drvloc, gl_varying_slot slot)
+create_input(nir_shader *shader, unsigned drvloc, gl_varying_slot slot,
+             enum glsl_interp_mode interpolation)
 {
    nir_variable *var = rzalloc(shader, nir_variable);
 
@@ -56,6 +57,7 @@
    var->name = ralloc_asprintf(var, "in_%d", drvloc);
    var->data.index = 0;
    var->data.location = slot;
+   var->data.interpolation = interpolation;
 
    exec_list_push_tail(&shader->inputs, &var->node);
 
@@ -116,7 +118,9 @@
       else
          slot = VARYING_SLOT_BFC1;
 
-      state->colors[i].back = create_input(state->shader, ++maxloc, slot);
+      state->colors[i].back = create_input(
+            state->shader, ++maxloc, slot,
+            state->colors[i].front->data.interpolation);
    }
 
    return 0;
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index e5a12eb..e8cfe30 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -245,8 +245,12 @@
 
       case nir_deref_type_struct: {
          nir_deref_struct *str = nir_deref_as_struct(deref->child);
-         return foreach_deref_node_worker(node->children[str->index],
-                                          deref->child, cb, state);
+         if (node->children[str->index] &&
+             !foreach_deref_node_worker(node->children[str->index],
+                                        deref->child, cb, state))
+            return false;
+
+         return true;
       }
 
       default:
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 31b4615..a64a28e 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -308,7 +308,7 @@
 
 unop_convert("ufind_msb", tint32, tuint32, """
 dst = -1;
-for (int bit = 31; bit > 0; bit--) {
+for (int bit = 31; bit >= 0; bit--) {
    if ((src0 >> bit) & 1) {
       dst = bit;
       break;
@@ -467,7 +467,7 @@
 # and false respectively
 
 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
-binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 49c1460..ad75228 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -158,6 +158,10 @@
    # a != 0.0
    (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
 
+   # ignore this opt when the result is used by a bcsel or if so we can make
+   # use of conditional modifiers on supported hardware.
+   (('flt(is_not_used_by_conditional)', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
+
    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
    (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
@@ -246,8 +250,8 @@
    (('ishr', a, 0), a),
    (('ushr', 0, a), 0),
    (('ushr', a, 0), a),
-   (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
-   (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
+   (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)),
+   (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)),
    # Exponential/logarithmic identities
    (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
    (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
@@ -339,14 +343,31 @@
    (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
    (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
 
+   # Propagate constants up multiplication chains
+   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
+   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
+   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
+   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
+
    # Reassociate constants in add/mul chains so they can be folded together.
-   # For now, we only handle cases where the constants are separated by
+   # For now, we mostly only handle cases where the constants are separated by
    # a single non-constant.  We could do better eventually.
    (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)),
    (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)),
    (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)),
    (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)),
 
+   # By definition...
+   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
+   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
+   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
+
+   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
+   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
+   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
+
+   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
+
    # Misc. lowering
    (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'),
    (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'),
@@ -520,6 +541,27 @@
        ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
    ]
 
+# This section contains "late" optimizations that should be run before
+# creating ffmas and calling regular optimizations for the final time.
+# Optimizations should go here if they help code generation and conflict
+# with the regular optimizations.
+before_ffma_optimizations = [
+   # Propagate constants down multiplication chains
+   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
+   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
+   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
+   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
+
+   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
+   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
+   (('~fadd', ('fneg', a), a), 0.0),
+   (('iadd', ('ineg', a), a), 0),
+   (('iadd', ('ineg', a), ('iadd', a, b)), b),
+   (('iadd', a, ('iadd', ('ineg', a), b)), b),
+   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
+   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+]
+
 # This section contains "late" optimizations that should be run after the
 # regular optimizations have finished.  Optimizations should go here if
 # they help code generation but do not necessarily produce code that is
@@ -546,5 +588,7 @@
 ]
 
 print nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()
+print nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
+                                  before_ffma_optimizations).render()
 print nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
                                   late_optimizations).render()
diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c
index 7f17469..89ddc8d 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -469,8 +469,8 @@
    nir_deref_var *ret = nir_deref_var_create(mem_ctx, deref->var);
 
    nir_deref *deref_tail = deref->deref.child;
-   nir_deref *guide_tail = guide->deref.child;
-   nir_deref *spec_tail = specific->deref.child;
+   nir_deref *guide_tail = &guide->deref;
+   nir_deref *spec_tail = &specific->deref;
    nir_deref *ret_tail = &ret->deref;
    while (deref_tail) {
       switch (deref_tail->deref_type) {
@@ -495,14 +495,14 @@
              * the entry deref to find its corresponding wildcard and fill
              * this slot in with the value from the src.
              */
-            while (guide_tail) {
+            while (guide_tail->child) {
+               guide_tail = guide_tail->child;
+               spec_tail = spec_tail->child;
+
                if (guide_tail->deref_type == nir_deref_type_array &&
                    nir_deref_as_array(guide_tail)->deref_array_type ==
                    nir_deref_array_type_wildcard)
                   break;
-
-               guide_tail = guide_tail->child;
-               spec_tail = spec_tail->child;
             }
 
             nir_deref_array *spec_arr = nir_deref_as_array(spec_tail);
diff --git a/src/compiler/nir/nir_opt_intrinsics.c b/src/compiler/nir/nir_opt_intrinsics.c
new file mode 100644
index 0000000..68c0041
--- /dev/null
+++ b/src/compiler/nir/nir_opt_intrinsics.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/**
+ * \file nir_opt_intrinsics.c
+ */
+
+static nir_ssa_def *
+high_subgroup_mask(nir_builder *b,
+                   nir_ssa_def *count,
+                   uint64_t base_mask)
+{
+   /* group_mask could probably be calculated more efficiently but we want to
+    * be sure not to shift by 64 if the subgroup size is 64 because the GLSL
+    * shift operator is undefined in that case. In any case if we were worried
+    * about efficency this should probably be done further down because the
+    * subgroup size is likely to be known at compile time.
+    */
+   nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
+   nir_ssa_def *all_bits = nir_imm_int64(b, ~0ull);
+   nir_ssa_def *shift = nir_isub(b, nir_imm_int(b, 64), subgroup_size);
+   nir_ssa_def *group_mask = nir_ushr(b, all_bits, shift);
+   nir_ssa_def *higher_bits = nir_ishl(b, nir_imm_int64(b, base_mask), count);
+
+   return nir_iand(b, higher_bits, group_mask);
+}
+
+static bool
+opt_intrinsics_impl(nir_function_impl *impl)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         nir_ssa_def *replacement = NULL;
+         b.cursor = nir_before_instr(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_vote_any:
+         case nir_intrinsic_vote_all: {
+            nir_const_value *val = nir_src_as_const_value(intrin->src[0]);
+            if (!val && !b.shader->options->lower_vote_trivial)
+               continue;
+
+            replacement = nir_ssa_for_src(&b, intrin->src[0], 1);
+            break;
+         }
+         case nir_intrinsic_vote_eq: {
+            nir_const_value *val = nir_src_as_const_value(intrin->src[0]);
+            if (!val && !b.shader->options->lower_vote_trivial)
+               continue;
+
+            replacement = nir_imm_int(&b, NIR_TRUE);
+            break;
+         }
+         case nir_intrinsic_ballot: {
+            assert(b.shader->options->max_subgroup_size != 0);
+            if (b.shader->options->max_subgroup_size > 32 ||
+                intrin->dest.ssa.bit_size <= 32)
+               continue;
+
+            nir_intrinsic_instr *ballot =
+               nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
+            nir_ssa_dest_init(&ballot->instr, &ballot->dest, 1, 32, NULL);
+            nir_src_copy(&ballot->src[0], &intrin->src[0], ballot);
+
+            nir_builder_instr_insert(&b, &ballot->instr);
+
+            replacement = nir_pack_64_2x32_split(&b,
+                                                 &ballot->dest.ssa,
+                                                 nir_imm_int(&b, 0));
+            break;
+         }
+         case nir_intrinsic_load_subgroup_eq_mask:
+         case nir_intrinsic_load_subgroup_ge_mask:
+         case nir_intrinsic_load_subgroup_gt_mask:
+         case nir_intrinsic_load_subgroup_le_mask:
+         case nir_intrinsic_load_subgroup_lt_mask: {
+            if (!b.shader->options->lower_subgroup_masks)
+               break;
+
+            nir_ssa_def *count = nir_load_subgroup_invocation(&b);
+
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_subgroup_eq_mask:
+               replacement = nir_ishl(&b, nir_imm_int64(&b, 1ull), count);
+               break;
+            case nir_intrinsic_load_subgroup_ge_mask:
+               replacement = high_subgroup_mask(&b, count, ~0ull);
+               break;
+            case nir_intrinsic_load_subgroup_gt_mask:
+               replacement = high_subgroup_mask(&b, count, ~1ull);
+               break;
+            case nir_intrinsic_load_subgroup_le_mask:
+               replacement = nir_inot(&b, nir_ishl(&b, nir_imm_int64(&b, ~1ull), count));
+               break;
+            case nir_intrinsic_load_subgroup_lt_mask:
+               replacement = nir_inot(&b, nir_ishl(&b, nir_imm_int64(&b, ~0ull), count));
+               break;
+            default:
+               unreachable("you seriously can't tell this is unreachable?");
+            }
+            break;
+         }
+         default:
+            break;
+         }
+
+         if (!replacement)
+            continue;
+
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  nir_src_for_ssa(replacement));
+         nir_instr_remove(instr);
+         nir_metadata_preserve(impl, nir_metadata_block_index |
+                                     nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
+
+bool
+nir_opt_intrinsics(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= opt_intrinsics_impl(function->impl);
+   }
+
+   return false;
+}
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index e6c5c9c..f4811fe 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -31,10 +31,6 @@
 #include <stdlib.h>
 #include <inttypes.h> /* for PRIx64 macro */
 
-#if defined(_WIN32) && !defined(snprintf)
-#define snprintf _snprintf
-#endif
-
 static void
 print_tabs(unsigned num_tabs, FILE *fp)
 {
@@ -261,7 +257,7 @@
 get_var_name(nir_variable *var, print_state *state)
 {
    if (state->ht == NULL)
-      return var->name;
+      return var->name ? var->name : "unnamed";
 
    assert(state->syms);
 
@@ -1163,20 +1159,20 @@
 
    fprintf(fp, "shader: %s\n", gl_shader_stage_name(shader->stage));
 
-   if (shader->info->name)
-      fprintf(fp, "name: %s\n", shader->info->name);
+   if (shader->info.name)
+      fprintf(fp, "name: %s\n", shader->info.name);
 
-   if (shader->info->label)
-      fprintf(fp, "label: %s\n", shader->info->label);
+   if (shader->info.label)
+      fprintf(fp, "label: %s\n", shader->info.label);
 
    switch (shader->stage) {
    case MESA_SHADER_COMPUTE:
       fprintf(fp, "local-size: %u, %u, %u%s\n",
-              shader->info->cs.local_size[0],
-              shader->info->cs.local_size[1],
-              shader->info->cs.local_size[2],
-              shader->info->cs.local_size_variable ? " (variable)" : "");
-      fprintf(fp, "shared-size: %u\n", shader->info->cs.shared_size);
+              shader->info.cs.local_size[0],
+              shader->info.cs.local_size[1],
+              shader->info.cs.local_size[2],
+              shader->info.cs.local_size_variable ? " (variable)" : "");
+      fprintf(fp, "shared-size: %u\n", shader->info.cs.shared_size);
       break;
    default:
       break;
diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
index faa3bdf..200f247 100644
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -115,6 +115,18 @@
 }
 
 static inline bool
+is_not_const(nir_alu_instr *instr, unsigned src, unsigned num_components,
+             const uint8_t *swizzle)
+{
+   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
+
+   if (val)
+      return false;
+
+   return true;
+}
+
+static inline bool
 is_used_more_than_once(nir_alu_instr *instr)
 {
    bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses);
@@ -158,4 +170,19 @@
    return list_empty(&instr->dest.dest.ssa.if_uses);
 }
 
+static inline bool
+is_not_used_by_conditional(nir_alu_instr *instr)
+{
+   if (!is_not_used_by_if(instr))
+      return false;
+
+   nir_foreach_use(use, &instr->dest.dest.ssa) {
+      if (use->parent_instr->type == nir_instr_type_alu &&
+          nir_instr_as_alu(use->parent_instr)->op == nir_op_bcsel)
+         return false;
+   }
+
+   return true;
+}
+
 #endif /* _NIR_SEARCH_ */
diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c
index e6ae298..0f1debc 100644
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@@ -150,20 +150,12 @@
 {
    void *rubbish = ralloc_context(NULL);
 
-   /* The shader may not own shader_info so check first */
-   bool steal_info = false;
-   if (nir == ralloc_parent(nir->info))
-      steal_info = true;
-
    /* First, move ownership of all the memory to a temporary context; assume dead. */
    ralloc_adopt(rubbish, nir);
 
-   if (steal_info)
-      ralloc_steal(nir, nir->info);
-
-   ralloc_steal(nir, (char *)nir->info->name);
-   if (nir->info->label)
-      ralloc_steal(nir, (char *)nir->info->label);
+   ralloc_steal(nir, (char *)nir->info.name);
+   if (nir->info.label)
+      ralloc_steal(nir, (char *)nir->info.label);
 
    /* Variables and registers are not dead.  Steal them back. */
    steal_list(nir, nir_variable, &nir->uniforms);
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index 52fd0e9..25980b9 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -339,6 +339,17 @@
    return glsl_type::get_record_instance(fields, num_fields, name);
 }
 
+const glsl_type *
+glsl_interface_type(const glsl_struct_field *fields,
+                    unsigned num_fields,
+                    enum glsl_interface_packing packing,
+                    bool row_major,
+                    const char *block_name)
+{
+   return glsl_type::get_interface_instance(fields, num_fields, packing,
+                                            row_major, block_name);
+}
+
 const struct glsl_type *
 glsl_sampler_type(enum glsl_sampler_dim dim, bool is_shadow, bool is_array,
                   enum glsl_base_type base_type)
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 228d33a..0c52bb9 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -142,6 +142,11 @@
                                         unsigned elements);
 const struct glsl_type *glsl_struct_type(const struct glsl_struct_field *fields,
                                          unsigned num_fields, const char *name);
+const struct glsl_type *glsl_interface_type(const struct glsl_struct_field *fields,
+                                            unsigned num_fields,
+                                            enum glsl_interface_packing packing,
+                                            bool row_major,
+                                            const char *block_name);
 const struct glsl_type *glsl_sampler_type(enum glsl_sampler_dim dim,
                                           bool is_shadow, bool is_array,
                                           enum glsl_base_type base_type);
diff --git a/src/compiler/shader_enums.c b/src/compiler/shader_enums.c
index ca62cda..b2ca80b 100644
--- a/src/compiler/shader_enums.c
+++ b/src/compiler/shader_enums.c
@@ -162,6 +162,7 @@
       ENUM(VARYING_SLOT_TESS_LEVEL_INNER),
       ENUM(VARYING_SLOT_BOUNDING_BOX0),
       ENUM(VARYING_SLOT_BOUNDING_BOX1),
+      ENUM(VARYING_SLOT_VIEW_INDEX),
       ENUM(VARYING_SLOT_VAR0),
       ENUM(VARYING_SLOT_VAR1),
       ENUM(VARYING_SLOT_VAR2),
@@ -232,6 +233,7 @@
      ENUM(SYSTEM_VALUE_GLOBAL_INVOCATION_ID),
      ENUM(SYSTEM_VALUE_WORK_GROUP_ID),
      ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS),
+     ENUM(SYSTEM_VALUE_VIEW_INDEX),
      ENUM(SYSTEM_VALUE_VERTEX_CNT),
    };
    STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX);
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index 930d997..352f270 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -39,6 +39,7 @@
  */
 typedef enum
 {
+   MESA_SHADER_NONE = -1,
    MESA_SHADER_VERTEX = 0,
    MESA_SHADER_TESS_CTRL = 1,
    MESA_SHADER_TESS_EVAL = 2,
@@ -217,6 +218,7 @@
    VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
    VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
    VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
+   VARYING_SLOT_VIEW_INDEX,
    VARYING_SLOT_VAR0, /* First generic varying slot */
    /* the remaining are simply for the benefit of gl_varying_slot_name()
     * and not to be construed as an upper bound:
@@ -535,6 +537,9 @@
    SYSTEM_VALUE_LOCAL_GROUP_SIZE,
    /*@}*/
 
+   /** Required for VK_KHX_multiview */
+   SYSTEM_VALUE_VIEW_INDEX,
+
    /**
     * Driver internal vertex-count, used (for example) for drivers to
     * calculate stride for stream-out outputs.  Not externally visible.
diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index a670841..3841394 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -32,14 +32,14 @@
 #endif
 
 typedef struct shader_info {
-   /** The shader stage, such as MESA_SHADER_VERTEX. */
-   gl_shader_stage stage;
-
    const char *name;
 
    /* Descriptive name provided by the client; may be NULL */
    const char *label;
 
+   /** The shader stage, such as MESA_SHADER_VERTEX. */
+   gl_shader_stage stage;
+
    /* Number of textures used by this shader */
    unsigned num_textures;
    /* Number of uniform buffers used by this shader */
diff --git a/src/compiler/spirv/.gitignore b/src/compiler/spirv/.gitignore
new file mode 100644
index 0000000..f723c31
--- /dev/null
+++ b/src/compiler/spirv/.gitignore
@@ -0,0 +1 @@
+/spirv_info.c
diff --git a/src/compiler/spirv/BUILD.gn b/src/compiler/spirv/BUILD.gn
index 974da27..388c458 100644
--- a/src/compiler/spirv/BUILD.gn
+++ b/src/compiler/spirv/BUILD.gn
@@ -32,6 +32,7 @@
   ]
 
   deps = [
+    ":spirv_info",
     "$mesa_build_root/include:c_compat",
     "$mesa_build_root/src/compiler/nir",
     "$mesa_build_root/src/util",
@@ -40,7 +41,7 @@
   include_dirs = [ ".." ]
 
   sources = [
-    "spirv_info.c",
+    "$target_gen_dir/spirv_info.c",
     "spirv_to_nir.c",
     "vtn_alu.c",
     "vtn_cfg.c",
@@ -49,3 +50,27 @@
     "vtn_variables.c",
   ]
 }
+
+action("spirv_info") {
+  output_name = "spirv_info.c"
+  script_name = "spirv_info_c.py"
+
+  script = "$mesa_build_root/scripts/gn_script_wrapper.py"
+
+  outputs = [
+    "$target_gen_dir/$output_name",
+  ]
+
+  inputs = [
+    "spirv.core.grammar.json",
+    script_name,
+  ]
+
+  args = [
+    "$magma_python_path",
+    rebase_path(target_gen_dir) + "/stdout",
+    rebase_path(".") + "/$script_name",
+    rebase_path(".") + "/spirv.core.grammar.json",
+    rebase_path(target_gen_dir) + "/$output_name",
+  ]
+}
\ No newline at end of file
diff --git a/src/compiler/spirv/nir_spirv.h b/src/compiler/spirv/nir_spirv.h
index 1779d1c..83577fb 100644
--- a/src/compiler/spirv/nir_spirv.h
+++ b/src/compiler/spirv/nir_spirv.h
@@ -50,6 +50,8 @@
    bool image_read_without_format;
    bool image_write_without_format;
    bool int64;
+   bool multiview;
+   bool variable_pointers;
 };
 
 nir_function *spirv_to_nir(const uint32_t *words, size_t word_count,
diff --git a/src/compiler/spirv/spirv.core.grammar.json b/src/compiler/spirv/spirv.core.grammar.json
new file mode 100644
index 0000000..e2950dd
--- /dev/null
+++ b/src/compiler/spirv/spirv.core.grammar.json
@@ -0,0 +1,5792 @@
+{
+  "copyright" : [
+    "Copyright (c) 2014-2016 The Khronos Group Inc.",
+    "",
+    "Permission is hereby granted, free of charge, to any person obtaining a copy",
+    "of this software and/or associated documentation files (the \"Materials\"),",
+    "to deal in the Materials without restriction, including without limitation",
+    "the rights to use, copy, modify, merge, publish, distribute, sublicense,",
+    "and/or sell copies of the Materials, and to permit persons to whom the",
+    "Materials are furnished to do so, subject to the following conditions:",
+    "",
+    "The above copyright notice and this permission notice shall be included in",
+    "all copies or substantial portions of the Materials.",
+    "",
+    "MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS",
+    "STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND",
+    "HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ ",
+    "",
+    "THE MATERIALS ARE PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS",
+    "OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,",
+    "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL",
+    "THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER",
+    "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING",
+    "FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS",
+    "IN THE MATERIALS."
+  ],
+  "magic_number" : "0x07230203",
+  "major_version" : 1,
+  "minor_version" : 2,
+  "revision" : 1,
+  "instructions" : [
+    {
+      "opname" : "OpNop",
+      "opcode" : 0
+    },
+    {
+      "opname" : "OpUndef",
+      "opcode" : 1,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpSourceContinued",
+      "opcode" : 2,
+      "operands" : [
+        { "kind" : "LiteralString", "name" : "'Continued Source'" }
+      ]
+    },
+    {
+      "opname" : "OpSource",
+      "opcode" : 3,
+      "operands" : [
+        { "kind" : "SourceLanguage" },
+        { "kind" : "LiteralInteger",                     "name" : "'Version'" },
+        { "kind" : "IdRef",          "quantifier" : "?", "name" : "'File'" },
+        { "kind" : "LiteralString",  "quantifier" : "?", "name" : "'Source'" }
+      ]
+    },
+    {
+      "opname" : "OpSourceExtension",
+      "opcode" : 4,
+      "operands" : [
+        { "kind" : "LiteralString", "name" : "'Extension'" }
+      ]
+    },
+    {
+      "opname" : "OpName",
+      "opcode" : 5,
+      "operands" : [
+        { "kind" : "IdRef",         "name" : "'Target'" },
+        { "kind" : "LiteralString", "name" : "'Name'" }
+      ]
+    },
+    {
+      "opname" : "OpMemberName",
+      "opcode" : 6,
+      "operands" : [
+        { "kind" : "IdRef",          "name" : "'Type'" },
+        { "kind" : "LiteralInteger", "name" : "'Member'" },
+        { "kind" : "LiteralString",  "name" : "'Name'" }
+      ]
+    },
+    {
+      "opname" : "OpString",
+      "opcode" : 7,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralString", "name" : "'String'" }
+      ]
+    },
+    {
+      "opname" : "OpLine",
+      "opcode" : 8,
+      "operands" : [
+        { "kind" : "IdRef",          "name" : "'File'" },
+        { "kind" : "LiteralInteger", "name" : "'Line'" },
+        { "kind" : "LiteralInteger", "name" : "'Column'" }
+      ]
+    },
+    {
+      "opname" : "OpExtension",
+      "opcode" : 10,
+      "operands" : [
+        { "kind" : "LiteralString", "name" : "'Name'" }
+      ]
+    },
+    {
+      "opname" : "OpExtInstImport",
+      "opcode" : 11,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralString", "name" : "'Name'" }
+      ]
+    },
+    {
+      "opname" : "OpExtInst",
+      "opcode" : 12,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                                     "name" : "'Set'" },
+        { "kind" : "LiteralExtInstInteger",                     "name" : "'Instruction'" },
+        { "kind" : "IdRef",                 "quantifier" : "*", "name" : "'Operand 1', +\n'Operand 2', +\n..." }
+      ]
+    },
+    {
+      "opname" : "OpMemoryModel",
+      "opcode" : 14,
+      "operands" : [
+        { "kind" : "AddressingModel" },
+        { "kind" : "MemoryModel" }
+      ]
+    },
+    {
+      "opname" : "OpEntryPoint",
+      "opcode" : 15,
+      "operands" : [
+        { "kind" : "ExecutionModel" },
+        { "kind" : "IdRef",                              "name" : "'Entry Point'" },
+        { "kind" : "LiteralString",                      "name" : "'Name'" },
+        { "kind" : "IdRef",          "quantifier" : "*", "name" : "'Interface'" }
+      ]
+    },
+    {
+      "opname" : "OpExecutionMode",
+      "opcode" : 16,
+      "operands" : [
+        { "kind" : "IdRef",         "name" : "'Entry Point'" },
+        { "kind" : "ExecutionMode", "name" : "'Mode'" }
+      ]
+    },
+    {
+      "opname" : "OpCapability",
+      "opcode" : 17,
+      "operands" : [
+        { "kind" : "Capability", "name" : "'Capability'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeVoid",
+      "opcode" : 19,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpTypeBool",
+      "opcode" : 20,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpTypeInt",
+      "opcode" : 21,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralInteger", "name" : "'Width'" },
+        { "kind" : "LiteralInteger", "name" : "'Signedness'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeFloat",
+      "opcode" : 22,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralInteger", "name" : "'Width'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeVector",
+      "opcode" : 23,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",          "name" : "'Component Type'" },
+        { "kind" : "LiteralInteger", "name" : "'Component Count'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeMatrix",
+      "opcode" : 24,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",          "name" : "'Column Type'" },
+        { "kind" : "LiteralInteger", "name" : "'Column Count'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpTypeImage",
+      "opcode" : 25,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                               "name" : "'Sampled Type'" },
+        { "kind" : "Dim" },
+        { "kind" : "LiteralInteger",                      "name" : "'Depth'" },
+        { "kind" : "LiteralInteger",                      "name" : "'Arrayed'" },
+        { "kind" : "LiteralInteger",                      "name" : "'MS'" },
+        { "kind" : "LiteralInteger",                      "name" : "'Sampled'" },
+        { "kind" : "ImageFormat" },
+        { "kind" : "AccessQualifier", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpTypeSampler",
+      "opcode" : 26,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpTypeSampledImage",
+      "opcode" : 27,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",    "name" : "'Image Type'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeArray",
+      "opcode" : 28,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",    "name" : "'Element Type'" },
+        { "kind" : "IdRef",    "name" : "'Length'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeRuntimeArray",
+      "opcode" : 29,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",    "name" : "'Element Type'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpTypeStruct",
+      "opcode" : 30,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",    "quantifier" : "*", "name" : "'Member 0 type', +\n'member 1 type', +\n..." }
+      ]
+    },
+    {
+      "opname" : "OpTypeOpaque",
+      "opcode" : 31,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralString", "name" : "The name of the opaque type." }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpTypePointer",
+      "opcode" : 32,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "StorageClass" },
+        { "kind" : "IdRef",        "name" : "'Type'" }
+      ]
+    },
+    {
+      "opname" : "OpTypeFunction",
+      "opcode" : 33,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                        "name" : "'Return Type'" },
+        { "kind" : "IdRef",    "quantifier" : "*", "name" : "'Parameter 0 Type', +\n'Parameter 1 Type', +\n..." }
+      ]
+    },
+    {
+      "opname" : "OpTypeEvent",
+      "opcode" : 34,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpTypeDeviceEvent",
+      "opcode" : 35,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpTypeReserveId",
+      "opcode" : 36,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpTypeQueue",
+      "opcode" : 37,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpTypePipe",
+      "opcode" : 38,
+      "operands" : [
+        { "kind" : "IdResult" },
+        { "kind" : "AccessQualifier", "name" : "'Qualifier'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpTypeForwardPointer",
+      "opcode" : 39,
+      "operands" : [
+        { "kind" : "IdRef",        "name" : "'Pointer Type'" },
+        { "kind" : "StorageClass" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpConstantTrue",
+      "opcode" : 41,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpConstantFalse",
+      "opcode" : 42,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpConstant",
+      "opcode" : 43,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralContextDependentNumber", "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpConstantComposite",
+      "opcode" : 44,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Constituents'" }
+      ]
+    },
+    {
+      "opname" : "OpConstantSampler",
+      "opcode" : 45,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "SamplerAddressingMode" },
+        { "kind" : "LiteralInteger",        "name" : "'Param'" },
+        { "kind" : "SamplerFilterMode" }
+      ],
+      "capabilities" : [ "LiteralSampler" ]
+    },
+    {
+      "opname" : "OpConstantNull",
+      "opcode" : 46,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpSpecConstantTrue",
+      "opcode" : 48,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpSpecConstantFalse",
+      "opcode" : 49,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpSpecConstant",
+      "opcode" : 50,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralContextDependentNumber", "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpSpecConstantComposite",
+      "opcode" : 51,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Constituents'" }
+      ]
+    },
+    {
+      "opname" : "OpSpecConstantOp",
+      "opcode" : 52,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralSpecConstantOpInteger", "name" : "'Opcode'" }
+      ]
+    },
+    {
+      "opname" : "OpFunction",
+      "opcode" : 54,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "FunctionControl" },
+        { "kind" : "IdRef",           "name" : "'Function Type'" }
+      ]
+    },
+    {
+      "opname" : "OpFunctionParameter",
+      "opcode" : 55,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpFunctionEnd",
+      "opcode" : 56
+    },
+    {
+      "opname" : "OpFunctionCall",
+      "opcode" : 57,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Function'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Argument 0', +\n'Argument 1', +\n..." }
+      ]
+    },
+    {
+      "opname" : "OpVariable",
+      "opcode" : 59,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "StorageClass" },
+        { "kind" : "IdRef",        "quantifier" : "?", "name" : "'Initializer'" }
+      ]
+    },
+    {
+      "opname" : "OpImageTexelPointer",
+      "opcode" : 60,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" },
+        { "kind" : "IdRef",        "name" : "'Coordinate'" },
+        { "kind" : "IdRef",        "name" : "'Sample'" }
+      ]
+    },
+    {
+      "opname" : "OpLoad",
+      "opcode" : 61,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Pointer'" },
+        { "kind" : "MemoryAccess", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpStore",
+      "opcode" : 62,
+      "operands" : [
+        { "kind" : "IdRef",                            "name" : "'Pointer'" },
+        { "kind" : "IdRef",                            "name" : "'Object'" },
+        { "kind" : "MemoryAccess", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpCopyMemory",
+      "opcode" : 63,
+      "operands" : [
+        { "kind" : "IdRef",                            "name" : "'Target'" },
+        { "kind" : "IdRef",                            "name" : "'Source'" },
+        { "kind" : "MemoryAccess", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpCopyMemorySized",
+      "opcode" : 64,
+      "operands" : [
+        { "kind" : "IdRef",                            "name" : "'Target'" },
+        { "kind" : "IdRef",                            "name" : "'Source'" },
+        { "kind" : "IdRef",                            "name" : "'Size'" },
+        { "kind" : "MemoryAccess", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpAccessChain",
+      "opcode" : 65,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Base'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Indexes'" }
+      ]
+    },
+    {
+      "opname" : "OpInBoundsAccessChain",
+      "opcode" : 66,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Base'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Indexes'" }
+      ]
+    },
+    {
+      "opname" : "OpPtrAccessChain",
+      "opcode" : 67,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Base'" },
+        { "kind" : "IdRef",                            "name" : "'Element'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Indexes'" }
+      ],
+      "capabilities" : [
+        "Addresses",
+        "VariablePointers",
+        "VariablePointersStorageBuffer"
+      ]
+    },
+    {
+      "opname" : "OpArrayLength",
+      "opcode" : 68,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",          "name" : "'Structure'" },
+        { "kind" : "LiteralInteger", "name" : "'Array member'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpGenericPtrMemSemantics",
+      "opcode" : 69,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpInBoundsPtrAccessChain",
+      "opcode" : 70,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Base'" },
+        { "kind" : "IdRef",                            "name" : "'Element'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Indexes'" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpDecorate",
+      "opcode" : 71,
+      "operands" : [
+        { "kind" : "IdRef",      "name" : "'Target'" },
+        { "kind" : "Decoration" }
+      ]
+    },
+    {
+      "opname" : "OpMemberDecorate",
+      "opcode" : 72,
+      "operands" : [
+        { "kind" : "IdRef",          "name" : "'Structure Type'" },
+        { "kind" : "LiteralInteger", "name" : "'Member'" },
+        { "kind" : "Decoration" }
+      ]
+    },
+    {
+      "opname" : "OpDecorationGroup",
+      "opcode" : 73,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpGroupDecorate",
+      "opcode" : 74,
+      "operands" : [
+        { "kind" : "IdRef",                     "name" : "'Decoration Group'" },
+        { "kind" : "IdRef", "quantifier" : "*", "name" : "'Targets'" }
+      ]
+    },
+    {
+      "opname" : "OpGroupMemberDecorate",
+      "opcode" : 75,
+      "operands" : [
+        { "kind" : "IdRef",                                       "name" : "'Decoration Group'" },
+        { "kind" : "PairIdRefLiteralInteger", "quantifier" : "*", "name" : "'Targets'" }
+      ]
+    },
+    {
+      "opname" : "OpVectorExtractDynamic",
+      "opcode" : 77,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" },
+        { "kind" : "IdRef",        "name" : "'Index'" }
+      ]
+    },
+    {
+      "opname" : "OpVectorInsertDynamic",
+      "opcode" : 78,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" },
+        { "kind" : "IdRef",        "name" : "'Component'" },
+        { "kind" : "IdRef",        "name" : "'Index'" }
+      ]
+    },
+    {
+      "opname" : "OpVectorShuffle",
+      "opcode" : 79,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                              "name" : "'Vector 1'" },
+        { "kind" : "IdRef",                              "name" : "'Vector 2'" },
+        { "kind" : "LiteralInteger", "quantifier" : "*", "name" : "'Components'" }
+      ]
+    },
+    {
+      "opname" : "OpCompositeConstruct",
+      "opcode" : 80,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Constituents'" }
+      ]
+    },
+    {
+      "opname" : "OpCompositeExtract",
+      "opcode" : 81,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                              "name" : "'Composite'" },
+        { "kind" : "LiteralInteger", "quantifier" : "*", "name" : "'Indexes'" }
+      ]
+    },
+    {
+      "opname" : "OpCompositeInsert",
+      "opcode" : 82,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                              "name" : "'Object'" },
+        { "kind" : "IdRef",                              "name" : "'Composite'" },
+        { "kind" : "LiteralInteger", "quantifier" : "*", "name" : "'Indexes'" }
+      ]
+    },
+    {
+      "opname" : "OpCopyObject",
+      "opcode" : 83,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpTranspose",
+      "opcode" : 84,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Matrix'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpSampledImage",
+      "opcode" : 86,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" },
+        { "kind" : "IdRef",        "name" : "'Sampler'" }
+      ]
+    },
+    {
+      "opname" : "OpImageSampleImplicitLod",
+      "opcode" : 87,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleExplicitLod",
+      "opcode" : 88,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands" }
+      ]
+    },
+    {
+      "opname" : "OpImageSampleDrefImplicitLod",
+      "opcode" : 89,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleDrefExplicitLod",
+      "opcode" : 90,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "IdRef",         "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleProjImplicitLod",
+      "opcode" : 91,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleProjExplicitLod",
+      "opcode" : 92,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleProjDrefImplicitLod",
+      "opcode" : 93,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageSampleProjDrefExplicitLod",
+      "opcode" : 94,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "IdRef",         "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageFetch",
+      "opcode" : 95,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpImageGather",
+      "opcode" : 96,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'Component'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageDrefGather",
+      "opcode" : 97,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpImageRead",
+      "opcode" : 98,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpImageWrite",
+      "opcode" : 99,
+      "operands" : [
+        { "kind" : "IdRef",                             "name" : "'Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'Texel'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ]
+    },
+    {
+      "opname" : "OpImage",
+      "opcode" : 100,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Sampled Image'" }
+      ]
+    },
+    {
+      "opname" : "OpImageQueryFormat",
+      "opcode" : 101,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpImageQueryOrder",
+      "opcode" : 102,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpImageQuerySizeLod",
+      "opcode" : 103,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" },
+        { "kind" : "IdRef",        "name" : "'Level of Detail'" }
+      ],
+      "capabilities" : [ "Kernel", "ImageQuery" ]
+    },
+    {
+      "opname" : "OpImageQuerySize",
+      "opcode" : 104,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" }
+      ],
+      "capabilities" : [ "Kernel", "ImageQuery" ]
+    },
+    {
+      "opname" : "OpImageQueryLod",
+      "opcode" : 105,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",        "name" : "'Coordinate'" }
+      ],
+      "capabilities" : [ "ImageQuery" ]
+    },
+    {
+      "opname" : "OpImageQueryLevels",
+      "opcode" : 106,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" }
+      ],
+      "capabilities" : [ "Kernel", "ImageQuery" ]
+    },
+    {
+      "opname" : "OpImageQuerySamples",
+      "opcode" : 107,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Image'" }
+      ],
+      "capabilities" : [ "Kernel", "ImageQuery" ]
+    },
+    {
+      "opname" : "OpConvertFToU",
+      "opcode" : 109,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Float Value'" }
+      ]
+    },
+    {
+      "opname" : "OpConvertFToS",
+      "opcode" : 110,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Float Value'" }
+      ]
+    },
+    {
+      "opname" : "OpConvertSToF",
+      "opcode" : 111,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Signed Value'" }
+      ]
+    },
+    {
+      "opname" : "OpConvertUToF",
+      "opcode" : 112,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Unsigned Value'" }
+      ]
+    },
+    {
+      "opname" : "OpUConvert",
+      "opcode" : 113,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Unsigned Value'" }
+      ]
+    },
+    {
+      "opname" : "OpSConvert",
+      "opcode" : 114,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Signed Value'" }
+      ]
+    },
+    {
+      "opname" : "OpFConvert",
+      "opcode" : 115,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Float Value'" }
+      ]
+    },
+    {
+      "opname" : "OpQuantizeToF16",
+      "opcode" : 116,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpConvertPtrToU",
+      "opcode" : 117,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpSatConvertSToU",
+      "opcode" : 118,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Signed Value'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpSatConvertUToS",
+      "opcode" : 119,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Unsigned Value'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpConvertUToPtr",
+      "opcode" : 120,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Integer Value'" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpPtrCastToGeneric",
+      "opcode" : 121,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpGenericCastToPtr",
+      "opcode" : 122,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpGenericCastToPtrExplicit",
+      "opcode" : 123,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" },
+        { "kind" : "StorageClass", "name" : "'Storage'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpBitcast",
+      "opcode" : 124,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpSNegate",
+      "opcode" : 126,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpFNegate",
+      "opcode" : 127,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpIAdd",
+      "opcode" : 128,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFAdd",
+      "opcode" : 129,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpISub",
+      "opcode" : 130,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFSub",
+      "opcode" : 131,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpIMul",
+      "opcode" : 132,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFMul",
+      "opcode" : 133,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpUDiv",
+      "opcode" : 134,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSDiv",
+      "opcode" : 135,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFDiv",
+      "opcode" : 136,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpUMod",
+      "opcode" : 137,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSRem",
+      "opcode" : 138,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSMod",
+      "opcode" : 139,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFRem",
+      "opcode" : 140,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFMod",
+      "opcode" : 141,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpVectorTimesScalar",
+      "opcode" : 142,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" },
+        { "kind" : "IdRef",        "name" : "'Scalar'" }
+      ]
+    },
+    {
+      "opname" : "OpMatrixTimesScalar",
+      "opcode" : 143,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Matrix'" },
+        { "kind" : "IdRef",        "name" : "'Scalar'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpVectorTimesMatrix",
+      "opcode" : 144,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" },
+        { "kind" : "IdRef",        "name" : "'Matrix'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpMatrixTimesVector",
+      "opcode" : 145,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Matrix'" },
+        { "kind" : "IdRef",        "name" : "'Vector'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpMatrixTimesMatrix",
+      "opcode" : 146,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'LeftMatrix'" },
+        { "kind" : "IdRef",        "name" : "'RightMatrix'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpOuterProduct",
+      "opcode" : 147,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector 1'" },
+        { "kind" : "IdRef",        "name" : "'Vector 2'" }
+      ],
+      "capabilities" : [ "Matrix" ]
+    },
+    {
+      "opname" : "OpDot",
+      "opcode" : 148,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector 1'" },
+        { "kind" : "IdRef",        "name" : "'Vector 2'" }
+      ]
+    },
+    {
+      "opname" : "OpIAddCarry",
+      "opcode" : 149,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpISubBorrow",
+      "opcode" : 150,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpUMulExtended",
+      "opcode" : 151,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSMulExtended",
+      "opcode" : 152,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpAny",
+      "opcode" : 154,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" }
+      ]
+    },
+    {
+      "opname" : "OpAll",
+      "opcode" : 155,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Vector'" }
+      ]
+    },
+    {
+      "opname" : "OpIsNan",
+      "opcode" : 156,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" }
+      ]
+    },
+    {
+      "opname" : "OpIsInf",
+      "opcode" : 157,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" }
+      ]
+    },
+    {
+      "opname" : "OpIsFinite",
+      "opcode" : 158,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpIsNormal",
+      "opcode" : 159,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpSignBitSet",
+      "opcode" : 160,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpLessOrGreater",
+      "opcode" : 161,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" },
+        { "kind" : "IdRef",        "name" : "'y'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpOrdered",
+      "opcode" : 162,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" },
+        { "kind" : "IdRef",        "name" : "'y'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpUnordered",
+      "opcode" : 163,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'x'" },
+        { "kind" : "IdRef",        "name" : "'y'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpLogicalEqual",
+      "opcode" : 164,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpLogicalNotEqual",
+      "opcode" : 165,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpLogicalOr",
+      "opcode" : 166,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpLogicalAnd",
+      "opcode" : 167,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpLogicalNot",
+      "opcode" : 168,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpSelect",
+      "opcode" : 169,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Condition'" },
+        { "kind" : "IdRef",        "name" : "'Object 1'" },
+        { "kind" : "IdRef",        "name" : "'Object 2'" }
+      ]
+    },
+    {
+      "opname" : "OpIEqual",
+      "opcode" : 170,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpINotEqual",
+      "opcode" : 171,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpUGreaterThan",
+      "opcode" : 172,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSGreaterThan",
+      "opcode" : 173,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpUGreaterThanEqual",
+      "opcode" : 174,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSGreaterThanEqual",
+      "opcode" : 175,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpULessThan",
+      "opcode" : 176,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSLessThan",
+      "opcode" : 177,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpULessThanEqual",
+      "opcode" : 178,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpSLessThanEqual",
+      "opcode" : 179,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdEqual",
+      "opcode" : 180,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordEqual",
+      "opcode" : 181,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdNotEqual",
+      "opcode" : 182,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordNotEqual",
+      "opcode" : 183,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdLessThan",
+      "opcode" : 184,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordLessThan",
+      "opcode" : 185,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdGreaterThan",
+      "opcode" : 186,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordGreaterThan",
+      "opcode" : 187,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdLessThanEqual",
+      "opcode" : 188,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordLessThanEqual",
+      "opcode" : 189,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFOrdGreaterThanEqual",
+      "opcode" : 190,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpFUnordGreaterThanEqual",
+      "opcode" : 191,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpShiftRightLogical",
+      "opcode" : 194,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Shift'" }
+      ]
+    },
+    {
+      "opname" : "OpShiftRightArithmetic",
+      "opcode" : 195,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Shift'" }
+      ]
+    },
+    {
+      "opname" : "OpShiftLeftLogical",
+      "opcode" : 196,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Shift'" }
+      ]
+    },
+    {
+      "opname" : "OpBitwiseOr",
+      "opcode" : 197,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpBitwiseXor",
+      "opcode" : 198,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpBitwiseAnd",
+      "opcode" : 199,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand 1'" },
+        { "kind" : "IdRef",        "name" : "'Operand 2'" }
+      ]
+    },
+    {
+      "opname" : "OpNot",
+      "opcode" : 200,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Operand'" }
+      ]
+    },
+    {
+      "opname" : "OpBitFieldInsert",
+      "opcode" : 201,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Insert'" },
+        { "kind" : "IdRef",        "name" : "'Offset'" },
+        { "kind" : "IdRef",        "name" : "'Count'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpBitFieldSExtract",
+      "opcode" : 202,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Offset'" },
+        { "kind" : "IdRef",        "name" : "'Count'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpBitFieldUExtract",
+      "opcode" : 203,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" },
+        { "kind" : "IdRef",        "name" : "'Offset'" },
+        { "kind" : "IdRef",        "name" : "'Count'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpBitReverse",
+      "opcode" : 204,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpBitCount",
+      "opcode" : 205,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Base'" }
+      ]
+    },
+    {
+      "opname" : "OpDPdx",
+      "opcode" : 207,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpDPdy",
+      "opcode" : 208,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpFwidth",
+      "opcode" : 209,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpDPdxFine",
+      "opcode" : 210,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpDPdyFine",
+      "opcode" : 211,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpFwidthFine",
+      "opcode" : 212,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpDPdxCoarse",
+      "opcode" : 213,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpDPdyCoarse",
+      "opcode" : 214,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpFwidthCoarse",
+      "opcode" : 215,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'P'" }
+      ],
+      "capabilities" : [ "DerivativeControl" ]
+    },
+    {
+      "opname" : "OpEmitVertex",
+      "opcode" : 218,
+      "capabilities" : [ "Geometry" ]
+    },
+    {
+      "opname" : "OpEndPrimitive",
+      "opcode" : 219,
+      "capabilities" : [ "Geometry" ]
+    },
+    {
+      "opname" : "OpEmitStreamVertex",
+      "opcode" : 220,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Stream'" }
+      ],
+      "capabilities" : [ "GeometryStreams" ]
+    },
+    {
+      "opname" : "OpEndStreamPrimitive",
+      "opcode" : 221,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Stream'" }
+      ],
+      "capabilities" : [ "GeometryStreams" ]
+    },
+    {
+      "opname" : "OpControlBarrier",
+      "opcode" : 224,
+      "operands" : [
+        { "kind" : "IdScope",           "name" : "'Execution'" },
+        { "kind" : "IdScope",           "name" : "'Memory'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ]
+    },
+    {
+      "opname" : "OpMemoryBarrier",
+      "opcode" : 225,
+      "operands" : [
+        { "kind" : "IdScope",           "name" : "'Memory'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicLoad",
+      "opcode" : 227,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicStore",
+      "opcode" : 228,
+      "operands" : [
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicExchange",
+      "opcode" : 229,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicCompareExchange",
+      "opcode" : 230,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Equal'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Unequal'" },
+        { "kind" : "IdRef",             "name" : "'Value'" },
+        { "kind" : "IdRef",             "name" : "'Comparator'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicCompareExchangeWeak",
+      "opcode" : 231,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Equal'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Unequal'" },
+        { "kind" : "IdRef",             "name" : "'Value'" },
+        { "kind" : "IdRef",             "name" : "'Comparator'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpAtomicIIncrement",
+      "opcode" : 232,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicIDecrement",
+      "opcode" : 233,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicIAdd",
+      "opcode" : 234,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicISub",
+      "opcode" : 235,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicSMin",
+      "opcode" : 236,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicUMin",
+      "opcode" : 237,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicSMax",
+      "opcode" : 238,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicUMax",
+      "opcode" : 239,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicAnd",
+      "opcode" : 240,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicOr",
+      "opcode" : 241,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpAtomicXor",
+      "opcode" : 242,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" },
+        { "kind" : "IdRef",             "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpPhi",
+      "opcode" : 245,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "PairIdRefIdRef", "quantifier" : "*", "name" : "'Variable, Parent, ...'" }
+      ]
+    },
+    {
+      "opname" : "OpLoopMerge",
+      "opcode" : 246,
+      "operands" : [
+        { "kind" : "IdRef",       "name" : "'Merge Block'" },
+        { "kind" : "IdRef",       "name" : "'Continue Target'" },
+        { "kind" : "LoopControl" }
+      ]
+    },
+    {
+      "opname" : "OpSelectionMerge",
+      "opcode" : 247,
+      "operands" : [
+        { "kind" : "IdRef",            "name" : "'Merge Block'" },
+        { "kind" : "SelectionControl" }
+      ]
+    },
+    {
+      "opname" : "OpLabel",
+      "opcode" : 248,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ]
+    },
+    {
+      "opname" : "OpBranch",
+      "opcode" : 249,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Target Label'" }
+      ]
+    },
+    {
+      "opname" : "OpBranchConditional",
+      "opcode" : 250,
+      "operands" : [
+        { "kind" : "IdRef",                              "name" : "'Condition'" },
+        { "kind" : "IdRef",                              "name" : "'True Label'" },
+        { "kind" : "IdRef",                              "name" : "'False Label'" },
+        { "kind" : "LiteralInteger", "quantifier" : "*", "name" : "'Branch weights'" }
+      ]
+    },
+    {
+      "opname" : "OpSwitch",
+      "opcode" : 251,
+      "operands" : [
+        { "kind" : "IdRef",                                       "name" : "'Selector'" },
+        { "kind" : "IdRef",                                       "name" : "'Default'" },
+        { "kind" : "PairLiteralIntegerIdRef", "quantifier" : "*", "name" : "'Target'" }
+      ]
+    },
+    {
+      "opname" : "OpKill",
+      "opcode" : 252,
+      "capabilities" : [ "Shader" ]
+    },
+    {
+      "opname" : "OpReturn",
+      "opcode" : 253
+    },
+    {
+      "opname" : "OpReturnValue",
+      "opcode" : 254,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Value'" }
+      ]
+    },
+    {
+      "opname" : "OpUnreachable",
+      "opcode" : 255
+    },
+    {
+      "opname" : "OpLifetimeStart",
+      "opcode" : 256,
+      "operands" : [
+        { "kind" : "IdRef",          "name" : "'Pointer'" },
+        { "kind" : "LiteralInteger", "name" : "'Size'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpLifetimeStop",
+      "opcode" : 257,
+      "operands" : [
+        { "kind" : "IdRef",          "name" : "'Pointer'" },
+        { "kind" : "LiteralInteger", "name" : "'Size'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpGroupAsyncCopy",
+      "opcode" : 259,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Destination'" },
+        { "kind" : "IdRef",        "name" : "'Source'" },
+        { "kind" : "IdRef",        "name" : "'Num Elements'" },
+        { "kind" : "IdRef",        "name" : "'Stride'" },
+        { "kind" : "IdRef",        "name" : "'Event'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpGroupWaitEvents",
+      "opcode" : 260,
+      "operands" : [
+        { "kind" : "IdScope", "name" : "'Execution'" },
+        { "kind" : "IdRef",   "name" : "'Num Events'" },
+        { "kind" : "IdRef",   "name" : "'Events List'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpGroupAll",
+      "opcode" : 261,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupAny",
+      "opcode" : 262,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupBroadcast",
+      "opcode" : 263,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Value'" },
+        { "kind" : "IdRef",        "name" : "'LocalId'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupIAdd",
+      "opcode" : 264,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFAdd",
+      "opcode" : 265,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFMin",
+      "opcode" : 266,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupUMin",
+      "opcode" : 267,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupSMin",
+      "opcode" : 268,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFMax",
+      "opcode" : 269,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupUMax",
+      "opcode" : 270,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupSMax",
+      "opcode" : 271,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpReadPipe",
+      "opcode" : 274,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpWritePipe",
+      "opcode" : 275,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpReservedReadPipe",
+      "opcode" : 276,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Reserve Id'" },
+        { "kind" : "IdRef",        "name" : "'Index'" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpReservedWritePipe",
+      "opcode" : 277,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Reserve Id'" },
+        { "kind" : "IdRef",        "name" : "'Index'" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpReserveReadPipePackets",
+      "opcode" : 278,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Num Packets'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpReserveWritePipePackets",
+      "opcode" : 279,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Num Packets'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpCommitReadPipe",
+      "opcode" : 280,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Pipe'" },
+        { "kind" : "IdRef", "name" : "'Reserve Id'" },
+        { "kind" : "IdRef", "name" : "'Packet Size'" },
+        { "kind" : "IdRef", "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpCommitWritePipe",
+      "opcode" : 281,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Pipe'" },
+        { "kind" : "IdRef", "name" : "'Reserve Id'" },
+        { "kind" : "IdRef", "name" : "'Packet Size'" },
+        { "kind" : "IdRef", "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpIsValidReserveId",
+      "opcode" : 282,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Reserve Id'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGetNumPipePackets",
+      "opcode" : 283,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGetMaxPipePackets",
+      "opcode" : 284,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGroupReserveReadPipePackets",
+      "opcode" : 285,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Num Packets'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGroupReserveWritePipePackets",
+      "opcode" : 286,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",      "name" : "'Execution'" },
+        { "kind" : "IdRef",        "name" : "'Pipe'" },
+        { "kind" : "IdRef",        "name" : "'Num Packets'" },
+        { "kind" : "IdRef",        "name" : "'Packet Size'" },
+        { "kind" : "IdRef",        "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGroupCommitReadPipe",
+      "opcode" : 287,
+      "operands" : [
+        { "kind" : "IdScope", "name" : "'Execution'" },
+        { "kind" : "IdRef",   "name" : "'Pipe'" },
+        { "kind" : "IdRef",   "name" : "'Reserve Id'" },
+        { "kind" : "IdRef",   "name" : "'Packet Size'" },
+        { "kind" : "IdRef",   "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpGroupCommitWritePipe",
+      "opcode" : 288,
+      "operands" : [
+        { "kind" : "IdScope", "name" : "'Execution'" },
+        { "kind" : "IdRef",   "name" : "'Pipe'" },
+        { "kind" : "IdRef",   "name" : "'Reserve Id'" },
+        { "kind" : "IdRef",   "name" : "'Packet Size'" },
+        { "kind" : "IdRef",   "name" : "'Packet Alignment'" }
+      ],
+      "capabilities" : [ "Pipes" ]
+    },
+    {
+      "opname" : "OpEnqueueMarker",
+      "opcode" : 291,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Queue'" },
+        { "kind" : "IdRef",        "name" : "'Num Events'" },
+        { "kind" : "IdRef",        "name" : "'Wait Events'" },
+        { "kind" : "IdRef",        "name" : "'Ret Event'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpEnqueueKernel",
+      "opcode" : 292,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                            "name" : "'Queue'" },
+        { "kind" : "IdRef",                            "name" : "'Flags'" },
+        { "kind" : "IdRef",                            "name" : "'ND Range'" },
+        { "kind" : "IdRef",                            "name" : "'Num Events'" },
+        { "kind" : "IdRef",                            "name" : "'Wait Events'" },
+        { "kind" : "IdRef",                            "name" : "'Ret Event'" },
+        { "kind" : "IdRef",                            "name" : "'Invoke'" },
+        { "kind" : "IdRef",                            "name" : "'Param'" },
+        { "kind" : "IdRef",                            "name" : "'Param Size'" },
+        { "kind" : "IdRef",                            "name" : "'Param Align'" },
+        { "kind" : "IdRef",        "quantifier" : "*", "name" : "'Local Size'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpGetKernelNDrangeSubGroupCount",
+      "opcode" : 293,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'ND Range'" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpGetKernelNDrangeMaxSubGroupSize",
+      "opcode" : 294,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'ND Range'" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpGetKernelWorkGroupSize",
+      "opcode" : 295,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpGetKernelPreferredWorkGroupSizeMultiple",
+      "opcode" : 296,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpRetainEvent",
+      "opcode" : 297,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Event'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpReleaseEvent",
+      "opcode" : 298,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Event'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpCreateUserEvent",
+      "opcode" : 299,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpIsValidEvent",
+      "opcode" : 300,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Event'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpSetUserEventStatus",
+      "opcode" : 301,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Event'" },
+        { "kind" : "IdRef", "name" : "'Status'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpCaptureEventProfilingInfo",
+      "opcode" : 302,
+      "operands" : [
+        { "kind" : "IdRef", "name" : "'Event'" },
+        { "kind" : "IdRef", "name" : "'Profiling Info'" },
+        { "kind" : "IdRef", "name" : "'Value'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpGetDefaultQueue",
+      "opcode" : 303,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpBuildNDRange",
+      "opcode" : 304,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'GlobalWorkSize'" },
+        { "kind" : "IdRef",        "name" : "'LocalWorkSize'" },
+        { "kind" : "IdRef",        "name" : "'GlobalWorkOffset'" }
+      ],
+      "capabilities" : [ "DeviceEnqueue" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleImplicitLod",
+      "opcode" : 305,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleExplicitLod",
+      "opcode" : 306,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleDrefImplicitLod",
+      "opcode" : 307,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleDrefExplicitLod",
+      "opcode" : 308,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "IdRef",         "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleProjImplicitLod",
+      "opcode" : 309,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleProjExplicitLod",
+      "opcode" : 310,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleProjDrefImplicitLod",
+      "opcode" : 311,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseSampleProjDrefExplicitLod",
+      "opcode" : 312,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",         "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",         "name" : "'Coordinate'" },
+        { "kind" : "IdRef",         "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseFetch",
+      "opcode" : 313,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseGather",
+      "opcode" : 314,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'Component'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseDrefGather",
+      "opcode" : 315,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Sampled Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "IdRef",                             "name" : "'D~ref~'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpImageSparseTexelsResident",
+      "opcode" : 316,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Resident Code'" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpNoLine",
+      "opcode" : 317
+    },
+    {
+      "opname" : "OpAtomicFlagTestAndSet",
+      "opcode" : 318,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpAtomicFlagClear",
+      "opcode" : 319,
+      "operands" : [
+        { "kind" : "IdRef",             "name" : "'Pointer'" },
+        { "kind" : "IdScope",           "name" : "'Scope'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ],
+      "capabilities" : [ "Kernel" ]
+    },
+    {
+      "opname" : "OpImageSparseRead",
+      "opcode" : 320,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",                             "name" : "'Image'" },
+        { "kind" : "IdRef",                             "name" : "'Coordinate'" },
+        { "kind" : "ImageOperands", "quantifier" : "?" }
+      ],
+      "capabilities" : [ "SparseResidency" ]
+    },
+    {
+      "opname" : "OpSizeOf",
+      "opcode" : 321,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pointer'" }
+      ],
+      "capabilities" : [ "Addresses" ]
+    },
+    {
+      "opname" : "OpTypePipeStorage",
+      "opcode" : 322,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "PipeStorage" ]
+    },
+    {
+      "opname" : "OpConstantPipeStorage",
+      "opcode" : 323,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "LiteralInteger", "name" : "'Packet Size'" },
+        { "kind" : "LiteralInteger", "name" : "'Packet Alignment'" },
+        { "kind" : "LiteralInteger", "name" : "'Capacity'" }
+      ],
+      "capabilities" : [ "PipeStorage" ]
+    },
+    {
+      "opname" : "OpCreatePipeFromPipeStorage",
+      "opcode" : 324,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Pipe Storage'" }
+      ],
+      "capabilities" : [ "PipeStorage" ]
+    },
+    {
+      "opname" : "OpGetKernelLocalSizeForSubgroupCount",
+      "opcode" : 325,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Subgroup Count'" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "SubgroupDispatch" ]
+    },
+    {
+      "opname" : "OpGetKernelMaxNumSubgroups",
+      "opcode" : 326,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Invoke'" },
+        { "kind" : "IdRef",        "name" : "'Param'" },
+        { "kind" : "IdRef",        "name" : "'Param Size'" },
+        { "kind" : "IdRef",        "name" : "'Param Align'" }
+      ],
+      "capabilities" : [ "SubgroupDispatch" ]
+    },
+    {
+      "opname" : "OpTypeNamedBarrier",
+      "opcode" : 327,
+      "operands" : [
+        { "kind" : "IdResult" }
+      ],
+      "capabilities" : [ "NamedBarrier" ]
+    },
+    {
+      "opname" : "OpNamedBarrierInitialize",
+      "opcode" : 328,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef",        "name" : "'Subgroup Count'" }
+      ],
+      "capabilities" : [ "NamedBarrier" ]
+    },
+    {
+      "opname" : "OpMemoryNamedBarrier",
+      "opcode" : 329,
+      "operands" : [
+        { "kind" : "IdRef",             "name" : "'Named Barrier'" },
+        { "kind" : "IdScope",           "name" : "'Memory'" },
+        { "kind" : "IdMemorySemantics", "name" : "'Semantics'" }
+      ],
+      "capabilities" : [ "NamedBarrier" ]
+    },
+    {
+      "opname" : "OpModuleProcessed",
+      "opcode" : 330,
+      "operands" : [
+        { "kind" : "LiteralString", "name" : "'Process'" }
+      ]
+    },
+    {
+      "opname" : "OpExecutionModeId",
+      "opcode" : 331,
+      "operands" : [
+        { "kind" : "IdRef",           "name" : "'Entry Point'" },
+        { "kind" : "ExecutionMode",   "name" : "'Mode'" }
+      ]
+    },
+    {
+      "opname" : "OpDecorateId",
+      "opcode" : 332,
+      "operands" : [
+        { "kind" : "IdRef",      "name" : "'Target'" },
+        { "kind" : "Decoration" }
+      ]
+    },
+    {
+      "opname" : "OpSubgroupBallotKHR",
+      "opcode" : 4421,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "SubgroupBallotKHR" ]
+    },
+    {
+      "opname" : "OpSubgroupFirstInvocationKHR",
+      "opcode" : 4422,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Value'" }
+      ],
+      "capabilities" : [ "SubgroupBallotKHR" ]
+    },
+    {
+      "opname" : "OpSubgroupAllKHR",
+      "opcode" : 4428,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "SubgroupVoteKHR" ]
+    },
+    {
+      "opname" : "OpSubgroupAnyKHR",
+      "opcode" : 4429,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "SubgroupVoteKHR" ]
+    },
+    {
+      "opname" : "OpSubgroupAllEqualKHR",
+      "opcode" : 4430,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Predicate'" }
+      ],
+      "capabilities" : [ "SubgroupVoteKHR" ]
+    },
+    {
+      "opname" : "OpSubgroupReadInvocationKHR",
+      "opcode" : 4432,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdRef", "name" : "'Value'" },
+        { "kind" : "IdRef", "name" : "'Index'" }
+      ],
+      "capabilities" : [ "SubgroupBallotKHR" ]
+    },
+    {
+      "opname" : "OpGroupIAddNonUniformAMD",
+      "opcode" : 5000,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFAddNonUniformAMD",
+      "opcode" : 5001,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFMinNonUniformAMD",
+      "opcode" : 5002,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupUMinNonUniformAMD",
+      "opcode" : 5003,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "'X'" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupSMinNonUniformAMD",
+      "opcode" : 5004,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupFMaxNonUniformAMD",
+      "opcode" : 5005,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupUMaxNonUniformAMD",
+      "opcode" : 5006,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    },
+    {
+      "opname" : "OpGroupSMaxNonUniformAMD",
+      "opcode" : 5007,
+      "operands" : [
+        { "kind" : "IdResultType" },
+        { "kind" : "IdResult" },
+        { "kind" : "IdScope",        "name" : "'Execution'" },
+        { "kind" : "GroupOperation", "name" : "'Operation'" },
+        { "kind" : "IdRef",          "name" : "X" }
+      ],
+      "capabilities" : [ "Groups" ]
+    }
+  ],
+  "operand_kinds" : [
+    {
+      "category" : "BitEnum",
+      "kind" : "ImageOperands",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Bias",
+          "value" : "0x0001",
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "Lod",
+          "value" : "0x0002",
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "Grad",
+          "value" : "0x0004",
+          "parameters" : [
+            { "kind" : "IdRef" },
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "ConstOffset",
+          "value" : "0x0008",
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "Offset",
+          "value" : "0x0010",
+          "capabilities" : [ "ImageGatherExtended" ],
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "ConstOffsets",
+          "value" : "0x0020",
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "Sample",
+          "value" : "0x0040",
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        },
+        {
+          "enumerant" : "MinLod",
+          "value" : "0x0080",
+          "capabilities" : [ "MinLod" ],
+          "parameters" : [
+            { "kind" : "IdRef" }
+          ]
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "FPFastMathMode",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "NotNaN",
+          "value" : "0x0001",
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NotInf",
+          "value" : "0x0002",
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NSZ",
+          "value" : "0x0004",
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "AllowRecip",
+          "value" : "0x0008",
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Fast",
+          "value" : "0x0010",
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "SelectionControl",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Flatten",
+          "value" : "0x0001"
+        },
+        {
+          "enumerant" : "DontFlatten",
+          "value" : "0x0002"
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "LoopControl",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Unroll",
+          "value" : "0x0001"
+        },
+        {
+          "enumerant" : "DontUnroll",
+          "value" : "0x0002"
+        },
+        {
+          "enumerant" : "DependencyInfinite",
+          "value" : "0x0004"
+        },
+        {
+          "enumerant" : "DependencyLength",
+          "value" : "0x0008",
+          "parameters" : [
+            { "kind" : "LiteralInteger" }
+          ]
+
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "FunctionControl",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Inline",
+          "value" : "0x0001"
+        },
+        {
+          "enumerant" : "DontInline",
+          "value" : "0x0002"
+        },
+        {
+          "enumerant" : "Pure",
+          "value" : "0x0004"
+        },
+        {
+          "enumerant" : "Const",
+          "value" : "0x0008"
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "MemorySemantics",
+      "enumerants" : [
+        {
+          "enumerant" : "Relaxed",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Acquire",
+          "value" : "0x0002"
+        },
+        {
+          "enumerant" : "Release",
+          "value" : "0x0004"
+        },
+        {
+          "enumerant" : "AcquireRelease",
+          "value" : "0x0008"
+        },
+        {
+          "enumerant" : "SequentiallyConsistent",
+          "value" : "0x0010"
+        },
+        {
+          "enumerant" : "UniformMemory",
+          "value" : "0x0040",
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SubgroupMemory",
+          "value" : "0x0080"
+        },
+        {
+          "enumerant" : "WorkgroupMemory",
+          "value" : "0x0100"
+        },
+        {
+          "enumerant" : "CrossWorkgroupMemory",
+          "value" : "0x0200"
+        },
+        {
+          "enumerant" : "AtomicCounterMemory",
+          "value" : "0x0400",
+          "capabilities" : [ "AtomicStorage" ]
+        },
+        {
+          "enumerant" : "ImageMemory",
+          "value" : "0x0800"
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "MemoryAccess",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "Volatile",
+          "value" : "0x0001"
+        },
+        {
+          "enumerant" : "Aligned",
+          "value" : "0x0002",
+          "parameters" : [
+            { "kind" : "LiteralInteger" }
+          ]
+        },
+        {
+          "enumerant" : "Nontemporal",
+          "value" : "0x0004"
+        }
+      ]
+    },
+    {
+      "category" : "BitEnum",
+      "kind" : "KernelProfilingInfo",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : "0x0000"
+        },
+        {
+          "enumerant" : "CmdExecTime",
+          "value" : "0x0001",
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "SourceLanguage",
+      "enumerants" : [
+        {
+          "enumerant" : "Unknown",
+          "value" : 0
+        },
+        {
+          "enumerant" : "ESSL",
+          "value" : 1
+        },
+        {
+          "enumerant" : "GLSL",
+          "value" : 2
+        },
+        {
+          "enumerant" : "OpenCL_C",
+          "value" : 3
+        },
+        {
+          "enumerant" : "OpenCL_CPP",
+          "value" : 4
+        },
+        {
+          "enumerant" : "HLSL",
+          "value" : 5
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "ExecutionModel",
+      "enumerants" : [
+        {
+          "enumerant" : "Vertex",
+          "value" : 0,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "TessellationControl",
+          "value" : 1,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "TessellationEvaluation",
+          "value" : 2,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "Geometry",
+          "value" : 3,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "Fragment",
+          "value" : 4,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "GLCompute",
+          "value" : 5,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Kernel",
+          "value" : 6,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "AddressingModel",
+      "enumerants" : [
+        {
+          "enumerant" : "Logical",
+          "value" : 0
+        },
+        {
+          "enumerant" : "Physical32",
+          "value" : 1,
+          "capabilities" : [ "Addresses" ]
+        },
+        {
+          "enumerant" : "Physical64",
+          "value" : 2,
+          "capabilities" : [ "Addresses" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "MemoryModel",
+      "enumerants" : [
+        {
+          "enumerant" : "Simple",
+          "value" : 0,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "GLSL450",
+          "value" : 1,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "OpenCL",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "ExecutionMode",
+      "enumerants" : [
+        {
+          "enumerant" : "Invocations",
+          "value" : 0,
+          "capabilities" : [ "Geometry" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Number of <<Invocation,invocations>>'" }
+          ]
+        },
+        {
+          "enumerant" : "SpacingEqual",
+          "value" : 1,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "SpacingFractionalEven",
+          "value" : 2,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "SpacingFractionalOdd",
+          "value" : 3,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "VertexOrderCw",
+          "value" : 4,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "VertexOrderCcw",
+          "value" : 5,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "PixelCenterInteger",
+          "value" : 6,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "OriginUpperLeft",
+          "value" : 7,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "OriginLowerLeft",
+          "value" : 8,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "EarlyFragmentTests",
+          "value" : 9,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "PointMode",
+          "value" : 10,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "Xfb",
+          "value" : 11,
+          "capabilities" : [ "TransformFeedback" ]
+        },
+        {
+          "enumerant" : "DepthReplacing",
+          "value" : 12,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "DepthGreater",
+          "value" : 14,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "DepthLess",
+          "value" : 15,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "DepthUnchanged",
+          "value" : 16,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "LocalSize",
+          "value" : 17,
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'x size'" },
+            { "kind" : "LiteralInteger", "name" : "'y size'" },
+            { "kind" : "LiteralInteger", "name" : "'z size'" }
+          ]
+        },
+        {
+          "enumerant" : "LocalSizeHint",
+          "value" : 18,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'x size'" },
+            { "kind" : "LiteralInteger", "name" : "'y size'" },
+            { "kind" : "LiteralInteger", "name" : "'z size'" }
+          ]
+        },
+        {
+          "enumerant" : "InputPoints",
+          "value" : 19,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "InputLines",
+          "value" : 20,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "InputLinesAdjacency",
+          "value" : 21,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "Triangles",
+          "value" : 22,
+          "capabilities" : [ "Geometry", "Tessellation" ]
+        },
+        {
+          "enumerant" : "InputTrianglesAdjacency",
+          "value" : 23,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "Quads",
+          "value" : 24,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "Isolines",
+          "value" : 25,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "OutputVertices",
+          "value" : 26,
+          "capabilities" : [ "Geometry", "Tessellation" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Vertex count'" }
+          ]
+        },
+        {
+          "enumerant" : "OutputPoints",
+          "value" : 27,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "OutputLineStrip",
+          "value" : 28,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "OutputTriangleStrip",
+          "value" : 29,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "VecTypeHint",
+          "value" : 30,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Vector type'" }
+          ]
+        },
+        {
+          "enumerant" : "ContractionOff",
+          "value" : 31,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Initializer",
+          "value" : 33,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Finalizer",
+          "value" : 34,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SubgroupSize",
+          "value" : 35,
+          "capabilities" : [ "SubgroupDispatch" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Subgroup Size'" }
+          ]
+        },
+        {
+          "enumerant" : "SubgroupsPerWorkgroup",
+          "value" : 36,
+          "capabilities" : [ "SubgroupDispatch" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Subgroups Per Workgroup'" }
+          ]
+        },
+        {
+          "enumerant" : "SubgroupsPerWorkgroupId",
+          "value" : 37,
+          "capabilities" : [ "SubgroupDispatch" ],
+          "parameters" : [
+            { "kind" : "IdRef", "name" : "'Subgroups Per Workgroup'" }
+          ]
+        },
+        {
+          "enumerant" : "LocalSizeId",
+          "value" : 38,
+          "parameters" : [
+            { "kind" : "IdRef", "name" : "'x size'" },
+            { "kind" : "IdRef", "name" : "'y size'" },
+            { "kind" : "IdRef", "name" : "'z size'" }
+          ]
+        },
+        {
+          "enumerant" : "LocalSizeHintId",
+          "value" : 39,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "IdRef", "name" : "'Local Size Hint'" }
+          ]
+        },
+        {
+          "enumerant" : "PostDepthCoverage",
+          "value" : 4446,
+          "capabilities" : [ "SampleMaskPostDepthCoverage" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "StorageClass",
+      "enumerants" : [
+        {
+          "enumerant" : "UniformConstant",
+          "value" : 0
+        },
+        {
+          "enumerant" : "Input",
+          "value" : 1
+        },
+        {
+          "enumerant" : "Uniform",
+          "value" : 2,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Output",
+          "value" : 3,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Workgroup",
+          "value" : 4
+        },
+        {
+          "enumerant" : "CrossWorkgroup",
+          "value" : 5
+        },
+        {
+          "enumerant" : "Private",
+          "value" : 6,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Function",
+          "value" : 7
+        },
+        {
+          "enumerant" : "Generic",
+          "value" : 8,
+          "capabilities" : [ "GenericPointer" ]
+        },
+        {
+          "enumerant" : "PushConstant",
+          "value" : 9,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "AtomicCounter",
+          "value" : 10,
+          "capabilities" : [ "AtomicStorage" ]
+        },
+        {
+          "enumerant" : "Image",
+          "value" : 11
+        },
+        {
+          "enumerant" : "StorageBuffer",
+          "value" : 12,
+          "extensions" : [
+            "SPV_KHR_storage_buffer_storage_class",
+            "SPV_KHR_variable_pointers"
+          ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "Dim",
+      "enumerants" : [
+        {
+          "enumerant" : "1D",
+          "value" : 0,
+          "capabilities" : [ "Sampled1D" ]
+        },
+        {
+          "enumerant" : "2D",
+          "value" : 1
+        },
+        {
+          "enumerant" : "3D",
+          "value" : 2
+        },
+        {
+          "enumerant" : "Cube",
+          "value" : 3,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rect",
+          "value" : 4,
+          "capabilities" : [ "SampledRect" ]
+        },
+        {
+          "enumerant" : "Buffer",
+          "value" : 5,
+          "capabilities" : [ "SampledBuffer" ]
+        },
+        {
+          "enumerant" : "SubpassData",
+          "value" : 6,
+          "capabilities" : [ "InputAttachment" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "SamplerAddressingMode",
+      "enumerants" : [
+        {
+          "enumerant" : "None",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ClampToEdge",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Clamp",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Repeat",
+          "value" : 3,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RepeatMirrored",
+          "value" : 4,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "SamplerFilterMode",
+      "enumerants" : [
+        {
+          "enumerant" : "Nearest",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Linear",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "ImageFormat",
+      "enumerants" : [
+        {
+          "enumerant" : "Unknown",
+          "value" : 0
+        },
+        {
+          "enumerant" : "Rgba32f",
+          "value" : 1,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba16f",
+          "value" : 2,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "R32f",
+          "value" : 3,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba8",
+          "value" : 4,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba8Snorm",
+          "value" : 5,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rg32f",
+          "value" : 6,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg16f",
+          "value" : 7,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R11fG11fB10f",
+          "value" : 8,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R16f",
+          "value" : 9,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rgba16",
+          "value" : 10,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rgb10A2",
+          "value" : 11,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg16",
+          "value" : 12,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg8",
+          "value" : 13,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R16",
+          "value" : 14,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R8",
+          "value" : 15,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rgba16Snorm",
+          "value" : 16,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg16Snorm",
+          "value" : 17,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg8Snorm",
+          "value" : 18,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R16Snorm",
+          "value" : 19,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R8Snorm",
+          "value" : 20,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rgba32i",
+          "value" : 21,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba16i",
+          "value" : 22,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba8i",
+          "value" : 23,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "R32i",
+          "value" : 24,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rg32i",
+          "value" : 25,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg16i",
+          "value" : 26,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg8i",
+          "value" : 27,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R16i",
+          "value" : 28,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R8i",
+          "value" : 29,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rgba32ui",
+          "value" : 30,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba16ui",
+          "value" : 31,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgba8ui",
+          "value" : 32,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "R32ui",
+          "value" : 33,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Rgb10a2ui",
+          "value" : 34,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg32ui",
+          "value" : 35,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg16ui",
+          "value" : 36,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "Rg8ui",
+          "value" : 37,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R16ui",
+          "value" : 38,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        },
+        {
+          "enumerant" : "R8ui",
+          "value" : 39,
+          "capabilities" : [ "StorageImageExtendedFormats" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "ImageChannelOrder",
+      "enumerants" : [
+        {
+          "enumerant" : "R",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "A",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RG",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RA",
+          "value" : 3,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RGB",
+          "value" : 4,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RGBA",
+          "value" : 5,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "BGRA",
+          "value" : 6,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ARGB",
+          "value" : 7,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Intensity",
+          "value" : 8,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Luminance",
+          "value" : 9,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Rx",
+          "value" : 10,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RGx",
+          "value" : 11,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "RGBx",
+          "value" : 12,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Depth",
+          "value" : 13,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "DepthStencil",
+          "value" : 14,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "sRGB",
+          "value" : 15,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "sRGBx",
+          "value" : 16,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "sRGBA",
+          "value" : 17,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "sBGRA",
+          "value" : 18,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ABGR",
+          "value" : 19,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "ImageChannelDataType",
+      "enumerants" : [
+        {
+          "enumerant" : "SnormInt8",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SnormInt16",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormInt8",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormInt16",
+          "value" : 3,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormShort565",
+          "value" : 4,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormShort555",
+          "value" : 5,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormInt101010",
+          "value" : 6,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SignedInt8",
+          "value" : 7,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SignedInt16",
+          "value" : 8,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SignedInt32",
+          "value" : 9,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnsignedInt8",
+          "value" : 10,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnsignedInt16",
+          "value" : 11,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnsignedInt32",
+          "value" : 12,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "HalfFloat",
+          "value" : 13,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Float",
+          "value" : 14,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormInt24",
+          "value" : 15,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "UnormInt101010_2",
+          "value" : 16,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "FPRoundingMode",
+      "enumerants" : [
+        {
+          "enumerant" : "RTE",
+          "value" : 0,
+          "capabilities" : [
+            "Kernel",
+            "StorageUniformBufferBlock16",
+            "StorageUniform16",
+            "StoragePushConstant16",
+            "StorageInputOutput16"
+          ]
+        },
+        {
+          "enumerant" : "RTZ",
+          "value" : 1,
+          "capabilities" : [
+            "Kernel",
+            "StorageUniformBufferBlock16",
+            "StorageUniform16",
+            "StoragePushConstant16",
+            "StorageInputOutput16"
+          ]
+        },
+        {
+          "enumerant" : "RTP",
+          "value" : 2,
+          "capabilities" : [
+            "Kernel",
+            "StorageUniformBufferBlock16",
+            "StorageUniform16",
+            "StoragePushConstant16",
+            "StorageInputOutput16"
+          ]
+        },
+        {
+          "enumerant" : "RTN",
+          "value" : 3,
+          "capabilities" : [
+            "Kernel",
+            "StorageUniformBufferBlock16",
+            "StorageUniform16",
+            "StoragePushConstant16",
+            "StorageInputOutput16"
+          ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "LinkageType",
+      "enumerants" : [
+        {
+          "enumerant" : "Export",
+          "value" : 0,
+          "capabilities" : [ "Linkage" ]
+        },
+        {
+          "enumerant" : "Import",
+          "value" : 1,
+          "capabilities" : [ "Linkage" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "AccessQualifier",
+      "enumerants" : [
+        {
+          "enumerant" : "ReadOnly",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "WriteOnly",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ReadWrite",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "FunctionParameterAttribute",
+      "enumerants" : [
+        {
+          "enumerant" : "Zext",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Sext",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ByVal",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Sret",
+          "value" : 3,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NoAlias",
+          "value" : 4,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NoCapture",
+          "value" : 5,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NoWrite",
+          "value" : 6,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NoReadWrite",
+          "value" : 7,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "Decoration",
+      "enumerants" : [
+        {
+          "enumerant" : "RelaxedPrecision",
+          "value" : 0,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SpecId",
+          "value" : 1,
+          "capabilities" : [ "Shader", "Kernel" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Specialization Constant ID'" }
+          ]
+        },
+        {
+          "enumerant" : "Block",
+          "value" : 2,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "BufferBlock",
+          "value" : 3,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "RowMajor",
+          "value" : 4,
+          "capabilities" : [ "Matrix" ]
+        },
+        {
+          "enumerant" : "ColMajor",
+          "value" : 5,
+          "capabilities" : [ "Matrix" ]
+        },
+        {
+          "enumerant" : "ArrayStride",
+          "value" : 6,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Array Stride'" }
+          ]
+        },
+        {
+          "enumerant" : "MatrixStride",
+          "value" : 7,
+          "capabilities" : [ "Matrix" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Matrix Stride'" }
+          ]
+        },
+        {
+          "enumerant" : "GLSLShared",
+          "value" : 8,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "GLSLPacked",
+          "value" : 9,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "CPacked",
+          "value" : 10,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "BuiltIn",
+          "value" : 11,
+          "parameters" : [
+            { "kind" : "BuiltIn" }
+          ]
+        },
+        {
+          "enumerant" : "NoPerspective",
+          "value" : 13,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Flat",
+          "value" : 14,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Patch",
+          "value" : 15,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "Centroid",
+          "value" : 16,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Sample",
+          "value" : 17,
+          "capabilities" : [ "SampleRateShading" ]
+        },
+        {
+          "enumerant" : "Invariant",
+          "value" : 18,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Restrict",
+          "value" : 19
+        },
+        {
+          "enumerant" : "Aliased",
+          "value" : 20
+        },
+        {
+          "enumerant" : "Volatile",
+          "value" : 21
+        },
+        {
+          "enumerant" : "Constant",
+          "value" : 22,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Coherent",
+          "value" : 23
+        },
+        {
+          "enumerant" : "NonWritable",
+          "value" : 24
+        },
+        {
+          "enumerant" : "NonReadable",
+          "value" : 25
+        },
+        {
+          "enumerant" : "Uniform",
+          "value" : 26,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SaturatedConversion",
+          "value" : 28,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Stream",
+          "value" : 29,
+          "capabilities" : [ "GeometryStreams" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Stream Number'" }
+          ]
+        },
+        {
+          "enumerant" : "Location",
+          "value" : 30,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Location'" }
+          ]
+        },
+        {
+          "enumerant" : "Component",
+          "value" : 31,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Component'" }
+          ]
+        },
+        {
+          "enumerant" : "Index",
+          "value" : 32,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Index'" }
+          ]
+        },
+        {
+          "enumerant" : "Binding",
+          "value" : 33,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Binding Point'" }
+          ]
+        },
+        {
+          "enumerant" : "DescriptorSet",
+          "value" : 34,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Descriptor Set'" }
+          ]
+        },
+        {
+          "enumerant" : "Offset",
+          "value" : 35,
+          "capabilities" : [ "Shader" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Byte Offset'" }
+          ]
+        },
+        {
+          "enumerant" : "XfbBuffer",
+          "value" : 36,
+          "capabilities" : [ "TransformFeedback" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'XFB Buffer Number'" }
+          ]
+        },
+        {
+          "enumerant" : "XfbStride",
+          "value" : 37,
+          "capabilities" : [ "TransformFeedback" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'XFB Stride'" }
+          ]
+        },
+        {
+          "enumerant" : "FuncParamAttr",
+          "value" : 38,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "FunctionParameterAttribute", "name" : "'Function Parameter Attribute'" }
+          ]
+        },
+        {
+          "enumerant" : "FPRoundingMode",
+          "value" : 39,
+          "capabilities" : [
+            "Kernel",
+            "StorageUniformBufferBlock16",
+            "StorageUniform16",
+            "StoragePushConstant16",
+            "StorageInputOutput16"
+          ],
+          "parameters" : [
+            { "kind" : "FPRoundingMode", "name" : "'Floating-Point Rounding Mode'" }
+          ]
+        },
+        {
+          "enumerant" : "FPFastMathMode",
+          "value" : 40,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "FPFastMathMode", "name" : "'Fast-Math Mode'" }
+          ]
+        },
+        {
+          "enumerant" : "LinkageAttributes",
+          "value" : 41,
+          "capabilities" : [ "Linkage" ],
+          "parameters" : [
+            { "kind" : "LiteralString", "name" : "'Name'" },
+            { "kind" : "LinkageType",   "name" : "'Linkage Type'" }
+          ]
+        },
+        {
+          "enumerant" : "NoContraction",
+          "value" : 42,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "InputAttachmentIndex",
+          "value" : 43,
+          "capabilities" : [ "InputAttachment" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Attachment Index'" }
+          ]
+        },
+        {
+          "enumerant" : "Alignment",
+          "value" : 44,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Alignment'" }
+          ]
+        },
+        {
+          "enumerant" : "MaxByteOffset",
+          "value" : 45,
+          "capabilities" : [ "Addresses" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Max Byte Offset'" }
+          ]
+        },
+        {
+          "enumerant" : "AlignmentId",
+          "value" : 46,
+          "capabilities" : [ "Kernel" ],
+          "parameters" : [
+            { "kind" : "IdRef", "name" : "'Alignment'" }
+          ]
+        },
+        {
+          "enumerant" : "MaxByteOffsetId",
+          "value" : 47,
+          "capabilities" : [ "Addresses" ],
+          "parameters" : [
+            { "kind" : "IdRef", "name" : "'Max Byte Offset'" }
+          ]
+        },
+        {
+          "enumerant" : "ExplicitInterpAMD",
+          "value" : 4999
+        },
+        {
+          "enumerant" : "OverrideCoverageNV",
+          "value" : 5248,
+          "capabilities" : [ "SampleMaskOverrideCoverageNV" ]
+        },
+        {
+          "enumerant" : "PassthroughNV",
+          "value" : 5250,
+          "capabilities" : [ "GeometryShaderPassthroughNV" ]
+        },
+        {
+          "enumerant" : "ViewportRelativeNV",
+          "value" : 5252,
+          "capabilities" : [ "ShaderViewportMaskNV" ]
+        },
+        {
+          "enumerant" : "SecondaryViewportRelativeNV",
+          "value" : 5256,
+          "capabilities" : [ "ShaderStereoViewNV" ],
+          "parameters" : [
+            { "kind" : "LiteralInteger", "name" : "'Offset'" }
+          ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "BuiltIn",
+      "enumerants" : [
+        {
+          "enumerant" : "Position",
+          "value" : 0,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "PointSize",
+          "value" : 1,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "ClipDistance",
+          "value" : 3,
+          "capabilities" : [ "ClipDistance" ]
+        },
+        {
+          "enumerant" : "CullDistance",
+          "value" : 4,
+          "capabilities" : [ "CullDistance" ]
+        },
+        {
+          "enumerant" : "VertexId",
+          "value" : 5,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "InstanceId",
+          "value" : 6,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "PrimitiveId",
+          "value" : 7,
+          "capabilities" : [ "Geometry", "Tessellation" ]
+        },
+        {
+          "enumerant" : "InvocationId",
+          "value" : 8,
+          "capabilities" : [ "Geometry", "Tessellation" ]
+        },
+        {
+          "enumerant" : "Layer",
+          "value" : 9,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "ViewportIndex",
+          "value" : 10,
+          "capabilities" : [ "MultiViewport" ]
+        },
+        {
+          "enumerant" : "TessLevelOuter",
+          "value" : 11,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "TessLevelInner",
+          "value" : 12,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "TessCoord",
+          "value" : 13,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "PatchVertices",
+          "value" : 14,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "FragCoord",
+          "value" : 15,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "PointCoord",
+          "value" : 16,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "FrontFacing",
+          "value" : 17,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SampleId",
+          "value" : 18,
+          "capabilities" : [ "SampleRateShading" ]
+        },
+        {
+          "enumerant" : "SamplePosition",
+          "value" : 19,
+          "capabilities" : [ "SampleRateShading" ]
+        },
+        {
+          "enumerant" : "SampleMask",
+          "value" : 20,
+          "capabilities" : [ "SampleRateShading" ]
+        },
+        {
+          "enumerant" : "FragDepth",
+          "value" : 22,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "HelperInvocation",
+          "value" : 23,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "NumWorkgroups",
+          "value" : 24
+        },
+        {
+          "enumerant" : "WorkgroupSize",
+          "value" : 25
+        },
+        {
+          "enumerant" : "WorkgroupId",
+          "value" : 26
+        },
+        {
+          "enumerant" : "LocalInvocationId",
+          "value" : 27
+        },
+        {
+          "enumerant" : "GlobalInvocationId",
+          "value" : 28
+        },
+        {
+          "enumerant" : "LocalInvocationIndex",
+          "value" : 29
+        },
+        {
+          "enumerant" : "WorkDim",
+          "value" : 30,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "GlobalSize",
+          "value" : 31,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "EnqueuedWorkgroupSize",
+          "value" : 32,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "GlobalOffset",
+          "value" : 33,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "GlobalLinearId",
+          "value" : 34,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SubgroupSize",
+          "value" : 36,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SubgroupMaxSize",
+          "value" : 37,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NumSubgroups",
+          "value" : 38,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "NumEnqueuedSubgroups",
+          "value" : 39,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SubgroupId",
+          "value" : 40,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "SubgroupLocalInvocationId",
+          "value" : 41,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "VertexIndex",
+          "value" : 42,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "InstanceIndex",
+          "value" : 43,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SubgroupEqMaskKHR",
+          "value" : 4416,
+          "capabilities" : [ "SubgroupBallotKHR" ]
+        },
+        {
+          "enumerant" : "SubgroupGeMaskKHR",
+          "value" : 4417,
+          "capabilities" : [ "SubgroupBallotKHR" ]
+        },
+        {
+          "enumerant" : "SubgroupGtMaskKHR",
+          "value" : 4418,
+          "capabilities" : [ "SubgroupBallotKHR" ]
+        },
+        {
+          "enumerant" : "SubgroupLeMaskKHR",
+          "value" : 4419,
+          "capabilities" : [ "SubgroupBallotKHR" ]
+        },
+        {
+          "enumerant" : "SubgroupLtMaskKHR",
+          "value" : 4420,
+          "capabilities" : [ "SubgroupBallotKHR" ]
+        },
+        {
+          "enumerant" : "BaseVertex",
+          "value" : 4424,
+          "capabilities" : [ "DrawParameters" ]
+        },
+        {
+          "enumerant" : "BaseInstance",
+          "value" : 4425,
+          "capabilities" : [ "DrawParameters" ]
+        },
+        {
+          "enumerant" : "DrawIndex",
+          "value" : 4426,
+          "capabilities" : [ "DrawParameters" ]
+        },
+        {
+          "enumerant" : "DeviceIndex",
+          "value" : 4438,
+          "capabilities" : [ "DeviceGroup" ]
+        },
+        {
+          "enumerant" : "ViewIndex",
+          "value" : 4440,
+          "capabilities" : [ "MultiView" ]
+        },
+        {
+          "enumerant" : "BaryCoordNoPerspAMD",
+          "value" : 4992
+        },
+        {
+          "enumerant" : "BaryCoordNoPerspCentroidAMD",
+          "value" : 4993
+        },
+        {
+          "enumerant" : "BaryCoordNoPerspSampleAMD",
+          "value" : 4994
+        },
+        {
+          "enumerant" : "BaryCoordSmoothAMD",
+          "value" : 4995
+        },
+        {
+          "enumerant" : "BaryCoordSmoothCentroidAMD",
+          "value" : 4996
+        },
+        {
+          "enumerant" : "BaryCoordSmoothSampleAMD",
+          "value" : 4997
+        },
+        {
+          "enumerant" : "BaryCoordPullModelAMD",
+          "value" : 4998
+        },
+        {
+          "enumerant" : "ViewportMaskNV",
+          "value" : 5253,
+          "capabilities" : [ "ShaderViewportMaskNV" ]
+        },
+        {
+          "enumerant" : "SecondaryPositionNV",
+          "value" : 5257,
+          "capabilities" : [ "ShaderStereoViewNV" ]
+        },
+        {
+          "enumerant" : "SecondaryViewportMaskNV",
+          "value" : 5258,
+          "capabilities" : [ "ShaderStereoViewNV" ]
+        },
+        {
+          "enumerant" : "PositionPerViewNV",
+          "value" : 5261,
+          "capabilities" : [ "PerViewAttributesNV" ]
+        },
+        {
+          "enumerant" : "ViewportMaskPerViewNV",
+          "value" : 5262,
+          "capabilities" : [ "PerViewAttributesNV" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "Scope",
+      "enumerants" : [
+        {
+          "enumerant" : "CrossDevice",
+          "value" : 0
+        },
+        {
+          "enumerant" : "Device",
+          "value" : 1
+        },
+        {
+          "enumerant" : "Workgroup",
+          "value" : 2
+        },
+        {
+          "enumerant" : "Subgroup",
+          "value" : 3
+        },
+        {
+          "enumerant" : "Invocation",
+          "value" : 4
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "GroupOperation",
+      "enumerants" : [
+        {
+          "enumerant" : "Reduce",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "InclusiveScan",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ExclusiveScan",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "KernelEnqueueFlags",
+      "enumerants" : [
+        {
+          "enumerant" : "NoWait",
+          "value" : 0,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "WaitKernel",
+          "value" : 1,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "WaitWorkGroup",
+          "value" : 2,
+          "capabilities" : [ "Kernel" ]
+        }
+      ]
+    },
+    {
+      "category" : "ValueEnum",
+      "kind" : "Capability",
+      "enumerants" : [
+        {
+          "enumerant" : "Matrix",
+          "value" : 0
+        },
+        {
+          "enumerant" : "Shader",
+          "value" : 1,
+          "capabilities" : [ "Matrix" ]
+        },
+        {
+          "enumerant" : "Geometry",
+          "value" : 2,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Tessellation",
+          "value" : 3,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Addresses",
+          "value" : 4
+        },
+        {
+          "enumerant" : "Linkage",
+          "value" : 5
+        },
+        {
+          "enumerant" : "Kernel",
+          "value" : 6
+        },
+        {
+          "enumerant" : "Vector16",
+          "value" : 7,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Float16Buffer",
+          "value" : 8,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Float16",
+          "value" : 9
+        },
+        {
+          "enumerant" : "Float64",
+          "value" : 10
+        },
+        {
+          "enumerant" : "Int64",
+          "value" : 11
+        },
+        {
+          "enumerant" : "Int64Atomics",
+          "value" : 12,
+          "capabilities" : [ "Int64" ]
+        },
+        {
+          "enumerant" : "ImageBasic",
+          "value" : 13,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "ImageReadWrite",
+          "value" : 14,
+          "capabilities" : [ "ImageBasic" ]
+        },
+        {
+          "enumerant" : "ImageMipmap",
+          "value" : 15,
+          "capabilities" : [ "ImageBasic" ]
+        },
+        {
+          "enumerant" : "Pipes",
+          "value" : 17,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "Groups",
+          "value" : 18
+        },
+        {
+          "enumerant" : "DeviceEnqueue",
+          "value" : 19,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "LiteralSampler",
+          "value" : 20,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "AtomicStorage",
+          "value" : 21,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Int16",
+          "value" : 22
+        },
+        {
+          "enumerant" : "TessellationPointSize",
+          "value" : 23,
+          "capabilities" : [ "Tessellation" ]
+        },
+        {
+          "enumerant" : "GeometryPointSize",
+          "value" : 24,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "ImageGatherExtended",
+          "value" : 25,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "StorageImageMultisample",
+          "value" : 27,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "UniformBufferArrayDynamicIndexing",
+          "value" : 28,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SampledImageArrayDynamicIndexing",
+          "value" : 29,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "StorageBufferArrayDynamicIndexing",
+          "value" : 30,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "StorageImageArrayDynamicIndexing",
+          "value" : 31,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "ClipDistance",
+          "value" : 32,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "CullDistance",
+          "value" : 33,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "ImageCubeArray",
+          "value" : 34,
+          "capabilities" : [ "SampledCubeArray" ]
+        },
+        {
+          "enumerant" : "SampleRateShading",
+          "value" : 35,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "ImageRect",
+          "value" : 36,
+          "capabilities" : [ "SampledRect" ]
+        },
+        {
+          "enumerant" : "SampledRect",
+          "value" : 37,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "GenericPointer",
+          "value" : 38,
+          "capabilities" : [ "Addresses" ]
+        },
+        {
+          "enumerant" : "Int8",
+          "value" : 39,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "InputAttachment",
+          "value" : 40,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SparseResidency",
+          "value" : 41,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "MinLod",
+          "value" : 42,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "Sampled1D",
+          "value" : 43
+        },
+        {
+          "enumerant" : "Image1D",
+          "value" : 44,
+          "capabilities" : [ "Sampled1D" ]
+        },
+        {
+          "enumerant" : "SampledCubeArray",
+          "value" : 45,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "SampledBuffer",
+          "value" : 46
+        },
+        {
+          "enumerant" : "ImageBuffer",
+          "value" : 47,
+          "capabilities" : [ "SampledBuffer" ]
+        },
+        {
+          "enumerant" : "ImageMSArray",
+          "value" : 48,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "StorageImageExtendedFormats",
+          "value" : 49,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "ImageQuery",
+          "value" : 50,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "DerivativeControl",
+          "value" : 51,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "InterpolationFunction",
+          "value" : 52,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "TransformFeedback",
+          "value" : 53,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "GeometryStreams",
+          "value" : 54,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "StorageImageReadWithoutFormat",
+          "value" : 55,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "StorageImageWriteWithoutFormat",
+          "value" : 56,
+          "capabilities" : [ "Shader" ]
+        },
+        {
+          "enumerant" : "MultiViewport",
+          "value" : 57,
+          "capabilities" : [ "Geometry" ]
+        },
+        {
+          "enumerant" : "SubgroupDispatch",
+          "value" : 58,
+          "capabilities" : [ "DeviceEnqueue" ]
+        },
+        {
+          "enumerant" : "NamedBarrier",
+          "value" : 59,
+          "capabilities" : [ "Kernel" ]
+        },
+        {
+          "enumerant" : "PipeStorage",
+          "value" : 60,
+          "capabilities" : [ "Pipes" ]
+        },
+        {
+          "enumerant" : "SubgroupBallotKHR",
+          "value" : 4423,
+          "extensions" : [ "SPV_KHR_shader_ballot" ]
+        },
+        {
+          "enumerant" : "DrawParameters",
+          "value" : 4427,
+          "extensions" : [ "SPV_KHR_shader_draw_parameters" ]
+        },
+        {
+          "enumerant" : "SubgroupVoteKHR",
+          "value" : 4431,
+          "extensions" : [ "SPV_KHR_subgroup_vote" ]
+        },
+        {
+          "enumerant" : "StorageBuffer16BitAccess",
+          "value" : 4433,
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "StorageUniformBufferBlock16",
+          "value" : 4433,
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "UniformAndStorageBuffer16BitAccess",
+          "value" : 4434,
+          "capabilities" : [
+            "StorageBuffer16BitAccess",
+            "StorageUniformBufferBlock16"
+          ],
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "StorageUniform16",
+          "value" : 4434,
+          "capabilities" : [
+            "StorageBuffer16BitAccess",
+            "StorageUniformBufferBlock16"
+          ],
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "StoragePushConstant16",
+          "value" : 4435,
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "StorageInputOutput16",
+          "value" : 4436,
+          "extensions" : [ "SPV_KHR_16bit_storage" ]
+        },
+        {
+          "enumerant" : "DeviceGroup",
+          "value" : 4437,
+          "extensions" : [ "SPV_KHR_device_group" ]
+        },
+        {
+          "enumerant" : "MultiView",
+          "value" : 4439,
+          "capabilities" : [ "Shader" ],
+          "extensions" : [ "SPV_KHR_multiview" ]
+        },
+        {
+          "enumerant" : "VariablePointersStorageBuffer",
+          "value" : 4441,
+          "capabilities" : [ "Shader" ],
+          "extensions" : [ "SPV_KHR_variable_pointers" ]
+        },
+        {
+          "enumerant" : "VariablePointers",
+          "value" : 4442,
+          "capabilities" : [ "VariablePointersStorageBuffer" ],
+          "extensions" : [ "SPV_KHR_variable_pointers" ]
+        },
+        {          "enumerant" : "AtomicStorageOps",
+          "value" : 4445,
+          "extensions" : [ "SPV_KHR_shader_atomic_counter_ops" ]
+        },
+        {
+          "enumerant" : "SampleMaskPostDepthCoverage",
+          "value" : 4447,
+          "capabilities" : [ "SampleRateShading" ],
+          "extensions" : [ "SPV_KHR_post_depth_coverage" ]
+        },
+        {
+          "enumerant" : "ImageGatherBiasLodAMD",
+          "value" : 5009,
+          "capabilities" : [ "Shader" ],
+          "extensions" : [ "SPV_AMD_texture_gather_bias_lod" ]
+        },
+        {
+          "enumerant" : "SampleMaskOverrideCoverageNV",
+          "value" : 5249,
+          "capabilities" : [ "SampleRateShading" ],
+          "extensions" : [ "SPV_NV_sample_mask_override_coverage" ]
+        },
+        {
+          "enumerant" : "GeometryShaderPassthroughNV",
+          "value" : 5251,
+          "capabilities" : [ "Geometry" ],
+          "extensions" : [ "SPV_NV_geometry_shader_passthrough" ]
+        },
+        {
+          "enumerant" : "ShaderViewportIndexLayerNV",
+          "value" : 5254,
+          "capabilities" : [ "MultiViewport" ],
+          "extensions" : [ "SPV_NV_viewport_array2" ]
+        },
+        {
+          "enumerant" : "ShaderViewportMaskNV",
+          "value" : 5255,
+          "capabilities" : [ "ShaderViewportIndexLayerNV" ],
+          "extensions" : [ "SPV_NV_viewport_array2" ]
+        },
+        {
+          "enumerant" : "ShaderStereoViewNV",
+          "value" : 5259,
+          "capabilities" : [ "ShaderViewportMaskNV" ],
+          "extensions" : [ "SPV_NV_stereo_view_rendering" ]
+        },
+        {
+          "enumerant" : "PerViewAttributesNV",
+          "value" : 5260,
+          "capabilities" : [ "MultiView" ],
+          "extensions" : [ "SPV_NVX_multiview_per_view_attributes" ]
+        }
+      ]
+    },
+    {
+      "category" : "Id",
+      "kind" : "IdResultType",
+      "doc" : "Reference to an <id> representing the result's type of the enclosing instruction"
+    },
+    {
+      "category" : "Id",
+      "kind" : "IdResult",
+      "doc" : "Definition of an <id> representing the result of the enclosing instruction"
+    },
+    {
+      "category" : "Id",
+      "kind" : "IdMemorySemantics",
+      "doc" : "Reference to an <id> representing a 32-bit integer that is a mask from the MemorySemantics operand kind"
+    },
+    {
+      "category" : "Id",
+      "kind" : "IdScope",
+      "doc" : "Reference to an <id> representing a 32-bit integer that is a mask from the Scope operand kind"
+    },
+    {
+      "category" : "Id",
+      "kind" : "IdRef",
+      "doc" : "Reference to an <id>"
+    },
+    {
+      "category" : "Literal",
+      "kind" : "LiteralInteger",
+      "doc" : "An integer consuming one or more words"
+    },
+    {
+      "category" : "Literal",
+      "kind" : "LiteralString",
+      "doc" : "A null-terminated stream of characters consuming an integral number of words"
+    },
+    {
+      "category" : "Literal",
+      "kind" : "LiteralContextDependentNumber",
+      "doc" : "A literal number whose size and format are determined by a previous operand in the enclosing instruction"
+    },
+    {
+      "category" : "Literal",
+      "kind" : "LiteralExtInstInteger",
+      "doc" : "A 32-bit unsigned integer indicating which instruction to use and determining the layout of following operands (for OpExtInst)"
+    },
+    {
+      "category" : "Literal",
+      "kind" : "LiteralSpecConstantOpInteger",
+      "doc" : "An opcode indicating the operation to be performed and determining the layout of following operands (for OpSpecConstantOp)"
+    },
+    {
+      "category" : "Composite",
+      "kind" : "PairLiteralIntegerIdRef",
+      "bases" : [ "LiteralInteger", "IdRef" ]
+    },
+    {
+      "category" : "Composite",
+      "kind" : "PairIdRefLiteralInteger",
+      "bases" : [ "IdRef", "LiteralInteger" ]
+    },
+    {
+      "category" : "Composite",
+      "kind" : "PairIdRefIdRef",
+      "bases" : [ "IdRef", "IdRef" ]
+    }
+  ]
+}
diff --git a/src/compiler/spirv/spirv.h b/src/compiler/spirv/spirv.h
index 769c102..61559a1 100644
--- a/src/compiler/spirv/spirv.h
+++ b/src/compiler/spirv/spirv.h
@@ -1,5 +1,5 @@
 /*
-** Copyright (c) 2014-2016 The Khronos Group Inc.
+** Copyright (c) 2014-2017 The Khronos Group Inc.
 ** 
 ** Permission is hereby granted, free of charge, to any person obtaining a copy
 ** of this software and/or associated documentation files (the "Materials"),
@@ -50,12 +50,12 @@
 
 typedef unsigned int SpvId;
 
-#define SPV_VERSION 0x10100
-#define SPV_REVISION 4
+#define SPV_VERSION 0x10200
+#define SPV_REVISION 1
 
 static const unsigned int SpvMagicNumber = 0x07230203;
-static const unsigned int SpvVersion = 0x00010100;
-static const unsigned int SpvRevision = 4;
+static const unsigned int SpvVersion = 0x00010200;
+static const unsigned int SpvRevision = 1;
 static const unsigned int SpvOpCodeMask = 0xffff;
 static const unsigned int SpvWordCountShift = 16;
 
@@ -65,6 +65,7 @@
     SpvSourceLanguageGLSL = 2,
     SpvSourceLanguageOpenCL_C = 3,
     SpvSourceLanguageOpenCL_CPP = 4,
+    SpvSourceLanguageHLSL = 5,
     SpvSourceLanguageMax = 0x7fffffff,
 } SpvSourceLanguage;
 
@@ -129,6 +130,10 @@
     SpvExecutionModeFinalizer = 34,
     SpvExecutionModeSubgroupSize = 35,
     SpvExecutionModeSubgroupsPerWorkgroup = 36,
+    SpvExecutionModeSubgroupsPerWorkgroupId = 37,
+    SpvExecutionModeLocalSizeId = 38,
+    SpvExecutionModeLocalSizeHintId = 39,
+    SpvExecutionModePostDepthCoverage = 4446,
     SpvExecutionModeMax = 0x7fffffff,
 } SpvExecutionMode;
 
@@ -145,6 +150,7 @@
     SpvStorageClassPushConstant = 9,
     SpvStorageClassAtomicCounter = 10,
     SpvStorageClassImage = 11,
+    SpvStorageClassStorageBuffer = 12,
     SpvStorageClassMax = 0x7fffffff,
 } SpvStorageClass;
 
@@ -383,6 +389,13 @@
     SpvDecorationInputAttachmentIndex = 43,
     SpvDecorationAlignment = 44,
     SpvDecorationMaxByteOffset = 45,
+    SpvDecorationAlignmentId = 46,
+    SpvDecorationMaxByteOffsetId = 47,
+    SpvDecorationExplicitInterpAMD = 4999,
+    SpvDecorationOverrideCoverageNV = 5248,
+    SpvDecorationPassthroughNV = 5250,
+    SpvDecorationViewportRelativeNV = 5252,
+    SpvDecorationSecondaryViewportRelativeNV = 5256,
     SpvDecorationMax = 0x7fffffff,
 } SpvDecoration;
 
@@ -436,6 +449,20 @@
     SpvBuiltInBaseVertex = 4424,
     SpvBuiltInBaseInstance = 4425,
     SpvBuiltInDrawIndex = 4426,
+    SpvBuiltInDeviceIndex = 4438,
+    SpvBuiltInViewIndex = 4440,
+    SpvBuiltInBaryCoordNoPerspAMD = 4992,
+    SpvBuiltInBaryCoordNoPerspCentroidAMD = 4993,
+    SpvBuiltInBaryCoordNoPerspSampleAMD = 4994,
+    SpvBuiltInBaryCoordSmoothAMD = 4995,
+    SpvBuiltInBaryCoordSmoothCentroidAMD = 4996,
+    SpvBuiltInBaryCoordSmoothSampleAMD = 4997,
+    SpvBuiltInBaryCoordPullModelAMD = 4998,
+    SpvBuiltInViewportMaskNV = 5253,
+    SpvBuiltInSecondaryPositionNV = 5257,
+    SpvBuiltInSecondaryViewportMaskNV = 5258,
+    SpvBuiltInPositionPerViewNV = 5261,
+    SpvBuiltInViewportMaskPerViewNV = 5262,
     SpvBuiltInMax = 0x7fffffff,
 } SpvBuiltIn;
 
@@ -620,6 +647,26 @@
     SpvCapabilityPipeStorage = 60,
     SpvCapabilitySubgroupBallotKHR = 4423,
     SpvCapabilityDrawParameters = 4427,
+    SpvCapabilitySubgroupVoteKHR = 4431,
+    SpvCapabilityStorageBuffer16BitAccess = 4433,
+    SpvCapabilityStorageUniformBufferBlock16 = 4433,
+    SpvCapabilityStorageUniform16 = 4434,
+    SpvCapabilityUniformAndStorageBuffer16BitAccess = 4434,
+    SpvCapabilityStoragePushConstant16 = 4435,
+    SpvCapabilityStorageInputOutput16 = 4436,
+    SpvCapabilityDeviceGroup = 4437,
+    SpvCapabilityMultiView = 4439,
+    SpvCapabilityVariablePointersStorageBuffer = 4441,
+    SpvCapabilityVariablePointers = 4442,
+    SpvCapabilityAtomicStorageOps = 4445,
+    SpvCapabilitySampleMaskPostDepthCoverage = 4447,
+    SpvCapabilityImageGatherBiasLodAMD = 5009,
+    SpvCapabilitySampleMaskOverrideCoverageNV = 5249,
+    SpvCapabilityGeometryShaderPassthroughNV = 5251,
+    SpvCapabilityShaderViewportIndexLayerNV = 5254,
+    SpvCapabilityShaderViewportMaskNV = 5255,
+    SpvCapabilityShaderStereoViewNV = 5259,
+    SpvCapabilityPerViewAttributesNV = 5260,
     SpvCapabilityMax = 0x7fffffff,
 } SpvCapability;
 
@@ -928,8 +975,22 @@
     SpvOpNamedBarrierInitialize = 328,
     SpvOpMemoryNamedBarrier = 329,
     SpvOpModuleProcessed = 330,
+    SpvOpExecutionModeId = 331,
+    SpvOpDecorateId = 332,
     SpvOpSubgroupBallotKHR = 4421,
     SpvOpSubgroupFirstInvocationKHR = 4422,
+    SpvOpSubgroupAllKHR = 4428,
+    SpvOpSubgroupAnyKHR = 4429,
+    SpvOpSubgroupAllEqualKHR = 4430,
+    SpvOpSubgroupReadInvocationKHR = 4432,
+    SpvOpGroupIAddNonUniformAMD = 5000,
+    SpvOpGroupFAddNonUniformAMD = 5001,
+    SpvOpGroupFMinNonUniformAMD = 5002,
+    SpvOpGroupUMinNonUniformAMD = 5003,
+    SpvOpGroupSMinNonUniformAMD = 5004,
+    SpvOpGroupFMaxNonUniformAMD = 5005,
+    SpvOpGroupUMaxNonUniformAMD = 5006,
+    SpvOpGroupSMaxNonUniformAMD = 5007,
     SpvOpMax = 0x7fffffff,
 } SpvOp;
 
diff --git a/src/compiler/spirv/spirv_info.c b/src/compiler/spirv/spirv_info.c
deleted file mode 100644
index 1036b41..0000000
--- a/src/compiler/spirv/spirv_info.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright © 2016 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "spirv_info.h"
-#include "util/macros.h"
-
-#define CAPABILITY(cap) [SpvCapability##cap] = #cap
-static const char * const capability_to_string[] = {
-   CAPABILITY(Matrix),
-   CAPABILITY(Shader),
-   CAPABILITY(Geometry),
-   CAPABILITY(Tessellation),
-   CAPABILITY(Addresses),
-   CAPABILITY(Linkage),
-   CAPABILITY(Kernel),
-   CAPABILITY(Vector16),
-   CAPABILITY(Float16Buffer),
-   CAPABILITY(Float16),
-   CAPABILITY(Float64),
-   CAPABILITY(Int64),
-   CAPABILITY(Int64Atomics),
-   CAPABILITY(ImageBasic),
-   CAPABILITY(ImageReadWrite),
-   CAPABILITY(ImageMipmap),
-   CAPABILITY(Pipes),
-   CAPABILITY(Groups),
-   CAPABILITY(DeviceEnqueue),
-   CAPABILITY(LiteralSampler),
-   CAPABILITY(AtomicStorage),
-   CAPABILITY(Int16),
-   CAPABILITY(TessellationPointSize),
-   CAPABILITY(GeometryPointSize),
-   CAPABILITY(ImageGatherExtended),
-   CAPABILITY(StorageImageMultisample),
-   CAPABILITY(UniformBufferArrayDynamicIndexing),
-   CAPABILITY(SampledImageArrayDynamicIndexing),
-   CAPABILITY(StorageBufferArrayDynamicIndexing),
-   CAPABILITY(StorageImageArrayDynamicIndexing),
-   CAPABILITY(ClipDistance),
-   CAPABILITY(CullDistance),
-   CAPABILITY(ImageCubeArray),
-   CAPABILITY(SampleRateShading),
-   CAPABILITY(ImageRect),
-   CAPABILITY(SampledRect),
-   CAPABILITY(GenericPointer),
-   CAPABILITY(Int8),
-   CAPABILITY(InputAttachment),
-   CAPABILITY(SparseResidency),
-   CAPABILITY(MinLod),
-   CAPABILITY(Sampled1D),
-   CAPABILITY(Image1D),
-   CAPABILITY(SampledCubeArray),
-   CAPABILITY(SampledBuffer),
-   CAPABILITY(ImageBuffer),
-   CAPABILITY(ImageMSArray),
-   CAPABILITY(StorageImageExtendedFormats),
-   CAPABILITY(ImageQuery),
-   CAPABILITY(DerivativeControl),
-   CAPABILITY(InterpolationFunction),
-   CAPABILITY(TransformFeedback),
-   CAPABILITY(GeometryStreams),
-   CAPABILITY(StorageImageReadWithoutFormat),
-   CAPABILITY(StorageImageWriteWithoutFormat),
-   CAPABILITY(MultiViewport),
-   CAPABILITY(SubgroupDispatch),
-   CAPABILITY(NamedBarrier),
-   CAPABILITY(PipeStorage),
-   CAPABILITY(SubgroupBallotKHR),
-   CAPABILITY(DrawParameters),
-};
-
-const char *
-spirv_capability_to_string(SpvCapability cap)
-{
-   if (cap < ARRAY_SIZE(capability_to_string))
-      return capability_to_string[cap];
-   else
-      return "unknown";
-}
-
-#define DECORATION(dec) [SpvDecoration##dec] = #dec
-static const char * const decoration_to_string[] = {
-   DECORATION(RelaxedPrecision),
-   DECORATION(SpecId),
-   DECORATION(Block),
-   DECORATION(BufferBlock),
-   DECORATION(RowMajor),
-   DECORATION(ColMajor),
-   DECORATION(ArrayStride),
-   DECORATION(MatrixStride),
-   DECORATION(GLSLShared),
-   DECORATION(GLSLPacked),
-   DECORATION(CPacked),
-   DECORATION(BuiltIn),
-   DECORATION(NoPerspective),
-   DECORATION(Flat),
-   DECORATION(Patch),
-   DECORATION(Centroid),
-   DECORATION(Sample),
-   DECORATION(Invariant),
-   DECORATION(Restrict),
-   DECORATION(Aliased),
-   DECORATION(Volatile),
-   DECORATION(Constant),
-   DECORATION(Coherent),
-   DECORATION(NonWritable),
-   DECORATION(NonReadable),
-   DECORATION(Uniform),
-   DECORATION(SaturatedConversion),
-   DECORATION(Stream),
-   DECORATION(Location),
-   DECORATION(Component),
-   DECORATION(Index),
-   DECORATION(Binding),
-   DECORATION(DescriptorSet),
-   DECORATION(Offset),
-   DECORATION(XfbBuffer),
-   DECORATION(XfbStride),
-   DECORATION(FuncParamAttr),
-   DECORATION(FPRoundingMode),
-   DECORATION(FPFastMathMode),
-   DECORATION(LinkageAttributes),
-   DECORATION(NoContraction),
-   DECORATION(InputAttachmentIndex),
-   DECORATION(Alignment),
-   DECORATION(MaxByteOffset),
-};
-
-const char *
-spirv_decoration_to_string(SpvDecoration dec)
-{
-   if (dec < ARRAY_SIZE(decoration_to_string))
-      return decoration_to_string[dec];
-   else
-      return "unknown";
-}
diff --git a/src/compiler/spirv/spirv_info_c.py b/src/compiler/spirv/spirv_info_c.py
new file mode 100644
index 0000000..c5e11df
--- /dev/null
+++ b/src/compiler/spirv/spirv_info_c.py
@@ -0,0 +1,82 @@
+COPYRIGHT = """\
+/*
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import json
+from sys import stdout
+from mako.template import Template
+
+def collect_data(spirv, kind):
+    for x in spirv["operand_kinds"]:
+        if x["kind"] == kind:
+            operands = x
+            break
+
+    # There are some duplicate values in some of the tables (thanks guys!), so
+    # filter them out.
+    last_value = -1
+    values = []
+    for x in operands["enumerants"]:
+        if x["value"] != last_value:
+            last_value = x["value"]
+            values.append(x["enumerant"])
+
+    return (kind, values)
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("json")
+    p.add_argument("out")
+    return p.parse_args()
+
+TEMPLATE  = Template(COPYRIGHT + """\
+#include "spirv_info.h"
+% for kind,values in info:
+
+const char *
+spirv_${kind.lower()}_to_string(Spv${kind} v)
+{
+   switch (v) {
+    % for name in values:
+   case Spv${kind}${name}: return "Spv${kind}${name}";
+    % endfor
+   case Spv${kind}Max: break; /* silence warnings about unhandled enums. */
+   }
+
+   return "unknown";
+}
+% endfor
+""")
+
+if __name__ == "__main__":
+    pargs = parse_args()
+
+    spirv_info = json.JSONDecoder().decode(open(pargs.json, "r").read())
+
+    capabilities = collect_data(spirv_info, "Capability")
+    decorations = collect_data(spirv_info, "Decoration")
+
+    with open(pargs.out, 'w') as f:
+        f.write(TEMPLATE.render(info=[capabilities, decorations]))
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 2a13636..8265a58 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -185,9 +185,12 @@
    case vtn_value_type_ssa:
       return val->ssa;
 
-   case vtn_value_type_access_chain:
-      /* This is needed for function parameters */
-      return vtn_variable_load(b, val->access_chain);
+   case vtn_value_type_pointer:
+      assert(val->pointer->ptr_type && val->pointer->ptr_type->type);
+      struct vtn_ssa_value *ssa =
+         vtn_create_ssa_value(b, val->pointer->ptr_type->type);
+      ssa->def = vtn_pointer_to_ssa(b, val->pointer);
+      return ssa;
 
    default:
       unreachable("Invalid type for an SSA value");
@@ -413,40 +416,34 @@
 vtn_type_copy(struct vtn_builder *b, struct vtn_type *src)
 {
    struct vtn_type *dest = ralloc(b, struct vtn_type);
-   dest->type = src->type;
-   dest->is_builtin = src->is_builtin;
-   if (src->is_builtin)
-      dest->builtin = src->builtin;
+   *dest = *src;
 
-   if (!glsl_type_is_scalar(src->type)) {
-      switch (glsl_get_base_type(src->type)) {
-      case GLSL_TYPE_INT:
-      case GLSL_TYPE_UINT:
-      case GLSL_TYPE_INT64:
-      case GLSL_TYPE_UINT64:
-      case GLSL_TYPE_BOOL:
-      case GLSL_TYPE_FLOAT:
-      case GLSL_TYPE_DOUBLE:
-      case GLSL_TYPE_ARRAY:
-         dest->row_major = src->row_major;
-         dest->stride = src->stride;
-         dest->array_element = src->array_element;
-         break;
+   switch (src->base_type) {
+   case vtn_base_type_void:
+   case vtn_base_type_scalar:
+   case vtn_base_type_vector:
+   case vtn_base_type_matrix:
+   case vtn_base_type_array:
+   case vtn_base_type_pointer:
+   case vtn_base_type_image:
+   case vtn_base_type_sampler:
+      /* Nothing more to do */
+      break;
 
-      case GLSL_TYPE_STRUCT: {
-         unsigned elems = glsl_get_length(src->type);
+   case vtn_base_type_struct:
+      dest->members = ralloc_array(b, struct vtn_type *, src->length);
+      memcpy(dest->members, src->members,
+             src->length * sizeof(src->members[0]));
 
-         dest->members = ralloc_array(b, struct vtn_type *, elems);
-         memcpy(dest->members, src->members, elems * sizeof(struct vtn_type *));
+      dest->offsets = ralloc_array(b, unsigned, src->length);
+      memcpy(dest->offsets, src->offsets,
+             src->length * sizeof(src->offsets[0]));
+      break;
 
-         dest->offsets = ralloc_array(b, unsigned, elems);
-         memcpy(dest->offsets, src->offsets, elems * sizeof(unsigned));
-         break;
-      }
-
-      default:
-         unreachable("unhandled type");
-      }
+   case vtn_base_type_function:
+      dest->params = ralloc_array(b, struct vtn_type *, src->length);
+      memcpy(dest->params, src->params, src->length * sizeof(src->params[0]));
+      break;
    }
 
    return dest;
@@ -520,7 +517,7 @@
       ctx->type->offsets[member] = dec->literals[0];
       break;
    case SpvDecorationMatrixStride:
-      mutable_matrix_member(b, ctx->type, member)->stride = dec->literals[0];
+      /* Handled as a second pass */
       break;
    case SpvDecorationColMajor:
       break; /* Nothing to do here.  Column-major is the default. */
@@ -571,6 +568,32 @@
    }
 }
 
+/* Matrix strides are handled as a separate pass because we need to know
+ * whether the matrix is row-major or not first.
+ */
+static void
+struct_member_matrix_stride_cb(struct vtn_builder *b,
+                               struct vtn_value *val, int member,
+                               const struct vtn_decoration *dec,
+                               void *void_ctx)
+{
+   if (dec->decoration != SpvDecorationMatrixStride)
+      return;
+   assert(member >= 0);
+
+   struct member_decoration_ctx *ctx = void_ctx;
+
+   struct vtn_type *mat_type = mutable_matrix_member(b, ctx->type, member);
+   if (mat_type->row_major) {
+      mat_type->array_element = vtn_type_copy(b, mat_type->array_element);
+      mat_type->stride = mat_type->array_element->stride;
+      mat_type->array_element->stride = dec->literals[0];
+   } else {
+      assert(mat_type->array_element->stride > 0);
+      mat_type->stride = dec->literals[0];
+   }
+}
+
 static void
 type_decoration_cb(struct vtn_builder *b,
                    struct vtn_value *val, int member,
@@ -583,12 +606,17 @@
 
    switch (dec->decoration) {
    case SpvDecorationArrayStride:
+      assert(type->base_type == vtn_base_type_matrix ||
+             type->base_type == vtn_base_type_array ||
+             type->base_type == vtn_base_type_pointer);
       type->stride = dec->literals[0];
       break;
    case SpvDecorationBlock:
+      assert(type->base_type == vtn_base_type_struct);
       type->block = true;
       break;
    case SpvDecorationBufferBlock:
+      assert(type->base_type == vtn_base_type_struct);
       type->buffer_block = true;
       break;
    case SpvDecorationGLSLShared:
@@ -693,7 +721,7 @@
    case SpvImageFormatRg32ui:       return 0x823C; /* GL_RG32UI */
    case SpvImageFormatRg16ui:       return 0x823A; /* GL_RG16UI */
    case SpvImageFormatRg8ui:        return 0x8238; /* GL_RG8UI */
-   case SpvImageFormatR16ui:        return 0x823A; /* GL_RG16UI */
+   case SpvImageFormatR16ui:        return 0x8234; /* GL_R16UI */
    case SpvImageFormatR8ui:         return 0x8232; /* GL_R8UI */
    default:
       assert(!"Invalid image format");
@@ -708,19 +736,21 @@
    struct vtn_value *val = vtn_push_value(b, w[1], vtn_value_type_type);
 
    val->type = rzalloc(b, struct vtn_type);
-   val->type->is_builtin = false;
    val->type->val = val;
 
    switch (opcode) {
    case SpvOpTypeVoid:
+      val->type->base_type = vtn_base_type_void;
       val->type->type = glsl_void_type();
       break;
    case SpvOpTypeBool:
+      val->type->base_type = vtn_base_type_scalar;
       val->type->type = glsl_bool_type();
       break;
    case SpvOpTypeInt: {
       int bit_size = w[2];
       const bool signedness = w[3];
+      val->type->base_type = vtn_base_type_scalar;
       if (bit_size == 64)
          val->type->type = (signedness ? glsl_int64_t_type() : glsl_uint64_t_type());
       else
@@ -729,6 +759,7 @@
    }
    case SpvOpTypeFloat: {
       int bit_size = w[2];
+      val->type->base_type = vtn_base_type_scalar;
       val->type->type = bit_size == 64 ? glsl_double_type() : glsl_float_type();
       break;
    }
@@ -738,13 +769,9 @@
       unsigned elems = w[3];
 
       assert(glsl_type_is_scalar(base->type));
+      val->type->base_type = vtn_base_type_vector;
       val->type->type = glsl_vector_type(glsl_get_base_type(base->type), elems);
-
-      /* Vectors implicitly have sizeof(base_type) stride.  For now, this
-       * is always 4 bytes.  This will have to change if we want to start
-       * supporting doubles or half-floats.
-       */
-      val->type->stride = 4;
+      val->type->stride = glsl_get_bit_size(base->type) / 8;
       val->type->array_element = base;
       break;
    }
@@ -754,10 +781,12 @@
       unsigned columns = w[3];
 
       assert(glsl_type_is_vector(base->type));
+      val->type->base_type = vtn_base_type_matrix;
       val->type->type = glsl_matrix_type(glsl_get_base_type(base->type),
                                          glsl_get_vector_elements(base->type),
                                          columns);
       assert(!glsl_type_is_error(val->type->type));
+      val->type->length = columns;
       val->type->array_element = base;
       val->type->row_major = false;
       val->type->stride = 0;
@@ -769,16 +798,16 @@
       struct vtn_type *array_element =
          vtn_value(b, w[2], vtn_value_type_type)->type;
 
-      unsigned length;
       if (opcode == SpvOpTypeRuntimeArray) {
          /* A length of 0 is used to denote unsized arrays */
-         length = 0;
+         val->type->length = 0;
       } else {
-         length =
+         val->type->length =
             vtn_value(b, w[3], vtn_value_type_constant)->constant->values[0].u32[0];
       }
 
-      val->type->type = glsl_array_type(array_element->type, length);
+      val->type->base_type = vtn_base_type_array;
+      val->type->type = glsl_array_type(array_element->type, val->type->length);
       val->type->array_element = array_element;
       val->type->stride = 0;
       break;
@@ -786,6 +815,8 @@
 
    case SpvOpTypeStruct: {
       unsigned num_fields = count - 2;
+      val->type->base_type = vtn_base_type_struct;
+      val->type->length = num_fields;
       val->type->members = ralloc_array(b, struct vtn_type *, num_fields);
       val->type->offsets = ralloc_array(b, unsigned, num_fields);
 
@@ -807,6 +838,7 @@
       };
 
       vtn_foreach_decoration(b, val, struct_member_decoration_cb, &ctx);
+      vtn_foreach_decoration(b, val, struct_member_matrix_stride_cb, &ctx);
 
       const char *name = val->name ? val->name : "struct";
 
@@ -815,29 +847,43 @@
    }
 
    case SpvOpTypeFunction: {
-      const struct glsl_type *return_type =
-         vtn_value(b, w[2], vtn_value_type_type)->type->type;
-      NIR_VLA(struct glsl_function_param, params, count - 3);
-      for (unsigned i = 0; i < count - 3; i++) {
-         params[i].type = vtn_value(b, w[i + 3], vtn_value_type_type)->type->type;
+      val->type->base_type = vtn_base_type_function;
+      val->type->type = NULL;
 
-         /* FIXME: */
-         params[i].in = true;
-         params[i].out = true;
+      val->type->return_type = vtn_value(b, w[2], vtn_value_type_type)->type;
+
+      const unsigned num_params = count - 3;
+      val->type->length = num_params;
+      val->type->params = ralloc_array(b, struct vtn_type *, num_params);
+      for (unsigned i = 0; i < count - 3; i++) {
+         val->type->params[i] =
+            vtn_value(b, w[i + 3], vtn_value_type_type)->type;
       }
-      val->type->type = glsl_function_type(return_type, params, count - 3);
       break;
    }
 
-   case SpvOpTypePointer:
-      /* FIXME:  For now, we'll just do the really lame thing and return
-       * the same type.  The validator should ensure that the proper number
-       * of dereferences happen
-       */
-      val->type = vtn_value(b, w[3], vtn_value_type_type)->type;
+   case SpvOpTypePointer: {
+      SpvStorageClass storage_class = w[2];
+      struct vtn_type *deref_type =
+         vtn_value(b, w[3], vtn_value_type_type)->type;
+
+      val->type->base_type = vtn_base_type_pointer;
+      val->type->storage_class = storage_class;
+      val->type->deref = deref_type;
+
+      if (storage_class == SpvStorageClassUniform ||
+          storage_class == SpvStorageClassStorageBuffer) {
+         /* These can actually be stored to nir_variables and used as SSA
+          * values so they need a real glsl_type.
+          */
+         val->type->type = glsl_vector_type(GLSL_TYPE_UINT, 2);
+      }
       break;
+   }
 
    case SpvOpTypeImage: {
+      val->type->base_type = vtn_base_type_image;
+
       const struct glsl_type *sampled_type =
          vtn_value(b, w[2], vtn_value_type_type)->type->type;
 
@@ -879,10 +925,12 @@
       val->type->image_format = translate_image_format(format);
 
       if (sampled == 1) {
+         val->type->sampled = true;
          val->type->type = glsl_sampler_type(dim, is_shadow, is_array,
                                              glsl_get_base_type(sampled_type));
       } else if (sampled == 2) {
          assert(!is_shadow);
+         val->type->sampled = false;
          val->type->type = glsl_image_type(dim, is_array,
                                            glsl_get_base_type(sampled_type));
       } else {
@@ -901,6 +949,7 @@
        * matters is that it's a sampler type as opposed to an integer type
        * so the backend knows what to do.
        */
+      val->type->base_type = vtn_base_type_sampler;
       val->type->type = glsl_bare_sampler_type();
       break;
 
@@ -922,6 +971,12 @@
 {
    nir_constant *c = rzalloc(b, nir_constant);
 
+   /* For pointers and other typeless things, we have to return something but
+    * it doesn't matter what.
+    */
+   if (!type)
+      return c;
+
    switch (glsl_get_base_type(type)) {
    case GLSL_TYPE_INT:
    case GLSL_TYPE_UINT:
@@ -1017,9 +1072,9 @@
 
    assert(val->const_type == glsl_vector_type(GLSL_TYPE_UINT, 3));
 
-   b->shader->info->cs.local_size[0] = val->constant->values[0].u32[0];
-   b->shader->info->cs.local_size[1] = val->constant->values[0].u32[1];
-   b->shader->info->cs.local_size[2] = val->constant->values[0].u32[2];
+   b->shader->info.cs.local_size[0] = val->constant->values[0].u32[0];
+   b->shader->info.cs.local_size[1] = val->constant->values[0].u32[1];
+   b->shader->info.cs.local_size[2] = val->constant->values[0].u32[2];
 }
 
 static void
@@ -1338,6 +1393,7 @@
 vtn_handle_function_call(struct vtn_builder *b, SpvOp opcode,
                          const uint32_t *w, unsigned count)
 {
+   struct vtn_type *res_type = vtn_value(b, w[1], vtn_value_type_type)->type;
    struct nir_function *callee =
       vtn_value(b, w[3], vtn_value_type_function)->func->impl->function;
 
@@ -1345,8 +1401,9 @@
    for (unsigned i = 0; i < call->num_params; i++) {
       unsigned arg_id = w[4 + i];
       struct vtn_value *arg = vtn_untyped_value(b, arg_id);
-      if (arg->value_type == vtn_value_type_access_chain) {
-         nir_deref_var *d = vtn_access_chain_to_deref(b, arg->access_chain);
+      if (arg->value_type == vtn_value_type_pointer &&
+          arg->pointer->ptr_type->type == NULL) {
+         nir_deref_var *d = vtn_pointer_to_deref(b, arg->pointer);
          call->params[i] = nir_deref_var_clone(d, call);
       } else {
          struct vtn_ssa_value *arg_ssa = vtn_ssa_value(b, arg_id);
@@ -1361,6 +1418,7 @@
    }
 
    nir_variable *out_tmp = NULL;
+   assert(res_type->type == callee->return_type);
    if (!glsl_type_is_void(callee->return_type)) {
       out_tmp = nir_local_variable_create(b->impl, callee->return_type,
                                           "out_tmp");
@@ -1372,8 +1430,7 @@
    if (glsl_type_is_void(callee->return_type)) {
       vtn_push_value(b, w[2], vtn_value_type_undef);
    } else {
-      struct vtn_value *retval = vtn_push_value(b, w[2], vtn_value_type_ssa);
-      retval->ssa = vtn_local_load(b, call->return_deref);
+      vtn_push_ssa(b, w[2], res_type, vtn_local_load(b, call->return_deref));
    }
 }
 
@@ -1434,19 +1491,18 @@
          vtn_push_value(b, w[2], vtn_value_type_sampled_image);
       val->sampled_image = ralloc(b, struct vtn_sampled_image);
       val->sampled_image->image =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
       val->sampled_image->sampler =
-         vtn_value(b, w[4], vtn_value_type_access_chain)->access_chain;
+         vtn_value(b, w[4], vtn_value_type_pointer)->pointer;
       return;
    } else if (opcode == SpvOpImage) {
-      struct vtn_value *val =
-         vtn_push_value(b, w[2], vtn_value_type_access_chain);
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_pointer);
       struct vtn_value *src_val = vtn_untyped_value(b, w[3]);
       if (src_val->value_type == vtn_value_type_sampled_image) {
-         val->access_chain = src_val->sampled_image->image;
+         val->pointer = src_val->sampled_image->image;
       } else {
-         assert(src_val->value_type == vtn_value_type_access_chain);
-         val->access_chain = src_val->access_chain;
+         assert(src_val->value_type == vtn_value_type_pointer);
+         val->pointer = src_val->pointer;
       }
       return;
    }
@@ -1459,9 +1515,9 @@
    if (sampled_val->value_type == vtn_value_type_sampled_image) {
       sampled = *sampled_val->sampled_image;
    } else {
-      assert(sampled_val->value_type == vtn_value_type_access_chain);
+      assert(sampled_val->value_type == vtn_value_type_pointer);
       sampled.image = NULL;
-      sampled.sampler = sampled_val->access_chain;
+      sampled.sampler = sampled_val->pointer;
    }
 
    const struct glsl_type *image_type;
@@ -1685,10 +1741,10 @@
       unreachable("Invalid base type for sampler result");
    }
 
-   nir_deref_var *sampler = vtn_access_chain_to_deref(b, sampled.sampler);
+   nir_deref_var *sampler = vtn_pointer_to_deref(b, sampled.sampler);
    nir_deref_var *texture;
    if (sampled.image) {
-      nir_deref_var *image = vtn_access_chain_to_deref(b, sampled.image);
+      nir_deref_var *image = vtn_pointer_to_deref(b, sampled.image);
       texture = image;
    } else {
       texture = sampler;
@@ -1850,8 +1906,7 @@
          vtn_push_value(b, w[2], vtn_value_type_image_pointer);
       val->image = ralloc(b, struct vtn_image_pointer);
 
-      val->image->image =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      val->image->image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
       val->image->coord = get_image_coord(b, w[4]);
       val->image->sample = vtn_ssa_value(b, w[5])->def;
       return;
@@ -1883,15 +1938,13 @@
       break;
 
    case SpvOpImageQuerySize:
-      image.image =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      image.image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
       image.coord = NULL;
       image.sample = NULL;
       break;
 
    case SpvOpImageRead:
-      image.image =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      image.image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
       image.coord = get_image_coord(b, w[4]);
 
       if (count > 5 && (w[5] & SpvImageOperandsSampleMask)) {
@@ -1903,8 +1956,7 @@
       break;
 
    case SpvOpImageWrite:
-      image.image =
-         vtn_value(b, w[1], vtn_value_type_access_chain)->access_chain;
+      image.image = vtn_value(b, w[1], vtn_value_type_pointer)->pointer;
       image.coord = get_image_coord(b, w[2]);
 
       /* texel = w[3] */
@@ -1949,7 +2001,7 @@
 
    nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
 
-   nir_deref_var *image_deref = vtn_access_chain_to_deref(b, image.image);
+   nir_deref_var *image_deref = vtn_pointer_to_deref(b, image.image);
    intrin->variables[0] = nir_deref_var_clone(image_deref, intrin);
 
    /* ImageQuerySize doesn't take any extra parameters */
@@ -1982,6 +2034,7 @@
    case SpvOpAtomicIDecrement:
    case SpvOpAtomicExchange:
    case SpvOpAtomicIAdd:
+   case SpvOpAtomicISub:
    case SpvOpAtomicSMin:
    case SpvOpAtomicUMin:
    case SpvOpAtomicSMax:
@@ -2075,7 +2128,7 @@
 vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
                                  const uint32_t *w, unsigned count)
 {
-   struct vtn_access_chain *chain;
+   struct vtn_pointer *ptr;
    nir_intrinsic_instr *atomic;
 
    switch (opcode) {
@@ -2094,13 +2147,11 @@
    case SpvOpAtomicAnd:
    case SpvOpAtomicOr:
    case SpvOpAtomicXor:
-      chain =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      ptr = vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
       break;
 
    case SpvOpAtomicStore:
-      chain =
-         vtn_value(b, w[1], vtn_value_type_access_chain)->access_chain;
+      ptr = vtn_value(b, w[1], vtn_value_type_pointer)->pointer;
       break;
 
    default:
@@ -2112,8 +2163,8 @@
    SpvMemorySemanticsMask semantics = w[5];
    */
 
-   if (chain->var->mode == vtn_variable_mode_workgroup) {
-      nir_deref_var *deref = vtn_access_chain_to_deref(b, chain);
+   if (ptr->mode == vtn_variable_mode_workgroup) {
+      nir_deref_var *deref = vtn_pointer_to_deref(b, ptr);
       const struct glsl_type *deref_type = nir_deref_tail(&deref->deref)->type;
       nir_intrinsic_op op = get_shared_nir_atomic_op(opcode);
       atomic = nir_intrinsic_instr_create(b->nb.shader, op);
@@ -2152,10 +2203,9 @@
 
       }
    } else {
-      assert(chain->var->mode == vtn_variable_mode_ssbo);
-      struct vtn_type *type;
+      assert(ptr->mode == vtn_variable_mode_ssbo);
       nir_ssa_def *offset, *index;
-      offset = vtn_access_chain_to_offset(b, chain, &index, &type, NULL, false);
+      offset = vtn_pointer_to_offset(b, ptr, &index, NULL);
 
       nir_intrinsic_op op = get_ssbo_nir_atomic_op(opcode);
 
@@ -2163,13 +2213,13 @@
 
       switch (opcode) {
       case SpvOpAtomicLoad:
-         atomic->num_components = glsl_get_vector_elements(type->type);
+         atomic->num_components = glsl_get_vector_elements(ptr->type->type);
          atomic->src[0] = nir_src_for_ssa(index);
          atomic->src[1] = nir_src_for_ssa(offset);
          break;
 
       case SpvOpAtomicStore:
-         atomic->num_components = glsl_get_vector_elements(type->type);
+         atomic->num_components = glsl_get_vector_elements(ptr->type->type);
          nir_intrinsic_set_write_mask(atomic, (1 << atomic->num_components) - 1);
          atomic->src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[4])->def);
          atomic->src[1] = nir_src_for_ssa(index);
@@ -2731,6 +2781,15 @@
          spv_check_supported(image_write_without_format, cap);
          break;
 
+      case SpvCapabilityMultiView:
+         spv_check_supported(multiview, cap);
+         break;
+
+      case SpvCapabilityVariablePointersStorageBuffer:
+      case SpvCapabilityVariablePointers:
+         spv_check_supported(variable_pointers, cap);
+         break;
+
       default:
          unreachable("Unhandled capability");
       }
@@ -2743,7 +2802,8 @@
 
    case SpvOpMemoryModel:
       assert(w[1] == SpvAddressingModelLogical);
-      assert(w[2] == SpvMemoryModelGLSL450);
+      assert(w[2] == SpvMemoryModelSimple ||
+             w[2] == SpvMemoryModelGLSL450);
       break;
 
    case SpvOpEntryPoint: {
@@ -2805,36 +2865,36 @@
 
    case SpvExecutionModeEarlyFragmentTests:
       assert(b->shader->stage == MESA_SHADER_FRAGMENT);
-      b->shader->info->fs.early_fragment_tests = true;
+      b->shader->info.fs.early_fragment_tests = true;
       break;
 
    case SpvExecutionModeInvocations:
       assert(b->shader->stage == MESA_SHADER_GEOMETRY);
-      b->shader->info->gs.invocations = MAX2(1, mode->literals[0]);
+      b->shader->info.gs.invocations = MAX2(1, mode->literals[0]);
       break;
 
    case SpvExecutionModeDepthReplacing:
       assert(b->shader->stage == MESA_SHADER_FRAGMENT);
-      b->shader->info->fs.depth_layout = FRAG_DEPTH_LAYOUT_ANY;
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_ANY;
       break;
    case SpvExecutionModeDepthGreater:
       assert(b->shader->stage == MESA_SHADER_FRAGMENT);
-      b->shader->info->fs.depth_layout = FRAG_DEPTH_LAYOUT_GREATER;
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_GREATER;
       break;
    case SpvExecutionModeDepthLess:
       assert(b->shader->stage == MESA_SHADER_FRAGMENT);
-      b->shader->info->fs.depth_layout = FRAG_DEPTH_LAYOUT_LESS;
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_LESS;
       break;
    case SpvExecutionModeDepthUnchanged:
       assert(b->shader->stage == MESA_SHADER_FRAGMENT);
-      b->shader->info->fs.depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
       break;
 
    case SpvExecutionModeLocalSize:
       assert(b->shader->stage == MESA_SHADER_COMPUTE);
-      b->shader->info->cs.local_size[0] = mode->literals[0];
-      b->shader->info->cs.local_size[1] = mode->literals[1];
-      b->shader->info->cs.local_size[2] = mode->literals[2];
+      b->shader->info.cs.local_size[0] = mode->literals[0];
+      b->shader->info.cs.local_size[1] = mode->literals[1];
+      b->shader->info.cs.local_size[2] = mode->literals[2];
       break;
    case SpvExecutionModeLocalSizeHint:
       break; /* Nothing to do with this */
@@ -2842,10 +2902,10 @@
    case SpvExecutionModeOutputVertices:
       if (b->shader->stage == MESA_SHADER_TESS_CTRL ||
           b->shader->stage == MESA_SHADER_TESS_EVAL) {
-         b->shader->info->tess.tcs_vertices_out = mode->literals[0];
+         b->shader->info.tess.tcs_vertices_out = mode->literals[0];
       } else {
          assert(b->shader->stage == MESA_SHADER_GEOMETRY);
-         b->shader->info->gs.vertices_out = mode->literals[0];
+         b->shader->info.gs.vertices_out = mode->literals[0];
       }
       break;
 
@@ -2858,11 +2918,11 @@
    case SpvExecutionModeIsolines:
       if (b->shader->stage == MESA_SHADER_TESS_CTRL ||
           b->shader->stage == MESA_SHADER_TESS_EVAL) {
-         b->shader->info->tess.primitive_mode =
+         b->shader->info.tess.primitive_mode =
             gl_primitive_from_spv_execution_mode(mode->exec_mode);
       } else {
          assert(b->shader->stage == MESA_SHADER_GEOMETRY);
-         b->shader->info->gs.vertices_in =
+         b->shader->info.gs.vertices_in =
             vertices_in_from_spv_execution_mode(mode->exec_mode);
       }
       break;
@@ -2871,24 +2931,24 @@
    case SpvExecutionModeOutputLineStrip:
    case SpvExecutionModeOutputTriangleStrip:
       assert(b->shader->stage == MESA_SHADER_GEOMETRY);
-      b->shader->info->gs.output_primitive =
+      b->shader->info.gs.output_primitive =
          gl_primitive_from_spv_execution_mode(mode->exec_mode);
       break;
 
    case SpvExecutionModeSpacingEqual:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
              b->shader->stage == MESA_SHADER_TESS_EVAL);
-      b->shader->info->tess.spacing = TESS_SPACING_EQUAL;
+      b->shader->info.tess.spacing = TESS_SPACING_EQUAL;
       break;
    case SpvExecutionModeSpacingFractionalEven:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
              b->shader->stage == MESA_SHADER_TESS_EVAL);
-      b->shader->info->tess.spacing = TESS_SPACING_FRACTIONAL_EVEN;
+      b->shader->info.tess.spacing = TESS_SPACING_FRACTIONAL_EVEN;
       break;
    case SpvExecutionModeSpacingFractionalOdd:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
              b->shader->stage == MESA_SHADER_TESS_EVAL);
-      b->shader->info->tess.spacing = TESS_SPACING_FRACTIONAL_ODD;
+      b->shader->info.tess.spacing = TESS_SPACING_FRACTIONAL_ODD;
       break;
    case SpvExecutionModeVertexOrderCw:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
@@ -2897,18 +2957,18 @@
        * but be the opposite of OpenGL.  Currently NIR follows GL semantics,
        * so we set it backwards here.
        */
-      b->shader->info->tess.ccw = true;
+      b->shader->info.tess.ccw = true;
       break;
    case SpvExecutionModeVertexOrderCcw:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
              b->shader->stage == MESA_SHADER_TESS_EVAL);
       /* Backwards; see above */
-      b->shader->info->tess.ccw = false;
+      b->shader->info.tess.ccw = false;
       break;
    case SpvExecutionModePointMode:
       assert(b->shader->stage == MESA_SHADER_TESS_CTRL ||
              b->shader->stage == MESA_SHADER_TESS_EVAL);
-      b->shader->info->tess.point_mode = true;
+      b->shader->info.tess.point_mode = true;
       break;
 
    case SpvExecutionModePixelCenterInteger:
@@ -3031,6 +3091,7 @@
    case SpvOpCopyMemory:
    case SpvOpCopyMemorySized:
    case SpvOpAccessChain:
+   case SpvOpPtrAccessChain:
    case SpvOpInBoundsAccessChain:
    case SpvOpArrayLength:
       vtn_handle_variables(b, opcode, w, count);
@@ -3067,11 +3128,12 @@
       break;
 
    case SpvOpImageQuerySize: {
-      struct vtn_access_chain *image =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
-      if (glsl_type_is_image(image->var->var->interface_type)) {
+      struct vtn_pointer *image =
+         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+      if (image->mode == vtn_variable_mode_image) {
          vtn_handle_image(b, opcode, w, count);
       } else {
+         assert(image->mode == vtn_variable_mode_sampler);
          vtn_handle_texture(b, opcode, w, count);
       }
       break;
@@ -3096,7 +3158,7 @@
       if (pointer->value_type == vtn_value_type_image_pointer) {
          vtn_handle_image(b, opcode, w, count);
       } else {
-         assert(pointer->value_type == vtn_value_type_access_chain);
+         assert(pointer->value_type == vtn_value_type_pointer);
          vtn_handle_ssbo_or_shared_atomic(b, opcode, w, count);
       }
       break;
@@ -3107,12 +3169,25 @@
       if (pointer->value_type == vtn_value_type_image_pointer) {
          vtn_handle_image(b, opcode, w, count);
       } else {
-         assert(pointer->value_type == vtn_value_type_access_chain);
+         assert(pointer->value_type == vtn_value_type_pointer);
          vtn_handle_ssbo_or_shared_atomic(b, opcode, w, count);
       }
       break;
    }
 
+   case SpvOpSelect: {
+      /* Handle OpSelect up-front here because it needs to be able to handle
+       * pointers and not just regular vectors and scalars.
+       */
+      struct vtn_type *res_type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      struct vtn_ssa_value *ssa = vtn_create_ssa_value(b, res_type->type);
+      ssa->def = nir_bcsel(&b->nb, vtn_ssa_value(b, w[3])->def,
+                                   vtn_ssa_value(b, w[4])->def,
+                                   vtn_ssa_value(b, w[5])->def);
+      vtn_push_ssa(b, w[2], res_type, ssa);
+      break;
+   }
+
    case SpvOpSNegate:
    case SpvOpFNegate:
    case SpvOpNot:
@@ -3170,7 +3245,6 @@
    case SpvOpBitwiseOr:
    case SpvOpBitwiseXor:
    case SpvOpBitwiseAnd:
-   case SpvOpSelect:
    case SpvOpIEqual:
    case SpvOpFOrdEqual:
    case SpvOpFUnordEqual:
@@ -3284,7 +3358,7 @@
    b->shader = nir_shader_create(NULL, stage, options, NULL);
 
    /* Set shader info defaults */
-   b->shader->info->gs.invocations = 1;
+   b->shader->info.gs.invocations = 1;
 
    /* Parse execution modes */
    vtn_foreach_execution_mode(b, b->entry_point,
diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c
index 54248b1..3e57608 100644
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -41,38 +41,32 @@
       struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_function);
       val->func = b->func;
 
-      const struct glsl_type *func_type =
-         vtn_value(b, w[4], vtn_value_type_type)->type->type;
+      const struct vtn_type *func_type =
+         vtn_value(b, w[4], vtn_value_type_type)->type;
 
-      assert(glsl_get_function_return_type(func_type) == result_type);
+      assert(func_type->return_type->type == result_type);
 
       nir_function *func =
          nir_function_create(b->shader, ralloc_strdup(b->shader, val->name));
 
-      func->num_params = glsl_get_length(func_type);
+      func->num_params = func_type->length;
       func->params = ralloc_array(b->shader, nir_parameter, func->num_params);
       for (unsigned i = 0; i < func->num_params; i++) {
-         const struct glsl_function_param *param =
-            glsl_get_function_param(func_type, i);
-         func->params[i].type = param->type;
-         if (param->in) {
-            if (param->out) {
-               func->params[i].param_type = nir_parameter_inout;
-            } else {
-               func->params[i].param_type = nir_parameter_in;
-            }
+         if (func_type->params[i]->base_type == vtn_base_type_pointer &&
+             func_type->params[i]->type == NULL) {
+            func->params[i].type = func_type->params[i]->deref->type;
          } else {
-            if (param->out) {
-               func->params[i].param_type = nir_parameter_out;
-            } else {
-               assert(!"Parameter is neither in nor out");
-            }
+            func->params[i].type = func_type->params[i]->type;
          }
+
+         /* TODO: We could do something smarter here. */
+         func->params[i].param_type = nir_parameter_inout;
       }
 
-      func->return_type = glsl_get_function_return_type(func_type);
+      func->return_type = func_type->return_type->type;
 
       b->func->impl = nir_function_impl_create(func);
+      b->nb.cursor = nir_before_cf_list(&b->func->impl->body);
 
       b->func_param_idx = 0;
       break;
@@ -84,40 +78,48 @@
       break;
 
    case SpvOpFunctionParameter: {
-      struct vtn_value *val =
-         vtn_push_value(b, w[2], vtn_value_type_access_chain);
-
       struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
 
       assert(b->func_param_idx < b->func->impl->num_params);
       nir_variable *param = b->func->impl->params[b->func_param_idx++];
 
-      assert(param->type == type->type);
+      if (type->base_type == vtn_base_type_pointer && type->type == NULL) {
+         struct vtn_variable *vtn_var = rzalloc(b, struct vtn_variable);
+         vtn_var->type = type->deref;
+         vtn_var->var = param;
 
-      /* Name the parameter so it shows up nicely in NIR */
-      param->name = ralloc_strdup(param, val->name);
+         assert(vtn_var->type->type == param->type);
 
-      struct vtn_variable *vtn_var = rzalloc(b, struct vtn_variable);
-      vtn_var->type = type;
-      vtn_var->var = param;
-      vtn_var->chain.var = vtn_var;
-      vtn_var->chain.length = 0;
+         struct vtn_type *without_array = vtn_var->type;
+         while(glsl_type_is_array(without_array->type))
+            without_array = without_array->array_element;
 
-      struct vtn_type *without_array = type;
-      while(glsl_type_is_array(without_array->type))
-         without_array = without_array->array_element;
+         if (glsl_type_is_image(without_array->type)) {
+            vtn_var->mode = vtn_variable_mode_image;
+            param->interface_type = without_array->type;
+         } else if (glsl_type_is_sampler(without_array->type)) {
+            vtn_var->mode = vtn_variable_mode_sampler;
+            param->interface_type = without_array->type;
+         } else {
+            vtn_var->mode = vtn_variable_mode_param;
+         }
 
-      if (glsl_type_is_image(without_array->type)) {
-         vtn_var->mode = vtn_variable_mode_image;
-         param->interface_type = without_array->type;
-      } else if (glsl_type_is_sampler(without_array->type)) {
-         vtn_var->mode = vtn_variable_mode_sampler;
-         param->interface_type = without_array->type;
+         struct vtn_value *val =
+            vtn_push_value(b, w[2], vtn_value_type_pointer);
+
+         /* Name the parameter so it shows up nicely in NIR */
+         param->name = ralloc_strdup(param, val->name);
+
+         val->pointer = vtn_pointer_for_variable(b, vtn_var, type);
       } else {
-         vtn_var->mode = vtn_variable_mode_param;
-      }
+         /* We're a regular SSA value. */
+         struct vtn_ssa_value *param_ssa =
+            vtn_local_load(b, nir_deref_var_create(b, param));
+         struct vtn_value *val = vtn_push_ssa(b, w[2], type, param_ssa);
 
-      val->access_chain = &vtn_var->chain;
+         /* Name the parameter so it shows up nicely in NIR */
+         param->name = ralloc_strdup(param, val->name);
+      }
       break;
    }
 
@@ -183,7 +185,7 @@
       list_inithead(&c->body);
       c->start_block = case_block;
       c->fallthrough = NULL;
-      nir_array_init(&c->values, b);
+      util_dynarray_init(&c->values, b);
       c->is_default = false;
       c->visited = false;
 
@@ -195,7 +197,7 @@
    if (is_default) {
       case_block->switch_case->is_default = true;
    } else {
-      nir_array_add(&case_block->switch_case->values, uint32_t, val);
+      util_dynarray_append(&case_block->switch_case->values, uint32_t, val);
    }
 }
 
@@ -425,7 +427,7 @@
          list_for_each_entry(struct vtn_case, cse, &swtch->cases, link) {
             assert(cse->start_block != break_block);
             vtn_cfg_walk_blocks(b, &cse->body, cse->start_block, cse,
-                                break_block, NULL, loop_cont, NULL);
+                                break_block, loop_break, loop_cont, NULL);
          }
 
          /* Finally, we walk over all of the cases one more time and put
@@ -503,14 +505,13 @@
     * algorithm all over again.  It's easier if we just let
     * lower_vars_to_ssa do that for us instead of repeating it here.
     */
-   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
-
    struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
    nir_variable *phi_var =
       nir_local_variable_create(b->nb.impl, type->type, "phi");
    _mesa_hash_table_insert(b->phi_table, w, phi_var);
 
-   val->ssa = vtn_local_load(b, nir_deref_var_create(b, phi_var));
+   vtn_push_ssa(b, w[2], type,
+                vtn_local_load(b, nir_deref_var_create(b, phi_var)));
 
    return true;
 }
@@ -709,7 +710,7 @@
             }
 
             nir_ssa_def *cond = NULL;
-            nir_array_foreach(&cse->values, uint32_t, val) {
+            util_dynarray_foreach(&cse->values, uint32_t, val) {
                nir_ssa_def *is_val =
                   nir_ieq(&b->nb, sel, nir_imm_int(&b->nb, *val));
 
diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 5d38431..c30dcc7 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -433,9 +433,11 @@
    case GLSLstd450Log2:          return nir_op_flog2;
    case GLSLstd450Sqrt:          return nir_op_fsqrt;
    case GLSLstd450InverseSqrt:   return nir_op_frsq;
+   case GLSLstd450NMin:          return nir_op_fmin;
    case GLSLstd450FMin:          return nir_op_fmin;
    case GLSLstd450UMin:          return nir_op_umin;
    case GLSLstd450SMin:          return nir_op_imin;
+   case GLSLstd450NMax:          return nir_op_fmax;
    case GLSLstd450FMax:          return nir_op_fmax;
    case GLSLstd450UMax:          return nir_op_umax;
    case GLSLstd450SMax:          return nir_op_imax;
@@ -465,6 +467,8 @@
    }
 }
 
+#define NIR_IMM_FP(n, v) (src[0]->bit_size == 64 ? nir_imm_double(n, v) : nir_imm_float(n, v))
+
 static void
 handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
                    const uint32_t *w, unsigned count)
@@ -479,8 +483,13 @@
    /* Collect the various SSA sources */
    unsigned num_inputs = count - 5;
    nir_ssa_def *src[3] = { NULL, };
-   for (unsigned i = 0; i < num_inputs; i++)
+   for (unsigned i = 0; i < num_inputs; i++) {
+      /* These are handled specially below */
+      if (vtn_untyped_value(b, w[i + 5])->value_type == vtn_value_type_pointer)
+         continue;
+
       src[i] = vtn_ssa_value(b, w[i + 5])->def;
+   }
 
    switch (entrypoint) {
    case GLSLstd450Radians:
@@ -535,6 +544,7 @@
       return;
 
    case GLSLstd450FClamp:
+   case GLSLstd450NClamp:
       val->ssa->def = build_fclamp(nb, src[0], src[1], src[2]);
       return;
    case GLSLstd450UClamp:
@@ -560,12 +570,12 @@
       nir_ssa_def *t =
          build_fclamp(nb, nir_fdiv(nb, nir_fsub(nb, src[2], src[0]),
                                        nir_fsub(nb, src[1], src[0])),
-                          nir_imm_float(nb, 0.0), nir_imm_float(nb, 1.0));
+                          NIR_IMM_FP(nb, 0.0), NIR_IMM_FP(nb, 1.0));
       /* result = t * t * (3 - 2 * t) */
       val->ssa->def =
          nir_fmul(nb, t, nir_fmul(nb, t,
-            nir_fsub(nb, nir_imm_float(nb, 3.0),
-                         nir_fmul(nb, nir_imm_float(nb, 2.0), t))));
+            nir_fsub(nb, NIR_IMM_FP(nb, 3.0),
+                         nir_fmul(nb, NIR_IMM_FP(nb, 2.0), t))));
       return;
    }
 
diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index a159461..8458462 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -30,7 +30,7 @@
 
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
-#include "nir/nir_array.h"
+#include "util/u_dynarray.h"
 #include "nir_spirv.h"
 #include "spirv.h"
 
@@ -44,7 +44,7 @@
    vtn_value_type_decoration_group,
    vtn_value_type_type,
    vtn_value_type_constant,
-   vtn_value_type_access_chain,
+   vtn_value_type_pointer,
    vtn_value_type_function,
    vtn_value_type_block,
    vtn_value_type_ssa,
@@ -115,7 +115,7 @@
    struct vtn_case *fallthrough;
 
    /* The uint32_t values that map to this case */
-   nir_array values;
+   struct util_dynarray values;
 
    /* True if this is the default case */
    bool is_default;
@@ -196,48 +196,101 @@
    const struct glsl_type *type;
 };
 
+enum vtn_base_type {
+   vtn_base_type_void,
+   vtn_base_type_scalar,
+   vtn_base_type_vector,
+   vtn_base_type_matrix,
+   vtn_base_type_array,
+   vtn_base_type_struct,
+   vtn_base_type_pointer,
+   vtn_base_type_image,
+   vtn_base_type_sampler,
+   vtn_base_type_function,
+};
+
 struct vtn_type {
+   enum vtn_base_type base_type;
+
    const struct glsl_type *type;
 
    /* The value that declares this type.  Used for finding decorations */
    struct vtn_value *val;
 
-   /* for matrices, whether the matrix is stored row-major */
-   bool row_major;
+   /* Specifies the length of complex types. */
+   unsigned length;
 
-   /* for structs, the offset of each member */
-   unsigned *offsets;
-
-   /* for structs, whether it was decorated as a "non-SSBO-like" block */
-   bool block;
-
-   /* for structs, whether it was decorated as an "SSBO-like" block */
-   bool buffer_block;
-
-   /* for structs with block == true, whether this is a builtin block (i.e. a
-    * block that contains only builtins).
-    */
-   bool builtin_block;
-
-   /* Image format for image_load_store type images */
-   unsigned image_format;
-
-   /* Access qualifier for storage images */
-   SpvAccessQualifier access_qualifier;
-
-   /* for arrays and matrices, the array stride */
+   /* for arrays, matrices and pointers, the array stride */
    unsigned stride;
 
-   /* for arrays, the vtn_type for the elements of the array */
-   struct vtn_type *array_element;
+   union {
+      /* Members for scalar, vector, and array-like types */
+      struct {
+         /* for arrays, the vtn_type for the elements of the array */
+         struct vtn_type *array_element;
 
-   /* for structures, the vtn_type for each member */
-   struct vtn_type **members;
+         /* for matrices, whether the matrix is stored row-major */
+         bool row_major:1;
 
-   /* Whether this type, or a parent type, has been decorated as a builtin */
-   bool is_builtin;
+         /* Whether this type, or a parent type, has been decorated as a
+          * builtin
+          */
+         bool is_builtin:1;
 
-   SpvBuiltIn builtin;
+         /* Which built-in to use */
+         SpvBuiltIn builtin;
+      };
+
+      /* Members for struct types */
+      struct {
+         /* for structures, the vtn_type for each member */
+         struct vtn_type **members;
+
+         /* for structs, the offset of each member */
+         unsigned *offsets;
+
+         /* for structs, whether it was decorated as a "non-SSBO-like" block */
+         bool block:1;
+
+         /* for structs, whether it was decorated as an "SSBO-like" block */
+         bool buffer_block:1;
+
+         /* for structs with block == true, whether this is a builtin block
+          * (i.e. a block that contains only builtins).
+          */
+         bool builtin_block:1;
+      };
+
+      /* Members for pointer types */
+      struct {
+         /* For pointers, the vtn_type for dereferenced type */
+         struct vtn_type *deref;
+
+         /* Storage class for pointers */
+         SpvStorageClass storage_class;
+      };
+
+      /* Members for image types */
+      struct {
+         /* For images, indicates whether it's sampled or storage */
+         bool sampled;
+
+         /* Image format for image_load_store type images */
+         unsigned image_format;
+
+         /* Access qualifier for storage images */
+         SpvAccessQualifier access_qualifier;
+      };
+
+      /* Members for function types */
+      struct {
+         /* For functions, the vtn_type for each parameter */
+         struct vtn_type **params;
+
+         /* Return type for functions */
+         struct vtn_type *return_type;
+      };
+   };
 };
 
 struct vtn_variable;
@@ -253,12 +306,19 @@
 };
 
 struct vtn_access_chain {
-   struct vtn_variable *var;
-
    uint32_t length;
 
-   /* Struct elements and array offsets */
-   struct vtn_access_link link[0];
+   /** Whether or not to treat the base pointer as an array.  This is only
+    * true if this access chain came from an OpPtrAccessChain.
+    */
+   bool ptr_as_array;
+
+   /** Struct elements and array offsets.
+    *
+    * This is an array of 1 so that it can conveniently be created on the
+    * stack but the real length is given by the length field.
+    */
+   struct vtn_access_link link[1];
 };
 
 enum vtn_variable_mode {
@@ -275,6 +335,47 @@
    vtn_variable_mode_output,
 };
 
+struct vtn_pointer {
+   /** The variable mode for the referenced data */
+   enum vtn_variable_mode mode;
+
+   /** The dereferenced type of this pointer */
+   struct vtn_type *type;
+
+   /** The pointer type of this pointer
+    *
+    * This may be NULL for some temporary pointers constructed as part of a
+    * large load, store, or copy.  It MUST be valid for all pointers which are
+    * stored as SPIR-V SSA values.
+    */
+   struct vtn_type *ptr_type;
+
+   /** The referenced variable, if known
+    *
+    * This field may be NULL if the pointer uses a (block_index, offset) pair
+    * instead of an access chain.
+    */
+   struct vtn_variable *var;
+
+   /** An access chain describing how to get from var to the referenced data
+    *
+    * This field may be NULL if the pointer references the entire variable or
+    * if a (block_index, offset) pair is used instead of an access chain.
+    */
+   struct vtn_access_chain *chain;
+
+   /** A (block_index, offset) pair representing a UBO or SSBO position. */
+   struct nir_ssa_def *block_index;
+   struct nir_ssa_def *offset;
+};
+
+static inline bool
+vtn_pointer_uses_ssa_offset(struct vtn_pointer *ptr)
+{
+   return ptr->mode == vtn_variable_mode_ubo ||
+          ptr->mode == vtn_variable_mode_ssbo;
+}
+
 struct vtn_variable {
    enum vtn_variable_mode mode;
 
@@ -300,20 +401,18 @@
     * around this GLSLang issue in SPIR-V -> NIR.  Hopefully, we can drop this
     * hack at some point in the future.
     */
-   struct vtn_access_chain *copy_prop_sampler;
-
-   struct vtn_access_chain chain;
+   struct vtn_pointer *copy_prop_sampler;
 };
 
 struct vtn_image_pointer {
-   struct vtn_access_chain *image;
+   struct vtn_pointer *image;
    nir_ssa_def *coord;
    nir_ssa_def *sample;
 };
 
 struct vtn_sampled_image {
-   struct vtn_access_chain *image; /* Image or array of images */
-   struct vtn_access_chain *sampler; /* Sampler */
+   struct vtn_pointer *image; /* Image or array of images */
+   struct vtn_pointer *sampler; /* Sampler */
 };
 
 struct vtn_value {
@@ -328,7 +427,7 @@
          nir_constant *constant;
          const struct glsl_type *const_type;
       };
-      struct vtn_access_chain *access_chain;
+      struct vtn_pointer *pointer;
       struct vtn_image_pointer *image;
       struct vtn_sampled_image *sampled_image;
       struct vtn_function *func;
@@ -409,6 +508,12 @@
    bool has_loop_continue;
 };
 
+nir_ssa_def *
+vtn_pointer_to_ssa(struct vtn_builder *b, struct vtn_pointer *ptr);
+struct vtn_pointer *
+vtn_pointer_from_ssa(struct vtn_builder *b, nir_ssa_def *ssa,
+                     struct vtn_type *ptr_type);
+
 static inline struct vtn_value *
 vtn_push_value(struct vtn_builder *b, uint32_t value_id,
                enum vtn_value_type value_type)
@@ -422,6 +527,21 @@
 }
 
 static inline struct vtn_value *
+vtn_push_ssa(struct vtn_builder *b, uint32_t value_id,
+             struct vtn_type *type, struct vtn_ssa_value *ssa)
+{
+   struct vtn_value *val;
+   if (type->base_type == vtn_base_type_pointer) {
+      val = vtn_push_value(b, value_id, vtn_value_type_pointer);
+      val->pointer = vtn_pointer_from_ssa(b, ssa->def, type);
+   } else {
+      val = vtn_push_value(b, value_id, vtn_value_type_ssa);
+      val->ssa = ssa;
+   }
+   return val;
+}
+
+static inline struct vtn_value *
 vtn_untyped_value(struct vtn_builder *b, uint32_t value_id)
 {
    assert(value_id < b->value_id_bound);
@@ -459,13 +579,15 @@
 
 nir_deref_var *vtn_nir_deref(struct vtn_builder *b, uint32_t id);
 
-nir_deref_var *vtn_access_chain_to_deref(struct vtn_builder *b,
-                                         struct vtn_access_chain *chain);
+struct vtn_pointer *vtn_pointer_for_variable(struct vtn_builder *b,
+                                             struct vtn_variable *var,
+                                             struct vtn_type *ptr_type);
+
+nir_deref_var *vtn_pointer_to_deref(struct vtn_builder *b,
+                                    struct vtn_pointer *ptr);
 nir_ssa_def *
-vtn_access_chain_to_offset(struct vtn_builder *b,
-                           struct vtn_access_chain *chain,
-                           nir_ssa_def **index_out, struct vtn_type **type_out,
-                           unsigned *end_idx_out, bool stop_at_matrix);
+vtn_pointer_to_offset(struct vtn_builder *b, struct vtn_pointer *ptr,
+                      nir_ssa_def **index_out, unsigned *end_idx_out);
 
 struct vtn_ssa_value *vtn_local_load(struct vtn_builder *b, nir_deref_var *src);
 
@@ -473,10 +595,10 @@
                      nir_deref_var *dest);
 
 struct vtn_ssa_value *
-vtn_variable_load(struct vtn_builder *b, struct vtn_access_chain *src);
+vtn_variable_load(struct vtn_builder *b, struct vtn_pointer *src);
 
 void vtn_variable_store(struct vtn_builder *b, struct vtn_ssa_value *src,
-                        struct vtn_access_chain *dest);
+                        struct vtn_pointer *dest);
 
 void vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
                           const uint32_t *w, unsigned count);
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index cf03628..c3eec0c 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -29,24 +29,71 @@
 #include "spirv_info.h"
 
 static struct vtn_access_chain *
+vtn_access_chain_create(struct vtn_builder *b, unsigned length)
+{
+   struct vtn_access_chain *chain;
+
+   /* Subtract 1 from the length since there's already one built in */
+   size_t size = sizeof(*chain) +
+                 (MAX2(length, 1) - 1) * sizeof(chain->link[0]);
+   chain = rzalloc_size(b, size);
+   chain->length = length;
+
+   return chain;
+}
+
+static struct vtn_access_chain *
 vtn_access_chain_extend(struct vtn_builder *b, struct vtn_access_chain *old,
                         unsigned new_ids)
 {
    struct vtn_access_chain *chain;
 
-   unsigned new_len = old->length + new_ids;
-   /* TODO: don't use rzalloc */
-   chain = rzalloc_size(b, sizeof(*chain) + new_len * sizeof(chain->link[0]));
+   unsigned old_len = old ? old->length : 0;
+   chain = vtn_access_chain_create(b, old_len + new_ids);
 
-   chain->var = old->var;
-   chain->length = new_len;
-
-   for (unsigned i = 0; i < old->length; i++)
+   for (unsigned i = 0; i < old_len; i++)
       chain->link[i] = old->link[i];
 
    return chain;
 }
 
+/* Dereference the given base pointer by the access chain */
+static struct vtn_pointer *
+vtn_access_chain_pointer_dereference(struct vtn_builder *b,
+                                     struct vtn_pointer *base,
+                                     struct vtn_access_chain *deref_chain)
+{
+   struct vtn_access_chain *chain =
+      vtn_access_chain_extend(b, base->chain, deref_chain->length);
+   struct vtn_type *type = base->type;
+
+   /* OpPtrAccessChain is only allowed on things which support variable
+    * pointers.  For everything else, the client is expected to just pass us
+    * the right access chain.
+    */
+   assert(!deref_chain->ptr_as_array);
+
+   unsigned start = base->chain ? base->chain->length : 0;
+   for (unsigned i = 0; i < deref_chain->length; i++) {
+      chain->link[start + i] = deref_chain->link[i];
+
+      if (glsl_type_is_struct(type->type)) {
+         assert(deref_chain->link[i].mode == vtn_access_mode_literal);
+         type = type->members[deref_chain->link[i].id];
+      } else {
+         type = type->array_element;
+      }
+   }
+
+   struct vtn_pointer *ptr = rzalloc(b, struct vtn_pointer);
+   ptr->mode = base->mode;
+   ptr->type = type;
+   ptr->var = base->var;
+   ptr->chain = chain;
+
+   return ptr;
+}
+
 static nir_ssa_def *
 vtn_access_link_as_ssa(struct vtn_builder *b, struct vtn_access_link link,
                        unsigned stride)
@@ -62,20 +109,125 @@
    }
 }
 
-static struct vtn_type *
-vtn_access_chain_tail_type(struct vtn_builder *b,
-                           struct vtn_access_chain *chain)
+static nir_ssa_def *
+vtn_variable_resource_index(struct vtn_builder *b, struct vtn_variable *var,
+                            nir_ssa_def *desc_array_index)
 {
-   struct vtn_type *type = chain->var->type;
-   for (unsigned i = 0; i < chain->length; i++) {
-      if (glsl_type_is_struct(type->type)) {
-         assert(chain->link[i].mode == vtn_access_mode_literal);
-         type = type->members[chain->link[i].id];
-      } else {
+   if (!desc_array_index) {
+      assert(glsl_type_is_struct(var->type->type));
+      desc_array_index = nir_imm_int(&b->nb, 0);
+   }
+
+   nir_intrinsic_instr *instr =
+      nir_intrinsic_instr_create(b->nb.shader,
+                                 nir_intrinsic_vulkan_resource_index);
+   instr->src[0] = nir_src_for_ssa(desc_array_index);
+   nir_intrinsic_set_desc_set(instr, var->descriptor_set);
+   nir_intrinsic_set_binding(instr, var->binding);
+
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
+   nir_builder_instr_insert(&b->nb, &instr->instr);
+
+   return &instr->dest.ssa;
+}
+
+static struct vtn_pointer *
+vtn_ssa_offset_pointer_dereference(struct vtn_builder *b,
+                                   struct vtn_pointer *base,
+                                   struct vtn_access_chain *deref_chain)
+{
+   nir_ssa_def *block_index = base->block_index;
+   nir_ssa_def *offset = base->offset;
+   struct vtn_type *type = base->type;
+
+   unsigned idx = 0;
+   if (deref_chain->ptr_as_array) {
+      /* We need ptr_type for the stride */
+      assert(base->ptr_type);
+      /* This must be a pointer to an actual element somewhere */
+      assert(block_index && offset);
+      /* We need at least one element in the chain */
+      assert(deref_chain->length >= 1);
+
+      nir_ssa_def *elem_offset =
+         vtn_access_link_as_ssa(b, deref_chain->link[idx],
+                                base->ptr_type->stride);
+      offset = nir_iadd(&b->nb, offset, elem_offset);
+      idx++;
+   }
+
+   if (!block_index) {
+      assert(base->var);
+      if (glsl_type_is_array(type->type)) {
+         /* We need at least one element in the chain */
+         assert(deref_chain->length >= 1);
+
+         nir_ssa_def *desc_arr_idx =
+            vtn_access_link_as_ssa(b, deref_chain->link[0], 1);
+         block_index = vtn_variable_resource_index(b, base->var, desc_arr_idx);
          type = type->array_element;
+         idx++;
+      } else {
+         block_index = vtn_variable_resource_index(b, base->var, NULL);
+      }
+
+      /* This is the first access chain so we also need an offset */
+      assert(!offset);
+      offset = nir_imm_int(&b->nb, 0);
+   }
+   assert(offset);
+
+   for (; idx < deref_chain->length; idx++) {
+      switch (glsl_get_base_type(type->type)) {
+      case GLSL_TYPE_UINT:
+      case GLSL_TYPE_INT:
+      case GLSL_TYPE_UINT64:
+      case GLSL_TYPE_INT64:
+      case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_DOUBLE:
+      case GLSL_TYPE_BOOL:
+      case GLSL_TYPE_ARRAY: {
+         nir_ssa_def *elem_offset =
+            vtn_access_link_as_ssa(b, deref_chain->link[idx], type->stride);
+         offset = nir_iadd(&b->nb, offset, elem_offset);
+         type = type->array_element;
+         break;
+      }
+
+      case GLSL_TYPE_STRUCT: {
+         assert(deref_chain->link[idx].mode == vtn_access_mode_literal);
+         unsigned member = deref_chain->link[idx].id;
+         nir_ssa_def *mem_offset = nir_imm_int(&b->nb, type->offsets[member]);
+         offset = nir_iadd(&b->nb, offset, mem_offset);
+         type = type->members[member];
+         break;
+      }
+
+      default:
+         unreachable("Invalid type for deref");
       }
    }
-   return type;
+
+   struct vtn_pointer *ptr = rzalloc(b, struct vtn_pointer);
+   ptr->mode = base->mode;
+   ptr->type = type;
+   ptr->block_index = block_index;
+   ptr->offset = offset;
+
+   return ptr;
+}
+
+/* Dereference the given base pointer by the access chain */
+static struct vtn_pointer *
+vtn_pointer_dereference(struct vtn_builder *b,
+                        struct vtn_pointer *base,
+                        struct vtn_access_chain *deref_chain)
+{
+   if (vtn_pointer_uses_ssa_offset(base)) {
+      return vtn_ssa_offset_pointer_dereference(b, base, deref_chain);
+   } else {
+      return vtn_access_chain_pointer_dereference(b, base, deref_chain);
+   }
 }
 
 /* Crawls a chain of array derefs and rewrites the types so that the
@@ -93,26 +245,48 @@
    }
 }
 
+struct vtn_pointer *
+vtn_pointer_for_variable(struct vtn_builder *b,
+                         struct vtn_variable *var, struct vtn_type *ptr_type)
+{
+   struct vtn_pointer *pointer = rzalloc(b, struct vtn_pointer);
+
+   pointer->mode = var->mode;
+   pointer->type = var->type;
+   assert(ptr_type->base_type == vtn_base_type_pointer);
+   assert(ptr_type->deref->type == var->type->type);
+   pointer->ptr_type = ptr_type;
+   pointer->var = var;
+
+   return pointer;
+}
+
 nir_deref_var *
-vtn_access_chain_to_deref(struct vtn_builder *b, struct vtn_access_chain *chain)
+vtn_pointer_to_deref(struct vtn_builder *b, struct vtn_pointer *ptr)
 {
    /* Do on-the-fly copy propagation for samplers. */
-   if (chain->var->copy_prop_sampler)
-      return vtn_access_chain_to_deref(b, chain->var->copy_prop_sampler);
+   if (ptr->var->copy_prop_sampler)
+      return vtn_pointer_to_deref(b, ptr->var->copy_prop_sampler);
 
    nir_deref_var *deref_var;
-   if (chain->var->var) {
-      deref_var = nir_deref_var_create(b, chain->var->var);
+   if (ptr->var->var) {
+      deref_var = nir_deref_var_create(b, ptr->var->var);
+      /* Raw variable access */
+      if (!ptr->chain)
+         return deref_var;
    } else {
-      assert(chain->var->members);
+      assert(ptr->var->members);
       /* Create the deref_var manually.  It will get filled out later. */
       deref_var = rzalloc(b, nir_deref_var);
       deref_var->deref.deref_type = nir_deref_type_var;
    }
 
-   struct vtn_type *deref_type = chain->var->type;
+   struct vtn_access_chain *chain = ptr->chain;
+   assert(chain);
+
+   struct vtn_type *deref_type = ptr->var->type;
    nir_deref *tail = &deref_var->deref;
-   nir_variable **members = chain->var->members;
+   nir_variable **members = ptr->var->members;
 
    for (unsigned i = 0; i < chain->length; i++) {
       enum glsl_base_type base_type = glsl_get_base_type(deref_type->type);
@@ -236,10 +410,8 @@
 nir_deref_var *
 vtn_nir_deref(struct vtn_builder *b, uint32_t id)
 {
-   struct vtn_access_chain *chain =
-      vtn_value(b, id, vtn_value_type_access_chain)->access_chain;
-
-   return vtn_access_chain_to_deref(b, chain);
+   struct vtn_pointer *ptr = vtn_value(b, id, vtn_value_type_pointer)->pointer;
+   return vtn_pointer_to_deref(b, ptr);
 }
 
 /*
@@ -302,53 +474,46 @@
 }
 
 static nir_ssa_def *
-get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain,
+get_vulkan_resource_index(struct vtn_builder *b, struct vtn_pointer *ptr,
                           struct vtn_type **type, unsigned *chain_idx)
 {
    /* Push constants have no explicit binding */
-   if (chain->var->mode == vtn_variable_mode_push_constant) {
+   if (ptr->mode == vtn_variable_mode_push_constant) {
       *chain_idx = 0;
-      *type = chain->var->type;
+      *type = ptr->var->type;
       return NULL;
    }
 
-   nir_ssa_def *array_index;
-   if (glsl_type_is_array(chain->var->type->type)) {
-      assert(chain->length > 0);
-      array_index = vtn_access_link_as_ssa(b, chain->link[0], 1);
+   if (glsl_type_is_array(ptr->var->type->type)) {
+      assert(ptr->chain->length > 0);
+      nir_ssa_def *desc_array_index =
+         vtn_access_link_as_ssa(b, ptr->chain->link[0], 1);
       *chain_idx = 1;
-      *type = chain->var->type->array_element;
+      *type = ptr->var->type->array_element;
+      return vtn_variable_resource_index(b, ptr->var, desc_array_index);
    } else {
-      array_index = nir_imm_int(&b->nb, 0);
       *chain_idx = 0;
-      *type = chain->var->type;
+      *type = ptr->var->type;
+      return vtn_variable_resource_index(b, ptr->var, NULL);
    }
-
-   nir_intrinsic_instr *instr =
-      nir_intrinsic_instr_create(b->nb.shader,
-                                 nir_intrinsic_vulkan_resource_index);
-   instr->src[0] = nir_src_for_ssa(array_index);
-   nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set);
-   nir_intrinsic_set_binding(instr, chain->var->binding);
-
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
-   nir_builder_instr_insert(&b->nb, &instr->instr);
-
-   return &instr->dest.ssa;
 }
 
 nir_ssa_def *
-vtn_access_chain_to_offset(struct vtn_builder *b,
-                           struct vtn_access_chain *chain,
-                           nir_ssa_def **index_out, struct vtn_type **type_out,
-                           unsigned *end_idx_out, bool stop_at_matrix)
+vtn_pointer_to_offset(struct vtn_builder *b, struct vtn_pointer *ptr,
+                      nir_ssa_def **index_out, unsigned *end_idx_out)
 {
+   if (ptr->offset) {
+      assert(ptr->block_index);
+      *index_out = ptr->block_index;
+      return ptr->offset;
+   }
+
    unsigned idx = 0;
    struct vtn_type *type;
-   *index_out = get_vulkan_resource_index(b, chain, &type, &idx);
+   *index_out = get_vulkan_resource_index(b, ptr, &type, &idx);
 
    nir_ssa_def *offset = nir_imm_int(&b->nb, 0);
-   for (; idx < chain->length; idx++) {
+   for (; idx < ptr->chain->length; idx++) {
       enum glsl_base_type base_type = glsl_get_base_type(type->type);
       switch (base_type) {
       case GLSL_TYPE_UINT:
@@ -358,22 +523,17 @@
       case GLSL_TYPE_FLOAT:
       case GLSL_TYPE_DOUBLE:
       case GLSL_TYPE_BOOL:
-         /* Some users may not want matrix or vector derefs */
-         if (stop_at_matrix)
-            goto end;
-         /* Fall through */
-
       case GLSL_TYPE_ARRAY:
          offset = nir_iadd(&b->nb, offset,
-                           vtn_access_link_as_ssa(b, chain->link[idx],
+                           vtn_access_link_as_ssa(b, ptr->chain->link[idx],
                                                   type->stride));
 
          type = type->array_element;
          break;
 
       case GLSL_TYPE_STRUCT: {
-         assert(chain->link[idx].mode == vtn_access_mode_literal);
-         unsigned member = chain->link[idx].id;
+         assert(ptr->chain->link[idx].mode == vtn_access_mode_literal);
+         unsigned member = ptr->chain->link[idx].id;
          offset = nir_iadd(&b->nb, offset,
                            nir_imm_int(&b->nb, type->offsets[member]));
          type = type->members[member];
@@ -385,8 +545,7 @@
       }
    }
 
-end:
-   *type_out = type;
+   assert(type == ptr->type);
    if (end_idx_out)
       *end_idx_out = idx;
 
@@ -447,14 +606,10 @@
 
 static void
 vtn_access_chain_get_offset_size(struct vtn_access_chain *chain,
+                                 struct vtn_type *type,
                                  unsigned *access_offset,
                                  unsigned *access_size)
 {
-   /* Only valid for push constants accesses now. */
-   assert(chain->var->mode == vtn_variable_mode_push_constant);
-
-   struct vtn_type *type = chain->var->type;
-
    *access_offset = 0;
 
    for (unsigned i = 0; i < chain->length; i++) {
@@ -547,114 +702,77 @@
        * a vector, a scalar, or a matrix.
        */
       if (glsl_type_is_matrix(type->type)) {
-         if (chain == NULL) {
-            /* Loading the whole matrix */
-            struct vtn_ssa_value *transpose;
-            unsigned num_ops, vec_width;
-            if (type->row_major) {
-               num_ops = glsl_get_vector_elements(type->type);
-               vec_width = glsl_get_matrix_columns(type->type);
-               if (load) {
-                  const struct glsl_type *transpose_type =
-                     glsl_matrix_type(base_type, vec_width, num_ops);
-                  *inout = vtn_create_ssa_value(b, transpose_type);
-               } else {
-                  transpose = vtn_ssa_transpose(b, *inout);
-                  inout = &transpose;
-               }
+         /* Loading the whole matrix */
+         struct vtn_ssa_value *transpose;
+         unsigned num_ops, vec_width, col_stride;
+         if (type->row_major) {
+            num_ops = glsl_get_vector_elements(type->type);
+            vec_width = glsl_get_matrix_columns(type->type);
+            col_stride = type->array_element->stride;
+            if (load) {
+               const struct glsl_type *transpose_type =
+                  glsl_matrix_type(base_type, vec_width, num_ops);
+               *inout = vtn_create_ssa_value(b, transpose_type);
             } else {
-               num_ops = glsl_get_matrix_columns(type->type);
-               vec_width = glsl_get_vector_elements(type->type);
-            }
-
-            for (unsigned i = 0; i < num_ops; i++) {
-               nir_ssa_def *elem_offset =
-                  nir_iadd(&b->nb, offset,
-                           nir_imm_int(&b->nb, i * type->stride));
-               _vtn_load_store_tail(b, op, load, index, elem_offset,
-                                    access_offset, access_size,
-                                    &(*inout)->elems[i],
-                                    glsl_vector_type(base_type, vec_width));
-            }
-
-            if (load && type->row_major)
-               *inout = vtn_ssa_transpose(b, *inout);
-         } else if (type->row_major) {
-            /* Row-major but with an access chiain. */
-            nir_ssa_def *col_offset =
-               vtn_access_link_as_ssa(b, chain->link[chain_idx],
-                                      type->array_element->stride);
-            offset = nir_iadd(&b->nb, offset, col_offset);
-
-            if (chain_idx + 1 < chain->length) {
-               /* Picking off a single element */
-               nir_ssa_def *row_offset =
-                  vtn_access_link_as_ssa(b, chain->link[chain_idx + 1],
-                                         type->stride);
-               offset = nir_iadd(&b->nb, offset, row_offset);
-               if (load)
-                  *inout = vtn_create_ssa_value(b, glsl_scalar_type(base_type));
-               _vtn_load_store_tail(b, op, load, index, offset,
-                                    access_offset, access_size,
-                                    inout, glsl_scalar_type(base_type));
-            } else {
-               /* Grabbing a column; picking one element off each row */
-               unsigned num_comps = glsl_get_vector_elements(type->type);
-               const struct glsl_type *column_type =
-                  glsl_get_column_type(type->type);
-
-               nir_ssa_def *comps[4];
-               for (unsigned i = 0; i < num_comps; i++) {
-                  nir_ssa_def *elem_offset =
-                     nir_iadd(&b->nb, offset,
-                              nir_imm_int(&b->nb, i * type->stride));
-
-                  struct vtn_ssa_value *comp, temp_val;
-                  if (!load) {
-                     temp_val.def = nir_channel(&b->nb, (*inout)->def, i);
-                     temp_val.type = glsl_scalar_type(base_type);
-                  }
-                  comp = &temp_val;
-                  _vtn_load_store_tail(b, op, load, index, elem_offset,
-                                       access_offset, access_size,
-                                       &comp, glsl_scalar_type(base_type));
-                  comps[i] = comp->def;
-               }
-
-               if (load) {
-                  if (*inout == NULL)
-                     *inout = vtn_create_ssa_value(b, column_type);
-
-                  (*inout)->def = nir_vec(&b->nb, comps, num_comps);
-               }
+               transpose = vtn_ssa_transpose(b, *inout);
+               inout = &transpose;
             }
          } else {
-            /* Column-major with a deref. Fall through to array case. */
-            nir_ssa_def *col_offset =
-               vtn_access_link_as_ssa(b, chain->link[chain_idx], type->stride);
-            offset = nir_iadd(&b->nb, offset, col_offset);
-
-            _vtn_block_load_store(b, op, load, index, offset,
-                                  access_offset, access_size,
-                                  chain, chain_idx + 1,
-                                  type->array_element, inout);
+            num_ops = glsl_get_matrix_columns(type->type);
+            vec_width = glsl_get_vector_elements(type->type);
+            col_stride = type->stride;
          }
-      } else if (chain == NULL) {
-         /* Single whole vector */
-         assert(glsl_type_is_vector_or_scalar(type->type));
-         _vtn_load_store_tail(b, op, load, index, offset,
-                              access_offset, access_size,
-                              inout, type->type);
-      } else {
-         /* Single component of a vector. Fall through to array case. */
-         nir_ssa_def *elem_offset =
-            vtn_access_link_as_ssa(b, chain->link[chain_idx], type->stride);
-         offset = nir_iadd(&b->nb, offset, elem_offset);
 
-         _vtn_block_load_store(b, op, load, index, offset,
-                               access_offset, access_size,
-                               NULL, 0,
-                               type->array_element, inout);
+         for (unsigned i = 0; i < num_ops; i++) {
+            nir_ssa_def *elem_offset =
+               nir_iadd(&b->nb, offset, nir_imm_int(&b->nb, i * col_stride));
+            _vtn_load_store_tail(b, op, load, index, elem_offset,
+                                 access_offset, access_size,
+                                 &(*inout)->elems[i],
+                                 glsl_vector_type(base_type, vec_width));
+         }
+
+         if (load && type->row_major)
+            *inout = vtn_ssa_transpose(b, *inout);
+      } else {
+         unsigned elems = glsl_get_vector_elements(type->type);
+         unsigned type_size = glsl_get_bit_size(type->type) / 8;
+         if (elems == 1 || type->stride == type_size) {
+            /* This is a tightly-packed normal scalar or vector load */
+            assert(glsl_type_is_vector_or_scalar(type->type));
+            _vtn_load_store_tail(b, op, load, index, offset,
+                                 access_offset, access_size,
+                                 inout, type->type);
+         } else {
+            /* This is a strided load.  We have to load N things separately.
+             * This is the single column of a row-major matrix case.
+             */
+            assert(type->stride > type_size);
+            assert(type->stride % type_size == 0);
+
+            nir_ssa_def *per_comp[4];
+            for (unsigned i = 0; i < elems; i++) {
+               nir_ssa_def *elem_offset =
+                  nir_iadd(&b->nb, offset,
+                                   nir_imm_int(&b->nb, i * type->stride));
+               struct vtn_ssa_value *comp, temp_val;
+               if (!load) {
+                  temp_val.def = nir_channel(&b->nb, (*inout)->def, i);
+                  temp_val.type = glsl_scalar_type(base_type);
+               }
+               comp = &temp_val;
+               _vtn_load_store_tail(b, op, load, index, elem_offset,
+                                    access_offset, access_size,
+                                    &comp, glsl_scalar_type(base_type));
+               per_comp[i] = comp->def;
+            }
+
+            if (load) {
+               if (*inout == NULL)
+                  *inout = vtn_create_ssa_value(b, type->type);
+               (*inout)->def = nir_vec(&b->nb, per_comp, elems);
+            }
+         }
       }
       return;
 
@@ -690,11 +808,11 @@
 }
 
 static struct vtn_ssa_value *
-vtn_block_load(struct vtn_builder *b, struct vtn_access_chain *src)
+vtn_block_load(struct vtn_builder *b, struct vtn_pointer *src)
 {
    nir_intrinsic_op op;
    unsigned access_offset = 0, access_size = 0;
-   switch (src->var->mode) {
+   switch (src->mode) {
    case vtn_variable_mode_ubo:
       op = nir_intrinsic_load_ubo;
       break;
@@ -703,53 +821,50 @@
       break;
    case vtn_variable_mode_push_constant:
       op = nir_intrinsic_load_push_constant;
-      vtn_access_chain_get_offset_size(src, &access_offset, &access_size);
+      vtn_access_chain_get_offset_size(src->chain, src->var->type,
+                                       &access_offset, &access_size);
       break;
    default:
       assert(!"Invalid block variable mode");
-      return NULL;
    }
 
    nir_ssa_def *offset, *index = NULL;
-   struct vtn_type *type;
    unsigned chain_idx;
-   offset = vtn_access_chain_to_offset(b, src, &index, &type, &chain_idx, true);
+   offset = vtn_pointer_to_offset(b, src, &index, &chain_idx);
 
    struct vtn_ssa_value *value = NULL;
    _vtn_block_load_store(b, op, true, index, offset,
                          access_offset, access_size,
-                         src, chain_idx, type, &value);
+                         src->chain, chain_idx, src->type, &value);
    return value;
 }
 
 static void
 vtn_block_store(struct vtn_builder *b, struct vtn_ssa_value *src,
-                struct vtn_access_chain *dst)
+                struct vtn_pointer *dst)
 {
    nir_ssa_def *offset, *index = NULL;
-   struct vtn_type *type;
    unsigned chain_idx;
-   offset = vtn_access_chain_to_offset(b, dst, &index, &type, &chain_idx, true);
+   offset = vtn_pointer_to_offset(b, dst, &index, &chain_idx);
 
    _vtn_block_load_store(b, nir_intrinsic_store_ssbo, false, index, offset,
-                         0, 0, dst, chain_idx, type, &src);
+                         0, 0, dst->chain, chain_idx, dst->type, &src);
 }
 
 static bool
-vtn_variable_is_external_block(struct vtn_variable *var)
+vtn_pointer_is_external_block(struct vtn_pointer *ptr)
 {
-   return var->mode == vtn_variable_mode_ssbo ||
-          var->mode == vtn_variable_mode_ubo ||
-          var->mode == vtn_variable_mode_push_constant;
+   return ptr->mode == vtn_variable_mode_ssbo ||
+          ptr->mode == vtn_variable_mode_ubo ||
+          ptr->mode == vtn_variable_mode_push_constant;
 }
 
 static void
 _vtn_variable_load_store(struct vtn_builder *b, bool load,
-                         struct vtn_access_chain *chain,
-                         struct vtn_type *tail_type,
+                         struct vtn_pointer *ptr,
                          struct vtn_ssa_value **inout)
 {
-   enum glsl_base_type base_type = glsl_get_base_type(tail_type->type);
+   enum glsl_base_type base_type = glsl_get_base_type(ptr->type->type);
    switch (base_type) {
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
@@ -765,30 +880,32 @@
        * are storred row-major in a UBO.
        */
       if (load) {
-         *inout = vtn_local_load(b, vtn_access_chain_to_deref(b, chain));
+         *inout = vtn_local_load(b, vtn_pointer_to_deref(b, ptr));
       } else {
-         vtn_local_store(b, *inout, vtn_access_chain_to_deref(b, chain));
+         vtn_local_store(b, *inout, vtn_pointer_to_deref(b, ptr));
       }
       return;
 
    case GLSL_TYPE_ARRAY:
    case GLSL_TYPE_STRUCT: {
-      struct vtn_access_chain *new_chain =
-         vtn_access_chain_extend(b, chain, 1);
-      new_chain->link[chain->length].mode = vtn_access_mode_literal;
-      unsigned elems = glsl_get_length(tail_type->type);
+      unsigned elems = glsl_get_length(ptr->type->type);
       if (load) {
          assert(*inout == NULL);
          *inout = rzalloc(b, struct vtn_ssa_value);
-         (*inout)->type = tail_type->type;
+         (*inout)->type = ptr->type->type;
          (*inout)->elems = rzalloc_array(b, struct vtn_ssa_value *, elems);
       }
+
+      struct vtn_access_chain chain = {
+         .length = 1,
+         .link = {
+            { .mode = vtn_access_mode_literal, },
+         }
+      };
       for (unsigned i = 0; i < elems; i++) {
-         new_chain->link[chain->length].id = i;
-         struct vtn_type *elem_type = base_type == GLSL_TYPE_ARRAY ?
-            tail_type->array_element : tail_type->members[i];
-         _vtn_variable_load_store(b, load, new_chain, elem_type,
-                                  &(*inout)->elems[i]);
+         chain.link[0].id = i;
+         struct vtn_pointer *elem = vtn_pointer_dereference(b, ptr, &chain);
+         _vtn_variable_load_store(b, load, elem, &(*inout)->elems[i]);
       }
       return;
    }
@@ -799,36 +916,35 @@
 }
 
 struct vtn_ssa_value *
-vtn_variable_load(struct vtn_builder *b, struct vtn_access_chain *src)
+vtn_variable_load(struct vtn_builder *b, struct vtn_pointer *src)
 {
-   if (vtn_variable_is_external_block(src->var)) {
+   if (vtn_pointer_is_external_block(src)) {
       return vtn_block_load(b, src);
    } else {
-      struct vtn_type *tail_type = vtn_access_chain_tail_type(b, src);
       struct vtn_ssa_value *val = NULL;
-      _vtn_variable_load_store(b, true, src, tail_type, &val);
+      _vtn_variable_load_store(b, true, src, &val);
       return val;
    }
 }
 
 void
 vtn_variable_store(struct vtn_builder *b, struct vtn_ssa_value *src,
-                   struct vtn_access_chain *dest)
+                   struct vtn_pointer *dest)
 {
-   if (vtn_variable_is_external_block(dest->var)) {
-      assert(dest->var->mode == vtn_variable_mode_ssbo);
+   if (vtn_pointer_is_external_block(dest)) {
+      assert(dest->mode == vtn_variable_mode_ssbo);
       vtn_block_store(b, src, dest);
    } else {
-      struct vtn_type *tail_type = vtn_access_chain_tail_type(b, dest);
-      _vtn_variable_load_store(b, false, dest, tail_type, &src);
+      _vtn_variable_load_store(b, false, dest, &src);
    }
 }
 
 static void
-_vtn_variable_copy(struct vtn_builder *b, struct vtn_access_chain *dest,
-                   struct vtn_access_chain *src, struct vtn_type *tail_type)
+_vtn_variable_copy(struct vtn_builder *b, struct vtn_pointer *dest,
+                   struct vtn_pointer *src)
 {
-   enum glsl_base_type base_type = glsl_get_base_type(tail_type->type);
+   assert(src->type->type == dest->type->type);
+   enum glsl_base_type base_type = glsl_get_base_type(src->type->type);
    switch (base_type) {
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
@@ -848,18 +964,21 @@
 
    case GLSL_TYPE_ARRAY:
    case GLSL_TYPE_STRUCT: {
-      struct vtn_access_chain *new_src, *new_dest;
-      new_src = vtn_access_chain_extend(b, src, 1);
-      new_dest = vtn_access_chain_extend(b, dest, 1);
-      new_src->link[src->length].mode = vtn_access_mode_literal;
-      new_dest->link[dest->length].mode = vtn_access_mode_literal;
-      unsigned elems = glsl_get_length(tail_type->type);
+      struct vtn_access_chain chain = {
+         .length = 1,
+         .link = {
+            { .mode = vtn_access_mode_literal, },
+         }
+      };
+      unsigned elems = glsl_get_length(src->type->type);
       for (unsigned i = 0; i < elems; i++) {
-         new_src->link[src->length].id = i;
-         new_dest->link[dest->length].id = i;
-         struct vtn_type *elem_type = base_type == GLSL_TYPE_ARRAY ?
-            tail_type->array_element : tail_type->members[i];
-         _vtn_variable_copy(b, new_dest, new_src, elem_type);
+         chain.link[0].id = i;
+         struct vtn_pointer *src_elem =
+            vtn_pointer_dereference(b, src, &chain);
+         struct vtn_pointer *dest_elem =
+            vtn_pointer_dereference(b, dest, &chain);
+
+         _vtn_variable_copy(b, dest_elem, src_elem);
       }
       return;
    }
@@ -870,16 +989,13 @@
 }
 
 static void
-vtn_variable_copy(struct vtn_builder *b, struct vtn_access_chain *dest,
-                  struct vtn_access_chain *src)
+vtn_variable_copy(struct vtn_builder *b, struct vtn_pointer *dest,
+                  struct vtn_pointer *src)
 {
-   struct vtn_type *tail_type = vtn_access_chain_tail_type(b, src);
-   assert(vtn_access_chain_tail_type(b, dest)->type == tail_type->type);
-
    /* TODO: At some point, we should add a special-case for when we can
     * just emit a copy_var intrinsic.
     */
-   _vtn_variable_copy(b, dest, src, tail_type);
+   _vtn_variable_copy(b, dest, src);
 }
 
 static void
@@ -1005,6 +1121,10 @@
       *location = FRAG_RESULT_DEPTH;
       assert(*mode == nir_var_shader_out);
       break;
+   case SpvBuiltInHelperInvocation:
+      *location = SYSTEM_VALUE_HELPER_INVOCATION;
+      set_mode_system_value(mode);
+      break;
    case SpvBuiltInNumWorkgroups:
       *location = SYSTEM_VALUE_NUM_WORK_GROUPS;
       set_mode_system_value(mode);
@@ -1041,7 +1161,10 @@
       *location = SYSTEM_VALUE_DRAW_ID;
       set_mode_system_value(mode);
       break;
-   case SpvBuiltInHelperInvocation:
+   case SpvBuiltInViewIndex:
+      *location = SYSTEM_VALUE_VIEW_INDEX;
+      set_mode_system_value(mode);
+      break;
    default:
       unreachable("unsupported builtin");
    }
@@ -1095,9 +1218,9 @@
          nir_var->data.read_only = true;
 
          nir_constant *c = rzalloc(nir_var, nir_constant);
-         c->values[0].u32[0] = b->shader->info->cs.local_size[0];
-         c->values[0].u32[1] = b->shader->info->cs.local_size[1];
-         c->values[0].u32[2] = b->shader->info->cs.local_size[2];
+         c->values[0].u32[0] = b->shader->info.cs.local_size[0];
+         c->values[0].u32[1] = b->shader->info.cs.local_size[1];
+         c->values[0].u32[2] = b->shader->info.cs.local_size[2];
          nir_var->constant_initializer = c;
          break;
       }
@@ -1212,9 +1335,9 @@
       break;
    }
 
-   if (val->value_type == vtn_value_type_access_chain) {
-      assert(val->access_chain->length == 0);
-      assert(val->access_chain->var == void_var);
+   if (val->value_type == vtn_value_type_pointer) {
+      assert(val->pointer->var == void_var);
+      assert(val->pointer->chain == NULL);
       assert(member == -1);
    } else {
       assert(val->value_type == vtn_value_type_type);
@@ -1286,6 +1409,124 @@
    }
 }
 
+static enum vtn_variable_mode
+vtn_storage_class_to_mode(SpvStorageClass class,
+                          struct vtn_type *interface_type,
+                          nir_variable_mode *nir_mode_out)
+{
+   enum vtn_variable_mode mode;
+   nir_variable_mode nir_mode;
+   switch (class) {
+   case SpvStorageClassUniform:
+      if (interface_type->block) {
+         mode = vtn_variable_mode_ubo;
+         nir_mode = 0;
+      } else if (interface_type->buffer_block) {
+         mode = vtn_variable_mode_ssbo;
+         nir_mode = 0;
+      } else {
+         assert(!"Invalid uniform variable type");
+      }
+      break;
+   case SpvStorageClassStorageBuffer:
+      mode = vtn_variable_mode_ssbo;
+      nir_mode = 0;
+      break;
+   case SpvStorageClassUniformConstant:
+      if (glsl_type_is_image(interface_type->type)) {
+         mode = vtn_variable_mode_image;
+         nir_mode = nir_var_uniform;
+      } else if (glsl_type_is_sampler(interface_type->type)) {
+         mode = vtn_variable_mode_sampler;
+         nir_mode = nir_var_uniform;
+      } else {
+         assert(!"Invalid uniform constant variable type");
+      }
+      break;
+   case SpvStorageClassPushConstant:
+      mode = vtn_variable_mode_push_constant;
+      nir_mode = nir_var_uniform;
+      break;
+   case SpvStorageClassInput:
+      mode = vtn_variable_mode_input;
+      nir_mode = nir_var_shader_in;
+      break;
+   case SpvStorageClassOutput:
+      mode = vtn_variable_mode_output;
+      nir_mode = nir_var_shader_out;
+      break;
+   case SpvStorageClassPrivate:
+      mode = vtn_variable_mode_global;
+      nir_mode = nir_var_global;
+      break;
+   case SpvStorageClassFunction:
+      mode = vtn_variable_mode_local;
+      nir_mode = nir_var_local;
+      break;
+   case SpvStorageClassWorkgroup:
+      mode = vtn_variable_mode_workgroup;
+      nir_mode = nir_var_shared;
+      break;
+   case SpvStorageClassCrossWorkgroup:
+   case SpvStorageClassGeneric:
+   case SpvStorageClassAtomicCounter:
+   default:
+      unreachable("Unhandled variable storage class");
+   }
+
+   if (nir_mode_out)
+      *nir_mode_out = nir_mode;
+
+   return mode;
+}
+
+nir_ssa_def *
+vtn_pointer_to_ssa(struct vtn_builder *b, struct vtn_pointer *ptr)
+{
+   /* This pointer needs to have a pointer type with actual storage */
+   assert(ptr->ptr_type);
+   assert(ptr->ptr_type->type);
+
+   if (ptr->offset && ptr->block_index) {
+      return nir_vec2(&b->nb, ptr->block_index, ptr->offset);
+   } else {
+      /* If we don't have an offset or block index, then we must be a pointer
+       * to the variable itself.
+       */
+      assert(!ptr->offset && !ptr->block_index);
+
+      /* We can't handle a pointer to an array of descriptors because we have
+       * no way of knowing later on that we need to add to update the block
+       * index when dereferencing.
+       */
+      assert(ptr->var && ptr->var->type->base_type == vtn_base_type_struct);
+
+      return nir_vec2(&b->nb, vtn_variable_resource_index(b, ptr->var, NULL),
+                              nir_imm_int(&b->nb, 0));
+   }
+}
+
+struct vtn_pointer *
+vtn_pointer_from_ssa(struct vtn_builder *b, nir_ssa_def *ssa,
+                     struct vtn_type *ptr_type)
+{
+   assert(ssa->num_components == 2 && ssa->bit_size == 32);
+   assert(ptr_type->base_type == vtn_base_type_pointer);
+   assert(ptr_type->deref->base_type != vtn_base_type_pointer);
+   /* This pointer type needs to have actual storage */
+   assert(ptr_type->type);
+
+   struct vtn_pointer *ptr = rzalloc(b, struct vtn_pointer);
+   ptr->mode = vtn_storage_class_to_mode(ptr_type->storage_class,
+                                         ptr_type, NULL);
+   ptr->type = ptr_type->deref;
+   ptr->ptr_type = ptr_type;
+   ptr->block_index = nir_channel(&b->nb, ssa, 0);
+   ptr->offset = nir_channel(&b->nb, ssa, 1);
+
+   return ptr;
+}
+
 static bool
 is_per_vertex_inout(const struct vtn_variable *var, gl_shader_stage stage)
 {
@@ -1304,6 +1545,203 @@
    return false;
 }
 
+static void
+vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
+                    struct vtn_type *ptr_type, SpvStorageClass storage_class,
+                    nir_constant *initializer)
+{
+   assert(ptr_type->base_type == vtn_base_type_pointer);
+   struct vtn_type *type = ptr_type->deref;
+
+   struct vtn_type *without_array = type;
+   while(glsl_type_is_array(without_array->type))
+      without_array = without_array->array_element;
+
+   enum vtn_variable_mode mode;
+   nir_variable_mode nir_mode;
+   mode = vtn_storage_class_to_mode(storage_class, without_array, &nir_mode);
+
+   switch (mode) {
+   case vtn_variable_mode_ubo:
+      b->shader->info.num_ubos++;
+      break;
+   case vtn_variable_mode_ssbo:
+      b->shader->info.num_ssbos++;
+      break;
+   case vtn_variable_mode_image:
+      b->shader->info.num_images++;
+      break;
+   case vtn_variable_mode_sampler:
+      b->shader->info.num_textures++;
+      break;
+   case vtn_variable_mode_push_constant:
+      b->shader->num_uniforms = vtn_type_block_size(type);
+      break;
+   default:
+      /* No tallying is needed */
+      break;
+   }
+
+   struct vtn_variable *var = rzalloc(b, struct vtn_variable);
+   var->type = type;
+   var->mode = mode;
+
+   assert(val->value_type == vtn_value_type_pointer);
+   val->pointer = vtn_pointer_for_variable(b, var, ptr_type);
+
+   switch (var->mode) {
+   case vtn_variable_mode_local:
+   case vtn_variable_mode_global:
+   case vtn_variable_mode_image:
+   case vtn_variable_mode_sampler:
+   case vtn_variable_mode_workgroup:
+      /* For these, we create the variable normally */
+      var->var = rzalloc(b->shader, nir_variable);
+      var->var->name = ralloc_strdup(var->var, val->name);
+      var->var->type = var->type->type;
+      var->var->data.mode = nir_mode;
+
+      switch (var->mode) {
+      case vtn_variable_mode_image:
+      case vtn_variable_mode_sampler:
+         var->var->interface_type = without_array->type;
+         break;
+      default:
+         var->var->interface_type = NULL;
+         break;
+      }
+      break;
+
+   case vtn_variable_mode_input:
+   case vtn_variable_mode_output: {
+      /* In order to know whether or not we're a per-vertex inout, we need
+       * the patch qualifier.  This means walking the variable decorations
+       * early before we actually create any variables.  Not a big deal.
+       *
+       * GLSLang really likes to place decorations in the most interior
+       * thing it possibly can.  In particular, if you have a struct, it
+       * will place the patch decorations on the struct members.  This
+       * should be handled by the variable splitting below just fine.
+       *
+       * If you have an array-of-struct, things get even more weird as it
+       * will place the patch decorations on the struct even though it's
+       * inside an array and some of the members being patch and others not
+       * makes no sense whatsoever.  Since the only sensible thing is for
+       * it to be all or nothing, we'll call it patch if any of the members
+       * are declared patch.
+       */
+      var->patch = false;
+      vtn_foreach_decoration(b, val, var_is_patch_cb, &var->patch);
+      if (glsl_type_is_array(var->type->type) &&
+          glsl_type_is_struct(without_array->type)) {
+         vtn_foreach_decoration(b, without_array->val,
+                                var_is_patch_cb, &var->patch);
+      }
+
+      /* For inputs and outputs, we immediately split structures.  This
+       * is for a couple of reasons.  For one, builtins may all come in
+       * a struct and we really want those split out into separate
+       * variables.  For another, interpolation qualifiers can be
+       * applied to members of the top-level struct ane we need to be
+       * able to preserve that information.
+       */
+
+      int array_length = -1;
+      struct vtn_type *interface_type = var->type;
+      if (is_per_vertex_inout(var, b->shader->stage)) {
+         /* In Geometry shaders (and some tessellation), inputs come
+          * in per-vertex arrays.  However, some builtins come in
+          * non-per-vertex, hence the need for the is_array check.  In
+          * any case, there are no non-builtin arrays allowed so this
+          * check should be sufficient.
+          */
+         interface_type = var->type->array_element;
+         array_length = glsl_get_length(var->type->type);
+      }
+
+      if (glsl_type_is_struct(interface_type->type)) {
+         /* It's a struct.  Split it. */
+         unsigned num_members = glsl_get_length(interface_type->type);
+         var->members = ralloc_array(b, nir_variable *, num_members);
+
+         for (unsigned i = 0; i < num_members; i++) {
+            const struct glsl_type *mtype = interface_type->members[i]->type;
+            if (array_length >= 0)
+               mtype = glsl_array_type(mtype, array_length);
+
+            var->members[i] = rzalloc(b->shader, nir_variable);
+            var->members[i]->name =
+               ralloc_asprintf(var->members[i], "%s.%d", val->name, i);
+            var->members[i]->type = mtype;
+            var->members[i]->interface_type =
+               interface_type->members[i]->type;
+            var->members[i]->data.mode = nir_mode;
+            var->members[i]->data.patch = var->patch;
+         }
+      } else {
+         var->var = rzalloc(b->shader, nir_variable);
+         var->var->name = ralloc_strdup(var->var, val->name);
+         var->var->type = var->type->type;
+         var->var->interface_type = interface_type->type;
+         var->var->data.mode = nir_mode;
+         var->var->data.patch = var->patch;
+      }
+
+      /* For inputs and outputs, we need to grab locations and builtin
+       * information from the interface type.
+       */
+      vtn_foreach_decoration(b, interface_type->val, var_decoration_cb, var);
+      break;
+   }
+
+   case vtn_variable_mode_param:
+      unreachable("Not created through OpVariable");
+
+   case vtn_variable_mode_ubo:
+   case vtn_variable_mode_ssbo:
+   case vtn_variable_mode_push_constant:
+      /* These don't need actual variables. */
+      break;
+   }
+
+   if (initializer) {
+      var->var->constant_initializer =
+         nir_constant_clone(initializer, var->var);
+   }
+
+   vtn_foreach_decoration(b, val, var_decoration_cb, var);
+
+   if (var->mode == vtn_variable_mode_image ||
+       var->mode == vtn_variable_mode_sampler) {
+      /* XXX: We still need the binding information in the nir_variable
+       * for these. We should fix that.
+       */
+      var->var->data.binding = var->binding;
+      var->var->data.descriptor_set = var->descriptor_set;
+      var->var->data.index = var->input_attachment_index;
+
+      if (var->mode == vtn_variable_mode_image)
+         var->var->data.image.format = without_array->image_format;
+   }
+
+   if (var->mode == vtn_variable_mode_local) {
+      assert(var->members == NULL && var->var != NULL);
+      nir_function_impl_add_variable(b->impl, var->var);
+   } else if (var->var) {
+      nir_shader_add_variable(b->shader, var->var);
+   } else if (var->members) {
+      unsigned count = glsl_get_length(without_array->type);
+      for (unsigned i = 0; i < count; i++) {
+         assert(var->members[i]->data.mode != nir_var_local);
+         nir_shader_add_variable(b->shader, var->members[i]);
+      }
+   } else {
+      assert(var->mode == vtn_variable_mode_ubo ||
+             var->mode == vtn_variable_mode_ssbo ||
+             var->mode == vtn_variable_mode_push_constant);
+   }
+}
+
 void
 vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
                      const uint32_t *w, unsigned count)
@@ -1316,234 +1754,40 @@
    }
 
    case SpvOpVariable: {
-      struct vtn_variable *var = rzalloc(b, struct vtn_variable);
-      var->type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      struct vtn_type *ptr_type = vtn_value(b, w[1], vtn_value_type_type)->type;
 
-      var->chain.var = var;
-      var->chain.length = 0;
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_pointer);
 
-      struct vtn_value *val =
-         vtn_push_value(b, w[2], vtn_value_type_access_chain);
-      val->access_chain = &var->chain;
+      SpvStorageClass storage_class = w[3];
+      nir_constant *initializer = NULL;
+      if (count > 4)
+         initializer = vtn_value(b, w[4], vtn_value_type_constant)->constant;
 
-      struct vtn_type *without_array = var->type;
-      while(glsl_type_is_array(without_array->type))
-         without_array = without_array->array_element;
-
-      nir_variable_mode nir_mode;
-      switch ((SpvStorageClass)w[3]) {
-      case SpvStorageClassUniform:
-      case SpvStorageClassUniformConstant:
-         if (without_array->block) {
-            var->mode = vtn_variable_mode_ubo;
-            b->shader->info->num_ubos++;
-         } else if (without_array->buffer_block) {
-            var->mode = vtn_variable_mode_ssbo;
-            b->shader->info->num_ssbos++;
-         } else if (glsl_type_is_image(without_array->type)) {
-            var->mode = vtn_variable_mode_image;
-            nir_mode = nir_var_uniform;
-            b->shader->info->num_images++;
-         } else if (glsl_type_is_sampler(without_array->type)) {
-            var->mode = vtn_variable_mode_sampler;
-            nir_mode = nir_var_uniform;
-            b->shader->info->num_textures++;
-         } else {
-            assert(!"Invalid uniform variable type");
-         }
-         break;
-      case SpvStorageClassPushConstant:
-         var->mode = vtn_variable_mode_push_constant;
-         assert(b->shader->num_uniforms == 0);
-         b->shader->num_uniforms = vtn_type_block_size(var->type);
-         break;
-      case SpvStorageClassInput:
-         var->mode = vtn_variable_mode_input;
-         nir_mode = nir_var_shader_in;
-         break;
-      case SpvStorageClassOutput:
-         var->mode = vtn_variable_mode_output;
-         nir_mode = nir_var_shader_out;
-         break;
-      case SpvStorageClassPrivate:
-         var->mode = vtn_variable_mode_global;
-         nir_mode = nir_var_global;
-         break;
-      case SpvStorageClassFunction:
-         var->mode = vtn_variable_mode_local;
-         nir_mode = nir_var_local;
-         break;
-      case SpvStorageClassWorkgroup:
-         var->mode = vtn_variable_mode_workgroup;
-         nir_mode = nir_var_shared;
-         break;
-      case SpvStorageClassCrossWorkgroup:
-      case SpvStorageClassGeneric:
-      case SpvStorageClassAtomicCounter:
-      default:
-         unreachable("Unhandled variable storage class");
-      }
-
-      switch (var->mode) {
-      case vtn_variable_mode_local:
-      case vtn_variable_mode_global:
-      case vtn_variable_mode_image:
-      case vtn_variable_mode_sampler:
-      case vtn_variable_mode_workgroup:
-         /* For these, we create the variable normally */
-         var->var = rzalloc(b->shader, nir_variable);
-         var->var->name = ralloc_strdup(var->var, val->name);
-         var->var->type = var->type->type;
-         var->var->data.mode = nir_mode;
-
-         switch (var->mode) {
-         case vtn_variable_mode_image:
-         case vtn_variable_mode_sampler:
-            var->var->interface_type = without_array->type;
-            break;
-         default:
-            var->var->interface_type = NULL;
-            break;
-         }
-         break;
-
-      case vtn_variable_mode_input:
-      case vtn_variable_mode_output: {
-         /* In order to know whether or not we're a per-vertex inout, we need
-          * the patch qualifier.  This means walking the variable decorations
-          * early before we actually create any variables.  Not a big deal.
-          *
-          * GLSLang really likes to place decorations in the most interior
-          * thing it possibly can.  In particular, if you have a struct, it
-          * will place the patch decorations on the struct members.  This
-          * should be handled by the variable splitting below just fine.
-          *
-          * If you have an array-of-struct, things get even more weird as it
-          * will place the patch decorations on the struct even though it's
-          * inside an array and some of the members being patch and others not
-          * makes no sense whatsoever.  Since the only sensible thing is for
-          * it to be all or nothing, we'll call it patch if any of the members
-          * are declared patch.
-          */
-         var->patch = false;
-         vtn_foreach_decoration(b, val, var_is_patch_cb, &var->patch);
-         if (glsl_type_is_array(var->type->type) &&
-             glsl_type_is_struct(without_array->type)) {
-            vtn_foreach_decoration(b, without_array->val,
-                                   var_is_patch_cb, &var->patch);
-         }
-
-         /* For inputs and outputs, we immediately split structures.  This
-          * is for a couple of reasons.  For one, builtins may all come in
-          * a struct and we really want those split out into separate
-          * variables.  For another, interpolation qualifiers can be
-          * applied to members of the top-level struct ane we need to be
-          * able to preserve that information.
-          */
-
-         int array_length = -1;
-         struct vtn_type *interface_type = var->type;
-         if (is_per_vertex_inout(var, b->shader->stage)) {
-            /* In Geometry shaders (and some tessellation), inputs come
-             * in per-vertex arrays.  However, some builtins come in
-             * non-per-vertex, hence the need for the is_array check.  In
-             * any case, there are no non-builtin arrays allowed so this
-             * check should be sufficient.
-             */
-            interface_type = var->type->array_element;
-            array_length = glsl_get_length(var->type->type);
-         }
-
-         if (glsl_type_is_struct(interface_type->type)) {
-            /* It's a struct.  Split it. */
-            unsigned num_members = glsl_get_length(interface_type->type);
-            var->members = ralloc_array(b, nir_variable *, num_members);
-
-            for (unsigned i = 0; i < num_members; i++) {
-               const struct glsl_type *mtype = interface_type->members[i]->type;
-               if (array_length >= 0)
-                  mtype = glsl_array_type(mtype, array_length);
-
-               var->members[i] = rzalloc(b->shader, nir_variable);
-               var->members[i]->name =
-                  ralloc_asprintf(var->members[i], "%s.%d", val->name, i);
-               var->members[i]->type = mtype;
-               var->members[i]->interface_type =
-                  interface_type->members[i]->type;
-               var->members[i]->data.mode = nir_mode;
-               var->members[i]->data.patch = var->patch;
-            }
-         } else {
-            var->var = rzalloc(b->shader, nir_variable);
-            var->var->name = ralloc_strdup(var->var, val->name);
-            var->var->type = var->type->type;
-            var->var->interface_type = interface_type->type;
-            var->var->data.mode = nir_mode;
-            var->var->data.patch = var->patch;
-         }
-
-         /* For inputs and outputs, we need to grab locations and builtin
-          * information from the interface type.
-          */
-         vtn_foreach_decoration(b, interface_type->val, var_decoration_cb, var);
-         break;
-
-      case vtn_variable_mode_param:
-         unreachable("Not created through OpVariable");
-      }
-
-      case vtn_variable_mode_ubo:
-      case vtn_variable_mode_ssbo:
-      case vtn_variable_mode_push_constant:
-         /* These don't need actual variables. */
-         break;
-      }
-
-      if (count > 4) {
-         assert(count == 5);
-         nir_constant *constant =
-            vtn_value(b, w[4], vtn_value_type_constant)->constant;
-         var->var->constant_initializer =
-            nir_constant_clone(constant, var->var);
-      }
-
-      vtn_foreach_decoration(b, val, var_decoration_cb, var);
-
-      if (var->mode == vtn_variable_mode_image ||
-          var->mode == vtn_variable_mode_sampler) {
-         /* XXX: We still need the binding information in the nir_variable
-          * for these. We should fix that.
-          */
-         var->var->data.binding = var->binding;
-         var->var->data.descriptor_set = var->descriptor_set;
-         var->var->data.index = var->input_attachment_index;
-
-         if (var->mode == vtn_variable_mode_image)
-            var->var->data.image.format = without_array->image_format;
-      }
-
-      if (var->mode == vtn_variable_mode_local) {
-         assert(var->members == NULL && var->var != NULL);
-         nir_function_impl_add_variable(b->impl, var->var);
-      } else if (var->var) {
-         nir_shader_add_variable(b->shader, var->var);
-      } else if (var->members) {
-         unsigned count = glsl_get_length(without_array->type);
-         for (unsigned i = 0; i < count; i++) {
-            assert(var->members[i]->data.mode != nir_var_local);
-            nir_shader_add_variable(b->shader, var->members[i]);
-         }
-      } else {
-         assert(var->mode == vtn_variable_mode_ubo ||
-                var->mode == vtn_variable_mode_ssbo ||
-                var->mode == vtn_variable_mode_push_constant);
-      }
+      vtn_create_variable(b, val, ptr_type, storage_class, initializer);
       break;
    }
 
    case SpvOpAccessChain:
+   case SpvOpPtrAccessChain:
    case SpvOpInBoundsAccessChain: {
-      struct vtn_access_chain *base, *chain;
+      struct vtn_access_chain *chain = vtn_access_chain_create(b, count - 4);
+      chain->ptr_as_array = (opcode == SpvOpPtrAccessChain);
+
+      unsigned idx = 0;
+      for (int i = 4; i < count; i++) {
+         struct vtn_value *link_val = vtn_untyped_value(b, w[i]);
+         if (link_val->value_type == vtn_value_type_constant) {
+            chain->link[idx].mode = vtn_access_mode_literal;
+            chain->link[idx].id = link_val->constant->values[0].u32[0];
+         } else {
+            chain->link[idx].mode = vtn_access_mode_id;
+            chain->link[idx].id = w[i];
+
+         }
+         idx++;
+      }
+
+      struct vtn_type *ptr_type = vtn_value(b, w[1], vtn_value_type_type)->type;
       struct vtn_value *base_val = vtn_untyped_value(b, w[3]);
       if (base_val->value_type == vtn_value_type_sampled_image) {
          /* This is rather insane.  SPIR-V allows you to use OpSampledImage
@@ -1553,74 +1797,56 @@
           * sampler when crawling the access chain, but it does leave us
           * with this rather awkward little special-case.
           */
-         base = base_val->sampled_image->image;
-      } else {
-         assert(base_val->value_type == vtn_value_type_access_chain);
-         base = base_val->access_chain;
-      }
-
-      chain = vtn_access_chain_extend(b, base, count - 4);
-
-      unsigned idx = base->length;
-      for (int i = 4; i < count; i++) {
-         struct vtn_value *link_val = vtn_untyped_value(b, w[i]);
-         if (link_val->value_type == vtn_value_type_constant) {
-            chain->link[idx].mode = vtn_access_mode_literal;
-            chain->link[idx].id = link_val->constant->values[0].u32[0];
-         } else {
-            chain->link[idx].mode = vtn_access_mode_id;
-            chain->link[idx].id = w[i];
-         }
-         idx++;
-      }
-
-      if (base_val->value_type == vtn_value_type_sampled_image) {
          struct vtn_value *val =
             vtn_push_value(b, w[2], vtn_value_type_sampled_image);
          val->sampled_image = ralloc(b, struct vtn_sampled_image);
-         val->sampled_image->image = chain;
+         val->sampled_image->image =
+            vtn_pointer_dereference(b, base_val->sampled_image->image, chain);
          val->sampled_image->sampler = base_val->sampled_image->sampler;
       } else {
+         assert(base_val->value_type == vtn_value_type_pointer);
          struct vtn_value *val =
-            vtn_push_value(b, w[2], vtn_value_type_access_chain);
-         val->access_chain = chain;
+            vtn_push_value(b, w[2], vtn_value_type_pointer);
+         val->pointer = vtn_pointer_dereference(b, base_val->pointer, chain);
+         val->pointer->ptr_type = ptr_type;
       }
       break;
    }
 
    case SpvOpCopyMemory: {
-      struct vtn_value *dest = vtn_value(b, w[1], vtn_value_type_access_chain);
-      struct vtn_value *src = vtn_value(b, w[2], vtn_value_type_access_chain);
+      struct vtn_value *dest = vtn_value(b, w[1], vtn_value_type_pointer);
+      struct vtn_value *src = vtn_value(b, w[2], vtn_value_type_pointer);
 
-      vtn_variable_copy(b, dest->access_chain, src->access_chain);
+      vtn_variable_copy(b, dest->pointer, src->pointer);
       break;
    }
 
    case SpvOpLoad: {
-      struct vtn_access_chain *src =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      struct vtn_type *res_type =
+         vtn_value(b, w[1], vtn_value_type_type)->type;
+      struct vtn_pointer *src =
+         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
 
-      if (src->var->mode == vtn_variable_mode_image ||
-          src->var->mode == vtn_variable_mode_sampler) {
-         vtn_push_value(b, w[2], vtn_value_type_access_chain)->access_chain = src;
+      if (src->mode == vtn_variable_mode_image ||
+          src->mode == vtn_variable_mode_sampler) {
+         vtn_push_value(b, w[2], vtn_value_type_pointer)->pointer = src;
          return;
       }
 
-      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
-      val->ssa = vtn_variable_load(b, src);
+      vtn_push_ssa(b, w[2], res_type, vtn_variable_load(b, src));
       break;
    }
 
    case SpvOpStore: {
-      struct vtn_access_chain *dest =
-         vtn_value(b, w[1], vtn_value_type_access_chain)->access_chain;
+      struct vtn_pointer *dest =
+         vtn_value(b, w[1], vtn_value_type_pointer)->pointer;
 
-      if (glsl_type_is_sampler(dest->var->type->type)) {
+      if (glsl_type_is_sampler(dest->type->type)) {
          vtn_warn("OpStore of a sampler detected.  Doing on-the-fly copy "
                   "propagation to workaround the problem.");
          assert(dest->var->copy_prop_sampler == NULL);
          dest->var->copy_prop_sampler =
-            vtn_value(b, w[2], vtn_value_type_access_chain)->access_chain;
+            vtn_value(b, w[2], vtn_value_type_pointer)->pointer;
          break;
       }
 
@@ -1630,16 +1856,16 @@
    }
 
    case SpvOpArrayLength: {
-      struct vtn_access_chain *chain =
-         vtn_value(b, w[3], vtn_value_type_access_chain)->access_chain;
+      struct vtn_pointer *ptr =
+         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
 
-      const uint32_t offset = chain->var->type->offsets[w[4]];
-      const uint32_t stride = chain->var->type->members[w[4]]->stride;
+      const uint32_t offset = ptr->var->type->offsets[w[4]];
+      const uint32_t stride = ptr->var->type->members[w[4]]->stride;
 
       unsigned chain_idx;
       struct vtn_type *type;
       nir_ssa_def *index =
-         get_vulkan_resource_index(b, chain, &type, &chain_idx);
+         get_vulkan_resource_index(b, ptr, &type, &chain_idx);
 
       nir_intrinsic_instr *instr =
          nir_intrinsic_instr_create(b->nb.shader,
diff --git a/src/egl/.gitignore b/src/egl/.gitignore
new file mode 100644
index 0000000..32331e9
--- /dev/null
+++ b/src/egl/.gitignore
@@ -0,0 +1,2 @@
+g_egldispatchstubs.c
+g_egldispatchstubs.h
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index a122c1d..0055322 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -58,16 +58,16 @@
 	libgralloc_drm \
 	libsync
 
-ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
-# require i915_dri and/or i965_dri
-LOCAL_REQUIRED_MODULES += \
-	$(addsuffix _dri, $(filter i915 i965, $(MESA_GPU_DRIVERS)))
-endif # MESA_BUILD_CLASSIC
-
-ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
+# This controls enabling building of driver libraries
+ifneq ($(HAVE_I915_DRI),)
+LOCAL_REQUIRED_MODULES += i915_dri
+endif
+ifneq ($(HAVE_I965_DRI),)
+LOCAL_REQUIRED_MODULES += i965_dri
+endif
+ifneq ($(MESA_BUILD_GALLIUM),)
 LOCAL_REQUIRED_MODULES += gallium_dri
-endif # MESA_BUILD_GALLIUM
-
+endif
 
 LOCAL_MODULE := libGLES_mesa
 LOCAL_MODULE_RELATIVE_PATH := egl
diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index 8109038..ecaf148 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -21,6 +21,10 @@
 
 include Makefile.sources
 
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+
+BUILT_SOURCES =
+
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/egl/main \
@@ -60,12 +64,29 @@
 endif
 endif
 
+WL_DMABUF_XML = $(WAYLAND_PROTOCOLS_DATADIR)/unstable/linux-dmabuf/linux-dmabuf-unstable-v1.xml
+
+drivers/dri2/linux-dmabuf-unstable-v1-protocol.c: $(WL_DMABUF_XML)
+	$(MKDIR_GEN)
+	$(AM_V_GEN)$(WAYLAND_SCANNER) code < $< > $@
+
+drivers/dri2/linux-dmabuf-unstable-v1-client-protocol.h: $(WL_DMABUF_XML)
+	$(MKDIR_GEN)
+	$(AM_V_GEN)$(WAYLAND_SCANNER) client-header < $< > $@
+
 if HAVE_PLATFORM_WAYLAND
+drivers/dri2/linux-dmabuf-unstable-v1-protocol.lo: drivers/dri2/linux-dmabuf-unstable-v1-client-protocol.h
+drivers/dri2/egl_dri2.lo: drivers/dri2/linux-dmabuf-unstable-v1-client-protocol.h
+
 AM_CFLAGS += $(WAYLAND_CFLAGS)
 libEGL_common_la_LIBADD += $(WAYLAND_LIBS)
 libEGL_common_la_LIBADD += $(LIBDRM_LIBS)
 libEGL_common_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.la
-dri2_backend_FILES += drivers/dri2/platform_wayland.c
+libEGL_common_la_LIBADD += $(top_builddir)/src/util/libmesautil.la
+dri2_backend_FILES += \
+	drivers/dri2/platform_wayland.c	\
+	drivers/dri2/linux-dmabuf-unstable-v1-protocol.c \
+	drivers/dri2/linux-dmabuf-unstable-v1-client-protocol.h
 endif
 
 if HAVE_PLATFORM_DRM
@@ -85,6 +106,7 @@
 
 AM_CFLAGS += \
 	-I$(top_srcdir)/src/loader \
+	-I$(top_builddir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/gbm/backends/dri \
 	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
@@ -118,7 +140,7 @@
 		$(top_srcdir)/src/egl/generate/egl.xml \
 		$(top_srcdir)/src/egl/generate/egl_other.xml > $@
 
-BUILT_SOURCES = g_egldispatchstubs.c g_egldispatchstubs.h
+BUILT_SOURCES += g_egldispatchstubs.c g_egldispatchstubs.h
 CLEANFILES = $(BUILT_SOURCES)
 
 if USE_LIBGLVND
diff --git a/src/egl/Makefile.sources b/src/egl/Makefile.sources
index e6fd3f1..82f13ad 100644
--- a/src/egl/Makefile.sources
+++ b/src/egl/Makefile.sources
@@ -3,7 +3,6 @@
 	main/eglapi.h \
 	main/eglarray.c \
 	main/eglarray.h \
-	main/eglcompiler.h \
 	main/eglconfig.c \
 	main/eglconfig.h \
 	main/eglcontext.c \
diff --git a/src/egl/drivers/dri2/.gitignore b/src/egl/drivers/dri2/.gitignore
new file mode 100644
index 0000000..e96becb
--- /dev/null
+++ b/src/egl/drivers/dri2/.gitignore
@@ -0,0 +1,2 @@
+linux-dmabuf-unstable-v1-client-protocol.h
+linux-dmabuf-unstable-v1-protocol.c
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 2cab7d0..1d3fe52 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -53,11 +53,17 @@
 #ifdef HAVE_WAYLAND_PLATFORM
 #include "wayland-drm.h"
 #include "wayland-drm-client-protocol.h"
+#include "linux-dmabuf-unstable-v1-client-protocol.h"
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+#include "X11/Xlibint.h"
 #endif
 
 #include "egl_dri2.h"
 #include "loader/loader.h"
 #include "util/u_atomic.h"
+#include "util/u_vector.h"
 
 /* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
  * some of the definitions here so that building Mesa won't bleeding-edge
@@ -83,6 +89,10 @@
 #define DRM_FORMAT_GR1616        fourcc_code('G', 'R', '3', '2') /* [31:0] R:G 16:16 little endian */
 #endif
 
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL<<56) - 1)
+#endif
+
 static void
 dri_set_background_context(void *loaderPrivate)
 {
@@ -92,76 +102,72 @@
    _eglBindContextToThread(ctx, t);
 }
 
+static GLboolean
+dri_is_thread_safe(void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+   _EGLDisplay *display =  dri2_surf->base.Resource.Display;
+
+#ifdef HAVE_X11_PLATFORM
+   Display *xdpy = (Display*)display->PlatformDisplay;
+
+   /* Check Xlib is running in thread safe mode when running on EGL/X11-xlib
+    * platform
+    *
+    * 'lock_fns' is the XLockDisplay function pointer of the X11 display 'dpy'.
+    * It wll be NULL if XInitThreads wasn't called.
+    */
+   if (display->Platform == _EGL_PLATFORM_X11 && xdpy && !xdpy->lock_fns)
+      return false;
+#endif
+
+#ifdef HAVE_WAYLAND_PLATFORM
+   if (display->Platform == _EGL_PLATFORM_WAYLAND)
+      return true;
+#endif
+
+   return true;
+}
+
 const __DRIbackgroundCallableExtension background_callable_extension = {
-   .base = { __DRI_BACKGROUND_CALLABLE, 1 },
+   .base = { __DRI_BACKGROUND_CALLABLE, 2 },
 
    .setBackgroundContext = dri_set_background_context,
+   .isThreadSafe         = dri_is_thread_safe,
 };
 
 const __DRIuseInvalidateExtension use_invalidate = {
    .base = { __DRI_USE_INVALIDATE, 1 }
 };
 
-EGLint dri2_to_egl_attribute_map[] = {
-   0,
-   EGL_BUFFER_SIZE,                /* __DRI_ATTRIB_BUFFER_SIZE */
-   EGL_LEVEL,                        /* __DRI_ATTRIB_LEVEL */
-   EGL_RED_SIZE,                /* __DRI_ATTRIB_RED_SIZE */
-   EGL_GREEN_SIZE,                /* __DRI_ATTRIB_GREEN_SIZE */
-   EGL_BLUE_SIZE,                /* __DRI_ATTRIB_BLUE_SIZE */
-   EGL_LUMINANCE_SIZE,                /* __DRI_ATTRIB_LUMINANCE_SIZE */
-   EGL_ALPHA_SIZE,                /* __DRI_ATTRIB_ALPHA_SIZE */
-   0,                                /* __DRI_ATTRIB_ALPHA_MASK_SIZE */
-   EGL_DEPTH_SIZE,                /* __DRI_ATTRIB_DEPTH_SIZE */
-   EGL_STENCIL_SIZE,                /* __DRI_ATTRIB_STENCIL_SIZE */
-   0,                                /* __DRI_ATTRIB_ACCUM_RED_SIZE */
-   0,                                /* __DRI_ATTRIB_ACCUM_GREEN_SIZE */
-   0,                                /* __DRI_ATTRIB_ACCUM_BLUE_SIZE */
-   0,                                /* __DRI_ATTRIB_ACCUM_ALPHA_SIZE */
-   EGL_SAMPLE_BUFFERS,                /* __DRI_ATTRIB_SAMPLE_BUFFERS */
-   EGL_SAMPLES,                        /* __DRI_ATTRIB_SAMPLES */
-   0,                                /* __DRI_ATTRIB_RENDER_TYPE, */
-   0,                                /* __DRI_ATTRIB_CONFIG_CAVEAT */
-   0,                                /* __DRI_ATTRIB_CONFORMANT */
-   0,                                /* __DRI_ATTRIB_DOUBLE_BUFFER */
-   0,                                /* __DRI_ATTRIB_STEREO */
-   0,                                /* __DRI_ATTRIB_AUX_BUFFERS */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_TYPE */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_INDEX_VALUE */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_RED_VALUE */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_GREEN_VALUE */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_BLUE_VALUE */
-   0,                                /* __DRI_ATTRIB_TRANSPARENT_ALPHA_VALUE */
-   0,                                /* __DRI_ATTRIB_FLOAT_MODE (deprecated) */
-   0,                                /* __DRI_ATTRIB_RED_MASK */
-   0,                                /* __DRI_ATTRIB_GREEN_MASK */
-   0,                                /* __DRI_ATTRIB_BLUE_MASK */
-   0,                                /* __DRI_ATTRIB_ALPHA_MASK */
-   EGL_MAX_PBUFFER_WIDTH,        /* __DRI_ATTRIB_MAX_PBUFFER_WIDTH */
-   EGL_MAX_PBUFFER_HEIGHT,        /* __DRI_ATTRIB_MAX_PBUFFER_HEIGHT */
-   EGL_MAX_PBUFFER_PIXELS,        /* __DRI_ATTRIB_MAX_PBUFFER_PIXELS */
-   0,                                /* __DRI_ATTRIB_OPTIMAL_PBUFFER_WIDTH */
-   0,                                /* __DRI_ATTRIB_OPTIMAL_PBUFFER_HEIGHT */
-   0,                                /* __DRI_ATTRIB_VISUAL_SELECT_GROUP */
-   0,                                /* __DRI_ATTRIB_SWAP_METHOD */
-   EGL_MAX_SWAP_INTERVAL,        /* __DRI_ATTRIB_MAX_SWAP_INTERVAL */
-   EGL_MIN_SWAP_INTERVAL,        /* __DRI_ATTRIB_MIN_SWAP_INTERVAL */
-   0,                                /* __DRI_ATTRIB_BIND_TO_TEXTURE_RGB */
-   0,                                /* __DRI_ATTRIB_BIND_TO_TEXTURE_RGBA */
-   0,                                /* __DRI_ATTRIB_BIND_TO_MIPMAP_TEXTURE */
-   0,                                /* __DRI_ATTRIB_BIND_TO_TEXTURE_TARGETS */
-   EGL_Y_INVERTED_NOK,                /* __DRI_ATTRIB_YINVERTED */
-   0,                                /* __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE */
+static const EGLint dri2_to_egl_attribute_map[__DRI_ATTRIB_MAX] = {
+   [__DRI_ATTRIB_BUFFER_SIZE ]          = EGL_BUFFER_SIZE,
+   [__DRI_ATTRIB_LEVEL]                 = EGL_LEVEL,
+   [__DRI_ATTRIB_RED_SIZE]              = EGL_RED_SIZE,
+   [__DRI_ATTRIB_GREEN_SIZE]            = EGL_GREEN_SIZE,
+   [__DRI_ATTRIB_BLUE_SIZE]             = EGL_BLUE_SIZE,
+   [__DRI_ATTRIB_LUMINANCE_SIZE]        = EGL_LUMINANCE_SIZE,
+   [__DRI_ATTRIB_ALPHA_SIZE]            = EGL_ALPHA_SIZE,
+   [__DRI_ATTRIB_DEPTH_SIZE]            = EGL_DEPTH_SIZE,
+   [__DRI_ATTRIB_STENCIL_SIZE]          = EGL_STENCIL_SIZE,
+   [__DRI_ATTRIB_SAMPLE_BUFFERS]        = EGL_SAMPLE_BUFFERS,
+   [__DRI_ATTRIB_SAMPLES]               = EGL_SAMPLES,
+   [__DRI_ATTRIB_MAX_PBUFFER_WIDTH]     = EGL_MAX_PBUFFER_WIDTH,
+   [__DRI_ATTRIB_MAX_PBUFFER_HEIGHT]    = EGL_MAX_PBUFFER_HEIGHT,
+   [__DRI_ATTRIB_MAX_PBUFFER_PIXELS]    = EGL_MAX_PBUFFER_PIXELS,
+   [__DRI_ATTRIB_MAX_SWAP_INTERVAL]     = EGL_MAX_SWAP_INTERVAL,
+   [__DRI_ATTRIB_MIN_SWAP_INTERVAL]     = EGL_MIN_SWAP_INTERVAL,
+   [__DRI_ATTRIB_YINVERTED]             = EGL_Y_INVERTED_NOK,
 };
 
 const __DRIconfig *
 dri2_get_dri_config(struct dri2_egl_config *conf, EGLint surface_type,
                     EGLenum colorspace)
 {
+   const bool double_buffer = surface_type == EGL_WINDOW_BIT;
    const bool srgb = colorspace == EGL_GL_COLORSPACE_SRGB_KHR;
 
-   return surface_type == EGL_WINDOW_BIT ? conf->dri_double_config[srgb] :
-                                           conf->dri_single_config[srgb];
+   return conf->dri_config[double_buffer][srgb];
 }
 
 static EGLBoolean
@@ -191,16 +197,15 @@
    _EGLConfig *matching_config;
    EGLint num_configs = 0;
    EGLint config_id;
-   int i;
 
    _eglInitConfig(&base, disp, id);
 
-   i = 0;
    double_buffer = 0;
    bind_to_texture_rgb = 0;
    bind_to_texture_rgba = 0;
 
-   while (dri2_dpy->core->indexConfigAttrib(dri_config, i++, &attrib, &value)) {
+   for (int i = 0; dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib,
+                                                     &value); ++i) {
       switch (attrib) {
       case __DRI_ATTRIB_RENDER_TYPE:
          if (value & __DRI_ATTRIB_RGBA_BIT)
@@ -283,7 +288,7 @@
    }
 
    if (attr_list)
-      for (i = 0; attr_list[i] != EGL_NONE; i += 2)
+      for (int i = 0; attr_list[i] != EGL_NONE; i += 2)
          _eglSetConfigKey(&base, attr_list[i], attr_list[i+1]);
 
    if (rgba_masks && memcmp(rgba_masks, dri_masks, sizeof(dri_masks)))
@@ -319,10 +324,8 @@
    if (num_configs == 1) {
       conf = (struct dri2_egl_config *) matching_config;
 
-      if (double_buffer && !conf->dri_double_config[srgb])
-         conf->dri_double_config[srgb] = dri_config;
-      else if (!double_buffer && !conf->dri_single_config[srgb])
-         conf->dri_single_config[srgb] = dri_config;
+      if (!conf->dri_config[double_buffer][srgb])
+         conf->dri_config[double_buffer][srgb] = dri_config;
       else
          /* a similar config type is already added (unlikely) => discard */
          return NULL;
@@ -332,10 +335,7 @@
       if (conf == NULL)
          return NULL;
 
-      if (double_buffer)
-         conf->dri_double_config[srgb] = dri_config;
-      else
-         conf->dri_single_config[srgb] = dri_config;
+      conf->dri_config[double_buffer][srgb] = dri_config;
 
       memcpy(&conf->base, &base, sizeof base);
       conf->base.SurfaceType = 0;
@@ -430,6 +430,7 @@
 
 static const struct dri2_extension_match optional_core_extensions[] = {
    { __DRI2_ROBUSTNESS, 1, offsetof(struct dri2_egl_display, robustness) },
+   { __DRI2_NO_ERROR, 1, offsetof(struct dri2_egl_display, no_error) },
    { __DRI2_CONFIG_QUERY, 1, offsetof(struct dri2_egl_display, config) },
    { __DRI2_FENCE, 1, offsetof(struct dri2_egl_display, fence) },
    { __DRI2_RENDERER_QUERY, 1, offsetof(struct dri2_egl_display, rendererQuery) },
@@ -443,12 +444,12 @@
                      const __DRIextension **extensions,
                      bool optional)
 {
-   int i, j, ret = EGL_TRUE;
+   int ret = EGL_TRUE;
    void *field;
 
-   for (i = 0; extensions[i]; i++) {
+   for (int i = 0; extensions[i]; i++) {
       _eglLog(_EGL_DEBUG, "found extension `%s'", extensions[i]->name);
-      for (j = 0; matches[j].name; j++) {
+      for (int j = 0; matches[j].name; j++) {
          if (strcmp(extensions[i]->name, matches[j].name) == 0 &&
              extensions[i]->version >= matches[j].version) {
             field = ((char *) dri2_dpy + matches[j].offset);
@@ -460,7 +461,7 @@
       }
    }
 
-   for (j = 0; matches[j].name; j++) {
+   for (int j = 0; matches[j].name; j++) {
       field = ((char *) dri2_dpy + matches[j].offset);
       if (*(const __DRIextension **) field == NULL) {
          if (optional) {
@@ -482,7 +483,7 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    const __DRIextension **extensions = NULL;
-   char path[PATH_MAX], *search_paths, *p, *next, *end;
+   char path[PATH_MAX], *search_paths, *next, *end;
    char *get_extensions_name;
    const __DRIextension **(*get_extensions)(void);
 
@@ -496,7 +497,7 @@
 
    dri2_dpy->driver = NULL;
    end = search_paths + strlen(search_paths);
-   for (p = search_paths; p < end; p = next + 1) {
+   for (char *p = search_paths; p < end; p = next + 1) {
       int len;
       next = strchr(p, ':');
       if (next == NULL)
@@ -518,15 +519,6 @@
       /* not need continue to loop all paths once the driver is found */
       if (dri2_dpy->driver != NULL)
          break;
-
-#ifdef ANDROID
-      snprintf(path, sizeof path, "%.*s/gallium_dri.so", len, p);
-      dri2_dpy->driver = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
-      if (dri2_dpy->driver == NULL)
-         _eglLog(_EGL_DEBUG, "failed to open %s: %s\n", path, dlerror());
-      else
-         break;
-#endif
    }
 
    if (dri2_dpy->driver == NULL) {
@@ -637,6 +629,18 @@
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    unsigned int api_mask;
 
+   /*
+    * EGL 1.5 specification defines the default value to 1. Moreover,
+    * eglSwapInterval() is required to clamp requested value to the supported
+    * range. Since the default value is implicitly assumed to be supported,
+    * use it as both minimum and maximum for the platforms that do not allow
+    * changing the interval. Platforms, which allow it (e.g. x11, wayland)
+    * override these values already.
+    */
+   dri2_dpy->min_swap_interval = 1;
+   dri2_dpy->max_swap_interval = 1;
+   dri2_dpy->default_swap_interval = 1;
+
    if (dri2_dpy->image_driver) {
       api_mask = dri2_dpy->image_driver->getAPIMask(dri2_dpy->dri_screen);
    } else if (dri2_dpy->dri2) {
@@ -676,12 +680,16 @@
          disp->Extensions.EXT_create_context_robustness = EGL_TRUE;
    }
 
+   if (dri2_dpy->no_error)
+      disp->Extensions.KHR_create_context_no_error = EGL_TRUE;
+
    if (dri2_dpy->fence) {
       disp->Extensions.KHR_fence_sync = EGL_TRUE;
       disp->Extensions.KHR_wait_sync = EGL_TRUE;
       if (dri2_dpy->fence->get_fence_from_cl_event)
          disp->Extensions.KHR_cl_event2 = EGL_TRUE;
-      if (dri2_dpy->fence->base.version >= 2) {
+      if (dri2_dpy->fence->base.version >= 2 &&
+          dri2_dpy->fence->get_capabilities) {
          unsigned capabilities =
             dri2_dpy->fence->get_capabilities(dri2_dpy->dri_screen);
          disp->Extensions.ANDROID_native_fence_sync =
@@ -713,27 +721,32 @@
           dri2_dpy->image->createImageFromTexture) {
          disp->Extensions.KHR_gl_texture_2D_image = EGL_TRUE;
          disp->Extensions.KHR_gl_texture_cubemap_image = EGL_TRUE;
+
+         if (dri2_renderer_query_integer(dri2_dpy,
+                                         __DRI2_RENDERER_HAS_TEXTURE_3D))
+             disp->Extensions.KHR_gl_texture_3D_image = EGL_TRUE;
       }
-      if (dri2_renderer_query_integer(dri2_dpy,
-                                      __DRI2_RENDERER_HAS_TEXTURE_3D))
-         disp->Extensions.KHR_gl_texture_3D_image = EGL_TRUE;
 #ifdef HAVE_LIBDRM
       if (dri2_dpy->image->base.version >= 8 &&
           dri2_dpy->image->createImageFromDmaBufs) {
          disp->Extensions.EXT_image_dma_buf_import = EGL_TRUE;
       }
+      if (dri2_dpy->image->base.version >= 15 &&
+          dri2_dpy->image->createImageFromDmaBufs2 &&
+          dri2_dpy->image->queryDmaBufFormats &&
+          dri2_dpy->image->queryDmaBufModifiers) {
+         disp->Extensions.EXT_image_dma_buf_import_modifiers = EGL_TRUE;
+      }
 #endif
    }
 }
 
-/* All platforms but DRM call this function to create the screen, query the
- * dri extensions, setup the vtables and populate the driver_configs.
- * DRM inherits all that information from its display - GBM.
+/* All platforms but DRM call this function to create the screen and populate
+ * the driver_configs. DRM inherits that information from its display - GBM.
  */
 EGLBoolean
 dri2_create_screen(_EGLDisplay *disp)
 {
-   const __DRIextension **extensions;
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
 
    if (dri2_dpy->image_driver) {
@@ -775,28 +788,29 @@
       return EGL_FALSE;
    }
 
-   dri2_dpy->own_dri_screen = 1;
+   dri2_dpy->own_dri_screen = true;
+   return EGL_TRUE;
+}
+
+EGLBoolean
+dri2_setup_extensions(_EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   const struct dri2_extension_match *mandatory_core_extensions;
+   const __DRIextension **extensions;
 
    extensions = dri2_dpy->core->getExtensions(dri2_dpy->dri_screen);
 
-   if (dri2_dpy->image_driver || dri2_dpy->dri2) {
-      if (!dri2_bind_extensions(dri2_dpy, dri2_core_extensions, extensions, false))
-         goto cleanup_dri_screen;
-   } else {
-      assert(dri2_dpy->swrast);
-      if (!dri2_bind_extensions(dri2_dpy, swrast_core_extensions, extensions, false))
-         goto cleanup_dri_screen;
-   }
+   if (dri2_dpy->image_driver || dri2_dpy->dri2)
+      mandatory_core_extensions = dri2_core_extensions;
+   else
+      mandatory_core_extensions = swrast_core_extensions;
+
+   if (!dri2_bind_extensions(dri2_dpy, mandatory_core_extensions, extensions, false))
+      return EGL_FALSE;
 
    dri2_bind_extensions(dri2_dpy, optional_core_extensions, extensions, true);
-   dri2_setup_screen(disp);
-
    return EGL_TRUE;
-
- cleanup_dri_screen:
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
-
-   return EGL_FALSE;
 }
 
 /**
@@ -887,7 +901,6 @@
 dri2_display_release(_EGLDisplay *disp)
 {
    struct dri2_egl_display *dri2_dpy;
-   unsigned i;
 
    if (!disp)
       return;
@@ -901,6 +914,13 @@
       return;
 
    _eglCleanupDisplay(disp);
+   dri2_display_destroy(disp);
+}
+
+void
+dri2_display_destroy(_EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
 
    if (dri2_dpy->own_dri_screen)
       dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
@@ -925,7 +945,7 @@
 #ifdef HAVE_DRM_PLATFORM
    case _EGL_PLATFORM_DRM:
       if (dri2_dpy->own_device) {
-         gbm_device_destroy(&dri2_dpy->gbm_dri->base.base);
+         gbm_device_destroy(&dri2_dpy->gbm_dri->base);
       }
       break;
 #endif
@@ -933,11 +953,19 @@
    case _EGL_PLATFORM_WAYLAND:
       if (dri2_dpy->wl_drm)
           wl_drm_destroy(dri2_dpy->wl_drm);
+      if (dri2_dpy->wl_dmabuf)
+          zwp_linux_dmabuf_v1_destroy(dri2_dpy->wl_dmabuf);
       if (dri2_dpy->wl_shm)
           wl_shm_destroy(dri2_dpy->wl_shm);
-      wl_registry_destroy(dri2_dpy->wl_registry);
-      wl_event_queue_destroy(dri2_dpy->wl_queue);
-      wl_proxy_wrapper_destroy(dri2_dpy->wl_dpy_wrapper);
+      if (dri2_dpy->wl_registry)
+         wl_registry_destroy(dri2_dpy->wl_registry);
+      if (dri2_dpy->wl_queue)
+         wl_event_queue_destroy(dri2_dpy->wl_queue);
+      if (dri2_dpy->wl_dpy_wrapper)
+         wl_proxy_wrapper_destroy(dri2_dpy->wl_dpy_wrapper);
+      u_vector_finish(&dri2_dpy->wl_modifiers.argb8888);
+      u_vector_finish(&dri2_dpy->wl_modifiers.xrgb8888);
+      u_vector_finish(&dri2_dpy->wl_modifiers.rgb565);
       if (dri2_dpy->own_device) {
          wl_display_disconnect(dri2_dpy->wl_dpy);
       }
@@ -951,8 +979,8 @@
     * the ones from the gbm device. As such the gbm itself is responsible
     * for the cleanup.
     */
-   if (disp->Platform != _EGL_PLATFORM_DRM) {
-      for (i = 0; dri2_dpy->driver_configs[i]; i++)
+   if (disp->Platform != _EGL_PLATFORM_DRM && dri2_dpy->driver_configs) {
+      for (unsigned i = 0; dri2_dpy->driver_configs[i]; i++)
          free((__DRIconfig *) dri2_dpy->driver_configs[i]);
       free(dri2_dpy->driver_configs);
    }
@@ -1056,7 +1084,7 @@
    ctx_attribs[pos++] = __DRI_CTX_ATTRIB_MINOR_VERSION;
    ctx_attribs[pos++] = dri2_ctx->base.ClientMinorVersion;
 
-   if (dri2_ctx->base.Flags != 0) {
+   if (dri2_ctx->base.Flags != 0 || dri2_ctx->base.NoError) {
       /* If the implementation doesn't support the __DRI2_ROBUSTNESS
        * extension, don't even try to send it the robust-access flag.
        * It may explode.  Instead, generate the required EGL error here.
@@ -1068,7 +1096,8 @@
       }
 
       ctx_attribs[pos++] = __DRI_CTX_ATTRIB_FLAGS;
-      ctx_attribs[pos++] = dri2_ctx->base.Flags;
+      ctx_attribs[pos++] = dri2_ctx->base.Flags |
+         (dri2_ctx->base.NoError ? __DRI_CTX_FLAG_NO_ERROR : 0);
    }
 
    if (dri2_ctx->base.ResetNotificationStrategy != EGL_NO_RESET_NOTIFICATION_KHR) {
@@ -1131,6 +1160,17 @@
       goto cleanup;
    }
 
+   /* The EGL_KHR_create_context_no_error spec says:
+    *
+    *    "BAD_MATCH is generated if the value of EGL_CONTEXT_OPENGL_NO_ERROR_KHR
+    *    used to create <share_context> does not match the value of
+    *    EGL_CONTEXT_OPENGL_NO_ERROR_KHR for the context being created."
+    */
+   if (share_list && share_list->NoError != dri2_ctx->base.NoError) {
+      _eglError(EGL_BAD_MATCH, "eglCreateContext");
+      goto cleanup;
+   }
+
    switch (dri2_ctx->base.ClientAPI) {
    case EGL_OPENGL_ES_API:
       switch (dri2_ctx->base.ClientMajorVersion) {
@@ -1172,13 +1212,13 @@
        * doubleBufferMode check in
        * src/mesa/main/context.c:check_compatible()
        */
-      if (dri2_config->dri_double_config[0])
-         dri_config = dri2_config->dri_double_config[0];
+      if (dri2_config->dri_config[1][0])
+         dri_config = dri2_config->dri_config[1][0];
       else
-         dri_config = dri2_config->dri_single_config[0];
+         dri_config = dri2_config->dri_config[0][0];
 
-      /* EGL_WINDOW_BIT is set only when there is a dri_double_config.  This
-       * makes sure the back buffer will always be used.
+      /* EGL_WINDOW_BIT is set only when there is a double-buffered dri_config.
+       * This makes sure the back buffer will always be used.
        */
       if (conf->SurfaceType & EGL_WINDOW_BIT)
          dri2_ctx->base.WindowRenderBuffer = EGL_BACK_BUFFER;
@@ -1496,6 +1536,14 @@
 }
 
 static EGLBoolean
+dri2_set_damage_region(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
+                       EGLint *rects, EGLint n_rects)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
+   return dri2_dpy->vtbl->set_damage_region(drv, dpy, surf, rects, n_rects);
+}
+
+static EGLBoolean
 dri2_post_sub_buffer(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
                      EGLint x, EGLint y, EGLint width, EGLint height)
 {
@@ -1652,10 +1700,7 @@
       return NULL;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return NULL;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    dri2_img->dri_image = dri_image;
 
@@ -1677,6 +1722,11 @@
       return EGL_NO_IMAGE_KHR;
    }
 
+   if (!disp->Extensions.KHR_gl_renderbuffer_image) {
+      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+      return EGL_NO_IMAGE_KHR;
+   }
+
    dri_image =
       dri2_dpy->image->createImageFromRenderbuffer(dri2_ctx->dri_context,
                                                    renderbuffer, NULL);
@@ -1714,7 +1764,6 @@
    const struct wl_drm_components_descriptor *f;
    __DRIimage *dri_image;
    _EGLImageAttribs attrs;
-   EGLint err;
    int32_t plane;
 
    buffer = wayland_drm_buffer_get(dri2_dpy->wl_server_drm,
@@ -1722,13 +1771,10 @@
    if (!buffer)
        return NULL;
 
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   plane = attrs.PlaneWL;
-   if (err != EGL_SUCCESS) {
-      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_wayland_wl_buffer");
+   if (!_eglParseImageAttribList(&attrs, disp, attr_list))
       return NULL;
-   }
 
+   plane = attrs.PlaneWL;
    f = buffer->driver_format;
    if (plane < 0 || plane >= f->nplanes) {
       _eglError(EGL_BAD_PARAMETER,
@@ -1814,35 +1860,43 @@
       return EGL_NO_IMAGE_KHR;
    }
 
-   if (_eglParseImageAttribList(&attrs, disp, attr_list) != EGL_SUCCESS)
+   if (!_eglParseImageAttribList(&attrs, disp, attr_list))
       return EGL_NO_IMAGE_KHR;
 
    switch (target) {
    case EGL_GL_TEXTURE_2D_KHR:
+      if (!disp->Extensions.KHR_gl_texture_2D_image) {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
       depth = 0;
       gl_target = GL_TEXTURE_2D;
       break;
    case EGL_GL_TEXTURE_3D_KHR:
-      if (disp->Extensions.KHR_gl_texture_3D_image) {
-         depth = attrs.GLTextureZOffset;
-         gl_target = GL_TEXTURE_3D;
-         break;
-      }
-      else {
+      if (!disp->Extensions.KHR_gl_texture_3D_image) {
          _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
          return EGL_NO_IMAGE_KHR;
       }
+
+      depth = attrs.GLTextureZOffset;
+      gl_target = GL_TEXTURE_3D;
+      break;
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
+      if (!disp->Extensions.KHR_gl_texture_cubemap_image) {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
+
       depth = target - EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR;
       gl_target = GL_TEXTURE_CUBE_MAP;
       break;
    default:
-      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+      unreachable("Unexpected target in dri2_create_image_khr_texture()");
       return EGL_NO_IMAGE_KHR;
    }
 
@@ -1852,11 +1906,7 @@
       return EGL_NO_IMAGE_KHR;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      _eglError(EGL_BAD_ALLOC, "dri2_create_image_khr");
-      free(dri2_img);
-      return EGL_NO_IMAGE_KHR;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    dri2_img->dri_image =
       dri2_dpy->image->createImageFromTexture(dri2_ctx->dri_context,
@@ -1899,14 +1949,13 @@
                                   EGLClientBuffer buffer, const EGLint *attr_list)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   EGLint format, name, pitch, err;
+   EGLint format, name, pitch;
    _EGLImageAttribs attrs;
    __DRIimage *dri_image;
 
    name = (EGLint) (uintptr_t) buffer;
 
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS)
+   if (!_eglParseImageAttribList(&attrs, disp, attr_list))
       return NULL;
 
    if (attrs.Width <= 0 || attrs.Height <= 0 ||
@@ -1942,8 +1991,6 @@
 static EGLBoolean
 dri2_check_dma_buf_attribs(const _EGLImageAttribs *attrs)
 {
-   unsigned i;
-
    /**
      * The spec says:
      *
@@ -1961,10 +2008,8 @@
      *    incomplete, EGL_BAD_PARAMETER is generated."
      */
    if (attrs->Width <= 0 || attrs->Height <= 0 ||
-       !attrs->DMABufFourCC.IsPresent) {
-      _eglError(EGL_BAD_PARAMETER, "attribute(s) missing");
-      return EGL_FALSE;
-   }
+       !attrs->DMABufFourCC.IsPresent)
+      return _eglError(EGL_BAD_PARAMETER, "attribute(s) missing");
 
    /**
     * Also:
@@ -1973,11 +2018,36 @@
     *  specified for a plane's pitch or offset isn't supported by EGL,
     *  EGL_BAD_ACCESS is generated."
     */
-   for (i = 0; i < ARRAY_SIZE(attrs->DMABufPlanePitches); ++i) {
+   for (unsigned i = 0; i < ARRAY_SIZE(attrs->DMABufPlanePitches); ++i) {
       if (attrs->DMABufPlanePitches[i].IsPresent &&
-          attrs->DMABufPlanePitches[i].Value <= 0) {
-         _eglError(EGL_BAD_ACCESS, "invalid pitch");
-         return EGL_FALSE;
+          attrs->DMABufPlanePitches[i].Value <= 0)
+         return _eglError(EGL_BAD_ACCESS, "invalid pitch");
+   }
+
+   /**
+    * If <target> is EGL_LINUX_DMA_BUF_EXT, both or neither of the following
+    * attribute values may be given.
+    *
+    * This is referring to EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT and
+    * EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, and the same for other planes.
+    */
+   for (unsigned i = 0; i < DMA_BUF_MAX_PLANES; ++i) {
+      if (attrs->DMABufPlaneModifiersLo[i].IsPresent !=
+          attrs->DMABufPlaneModifiersHi[i].IsPresent)
+         return _eglError(EGL_BAD_PARAMETER, "modifier attribute lo or hi missing");
+   }
+
+   /* Although the EGL_EXT_image_dma_buf_import_modifiers spec doesn't
+    * mandate it, we only accept the same modifier across all planes. */
+   for (unsigned i = 1; i < DMA_BUF_MAX_PLANES; ++i) {
+      if (attrs->DMABufPlaneFds[i].IsPresent) {
+         if ((attrs->DMABufPlaneModifiersLo[0].IsPresent !=
+               attrs->DMABufPlaneModifiersLo[i].IsPresent) ||
+             (attrs->DMABufPlaneModifiersLo[0].Value !=
+               attrs->DMABufPlaneModifiersLo[i].Value) ||
+             (attrs->DMABufPlaneModifiersHi[0].Value !=
+               attrs->DMABufPlaneModifiersHi[i].Value))
+            return _eglError(EGL_BAD_PARAMETER, "modifier attributes not equal");
       }
    }
 
@@ -1988,7 +2058,7 @@
 static unsigned
 dri2_check_dma_buf_format(const _EGLImageAttribs *attrs)
 {
-   unsigned i, plane_n;
+   unsigned plane_n;
 
    switch (attrs->DMABufFourCC.Value) {
    case DRM_FORMAT_R8:
@@ -2069,7 +2139,7 @@
      * "* If <target> is EGL_LINUX_DMA_BUF_EXT, and the list of attributes is
      *    incomplete, EGL_BAD_PARAMETER is generated."
      */
-   for (i = 0; i < plane_n; ++i) {
+   for (unsigned i = 0; i < plane_n; ++i) {
       if (!attrs->DMABufPlaneFds[i].IsPresent ||
           !attrs->DMABufPlaneOffsets[i].IsPresent ||
           !attrs->DMABufPlanePitches[i].IsPresent) {
@@ -2084,12 +2154,30 @@
     * "If <target> is EGL_LINUX_DMA_BUF_EXT, and the EGL_LINUX_DRM_FOURCC_EXT
     *  attribute indicates a single-plane format, EGL_BAD_ATTRIBUTE is
     *  generated if any of the EGL_DMA_BUF_PLANE1_* or EGL_DMA_BUF_PLANE2_*
-    *  attributes are specified."
+    *  or EGL_DMA_BUF_PLANE3_* attributes are specified."
     */
-   for (i = plane_n; i < 3; ++i) {
+   for (unsigned i = plane_n; i < DMA_BUF_MAX_PLANES; ++i) {
       if (attrs->DMABufPlaneFds[i].IsPresent ||
           attrs->DMABufPlaneOffsets[i].IsPresent ||
-          attrs->DMABufPlanePitches[i].IsPresent) {
+          attrs->DMABufPlanePitches[i].IsPresent ||
+          attrs->DMABufPlaneModifiersLo[i].IsPresent ||
+          attrs->DMABufPlaneModifiersHi[i].IsPresent) {
+
+         /**
+          * The modifiers extension spec says:
+          *
+          * "Modifiers may modify any attribute of a buffer import, including
+          *  but not limited to adding extra planes to a format which
+          *  otherwise does not have those planes. As an example, a modifier
+          *  may add a plane for an external compression buffer to a
+          *  single-plane format. The exact meaning and effect of any
+          *  modifier is canonically defined by drm_fourcc.h, not as part of
+          *  this extension."
+          */
+         if (attrs->DMABufPlaneModifiersLo[i].IsPresent &&
+             attrs->DMABufPlaneModifiersHi[i].IsPresent)
+            continue;
+
          _eglError(EGL_BAD_ATTRIBUTE, "too many plane attributes");
          return 0;
       }
@@ -2098,6 +2186,51 @@
    return plane_n;
 }
 
+static EGLBoolean
+dri2_query_dma_buf_formats(_EGLDriver *drv, _EGLDisplay *disp,
+                            EGLint max, EGLint *formats, EGLint *count)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   if (max < 0 || (max > 0 && formats == NULL))
+      return _eglError(EGL_BAD_PARAMETER, "invalid value for max count of formats");
+
+   if (dri2_dpy->image->base.version < 15 ||
+       dri2_dpy->image->queryDmaBufFormats == NULL)
+      return EGL_FALSE;
+
+   if (!dri2_dpy->image->queryDmaBufFormats(dri2_dpy->dri_screen, max,
+                                            formats, count))
+      return EGL_FALSE;
+
+   return EGL_TRUE;
+}
+
+static EGLBoolean
+dri2_query_dma_buf_modifiers(_EGLDriver *drv, _EGLDisplay *disp, EGLint format,
+                             EGLint max, EGLuint64KHR *modifiers,
+                             EGLBoolean *external_only, EGLint *count)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+
+   if (max < 0)
+      return _eglError(EGL_BAD_PARAMETER, "invalid value for max count of formats");
+
+   if (max > 0 && modifiers == NULL)
+      return _eglError(EGL_BAD_PARAMETER, "invalid modifiers array");
+
+   if (dri2_dpy->image->base.version < 15 ||
+       dri2_dpy->image->queryDmaBufModifiers == NULL)
+      return EGL_FALSE;
+
+   if (dri2_dpy->image->queryDmaBufModifiers(dri2_dpy->dri_screen, format,
+                                             max, modifiers,
+                                             (unsigned int *) external_only,
+                                             count) == false)
+      return _eglError(EGL_BAD_PARAMETER, "invalid format");
+
+   return EGL_TRUE;
+}
+
 /**
  * The spec says:
  *
@@ -2114,14 +2247,14 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    _EGLImage *res;
-   EGLint err;
    _EGLImageAttribs attrs;
    __DRIimage *dri_image;
    unsigned num_fds;
-   unsigned i;
-   int fds[3];
-   int pitches[3];
-   int offsets[3];
+   int fds[DMA_BUF_MAX_PLANES];
+   int pitches[DMA_BUF_MAX_PLANES];
+   int offsets[DMA_BUF_MAX_PLANES];
+   uint64_t modifier;
+   bool has_modifier = false;
    unsigned error;
 
    /**
@@ -2135,11 +2268,8 @@
       return NULL;
    }
 
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS) {
-      _eglError(err, "bad attribute");
+   if (!_eglParseImageAttribList(&attrs, disp, attr_list))
       return NULL;
-   }
 
    if (!dri2_check_dma_buf_attribs(&attrs))
       return NULL;
@@ -2148,22 +2278,50 @@
    if (!num_fds)
       return NULL;
 
-   for (i = 0; i < num_fds; ++i) {
+   for (unsigned i = 0; i < num_fds; ++i) {
       fds[i] = attrs.DMABufPlaneFds[i].Value;
       pitches[i] = attrs.DMABufPlanePitches[i].Value;
       offsets[i] = attrs.DMABufPlaneOffsets[i].Value;
    }
 
-   dri_image =
-      dri2_dpy->image->createImageFromDmaBufs(dri2_dpy->dri_screen,
-         attrs.Width, attrs.Height, attrs.DMABufFourCC.Value,
-         fds, num_fds, pitches, offsets,
-         attrs.DMABufYuvColorSpaceHint.Value,
-         attrs.DMABufSampleRangeHint.Value,
-         attrs.DMABufChromaHorizontalSiting.Value,
-         attrs.DMABufChromaVerticalSiting.Value,
-         &error,
-         NULL);
+   /* dri2_check_dma_buf_attribs ensures that the modifier, if available,
+    * will be present in attrs.DMABufPlaneModifiersLo[0] and
+    * attrs.DMABufPlaneModifiersHi[0] */
+   if (attrs.DMABufPlaneModifiersLo[0].IsPresent) {
+      modifier = (uint64_t) attrs.DMABufPlaneModifiersHi[0].Value << 32;
+      modifier |= (uint64_t) (attrs.DMABufPlaneModifiersLo[0].Value & 0xffffffff);
+      has_modifier = true;
+   }
+
+   if (has_modifier) {
+      if (dri2_dpy->image->base.version < 15 ||
+          dri2_dpy->image->createImageFromDmaBufs2 == NULL) {
+         _eglError(EGL_BAD_MATCH, "unsupported dma_buf format modifier");
+         return EGL_NO_IMAGE_KHR;
+      }
+      dri_image =
+         dri2_dpy->image->createImageFromDmaBufs2(dri2_dpy->dri_screen,
+            attrs.Width, attrs.Height, attrs.DMABufFourCC.Value,
+            modifier, fds, num_fds, pitches, offsets,
+            attrs.DMABufYuvColorSpaceHint.Value,
+            attrs.DMABufSampleRangeHint.Value,
+            attrs.DMABufChromaHorizontalSiting.Value,
+            attrs.DMABufChromaVerticalSiting.Value,
+            &error,
+            NULL);
+   }
+   else {
+      dri_image =
+         dri2_dpy->image->createImageFromDmaBufs(dri2_dpy->dri_screen,
+            attrs.Width, attrs.Height, attrs.DMABufFourCC.Value,
+            fds, num_fds, pitches, offsets,
+            attrs.DMABufYuvColorSpaceHint.Value,
+            attrs.DMABufSampleRangeHint.Value,
+            attrs.DMABufChromaHorizontalSiting.Value,
+            attrs.DMABufChromaVerticalSiting.Value,
+            &error,
+            NULL);
+   }
    dri2_create_image_khr_texture_error(error);
 
    if (!dri_image)
@@ -2182,34 +2340,20 @@
    _EGLImageAttribs attrs;
    unsigned int dri_use, valid_mask;
    int format;
-   EGLint err = EGL_SUCCESS;
 
    (void) drv;
 
-   dri2_img = malloc(sizeof *dri2_img);
-   if (!dri2_img) {
-      _eglError(EGL_BAD_ALLOC, "dri2_create_image_khr");
+   if (!attr_list) {
+      _eglError(EGL_BAD_PARAMETER, __func__);
       return EGL_NO_IMAGE_KHR;
    }
 
-   if (!attr_list) {
-      err = EGL_BAD_PARAMETER;
-      goto cleanup_img;
-   }
-
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      err = EGL_BAD_PARAMETER;
-      goto cleanup_img;
-   }
-
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS)
-      goto cleanup_img;
+   if (!_eglParseImageAttribList(&attrs, disp, attr_list))
+      return EGL_NO_IMAGE_KHR;
 
    if (attrs.Width <= 0 || attrs.Height <= 0) {
-      _eglLog(_EGL_WARNING, "bad width or height (%dx%d)",
-            attrs.Width, attrs.Height);
-      goto cleanup_img;
+      _eglError(EGL_BAD_PARAMETER, __func__);
+      return EGL_NO_IMAGE_KHR;
    }
 
    switch (attrs.DRMBufferFormatMESA) {
@@ -2217,9 +2361,8 @@
       format = __DRI_IMAGE_FORMAT_ARGB8888;
       break;
    default:
-      _eglLog(_EGL_WARNING, "bad image format value 0x%04x",
-            attrs.DRMBufferFormatMESA);
-      goto cleanup_img;
+      _eglError(EGL_BAD_PARAMETER, __func__);
+      return EGL_NO_IMAGE_KHR;
    }
 
    valid_mask =
@@ -2227,9 +2370,8 @@
       EGL_DRM_BUFFER_USE_SHARE_MESA |
       EGL_DRM_BUFFER_USE_CURSOR_MESA;
    if (attrs.DRMBufferUseMESA & ~valid_mask) {
-      _eglLog(_EGL_WARNING, "bad image use bit 0x%04x",
-            attrs.DRMBufferUseMESA & ~valid_mask);
-      goto cleanup_img;
+      _eglError(EGL_BAD_PARAMETER, __func__);
+      return EGL_NO_IMAGE_KHR;
    }
 
    dri_use = 0;
@@ -2240,22 +2382,25 @@
    if (attrs.DRMBufferUseMESA & EGL_DRM_BUFFER_USE_CURSOR_MESA)
       dri_use |= __DRI_IMAGE_USE_CURSOR;
 
+   dri2_img = malloc(sizeof *dri2_img);
+   if (!dri2_img) {
+      _eglError(EGL_BAD_ALLOC, "dri2_create_image_khr");
+      return EGL_NO_IMAGE_KHR;
+   }
+
+   _eglInitImage(&dri2_img->base, disp);
+
    dri2_img->dri_image =
       dri2_dpy->image->createImage(dri2_dpy->dri_screen,
                                    attrs.Width, attrs.Height,
                                    format, dri_use, dri2_img);
    if (dri2_img->dri_image == NULL) {
-      err = EGL_BAD_ALLOC;
-      goto cleanup_img;
+      free(dri2_img);
+       _eglError(EGL_BAD_ALLOC, "dri2_create_drm_image_mesa");
+      return EGL_NO_IMAGE_KHR;
    }
 
    return &dri2_img->base;
-
- cleanup_img:
-   free(dri2_img);
-   _eglError(err, "dri2_create_drm_image_mesa");
-
-   return EGL_NO_IMAGE_KHR;
 }
 
 static EGLBoolean
@@ -2268,10 +2413,8 @@
    (void) drv;
 
    if (name && !dri2_dpy->image->queryImage(dri2_img->dri_image,
-                                            __DRI_IMAGE_ATTRIB_NAME, name)) {
-      _eglError(EGL_BAD_ALLOC, "dri2_export_drm_image_mesa");
-      return EGL_FALSE;
-   }
+                                            __DRI_IMAGE_ATTRIB_NAME, name))
+      return _eglError(EGL_BAD_ALLOC, "dri2_export_drm_image_mesa");
 
    if (handle)
       dri2_dpy->image->queryImage(dri2_img->dri_image,
@@ -2357,15 +2500,8 @@
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
-      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
    case EGL_GL_TEXTURE_3D_KHR:
-      if (disp->Extensions.KHR_gl_texture_3D_image) {
-         return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
-      }
-      else {
-         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
-         return EGL_NO_IMAGE_KHR;
-      }
+      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
    case EGL_GL_RENDERBUFFER_KHR:
       return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
 #ifdef HAVE_LIBDRM
@@ -2407,7 +2543,7 @@
    _EGLDisplay *disp = user_data;
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    __DRIimage *img;
-   int i, dri_components = 0;
+   int dri_components = 0;
 
    if (fd == -1)
       img = dri2_dpy->image->createImageFromNames(dri2_dpy->dri_screen,
@@ -2434,7 +2570,7 @@
    dri2_dpy->image->queryImage(img, __DRI_IMAGE_ATTRIB_COMPONENTS, &dri_components);
 
    buffer->driver_format = NULL;
-   for (i = 0; i < ARRAY_SIZE(wl_drm_components); i++)
+   for (int i = 0; i < ARRAY_SIZE(wl_drm_components); i++)
       if (wl_drm_components[i].dri_components == dri_components)
          buffer->driver_format = &wl_drm_components[i];
 
@@ -2828,15 +2964,11 @@
    struct dri2_egl_sync *dri2_sync = dri2_egl_sync(sync);
    EGLint ret;
 
-   if (sync->Type != EGL_SYNC_REUSABLE_KHR) {
-      _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR");
-      return EGL_FALSE;
-   }
+   if (sync->Type != EGL_SYNC_REUSABLE_KHR)
+      return _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR");
 
-   if (mode != EGL_SIGNALED_KHR && mode != EGL_UNSIGNALED_KHR) {
-      _eglError(EGL_BAD_ATTRIBUTE, "eglSignalSyncKHR");
-      return EGL_FALSE;
-   }
+   if (mode != EGL_SIGNALED_KHR && mode != EGL_UNSIGNALED_KHR)
+      return _eglError(EGL_BAD_ATTRIBUTE, "eglSignalSyncKHR");
 
    dri2_sync->base.SyncStatus = mode;
 
@@ -2844,10 +2976,8 @@
       ret = cnd_broadcast(&dri2_sync->cond);
 
       /* fail to broadcast */
-      if (ret) {
-         _eglError(EGL_BAD_ACCESS, "eglSignalSyncKHR");
-         return EGL_FALSE;
-      }
+      if (ret)
+         return _eglError(EGL_BAD_ACCESS, "eglSignalSyncKHR");
    }
 
    return EGL_TRUE;
@@ -2990,6 +3120,7 @@
    dri2_drv->base.API.SwapBuffers = dri2_swap_buffers;
    dri2_drv->base.API.SwapBuffersWithDamageEXT = dri2_swap_buffers_with_damage;
    dri2_drv->base.API.SwapBuffersRegionNOK = dri2_swap_buffers_region;
+   dri2_drv->base.API.SetDamageRegion = dri2_set_damage_region;
    dri2_drv->base.API.PostSubBufferNV = dri2_post_sub_buffer;
    dri2_drv->base.API.CopyBuffers = dri2_copy_buffers,
    dri2_drv->base.API.QueryBufferAge = dri2_query_buffer_age;
@@ -3002,6 +3133,8 @@
    dri2_drv->base.API.ExportDRMImageMESA = dri2_export_drm_image_mesa;
    dri2_drv->base.API.ExportDMABUFImageQueryMESA = dri2_export_dma_buf_image_query_mesa;
    dri2_drv->base.API.ExportDMABUFImageMESA = dri2_export_dma_buf_image_mesa;
+   dri2_drv->base.API.QueryDmaBufFormatsEXT = dri2_query_dma_buf_formats;
+   dri2_drv->base.API.QueryDmaBufModifiersEXT = dri2_query_dma_buf_modifiers;
 #endif
 #ifdef HAVE_WAYLAND_PLATFORM
    dri2_drv->base.API.BindWaylandDisplayWL = dri2_bind_wayland_display_wl;
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index bd77247..ccfefef 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -28,6 +28,7 @@
 #ifndef EGL_DRI2_INCLUDED
 #define EGL_DRI2_INCLUDED
 
+#include <stdbool.h>
 #include <stdint.h>
 
 #ifdef HAVE_X11_PLATFORM
@@ -44,6 +45,8 @@
 #ifdef HAVE_WAYLAND_PLATFORM
 #include <wayland-client.h>
 #include "wayland-egl-priv.h"
+/* forward declarations of protocol elements */
+struct zwp_linux_dmabuf_v1;
 #endif
 
 #include <GL/gl.h>
@@ -59,7 +62,6 @@
 #include <system/window.h>
 #include <hardware/gralloc.h>
 #include <gralloc_drm_handle.h>
-#include <cutils/log.h>
 
 #endif /* HAVE_ANDROID_PLATFORM */
 
@@ -73,6 +75,8 @@
 #include "eglimage.h"
 #include "eglsync.h"
 
+#include "util/u_vector.h"
+
 struct wl_buffer;
 
 struct dri2_egl_driver
@@ -119,6 +123,10 @@
                                           _EGLSurface *surface,
                                           const EGLint *rects, EGLint n_rects);
 
+   EGLBoolean (*set_damage_region)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                   _EGLSurface *surface,
+                                   const EGLint *rects, EGLint n_rects);
+
    EGLBoolean (*swap_buffers_region)(_EGLDriver *drv, _EGLDisplay *dpy,
                                      _EGLSurface *surf, EGLint numRects,
                                      const EGLint *rects);
@@ -155,7 +163,7 @@
    int                       dri2_major;
    int                       dri2_minor;
    __DRIscreen              *dri_screen;
-   int                       own_dri_screen;
+   bool                      own_dri_screen;
    const __DRIconfig       **driver_configs;
    void                     *driver;
    const __DRIcoreExtension       *core;
@@ -166,6 +174,7 @@
    const __DRItexBufferExtension  *tex_buffer;
    const __DRIimageExtension      *image;
    const __DRIrobustnessExtension *robustness;
+   const __DRInoErrorExtension    *no_error;
    const __DRI2configQueryExtension *config;
    const __DRI2fenceExtension *fence;
    const __DRI2rendererQueryExtension *rendererQuery;
@@ -176,8 +185,8 @@
     * dri2_make_current (tracks if there are active contexts/surfaces). */
    int                       ref_count;
 
-   int                       own_device;
-   int                       invalidate_available;
+   bool                      own_device;
+   bool                      invalidate_available;
    int                       min_swap_interval;
    int                       max_swap_interval;
    int                       default_swap_interval;
@@ -193,7 +202,7 @@
 #ifdef HAVE_X11_PLATFORM
    xcb_connection_t         *conn;
    xcb_screen_t             *screen;
-   int                      swap_available;
+   bool                     swap_available;
 #ifdef HAVE_DRI3
    struct loader_dri3_extensions loader_dri3_ext;
 #endif
@@ -207,7 +216,13 @@
    struct wl_drm            *wl_drm;
    struct wl_shm            *wl_shm;
    struct wl_event_queue    *wl_queue;
-   int                       authenticated;
+   struct zwp_linux_dmabuf_v1 *wl_dmabuf;
+   struct {
+      struct u_vector        xrgb8888;
+      struct u_vector        argb8888;
+      struct u_vector        rgb565;
+   } wl_modifiers;
+   bool                      authenticated;
    int                       formats;
    uint32_t                  capabilities;
    char                     *device_name;
@@ -217,8 +232,8 @@
    const gralloc_module_t *gralloc;
 #endif
 
-   int                       is_render_node;
-   int                       is_different_gpu;
+   bool                      is_render_node;
+   bool                      is_different_gpu;
 };
 
 struct dri2_egl_context
@@ -241,8 +256,7 @@
    _EGLSurface          base;
    __DRIdrawable       *dri_drawable;
    __DRIbuffer          buffers[5];
-   int                  buffer_count;
-   int                  have_fake_front;
+   bool                 have_fake_front;
 
 #ifdef HAVE_X11_PLATFORM
    xcb_drawable_t       drawable;
@@ -284,7 +298,7 @@
 #ifdef HAVE_DRM_PLATFORM
       struct gbm_bo       *bo;
 #endif
-      int                 locked;
+      bool                locked;
       int                 age;
    } color_buffers[4], *back, *current;
 #endif
@@ -317,8 +331,7 @@
 struct dri2_egl_config
 {
    _EGLConfig         base;
-   const __DRIconfig *dri_single_config[2];
-   const __DRIconfig *dri_double_config[2];
+   const __DRIconfig *dri_config[2][2];
 };
 
 struct dri2_egl_image
@@ -366,6 +379,9 @@
 EGLBoolean
 dri2_create_screen(_EGLDisplay *disp);
 
+EGLBoolean
+dri2_setup_extensions(_EGLDisplay *disp);
+
 __DRIdrawable *
 dri2_surface_get_dri_drawable(_EGLSurface *surf);
 
@@ -432,4 +448,7 @@
 #endif
 }
 
+void
+dri2_display_destroy(_EGLDisplay *disp);
+
 #endif /* EGL_DRI2_INCLUDED */
diff --git a/src/egl/drivers/dri2/egl_dri2_fallbacks.h b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
index 67a9c50..c70c686 100644
--- a/src/egl/drivers/dri2/egl_dri2_fallbacks.h
+++ b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
@@ -59,7 +59,14 @@
 dri2_fallback_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy,
                             _EGLSurface *surf, EGLint interval)
 {
-   return EGL_FALSE;
+   if (interval > surf->Config->MaxSwapInterval)
+      interval = surf->Config->MaxSwapInterval;
+   else if (interval < surf->Config->MinSwapInterval)
+      interval = surf->Config->MinSwapInterval;
+
+   surf->SwapInterval = interval;
+
+   return EGL_TRUE;
 }
 
 static inline EGLBoolean
@@ -68,6 +75,7 @@
                                       const EGLint *rects, EGLint n_rects)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
+   dri2_dpy->vtbl->set_damage_region(drv, dpy, surf, rects, n_rects);
    return dri2_dpy->vtbl->swap_buffers(drv, dpy, surf);
 }
 
@@ -95,6 +103,14 @@
    return EGL_FALSE;
 }
 
+static inline EGLBoolean
+dri2_fallback_set_damage_region(_EGLDriver *drv, _EGLDisplay *dpy,
+                                _EGLSurface *surf,
+                                const EGLint *rects, EGLint n_rects)
+{
+   return EGL_FALSE;
+}
+
 static inline EGLint
 dri2_fallback_query_buffer_age(_EGLDriver *drv, _EGLDisplay *dpy,
                                _EGLSurface *surf)
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index d675cdc..300e2d9 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -64,9 +64,7 @@
 static int
 get_fourcc_yuv(int native, int is_ycrcb, int chroma_step)
 {
-   int i;
-
-   for (i = 0; i < ARRAY_SIZE(droid_yuv_formats); ++i)
+   for (int i = 0; i < ARRAY_SIZE(droid_yuv_formats); ++i)
       if (droid_yuv_formats[i].native == native &&
           droid_yuv_formats[i].is_ycrcb == is_ycrcb &&
           droid_yuv_formats[i].chroma_step == chroma_step)
@@ -78,9 +76,7 @@
 static bool
 is_yuv(int native)
 {
-   int i;
-
-   for (i = 0; i < ARRAY_SIZE(droid_yuv_formats); ++i)
+   for (int i = 0; i < ARRAY_SIZE(droid_yuv_formats); ++i)
       if (droid_yuv_formats[i].native == native)
          return true;
 
@@ -299,9 +295,8 @@
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i;
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->local_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->local_buffers); i++) {
       if (dri2_surf->local_buffers[i]) {
          dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
                dri2_surf->local_buffers[i]);
@@ -315,6 +310,7 @@
 		    _EGLConfig *conf, void *native_window,
 		    const EGLint *attrib_list)
 {
+   __DRIcreateNewDrawableFunc createNewDrawable;
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
    struct dri2_egl_surface *dri2_surf;
@@ -356,11 +352,15 @@
    if (!config)
       goto cleanup_surface;
 
-   dri2_surf->dri_drawable =
-      dri2_dpy->dri2->createNewDrawable(dri2_dpy->dri_screen, config,
-                                        dri2_surf);
+   if (dri2_dpy->image_driver)
+      createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
+   else
+      createNewDrawable = dri2_dpy->dri2->createNewDrawable;
+
+   dri2_surf->dri_drawable = (*createNewDrawable)(dri2_dpy->dri_screen, config,
+                                                  dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
-      _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
+      _eglError(EGL_BAD_ALLOC, "createNewDrawable");
       goto cleanup_surface;
    }
 
@@ -489,7 +489,7 @@
 }
 
 static int
-get_back_bo(struct dri2_egl_surface *dri2_surf, unsigned int format)
+get_back_bo(struct dri2_egl_surface *dri2_surf)
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
@@ -589,7 +589,7 @@
    }
 
    if (buffer_mask & __DRI_IMAGE_BUFFER_BACK) {
-      if (get_back_bo(dri2_surf, format) < 0)
+      if (get_back_bo(dri2_surf) < 0)
          return 0;
 
       if (dri2_surf->dri_image_back) {
@@ -651,6 +651,43 @@
    return EGL_TRUE;
 }
 
+#if ANDROID_API_LEVEL >= 23
+static EGLBoolean
+droid_set_damage_region(_EGLDriver *drv,
+                        _EGLDisplay *disp,
+                        _EGLSurface *draw, const EGLint* rects, EGLint n_rects)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
+   android_native_rect_t* droid_rects = NULL;
+   int ret;
+
+   if (n_rects == 0)
+      return EGL_TRUE;
+
+   droid_rects = malloc(n_rects * sizeof(android_native_rect_t));
+   if (droid_rects == NULL)
+     return _eglError(EGL_BAD_ALLOC, "eglSetDamageRegionKHR");
+
+   for (EGLint num_drects = 0; num_drects < n_rects; num_drects++) {
+      EGLint i = num_drects * 4;
+      droid_rects[num_drects].left = rects[i];
+      droid_rects[num_drects].bottom = rects[i + 1];
+      droid_rects[num_drects].right = rects[i] + rects[i + 2];
+      droid_rects[num_drects].top = rects[i + 1] + rects[i + 3];
+   }
+
+   /*
+    * XXX/TODO: Need to check for other return values
+    */
+
+   ret = native_window_set_surface_damage(dri2_surf->window, droid_rects, n_rects);
+   free(droid_rects);
+
+   return ret == 0 ? EGL_TRUE : EGL_FALSE;
+}
+#endif
+
 static _EGLImage *
 droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
                                      struct ANativeWindowBuffer *buf, int fd)
@@ -708,7 +745,7 @@
    }
 
    if (ycbcr.chroma_step == 2) {
-      /* Semi-planar Y + CbCr or Y + CbCr format. */
+      /* Semi-planar Y + CbCr or Y + CrCb format. */
       const EGLint attr_list_2plane[] = {
          EGL_WIDTH, buf->width,
          EGL_HEIGHT, buf->height,
@@ -804,10 +841,7 @@
       return NULL;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return NULL;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    dri2_img->dri_image =
       dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
@@ -907,10 +941,10 @@
 droid_get_buffers_parse_attachments(struct dri2_egl_surface *dri2_surf,
                                     unsigned int *attachments, int count)
 {
-   int num_buffers = 0, i;
+   int num_buffers = 0;
 
    /* fill dri2_surf->buffers */
-   for (i = 0; i < count * 2; i += 2) {
+   for (int i = 0; i < count * 2; i += 2) {
       __DRIbuffer *buf, *local;
 
       assert(num_buffers < ARRAY_SIZE(dri2_surf->buffers));
@@ -969,16 +1003,13 @@
    if (update_buffers(dri2_surf) < 0)
       return NULL;
 
-   dri2_surf->buffer_count =
-      droid_get_buffers_parse_attachments(dri2_surf, attachments, count);
+   *out_count = droid_get_buffers_parse_attachments(dri2_surf, attachments, count);
 
    if (width)
       *width = dri2_surf->base.Width;
    if (height)
       *height = dri2_surf->base.Height;
 
-   *out_count = dri2_surf->buffer_count;
-
    return dri2_surf->buffers;
 }
 
@@ -990,20 +1021,14 @@
       int format;
       unsigned int rgba_masks[4];
    } visuals[] = {
-      { HAL_PIXEL_FORMAT_RGBA_8888, { 0xff, 0xff00, 0xff0000, 0xff000000 } },
-      { HAL_PIXEL_FORMAT_RGBX_8888, { 0xff, 0xff00, 0xff0000, 0x0 } },
-      { HAL_PIXEL_FORMAT_RGB_565,   { 0xf800, 0x7e0, 0x1f, 0x0 } },
-      { HAL_PIXEL_FORMAT_BGRA_8888, { 0xff0000, 0xff00, 0xff, 0xff000000 } },
+      { HAL_PIXEL_FORMAT_RGBA_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 } },
+      { HAL_PIXEL_FORMAT_RGBX_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 } },
+      { HAL_PIXEL_FORMAT_RGB_565,   { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 } },
+      { HAL_PIXEL_FORMAT_BGRA_8888, { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 } },
    };
-   EGLint config_attrs[] = {
-     EGL_NATIVE_VISUAL_ID,   0,
-     EGL_NATIVE_VISUAL_TYPE, 0,
-     EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
-     EGL_RECORDABLE_ANDROID, EGL_TRUE,
-     EGL_NONE
-   };
+
    unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
-   int count, i, j;
+   int config_count = 0;
 
    /* The nesting of loops is significant here. Also significant is the order
     * of the HAL pixel formats. Many Android apps (such as Google's official
@@ -1023,33 +1048,38 @@
     * (chadversary) testing on Android Nougat, this was good enough to pacify
     * the buggy clients.
     */
-   count = 0;
-   for (i = 0; i < ARRAY_SIZE(visuals); i++) {
-      const EGLint surface_type = EGL_WINDOW_BIT | EGL_PBUFFER_BIT;
-      struct dri2_egl_config *dri2_conf;
+   for (int i = 0; i < ARRAY_SIZE(visuals); i++) {
+      for (int j = 0; dri2_dpy->driver_configs[j]; j++) {
+         const EGLint surface_type = EGL_WINDOW_BIT | EGL_PBUFFER_BIT;
 
-      for (j = 0; dri2_dpy->driver_configs[j]; j++) {
-         config_attrs[1] = visuals[i].format;
-         config_attrs[3] = visuals[i].format;
+         const EGLint config_attrs[] = {
+           EGL_NATIVE_VISUAL_ID,   visuals[i].format,
+           EGL_NATIVE_VISUAL_TYPE, visuals[i].format,
+           EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
+           EGL_RECORDABLE_ANDROID, EGL_TRUE,
+           EGL_NONE
+         };
 
-         dri2_conf = dri2_add_config(dpy, dri2_dpy->driver_configs[j],
-               count + 1, surface_type, config_attrs, visuals[i].rgba_masks);
+         struct dri2_egl_config *dri2_conf =
+            dri2_add_config(dpy, dri2_dpy->driver_configs[j],
+                            config_count + 1, surface_type, config_attrs,
+                            visuals[i].rgba_masks);
          if (dri2_conf) {
-            if (dri2_conf->base.ConfigID == count + 1)
-               count++;
+            if (dri2_conf->base.ConfigID == config_count + 1)
+               config_count++;
             format_count[i]++;
          }
       }
    }
 
-   for (i = 0; i < ARRAY_SIZE(format_count); i++) {
+   for (int i = 0; i < ARRAY_SIZE(format_count); i++) {
       if (!format_count[i]) {
          _eglLog(_EGL_DEBUG, "No DRI config supports native format 0x%x",
                  visuals[i].format);
       }
    }
 
-   return (count != 0);
+   return (config_count != 0);
 }
 
 static int
@@ -1069,39 +1099,7 @@
    return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1;
 }
 
-/* support versions < JellyBean */
-#ifndef ALOGW
-#define ALOGW LOGW
-#endif
-#ifndef ALOGD
-#define ALOGD LOGD
-#endif
-#ifndef ALOGI
-#define ALOGI LOGI
-#endif
-
-static void
-droid_log(EGLint level, const char *msg)
-{
-   switch (level) {
-   case _EGL_DEBUG:
-      ALOGD("%s", msg);
-      break;
-   case _EGL_INFO:
-      ALOGI("%s", msg);
-      break;
-   case _EGL_WARNING:
-      ALOGW("%s", msg);
-      break;
-   case _EGL_FATAL:
-      LOG_FATAL("%s", msg);
-      break;
-   default:
-      break;
-   }
-}
-
-static struct dri2_egl_display_vtbl droid_display_vtbl = {
+static const struct dri2_egl_display_vtbl droid_display_vtbl = {
    .authenticate = NULL,
    .create_window_surface = droid_create_window_surface,
    .create_pixmap_surface = dri2_fallback_create_pixmap_surface,
@@ -1112,6 +1110,11 @@
    .swap_buffers = droid_swap_buffers,
    .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
+#if ANDROID_API_LEVEL >= 23
+   .set_damage_region = droid_set_damage_region,
+#else
+   .set_damage_region = dri2_fallback_set_damage_region,
+#endif
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri2_fallback_copy_buffers,
    .query_buffer_age = droid_query_buffer_age,
@@ -1157,19 +1160,18 @@
    const char *err;
    int ret;
 
-   _eglSetLogProc(droid_log);
-
    loader_set_logger(_eglLog);
 
    dri2_dpy = calloc(1, sizeof(*dri2_dpy));
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    ret = hw_get_module(GRALLOC_HARDWARE_MODULE_ID,
                        (const hw_module_t **)&dri2_dpy->gralloc);
    if (ret) {
       err = "DRI2: failed to get gralloc module";
-      goto cleanup_display;
+      goto cleanup;
    }
 
    dpy->DriverData = (void *) dri2_dpy;
@@ -1177,43 +1179,57 @@
    dri2_dpy->fd = droid_open_device(dri2_dpy);
    if (dri2_dpy->fd < 0) {
       err = "DRI2: failed to open device";
-      goto cleanup_display;
+      goto cleanup;
    }
 
    dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
    if (dri2_dpy->driver_name == NULL) {
       err = "DRI2: failed to get driver name";
-      goto cleanup_device;
-   }
-
-   if (!dri2_load_driver(dpy)) {
-      err = "DRI2: failed to load driver";
-      goto cleanup_driver_name;
+      goto cleanup;
    }
 
    dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
 
    /* render nodes cannot use Gem names, and thus do not support
     * the __DRI_DRI2_LOADER extension */
-   if (!dri2_dpy->is_render_node)
+   if (!dri2_dpy->is_render_node) {
       dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
-   else
+      if (!dri2_load_driver(dpy)) {
+         err = "DRI2: failed to load driver";
+         goto cleanup;
+      }
+   } else {
       dri2_dpy->loader_extensions = droid_image_loader_extensions;
+      if (!dri2_load_driver_dri3(dpy)) {
+         err = "DRI3: failed to load driver";
+         goto cleanup;
+      }
+   }
 
    if (!dri2_create_screen(dpy)) {
       err = "DRI2: failed to create screen";
-      goto cleanup_driver;
+      goto cleanup;
    }
 
+   if (!dri2_setup_extensions(dpy)) {
+      err = "DRI2: failed to setup extensions";
+      goto cleanup;
+   }
+
+   dri2_setup_screen(dpy);
+
    if (!droid_add_configs_for_visuals(drv, dpy)) {
       err = "DRI2: failed to add configs";
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
    dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
    dpy->Extensions.ANDROID_recordable = EGL_TRUE;
    dpy->Extensions.EXT_buffer_age = EGL_TRUE;
+#if ANDROID_API_LEVEL >= 23
+   dpy->Extensions.KHR_partial_update = EGL_TRUE;
+#endif
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
@@ -1222,17 +1238,7 @@
 
    return EGL_TRUE;
 
-cleanup_screen:
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
-cleanup_driver:
-   dlclose(dri2_dpy->driver);
-cleanup_driver_name:
-   free(dri2_dpy->driver_name);
-cleanup_device:
-   close(dri2_dpy->fd);
-cleanup_display:
-   free(dri2_dpy);
-   dpy->DriverData = NULL;
-
+cleanup:
+   dri2_display_destroy(dpy);
    return _eglError(EGL_NOT_INITIALIZED, err);
 }
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index b178eaa..d59009f 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -56,7 +56,7 @@
    bo = dri2_surf->current->bo;
 
    if (device->dri2) {
-      dri2_surf->current->locked = 1;
+      dri2_surf->current->locked = true;
       dri2_surf->current = NULL;
    }
 
@@ -68,11 +68,11 @@
 {
    struct gbm_dri_surface *surf = (struct gbm_dri_surface *) _surf;
    struct dri2_egl_surface *dri2_surf = surf->dri_private;
-   unsigned i;
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (dri2_surf->color_buffers[i].bo == bo) {
-	 dri2_surf->color_buffers[i].locked = 0;
+	 dri2_surf->color_buffers[i].locked = false;
+	 break;
       }
    }
 }
@@ -82,9 +82,8 @@
 {
    struct gbm_dri_surface *surf = (struct gbm_dri_surface *) _surf;
    struct dri2_egl_surface *dri2_surf = surf->dri_private;
-   unsigned i;
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
+   for (unsigned i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
       if (!dri2_surf->color_buffers[i].locked)
 	 return 1;
 
@@ -189,16 +188,15 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
-   unsigned i;
 
    dri2_dpy->core->destroyDrawable(dri2_surf->dri_drawable);
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (dri2_surf->color_buffers[i].bo)
 	 gbm_bo_destroy(dri2_surf->color_buffers[i].bo);
    }
 
-   for (i = 0; i < __DRI_BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < __DRI_BUFFER_COUNT; i++) {
       if (dri2_surf->dri_buffers[i])
          dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
                                        dri2_surf->dri_buffers[i]);
@@ -216,10 +214,9 @@
       dri2_egl_display(dri2_surf->base.Resource.Display);
    struct gbm_dri_surface *surf = dri2_surf->gbm_surf;
    int age = 0;
-   unsigned i;
 
    if (dri2_surf->back == NULL) {
-      for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+      for (unsigned i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
 	 if (!dri2_surf->color_buffers[i].locked &&
 	      dri2_surf->color_buffers[i].age >= age) {
 	    dri2_surf->back = &dri2_surf->color_buffers[i];
@@ -232,13 +229,14 @@
       return -1;
    if (dri2_surf->back->bo == NULL) {
       if (surf->base.modifiers)
-         dri2_surf->back->bo = gbm_bo_create_with_modifiers(&dri2_dpy->gbm_dri->base.base,
-                                                            surf->base.width, surf->base.height,
+         dri2_surf->back->bo = gbm_bo_create_with_modifiers(&dri2_dpy->gbm_dri->base,
+                                                            surf->base.width,
+                                                            surf->base.height,
                                                             surf->base.format,
                                                             surf->base.modifiers,
                                                             surf->base.count);
       else
-         dri2_surf->back->bo = gbm_bo_create(&dri2_dpy->gbm_dri->base.base,
+         dri2_surf->back->bo = gbm_bo_create(&dri2_dpy->gbm_dri->base,
                                              surf->base.width,
                                              surf->base.height,
                                              surf->base.format,
@@ -264,7 +262,7 @@
    }
 
    if (dri2_surf->current->bo == NULL)
-      dri2_surf->current->bo = gbm_bo_create(&dri2_dpy->gbm_dri->base.base,
+      dri2_surf->current->bo = gbm_bo_create(&dri2_dpy->gbm_dri->base,
                                              surf->base.width, surf->base.height,
                                              surf->base.format, surf->base.flags);
    if (dri2_surf->current->bo == NULL)
@@ -325,10 +323,9 @@
    struct dri2_egl_surface *dri2_surf = loaderPrivate;
    int i, j;
 
-   dri2_surf->buffer_count = 0;
    for (i = 0, j = 0; i < 2 * count; i += 2, j++) {
       assert(attachments[i] < __DRI_BUFFER_COUNT);
-      assert(dri2_surf->buffer_count < 5);
+      assert(j < ARRAY_SIZE(dri2_surf->buffers));
 
       switch (attachments[i]) {
       case __DRI_BUFFER_BACK_LEFT:
@@ -367,7 +364,6 @@
    unsigned int *attachments_with_format;
    __DRIbuffer *buffer;
    const unsigned int format = 32;
-   int i;
 
    attachments_with_format = calloc(count, 2 * sizeof(unsigned int));
    if (!attachments_with_format) {
@@ -375,7 +371,7 @@
       return NULL;
    }
 
-   for (i = 0; i < count; ++i) {
+   for (int i = 0; i < count; ++i) {
       attachments_with_format[2*i] = attachments[i];
       attachments_with_format[2*i + 1] = format;
    }
@@ -424,34 +420,32 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
-   unsigned i;
 
-   if (dri2_dpy->swrast) {
+   if (!dri2_dpy->flush) {
       dri2_dpy->core->swapBuffers(dri2_surf->dri_drawable);
-   } else {
-      if (dri2_surf->base.Type == EGL_WINDOW_BIT) {
-         if (dri2_surf->current)
-            _eglError(EGL_BAD_SURFACE, "dri2_swap_buffers");
-         for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
-            if (dri2_surf->color_buffers[i].age > 0)
-               dri2_surf->color_buffers[i].age++;
-
-         /* Make sure we have a back buffer in case we're swapping without
-          * ever rendering. */
-         if (get_back_bo(dri2_surf) < 0) {
-            _eglError(EGL_BAD_ALLOC, "dri2_swap_buffers");
-            return EGL_FALSE;
-         }
-
-         dri2_surf->current = dri2_surf->back;
-         dri2_surf->current->age = 1;
-         dri2_surf->back = NULL;
-      }
-
-      dri2_flush_drawable_for_swapbuffers(disp, draw);
-      dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
+      return EGL_TRUE;
    }
 
+   if (dri2_surf->base.Type == EGL_WINDOW_BIT) {
+      if (dri2_surf->current)
+         _eglError(EGL_BAD_SURFACE, "dri2_swap_buffers");
+      for (unsigned i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
+         if (dri2_surf->color_buffers[i].age > 0)
+            dri2_surf->color_buffers[i].age++;
+
+      /* Make sure we have a back buffer in case we're swapping without
+       * ever rendering. */
+      if (get_back_bo(dri2_surf) < 0)
+         return _eglError(EGL_BAD_ALLOC, "dri2_swap_buffers");
+
+      dri2_surf->current = dri2_surf->back;
+      dri2_surf->current->age = 1;
+      dri2_surf->back = NULL;
+   }
+
+   dri2_flush_drawable_for_swapbuffers(disp, draw);
+   dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
+
    return EGL_TRUE;
 }
 
@@ -483,10 +477,7 @@
       return NULL;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return NULL;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    dri2_img->dri_image = dri2_dpy->image->dupImage(dri_bo->image, dri2_img);
    if (dri2_img->dri_image == NULL) {
@@ -533,7 +524,7 @@
                   void          *loaderPrivate)
 {
    struct dri2_egl_surface *dri2_surf = loaderPrivate;
-   int internal_stride, i;
+   int internal_stride;
    struct gbm_dri_bo *bo;
 
    if (op != __DRI_SWRAST_IMAGE_OP_DRAW &&
@@ -547,9 +538,9 @@
    if (gbm_dri_bo_map_dumb(bo) == NULL)
       return;
 
-   internal_stride = bo->base.base.stride;
+   internal_stride = bo->base.stride;
 
-   for (i = 0; i < height; i++) {
+   for (int i = 0; i < height; i++) {
       memcpy(bo->map + (x + i) * internal_stride + y,
              data + i * stride, stride);
    }
@@ -567,7 +558,7 @@
                  void          *loaderPrivate)
 {
    struct dri2_egl_surface *dri2_surf = loaderPrivate;
-   int internal_stride, stride, i;
+   int internal_stride, stride;
    struct gbm_dri_bo *bo;
 
    if (get_swrast_front_bo(dri2_surf) < 0)
@@ -577,10 +568,10 @@
    if (gbm_dri_bo_map_dumb(bo) == NULL)
       return;
 
-   internal_stride = bo->base.base.stride;
+   internal_stride = bo->base.stride;
    stride = width * 4;
 
-   for (i = 0; i < height; i++) {
+   for (int i = 0; i < height; i++) {
       memcpy(data + i * stride,
              bo->map + (x + i) * internal_stride + y, stride);
    }
@@ -603,15 +594,11 @@
       { GBM_FORMAT_ARGB8888,    0x00ff0000, 0xff000000 },
       { GBM_FORMAT_RGB565,      0x0000f800, 0x00000000 },
    };
-   EGLint attr_list[] = {
-      EGL_NATIVE_VISUAL_ID, 0,
-      EGL_NONE,
-   };
-   unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
-   unsigned int count, i, j;
 
-   count = 0;
-   for (i = 0; dri2_dpy->driver_configs[i]; i++) {
+   unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
+   unsigned int config_count = 0;
+
+   for (unsigned i = 0; dri2_dpy->driver_configs[i]; i++) {
       unsigned int red, alpha;
 
       dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i],
@@ -619,35 +606,38 @@
       dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i],
                                       __DRI_ATTRIB_ALPHA_MASK, &alpha);
 
-      for (j = 0; j < ARRAY_SIZE(visuals); j++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(visuals); j++) {
          struct dri2_egl_config *dri2_conf;
 
          if (visuals[j].red_mask != red || visuals[j].alpha_mask != alpha)
             continue;
 
-         attr_list[1] = visuals[j].format;
+         const EGLint attr_list[] = {
+            EGL_NATIVE_VISUAL_ID,  visuals[j].format,
+            EGL_NONE,
+         };
 
          dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i],
-               count + 1, EGL_WINDOW_BIT, attr_list, NULL);
+               config_count + 1, EGL_WINDOW_BIT, attr_list, NULL);
          if (dri2_conf) {
-            if (dri2_conf->base.ConfigID == count + 1)
-               count++;
+            if (dri2_conf->base.ConfigID == config_count + 1)
+               config_count++;
             format_count[j]++;
          }
       }
    }
 
-   for (i = 0; i < ARRAY_SIZE(format_count); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(format_count); i++) {
       if (!format_count[i]) {
          _eglLog(_EGL_DEBUG, "No DRI config supports native format 0x%x",
                  visuals[i].format);
       }
    }
 
-   return (count != 0);
+   return (config_count != 0);
 }
 
-static struct dri2_egl_display_vtbl dri2_drm_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_drm_display_vtbl = {
    .authenticate = dri2_drm_authenticate,
    .create_window_surface = dri2_drm_create_window_surface,
    .create_pixmap_surface = dri2_drm_create_pixmap_surface,
@@ -658,6 +648,7 @@
    .swap_buffers = dri2_drm_swap_buffers,
    .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri2_fallback_copy_buffers,
    .query_buffer_age = dri2_drm_query_buffer_age,
@@ -672,7 +663,6 @@
    struct dri2_egl_display *dri2_dpy;
    struct gbm_device *gbm;
    const char *err;
-   int fd = -1;
 
    loader_set_logger(_eglLog);
 
@@ -680,6 +670,7 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    disp->DriverData = (void *) dri2_dpy;
 
    gbm = disp->PlatformDisplay;
@@ -687,18 +678,18 @@
       char buf[64];
       int n = snprintf(buf, sizeof(buf), DRM_DEV_NAME, DRM_DIR_NAME, 0);
       if (n != -1 && n < sizeof(buf))
-         fd = loader_open_device(buf);
-      if (fd < 0)
-         fd = loader_open_device("/dev/dri/card0");
-      gbm = gbm_create_device(fd);
+         dri2_dpy->fd = loader_open_device(buf);
+      if (dri2_dpy->fd < 0)
+         dri2_dpy->fd = loader_open_device("/dev/dri/card0");
+      gbm = gbm_create_device(dri2_dpy->fd);
       if (gbm == NULL) {
          err = "DRI2: failed to create gbm device";
          goto cleanup;
       }
-      dri2_dpy->own_device = 1;
+      dri2_dpy->own_device = true;
    } else {
-      fd = fcntl(gbm_device_get_fd(gbm), F_DUPFD_CLOEXEC, 3);
-      if (fd < 0) {
+      dri2_dpy->fd = fcntl(gbm_device_get_fd(gbm), F_DUPFD_CLOEXEC, 3);
+      if (dri2_dpy->fd < 0) {
          err = "DRI2: failed to fcntl() existing gbm device";
          goto cleanup;
       }
@@ -710,23 +701,13 @@
    }
 
    dri2_dpy->gbm_dri = gbm_dri_device(gbm);
-   if (dri2_dpy->gbm_dri->base.type != GBM_DRM_DRIVER_TYPE_DRI) {
-      err = "DRI2: gbm device using incorrect/incompatible type";
-      goto cleanup;
-   }
-
-   dri2_dpy->fd = fd;
-   dri2_dpy->driver_name = strdup(dri2_dpy->gbm_dri->base.driver_name);
+   dri2_dpy->driver_name = strdup(dri2_dpy->gbm_dri->driver_name);
 
    dri2_dpy->dri_screen = dri2_dpy->gbm_dri->screen;
    dri2_dpy->core = dri2_dpy->gbm_dri->core;
    dri2_dpy->dri2 = dri2_dpy->gbm_dri->dri2;
-   dri2_dpy->fence = dri2_dpy->gbm_dri->fence;
-   dri2_dpy->image = dri2_dpy->gbm_dri->image;
-   dri2_dpy->flush = dri2_dpy->gbm_dri->flush;
    dri2_dpy->swrast = dri2_dpy->gbm_dri->swrast;
    dri2_dpy->driver_configs = dri2_dpy->gbm_dri->driver_configs;
-   dri2_dpy->interop = dri2_dpy->gbm_dri->interop;
 
    dri2_dpy->gbm_dri->lookup_image = dri2_lookup_egl_image;
    dri2_dpy->gbm_dri->lookup_user_data = disp;
@@ -738,9 +719,14 @@
    dri2_dpy->gbm_dri->swrast_put_image2 = swrast_put_image2;
    dri2_dpy->gbm_dri->swrast_get_image = swrast_get_image;
 
-   dri2_dpy->gbm_dri->base.base.surface_lock_front_buffer = lock_front_buffer;
-   dri2_dpy->gbm_dri->base.base.surface_release_buffer = release_buffer;
-   dri2_dpy->gbm_dri->base.base.surface_has_free_buffers = has_free_buffers;
+   dri2_dpy->gbm_dri->base.surface_lock_front_buffer = lock_front_buffer;
+   dri2_dpy->gbm_dri->base.surface_release_buffer = release_buffer;
+   dri2_dpy->gbm_dri->base.surface_has_free_buffers = has_free_buffers;
+
+   if (!dri2_setup_extensions(disp)) {
+      err = "DRI2: failed to find required DRI extensions";
+      goto cleanup;
+   }
 
    dri2_setup_screen(disp);
 
@@ -766,10 +752,6 @@
    return EGL_TRUE;
 
 cleanup:
-   if (fd >= 0)
-      close(fd);
-
-   free(dri2_dpy);
-   disp->DriverData = NULL;
+   dri2_display_destroy(disp);
    return _eglError(EGL_NOT_INITIALIZED, err);
 }
diff --git a/src/egl/drivers/dri2/platform_surfaceless.c b/src/egl/drivers/dri2/platform_surfaceless.c
index 3e19169..1091b4f 100644
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -134,10 +134,10 @@
       goto cleanup_surface;
 
    dri2_surf->dri_drawable =
-      dri2_dpy->dri2->createNewDrawable(dri2_dpy->dri_screen, config,
-                                        dri2_surf);
+      dri2_dpy->image_driver->createNewDrawable(dri2_dpy->dri_screen, config,
+                                                dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
-      _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
+      _eglError(EGL_BAD_ALLOC, "image->createNewDrawable");
       goto cleanup_surface;
     }
 
@@ -201,35 +201,35 @@
       { "RGB565",   { 0x00f800, 0x07e0, 0x1f, 0x0 } },
    };
    unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
-   unsigned int count, i, j;
+   unsigned int config_count = 0;
 
-   count = 0;
-   for (i = 0; dri2_dpy->driver_configs[i] != NULL; i++) {
-      for (j = 0; j < ARRAY_SIZE(visuals); j++) {
+   for (unsigned i = 0; dri2_dpy->driver_configs[i] != NULL; i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(visuals); j++) {
          struct dri2_egl_config *dri2_conf;
 
          dri2_conf = dri2_add_config(dpy, dri2_dpy->driver_configs[i],
-               count + 1, EGL_PBUFFER_BIT, NULL, visuals[j].rgba_masks);
+               config_count + 1, EGL_PBUFFER_BIT, NULL,
+               visuals[j].rgba_masks);
 
          if (dri2_conf) {
-            if (dri2_conf->base.ConfigID == count + 1)
-               count++;
+            if (dri2_conf->base.ConfigID == config_count + 1)
+               config_count++;
             format_count[j]++;
          }
       }
    }
 
-   for (i = 0; i < ARRAY_SIZE(format_count); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(format_count); i++) {
       if (!format_count[i]) {
          _eglLog(_EGL_DEBUG, "No DRI config supports native format %s",
                visuals[i].format_name);
       }
    }
 
-   return (count != 0);
+   return (config_count != 0);
 }
 
-static struct dri2_egl_display_vtbl dri2_surfaceless_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_surfaceless_display_vtbl = {
    .create_pixmap_surface = dri2_fallback_create_pixmap_surface,
    .create_pbuffer_surface = dri2_surfaceless_create_pbuffer_surface,
    .destroy_surface = surfaceless_destroy_surface,
@@ -238,6 +238,7 @@
    .swap_buffers = surfaceless_swap_buffers,
    .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri2_fallback_copy_buffers,
    .query_buffer_age = dri2_fallback_query_buffer_age,
@@ -271,7 +272,6 @@
 {
    struct dri2_egl_display *dri2_dpy;
    const char* err;
-   int i;
    int driver_loaded = 0;
 
    loader_set_logger(_eglLog);
@@ -280,11 +280,12 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    disp->DriverData = (void *) dri2_dpy;
 
    const int limit = 64;
    const int base = 128;
-   for (i = 0; i < limit; ++i) {
+   for (int i = 0; i < limit; ++i) {
       char *card_path;
       if (asprintf(&card_path, DRM_RENDER_DEV_NAME, DRM_DIR_NAME, base + i) < 0)
          continue;
@@ -297,30 +298,39 @@
 
       dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
       if (dri2_dpy->driver_name) {
-         if (dri2_load_driver(disp)) {
+         if (dri2_load_driver_dri3(disp)) {
             driver_loaded = 1;
             break;
          }
          free(dri2_dpy->driver_name);
+         dri2_dpy->driver_name = NULL;
       }
       close(dri2_dpy->fd);
+      dri2_dpy->fd = -1;
    }
 
    if (!driver_loaded) {
       err = "DRI2: failed to load driver";
-      goto cleanup_display;
+      goto cleanup;
    }
 
    dri2_dpy->loader_extensions = image_loader_extensions;
 
    if (!dri2_create_screen(disp)) {
       err = "DRI2: failed to create screen";
-      goto cleanup_driver;
+      goto cleanup;
    }
 
+   if (!dri2_setup_extensions(disp)) {
+      err = "DRI2: failed to find required DRI extensions";
+      goto cleanup;
+   }
+
+   dri2_setup_screen(disp);
+
    if (!surfaceless_add_configs_for_visuals(drv, disp)) {
       err = "DRI2: failed to add configs";
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
@@ -330,16 +340,7 @@
 
    return EGL_TRUE;
 
-cleanup_screen:
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
-
-cleanup_driver:
-   dlclose(dri2_dpy->driver);
-   free(dri2_dpy->driver_name);
-   close(dri2_dpy->fd);
-cleanup_display:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+cleanup:
+   dri2_display_destroy(disp);
    return _eglError(EGL_NOT_INITIALIZED, err);
 }
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index b8af6ef..b4b412c 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -36,14 +36,26 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <xf86drm.h>
+#include <drm_fourcc.h>
 #include <sys/mman.h>
 
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
 #include "loader.h"
+#include "util/u_vector.h"
+#include "eglglobals.h"
 
 #include <wayland-client.h>
 #include "wayland-drm-client-protocol.h"
+#include "linux-dmabuf-unstable-v1-client-protocol.h"
+
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL << 56) - 1)
+#endif
+
+#ifndef DRM_FORMAT_MOD_LINEAR
+#define DRM_FORMAT_MOD_LINEAR 0
+#endif
 
 enum wl_drm_format_flags {
    HAS_ARGB8888 = 1,
@@ -76,7 +88,7 @@
       return;
    }
 
-   dri2_surf->color_buffers[i].locked = 0;
+   dri2_surf->color_buffers[i].locked = false;
 }
 
 static const struct wl_buffer_listener wl_buffer_listener = {
@@ -100,6 +112,19 @@
    dri2_surf->wl_win = NULL;
 }
 
+static struct wl_surface *
+get_wl_surface_proxy(struct wl_egl_window *window)
+{
+    /* Version 3 of wl_egl_window introduced a version field at the same
+     * location where a pointer to wl_surface was stored. Thus, if
+     * window->version is dereferencable, we've been given an older version of
+     * wl_egl_window, and window->version points to wl_surface */
+   if (_eglPointerIsDereferencable((void *)(window->version))) {
+      return wl_proxy_create_wrapper((void *)(window->version));
+   }
+   return wl_proxy_create_wrapper(window->surface);
+}
+
 /**
  * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
  */
@@ -124,7 +149,7 @@
    if (!_eglInitSurface(&dri2_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list))
       goto cleanup_surf;
 
-   if (dri2_dpy->wl_drm) {
+   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) {
       if (conf->RedSize == 5)
          dri2_surf->format = WL_DRM_FORMAT_RGB565;
       else if (conf->AlphaSize == 0)
@@ -171,10 +196,10 @@
    wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_dpy_wrapper,
                       dri2_surf->wl_queue);
 
-   dri2_surf->wl_surface_wrapper = wl_proxy_create_wrapper(window->surface);
+   dri2_surf->wl_surface_wrapper = get_wl_surface_proxy(window);
    if (!dri2_surf->wl_surface_wrapper) {
       _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
-      goto cleanup_drm;
+      goto cleanup_dpy_wrapper;
    }
    wl_proxy_set_queue((struct wl_proxy *)dri2_surf->wl_surface_wrapper,
                       dri2_surf->wl_queue);
@@ -188,19 +213,21 @@
    config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
                                 dri2_surf->base.GLColorspace);
 
-   if (dri2_dpy->dri2) {
+   if (dri2_dpy->flush)
       dri2_surf->wl_win->resize_callback = resize_callback;
 
+   if (dri2_dpy->image_driver)
+      createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
+   else if (dri2_dpy->dri2)
       createNewDrawable = dri2_dpy->dri2->createNewDrawable;
-   } else {
+   else
       createNewDrawable = dri2_dpy->swrast->createNewDrawable;
-   }
 
    dri2_surf->dri_drawable = (*createNewDrawable)(dri2_dpy->dri_screen, config,
                                                   dri2_surf);
     if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "createNewDrawable");
-       goto cleanup_surf;
+       goto cleanup_surf_wrapper;
     }
 
    dri2_wl_swap_interval(drv, disp, &dri2_surf->base,
@@ -208,6 +235,10 @@
 
    return &dri2_surf->base;
 
+ cleanup_surf_wrapper:
+   wl_proxy_wrapper_destroy(dri2_surf->wl_surface_wrapper);
+ cleanup_dpy_wrapper:
+   wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper);
  cleanup_drm:
    if (dri2_surf->wl_drm_wrapper)
       wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
@@ -243,13 +274,12 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
-   int i;
 
    (void) drv;
 
    dri2_dpy->core->destroyDrawable(dri2_surf->dri_drawable);
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (dri2_surf->color_buffers[i].wl_buffer)
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
       if (dri2_surf->color_buffers[i].dri_image)
@@ -262,7 +292,7 @@
    }
 
    if (dri2_dpy->dri2) {
-      for (i = 0; i < __DRI_BUFFER_COUNT; i++)
+      for (int i = 0; i < __DRI_BUFFER_COUNT; i++)
          if (dri2_surf->dri_buffers[i] &&
              dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
             dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
@@ -278,10 +308,10 @@
       dri2_surf->wl_win->destroy_window_callback = NULL;
    }
 
-   if (dri2_surf->wl_drm_wrapper)
-      wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
    wl_proxy_wrapper_destroy(dri2_surf->wl_surface_wrapper);
    wl_proxy_wrapper_destroy(dri2_surf->wl_dpy_wrapper);
+   if (dri2_surf->wl_drm_wrapper)
+      wl_proxy_wrapper_destroy(dri2_surf->wl_drm_wrapper);
    wl_event_queue_destroy(dri2_surf->wl_queue);
 
    free(surf);
@@ -294,9 +324,8 @@
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i;
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (dri2_surf->color_buffers[i].wl_buffer &&
           !dri2_surf->color_buffers[i].locked)
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
@@ -312,11 +341,11 @@
       dri2_surf->color_buffers[i].dri_image = NULL;
       dri2_surf->color_buffers[i].linear_copy = NULL;
       dri2_surf->color_buffers[i].data = NULL;
-      dri2_surf->color_buffers[i].locked = 0;
+      dri2_surf->color_buffers[i].locked = false;
    }
 
    if (dri2_dpy->dri2) {
-      for (i = 0; i < __DRI_BUFFER_COUNT; i++)
+      for (int i = 0; i < __DRI_BUFFER_COUNT; i++)
          if (dri2_surf->dri_buffers[i] &&
              dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
             dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
@@ -329,8 +358,10 @@
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i, use_flags;
+   int use_flags;
    unsigned int dri_image_format;
+   uint64_t *modifiers;
+   int num_modifiers;
 
    /* currently supports three WL DRM formats,
     * WL_DRM_FORMAT_ARGB8888, WL_DRM_FORMAT_XRGB8888,
@@ -339,12 +370,18 @@
    switch (dri2_surf->format) {
    case WL_DRM_FORMAT_ARGB8888:
       dri_image_format = __DRI_IMAGE_FORMAT_ARGB8888;
+      modifiers = u_vector_tail(&dri2_dpy->wl_modifiers.argb8888);
+      num_modifiers = u_vector_length(&dri2_dpy->wl_modifiers.argb8888);
       break;
    case WL_DRM_FORMAT_XRGB8888:
       dri_image_format = __DRI_IMAGE_FORMAT_XRGB8888;
+      modifiers = u_vector_tail(&dri2_dpy->wl_modifiers.xrgb8888);
+      num_modifiers = u_vector_length(&dri2_dpy->wl_modifiers.xrgb8888);
       break;
    case WL_DRM_FORMAT_RGB565:
       dri_image_format = __DRI_IMAGE_FORMAT_RGB565;
+      modifiers = u_vector_tail(&dri2_dpy->wl_modifiers.rgb565);
+      num_modifiers = u_vector_length(&dri2_dpy->wl_modifiers.rgb565);
       break;
    default:
       /* format is not supported */
@@ -355,7 +392,7 @@
    wl_display_dispatch_queue_pending(dri2_dpy->wl_dpy, dri2_surf->wl_queue);
 
    while (dri2_surf->back == NULL) {
-      for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+      for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
          /* Get an unlocked buffer, preferrably one with a dri_buffer
           * already allocated. */
          if (dri2_surf->color_buffers[i].locked)
@@ -370,8 +407,13 @@
          break;
 
       /* If we don't have a buffer, then block on the server to release one for
-       * us, and try again. */
-      if (wl_display_dispatch_queue(dri2_dpy->wl_dpy, dri2_surf->wl_queue) < 0)
+       * us, and try again. wl_display_dispatch_queue will process any pending
+       * events, however not all servers flush on issuing a buffer release
+       * event. So, we spam the server with roundtrips as they always cause a
+       * client flush.
+       */
+      if (wl_display_roundtrip_queue(dri2_dpy->wl_dpy,
+                                     dri2_surf->wl_queue) < 0)
           return -1;
    }
 
@@ -382,33 +424,66 @@
 
    if (dri2_dpy->is_different_gpu &&
        dri2_surf->back->linear_copy == NULL) {
-       dri2_surf->back->linear_copy =
-          dri2_dpy->image->createImage(dri2_dpy->dri_screen,
-                                      dri2_surf->base.Width,
-                                      dri2_surf->base.Height,
-                                      dri_image_format,
-                                      use_flags |
-                                      __DRI_IMAGE_USE_LINEAR,
-                                      NULL);
+      /* The LINEAR modifier should be a perfect alias of the LINEAR use
+       * flag; try the new interface first before the old, then fall back. */
+      if (dri2_dpy->image->base.version >= 15 &&
+           dri2_dpy->image->createImageWithModifiers) {
+         uint64_t linear_mod = DRM_FORMAT_MOD_LINEAR;
+
+         dri2_surf->back->linear_copy =
+            dri2_dpy->image->createImageWithModifiers(dri2_dpy->dri_screen,
+                                                      dri2_surf->base.Width,
+                                                      dri2_surf->base.Height,
+                                                      dri_image_format,
+                                                      &linear_mod,
+                                                      1,
+                                                      NULL);
+      } else {
+         dri2_surf->back->linear_copy =
+            dri2_dpy->image->createImage(dri2_dpy->dri_screen,
+                                         dri2_surf->base.Width,
+                                         dri2_surf->base.Height,
+                                         dri_image_format,
+                                         use_flags |
+                                         __DRI_IMAGE_USE_LINEAR,
+                                         NULL);
+      }
       if (dri2_surf->back->linear_copy == NULL)
           return -1;
    }
 
    if (dri2_surf->back->dri_image == NULL) {
-      dri2_surf->back->dri_image =
-         dri2_dpy->image->createImage(dri2_dpy->dri_screen,
-                                      dri2_surf->base.Width,
-                                      dri2_surf->base.Height,
-                                      dri_image_format,
-                                      dri2_dpy->is_different_gpu ?
-                                         0 : use_flags,
-                                      NULL);
+      /* If our DRIImage implementation does not support
+       * createImageWithModifiers, then fall back to the old createImage,
+       * and hope it allocates an image which is acceptable to the winsys.
+        */
+      if (num_modifiers && dri2_dpy->image->base.version >= 15 &&
+          dri2_dpy->image->createImageWithModifiers) {
+         dri2_surf->back->dri_image =
+           dri2_dpy->image->createImageWithModifiers(dri2_dpy->dri_screen,
+                                                     dri2_surf->base.Width,
+                                                     dri2_surf->base.Height,
+                                                     dri_image_format,
+                                                     modifiers,
+                                                     num_modifiers,
+                                                     NULL);
+      } else {
+         dri2_surf->back->dri_image =
+            dri2_dpy->image->createImage(dri2_dpy->dri_screen,
+                                         dri2_surf->base.Width,
+                                         dri2_surf->base.Height,
+                                         dri_image_format,
+                                         dri2_dpy->is_different_gpu ?
+                                              0 : use_flags,
+                                         NULL);
+      }
+
       dri2_surf->back->age = 0;
    }
    if (dri2_surf->back->dri_image == NULL)
       return -1;
 
-   dri2_surf->back->locked = 1;
+   dri2_surf->back->locked = true;
 
    return 0;
 }
@@ -436,7 +511,7 @@
 
 static int
 get_aux_bo(struct dri2_egl_surface *dri2_surf,
-	   unsigned int attachment, unsigned int format, __DRIbuffer *buffer)
+           unsigned int attachment, unsigned int format, __DRIbuffer *buffer)
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
@@ -444,9 +519,9 @@
 
    if (b == NULL) {
       b = dri2_dpy->dri2->allocateBuffer(dri2_dpy->dri_screen,
-					 attachment, format,
-					 dri2_surf->base.Width,
-					 dri2_surf->base.Height);
+                                         attachment, format,
+                                         dri2_surf->base.Width,
+                                         dri2_surf->base.Height);
       dri2_surf->dri_buffers[attachment] = b;
    }
    if (b == NULL)
@@ -462,7 +537,6 @@
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i;
 
    if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
        dri2_surf->base.Height != dri2_surf->wl_win->height) {
@@ -483,7 +557,7 @@
    /* If we have an extra unlocked buffer at this point, we had to do triple
     * buffering for a while, but now can go back to just double buffering.
     * That means we can free any unlocked buffer now. */
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (!dri2_surf->color_buffers[i].locked &&
           dri2_surf->color_buffers[i].wl_buffer) {
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
@@ -515,20 +589,20 @@
       switch (attachments[i]) {
       case __DRI_BUFFER_BACK_LEFT:
          back_bo_to_dri_buffer(dri2_surf, &dri2_surf->buffers[j]);
-	 break;
+         break;
       default:
-	 if (get_aux_bo(dri2_surf, attachments[i], attachments[i + 1],
-			&dri2_surf->buffers[j]) < 0) {
-	    _eglError(EGL_BAD_ALLOC, "failed to allocate aux buffer");
-	    return NULL;
-	 }
-	 break;
+         if (get_aux_bo(dri2_surf, attachments[i], attachments[i + 1],
+                        &dri2_surf->buffers[j]) < 0) {
+            _eglError(EGL_BAD_ALLOC, "failed to allocate aux buffer");
+            return NULL;
+         }
+         break;
       }
    }
 
    *out_count = j;
    if (j == 0)
-	   return NULL;
+      return NULL;
 
    *width = dri2_surf->base.Width;
    *height = dri2_surf->base.Height;
@@ -547,8 +621,6 @@
    __DRIbuffer *buffer;
    unsigned int bpp;
 
-   int i;
-
    switch (dri2_surf->format) {
    case WL_DRM_FORMAT_ARGB8888:
    case WL_DRM_FORMAT_XRGB8888:
@@ -568,7 +640,7 @@
       return NULL;
    }
 
-   for (i = 0; i < count; ++i) {
+   for (int i = 0; i < count; ++i) {
       attachments_with_format[2*i] = attachments[i];
       attachments_with_format[2*i + 1] = bpp;
    }
@@ -640,51 +712,123 @@
    .done = wayland_throttle_callback
 };
 
-static void
-create_wl_buffer(struct dri2_egl_surface *dri2_surf)
+static struct wl_buffer *
+create_wl_buffer(struct dri2_egl_display *dri2_dpy,
+                 struct dri2_egl_surface *dri2_surf,
+                 __DRIimage *image)
 {
-   struct dri2_egl_display *dri2_dpy =
-      dri2_egl_display(dri2_surf->base.Resource.Display);
-   __DRIimage *image;
-   int fd, stride, name;
+   struct wl_buffer *ret;
+   EGLBoolean query;
+   int width, height, fourcc, num_planes;
+   uint64_t modifier = DRM_FORMAT_MOD_INVALID;
 
-   if (dri2_surf->current->wl_buffer != NULL)
-      return;
+   query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_WIDTH, &width);
+   query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_HEIGHT,
+                                        &height);
+   query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FOURCC,
+                                        &fourcc);
+   if (!query)
+      return NULL;
 
-   if (dri2_dpy->is_different_gpu) {
-      image = dri2_surf->current->linear_copy;
-   } else {
-      image = dri2_surf->current->dri_image;
+   query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_NUM_PLANES,
+                                       &num_planes);
+   if (!query)
+      num_planes = 1;
+
+   if (dri2_dpy->image->base.version >= 15) {
+      int mod_hi, mod_lo;
+
+      query = dri2_dpy->image->queryImage(image,
+                                          __DRI_IMAGE_ATTRIB_MODIFIER_UPPER,
+                                          &mod_hi);
+      query &= dri2_dpy->image->queryImage(image,
+                                           __DRI_IMAGE_ATTRIB_MODIFIER_LOWER,
+                                           &mod_lo);
+      if (query) {
+         modifier = (uint64_t) mod_hi << 32;
+         modifier |= (uint64_t) (mod_lo & 0xffffffff);
+      }
    }
-   if (dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME) {
+
+   if (dri2_dpy->wl_dmabuf && modifier != DRM_FORMAT_MOD_INVALID) {
+      struct zwp_linux_buffer_params_v1 *params;
+      int i;
+
+      /* We don't need a wrapper for wl_dmabuf objects, because we have to
+       * create the intermediate params object; we can set the queue on this,
+       * and the wl_buffer inherits it race-free. */
+      params = zwp_linux_dmabuf_v1_create_params(dri2_dpy->wl_dmabuf);
+      if (dri2_surf)
+         wl_proxy_set_queue((struct wl_proxy *) params, dri2_surf->wl_queue);
+
+      for (i = 0; i < num_planes; i++) {
+         __DRIimage *p_image;
+         int stride, offset;
+         int fd = -1;
+
+         if (i == 0)
+            p_image = image;
+         else
+            p_image = dri2_dpy->image->fromPlanar(image, i, NULL);
+         if (!p_image) {
+            zwp_linux_buffer_params_v1_destroy(params);
+            return NULL;
+         }
+
+         query = dri2_dpy->image->queryImage(p_image,
+                                             __DRI_IMAGE_ATTRIB_FD,
+                                             &fd);
+         query &= dri2_dpy->image->queryImage(p_image,
+                                              __DRI_IMAGE_ATTRIB_STRIDE,
+                                              &stride);
+         query &= dri2_dpy->image->queryImage(p_image,
+                                              __DRI_IMAGE_ATTRIB_OFFSET,
+                                              &offset);
+         if (image != p_image)
+            dri2_dpy->image->destroyImage(p_image);
+
+         if (!query) {
+            if (fd >= 0)
+               close(fd);
+            zwp_linux_buffer_params_v1_destroy(params);
+            return NULL;
+         }
+
+         zwp_linux_buffer_params_v1_add(params, fd, i, offset, stride,
+                                        modifier >> 32, modifier & 0xffffffff);
+         close(fd);
+      }
+
+      ret = zwp_linux_buffer_params_v1_create_immed(params, width, height,
+                                                    fourcc, 0);
+      zwp_linux_buffer_params_v1_destroy(params);
+   } else if (dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME) {
+      struct wl_drm *wl_drm =
+         dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm;
+      int fd, stride;
+
+      if (num_planes > 1)
+         return NULL;
+
       dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd);
       dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
-
-      dri2_surf->current->wl_buffer =
-         wl_drm_create_prime_buffer(dri2_surf->wl_drm_wrapper,
-                                    fd,
-                                    dri2_surf->base.Width,
-                                    dri2_surf->base.Height,
-                                    dri2_surf->format,
-                                    0, stride,
-                                    0, 0,
-                                    0, 0);
+      ret = wl_drm_create_prime_buffer(wl_drm, fd, width, height, fourcc, 0,
+                                       stride, 0, 0, 0, 0);
       close(fd);
    } else {
+      struct wl_drm *wl_drm =
+         dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm;
+      int name, stride;
+
+      if (num_planes > 1)
+         return NULL;
+
       dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_NAME, &name);
       dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
-
-      dri2_surf->current->wl_buffer =
-         wl_drm_create_buffer(dri2_surf->wl_drm_wrapper,
-                              name,
-                              dri2_surf->base.Width,
-                              dri2_surf->base.Height,
-                              stride,
-                              dri2_surf->format);
+      ret = wl_drm_create_buffer(wl_drm, name, width, height, stride, fourcc);
    }
 
-   wl_buffer_add_listener(dri2_surf->current->wl_buffer,
-                          &wl_buffer_listener, dri2_surf);
+   return ret;
 }
 
 static EGLBoolean
@@ -692,13 +836,11 @@
                   const EGLint *rects,
                   EGLint n_rects)
 {
-   int i;
-
    if (wl_proxy_get_version((struct wl_proxy *) dri2_surf->wl_surface_wrapper)
        < WL_SURFACE_DAMAGE_BUFFER_SINCE_VERSION)
       return EGL_FALSE;
 
-   for (i = 0; i < n_rects; i++) {
+   for (int i = 0; i < n_rects; i++) {
       const int *rect = &rects[i * 4];
 
       wl_surface_damage_buffer(dri2_surf->wl_surface_wrapper,
@@ -720,23 +862,20 @@
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
-   int i;
 
    while (dri2_surf->throttle_callback != NULL)
       if (wl_display_dispatch_queue(dri2_dpy->wl_dpy,
                                     dri2_surf->wl_queue) == -1)
          return -1;
 
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
       if (dri2_surf->color_buffers[i].age > 0)
          dri2_surf->color_buffers[i].age++;
 
    /* Make sure we have a back buffer in case we're swapping without ever
     * rendering. */
-   if (get_back_bo(dri2_surf) < 0) {
-      _eglError(EGL_BAD_ALLOC, "dri2_swap_buffers");
-      return EGL_FALSE;
-   }
+   if (get_back_bo(dri2_surf) < 0)
+      return _eglError(EGL_BAD_ALLOC, "dri2_swap_buffers");
 
    if (draw->SwapInterval > 0) {
       dri2_surf->throttle_callback =
@@ -749,7 +888,20 @@
    dri2_surf->current = dri2_surf->back;
    dri2_surf->back = NULL;
 
-   create_wl_buffer(dri2_surf);
+   if (!dri2_surf->current->wl_buffer) {
+      __DRIimage *image;
+
+      if (dri2_dpy->is_different_gpu)
+         image = dri2_surf->current->linear_copy;
+      else
+         image = dri2_surf->current->dri_image;
+
+      dri2_surf->current->wl_buffer =
+         create_wl_buffer(dri2_dpy, dri2_surf, image);
+
+      wl_buffer_add_listener(dri2_surf->current->wl_buffer,
+                             &wl_buffer_listener, dri2_surf);
+   }
 
    wl_surface_attach(dri2_surf->wl_surface_wrapper,
                      dri2_surf->current->wl_buffer,
@@ -829,62 +981,30 @@
    struct dri2_egl_image *dri2_img = dri2_egl_image(img);
    __DRIimage *image = dri2_img->dri_image;
    struct wl_buffer *buffer;
-   int width, height, format, pitch;
-   enum wl_drm_format wl_format;
+   int format;
 
    dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FORMAT, &format);
-
    switch (format) {
    case __DRI_IMAGE_FORMAT_ARGB8888:
       if (!(dri2_dpy->formats & HAS_ARGB8888))
          goto bad_format;
-      wl_format = WL_DRM_FORMAT_ARGB8888;
       break;
    case __DRI_IMAGE_FORMAT_XRGB8888:
       if (!(dri2_dpy->formats & HAS_XRGB8888))
          goto bad_format;
-      wl_format = WL_DRM_FORMAT_XRGB8888;
       break;
    default:
       goto bad_format;
    }
 
-   dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_WIDTH, &width);
-   dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_HEIGHT, &height);
-   dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &pitch);
-
-   if (dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME) {
-      int fd;
-
-      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd);
-
-      buffer =
-         wl_drm_create_prime_buffer(dri2_dpy->wl_drm,
-                                    fd,
-                                    width, height,
-                                    wl_format,
-                                    0, pitch,
-                                    0, 0,
-                                    0, 0);
-
-      close(fd);
-   } else {
-      int name;
-
-      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_NAME, &name);
-
-      buffer =
-         wl_drm_create_buffer(dri2_dpy->wl_drm,
-                              name,
-                              width, height,
-                              pitch,
-                              wl_format);
-   }
+   buffer = create_wl_buffer(dri2_dpy, NULL, image);
 
    /* The buffer object will have been created with our internal event queue
     * because it is using the wl_drm object as a proxy factory. We want the
     * buffer to be used by the application so we'll reset it to the display's
-    * default event queue */
+    * default event queue. This isn't actually racy, as the only event the
+    * buffer can get is a buffer release, which doesn't happen with an explicit
+    * attach. */
    if (buffer)
       wl_proxy_set_queue((struct wl_proxy *) buffer, NULL);
 
@@ -906,7 +1026,7 @@
                             "authenticate for render-nodes");
       return 0;
    }
-   dri2_dpy->authenticated = 0;
+   dri2_dpy->authenticated = false;
 
    wl_drm_authenticate(dri2_dpy->wl_drm, id);
    if (roundtrip(dri2_dpy) < 0)
@@ -916,7 +1036,7 @@
       ret = -1;
 
    /* reset authenticated */
-   dri2_dpy->authenticated = 1;
+   dri2_dpy->authenticated = true;
 
    return ret;
 }
@@ -934,12 +1054,12 @@
    dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
    if (dri2_dpy->fd == -1) {
       _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)",
-	      dri2_dpy->device_name, strerror(errno));
+              dri2_dpy->device_name, strerror(errno));
       return;
    }
 
    if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
-      dri2_dpy->authenticated = 1;
+      dri2_dpy->authenticated = true;
    } else {
       drmGetMagic(dri2_dpy->fd, &magic);
       wl_drm_authenticate(dri2_dpy->wl_drm, magic);
@@ -977,7 +1097,7 @@
 {
    struct dri2_egl_display *dri2_dpy = data;
 
-   dri2_dpy->authenticated = 1;
+   dri2_dpy->authenticated = true;
 }
 
 static const struct wl_drm_listener drm_listener = {
@@ -988,23 +1108,76 @@
 };
 
 static void
-registry_handle_global_drm(void *data, struct wl_registry *registry, uint32_t name,
-		       const char *interface, uint32_t version)
+dmabuf_ignore_format(void *data, struct zwp_linux_dmabuf_v1 *dmabuf,
+                     uint32_t format)
+{
+   /* formats are implicitly advertised by the 'modifier' event, so ignore */
+}
+
+static void
+dmabuf_handle_modifier(void *data, struct zwp_linux_dmabuf_v1 *dmabuf,
+                       uint32_t format, uint32_t modifier_hi,
+                       uint32_t modifier_lo)
+{
+   struct dri2_egl_display *dri2_dpy = data;
+   uint64_t *mod = NULL;
+
+   if (modifier_hi == (DRM_FORMAT_MOD_INVALID >> 32) &&
+       modifier_lo == (DRM_FORMAT_MOD_INVALID & 0xffffffff))
+      return;
+
+   switch (format) {
+   case WL_DRM_FORMAT_ARGB8888:
+      mod = u_vector_add(&dri2_dpy->wl_modifiers.argb8888);
+      dri2_dpy->formats |= HAS_ARGB8888;
+      break;
+   case WL_DRM_FORMAT_XRGB8888:
+      mod = u_vector_add(&dri2_dpy->wl_modifiers.xrgb8888);
+      dri2_dpy->formats |= HAS_XRGB8888;
+      break;
+   case WL_DRM_FORMAT_RGB565:
+      mod = u_vector_add(&dri2_dpy->wl_modifiers.rgb565);
+      dri2_dpy->formats |= HAS_RGB565;
+      break;
+   default:
+      break;
+   }
+
+   if (!mod)
+      return;
+
+   *mod = (uint64_t) modifier_hi << 32;
+   *mod |= (uint64_t) (modifier_lo & 0xffffffff);
+}
+
+static const struct zwp_linux_dmabuf_v1_listener dmabuf_listener = {
+   .format = dmabuf_ignore_format,
+   .modifier = dmabuf_handle_modifier,
+};
+
+static void
+registry_handle_global_drm(void *data, struct wl_registry *registry,
+                           uint32_t name, const char *interface,
+                           uint32_t version)
 {
    struct dri2_egl_display *dri2_dpy = data;
 
-   if (version > 1)
-      version = 2;
    if (strcmp(interface, "wl_drm") == 0) {
       dri2_dpy->wl_drm =
-         wl_registry_bind(registry, name, &wl_drm_interface, version);
+         wl_registry_bind(registry, name, &wl_drm_interface, MIN2(version, 2));
       wl_drm_add_listener(dri2_dpy->wl_drm, &drm_listener, dri2_dpy);
+   } else if (strcmp(interface, "zwp_linux_dmabuf_v1") == 0 && version >= 3) {
+      dri2_dpy->wl_dmabuf =
+         wl_registry_bind(registry, name, &zwp_linux_dmabuf_v1_interface,
+                          MIN2(version, 3));
+      zwp_linux_dmabuf_v1_add_listener(dri2_dpy->wl_dmabuf, &dmabuf_listener,
+                                       dri2_dpy);
    }
 }
 
 static void
 registry_handle_global_remove(void *data, struct wl_registry *registry,
-			      uint32_t name)
+                              uint32_t name)
 {
 }
 
@@ -1067,7 +1240,7 @@
    }
 }
 
-static struct dri2_egl_display_vtbl dri2_wl_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_wl_display_vtbl = {
    .authenticate = dri2_wl_authenticate,
    .create_window_surface = dri2_wl_create_window_surface,
    .create_pixmap_surface = dri2_wl_create_pixmap_surface,
@@ -1078,6 +1251,7 @@
    .swap_buffers = dri2_wl_swap_buffers,
    .swap_buffers_with_damage = dri2_wl_swap_buffers_with_damage,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri2_fallback_copy_buffers,
    .query_buffer_age = dri2_wl_query_buffer_age,
@@ -1115,11 +1289,10 @@
       { "RGB565",   HAS_RGB565,   { 0x00f800, 0x07e0, 0x001f, 0 } },
    };
    unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
-   unsigned int count, i, j;
+   unsigned int count = 0;
 
-   count = 0;
-   for (i = 0; dri2_dpy->driver_configs[i]; i++) {
-      for (j = 0; j < ARRAY_SIZE(visuals); j++) {
+   for (unsigned i = 0; dri2_dpy->driver_configs[i]; i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(visuals); j++) {
          struct dri2_egl_config *dri2_conf;
 
          if (!(dri2_dpy->formats & visuals[j].has_format))
@@ -1135,7 +1308,7 @@
       }
    }
 
-   for (i = 0; i < ARRAY_SIZE(format_count); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(format_count); i++) {
       if (!format_count[i]) {
          _eglLog(_EGL_DEBUG, "No DRI config supports native format %s",
                  visuals[i].format_name);
@@ -1156,21 +1329,28 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    disp->DriverData = (void *) dri2_dpy;
    if (disp->PlatformDisplay == NULL) {
       dri2_dpy->wl_dpy = wl_display_connect(NULL);
       if (dri2_dpy->wl_dpy == NULL)
-         goto cleanup_dpy;
-      dri2_dpy->own_device = 1;
+         goto cleanup;
+      dri2_dpy->own_device = true;
    } else {
       dri2_dpy->wl_dpy = disp->PlatformDisplay;
    }
 
+   if (!u_vector_init(&dri2_dpy->wl_modifiers.xrgb8888, sizeof(uint64_t), 32) ||
+       !u_vector_init(&dri2_dpy->wl_modifiers.argb8888, sizeof(uint64_t), 32) ||
+       !u_vector_init(&dri2_dpy->wl_modifiers.rgb565, sizeof(uint64_t), 32)) {
+      goto cleanup;
+   }
+
    dri2_dpy->wl_queue = wl_display_create_queue(dri2_dpy->wl_dpy);
 
    dri2_dpy->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy);
    if (dri2_dpy->wl_dpy_wrapper == NULL)
-      goto cleanup_dpy_wrapper;
+      goto cleanup;
 
    wl_proxy_set_queue((struct wl_proxy *) dri2_dpy->wl_dpy_wrapper,
                       dri2_dpy->wl_queue);
@@ -1182,13 +1362,13 @@
    wl_registry_add_listener(dri2_dpy->wl_registry,
                             &registry_listener_drm, dri2_dpy);
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->wl_drm == NULL)
-      goto cleanup_registry;
+      goto cleanup;
 
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1)
-      goto cleanup_drm;
+      goto cleanup;
 
    if (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated)
-      goto cleanup_fd;
+      goto cleanup;
 
    dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd,
                                                &dri2_dpy->is_different_gpu);
@@ -1198,7 +1378,7 @@
       if (!dri2_dpy->device_name) {
          _eglError(EGL_BAD_ALLOC, "wayland-egl: failed to get device name "
                                   "for requested GPU");
-         goto cleanup_fd;
+         goto cleanup;
       }
    }
 
@@ -1211,21 +1391,32 @@
    dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
    if (dri2_dpy->driver_name == NULL) {
       _eglError(EGL_BAD_ALLOC, "DRI2: failed to get driver name");
-      goto cleanup_fd;
+      goto cleanup;
    }
 
-   if (!dri2_load_driver(disp))
-      goto cleanup_driver_name;
-
    /* render nodes cannot use Gem names, and thus do not support
     * the __DRI_DRI2_LOADER extension */
-   if (!dri2_dpy->is_render_node)
+   if (!dri2_dpy->is_render_node) {
       dri2_dpy->loader_extensions = dri2_loader_extensions;
-   else
+      if (!dri2_load_driver(disp)) {
+         _eglError(EGL_BAD_ALLOC, "DRI2: failed to load driver");
+         goto cleanup;
+      }
+   } else {
       dri2_dpy->loader_extensions = image_loader_extensions;
+      if (!dri2_load_driver_dri3(disp)) {
+         _eglError(EGL_BAD_ALLOC, "DRI3: failed to load driver");
+         goto cleanup;
+      }
+   }
 
    if (!dri2_create_screen(disp))
-      goto cleanup_driver;
+      goto cleanup;
+
+   if (!dri2_setup_extensions(disp))
+      goto cleanup;
+
+   dri2_setup_screen(disp);
 
    dri2_wl_setup_swap_interval(dri2_dpy);
 
@@ -1243,7 +1434,7 @@
    if (dri2_dpy->is_render_node &&
        !(dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME)) {
       _eglLog(_EGL_WARNING, "wayland-egl: display is not render-node capable");
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    if (dri2_dpy->is_different_gpu &&
@@ -1253,12 +1444,12 @@
                             "Image extension in the driver is not "
                             "compatible. Version 9 or later and blitImage() "
                             "are required");
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    if (!dri2_wl_add_configs_for_visuals(drv, disp)) {
       _eglError(EGL_NOT_INITIALIZED, "DRI2: failed to add configs");
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    dri2_set_WL_bind_wayland_display(drv, disp);
@@ -1266,12 +1457,9 @@
     * because the buffer of the EGLImage has likely a tiling mode the server
     * gpu won't support. These is no way to check for now. Thus do not support the
     * extension */
-   if (!dri2_dpy->is_different_gpu) {
+   if (!dri2_dpy->is_different_gpu)
       disp->Extensions.WL_create_wayland_buffer_from_image = EGL_TRUE;
-   } else {
-      dri2_wl_display_vtbl.create_wayland_buffer_from_image =
-         dri2_fallback_create_wayland_buffer_from_image;
-   }
+
    disp->Extensions.EXT_buffer_age = EGL_TRUE;
 
    disp->Extensions.EXT_swap_buffers_with_damage = EGL_TRUE;
@@ -1283,28 +1471,8 @@
 
    return EGL_TRUE;
 
- cleanup_screen:
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_driver:
-   dlclose(dri2_dpy->driver);
- cleanup_driver_name:
-   free(dri2_dpy->driver_name);
- cleanup_fd:
-   close(dri2_dpy->fd);
- cleanup_drm:
-   free(dri2_dpy->device_name);
-   wl_drm_destroy(dri2_dpy->wl_drm);
- cleanup_registry:
-   wl_registry_destroy(dri2_dpy->wl_registry);
-   wl_proxy_wrapper_destroy(dri2_dpy->wl_dpy_wrapper);
- cleanup_dpy_wrapper:
-   wl_event_queue_destroy(dri2_dpy->wl_queue);
-   if (disp->PlatformDisplay == NULL)
-      wl_display_disconnect(dri2_dpy->wl_dpy);
- cleanup_dpy:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+ cleanup:
+   dri2_display_destroy(disp);
    return EGL_FALSE;
 }
 
@@ -1476,7 +1644,6 @@
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i;
 
    /* we need to do the following operations only once per frame */
    if (dri2_surf->back)
@@ -1500,7 +1667,7 @@
    wl_display_dispatch_queue_pending(dri2_dpy->wl_dpy, dri2_surf->wl_queue);
 
    /* try get free buffer already created */
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (!dri2_surf->color_buffers[i].locked &&
           dri2_surf->color_buffers[i].wl_buffer) {
           dri2_surf->back = &dri2_surf->color_buffers[i];
@@ -1510,7 +1677,7 @@
 
    /* else choose any another free location */
    if (!dri2_surf->back) {
-      for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+      for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
          if (!dri2_surf->color_buffers[i].locked) {
              dri2_surf->back = &dri2_surf->color_buffers[i];
              if (!dri2_wl_swrast_allocate_buffer(dri2_surf,
@@ -1535,12 +1702,12 @@
       return -1;
    }
 
-   dri2_surf->back->locked = 1;
+   dri2_surf->back->locked = true;
 
    /* If we have an extra unlocked buffer at this point, we had to do triple
     * buffering for a while, but now can go back to just double buffering.
     * That means we can free any unlocked buffer now. */
-   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (!dri2_surf->color_buffers[i].locked &&
           dri2_surf->color_buffers[i].wl_buffer) {
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
@@ -1756,8 +1923,9 @@
 };
 
 static void
-registry_handle_global_swrast(void *data, struct wl_registry *registry, uint32_t name,
-                              const char *interface, uint32_t version)
+registry_handle_global_swrast(void *data, struct wl_registry *registry,
+                              uint32_t name, const char *interface,
+                              uint32_t version)
 {
    struct dri2_egl_display *dri2_dpy = data;
 
@@ -1773,7 +1941,7 @@
    .global_remove = registry_handle_global_remove
 };
 
-static struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {
    .authenticate = NULL,
    .create_window_surface = dri2_wl_create_window_surface,
    .create_pixmap_surface = dri2_wl_create_pixmap_surface,
@@ -1817,12 +1985,13 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    disp->DriverData = (void *) dri2_dpy;
    if (disp->PlatformDisplay == NULL) {
       dri2_dpy->wl_dpy = wl_display_connect(NULL);
       if (dri2_dpy->wl_dpy == NULL)
-         goto cleanup_dpy;
-      dri2_dpy->own_device = 1;
+         goto cleanup;
+      dri2_dpy->own_device = true;
    } else {
       dri2_dpy->wl_dpy = disp->PlatformDisplay;
    }
@@ -1831,7 +2000,7 @@
 
    dri2_dpy->wl_dpy_wrapper = wl_proxy_create_wrapper(dri2_dpy->wl_dpy);
    if (dri2_dpy->wl_dpy_wrapper == NULL)
-      goto cleanup_dpy_wrapper;
+      goto cleanup;
 
    wl_proxy_set_queue((struct wl_proxy *) dri2_dpy->wl_dpy_wrapper,
                       dri2_dpy->wl_queue);
@@ -1844,26 +2013,30 @@
                             &registry_listener_swrast, dri2_dpy);
 
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->wl_shm == NULL)
-      goto cleanup_registry;
+      goto cleanup;
 
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->formats == 0)
-      goto cleanup_shm;
+      goto cleanup;
 
-   dri2_dpy->fd = -1;
    dri2_dpy->driver_name = strdup("swrast");
    if (!dri2_load_driver_swrast(disp))
-      goto cleanup_shm;
+      goto cleanup;
 
    dri2_dpy->loader_extensions = swrast_loader_extensions;
 
    if (!dri2_create_screen(disp))
-      goto cleanup_driver;
+      goto cleanup;
+
+   if (!dri2_setup_extensions(disp))
+      goto cleanup;
+
+   dri2_setup_screen(disp);
 
    dri2_wl_setup_swap_interval(dri2_dpy);
 
    if (!dri2_wl_add_configs_for_visuals(drv, disp)) {
       _eglError(EGL_NOT_INITIALIZED, "DRI2: failed to add configs");
-      goto cleanup_screen;
+      goto cleanup;
    }
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
@@ -1873,23 +2046,8 @@
 
    return EGL_TRUE;
 
- cleanup_screen:
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_driver:
-   dlclose(dri2_dpy->driver);
- cleanup_shm:
-   wl_shm_destroy(dri2_dpy->wl_shm);
- cleanup_registry:
-   wl_registry_destroy(dri2_dpy->wl_registry);
-   wl_proxy_wrapper_destroy(dri2_dpy->wl_dpy_wrapper);
- cleanup_dpy_wrapper:
-   wl_event_queue_destroy(dri2_dpy->wl_queue);
-   if (disp->PlatformDisplay == NULL)
-      wl_display_disconnect(dri2_dpy->wl_dpy);
- cleanup_dpy:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+ cleanup:
+   dri2_display_destroy(disp);
    return EGL_FALSE;
 }
 
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index ea21355..6cb7b20 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -40,6 +40,7 @@
 #endif
 #include <sys/types.h>
 #include <sys/stat.h>
+#include "util/macros.h"
 
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
@@ -443,14 +444,12 @@
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
    xcb_rectangle_t rectangle;
-   unsigned i;
 
-   dri2_surf->buffer_count = count;
-   dri2_surf->have_fake_front = 0;
+   dri2_surf->have_fake_front = false;
 
    /* This assumes the DRI2 buffer attachment tokens matches the
     * __DRIbuffer tokens. */
-   for (i = 0; i < count; i++) {
+   for (unsigned i = 0; i < count; i++) {
       dri2_surf->buffers[i].attachment = buffers[i].attachment;
       dri2_surf->buffers[i].name = buffers[i].name;
       dri2_surf->buffers[i].pitch = buffers[i].pitch;
@@ -463,7 +462,7 @@
        * Note that EGL doesn't require that several clients rendering
        * to the same window must see the same aux buffers. */
       if (dri2_surf->buffers[i].attachment == __DRI_BUFFER_FAKE_FRONT_LEFT)
-         dri2_surf->have_fake_front = 1;
+         dri2_surf->have_fake_front = true;
    }
 
    if (dri2_surf->region != XCB_NONE)
@@ -647,6 +646,7 @@
        error != NULL || xfixes_query->major_version < 2) {
       _eglLog(_EGL_WARNING, "DRI2: failed to query xfixes version");
       free(error);
+      free(xfixes_query);
       return EGL_FALSE;
    }
    free(xfixes_query);
@@ -731,17 +731,10 @@
 {
    xcb_depth_iterator_t d;
    xcb_visualtype_t *visuals;
-   int i, j, count;
-   unsigned int rgba_masks[4];
+   int config_count = 0;
    EGLint surface_type;
-   EGLint config_attrs[] = {
-	   EGL_NATIVE_VISUAL_ID,   0,
-	   EGL_NATIVE_VISUAL_TYPE, 0,
-	   EGL_NONE
-   };
 
    d = xcb_screen_allowed_depths_iterator(dri2_dpy->screen);
-   count = 0;
 
    surface_type =
       EGL_WINDOW_BIT |
@@ -755,27 +748,36 @@
       EGLBoolean class_added[6] = { 0, };
 
       visuals = xcb_depth_visuals(d.data);
-      for (i = 0; i < xcb_depth_visuals_length(d.data); i++) {
+
+      for (int i = 0; i < xcb_depth_visuals_length(d.data); i++) {
 	 if (class_added[visuals[i]._class])
 	    continue;
 
 	 class_added[visuals[i]._class] = EGL_TRUE;
-	 for (j = 0; dri2_dpy->driver_configs[j]; j++) {
+
+	 for (int j = 0; dri2_dpy->driver_configs[j]; j++) {
             struct dri2_egl_config *dri2_conf;
             const __DRIconfig *config = dri2_dpy->driver_configs[j];
 
-            config_attrs[1] = visuals[i].visual_id;
-            config_attrs[3] = visuals[i]._class;
+            const EGLint config_attrs[] = {
+                    EGL_NATIVE_VISUAL_ID,    visuals[i].visual_id,
+                    EGL_NATIVE_VISUAL_TYPE,  visuals[i]._class,
+                    EGL_NONE
+            };
 
-            rgba_masks[0] = visuals[i].red_mask;
-            rgba_masks[1] = visuals[i].green_mask;
-            rgba_masks[2] = visuals[i].blue_mask;
-            rgba_masks[3] = 0;
-            dri2_conf = dri2_add_config(disp, config, count + 1, surface_type,
-                                        config_attrs, rgba_masks);
+            unsigned int rgba_masks[4] = {
+               visuals[i].red_mask,
+               visuals[i].green_mask,
+               visuals[i].blue_mask,
+               0,
+            };
+
+            dri2_conf = dri2_add_config(disp, config, config_count + 1,
+                                        surface_type, config_attrs,
+                                        rgba_masks);
             if (dri2_conf)
-               if (dri2_conf->base.ConfigID == count + 1)
-                  count++;
+               if (dri2_conf->base.ConfigID == config_count + 1)
+                  config_count++;
 
             /* Allow a 24-bit RGB visual to match a 32-bit RGBA EGLConfig.
              * Otherwise it will only match a 32-bit RGBA visual.  On a
@@ -787,11 +789,12 @@
             if (d.data->depth == 24) {
                rgba_masks[3] =
                   ~(rgba_masks[0] | rgba_masks[1] | rgba_masks[2]);
-               dri2_conf = dri2_add_config(disp, config, count + 1, surface_type,
-                                           config_attrs, rgba_masks);
+               dri2_conf = dri2_add_config(disp, config, config_count + 1,
+                                           surface_type, config_attrs,
+                                           rgba_masks);
                if (dri2_conf)
-                  if (dri2_conf->base.ConfigID == count + 1)
-                     count++;
+                  if (dri2_conf->base.ConfigID == config_count + 1)
+                     config_count++;
             }
 	 }
       }
@@ -799,7 +802,7 @@
       xcb_depth_next(&d);
    }
 
-   if (!count) {
+   if (!config_count) {
       _eglLog(_EGL_WARNING, "DRI2: failed to create any config");
       return EGL_FALSE;
    }
@@ -820,8 +823,7 @@
    if (draw->Type == EGL_PIXMAP_BIT || draw->Type == EGL_PBUFFER_BIT)
       return EGL_TRUE;
 
-   if (dri2_dpy->flush)
-      dri2_dpy->flush->flush(dri2_surf->dri_drawable);
+   dri2_dpy->flush->flush(dri2_surf->dri_drawable);
 
    if (dri2_surf->have_fake_front)
       render_attachment = XCB_DRI2_ATTACHMENT_BUFFER_FAKE_FRONT_LEFT;
@@ -883,8 +885,7 @@
     * happened.  The driver should still be using the viewport hack to catch
     * window resizes.
     */
-   if (dri2_dpy->flush &&
-       dri2_dpy->flush->base.version >= 3 && dri2_dpy->flush->invalidate)
+   if (dri2_dpy->flush->base.version >= 3 && dri2_dpy->flush->invalidate)
       dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
 
    return swap_count;
@@ -896,19 +897,16 @@
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
 
-   if (dri2_dpy->dri2) {
-      if (dri2_x11_swap_buffers_msc(drv, disp, draw, 0, 0, 0) != -1) {
-          return EGL_TRUE;
-      }
-      /* Swap failed with a window drawable. */
-      _eglError(EGL_BAD_NATIVE_WINDOW, __func__);
-      return EGL_FALSE;
-   } else {
-      assert(dri2_dpy->swrast);
-
+   if (!dri2_dpy->flush) {
       dri2_dpy->core->swapBuffers(dri2_surf->dri_drawable);
       return EGL_TRUE;
    }
+
+   if (dri2_x11_swap_buffers_msc(drv, disp, draw, 0, 0, 0) == -1) {
+      /* Swap failed with a window drawable. */
+      return _eglError(EGL_BAD_NATIVE_WINDOW, __func__);
+   }
+   return EGL_TRUE;
 }
 
 static EGLBoolean
@@ -921,12 +919,11 @@
    EGLBoolean ret;
    xcb_xfixes_region_t region;
    xcb_rectangle_t rectangles[16];
-   int i;
 
    if (numRects > (int)ARRAY_SIZE(rectangles))
       return dri2_copy_region(drv, disp, draw, dri2_surf->region);
 
-   for (i = 0; i < numRects; i++) {
+   for (int i = 0; i < numRects; i++) {
       rectangles[i].x = rects[i * 4];
       rectangles[i].y = dri2_surf->base.Height - rects[i * 4 + 1] - rects[i * 4 + 3];
       rectangles[i].width = rects[i * 4 + 2];
@@ -1074,12 +1071,7 @@
       return EGL_NO_IMAGE_KHR;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(buffers_reply);
-      free(geometry_reply);
-      free(dri2_img);
-      return EGL_NO_IMAGE_KHR;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    stride = buffers[0].pitch / buffers[0].cpp;
    dri2_img->dri_image =
@@ -1125,10 +1117,8 @@
    cookie = xcb_dri2_get_msc(dri2_dpy->conn, dri2_surf->drawable);
    reply = xcb_dri2_get_msc_reply(dri2_dpy->conn, cookie, NULL);
 
-   if (!reply) {
-      _eglError(EGL_BAD_ACCESS, __func__);
-      return EGL_FALSE;
-   }
+   if (!reply)
+      return _eglError(EGL_BAD_ACCESS, __func__);
 
    *ust = ((EGLuint64KHR) reply->ust_hi << 32) | reply->ust_lo;
    *msc = ((EGLuint64KHR) reply->msc_hi << 32) | reply->msc_lo;
@@ -1138,7 +1128,7 @@
    return EGL_TRUE;
 }
 
-static struct dri2_egl_display_vtbl dri2_x11_swrast_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_x11_swrast_display_vtbl = {
    .authenticate = NULL,
    .create_window_surface = dri2_x11_create_window_surface,
    .create_pixmap_surface = dri2_x11_create_pixmap_surface,
@@ -1147,6 +1137,7 @@
    .create_image = dri2_fallback_create_image_khr,
    .swap_interval = dri2_fallback_swap_interval,
    .swap_buffers = dri2_x11_swap_buffers,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri2_x11_copy_buffers,
@@ -1157,7 +1148,7 @@
    .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
-static struct dri2_egl_display_vtbl dri2_x11_display_vtbl = {
+static const struct dri2_egl_display_vtbl dri2_x11_display_vtbl = {
    .authenticate = dri2_x11_authenticate,
    .create_window_surface = dri2_x11_create_window_surface,
    .create_pixmap_surface = dri2_x11_create_pixmap_surface,
@@ -1168,6 +1159,7 @@
    .swap_buffers = dri2_x11_swap_buffers,
    .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
    .swap_buffers_region = dri2_x11_swap_buffers_region,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .post_sub_buffer = dri2_x11_post_sub_buffer,
    .copy_buffers = dri2_x11_copy_buffers,
    .query_buffer_age = dri2_fallback_query_buffer_age,
@@ -1195,7 +1187,7 @@
                         struct dri2_egl_display *dri2_dpy)
 {
    xcb_screen_iterator_t s;
-   int screen = 0;
+   int screen = (uintptr_t)disp->Options.Platform;
    const char *msg;
 
    disp->DriverData = (void *) dri2_dpy;
@@ -1238,25 +1230,30 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    if (!dri2_get_xcb_connection(drv, disp, dri2_dpy))
-      goto cleanup_dpy;
+      goto cleanup;
 
    /*
     * Every hardware driver_name is set using strdup. Doing the same in
     * here will allow is to simply free the memory at dri2_terminate().
     */
-   dri2_dpy->fd = -1;
    dri2_dpy->driver_name = strdup("swrast");
    if (!dri2_load_driver_swrast(disp))
-      goto cleanup_conn;
+      goto cleanup;
 
    dri2_dpy->loader_extensions = swrast_loader_extensions;
 
    if (!dri2_create_screen(disp))
-      goto cleanup_driver;
+      goto cleanup;
+
+   if (!dri2_setup_extensions(disp))
+      goto cleanup;
+
+   dri2_setup_screen(disp);
 
    if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, true))
-      goto cleanup_configs;
+      goto cleanup;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
@@ -1265,19 +1262,8 @@
 
    return EGL_TRUE;
 
- cleanup_configs:
-   _eglCleanupDisplay(disp);
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_driver:
-   dlclose(dri2_dpy->driver);
- cleanup_conn:
-   free(dri2_dpy->driver_name);
-   if (disp->PlatformDisplay == NULL)
-      xcb_disconnect(dri2_dpy->conn);
- cleanup_dpy:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+ cleanup:
+   dri2_display_destroy(disp);
    return EGL_FALSE;
 }
 
@@ -1292,6 +1278,7 @@
     */
    dri2_dpy->min_swap_interval = 0;
    dri2_dpy->max_swap_interval = 0;
+   dri2_dpy->default_swap_interval = 0;
 
    if (!dri2_dpy->swap_available)
       return;
@@ -1333,6 +1320,7 @@
    &dri3_image_loader_extension.base,
    &image_lookup_extension.base,
    &use_invalidate.base,
+   &background_callable_extension.base,
    NULL,
 };
 
@@ -1345,14 +1333,15 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    if (!dri2_get_xcb_connection(drv, disp, dri2_dpy))
-      goto cleanup_dpy;
+      goto cleanup;
 
    if (!dri3_x11_connect(dri2_dpy))
-      goto cleanup_conn;
+      goto cleanup;
 
    if (!dri2_load_driver_dri3(disp))
-      goto cleanup_conn;
+      goto cleanup;
 
    dri2_dpy->loader_extensions = dri3_image_loader_extensions;
 
@@ -1360,7 +1349,12 @@
    dri2_dpy->invalidate_available = true;
 
    if (!dri2_create_screen(disp))
-      goto cleanup_fd;
+      goto cleanup;
+
+   if (!dri2_setup_extensions(disp))
+      goto cleanup;
+
+   dri2_setup_screen(disp);
 
    dri2_x11_setup_swap_interval(dri2_dpy);
 
@@ -1373,7 +1367,7 @@
    dri2_set_WL_bind_wayland_display(drv, disp);
 
    if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, false))
-      goto cleanup_configs;
+      goto cleanup;
 
    dri2_dpy->loader_dri3_ext.core = dri2_dpy->core;
    dri2_dpy->loader_dri3_ext.image_driver = dri2_dpy->image_driver;
@@ -1391,19 +1385,8 @@
 
    return EGL_TRUE;
 
- cleanup_configs:
-   _eglCleanupDisplay(disp);
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
-   dlclose(dri2_dpy->driver);
- cleanup_fd:
-   close(dri2_dpy->fd);
- cleanup_conn:
-   if (disp->PlatformDisplay == NULL)
-      xcb_disconnect(dri2_dpy->conn);
- cleanup_dpy:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+ cleanup:
+   dri2_display_destroy(disp);
    return EGL_FALSE;
 }
 #endif
@@ -1447,14 +1430,15 @@
    if (!dri2_dpy)
       return _eglError(EGL_BAD_ALLOC, "eglInitialize");
 
+   dri2_dpy->fd = -1;
    if (!dri2_get_xcb_connection(drv, disp, dri2_dpy))
-      goto cleanup_dpy;
+      goto cleanup;
 
    if (!dri2_x11_connect(dri2_dpy))
-      goto cleanup_conn;
+      goto cleanup;
 
    if (!dri2_load_driver(disp))
-      goto cleanup_fd;
+      goto cleanup;
 
    if (dri2_dpy->dri2_minor >= 1)
       dri2_dpy->loader_extensions = dri2_loader_extensions;
@@ -1465,7 +1449,12 @@
    dri2_dpy->invalidate_available = (dri2_dpy->dri2_minor >= 3);
 
    if (!dri2_create_screen(disp))
-      goto cleanup_driver;
+      goto cleanup;
+
+   if (!dri2_setup_extensions(disp))
+      goto cleanup;
+
+   dri2_setup_screen(disp);
 
    dri2_x11_setup_swap_interval(dri2_dpy);
 
@@ -1478,7 +1467,7 @@
    dri2_set_WL_bind_wayland_display(drv, disp);
 
    if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, true))
-      goto cleanup_configs;
+      goto cleanup;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
@@ -1489,20 +1478,8 @@
 
    return EGL_TRUE;
 
- cleanup_configs:
-   _eglCleanupDisplay(disp);
-   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_driver:
-   dlclose(dri2_dpy->driver);
- cleanup_fd:
-   close(dri2_dpy->fd);
- cleanup_conn:
-   if (disp->PlatformDisplay == NULL)
-      xcb_disconnect(dri2_dpy->conn);
- cleanup_dpy:
-   free(dri2_dpy);
-   disp->DriverData = NULL;
-
+ cleanup:
+   dri2_display_destroy(disp);
    return EGL_FALSE;
 }
 
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.c b/src/egl/drivers/dri2/platform_x11_dri3.c
index c4a5443..515be27 100644
--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -30,6 +30,7 @@
 #include <xcb/present.h>
 
 #include <xf86drm.h>
+#include "util/macros.h"
 
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
@@ -326,10 +327,7 @@
       return EGL_NO_IMAGE_KHR;
    }
 
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return EGL_NO_IMAGE_KHR;
-   }
+   _eglInitImage(&dri2_img->base, disp);
 
    dri2_img->dri_image = loader_dri3_create_image(dri2_dpy->conn,
                                                   bp_reply,
@@ -457,6 +455,7 @@
    .swap_buffers = dri3_swap_buffers,
    .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .set_damage_region = dri2_fallback_set_damage_region,
    .post_sub_buffer = dri2_fallback_post_sub_buffer,
    .copy_buffers = dri3_copy_buffers,
    .query_buffer_age = dri3_query_buffer_age,
diff --git a/src/egl/drivers/haiku/egl_haiku.cpp b/src/egl/drivers/haiku/egl_haiku.cpp
index ef74f65..10f3abc 100644
--- a/src/egl/drivers/haiku/egl_haiku.cpp
+++ b/src/egl/drivers/haiku/egl_haiku.cpp
@@ -150,10 +150,8 @@
 
 	struct haiku_egl_config* conf;
 	conf = (struct haiku_egl_config*) calloc(1, sizeof (*conf));
-	if (!conf) {
-		_eglError(EGL_BAD_ALLOC, "haiku_add_configs_for_visuals");
-		return EGL_FALSE;
-	}
+	if (!conf)
+		return _eglError(EGL_BAD_ALLOC, "haiku_add_configs_for_visuals");
 
 	_eglInitConfig(&conf->base, dpy, 1);
 	TRACE("Config inited\n");
diff --git a/src/egl/generate/egl.xml b/src/egl/generate/egl.xml
index f6dbbc0..9250f93 100644
--- a/src/egl/generate/egl.xml
+++ b/src/egl/generate/egl.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <registry>
     <!--
-    Copyright (c) 2013-2014 The Khronos Group Inc.
+    Copyright (c) 2013-2017 The Khronos Group Inc.
 
     Permission is hereby granted, free of charge, to any person obtaining a
     copy of this software and/or associated documentation files (the
@@ -29,7 +29,7 @@
     together with documentation, schema, and Python generator scripts used
     to generate C header files for EGL, can be found in the Khronos Registry
     at
-        http://www.opengl.org/registry/
+        https://www.github.com/KhronosGroup/EGL-Registry
     -->
 
     <!-- SECTION: EGL type definitions. Does not include GL types. -->
@@ -76,6 +76,7 @@
         <type requires="khrplatform">typedef khronos_utime_nanoseconds_t <name>EGLTimeNV</name>;</type>
         <type requires="khrplatform">typedef khronos_utime_nanoseconds_t <name>EGLuint64NV</name>;</type>
         <type requires="khrplatform">typedef khronos_uint64_t <name>EGLuint64KHR</name>;</type>
+        <type requires="khrplatform">typedef khronos_stime_nanoseconds_t <name>EGLnsecsANDROID</name>;</type>
         <type>typedef int <name>EGLNativeFileDescriptorKHR</name>;</type>
         <type requires="khrplatform">typedef khronos_ssize_t <name>EGLsizeiANDROID</name>;</type>
         <type requires="EGLsizeiANDROID">typedef void (*<name>EGLSetBlobFuncANDROID</name>) (const void *key, EGLsizeiANDROID keySize, const void *value, EGLsizeiANDROID valueSize);</type>
@@ -112,6 +113,7 @@
             <!--
         <enum value="0x0800"      name="EGL_STREAM_BIT_NV" comment="Draft EGL_NV_stream_producer_eglsurface extension (bug 8064)"/>
             -->
+        <enum value="0x1000" name="EGL_MUTABLE_RENDER_BUFFER_BIT_KHR"/>
     </enums>
 
     <enums namespace="EGLRenderableTypeMask" type="bitmask" comment="EGL_RENDERABLE_TYPE bits">
@@ -130,6 +132,12 @@
         <enum value="0x0002" name="EGL_WRITE_SURFACE_BIT_KHR"/>
     </enums>
 
+    <enums namespace="EGLNativeBufferUsageFlags" type="bitmask" comment="EGL_NATIVE_BUFFER_USAGE_ANDROID bits">
+        <enum value="0x00000001" name="EGL_NATIVE_BUFFER_USAGE_PROTECTED_BIT_ANDROID"/>
+        <enum value="0x00000002" name="EGL_NATIVE_BUFFER_USAGE_RENDERBUFFER_BIT_ANDROID"/>
+        <enum value="0x00000004" name="EGL_NATIVE_BUFFER_USAGE_TEXTURE_BIT_ANDROID"/>
+    </enums>
+
     <enums namespace="EGLSyncFlagsKHR" type="bitmask" comment="Fence/reusable sync wait bits">
         <enum value="0x0001" name="EGL_SYNC_FLUSH_COMMANDS_BIT"/>
         <enum value="0x0001" name="EGL_SYNC_FLUSH_COMMANDS_BIT_KHR" alias="EGL_SYNC_FLUSH_COMMANDS_BIT"/>
@@ -165,7 +173,11 @@
          tokens are reused for different purposes in different
          extensions and API versions). -->
 
-    <enums namespace="EGL" start="0x0000" end="0x2FFF" vendor="ARB"/>
+    <enums namespace="EGL" start="0x0000" end="0x2FFF" vendor="KHR" comment="Reserved for enumerants shared with WGL, GLX, and GL">
+        <enum value="0" name="EGL_CONTEXT_RELEASE_BEHAVIOR_NONE_KHR"/>
+        <enum value="0x2097" name="EGL_CONTEXT_RELEASE_BEHAVIOR_KHR"/>
+        <enum value="0x2098" name="EGL_CONTEXT_RELEASE_BEHAVIOR_FLUSH_KHR"/>
+    </enums>
 
     <enums namespace="EGL" group="Boolean" vendor="ARB">
         <enum value="0" name="EGL_FALSE"/>
@@ -173,24 +185,25 @@
     </enums>
 
     <enums namespace="EGL" group="SpecialNumbers" vendor="ARB" comment="Tokens whose numeric value is intrinsically meaningful">
-        <enum value="((EGLint)-1)" name="EGL_DONT_CARE"/>
-        <enum value="((EGLint)-1)" name="EGL_UNKNOWN"/>
+        <enum value="EGL_CAST(EGLint,-1)" name="EGL_DONT_CARE"/>
+        <enum value="EGL_CAST(EGLint,-1)" name="EGL_UNKNOWN"/>
         <enum value="-1" name="EGL_NO_NATIVE_FENCE_FD_ANDROID"/>
         <enum value="0" name="EGL_DEPTH_ENCODING_NONE_NV"/>
-        <enum value="((EGLContext)0)" name="EGL_NO_CONTEXT"/>
-        <enum value="((EGLDeviceEXT)(0))" name="EGL_NO_DEVICE_EXT"/>
-        <enum value="((EGLDisplay)0)" name="EGL_NO_DISPLAY"/>
-        <enum value="((EGLImage)0)" name="EGL_NO_IMAGE"/>
-        <enum value="((EGLImageKHR)0)" name="EGL_NO_IMAGE_KHR"/>
-        <enum value="((EGLNativeDisplayType)0)" name="EGL_DEFAULT_DISPLAY"/>
-        <enum value="((EGLNativeFileDescriptorKHR)(-1))" name="EGL_NO_FILE_DESCRIPTOR_KHR"/>
-        <enum value="((EGLOutputLayerEXT)0)" name="EGL_NO_OUTPUT_LAYER_EXT"/>
-        <enum value="((EGLOutputPortEXT)0)" name="EGL_NO_OUTPUT_PORT_EXT"/>
-        <enum value="((EGLStreamKHR)0)" name="EGL_NO_STREAM_KHR"/>
-        <enum value="((EGLSurface)0)" name="EGL_NO_SURFACE"/>
-        <enum value="((EGLSync)0)" name="EGL_NO_SYNC"/>
-        <enum value="((EGLSyncKHR)0)" name="EGL_NO_SYNC_KHR" alias="EGL_NO_SYNC"/>
-        <enum value="((EGLSyncNV)0)" name="EGL_NO_SYNC_NV" alias="EGL_NO_SYNC"/>
+        <enum value="EGL_CAST(EGLContext,0)" name="EGL_NO_CONTEXT"/>
+        <enum value="EGL_CAST(EGLDeviceEXT,0)" name="EGL_NO_DEVICE_EXT"/>
+        <enum value="EGL_CAST(EGLDisplay,0)" name="EGL_NO_DISPLAY"/>
+        <enum value="EGL_CAST(EGLImage,0)" name="EGL_NO_IMAGE"/>
+        <enum value="EGL_CAST(EGLImageKHR,0)" name="EGL_NO_IMAGE_KHR"/>
+        <enum value="EGL_CAST(EGLNativeDisplayType,0)" name="EGL_DEFAULT_DISPLAY"/>
+        <enum value="EGL_CAST(EGLNativeFileDescriptorKHR,-1)" name="EGL_NO_FILE_DESCRIPTOR_KHR"/>
+        <enum value="EGL_CAST(EGLOutputLayerEXT,0)" name="EGL_NO_OUTPUT_LAYER_EXT"/>
+        <enum value="EGL_CAST(EGLOutputPortEXT,0)" name="EGL_NO_OUTPUT_PORT_EXT"/>
+        <enum value="EGL_CAST(EGLStreamKHR,0)" name="EGL_NO_STREAM_KHR"/>
+        <enum value="EGL_CAST(EGLSurface,0)" name="EGL_NO_SURFACE"/>
+        <enum value="EGL_CAST(EGLSync,0)" name="EGL_NO_SYNC"/>
+        <enum value="EGL_CAST(EGLSyncKHR,0)" name="EGL_NO_SYNC_KHR" alias="EGL_NO_SYNC"/>
+        <enum value="EGL_CAST(EGLSyncNV,0)" name="EGL_NO_SYNC_NV" alias="EGL_NO_SYNC"/>
+        <enum value="EGL_CAST(EGLConfig,0)" name="EGL_NO_CONFIG_KHR"/>
         <enum value="10000" name="EGL_DISPLAY_SCALING"/>
         <enum value="0xFFFFFFFFFFFFFFFF" name="EGL_FOREVER" type="ull"/>
         <enum value="0xFFFFFFFFFFFFFFFF" name="EGL_FOREVER_KHR" type="ull" alias="EGL_FOREVER"/>
@@ -356,7 +369,7 @@
         <enum value="0x30BD" name="EGL_GL_TEXTURE_ZOFFSET"/>
         <enum value="0x30BD" name="EGL_GL_TEXTURE_ZOFFSET_KHR" alias="EGL_GL_TEXTURE_ZOFFSET"/>
         <enum value="0x30BE" name="EGL_POST_SUB_BUFFER_SUPPORTED_NV"/>
-        <enum value="0x30BF" name="EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT" alias="EGL_CONTEXT_OPENGL_ROBUST_ACCESS"/>
+        <enum value="0x30BF" name="EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT"/>
     </enums>
 
     <enums namespace="EGL" start="0x30C0-0x30CF" vendor="KHR">
@@ -441,7 +454,10 @@
         <enum value="0x3101" name="EGL_CONTEXT_PRIORITY_HIGH_IMG"/>
         <enum value="0x3102" name="EGL_CONTEXT_PRIORITY_MEDIUM_IMG"/>
         <enum value="0x3103" name="EGL_CONTEXT_PRIORITY_LOW_IMG"/>
-            <unused start="0x3104" end="0x310F"/>
+            <unused start="0x3104"/>
+        <enum value="0x3105" name="EGL_NATIVE_BUFFER_MULTIPLANE_SEPARATE_IMG"/>
+        <enum value="0x3106" name="EGL_NATIVE_BUFFER_PLANE_OFFSET_IMG"/>
+            <unused start="0x3107" end="0x310F"/>
     </enums>
 
     <enums namespace="EGL" start="0x3110" end="0x311F" vendor="ATX" comment="Reserved for Tim Renouf, Antix (Khronos bug 4949)">
@@ -474,12 +490,14 @@
         <enum value="0x3140" name="EGL_NATIVE_BUFFER_ANDROID"/>
         <enum value="0x3141" name="EGL_PLATFORM_ANDROID_KHR"/>
         <enum value="0x3142" name="EGL_RECORDABLE_ANDROID"/>
-            <unused start="0x3143"/>
+        <enum value="0x3143" name="EGL_NATIVE_BUFFER_USAGE_ANDROID"/>
         <enum value="0x3144" name="EGL_SYNC_NATIVE_FENCE_ANDROID"/>
         <enum value="0x3145" name="EGL_SYNC_NATIVE_FENCE_FD_ANDROID"/>
         <enum value="0x3146" name="EGL_SYNC_NATIVE_FENCE_SIGNALED_ANDROID"/>
         <enum value="0x3147" name="EGL_FRAMEBUFFER_TARGET_ANDROID"/>
-            <unused start="0x3148" end="0x314F"/>
+            <unused start="0x3148" end="0x314B"/>
+        <enum value="0x314C"     name="EGL_FRONT_BUFFER_AUTO_REFRESH_ANDROID"/>
+            <unused start="0x314D" end="0x314F"/>
     </enums>
 
     <enums namespace="EGL" start="0x3150" end="0x315F" vendor="NOK" comment="Reserved for Robert Palmer (Khronos bug 5368)">
@@ -532,7 +550,9 @@
         <enum value="0x31D7" name="EGL_PLATFORM_GBM_MESA" alias="EGL_PLATFORM_GBM_KHR"/>
         <enum value="0x31D8" name="EGL_PLATFORM_WAYLAND_KHR"/>
         <enum value="0x31D8" name="EGL_PLATFORM_WAYLAND_EXT" alias="EGL_PLATFORM_WAYLAND_KHR"/>
-            <unused start="0x31D9" end="0x31DF"/>
+            <unused start="0x31D9" end="0x31DC"/>
+        <enum value="0x31DD" name="EGL_PLATFORM_SURFACELESS_MESA"/>
+            <unused start="0x31DE" end="0x31DF"/>
     </enums>
 
     <enums namespace="EGL" start="0x31E0" end="0x31EF" vendor="HI" comment="Reserved for Mark Callow (Khronos bug 6799)">
@@ -605,7 +625,37 @@
         <enum value="0x323B" name="EGL_CUDA_EVENT_HANDLE_NV"/>
         <enum value="0x323C" name="EGL_SYNC_CUDA_EVENT_NV"/>
         <enum value="0x323D" name="EGL_SYNC_CUDA_EVENT_COMPLETE_NV"/>
-            <unused start="0x323E" end="0x325F"/>
+            <unused start="0x323E"/>
+        <enum value="0x323F" name="EGL_STREAM_CROSS_PARTITION_NV"/>
+        <enum value="0x3240" name="EGL_STREAM_STATE_INITIALIZING_NV"/>
+        <enum value="0x3241" name="EGL_STREAM_TYPE_NV"/>
+        <enum value="0x3242" name="EGL_STREAM_PROTOCOL_NV"/>
+        <enum value="0x3243" name="EGL_STREAM_ENDPOINT_NV"/>
+        <enum value="0x3244" name="EGL_STREAM_LOCAL_NV"/>
+        <enum value="0x3245" name="EGL_STREAM_CROSS_PROCESS_NV"/>
+        <enum value="0x3246" name="EGL_STREAM_PROTOCOL_FD_NV"/>
+        <enum value="0x3247" name="EGL_STREAM_PRODUCER_NV"/>
+        <enum value="0x3248" name="EGL_STREAM_CONSUMER_NV"/>
+            <unused start="0x3239" end="0x324A"/>
+        <enum value="0x324B" name="EGL_STREAM_PROTOCOL_SOCKET_NV"/>
+        <enum value="0x324C" name="EGL_SOCKET_HANDLE_NV"/>
+        <enum value="0x324D" name="EGL_SOCKET_TYPE_NV"/>
+        <enum value="0x324E" name="EGL_SOCKET_TYPE_UNIX_NV"/>
+        <enum value="0x324F" name="EGL_SOCKET_TYPE_INET_NV"/>
+        <enum value="0x3250" name="EGL_MAX_STREAM_METADATA_BLOCKS_NV"/>
+        <enum value="0x3251" name="EGL_MAX_STREAM_METADATA_BLOCK_SIZE_NV"/>
+        <enum value="0x3252" name="EGL_MAX_STREAM_METADATA_TOTAL_SIZE_NV"/>
+        <enum value="0x3253" name="EGL_PRODUCER_METADATA_NV"/>
+        <enum value="0x3254" name="EGL_CONSUMER_METADATA_NV"/>
+        <enum value="0x3255" name="EGL_METADATA0_SIZE_NV"/>
+        <enum value="0x3256" name="EGL_METADATA1_SIZE_NV"/>
+        <enum value="0x3257" name="EGL_METADATA2_SIZE_NV"/>
+        <enum value="0x3258" name="EGL_METADATA3_SIZE_NV"/>
+        <enum value="0x3259" name="EGL_METADATA0_TYPE_NV"/>
+        <enum value="0x325A" name="EGL_METADATA1_TYPE_NV"/>
+        <enum value="0x325B" name="EGL_METADATA2_TYPE_NV"/>
+        <enum value="0x325C" name="EGL_METADATA3_TYPE_NV"/>
+            <unused start="0x325D" end="0x325F"/>
     </enums>
 
     <enums namespace="EGL" start="0x3260" end="0x326F" vendor="BCOM" comment="Reserved for Gary Sweet, Broadcom (Public bug 620)">
@@ -636,7 +686,9 @@
         <enum value="0x3284" name="EGL_YUV_CHROMA_SITING_0_EXT"/>
         <enum value="0x3285" name="EGL_YUV_CHROMA_SITING_0_5_EXT"/>
         <enum value="0x3286" name="EGL_DISCARD_SAMPLES_ARM"/>
-            <unused start="0x3287" end="0x328F"/>
+            <unused start="0x3287" end="0x3289"/>
+        <enum value="0x328A" name="EGL_SYNC_PRIOR_COMMANDS_IMPLICIT_EXTERNAL_ARM"/>
+            <unused start="0x328B" end="0x328F"/>
     </enums>
 
     <enums namespace="EGL" start="0x3290" end="0x329F" vendor="MESA" comment="Reserved for John K&#229;re Alsaker (Public bug 757)">
@@ -699,7 +751,51 @@
     </enums>
 
     <enums namespace="EGL" start="0x3320" end="0x339F" vendor="NV" comment="Reserved for James Jones (Bug 13209)">
-            <unused start="0x3320" end="0x339F"/>
+            <unused start="0x3320" end="0x3327"/>
+        <enum value="0x3328" name="EGL_PENDING_METADATA_NV"/>
+        <enum value="0x3329" name="EGL_PENDING_FRAME_NV"/>
+        <enum value="0x332A" name="EGL_STREAM_TIME_PENDING_NV"/>
+            <unused start="0x332B"/>
+        <enum value="0x332C" name="EGL_YUV_PLANE0_TEXTURE_UNIT_NV"/>
+        <enum value="0x332D" name="EGL_YUV_PLANE1_TEXTURE_UNIT_NV"/>
+        <enum value="0x332E" name="EGL_YUV_PLANE2_TEXTURE_UNIT_NV"/>
+            <unused start="0x332F" end="0x3333"/>
+        <enum value="0x3334" name="EGL_SUPPORT_RESET_NV"/>
+        <enum value="0x3335" name="EGL_SUPPORT_REUSE_NV"/>
+        <enum value="0x3336" name="EGL_STREAM_FIFO_SYNCHRONOUS_NV"/>
+        <enum value="0x3337" name="EGL_PRODUCER_MAX_FRAME_HINT_NV"/>
+        <enum value="0x3338" name="EGL_CONSUMER_MAX_FRAME_HINT_NV"/>
+        <enum value="0x3339" name="EGL_COLOR_COMPONENT_TYPE_EXT"/>
+        <enum value="0x333A" name="EGL_COLOR_COMPONENT_TYPE_FIXED_EXT"/>
+        <enum value="0x333B" name="EGL_COLOR_COMPONENT_TYPE_FLOAT_EXT"/>
+            <unused start="0x333C" end="0x333E"/>
+        <enum value="0x333F" name="EGL_GL_COLORSPACE_BT2020_LINEAR_EXT"/>
+        <enum value="0x3340" name="EGL_GL_COLORSPACE_BT2020_PQ_EXT"/>
+        <enum value="0x3341" name="EGL_SMPTE2086_DISPLAY_PRIMARY_RX_EXT"/>
+        <enum value="0x3342" name="EGL_SMPTE2086_DISPLAY_PRIMARY_RY_EXT"/>
+        <enum value="0x3343" name="EGL_SMPTE2086_DISPLAY_PRIMARY_GX_EXT"/>
+        <enum value="0x3344" name="EGL_SMPTE2086_DISPLAY_PRIMARY_GY_EXT"/>
+        <enum value="0x3345" name="EGL_SMPTE2086_DISPLAY_PRIMARY_BX_EXT"/>
+        <enum value="0x3346" name="EGL_SMPTE2086_DISPLAY_PRIMARY_BY_EXT"/>
+        <enum value="0x3347" name="EGL_SMPTE2086_WHITE_POINT_X_EXT"/>
+        <enum value="0x3348" name="EGL_SMPTE2086_WHITE_POINT_Y_EXT"/>
+        <enum value="0x3349" name="EGL_SMPTE2086_MAX_LUMINANCE_EXT"/>
+        <enum value="0x334A" name="EGL_SMPTE2086_MIN_LUMINANCE_EXT"/>
+        <enum value="50000"  name="EGL_METADATA_SCALING_EXT"/>
+            <unused start="0x334B"/>
+        <enum value="0x334C" name="EGL_GENERATE_RESET_ON_VIDEO_MEMORY_PURGE_NV"/>
+        <enum value="0x334D" name="EGL_STREAM_CROSS_OBJECT_NV"/>
+        <enum value="0x334E" name="EGL_STREAM_CROSS_DISPLAY_NV"/>
+        <enum value="0x334F" name="EGL_STREAM_CROSS_SYSTEM_NV"/>
+        <enum value="0x3350" name="EGL_GL_COLORSPACE_SCRGB_LINEAR_EXT"/>
+        <enum value="0x3351" name="EGL_GL_COLORSPACE_SCRGB_EXT"/>
+        <enum value="0x3352" name="EGL_TRACK_REFERENCES_KHR"/>
+            <unused start="0x3353" end="0x335F"/>
+        <enum value="0x3360" name="EGL_CTA861_3_MAX_CONTENT_LIGHT_LEVEL_EXT"/>
+        <enum value="0x3361" name="EGL_CTA861_3_MAX_FRAME_AVERAGE_LEVEL_EXT"/>
+        <enum value="0x3362" name="EGL_GL_COLORSPACE_DISPLAY_P3_LINEAR_EXT"/>
+        <enum value="0x3363" name="EGL_GL_COLORSPACE_DISPLAY_P3_EXT"/>
+            <unused start="0x3364" end="0x339F"/>
     </enums>
 
     <enums namespace="EGL" start="0x33A0" end="0x33AF" vendor="ANGLE" comment="Reserved for Shannon Woods (Bug 13175)">
@@ -733,6 +829,44 @@
             <unused start="0x33E0" end="0x342F"/>
     </enums>
 
+    <enums namespace="EGL" start="0x3430" end="0x343F" vendor="ANDROID" comment="Reserved for Pablo Ceballos (Bug 15874)">
+            <unused start="0x3430" end="0x343F"/>
+    </enums>
+
+    <enums namespace="EGL" start="0x3440" end="0x344F" vendor="ANDROID" comment="Reserved for Kristian Kristensen (Bug 16033)">
+        <enum value="0x3440" name="EGL_DMA_BUF_PLANE3_FD_EXT"/>
+        <enum value="0x3441" name="EGL_DMA_BUF_PLANE3_OFFSET_EXT"/>
+        <enum value="0x3442" name="EGL_DMA_BUF_PLANE3_PITCH_EXT"/>
+        <enum value="0x3443" name="EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT"/>
+        <enum value="0x3444" name="EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT"/>
+        <enum value="0x3445" name="EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT"/>
+        <enum value="0x3446" name="EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT"/>
+        <enum value="0x3447" name="EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT"/>
+        <enum value="0x3448" name="EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT"/>
+        <enum value="0x3449" name="EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT"/>
+        <enum value="0x344A" name="EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT"/>
+            <unused start="0x344B" end="0x344F"/>
+    </enums>
+
+    <enums namespace="EGL" start="0x3450" end="0x345F" vendor="ANGLE" comment="Reserved for Shannon Woods (Bug 16106)">
+            <unused start="0x3450" end="0x345F"/>
+    </enums>
+
+    <enums namespace="EGL" start="0x3460" end="0x346F" vendor="COREAVI" comment="Reserved for Daniel Herring (Bug 16162)">
+        <enum value="0x3460" name="EGL_PRIMARY_COMPOSITOR_CONTEXT_EXT"/>
+        <enum value="0x3461" name="EGL_EXTERNAL_REF_ID_EXT"/>
+        <enum value="0x3462" name="EGL_COMPOSITOR_DROP_NEWEST_FRAME_EXT"/>
+        <enum value="0x3463" name="EGL_COMPOSITOR_KEEP_NEWEST_FRAME_EXT"/>
+        <enum value="0x3464" name="EGL_FRONT_BUFFER_EXT"/>
+        <unused start="0x3465" end="0x346F"/>
+    </enums>
+
+    <enums namespace="EGL" start="0x3470" end="0x347F" vendor="EXT" comment="Reserved for Daniel Stone (PR 14)">
+	<enum value="0x3470" name="EGL_IMPORT_SYNC_TYPE_EXT"/>
+	<enum value="0x3471" name="EGL_IMPORT_IMPLICIT_SYNC_EXT"/>
+	<enum value="0x3472" name="EGL_IMPORT_EXPLICIT_SYNC_EXT"/>
+    </enums>
+
 <!-- Please remember that new enumerant allocations must be obtained by
      request to the Khronos API registrar (see comments at the top of this
      file) File requests in the Khronos Bugzilla, EGL project, Registry
@@ -742,8 +876,8 @@
 
 <!-- Reservable for future use. To generate a new range, allocate multiples
      of 16 starting at the lowest available point in this block. -->
-    <enums namespace="EGL" start="0x3420" end="0x3FFF" vendor="KHR">
-            <unused start="0x3420" end="0x3FFF" comment="Reserved for future use"/>
+    <enums namespace="EGL" start="0x3480" end="0x3FFF" vendor="KHR" comment="Reserved for future use">
+            <unused start="0x3480" end="0x3FFF"/>
     </enums>
 
     <enums namespace="EGL" start="0x8F70" end="0x8F7F" vendor="HI" comment="For Mark Callow, Khronos bug 4055. Shared with GL.">
@@ -836,6 +970,10 @@
             <param>const <ptype>EGLint</ptype> *<name>attrib_list</name></param>
         </command>
         <command>
+            <proto><ptype>EGLClientBuffer</ptype> <name>eglCreateNativeClientBufferANDROID</name></proto>
+            <param>const <ptype>EGLint</ptype> *<name>attrib_list</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLSurface</ptype> <name>eglCreatePbufferFromClientBuffer</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLenum</ptype> <name>buftype</name></param>
@@ -901,6 +1039,11 @@
             <param>const <ptype>EGLint</ptype> *<name>attrib_list</name></param>
         </command>
         <command>
+            <proto><ptype>EGLStreamKHR</ptype> <name>eglCreateStreamAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param>const <ptype>EGLAttrib</ptype> *<name>attrib_list</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLSurface</ptype> <name>eglCreateStreamProducerSurfaceKHR</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLConfig</ptype> <name>config</name></param>
@@ -1163,6 +1306,12 @@
             <param><ptype>EGLint</ptype> <name>height</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglPresentationTimeANDROID</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLSurface</ptype> <name>surface</name></param>
+            <param><ptype>EGLnsecsANDROID</ptype> <name>time</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLenum</ptype> <name>eglQueryAPI</name></proto>
         </command>
         <command>
@@ -1199,6 +1348,36 @@
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLint</ptype> <name>attribute</name></param>
             <param><ptype>EGLAttrib</ptype> *<name>value</name></param>
+            <alias name="eglQueryDisplayAttribKHR"/>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryDisplayAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLint</ptype> <name>name</name></param>
+            <param><ptype>EGLAttrib</ptype> *<name>value</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryDisplayAttribNV</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLint</ptype> <name>attribute</name></param>
+            <param><ptype>EGLAttrib</ptype> *<name>value</name></param>
+            <alias name="eglQueryDisplayAttribKHR"/>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryDmaBufFormatsEXT</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLint</ptype> <name>max_formats</name></param>
+            <param><ptype>EGLint</ptype> *<name>formats</name></param>
+            <param><ptype>EGLint</ptype> *<name>num_formats</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryDmaBufModifiersEXT</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLint</ptype> <name>format</name></param>
+            <param><ptype>EGLint</ptype> <name>max_modifiers</name></param>
+            <param><ptype>EGLuint64KHR</ptype> *<name>modifiers</name></param>
+            <param><ptype>EGLBoolean</ptype> *<name>external_only</name></param>
+            <param><ptype>EGLint</ptype> *<name>num_modifiers</name></param>
         </command>
         <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglQueryNativeDisplayNV</name></proto>
@@ -1251,6 +1430,23 @@
             <param><ptype>EGLint</ptype> *<name>value</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryStreamAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param><ptype>EGLenum</ptype> <name>attribute</name></param>
+            <param><ptype>EGLAttrib</ptype> *<name>value</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglQueryStreamMetadataNV</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param><ptype>EGLenum</ptype> <name>name</name></param>
+            <param><ptype>EGLint</ptype> <name>n</name></param>
+            <param><ptype>EGLint</ptype> <name>offset</name></param>
+            <param><ptype>EGLint</ptype> <name>size</name></param>
+            <param>void *<name>data</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglQueryStreamTimeKHR</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
@@ -1300,6 +1496,11 @@
             <proto><ptype>EGLBoolean</ptype> <name>eglReleaseThread</name></proto>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglResetStreamNV</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+        </command>
+        <command>
             <proto>void <name>eglSetBlobCacheFuncsANDROID</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLSetBlobFuncANDROID</ptype> <name>set</name></param>
@@ -1313,6 +1514,22 @@
             <param><ptype>EGLint</ptype> <name>n_rects</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglSetStreamAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param><ptype>EGLenum</ptype> <name>attribute</name></param>
+            <param><ptype>EGLAttrib</ptype> <name>value</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglSetStreamMetadataNV</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param><ptype>EGLint</ptype> <name>n</name></param>
+            <param><ptype>EGLint</ptype> <name>offset</name></param>
+            <param><ptype>EGLint</ptype> <name>size</name></param>
+            <param>const void *<name>data</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglSignalSyncKHR</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLSyncKHR</ptype> <name>sync</name></param>
@@ -1336,11 +1553,23 @@
             <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglStreamConsumerAcquireAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param>const <ptype>EGLAttrib</ptype> *<name>attrib_list</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglStreamConsumerGLTextureExternalKHR</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglStreamConsumerGLTextureExternalAttribsNV</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param><ptype>EGLAttrib</ptype> *<name>attrib_list</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglStreamConsumerOutputEXT</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
@@ -1352,6 +1581,12 @@
             <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
         </command>
         <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglStreamConsumerReleaseAttribKHR</name></proto>
+            <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
+            <param><ptype>EGLStreamKHR</ptype> <name>stream</name></param>
+            <param>const <ptype>EGLAttrib</ptype> *<name>attrib_list</name></param>
+        </command>
+        <command>
             <proto><ptype>EGLBoolean</ptype> <name>eglSurfaceAttrib</name></proto>
             <param><ptype>EGLDisplay</ptype> <name>dpy</name></param>
             <param><ptype>EGLSurface</ptype> <name>surface</name></param>
@@ -1427,6 +1662,44 @@
             <param><ptype>EGLSyncKHR</ptype> <name>sync</name></param>
             <param><ptype>EGLint</ptype> <name>flags</name></param>
         </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSetContextListEXT</name></proto>
+            <param>const <ptype>EGLint</ptype> *<name>external_ref_ids</name></param>
+            <param><ptype>EGLint</ptype> <name>num_entries</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSetContextAttributesEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_ref_id</name></param>
+            <param>const <ptype>EGLint</ptype> *<name>context_attributes</name></param>
+            <param><ptype>EGLint</ptype> <name>num_entries</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSetWindowListEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_ref_id</name></param>
+            <param>const <ptype>EGLint</ptype> *<name>external_win_ids</name></param>
+            <param><ptype>EGLint</ptype> <name>num_entries</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSetWindowAttributesEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_win_id</name></param>
+            <param>const <ptype>EGLint</ptype> *<name>window_attributes</name></param>
+            <param><ptype>EGLint</ptype> <name>num_entries</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorBindTexWindowEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_win_id</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSetSizeEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_win_id</name></param>
+            <param><ptype>EGLint</ptype> <name>width</name></param>
+            <param><ptype>EGLint</ptype> <name>height</name></param>
+        </command>
+        <command>
+            <proto><ptype>EGLBoolean</ptype> <name>eglCompositorSwapPolicyEXT</name></proto>
+            <param><ptype>EGLint</ptype> <name>external_win_id</name></param>
+            <param><ptype>EGLint</ptype> <name>policy</name></param>
+        </command>
     </commands>
 
     <!-- SECTION: EGL API interface definitions. -->
@@ -1699,11 +1972,25 @@
                 <command name="eglSetBlobCacheFuncsANDROID"/>
             </require>
         </extension>
+        <extension name="EGL_ANDROID_create_native_client_buffer" supported="egl">
+            <require>
+                <enum name="EGL_NATIVE_BUFFER_USAGE_ANDROID"/>
+                <enum name="EGL_NATIVE_BUFFER_USAGE_PROTECTED_BIT_ANDROID"/>
+                <enum name="EGL_NATIVE_BUFFER_USAGE_RENDERBUFFER_BIT_ANDROID"/>
+                <enum name="EGL_NATIVE_BUFFER_USAGE_TEXTURE_BIT_ANDROID"/>
+                <command name="eglCreateNativeClientBufferANDROID"/>
+            </require>
+        </extension>
         <extension name="EGL_ANDROID_framebuffer_target" supported="egl">
             <require>
                 <enum name="EGL_FRAMEBUFFER_TARGET_ANDROID"/>
             </require>
         </extension>
+        <extension name="EGL_ANDROID_front_buffer_auto_refresh" supported="egl">
+            <require>
+                <enum name="EGL_FRONT_BUFFER_AUTO_REFRESH_ANDROID"/>
+            </require>
+        </extension>
         <extension name="EGL_ANDROID_image_native_buffer" supported="egl">
             <require>
                 <enum name="EGL_NATIVE_BUFFER_ANDROID"/>
@@ -1718,6 +2005,11 @@
                 <command name="eglDupNativeFenceFDANDROID"/>
             </require>
         </extension>
+        <extension name="EGL_ANDROID_presentation_time" supported="egl">
+            <require>
+                <command name="eglPresentationTimeANDROID"/>
+            </require>
+        </extension>
         <extension name="EGL_ANDROID_recordable" supported="egl">
             <require>
                 <enum name="EGL_RECORDABLE_ANDROID"/>
@@ -1749,6 +2041,11 @@
                 <enum name="EGL_FIXED_SIZE_ANGLE"/>
             </require>
         </extension>
+        <extension name="EGL_ARM_implicit_external_sync" supported="egl">
+            <require>
+                <enum name="EGL_SYNC_PRIOR_COMMANDS_IMPLICIT_EXTERNAL_ARM"/>
+            </require>
+        </extension>
         <extension name="EGL_ARM_pixmap_multisample_discard" supported="egl">
             <require>
                 <enum name="EGL_DISCARD_SAMPLES_ARM"/>
@@ -1804,6 +2101,36 @@
                 <command name="eglQueryDisplayAttribEXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_gl_colorspace_bt2020_linear" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_BT2020_LINEAR_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_gl_colorspace_bt2020_pq" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_BT2020_PQ_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_gl_colorspace_scrgb" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_SCRGB_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_gl_colorspace_scrgb_linear" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_SCRGB_LINEAR_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_gl_colorspace_display_p3_linear" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_DISPLAY_P3_LINEAR_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_gl_colorspace_display_p3" supported="egl">
+            <require>
+                <enum name="EGL_GL_COLORSPACE_DISPLAY_P3_EXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_image_dma_buf_import" supported="egl">
             <require>
                 <enum name="EGL_LINUX_DMA_BUF_EXT"/>
@@ -1830,6 +2157,23 @@
                 <enum name="EGL_YUV_CHROMA_SITING_0_5_EXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_image_dma_buf_import_modifiers" supported="egl">
+            <require>
+                <enum name="EGL_DMA_BUF_PLANE3_FD_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE3_OFFSET_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE3_PITCH_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT"/>
+                <enum name="EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT"/>
+                <command name="eglQueryDmaBufFormatsEXT"/>
+                <command name="eglQueryDmaBufModifiersEXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_multiview_window" supported="egl">
             <require>
                 <enum name="EGL_MULTIVIEW_VIEW_COUNT_EXT"/>
@@ -1867,6 +2211,13 @@
                 <enum name="EGL_OPENWF_PORT_ID_EXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_pixel_format_float" supported="egl">
+            <require>
+                <enum name="EGL_COLOR_COMPONENT_TYPE_EXT"/>
+                <enum name="EGL_COLOR_COMPONENT_TYPE_FIXED_EXT"/>
+                <enum name="EGL_COLOR_COMPONENT_TYPE_FLOAT_EXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_platform_base" supported="egl">
             <require>
                 <command name="eglGetPlatformDisplayEXT"/>
@@ -1890,6 +2241,11 @@
                 <enum name="EGL_PLATFORM_X11_SCREEN_EXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_protected_content" supported="egl">
+            <require>
+                <enum name="EGL_PROTECTED_CONTENT_EXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_protected_surface" supported="egl">
             <require>
                 <enum name="EGL_PROTECTED_CONTENT_EXT"/>
@@ -1900,6 +2256,21 @@
                 <command name="eglStreamConsumerOutputEXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_surface_SMPTE2086_metadata" supported="egl">
+            <require>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_RX_EXT"/>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_RY_EXT"/>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_GX_EXT"/>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_GY_EXT"/>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_BX_EXT"/>
+                <enum name="EGL_SMPTE2086_DISPLAY_PRIMARY_BY_EXT"/>
+                <enum name="EGL_SMPTE2086_WHITE_POINT_X_EXT"/>
+                <enum name="EGL_SMPTE2086_WHITE_POINT_Y_EXT"/>
+                <enum name="EGL_SMPTE2086_MAX_LUMINANCE_EXT"/>
+                <enum name="EGL_SMPTE2086_MIN_LUMINANCE_EXT"/>
+                <enum name="EGL_METADATA_SCALING_EXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_swap_buffers_with_damage" supported="egl">
             <require>
                 <command name="eglSwapBuffersWithDamageEXT"/>
@@ -1956,6 +2327,12 @@
                 <enum name="EGL_CONTEXT_PRIORITY_LOW_IMG"/>
             </require>
         </extension>
+        <extension name="EGL_IMG_image_plane_attribs" supported="egl">
+            <require>
+                <enum name="EGL_NATIVE_BUFFER_MULTIPLANE_SEPARATE_IMG"/>
+                <enum name="EGL_NATIVE_BUFFER_PLANE_OFFSET_IMG"/>
+            </require>
+        </extension>
         <extension name="EGL_KHR_cl_event" supported="egl">
             <require>
                 <enum name="EGL_CL_EVENT_HANDLE_KHR"/>
@@ -1979,6 +2356,13 @@
             </require>
         </extension>
         <extension name="EGL_KHR_client_get_all_proc_addresses" supported="egl" comment="Alias of EGL_KHR_get_all_proc_addresses"/>
+        <extension name="EGL_KHR_context_flush_control" supported="egl">
+            <require>
+                <enum name="EGL_CONTEXT_RELEASE_BEHAVIOR_NONE_KHR"/>
+                <enum name="EGL_CONTEXT_RELEASE_BEHAVIOR_KHR"/>
+                <enum name="EGL_CONTEXT_RELEASE_BEHAVIOR_FLUSH_KHR"/>
+            </require>
+        </extension>
         <extension name="EGL_KHR_create_context" supported="egl">
             <require>
                 <enum name="EGL_CONTEXT_MAJOR_VERSION_KHR"/>
@@ -2024,6 +2408,12 @@
                 <command name="eglLabelObjectKHR"/>
             </require>
         </extension>
+        <extension name="EGL_KHR_display_reference" supported="egl">
+            <require>
+                <enum name="EGL_TRACK_REFERENCES_KHR"/>
+                <command name="eglQueryDisplayAttribKHR"/>
+            </require>
+        </extension>
         <extension name="EGL_KHR_fence_sync" protect="KHRONOS_SUPPORT_INT64" supported="egl">
             <require>
                 <!-- Most interfaces also defined by EGL_KHR_reusable sync -->
@@ -2153,6 +2543,16 @@
                 <command name="eglQuerySurface64KHR"/>
             </require>
         </extension>
+        <extension name="EGL_KHR_mutable_render_buffer" supported="egl">
+            <require>
+                <enum name="EGL_MUTABLE_RENDER_BUFFER_BIT_KHR"/>
+            </require>
+        </extension>
+        <extension name="EGL_KHR_no_config_context" supported="egl">
+            <require>
+                <enum name="EGL_NO_CONFIG_KHR"/>
+            </require>
+        </extension>
         <extension name="EGL_KHR_partial_update" supported="egl">
             <require>
                 <enum name="EGL_BUFFER_AGE_KHR"/>
@@ -2221,6 +2621,19 @@
                 <command name="eglQueryStreamu64KHR"/>
             </require>
         </extension>
+        <extension name="EGL_KHR_stream_attrib" protect="KHRONOS_SUPPORT_INT64" supported="egl">
+            <require>
+                <enum name="EGL_CONSUMER_LATENCY_USEC_KHR"/>
+                <enum name="EGL_STREAM_STATE_KHR"/>
+                <enum name="EGL_STREAM_STATE_CREATED_KHR"/>
+                <enum name="EGL_STREAM_STATE_CONNECTING_KHR"/>
+                <command name="eglCreateStreamAttribKHR"/>
+                <command name="eglSetStreamAttribKHR"/>
+                <command name="eglQueryStreamAttribKHR"/>
+                <command name="eglStreamConsumerAcquireAttribKHR"/>
+                <command name="eglStreamConsumerReleaseAttribKHR"/>
+            </require>
+        </extension>
         <extension name="EGL_KHR_stream_consumer_gltexture" protect="EGL_KHR_stream" supported="egl">
             <require>
                 <enum name="EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR"/>
@@ -2293,6 +2706,11 @@
                 <enum name="EGL_PLATFORM_GBM_MESA"/>
             </require>
         </extension>
+        <extension name="EGL_MESA_platform_surfaceless" supported="egl">
+            <require>
+                <enum name="EGL_PLATFORM_SURFACELESS_MESA"/>
+            </require>
+        </extension>
         <extension name="EGL_NOK_swap_region" supported="egl">
             <require>
                 <command name="eglSwapBuffersRegionNOK"/>
@@ -2362,6 +2780,122 @@
                 <command name="eglPostSubBufferNV"/>
             </require>
         </extension>
+        <extension name="EGL_NV_robustness_video_memory_purge" supported="egl">
+            <require>
+                <enum name="EGL_GENERATE_RESET_ON_VIDEO_MEMORY_PURGE_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_consumer_gltexture_yuv" supported="egl">
+            <require>
+                <enum name="EGL_YUV_PLANE0_TEXTURE_UNIT_NV"/>
+                <enum name="EGL_YUV_PLANE1_TEXTURE_UNIT_NV"/>
+                <enum name="EGL_YUV_PLANE2_TEXTURE_UNIT_NV"/>
+                <enum name="EGL_YUV_NUMBER_OF_PLANES_EXT"/>
+                <enum name="EGL_YUV_BUFFER_EXT"/>
+                <command name="eglStreamConsumerGLTextureExternalAttribsNV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_cross_object" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_CROSS_OBJECT_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_cross_display" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_CROSS_DISPLAY_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_cross_partition" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_CROSS_PARTITION_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_cross_process" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_CROSS_PROCESS_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_cross_system" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_CROSS_SYSTEM_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_fifo_next" supported="egl">
+            <require>
+                <enum name="EGL_PENDING_FRAME_NV"/>
+                <enum name="EGL_STREAM_TIME_PENDING_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_fifo_synchronous" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_FIFO_SYNCHRONOUS_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_frame_limits" supported="egl">
+            <require>
+                <enum name="EGL_PRODUCER_MAX_FRAME_HINT_NV"/>
+                <enum name="EGL_CONSUMER_MAX_FRAME_HINT_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_metadata" supported="egl">
+            <require>
+                <enum name="EGL_MAX_STREAM_METADATA_BLOCKS_NV"/>
+                <enum name="EGL_MAX_STREAM_METADATA_BLOCK_SIZE_NV"/>
+                <enum name="EGL_MAX_STREAM_METADATA_TOTAL_SIZE_NV"/>
+                <enum name="EGL_PRODUCER_METADATA_NV"/>
+                <enum name="EGL_CONSUMER_METADATA_NV"/>
+                <enum name="EGL_PENDING_METADATA_NV"/>
+                <enum name="EGL_METADATA0_SIZE_NV"/>
+                <enum name="EGL_METADATA1_SIZE_NV"/>
+                <enum name="EGL_METADATA2_SIZE_NV"/>
+                <enum name="EGL_METADATA3_SIZE_NV"/>
+                <enum name="EGL_METADATA0_TYPE_NV"/>
+                <enum name="EGL_METADATA1_TYPE_NV"/>
+                <enum name="EGL_METADATA2_TYPE_NV"/>
+                <enum name="EGL_METADATA3_TYPE_NV"/>
+                <command name="eglQueryDisplayAttribNV"/>
+                <command name="eglSetStreamMetadataNV"/>
+                <command name="eglQueryStreamMetadataNV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_reset" supported="egl">
+            <require>
+                <enum name="EGL_SUPPORT_RESET_NV"/>
+                <enum name="EGL_SUPPORT_REUSE_NV"/>
+                <command name="eglResetStreamNV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_remote" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_STATE_INITIALIZING_NV"/>
+                <enum name="EGL_STREAM_TYPE_NV"/>
+                <enum name="EGL_STREAM_PROTOCOL_NV"/>
+                <enum name="EGL_STREAM_ENDPOINT_NV"/>
+                <enum name="EGL_STREAM_LOCAL_NV"/>
+                <enum name="EGL_STREAM_PRODUCER_NV"/>
+                <enum name="EGL_STREAM_CONSUMER_NV"/>
+            </require>
+            <require comment="Supported only if EGL_KHR_stream_cross_process_fd is supported">
+                <enum name="EGL_STREAM_PROTOCOL_FD_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_socket" supported="egl">
+            <require>
+                <enum name="EGL_STREAM_PROTOCOL_SOCKET_NV"/>
+                <enum name="EGL_SOCKET_HANDLE_NV"/>
+                <enum name="EGL_SOCKET_TYPE_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_socket_inet" supported="egl">
+            <require>
+                <enum name="EGL_SOCKET_TYPE_INET_NV"/>
+            </require>
+        </extension>
+        <extension name="EGL_NV_stream_socket_unix" supported="egl">
+            <require>
+                <enum name="EGL_SOCKET_TYPE_UNIX_NV"/>
+            </require>
+        </extension>
         <extension name="EGL_NV_stream_sync" supported="egl">
             <require>
                 <enum name="EGL_SYNC_TYPE_KHR"/>
@@ -2408,5 +2942,39 @@
                 <enum name="EGL_NATIVE_SURFACE_TIZEN"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_compositor" supported="egl">
+            <require>
+                <enum name="EGL_PRIMARY_COMPOSITOR_CONTEXT_EXT"/>
+                <enum name="EGL_EXTERNAL_REF_ID_EXT"/>
+                <enum name="EGL_COMPOSITOR_DROP_NEWEST_FRAME_EXT"/>
+                <enum name="EGL_COMPOSITOR_KEEP_NEWEST_FRAME_EXT"/>
+
+                <command name="eglCompositorSetContextListEXT"/>
+                <command name="eglCompositorSetContextAttributesEXT"/>
+                <command name="eglCompositorSetWindowListEXT"/>
+                <command name="eglCompositorSetWindowAttributesEXT"/>
+                <command name="eglCompositorBindTexWindowEXT"/>
+                <command name="eglCompositorSetSizeEXT"/>
+                <command name="eglCompositorSwapPolicyEXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_surface_CTA861_3_metadata" supported="egl">
+            <require>
+                <enum name="EGL_CTA861_3_MAX_CONTENT_LIGHT_LEVEL_EXT"/>
+                <enum name="EGL_CTA861_3_MAX_FRAME_AVERAGE_LEVEL_EXT"/>
+            </require>
+        </extension>
+        <extension name="EGL_EXT_image_implicit_sync_control" supported="egl">
+	    <require>
+	        <enum name="EGL_IMPORT_SYNC_TYPE_EXT"/>
+		<enum name="EGL_IMPORT_IMPLICIT_SYNC_EXT"/>
+		<enum name="EGL_IMPORT_EXPLICIT_SYNC_EXT"/>
+	    </require>
+	</extension>
+        <extension name="EGL_EXT_bind_to_front" supported="egl">
+            <require>
+                <enum name="EGL_FRONT_BUFFER_EXT"/>
+            </require>
+        </extension>
     </extensions>
 </registry>
diff --git a/src/egl/generate/eglFunctionList.py b/src/egl/generate/eglFunctionList.py
index 80cb834..fb5b3c3 100644
--- a/src/egl/generate/eglFunctionList.py
+++ b/src/egl/generate/eglFunctionList.py
@@ -195,5 +195,9 @@
 
     # EGL_ANDROID_native_fence_sync
     _eglFunc("eglDupNativeFenceFDANDROID",           "display"),
+
+    # EGL_EXT_image_dma_buf_import_modifiers
+    _eglFunc("eglQueryDmaBufFormatsEXT",             "display"),
+    _eglFunc("eglQueryDmaBufModifiersEXT",           "display"),
 )
 
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 08faa78..16ae741 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -89,7 +89,7 @@
 #include "c99_compat.h"
 #include "c11/threads.h"
 #include "GL/mesa_glinterop.h"
-#include "eglcompiler.h"
+#include "util/macros.h"
 
 #include "eglglobals.h"
 #include "eglcontext.h"
@@ -488,11 +488,13 @@
    _EGL_CHECK_EXTENSION(EXT_buffer_age);
    _EGL_CHECK_EXTENSION(EXT_create_context_robustness);
    _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import);
+   _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import_modifiers);
    _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
 
    _EGL_CHECK_EXTENSION(KHR_cl_event2);
    _EGL_CHECK_EXTENSION(KHR_config_attribs);
    _EGL_CHECK_EXTENSION(KHR_create_context);
+   _EGL_CHECK_EXTENSION(KHR_create_context_no_error);
    _EGL_CHECK_EXTENSION(KHR_fence_sync);
    _EGL_CHECK_EXTENSION(KHR_get_all_proc_addresses);
    _EGL_CHECK_EXTENSION(KHR_gl_colorspace);
@@ -505,6 +507,7 @@
    _EGL_CHECK_EXTENSION(KHR_image_base);
    _EGL_CHECK_EXTENSION(KHR_image_pixmap);
    _EGL_CHECK_EXTENSION(KHR_no_config_context);
+   _EGL_CHECK_EXTENSION(KHR_partial_update);
    _EGL_CHECK_EXTENSION(KHR_reusable_sync);
    _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
    if (dpy->Extensions.EXT_swap_buffers_with_damage)
@@ -920,7 +923,7 @@
 _fixupNativeWindow(_EGLDisplay *disp, void *native_window)
 {
 #ifdef HAVE_X11_PLATFORM
-   if (disp->Platform == _EGL_PLATFORM_X11 && native_window != NULL) {
+   if (disp && disp->Platform == _EGL_PLATFORM_X11 && native_window != NULL) {
       /* The `native_window` parameter for the X11 platform differs between
        * eglCreateWindowSurface() and eglCreatePlatformPixmapSurfaceEXT(). In
        * eglCreateWindowSurface(), the type of `native_window` is an Xlib
@@ -982,7 +985,7 @@
        * `Pixmap*`.  Convert `Pixmap*` to `Pixmap` because that's what
        * dri2_x11_create_pixmap_surface() expects.
        */
-   if (disp->Platform == _EGL_PLATFORM_X11 && native_pixmap != NULL)
+   if (disp && disp->Platform == _EGL_PLATFORM_X11 && native_pixmap != NULL)
       return (void *)(* (Pixmap*) native_pixmap);
 #endif
    return native_pixmap;
@@ -1234,6 +1237,15 @@
 
    ret = drv->API.SwapBuffers(drv, disp, surf);
 
+   /* EGL_KHR_partial_update
+    * Frame boundary successfully reached,
+    * reset damage region and reset BufferAgeRead
+    */
+   if (ret) {
+      surf->SetDamageRegionCalled = EGL_FALSE;
+      surf->BufferAgeRead = EGL_FALSE;
+   }
+
    RETURN_EGL_EVAL(disp, ret);
 }
 
@@ -1258,6 +1270,15 @@
 
    ret = drv->API.SwapBuffersWithDamageEXT(drv, disp, surf, rects, n_rects);
 
+   /* EGL_KHR_partial_update
+    * Frame boundary successfully reached,
+    * reset damage region and reset BufferAgeRead
+    */
+   if (ret) {
+      surf->SetDamageRegionCalled = EGL_FALSE;
+      surf->BufferAgeRead = EGL_FALSE;
+   }
+
    RETURN_EGL_EVAL(disp, ret);
 }
 
@@ -1281,6 +1302,70 @@
    return _eglSwapBuffersWithDamageCommon(disp, surf, rects, n_rects);
 }
 
+/**
+ * If the width of the passed rect is greater than the surface's
+ * width then it is clamped to the width of the surface. Same with
+ * height.
+ */
+
+static void
+_eglSetDamageRegionKHRClampRects(_EGLDisplay* disp, _EGLSurface* surf,
+                                 EGLint *rects, EGLint n_rects)
+{
+   EGLint i;
+   EGLint surf_height = surf->Height;
+   EGLint surf_width = surf->Width;
+
+   for (i = 0; i < (4 * n_rects); i += 4) {
+      EGLint x, y, rect_width, rect_height;
+      x = rects[i];
+      y = rects[i + 1];
+      rect_width = rects[i + 2];
+      rect_height = rects[i + 3];
+
+      if (rect_width > surf_width - x)
+         rects[i + 2] = surf_width - x;
+
+      if (rect_height > surf_height - y)
+         rects[i + 3] = surf_height - y;
+   }
+}
+
+static EGLBoolean EGLAPIENTRY
+eglSetDamageRegionKHR(EGLDisplay dpy, EGLSurface surface,
+                      EGLint *rects, EGLint n_rects)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLSurface *surf = _eglLookupSurface(surface, disp);
+   _EGL_FUNC_START(disp, EGL_OBJECT_SURFACE_KHR, surf, EGL_FALSE);
+   _EGLContext *ctx = _eglGetCurrentContext();
+   _EGLDriver *drv;
+   EGLBoolean ret;
+   _EGL_CHECK_SURFACE(disp, surf, EGL_FALSE, drv);
+
+   if (_eglGetContextHandle(ctx) == EGL_NO_CONTEXT ||
+       surf->Type != EGL_WINDOW_BIT ||
+       ctx->DrawSurface != surf ||
+       surf->SwapBehavior != EGL_BUFFER_DESTROYED)
+      RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_FALSE);
+
+   /* If the damage region is already set or
+    * buffer age is not queried between
+    * frame boundaries, throw bad access error
+    */
+
+   if (surf->SetDamageRegionCalled || !surf->BufferAgeRead)
+      RETURN_EGL_ERROR(disp, EGL_BAD_ACCESS, EGL_FALSE);
+
+   _eglSetDamageRegionKHRClampRects(disp, surf, rects, n_rects);
+   ret = drv->API.SetDamageRegion(drv, disp, surf, rects, n_rects);
+
+   if (ret)
+      surf->SetDamageRegionCalled = EGL_TRUE;
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
 EGLBoolean EGLAPIENTRY
 eglCopyBuffers(EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target)
 {
@@ -2132,7 +2217,8 @@
    _EGL_FUNC_START(disp, EGL_OBJECT_DISPLAY_KHR, NULL, EGL_FALSE);
 
    _EGL_CHECK_DISPLAY(disp, NULL, drv);
-   assert(disp->Extensions.WL_create_wayland_buffer_from_image);
+   if (!disp->Extensions.WL_create_wayland_buffer_from_image)
+      RETURN_EGL_EVAL(disp, NULL);
 
    img = _eglLookupImage(image, disp);
 
@@ -2385,6 +2471,44 @@
    return strcmp(procname, entrypoint->name);
 }
 
+static EGLBoolean EGLAPIENTRY
+eglQueryDmaBufFormatsEXT(EGLDisplay dpy, EGLint max_formats,
+                         EGLint *formats, EGLint *num_formats)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_FUNC_START(NULL, EGL_NONE, NULL, EGL_FALSE);
+
+   _EGL_CHECK_DISPLAY(disp, EGL_FALSE, drv);
+
+   ret = drv->API.QueryDmaBufFormatsEXT(drv, disp, max_formats, formats,
+                                        num_formats);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+static EGLBoolean EGLAPIENTRY
+eglQueryDmaBufModifiersEXT(EGLDisplay dpy, EGLint format, EGLint max_modifiers,
+                           EGLuint64KHR *modifiers, EGLBoolean *external_only,
+                           EGLint *num_modifiers)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_FUNC_START(NULL, EGL_NONE, NULL, EGL_FALSE);
+
+   _EGL_CHECK_DISPLAY(disp, EGL_FALSE, drv);
+
+   ret = drv->API.QueryDmaBufModifiersEXT(drv, disp, format, max_modifiers,
+                                          modifiers, external_only,
+                                          num_modifiers);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
 __eglMustCastToProperFunctionPointerType EGLAPIENTRY
 eglGetProcAddress(const char *procname)
 {
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index 710c5d8..852a345 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -110,6 +110,8 @@
                              _EGLSurface *draw);
    EGLBoolean (*CopyBuffers)(_EGLDriver *drv, _EGLDisplay *dpy,
                              _EGLSurface *surface, void *native_pixmap_target);
+   EGLBoolean (*SetDamageRegion)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                 _EGLSurface *surface, EGLint *rects, EGLint n_rects);
 
    /* misc functions */
    EGLBoolean (*WaitClient)(_EGLDriver *drv, _EGLDisplay *dpy,
@@ -198,6 +200,15 @@
    int (*GLInteropExportObject)(_EGLDisplay *dpy, _EGLContext *ctx,
                                 struct mesa_glinterop_export_in *in,
                                 struct mesa_glinterop_export_out *out);
+
+   EGLBoolean (*QueryDmaBufFormatsEXT)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                       EGLint max_formats, EGLint *formats,
+                                       EGLint *num_formats);
+   EGLBoolean (*QueryDmaBufModifiersEXT) (_EGLDriver *drv, _EGLDisplay *dpy,
+                                          EGLint format, EGLint max_modifiers,
+                                          EGLuint64KHR *modifiers,
+                                          EGLBoolean *external_only,
+                                          EGLint *num_modifiers);
 };
 
 #ifdef __cplusplus
diff --git a/src/egl/main/eglcompiler.h b/src/egl/main/eglcompiler.h
deleted file mode 100644
index 9804ca4..0000000
--- a/src/egl/main/eglcompiler.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLCOMPILER_INCLUDED
-#define EGLCOMPILER_INCLUDED
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define STATIC_ASSERT(COND) \
-   do { \
-      (void) sizeof(char [1 - 2*!(COND)]); \
-   } while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* EGLCOMPILER_INCLUDED */
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index 876c8f0..f747c33 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -37,8 +37,8 @@
 #include <string.h>
 #include <assert.h>
 #include "c99_compat.h"
+#include "util/macros.h"
 
-#include "eglcompiler.h"
 #include "eglconfig.h"
 #include "egldisplay.h"
 #include "eglcurrent.h"
diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index 05cc523..1b03160 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -312,6 +312,26 @@
             ctx->Flags |= EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR;
          break;
 
+      case EGL_CONTEXT_OPENGL_NO_ERROR_KHR:
+         if (dpy->Version < 14 ||
+             !dpy->Extensions.KHR_create_context_no_error) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+
+         /* The KHR_no_error spec only applies against OpenGL 2.0+ and
+          * OpenGL ES 2.0+
+          */
+         if ((api != EGL_OPENGL_API && api != EGL_OPENGL_ES_API) ||
+             ctx->ClientMajorVersion < 2) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+
+         /* Canonicalize value to EGL_TRUE/EGL_FALSE definitions */
+         ctx->NoError = !!val;
+         break;
+
       default:
          err = EGL_BAD_ATTRIBUTE;
          break;
@@ -458,6 +478,16 @@
       break;
    }
 
+   /* The EGL_KHR_create_context_no_error spec says:
+    *
+    *    "BAD_MATCH is generated if the EGL_CONTEXT_OPENGL_NO_ERROR_KHR is TRUE at
+    *    the same time as a debug or robustness context is specified."
+    */
+   if (ctx->NoError && (ctx->Flags & EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR ||
+                        ctx->Flags & EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR)) {
+      err = EGL_BAD_MATCH;
+   }
+
    if ((ctx->Flags & ~(EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR
                       | EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR
                       | EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR)) != 0) {
@@ -489,10 +519,8 @@
    const EGLenum api = eglQueryAPI();
    EGLint err;
 
-   if (api == EGL_NONE) {
-      _eglError(EGL_BAD_MATCH, "eglCreateContext(no client API)");
-      return EGL_FALSE;
-   }
+   if (api == EGL_NONE)
+      return _eglError(EGL_BAD_MATCH, "eglCreateContext(no client API)");
 
    _eglInitResource(&ctx->Resource, sizeof(*ctx), dpy);
    ctx->ClientAPI = api;
diff --git a/src/egl/main/eglcontext.h b/src/egl/main/eglcontext.h
index f2fe806..0667622 100644
--- a/src/egl/main/eglcontext.h
+++ b/src/egl/main/eglcontext.h
@@ -62,6 +62,7 @@
    EGLint Flags;
    EGLint Profile;
    EGLint ResetNotificationStrategy;
+   EGLBoolean NoError;
 
    /* The real render buffer when a window surface is bound */
    EGLint WindowRenderBuffer;
diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c
index eae7bdc..26f4276 100644
--- a/src/egl/main/eglcurrent.c
+++ b/src/egl/main/eglcurrent.c
@@ -37,12 +37,8 @@
 #include "eglcurrent.h"
 #include "eglglobals.h"
 
-/* This should be kept in sync with _eglInitThreadInfo() */
-#define _EGL_THREAD_INFO_INITIALIZER \
-   { EGL_SUCCESS, NULL, EGL_OPENGL_ES_API, NULL, NULL, NULL }
-
 /* a fallback thread info to guarantee that every thread always has one */
-static _EGLThreadInfo dummy_thread = _EGL_THREAD_INFO_INITIALIZER;
+static _EGLThreadInfo dummy_thread;
 static mtx_t _egl_TSDMutex = _MTX_INITIALIZER_NP;
 static EGLBoolean _egl_TSDInitialized;
 static tss_t _egl_TSD;
@@ -109,7 +105,6 @@
 static void
 _eglInitThreadInfo(_EGLThreadInfo *t)
 {
-   memset(t, 0, sizeof(*t));
    t->LastError = EGL_SUCCESS;
    /* default, per EGL spec */
    t->CurrentAPI = EGL_OPENGL_ES_API;
@@ -123,10 +118,10 @@
 _eglCreateThreadInfo(void)
 {
    _EGLThreadInfo *t = calloc(1, sizeof(_EGLThreadInfo));
-   if (t)
-      _eglInitThreadInfo(t);
-   else
+   if (!t)
       t = &dummy_thread;
+
+   _eglInitThreadInfo(t);
    return t;
 }
 
diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index bb5076c..690728d 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -49,10 +49,6 @@
 #include "eglsync.h"
 
 /* Includes for _eglNativePlatformDetectNativeDisplay */
-#ifdef HAVE_MINCORE
-#include <unistd.h>
-#include <sys/mman.h>
-#endif
 #ifdef HAVE_WAYLAND_PLATFORM
 #include <wayland-client.h>
 #endif
@@ -106,35 +102,6 @@
 
 
 /**
- * Perform validity checks on a generic pointer.
- */
-static EGLBoolean
-_eglPointerIsDereferencable(void *p)
-{
-#ifdef HAVE_MINCORE
-   uintptr_t addr = (uintptr_t) p;
-   unsigned char valid = 0;
-   const long page_size = getpagesize();
-
-   if (p == NULL)
-      return EGL_FALSE;
-
-   /* align addr to page_size */
-   addr &= ~(page_size - 1);
-
-   if (mincore((void *) addr, page_size, &valid) < 0) {
-      _eglLog(_EGL_DEBUG, "mincore failed: %m");
-      return EGL_FALSE;
-   }
-
-   return (valid & 0x01) == 0x01;
-#else
-   return p != NULL;
-#endif
-}
-
-
-/**
  * Try detecting native platform with the help of native display characteristcs.
  */
 static _EGLPlatformType
@@ -480,7 +447,7 @@
 
 #ifdef HAVE_X11_PLATFORM
 static EGLBoolean
-_eglParseX11DisplayAttribList(const EGLint *attrib_list)
+_eglParseX11DisplayAttribList(_EGLDisplay *display, const EGLint *attrib_list)
 {
    int i;
 
@@ -494,14 +461,11 @@
 
       /* EGL_EXT_platform_x11 recognizes exactly one attribute,
        * EGL_PLATFORM_X11_SCREEN_EXT, which is optional.
-       * 
-       * Mesa supports connecting to only the default screen, so we reject
-       * screen != 0.
        */
-      if (attrib != EGL_PLATFORM_X11_SCREEN_EXT || value != 0) {
-         _eglError(EGL_BAD_ATTRIBUTE, "eglGetPlatformDisplay");
-         return EGL_FALSE;
-      }
+      if (attrib != EGL_PLATFORM_X11_SCREEN_EXT)
+         return _eglError(EGL_BAD_ATTRIBUTE, "eglGetPlatformDisplay");
+
+      display->Options.Platform = (void *)(uintptr_t)value;
    }
 
    return EGL_TRUE;
@@ -511,11 +475,19 @@
 _eglGetX11Display(Display *native_display,
                   const EGLint *attrib_list)
 {
-   if (!_eglParseX11DisplayAttribList(attrib_list)) {
+   _EGLDisplay *display = _eglFindDisplay(_EGL_PLATFORM_X11,
+                                          native_display);
+
+   if (!display) {
+      _eglError(EGL_BAD_ALLOC, "eglGetPlatformDisplay");
       return NULL;
    }
 
-   return _eglFindDisplay(_EGL_PLATFORM_X11, native_display);
+   if (!_eglParseX11DisplayAttribList(display, attrib_list)) {
+      return NULL;
+   }
+
+   return display;
 }
 #endif /* HAVE_X11_PLATFORM */
 
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 6c1049d..3d5a445 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -102,6 +102,7 @@
    EGLBoolean EXT_buffer_age;
    EGLBoolean EXT_create_context_robustness;
    EGLBoolean EXT_image_dma_buf_import;
+   EGLBoolean EXT_image_dma_buf_import_modifiers;
    EGLBoolean EXT_swap_buffers_with_damage;
 
    EGLBoolean KHR_cl_event2;
@@ -117,9 +118,11 @@
    EGLBoolean KHR_image_base;
    EGLBoolean KHR_image_pixmap;
    EGLBoolean KHR_no_config_context;
+   EGLBoolean KHR_partial_update;
    EGLBoolean KHR_reusable_sync;
    EGLBoolean KHR_surfaceless_context;
    EGLBoolean KHR_wait_sync;
+   EGLBoolean KHR_create_context_no_error;
 
    EGLBoolean MESA_drm_image;
    EGLBoolean MESA_image_dma_buf_export;
@@ -151,6 +154,7 @@
    struct {
       EGLBoolean TestOnly;    /**< Driver should not set fields when true */
       EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+      void *Platform;         /**< Platform-specific options */
    } Options;
 
    /* these fields are set by the driver during init */
diff --git a/src/egl/main/eglentrypoint.h b/src/egl/main/eglentrypoint.h
index e6318b9..f7fe774 100644
--- a/src/egl/main/eglentrypoint.h
+++ b/src/egl/main/eglentrypoint.h
@@ -56,11 +56,14 @@
 EGL_ENTRYPOINT(eglQueryAPI)
 EGL_ENTRYPOINT(eglQueryContext)
 EGL_ENTRYPOINT(eglQueryDebugKHR)
+EGL_ENTRYPOINT(eglQueryDmaBufFormatsEXT)
+EGL_ENTRYPOINT(eglQueryDmaBufModifiersEXT)
 EGL_ENTRYPOINT(eglQueryString)
 EGL_ENTRYPOINT(eglQuerySurface)
 EGL_ENTRYPOINT(eglQueryWaylandBufferWL)
 EGL_ENTRYPOINT(eglReleaseTexImage)
 EGL_ENTRYPOINT(eglReleaseThread)
+EGL_ENTRYPOINT(eglSetDamageRegionKHR)
 EGL_ENTRYPOINT(eglSignalSyncKHR)
 EGL_ENTRYPOINT(eglSurfaceAttrib)
 EGL_ENTRYPOINT(eglSwapBuffers)
diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c
index 017d337..1575ab5 100644
--- a/src/egl/main/eglfallbacks.c
+++ b/src/egl/main/eglfallbacks.c
@@ -77,6 +77,7 @@
    drv->API.ReleaseTexImage = (void*) _eglReturnFalse;
    drv->API.CopyBuffers = (void*) _eglReturnFalse;
    drv->API.SwapBuffers = (void*) _eglReturnFalse;
+   drv->API.SetDamageRegion = (void*) _eglReturnFalse;
    drv->API.SwapInterval = _eglSwapInterval;
 
    drv->API.WaitClient = (void*) _eglReturnFalse;
diff --git a/src/egl/main/eglglobals.c b/src/egl/main/eglglobals.c
index baf96bb..9071226 100644
--- a/src/egl/main/eglglobals.c
+++ b/src/egl/main/eglglobals.c
@@ -37,6 +37,12 @@
 #include "eglglobals.h"
 #include "egldisplay.h"
 #include "egldriver.h"
+#include "egllog.h"
+
+#ifdef HAVE_MINCORE
+#include <unistd.h>
+#include <sys/mman.h>
+#endif
 
 
 static mtx_t _eglGlobalMutex = _MTX_INITIALIZER_NP;
@@ -142,3 +148,39 @@
    mtx_unlock(_eglGlobal.Mutex);
    return ret;
 }
+
+EGLBoolean
+_eglPointerIsDereferencable(void *p)
+{
+#ifdef HAVE_MINCORE
+   uintptr_t addr = (uintptr_t) p;
+   unsigned char valid = 0;
+   const long page_size = getpagesize();
+
+   if (p == NULL)
+      return EGL_FALSE;
+
+   /* align addr to page_size */
+   addr &= ~(page_size - 1);
+
+   if (mincore((void *) addr, page_size, &valid) < 0) {
+      _eglLog(_EGL_DEBUG, "mincore failed: %m");
+      return EGL_FALSE;
+   }
+
+   /* mincore() returns 0 on success, and -1 on failure.  The last parameter
+    * is a vector of bytes with one entry for each page queried.  mincore
+    * returns page residency information in the first bit of each byte in the
+    * vector.
+    *
+    * Residency doesn't actually matter when determining whether a pointer is
+    * dereferenceable, so the output vector can be ignored.  What matters is
+    * whether mincore succeeds. See:
+    *
+    *   http://man7.org/linux/man-pages/man2/mincore.2.html
+    */
+   return EGL_TRUE;
+#else
+   return p != NULL;
+#endif
+}
diff --git a/src/egl/main/eglglobals.h b/src/egl/main/eglglobals.h
index c6ef59d..6655cca 100644
--- a/src/egl/main/eglglobals.h
+++ b/src/egl/main/eglglobals.h
@@ -87,4 +87,10 @@
 extern const char *
 _eglGetClientExtensionString(void);
 
+/**
+ * Perform validity checks on a generic pointer.
+ */
+extern EGLBoolean
+_eglPointerIsDereferencable(void *p);
+
 #endif /* EGLGLOBALS_INCLUDED */
diff --git a/src/egl/main/eglimage.c b/src/egl/main/eglimage.c
index d062cbf..72a556e 100644
--- a/src/egl/main/eglimage.c
+++ b/src/egl/main/eglimage.c
@@ -30,161 +30,291 @@
 #include <assert.h>
 #include <string.h>
 
+#include "eglcurrent.h"
 #include "eglimage.h"
 #include "egllog.h"
 
+static EGLint
+_eglParseKHRImageAttribs(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
+                         EGLint attr, EGLint val)
+{
+   switch (attr) {
+   case EGL_IMAGE_PRESERVED_KHR:
+      if (!dpy->Extensions.KHR_image_base)
+          return EGL_BAD_PARAMETER;
+
+      attrs->ImagePreserved = val;
+      break;
+
+   case EGL_GL_TEXTURE_LEVEL_KHR:
+      if (!dpy->Extensions.KHR_gl_texture_2D_image)
+         return EGL_BAD_PARAMETER;
+
+      attrs->GLTextureLevel = val;
+      break;
+   case EGL_GL_TEXTURE_ZOFFSET_KHR:
+      if (!dpy->Extensions.KHR_gl_texture_3D_image)
+         return EGL_BAD_PARAMETER;
+
+      attrs->GLTextureZOffset = val;
+      break;
+   default:
+      return EGL_BAD_PARAMETER;
+   }
+
+   return EGL_SUCCESS;
+}
+
+static EGLint
+_eglParseMESADrmImageAttribs(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
+                             EGLint attr, EGLint val)
+{
+   if (!dpy->Extensions.MESA_drm_image)
+      return EGL_BAD_PARAMETER;
+
+   switch (attr) {
+   case EGL_WIDTH:
+      attrs->Width = val;
+      break;
+   case EGL_HEIGHT:
+      attrs->Height = val;
+      break;
+   case EGL_DRM_BUFFER_FORMAT_MESA:
+      attrs->DRMBufferFormatMESA = val;
+      break;
+   case EGL_DRM_BUFFER_USE_MESA:
+      attrs->DRMBufferUseMESA = val;
+      break;
+   case EGL_DRM_BUFFER_STRIDE_MESA:
+      attrs->DRMBufferStrideMESA = val;
+      break;
+   default:
+      return EGL_BAD_PARAMETER;
+   }
+
+   return EGL_SUCCESS;
+}
+
+static EGLint
+_eglParseWLBindWaylandDisplayAttribs(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
+                                     EGLint attr, EGLint val)
+{
+   if (!dpy->Extensions.WL_bind_wayland_display)
+      return EGL_BAD_PARAMETER;
+
+   switch (attr) {
+   case EGL_WAYLAND_PLANE_WL:
+      attrs->PlaneWL = val;
+      break;
+   default:
+      return EGL_BAD_PARAMETER;
+   }
+
+   return EGL_SUCCESS;
+}
+
+static EGLint
+_eglParseEXTImageDmaBufImportAttribs(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
+                                     EGLint attr, EGLint val)
+{
+   if (!dpy->Extensions.EXT_image_dma_buf_import)
+      return EGL_BAD_PARAMETER;
+
+   switch (attr) {
+   case EGL_WIDTH:
+      attrs->Width = val;
+      break;
+   case EGL_HEIGHT:
+      attrs->Height = val;
+      break;
+   case EGL_LINUX_DRM_FOURCC_EXT:
+      attrs->DMABufFourCC.Value = val;
+      attrs->DMABufFourCC.IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE0_FD_EXT:
+      attrs->DMABufPlaneFds[0].Value = val;
+      attrs->DMABufPlaneFds[0].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE0_OFFSET_EXT:
+      attrs->DMABufPlaneOffsets[0].Value = val;
+      attrs->DMABufPlaneOffsets[0].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE0_PITCH_EXT:
+      attrs->DMABufPlanePitches[0].Value = val;
+      attrs->DMABufPlanePitches[0].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE1_FD_EXT:
+       attrs->DMABufPlaneFds[1].Value = val;
+       attrs->DMABufPlaneFds[1].IsPresent = EGL_TRUE;
+       break;
+   case EGL_DMA_BUF_PLANE1_OFFSET_EXT:
+      attrs->DMABufPlaneOffsets[1].Value = val;
+      attrs->DMABufPlaneOffsets[1].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE1_PITCH_EXT:
+      attrs->DMABufPlanePitches[1].Value = val;
+      attrs->DMABufPlanePitches[1].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE2_FD_EXT:
+      attrs->DMABufPlaneFds[2].Value = val;
+      attrs->DMABufPlaneFds[2].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE2_OFFSET_EXT:
+      attrs->DMABufPlaneOffsets[2].Value = val;
+      attrs->DMABufPlaneOffsets[2].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE2_PITCH_EXT:
+      attrs->DMABufPlanePitches[2].Value = val;
+      attrs->DMABufPlanePitches[2].IsPresent = EGL_TRUE;
+      break;
+   case EGL_YUV_COLOR_SPACE_HINT_EXT:
+      if (val != EGL_ITU_REC601_EXT && val != EGL_ITU_REC709_EXT &&
+          val != EGL_ITU_REC2020_EXT)
+         return EGL_BAD_ATTRIBUTE;
+
+      attrs->DMABufYuvColorSpaceHint.Value = val;
+      attrs->DMABufYuvColorSpaceHint.IsPresent = EGL_TRUE;
+      break;
+   case EGL_SAMPLE_RANGE_HINT_EXT:
+      if (val != EGL_YUV_FULL_RANGE_EXT && val != EGL_YUV_NARROW_RANGE_EXT)
+         return EGL_BAD_ATTRIBUTE;
+
+      attrs->DMABufSampleRangeHint.Value = val;
+      attrs->DMABufSampleRangeHint.IsPresent = EGL_TRUE;
+      break;
+   case EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT:
+      if (val != EGL_YUV_CHROMA_SITING_0_EXT &&
+          val != EGL_YUV_CHROMA_SITING_0_5_EXT)
+         return EGL_BAD_ATTRIBUTE;
+
+      attrs->DMABufChromaHorizontalSiting.Value = val;
+      attrs->DMABufChromaHorizontalSiting.IsPresent = EGL_TRUE;
+      break;
+   case EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT:
+      if (val != EGL_YUV_CHROMA_SITING_0_EXT &&
+          val != EGL_YUV_CHROMA_SITING_0_5_EXT)
+         return EGL_BAD_ATTRIBUTE;
+
+      attrs->DMABufChromaVerticalSiting.Value = val;
+      attrs->DMABufChromaVerticalSiting.IsPresent = EGL_TRUE;
+      break;
+   default:
+      return EGL_BAD_PARAMETER;
+   }
+
+   return EGL_SUCCESS;
+}
+
+static EGLint
+_eglParseEXTImageDmaBufImportModifiersAttribs(_EGLImageAttribs *attrs,
+                                              _EGLDisplay *dpy,
+                                              EGLint attr, EGLint val)
+{
+   if (!dpy->Extensions.EXT_image_dma_buf_import_modifiers)
+      return EGL_BAD_PARAMETER;
+
+   switch (attr) {
+   case EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT:
+      attrs->DMABufPlaneModifiersLo[0].Value = val;
+      attrs->DMABufPlaneModifiersLo[0].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT:
+      attrs->DMABufPlaneModifiersHi[0].Value = val;
+      attrs->DMABufPlaneModifiersHi[0].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT:
+      attrs->DMABufPlaneModifiersLo[1].Value = val;
+      attrs->DMABufPlaneModifiersLo[1].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT:
+      attrs->DMABufPlaneModifiersHi[1].Value = val;
+      attrs->DMABufPlaneModifiersHi[1].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT:
+      attrs->DMABufPlaneModifiersLo[2].Value = val;
+      attrs->DMABufPlaneModifiersLo[2].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT:
+      attrs->DMABufPlaneModifiersHi[2].Value = val;
+      attrs->DMABufPlaneModifiersHi[2].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE3_FD_EXT:
+      attrs->DMABufPlaneFds[3].Value = val;
+      attrs->DMABufPlaneFds[3].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE3_OFFSET_EXT:
+      attrs->DMABufPlaneOffsets[3].Value = val;
+      attrs->DMABufPlaneOffsets[3].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE3_PITCH_EXT:
+      attrs->DMABufPlanePitches[3].Value = val;
+      attrs->DMABufPlanePitches[3].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT:
+      attrs->DMABufPlaneModifiersLo[3].Value = val;
+      attrs->DMABufPlaneModifiersLo[3].IsPresent = EGL_TRUE;
+      break;
+   case EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT:
+      attrs->DMABufPlaneModifiersHi[3].Value = val;
+      attrs->DMABufPlaneModifiersHi[3].IsPresent = EGL_TRUE;
+      break;
+   default:
+      return EGL_BAD_PARAMETER;
+   }
+
+   return EGL_SUCCESS;
+}
 
 /**
- * Parse the list of image attributes and return the proper error code.
+ * Parse the list of image attributes.
+ *
+ * Returns EGL_TRUE on success and EGL_FALSE otherwise.
+ * Function calls _eglError to set the correct error code.
  */
-EGLint
+EGLBoolean
 _eglParseImageAttribList(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
                          const EGLint *attrib_list)
 {
-   EGLint i, err = EGL_SUCCESS;
-
-   (void) dpy;
+   EGLint i, err;
 
    memset(attrs, 0, sizeof(*attrs));
 
    if (!attrib_list)
-      return err;
+      return EGL_TRUE;
 
    for (i = 0; attrib_list[i] != EGL_NONE; i++) {
       EGLint attr = attrib_list[i++];
       EGLint val = attrib_list[i];
 
-      switch (attr) {
-      /* EGL_KHR_image_base */
-      case EGL_IMAGE_PRESERVED_KHR:
-         attrs->ImagePreserved = val;
-         break;
+      err = _eglParseKHRImageAttribs(attrs, dpy, attr, val);
+      if (err == EGL_SUCCESS)
+          continue;
 
-      /* EGL_KHR_gl_image */
-      case EGL_GL_TEXTURE_LEVEL_KHR:
-         attrs->GLTextureLevel = val;
-         break;
-      case EGL_GL_TEXTURE_ZOFFSET_KHR:
-         attrs->GLTextureZOffset = val;
-         break;
+      err = _eglParseMESADrmImageAttribs(attrs, dpy, attr, val);
+      if (err == EGL_SUCCESS)
+          continue;
 
-      /* EGL_MESA_drm_image */
-      case EGL_WIDTH:
-         attrs->Width = val;
-         break;
-      case EGL_HEIGHT:
-         attrs->Height = val;
-         break;
-      case EGL_DRM_BUFFER_FORMAT_MESA:
-         attrs->DRMBufferFormatMESA = val;
-         break;
-      case EGL_DRM_BUFFER_USE_MESA:
-         attrs->DRMBufferUseMESA = val;
-         break;
-      case EGL_DRM_BUFFER_STRIDE_MESA:
-         attrs->DRMBufferStrideMESA = val;
-         break;
+      err = _eglParseWLBindWaylandDisplayAttribs(attrs, dpy, attr, val);
+      if (err == EGL_SUCCESS)
+          continue;
 
-      /* EGL_WL_bind_wayland_display */
-      case EGL_WAYLAND_PLANE_WL:
-         attrs->PlaneWL = val;
-         break;
+      err = _eglParseEXTImageDmaBufImportAttribs(attrs, dpy, attr, val);
+      if (err == EGL_SUCCESS)
+          continue;
 
-      case EGL_LINUX_DRM_FOURCC_EXT:
-         attrs->DMABufFourCC.Value = val;
-         attrs->DMABufFourCC.IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE0_FD_EXT:
-         attrs->DMABufPlaneFds[0].Value = val;
-         attrs->DMABufPlaneFds[0].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE0_OFFSET_EXT:
-         attrs->DMABufPlaneOffsets[0].Value = val;
-         attrs->DMABufPlaneOffsets[0].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE0_PITCH_EXT:
-         attrs->DMABufPlanePitches[0].Value = val;
-         attrs->DMABufPlanePitches[0].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE1_FD_EXT:
-         attrs->DMABufPlaneFds[1].Value = val;
-         attrs->DMABufPlaneFds[1].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE1_OFFSET_EXT:
-         attrs->DMABufPlaneOffsets[1].Value = val;
-         attrs->DMABufPlaneOffsets[1].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE1_PITCH_EXT:
-         attrs->DMABufPlanePitches[1].Value = val;
-         attrs->DMABufPlanePitches[1].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE2_FD_EXT:
-         attrs->DMABufPlaneFds[2].Value = val;
-         attrs->DMABufPlaneFds[2].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE2_OFFSET_EXT:
-         attrs->DMABufPlaneOffsets[2].Value = val;
-         attrs->DMABufPlaneOffsets[2].IsPresent = EGL_TRUE;
-         break;
-      case EGL_DMA_BUF_PLANE2_PITCH_EXT:
-         attrs->DMABufPlanePitches[2].Value = val;
-         attrs->DMABufPlanePitches[2].IsPresent = EGL_TRUE;
-         break;
-      case EGL_YUV_COLOR_SPACE_HINT_EXT:
-         if (val != EGL_ITU_REC601_EXT && val != EGL_ITU_REC709_EXT &&
-             val != EGL_ITU_REC2020_EXT) {
-            err = EGL_BAD_ATTRIBUTE;
-         } else {
-            attrs->DMABufYuvColorSpaceHint.Value = val;
-            attrs->DMABufYuvColorSpaceHint.IsPresent = EGL_TRUE;
-         }
-         break;
-      case EGL_SAMPLE_RANGE_HINT_EXT:
-         if (val != EGL_YUV_FULL_RANGE_EXT && val != EGL_YUV_NARROW_RANGE_EXT) {
-            err = EGL_BAD_ATTRIBUTE;
-         } else {
-            attrs->DMABufSampleRangeHint.Value = val;
-            attrs->DMABufSampleRangeHint.IsPresent = EGL_TRUE;
-         }
-         break;
-      case EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT:
-         if (val != EGL_YUV_CHROMA_SITING_0_EXT &&
-             val != EGL_YUV_CHROMA_SITING_0_5_EXT) {
-            err = EGL_BAD_ATTRIBUTE;
-         } else {
-            attrs->DMABufChromaHorizontalSiting.Value = val;
-            attrs->DMABufChromaHorizontalSiting.IsPresent = EGL_TRUE;
-         }
-         break;
-      case EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT:
-         if (val != EGL_YUV_CHROMA_SITING_0_EXT &&
-             val != EGL_YUV_CHROMA_SITING_0_5_EXT) {
-            err = EGL_BAD_ATTRIBUTE;
-         } else {
-            attrs->DMABufChromaVerticalSiting.Value = val;
-            attrs->DMABufChromaVerticalSiting.IsPresent = EGL_TRUE;
-         }
-         break;
+      /* EXT_image_dma_buf_import states that if invalid value is provided for
+       * its attributes, we should return EGL_BAD_ATTRIBUTE.
+       * Bail out ASAP, since follow-up calls can return another EGL_BAD error.
+       */
+      if (err == EGL_BAD_ATTRIBUTE)
+          return _eglError(err, __func__);
 
-      default:
-         /* unknown attrs are ignored */
-         break;
-      }
+      err = _eglParseEXTImageDmaBufImportModifiersAttribs(attrs, dpy, attr, val);
+      if (err == EGL_SUCCESS)
+          continue;
 
-      if (err != EGL_SUCCESS) {
-         _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr);
-         break;
-      }
+      return _eglError(err, __func__);
    }
 
-   return err;
-}
-
-
-EGLBoolean
-_eglInitImage(_EGLImage *img, _EGLDisplay *dpy)
-{
-   _eglInitResource(&img->Resource, sizeof(*img), dpy);
-
    return EGL_TRUE;
 }
diff --git a/src/egl/main/eglimage.h b/src/egl/main/eglimage.h
index 0dd5e12..8751792 100644
--- a/src/egl/main/eglimage.h
+++ b/src/egl/main/eglimage.h
@@ -46,6 +46,8 @@
    EGLBoolean IsPresent;
 };
 
+#define DMA_BUF_MAX_PLANES 4
+
 struct _egl_image_attribs
 {
    /* EGL_KHR_image_base */
@@ -65,11 +67,14 @@
    /* EGL_WL_bind_wayland_display */
    EGLint PlaneWL;
 
-   /* EGL_EXT_image_dma_buf_import */
+   /* EGL_EXT_image_dma_buf_import and
+    * EGL_EXT_image_dma_buf_import_modifiers */
    struct _egl_image_attrib_int DMABufFourCC;
-   struct _egl_image_attrib_int DMABufPlaneFds[3];
-   struct _egl_image_attrib_int DMABufPlaneOffsets[3];
-   struct _egl_image_attrib_int DMABufPlanePitches[3];
+   struct _egl_image_attrib_int DMABufPlaneFds[DMA_BUF_MAX_PLANES];
+   struct _egl_image_attrib_int DMABufPlaneOffsets[DMA_BUF_MAX_PLANES];
+   struct _egl_image_attrib_int DMABufPlanePitches[DMA_BUF_MAX_PLANES];
+   struct _egl_image_attrib_int DMABufPlaneModifiersLo[DMA_BUF_MAX_PLANES];
+   struct _egl_image_attrib_int DMABufPlaneModifiersHi[DMA_BUF_MAX_PLANES];
    struct _egl_image_attrib_int DMABufYuvColorSpaceHint;
    struct _egl_image_attrib_int DMABufSampleRangeHint;
    struct _egl_image_attrib_int DMABufChromaHorizontalSiting;
@@ -86,13 +91,16 @@
 };
 
 
-extern EGLint
+EGLBoolean
 _eglParseImageAttribList(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
                          const EGLint *attrib_list);
 
 
-extern EGLBoolean
-_eglInitImage(_EGLImage *img, _EGLDisplay *dpy);
+static inline void
+_eglInitImage(_EGLImage *img, _EGLDisplay *dpy)
+{
+   _eglInitResource(&img->Resource, sizeof(*img), dpy);
+}
 
 
 /**
diff --git a/src/egl/main/egllog.c b/src/egl/main/egllog.c
index 9569465..64ff474 100644
--- a/src/egl/main/egllog.c
+++ b/src/egl/main/egllog.c
@@ -41,9 +41,16 @@
 #include <string.h>
 #include <strings.h>
 #include "c11/threads.h"
+#include "util/macros.h"
 
 #include "egllog.h"
 
+#ifdef HAVE_ANDROID_PLATFORM
+#define LOG_TAG "EGL-MAIN"
+#include <cutils/log.h>
+
+#endif /* HAVE_ANDROID_PLATFORM */
+
 #define MAXSTRING 1000
 #define FALLBACK_LOG_LEVEL _EGL_WARNING
 
@@ -53,82 +60,37 @@
 
    EGLBoolean initialized;
    EGLint level;
-   _EGLLogProc logger;
-   EGLint num_messages;
 } logging = {
-   _MTX_INITIALIZER_NP,
-   EGL_FALSE,
-   FALLBACK_LOG_LEVEL,
-   NULL,
-   0
+   .mutex = _MTX_INITIALIZER_NP,
+   .initialized = EGL_FALSE,
+   .level = FALLBACK_LOG_LEVEL,
 };
 
 static const char *level_strings[] = {
-   /* the order is important */
-   "fatal",
-   "warning",
-   "info",
-   "debug",
-   NULL
+   [_EGL_FATAL] = "fatal",
+   [_EGL_WARNING]  = "warning",
+   [_EGL_INFO] = "info",
+   [_EGL_DEBUG] = "debug",
 };
 
 
 /**
- * Set the function to be called when there is a message to log.
- * Note that the function will be called with an internal lock held.
- * Recursive logging is not allowed.
- */
-void
-_eglSetLogProc(_EGLLogProc logger)
-{
-   EGLint num_messages = 0;
-
-   mtx_lock(&logging.mutex);
-
-   if (logging.logger != logger) {
-      logging.logger = logger;
-
-      num_messages = logging.num_messages;
-      logging.num_messages = 0;
-   }
-
-   mtx_unlock(&logging.mutex);
-
-   if (num_messages)
-      _eglLog(_EGL_DEBUG,
-              "New logger installed. "
-              "Messages before the new logger might not be available.");
-}
-
-
-/**
- * Set the log reporting level.
- */
-void
-_eglSetLogLevel(EGLint level)
-{
-   switch (level) {
-   case _EGL_FATAL:
-   case _EGL_WARNING:
-   case _EGL_INFO:
-   case _EGL_DEBUG:
-      mtx_lock(&logging.mutex);
-      logging.level = level;
-      mtx_unlock(&logging.mutex);
-      break;
-   default:
-      break;
-   }
-}
-
-
-/**
  * The default logger.  It prints the message to stderr.
  */
 static void
 _eglDefaultLogger(EGLint level, const char *msg)
 {
+#ifdef HAVE_ANDROID_PLATFORM
+   static const int egl2alog[] = {
+      [_EGL_FATAL] = ANDROID_LOG_ERROR,
+      [_EGL_WARNING]  = ANDROID_LOG_WARN,
+      [_EGL_INFO] = ANDROID_LOG_INFO,
+      [_EGL_DEBUG] = ANDROID_LOG_DEBUG,
+   };
+   LOG_PRI(egl2alog[level], LOG_TAG, "%s", msg);
+#else
    fprintf(stderr, "libEGL %s: %s\n", level_strings[level], msg);
+#endif /* HAVE_ANDROID_PLATFORM */
 }
 
 
@@ -146,18 +108,14 @@
 
    log_env = getenv("EGL_LOG_LEVEL");
    if (log_env) {
-      for (i = 0; level_strings[i]; i++) {
+      for (i = 0; i < ARRAY_SIZE(level_strings); i++) {
          if (strcasecmp(log_env, level_strings[i]) == 0) {
             level = i;
             break;
          }
       }
    }
-   else {
-      level = FALLBACK_LOG_LEVEL;
-   }
 
-   logging.logger = _eglDefaultLogger;
    logging.level = (level >= 0) ? level : FALLBACK_LOG_LEVEL;
    logging.initialized = EGL_TRUE;
 
@@ -191,16 +149,13 @@
 
    mtx_lock(&logging.mutex);
 
-   if (logging.logger) {
-      va_start(args, fmtStr);
-      ret = vsnprintf(msg, MAXSTRING, fmtStr, args);
-      if (ret < 0 || ret >= MAXSTRING)
-         strcpy(msg, "<message truncated>");
-      va_end(args);
+   va_start(args, fmtStr);
+   ret = vsnprintf(msg, MAXSTRING, fmtStr, args);
+   if (ret < 0 || ret >= MAXSTRING)
+      strcpy(msg, "<message truncated>");
+   va_end(args);
 
-      logging.logger(level, msg);
-      logging.num_messages++;
-   }
+   _eglDefaultLogger(level, msg);
 
    mtx_unlock(&logging.mutex);
 
diff --git a/src/egl/main/egllog.h b/src/egl/main/egllog.h
index cf58525..2a06a34 100644
--- a/src/egl/main/egllog.h
+++ b/src/egl/main/egllog.h
@@ -44,17 +44,6 @@
 #define _EGL_DEBUG   3   /* useful info for debugging */
 
 
-typedef void (*_EGLLogProc)(EGLint level, const char *msg);
-
-
-extern void
-_eglSetLogProc(_EGLLogProc logger);
-
-
-extern void
-_eglSetLogLevel(EGLint level);
-
-
 extern void
 _eglLog(EGLint level, const char *fmtStr, ...);
 
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 5b3e83e..f6e41f1 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -286,11 +286,9 @@
       return EGL_FALSE;
    }
 
-   if ((conf->SurfaceType & type) == 0) {
+   if ((conf->SurfaceType & type) == 0)
       /* The config can't be used to create a surface of this type */
-      _eglError(EGL_BAD_MATCH, func);
-      return EGL_FALSE;
-   }
+      return _eglError(EGL_BAD_MATCH, func);
 
    _eglInitResource(&surf->Resource, sizeof(*surf), dpy);
    surf->Type = type;
@@ -317,6 +315,8 @@
    surf->AspectRatio = EGL_UNKNOWN;
 
    surf->PostSubBufferSupportedNV = EGL_FALSE;
+   surf->SetDamageRegionCalled = EGL_FALSE;
+   surf->BufferAgeRead = EGL_FALSE;
 
    /* the default swap interval is 1 */
    _eglClampSwapInterval(surf, 1);
@@ -395,29 +395,32 @@
       *value = surface->VGColorspace;
       break;
    case EGL_GL_COLORSPACE_KHR:
-      if (!dpy->Extensions.KHR_gl_colorspace) {
-         _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
-         return EGL_FALSE;
-      }
+      if (!dpy->Extensions.KHR_gl_colorspace)
+         return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
+
       *value = surface->GLColorspace;
       break;
    case EGL_POST_SUB_BUFFER_SUPPORTED_NV:
       *value = surface->PostSubBufferSupportedNV;
       break;
    case EGL_BUFFER_AGE_EXT:
-      if (!dpy->Extensions.EXT_buffer_age) {
-         _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
-         return EGL_FALSE;
-      }
+      if (!dpy->Extensions.EXT_buffer_age)
+         return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
+
+      _EGLContext *ctx = _eglGetCurrentContext();
       EGLint result = drv->API.QueryBufferAge(drv, dpy, surface);
       /* error happened */
       if (result < 0)
          return EGL_FALSE;
+      if (_eglGetContextHandle(ctx) == EGL_NO_CONTEXT ||
+          ctx->DrawSurface != surface)
+         return _eglError(EGL_BAD_SURFACE, "eglQuerySurface");
+
       *value = result;
+      surface->BufferAgeRead = EGL_TRUE;
       break;
    default:
-      _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
-      return EGL_FALSE;
+      return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
    }
 
    return EGL_TRUE;
@@ -504,25 +507,17 @@
    if (dpy->Extensions.NOK_texture_from_pixmap)
       texture_type |= EGL_PIXMAP_BIT;
 
-   if (!(surface->Type & texture_type)) {
-      _eglError(EGL_BAD_SURFACE, "eglBindTexImage");
-      return EGL_FALSE;
-   }
+   if (!(surface->Type & texture_type))
+      return _eglError(EGL_BAD_SURFACE, "eglBindTexImage");
 
-   if (surface->TextureFormat == EGL_NO_TEXTURE) {
-      _eglError(EGL_BAD_MATCH, "eglBindTexImage");
-      return EGL_FALSE;
-   }
+   if (surface->TextureFormat == EGL_NO_TEXTURE)
+      return _eglError(EGL_BAD_MATCH, "eglBindTexImage");
 
-   if (surface->TextureTarget == EGL_NO_TEXTURE) {
-      _eglError(EGL_BAD_MATCH, "eglBindTexImage");
-      return EGL_FALSE;
-   }
+   if (surface->TextureTarget == EGL_NO_TEXTURE)
+      return _eglError(EGL_BAD_MATCH, "eglBindTexImage");
 
-   if (buffer != EGL_BACK_BUFFER) {
-      _eglError(EGL_BAD_PARAMETER, "eglBindTexImage");
-      return EGL_FALSE;
-   }
+   if (buffer != EGL_BACK_BUFFER)
+      return _eglError(EGL_BAD_PARAMETER, "eglBindTexImage");
 
    surface->BoundToTexture = EGL_TRUE;
 
@@ -540,10 +535,7 @@
    EGLint texture_type = EGL_PBUFFER_BIT;
 
    if (surf == EGL_NO_SURFACE)
-   {
-      _eglError(EGL_BAD_SURFACE, "eglReleaseTexImage");
-      return EGL_FALSE;
-   }
+      return _eglError(EGL_BAD_SURFACE, "eglReleaseTexImage");
 
    if (!surf->BoundToTexture)
    {
@@ -552,25 +544,16 @@
    }
 
    if (surf->TextureFormat == EGL_NO_TEXTURE)
-   {
-      _eglError(EGL_BAD_MATCH, "eglReleaseTexImage");
-      return EGL_FALSE;
-   }
+      return _eglError(EGL_BAD_MATCH, "eglReleaseTexImage");
 
    if (buffer != EGL_BACK_BUFFER)
-   {
-      _eglError(EGL_BAD_PARAMETER, "eglReleaseTexImage");
-      return EGL_FALSE;
-   }
+      return _eglError(EGL_BAD_PARAMETER, "eglReleaseTexImage");
 
    if (dpy->Extensions.NOK_texture_from_pixmap)
       texture_type |= EGL_PIXMAP_BIT;
 
    if (!(surf->Type & texture_type))
-   {
-      _eglError(EGL_BAD_SURFACE, "eglReleaseTexImage");
-      return EGL_FALSE;
-   }
+      return _eglError(EGL_BAD_SURFACE, "eglReleaseTexImage");
 
    surf->BoundToTexture = EGL_FALSE;
 
diff --git a/src/egl/main/eglsurface.h b/src/egl/main/eglsurface.h
index f13cf49..c53e8d0 100644
--- a/src/egl/main/eglsurface.h
+++ b/src/egl/main/eglsurface.h
@@ -82,6 +82,18 @@
 
    EGLint SwapInterval;
 
+   /* EGL_KHR_partial_update
+    * True if the damage region is already set
+    * between frame boundaries.
+    */
+   EGLBoolean SetDamageRegionCalled;
+
+   /* EGL_KHR_partial_update
+    * True if the buffer age is read by the client
+    * between frame boundaries.
+    */
+   EGLBoolean BufferAgeRead;
+
    /* True if the surface is bound to an OpenGL ES texture */
    EGLBoolean BoundToTexture;
 
diff --git a/src/egl/main/egltypedefs.h b/src/egl/main/egltypedefs.h
index 7facdb4..19524a1 100644
--- a/src/egl/main/egltypedefs.h
+++ b/src/egl/main/egltypedefs.h
@@ -34,9 +34,6 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "eglcompiler.h"
-
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/egl/wayland/wayland-drm/wayland-drm.c b/src/egl/wayland/wayland-drm/wayland-drm.c
index 4fc1252..674cd10 100644
--- a/src/egl/wayland/wayland-drm/wayland-drm.c
+++ b/src/egl/wayland/wayland-drm/wayland-drm.c
@@ -47,7 +47,7 @@
 	char *device_name;
         uint32_t flags;
 
-	struct wayland_drm_callbacks *callbacks;
+	struct wayland_drm_callbacks callbacks;
 
         struct wl_buffer_interface buffer_interface;
 };
@@ -58,7 +58,7 @@
 	struct wl_drm_buffer *buffer = resource->data;
 	struct wl_drm *drm = buffer->drm;
 
-	drm->callbacks->release_buffer(drm->user_data, buffer);
+	drm->callbacks.release_buffer(drm->user_data, buffer);
 	free(buffer);
 }
 
@@ -97,7 +97,7 @@
 	buffer->offset[2] = offset2;
 	buffer->stride[2] = stride2;
 
-        drm->callbacks->reference_buffer(drm->user_data, name, fd, buffer);
+        drm->callbacks.reference_buffer(drm->user_data, name, fd, buffer);
 	if (buffer->driver_buffer == NULL) {
 		wl_resource_post_error(resource,
 				       WL_DRM_ERROR_INVALID_NAME,
@@ -189,7 +189,7 @@
 {
 	struct wl_drm *drm = resource->data;
 
-	if (drm->callbacks->authenticate(drm->user_data, id) < 0)
+	if (drm->callbacks.authenticate(drm->user_data, id) < 0)
 		wl_resource_post_error(resource,
 				       WL_DRM_ERROR_AUTHENTICATE_FAIL,
 				       "authenicate failed");
@@ -270,7 +270,7 @@
 
 	drm->display = display;
 	drm->device_name = strdup(device_name);
-	drm->callbacks = callbacks;
+	drm->callbacks = *callbacks;
 	drm->user_data = user_data;
         drm->flags = flags;
 
diff --git a/src/egl/wayland/wayland-drm/wayland-drm.xml b/src/egl/wayland/wayland-drm/wayland-drm.xml
index 5e64622..83aa561 100644
--- a/src/egl/wayland/wayland-drm/wayland-drm.xml
+++ b/src/egl/wayland/wayland-drm/wayland-drm.xml
@@ -39,7 +39,8 @@
     <enum name="format">
       <!-- The drm format codes match the #defines in drm_fourcc.h.
            The formats actually supported by the compositor will be
-           reported by the format event. -->
+           reported by the format event. New codes must not be added,
+           unless directly taken from drm_fourcc.h. -->
       <entry name="c8" value="0x20203843"/>
       <entry name="rgb332" value="0x38424752"/>
       <entry name="bgr233" value="0x38524742"/>
diff --git a/src/egl/wayland/wayland-egl/Makefile.am b/src/egl/wayland/wayland-egl/Makefile.am
index 8c45e8e..846fa62 100644
--- a/src/egl/wayland/wayland-egl/Makefile.am
+++ b/src/egl/wayland/wayland-egl/Makefile.am
@@ -14,7 +14,11 @@
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
-TESTS = wayland-egl-symbols-check
+TESTS = wayland-egl-symbols-check \
+        wayland-egl-abi-check
+
 EXTRA_DIST = wayland-egl-symbols-check
 
+check_PROGRAMS = wayland-egl-abi-check
+
 include $(top_srcdir)/install-lib-links.mk
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c b/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c
new file mode 100644
index 0000000..4ea3854
--- /dev/null
+++ b/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stddef.h> // offsetof
+#include <stdio.h>  // printf
+
+#include "wayland-egl-priv.h" // Current struct wl_egl_window implementation
+
+/*
+ * Following are previous implementations of wl_egl_window.
+ *
+ * DO NOT EVER CHANGE!
+ */
+
+/* From: 214fc6e850 - Benjamin Franzke : egl: Implement libwayland-egl */
+struct wl_egl_window_v0 {
+    struct wl_surface *surface;
+
+    int width;
+    int height;
+    int dx;
+    int dy;
+
+    int attached_width;
+    int attached_height;
+};
+
+/* From: ca3ed3e024 - Ander Conselvan de Oliveira : egl/wayland: Don't invalidate drawable on swap buffers */
+struct wl_egl_window_v1 {
+    struct wl_surface *surface;
+
+    int width;
+    int height;
+    int dx;
+    int dy;
+
+    int attached_width;
+    int attached_height;
+
+    void *private;
+    void (*resize_callback)(struct wl_egl_window *, void *);
+};
+
+/* From: 690ead4a13 - Stencel, Joanna : egl/wayland-egl: Fix for segfault in dri2_wl_destroy_surface. */
+#define WL_EGL_WINDOW_VERSION_v2 2
+struct wl_egl_window_v2 {
+    struct wl_surface *surface;
+
+    int width;
+    int height;
+    int dx;
+    int dy;
+
+    int attached_width;
+    int attached_height;
+
+    void *private;
+    void (*resize_callback)(struct wl_egl_window *, void *);
+    void (*destroy_window_callback)(void *);
+};
+
+/* From: 2d5d61bc49 - Miguel A. Vico : wayland-egl: Make wl_egl_window a versioned struct */
+#define WL_EGL_WINDOW_VERSION_v3 3
+struct wl_egl_window_v3 {
+    const intptr_t version;
+
+    int width;
+    int height;
+    int dx;
+    int dy;
+
+    int attached_width;
+    int attached_height;
+
+    void *private;
+    void (*resize_callback)(struct wl_egl_window *, void *);
+    void (*destroy_window_callback)(void *);
+
+    struct wl_surface *surface;
+};
+
+
+/* This program checks we keep a backwards-compatible struct wl_egl_window
+ * definition whenever it is modified in wayland-egl-priv.h.
+ *
+ * The previous definition should be added above as a new struct
+ * wl_egl_window_vN, and the appropriate checks should be added below
+ */
+
+#define MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+
+#define CHECK_RENAMED_MEMBER(a_ver, b_ver, a_member, b_member)                      \
+    do {                                                                            \
+        if (offsetof(struct wl_egl_window ## a_ver, a_member) !=                    \
+            offsetof(struct wl_egl_window ## b_ver, b_member)) {                    \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "offsetof(struct wl_egl_window" #a_ver "::" #a_member ") != "    \
+                   "offsetof(struct wl_egl_window" #b_ver "::" #b_member ")\n");    \
+            return 1;                                                               \
+        }                                                                           \
+                                                                                    \
+        if (MEMBER_SIZE(struct wl_egl_window ## a_ver, a_member) !=                 \
+            MEMBER_SIZE(struct wl_egl_window ## b_ver, b_member)) {                 \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "MEMBER_SIZE(struct wl_egl_window" #a_ver "::" #a_member ") != " \
+                   "MEMBER_SIZE(struct wl_egl_window" #b_ver "::" #b_member ")\n"); \
+            return 1;                                                               \
+        }                                                                           \
+    } while (0)
+
+#define CHECK_MEMBER(a_ver, b_ver, member) CHECK_RENAMED_MEMBER(a_ver, b_ver, member, member)
+#define CHECK_MEMBER_CURRENT(a_ver, member) CHECK_MEMBER(a_ver,, member)
+
+#define CHECK_SIZE(a_ver, b_ver)                                                    \
+    do {                                                                            \
+        if (sizeof(struct wl_egl_window ## a_ver) >                                 \
+            sizeof(struct wl_egl_window ## b_ver)) {                                \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "sizeof(struct wl_egl_window" #a_ver ") > "                      \
+                   "sizeof(struct wl_egl_window" #b_ver ")\n");                     \
+            return 1;                                                               \
+        }                                                                           \
+    } while (0)
+
+#define CHECK_SIZE_CURRENT(a_ver)                                                   \
+    do {                                                                            \
+        if (sizeof(struct wl_egl_window ## a_ver) !=                                \
+            sizeof(struct wl_egl_window)) {                                         \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "sizeof(struct wl_egl_window" #a_ver ") != "                     \
+                   "sizeof(struct wl_egl_window)\n");                               \
+            return 1;                                                               \
+        }                                                                           \
+    } while (0)
+
+#define CHECK_VERSION(a_ver, b_ver)                                                 \
+    do {                                                                            \
+        if ((WL_EGL_WINDOW_VERSION ## a_ver) >=                                     \
+            (WL_EGL_WINDOW_VERSION ## b_ver)) {                                     \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "WL_EGL_WINDOW_VERSION" #a_ver " >= "                            \
+                   "WL_EGL_WINDOW_VERSION" #b_ver "\n");                            \
+            return 1;                                                               \
+        }                                                                           \
+    } while (0)
+
+#define CHECK_VERSION_CURRENT(a_ver)                                                \
+    do {                                                                            \
+        if ((WL_EGL_WINDOW_VERSION ## a_ver) !=                                     \
+            (WL_EGL_WINDOW_VERSION)) {                                              \
+            printf("Backards incompatible change detected!\n   "                    \
+                   "WL_EGL_WINDOW_VERSION" #a_ver " != "                            \
+                   "WL_EGL_WINDOW_VERSION\n");                                      \
+            return 1;                                                               \
+        }                                                                           \
+    } while (0)
+
+int main(int argc, char **argv)
+{
+    /* Check wl_egl_window_v1 ABI against wl_egl_window_v0 */
+    CHECK_MEMBER(_v0, _v1, surface);
+    CHECK_MEMBER(_v0, _v1, width);
+    CHECK_MEMBER(_v0, _v1, height);
+    CHECK_MEMBER(_v0, _v1, dx);
+    CHECK_MEMBER(_v0, _v1, dy);
+    CHECK_MEMBER(_v0, _v1, attached_width);
+    CHECK_MEMBER(_v0, _v1, attached_height);
+
+    CHECK_SIZE(_v0, _v1);
+
+    /* Check wl_egl_window_v2 ABI against wl_egl_window_v1 */
+    CHECK_MEMBER(_v1, _v2, surface);
+    CHECK_MEMBER(_v1, _v2, width);
+    CHECK_MEMBER(_v1, _v2, height);
+    CHECK_MEMBER(_v1, _v2, dx);
+    CHECK_MEMBER(_v1, _v2, dy);
+    CHECK_MEMBER(_v1, _v2, attached_width);
+    CHECK_MEMBER(_v1, _v2, attached_height);
+    CHECK_MEMBER(_v1, _v2, private);
+    CHECK_MEMBER(_v1, _v2, resize_callback);
+
+    CHECK_SIZE(_v1, _v2);
+
+    /* Check wl_egl_window_v3 ABI against wl_egl_window_v2 */
+    CHECK_RENAMED_MEMBER(_v2, _v3, surface, version);
+    CHECK_MEMBER        (_v2, _v3, width);
+    CHECK_MEMBER        (_v2, _v3, height);
+    CHECK_MEMBER        (_v2, _v3, dx);
+    CHECK_MEMBER        (_v2, _v3, dy);
+    CHECK_MEMBER        (_v2, _v3, attached_width);
+    CHECK_MEMBER        (_v2, _v3, attached_height);
+    CHECK_MEMBER        (_v2, _v3, private);
+    CHECK_MEMBER        (_v2, _v3, resize_callback);
+    CHECK_MEMBER        (_v2, _v3, destroy_window_callback);
+
+    CHECK_SIZE   (_v2, _v3);
+    CHECK_VERSION(_v2, _v3);
+
+    /* Check current wl_egl_window ABI against wl_egl_window_v3 */
+    CHECK_MEMBER_CURRENT(_v3, version);
+    CHECK_MEMBER_CURRENT(_v3, width);
+    CHECK_MEMBER_CURRENT(_v3, height);
+    CHECK_MEMBER_CURRENT(_v3, dx);
+    CHECK_MEMBER_CURRENT(_v3, dy);
+    CHECK_MEMBER_CURRENT(_v3, attached_width);
+    CHECK_MEMBER_CURRENT(_v3, attached_height);
+    CHECK_MEMBER_CURRENT(_v3, private);
+    CHECK_MEMBER_CURRENT(_v3, resize_callback);
+    CHECK_MEMBER_CURRENT(_v3, destroy_window_callback);
+    CHECK_MEMBER_CURRENT(_v3, surface);
+
+    CHECK_SIZE_CURRENT   (_v3);
+    CHECK_VERSION_CURRENT(_v3);
+
+    return 0;
+}
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-priv.h b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
index 92c31d9..3b59908 100644
--- a/src/egl/wayland/wayland-egl/wayland-egl-priv.h
+++ b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
@@ -41,8 +41,10 @@
 extern "C" {
 #endif
 
+#define WL_EGL_WINDOW_VERSION 3
+
 struct wl_egl_window {
-	struct wl_surface *surface;
+	const intptr_t version;
 
 	int width;
 	int height;
@@ -55,6 +57,8 @@
 	void *private;
 	void (*resize_callback)(struct wl_egl_window *, void *);
 	void (*destroy_window_callback)(void *);
+
+	struct wl_surface *surface;
 };
 
 #ifdef  __cplusplus
diff --git a/src/egl/wayland/wayland-egl/wayland-egl.c b/src/egl/wayland/wayland-egl/wayland-egl.c
index 4a4701a..f16324c 100644
--- a/src/egl/wayland/wayland-egl/wayland-egl.c
+++ b/src/egl/wayland/wayland-egl/wayland-egl.c
@@ -28,6 +28,7 @@
  */
 
 #include <stdlib.h>
+#include <string.h>
 
 #include <wayland-client.h>
 #include "wayland-egl.h"
@@ -54,6 +55,7 @@
 wl_egl_window_create(struct wl_surface *surface,
 		     int width, int height)
 {
+	struct wl_egl_window _INIT_ = { .version = WL_EGL_WINDOW_VERSION };
 	struct wl_egl_window *egl_window;
 
 	if (width <= 0 || height <= 0)
@@ -63,6 +65,8 @@
 	if (!egl_window)
 		return NULL;
 
+	memcpy(egl_window, &_INIT_, sizeof *egl_window);
+
 	egl_window->surface = surface;
 	egl_window->private = NULL;
 	egl_window->resize_callback = NULL;
diff --git a/src/gallium/Android.common.mk b/src/gallium/Android.common.mk
index 8559b5b..782510f 100644
--- a/src/gallium/Android.common.mk
+++ b/src/gallium/Android.common.mk
@@ -29,12 +29,4 @@
 	$(GALLIUM_TOP)/winsys \
 	$(GALLIUM_TOP)/drivers
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-LOCAL_C_INCLUDES += \
-	external/llvm/include \
-	external/llvm/device/include \
-	external/libcxx/include \
-	$(ELF_INCLUDES)
-endif
-
 include $(MESA_COMMON_MK)
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index e67cfab..8be365a 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -25,6 +25,7 @@
 
 GALLIUM_TOP := $(call my-dir)
 GALLIUM_COMMON_MK := $(GALLIUM_TOP)/Android.common.mk
+GALLIUM_TARGET_DRIVERS :=
 
 SUBDIRS := auxiliary
 SUBDIRS += auxiliary/pipe-loader
@@ -33,62 +34,24 @@
 # Gallium drivers and their respective winsys
 #
 
-# swrast
-ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/sw/dri drivers/softpipe
-endif
-
-# freedreno
-ifneq ($(filter freedreno, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/freedreno/drm drivers/freedreno
-endif
-
-# i915g
-ifneq ($(filter i915g, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/i915/drm drivers/i915
-endif
-
-# nouveau
-ifneq ($(filter nouveau, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += \
-	winsys/nouveau/drm \
-	drivers/nouveau
-endif
-
-# r300g/r600g/radeonsi
-ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += winsys/radeon/drm
-ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += drivers/r300
-endif
-ifneq ($(filter r600g radeonsi, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += drivers/radeon
-ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += drivers/r600
-endif
-ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += drivers/radeonsi
-SUBDIRS += winsys/amdgpu/drm
-endif
-endif
-endif
-
-# vc4
-ifneq ($(filter vc4, $(MESA_GPU_DRIVERS)),)
+SUBDIRS += winsys/nouveau/drm drivers/nouveau
+SUBDIRS += winsys/pl111/drm drivers/pl111
+SUBDIRS += winsys/radeon/drm drivers/r300
+SUBDIRS += winsys/radeon/drm drivers/r600 drivers/radeon
+SUBDIRS += winsys/radeon/drm winsys/amdgpu/drm drivers/radeonsi drivers/radeon
 SUBDIRS += winsys/vc4/drm drivers/vc4
-endif
-
-# virgl
-ifneq ($(filter virgl, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/virgl/drm winsys/virgl/vtest drivers/virgl
-endif
-
-# vmwgfx
-ifneq ($(filter vmwgfx, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/svga/drm drivers/svga
-endif
+SUBDIRS += winsys/etnaviv/drm drivers/etnaviv drivers/renderonly
+SUBDIRS += winsys/imx/drm
+SUBDIRS += state_trackers/dri
 
-# Gallium state trackers and target for dri
-SUBDIRS += state_trackers/dri targets/dri
+# sort to eliminate any duplicates
+INC_DIRS := $(call all-named-subdir-makefiles,$(sort $(SUBDIRS)))
+# targets/dri must be included last
+INC_DIRS += $(call all-named-subdir-makefiles,targets/dri)
 
-include $(call all-named-subdir-makefiles,$(SUBDIRS))
+include $(INC_DIRS)
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index 38da63b..9f98a7e 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -71,6 +71,10 @@
 SUBDIRS += drivers/imx winsys/imx/drm
 endif
 
+if HAVE_GALLIUM_PL111
+SUBDIRS += drivers/pl111 winsys/pl111/drm
+endif
+
 ## swrast/softpipe
 if HAVE_GALLIUM_SOFTPIPE
 SUBDIRS += drivers/softpipe
diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk
index e8628e4..2693838 100644
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -31,6 +31,7 @@
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(NIR_SOURCES) \
+	$(RENDERONLY_SOURCES) \
 	$(VL_STUB_SOURCES)
 
 LOCAL_C_INCLUDES := \
@@ -38,16 +39,19 @@
 
 ifeq ($(MESA_ENABLE_LLVM),true)
 LOCAL_SRC_FILES += \
-	$(GALLIVM_SOURCES) \
-	$(GALLIVM_CPP_SOURCES)
-LOCAL_STATIC_LIBRARIES += libLLVMCore
-LOCAL_CPPFLAGS := -std=c++11
+	$(GALLIVM_SOURCES)
+$(call mesa-build-with-llvm)
 endif
 
+LOCAL_CPPFLAGS += -std=c++11
+
 # We need libmesa_nir to get NIR's generated include directories.
 LOCAL_MODULE := libmesa_gallium
 LOCAL_STATIC_LIBRARIES += libmesa_nir
 
+LOCAL_WHOLE_STATIC_LIBRARIES += cpufeatures
+LOCAL_CFLAGS += -DHAS_ANDROID_CPUFEATURES
+
 # generate sources
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 intermediates := $(call local-generated-sources-dir)
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 192f8b3..9ae8e6c 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -147,6 +147,8 @@
 	tgsi/tgsi_exec.h \
 	tgsi/tgsi_emulate.c \
 	tgsi/tgsi_emulate.h \
+	tgsi/tgsi_from_mesa.c \
+	tgsi/tgsi_from_mesa.h \
 	tgsi/tgsi_info.c \
 	tgsi/tgsi_info.h \
 	tgsi/tgsi_iterate.c \
@@ -192,8 +194,6 @@
 	util/u_box.h \
 	util/u_cache.c \
 	util/u_cache.h \
-	util/u_caps.c \
-	util/u_caps.h \
 	util/u_cpu_detect.c \
 	util/u_cpu_detect.h \
 	util/u_debug.c \
@@ -223,7 +223,6 @@
 	util/u_dump_defines.c \
 	util/u_dump.h \
 	util/u_dump_state.c \
-	util/u_dynarray.h \
 	util/u_fifo.h \
 	util/u_format.c \
 	util/u_format.h \
@@ -303,6 +302,9 @@
 	util/u_time.h \
 	util/u_transfer.c \
 	util/u_transfer.h \
+	util/u_threaded_context.c \
+	util/u_threaded_context.h \
+	util/u_threaded_context_calls.h \
 	util/u_upload_mgr.c \
 	util/u_upload_mgr.h \
 	util/u_vbuf.c \
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 3d3c44c..757bcf3 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -57,7 +57,6 @@
 {
    struct cso_sampler *cso_samplers[PIPE_MAX_SAMPLERS];
    void *samplers[PIPE_MAX_SAMPLERS];
-   unsigned nr_samplers;
 };
 
 
@@ -83,6 +82,11 @@
    struct sampler_info fragment_samplers_saved;
    struct sampler_info samplers[PIPE_SHADER_TYPES];
 
+   /* Temporary number until cso_single_sampler_done is called.
+    * It tracks the highest sampler seen in cso_single_sampler.
+    */
+   int max_sampler_seen;
+
    struct pipe_vertex_buffer aux_vertex_buffer_current;
    struct pipe_vertex_buffer aux_vertex_buffer_saved;
    unsigned aux_vertex_buffer_index;
@@ -240,7 +244,7 @@
        * table, to prevent them from being deleted
        */
       for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-         for (j = 0; j < ctx->samplers[i].nr_samplers; j++) {
+         for (j = 0; j < PIPE_MAX_SAMPLERS; j++) {
             struct cso_sampler *sampler = ctx->samplers[i].cso_samplers[j];
 
             if (sampler && cso_hash_take(hash, sampler->hash_key))
@@ -334,6 +338,7 @@
       ctx->has_streamout = TRUE;
    }
 
+   ctx->max_sampler_seen = -1;
    return ctx;
 
 out:
@@ -349,8 +354,6 @@
    unsigned i;
 
    if (ctx->pipe) {
-      ctx->pipe->set_index_buffer(ctx->pipe, NULL);
-
       ctx->pipe->bind_blend_state( ctx->pipe, NULL );
       ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
 
@@ -408,8 +411,8 @@
    util_unreference_framebuffer_state(&ctx->fb);
    util_unreference_framebuffer_state(&ctx->fb_saved);
 
-   pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer, NULL);
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_current);
+   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_saved);
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       pipe_resource_reference(&ctx->aux_constbuf_current[i].buffer, NULL);
@@ -1150,15 +1153,9 @@
          const struct pipe_vertex_buffer *vb =
                buffers + (ctx->aux_vertex_buffer_index - start_slot);
 
-         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
-                                 vb->buffer);
-         memcpy(&ctx->aux_vertex_buffer_current, vb,
-                sizeof(struct pipe_vertex_buffer));
-      }
-      else {
-         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
-                                 NULL);
-         ctx->aux_vertex_buffer_current.user_buffer = NULL;
+         pipe_vertex_buffer_reference(&ctx->aux_vertex_buffer_current, vb);
+      } else {
+         pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_current);
       }
    }
 
@@ -1175,10 +1172,8 @@
       return;
    }
 
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer,
-                           ctx->aux_vertex_buffer_current.buffer);
-   memcpy(&ctx->aux_vertex_buffer_saved, &ctx->aux_vertex_buffer_current,
-          sizeof(struct pipe_vertex_buffer));
+   pipe_vertex_buffer_reference(&ctx->aux_vertex_buffer_saved,
+                                &ctx->aux_vertex_buffer_current);
 }
 
 static void
@@ -1193,7 +1188,7 @@
 
    cso_set_vertex_buffers(ctx, ctx->aux_vertex_buffer_index, 1,
                           &ctx->aux_vertex_buffer_saved);
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_saved);
 }
 
 unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
@@ -1203,7 +1198,7 @@
 
 
 
-enum pipe_error
+void
 cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage,
                    unsigned idx, const struct pipe_sampler_state *templ)
 {
@@ -1219,7 +1214,7 @@
       if (cso_hash_iter_is_null(iter)) {
          cso = MALLOC(sizeof(struct cso_sampler));
          if (!cso)
-            return PIPE_ERROR_OUT_OF_MEMORY;
+            return;
 
          memcpy(&cso->state, templ, sizeof(*templ));
          cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
@@ -1231,7 +1226,7 @@
          iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
          if (cso_hash_iter_is_null(iter)) {
             FREE(cso);
-            return PIPE_ERROR_OUT_OF_MEMORY;
+            return;
          }
       }
       else {
@@ -1240,12 +1235,8 @@
 
       ctx->samplers[shader_stage].cso_samplers[idx] = cso;
       ctx->samplers[shader_stage].samplers[idx] = cso->data;
-   } else {
-      ctx->samplers[shader_stage].cso_samplers[idx] = NULL;
-      ctx->samplers[shader_stage].samplers[idx] = NULL;
+      ctx->max_sampler_seen = MAX2(ctx->max_sampler_seen, (int)idx);
    }
-
-   return PIPE_OK;
 }
 
 
@@ -1257,19 +1248,14 @@
                         enum pipe_shader_type shader_stage)
 {
    struct sampler_info *info = &ctx->samplers[shader_stage];
-   const unsigned old_nr_samplers = info->nr_samplers;
-   unsigned i;
 
-   /* find highest non-null sampler */
-   for (i = PIPE_MAX_SAMPLERS; i > 0; i--) {
-      if (info->samplers[i - 1] != NULL)
-         break;
-   }
+   if (ctx->max_sampler_seen == -1)
+      return;
 
-   info->nr_samplers = i;
    ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0,
-                                  MAX2(old_nr_samplers, info->nr_samplers),
+                                  ctx->max_sampler_seen + 1,
                                   info->samplers);
+   ctx->max_sampler_seen = -1;
 }
 
 
@@ -1278,31 +1264,16 @@
  * last one. Done to always try to set as many samplers
  * as possible.
  */
-enum pipe_error
+void
 cso_set_samplers(struct cso_context *ctx,
                  enum pipe_shader_type shader_stage,
                  unsigned nr,
                  const struct pipe_sampler_state **templates)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i;
-   enum pipe_error temp, error = PIPE_OK;
-
-   for (i = 0; i < nr; i++) {
-      temp = cso_single_sampler(ctx, shader_stage, i, templates[i]);
-      if (temp != PIPE_OK)
-         error = temp;
-   }
-
-   for ( ; i < info->nr_samplers; i++) {
-      temp = cso_single_sampler(ctx, shader_stage, i, NULL);
-      if (temp != PIPE_OK)
-         error = temp;
-   }
+   for (unsigned i = 0; i < nr; i++)
+      cso_single_sampler(ctx, shader_stage, i, templates[i]);
 
    cso_single_sampler_done(ctx, shader_stage);
-
-   return error;
 }
 
 static void
@@ -1311,11 +1282,9 @@
    struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
    struct sampler_info *saved = &ctx->fragment_samplers_saved;
 
-   saved->nr_samplers = info->nr_samplers;
-   memcpy(saved->cso_samplers, info->cso_samplers, info->nr_samplers *
-          sizeof(*info->cso_samplers));
-   memcpy(saved->samplers, info->samplers, info->nr_samplers *
-          sizeof(*info->samplers));
+   memcpy(saved->cso_samplers, info->cso_samplers,
+          sizeof(info->cso_samplers));
+   memcpy(saved->samplers, info->samplers, sizeof(info->samplers));
 }
 
 
@@ -1324,18 +1293,16 @@
 {
    struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
    struct sampler_info *saved = &ctx->fragment_samplers_saved;
-   int delta = (int)info->nr_samplers - saved->nr_samplers;
 
    memcpy(info->cso_samplers, saved->cso_samplers,
-          saved->nr_samplers * sizeof(*info->cso_samplers));
-   memcpy(info->samplers, saved->samplers,
-          saved->nr_samplers * sizeof(*info->samplers));
+          sizeof(info->cso_samplers));
+   memcpy(info->samplers, saved->samplers, sizeof(info->samplers));
 
-   if (delta > 0) {
-      memset(&info->cso_samplers[saved->nr_samplers], 0,
-             delta * sizeof(*info->cso_samplers));
-      memset(&info->samplers[saved->nr_samplers], 0,
-             delta * sizeof(*info->samplers));
+   for (int i = PIPE_MAX_SAMPLERS - 1; i >= 0; i--) {
+      if (info->samplers[i]) {
+         ctx->max_sampler_seen = i;
+         break;
+      }
    }
 
    cso_single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
@@ -1705,20 +1672,6 @@
 /* drawing */
 
 void
-cso_set_index_buffer(struct cso_context *cso,
-                     const struct pipe_index_buffer *ib)
-{
-   struct u_vbuf *vbuf = cso->vbuf;
-
-   if (vbuf) {
-      u_vbuf_set_index_buffer(vbuf, ib);
-   } else {
-      struct pipe_context *pipe = cso->pipe;
-      pipe->set_index_buffer(pipe, ib);
-   }
-}
-
-void
 cso_draw_vbo(struct cso_context *cso,
              const struct pipe_draw_info *info)
 {
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 742bbb5..190d0dc 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -59,7 +59,7 @@
                                     const struct pipe_rasterizer_state *rasterizer );
 
 
-enum pipe_error
+void
 cso_set_samplers(struct cso_context *cso,
                  enum pipe_shader_type shader_stage,
                  unsigned count,
@@ -69,7 +69,7 @@
 /* Alternate interface to support state trackers that like to modify
  * samplers one at a time:
  */
-enum pipe_error
+void
 cso_single_sampler(struct cso_context *cso, enum pipe_shader_type shader_stage,
                    unsigned idx, const struct pipe_sampler_state *states);
 
@@ -222,10 +222,6 @@
 /* drawing */
 
 void
-cso_set_index_buffer(struct cso_context *cso,
-                     const struct pipe_index_buffer *ib);
-
-void
 cso_draw_vbo(struct cso_context *cso,
              const struct pipe_draw_info *info);
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 2a3f361..4d3e261 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -73,12 +73,6 @@
    return numBits;
 }
 
-struct cso_node {
-   struct cso_node *next;
-   unsigned key;
-   void *value;
-};
-
 struct cso_hash_data {
    struct cso_node *fakeNext;
    struct cso_node **buckets;
@@ -89,13 +83,6 @@
    int numBuckets;
 };
 
-struct cso_hash {
-   union {
-      struct cso_hash_data *d;
-      struct cso_node      *e;
-   } data;
-};
-
 static void *cso_data_allocate_node(struct cso_hash_data *hash)
 {
    return MALLOC(hash->nodeSize);
@@ -293,13 +280,6 @@
    return iter.node->key;
 }
 
-void * cso_hash_iter_data(struct cso_hash_iter iter)
-{
-   if (!iter.node || iter.hash->data.e == iter.node)
-      return 0;
-   return iter.node->value;
-}
-
 static struct cso_node *cso_hash_data_next(struct cso_node *node)
 {
    union {
@@ -374,13 +354,6 @@
    return next;
 }
 
-int cso_hash_iter_is_null(struct cso_hash_iter iter)
-{
-   if (!iter.node || iter.node == iter.hash->data.e)
-      return 1;
-   return 0;
-}
-
 void * cso_hash_take(struct cso_hash *hash,
                       unsigned akey)
 {
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index e58981c..d6eeb04 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -51,9 +51,18 @@
 #endif
 
 
-struct cso_hash;
-struct cso_node;
+struct cso_node {
+   struct cso_node *next;
+   unsigned key;
+   void *value;
+};
 
+struct cso_hash {
+   union {
+      struct cso_hash_data *d;
+      struct cso_node      *e;
+   } data;
+};
 
 struct cso_hash_iter {
    struct cso_hash *hash;
@@ -102,9 +111,7 @@
 boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
 
 
-int       cso_hash_iter_is_null(struct cso_hash_iter iter);
 unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
-void     *cso_hash_iter_data(struct cso_hash_iter iter);
 
 
 struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter);
@@ -121,6 +128,21 @@
 				        void *templ,
 				        int size );
 
+static inline int
+cso_hash_iter_is_null(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.node == iter.hash->data.e)
+      return 1;
+   return 0;
+}
+
+static inline void *
+cso_hash_iter_data(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->value;
+}
 
 #ifdef	__cplusplus
 }
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 8f1189a..9791ec5 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -206,9 +206,8 @@
       }
    }
 
-   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
-      pipe_resource_reference(&draw->pt.vertex_buffer[i].buffer, NULL);
-   }
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++)
+      pipe_vertex_buffer_unreference(&draw->pt.vertex_buffer[i]);
 
    /* Not so fast -- we're just borrowing this at the moment.
     * 
@@ -778,9 +777,6 @@
 /**
  * Tell the draw module where vertex indexes/elements are located, and
  * their size (in bytes).
- *
- * Note: the caller must apply the pipe_index_buffer::offset value to
- * the address.  The draw module doesn't do that.
  */
 void
 draw_set_indexes(struct draw_context *draw,
diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h
index b10bbc4..bf276d3 100644
--- a/src/gallium/auxiliary/draw/draw_gs_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h
@@ -22,7 +22,7 @@
       default:                                                    \
          break;                                                   \
       }                                                           \
-   } while (0)                                                    \
+   } while (0)
 
 #define POINT(i0)                             gs_point(gs,i0)
 #define LINE(flags,i0,i1)                     gs_line(gs,i0,i1)
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index bb08f66..2035720 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -352,9 +352,9 @@
    LLVMTypeRef elem_types[4];
    LLVMTypeRef vb_type;
 
-   elem_types[0] =
-   elem_types[1] = LLVMInt32TypeInContext(gallivm->context);
-   elem_types[2] =
+   elem_types[0] = LLVMInt16TypeInContext(gallivm->context);
+   elem_types[1] = LLVMInt8TypeInContext(gallivm->context);
+   elem_types[2] = LLVMInt32TypeInContext(gallivm->context);
    elem_types[3] = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
 
    vb_type = LLVMStructTypeInContext(gallivm->context, elem_types,
@@ -363,8 +363,12 @@
    (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, stride,
                           target, vb_type, 0);
-   LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, buffer_offset,
+   LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, is_user_buffer,
                           target, vb_type, 1);
+   LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, buffer_offset,
+                          target, vb_type, 2);
+   LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, buffer.resource,
+                          target, vb_type, 3);
 
    LP_CHECK_STRUCT_SIZE(struct pipe_vertex_buffer, target, vb_type);
 
@@ -1699,6 +1703,8 @@
          vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
          vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
          vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
+         vb_stride[j] = LLVMBuildZExt(gallivm->builder, vb_stride[j],
+                                      LLVMInt32TypeInContext(context), "");
          vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
          map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
          buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 57c9e72..a968be0 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -172,7 +172,7 @@
    lp_build_struct_get(_gallivm, _ptr, 0, "stride")
 
 #define draw_jit_vbuffer_offset(_gallivm, _ptr)         \
-   lp_build_struct_get(_gallivm, _ptr, 1, "buffer_offset")
+   lp_build_struct_get(_gallivm, _ptr, 2, "buffer_offset")
 
 enum {
    DRAW_JIT_DVBUFFER_MAP = 0,
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 57ca12e..a859dbc 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -1,5 +1,5 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
  *
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /**
@@ -423,9 +423,9 @@
 
       assert(aaline->texture->width0 == aaline->texture->height0);
 
-      u_box_origin_2d( size, size, &box );
+      u_box_origin_2d(size, size, &box);
 
-      /* This texture is new, no need to flush. 
+      /* This texture is new, no need to flush.
        */
       data = pipe->transfer_map(pipe,
                                 aaline->texture,
@@ -502,8 +502,7 @@
    struct draw_context *draw = aaline->stage.draw;
    struct pipe_context *pipe = draw->pipe;
 
-   if (!aaline->fs->aaline_fs && 
-       !generate_aaline_fs(aaline))
+   if (!aaline->fs->aaline_fs && !generate_aaline_fs(aaline))
       return FALSE;
 
    draw->suspend_flushing = TRUE;
@@ -516,7 +515,7 @@
 
 
 static inline struct aaline_stage *
-aaline_stage( struct draw_stage *stage )
+aaline_stage(struct draw_stage *stage)
 {
    return (struct aaline_stage *) stage;
 }
@@ -573,12 +572,12 @@
    pos[1] += (-dx * s_a + -dy * c_a);
 
    pos = v[2]->data[posPos];
-   pos[0] += ( dx * c_a -  dy * s_a);
-   pos[1] += ( dx * s_a +  dy * c_a);
+   pos[0] += (dx * c_a -  dy * s_a);
+   pos[1] += (dx * s_a +  dy * c_a);
 
    pos = v[3]->data[posPos];
-   pos[0] += ( dx * c_a - -dy * s_a);
-   pos[1] += ( dx * s_a + -dy * c_a);
+   pos[0] += (dx * c_a - -dy * s_a);
+   pos[1] += (dx * s_a + -dy * c_a);
 
    pos = v[4]->data[posPos];
    pos[0] += (-dx * c_a -  dy * s_a);
@@ -589,12 +588,12 @@
    pos[1] += (-dx * s_a + -dy * c_a);
 
    pos = v[6]->data[posPos];
-   pos[0] += ( dx * c_a -  dy * s_a);
-   pos[1] += ( dx * s_a +  dy * c_a);
+   pos[0] += (dx * c_a -  dy * s_a);
+   pos[1] += (dx * s_a +  dy * c_a);
 
    pos = v[7]->data[posPos];
-   pos[0] += ( dx * c_a - -dy * s_a);
-   pos[1] += ( dx * s_a + -dy * c_a);
+   pos[0] += (dx * c_a - -dy * s_a);
+   pos[1] += (dx * s_a + -dy * c_a);
 
    /* new texcoords */
    tex = v[0]->data[texPos];
@@ -623,22 +622,22 @@
 
    /* emit 6 tris for the quad strip */
    tri.v[0] = v[2];  tri.v[1] = v[1];  tri.v[2] = v[0];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 
    tri.v[0] = v[3];  tri.v[1] = v[1];  tri.v[2] = v[2];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 
    tri.v[0] = v[4];  tri.v[1] = v[3];  tri.v[2] = v[2];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 
    tri.v[0] = v[5];  tri.v[1] = v[3];  tri.v[2] = v[4];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 
    tri.v[0] = v[6];  tri.v[1] = v[5];  tri.v[2] = v[4];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 
    tri.v[0] = v[7];  tri.v[1] = v[5];  tri.v[2] = v[6];
-   stage->next->tri( stage->next, &tri );
+   stage->next->tri(stage->next, &tri);
 }
 
 
@@ -708,7 +707,7 @@
    struct pipe_context *pipe = draw->pipe;
 
    stage->line = aaline_first_line;
-   stage->next->flush( stage->next, flags );
+   stage->next->flush(stage->next, flags);
 
    /* restore original frag shader, texture, sampler state */
    draw->suspend_flushing = TRUE;
@@ -736,7 +735,7 @@
 static void
 aaline_reset_stipple_counter(struct draw_stage *stage)
 {
-   stage->next->reset_stipple_counter( stage->next );
+   stage->next->reset_stipple_counter(stage->next);
 }
 
 
@@ -761,7 +760,7 @@
       pipe_sampler_view_reference(&aaline->sampler_view, NULL);
    }
 
-   draw_free_temp_verts( stage );
+   draw_free_temp_verts(stage);
 
    /* restore the old entry points */
    pipe->create_fs_state = aaline->driver_create_fs_state;
@@ -771,7 +770,7 @@
    pipe->bind_sampler_states = aaline->driver_bind_sampler_states;
    pipe->set_sampler_views = aaline->driver_set_sampler_views;
 
-   FREE( stage );
+   FREE(stage);
 }
 
 
@@ -792,7 +791,7 @@
    aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter;
    aaline->stage.destroy = aaline_destroy;
 
-   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
+   if (!draw_alloc_temp_verts(&aaline->stage, 8))
       goto fail;
 
    return aaline;
@@ -970,7 +969,7 @@
    /*
     * Create / install AA line drawing / prim stage
     */
-   aaline = draw_aaline_stage( draw );
+   aaline = draw_aaline_stage(draw);
    if (!aaline)
       goto fail;
 
@@ -996,16 +995,16 @@
 
    pipe->bind_sampler_states = aaline_bind_sampler_states;
    pipe->set_sampler_views = aaline_set_sampler_views;
-   
+
    /* Install once everything is known to be OK:
     */
    draw->pipeline.aaline = &aaline->stage;
 
    return TRUE;
 
- fail:
+fail:
    if (aaline)
-      aaline->stage.destroy( &aaline->stage );
-   
+      aaline->stage.destroy(&aaline->stage);
+
    return FALSE;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index cf2b417..4cfa54b 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -771,8 +771,9 @@
    int interp;
    /* If it's gl_{Front,Back}{,Secondary}Color, pick up the mode
     * from the array we've filled before. */
-   if (semantic_name == TGSI_SEMANTIC_COLOR ||
-       semantic_name == TGSI_SEMANTIC_BCOLOR) {
+   if ((semantic_name == TGSI_SEMANTIC_COLOR ||
+        semantic_name == TGSI_SEMANTIC_BCOLOR) &&
+       semantic_index < 2) {
       interp = indexed_interp[semantic_index];
    } else if (semantic_name == TGSI_SEMANTIC_POSITION ||
               semantic_name == TGSI_SEMANTIC_CLIPVERTEX) {
@@ -851,7 +852,8 @@
 
    if (fs) {
       for (i = 0; i < fs->info.num_inputs; i++) {
-         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             fs->info.input_semantic_index[i] < 2) {
             if (fs->info.input_interpolate[i] != TGSI_INTERPOLATE_COLOR)
                indexed_interp[fs->info.input_semantic_index[i]] = fs->info.input_interpolate[i];
          }
@@ -881,6 +883,15 @@
          clipper->perspect_attribs[clipper->num_perspect_attribs] = i;
          clipper->num_perspect_attribs++;
          break;
+      case TGSI_INTERPOLATE_COLOR:
+         if (draw->rasterizer->flatshade) {
+            clipper->const_attribs[clipper->num_const_attribs] = i;
+            clipper->num_const_attribs++;
+         } else {
+            clipper->perspect_attribs[clipper->num_perspect_attribs] = i;
+            clipper->num_perspect_attribs++;
+         }
+         break;
       default:
          assert(interp == -1);
          break;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index cd285e6..2830435 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -170,8 +170,9 @@
    int interp;
    /* If it's gl_{Front,Back}{,Secondary}Color, pick up the mode
     * from the array we've filled before. */
-   if (semantic_name == TGSI_SEMANTIC_COLOR ||
-       semantic_name == TGSI_SEMANTIC_BCOLOR) {
+   if ((semantic_name == TGSI_SEMANTIC_COLOR ||
+        semantic_name == TGSI_SEMANTIC_BCOLOR) &&
+       semantic_index < 2) {
       interp = indexed_interp[semantic_index];
    } else {
       /* Otherwise, search in the FS inputs, with a decent default
@@ -216,7 +217,8 @@
 
    if (fs) {
       for (i = 0; i < fs->info.num_inputs; i++) {
-         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             fs->info.input_semantic_index[i] < 2) {
             if (fs->info.input_interpolate[i] != TGSI_INTERPOLATE_COLOR)
                indexed_interp[fs->info.input_semantic_index[i]] = fs->info.input_interpolate[i];
          }
@@ -236,7 +238,8 @@
                                info->output_semantic_index[i]);
       /* If it's flat, add it to the flat vector. */
 
-      if (interp == TGSI_INTERPOLATE_CONSTANT) {
+      if (interp == TGSI_INTERPOLATE_CONSTANT ||
+          (interp == TGSI_INTERPOLATE_COLOR && draw->rasterizer->flatshade)) {
          flat->flat_attribs[flat->num_flat_attribs] = i;
          flat->num_flat_attribs++;
       }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 0d39ee4..3a84d6c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -1,5 +1,5 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
  *
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /* Authors:  Keith Whitwell <keithw@vmware.com>
@@ -61,16 +61,16 @@
 
 
 /**
- * Compute interpolated vertex attributes for 'dst' at position 't' 
+ * Compute interpolated vertex attributes for 'dst' at position 't'
  * between 'v0' and 'v1'.
  * XXX using linear interpolation for all attribs at this time.
  */
 static void
-screen_interp( struct draw_context *draw,
-               struct vertex_header *dst,
-               float t,
-               const struct vertex_header *v0, 
-               const struct vertex_header *v1 )
+screen_interp(struct draw_context *draw,
+              struct vertex_header *dst,
+              float t,
+              const struct vertex_header *v0,
+              const struct vertex_header *v1)
 {
    uint attr;
    uint num_outputs = draw_current_shader_outputs(draw);
@@ -95,16 +95,16 @@
    struct prim_header newprim = *header;
 
    if (t0 > 0.0) {
-      screen_interp( stage->draw, v0new, t0, header->v[0], header->v[1] );
+      screen_interp(stage->draw, v0new, t0, header->v[0], header->v[1]);
       newprim.v[0] = v0new;
    }
 
    if (t1 < 1.0) {
-      screen_interp( stage->draw, v1new, t1, header->v[0], header->v[1] );
+      screen_interp(stage->draw, v1new, t1, header->v[0], header->v[1]);
       newprim.v[1] = v1new;
    }
 
-   stage->next->line( stage->next, &newprim );
+   stage->next->line(stage->next, &newprim);
 }
 
 
@@ -176,7 +176,7 @@
 {
    struct stipple_stage *stipple = stipple_stage(stage);
    stipple->counter = 0;
-   stage->next->reset_stipple_counter( stage->next );
+   stage->next->reset_stipple_counter(stage->next);
 }
 
 static void
@@ -197,8 +197,8 @@
 
 
 static void
-stipple_first_line(struct draw_stage *stage, 
-		   struct prim_header *header)
+stipple_first_line(struct draw_stage *stage,
+                   struct prim_header *header)
 {
    struct stipple_stage *stipple = stipple_stage(stage);
    struct draw_context *draw = stage->draw;
@@ -207,7 +207,7 @@
    stipple->factor = draw->rasterizer->line_stipple_factor + 1;
 
    stage->line = stipple_line;
-   stage->line( stage, header );
+   stage->line(stage, header);
 }
 
 
@@ -215,24 +215,23 @@
 stipple_flush(struct draw_stage *stage, unsigned flags)
 {
    stage->line = stipple_first_line;
-   stage->next->flush( stage->next, flags );
+   stage->next->flush(stage->next, flags);
 }
 
 
-
-
-static void 
-stipple_destroy( struct draw_stage *stage )
+static void
+stipple_destroy(struct draw_stage *stage)
 {
-   draw_free_temp_verts( stage );
-   FREE( stage );
+   draw_free_temp_verts(stage);
+   FREE(stage);
 }
 
 
 /**
  * Create line stippler stage
  */
-struct draw_stage *draw_stipple_stage( struct draw_context *draw )
+struct draw_stage *
+draw_stipple_stage(struct draw_context *draw)
 {
    struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
    if (!stipple)
@@ -248,14 +247,14 @@
    stipple->stage.flush = stipple_flush;
    stipple->stage.destroy = stipple_destroy;
 
-   if (!draw_alloc_temp_verts( &stipple->stage, 2 ))
+   if (!draw_alloc_temp_verts(&stipple->stage, 2))
       goto fail;
 
    return &stipple->stage;
 
 fail:
    if (stipple)
-      stipple->stage.destroy( &stipple->stage );
+      stipple->stage.destroy(&stipple->stage);
 
    return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 52d87c6..7e76835 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -111,18 +111,21 @@
    twoside->attrib_back0 = -1;
    twoside->attrib_back1 = -1;
 
-   /* Find which vertex shader outputs are front/back colors */
+   /*
+    * Find which vertex shader outputs are front/back colors
+    * (only first two can be front or back).
+    */
    for (i = 0; i < vs->info.num_outputs; i++) {
       if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
          if (vs->info.output_semantic_index[i] == 0)
             twoside->attrib_front0 = i;
-         else
+         else if (vs->info.output_semantic_index[i] == 1)
             twoside->attrib_front1 = i;
       }
       if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
          if (vs->info.output_semantic_index[i] == 0)
             twoside->attrib_back0 = i;
-         else
+         else if (vs->info.output_semantic_index[i] == 1)
             twoside->attrib_back1 = i;
       }
    }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index f26063d..8592f51 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -28,7 +28,7 @@
 /**
  * \file
  * Vertex buffer drawing stage.
- * 
+ *
  * \author Jose Fonseca <jfonseca@vmware.com>
  * \author Keith Whitwell <keithw@vmware.com>
  */
@@ -37,7 +37,6 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-
 #include "draw_vbuf.h"
 #include "draw_private.h"
 #include "draw_vertex.h"
@@ -53,14 +52,14 @@
    struct draw_stage stage; /**< This must be first (base class) */
 
    struct vbuf_render *render;
-   
+
    const struct vertex_info *vinfo;
-   
+
    /** Vertex size in bytes */
    unsigned vertex_size;
 
    struct translate *translate;
-   
+
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
    /** Vertices in hardware format */
@@ -68,7 +67,7 @@
    unsigned *vertex_ptr;
    unsigned max_vertices;
    unsigned nr_vertices;
-   
+
    /** Indices */
    ushort *indices;
    unsigned max_indices;
@@ -87,31 +86,28 @@
  * Basically a cast wrapper.
  */
 static inline struct vbuf_stage *
-vbuf_stage( struct draw_stage *stage )
+vbuf_stage(struct draw_stage *stage)
 {
    assert(stage);
    return (struct vbuf_stage *)stage;
 }
 
 
-static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
-static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
+static void vbuf_flush_vertices(struct vbuf_stage *vbuf);
+static void vbuf_alloc_vertices(struct vbuf_stage *vbuf);
 
 
-static inline void 
-check_space( struct vbuf_stage *vbuf, unsigned nr )
+static inline void
+check_space(struct vbuf_stage *vbuf, unsigned nr)
 {
    if (vbuf->nr_vertices + nr > vbuf->max_vertices ||
-       vbuf->nr_indices + nr > vbuf->max_indices)
-   {
-      vbuf_flush_vertices( vbuf );
-      vbuf_alloc_vertices( vbuf );
+       vbuf->nr_indices + nr > vbuf->max_indices) {
+      vbuf_flush_vertices(vbuf);
+      vbuf_alloc_vertices(vbuf);
    }
 }
 
 
-
-
 /**
  * Extract the needed fields from post-transformed vertex and emit
  * a hardware(driver) vertex.
@@ -119,22 +115,21 @@
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.  We only use the vertex->data[] fields.
  */
-static inline ushort 
-emit_vertex( struct vbuf_stage *vbuf,
-             struct vertex_header *vertex )
+static inline ushort
+emit_vertex(struct vbuf_stage *vbuf, struct vertex_header *vertex)
 {
    if (vertex->vertex_id == UNDEFINED_VERTEX_ID && vbuf->vertex_ptr) {
       /* Hmm - vertices are emitted one at a time - better make sure
        * set_buffer is efficient.  Consider a special one-shot mode for
        * translate.
        */
-      /* Note: we really do want data[0] here, not data[pos]: 
+      /* Note: we really do want data[0] here, not data[pos]:
        */
       vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0, ~0);
       vbuf->translate->run(vbuf->translate, 0, 1, 0, 0, vbuf->vertex_ptr);
 
       if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
-      
+
       vbuf->vertex_ptr += vbuf->vertex_size/4;
       vertex->vertex_id = vbuf->nr_vertices++;
    }
@@ -143,57 +138,52 @@
 }
 
 
-static void 
-vbuf_tri( struct draw_stage *stage,
-          struct prim_header *prim )
+static void
+vbuf_tri(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
    unsigned i;
 
-   check_space( vbuf, 3 );
+   check_space(vbuf, 3);
 
    for (i = 0; i < 3; i++) {
-      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex(vbuf, prim->v[i]);
    }
 }
 
 
-static void 
-vbuf_line( struct draw_stage *stage, 
-           struct prim_header *prim )
+static void
+vbuf_line(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
    unsigned i;
 
-   check_space( vbuf, 2 );
+   check_space(vbuf, 2);
 
    for (i = 0; i < 2; i++) {
-      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
-   }   
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex(vbuf, prim->v[i]);
+   }
 }
 
 
-static void 
-vbuf_point( struct draw_stage *stage, 
-            struct prim_header *prim )
+static void
+vbuf_point(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
-   check_space( vbuf, 1 );
+   check_space(vbuf, 1);
 
-   vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[0] );
+   vbuf->indices[vbuf->nr_indices++] = emit_vertex(vbuf, prim->v[0]);
 }
 
 
-
-
 /**
  * Set the prim type for subsequent vertices.
  * This may result in a new vertex size.  The existing vbuffer (if any)
  * will be flushed if needed and a new one allocated.
  */
 static void
-vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
+vbuf_start_prim(struct vbuf_stage *vbuf, uint prim)
 {
    struct translate_key hw_key;
    unsigned dst_offset;
@@ -203,7 +193,7 @@
    vbuf->render->set_primitive(vbuf->render, prim);
 
    /* Must do this after set_primitive() above:
-    * 
+    *
     * XXX: need some state managment to track when this needs to be
     * recalculated.  The driver should tell us whether there was a
     * state change.
@@ -220,7 +210,7 @@
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       enum pipe_format output_format;
-      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float));
 
       output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
       emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
@@ -255,8 +245,7 @@
    /* Don't bother with caching at this stage:
     */
    if (!vbuf->translate ||
-       translate_key_compare(&vbuf->translate->key, &hw_key) != 0) 
-   {
+       translate_key_compare(&vbuf->translate->key, &hw_key) != 0) {
       translate_key_sanitize(&hw_key);
       vbuf->translate = translate_cache_find(vbuf->cache, &hw_key);
 
@@ -273,42 +262,39 @@
 }
 
 
-static void 
-vbuf_first_tri( struct draw_stage *stage,
-                struct prim_header *prim )
+static void
+vbuf_first_tri(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
-   vbuf_flush_vertices( vbuf );
+   vbuf_flush_vertices(vbuf);
    vbuf_start_prim(vbuf, PIPE_PRIM_TRIANGLES);
    stage->tri = vbuf_tri;
-   stage->tri( stage, prim );
+   stage->tri(stage, prim);
 }
 
 
-static void 
-vbuf_first_line( struct draw_stage *stage,
-                 struct prim_header *prim )
+static void
+vbuf_first_line(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
-   vbuf_flush_vertices( vbuf );
+   vbuf_flush_vertices(vbuf);
    vbuf_start_prim(vbuf, PIPE_PRIM_LINES);
    stage->line = vbuf_line;
-   stage->line( stage, prim );
+   stage->line(stage, prim);
 }
 
 
-static void 
-vbuf_first_point( struct draw_stage *stage,
-                  struct prim_header *prim )
+static void
+vbuf_first_point(struct draw_stage *stage, struct prim_header *prim)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
    vbuf_flush_vertices(vbuf);
    vbuf_start_prim(vbuf, PIPE_PRIM_POINTS);
    stage->point = vbuf_point;
-   stage->point( stage, prim );
+   stage->point(stage, prim);
 }
 
 
@@ -316,28 +302,26 @@
 /**
  * Flush existing vertex buffer and allocate a new one.
  */
-static void 
-vbuf_flush_vertices( struct vbuf_stage *vbuf )
+static void
+vbuf_flush_vertices(struct vbuf_stage *vbuf)
 {
-   if(vbuf->vertices) {
+   if (vbuf->vertices) {
+      vbuf->render->unmap_vertices(vbuf->render, 0, vbuf->nr_vertices - 1);
 
-      vbuf->render->unmap_vertices( vbuf->render, 0, vbuf->nr_vertices - 1 );
+      if (vbuf->nr_indices) {
+         vbuf->render->draw_elements(vbuf->render,
+                                     vbuf->indices,
+                                     vbuf->nr_indices);
 
-      if (vbuf->nr_indices) 
-      {
-         vbuf->render->draw_elements(vbuf->render, 
-                                     vbuf->indices, 
-                                     vbuf->nr_indices );
-   
          vbuf->nr_indices = 0;
       }
-     
+
       /* Reset temporary vertices ids */
-      if(vbuf->nr_vertices)
-	 draw_reset_vertex_ids( vbuf->stage.draw );
-      
+      if (vbuf->nr_vertices)
+         draw_reset_vertex_ids(vbuf->stage.draw);
+
       /* Free the vertex buffer */
-      vbuf->render->release_vertices( vbuf->render );
+      vbuf->render->release_vertices(vbuf->render);
 
       vbuf->max_vertices = vbuf->nr_vertices = 0;
       vbuf->vertex_ptr = vbuf->vertices = NULL;
@@ -353,20 +337,21 @@
    vbuf->stage.line = vbuf_first_line;
    vbuf->stage.tri = vbuf_first_tri;
 }
-   
 
-static void 
-vbuf_alloc_vertices( struct vbuf_stage *vbuf )
+
+static void
+vbuf_alloc_vertices(struct vbuf_stage *vbuf)
 {
    if (vbuf->vertex_ptr) {
       assert(!vbuf->nr_indices);
       assert(!vbuf->vertices);
    }
-   
-   /* Allocate a new vertex buffer */
-   vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
 
-   if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID)
+   /* Allocate a new vertex buffer */
+   vbuf->max_vertices =
+      vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
+
+   if (vbuf->max_vertices >= UNDEFINED_VERTEX_ID)
       vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1;
 
    /* Must always succeed -- driver gives us a
@@ -378,24 +363,23 @@
                                    (ushort) vbuf->vertex_size,
                                    (ushort) vbuf->max_vertices);
 
-   vbuf->vertices = (uint *) vbuf->render->map_vertices( vbuf->render );
-   
+   vbuf->vertices = (uint *) vbuf->render->map_vertices(vbuf->render);
+
    vbuf->vertex_ptr = vbuf->vertices;
 }
 
 
-
-static void 
-vbuf_flush( struct draw_stage *stage, unsigned flags )
+static void
+vbuf_flush(struct draw_stage *stage, unsigned flags)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
-   vbuf_flush_vertices( vbuf );
+   vbuf_flush_vertices(vbuf);
 }
 
 
-static void 
-vbuf_reset_stipple_counter( struct draw_stage *stage )
+static void
+vbuf_reset_stipple_counter(struct draw_stage *stage)
 {
    /* XXX: Need to do something here for hardware with linestipple.
     */
@@ -403,28 +387,29 @@
 }
 
 
-static void vbuf_destroy( struct draw_stage *stage )
+static void
+vbuf_destroy(struct draw_stage *stage)
 {
-   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   struct vbuf_stage *vbuf = vbuf_stage(stage);
 
-   if(vbuf->indices)
-      align_free( vbuf->indices );
-   
+   if (vbuf->indices)
+      align_free(vbuf->indices);
+
    if (vbuf->render)
-      vbuf->render->destroy( vbuf->render );
+      vbuf->render->destroy(vbuf->render);
 
    if (vbuf->cache)
       translate_cache_destroy(vbuf->cache);
 
-   FREE( stage );
+   FREE(stage);
 }
 
 
 /**
  * Create a new primitive vbuf/render stage.
  */
-struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
-                                    struct vbuf_render *render )
+struct draw_stage *
+draw_vbuf_stage(struct draw_context *draw, struct vbuf_render *render)
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
    if (!vbuf)
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index adb6120..1329ab4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -76,7 +76,7 @@
    uint texcoord_gen_slot[PIPE_MAX_SHADER_OUTPUTS];
 
    /* TGSI_SEMANTIC to which sprite_coord_enable applies */
-   unsigned sprite_coord_semantic;
+   enum tgsi_semantic sprite_coord_semantic;
 
    int psize_slot;
 };
@@ -242,7 +242,7 @@
        */
       for (i = 0; i < fs->info.num_inputs; i++) {
          int slot;
-         const unsigned sn = fs->info.input_semantic_name[i];
+         const enum tgsi_semantic sn = fs->info.input_semantic_name[i];
          const unsigned si = fs->info.input_semantic_index[i];
 
          if (sn == wide->sprite_coord_semantic) {
diff --git a/src/gallium/auxiliary/draw/draw_prim_assembler_tmp.h b/src/gallium/auxiliary/draw/draw_prim_assembler_tmp.h
index bff6d55..145a8ca 100644
--- a/src/gallium/auxiliary/draw/draw_prim_assembler_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_prim_assembler_tmp.h
@@ -19,7 +19,7 @@
       return;                                                       \
    default:                                                         \
       break;                                                        \
-   }                                                                \
+   }
 
 
 #define POINT(i0)                             prim_point(asmblr, i0)
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 5a49acb..be76a30 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -443,7 +443,7 @@
       info->count = target->internal_offset / vertex_buffer->stride;
 
       /* Stream output draw can not be indexed */
-      debug_assert(!info->indexed);
+      debug_assert(!info->index_size);
       info->max_index = info->count - 1;
    }
 }
@@ -473,7 +473,7 @@
    info = &resolved_info;
 
    assert(info->instance_count > 0);
-   if (info->indexed)
+   if (info->index_size)
       assert(draw->pt.user.elts);
 
    count = info->count;
@@ -481,7 +481,7 @@
    draw->pt.user.eltBias = info->index_bias;
    draw->pt.user.min_index = info->min_index;
    draw->pt.user.max_index = info->max_index;
-   draw->pt.user.eltSize = info->indexed ? draw->pt.user.eltSizeIB : 0;
+   draw->pt.user.eltSize = info->index_size ? draw->pt.user.eltSizeIB : 0;
 
    if (0)
       debug_printf("draw_vbo(mode=%u start=%u count=%u):\n",
diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
index 282a52d..c3a4695 100644
--- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
@@ -22,7 +22,7 @@
       default:                                                    \
          break;                                                   \
       }                                                           \
-   } while (0)                                                    \
+   } while (0)
 
 #define POINT(i0)                so_point(so,i0)
 #define LINE(flags,i0,i1)        so_line(so,i0,i1)
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 5125eb4..4f11ac7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
  /*
@@ -49,7 +49,9 @@
    struct tgsi_exec_machine *machine;
 };
 
-static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
+
+static struct exec_vertex_shader *
+exec_vertex_shader(struct draw_vertex_shader *vs)
 {
    return (struct exec_vertex_shader *)vs;
 }
@@ -58,8 +60,8 @@
 /* Not required for run_linear.
  */
 static void
-vs_exec_prepare( struct draw_vertex_shader *shader,
-		 struct draw_context *draw )
+vs_exec_prepare(struct draw_vertex_shader *shader,
+                struct draw_context *draw)
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
 
@@ -78,20 +80,20 @@
 
 
 
-
-/* Simplified vertex shader interface for the pt paths.  Given the
+/**
+ * Simplified vertex shader interface for the pt paths.  Given the
  * complexity of code-generating all the above operations together,
  * it's time to try doing all the other stuff separately.
  */
 static void
-vs_exec_run_linear( struct draw_vertex_shader *shader,
-		    const float (*input)[4],
-		    float (*output)[4],
-                    const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+vs_exec_run_linear(struct draw_vertex_shader *shader,
+                   const float (*input)[4],
+                   float (*output)[4],
+                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
                     const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
-		    unsigned count,
-		    unsigned input_stride,
-		    unsigned output_stride )
+                   unsigned count,
+                   unsigned input_stride,
+                   unsigned output_stride)
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
    struct tgsi_exec_machine *machine = evs->machine;
@@ -113,17 +115,17 @@
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
-      /* Swizzle inputs.  
+      /* Swizzle inputs.
        */
       for (j = 0; j < max_vertices; j++) {
 #if 0
          debug_printf("%d) Input vert:\n", i + j);
          for (slot = 0; slot < shader->info.num_inputs; slot++) {
             debug_printf("\t%d: %f %f %f %f\n", slot,
-			 input[slot][0],
-			 input[slot][1],
-			 input[slot][2],
-			 input[slot][3]);
+                         input[slot][0],
+                         input[slot][1],
+                         input[slot][2],
+                         input[slot][3]);
          }
 #endif
 
@@ -159,27 +161,24 @@
          }
 
          input = (const float (*)[4])((const char *)input + input_stride);
-      } 
+      }
 
       machine->NonHelperMask = (1 << max_vertices) - 1;
       /* run interpreter */
-      tgsi_exec_machine_run( machine, 0 );
+      tgsi_exec_machine_run(machine, 0);
 
-      /* Unswizzle all output results.  
+      /* Unswizzle all output results.
        */
       for (j = 0; j < max_vertices; j++) {
          for (slot = 0; slot < shader->info.num_outputs; slot++) {
-            unsigned name = shader->info.output_semantic_name[slot];
-            if(clamp_vertex_color &&
-                  (name == TGSI_SEMANTIC_COLOR || name == TGSI_SEMANTIC_BCOLOR))
-            {
+            enum tgsi_semantic name = shader->info.output_semantic_name[slot];
+            if (clamp_vertex_color &&
+                (name == TGSI_SEMANTIC_COLOR || name == TGSI_SEMANTIC_BCOLOR)) {
                output[slot][0] = CLAMP(machine->Outputs[slot].xyzw[0].f[j], 0.0f, 1.0f);
                output[slot][1] = CLAMP(machine->Outputs[slot].xyzw[1].f[j], 0.0f, 1.0f);
                output[slot][2] = CLAMP(machine->Outputs[slot].xyzw[2].f[j], 0.0f, 1.0f);
                output[slot][3] = CLAMP(machine->Outputs[slot].xyzw[3].f[j], 0.0f, 1.0f);
-            }
-            else
-            {
+            } else {
                output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
                output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
                output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
@@ -188,39 +187,36 @@
          }
 
 #if 0
-	 debug_printf("%d) Post xform vert:\n", i + j);
-	 for (slot = 0; slot < shader->info.num_outputs; slot++) {
-	    debug_printf("\t%d: %f %f %f %f\n", slot,
-			 output[slot][0],
-			 output[slot][1],
-			 output[slot][2],
-			 output[slot][3]);
+         debug_printf("%d) Post xform vert:\n", i + j);
+         for (slot = 0; slot < shader->info.num_outputs; slot++) {
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+                         output[slot][0],
+                         output[slot][1],
+                         output[slot][2],
+                         output[slot][3]);
             assert(!util_is_inf_or_nan(output[slot][0]));
          }
 #endif
 
-	 output = (float (*)[4])((char *)output + output_stride);
-      } 
-
+         output = (float (*)[4])((char *)output + output_stride);
+      }
    }
 }
 
 
-
-
 static void
-vs_exec_delete( struct draw_vertex_shader *dvs )
+vs_exec_delete(struct draw_vertex_shader *dvs)
 {
    FREE((void*) dvs->state.tokens);
-   FREE( dvs );
+   FREE(dvs);
 }
 
 
 struct draw_vertex_shader *
 draw_create_vs_exec(struct draw_context *draw,
-		    const struct pipe_shader_state *state)
+                    const struct pipe_shader_state *state)
 {
-   struct exec_vertex_shader *vs = CALLOC_STRUCT( exec_vertex_shader );
+   struct exec_vertex_shader *vs = CALLOC_STRUCT(exec_vertex_shader);
 
    if (!vs)
       return NULL;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 98eb694..22c19b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -650,7 +650,13 @@
          for (i = 0; i < format_desc->nr_channels; i++) {
             struct util_format_channel_description chan_desc = format_desc->channel[i];
             unsigned blockbits = type.width;
-            unsigned vec_nr = chan_desc.shift / type.width;
+            unsigned vec_nr;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+            vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
+#else
+            vec_nr = chan_desc.shift / type.width;
+#endif
             chan_desc.shift %= type.width;
 
             output[i] = lp_build_extract_soa_chan(&bld,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index ccd0376..7d11dcd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -234,13 +234,39 @@
           */
          res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
 
-         if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
+         if (vector_justify) {
          res = LLVMBuildShl(gallivm->builder, res,
                             LLVMConstInt(dst_elem_type,
                                          dst_type.width - src_width, 0), "");
-#endif
          }
+         if (src_width == 48) {
+            /* Load 3x16 bit vector.
+             * The sequence of loads on big-endian hardware proceeds as follows.
+             * 16-bit fields are denoted by X, Y, Z, and 0.  In memory, the sequence
+             * of three fields appears in the order X, Y, Z.
+             *
+             * Load 32-bit word: 0.0.X.Y
+             * Load 16-bit halfword: 0.0.0.Z
+             * Rotate left: 0.X.Y.0
+             * Bitwise OR: 0.X.Y.Z
+             *
+             * The order in which we need the fields in the result is 0.Z.Y.X,
+             * the same as on little-endian; permute 16-bit fields accordingly
+             * within 64-bit register:
+             */
+            LLVMValueRef shuffles[4] = {
+               lp_build_const_int32(gallivm, 2),
+               lp_build_const_int32(gallivm, 1),
+               lp_build_const_int32(gallivm, 0),
+               lp_build_const_int32(gallivm, 3),
+            };
+            res = LLVMBuildBitCast(gallivm->builder, res,
+                                   lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
+            res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
+            res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
+         }
+#endif
       }
    }
    return res;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 9f1ade6..c456a97 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -606,7 +606,7 @@
       LLVMWriteBitcodeToFile(gallivm->module, filename);
       debug_printf("%s written\n", filename);
       debug_printf("Invoke as \"llc %s%s -o - %s\"\n",
-                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option] " : "",
+                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
                    "[-mattr=<-mattr option(s)>]",
                    filename);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 19f98bb..b924555 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -168,10 +168,14 @@
 
 #if HAVE_LLVM < 0x0400
    LLVMAttribute llvm_attr = lp_attr_to_llvm_attr(attr);
-   if (attr_idx == -1) {
-      LLVMAddFunctionAttr(function_or_call, llvm_attr);
+   if (LLVMIsAFunction(function_or_call)) {
+      if (attr_idx == -1) {
+         LLVMAddFunctionAttr(function_or_call, llvm_attr);
+      } else {
+         LLVMAddAttribute(LLVMGetParam(function_or_call, attr_idx - 1), llvm_attr);
+      }
    } else {
-      LLVMAddAttribute(LLVMGetParam(function_or_call, attr_idx - 1), llvm_attr);
+      LLVMAddInstrAttribute(function_or_call, attr_idx, llvm_attr);
    }
 #else
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index b8c5c80..354e2a4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -135,6 +135,7 @@
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
    case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 2a388cb..d988910 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -49,6 +49,9 @@
 #endif
 
 #include <llvm-c/Core.h>
+#if HAVE_LLVM >= 0x0306
+#include <llvm-c/Support.h>
+#endif
 #include <llvm-c/ExecutionEngine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
@@ -122,20 +125,26 @@
    llvm::InitializeNativeTargetAsmPrinter();
 
    llvm::InitializeNativeTargetDisassembler();
-}
-
-/**
- * The llvm target registry is not thread-safe, so drivers and state-trackers
- * that want to initialize targets should use the gallivm_init_llvm_targets()
- * function to safely initialize targets.
- *
- * LLVM targets should be initialized before the driver or state-tracker tries
- * to access the registry.
- */
-extern "C" void
-gallivm_init_llvm_targets(void)
-{
-   call_once(&init_native_targets_once_flag, init_native_targets);
+#if DEBUG && HAVE_LLVM >= 0x0306
+   {
+      char *env_llc_options = getenv("GALLIVM_LLC_OPTIONS");
+      if (env_llc_options) {
+         char *option;
+         char *options[64] = {(char *) "llc"};      // Warning without cast
+         int   n;
+         for (n = 0, option = strtok(env_llc_options, " "); option; n++, option = strtok(NULL, " ")) {
+            options[n + 1] = option;
+         }
+         if (gallivm_debug & (GALLIVM_DEBUG_IR | GALLIVM_DEBUG_ASM | GALLIVM_DEBUG_DUMP_BC)) {
+            debug_printf("llc additional options (%d):\n", n);
+            for (int i = 1; i <= n; i++)
+               debug_printf("\t%s\n", options[i]);
+            debug_printf("\n");
+         }
+         LLVMParseCommandLineOptions(n + 1, options, NULL);
+      }
+   }
+#endif
 }
 
 extern "C" void
@@ -150,7 +159,14 @@
    llvm::DisablePrettyStackTrace = true;
 #endif
 
-   gallivm_init_llvm_targets();
+   /* The llvm target registry is not thread-safe, so drivers and state-trackers
+    * that want to initialize targets should use the lp_set_target_options()
+    * function to safely initialize targets.
+    *
+    * LLVM targets should be initialized before the driver or state-tracker tries
+    * to access the registry.
+    */
+   call_once(&init_native_targets_once_flag, init_native_targets);
 }
 
 extern "C"
@@ -342,14 +358,20 @@
       virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
          mgr()->registerEHFrames(Addr, LoadAddr, Size);
       }
-      virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
-         mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
-      }
 #else
       virtual void registerEHFrames(llvm::StringRef SectionData) {
          mgr()->registerEHFrames(SectionData);
       }
 #endif
+#if HAVE_LLVM >= 0x0500
+      virtual void deregisterEHFrames() {
+         mgr()->deregisterEHFrames();
+      }
+#elif HAVE_LLVM >= 0x0304
+      virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+         mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
+      }
+#endif
       virtual void *getPointerToNamedFunction(const std::string &Name,
                                               bool AbortOnFailure=true) {
          return mgr()->getPointerToNamedFunction(Name, AbortOnFailure);
@@ -608,23 +630,46 @@
 #if defined(PIPE_ARCH_PPC)
    MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
 #if (HAVE_LLVM >= 0x0304)
-#if (HAVE_LLVM <= 0x0307) || (HAVE_LLVM == 0x0308 && MESA_LLVM_VERSION_PATCH == 0)
+#if (HAVE_LLVM < 0x0400)
    /*
     * Make sure VSX instructions are disabled
-    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7
+    * See LLVM bugs:
+    * https://llvm.org/bugs/show_bug.cgi?id=25503#c7 (fixed in 3.8.1)
+    * https://llvm.org/bugs/show_bug.cgi?id=26775 (fixed in 3.8.1)
+    * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
+    * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
     */
    if (util_cpu_caps.has_altivec) {
       MAttrs.push_back("-vsx");
    }
 #else
    /*
-    * However, bug 25503 is fixed, by the same fix that fixed
-    * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1):
-    * Make sure VSX instructions are ENABLED
-    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=26775
+    * Bug 25503 is fixed, by the same fix that fixed
+    * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1).
+    * BZ 33531 actually comprises more than one bug, all of
+    * which are fixed in LLVM 4.0.
+    *
+    * With LLVM 4.0 or higher:
+    * Make sure VSX instructions are ENABLED, unless
+    * a) the entire -mattr option is overridden via GALLIVM_MATTRS, or
+    * b) VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
     */
    if (util_cpu_caps.has_altivec) {
-      MAttrs.push_back("+vsx");
+      char *env_mattrs = getenv("GALLIVM_MATTRS");
+      if (env_mattrs) {
+         MAttrs.push_back(env_mattrs);
+      }
+      else {
+         boolean enable_vsx = true;
+         char *env_vsx = getenv("GALLIVM_VSX");
+         if (env_vsx && env_vsx[0] == '0') {
+            enable_vsx = false;
+         }
+         if (enable_vsx)
+            MAttrs.push_back("+vsx");
+         else
+            MAttrs.push_back("-vsx");
+      }
    }
 #endif
 #endif
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
index dafb4cf..1b725d1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -42,9 +42,6 @@
 
 struct lp_generated_code;
 
-extern void
-gallivm_init_llvm_targets(void);
-
 extern LLVMTargetLibraryInfoRef
 gallivm_create_target_library_info(const char *triple);
 
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index f492c81..2deb48d 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -39,7 +39,6 @@
 
 #include "hud/hud_context.h"
 #include "hud/hud_private.h"
-#include "hud/font.h"
 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h"
@@ -57,51 +56,6 @@
 /* Control the visibility of all HUD contexts */
 static boolean huds_visible = TRUE;
 
-struct hud_context {
-   struct pipe_context *pipe;
-   struct cso_context *cso;
-
-   struct hud_batch_query_context *batch_query;
-   struct list_head pane_list;
-
-   /* states */
-   struct pipe_blend_state no_blend, alpha_blend;
-   struct pipe_depth_stencil_alpha_state dsa;
-   void *fs_color, *fs_text;
-   struct pipe_rasterizer_state rasterizer, rasterizer_aa_lines;
-   void *vs;
-   struct pipe_vertex_element velems[2];
-
-   /* font */
-   struct util_font font;
-   struct pipe_sampler_view *font_sampler_view;
-   struct pipe_sampler_state font_sampler_state;
-
-   /* VS constant buffer */
-   struct {
-      float color[4];
-      float two_div_fb_width;
-      float two_div_fb_height;
-      float translate[2];
-      float scale[2];
-      float padding[2];
-   } constants;
-   struct pipe_constant_buffer constbuf;
-
-   unsigned fb_width, fb_height;
-
-   /* vertices for text and background drawing are accumulated here and then
-    * drawn all at once */
-   struct vertex_queue {
-      float *vertices;
-      struct pipe_vertex_buffer vbuf;
-      unsigned max_num_vertices;
-      unsigned num_vertices;
-      unsigned buffer_size;
-   } text, bg, whitelines, color_prims;
-
-   bool has_srgb;
-};
 
 #ifdef PIPE_OS_UNIX
 static void
@@ -250,7 +204,7 @@
 }
 
 static void
-number_to_human_readable(uint64_t num, enum pipe_driver_query_type type,
+number_to_human_readable(double num, enum pipe_driver_query_type type,
                          char *out)
 {
    static const char *byte_units[] =
@@ -579,15 +533,15 @@
                   hud->whitelines.buffer_size +
                   hud->text.buffer_size +
                   hud->color_prims.buffer_size,
-                  16, &hud->bg.vbuf.buffer_offset, &hud->bg.vbuf.buffer,
+                  16, &hud->bg.vbuf.buffer_offset, &hud->bg.vbuf.buffer.resource,
                   (void**)&hud->bg.vertices);
    if (!hud->bg.vertices) {
       goto out;
    }
 
-   pipe_resource_reference(&hud->whitelines.vbuf.buffer, hud->bg.vbuf.buffer);
-   pipe_resource_reference(&hud->text.vbuf.buffer, hud->bg.vbuf.buffer);
-   pipe_resource_reference(&hud->color_prims.vbuf.buffer, hud->bg.vbuf.buffer);
+   pipe_resource_reference(&hud->whitelines.vbuf.buffer.resource, hud->bg.vbuf.buffer.resource);
+   pipe_resource_reference(&hud->text.vbuf.buffer.resource, hud->bg.vbuf.buffer.resource);
+   pipe_resource_reference(&hud->color_prims.vbuf.buffer.resource, hud->bg.vbuf.buffer.resource);
 
    hud->whitelines.vbuf.buffer_offset = hud->bg.vbuf.buffer_offset +
                                         hud->bg.buffer_size;
@@ -654,7 +608,7 @@
                              &hud->bg.vbuf);
       cso_draw_arrays(cso, PIPE_PRIM_QUADS, 0, hud->bg.num_vertices);
    }
-   pipe_resource_reference(&hud->bg.vbuf.buffer, NULL);
+   pipe_resource_reference(&hud->bg.vbuf.buffer.resource, NULL);
 
    /* draw accumulated vertices for white lines */
    cso_set_blend(cso, &hud->no_blend);
@@ -675,7 +629,7 @@
       cso_set_fragment_shader_handle(hud->cso, hud->fs_color);
       cso_draw_arrays(cso, PIPE_PRIM_LINES, 0, hud->whitelines.num_vertices);
    }
-   pipe_resource_reference(&hud->whitelines.vbuf.buffer, NULL);
+   pipe_resource_reference(&hud->whitelines.vbuf.buffer.resource, NULL);
 
    /* draw accumulated vertices for text */
    cso_set_blend(cso, &hud->alpha_blend);
@@ -685,7 +639,7 @@
       cso_set_fragment_shader_handle(hud->cso, hud->fs_text);
       cso_draw_arrays(cso, PIPE_PRIM_QUADS, 0, hud->text.num_vertices);
    }
-   pipe_resource_reference(&hud->text.vbuf.buffer, NULL);
+   pipe_resource_reference(&hud->text.vbuf.buffer.resource, NULL);
 
    /* draw the rest */
    cso_set_rasterizer(cso, &hud->rasterizer_aa_lines);
@@ -824,7 +778,8 @@
 }
 
 static struct hud_pane *
-hud_pane_create(unsigned x1, unsigned y1, unsigned x2, unsigned y2,
+hud_pane_create(struct hud_context *hud,
+                unsigned x1, unsigned y1, unsigned x2, unsigned y2,
                 unsigned period, uint64_t max_value, uint64_t ceiling,
                 boolean dyn_ceiling, boolean sort_items)
 {
@@ -833,6 +788,7 @@
    if (!pane)
       return NULL;
 
+   pane->hud = hud;
    pane->x1 = x1;
    pane->y1 = y1;
    pane->x2 = x2;
@@ -905,13 +861,19 @@
 }
 
 void
-hud_graph_add_value(struct hud_graph *gr, uint64_t value)
+hud_graph_add_value(struct hud_graph *gr, double value)
 {
    gr->current_value = value;
    value = value > gr->pane->ceiling ? gr->pane->ceiling : value;
 
-   if (gr->fd)
-      fprintf(gr->fd, "%" PRIu64 "\n", value);
+   if (gr->fd) {
+      if (fabs(value - lround(value)) > FLT_EPSILON) {
+         fprintf(gr->fd, "%f\n", value);
+      }
+      else {
+         fprintf(gr->fd, "%" PRIu64 "\n", (uint64_t) lround(value));
+      }
+   }
 
    if (gr->index == gr->pane->max_num_vertices) {
       gr->vertices[0] = 0;
@@ -958,26 +920,50 @@
    *dst = 0;
 }
 
+
+#ifdef PIPE_OS_WINDOWS
+#define W_OK 0
+static int
+access(const char *pathname, int mode)
+{
+   /* no-op */
+   return 0;
+}
+
+#define PATH_SEP "\\"
+
+#else
+
+#define PATH_SEP "/"
+
+#endif
+
+
+/**
+ * If the GALLIUM_HUD_DUMP_DIR env var is set, we'll write the raw
+ * HUD values to files at ${GALLIUM_HUD_DUMP_DIR}/<stat> where <stat>
+ * is a HUD variable such as "fps", or "cpu"
+ */
 static void
 hud_graph_set_dump_file(struct hud_graph *gr)
 {
-#ifndef PIPE_OS_WINDOWS
    const char *hud_dump_dir = getenv("GALLIUM_HUD_DUMP_DIR");
-   char *dump_file;
 
    if (hud_dump_dir && access(hud_dump_dir, W_OK) == 0) {
-      dump_file = malloc(strlen(hud_dump_dir) + sizeof("/") + sizeof(gr->name));
+      char *dump_file = malloc(strlen(hud_dump_dir) + sizeof(PATH_SEP)
+                               + sizeof(gr->name));
       if (dump_file) {
          strcpy(dump_file, hud_dump_dir);
-         strcat(dump_file, "/");
+         strcat(dump_file, PATH_SEP);
          strcat_without_spaces(dump_file, gr->name);
          gr->fd = fopen(dump_file, "w+");
-         if (gr->fd)
+         if (gr->fd) {
+            /* flush output after each line is written */
             setvbuf(gr->fd, NULL, _IOLBF, 0);
+         }
          free(dump_file);
       }
    }
-#endif
 }
 
 /**
@@ -1145,8 +1131,8 @@
      column_width = width > column_width ? width : column_width;
 
       if (!pane) {
-         pane = hud_pane_create(x, y, x + width, y + height, period, 10,
-                         ceiling, dyn_ceiling, sort_items);
+         pane = hud_pane_create(hud, x, y, x + width, y + height, period, 10,
+                                ceiling, dyn_ceiling, sort_items);
          if (!pane)
             return;
       }
@@ -1171,7 +1157,19 @@
          hud_cpu_graph_install(pane, i);
       }
       else if (strcmp(name, "API-thread-busy") == 0) {
-         hud_api_thread_busy_install(pane);
+         hud_thread_busy_install(pane, name, false);
+      }
+      else if (strcmp(name, "API-thread-offloaded-slots") == 0) {
+         hud_thread_counter_install(pane, name, HUD_COUNTER_OFFLOADED);
+      }
+      else if (strcmp(name, "API-thread-direct-slots") == 0) {
+         hud_thread_counter_install(pane, name, HUD_COUNTER_DIRECT);
+      }
+      else if (strcmp(name, "API-thread-num-syncs") == 0) {
+         hud_thread_counter_install(pane, name, HUD_COUNTER_SYNCS);
+      }
+      else if (strcmp(name, "main-thread-busy") == 0) {
+         hud_thread_busy_install(pane, name, true);
       }
 #if HAVE_GALLIUM_EXTRA_HUD
       else if (sscanf(name, "nic-rx-%s", arg_name) == 1) {
@@ -1716,3 +1714,11 @@
    pipe_resource_reference(&hud->font.texture, NULL);
    FREE(hud);
 }
+
+void
+hud_add_queue_for_monitoring(struct hud_context *hud,
+                             struct util_queue_monitoring *queue_info)
+{
+   assert(!hud->monitored_queue);
+   hud->monitored_queue = queue_info;
+}
diff --git a/src/gallium/auxiliary/hud/hud_context.h b/src/gallium/auxiliary/hud/hud_context.h
index abf2ad5..5a7e13b 100644
--- a/src/gallium/auxiliary/hud/hud_context.h
+++ b/src/gallium/auxiliary/hud/hud_context.h
@@ -32,6 +32,7 @@
 struct cso_context;
 struct pipe_context;
 struct pipe_resource;
+struct util_queue_monitoring;
 
 struct hud_context *
 hud_create(struct pipe_context *pipe, struct cso_context *cso);
@@ -42,4 +43,8 @@
 void
 hud_draw(struct hud_context *hud, struct pipe_resource *tex);
 
+void
+hud_add_queue_for_monitoring(struct hud_context *hud,
+                             struct util_queue_monitoring *queue_info);
+
 #endif
diff --git a/src/gallium/auxiliary/hud/hud_cpu.c b/src/gallium/auxiliary/hud/hud_cpu.c
index 302445d..4caaab6 100644
--- a/src/gallium/auxiliary/hud/hud_cpu.c
+++ b/src/gallium/auxiliary/hud/hud_cpu.c
@@ -32,6 +32,7 @@
 #include "os/os_time.h"
 #include "os/os_thread.h"
 #include "util/u_memory.h"
+#include "util/u_queue.h"
 #include <stdio.h>
 #include <inttypes.h>
 #ifdef PIPE_OS_WINDOWS
@@ -231,6 +232,7 @@
 }
 
 struct thread_info {
+   bool main_thread;
    int64_t last_time;
    int64_t last_thread_time;
 };
@@ -243,7 +245,19 @@
 
    if (info->last_time) {
       if (info->last_time + gr->pane->period*1000 <= now) {
-         int64_t thread_now = pipe_current_thread_get_time_nano();
+         int64_t thread_now;
+
+         if (info->main_thread) {
+            thread_now = pipe_current_thread_get_time_nano();
+         } else {
+            struct util_queue_monitoring *mon = gr->pane->hud->monitored_queue;
+
+            if (mon && mon->queue)
+               thread_now = util_queue_get_thread_time_nano(mon->queue, 0);
+            else
+               thread_now = 0;
+         }
+
          unsigned percent = (thread_now - info->last_thread_time) * 100 /
                             (now - info->last_time);
 
@@ -266,7 +280,7 @@
 }
 
 void
-hud_api_thread_busy_install(struct hud_pane *pane)
+hud_thread_busy_install(struct hud_pane *pane, const char *name, bool main)
 {
    struct hud_graph *gr;
 
@@ -274,7 +288,7 @@
    if (!gr)
       return;
 
-   strcpy(gr->name, "API-thread-busy");
+   strcpy(gr->name, name);
 
    gr->query_data = CALLOC_STRUCT(thread_info);
    if (!gr->query_data) {
@@ -282,6 +296,7 @@
       return;
    }
 
+   ((struct thread_info*)gr->query_data)->main_thread = main;
    gr->query_new_value = query_api_thread_busy_status;
 
    /* Don't use free() as our callback as that messes up Gallium's
@@ -292,3 +307,77 @@
    hud_pane_add_graph(pane, gr);
    hud_pane_set_max_value(pane, 100);
 }
+
+struct counter_info {
+   enum hud_counter counter;
+   unsigned last_value;
+   int64_t last_time;
+};
+
+static unsigned get_counter(struct hud_graph *gr, enum hud_counter counter)
+{
+   struct util_queue_monitoring *mon = gr->pane->hud->monitored_queue;
+
+   if (!mon || !mon->queue)
+      return 0;
+
+   switch (counter) {
+   case HUD_COUNTER_OFFLOADED:
+      return mon->num_offloaded_items;
+   case HUD_COUNTER_DIRECT:
+      return mon->num_direct_items;
+   case HUD_COUNTER_SYNCS:
+      return mon->num_syncs;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void
+query_thread_counter(struct hud_graph *gr)
+{
+   struct counter_info *info = gr->query_data;
+   int64_t now = os_time_get_nano();
+
+   if (info->last_time) {
+      if (info->last_time + gr->pane->period*1000 <= now) {
+         unsigned current_value = get_counter(gr, info->counter);
+
+         hud_graph_add_value(gr, current_value - info->last_value);
+         info->last_value = current_value;
+         info->last_time = now;
+      }
+   } else {
+      /* initialize */
+      info->last_value = get_counter(gr, info->counter);
+      info->last_time = now;
+   }
+}
+
+void hud_thread_counter_install(struct hud_pane *pane, const char *name,
+                                enum hud_counter counter)
+{
+   struct hud_graph *gr = CALLOC_STRUCT(hud_graph);
+   if (!gr)
+      return;
+
+   strcpy(gr->name, name);
+
+   gr->query_data = CALLOC_STRUCT(counter_info);
+   if (!gr->query_data) {
+      FREE(gr);
+      return;
+   }
+
+   ((struct counter_info*)gr->query_data)->counter = counter;
+   gr->query_new_value = query_thread_counter;
+
+   /* Don't use free() as our callback as that messes up Gallium's
+    * memory debugger.  Use simple free_query_data() wrapper.
+    */
+   gr->free_query_data = free_query_data;
+
+   hud_pane_add_graph(pane, gr);
+   hud_pane_set_max_value(pane, 100);
+}
diff --git a/src/gallium/auxiliary/hud/hud_fps.c b/src/gallium/auxiliary/hud/hud_fps.c
index a360bc2..8aa7a66 100644
--- a/src/gallium/auxiliary/hud/hud_fps.c
+++ b/src/gallium/auxiliary/hud/hud_fps.c
@@ -47,12 +47,12 @@
 
    if (info->last_time) {
       if (info->last_time + gr->pane->period <= now) {
-         double fps = (uint64_t)info->frames * 1000000 /
+         double fps = ((uint64_t)info->frames) * 1000000 /
                       (double)(now - info->last_time);
          info->frames = 0;
          info->last_time = now;
 
-         hud_graph_add_value(gr, (uint64_t) fps);
+         hud_graph_add_value(gr, fps);
       }
    }
    else {
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index bbc5ec7..65baa8a 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -29,7 +29,63 @@
 #define HUD_PRIVATE_H
 
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 #include "util/list.h"
+#include "hud/font.h"
+
+enum hud_counter {
+   HUD_COUNTER_OFFLOADED,
+   HUD_COUNTER_DIRECT,
+   HUD_COUNTER_SYNCS,
+};
+
+struct hud_context {
+   struct pipe_context *pipe;
+   struct cso_context *cso;
+
+   struct hud_batch_query_context *batch_query;
+   struct list_head pane_list;
+
+   struct util_queue_monitoring *monitored_queue;
+
+   /* states */
+   struct pipe_blend_state no_blend, alpha_blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   void *fs_color, *fs_text;
+   struct pipe_rasterizer_state rasterizer, rasterizer_aa_lines;
+   void *vs;
+   struct pipe_vertex_element velems[2];
+
+   /* font */
+   struct util_font font;
+   struct pipe_sampler_view *font_sampler_view;
+   struct pipe_sampler_state font_sampler_state;
+
+   /* VS constant buffer */
+   struct {
+      float color[4];
+      float two_div_fb_width;
+      float two_div_fb_height;
+      float translate[2];
+      float scale[2];
+      float padding[2];
+   } constants;
+   struct pipe_constant_buffer constbuf;
+
+   unsigned fb_width, fb_height;
+
+   /* vertices for text and background drawing are accumulated here and then
+    * drawn all at once */
+   struct vertex_queue {
+      float *vertices;
+      struct pipe_vertex_buffer vbuf;
+      unsigned max_num_vertices;
+      unsigned num_vertices;
+      unsigned buffer_size;
+   } text, bg, whitelines, color_prims;
+
+   bool has_srgb;
+};
 
 struct hud_graph {
    /* initialized by common code */
@@ -48,12 +104,13 @@
    /* mutable variables */
    unsigned num_vertices;
    unsigned index; /* vertex index being updated */
-   uint64_t current_value;
+   double current_value;
    FILE *fd;
 };
 
 struct hud_pane {
    struct list_head head;
+   struct hud_context *hud;
    unsigned x1, y1, x2, y2;
    unsigned inner_x1;
    unsigned inner_y1;
@@ -82,7 +139,7 @@
 /* core */
 void hud_pane_add_graph(struct hud_pane *pane, struct hud_graph *gr);
 void hud_pane_set_max_value(struct hud_pane *pane, uint64_t value);
-void hud_graph_add_value(struct hud_graph *gr, uint64_t value);
+void hud_graph_add_value(struct hud_graph *gr, double value);
 
 /* graphs/queries */
 struct hud_batch_query_context;
@@ -93,7 +150,9 @@
 
 void hud_fps_graph_install(struct hud_pane *pane);
 void hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index);
-void hud_api_thread_busy_install(struct hud_pane *pane);
+void hud_thread_busy_install(struct hud_pane *pane, const char *name, bool main);
+void hud_thread_counter_install(struct hud_pane *pane, const char *name,
+                                enum hud_counter counter);
 void hud_pipe_query_install(struct hud_batch_query_context **pbq,
                             struct hud_pane *pane, struct pipe_context *pipe,
                             const char *name, unsigned query_type,
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c
index 1ffca4b..778f174 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -33,7 +33,6 @@
  *
  *    // emulate unsupported primitives:
  *    if (info->mode needs emulating) {
- *       util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf);
  *       util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
  *       util_primconvert_draw_vbo(ctx->primconvert, info);
  *       return;
@@ -53,7 +52,6 @@
 struct primconvert_context
 {
    struct pipe_context *pipe;
-   struct pipe_index_buffer saved_ib;
    uint32_t primtypes_mask;
    unsigned api_pv;
 };
@@ -73,26 +71,10 @@
 void
 util_primconvert_destroy(struct primconvert_context *pc)
 {
-   util_primconvert_save_index_buffer(pc, NULL);
    FREE(pc);
 }
 
 void
-util_primconvert_save_index_buffer(struct primconvert_context *pc,
-                                   const struct pipe_index_buffer *ib)
-{
-   if (ib) {
-      pipe_resource_reference(&pc->saved_ib.buffer, ib->buffer);
-      pc->saved_ib.index_size = ib->index_size;
-      pc->saved_ib.offset = ib->offset;
-      pc->saved_ib.user_buffer = ib->user_buffer;
-   }
-   else {
-      pipe_resource_reference(&pc->saved_ib.buffer, NULL);
-   }
-}
-
-void
 util_primconvert_save_rasterizer_state(struct primconvert_context *pc,
                                        const struct pipe_rasterizer_state
                                        *rast)
@@ -108,18 +90,15 @@
 util_primconvert_draw_vbo(struct primconvert_context *pc,
                           const struct pipe_draw_info *info)
 {
-   struct pipe_index_buffer *ib = &pc->saved_ib;
-   struct pipe_index_buffer new_ib;
    struct pipe_draw_info new_info;
    struct pipe_transfer *src_transfer = NULL;
    u_translate_func trans_func;
    u_generate_func gen_func;
    const void *src = NULL;
    void *dst;
+   unsigned ib_offset;
 
-   memset(&new_ib, 0, sizeof(new_ib));
    util_draw_init_info(&new_info);
-   new_info.indexed = true;
    new_info.min_index = info->min_index;
    new_info.max_index = info->max_index;
    new_info.index_bias = info->index_bias;
@@ -127,38 +106,43 @@
    new_info.instance_count = info->instance_count;
    new_info.primitive_restart = info->primitive_restart;
    new_info.restart_index = info->restart_index;
-   if (info->indexed) {
+   if (info->index_size) {
       enum pipe_prim_type mode = 0;
+      unsigned index_size;
 
       u_index_translator(pc->primtypes_mask,
-                         info->mode, pc->saved_ib.index_size, info->count,
+                         info->mode, info->index_size, info->count,
                          pc->api_pv, pc->api_pv,
                          info->primitive_restart ? PR_ENABLE : PR_DISABLE,
-                         &mode, &new_ib.index_size, &new_info.count,
+                         &mode, &index_size, &new_info.count,
                          &trans_func);
       new_info.mode = mode;
-      src = ib->user_buffer;
+      new_info.index_size = index_size;
+      src = info->has_user_indices ? info->index.user : NULL;
       if (!src) {
-         src = pipe_buffer_map(pc->pipe, ib->buffer,
+         src = pipe_buffer_map(pc->pipe, info->index.resource,
                                PIPE_TRANSFER_READ, &src_transfer);
       }
-      src = (const uint8_t *)src + ib->offset;
+      src = (const uint8_t *)src;
    }
    else {
       enum pipe_prim_type mode = 0;
+      unsigned index_size;
 
       u_index_generator(pc->primtypes_mask,
                         info->mode, info->start, info->count,
                         pc->api_pv, pc->api_pv,
-                        &mode, &new_ib.index_size, &new_info.count,
+                        &mode, &index_size, &new_info.count,
                         &gen_func);
       new_info.mode = mode;
+      new_info.index_size = index_size;
    }
 
-   u_upload_alloc(pc->pipe->stream_uploader, 0, new_ib.index_size * new_info.count, 4,
-                  &new_ib.offset, &new_ib.buffer, &dst);
+   u_upload_alloc(pc->pipe->stream_uploader, 0, new_info.index_size * new_info.count, 4,
+                  &ib_offset, &new_info.index.resource, &dst);
+   new_info.start = ib_offset / new_info.index_size;
 
-   if (info->indexed) {
+   if (info->index_size) {
       trans_func(src, info->start, info->count, new_info.count, info->restart_index, dst);
    }
    else {
@@ -170,14 +154,8 @@
 
    u_upload_unmap(pc->pipe->stream_uploader);
 
-   /* bind new index buffer: */
-   pc->pipe->set_index_buffer(pc->pipe, &new_ib);
-
    /* to the translated draw: */
    pc->pipe->draw_vbo(pc->pipe, &new_info);
 
-   /* and then restore saved ib: */
-   pc->pipe->set_index_buffer(pc->pipe, ib);
-
-   pipe_resource_reference(&new_ib.buffer, NULL);
+   pipe_resource_reference(&new_info.index.resource, NULL);
 }
diff --git a/src/gallium/auxiliary/indices/u_primconvert.h b/src/gallium/auxiliary/indices/u_primconvert.h
index 73ffea0..02ee063 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.h
+++ b/src/gallium/auxiliary/indices/u_primconvert.h
@@ -34,8 +34,6 @@
 struct primconvert_context *util_primconvert_create(struct pipe_context *pipe,
                                                     uint32_t primtypes_mask);
 void util_primconvert_destroy(struct primconvert_context *pc);
-void util_primconvert_save_index_buffer(struct primconvert_context *pc,
-                                        const struct pipe_index_buffer *ib);
 void util_primconvert_save_rasterizer_state(struct primconvert_context *pc,
                                             const struct pipe_rasterizer_state
                                             *rast);
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index de33375..d4914ac 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -315,8 +315,8 @@
 
       /* nothing to do for UBOs: */
       if ((file == TGSI_FILE_CONSTANT) && decl->Declaration.Dimension) {
-         b->shader->info->num_ubos =
-            MAX2(b->shader->info->num_ubos, decl->Dim.Index2D);
+         b->shader->info.num_ubos =
+            MAX2(b->shader->info.num_ubos, decl->Dim.Index2D);
          return;
       }
 
@@ -374,7 +374,7 @@
             exec_list_push_tail(&b->shader->inputs, &var->node);
 
             for (int i = 0; i < array_size; i++)
-               b->shader->info->inputs_read |= 1 << (var->data.location + i);
+               b->shader->info.inputs_read |= 1 << (var->data.location + i);
 
             break;
          case TGSI_FILE_OUTPUT: {
@@ -440,7 +440,7 @@
             exec_list_push_tail(&b->shader->outputs, &var->node);
 
             for (int i = 0; i < array_size; i++)
-               b->shader->info->outputs_written |= 1 << (var->data.location + i);
+               b->shader->info.outputs_written |= 1 << (var->data.location + i);
          }
             break;
          case TGSI_FILE_CONSTANT:
@@ -587,7 +587,7 @@
 
       src = nir_src_for_ssa(&load->dest.ssa);
 
-      b->shader->info->system_values_read |=
+      b->shader->info.system_values_read |=
          (1 << nir_system_value_from_intrinsic(op));
 
       break;
@@ -1068,7 +1068,7 @@
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard);
    nir_builder_instr_insert(b, &discard->instr);
-   b->shader->info->fs.uses_discard = true;
+   b->shader->info.fs.uses_discard = true;
 }
 
 static void
@@ -1081,7 +1081,7 @@
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
    discard->src[0] = nir_src_for_ssa(cmp);
    nir_builder_instr_insert(b, &discard->instr);
-   b->shader->info->fs.uses_discard = true;
+   b->shader->info.fs.uses_discard = true;
 }
 
 static void
@@ -1411,15 +1411,17 @@
    }
 
    if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
+      instr->src[src_number].src_type = nir_tex_src_ddx;
       instr->src[src_number].src =
          nir_src_for_ssa(nir_swizzle(b, src[1], SWIZ(X, Y, Z, W),
-              instr->coord_components, false));
-      instr->src[src_number].src_type = nir_tex_src_ddx;
+				     nir_tex_instr_src_size(instr, src_number),
+				     false));
       src_number++;
+      instr->src[src_number].src_type = nir_tex_src_ddy;
       instr->src[src_number].src =
          nir_src_for_ssa(nir_swizzle(b, src[2], SWIZ(X, Y, Z, W),
-              instr->coord_components, false));
-      instr->src[src_number].src_type = nir_tex_src_ddy;
+				     nir_tex_instr_src_size(instr, src_number),
+				     false));
       src_number++;
    }
 
@@ -1462,7 +1464,9 @@
 
    assert(src_number == num_srcs);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest,
+		     nir_tex_instr_dest_size(instr),
+		     32, NULL);
    nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
@@ -1501,7 +1505,8 @@
    txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X));
    txs->src[0].src_type = nir_tex_src_lod;
 
-   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest,
+		     nir_tex_instr_dest_size(txs), 32, NULL);
    nir_builder_instr_insert(b, &txs->instr);
 
    nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL);
diff --git a/src/gallium/auxiliary/os/os_mman.h b/src/gallium/auxiliary/os/os_mman.h
index 2ae0027..c7ef9d0 100644
--- a/src/gallium/auxiliary/os/os_mman.h
+++ b/src/gallium/auxiliary/os/os_mman.h
@@ -45,30 +45,15 @@
 #  error Unsupported OS
 #endif
 
-#if defined(PIPE_OS_ANDROID)
-#  include <errno.h> /* for EINVAL */
-#endif
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
 
 #if defined(PIPE_OS_ANDROID) && !defined(__LP64__)
-
-extern void *__mmap2(void *, size_t, int, int, int, size_t);
-
-static inline void *os_mmap(void *addr, size_t length, int prot, int flags,
-                            int fd, loff_t offset)
-{
-   /* offset must be aligned to 4096 (not necessarily the page size) */
-   if (unlikely(offset & 4095)) {
-      errno = EINVAL;
-      return MAP_FAILED;
-   }
-
-   return __mmap2(addr, length, prot, flags, fd, (size_t) (offset >> 12));
-}
+/* 32-bit needs mmap64 for 64-bit offsets */
+#  define os_mmap(addr, length, prot, flags, fd, offset) \
+             mmap64(addr, length, prot, flags, fd, offset)
 
 #  define os_munmap(addr, length) \
              munmap(addr, length)
diff --git a/src/gallium/auxiliary/os/os_process.c b/src/gallium/auxiliary/os/os_process.c
index 6622b9b..035bd22 100644
--- a/src/gallium/auxiliary/os/os_process.c
+++ b/src/gallium/auxiliary/os/os_process.c
@@ -34,7 +34,7 @@
 #  include <windows.h>
 #elif defined(__GLIBC__) || defined(__CYGWIN__)
 #  include <errno.h>
-#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE)
+#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_ANDROID)
 #  include <stdlib.h>
 #elif defined(PIPE_OS_HAIKU)
 #  include <kernel/OS.h>
@@ -86,7 +86,7 @@
 
 #elif defined(__GLIBC__) || defined(__CYGWIN__)
       name = program_invocation_short_name;
-#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE)
+#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_ANDROID)
       /* *BSD and OS X */
       name = getprogname();
 #elif defined(PIPE_OS_HAIKU)
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index b6e0698..10d4695 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -42,17 +42,6 @@
 #include "util/u_thread.h"
 
 
-static inline int pipe_thread_is_self( thrd_t thread )
-{
-#if defined(HAVE_PTHREAD)
-#  if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
-      (__GLIBC__ >= 3 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 12))
-   return pthread_equal(pthread_self(), thread);
-#  endif
-#endif
-   return 0;
-}
-
 #define pipe_mutex_assert_locked(mutex) \
    __pipe_mutex_assert_locked(&(mutex))
 
@@ -75,7 +64,7 @@
  * pipe_barrier
  */
 
-#if (defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HURD)) && !defined(PIPE_OS_ANDROID)
+#if (defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HURD)) && (!defined(PIPE_OS_ANDROID) || ANDROID_API_LEVEL >= 24)
 
 typedef pthread_barrier_t pipe_barrier;
 
diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
index e169139..e4a1cae 100644
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -69,10 +69,17 @@
 
    static LARGE_INTEGER frequency;
    LARGE_INTEGER counter;
+   int64_t secs, nanosecs;
    if(!frequency.QuadPart)
       QueryPerformanceFrequency(&frequency);
    QueryPerformanceCounter(&counter);
-   return counter.QuadPart*INT64_C(1000000000)/frequency.QuadPart;
+   /* Compute seconds and nanoseconds parts separately to
+    * reduce severity of precision loss.
+    */
+   secs = counter.QuadPart / frequency.QuadPart;
+   nanosecs = (counter.QuadPart % frequency.QuadPart) * INT64_C(1000000000)
+      / frequency.QuadPart;
+   return secs*INT64_C(1000000000) + nanosecs;
 
 #else
 
diff --git a/src/gallium/auxiliary/pipe-loader/Android.mk b/src/gallium/auxiliary/pipe-loader/Android.mk
index 006bb0e..1e1bb11 100644
--- a/src/gallium/auxiliary/pipe-loader/Android.mk
+++ b/src/gallium/auxiliary/pipe-loader/Android.mk
@@ -33,14 +33,13 @@
 	-DDROP_PIPE_LOADER_MISC \
 	-DGALLIUM_STATIC_TARGETS
 
-LOCAL_SRC_FILES := $(COMMON_SOURCES)
+LOCAL_SRC_FILES := \
+	$(COMMON_SOURCES) \
+	$(DRM_SOURCES)
 
 LOCAL_MODULE := libmesa_pipe_loader
 
-ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_SRC_FILES += $(DRM_SOURCES)
 LOCAL_STATIC_LIBRARIES := libmesa_loader
-endif
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
index 1ddfeba..0857a2c 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -74,9 +74,9 @@
 }
 
 struct pipe_screen *
-pipe_loader_create_screen(struct pipe_loader_device *dev)
+pipe_loader_create_screen(struct pipe_loader_device *dev, unsigned flags)
 {
-   return dev->ops->create_screen(dev);
+   return dev->ops->create_screen(dev, flags);
 }
 
 struct util_dl_library *
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 690d088..73b7558 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -84,7 +84,7 @@
  * \param dev Device the screen will be created for.
  */
 struct pipe_screen *
-pipe_loader_create_screen(struct pipe_loader_device *dev);
+pipe_loader_create_screen(struct pipe_loader_device *dev, unsigned flags);
 
 /**
  * Query the configuration parameters for the specified device.
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index a4f5cfc..ef446b6 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -129,6 +129,11 @@
         .configuration = configuration_query,
     },
     {
+       .driver_name = "pl111",
+        .create_screen = pipe_pl111_create_screen,
+        .configuration = configuration_query,
+    },
+    {
         .driver_name = "virtio_gpu",
         .create_screen = pipe_virgl_create_screen,
         .configuration = configuration_query,
@@ -276,11 +281,11 @@
 }
 
 static struct pipe_screen *
-pipe_loader_drm_create_screen(struct pipe_loader_device *dev)
+pipe_loader_drm_create_screen(struct pipe_loader_device *dev, unsigned flags)
 {
    struct pipe_loader_drm_device *ddev = pipe_loader_drm_device(dev);
 
-   return ddev->dd->create_screen(ddev->fd);
+   return ddev->dd->create_screen(ddev->fd, flags);
 }
 
 static const struct pipe_loader_ops pipe_loader_drm_ops = {
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
index da2ca8c..58ab992 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_priv.h
@@ -31,7 +31,8 @@
 #include "pipe_loader.h"
 
 struct pipe_loader_ops {
-   struct pipe_screen *(*create_screen)(struct pipe_loader_device *dev);
+   struct pipe_screen *(*create_screen)(struct pipe_loader_device *dev,
+                                        unsigned flags);
 
    const struct drm_conf_ret *(*configuration)(struct pipe_loader_device *dev,
                                                enum drm_conf conf);
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index 0fbc78e..46c6604 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -294,7 +294,8 @@
 }
 
 static struct pipe_screen *
-pipe_loader_sw_create_screen(struct pipe_loader_device *dev)
+pipe_loader_sw_create_screen(struct pipe_loader_device *dev,
+                             unsigned flags)
 {
    struct pipe_loader_sw_device *sdev = pipe_loader_sw_device(dev);
    struct pipe_screen *screen;
diff --git a/src/gallium/auxiliary/renderonly/renderonly.c b/src/gallium/auxiliary/renderonly/renderonly.c
index d3ed214..da91f12 100644
--- a/src/gallium/auxiliary/renderonly/renderonly.c
+++ b/src/gallium/auxiliary/renderonly/renderonly.c
@@ -50,27 +50,12 @@
    return copy;
 }
 
-struct renderonly_scanout *
-renderonly_scanout_for_prime(struct pipe_resource *rsc, struct renderonly *ro)
-{
-   struct renderonly_scanout *scanout;
-
-   scanout = CALLOC_STRUCT(renderonly_scanout);
-   if (!scanout)
-      return NULL;
-
-   scanout->prime = rsc;
-
-   return scanout;
-}
-
 void
 renderonly_scanout_destroy(struct renderonly_scanout *scanout,
 			   struct renderonly *ro)
 {
    struct drm_mode_destroy_dumb destroy_dumb = { };
 
-   pipe_resource_reference(&scanout->prime, NULL);
    if (ro->kms_fd != -1) {
       destroy_dumb.handle = scanout->handle;
       drmIoctl(ro->kms_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_dumb);
@@ -80,12 +65,11 @@
 
 struct renderonly_scanout *
 renderonly_create_kms_dumb_buffer_for_resource(struct pipe_resource *rsc,
-                                               struct renderonly *ro)
+                                               struct renderonly *ro,
+                                               struct winsys_handle *out_handle)
 {
-   struct pipe_screen *screen = rsc->screen;
    struct renderonly_scanout *scanout;
-   struct winsys_handle handle;
-   int prime_fd, err;
+   int err;
    struct drm_mode_create_dumb create_dumb = {
       .width = rsc->width0,
       .height = rsc->height0,
@@ -108,30 +92,21 @@
    scanout->handle = create_dumb.handle;
    scanout->stride = create_dumb.pitch;
 
-   /* export dumb buffer */
+   if (!out_handle)
+      return scanout;
+
+   /* fill in winsys handle */
+   memset(out_handle, 0, sizeof(*out_handle));
+   out_handle->type = DRM_API_HANDLE_TYPE_FD;
+   out_handle->stride = create_dumb.pitch;
+
    err = drmPrimeHandleToFD(ro->kms_fd, create_dumb.handle, O_CLOEXEC,
-         &prime_fd);
+         (int *)&out_handle->handle);
    if (err < 0) {
       fprintf(stderr, "failed to export dumb buffer: %s\n", strerror(errno));
       goto free_dumb;
    }
 
-   /* import dumb buffer */
-   memset(&handle, 0, sizeof(handle));
-   handle.type = DRM_API_HANDLE_TYPE_FD;
-   handle.handle = prime_fd;
-   handle.stride = create_dumb.pitch;
-
-   scanout->prime = screen->resource_from_handle(screen, rsc,
-         &handle, PIPE_HANDLE_USAGE_READ_WRITE);
-
-   close(prime_fd);
-
-   if (!scanout->prime) {
-      fprintf(stderr, "failed to create resource_from_handle: %s\n", strerror(errno));
-      goto free_dumb;
-   }
-
    return scanout;
 
 free_dumb:
@@ -146,7 +121,8 @@
 
 struct renderonly_scanout *
 renderonly_create_gpu_import_for_resource(struct pipe_resource *rsc,
-                                          struct renderonly *ro)
+                                          struct renderonly *ro,
+                                          struct winsys_handle *out_handle)
 {
    struct pipe_screen *screen = rsc->screen;
    struct renderonly_scanout *scanout;
diff --git a/src/gallium/auxiliary/renderonly/renderonly.h b/src/gallium/auxiliary/renderonly/renderonly.h
index d543073..6a89c29 100644
--- a/src/gallium/auxiliary/renderonly/renderonly.h
+++ b/src/gallium/auxiliary/renderonly/renderonly.h
@@ -34,8 +34,6 @@
 struct renderonly_scanout {
    uint32_t handle;
    uint32_t stride;
-
-   struct pipe_resource *prime;
 };
 
 struct renderonly {
@@ -59,7 +57,8 @@
     *   to be done in flush_resource(..) like a resolve to linear.
     */
    struct renderonly_scanout *(*create_for_resource)(struct pipe_resource *rsc,
-                                                     struct renderonly *ro);
+                                                     struct renderonly *ro,
+                                                     struct winsys_handle *out_handle);
    int kms_fd;
    int gpu_fd;
 };
@@ -68,14 +67,13 @@
 renderonly_dup(const struct renderonly *ro);
 
 static inline struct renderonly_scanout *
-renderonly_scanout_for_resource(struct pipe_resource *rsc, struct renderonly *ro)
+renderonly_scanout_for_resource(struct pipe_resource *rsc,
+                                struct renderonly *ro,
+                                struct winsys_handle *out_handle)
 {
-   return ro->create_for_resource(rsc, ro);
+   return ro->create_for_resource(rsc, ro, out_handle);
 }
 
-struct renderonly_scanout *
-renderonly_scanout_for_prime(struct pipe_resource *rsc, struct renderonly *ro);
-
 void
 renderonly_scanout_destroy(struct renderonly_scanout *scanout,
 			   struct renderonly *ro);
@@ -87,6 +85,7 @@
    if (!scanout)
       return FALSE;
 
+   assert(handle->type == DRM_API_HANDLE_TYPE_KMS);
    handle->handle = scanout->handle;
    handle->stride = scanout->stride;
 
@@ -98,13 +97,15 @@
  */
 struct renderonly_scanout *
 renderonly_create_kms_dumb_buffer_for_resource(struct pipe_resource *rsc,
-                                               struct renderonly *ro);
+                                               struct renderonly *ro,
+                                               struct winsys_handle *out_handle);
 
 /**
  * Import GPU resource into scanout hw.
  */
 struct renderonly_scanout *
 renderonly_create_gpu_import_for_resource(struct pipe_resource *rsc,
-                                          struct renderonly *ro);
+                                          struct renderonly *ro,
+                                          struct winsys_handle *out_handle);
 
 #endif /* RENDERONLY_H_ */
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 3159df6..0027ede 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -10,7 +10,7 @@
 #include "i915/i915_public.h"
 
 struct pipe_screen *
-pipe_i915_create_screen(int fd)
+pipe_i915_create_screen(int fd, unsigned flags)
 {
    struct i915_winsys *iws;
    struct pipe_screen *screen;
@@ -26,7 +26,7 @@
 #else
 
 struct pipe_screen *
-pipe_i915_create_screen(int fd)
+pipe_i915_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "i915g: driver missing\n");
    return NULL;
@@ -38,7 +38,7 @@
 #include "nouveau/drm/nouveau_drm_public.h"
 
 struct pipe_screen *
-pipe_nouveau_create_screen(int fd)
+pipe_nouveau_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -49,7 +49,7 @@
 #else
 
 struct pipe_screen *
-pipe_nouveau_create_screen(int fd)
+pipe_nouveau_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "nouveau: driver missing\n");
    return NULL;
@@ -57,24 +57,47 @@
 
 #endif
 
+#ifdef GALLIUM_PL111
+#include "pl111/drm/pl111_drm_public.h"
+
+struct pipe_screen *
+pipe_pl111_create_screen(int fd, unsigned flags)
+{
+   struct pipe_screen *screen;
+
+   screen = pl111_drm_screen_create(fd);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+#else
+
+struct pipe_screen *
+pipe_pl111_create_screen(int fd, unsigned flags)
+{
+   fprintf(stderr, "pl111: driver missing\n");
+   return NULL;
+}
+
+#endif
+
 #ifdef GALLIUM_R300
 #include "radeon/radeon_winsys.h"
 #include "radeon/drm/radeon_drm_public.h"
 #include "r300/r300_public.h"
 
 struct pipe_screen *
-pipe_r300_create_screen(int fd)
+pipe_r300_create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, r300_screen_create);
+   rw = radeon_drm_winsys_create(fd, flags, r300_screen_create);
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 
 #else
 
 struct pipe_screen *
-pipe_r300_create_screen(int fd)
+pipe_r300_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "r300: driver missing\n");
    return NULL;
@@ -88,18 +111,18 @@
 #include "r600/r600_public.h"
 
 struct pipe_screen *
-pipe_r600_create_screen(int fd)
+pipe_r600_create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, r600_screen_create);
+   rw = radeon_drm_winsys_create(fd, flags, r600_screen_create);
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 
 #else
 
 struct pipe_screen *
-pipe_r600_create_screen(int fd)
+pipe_r600_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "r600: driver missing\n");
    return NULL;
@@ -114,15 +137,15 @@
 #include "radeonsi/si_public.h"
 
 struct pipe_screen *
-pipe_radeonsi_create_screen(int fd)
+pipe_radeonsi_create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *rw;
 
    /* First, try amdgpu. */
-   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+   rw = amdgpu_winsys_create(fd, flags, radeonsi_screen_create);
 
    if (!rw)
-      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+      rw = radeon_drm_winsys_create(fd, flags, radeonsi_screen_create);
 
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
@@ -130,7 +153,7 @@
 #else
 
 struct pipe_screen *
-pipe_radeonsi_create_screen(int fd)
+pipe_radeonsi_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "radeonsi: driver missing\n");
    return NULL;
@@ -143,7 +166,7 @@
 #include "svga/svga_public.h"
 
 struct pipe_screen *
-pipe_vmwgfx_create_screen(int fd)
+pipe_vmwgfx_create_screen(int fd, unsigned flags)
 {
    struct svga_winsys_screen *sws;
    struct pipe_screen *screen;
@@ -159,7 +182,7 @@
 #else
 
 struct pipe_screen *
-pipe_vmwgfx_create_screen(int fd)
+pipe_vmwgfx_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "svga: driver missing\n");
    return NULL;
@@ -171,7 +194,7 @@
 #include "freedreno/drm/freedreno_drm_public.h"
 
 struct pipe_screen *
-pipe_freedreno_create_screen(int fd)
+pipe_freedreno_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -182,7 +205,7 @@
 #else
 
 struct pipe_screen *
-pipe_freedreno_create_screen(int fd)
+pipe_freedreno_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "freedreno: driver missing\n");
    return NULL;
@@ -195,7 +218,7 @@
 #include "virgl/virgl_public.h"
 
 struct pipe_screen *
-pipe_virgl_create_screen(int fd)
+pipe_virgl_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -206,7 +229,7 @@
 #else
 
 struct pipe_screen *
-pipe_virgl_create_screen(int fd)
+pipe_virgl_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "virgl: driver missing\n");
    return NULL;
@@ -218,7 +241,7 @@
 #include "vc4/drm/vc4_drm_public.h"
 
 struct pipe_screen *
-pipe_vc4_create_screen(int fd)
+pipe_vc4_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -229,7 +252,7 @@
 #else
 
 struct pipe_screen *
-pipe_vc4_create_screen(int fd)
+pipe_vc4_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "vc4: driver missing\n");
    return NULL;
@@ -241,7 +264,7 @@
 #include "etnaviv/drm/etnaviv_drm_public.h"
 
 struct pipe_screen *
-pipe_etna_create_screen(int fd)
+pipe_etna_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -252,7 +275,7 @@
 #else
 
 struct pipe_screen *
-pipe_etna_create_screen(int fd)
+pipe_etna_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "etnaviv: driver missing\n");
    return NULL;
@@ -264,7 +287,7 @@
 #include "imx/drm/imx_drm_public.h"
 
 struct pipe_screen *
-pipe_imx_drm_create_screen(int fd)
+pipe_imx_drm_create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
@@ -275,7 +298,7 @@
 #else
 
 struct pipe_screen *
-pipe_imx_drm_create_screen(int fd)
+pipe_imx_drm_create_screen(int fd, unsigned flags)
 {
    fprintf(stderr, "imx-drm: driver missing\n");
    return NULL;
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
index bc12b21..d4adc88 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -5,39 +5,42 @@
 struct pipe_screen;
 
 struct pipe_screen *
-pipe_i915_create_screen(int fd);
+pipe_i915_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_ilo_create_screen(int fd);
+pipe_ilo_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_nouveau_create_screen(int fd);
+pipe_nouveau_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_r300_create_screen(int fd);
+pipe_r300_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_r600_create_screen(int fd);
+pipe_r600_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_radeonsi_create_screen(int fd);
+pipe_radeonsi_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_vmwgfx_create_screen(int fd);
+pipe_vmwgfx_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_freedreno_create_screen(int fd);
+pipe_freedreno_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_virgl_create_screen(int fd);
+pipe_virgl_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_vc4_create_screen(int fd);
+pipe_vc4_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_etna_create_screen(int fd);
+pipe_pl111_create_screen(int fd, unsigned flags);
 
 struct pipe_screen *
-pipe_imx_drm_create_screen(int fd);
+pipe_etna_create_screen(int fd, unsigned flags);
+
+struct pipe_screen *
+pipe_imx_drm_create_screen(int fd, unsigned flags);
 
 #endif /* _DRM_HELPER_PUBLIC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 39c20b5..144a017 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -642,6 +642,7 @@
    instruction.Label = 0;
    instruction.Texture = 0;
    instruction.Memory = 0;
+   instruction.Precise = 0;
    instruction.Padding = 0;
 
    return instruction;
@@ -650,6 +651,7 @@
 static struct tgsi_instruction
 tgsi_build_instruction(unsigned opcode,
                        unsigned saturate,
+                       unsigned precise,
                        unsigned num_dst_regs,
                        unsigned num_src_regs,
                        struct tgsi_header *header)
@@ -664,6 +666,7 @@
    instruction = tgsi_default_instruction();
    instruction.Opcode = opcode;
    instruction.Saturate = saturate;
+   instruction.Precise = precise;
    instruction.NumDstRegs = num_dst_regs;
    instruction.NumSrcRegs = num_src_regs;
 
@@ -720,6 +723,7 @@
 
    instruction_texture.Texture = TGSI_TEXTURE_UNKNOWN;
    instruction_texture.NumOffsets = 0;
+   instruction_texture.ReturnType = TGSI_RETURN_TYPE_UNKNOWN;
    instruction_texture.Padding = 0;
 
    return instruction_texture;
@@ -729,6 +733,7 @@
 tgsi_build_instruction_texture(
    unsigned texture,
    unsigned num_offsets,
+   unsigned return_type,
    struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
@@ -737,6 +742,7 @@
 
    instruction_texture.Texture = texture;
    instruction_texture.NumOffsets = num_offsets;
+   instruction_texture.ReturnType = return_type;
    instruction_texture.Padding = 0;
    instruction->Texture = 1;
 
@@ -1057,6 +1063,7 @@
 
    *instruction = tgsi_build_instruction(full_inst->Instruction.Opcode,
                                          full_inst->Instruction.Saturate,
+                                         full_inst->Instruction.Precise,
                                          full_inst->Instruction.NumDstRegs,
                                          full_inst->Instruction.NumSrcRegs,
                                          header);
@@ -1090,7 +1097,8 @@
 
       *instruction_texture = tgsi_build_instruction_texture(
          full_inst->Texture.Texture,
-	 full_inst->Texture.NumOffsets,
+         full_inst->Texture.NumOffsets,
+         full_inst->Texture.ReturnType,
          prev_token,
          instruction,
          header   );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f6eba74..b58e645 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -584,6 +584,10 @@
       TXT( "_SAT" );
    }
 
+   if (inst->Instruction.Precise) {
+      TXT( "_PRECISE" );
+   }
+
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
       const struct tgsi_full_dst_register *dst = &inst->Dst[i];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index c41954c..97c75e9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -846,40 +846,40 @@
 micro_u64div(union tgsi_double_channel *dst,
              const union tgsi_double_channel *src)
 {
-   dst->u64[0] = src[0].u64[0] / src[1].u64[0];
-   dst->u64[1] = src[0].u64[1] / src[1].u64[1];
-   dst->u64[2] = src[0].u64[2] / src[1].u64[2];
-   dst->u64[3] = src[0].u64[3] / src[1].u64[3];
+   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
+   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
+   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
+   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
 }
 
 static void
 micro_i64div(union tgsi_double_channel *dst,
              const union tgsi_double_channel *src)
 {
-   dst->i64[0] = src[0].i64[0] / src[1].i64[0];
-   dst->i64[1] = src[0].i64[1] / src[1].i64[1];
-   dst->i64[2] = src[0].i64[2] / src[1].i64[2];
-   dst->i64[3] = src[0].i64[3] / src[1].i64[3];
+   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
+   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
+   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
+   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
 }
 
 static void
 micro_u64mod(union tgsi_double_channel *dst,
              const union tgsi_double_channel *src)
 {
-   dst->u64[0] = src[0].u64[0] % src[1].u64[0];
-   dst->u64[1] = src[0].u64[1] % src[1].u64[1];
-   dst->u64[2] = src[0].u64[2] % src[1].u64[2];
-   dst->u64[3] = src[0].u64[3] % src[1].u64[3];
+   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
+   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
+   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
+   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
 }
 
 static void
 micro_i64mod(union tgsi_double_channel *dst,
              const union tgsi_double_channel *src)
 {
-   dst->i64[0] = src[0].i64[0] % src[1].i64[0];
-   dst->i64[1] = src[0].i64[1] % src[1].i64[1];
-   dst->i64[2] = src[0].i64[2] % src[1].i64[2];
-   dst->i64[3] = src[0].i64[3] % src[1].i64[3];
+   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
+   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
+   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
+   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
 }
 
 static void
@@ -4653,10 +4653,10 @@
           const union tgsi_exec_channel *src0,
           const union tgsi_exec_channel *src1)
 {
-   dst->i[0] = src0->i[0] % src1->i[0];
-   dst->i[1] = src0->i[1] % src1->i[1];
-   dst->i[2] = src0->i[2] % src1->i[2];
-   dst->i[3] = src0->i[3] % src1->i[3];
+   dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
+   dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
+   dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
+   dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
 }
 
 static void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 5708a50..9d7e65f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -527,6 +527,7 @@
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return PIPE_MAX_SHADER_BUFFERS;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c b/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c
new file mode 100644
index 0000000..b7a21f2
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "tgsi/tgsi_from_mesa.h"
+
+#include "pipe/p_compiler.h"
+
+/**
+ * Determine the semantic index that is used when the given varying is mapped
+ * to TGSI_SEMANTIC_GENERIC.
+ */
+unsigned
+tgsi_get_generic_gl_varying_index(gl_varying_slot attr,
+                                  bool needs_texcoord_semantic)
+{
+   if (attr >= VARYING_SLOT_VAR0) {
+      if (needs_texcoord_semantic)
+         return attr - VARYING_SLOT_VAR0;
+      else
+         return 9 + (attr - VARYING_SLOT_VAR0);
+   }
+   if (attr == VARYING_SLOT_PNTC) {
+      assert(!needs_texcoord_semantic);
+      return 8;
+   }
+   if (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) {
+      assert(!needs_texcoord_semantic);
+      return attr - VARYING_SLOT_TEX0;
+   }
+
+   assert(0);
+   return 0;
+}
+
+/**
+ * Determine the semantic name and index used for the given varying.
+ */
+void
+tgsi_get_gl_varying_semantic(gl_varying_slot attr,
+                             bool needs_texcoord_semantic,
+                             unsigned *semantic_name,
+                             unsigned *semantic_index)
+{
+   switch (attr) {
+   case VARYING_SLOT_POS:
+      *semantic_name = TGSI_SEMANTIC_POSITION;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_COL0:
+      *semantic_name = TGSI_SEMANTIC_COLOR;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_COL1:
+      *semantic_name = TGSI_SEMANTIC_COLOR;
+      *semantic_index = 1;
+      break;
+   case VARYING_SLOT_BFC0:
+      *semantic_name = TGSI_SEMANTIC_BCOLOR;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_BFC1:
+      *semantic_name = TGSI_SEMANTIC_BCOLOR;
+      *semantic_index = 1;
+      break;
+   case VARYING_SLOT_FOGC:
+      *semantic_name = TGSI_SEMANTIC_FOG;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_PSIZ:
+      *semantic_name = TGSI_SEMANTIC_PSIZE;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_CLIP_DIST0:
+      *semantic_name = TGSI_SEMANTIC_CLIPDIST;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_CLIP_DIST1:
+      *semantic_name = TGSI_SEMANTIC_CLIPDIST;
+      *semantic_index = 1;
+      break;
+   case VARYING_SLOT_CULL_DIST0:
+   case VARYING_SLOT_CULL_DIST1:
+      /* these should have been lowered by GLSL */
+      assert(0);
+      break;
+   case VARYING_SLOT_EDGE:
+      *semantic_name = TGSI_SEMANTIC_EDGEFLAG;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_CLIP_VERTEX:
+      *semantic_name = TGSI_SEMANTIC_CLIPVERTEX;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_LAYER:
+      *semantic_name = TGSI_SEMANTIC_LAYER;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_VIEWPORT:
+      *semantic_name = TGSI_SEMANTIC_VIEWPORT_INDEX;
+      *semantic_index = 0;
+      break;
+   case VARYING_SLOT_PNTC:
+      *semantic_name = TGSI_SEMANTIC_PCOORD;
+      *semantic_index = 0;
+      break;
+
+   case VARYING_SLOT_TEX0:
+   case VARYING_SLOT_TEX1:
+   case VARYING_SLOT_TEX2:
+   case VARYING_SLOT_TEX3:
+   case VARYING_SLOT_TEX4:
+   case VARYING_SLOT_TEX5:
+   case VARYING_SLOT_TEX6:
+   case VARYING_SLOT_TEX7:
+      if (needs_texcoord_semantic) {
+         *semantic_name = TGSI_SEMANTIC_TEXCOORD;
+         *semantic_index = attr - VARYING_SLOT_TEX0;
+         break;
+      }
+      /* fall through */
+   case VARYING_SLOT_VAR0:
+   default:
+      assert(attr >= VARYING_SLOT_VAR0 ||
+             (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
+      *semantic_name = TGSI_SEMANTIC_GENERIC;
+      *semantic_index =
+         tgsi_get_generic_gl_varying_index(attr, needs_texcoord_semantic);
+      break;
+   }
+}
+
+/**
+ * Determine the semantic name and index used for the given fragment shader
+ * result.
+ */
+void
+tgsi_get_gl_frag_result_semantic(gl_frag_result frag_result,
+                                 unsigned *semantic_name,
+                                 unsigned *semantic_index)
+{
+   if (frag_result >= FRAG_RESULT_DATA0) {
+      *semantic_name = TGSI_SEMANTIC_COLOR;
+      *semantic_index = frag_result - FRAG_RESULT_DATA0;
+      return;
+   }
+
+   *semantic_index = 0;
+
+   switch (frag_result) {
+   case FRAG_RESULT_DEPTH:
+      *semantic_name = TGSI_SEMANTIC_POSITION;
+      break;
+   case FRAG_RESULT_STENCIL:
+      *semantic_name = TGSI_SEMANTIC_STENCIL;
+      break;
+   case FRAG_RESULT_COLOR:
+      *semantic_name = TGSI_SEMANTIC_COLOR;
+      break;
+   case FRAG_RESULT_SAMPLE_MASK:
+      *semantic_name = TGSI_SEMANTIC_SAMPLEMASK;
+      break;
+   default:
+      assert(false);
+   }
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h b/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h
new file mode 100644
index 0000000..bfaa48d
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef TGSI_FROM_MESA_H
+#define TGSI_FROM_MESA_H
+
+#include <stdbool.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "compiler/shader_enums.h"
+
+void
+tgsi_get_gl_varying_semantic(gl_varying_slot attr,
+                             bool needs_texcoord_semantic,
+                             unsigned *semantic_name,
+                             unsigned *semantic_index);
+
+unsigned
+tgsi_get_generic_gl_varying_index(gl_varying_slot attr,
+                                  bool needs_texcoord_semantic);
+
+void
+tgsi_get_gl_frag_result_semantic(gl_frag_result frag_result,
+                                 unsigned *semantic_name,
+                                 unsigned *semantic_index);
+
+static inline enum pipe_shader_type
+pipe_shader_type_from_mesa(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case MESA_SHADER_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case MESA_SHADER_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case MESA_SHADER_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case MESA_SHADER_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case MESA_SHADER_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   default:
+      unreachable("bad shader stage");
+   }
+}
+
+#endif /* TGSI_FROM_MESA_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index ab73fab..d11d0e5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -28,26 +28,6 @@
 #define OP12_TEX(a) OP12(a)
 #endif
 
-#ifndef OP14_TEX
-#define OP14_TEX(a) OP14(a)
-#endif
-
-#ifndef OP12_SAMPLE
-#define OP12_SAMPLE(a) OP12(a)
-#endif
-
-#ifndef OP13_SAMPLE
-#define OP13_SAMPLE(a) OP13(a)
-#endif
-
-#ifndef OP14_SAMPLE
-#define OP14_SAMPLE(a) OP14(a)
-#endif
-
-#ifndef OP15_SAMPLE
-#define OP15_SAMPLE(a) OP15(a)
-#endif
-
 #ifndef OP00_LBL
 #define OP00_LBL(a) OP00(a)
 #endif
@@ -73,10 +53,12 @@
 OP12(SLT)
 OP12(SGE)
 OP13(MAD)
+OP12_TEX(TEX_LZ)
 OP13(LRP)
 OP11(SQRT)
 OP13(DP2A)
 OP11(FRC)
+OP12_TEX(TXF_LZ)
 OP11(FLR)
 OP11(ROUND)
 OP11(EX2)
@@ -178,15 +160,6 @@
 OP00(DEFAULT)
 OP00(ENDSWITCH)
 
-OP13_SAMPLE(SAMPLE)
-OP12_SAMPLE(SAMPLE_I)
-OP13_SAMPLE(SAMPLE_I_MS)
-OP14_SAMPLE(SAMPLE_B)
-OP14_SAMPLE(SAMPLE_C)
-OP14_SAMPLE(SAMPLE_C_LZ)
-OP15_SAMPLE(SAMPLE_D)
-OP14_SAMPLE(SAMPLE_L)
-OP13_SAMPLE(GATHER4)
 OP12(SVIEWINFO)
 OP13(SAMPLE_POS)
 OP12(SAMPLE_INFO)
@@ -206,21 +179,8 @@
 #undef OP12
 #undef OP13
 
-#ifdef OP14
-#undef OP14
-#endif
-
-#ifdef OP15
-#undef OP15
-#endif
-
 #undef OP00_LBL
 #undef OP01_LBL
 
 #undef OP12_TEX
 #undef OP14_TEX
-
-#undef OP12_SAMPLE
-#undef OP13_SAMPLE
-#undef OP14_SAMPLE
-#undef OP15_SAMPLE
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index bf614db..6e51972 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -115,6 +115,39 @@
 {
    int ind = src->Register.Index;
 
+   if (info->processor == PIPE_SHADER_COMPUTE &&
+       src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
+      unsigned swizzle[4], i, name;
+
+      name = info->system_value_semantic_name[src->Register.Index];
+      swizzle[0] = src->Register.SwizzleX;
+      swizzle[1] = src->Register.SwizzleY;
+      swizzle[2] = src->Register.SwizzleZ;
+      swizzle[3] = src->Register.SwizzleW;
+
+      switch (name) {
+      case TGSI_SEMANTIC_THREAD_ID:
+      case TGSI_SEMANTIC_BLOCK_ID:
+         for (i = 0; i < 4; i++) {
+            if (swizzle[i] <= TGSI_SWIZZLE_Z) {
+               if (name == TGSI_SEMANTIC_THREAD_ID)
+                  info->uses_thread_id[swizzle[i]] = true;
+               else
+                  info->uses_block_id[swizzle[i]] = true;
+            }
+         }
+         break;
+      case TGSI_SEMANTIC_BLOCK_SIZE:
+         /* The block size is translated to IMM with a fixed block size. */
+         if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+            info->uses_block_size = true;
+         break;
+      case TGSI_SEMANTIC_GRID_SIZE:
+         info->uses_grid_size = true;
+         break;
+      }
+   }
+
    /* Mark which inputs are effectively used */
    if (src->Register.File == TGSI_FILE_INPUT) {
       if (src->Register.Indirect) {
@@ -319,6 +352,7 @@
    unsigned i;
    bool is_mem_inst = false;
    bool is_interp_instruction = false;
+   unsigned sampler_src;
 
    assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
    info->opcode_count[fullinst->Instruction.Opcode]++;
@@ -334,6 +368,44 @@
    case TGSI_OPCODE_ENDLOOP:
       (*current_depth)--;
       break;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TEX_LZ:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXQ:
+   case TGSI_OPCODE_TXQS:
+   case TGSI_OPCODE_TXF:
+   case TGSI_OPCODE_TXF_LZ:
+   case TGSI_OPCODE_TEX2:
+   case TGSI_OPCODE_TXB2:
+   case TGSI_OPCODE_TXL2:
+   case TGSI_OPCODE_TG4:
+   case TGSI_OPCODE_LODQ:
+      sampler_src = fullinst->Instruction.NumSrcRegs - 1;
+      if (fullinst->Src[sampler_src].Register.File != TGSI_FILE_SAMPLER)
+         info->uses_bindless_samplers = true;
+      break;
+   case TGSI_OPCODE_RESQ:
+   case TGSI_OPCODE_LOAD:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      if (tgsi_is_bindless_image_file(fullinst->Src[0].Register.File))
+         info->uses_bindless_images = true;
+      break;
+   case TGSI_OPCODE_STORE:
+      if (tgsi_is_bindless_image_file(fullinst->Dst[0].Register.File))
+         info->uses_bindless_images = true;
+      break;
    default:
       break;
    }
@@ -524,13 +596,16 @@
          /* Vertex shaders can have inputs with holes between them. */
          info->num_inputs = MAX2(info->num_inputs, reg + 1);
 
-         if (semName == TGSI_SEMANTIC_PRIMID)
-            info->uses_primid = TRUE;
-         else if (procType == PIPE_SHADER_FRAGMENT) {
-            if (semName == TGSI_SEMANTIC_POSITION)
-               info->reads_position = TRUE;
-            else if (semName == TGSI_SEMANTIC_FACE)
-               info->uses_frontface = TRUE;
+         switch (semName) {
+         case TGSI_SEMANTIC_PRIMID:
+            info->uses_primid = true;
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            info->reads_position = true;
+            break;
+         case TGSI_SEMANTIC_FACE:
+            info->uses_frontface = true;
+            break;
          }
          break;
 
@@ -858,79 +933,3 @@
 
    return;
 }
-
-
-/**
- * Check if the given shader is a "passthrough" shader consisting of only
- * MOV instructions of the form:  MOV OUT[n], IN[n]
- *  
- */
-boolean
-tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
-{
-   struct tgsi_parse_context parse;
-
-   /**
-    ** Setup to begin parsing input shader
-    **/
-   if (tgsi_parse_init(&parse, tokens) != TGSI_PARSE_OK) {
-      debug_printf("tgsi_parse_init() failed in tgsi_is_passthrough_shader()!\n");
-      return FALSE;
-   }
-
-   /**
-    ** Loop over incoming program tokens/instructions
-    */
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         {
-            struct tgsi_full_instruction *fullinst =
-               &parse.FullToken.FullInstruction;
-            const struct tgsi_full_src_register *src =
-               &fullinst->Src[0];
-            const struct tgsi_full_dst_register *dst =
-               &fullinst->Dst[0];
-
-            /* Do a whole bunch of checks for a simple move */
-            if (fullinst->Instruction.Opcode != TGSI_OPCODE_MOV ||
-                (src->Register.File != TGSI_FILE_INPUT &&
-                 src->Register.File != TGSI_FILE_SYSTEM_VALUE) ||
-                dst->Register.File != TGSI_FILE_OUTPUT ||
-                src->Register.Index != dst->Register.Index ||
-
-                src->Register.Negate ||
-                src->Register.Absolute ||
-
-                src->Register.SwizzleX != TGSI_SWIZZLE_X ||
-                src->Register.SwizzleY != TGSI_SWIZZLE_Y ||
-                src->Register.SwizzleZ != TGSI_SWIZZLE_Z ||
-                src->Register.SwizzleW != TGSI_SWIZZLE_W ||
-
-                dst->Register.WriteMask != TGSI_WRITEMASK_XYZW)
-            {
-               tgsi_parse_free(&parse);
-               return FALSE;
-            }
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         /* fall-through */
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* fall-through */
-      case TGSI_TOKEN_TYPE_PROPERTY:
-         /* fall-through */
-      default:
-         ; /* no-op */
-      }
-   }
-
-   tgsi_parse_free(&parse);
-
-   /* if we get here, it's a pass-through shader */
-   return TRUE;
-}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 3854827..857434f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -121,6 +121,10 @@
    boolean uses_primid;
    boolean uses_frontface;
    boolean uses_invocationid;
+   boolean uses_thread_id[3];
+   boolean uses_block_id[3];
+   boolean uses_block_size;
+   boolean uses_grid_size;
    boolean writes_position;
    boolean writes_psize;
    boolean writes_clipvertex;
@@ -131,6 +135,8 @@
    boolean is_msaa_sampler[PIPE_MAX_SAMPLERS];
    boolean uses_doubles; /**< uses any of the double instructions */
    boolean uses_derivatives;
+   boolean uses_bindless_samplers;
+   boolean uses_bindless_images;
    unsigned clipdist_writemask;
    unsigned culldist_writemask;
    unsigned num_written_culldistance;
@@ -192,8 +198,13 @@
                  unsigned max_array_id,
                  struct tgsi_array_info *arrays);
 
-extern boolean
-tgsi_is_passthrough_shader(const struct tgsi_token *tokens);
+static inline bool
+tgsi_is_bindless_image_file(unsigned file)
+{
+   return file != TGSI_FILE_IMAGE &&
+          file != TGSI_FILE_MEMORY &&
+          file != TGSI_FILE_BUFFER;
+}
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 2640350..7ce12d3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -151,6 +151,7 @@
    "NUM_CLIPDIST_ENABLED",
    "NUM_CULLDIST_ENABLED",
    "FS_EARLY_DEPTH_STENCIL",
+   "FS_POST_DEPTH_COVERAGE",
    "NEXT_SHADER",
    "CS_FIXED_BLOCK_WIDTH",
    "CS_FIXED_BLOCK_HEIGHT",
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 93a0556..4cb67c5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -999,6 +999,7 @@
 static boolean
 match_inst(const char **pcur,
            unsigned *saturate,
+           unsigned *precise,
            const struct tgsi_opcode_info *info)
 {
    const char *cur = *pcur;
@@ -1007,16 +1008,24 @@
    if (str_match_nocase_whole(&cur, info->mnemonic)) {
       *pcur = cur;
       *saturate = 0;
+      *precise = 0;
       return TRUE;
    }
 
    if (str_match_no_case(&cur, info->mnemonic)) {
       /* the instruction has a suffix, figure it out */
-      if (str_match_nocase_whole(&cur, "_SAT")) {
+      if (str_match_no_case(&cur, "_SAT")) {
          *pcur = cur;
          *saturate = 1;
-         return TRUE;
       }
+
+      if (str_match_no_case(&cur, "_PRECISE")) {
+         *pcur = cur;
+         *precise = 1;
+      }
+
+      if (!is_digit_alpha_underscore(cur))
+         return TRUE;
    }
 
    return FALSE;
@@ -1029,6 +1038,7 @@
 {
    uint i;
    uint saturate = 0;
+   uint precise = 0;
    const struct tgsi_opcode_info *info;
    struct tgsi_full_instruction inst;
    const char *cur;
@@ -1043,7 +1053,7 @@
       cur = ctx->cur;
 
       info = tgsi_get_opcode_info( i );
-      if (match_inst(&cur, &saturate, info)) {
+      if (match_inst(&cur, &saturate, &precise, info)) {
          if (info->num_dst + info->num_src + info->is_tex == 0) {
             ctx->cur = cur;
             break;
@@ -1064,6 +1074,7 @@
 
    inst.Instruction.Opcode = i;
    inst.Instruction.Saturate = saturate;
+   inst.Instruction.Precise = precise;
    inst.Instruction.NumDstRegs = info->num_dst;
    inst.Instruction.NumSrcRegs = info->num_src;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 9eb00d0..ca31bc4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1140,8 +1140,6 @@
    unsigned n = 0;
 
    assert(dst.File != TGSI_FILE_NULL);
-   assert(dst.File != TGSI_FILE_CONSTANT);
-   assert(dst.File != TGSI_FILE_INPUT);
    assert(dst.File != TGSI_FILE_SAMPLER);
    assert(dst.File != TGSI_FILE_SAMPLER_VIEW);
    assert(dst.File != TGSI_FILE_IMMEDIATE);
@@ -1213,6 +1211,7 @@
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               unsigned precise,
                unsigned num_dst,
                unsigned num_src)
 {
@@ -1226,6 +1225,7 @@
    out[0].insn = tgsi_default_instruction();
    out[0].insn.Opcode = opcode;
    out[0].insn.Saturate = saturate;
+   out[0].insn.Precise = precise;
    out[0].insn.NumDstRegs = num_dst;
    out[0].insn.NumSrcRegs = num_src;
 
@@ -1289,7 +1289,7 @@
 void
 ureg_emit_texture(struct ureg_program *ureg,
                   unsigned extended_token,
-                  unsigned target, unsigned num_offsets)
+                  unsigned target, unsigned return_type, unsigned num_offsets)
 {
    union tgsi_any_token *out, *insn;
 
@@ -1301,6 +1301,7 @@
    out[0].value = 0;
    out[0].insn_texture.Texture = target;
    out[0].insn_texture.NumOffsets = num_offsets;
+   out[0].insn_texture.ReturnType = return_type;
 }
 
 void
@@ -1353,7 +1354,8 @@
           const struct ureg_dst *dst,
           unsigned nr_dst,
           const struct ureg_src *src,
-          unsigned nr_src )
+          unsigned nr_src,
+          unsigned precise )
 {
    struct ureg_emit_insn_result insn;
    unsigned i;
@@ -1368,6 +1370,7 @@
    insn = ureg_emit_insn(ureg,
                          opcode,
                          saturate,
+                         precise,
                          nr_dst,
                          nr_src);
 
@@ -1386,6 +1389,7 @@
               const struct ureg_dst *dst,
               unsigned nr_dst,
               unsigned target,
+              unsigned return_type,
               const struct tgsi_texture_offset *texoffsets,
               unsigned nr_offset,
               const struct ureg_src *src,
@@ -1404,10 +1408,12 @@
    insn = ureg_emit_insn(ureg,
                          opcode,
                          saturate,
+                         0,
                          nr_dst,
                          nr_src);
 
-   ureg_emit_texture( ureg, insn.extended_token, target, nr_offset );
+   ureg_emit_texture( ureg, insn.extended_token, target, return_type,
+                      nr_offset );
 
    for (i = 0; i < nr_offset; i++)
       ureg_emit_texture_offset( ureg, &texoffsets[i]);
@@ -1439,6 +1445,7 @@
    insn = ureg_emit_insn(ureg,
                          opcode,
                          FALSE,
+                         0,
                          nr_dst,
                          nr_src);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index d301915..ed8c177 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -546,7 +546,8 @@
           const struct ureg_dst *dst,
           unsigned nr_dst,
           const struct ureg_src *src,
-          unsigned nr_src );
+          unsigned nr_src,
+          unsigned precise );
 
 
 void
@@ -555,6 +556,7 @@
               const struct ureg_dst *dst,
               unsigned nr_dst,
               unsigned target,
+              unsigned return_type,
               const struct tgsi_texture_offset *texoffsets,
               unsigned nr_offset,
               const struct ureg_src *src,
@@ -585,6 +587,7 @@
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               unsigned precise,
                unsigned num_dst,
                unsigned num_src);
 
@@ -596,7 +599,7 @@
 void
 ureg_emit_texture(struct ureg_program *ureg,
                   unsigned insn_token,
-                  unsigned target, unsigned num_offsets);
+                  unsigned target, unsigned return_type, unsigned num_offsets);
 
 void
 ureg_emit_texture_offset(struct ureg_program *ureg,
@@ -631,6 +634,7 @@
                          opcode,                                \
                          FALSE,                                 \
                          0,                                     \
+                         0,                                     \
                          0);                                    \
    ureg_fixup_insn_size( ureg, insn.insn_token );               \
 }
@@ -645,6 +649,7 @@
                          opcode,                                \
                          FALSE,                                 \
                          0,                                     \
+                         0,                                     \
                          1);                                    \
    ureg_emit_src( ureg, src );                                  \
    ureg_fixup_insn_size( ureg, insn.insn_token );               \
@@ -660,6 +665,7 @@
                          opcode,                                \
                          FALSE,                                 \
                          0,                                     \
+                         0,                                     \
                          0);                                    \
    ureg_emit_label( ureg, insn.extended_token, label_token );   \
    ureg_fixup_insn_size( ureg, insn.insn_token );               \
@@ -676,6 +682,7 @@
                          opcode,                                \
                          FALSE,                                 \
                          0,                                     \
+                         0,                                     \
                          1);                                    \
    ureg_emit_label( ureg, insn.extended_token, label_token );   \
    ureg_emit_src( ureg, src );                                  \
@@ -693,6 +700,7 @@
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          0);                                            \
    ureg_emit_dst( ureg, dst );                                          \
@@ -712,6 +720,7 @@
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          1);                                            \
    ureg_emit_dst( ureg, dst );                                          \
@@ -732,6 +741,7 @@
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          2);                                            \
    ureg_emit_dst( ureg, dst );                                          \
@@ -748,38 +758,18 @@
                               struct ureg_src src1 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
+   unsigned return_type = TGSI_RETURN_TYPE_UNKNOWN;                     \
    struct ureg_emit_insn_result insn;                                   \
    if (ureg_dst_is_empty(dst))                                          \
       return;                                                           \
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          2);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );		\
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
-#define OP12_SAMPLE( op )                                               \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned target = TGSI_TEXTURE_UNKNOWN;                              \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         2);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );           \
+   ureg_emit_texture( ureg, insn.extended_token, target,                \
+                      return_type, 0 );                                 \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -800,6 +790,7 @@
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          3);                                            \
    ureg_emit_dst( ureg, dst );                                          \
@@ -809,31 +800,6 @@
    ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
-#define OP13_SAMPLE( op )                                               \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1,                     \
-                              struct ureg_src src2 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned target = TGSI_TEXTURE_UNKNOWN;                              \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         3);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );           \
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_emit_src( ureg, src2 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
 #define OP14_TEX( op )                                                  \
 static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
@@ -844,15 +810,18 @@
                               struct ureg_src src3 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
+   unsigned return_type = TGSI_RETURN_TYPE_UNKNOWN;                     \
    struct ureg_emit_insn_result insn;                                   \
    if (ureg_dst_is_empty(dst))                                          \
       return;                                                           \
    insn = ureg_emit_insn(ureg,                                          \
                          opcode,                                        \
                          dst.Saturate,                                  \
+                         0,                                             \
                          1,                                             \
                          4);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );		\
+   ureg_emit_texture( ureg, insn.extended_token, target,                \
+                      return_type, 0 );                                 \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -861,116 +830,6 @@
    ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
-#define OP14_SAMPLE( op )                                               \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1,                     \
-                              struct ureg_src src2,                     \
-                              struct ureg_src src3 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned target = TGSI_TEXTURE_UNKNOWN;                              \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         4);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );           \
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_emit_src( ureg, src2 );                                         \
-   ureg_emit_src( ureg, src3 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
-
-#define OP14( op )                                                      \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1,                     \
-                              struct ureg_src src2,                     \
-                              struct ureg_src src3 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         4);                                            \
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_emit_src( ureg, src2 );                                         \
-   ureg_emit_src( ureg, src3 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
-
-#define OP15( op )                                                      \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1,                     \
-                              struct ureg_src src2,                     \
-                              struct ureg_src src3,                     \
-                              struct ureg_src src4 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         5);                                            \
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_emit_src( ureg, src2 );                                         \
-   ureg_emit_src( ureg, src3 );                                         \
-   ureg_emit_src( ureg, src4 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
-#define OP15_SAMPLE( op )                                               \
-static inline void ureg_##op( struct ureg_program *ureg,                \
-                              struct ureg_dst dst,                      \
-                              struct ureg_src src0,                     \
-                              struct ureg_src src1,                     \
-                              struct ureg_src src2,                     \
-                              struct ureg_src src3,                     \
-                              struct ureg_src src4 )                    \
-{                                                                       \
-   unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned target = TGSI_TEXTURE_UNKNOWN;                              \
-   struct ureg_emit_insn_result insn;                                   \
-   if (ureg_dst_is_empty(dst))                                          \
-      return;                                                           \
-   insn = ureg_emit_insn(ureg,                                          \
-                         opcode,                                        \
-                         dst.Saturate,                                  \
-                         1,                                             \
-                         5);                                            \
-   ureg_emit_texture( ureg, insn.extended_token, target, 0 );           \
-   ureg_emit_dst( ureg, dst );                                          \
-   ureg_emit_src( ureg, src0 );                                         \
-   ureg_emit_src( ureg, src1 );                                         \
-   ureg_emit_src( ureg, src2 );                                         \
-   ureg_emit_src( ureg, src3 );                                         \
-   ureg_emit_src( ureg, src4 );                                         \
-   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
-}
-
 /* Use a template include to generate a correctly-typed ureg_OP()
  * function for each TGSI opcode:
  */
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 3b460e1..9f8be2f 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
  /*
@@ -60,15 +60,17 @@
 
       emit_func emit;
       unsigned output_offset;
-      
+
       const uint8_t *input_ptr;
       unsigned input_stride;
       unsigned max_index;
 
-      /* this value is set to -1 if this is a normal element with output_format != input_format:
-       * in this case, u_format is used to do a full conversion
+      /* this value is set to -1 if this is a normal element with
+       * output_format != input_format: in this case, u_format is used
+       * to do a full conversion
        *
-       * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+       * this value is set to the format size in bytes if
+       * output_format == input_format or for 32-bit instance ids:
        * in this case, memcpy is used to copy this amount of bytes
        */
       int copy_size;
@@ -79,11 +81,13 @@
 };
 
 
-static struct translate_generic *translate_generic( struct translate *translate )
+static struct translate_generic *
+translate_generic(struct translate *translate)
 {
    return (struct translate_generic *)translate;
 }
 
+
 /**
  * Fetch a dword[4] vertex attribute from memory, doing format/type
  * conversion as needed.
@@ -91,7 +95,7 @@
  * This is probably needed/dupliocated elsewhere, eg format
  * conversion, texture sampling etc.
  */
-#define ATTRIB( NAME, SZ, SRCTYPE, DSTTYPE, TO )        \
+#define ATTRIB(NAME, SZ, SRCTYPE, DSTTYPE, TO)  	\
 static void						\
 emit_##NAME(const void *attrib, void *ptr)		\
 {  \
@@ -130,113 +134,113 @@
 #define TO_INT(x)        (x)
 
 
-ATTRIB( R64G64B64A64_FLOAT,   4, float, double, TO_64_FLOAT )
-ATTRIB( R64G64B64_FLOAT,      3, float, double, TO_64_FLOAT )
-ATTRIB( R64G64_FLOAT,         2, float, double, TO_64_FLOAT )
-ATTRIB( R64_FLOAT,            1, float, double, TO_64_FLOAT )
+ATTRIB(R64G64B64A64_FLOAT,   4, float, double, TO_64_FLOAT)
+ATTRIB(R64G64B64_FLOAT,      3, float, double, TO_64_FLOAT)
+ATTRIB(R64G64_FLOAT,         2, float, double, TO_64_FLOAT)
+ATTRIB(R64_FLOAT,            1, float, double, TO_64_FLOAT)
 
-ATTRIB( R32G32B32A32_FLOAT,   4, float, float, TO_32_FLOAT )
-ATTRIB( R32G32B32_FLOAT,      3, float, float, TO_32_FLOAT )
-ATTRIB( R32G32_FLOAT,         2, float, float, TO_32_FLOAT )
-ATTRIB( R32_FLOAT,            1, float, float, TO_32_FLOAT )
+ATTRIB(R32G32B32A32_FLOAT,   4, float, float, TO_32_FLOAT)
+ATTRIB(R32G32B32_FLOAT,      3, float, float, TO_32_FLOAT)
+ATTRIB(R32G32_FLOAT,         2, float, float, TO_32_FLOAT)
+ATTRIB(R32_FLOAT,            1, float, float, TO_32_FLOAT)
 
-ATTRIB( R16G16B16A16_FLOAT,   4, float, ushort, TO_16_FLOAT )
-ATTRIB( R16G16B16_FLOAT,      3, float, ushort, TO_16_FLOAT )
-ATTRIB( R16G16_FLOAT,         2, float, ushort, TO_16_FLOAT )
-ATTRIB( R16_FLOAT,            1, float, ushort, TO_16_FLOAT )
+ATTRIB(R16G16B16A16_FLOAT,   4, float, ushort, TO_16_FLOAT)
+ATTRIB(R16G16B16_FLOAT,      3, float, ushort, TO_16_FLOAT)
+ATTRIB(R16G16_FLOAT,         2, float, ushort, TO_16_FLOAT)
+ATTRIB(R16_FLOAT,            1, float, ushort, TO_16_FLOAT)
 
-ATTRIB( R32G32B32A32_USCALED, 4, float, unsigned, TO_32_USCALED )
-ATTRIB( R32G32B32_USCALED,    3, float, unsigned, TO_32_USCALED )
-ATTRIB( R32G32_USCALED,       2, float, unsigned, TO_32_USCALED )
-ATTRIB( R32_USCALED,          1, float, unsigned, TO_32_USCALED )
+ATTRIB(R32G32B32A32_USCALED, 4, float, unsigned, TO_32_USCALED)
+ATTRIB(R32G32B32_USCALED,    3, float, unsigned, TO_32_USCALED)
+ATTRIB(R32G32_USCALED,       2, float, unsigned, TO_32_USCALED)
+ATTRIB(R32_USCALED,          1, float, unsigned, TO_32_USCALED)
 
-ATTRIB( R32G32B32A32_SSCALED, 4, float, int, TO_32_SSCALED )
-ATTRIB( R32G32B32_SSCALED,    3, float, int, TO_32_SSCALED )
-ATTRIB( R32G32_SSCALED,       2, float, int, TO_32_SSCALED )
-ATTRIB( R32_SSCALED,          1, float, int, TO_32_SSCALED )
+ATTRIB(R32G32B32A32_SSCALED, 4, float, int, TO_32_SSCALED)
+ATTRIB(R32G32B32_SSCALED,    3, float, int, TO_32_SSCALED)
+ATTRIB(R32G32_SSCALED,       2, float, int, TO_32_SSCALED)
+ATTRIB(R32_SSCALED,          1, float, int, TO_32_SSCALED)
 
-ATTRIB( R32G32B32A32_UNORM, 4, float, unsigned, TO_32_UNORM )
-ATTRIB( R32G32B32_UNORM,    3, float, unsigned, TO_32_UNORM )
-ATTRIB( R32G32_UNORM,       2, float, unsigned, TO_32_UNORM )
-ATTRIB( R32_UNORM,          1, float, unsigned, TO_32_UNORM )
+ATTRIB(R32G32B32A32_UNORM, 4, float, unsigned, TO_32_UNORM)
+ATTRIB(R32G32B32_UNORM,    3, float, unsigned, TO_32_UNORM)
+ATTRIB(R32G32_UNORM,       2, float, unsigned, TO_32_UNORM)
+ATTRIB(R32_UNORM,          1, float, unsigned, TO_32_UNORM)
 
-ATTRIB( R32G32B32A32_SNORM, 4, float, int, TO_32_SNORM )
-ATTRIB( R32G32B32_SNORM,    3, float, int, TO_32_SNORM )
-ATTRIB( R32G32_SNORM,       2, float, int, TO_32_SNORM )
-ATTRIB( R32_SNORM,          1, float, int, TO_32_SNORM )
+ATTRIB(R32G32B32A32_SNORM, 4, float, int, TO_32_SNORM)
+ATTRIB(R32G32B32_SNORM,    3, float, int, TO_32_SNORM)
+ATTRIB(R32G32_SNORM,       2, float, int, TO_32_SNORM)
+ATTRIB(R32_SNORM,          1, float, int, TO_32_SNORM)
 
-ATTRIB( R16G16B16A16_USCALED, 4, float, ushort, TO_16_USCALED )
-ATTRIB( R16G16B16_USCALED,    3, float, ushort, TO_16_USCALED )
-ATTRIB( R16G16_USCALED,       2, float, ushort, TO_16_USCALED )
-ATTRIB( R16_USCALED,          1, float, ushort, TO_16_USCALED )
+ATTRIB(R16G16B16A16_USCALED, 4, float, ushort, TO_16_USCALED)
+ATTRIB(R16G16B16_USCALED,    3, float, ushort, TO_16_USCALED)
+ATTRIB(R16G16_USCALED,       2, float, ushort, TO_16_USCALED)
+ATTRIB(R16_USCALED,          1, float, ushort, TO_16_USCALED)
 
-ATTRIB( R16G16B16A16_SSCALED, 4, float, short, TO_16_SSCALED )
-ATTRIB( R16G16B16_SSCALED,    3, float, short, TO_16_SSCALED )
-ATTRIB( R16G16_SSCALED,       2, float, short, TO_16_SSCALED )
-ATTRIB( R16_SSCALED,          1, float, short, TO_16_SSCALED )
+ATTRIB(R16G16B16A16_SSCALED, 4, float, short, TO_16_SSCALED)
+ATTRIB(R16G16B16_SSCALED,    3, float, short, TO_16_SSCALED)
+ATTRIB(R16G16_SSCALED,       2, float, short, TO_16_SSCALED)
+ATTRIB(R16_SSCALED,          1, float, short, TO_16_SSCALED)
 
-ATTRIB( R16G16B16A16_UNORM, 4, float, ushort, TO_16_UNORM )
-ATTRIB( R16G16B16_UNORM,    3, float, ushort, TO_16_UNORM )
-ATTRIB( R16G16_UNORM,       2, float, ushort, TO_16_UNORM )
-ATTRIB( R16_UNORM,          1, float, ushort, TO_16_UNORM )
+ATTRIB(R16G16B16A16_UNORM, 4, float, ushort, TO_16_UNORM)
+ATTRIB(R16G16B16_UNORM,    3, float, ushort, TO_16_UNORM)
+ATTRIB(R16G16_UNORM,       2, float, ushort, TO_16_UNORM)
+ATTRIB(R16_UNORM,          1, float, ushort, TO_16_UNORM)
 
-ATTRIB( R16G16B16A16_SNORM, 4, float, short, TO_16_SNORM )
-ATTRIB( R16G16B16_SNORM,    3, float, short, TO_16_SNORM )
-ATTRIB( R16G16_SNORM,       2, float, short, TO_16_SNORM )
-ATTRIB( R16_SNORM,          1, float, short, TO_16_SNORM )
+ATTRIB(R16G16B16A16_SNORM, 4, float, short, TO_16_SNORM)
+ATTRIB(R16G16B16_SNORM,    3, float, short, TO_16_SNORM)
+ATTRIB(R16G16_SNORM,       2, float, short, TO_16_SNORM)
+ATTRIB(R16_SNORM,          1, float, short, TO_16_SNORM)
 
-ATTRIB( R8G8B8A8_USCALED,   4, float, ubyte, TO_8_USCALED )
-ATTRIB( R8G8B8_USCALED,     3, float, ubyte, TO_8_USCALED )
-ATTRIB( R8G8_USCALED,       2, float, ubyte, TO_8_USCALED )
-ATTRIB( R8_USCALED,         1, float, ubyte, TO_8_USCALED )
+ATTRIB(R8G8B8A8_USCALED,   4, float, ubyte, TO_8_USCALED)
+ATTRIB(R8G8B8_USCALED,     3, float, ubyte, TO_8_USCALED)
+ATTRIB(R8G8_USCALED,       2, float, ubyte, TO_8_USCALED)
+ATTRIB(R8_USCALED,         1, float, ubyte, TO_8_USCALED)
 
-ATTRIB( R8G8B8A8_SSCALED,  4, float, char, TO_8_SSCALED )
-ATTRIB( R8G8B8_SSCALED,    3, float, char, TO_8_SSCALED )
-ATTRIB( R8G8_SSCALED,      2, float, char, TO_8_SSCALED )
-ATTRIB( R8_SSCALED,        1, float, char, TO_8_SSCALED )
+ATTRIB(R8G8B8A8_SSCALED,  4, float, char, TO_8_SSCALED)
+ATTRIB(R8G8B8_SSCALED,    3, float, char, TO_8_SSCALED)
+ATTRIB(R8G8_SSCALED,      2, float, char, TO_8_SSCALED)
+ATTRIB(R8_SSCALED,        1, float, char, TO_8_SSCALED)
 
-ATTRIB( R8G8B8A8_UNORM,  4, float, ubyte, TO_8_UNORM )
-ATTRIB( R8G8B8_UNORM,    3, float, ubyte, TO_8_UNORM )
-ATTRIB( R8G8_UNORM,      2, float, ubyte, TO_8_UNORM )
-ATTRIB( R8_UNORM,        1, float, ubyte, TO_8_UNORM )
+ATTRIB(R8G8B8A8_UNORM,  4, float, ubyte, TO_8_UNORM)
+ATTRIB(R8G8B8_UNORM,    3, float, ubyte, TO_8_UNORM)
+ATTRIB(R8G8_UNORM,      2, float, ubyte, TO_8_UNORM)
+ATTRIB(R8_UNORM,        1, float, ubyte, TO_8_UNORM)
 
-ATTRIB( R8G8B8A8_SNORM,  4, float, char, TO_8_SNORM )
-ATTRIB( R8G8B8_SNORM,    3, float, char, TO_8_SNORM )
-ATTRIB( R8G8_SNORM,      2, float, char, TO_8_SNORM )
-ATTRIB( R8_SNORM,        1, float, char, TO_8_SNORM )
+ATTRIB(R8G8B8A8_SNORM,  4, float, char, TO_8_SNORM)
+ATTRIB(R8G8B8_SNORM,    3, float, char, TO_8_SNORM)
+ATTRIB(R8G8_SNORM,      2, float, char, TO_8_SNORM)
+ATTRIB(R8_SNORM,        1, float, char, TO_8_SNORM)
 
-ATTRIB( R32G32B32A32_UINT, 4, uint32_t, unsigned, TO_INT )
-ATTRIB( R32G32B32_UINT,    3, uint32_t, unsigned, TO_INT )
-ATTRIB( R32G32_UINT,       2, uint32_t, unsigned, TO_INT )
-ATTRIB( R32_UINT,          1, uint32_t, unsigned, TO_INT )
+ATTRIB(R32G32B32A32_UINT, 4, uint32_t, unsigned, TO_INT)
+ATTRIB(R32G32B32_UINT,    3, uint32_t, unsigned, TO_INT)
+ATTRIB(R32G32_UINT,       2, uint32_t, unsigned, TO_INT)
+ATTRIB(R32_UINT,          1, uint32_t, unsigned, TO_INT)
 
-ATTRIB( R16G16B16A16_UINT, 4, uint32_t, ushort, TO_INT )
-ATTRIB( R16G16B16_UINT,    3, uint32_t, ushort, TO_INT )
-ATTRIB( R16G16_UINT,       2, uint32_t, ushort, TO_INT )
-ATTRIB( R16_UINT,          1, uint32_t, ushort, TO_INT )
+ATTRIB(R16G16B16A16_UINT, 4, uint32_t, ushort, TO_INT)
+ATTRIB(R16G16B16_UINT,    3, uint32_t, ushort, TO_INT)
+ATTRIB(R16G16_UINT,       2, uint32_t, ushort, TO_INT)
+ATTRIB(R16_UINT,          1, uint32_t, ushort, TO_INT)
 
-ATTRIB( R8G8B8A8_UINT,   4, uint32_t, ubyte, TO_INT )
-ATTRIB( R8G8B8_UINT,     3, uint32_t, ubyte, TO_INT )
-ATTRIB( R8G8_UINT,       2, uint32_t, ubyte, TO_INT )
-ATTRIB( R8_UINT,         1, uint32_t, ubyte, TO_INT )
+ATTRIB(R8G8B8A8_UINT,   4, uint32_t, ubyte, TO_INT)
+ATTRIB(R8G8B8_UINT,     3, uint32_t, ubyte, TO_INT)
+ATTRIB(R8G8_UINT,       2, uint32_t, ubyte, TO_INT)
+ATTRIB(R8_UINT,         1, uint32_t, ubyte, TO_INT)
 
-ATTRIB( R32G32B32A32_SINT, 4, int32_t, int, TO_INT )
-ATTRIB( R32G32B32_SINT,    3, int32_t, int, TO_INT )
-ATTRIB( R32G32_SINT,       2, int32_t, int, TO_INT )
-ATTRIB( R32_SINT,          1, int32_t, int, TO_INT )
+ATTRIB(R32G32B32A32_SINT, 4, int32_t, int, TO_INT)
+ATTRIB(R32G32B32_SINT,    3, int32_t, int, TO_INT)
+ATTRIB(R32G32_SINT,       2, int32_t, int, TO_INT)
+ATTRIB(R32_SINT,          1, int32_t, int, TO_INT)
 
-ATTRIB( R16G16B16A16_SINT, 4, int32_t, short, TO_INT )
-ATTRIB( R16G16B16_SINT,    3, int32_t, short, TO_INT )
-ATTRIB( R16G16_SINT,       2, int32_t, short, TO_INT )
-ATTRIB( R16_SINT,          1, int32_t, short, TO_INT )
+ATTRIB(R16G16B16A16_SINT, 4, int32_t, short, TO_INT)
+ATTRIB(R16G16B16_SINT,    3, int32_t, short, TO_INT)
+ATTRIB(R16G16_SINT,       2, int32_t, short, TO_INT)
+ATTRIB(R16_SINT,          1, int32_t, short, TO_INT)
 
-ATTRIB( R8G8B8A8_SINT,   4, int32_t, char, TO_INT )
-ATTRIB( R8G8B8_SINT,     3, int32_t, char, TO_INT )
-ATTRIB( R8G8_SINT,       2, int32_t, char, TO_INT )
-ATTRIB( R8_SINT,         1, int32_t, char, TO_INT )
+ATTRIB(R8G8B8A8_SINT,   4, int32_t, char, TO_INT)
+ATTRIB(R8G8B8_SINT,     3, int32_t, char, TO_INT)
+ATTRIB(R8G8_SINT,       2, int32_t, char, TO_INT)
+ATTRIB(R8_SINT,         1, int32_t, char, TO_INT)
 
 static void
-emit_A8R8G8B8_UNORM( const void *attrib, void *ptr)
+emit_A8R8G8B8_UNORM(const void *attrib, void *ptr)
 {
    float *in = (float *)attrib;
    ubyte *out = (ubyte *)ptr;
@@ -247,7 +251,7 @@
 }
 
 static void
-emit_B8G8R8A8_UNORM( const void *attrib, void *ptr)
+emit_B8G8R8A8_UNORM(const void *attrib, void *ptr)
 {
    float *in = (float *)attrib;
    ubyte *out = (ubyte *)ptr;
@@ -258,7 +262,7 @@
 }
 
 static void
-emit_B10G10R10A2_UNORM( const void *attrib, void *ptr )
+emit_B10G10R10A2_UNORM(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -270,7 +274,7 @@
 }
 
 static void
-emit_B10G10R10A2_USCALED( const void *attrib, void *ptr )
+emit_B10G10R10A2_USCALED(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -282,7 +286,7 @@
 }
 
 static void
-emit_B10G10R10A2_SNORM( const void *attrib, void *ptr )
+emit_B10G10R10A2_SNORM(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -294,7 +298,7 @@
 }
 
 static void
-emit_B10G10R10A2_SSCALED( const void *attrib, void *ptr )
+emit_B10G10R10A2_SSCALED(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -306,7 +310,7 @@
 }
 
 static void
-emit_R10G10B10A2_UNORM( const void *attrib, void *ptr )
+emit_R10G10B10A2_UNORM(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -318,7 +322,7 @@
 }
 
 static void
-emit_R10G10B10A2_USCALED( const void *attrib, void *ptr )
+emit_R10G10B10A2_USCALED(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -330,7 +334,7 @@
 }
 
 static void
-emit_R10G10B10A2_SNORM( const void *attrib, void *ptr )
+emit_R10G10B10A2_SNORM(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -342,7 +346,7 @@
 }
 
 static void
-emit_R10G10B10A2_SSCALED( const void *attrib, void *ptr)
+emit_R10G10B10A2_SSCALED(const void *attrib, void *ptr)
 {
    float *src = (float *)ptr;
    uint32_t value = 0;
@@ -353,13 +357,14 @@
    *(uint32_t *)attrib = util_le32_to_cpu(value);
 }
 
-static void 
-emit_NULL( const void *attrib, void *ptr )
+static void
+emit_NULL(const void *attrib, void *ptr)
 {
    /* do nothing is the only sensible option */
 }
 
-static emit_func get_emit_func( enum pipe_format format )
+static emit_func
+get_emit_func(enum pipe_format format)
 {
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
@@ -576,16 +581,17 @@
       return &emit_R10G10B10A2_SSCALED;
 
    default:
-      assert(0); 
+      assert(0);
       return &emit_NULL;
    }
 }
 
-static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
-                                         unsigned elt,
-                                         unsigned start_instance,
-                                         unsigned instance_id,
-                                         void *vert )
+static ALWAYS_INLINE void PIPE_CDECL
+generic_run_one(struct translate_generic *tg,
+                unsigned elt,
+                unsigned start_instance,
+                unsigned instance_id,
+                void *vert)
 {
    unsigned nr_attrs = tg->nr_attrib;
    unsigned attr;
@@ -617,11 +623,10 @@
                (ptrdiff_t)tg->attrib[attr].input_stride * index;
 
          copy_size = tg->attrib[attr].copy_size;
-         if(likely(copy_size >= 0))
+         if (likely(copy_size >= 0)) {
             memcpy(dst, src, copy_size);
-         else
-         {
-            tg->attrib[attr].fetch( data, src, 0, 0 );
+         } else {
+            tg->attrib[attr].fetch(data, src, 0, 0);
 
             if (0)
                debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
@@ -632,15 +637,14 @@
                          index,
                          data[0], data[1],data[2], data[3]);
 
-            tg->attrib[attr].emit( data, dst );
+            tg->attrib[attr].emit(data, dst);
          }
       } else {
-         if(likely(tg->attrib[attr].copy_size >= 0))
+         if (likely(tg->attrib[attr].copy_size >= 0)) {
             memcpy(data, &instance_id, 4);
-         else
-         {
+         } else {
             data[0] = (float)instance_id;
-            tg->attrib[attr].emit( data, dst );
+            tg->attrib[attr].emit(data, dst);
          }
       }
    }
@@ -649,12 +653,13 @@
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void PIPE_CDECL generic_run_elts( struct translate *translate,
-                                         const unsigned *elts,
-                                         unsigned count,
-                                         unsigned start_instance,
-                                         unsigned instance_id,
-                                         void *output_buffer )
+static void PIPE_CDECL
+generic_run_elts(struct translate *translate,
+                 const unsigned *elts,
+                 unsigned count,
+                 unsigned start_instance,
+                 unsigned instance_id,
+                 void *output_buffer)
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
@@ -666,12 +671,13 @@
    }
 }
 
-static void PIPE_CDECL generic_run_elts16( struct translate *translate,
-                                         const uint16_t *elts,
-                                         unsigned count,
-                                         unsigned start_instance,
-                                         unsigned instance_id,
-                                         void *output_buffer )
+static void PIPE_CDECL
+generic_run_elts16(struct translate *translate,
+                   const uint16_t *elts,
+                   unsigned count,
+                   unsigned start_instance,
+                   unsigned instance_id,
+                   void *output_buffer)
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
@@ -683,12 +689,13 @@
    }
 }
 
-static void PIPE_CDECL generic_run_elts8( struct translate *translate,
-                                         const uint8_t *elts,
-                                         unsigned count,
-                                         unsigned start_instance, 
-                                         unsigned instance_id,
-                                         void *output_buffer )
+static void PIPE_CDECL
+generic_run_elts8(struct translate *translate,
+                  const uint8_t *elts,
+                  unsigned count,
+                  unsigned start_instance,
+                  unsigned instance_id,
+                  void *output_buffer)
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
@@ -700,12 +707,13 @@
    }
 }
 
-static void PIPE_CDECL generic_run( struct translate *translate,
-                                    unsigned start,
-                                    unsigned count,
-                                    unsigned start_instance,
-                                    unsigned instance_id,
-                                    void *output_buffer )
+static void PIPE_CDECL
+generic_run(struct translate *translate,
+            unsigned start,
+            unsigned count,
+            unsigned start_instance,
+            unsigned instance_id,
+            void *output_buffer)
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
@@ -718,28 +726,30 @@
 }
 
 
-			       
-static void generic_set_buffer( struct translate *translate,
-				unsigned buf,
-				const void *ptr,
-				unsigned stride,
-				unsigned max_index )
+
+static void
+generic_set_buffer(struct translate *translate,
+                   unsigned buf,
+                   const void *ptr,
+                   unsigned stride,
+                   unsigned max_index)
 {
    struct translate_generic *tg = translate_generic(translate);
    unsigned i;
 
    for (i = 0; i < tg->nr_attrib; i++) {
       if (tg->attrib[i].buffer == buf) {
-	 tg->attrib[i].input_ptr = ((const uint8_t *)ptr +
-				    tg->attrib[i].input_offset);
-	 tg->attrib[i].input_stride = stride;
+         tg->attrib[i].input_ptr = ((const uint8_t *)ptr +
+                                    tg->attrib[i].input_offset);
+         tg->attrib[i].input_stride = stride;
          tg->attrib[i].max_index = max_index;
       }
    }
 }
 
 
-static void generic_release( struct translate *translate )
+static void
+generic_release(struct translate *translate)
 {
    /* Refcount?
     */
@@ -747,8 +757,8 @@
 }
 
 static boolean
-is_legal_int_format_combo( const struct util_format_description *src,
-                           const struct util_format_description *dst )
+is_legal_int_format_combo(const struct util_format_description *src,
+                          const struct util_format_description *dst)
 {
    unsigned i;
    unsigned nr = MIN2(src->nr_channels, dst->nr_channels);
@@ -767,7 +777,8 @@
    return TRUE;
 }
 
-struct translate *translate_generic_create( const struct translate_key *key )
+struct translate *
+translate_generic_create(const struct translate_key *key)
 {
    struct translate_generic *tg = CALLOC_STRUCT(translate_generic);
    unsigned i;
@@ -821,37 +832,33 @@
       tg->attrib[i].output_offset = key->element[i].output_offset;
 
       tg->attrib[i].copy_size = -1;
-      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
-      {
-            if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
-                  || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
-               tg->attrib[i].copy_size = 4;
-      }
-      else
-      {
-         if(key->element[i].input_format == key->element[i].output_format
-               && format_desc->block.width == 1
-               && format_desc->block.height == 1
-               && !(format_desc->block.bits & 7))
+      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) {
+         if (key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+             || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+            tg->attrib[i].copy_size = 4;
+      } else {
+         if (key->element[i].input_format == key->element[i].output_format
+             && format_desc->block.width == 1
+             && format_desc->block.height == 1
+             && !(format_desc->block.bits & 7))
             tg->attrib[i].copy_size = format_desc->block.bits >> 3;
       }
 
-      if(tg->attrib[i].copy_size < 0)
-	      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      if (tg->attrib[i].copy_size < 0)
+         tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       else
-	      tg->attrib[i].emit  = NULL;
+         tg->attrib[i].emit  = NULL;
    }
 
    tg->nr_attrib = key->nr_elements;
 
-
    return &tg->translate;
 }
 
-boolean translate_generic_is_output_format_supported(enum pipe_format format)
+boolean
+translate_generic_is_output_format_supported(enum pipe_format format)
 {
-   switch(format)
-   {
+   switch(format) {
    case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE;
    case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE;
    case PIPE_FORMAT_R64G64_FLOAT: return TRUE;
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 6d8178e..3e49667 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -186,7 +186,7 @@
          util_make_fragment_tex_shader_writemask(ctx->pipe, tgsi_tex,
                                                  TGSI_INTERPOLATE_LINEAR,
                                                  writemask,
-                                                 stype, stype);
+                                                 stype, stype, false, false);
    }
 
    cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask][idx]);
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 447d8d2..65c6f5d 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -76,16 +76,20 @@
    void *fs_write_one_cbuf;
    void *fs_write_all_cbufs;
 
-   /* FS which outputs a color from a texture.
-    * The first index indicates the texture type / destination type,
-    * the second index is the PIPE_TEXTURE_* to be sampled. */
-   void *fs_texfetch_col[5][PIPE_MAX_TEXTURE_TYPES];
+   /* FS which outputs a color from a texture where
+    * the 1st index indicates the texture type / destination type,
+    * the 2nd index is the PIPE_TEXTURE_* to be sampled,
+    * the 3rd index is 0 = use TEX, 1 = use TXF.
+    */
+   void *fs_texfetch_col[5][PIPE_MAX_TEXTURE_TYPES][2];
 
-   /* FS which outputs a depth from a texture,
-      where the index is PIPE_TEXTURE_* to be sampled. */
-   void *fs_texfetch_depth[PIPE_MAX_TEXTURE_TYPES];
-   void *fs_texfetch_depthstencil[PIPE_MAX_TEXTURE_TYPES];
-   void *fs_texfetch_stencil[PIPE_MAX_TEXTURE_TYPES];
+   /* FS which outputs a depth from a texture, where
+    * the 1st index is the PIPE_TEXTURE_* to be sampled,
+    * the 2nd index is 0 = use TEX, 1 = use TXF.
+    */
+   void *fs_texfetch_depth[PIPE_MAX_TEXTURE_TYPES][2];
+   void *fs_texfetch_depthstencil[PIPE_MAX_TEXTURE_TYPES][2];
+   void *fs_texfetch_stencil[PIPE_MAX_TEXTURE_TYPES][2];
 
    /* FS which outputs one sample from a multisample texture. */
    void *fs_texfetch_col_msaa[5][PIPE_MAX_TEXTURE_TYPES];
@@ -132,6 +136,9 @@
    boolean has_stream_out;
    boolean has_stencil_export;
    boolean has_texture_multisample;
+   boolean has_tex_lz;
+   boolean has_txf;
+   boolean cube_as_2darray;
    boolean cached_all_shaders;
 
    /* The Draw module overrides these functions.
@@ -197,6 +204,13 @@
    ctx->has_texture_multisample =
       pipe->screen->get_param(pipe->screen, PIPE_CAP_TEXTURE_MULTISAMPLE);
 
+   ctx->has_tex_lz = pipe->screen->get_param(pipe->screen,
+                                             PIPE_CAP_TGSI_TEX_TXF_LZ);
+   ctx->has_txf = pipe->screen->get_param(pipe->screen,
+                                          PIPE_CAP_GLSL_FEATURE_LEVEL) > 130;
+   ctx->cube_as_2darray = pipe->screen->get_param(pipe->screen,
+                                                  PIPE_CAP_SAMPLER_VIEW_TARGET);
+
    /* blend state objects */
    memset(&blend, 0, sizeof(blend));
 
@@ -449,18 +463,22 @@
 
    for (i = 0; i < PIPE_MAX_TEXTURE_TYPES; i++) {
       for (unsigned type = 0; type < ARRAY_SIZE(ctx->fs_texfetch_col); ++type) {
-         if (ctx->fs_texfetch_col[type][i])
-            ctx->delete_fs_state(pipe, ctx->fs_texfetch_col[type][i]);
+         for (unsigned inst = 0; inst < 2; inst++) {
+            if (ctx->fs_texfetch_col[type][i][inst])
+               ctx->delete_fs_state(pipe, ctx->fs_texfetch_col[type][i][inst]);
+         }
          if (ctx->fs_texfetch_col_msaa[type][i])
             ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_msaa[type][i]);
       }
 
-      if (ctx->fs_texfetch_depth[i])
-         ctx->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
-      if (ctx->fs_texfetch_depthstencil[i])
-         ctx->delete_fs_state(pipe, ctx->fs_texfetch_depthstencil[i]);
-      if (ctx->fs_texfetch_stencil[i])
-         ctx->delete_fs_state(pipe, ctx->fs_texfetch_stencil[i]);
+      for (unsigned inst = 0; inst < 2; inst++) {
+         if (ctx->fs_texfetch_depth[i][inst])
+            ctx->delete_fs_state(pipe, ctx->fs_texfetch_depth[i][inst]);
+         if (ctx->fs_texfetch_depthstencil[i][inst])
+            ctx->delete_fs_state(pipe, ctx->fs_texfetch_depthstencil[i][inst]);
+         if (ctx->fs_texfetch_stencil[i][inst])
+            ctx->delete_fs_state(pipe, ctx->fs_texfetch_stencil[i][inst]);
+      }
 
       if (ctx->fs_texfetch_depth_msaa[i])
          ctx->delete_fs_state(pipe, ctx->fs_texfetch_depth_msaa[i]);
@@ -539,7 +557,7 @@
    /* Vertex buffer. */
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1,
                             &ctx->base.saved_vertex_buffer);
-   pipe_resource_reference(&ctx->base.saved_vertex_buffer.buffer, NULL);
+   pipe_vertex_buffer_unreference(&ctx->base.saved_vertex_buffer);
 
    /* Vertex elements. */
    pipe->bind_vertex_elements_state(pipe, ctx->base.saved_velem_state);
@@ -751,13 +769,13 @@
 
 static void get_texcoords(struct pipe_sampler_view *src,
                           unsigned src_width0, unsigned src_height0,
-                          int x1, int y1, int x2, int y2,
+                          int x1, int y1, int x2, int y2, bool uses_txf,
                           float out[4])
 {
-   struct pipe_resource *tex = src->texture;
    unsigned level = src->u.tex.first_level;
-   boolean normalized = tex->target != PIPE_TEXTURE_RECT &&
-                        tex->nr_samples <= 1;
+   boolean normalized = !uses_txf &&
+                        src->target != PIPE_TEXTURE_RECT &&
+                        src->texture->nr_samples <= 1;
 
    if (normalized) {
       out[0] = x1 / (float)u_minify(src_width0,  level);
@@ -792,16 +810,18 @@
                                   struct pipe_sampler_view *src,
                                   unsigned src_width0, unsigned src_height0,
                                   float layer, unsigned sample,
-                                  int x1, int y1, int x2, int y2)
+                                  int x1, int y1, int x2, int y2,
+                                  bool uses_txf)
 {
    unsigned i;
    float coord[4];
    float face_coord[4][2];
 
-   get_texcoords(src, src_width0, src_height0, x1, y1, x2, y2, coord);
+   get_texcoords(src, src_width0, src_height0, x1, y1, x2, y2, uses_txf,
+                 coord);
 
-   if (src->texture->target == PIPE_TEXTURE_CUBE ||
-       src->texture->target == PIPE_TEXTURE_CUBE_ARRAY) {
+   if (src->target == PIPE_TEXTURE_CUBE ||
+       src->target == PIPE_TEXTURE_CUBE_ARRAY) {
       set_texcoords_in_vertices(coord, &face_coord[0][0], 2);
       util_map_texcoords2d_onto_cubemap((unsigned)layer % 6,
                                         /* pointer, stride in floats */
@@ -813,11 +833,14 @@
    }
 
    /* Set the layer. */
-   switch (src->texture->target) {
+   switch (src->target) {
    case PIPE_TEXTURE_3D:
       {
-         float r = layer / (float)u_minify(src->texture->depth0,
-                                           src->u.tex.first_level);
+         float r = layer;
+
+         if (!uses_txf)
+            r /= u_minify(src->texture->depth0, src->u.tex.first_level);
+
          for (i = 0; i < 4; i++)
             ctx->vertices[i][1][2] = r; /*r*/
       }
@@ -863,7 +886,8 @@
                                          enum pipe_texture_target target,
                                          unsigned src_nr_samples,
                                          unsigned dst_nr_samples,
-                                         unsigned filter)
+                                         unsigned filter,
+                                         bool use_txf)
 {
    struct pipe_context *pipe = ctx->base.pipe;
    unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(target, src_nr_samples);
@@ -945,14 +969,20 @@
 
       return *shader;
    } else {
-      void **shader = &ctx->fs_texfetch_col[type][target];
+      void **shader;
+
+      if (use_txf)
+         shader = &ctx->fs_texfetch_col[type][target][1];
+      else
+         shader = &ctx->fs_texfetch_col[type][target][0];
 
       /* Create the fragment shader on-demand. */
       if (!*shader) {
          assert(!ctx->cached_all_shaders);
          *shader = util_make_fragment_tex_shader(pipe, tgsi_tex,
                                                  TGSI_INTERPOLATE_LINEAR,
-                                                 stype, dtype);
+                                                 stype, dtype,
+                                                 ctx->has_tex_lz, use_txf);
       }
 
       return *shader;
@@ -962,7 +992,8 @@
 static inline
 void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
                                     enum pipe_texture_target target,
-                                    unsigned nr_samples)
+                                    unsigned nr_samples,
+                                    bool use_txf)
 {
    struct pipe_context *pipe = ctx->base.pipe;
 
@@ -981,7 +1012,12 @@
 
       return *shader;
    } else {
-      void **shader = &ctx->fs_texfetch_depth[target];
+      void **shader;
+
+      if (use_txf)
+         shader = &ctx->fs_texfetch_depth[target][1];
+      else
+         shader = &ctx->fs_texfetch_depth[target][0];
 
       /* Create the fragment shader on-demand. */
       if (!*shader) {
@@ -990,7 +1026,8 @@
          tgsi_tex = util_pipe_tex_to_tgsi_tex(target, 0);
          *shader =
             util_make_fragment_tex_shader_writedepth(pipe, tgsi_tex,
-                                                     TGSI_INTERPOLATE_LINEAR);
+                                                     TGSI_INTERPOLATE_LINEAR,
+                                                     ctx->has_tex_lz, use_txf);
       }
 
       return *shader;
@@ -1000,7 +1037,8 @@
 static inline
 void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
                                            enum pipe_texture_target target,
-                                           unsigned nr_samples)
+                                           unsigned nr_samples,
+                                           bool use_txf)
 {
    struct pipe_context *pipe = ctx->base.pipe;
 
@@ -1019,7 +1057,12 @@
 
       return *shader;
    } else {
-      void **shader = &ctx->fs_texfetch_depthstencil[target];
+      void **shader;
+
+      if (use_txf)
+         shader = &ctx->fs_texfetch_depthstencil[target][1];
+      else
+         shader = &ctx->fs_texfetch_depthstencil[target][0];
 
       /* Create the fragment shader on-demand. */
       if (!*shader) {
@@ -1028,7 +1071,9 @@
          tgsi_tex = util_pipe_tex_to_tgsi_tex(target, 0);
          *shader =
             util_make_fragment_tex_shader_writedepthstencil(pipe, tgsi_tex,
-                                                     TGSI_INTERPOLATE_LINEAR);
+                                                            TGSI_INTERPOLATE_LINEAR,
+                                                            ctx->has_tex_lz,
+                                                            use_txf);
       }
 
       return *shader;
@@ -1038,7 +1083,8 @@
 static inline
 void *blitter_get_fs_texfetch_stencil(struct blitter_context_priv *ctx,
                                       enum pipe_texture_target target,
-                                      unsigned nr_samples)
+                                      unsigned nr_samples,
+                                      bool use_txf)
 {
    struct pipe_context *pipe = ctx->base.pipe;
 
@@ -1057,7 +1103,12 @@
 
       return *shader;
    } else {
-      void **shader = &ctx->fs_texfetch_stencil[target];
+      void **shader;
+
+      if (use_txf)
+         shader = &ctx->fs_texfetch_stencil[target][1];
+      else
+         shader = &ctx->fs_texfetch_stencil[target][0];
 
       /* Create the fragment shader on-demand. */
       if (!*shader) {
@@ -1066,7 +1117,8 @@
          tgsi_tex = util_pipe_tex_to_tgsi_tex(target, 0);
          *shader =
             util_make_fragment_tex_shader_writestencil(pipe, tgsi_tex,
-                                                       TGSI_INTERPOLATE_LINEAR);
+                                                       TGSI_INTERPOLATE_LINEAR,
+                                                       ctx->has_tex_lz, use_txf);
       }
 
       return *shader;
@@ -1085,7 +1137,7 @@
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = blitter->pipe;
    struct pipe_screen *screen = pipe->screen;
-   unsigned samples, j, f, target, max_samples;
+   unsigned samples, j, f, target, max_samples, use_txf;
    boolean has_arraytex, has_cubearraytex;
 
    max_samples = ctx->has_texture_multisample ? 2 : 1;
@@ -1097,65 +1149,73 @@
    /* It only matters if i <= 1 or > 1. */
    for (samples = 1; samples <= max_samples; samples++) {
       for (target = PIPE_TEXTURE_1D; target < PIPE_MAX_TEXTURE_TYPES; target++) {
-         if (!has_arraytex &&
-             (target == PIPE_TEXTURE_1D_ARRAY ||
-              target == PIPE_TEXTURE_2D_ARRAY)) {
-            continue;
-         }
-         if (!has_cubearraytex &&
-             (target == PIPE_TEXTURE_CUBE_ARRAY))
-            continue;
-
-	 if (samples > 1 &&
-	     (target != PIPE_TEXTURE_2D &&
-	      target != PIPE_TEXTURE_2D_ARRAY))
-	    continue;
-
-         /* If samples == 1, the shaders read one texel. If samples >= 1,
-          * they read one sample.
-          */
-         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_FLOAT,
-                                     PIPE_FORMAT_R32_FLOAT, target,
-                                     samples, samples, 0);
-         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
-                                     PIPE_FORMAT_R32_UINT, target,
-                                     samples, samples, 0);
-         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
-                                     PIPE_FORMAT_R32_SINT, target,
-                                     samples, samples, 0);
-         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
-                                     PIPE_FORMAT_R32_SINT, target,
-                                     samples, samples, 0);
-         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
-                                     PIPE_FORMAT_R32_UINT, target,
-                                     samples, samples, 0);
-         blitter_get_fs_texfetch_depth(ctx, target, samples);
-         if (ctx->has_stencil_export) {
-            blitter_get_fs_texfetch_depthstencil(ctx, target, samples);
-            blitter_get_fs_texfetch_stencil(ctx, target, samples);
-         }
-
-         if (samples == 1)
-            continue;
-
-         /* MSAA resolve shaders. */
-         for (j = 2; j < 32; j++) {
-            if (!screen->is_format_supported(screen, PIPE_FORMAT_R32_FLOAT,
-                                             target, j,
-                                             PIPE_BIND_SAMPLER_VIEW)) {
+         for (use_txf = 0; use_txf <= ctx->has_txf; use_txf++) {
+            if (!has_arraytex &&
+                (target == PIPE_TEXTURE_1D_ARRAY ||
+                 target == PIPE_TEXTURE_2D_ARRAY)) {
                continue;
             }
+            if (!has_cubearraytex &&
+                (target == PIPE_TEXTURE_CUBE_ARRAY))
+               continue;
 
-            for (f = 0; f < 2; f++) {
-               blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_FLOAT,
-                                           PIPE_FORMAT_R32_FLOAT, target,
-                                           j, 1, f);
-               blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
-                                           PIPE_FORMAT_R32_UINT, target,
-                                           j, 1, f);
-               blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
-                                           PIPE_FORMAT_R32_SINT, target,
-                                           j, 1, f);
+            if (samples > 1 &&
+                (target != PIPE_TEXTURE_2D &&
+                 target != PIPE_TEXTURE_2D_ARRAY))
+               continue;
+
+            if (samples > 1 && use_txf)
+               continue; /* TXF is the only option, use_txf has no effect */
+
+            /* If samples == 1, the shaders read one texel. If samples >= 1,
+             * they read one sample.
+             */
+            blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_FLOAT,
+                                        PIPE_FORMAT_R32_FLOAT, target,
+                                        samples, samples, 0, use_txf);
+            blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
+                                        PIPE_FORMAT_R32_UINT, target,
+                                        samples, samples, 0, use_txf);
+            blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
+                                        PIPE_FORMAT_R32_SINT, target,
+                                        samples, samples, 0, use_txf);
+            blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
+                                        PIPE_FORMAT_R32_SINT, target,
+                                        samples, samples, 0, use_txf);
+            blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
+                                        PIPE_FORMAT_R32_UINT, target,
+                                        samples, samples, 0, use_txf);
+            blitter_get_fs_texfetch_depth(ctx, target, samples, use_txf);
+            if (ctx->has_stencil_export) {
+               blitter_get_fs_texfetch_depthstencil(ctx, target, samples, use_txf);
+               blitter_get_fs_texfetch_stencil(ctx, target, samples, use_txf);
+            }
+
+            if (samples == 1)
+               continue;
+
+            /* MSAA resolve shaders. */
+            for (j = 2; j < 32; j++) {
+               if (!screen->is_format_supported(screen, PIPE_FORMAT_R32_FLOAT,
+                                                target, j,
+                                                PIPE_BIND_SAMPLER_VIEW)) {
+                  continue;
+               }
+
+               for (f = 0; f < 2; f++) {
+                  if (f != PIPE_TEX_FILTER_NEAREST && use_txf)
+                     continue;
+
+                  blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_FLOAT,
+                                              PIPE_FORMAT_R32_FLOAT, target,
+                                              j, 1, f, use_txf);
+                  blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT,
+                                              PIPE_FORMAT_R32_UINT, target,
+                                              j, 1, f, use_txf);
+                  blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT,
+                                              PIPE_FORMAT_R32_SINT, target,
+                                              j, 1, f, use_txf);
+               }
             }
          }
       }
@@ -1209,15 +1269,15 @@
    vb.stride = 8 * sizeof(float);
 
    u_upload_data(pipe->stream_uploader, 0, sizeof(ctx->vertices), 4, ctx->vertices,
-                 &vb.buffer_offset, &vb.buffer);
-   if (!vb.buffer)
+                 &vb.buffer_offset, &vb.buffer.resource);
+   if (!vb.buffer.resource)
       return;
    u_upload_unmap(pipe->stream_uploader);
 
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    util_draw_arrays_instanced(pipe, PIPE_PRIM_TRIANGLE_FAN, 0, 4,
                               0, num_instances);
-   pipe_resource_reference(&vb.buffer, NULL);
+   pipe_resource_reference(&vb.buffer.resource, NULL);
 }
 
 void util_blitter_draw_rectangle(struct blitter_context *blitter,
@@ -1400,12 +1460,22 @@
    return pipe->create_surface(pipe, surf->texture, &dst_templ);
 }
 
-void util_blitter_default_src_texture(struct pipe_sampler_view *src_templ,
+void util_blitter_default_src_texture(struct blitter_context *blitter,
+                                      struct pipe_sampler_view *src_templ,
                                       struct pipe_resource *src,
                                       unsigned srclevel)
 {
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+
    memset(src_templ, 0, sizeof(*src_templ));
-   src_templ->target = src->target;
+
+   if (ctx->cube_as_2darray &&
+       (src->target == PIPE_TEXTURE_CUBE ||
+        src->target == PIPE_TEXTURE_CUBE_ARRAY))
+      src_templ->target = PIPE_TEXTURE_2D_ARRAY;
+   else
+      src_templ->target = src->target;
+
    src_templ->format = util_format_linear(src->format);
    src_templ->u.tex.first_level = srclevel;
    src_templ->u.tex.last_level = srclevel;
@@ -1524,7 +1594,7 @@
    dst_view = pipe->create_surface(pipe, dst, &dst_templ);
 
    /* Initialize the sampler view. */
-   util_blitter_default_src_texture(&src_templ, src, src_level);
+   util_blitter_default_src_texture(blitter, &src_templ, src, src_level);
    src_view = pipe->create_sampler_view(pipe, src, &src_templ);
 
    /* Copy. */
@@ -1544,12 +1614,13 @@
                      unsigned src_width0,
                      unsigned src_height0,
                      const struct pipe_box *srcbox,
-                     bool is_zsbuf)
+                     bool is_zsbuf,
+                     bool uses_txf)
 {
    struct pipe_context *pipe = ctx->base.pipe;
    unsigned src_samples = src->texture->nr_samples;
    unsigned dst_samples = dst->texture->nr_samples;
-   enum pipe_texture_target src_target = src->texture->target;
+   enum pipe_texture_target src_target = src->target;
    struct pipe_framebuffer_state fb_state = {0};
 
    /* Initialize framebuffer state. */
@@ -1572,7 +1643,8 @@
        */
       union pipe_color_union coord;
       get_texcoords(src, src_width0, src_height0, srcbox->x, srcbox->y,
-                    srcbox->x+srcbox->width, srcbox->y+srcbox->height, coord.f);
+                    srcbox->x+srcbox->width, srcbox->y+srcbox->height,
+                    uses_txf, coord.f);
 
       /* Set framebuffer state. */
       if (is_zsbuf) {
@@ -1635,7 +1707,7 @@
                                      srcbox->z + src_z,
                                      i, srcbox->x, srcbox->y,
                                      srcbox->x + srcbox->width,
-                                     srcbox->y + srcbox->height);
+                                     srcbox->y + srcbox->height, uses_txf);
                blitter_draw(ctx, dstbox->x, dstbox->y,
                             dstbox->x + dstbox->width,
                             dstbox->y + dstbox->height, 0, 1);
@@ -1647,7 +1719,7 @@
                                   srcbox->z + src_z, 0,
                                   srcbox->x, srcbox->y,
                                   srcbox->x + srcbox->width,
-                                  srcbox->y + srcbox->height);
+                                  srcbox->y + srcbox->height, uses_txf);
             blitter_draw(ctx, dstbox->x, dstbox->y,
                          dstbox->x + dstbox->width,
                          dstbox->y + dstbox->height, 0, 1);
@@ -1678,7 +1750,7 @@
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->base.pipe;
-   enum pipe_texture_target src_target = src->texture->target;
+   enum pipe_texture_target src_target = src->target;
    unsigned src_samples = src->texture->nr_samples;
    unsigned dst_samples = dst->texture->nr_samples;
    boolean has_depth, has_stencil, has_color;
@@ -1705,10 +1777,47 @@
       return;
    }
 
-   if (blit_stencil ||
-       (dstbox->width == abs(srcbox->width) &&
-        dstbox->height == abs(srcbox->height))) {
+   bool is_scaled = dstbox->width != abs(srcbox->width) ||
+                    dstbox->height != abs(srcbox->height);
+
+   if (blit_stencil || !is_scaled)
       filter = PIPE_TEX_FILTER_NEAREST;
+
+   bool use_txf = false;
+
+   /* Don't support scaled blits. The TXF shader uses F2I for rounding. */
+   if (ctx->has_txf &&
+       !is_scaled &&
+       filter == PIPE_TEX_FILTER_NEAREST &&
+       src->target != PIPE_TEXTURE_CUBE &&
+       src->target != PIPE_TEXTURE_CUBE_ARRAY) {
+      int src_width = u_minify(src_width0, src->u.tex.first_level);
+      int src_height = u_minify(src_height0, src->u.tex.first_level);
+      int src_depth = src->u.tex.last_layer + 1;
+      struct pipe_box box = *srcbox;
+
+      /* Eliminate negative width/height/depth. */
+      if (box.width < 0) {
+         box.x += box.width;
+         box.width *= -1;
+      }
+      if (box.height < 0) {
+         box.y += box.height;
+         box.height *= -1;
+      }
+      if (box.depth < 0) {
+         box.z += box.depth;
+         box.depth *= -1;
+      }
+
+      /* See if srcbox is in bounds. TXF doesn't clamp the coordinates. */
+      use_txf =
+         box.x >= 0 && box.x < src_width &&
+         box.y >= 0 && box.y < src_height &&
+         box.z >= 0 && box.z < src_depth &&
+         box.x + box.width > 0 && box.x + box.width <= src_width &&
+         box.y + box.height > 0 && box.y + box.height <= src_height &&
+         box.z + box.depth > 0 && box.z + box.depth <= src_depth;
    }
 
    /* Check whether the states are properly saved. */
@@ -1727,19 +1836,19 @@
                                               ctx->dsa_write_depth_stencil);
          ctx->bind_fs_state(pipe,
                blitter_get_fs_texfetch_depthstencil(ctx, src_target,
-                                                    src_samples));
+                                                    src_samples, use_txf));
       } else if (blit_depth) {
          pipe->bind_depth_stencil_alpha_state(pipe,
                                               ctx->dsa_write_depth_keep_stencil);
          ctx->bind_fs_state(pipe,
                blitter_get_fs_texfetch_depth(ctx, src_target,
-                                             src_samples));
+                                             src_samples, use_txf));
       } else { /* is_stencil */
          pipe->bind_depth_stencil_alpha_state(pipe,
                                               ctx->dsa_keep_depth_write_stencil);
          ctx->bind_fs_state(pipe,
                blitter_get_fs_texfetch_stencil(ctx, src_target,
-                                               src_samples));
+                                               src_samples, use_txf));
       }
 
    } else {
@@ -1749,7 +1858,8 @@
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
       ctx->bind_fs_state(pipe,
             blitter_get_fs_texfetch_col(ctx, src->format, dst->format, src_target,
-                                        src_samples, dst_samples, filter));
+                                        src_samples, dst_samples, filter,
+                                        use_txf));
    }
 
    /* Set the linear filter only for scaled color non-MSAA blits. */
@@ -1815,7 +1925,7 @@
    blitter_set_common_draw_rect_state(ctx, scissor != NULL, FALSE);
 
    do_blits(ctx, dst, dstbox, src, src_width0, src_height0,
-            srcbox, blit_depth || blit_stencil);
+            srcbox, blit_depth || blit_stencil, use_txf);
 
    util_blitter_restore_vertex_states(blitter);
    util_blitter_restore_fragment_states(blitter);
@@ -1846,7 +1956,7 @@
    dst_view = pipe->create_surface(pipe, dst, &dst_templ);
 
    /* Initialize the sampler view. */
-   util_blitter_default_src_texture(&src_templ, src, info->src.level);
+   util_blitter_default_src_texture(blitter, &src_templ, src, info->src.level);
    src_templ.format = info->src.format;
    src_view = pipe->create_sampler_view(pipe, src, &src_templ);
 
@@ -1876,6 +1986,11 @@
    const struct util_format_description *desc =
          util_format_description(format);
    unsigned src_level;
+   unsigned target = tex->target;
+
+   if (ctx->cube_as_2darray &&
+       (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY))
+      target = PIPE_TEXTURE_2D_ARRAY;
 
    assert(tex->nr_samples <= 1);
    assert(!util_format_has_stencil(desc));
@@ -1896,16 +2011,16 @@
       pipe->bind_depth_stencil_alpha_state(pipe,
                                            ctx->dsa_write_depth_keep_stencil);
       ctx->bind_fs_state(pipe,
-                         blitter_get_fs_texfetch_depth(ctx, tex->target, 1));
+                         blitter_get_fs_texfetch_depth(ctx, target, 1, false));
    } else {
       pipe->bind_blend_state(pipe, ctx->blend[PIPE_MASK_RGBA][0]);
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
       ctx->bind_fs_state(pipe,
-            blitter_get_fs_texfetch_col(ctx, tex->format, tex->format, tex->target,
-                                        1, 1, PIPE_TEX_FILTER_LINEAR));
+            blitter_get_fs_texfetch_col(ctx, tex->format, tex->format, target,
+                                        1, 1, PIPE_TEX_FILTER_LINEAR, false));
    }
 
-   if (tex->target == PIPE_TEXTURE_RECT) {
+   if (target == PIPE_TEXTURE_RECT) {
       sampler_state = ctx->sampler_state_rect_linear;
    } else {
       sampler_state = ctx->sampler_state_linear;
@@ -1926,7 +2041,7 @@
       srcbox.width = u_minify(tex->width0, src_level);
       srcbox.height = u_minify(tex->height0, src_level);
 
-      if (tex->target == PIPE_TEXTURE_3D) {
+      if (target == PIPE_TEXTURE_3D) {
          dstbox.depth = util_max_layer(tex, dst_level) + 1;
          srcbox.depth = util_max_layer(tex, src_level) + 1;
       } else {
@@ -1941,14 +2056,14 @@
       dst_view = pipe->create_surface(pipe, tex, &dst_templ);
 
       /* Initialize the sampler view. */
-      util_blitter_default_src_texture(&src_templ, tex, src_level);
+      util_blitter_default_src_texture(blitter, &src_templ, tex, src_level);
       src_templ.format = format;
       src_view = pipe->create_sampler_view(pipe, tex, &src_templ);
 
       pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, &src_view);
 
       do_blits(ctx, dst_view, &dstbox, src_view, tex->width0, tex->height0,
-               &srcbox, is_depth);
+               &srcbox, is_depth, false);
 
       pipe_surface_reference(&dst_view, NULL);
       pipe_sampler_view_reference(&src_view, NULL);
@@ -2199,7 +2314,8 @@
    blitter_check_saved_vertex_states(ctx);
    blitter_disable_render_cond(ctx);
 
-   vb.buffer = src;
+   vb.is_user_buffer = false;
+   vb.buffer.resource = src;
    vb.buffer_offset = srcx;
    vb.stride = 4;
 
@@ -2259,8 +2375,8 @@
    }
 
    u_upload_data(pipe->stream_uploader, 0, num_channels*4, 4, clear_value,
-                 &vb.buffer_offset, &vb.buffer);
-   if (!vb.buffer)
+                 &vb.buffer_offset, &vb.buffer.resource);
+   if (!vb.buffer.resource)
       goto out;
 
    vb.stride = 0;
@@ -2291,7 +2407,7 @@
    util_blitter_restore_render_cond(blitter);
    util_blitter_unset_running_flag(blitter);
    pipe_so_target_reference(&so_target, NULL);
-   pipe_resource_reference(&vb.buffer, NULL);
+   pipe_resource_reference(&vb.buffer.resource, NULL);
 }
 
 /* probably radeon specific */
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index f47c3dd..912af83 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -268,7 +268,8 @@
  * Helper function to initialize a view for copy_texture_view.
  * The parameters must match copy_texture_view.
  */
-void util_blitter_default_src_texture(struct pipe_sampler_view *src_templ,
+void util_blitter_default_src_texture(struct blitter_context *blitter,
+                                      struct pipe_sampler_view *src_templ,
                                       struct pipe_resource *src,
                                       unsigned srclevel);
 
@@ -503,10 +504,8 @@
 util_blitter_save_vertex_buffer_slot(struct blitter_context *blitter,
                                      struct pipe_vertex_buffer *vertex_buffers)
 {
-   pipe_resource_reference(&blitter->saved_vertex_buffer.buffer,
-                           vertex_buffers[blitter->vb_slot].buffer);
-   memcpy(&blitter->saved_vertex_buffer, &vertex_buffers[blitter->vb_slot],
-          sizeof(struct pipe_vertex_buffer));
+   pipe_vertex_buffer_reference(&blitter->saved_vertex_buffer,
+                                &vertex_buffers[blitter->vb_slot]);
 }
 
 static inline void
diff --git a/src/gallium/auxiliary/util/u_caps.c b/src/gallium/auxiliary/util/u_caps.c
deleted file mode 100644
index cd005d6..0000000
--- a/src/gallium/auxiliary/util/u_caps.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 Vmware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "pipe/p_screen.h"
-#include "util/u_format.h"
-#include "util/u_debug.h"
-#include "u_caps.h"
-
-/**
- * Iterates over a list of caps checks as defined in u_caps.h. Should
- * all checks pass returns TRUE and out is set to the last element of
- * the list (TERMINATE). Should any check fail returns FALSE and set
- * out to the index of the start of the first failing check.
- */
-boolean
-util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out)
-{
-   int i, tmpi;
-   float tmpf;
-
-   for (i = 0; list[i];) {
-      switch(list[i++]) {
-      case UTIL_CAPS_CHECK_CAP:
-         if (!screen->get_param(screen, list[i++])) {
-            *out = i - 2;
-            return FALSE;
-         }
-         break;
-      case UTIL_CAPS_CHECK_INT:
-         tmpi = screen->get_param(screen, list[i++]);
-         if (tmpi < (int)list[i++]) {
-            *out = i - 3;
-            return FALSE;
-         }
-         break;
-      case UTIL_CAPS_CHECK_FLOAT:
-         tmpf = screen->get_paramf(screen, list[i++]);
-         if (tmpf < (float)list[i++]) {
-            *out = i - 3;
-            return FALSE;
-         }
-         break;
-      case UTIL_CAPS_CHECK_FORMAT:
-         if (!screen->is_format_supported(screen,
-                                          list[i++],
-                                          PIPE_TEXTURE_2D,
-                                          0,
-                                          PIPE_BIND_SAMPLER_VIEW)) {
-            *out = i - 2;
-            return FALSE;
-         }
-         break;
-      case UTIL_CAPS_CHECK_SHADER:
-         tmpi = screen->get_shader_param(screen, list[i] >> 24, list[i] & ((1 << 24) - 1));
-         ++i;
-         if (tmpi < (int)list[i++]) {
-            *out = i - 3;
-            return FALSE;
-         }
-         break;
-      case UTIL_CAPS_CHECK_UNIMPLEMENTED:
-         *out = i - 1;
-         return FALSE;
-      default:
-         assert(!"Unsupported check");
-         return FALSE;
-      }
-   }
-
-   *out = i;
-   return TRUE;
-}
-
-/**
- * Iterates over a list of caps checks as defined in u_caps.h.
- * Returns TRUE if all caps checks pass returns FALSE otherwise.
- */
-boolean
-util_check_caps(struct pipe_screen *screen, const unsigned *list)
-{
-   int out;
-   return util_check_caps_out(screen, list, &out);
-}
-
-
-/*
- * Below follows some demo lists.
- *
- * None of these lists are exhausting lists of what is
- * actually needed to support said API and more here for
- * as example on how to uses the above functions. Especially
- * for DX10 and DX11 where Gallium is missing features.
- */
-
-/* DX 9_1 */
-static unsigned caps_dx_9_1[] = {
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1),
-   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12),    /* 2048 */
-   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
-   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
-   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 2),
-   UTIL_CHECK_TERMINATE
-};
-
-/* DX 9_2 */
-static unsigned caps_dx_9_2[] = {
-   UTIL_CHECK_CAP(OCCLUSION_QUERY),
-   UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE),
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1),
-   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12),    /* 2048 */
-   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
-   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
-   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
-   UTIL_CHECK_TERMINATE
-};
-
-/* DX 9_3 */
-static unsigned caps_dx_9_3[] = {
-   UTIL_CHECK_CAP(SM3),
- //UTIL_CHECK_CAP(INSTANCING),
-   UTIL_CHECK_CAP(OCCLUSION_QUERY),
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 4),
-   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 13),    /* 4096 */
-   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
-   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
-   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
-   UTIL_CHECK_TERMINATE
-};
-
-/* DX 10 */
-static unsigned caps_dx_10[] = {
-   UTIL_CHECK_CAP(SM3),
- //UTIL_CHECK_CAP(INSTANCING),
-   UTIL_CHECK_CAP(OCCLUSION_QUERY),
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8),
-   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14),    /* 8192 */
-   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12),    /* 2048 */
-   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14),  /* 8192 */
-   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
-   UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */
-   UTIL_CHECK_TERMINATE
-};
-
-/* DX11 */
-static unsigned caps_dx_11[] = {
-   UTIL_CHECK_CAP(SM3),
- //UTIL_CHECK_CAP(INSTANCING),
-   UTIL_CHECK_CAP(OCCLUSION_QUERY),
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8),
-   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14),    /* 16384 */
-   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12),    /* 2048 */
-   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14),  /* 16384 */
-   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
-   UTIL_CHECK_FORMAT(B8G8R8A8_UNORM),
-   UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */
-   UTIL_CHECK_TERMINATE
-};
-
-/* OpenGL 2.1 */
-static unsigned caps_opengl_2_1[] = {
-   UTIL_CHECK_CAP(OCCLUSION_QUERY),
-   UTIL_CHECK_CAP(TWO_SIDED_STENCIL),
-   UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE),
-   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 2),
-   UTIL_CHECK_TERMINATE
-};
-
-/* OpenGL 3.0 */
-/* UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), */
-
-/* Shader Model 3 */
-static unsigned caps_sm3[] = {
-    UTIL_CHECK_SHADER(FRAGMENT, MAX_INSTRUCTIONS, 512),
-    UTIL_CHECK_SHADER(FRAGMENT, MAX_INPUTS, 10),
-    UTIL_CHECK_SHADER(FRAGMENT, MAX_TEMPS, 32),
-    UTIL_CHECK_SHADER(FRAGMENT, MAX_CONST_BUFFER_SIZE, 224 * 16),
-
-    UTIL_CHECK_SHADER(VERTEX, MAX_INSTRUCTIONS, 512),
-    UTIL_CHECK_SHADER(VERTEX, MAX_INPUTS, 16),
-    UTIL_CHECK_SHADER(VERTEX, MAX_TEMPS, 32),
-    UTIL_CHECK_SHADER(VERTEX, MAX_CONST_BUFFER_SIZE, 256 * 16),
-
-    UTIL_CHECK_TERMINATE
-};
-
-/**
- * Demo function which checks against theoretical caps needed for different APIs.
- */
-void util_caps_demo_print(struct pipe_screen *screen)
-{
-   struct {
-      char* name;
-      unsigned *list;
-   } list[] = {
-      {"DX 9.1", caps_dx_9_1},
-      {"DX 9.2", caps_dx_9_2},
-      {"DX 9.3", caps_dx_9_3},
-      {"DX 10", caps_dx_10},
-      {"DX 11", caps_dx_11},
-      {"OpenGL 2.1", caps_opengl_2_1},
-/*    {"OpenGL 3.0", caps_opengl_3_0},*/
-      {"SM3", caps_sm3},
-      {NULL, NULL}
-   };
-   int i, out = 0;
-
-   for (i = 0; list[i].name; i++) {
-      if (util_check_caps_out(screen, list[i].list, &out)) {
-         debug_printf("%s: %s yes\n", __FUNCTION__, list[i].name);
-         continue;
-      }
-      switch (list[i].list[out]) {
-      case UTIL_CAPS_CHECK_CAP:
-         debug_printf("%s: %s no (cap %u not supported)\n", __FUNCTION__,
-                      list[i].name,
-                      list[i].list[out + 1]);
-         break;
-      case UTIL_CAPS_CHECK_INT:
-         debug_printf("%s: %s no (cap %u less then %u)\n", __FUNCTION__,
-                      list[i].name,
-                      list[i].list[out + 1],
-                      list[i].list[out + 2]);
-         break;
-      case UTIL_CAPS_CHECK_FLOAT:
-         debug_printf("%s: %s no (cap %u less then %f)\n", __FUNCTION__,
-                      list[i].name,
-                      list[i].list[out + 1],
-                      (double)(int)list[i].list[out + 2]);
-         break;
-      case UTIL_CAPS_CHECK_FORMAT:
-         debug_printf("%s: %s no (format %s not supported)\n", __FUNCTION__,
-                      list[i].name,
-                      util_format_name(list[i].list[out + 1]) + 12);
-         break;
-      case UTIL_CAPS_CHECK_UNIMPLEMENTED:
-         debug_printf("%s: %s no (not implemented in gallium or state tracker)\n",
-                      __FUNCTION__, list[i].name);
-         break;
-      default:
-            assert(!"Unsupported check");
-      }
-   }
-}
diff --git a/src/gallium/auxiliary/util/u_caps.h b/src/gallium/auxiliary/util/u_caps.h
deleted file mode 100644
index 038efc9..0000000
--- a/src/gallium/auxiliary/util/u_caps.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 Vmware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef U_CAPS_H
-#define U_CAPS_H
-
-#include "pipe/p_compiler.h"
-
-struct pipe_screen;
-
-enum u_caps_check_enum {
-   UTIL_CAPS_CHECK_TERMINATE = 0,
-   UTIL_CAPS_CHECK_CAP,
-   UTIL_CAPS_CHECK_INT,
-   UTIL_CAPS_CHECK_FLOAT,
-   UTIL_CAPS_CHECK_FORMAT,
-   UTIL_CAPS_CHECK_SHADER,
-   UTIL_CAPS_CHECK_UNIMPLEMENTED,
-};
-
-#define UTIL_CHECK_CAP(cap) \
-   UTIL_CAPS_CHECK_CAP, PIPE_CAP_##cap
-
-#define UTIL_CHECK_INT(cap, higher) \
-   UTIL_CAPS_CHECK_INT, PIPE_CAP_##cap, (unsigned)(higher)
-
-/* Floats currently lose precision */
-#define UTIL_CHECK_FLOAT(cap, higher) \
-   UTIL_CAPS_CHECK_FLOAT, PIPE_CAPF_##cap, (unsigned)(int)(higher)
-
-#define UTIL_CHECK_FORMAT(format) \
-   UTIL_CAPS_CHECK_FORMAT, PIPE_FORMAT_##format
-
-#define UTIL_CHECK_SHADER(shader, cap, higher) \
-   UTIL_CAPS_CHECK_SHADER, (PIPE_SHADER_##shader << 24) | PIPE_SHADER_CAP_##cap, (unsigned)(higher)
-
-#define UTIL_CHECK_UNIMPLEMENTED \
-   UTIL_CAPS_CHECK_UNIMPLEMENTED
-
-#define UTIL_CHECK_TERMINATE \
-   UTIL_CAPS_CHECK_TERMINATE
-
-boolean util_check_caps(struct pipe_screen *screen, const unsigned *list);
-boolean util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out);
-void util_caps_demo_print(struct pipe_screen *screen);
-
-#endif
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 845fc6b..6a59f27 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -59,12 +59,18 @@
 
 #if defined(PIPE_OS_LINUX)
 #include <signal.h>
+#include <fcntl.h>
+#include <elf.h>
 #endif
 
 #ifdef PIPE_OS_UNIX
 #include <unistd.h>
 #endif
 
+#if defined(HAS_ANDROID_CPUFEATURES)
+#include <cpu-features.h>
+#endif
+
 #if defined(PIPE_OS_WINDOWS)
 #include <windows.h>
 #if defined(PIPE_CC_MSVC)
@@ -126,16 +132,32 @@
    if (setjmp(__lv_powerpc_jmpbuf)) {
       signal(SIGILL, SIG_DFL);
    } else {
-      __lv_powerpc_canjump = 1;
+      boolean enable_altivec = TRUE;    /* Default: enable  if available, and if not overridden */
+#ifdef DEBUG
+      /* Disabling Altivec code generation is not the same as disabling VSX code generation,
+       * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
+       * lp_build_create_jit_compiler_for_module().
+       * If you want to disable Altivec code generation, the best place to do it is here.
+       */
+      char *env_control = getenv("GALLIVM_ALTIVEC");    /* 1=enable (default); 0=disable */
+      if (env_control && env_control[0] == '0') {
+         enable_altivec = FALSE;
+      }
+#endif
+      if (enable_altivec) {
+         __lv_powerpc_canjump = 1;
 
-      __asm __volatile
-         ("mtspr 256, %0\n\t"
-          "vand %%v0, %%v0, %%v0"
-          :
-          : "r" (-1));
+         __asm __volatile
+            ("mtspr 256, %0\n\t"
+             "vand %%v0, %%v0, %%v0"
+             :
+             : "r" (-1));
 
-      signal(SIGILL, SIG_DFL);
-      util_cpu_caps.has_altivec = 1;
+         signal(SIGILL, SIG_DFL);
+         util_cpu_caps.has_altivec = 1;
+      } else {
+         util_cpu_caps.has_altivec = 0;
+      }
    }
 #endif /* !PIPE_OS_APPLE */
 }
@@ -294,6 +316,45 @@
 
 #endif /* X86 or X86_64 */
 
+#if defined(PIPE_ARCH_ARM)
+static void
+check_os_arm_support(void)
+{
+   /*
+    * On Android, the cpufeatures library is preferred way of checking
+    * CPU capabilities. However, it is not available for standalone Mesa
+    * builds, i.e. when Android build system (Android.mk-based) is not
+    * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
+    * have a separate macro that only gets enabled from respective Android.mk.
+    */
+#if defined(HAS_ANDROID_CPUFEATURES)
+   AndroidCpuFamily cpu_family = android_getCpuFamily();
+   uint64_t cpu_features = android_getCpuFeatures();
+
+   if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
+      if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
+         util_cpu_caps.has_neon = 1;
+   }
+#elif defined(PIPE_OS_LINUX)
+    Elf32_auxv_t aux;
+    int fd;
+
+    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (fd >= 0) {
+       while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
+          if (aux.a_type == AT_HWCAP) {
+             uint32_t hwcap = aux.a_un.a_val;
+
+             util_cpu_caps.has_neon = (hwcap >> 12) & 1;
+             break;
+          }
+       }
+       close (fd);
+    }
+#endif /* PIPE_OS_LINUX */
+}
+#endif /* PIPE_ARCH_ARM */
+
 void
 util_cpu_detect(void)
 {
@@ -393,7 +454,7 @@
           (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
           ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
          uint32_t regs3[4];
-         cpuid(0x00000007, regs3);
+         cpuid_count(0x00000007, 0x00000000, regs3);
          util_cpu_caps.has_avx512f    = (regs3[1] >> 16) & 1;
          util_cpu_caps.has_avx512dq   = (regs3[1] >> 17) & 1;
          util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
@@ -443,6 +504,10 @@
    }
 #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
 
+#if defined(PIPE_ARCH_ARM)
+   check_os_arm_support();
+#endif
+
 #if defined(PIPE_ARCH_PPC)
    check_os_altivec_support();
 #endif /* PIPE_ARCH_PPC */
@@ -471,6 +536,7 @@
       debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
       debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
       debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
+      debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
       debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
       debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
       debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 3bd7294..4a34ac4d 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -72,6 +72,7 @@
    unsigned has_xop:1;
    unsigned has_altivec:1;
    unsigned has_daz:1;
+   unsigned has_neon:1;
 
    unsigned has_avx512f:1;
    unsigned has_avx512dq:1;
diff --git a/src/gallium/auxiliary/util/u_debug_flush.c b/src/gallium/auxiliary/util/u_debug_flush.c
index 06d0cfa..98a69b2 100644
--- a/src/gallium/auxiliary/util/u_debug_flush.c
+++ b/src/gallium/auxiliary/util/u_debug_flush.c
@@ -266,6 +266,12 @@
    mtx_unlock(&fbuf->mutex);
 }
 
+
+/**
+ * Add the given buffer to the list of active buffers.  Active buffers
+ * are those which are referenced by the command buffer currently being
+ * constructed.
+ */
 void
 debug_flush_cb_reference(struct debug_flush_ctx *fctx,
                          struct debug_flush_buf *fbuf)
@@ -314,14 +320,15 @@
    struct debug_flush_item *item =
       (struct debug_flush_item *) value;
    struct debug_flush_buf *fbuf = item->fbuf;
-   const char *reason = (const char *) data;
-   char message[80];
-
-   util_snprintf(message, sizeof(message),
-                 "%s referenced mapped buffer detected.", reason);
 
    mtx_lock(&fbuf->mutex);
    if (fbuf->mapped_sync) {
+      const char *reason = (const char *) data;
+      char message[80];
+
+      util_snprintf(message, sizeof(message),
+                    "%s referenced mapped buffer detected.", reason);
+
       debug_flush_alert(message, reason, 3, item->bt_depth, TRUE, TRUE, NULL);
       debug_flush_alert(NULL, "Map", 0, fbuf->bt_depth, TRUE, FALSE,
                         fbuf->map_frame);
@@ -333,6 +340,10 @@
    return PIPE_OK;
 }
 
+/**
+ * Called when we're about to possibly flush a command buffer.
+ * We check if any active buffers are in a mapped state.  If so, print an alert.
+ */
 void
 debug_flush_might_flush(struct debug_flush_ctx *fctx)
 {
@@ -356,6 +367,11 @@
 }
 
 
+/**
+ * Called when we flush a command buffer.  Two things are done:
+ * 1. Check if any of the active buffers are currently mapped (alert if so).
+ * 2. Discard/unreference all the active buffers.
+ */
 void
 debug_flush_flush(struct debug_flush_ctx *fctx)
 {
diff --git a/src/gallium/auxiliary/util/u_draw.c b/src/gallium/auxiliary/util/u_draw.c
index b9f8fcd..a7590f7 100644
--- a/src/gallium/auxiliary/util/u_draw.c
+++ b/src/gallium/auxiliary/util/u_draw.c
@@ -62,13 +62,13 @@
       const struct util_format_description *format_desc;
       unsigned format_size;
 
-      if (!buffer->buffer) {
+      if (buffer->is_user_buffer || !buffer->buffer.resource) {
          continue;
       }
 
-      assert(buffer->buffer->height0 == 1);
-      assert(buffer->buffer->depth0 == 1);
-      buffer_size = buffer->buffer->width0;
+      assert(buffer->buffer.resource->height0 == 1);
+      assert(buffer->buffer.resource->depth0 == 1);
+      buffer_size = buffer->buffer.resource->width0;
 
       format_desc = util_format_description(element->src_format);
       assert(format_desc->block.width == 1);
@@ -136,7 +136,7 @@
    struct pipe_draw_info info;
    struct pipe_transfer *transfer;
    uint32_t *params;
-   const unsigned num_params = info_in->indexed ? 5 : 4;
+   const unsigned num_params = info_in->index_size ? 5 : 4;
 
    assert(info_in->indirect);
    assert(!info_in->count_from_stream_output);
@@ -145,8 +145,8 @@
 
    params = (uint32_t *)
       pipe_buffer_map_range(pipe,
-                            info_in->indirect,
-                            info_in->indirect_offset,
+                            info_in->indirect->buffer,
+                            info_in->indirect->offset,
                             num_params * sizeof(uint32_t),
                             PIPE_TRANSFER_READ,
                             &transfer);
@@ -158,8 +158,8 @@
    info.count = params[0];
    info.instance_count = params[1];
    info.start = params[2];
-   info.index_bias = info_in->indexed ? params[3] : 0;
-   info.start_instance = info_in->indexed ? params[4] : params[3];
+   info.index_bias = info_in->index_size ? params[3] : 0;
+   info.start_instance = info_in->index_size ? params[4] : params[3];
    info.indirect = NULL;
 
    pipe_buffer_unmap(pipe, transfer);
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
index b16f106..e8af140 100644
--- a/src/gallium/auxiliary/util/u_draw.h
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -67,15 +67,15 @@
 }
 
 static inline void
-util_draw_elements(struct pipe_context *pipe, int index_bias,
-                   enum pipe_prim_type mode,
+util_draw_elements(struct pipe_context *pipe, unsigned index_size,
+                   int index_bias, enum pipe_prim_type mode,
                    uint start,
                    uint count)
 {
    struct pipe_draw_info info;
 
    util_draw_init_info(&info);
-   info.indexed = TRUE;
+   info.index_size = index_size;
    info.mode = mode;
    info.start = start;
    info.count = count;
@@ -108,6 +108,7 @@
 
 static inline void
 util_draw_elements_instanced(struct pipe_context *pipe,
+                             unsigned index_size,
                              int index_bias,
                              enum pipe_prim_type mode,
                              uint start,
@@ -118,7 +119,7 @@
    struct pipe_draw_info info;
 
    util_draw_init_info(&info);
-   info.indexed = TRUE;
+   info.index_size = index_size;
    info.mode = mode;
    info.start = start;
    info.count = count;
@@ -129,29 +130,6 @@
    pipe->draw_vbo(pipe, &info);
 }
 
-static inline void
-util_draw_range_elements(struct pipe_context *pipe,
-                         int index_bias,
-                         uint min_index,
-                         uint max_index,
-                         enum pipe_prim_type mode,
-                         uint start,
-                         uint count)
-{
-   struct pipe_draw_info info;
-
-   util_draw_init_info(&info);
-   info.indexed = TRUE;
-   info.mode = mode;
-   info.start = start;
-   info.count = count;
-   info.index_bias = index_bias;
-   info.min_index = min_index;
-   info.max_index = max_index;
-
-   pipe->draw_vbo(pipe, &info);
-}
-
 
 /* This converts an indirect draw into a direct draw by mapping the indirect
  * buffer, extracting its arguments, and calling pipe->draw_vbo.
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index ce3fa41..fe9558e 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -54,7 +54,7 @@
 
    /* tell pipe about the vertex buffer */
    memset(&vbuffer, 0, sizeof(vbuffer));
-   vbuffer.buffer = vbuf;
+   vbuffer.buffer.resource = vbuf;
    vbuffer.stride = num_attribs * 4 * sizeof(float);  /* vertex size */
    vbuffer.buffer_offset = offset;
 
@@ -82,7 +82,8 @@
 
    assert(num_attribs <= PIPE_MAX_ATTRIBS);
 
-   vbuffer.user_buffer = buffer;
+   vbuffer.is_user_buffer = true;
+   vbuffer.buffer.user = buffer;
    vbuffer.stride = num_attribs * 4 * sizeof(float);  /* vertex size */
 
    /* note: vertex elements already set by caller */
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index bce8517..408c270 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -173,9 +173,6 @@
                           const struct pipe_constant_buffer *state);
 
 void
-util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state);
-
-void
 util_dump_vertex_buffer(FILE *stream,
                         const struct pipe_vertex_buffer *state);
 
@@ -191,6 +188,9 @@
 util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state);
 
 void
+util_dump_grid_info(FILE *stream, const struct pipe_grid_info *state);
+
+void
 util_dump_box(FILE *stream, const struct pipe_box *box);
 
 void
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 105e5c4..c62229a5 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -463,11 +463,13 @@
 
    util_dump_struct_begin(stream, "pipe_shader_state");
 
-   util_dump_member_begin(stream, "tokens");
-   fprintf(stream, "\"\n");
-   tgsi_dump_to_file(state->tokens, 0, stream);
-   fprintf(stream, "\"");
-   util_dump_member_end(stream);
+   if (state->type == PIPE_SHADER_IR_TGSI) {
+      util_dump_member_begin(stream, "tokens");
+      fprintf(stream, "\"\n");
+      tgsi_dump_to_file(state->tokens, 0, stream);
+      fprintf(stream, "\"");
+      util_dump_member_end(stream);
+   }
 
    if (state->stream_output.num_outputs) {
       util_dump_member_begin(stream, "stream_output");
@@ -832,25 +834,6 @@
 
 
 void
-util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state)
-{
-   if (!state) {
-      util_dump_null(stream);
-      return;
-   }
-
-   util_dump_struct_begin(stream, "pipe_index_buffer");
-
-   util_dump_member(stream, uint, state, index_size);
-   util_dump_member(stream, uint, state, offset);
-   util_dump_member(stream, ptr, state, buffer);
-   util_dump_member(stream, ptr, state, user_buffer);
-
-   util_dump_struct_end(stream);
-}
-
-
-void
 util_dump_vertex_buffer(FILE *stream, const struct pipe_vertex_buffer *state)
 {
    if (!state) {
@@ -861,9 +844,9 @@
    util_dump_struct_begin(stream, "pipe_vertex_buffer");
 
    util_dump_member(stream, uint, state, stride);
+   util_dump_member(stream, bool, state, is_user_buffer);
    util_dump_member(stream, uint, state, buffer_offset);
-   util_dump_member(stream, ptr, state, buffer);
-   util_dump_member(stream, ptr, state, user_buffer);
+   util_dump_member(stream, ptr, state, buffer.resource);
 
    util_dump_struct_end(stream);
 }
@@ -917,7 +900,8 @@
 
    util_dump_struct_begin(stream, "pipe_draw_info");
 
-   util_dump_member(stream, bool, state, indexed);
+   util_dump_member(stream, uint, state, index_size);
+   util_dump_member(stream, uint, state, has_user_indices);
 
    util_dump_member(stream, enum_prim_mode, state, mode);
    util_dump_member(stream, uint, state, start);
@@ -937,13 +921,46 @@
    util_dump_member(stream, bool, state, primitive_restart);
    util_dump_member(stream, uint, state, restart_index);
 
+   util_dump_member(stream, ptr, state, index.resource);
    util_dump_member(stream, ptr, state, count_from_stream_output);
 
+   if (!state->indirect) {
+      util_dump_member(stream, ptr, state, indirect);
+   } else {
+      util_dump_member(stream, uint, state, indirect->offset);
+      util_dump_member(stream, uint, state, indirect->stride);
+      util_dump_member(stream, uint, state, indirect->draw_count);
+      util_dump_member(stream, uint, state, indirect->indirect_draw_count_offset);
+      util_dump_member(stream, ptr, state, indirect->buffer);
+      util_dump_member(stream, ptr, state, indirect->indirect_draw_count);
+   }
+
+   util_dump_struct_end(stream);
+}
+
+void util_dump_grid_info(FILE *stream, const struct pipe_grid_info *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_grid_info");
+
+   util_dump_member(stream, uint, state, pc);
+   util_dump_member(stream, ptr, state, input);
+   util_dump_member(stream, uint, state, work_dim);
+
+   util_dump_member_begin(stream, "block");
+   util_dump_array(stream, uint, state->block, ARRAY_SIZE(state->block));
+   util_dump_member_end(stream);
+
+   util_dump_member_begin(stream, "grid");
+   util_dump_array(stream, uint, state->grid, ARRAY_SIZE(state->grid));
+   util_dump_member_end(stream);
+
    util_dump_member(stream, ptr, state, indirect);
    util_dump_member(stream, uint, state, indirect_offset);
-   util_dump_member(stream, uint, state, indirect_stride);
-   util_dump_member(stream, uint, state, indirect_count);
-   util_dump_member(stream, uint, state, indirect_params_offset);
 
    util_dump_struct_end(stream);
 }
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
deleted file mode 100644
index 7b7a093..0000000
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 Luca Barbieri
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef U_DYNARRAY_H
-#define U_DYNARRAY_H
-
-#include "pipe/p_compiler.h"
-#include "util/u_memory.h"
-
-/* A zero-initialized version of this is guaranteed to represent an
- * empty array.
- *
- * Also, size <= capacity and data != 0 if and only if capacity != 0
- * capacity will always be the allocation size of data
- */
-struct util_dynarray
-{
-   void *data;
-   unsigned size;
-   unsigned capacity;
-};
-
-static inline void
-util_dynarray_init(struct util_dynarray *buf)
-{
-   memset(buf, 0, sizeof(*buf));
-}
-
-static inline void
-util_dynarray_fini(struct util_dynarray *buf)
-{
-   if(buf->data)
-   {
-      FREE(buf->data);
-      util_dynarray_init(buf);
-   }
-}
-
-/* use util_dynarray_trim to reduce the allocated storage */
-static inline void *
-util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
-{
-   char *p;
-   if(newsize > buf->capacity)
-   {
-      unsigned newcap = buf->capacity << 1;
-      if(newsize > newcap)
-	      newcap = newsize;
-      buf->data = REALLOC(buf->data, buf->capacity, newcap);
-      buf->capacity = newcap;
-   }
-
-   p = (char *)buf->data + buf->size;
-   buf->size = newsize;
-   return p;
-}
-
-static inline void *
-util_dynarray_grow(struct util_dynarray *buf, int diff)
-{
-   return util_dynarray_resize(buf, buf->size + diff);
-}
-
-static inline void
-util_dynarray_trim(struct util_dynarray *buf)
-{
-   if (buf->size != buf->capacity) {
-      if (buf->size) {
-         buf->data = REALLOC(buf->data, buf->capacity, buf->size);
-         buf->capacity = buf->size;
-      }
-      else {
-         FREE(buf->data);
-         buf->data = 0;
-         buf->capacity = 0;
-      }
-   }
-}
-
-#define util_dynarray_append(buf, type, v) do {type __v = (v); memcpy(util_dynarray_grow((buf), sizeof(type)), &__v, sizeof(type));} while(0)
-#define util_dynarray_top_ptr(buf, type) (type*)((char*)(buf)->data + (buf)->size - sizeof(type))
-#define util_dynarray_top(buf, type) *util_dynarray_top_ptr(buf, type)
-#define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
-#define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
-#define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
-#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx))
-#define util_dynarray_begin(buf) ((buf)->data)
-#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size))
-
-#endif /* U_DYNARRAY_H */
-
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index d055778..88bfd72 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -495,6 +495,19 @@
    return desc->layout == UTIL_FORMAT_LAYOUT_S3TC ? TRUE : FALSE;
 }
 
+static inline boolean
+util_format_is_etc(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return FALSE;
+   }
+
+   return desc->layout == UTIL_FORMAT_LAYOUT_ETC ? TRUE : FALSE;
+}
+
 static inline boolean 
 util_format_is_srgb(enum pipe_format format)
 {
diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c
index 35cca82..e0feade 100644
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -28,6 +28,7 @@
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_upload_mgr.h"
+#include <inttypes.h>
 
 /**
  * This function is used to copy an array of pipe_vertex_buffer structures,
@@ -50,10 +51,13 @@
 
    if (src) {
       for (i = 0; i < count; i++) {
-         if (src[i].buffer || src[i].user_buffer) {
+         if (src[i].buffer.resource)
             bitmask |= 1 << i;
-         }
-         pipe_resource_reference(&dst[i].buffer, src[i].buffer);
+
+         pipe_vertex_buffer_unreference(&dst[i]);
+
+         if (!src[i].is_user_buffer)
+            pipe_resource_reference(&dst[i].buffer.resource, src[i].buffer.resource);
       }
 
       /* Copy over the other members of pipe_vertex_buffer. */
@@ -64,10 +68,8 @@
    }
    else {
       /* Unreference the buffers. */
-      for (i = 0; i < count; i++) {
-         pipe_resource_reference(&dst[i].buffer, NULL);
-         dst[i].user_buffer = NULL;
-      }
+      for (i = 0; i < count; i++)
+         pipe_vertex_buffer_unreference(&dst[i]);
 
       *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
    }
@@ -86,7 +88,7 @@
    uint32_t enabled_buffers = 0;
 
    for (i = 0; i < *dst_count; i++) {
-      if (dst[i].buffer || dst[i].user_buffer)
+      if (dst[i].buffer.resource)
          enabled_buffers |= (1ull << i);
    }
 
@@ -96,46 +98,72 @@
    *dst_count = util_last_bit(enabled_buffers);
 }
 
-
-void
-util_set_index_buffer(struct pipe_index_buffer *dst,
-                      const struct pipe_index_buffer *src)
-{
-   if (src) {
-      pipe_resource_reference(&dst->buffer, src->buffer);
-      memcpy(dst, src, sizeof(*dst));
-   }
-   else {
-      pipe_resource_reference(&dst->buffer, NULL);
-      memset(dst, 0, sizeof(*dst));
-   }
-}
-
 /**
  * Given a user index buffer, save the structure to "saved", and upload it.
  */
 bool
-util_save_and_upload_index_buffer(struct pipe_context *pipe,
-                                  const struct pipe_draw_info *info,
-                                  const struct pipe_index_buffer *ib,
-                                  struct pipe_index_buffer *out_saved)
+util_upload_index_buffer(struct pipe_context *pipe,
+                         const struct pipe_draw_info *info,
+                         struct pipe_resource **out_buffer,
+                         unsigned *out_offset)
 {
-   struct pipe_index_buffer new_ib = {0};
-   unsigned start_offset = info->start * ib->index_size;
+   unsigned start_offset = info->start * info->index_size;
 
    u_upload_data(pipe->stream_uploader, start_offset,
-                 info->count * ib->index_size, 4,
-                 (char*)ib->user_buffer + start_offset,
-                 &new_ib.offset, &new_ib.buffer);
-   if (!new_ib.buffer)
-      return false;
+                 info->count * info->index_size, 4,
+                 (char*)info->index.user + start_offset,
+                 out_offset, out_buffer);
    u_upload_unmap(pipe->stream_uploader);
+   *out_offset -= start_offset;
+   return *out_buffer != NULL;
+}
 
-   new_ib.offset -= start_offset;
-   new_ib.index_size = ib->index_size;
+struct pipe_query *
+util_begin_pipestat_query(struct pipe_context *ctx)
+{
+   struct pipe_query *q =
+      ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+   if (!q)
+      return NULL;
 
-   util_set_index_buffer(out_saved, ib);
-   pipe->set_index_buffer(pipe, &new_ib);
-   pipe_resource_reference(&new_ib.buffer, NULL);
-   return true;
+   ctx->begin_query(ctx, q);
+   return q;
+}
+
+void
+util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
+                        FILE *f)
+{
+   static unsigned counter;
+   struct pipe_query_data_pipeline_statistics stats;
+
+   ctx->end_query(ctx, q);
+   ctx->get_query_result(ctx, q, true, (void*)&stats);
+   ctx->destroy_query(ctx, q);
+
+   fprintf(f,
+           "Draw call %u:\n"
+           "    ia_vertices    = %"PRIu64"\n"
+           "    ia_primitives  = %"PRIu64"\n"
+           "    vs_invocations = %"PRIu64"\n"
+           "    gs_invocations = %"PRIu64"\n"
+           "    gs_primitives  = %"PRIu64"\n"
+           "    c_invocations  = %"PRIu64"\n"
+           "    c_primitives   = %"PRIu64"\n"
+           "    ps_invocations = %"PRIu64"\n"
+           "    hs_invocations = %"PRIu64"\n"
+           "    ds_invocations = %"PRIu64"\n"
+           "    cs_invocations = %"PRIu64"\n",
+           p_atomic_inc_return(&counter),
+           stats.ia_vertices,
+           stats.ia_primitives,
+           stats.vs_invocations,
+           stats.gs_invocations,
+           stats.gs_primitives,
+           stats.c_invocations,
+           stats.c_primitives,
+           stats.ps_invocations,
+           stats.hs_invocations,
+           stats.ds_invocations,
+           stats.cs_invocations);
 }
diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h
index 7de960b..ab970d7 100644
--- a/src/gallium/auxiliary/util/u_helpers.h
+++ b/src/gallium/auxiliary/util/u_helpers.h
@@ -29,6 +29,7 @@
 #define U_HELPERS_H
 
 #include "pipe/p_state.h"
+#include <stdio.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,13 +45,17 @@
                                    const struct pipe_vertex_buffer *src,
                                    unsigned start_slot, unsigned count);
 
-void util_set_index_buffer(struct pipe_index_buffer *dst,
-                           const struct pipe_index_buffer *src);
+bool util_upload_index_buffer(struct pipe_context *pipe,
+                              const struct pipe_draw_info *info,
+                              struct pipe_resource **out_buffer,
+                              unsigned *out_offset);
 
-bool util_save_and_upload_index_buffer(struct pipe_context *pipe,
-                                       const struct pipe_draw_info *info,
-                                       const struct pipe_index_buffer *ib,
-                                       struct pipe_index_buffer *out_saved);
+struct pipe_query *
+util_begin_pipestat_query(struct pipe_context *ctx);
+
+void
+util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
+                        FILE *f);
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/auxiliary/util/u_index_modify.c b/src/gallium/auxiliary/util/u_index_modify.c
index d86be24..4e9349a 100644
--- a/src/gallium/auxiliary/util/u_index_modify.c
+++ b/src/gallium/auxiliary/util/u_index_modify.c
@@ -27,7 +27,7 @@
 /* Ubyte indices. */
 
 void util_shorten_ubyte_elts_to_userptr(struct pipe_context *context,
-					const struct pipe_index_buffer *ib,
+					const struct pipe_draw_info *info,
                                         unsigned add_transfer_flags,
 					int index_bias,
 					unsigned start,
@@ -39,10 +39,10 @@
     unsigned short *out_map = out;
     unsigned i;
 
-    if (ib->user_buffer) {
-       in_map = ib->user_buffer;
+    if (info->has_user_indices) {
+       in_map = info->index.user;
     } else {
-       in_map = pipe_buffer_map(context, ib->buffer,
+       in_map = pipe_buffer_map(context, info->index.resource,
                                 PIPE_TRANSFER_READ |
                                 add_transfer_flags,
                                 &src_transfer);
@@ -62,7 +62,7 @@
 /* Ushort indices. */
 
 void util_rebuild_ushort_elts_to_userptr(struct pipe_context *context,
-					 const struct pipe_index_buffer *ib,
+					 const struct pipe_draw_info *info,
                                          unsigned add_transfer_flags,
 					 int index_bias,
 					 unsigned start, unsigned count,
@@ -73,10 +73,10 @@
     unsigned short *out_map = out;
     unsigned i;
 
-    if (ib->user_buffer) {
-       in_map = ib->user_buffer;
+    if (info->has_user_indices) {
+       in_map = info->index.user;
     } else {
-       in_map = pipe_buffer_map(context, ib->buffer,
+       in_map = pipe_buffer_map(context, info->index.resource,
                                 PIPE_TRANSFER_READ |
                                 add_transfer_flags,
                                 &in_transfer);
@@ -96,7 +96,7 @@
 /* Uint indices. */
 
 void util_rebuild_uint_elts_to_userptr(struct pipe_context *context,
-				       const struct pipe_index_buffer *ib,
+				       const struct pipe_draw_info *info,
                                        unsigned add_transfer_flags,
 				       int index_bias,
 				       unsigned start, unsigned count,
@@ -107,10 +107,10 @@
     unsigned int *out_map = out;
     unsigned i;
 
-    if (ib->user_buffer) {
-       in_map = ib->user_buffer;
+    if (info->has_user_indices) {
+       in_map = info->index.user;
     } else {
-       in_map = pipe_buffer_map(context, ib->buffer,
+       in_map = pipe_buffer_map(context, info->index.resource,
                                 PIPE_TRANSFER_READ |
                                 add_transfer_flags,
                                 &in_transfer);
diff --git a/src/gallium/auxiliary/util/u_index_modify.h b/src/gallium/auxiliary/util/u_index_modify.h
index d009199..ba96725 100644
--- a/src/gallium/auxiliary/util/u_index_modify.h
+++ b/src/gallium/auxiliary/util/u_index_modify.h
@@ -25,10 +25,9 @@
 
 struct pipe_context;
 struct pipe_resource;
-struct pipe_index_buffer;
 
 void util_shorten_ubyte_elts_to_userptr(struct pipe_context *context,
-					const struct pipe_index_buffer *ib,
+					const struct pipe_draw_info *info,
                                         unsigned add_transfer_flags,
 					int index_bias,
 					unsigned start,
@@ -36,14 +35,14 @@
 					void *out);
 
 void util_rebuild_ushort_elts_to_userptr(struct pipe_context *context,
-					 const struct pipe_index_buffer *ib,
+					 const struct pipe_draw_info *info,
                                          unsigned add_transfer_flags,
 					 int index_bias,
 					 unsigned start, unsigned count,
 					 void *out);
 
 void util_rebuild_uint_elts_to_userptr(struct pipe_context *context,
-				       const struct pipe_index_buffer *ib,
+				       const struct pipe_draw_info *info,
                                        unsigned add_transfer_flags,
 				       int index_bias,
 				       unsigned start, unsigned count,
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 1ca3e53..4fc683a 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -194,6 +194,25 @@
 }
 
 static inline void
+pipe_vertex_buffer_unreference(struct pipe_vertex_buffer *dst)
+{
+   if (dst->is_user_buffer)
+      dst->buffer.user = NULL;
+   else
+      pipe_resource_reference(&dst->buffer.resource, NULL);
+}
+
+static inline void
+pipe_vertex_buffer_reference(struct pipe_vertex_buffer *dst,
+                             const struct pipe_vertex_buffer *src)
+{
+   pipe_vertex_buffer_unreference(dst);
+   if (!src->is_user_buffer)
+      pipe_resource_reference(&dst->buffer.resource, src->buffer.resource);
+   memcpy(dst, src, sizeof(*src));
+}
+
+static inline void
 pipe_surface_reset(struct pipe_context *ctx, struct pipe_surface* ps,
                    struct pipe_resource *pt, unsigned level, unsigned layer)
 {
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 2ab5f03..a441b54 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -605,8 +605,9 @@
 /**
  * Clamp X to [MIN, MAX].
  * This is a macro to allow float, int, uint, etc. types.
+ * We arbitrarily turn NaN into MIN.
  */
-#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
+#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
 
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
diff --git a/src/gallium/auxiliary/util/u_prim_restart.c b/src/gallium/auxiliary/util/u_prim_restart.c
index e45aa56..9ff93a7 100644
--- a/src/gallium/auxiliary/util/u_prim_restart.c
+++ b/src/gallium/auxiliary/util/u_prim_restart.c
@@ -39,28 +39,26 @@
  */
 enum pipe_error
 util_translate_prim_restart_ib(struct pipe_context *context,
-                               struct pipe_index_buffer *src_buffer,
-                               struct pipe_resource **dst_buffer,
-                               unsigned num_indexes,
-                               unsigned restart_index)
+                               const struct pipe_draw_info *info,
+                               struct pipe_resource **dst_buffer)
 {
    struct pipe_screen *screen = context->screen;
    struct pipe_transfer *src_transfer = NULL, *dst_transfer = NULL;
    void *src_map = NULL, *dst_map = NULL;
-   const unsigned src_index_size = src_buffer->index_size;
+   const unsigned src_index_size = info->index_size;
    unsigned dst_index_size;
 
    /* 1-byte indexes are converted to 2-byte indexes, 4-byte stays 4-byte */
-   dst_index_size = MAX2(2, src_buffer->index_size);
+   dst_index_size = MAX2(2, info->index_size);
    assert(dst_index_size == 2 || dst_index_size == 4);
 
    /* no user buffers for now */
-   assert(src_buffer->user_buffer == NULL);
+   assert(!info->has_user_indices);
 
    /* Create new index buffer */
    *dst_buffer = pipe_buffer_create(screen, PIPE_BIND_INDEX_BUFFER,
                                     PIPE_USAGE_STREAM,
-                                    num_indexes * dst_index_size);
+                                    info->count * dst_index_size);
    if (!*dst_buffer)
       goto error;
 
@@ -71,9 +69,9 @@
       goto error;
 
    /* Map original / src index buffer */
-   src_map = pipe_buffer_map_range(context, src_buffer->buffer,
-                                   src_buffer->offset,
-                                   num_indexes * src_index_size,
+   src_map = pipe_buffer_map_range(context, info->index.resource,
+                                   info->start * src_index_size,
+                                   info->count * src_index_size,
                                    PIPE_TRANSFER_READ,
                                    &src_transfer);
    if (!src_map)
@@ -83,16 +81,16 @@
       uint8_t *src = (uint8_t *) src_map;
       uint16_t *dst = (uint16_t *) dst_map;
       unsigned i;
-      for (i = 0; i < num_indexes; i++) {
-         dst[i] = (src[i] == restart_index) ? 0xffff : src[i];
+      for (i = 0; i < info->count; i++) {
+         dst[i] = (src[i] == info->restart_index) ? 0xffff : src[i];
       }
    }
    else if (src_index_size == 2 && dst_index_size == 2) {
       uint16_t *src = (uint16_t *) src_map;
       uint16_t *dst = (uint16_t *) dst_map;
       unsigned i;
-      for (i = 0; i < num_indexes; i++) {
-         dst[i] = (src[i] == restart_index) ? 0xffff : src[i];
+      for (i = 0; i < info->count; i++) {
+         dst[i] = (src[i] == info->restart_index) ? 0xffff : src[i];
       }
    }
    else {
@@ -101,8 +99,8 @@
       unsigned i;
       assert(src_index_size == 4);
       assert(dst_index_size == 4);
-      for (i = 0; i < num_indexes; i++) {
-         dst[i] = (src[i] == restart_index) ? 0xffffffff : src[i];
+      for (i = 0; i < info->count; i++) {
+         dst[i] = (src[i] == info->restart_index) ? 0xffffffff : src[i];
       }
    }
 
@@ -177,7 +175,6 @@
  */
 enum pipe_error
 util_draw_vbo_without_prim_restart(struct pipe_context *context,
-                                   const struct pipe_index_buffer *ib,
                                    const struct pipe_draw_info *info)
 {
    const void *src_map;
@@ -186,15 +183,15 @@
    struct pipe_transfer *src_transfer = NULL;
    unsigned i, start, count;
 
-   assert(info->indexed);
+   assert(info->index_size);
    assert(info->primitive_restart);
 
    /* Get pointer to the index data */
-   if (ib->buffer) {
+   if (!info->has_user_indices) {
       /* map the index buffer (only the range we need to scan) */
-      src_map = pipe_buffer_map_range(context, ib->buffer,
-                                      ib->offset + info->start * ib->index_size,
-                                      info->count * ib->index_size,
+      src_map = pipe_buffer_map_range(context, info->index.resource,
+                                      info->start * info->index_size,
+                                      info->count * info->index_size,
                                       PIPE_TRANSFER_READ,
                                       &src_transfer);
       if (!src_map) {
@@ -202,13 +199,12 @@
       }
    }
    else {
-      if (!ib->user_buffer) {
+      if (!info->index.user) {
          debug_printf("User-space index buffer is null!");
          return PIPE_ERROR_BAD_INPUT;
       }
-      src_map = (const uint8_t *) ib->user_buffer
-         + ib->offset
-         + info->start * ib->index_size;
+      src_map = (const uint8_t *) info->index.user
+         + info->start * info->index_size;
    }
 
 #define SCAN_INDEXES(TYPE) \
@@ -231,9 +227,9 @@
       } \
    }
 
-   start = info->start;
+   start = 0;
    count = 0;
-   switch (ib->index_size) {
+   switch (info->index_size) {
    case 1:
       SCAN_INDEXES(uint8_t);
       break;
diff --git a/src/gallium/auxiliary/util/u_prim_restart.h b/src/gallium/auxiliary/util/u_prim_restart.h
index 1e98e0e..0e17ce5 100644
--- a/src/gallium/auxiliary/util/u_prim_restart.h
+++ b/src/gallium/auxiliary/util/u_prim_restart.h
@@ -38,20 +38,17 @@
 
 struct pipe_context;
 struct pipe_draw_info;
-struct pipe_index_buffer;
+union pipe_index_binding;
 struct pipe_resource;
 
 
 enum pipe_error
 util_translate_prim_restart_ib(struct pipe_context *context,
-                               struct pipe_index_buffer *src_buffer,
-                               struct pipe_resource **dst_buffer,
-                               unsigned num_indexes,
-                               unsigned restart_index);
+                               const struct pipe_draw_info *info,
+                               struct pipe_resource **dst_buffer);
 
 enum pipe_error
 util_draw_vbo_without_prim_restart(struct pipe_context *context,
-                                   const struct pipe_index_buffer *ib,
                                    const struct pipe_draw_info *info);
 
 
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 613ec4a..79331b5 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -205,6 +205,28 @@
    return pipe->create_gs_state(pipe, &state);
 }
 
+static void
+ureg_load_tex(struct ureg_program *ureg, struct ureg_dst out,
+              struct ureg_src coord, struct ureg_src sampler,
+              unsigned tex_target, bool load_level_zero, bool use_txf)
+{
+   if (use_txf) {
+      struct ureg_dst temp = ureg_DECL_temporary(ureg);
+
+      ureg_F2I(ureg, temp, coord);
+
+      if (load_level_zero)
+         ureg_TXF_LZ(ureg, out, tex_target, ureg_src(temp), sampler);
+      else
+         ureg_TXF(ureg, out, tex_target, ureg_src(temp), sampler);
+   } else {
+      if (load_level_zero)
+         ureg_TEX_LZ(ureg, out, tex_target, coord, sampler);
+      else
+         ureg_TEX(ureg, out, tex_target, coord, sampler);
+   }
+}
+
 /**
  * Make simple fragment texture shader:
  *  IMM {0,0,0,1}                         // (if writemask != 0xf)
@@ -224,7 +246,9 @@
                                         unsigned interp_mode,
                                         unsigned writemask,
                                         enum tgsi_return_type stype,
-                                        enum tgsi_return_type dtype)
+                                        enum tgsi_return_type dtype,
+                                        bool load_level_zero,
+                                        bool use_txf)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler;
@@ -265,9 +289,8 @@
                ureg_writemask(temp, writemask),
                tex_target, tex, sampler);
    else
-      ureg_TEX(ureg,
-               ureg_writemask(temp, writemask),
-               tex_target, tex, sampler);
+      ureg_load_tex(ureg, ureg_writemask(temp, writemask), tex, sampler,
+                    tex_target, load_level_zero, use_txf);
 
    if (stype != dtype) {
       if (stype == TGSI_RETURN_TYPE_SINT) {
@@ -299,13 +322,16 @@
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
                               unsigned interp_mode,
                               enum tgsi_return_type stype,
-                              enum tgsi_return_type dtype)
+                              enum tgsi_return_type dtype,
+                              bool load_level_zero,
+                              bool use_txf)
 {
    return util_make_fragment_tex_shader_writemask( pipe,
                                                    tex_target,
                                                    interp_mode,
                                                    TGSI_WRITEMASK_XYZW,
-                                                   stype, dtype );
+                                                   stype, dtype, load_level_zero,
+                                                   use_txf);
 }
 
 
@@ -316,7 +342,9 @@
 void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
                                          unsigned tex_target,
-                                         unsigned interp_mode)
+                                         unsigned interp_mode,
+                                         bool load_level_zero,
+                                         bool use_txf)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler;
@@ -352,9 +380,8 @@
 
    ureg_MOV( ureg, out, imm );
 
-   ureg_TEX( ureg,
-             ureg_writemask(depth, TGSI_WRITEMASK_Z),
-             tex_target, tex, sampler );
+   ureg_load_tex(ureg, ureg_writemask(depth, TGSI_WRITEMASK_Z), tex, sampler,
+                 tex_target, load_level_zero, use_txf);
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
@@ -368,7 +395,9 @@
 void *
 util_make_fragment_tex_shader_writedepthstencil(struct pipe_context *pipe,
                                                 unsigned tex_target,
-                                                unsigned interp_mode)
+                                                unsigned interp_mode,
+                                                bool load_level_zero,
+                                                bool use_txf)
 {
    struct ureg_program *ureg;
    struct ureg_src depth_sampler, stencil_sampler;
@@ -413,12 +442,10 @@
 
    ureg_MOV( ureg, out, imm );
 
-   ureg_TEX( ureg,
-             ureg_writemask(depth, TGSI_WRITEMASK_Z),
-             tex_target, tex, depth_sampler );
-   ureg_TEX( ureg,
-             ureg_writemask(stencil, TGSI_WRITEMASK_Y),
-             tex_target, tex, stencil_sampler );
+   ureg_load_tex(ureg, ureg_writemask(depth, TGSI_WRITEMASK_Z), tex,
+                 depth_sampler, tex_target, load_level_zero, use_txf);
+   ureg_load_tex(ureg, ureg_writemask(stencil, TGSI_WRITEMASK_Y), tex,
+                 stencil_sampler, tex_target, load_level_zero, use_txf);
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
@@ -432,7 +459,9 @@
 void *
 util_make_fragment_tex_shader_writestencil(struct pipe_context *pipe,
                                            unsigned tex_target,
-                                           unsigned interp_mode)
+                                           unsigned interp_mode,
+                                           bool load_level_zero,
+                                           bool use_txf)
 {
    struct ureg_program *ureg;
    struct ureg_src stencil_sampler;
@@ -468,9 +497,8 @@
 
    ureg_MOV( ureg, out, imm );
 
-   ureg_TEX( ureg,
-             ureg_writemask(stencil, TGSI_WRITEMASK_Y),
-             tex_target, tex, stencil_sampler );
+   ureg_load_tex(ureg, ureg_writemask(stencil, TGSI_WRITEMASK_Y), tex,
+                 stencil_sampler, tex_target, load_level_zero, use_txf);
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
@@ -926,7 +954,7 @@
    }
 
    /* EMIT IMM[0] */
-   ureg_insn(ureg, TGSI_OPCODE_EMIT, NULL, 0, &imm, 1);
+   ureg_insn(ureg, TGSI_OPCODE_EMIT, NULL, 0, &imm, 1, 0);
 
    /* END */
    ureg_END(ureg);
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index 0481098..de05aad 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -74,30 +74,40 @@
                                         unsigned interp_mode,
                                         unsigned writemask,
                                         enum tgsi_return_type stype,
-                                        enum tgsi_return_type dtype);
+                                        enum tgsi_return_type dtype,
+                                        bool load_level_zero,
+                                        bool use_txf);
 
 extern void *
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
                               unsigned interp_mode,
                               enum tgsi_return_type stype,
-                              enum tgsi_return_type dtype);
+                              enum tgsi_return_type dtype,
+                              bool load_level_zero,
+                              bool use_txf);
 
 extern void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
                                          unsigned tex_target,
-                                         unsigned interp_mode);
+                                         unsigned interp_mode,
+                                         bool load_level_zero,
+                                         bool use_txf);
 
 
 extern void *
 util_make_fragment_tex_shader_writedepthstencil(struct pipe_context *pipe,
                                                 unsigned tex_target,
-                                                unsigned interp_mode);
+                                                unsigned interp_mode,
+                                                bool load_level_zero,
+                                                bool use_txf);
 
 
 extern void *
 util_make_fragment_tex_shader_writestencil(struct pipe_context *pipe,
                                            unsigned tex_target,
-                                           unsigned interp_mode);
+                                           unsigned interp_mode,
+                                           bool load_level_zero,
+                                           bool use_txf);
 
 
 extern void *
diff --git a/src/gallium/auxiliary/util/u_tests.c b/src/gallium/auxiliary/util/u_tests.c
index 30eb589..7ec8eef 100644
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -375,7 +375,7 @@
    fs = util_make_fragment_tex_shader(ctx, tgsi_tex_target,
                                       TGSI_INTERPOLATE_LINEAR,
                                       TGSI_RETURN_TYPE_FLOAT,
-                                      TGSI_RETURN_TYPE_FLOAT);
+                                      TGSI_RETURN_TYPE_FLOAT, false, false);
    cso_set_fragment_shader_handle(cso, fs);
 
    /* Vertex shader. */
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
new file mode 100644
index 0000000..cb9ea3a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -0,0 +1,2459 @@
+/**************************************************************************
+ *
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_threaded_context.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+
+/* 0 = disabled, 1 = assertions, 2 = printfs */
+#define TC_DEBUG 0
+
+#if TC_DEBUG >= 1
+#define tc_assert assert
+#else
+#define tc_assert(x)
+#endif
+
+#if TC_DEBUG >= 2
+#define tc_printf printf
+#define tc_asprintf asprintf
+#define tc_strcmp strcmp
+#else
+#define tc_printf(...)
+#define tc_asprintf(...) 0
+#define tc_strcmp(...) 0
+#endif
+
+#define TC_SENTINEL 0x5ca1ab1e
+
+enum tc_call_id {
+#define CALL(name) TC_CALL_##name,
+#include "u_threaded_context_calls.h"
+#undef CALL
+   TC_NUM_CALLS,
+};
+
+typedef void (*tc_execute)(struct pipe_context *pipe, union tc_payload *payload);
+
+static const tc_execute execute_func[TC_NUM_CALLS];
+
+static void
+tc_batch_check(struct tc_batch *batch)
+{
+   tc_assert(batch->sentinel == TC_SENTINEL);
+   tc_assert(batch->num_total_call_slots <= TC_CALLS_PER_BATCH);
+}
+
+static void
+tc_debug_check(struct threaded_context *tc)
+{
+   for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
+      tc_batch_check(&tc->batch_slots[i]);
+      tc_assert(tc->batch_slots[i].pipe == tc->pipe);
+   }
+}
+
+static void
+tc_batch_execute(void *job, int thread_index)
+{
+   struct tc_batch *batch = job;
+   struct pipe_context *pipe = batch->pipe;
+   struct tc_call *last = &batch->call[batch->num_total_call_slots];
+
+   tc_batch_check(batch);
+
+   for (struct tc_call *iter = batch->call; iter != last;
+        iter += iter->num_call_slots) {
+      tc_assert(iter->sentinel == TC_SENTINEL);
+      execute_func[iter->call_id](pipe, &iter->payload);
+   }
+
+   tc_batch_check(batch);
+   batch->num_total_call_slots = 0;
+}
+
+static void
+tc_batch_flush(struct threaded_context *tc)
+{
+   struct tc_batch *next = &tc->batch_slots[tc->next];
+
+   tc_assert(next->num_total_call_slots != 0);
+   tc_batch_check(next);
+   tc_debug_check(tc);
+   p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots);
+
+   util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
+                      NULL);
+   tc->last = tc->next;
+   tc->next = (tc->next + 1) % TC_MAX_BATCHES;
+}
+
+/* This is the function that adds variable-sized calls into the current
+ * batch. It also flushes the batch if there is not enough space there.
+ * All other higher-level "add" functions use it.
+ */
+static union tc_payload *
+tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
+                  unsigned payload_size)
+{
+   struct tc_batch *next = &tc->batch_slots[tc->next];
+   unsigned total_size = offsetof(struct tc_call, payload) + payload_size;
+   unsigned num_call_slots = DIV_ROUND_UP(total_size, sizeof(struct tc_call));
+
+   tc_debug_check(tc);
+
+   if (unlikely(next->num_total_call_slots + num_call_slots > TC_CALLS_PER_BATCH)) {
+      tc_batch_flush(tc);
+      next = &tc->batch_slots[tc->next];
+      tc_assert(next->num_total_call_slots == 0);
+   }
+
+   tc_assert(util_queue_fence_is_signalled(&next->fence));
+
+   struct tc_call *call = &next->call[next->num_total_call_slots];
+   next->num_total_call_slots += num_call_slots;
+
+   call->sentinel = TC_SENTINEL;
+   call->call_id = id;
+   call->num_call_slots = num_call_slots;
+
+   tc_debug_check(tc);
+   return &call->payload;
+}
+
+#define tc_add_struct_typed_call(tc, execute, type) \
+   ((struct type*)tc_add_sized_call(tc, execute, sizeof(struct type)))
+
+#define tc_add_slot_based_call(tc, execute, type, num_slots) \
+   ((struct type*)tc_add_sized_call(tc, execute, \
+                                    sizeof(struct type) + \
+                                    sizeof(((struct type*)NULL)->slot[0]) * \
+                                    (num_slots)))
+
+static union tc_payload *
+tc_add_small_call(struct threaded_context *tc, enum tc_call_id id)
+{
+   return tc_add_sized_call(tc, id, 0);
+}
+
+static void
+_tc_sync(struct threaded_context *tc, const char *info, const char *func)
+{
+   struct tc_batch *last = &tc->batch_slots[tc->last];
+   struct tc_batch *next = &tc->batch_slots[tc->next];
+   bool synced = false;
+
+   tc_debug_check(tc);
+
+   /* Only wait for queued calls... */
+   if (!util_queue_fence_is_signalled(&last->fence)) {
+      util_queue_fence_wait(&last->fence);
+      synced = true;
+   }
+
+   tc_debug_check(tc);
+
+   /* .. and execute unflushed calls directly. */
+   if (next->num_total_call_slots) {
+      p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots);
+      tc_batch_execute(next, 0);
+      synced = true;
+   }
+
+   if (synced) {
+      p_atomic_inc(&tc->num_syncs);
+
+      if (tc_strcmp(func, "tc_destroy") != 0)
+         tc_printf("sync %s %s\n", func, info);
+   }
+
+   tc_debug_check(tc);
+}
+
+#define tc_sync(tc) _tc_sync(tc, "", __func__)
+#define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
+
+static void
+tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
+{
+   *dst = NULL;
+   pipe_resource_reference(dst, src);
+}
+
+void
+threaded_resource_init(struct pipe_resource *res)
+{
+   struct threaded_resource *tres = threaded_resource(res);
+
+   tres->latest = &tres->b;
+   util_range_init(&tres->valid_buffer_range);
+   tres->base_valid_buffer_range = &tres->valid_buffer_range;
+   tres->is_shared = false;
+   tres->is_user_ptr = false;
+}
+
+void
+threaded_resource_deinit(struct pipe_resource *res)
+{
+   struct threaded_resource *tres = threaded_resource(res);
+
+   if (tres->latest != &tres->b)
+           pipe_resource_reference(&tres->latest, NULL);
+   util_range_destroy(&tres->valid_buffer_range);
+}
+
+struct pipe_context *
+threaded_context_unwrap_sync(struct pipe_context *pipe)
+{
+   if (!pipe || !pipe->priv)
+      return pipe;
+
+   tc_sync(threaded_context(pipe));
+   return (struct pipe_context*)pipe->priv;
+}
+
+
+/********************************************************************
+ * simple functions
+ */
+
+#define TC_FUNC1(func, m_payload, qualifier, type, deref, deref2) \
+   static void \
+   tc_call_##func(struct pipe_context *pipe, union tc_payload *payload) \
+   { \
+      pipe->func(pipe, deref2((type*)payload)); \
+   } \
+   \
+   static void \
+   tc_##func(struct pipe_context *_pipe, qualifier type deref param) \
+   { \
+      struct threaded_context *tc = threaded_context(_pipe); \
+      type *p = (type*)tc_add_sized_call(tc, TC_CALL_##func, sizeof(type)); \
+      *p = deref(param); \
+   }
+
+TC_FUNC1(set_active_query_state, flags, , boolean, , *)
+
+TC_FUNC1(set_blend_color, blend_color, const, struct pipe_blend_color, *, )
+TC_FUNC1(set_stencil_ref, stencil_ref, const, struct pipe_stencil_ref, *, )
+TC_FUNC1(set_clip_state, clip_state, const, struct pipe_clip_state, *, )
+TC_FUNC1(set_sample_mask, sample_mask, , unsigned, , *)
+TC_FUNC1(set_min_samples, min_samples, , unsigned, , *)
+TC_FUNC1(set_polygon_stipple, polygon_stipple, const, struct pipe_poly_stipple, *, )
+
+TC_FUNC1(texture_barrier, flags, , unsigned, , *)
+TC_FUNC1(memory_barrier, flags, , unsigned, , *)
+
+
+/********************************************************************
+ * queries
+ */
+
+static struct pipe_query *
+tc_create_query(struct pipe_context *_pipe, unsigned query_type,
+                unsigned index)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   return pipe->create_query(pipe, query_type, index);
+}
+
+static struct pipe_query *
+tc_create_batch_query(struct pipe_context *_pipe, unsigned num_queries,
+                      unsigned *query_types)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   return pipe->create_batch_query(pipe, num_queries, query_types);
+}
+
+static void
+tc_call_destroy_query(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->destroy_query(pipe, payload->query);
+}
+
+static void
+tc_destroy_query(struct pipe_context *_pipe, struct pipe_query *query)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_query *tq = threaded_query(query);
+
+   if (tq->head_unflushed.next)
+      LIST_DEL(&tq->head_unflushed);
+
+   tc_add_small_call(tc, TC_CALL_destroy_query)->query = query;
+}
+
+static void
+tc_call_begin_query(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->begin_query(pipe, payload->query);
+}
+
+static boolean
+tc_begin_query(struct pipe_context *_pipe, struct pipe_query *query)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_begin_query);
+
+   payload->query = query;
+   return true; /* we don't care about the return value for this call */
+}
+
+static void
+tc_call_end_query(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->end_query(pipe, payload->query);
+}
+
+static bool
+tc_end_query(struct pipe_context *_pipe, struct pipe_query *query)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_query *tq = threaded_query(query);
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_end_query);
+
+   payload->query = query;
+
+   tq->flushed = false;
+   if (!tq->head_unflushed.next)
+      LIST_ADD(&tq->head_unflushed, &tc->unflushed_queries);
+
+   return true; /* we don't care about the return value for this call */
+}
+
+static boolean
+tc_get_query_result(struct pipe_context *_pipe,
+                    struct pipe_query *query, boolean wait,
+                    union pipe_query_result *result)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_query *tq = threaded_query(query);
+   struct pipe_context *pipe = tc->pipe;
+
+   if (!tq->flushed)
+      tc_sync_msg(tc, wait ? "wait" : "nowait");
+
+   bool success = pipe->get_query_result(pipe, query, wait, result);
+
+   if (success) {
+      tq->flushed = true;
+      if (tq->head_unflushed.next)
+         LIST_DEL(&tq->head_unflushed);
+   }
+   return success;
+}
+
+struct tc_query_result_resource {
+   struct pipe_query *query;
+   boolean wait;
+   enum pipe_query_value_type result_type;
+   int index;
+   struct pipe_resource *resource;
+   unsigned offset;
+};
+
+static void
+tc_call_get_query_result_resource(struct pipe_context *pipe,
+                                  union tc_payload *payload)
+{
+   struct tc_query_result_resource *p = (struct tc_query_result_resource *)payload;
+
+   pipe->get_query_result_resource(pipe, p->query, p->wait, p->result_type,
+                                   p->index, p->resource, p->offset);
+   pipe_resource_reference(&p->resource, NULL);
+}
+
+static void
+tc_get_query_result_resource(struct pipe_context *_pipe,
+                             struct pipe_query *query, boolean wait,
+                             enum pipe_query_value_type result_type, int index,
+                             struct pipe_resource *resource, unsigned offset)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_query_result_resource *p =
+      tc_add_struct_typed_call(tc, TC_CALL_get_query_result_resource,
+                               tc_query_result_resource);
+
+   p->query = query;
+   p->wait = wait;
+   p->result_type = result_type;
+   p->index = index;
+   tc_set_resource_reference(&p->resource, resource);
+   p->offset = offset;
+}
+
+struct tc_render_condition {
+   struct pipe_query *query;
+   bool condition;
+   unsigned mode;
+};
+
+static void
+tc_call_render_condition(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_render_condition *p = (struct tc_render_condition *)payload;
+   pipe->render_condition(pipe, p->query, p->condition, p->mode);
+}
+
+static void
+tc_render_condition(struct pipe_context *_pipe,
+                    struct pipe_query *query, boolean condition,
+                    enum pipe_render_cond_flag mode)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_render_condition *p =
+      tc_add_struct_typed_call(tc, TC_CALL_render_condition, tc_render_condition);
+
+   p->query = query;
+   p->condition = condition;
+   p->mode = mode;
+}
+
+
+/********************************************************************
+ * constant (immutable) states
+ */
+
+#define TC_CSO_CREATE(name, sname) \
+   static void * \
+   tc_create_##name##_state(struct pipe_context *_pipe, \
+                            const struct pipe_##sname##_state *state) \
+   { \
+      struct pipe_context *pipe = threaded_context(_pipe)->pipe; \
+      return pipe->create_##name##_state(pipe, state); \
+   }
+
+#define TC_CSO_BIND(name) TC_FUNC1(bind_##name##_state, cso, , void *, , *)
+#define TC_CSO_DELETE(name) TC_FUNC1(delete_##name##_state, cso, , void *, , *)
+
+#define TC_CSO_WHOLE2(name, sname) \
+   TC_CSO_CREATE(name, sname) \
+   TC_CSO_BIND(name) \
+   TC_CSO_DELETE(name)
+
+#define TC_CSO_WHOLE(name) TC_CSO_WHOLE2(name, name)
+
+TC_CSO_WHOLE(blend)
+TC_CSO_WHOLE(rasterizer)
+TC_CSO_WHOLE(depth_stencil_alpha)
+TC_CSO_WHOLE(compute)
+TC_CSO_WHOLE2(fs, shader)
+TC_CSO_WHOLE2(vs, shader)
+TC_CSO_WHOLE2(gs, shader)
+TC_CSO_WHOLE2(tcs, shader)
+TC_CSO_WHOLE2(tes, shader)
+TC_CSO_CREATE(sampler, sampler)
+TC_CSO_DELETE(sampler)
+TC_CSO_BIND(vertex_elements)
+TC_CSO_DELETE(vertex_elements)
+
+static void *
+tc_create_vertex_elements_state(struct pipe_context *_pipe, unsigned count,
+                                const struct pipe_vertex_element *elems)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+
+   return pipe->create_vertex_elements_state(pipe, count, elems);
+}
+
+struct tc_sampler_states {
+   ubyte shader, start, count;
+   void *slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_bind_sampler_states(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_sampler_states *p = (struct tc_sampler_states *)payload;
+   pipe->bind_sampler_states(pipe, p->shader, p->start, p->count, p->slot);
+}
+
+static void
+tc_bind_sampler_states(struct pipe_context *_pipe,
+                       enum pipe_shader_type shader,
+                       unsigned start, unsigned count, void **states)
+{
+   if (!count)
+      return;
+
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_sampler_states *p =
+      tc_add_slot_based_call(tc, TC_CALL_bind_sampler_states, tc_sampler_states, count);
+
+   p->shader = shader;
+   p->start = start;
+   p->count = count;
+   memcpy(p->slot, states, count * sizeof(states[0]));
+}
+
+
+/********************************************************************
+ * immediate states
+ */
+
+static void
+tc_call_set_framebuffer_state(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct pipe_framebuffer_state *p = (struct pipe_framebuffer_state *)payload;
+
+   pipe->set_framebuffer_state(pipe, p);
+
+   unsigned nr_cbufs = p->nr_cbufs;
+   for (unsigned i = 0; i < nr_cbufs; i++)
+      pipe_surface_reference(&p->cbufs[i], NULL);
+   pipe_surface_reference(&p->zsbuf, NULL);
+}
+
+static void
+tc_set_framebuffer_state(struct pipe_context *_pipe,
+                         const struct pipe_framebuffer_state *fb)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_framebuffer_state *p =
+      tc_add_struct_typed_call(tc, TC_CALL_set_framebuffer_state,
+                               pipe_framebuffer_state);
+   unsigned nr_cbufs = fb->nr_cbufs;
+
+   p->width = fb->width;
+   p->height = fb->height;
+   p->samples = fb->samples;
+   p->layers = fb->layers;
+   p->nr_cbufs = nr_cbufs;
+
+   for (unsigned i = 0; i < nr_cbufs; i++) {
+      p->cbufs[i] = NULL;
+      pipe_surface_reference(&p->cbufs[i], fb->cbufs[i]);
+   }
+   p->zsbuf = NULL;
+   pipe_surface_reference(&p->zsbuf, fb->zsbuf);
+}
+
+static void
+tc_call_set_tess_state(struct pipe_context *pipe, union tc_payload *payload)
+{
+   float *p = (float*)payload;
+   pipe->set_tess_state(pipe, p, p + 4);
+}
+
+static void
+tc_set_tess_state(struct pipe_context *_pipe,
+                  const float default_outer_level[4],
+                  const float default_inner_level[2])
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   float *p = (float*)tc_add_sized_call(tc, TC_CALL_set_tess_state,
+                                        sizeof(float) * 6);
+
+   memcpy(p, default_outer_level, 4 * sizeof(float));
+   memcpy(p + 4, default_inner_level, 2 * sizeof(float));
+}
+
+struct tc_constant_buffer {
+   ubyte shader, index;
+   struct pipe_constant_buffer cb;
+};
+
+static void
+tc_call_set_constant_buffer(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_constant_buffer *p = (struct tc_constant_buffer *)payload;
+
+   pipe->set_constant_buffer(pipe,
+                             p->shader,
+                             p->index,
+                             &p->cb);
+   pipe_resource_reference(&p->cb.buffer, NULL);
+}
+
+static void
+tc_set_constant_buffer(struct pipe_context *_pipe,
+                       enum pipe_shader_type shader, uint index,
+                       const struct pipe_constant_buffer *cb)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_resource *buffer = NULL;
+   unsigned offset;
+
+   /* This must be done before adding set_constant_buffer, because it could
+    * generate e.g. transfer_unmap and flush partially-uninitialized
+    * set_constant_buffer to the driver if it was done afterwards.
+    */
+   if (cb && cb->user_buffer) {
+      u_upload_data(tc->base.const_uploader, 0, cb->buffer_size, 64,
+                    cb->user_buffer, &offset, &buffer);
+   }
+
+   struct tc_constant_buffer *p =
+      tc_add_struct_typed_call(tc, TC_CALL_set_constant_buffer,
+                               tc_constant_buffer);
+   p->shader = shader;
+   p->index = index;
+
+   if (cb) {
+      if (cb->user_buffer) {
+         p->cb.buffer_size = cb->buffer_size;
+         p->cb.user_buffer = NULL;
+         p->cb.buffer_offset = offset;
+         p->cb.buffer = buffer;
+      } else {
+         tc_set_resource_reference(&p->cb.buffer,
+                                   cb->buffer);
+         memcpy(&p->cb, cb, sizeof(*cb));
+      }
+   } else {
+      memset(&p->cb, 0, sizeof(*cb));
+   }
+}
+
+struct tc_scissors {
+   ubyte start, count;
+   struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_scissor_states(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_scissors *p = (struct tc_scissors *)payload;
+   pipe->set_scissor_states(pipe, p->start, p->count, p->slot);
+}
+
+static void
+tc_set_scissor_states(struct pipe_context *_pipe,
+                      unsigned start, unsigned count,
+                      const struct pipe_scissor_state *states)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_scissors *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_scissor_states, tc_scissors, count);
+
+   p->start = start;
+   p->count = count;
+   memcpy(&p->slot, states, count * sizeof(states[0]));
+}
+
+struct tc_viewports {
+   ubyte start, count;
+   struct pipe_viewport_state slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_viewport_states(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_viewports *p = (struct tc_viewports *)payload;
+   pipe->set_viewport_states(pipe, p->start, p->count, p->slot);
+}
+
+static void
+tc_set_viewport_states(struct pipe_context *_pipe,
+                       unsigned start, unsigned count,
+                       const struct pipe_viewport_state *states)
+{
+   if (!count)
+      return;
+
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_viewports *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_viewport_states, tc_viewports, count);
+
+   p->start = start;
+   p->count = count;
+   memcpy(&p->slot, states, count * sizeof(states[0]));
+}
+
+struct tc_window_rects {
+   bool include;
+   ubyte count;
+   struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_window_rectangles(struct pipe_context *pipe,
+                              union tc_payload *payload)
+{
+   struct tc_window_rects *p = (struct tc_window_rects *)payload;
+   pipe->set_window_rectangles(pipe, p->include, p->count, p->slot);
+}
+
+static void
+tc_set_window_rectangles(struct pipe_context *_pipe, boolean include,
+                         unsigned count,
+                         const struct pipe_scissor_state *rects)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_window_rects *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_window_rectangles, tc_window_rects, count);
+
+   p->include = include;
+   p->count = count;
+   memcpy(p->slot, rects, count * sizeof(rects[0]));
+}
+
+struct tc_sampler_views {
+   ubyte shader, start, count;
+   struct pipe_sampler_view *slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_sampler_views(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_sampler_views *p = (struct tc_sampler_views *)payload;
+   unsigned count = p->count;
+
+   pipe->set_sampler_views(pipe, p->shader, p->start, p->count, p->slot);
+   for (unsigned i = 0; i < count; i++)
+      pipe_sampler_view_reference(&p->slot[i], NULL);
+}
+
+static void
+tc_set_sampler_views(struct pipe_context *_pipe,
+                     enum pipe_shader_type shader,
+                     unsigned start, unsigned count,
+                     struct pipe_sampler_view **views)
+{
+   if (!count)
+      return;
+
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_sampler_views *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_sampler_views, tc_sampler_views, count);
+
+   p->shader = shader;
+   p->start = start;
+   p->count = count;
+
+   if (views) {
+      for (unsigned i = 0; i < count; i++) {
+         p->slot[i] = NULL;
+         pipe_sampler_view_reference(&p->slot[i], views[i]);
+      }
+   } else {
+      memset(p->slot, 0, count * sizeof(views[0]));
+   }
+}
+
+struct tc_shader_images {
+   ubyte shader, start, count;
+   bool unbind;
+   struct pipe_image_view slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_shader_images(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_shader_images *p = (struct tc_shader_images *)payload;
+   unsigned count = p->count;
+
+   if (p->unbind) {
+      pipe->set_shader_images(pipe, p->shader, p->start, p->count, NULL);
+      return;
+   }
+
+   pipe->set_shader_images(pipe, p->shader, p->start, p->count, p->slot);
+
+   for (unsigned i = 0; i < count; i++)
+      pipe_resource_reference(&p->slot[i].resource, NULL);
+}
+
+static void
+tc_set_shader_images(struct pipe_context *_pipe,
+                     enum pipe_shader_type shader,
+                     unsigned start, unsigned count,
+                     const struct pipe_image_view *images)
+{
+   if (!count)
+      return;
+
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_shader_images *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_shader_images, tc_shader_images,
+                             images ? count : 0);
+
+   p->shader = shader;
+   p->start = start;
+   p->count = count;
+   p->unbind = images == NULL;
+
+   if (images) {
+      for (unsigned i = 0; i < count; i++) {
+         tc_set_resource_reference(&p->slot[i].resource, images[i].resource);
+
+         if (images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
+             images[i].resource &&
+             images[i].resource->target == PIPE_BUFFER) {
+            struct threaded_resource *tres =
+               threaded_resource(images[i].resource);
+
+            util_range_add(&tres->valid_buffer_range, images[i].u.buf.offset,
+                           images[i].u.buf.offset + images[i].u.buf.size);
+         }
+      }
+      memcpy(p->slot, images, count * sizeof(images[0]));
+   }
+}
+
+struct tc_shader_buffers {
+   ubyte shader, start, count;
+   bool unbind;
+   struct pipe_shader_buffer slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_shader_buffers(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_shader_buffers *p = (struct tc_shader_buffers *)payload;
+   unsigned count = p->count;
+
+   if (p->unbind) {
+      pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, NULL);
+      return;
+   }
+
+   pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, p->slot);
+
+   for (unsigned i = 0; i < count; i++)
+      pipe_resource_reference(&p->slot[i].buffer, NULL);
+}
+
+static void
+tc_set_shader_buffers(struct pipe_context *_pipe,
+                      enum pipe_shader_type shader,
+                      unsigned start, unsigned count,
+                      const struct pipe_shader_buffer *buffers)
+{
+   if (!count)
+      return;
+
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_shader_buffers *p =
+      tc_add_slot_based_call(tc, TC_CALL_set_shader_buffers, tc_shader_buffers,
+                             buffers ? count : 0);
+
+   p->shader = shader;
+   p->start = start;
+   p->count = count;
+   p->unbind = buffers == NULL;
+
+   if (buffers) {
+      for (unsigned i = 0; i < count; i++) {
+         struct pipe_shader_buffer *dst = &p->slot[i];
+         const struct pipe_shader_buffer *src = buffers + i;
+
+         tc_set_resource_reference(&dst->buffer, src->buffer);
+         dst->buffer_offset = src->buffer_offset;
+         dst->buffer_size = src->buffer_size;
+
+         if (src->buffer) {
+            struct threaded_resource *tres = threaded_resource(src->buffer);
+
+            util_range_add(&tres->valid_buffer_range, src->buffer_offset,
+                           src->buffer_offset + src->buffer_size);
+         }
+      }
+   }
+}
+
+struct tc_vertex_buffers {
+   ubyte start, count;
+   bool unbind;
+   struct pipe_vertex_buffer slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_set_vertex_buffers(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_vertex_buffers *p = (struct tc_vertex_buffers *)payload;
+   unsigned count = p->count;
+
+   if (p->unbind) {
+      pipe->set_vertex_buffers(pipe, p->start, count, NULL);
+      return;
+   }
+
+   for (unsigned i = 0; i < count; i++)
+      tc_assert(!p->slot[i].is_user_buffer);
+
+   pipe->set_vertex_buffers(pipe, p->start, count, p->slot);
+   for (unsigned i = 0; i < count; i++)
+      pipe_resource_reference(&p->slot[i].buffer.resource, NULL);
+}
+
+static void
+tc_set_vertex_buffers(struct pipe_context *_pipe,
+                      unsigned start, unsigned count,
+                      const struct pipe_vertex_buffer *buffers)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   if (!count)
+      return;
+
+   if (buffers) {
+      struct tc_vertex_buffers *p =
+         tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count);
+      p->start = start;
+      p->count = count;
+      p->unbind = false;
+
+      for (unsigned i = 0; i < count; i++) {
+         struct pipe_vertex_buffer *dst = &p->slot[i];
+         const struct pipe_vertex_buffer *src = buffers + i;
+
+         tc_assert(!src->is_user_buffer);
+         dst->stride = src->stride;
+         dst->is_user_buffer = false;
+         tc_set_resource_reference(&dst->buffer.resource,
+                                   src->buffer.resource);
+         dst->buffer_offset = src->buffer_offset;
+      }
+   } else {
+      struct tc_vertex_buffers *p =
+         tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, 0);
+      p->start = start;
+      p->count = count;
+      p->unbind = true;
+   }
+}
+
+struct tc_stream_outputs {
+   unsigned count;
+   struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+   unsigned offsets[PIPE_MAX_SO_BUFFERS];
+};
+
+static void
+tc_call_set_stream_output_targets(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_stream_outputs *p = (struct tc_stream_outputs *)payload;
+   unsigned count = p->count;
+
+   pipe->set_stream_output_targets(pipe, count, p->targets, p->offsets);
+   for (unsigned i = 0; i < count; i++)
+      pipe_so_target_reference(&p->targets[i], NULL);
+}
+
+static void
+tc_set_stream_output_targets(struct pipe_context *_pipe,
+                             unsigned count,
+                             struct pipe_stream_output_target **tgs,
+                             const unsigned *offsets)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_stream_outputs *p =
+      tc_add_struct_typed_call(tc, TC_CALL_set_stream_output_targets,
+                               tc_stream_outputs);
+
+   for (unsigned i = 0; i < count; i++) {
+      p->targets[i] = NULL;
+      pipe_so_target_reference(&p->targets[i], tgs[i]);
+   }
+   p->count = count;
+   memcpy(p->offsets, offsets, count * sizeof(unsigned));
+}
+
+static void
+tc_set_compute_resources(struct pipe_context *_pipe, unsigned start,
+                         unsigned count, struct pipe_surface **resources)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->set_compute_resources(pipe, start, count, resources);
+}
+
+static void
+tc_set_global_binding(struct pipe_context *_pipe, unsigned first,
+                      unsigned count, struct pipe_resource **resources,
+                      uint32_t **handles)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->set_global_binding(pipe, first, count, resources, handles);
+}
+
+
+/********************************************************************
+ * views
+ */
+
+static struct pipe_surface *
+tc_create_surface(struct pipe_context *_pipe,
+                  struct pipe_resource *resource,
+                  const struct pipe_surface *surf_tmpl)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+   struct pipe_surface *view =
+         pipe->create_surface(pipe, resource, surf_tmpl);
+
+   if (view)
+      view->context = _pipe;
+   return view;
+}
+
+static void
+tc_surface_destroy(struct pipe_context *_pipe,
+                   struct pipe_surface *surf)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+
+   pipe->surface_destroy(pipe, surf);
+}
+
+static struct pipe_sampler_view *
+tc_create_sampler_view(struct pipe_context *_pipe,
+                       struct pipe_resource *resource,
+                       const struct pipe_sampler_view *templ)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+   struct pipe_sampler_view *view =
+         pipe->create_sampler_view(pipe, resource, templ);
+
+   if (view)
+      view->context = _pipe;
+   return view;
+}
+
+static void
+tc_sampler_view_destroy(struct pipe_context *_pipe,
+                        struct pipe_sampler_view *view)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+
+   pipe->sampler_view_destroy(pipe, view);
+}
+
+static struct pipe_stream_output_target *
+tc_create_stream_output_target(struct pipe_context *_pipe,
+                               struct pipe_resource *res,
+                               unsigned buffer_offset,
+                               unsigned buffer_size)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+   struct threaded_resource *tres = threaded_resource(res);
+   struct pipe_stream_output_target *view;
+
+   tc_sync(threaded_context(_pipe));
+   util_range_add(&tres->valid_buffer_range, buffer_offset,
+                  buffer_offset + buffer_size);
+
+   view = pipe->create_stream_output_target(pipe, res, buffer_offset,
+                                            buffer_size);
+   if (view)
+      view->context = _pipe;
+   return view;
+}
+
+static void
+tc_stream_output_target_destroy(struct pipe_context *_pipe,
+                                struct pipe_stream_output_target *target)
+{
+   struct pipe_context *pipe = threaded_context(_pipe)->pipe;
+
+   pipe->stream_output_target_destroy(pipe, target);
+}
+
+
+/********************************************************************
+ * bindless
+ */
+
+static uint64_t
+tc_create_texture_handle(struct pipe_context *_pipe,
+                         struct pipe_sampler_view *view,
+                         const struct pipe_sampler_state *state)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   return pipe->create_texture_handle(pipe, view, state);
+}
+
+static void
+tc_call_delete_texture_handle(struct pipe_context *pipe,
+                              union tc_payload *payload)
+{
+   pipe->delete_texture_handle(pipe, payload->handle);
+}
+
+static void
+tc_delete_texture_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload =
+      tc_add_small_call(tc, TC_CALL_delete_texture_handle);
+
+   payload->handle = handle;
+}
+
+struct tc_make_texture_handle_resident
+{
+   uint64_t handle;
+   bool resident;
+};
+
+static void
+tc_call_make_texture_handle_resident(struct pipe_context *pipe,
+                                     union tc_payload *payload)
+{
+   struct tc_make_texture_handle_resident *p =
+      (struct tc_make_texture_handle_resident *)payload;
+
+   pipe->make_texture_handle_resident(pipe, p->handle, p->resident);
+}
+
+static void
+tc_make_texture_handle_resident(struct pipe_context *_pipe, uint64_t handle,
+                                bool resident)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_make_texture_handle_resident *p =
+      tc_add_struct_typed_call(tc, TC_CALL_make_texture_handle_resident,
+                               tc_make_texture_handle_resident);
+
+   p->handle = handle;
+   p->resident = resident;
+}
+
+static uint64_t
+tc_create_image_handle(struct pipe_context *_pipe,
+                       const struct pipe_image_view *image)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   return pipe->create_image_handle(pipe, image);
+}
+
+static void
+tc_call_delete_image_handle(struct pipe_context *pipe,
+                            union tc_payload *payload)
+{
+   pipe->delete_image_handle(pipe, payload->handle);
+}
+
+static void
+tc_delete_image_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload =
+      tc_add_small_call(tc, TC_CALL_delete_image_handle);
+
+   payload->handle = handle;
+}
+
+struct tc_make_image_handle_resident
+{
+   uint64_t handle;
+   unsigned access;
+   bool resident;
+};
+
+static void
+tc_call_make_image_handle_resident(struct pipe_context *pipe,
+                                     union tc_payload *payload)
+{
+   struct tc_make_image_handle_resident *p =
+      (struct tc_make_image_handle_resident *)payload;
+
+   pipe->make_image_handle_resident(pipe, p->handle, p->access, p->resident);
+}
+
+static void
+tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle,
+                              unsigned access, bool resident)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_make_image_handle_resident *p =
+      tc_add_struct_typed_call(tc, TC_CALL_make_image_handle_resident,
+                               tc_make_image_handle_resident);
+
+   p->handle = handle;
+   p->access = access;
+   p->resident = resident;
+}
+
+
+/********************************************************************
+ * transfer
+ */
+
+struct tc_replace_buffer_storage {
+   struct pipe_resource *dst;
+   struct pipe_resource *src;
+   tc_replace_buffer_storage_func func;
+};
+
+static void
+tc_call_replace_buffer_storage(struct pipe_context *pipe,
+                               union tc_payload *payload)
+{
+   struct tc_replace_buffer_storage *p =
+      (struct tc_replace_buffer_storage *)payload;
+
+   p->func(pipe, p->dst, p->src);
+   pipe_resource_reference(&p->dst, NULL);
+   pipe_resource_reference(&p->src, NULL);
+}
+
+static bool
+tc_invalidate_buffer(struct threaded_context *tc,
+                     struct threaded_resource *tbuf)
+{
+   /* We can't check if the buffer is idle, so we invalidate it
+    * unconditionally. */
+   struct pipe_screen *screen = tc->base.screen;
+   struct pipe_resource *new_buf;
+
+   /* Shared, pinned, and sparse buffers can't be reallocated. */
+   if (tbuf->is_shared ||
+       tbuf->is_user_ptr ||
+       tbuf->b.flags & PIPE_RESOURCE_FLAG_SPARSE)
+      return false;
+
+   /* Allocate a new one. */
+   new_buf = screen->resource_create(screen, &tbuf->b);
+   if (!new_buf)
+      return false;
+
+   /* Replace the "latest" pointer. */
+   if (tbuf->latest != &tbuf->b)
+      pipe_resource_reference(&tbuf->latest, NULL);
+
+   tbuf->latest = new_buf;
+   util_range_set_empty(&tbuf->valid_buffer_range);
+
+   /* The valid range should point to the original buffer. */
+   threaded_resource(new_buf)->base_valid_buffer_range =
+      &tbuf->valid_buffer_range;
+
+   /* Enqueue storage replacement of the original buffer. */
+   struct tc_replace_buffer_storage *p =
+      tc_add_struct_typed_call(tc, TC_CALL_replace_buffer_storage,
+                               tc_replace_buffer_storage);
+
+   p->func = tc->replace_buffer_storage;
+   tc_set_resource_reference(&p->dst, &tbuf->b);
+   tc_set_resource_reference(&p->src, new_buf);
+   return true;
+}
+
+static unsigned
+tc_improve_map_buffer_flags(struct threaded_context *tc,
+                            struct threaded_resource *tres, unsigned usage,
+                            unsigned offset, unsigned size)
+{
+   /* Sparse buffers can't be mapped directly and can't be reallocated
+    * (fully invalidated). That may just be a radeonsi limitation, but
+    * the threaded context must obey it with radeonsi.
+    */
+   if (tres->b.flags & PIPE_RESOURCE_FLAG_SPARSE) {
+      /* We can use DISCARD_RANGE instead of full discard. This is the only
+       * fast path for sparse buffers that doesn't need thread synchronization.
+       */
+      if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
+         usage |= PIPE_TRANSFER_DISCARD_RANGE;
+
+      /* Allow DISCARD_WHOLE_RESOURCE and infering UNSYNCHRONIZED in drivers.
+       * The threaded context doesn't do unsychronized mappings and invalida-
+       * tions of sparse buffers, therefore a correct driver behavior won't
+       * result in an incorrect behavior with the threaded context.
+       */
+      return usage;
+   }
+
+   /* Handle CPU reads trivially. */
+   if (usage & PIPE_TRANSFER_READ) {
+      /* Driver aren't allowed to do buffer invalidations. */
+      return (usage & ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) |
+             TC_TRANSFER_MAP_NO_INVALIDATE |
+             TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
+   }
+
+   /* See if the buffer range being mapped has never been initialized,
+    * in which case it can be mapped unsynchronized. */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+       !tres->is_shared &&
+       !util_ranges_intersect(&tres->valid_buffer_range, offset, offset + size))
+      usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      /* If discarding the entire range, discard the whole resource instead. */
+      if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+          offset == 0 && size == tres->b.width0)
+         usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+
+      /* Discard the whole resource if needed. */
+      if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
+         if (tc_invalidate_buffer(tc, tres))
+            usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+         else
+            usage |= PIPE_TRANSFER_DISCARD_RANGE; /* fallback */
+      }
+   }
+
+   /* We won't need this flag anymore. */
+   /* TODO: We might not need TC_TRANSFER_MAP_NO_INVALIDATE with this. */
+   usage &= ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+
+   /* GL_AMD_pinned_memory and persistent mappings can't use staging
+    * buffers. */
+   if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+                PIPE_TRANSFER_PERSISTENT) ||
+       tres->is_user_ptr)
+      usage &= ~PIPE_TRANSFER_DISCARD_RANGE;
+
+   /* Unsychronized buffer mappings don't have to synchronize the thread. */
+   if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
+      usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */
+
+   /* Never invalidate inside the driver and never infer "unsynchronized". */
+   return usage |
+          TC_TRANSFER_MAP_NO_INVALIDATE |
+          TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
+}
+
+static void *
+tc_transfer_map(struct pipe_context *_pipe,
+                struct pipe_resource *resource, unsigned level,
+                unsigned usage, const struct pipe_box *box,
+                struct pipe_transfer **transfer)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_resource *tres = threaded_resource(resource);
+   struct pipe_context *pipe = tc->pipe;
+
+   if (resource->target == PIPE_BUFFER) {
+      usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
+
+      /* Do a staging transfer within the threaded context. The driver should
+       * only get resource_copy_region.
+       */
+      if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
+         struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers);
+         uint8_t *map;
+
+         ttrans->staging = NULL;
+
+         u_upload_alloc(tc->base.stream_uploader, 0,
+                        box->width + (box->x % tc->map_buffer_alignment),
+                        64, &ttrans->offset, &ttrans->staging, (void**)&map);
+         if (!map) {
+            slab_free(&tc->pool_transfers, ttrans);
+            return NULL;
+         }
+
+         tc_set_resource_reference(&ttrans->b.resource, resource);
+         ttrans->b.level = 0;
+         ttrans->b.usage = usage;
+         ttrans->b.box = *box;
+         ttrans->b.stride = 0;
+         ttrans->b.layer_stride = 0;
+         *transfer = &ttrans->b;
+         return map + (box->x % tc->map_buffer_alignment);
+      }
+   }
+
+   /* Unsychronized buffer mappings don't have to synchronize the thread. */
+   if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
+      tc_sync_msg(tc, resource->target != PIPE_BUFFER ? "  texture" :
+                      usage & PIPE_TRANSFER_DISCARD_RANGE ? "  discard_range" :
+                      usage & PIPE_TRANSFER_READ ? "  read" : "  ??");
+
+   return pipe->transfer_map(pipe, tres->latest ? tres->latest : resource,
+                             level, usage, box, transfer);
+}
+
+struct tc_transfer_flush_region {
+   struct pipe_transfer *transfer;
+   struct pipe_box box;
+};
+
+static void
+tc_call_transfer_flush_region(struct pipe_context *pipe,
+                              union tc_payload *payload)
+{
+   struct tc_transfer_flush_region *p =
+      (struct tc_transfer_flush_region *)payload;
+
+   pipe->transfer_flush_region(pipe, p->transfer, &p->box);
+}
+
+struct tc_resource_copy_region {
+   struct pipe_resource *dst;
+   unsigned dst_level;
+   unsigned dstx, dsty, dstz;
+   struct pipe_resource *src;
+   unsigned src_level;
+   struct pipe_box src_box;
+};
+
+static void
+tc_resource_copy_region(struct pipe_context *_pipe,
+                        struct pipe_resource *dst, unsigned dst_level,
+                        unsigned dstx, unsigned dsty, unsigned dstz,
+                        struct pipe_resource *src, unsigned src_level,
+                        const struct pipe_box *src_box);
+
+static void
+tc_buffer_do_flush_region(struct threaded_context *tc,
+                          struct threaded_transfer *ttrans,
+                          const struct pipe_box *box)
+{
+   struct threaded_resource *tres = threaded_resource(ttrans->b.resource);
+
+   if (ttrans->staging) {
+      struct pipe_box src_box;
+
+      u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
+               box->width, &src_box);
+
+      /* Copy the staging buffer into the original one. */
+      tc_resource_copy_region(&tc->base, ttrans->b.resource, 0, box->x, 0, 0,
+                              ttrans->staging, 0, &src_box);
+   }
+
+   util_range_add(tres->base_valid_buffer_range, box->x, box->x + box->width);
+}
+
+static void
+tc_transfer_flush_region(struct pipe_context *_pipe,
+                         struct pipe_transfer *transfer,
+                         const struct pipe_box *rel_box)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_transfer *ttrans = threaded_transfer(transfer);
+   struct threaded_resource *tres = threaded_resource(transfer->resource);
+   unsigned required_usage = PIPE_TRANSFER_WRITE |
+                             PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+   if (tres->b.target == PIPE_BUFFER) {
+      if ((transfer->usage & required_usage) == required_usage) {
+         struct pipe_box box;
+
+         u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+         tc_buffer_do_flush_region(tc, ttrans, &box);
+      }
+
+      /* Staging transfers don't send the call to the driver. */
+      if (ttrans->staging)
+         return;
+   }
+
+   struct tc_transfer_flush_region *p =
+      tc_add_struct_typed_call(tc, TC_CALL_transfer_flush_region,
+                               tc_transfer_flush_region);
+   p->transfer = transfer;
+   p->box = *rel_box;
+}
+
+static void
+tc_call_transfer_unmap(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->transfer_unmap(pipe, payload->transfer);
+}
+
+static void
+tc_transfer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_transfer *ttrans = threaded_transfer(transfer);
+   struct threaded_resource *tres = threaded_resource(transfer->resource);
+
+   if (tres->b.target == PIPE_BUFFER) {
+      if (transfer->usage & PIPE_TRANSFER_WRITE &&
+          !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+         tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
+
+      /* Staging transfers don't send the call to the driver. */
+      if (ttrans->staging) {
+         pipe_resource_reference(&ttrans->staging, NULL);
+         pipe_resource_reference(&ttrans->b.resource, NULL);
+         slab_free(&tc->pool_transfers, ttrans);
+         return;
+      }
+   }
+
+   tc_add_small_call(tc, TC_CALL_transfer_unmap)->transfer = transfer;
+}
+
+struct tc_buffer_subdata {
+   struct pipe_resource *resource;
+   unsigned usage, offset, size;
+   char slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_buffer_subdata(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_buffer_subdata *p = (struct tc_buffer_subdata *)payload;
+
+   pipe->buffer_subdata(pipe, p->resource, p->usage, p->offset, p->size,
+                        p->slot);
+   pipe_resource_reference(&p->resource, NULL);
+}
+
+static void
+tc_buffer_subdata(struct pipe_context *_pipe,
+                  struct pipe_resource *resource,
+                  unsigned usage, unsigned offset,
+                  unsigned size, const void *data)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_resource *tres = threaded_resource(resource);
+
+   if (!size)
+      return;
+
+   usage |= PIPE_TRANSFER_WRITE |
+            PIPE_TRANSFER_DISCARD_RANGE;
+
+   usage = tc_improve_map_buffer_flags(tc, tres, usage, offset, size);
+
+   /* Unsychronized and big transfers should use transfer_map. Also handle
+    * full invalidations, because drivers aren't allowed to do them.
+    */
+   if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+                PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) ||
+       size > TC_MAX_SUBDATA_BYTES) {
+      struct pipe_transfer *transfer;
+      struct pipe_box box;
+      uint8_t *map = NULL;
+
+      u_box_1d(offset, size, &box);
+
+      map = tc_transfer_map(_pipe, resource, 0, usage, &box, &transfer);
+      if (map) {
+         memcpy(map, data, size);
+         tc_transfer_unmap(_pipe, transfer);
+      }
+      return;
+   }
+
+   util_range_add(&tres->valid_buffer_range, offset, offset + size);
+
+   /* The upload is small. Enqueue it. */
+   struct tc_buffer_subdata *p =
+      tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, size);
+
+   tc_set_resource_reference(&p->resource, resource);
+   p->usage = usage;
+   p->offset = offset;
+   p->size = size;
+   memcpy(p->slot, data, size);
+}
+
+struct tc_texture_subdata {
+   struct pipe_resource *resource;
+   unsigned level, usage, stride, layer_stride;
+   struct pipe_box box;
+   char slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_texture_subdata(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_texture_subdata *p = (struct tc_texture_subdata *)payload;
+
+   pipe->texture_subdata(pipe, p->resource, p->level, p->usage, &p->box,
+                         p->slot, p->stride, p->layer_stride);
+   pipe_resource_reference(&p->resource, NULL);
+}
+
+static void
+tc_texture_subdata(struct pipe_context *_pipe,
+                   struct pipe_resource *resource,
+                   unsigned level, unsigned usage,
+                   const struct pipe_box *box,
+                   const void *data, unsigned stride,
+                   unsigned layer_stride)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   unsigned size;
+
+   assert(box->height >= 1);
+   assert(box->depth >= 1);
+
+   size = (box->depth - 1) * layer_stride +
+          (box->height - 1) * stride +
+          box->width * util_format_get_blocksize(resource->format);
+   if (!size)
+      return;
+
+   /* Small uploads can be enqueued, big uploads must sync. */
+   if (size <= TC_MAX_SUBDATA_BYTES) {
+      struct tc_texture_subdata *p =
+         tc_add_slot_based_call(tc, TC_CALL_texture_subdata, tc_texture_subdata, size);
+
+      tc_set_resource_reference(&p->resource, resource);
+      p->level = level;
+      p->usage = usage;
+      p->box = *box;
+      p->stride = stride;
+      p->layer_stride = layer_stride;
+      memcpy(p->slot, data, size);
+   } else {
+      struct pipe_context *pipe = tc->pipe;
+
+      tc_sync(tc);
+      pipe->texture_subdata(pipe, resource, level, usage, box, data,
+                            stride, layer_stride);
+   }
+}
+
+
+/********************************************************************
+ * miscellaneous
+ */
+
+#define TC_FUNC_SYNC_RET0(ret_type, func) \
+   static ret_type \
+   tc_##func(struct pipe_context *_pipe) \
+   { \
+      struct threaded_context *tc = threaded_context(_pipe); \
+      struct pipe_context *pipe = tc->pipe; \
+      tc_sync(tc); \
+      return pipe->func(pipe); \
+   }
+
+TC_FUNC_SYNC_RET0(enum pipe_reset_status, get_device_reset_status)
+TC_FUNC_SYNC_RET0(uint64_t, get_timestamp)
+
+static void
+tc_get_sample_position(struct pipe_context *_pipe,
+                       unsigned sample_count, unsigned sample_index,
+                       float *out_value)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->get_sample_position(pipe, sample_count, sample_index,
+                             out_value);
+}
+
+static void
+tc_set_device_reset_callback(struct pipe_context *_pipe,
+                             const struct pipe_device_reset_callback *cb)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->set_device_reset_callback(pipe, cb);
+}
+
+struct tc_string_marker {
+   int len;
+   char slot[0]; /* more will be allocated if needed */
+};
+
+static void
+tc_call_emit_string_marker(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_string_marker *p = (struct tc_string_marker *)payload;
+   pipe->emit_string_marker(pipe, p->slot, p->len);
+}
+
+static void
+tc_emit_string_marker(struct pipe_context *_pipe,
+                      const char *string, int len)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   if (len <= TC_MAX_STRING_MARKER_BYTES) {
+      struct tc_string_marker *p =
+         tc_add_slot_based_call(tc, TC_CALL_emit_string_marker, tc_string_marker, len);
+
+      memcpy(p->slot, string, len);
+      p->len = len;
+   } else {
+      struct pipe_context *pipe = tc->pipe;
+
+      tc_sync(tc);
+      pipe->emit_string_marker(pipe, string, len);
+   }
+}
+
+static void
+tc_dump_debug_state(struct pipe_context *_pipe, FILE *stream,
+                    unsigned flags)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->dump_debug_state(pipe, stream, flags);
+}
+
+static void
+tc_set_debug_callback(struct pipe_context *_pipe,
+                      const struct pipe_debug_callback *cb)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   /* Drop all synchronous debug callbacks. Drivers are expected to be OK
+    * with this. shader-db will use an environment variable to disable
+    * the threaded context.
+    */
+   if (cb && cb->debug_message && !cb->async)
+      return;
+
+   tc_sync(tc);
+   pipe->set_debug_callback(pipe, cb);
+}
+
+static void
+tc_create_fence_fd(struct pipe_context *_pipe,
+                   struct pipe_fence_handle **fence, int fd)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->create_fence_fd(pipe, fence, fd);
+}
+
+static void
+tc_fence_server_sync(struct pipe_context *_pipe,
+                     struct pipe_fence_handle *fence)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->fence_server_sync(pipe, fence);
+}
+
+static struct pipe_video_codec *
+tc_create_video_codec(struct pipe_context *_pipe,
+                      const struct pipe_video_codec *templ)
+{
+   unreachable("Threaded context should not be enabled for video APIs");
+   return NULL;
+}
+
+static struct pipe_video_buffer *
+tc_create_video_buffer(struct pipe_context *_pipe,
+                       const struct pipe_video_buffer *templ)
+{
+   unreachable("Threaded context should not be enabled for video APIs");
+   return NULL;
+}
+
+
+/********************************************************************
+ * draw, launch, clear, blit, copy, flush
+ */
+
+static void
+tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
+         unsigned flags)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+   struct threaded_query *tq, *tmp;
+
+   LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
+      tq->flushed = true;
+      LIST_DEL(&tq->head_unflushed);
+   }
+
+   /* TODO: deferred flushes? */
+   tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
+                   flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
+   pipe->flush(pipe, fence, flags);
+}
+
+/* This is actually variable-sized, because indirect isn't allocated if it's
+ * not needed. */
+struct tc_full_draw_info {
+   struct pipe_draw_info draw;
+   struct pipe_draw_indirect_info indirect;
+};
+
+static void
+tc_call_draw_vbo(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_full_draw_info *info = (struct tc_full_draw_info*)payload;
+
+   pipe->draw_vbo(pipe, &info->draw);
+   pipe_so_target_reference(&info->draw.count_from_stream_output, NULL);
+   if (info->draw.index_size)
+      pipe_resource_reference(&info->draw.index.resource, NULL);
+   if (info->draw.indirect) {
+      pipe_resource_reference(&info->indirect.buffer, NULL);
+      pipe_resource_reference(&info->indirect.indirect_draw_count, NULL);
+   }
+}
+
+static struct tc_full_draw_info *
+tc_add_draw_vbo(struct pipe_context *_pipe, bool indirect)
+{
+   return (struct tc_full_draw_info*)
+          tc_add_sized_call(threaded_context(_pipe), TC_CALL_draw_vbo,
+                            indirect ? sizeof(struct tc_full_draw_info) :
+                                       sizeof(struct pipe_draw_info));
+}
+
+static void
+tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_draw_indirect_info *indirect = info->indirect;
+   unsigned index_size = info->index_size;
+   bool has_user_indices = info->has_user_indices;
+
+   if (index_size && has_user_indices) {
+      unsigned size = info->count * index_size;
+      struct pipe_resource *buffer = NULL;
+      unsigned offset;
+
+      tc_assert(!indirect);
+
+      /* This must be done before adding draw_vbo, because it could generate
+       * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
+       * to the driver if it was done afterwards.
+       */
+      u_upload_data(tc->base.stream_uploader, 0, size, 4, info->index.user,
+                    &offset, &buffer);
+      if (unlikely(!buffer))
+         return;
+
+      struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, false);
+      p->draw.count_from_stream_output = NULL;
+      pipe_so_target_reference(&p->draw.count_from_stream_output,
+                               info->count_from_stream_output);
+      memcpy(&p->draw, info, sizeof(*info));
+      p->draw.has_user_indices = false;
+      p->draw.index.resource = buffer;
+      p->draw.start = offset / index_size;
+   } else {
+      /* Non-indexed call or indexed with a real index buffer. */
+      struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, indirect != NULL);
+      p->draw.count_from_stream_output = NULL;
+      pipe_so_target_reference(&p->draw.count_from_stream_output,
+                               info->count_from_stream_output);
+      if (index_size) {
+         tc_set_resource_reference(&p->draw.index.resource,
+                                   info->index.resource);
+      }
+      memcpy(&p->draw, info, sizeof(*info));
+
+      if (indirect) {
+         tc_set_resource_reference(&p->draw.indirect->buffer, indirect->buffer);
+         tc_set_resource_reference(&p->indirect.indirect_draw_count,
+                                   indirect->indirect_draw_count);
+         memcpy(&p->indirect, indirect, sizeof(*indirect));
+         p->draw.indirect = &p->indirect;
+      }
+   }
+}
+
+static void
+tc_call_launch_grid(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct pipe_grid_info *p = (struct pipe_grid_info *)payload;
+
+   pipe->launch_grid(pipe, p);
+   pipe_resource_reference(&p->indirect, NULL);
+}
+
+static void
+tc_launch_grid(struct pipe_context *_pipe,
+               const struct pipe_grid_info *info)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_grid_info *p = tc_add_struct_typed_call(tc, TC_CALL_launch_grid,
+                                                       pipe_grid_info);
+   assert(info->input == NULL);
+
+   tc_set_resource_reference(&p->indirect, info->indirect);
+   memcpy(p, info, sizeof(*info));
+}
+
+static void
+tc_call_resource_copy_region(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_resource_copy_region *p = (struct tc_resource_copy_region *)payload;
+
+   pipe->resource_copy_region(pipe, p->dst, p->dst_level, p->dstx, p->dsty,
+                              p->dstz, p->src, p->src_level, &p->src_box);
+   pipe_resource_reference(&p->dst, NULL);
+   pipe_resource_reference(&p->src, NULL);
+}
+
+static void
+tc_resource_copy_region(struct pipe_context *_pipe,
+                        struct pipe_resource *dst, unsigned dst_level,
+                        unsigned dstx, unsigned dsty, unsigned dstz,
+                        struct pipe_resource *src, unsigned src_level,
+                        const struct pipe_box *src_box)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_resource *tdst = threaded_resource(dst);
+   struct tc_resource_copy_region *p =
+      tc_add_struct_typed_call(tc, TC_CALL_resource_copy_region,
+                               tc_resource_copy_region);
+
+   tc_set_resource_reference(&p->dst, dst);
+   p->dst_level = dst_level;
+   p->dstx = dstx;
+   p->dsty = dsty;
+   p->dstz = dstz;
+   tc_set_resource_reference(&p->src, src);
+   p->src_level = src_level;
+   p->src_box = *src_box;
+
+   if (dst->target == PIPE_BUFFER)
+      util_range_add(&tdst->valid_buffer_range, dstx, dstx + src_box->width);
+}
+
+static void
+tc_call_blit(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct pipe_blit_info *blit = (struct pipe_blit_info*)payload;
+
+   pipe->blit(pipe, blit);
+   pipe_resource_reference(&blit->dst.resource, NULL);
+   pipe_resource_reference(&blit->src.resource, NULL);
+}
+
+static void
+tc_blit(struct pipe_context *_pipe, const struct pipe_blit_info *info)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_blit_info *blit =
+      tc_add_struct_typed_call(tc, TC_CALL_blit, pipe_blit_info);
+
+   tc_set_resource_reference(&blit->dst.resource, info->dst.resource);
+   tc_set_resource_reference(&blit->src.resource, info->src.resource);
+   memcpy(blit, info, sizeof(*info));
+}
+
+struct tc_generate_mipmap {
+   struct pipe_resource *res;
+   enum pipe_format format;
+   unsigned base_level;
+   unsigned last_level;
+   unsigned first_layer;
+   unsigned last_layer;
+};
+
+static void
+tc_call_generate_mipmap(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_generate_mipmap *p = (struct tc_generate_mipmap *)payload;
+   bool result = pipe->generate_mipmap(pipe, p->res, p->format, p->base_level,
+                                       p->last_level, p->first_layer,
+                                       p->last_layer);
+   assert(result);
+   pipe_resource_reference(&p->res, NULL);
+}
+
+static boolean
+tc_generate_mipmap(struct pipe_context *_pipe,
+                   struct pipe_resource *res,
+                   enum pipe_format format,
+                   unsigned base_level,
+                   unsigned last_level,
+                   unsigned first_layer,
+                   unsigned last_layer)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   unsigned bind = PIPE_BIND_SAMPLER_VIEW;
+
+   if (util_format_is_depth_or_stencil(format))
+      bind = PIPE_BIND_DEPTH_STENCIL;
+   else
+      bind = PIPE_BIND_RENDER_TARGET;
+
+   if (!screen->is_format_supported(screen, format, res->target,
+                                    res->nr_samples, bind))
+      return false;
+
+   struct tc_generate_mipmap *p =
+      tc_add_struct_typed_call(tc, TC_CALL_generate_mipmap, tc_generate_mipmap);
+
+   tc_set_resource_reference(&p->res, res);
+   p->format = format;
+   p->base_level = base_level;
+   p->last_level = last_level;
+   p->first_layer = first_layer;
+   p->last_layer = last_layer;
+   return true;
+}
+
+static void
+tc_call_flush_resource(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->flush_resource(pipe, payload->resource);
+   pipe_resource_reference(&payload->resource, NULL);
+}
+
+static void
+tc_flush_resource(struct pipe_context *_pipe,
+                  struct pipe_resource *resource)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_flush_resource);
+
+   tc_set_resource_reference(&payload->resource, resource);
+}
+
+static void
+tc_call_invalidate_resource(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->invalidate_resource(pipe, payload->resource);
+   pipe_resource_reference(&payload->resource, NULL);
+}
+
+static void
+tc_invalidate_resource(struct pipe_context *_pipe,
+                       struct pipe_resource *resource)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   if (resource->target == PIPE_BUFFER) {
+      tc_invalidate_buffer(tc, threaded_resource(resource));
+      return;
+   }
+
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_invalidate_resource);
+   tc_set_resource_reference(&payload->resource, resource);
+}
+
+struct tc_clear {
+   unsigned buffers;
+   union pipe_color_union color;
+   double depth;
+   unsigned stencil;
+};
+
+static void
+tc_call_clear(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_clear *p = (struct tc_clear *)payload;
+   pipe->clear(pipe, p->buffers, &p->color, p->depth, p->stencil);
+}
+
+static void
+tc_clear(struct pipe_context *_pipe, unsigned buffers,
+         const union pipe_color_union *color, double depth,
+         unsigned stencil)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_clear *p = tc_add_struct_typed_call(tc, TC_CALL_clear, tc_clear);
+
+   p->buffers = buffers;
+   p->color = *color;
+   p->depth = depth;
+   p->stencil = stencil;
+}
+
+static void
+tc_clear_render_target(struct pipe_context *_pipe,
+                       struct pipe_surface *dst,
+                       const union pipe_color_union *color,
+                       unsigned dstx, unsigned dsty,
+                       unsigned width, unsigned height,
+                       bool render_condition_enabled)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->clear_render_target(pipe, dst, color, dstx, dsty, width, height,
+                             render_condition_enabled);
+}
+
+static void
+tc_clear_depth_stencil(struct pipe_context *_pipe,
+                       struct pipe_surface *dst, unsigned clear_flags,
+                       double depth, unsigned stencil, unsigned dstx,
+                       unsigned dsty, unsigned width, unsigned height,
+                       bool render_condition_enabled)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->clear_depth_stencil(pipe, dst, clear_flags, depth, stencil,
+                             dstx, dsty, width, height,
+                             render_condition_enabled);
+}
+
+struct tc_clear_buffer {
+   struct pipe_resource *res;
+   unsigned offset;
+   unsigned size;
+   char clear_value[16];
+   int clear_value_size;
+};
+
+static void
+tc_call_clear_buffer(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_clear_buffer *p = (struct tc_clear_buffer *)payload;
+
+   pipe->clear_buffer(pipe, p->res, p->offset, p->size, p->clear_value,
+                      p->clear_value_size);
+   pipe_resource_reference(&p->res, NULL);
+}
+
+static void
+tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
+                unsigned offset, unsigned size,
+                const void *clear_value, int clear_value_size)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct threaded_resource *tres = threaded_resource(res);
+   struct tc_clear_buffer *p =
+      tc_add_struct_typed_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
+
+   tc_set_resource_reference(&p->res, res);
+   p->offset = offset;
+   p->size = size;
+   memcpy(p->clear_value, clear_value, clear_value_size);
+   p->clear_value_size = clear_value_size;
+
+   util_range_add(&tres->valid_buffer_range, offset, offset + size);
+}
+
+struct tc_clear_texture {
+   struct pipe_resource *res;
+   unsigned level;
+   struct pipe_box box;
+   char data[16];
+};
+
+static void
+tc_call_clear_texture(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_clear_texture *p = (struct tc_clear_texture *)payload;
+
+   pipe->clear_texture(pipe, p->res, p->level, &p->box, p->data);
+   pipe_resource_reference(&p->res, NULL);
+}
+
+static void
+tc_clear_texture(struct pipe_context *_pipe, struct pipe_resource *res,
+                 unsigned level, const struct pipe_box *box, const void *data)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_clear_texture *p =
+      tc_add_struct_typed_call(tc, TC_CALL_clear_texture, tc_clear_texture);
+
+   tc_set_resource_reference(&p->res, res);
+   p->level = level;
+   p->box = *box;
+   memcpy(p->data, data,
+          util_format_get_blocksize(res->format));
+}
+
+struct tc_resource_commit {
+   struct pipe_resource *res;
+   unsigned level;
+   struct pipe_box box;
+   bool commit;
+};
+
+static void
+tc_call_resource_commit(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_resource_commit *p = (struct tc_resource_commit *)payload;
+
+   pipe->resource_commit(pipe, p->res, p->level, &p->box, p->commit);
+   pipe_resource_reference(&p->res, NULL);
+}
+
+static bool
+tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res,
+                   unsigned level, struct pipe_box *box, bool commit)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_resource_commit *p =
+      tc_add_struct_typed_call(tc, TC_CALL_resource_commit, tc_resource_commit);
+
+   tc_set_resource_reference(&p->res, res);
+   p->level = level;
+   p->box = *box;
+   p->commit = commit;
+   return true; /* we don't care about the return value for this call */
+}
+
+
+/********************************************************************
+ * create & destroy
+ */
+
+static void
+tc_destroy(struct pipe_context *_pipe)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+
+   if (util_queue_is_initialized(&tc->queue)) {
+      util_queue_destroy(&tc->queue);
+
+      for (unsigned i = 0; i < TC_MAX_BATCHES; i++)
+         util_queue_fence_destroy(&tc->batch_slots[i].fence);
+   }
+
+   if (tc->base.const_uploader &&
+       tc->base.stream_uploader != tc->base.const_uploader)
+      u_upload_destroy(tc->base.const_uploader);
+
+   if (tc->base.stream_uploader)
+      u_upload_destroy(tc->base.stream_uploader);
+
+   slab_destroy_child(&tc->pool_transfers);
+   pipe->destroy(pipe);
+   os_free_aligned(tc);
+}
+
+static const tc_execute execute_func[TC_NUM_CALLS] = {
+#define CALL(name) tc_call_##name,
+#include "u_threaded_context_calls.h"
+#undef CALL
+};
+
+/**
+ * Wrap an existing pipe_context into a threaded_context.
+ *
+ * \param pipe                 pipe_context to wrap
+ * \param parent_transfer_pool parent slab pool set up for creating pipe_-
+ *                             transfer objects; the driver should have one
+ *                             in pipe_screen.
+ * \param replace_buffer  callback for replacing a pipe_resource's storage
+ *                        with another pipe_resource's storage.
+ * \param out  if successful, the threaded_context will be returned here in
+ *             addition to the return value if "out" != NULL
+ */
+struct pipe_context *
+threaded_context_create(struct pipe_context *pipe,
+                        struct slab_parent_pool *parent_transfer_pool,
+                        tc_replace_buffer_storage_func replace_buffer,
+                        struct threaded_context **out)
+{
+   struct threaded_context *tc;
+
+   STATIC_ASSERT(sizeof(union tc_payload) <= 8);
+   STATIC_ASSERT(sizeof(struct tc_call) <= 16);
+
+   if (!pipe)
+      return NULL;
+
+   util_cpu_detect();
+
+   if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
+      return pipe;
+
+   tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
+   if (!tc) {
+      pipe->destroy(pipe);
+      return NULL;
+   }
+   memset(tc, 0, sizeof(*tc));
+
+   assert((uintptr_t)tc % 16 == 0);
+   /* These should be static asserts, but they don't work with MSVC */
+   assert(offsetof(struct threaded_context, batch_slots) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0);
+
+   /* The driver context isn't wrapped, so set its "priv" to NULL. */
+   pipe->priv = NULL;
+
+   tc->pipe = pipe;
+   tc->replace_buffer_storage = replace_buffer;
+   tc->map_buffer_alignment =
+      pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
+   tc->base.priv = pipe; /* priv points to the wrapped driver context */
+   tc->base.screen = pipe->screen;
+   tc->base.destroy = tc_destroy;
+
+   tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
+   if (pipe->stream_uploader == pipe->const_uploader)
+      tc->base.const_uploader = tc->base.stream_uploader;
+   else
+      tc->base.const_uploader = u_upload_clone(&tc->base, pipe->const_uploader);
+
+   if (!tc->base.stream_uploader || !tc->base.const_uploader)
+      goto fail;
+
+   /* The queue size is the number of batches "waiting". Batches are removed
+    * from the queue before being executed, so keep one tc_batch slot for that
+    * execution. Also, keep one unused slot for an unflushed batch.
+    */
+   if (!util_queue_init(&tc->queue, "gallium_drv", TC_MAX_BATCHES - 2, 1, 0))
+      goto fail;
+
+   for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
+      tc->batch_slots[i].sentinel = TC_SENTINEL;
+      tc->batch_slots[i].pipe = pipe;
+      util_queue_fence_init(&tc->batch_slots[i].fence);
+   }
+
+   LIST_INITHEAD(&tc->unflushed_queries);
+
+   slab_create_child(&tc->pool_transfers, parent_transfer_pool);
+
+#define CTX_INIT(_member) \
+   tc->base._member = tc->pipe->_member ? tc_##_member : NULL
+
+   CTX_INIT(flush);
+   CTX_INIT(draw_vbo);
+   CTX_INIT(launch_grid);
+   CTX_INIT(resource_copy_region);
+   CTX_INIT(blit);
+   CTX_INIT(clear);
+   CTX_INIT(clear_render_target);
+   CTX_INIT(clear_depth_stencil);
+   CTX_INIT(clear_buffer);
+   CTX_INIT(clear_texture);
+   CTX_INIT(flush_resource);
+   CTX_INIT(generate_mipmap);
+   CTX_INIT(render_condition);
+   CTX_INIT(create_query);
+   CTX_INIT(create_batch_query);
+   CTX_INIT(destroy_query);
+   CTX_INIT(begin_query);
+   CTX_INIT(end_query);
+   CTX_INIT(get_query_result);
+   CTX_INIT(get_query_result_resource);
+   CTX_INIT(set_active_query_state);
+   CTX_INIT(create_blend_state);
+   CTX_INIT(bind_blend_state);
+   CTX_INIT(delete_blend_state);
+   CTX_INIT(create_sampler_state);
+   CTX_INIT(bind_sampler_states);
+   CTX_INIT(delete_sampler_state);
+   CTX_INIT(create_rasterizer_state);
+   CTX_INIT(bind_rasterizer_state);
+   CTX_INIT(delete_rasterizer_state);
+   CTX_INIT(create_depth_stencil_alpha_state);
+   CTX_INIT(bind_depth_stencil_alpha_state);
+   CTX_INIT(delete_depth_stencil_alpha_state);
+   CTX_INIT(create_fs_state);
+   CTX_INIT(bind_fs_state);
+   CTX_INIT(delete_fs_state);
+   CTX_INIT(create_vs_state);
+   CTX_INIT(bind_vs_state);
+   CTX_INIT(delete_vs_state);
+   CTX_INIT(create_gs_state);
+   CTX_INIT(bind_gs_state);
+   CTX_INIT(delete_gs_state);
+   CTX_INIT(create_tcs_state);
+   CTX_INIT(bind_tcs_state);
+   CTX_INIT(delete_tcs_state);
+   CTX_INIT(create_tes_state);
+   CTX_INIT(bind_tes_state);
+   CTX_INIT(delete_tes_state);
+   CTX_INIT(create_compute_state);
+   CTX_INIT(bind_compute_state);
+   CTX_INIT(delete_compute_state);
+   CTX_INIT(create_vertex_elements_state);
+   CTX_INIT(bind_vertex_elements_state);
+   CTX_INIT(delete_vertex_elements_state);
+   CTX_INIT(set_blend_color);
+   CTX_INIT(set_stencil_ref);
+   CTX_INIT(set_sample_mask);
+   CTX_INIT(set_min_samples);
+   CTX_INIT(set_clip_state);
+   CTX_INIT(set_constant_buffer);
+   CTX_INIT(set_framebuffer_state);
+   CTX_INIT(set_polygon_stipple);
+   CTX_INIT(set_scissor_states);
+   CTX_INIT(set_viewport_states);
+   CTX_INIT(set_window_rectangles);
+   CTX_INIT(set_sampler_views);
+   CTX_INIT(set_tess_state);
+   CTX_INIT(set_shader_buffers);
+   CTX_INIT(set_shader_images);
+   CTX_INIT(set_vertex_buffers);
+   CTX_INIT(create_stream_output_target);
+   CTX_INIT(stream_output_target_destroy);
+   CTX_INIT(set_stream_output_targets);
+   CTX_INIT(create_sampler_view);
+   CTX_INIT(sampler_view_destroy);
+   CTX_INIT(create_surface);
+   CTX_INIT(surface_destroy);
+   CTX_INIT(transfer_map);
+   CTX_INIT(transfer_flush_region);
+   CTX_INIT(transfer_unmap);
+   CTX_INIT(buffer_subdata);
+   CTX_INIT(texture_subdata);
+   CTX_INIT(texture_barrier);
+   CTX_INIT(memory_barrier);
+   CTX_INIT(resource_commit);
+   CTX_INIT(create_video_codec);
+   CTX_INIT(create_video_buffer);
+   CTX_INIT(set_compute_resources);
+   CTX_INIT(set_global_binding);
+   CTX_INIT(get_sample_position);
+   CTX_INIT(invalidate_resource);
+   CTX_INIT(get_device_reset_status);
+   CTX_INIT(set_device_reset_callback);
+   CTX_INIT(dump_debug_state);
+   CTX_INIT(emit_string_marker);
+   CTX_INIT(set_debug_callback);
+   CTX_INIT(create_fence_fd);
+   CTX_INIT(fence_server_sync);
+   CTX_INIT(get_timestamp);
+   CTX_INIT(create_texture_handle);
+   CTX_INIT(delete_texture_handle);
+   CTX_INIT(make_texture_handle_resident);
+   CTX_INIT(create_image_handle);
+   CTX_INIT(delete_image_handle);
+   CTX_INIT(make_image_handle_resident);
+#undef CTX_INIT
+
+   if (out)
+      *out = tc;
+
+   return &tc->base;
+
+fail:
+   tc_destroy(&tc->base);
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
new file mode 100644
index 0000000..0742fae
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -0,0 +1,353 @@
+/**************************************************************************
+ *
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* This is a wrapper for pipe_context that executes all pipe_context calls
+ * in another thread.
+ *
+ *
+ * Guidelines for adopters and deviations from Gallium
+ * ---------------------------------------------------
+ *
+ * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
+ *    driver functions that take a context (fence_finish, texture_get_handle)
+ *    should manually unwrap pipe_context by doing:
+ *      pipe = threaded_context_unwrap_sync(pipe);
+ *
+ *    pipe_context::priv is used to unwrap the context, so drivers and state
+ *    trackers shouldn't use it.
+ *
+ *    No other objects are wrapped.
+ *
+ * 2) Drivers must subclass and initialize these structures:
+ *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
+ *    - threaded_query for pipe_query (zero memory)
+ *    - threaded_transfer for pipe_transfer (zero memory)
+ *
+ * 3) The threaded context must not be enabled for contexts that can use video
+ *    codecs.
+ *
+ * 4) Changes in driver behavior:
+ *    - begin_query and end_query always return true; return values from
+ *      the driver are ignored.
+ *    - generate_mipmap uses is_format_supported to determine success;
+ *      the return value from the driver is ignored.
+ *    - resource_commit always returns true; failures are ignored.
+ *    - set_debug_callback is skipped if the callback is synchronous.
+ *
+ *
+ * Thread-safety requirements on context functions
+ * -----------------------------------------------
+ *
+ * These pipe_context functions are executed directly, so they shouldn't use
+ * pipe_context in an unsafe way. They are de-facto screen functions now:
+ * - create_query
+ * - create_batch_query
+ * - create_*_state (all CSOs and shaders)
+ *     - Make sure the shader compiler doesn't use any per-context stuff.
+ *       (e.g. LLVM target machine)
+ *     - Only pipe_context's debug callback for shader dumps is guaranteed to
+ *       be up to date, because set_debug_callback synchronizes execution.
+ * - create_surface
+ * - surface_destroy
+ * - create_sampler_view
+ * - sampler_view_destroy
+ * - stream_output_target_destroy
+ * - transfer_map (only unsychronized buffer mappings)
+ * - get_query_result (when threaded_query::flushed == true)
+ *
+ * Create calls causing a sync that can't be async due to driver limitations:
+ * - create_stream_output_target
+ *
+ *
+ * Transfer_map rules for buffer mappings
+ * --------------------------------------
+ *
+ * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made
+ *    in the non-driver thread without flushing the queue. The driver will
+ *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_-
+ *    UNSYNCHRONIZED to indicate this.
+ *    Note that transfer_unmap is always enqueued and called from the driver
+ *    thread.
+ *
+ * 2) The driver isn't allowed to infer unsychronized mappings by tracking
+ *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
+ *    MAP_IGNORE_VALID_RANGE to indicate this. Ignoring the flag will lead
+ *    to failures.
+ *    The threaded context does its own detection of unsynchronized mappings.
+ *
+ * 3) The driver isn't allowed to do buffer invalidations by itself under any
+ *    circumstances. This is necessary for unsychronized maps to map the latest
+ *    version of the buffer. (because invalidations can be queued, while
+ *    unsychronized maps are not queued and they should return the latest
+ *    storage after invalidation). The threaded context always sends
+ *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
+ *    indicate this. Ignoring the flag will lead to failures.
+ *    The threaded context uses its own buffer invalidation mechanism.
+ *
+ *
+ * Additional requirements
+ * -----------------------
+ *
+ * get_query_result:
+ *    If threaded_query::flushed == true, get_query_result should assume that
+ *    it's called from a non-driver thread, in which case the driver shouldn't
+ *    use the context in an unsafe way.
+ *
+ * replace_buffer_storage:
+ *    The driver has to implement this callback, which will be called when
+ *    the threaded context wants to replace a resource's backing storage with
+ *    another resource's backing storage. The threaded context uses it to
+ *    implement buffer invalidation. This call is always queued.
+ *
+ *
+ * Performance gotchas
+ * -------------------
+ *
+ * Buffer invalidations are done unconditionally - they don't check whether
+ * the buffer is busy. This can cause drivers to have more live allocations
+ * and CPU mappings than necessary.
+ *
+ *
+ * How it works (queue architecture)
+ * ---------------------------------
+ *
+ * There is a multithreaded queue consisting of batches, each batch consisting
+ * of call slots. Each call slot consists of an 8-byte header (call ID +
+ * call size + constant 32-bit marker for integrity checking) and an 8-byte
+ * body for per-call data. That is 16 bytes per call slot.
+ *
+ * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
+ * calls occupy multiple call slots depending on the size needed by call
+ * parameters. That means that calls can have a variable size in the batch.
+ * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
+ * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
+ * Even though the first call slot can use only 8 bytes for data, additional
+ * call slots used by the same call can use all 16 bytes for data.
+ * For example, a call using 2 call slots has 24 bytes of space for data.
+ *
+ * Once a batch is full and there is no space for the next call, it's flushed,
+ * meaning that it's added to the queue for execution in the other thread.
+ * The batches are ordered in a ring and reused once they are idle again.
+ * The batching is necessary for low queue/mutex overhead.
+ *
+ */
+
+#ifndef U_THREADED_CONTEXT_H
+#define U_THREADED_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_queue.h"
+#include "util/u_range.h"
+#include "util/slab.h"
+
+/* These are transfer flags sent to drivers. */
+/* Never infer whether it's safe to use unsychronized mappings: */
+#define TC_TRANSFER_MAP_IGNORE_VALID_RANGE   (1u << 29)
+/* Don't invalidate buffers: */
+#define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
+/* transfer_map is called from a non-driver thread: */
+#define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
+
+/* Size of the queue = number of batch slots in memory.
+ * - 1 batch is always idle and records new commands
+ * - 1 batch is being executed
+ * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
+ *
+ * Use a size as small as possible for low CPU L2 cache usage but large enough
+ * so that the queue isn't stalled too often for not having enough idle batch
+ * slots.
+ */
+#define TC_MAX_BATCHES        10
+
+/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
+ * can occupy multiple call slots.
+ *
+ * The idea is to have batches as small as possible but large enough so that
+ * the queuing and mutex overhead is negligible.
+ */
+#define TC_CALLS_PER_BATCH    192
+
+/* Threshold for when to use the queue or sync. */
+#define TC_MAX_STRING_MARKER_BYTES  512
+
+/* Threshold for when to enqueue buffer/texture_subdata as-is.
+ * If the upload size is greater than this, it will do instead:
+ * - for buffers: DISCARD_RANGE is done by the threaded context
+ * - for textures: sync and call the driver directly
+ */
+#define TC_MAX_SUBDATA_BYTES        320
+
+typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
+                                               struct pipe_resource *dst,
+                                               struct pipe_resource *src);
+
+struct threaded_resource {
+   struct pipe_resource b;
+   const struct u_resource_vtbl *vtbl;
+
+   /* Since buffer invalidations are queued, we can't use the base resource
+    * for unsychronized mappings. This points to the latest version of
+    * the buffer after the latest invalidation. It's only used for unsychro-
+    * nized mappings in the non-driver thread. Initially it's set to &b.
+    */
+   struct pipe_resource *latest;
+
+   /* The buffer range which is initialized (with a write transfer, streamout,
+    * or writable shader resources). The remainder of the buffer is considered
+    * invalid and can be mapped unsynchronized.
+    *
+    * This allows unsychronized mapping of a buffer range which hasn't been
+    * used yet. It's for applications which forget to use the unsynchronized
+    * map flag and expect the driver to figure it out.
+    *
+    * Drivers should set this to the full range for buffers backed by user
+    * memory.
+    */
+   struct util_range valid_buffer_range;
+
+   /* If "this" is not the base instance of the buffer, but it's one of its
+    * reallocations (set in "latest" of the base instance), this points to
+    * the valid range of the base instance. It's used for transfers after
+    * a buffer invalidation, because such transfers operate on "latest", not
+    * the base instance. Initially it's set to &valid_buffer_range.
+    */
+   struct util_range *base_valid_buffer_range;
+
+   /* Drivers are required to update this for shared resources and user
+    * pointers. */
+   bool	is_shared;
+   bool is_user_ptr;
+};
+
+struct threaded_transfer {
+   struct pipe_transfer b;
+
+   /* Staging buffer for DISCARD_RANGE transfers. */
+   struct pipe_resource *staging;
+
+   /* Offset into the staging buffer, because the backing buffer is
+    * sub-allocated. */
+   unsigned offset;
+};
+
+struct threaded_query {
+   /* The query is added to the list in end_query and removed in flush. */
+   struct list_head head_unflushed;
+
+   /* Whether pipe->flush has been called after end_query. */
+   bool flushed;
+};
+
+/* This is the second half of tc_call containing call data.
+ * Most calls will typecast this to the type they need, typically larger
+ * than 8 bytes.
+ */
+union tc_payload {
+   struct pipe_query *query;
+   struct pipe_resource *resource;
+   struct pipe_transfer *transfer;
+   uint64_t handle;
+};
+
+#ifdef _MSC_VER
+#define ALIGN16 __declspec(align(16))
+#else
+#define ALIGN16 __attribute__((aligned(16)))
+#endif
+
+/* Each call slot should be aligned to its own size for optimal cache usage. */
+struct ALIGN16 tc_call {
+   unsigned sentinel;
+   ushort num_call_slots;
+   ushort call_id;
+   union tc_payload payload;
+};
+
+struct tc_batch {
+   struct pipe_context *pipe;
+   unsigned sentinel;
+   unsigned num_total_call_slots;
+   struct util_queue_fence fence;
+   struct tc_call call[TC_CALLS_PER_BATCH];
+};
+
+struct threaded_context {
+   struct pipe_context base;
+   struct pipe_context *pipe;
+   struct slab_child_pool pool_transfers;
+   tc_replace_buffer_storage_func replace_buffer_storage;
+   unsigned map_buffer_alignment;
+
+   struct list_head unflushed_queries;
+
+   /* Counters for the HUD. */
+   unsigned num_offloaded_slots;
+   unsigned num_direct_slots;
+   unsigned num_syncs;
+
+   struct util_queue queue;
+   struct util_queue_fence *fence;
+
+   unsigned last, next;
+   struct tc_batch batch_slots[TC_MAX_BATCHES];
+};
+
+void threaded_resource_init(struct pipe_resource *res);
+void threaded_resource_deinit(struct pipe_resource *res);
+struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
+
+struct pipe_context *
+threaded_context_create(struct pipe_context *pipe,
+                        struct slab_parent_pool *parent_transfer_pool,
+                        tc_replace_buffer_storage_func replace_buffer,
+                        struct threaded_context **out);
+
+static inline struct threaded_context *
+threaded_context(struct pipe_context *pipe)
+{
+   return (struct threaded_context*)pipe;
+}
+
+static inline struct threaded_resource *
+threaded_resource(struct pipe_resource *res)
+{
+   return (struct threaded_resource*)res;
+}
+
+static inline struct threaded_query *
+threaded_query(struct pipe_query *q)
+{
+   return (struct threaded_query*)q;
+}
+
+static inline struct threaded_transfer *
+threaded_transfer(struct pipe_transfer *transfer)
+{
+   return (struct threaded_transfer*)transfer;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h b/src/gallium/auxiliary/util/u_threaded_context_calls.h
new file mode 100644
index 0000000..546819a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h
@@ -0,0 +1,70 @@
+CALL(destroy_query)
+CALL(begin_query)
+CALL(end_query)
+CALL(get_query_result_resource)
+CALL(render_condition)
+CALL(bind_sampler_states)
+CALL(set_framebuffer_state)
+CALL(set_tess_state)
+CALL(set_constant_buffer)
+CALL(set_scissor_states)
+CALL(set_viewport_states)
+CALL(set_window_rectangles)
+CALL(set_sampler_views)
+CALL(set_shader_images)
+CALL(set_shader_buffers)
+CALL(set_vertex_buffers)
+CALL(set_stream_output_targets)
+CALL(replace_buffer_storage)
+CALL(transfer_flush_region)
+CALL(transfer_unmap)
+CALL(buffer_subdata)
+CALL(texture_subdata)
+CALL(emit_string_marker)
+CALL(draw_vbo)
+CALL(launch_grid)
+CALL(resource_copy_region)
+CALL(blit)
+CALL(generate_mipmap)
+CALL(flush_resource)
+CALL(invalidate_resource)
+CALL(clear)
+CALL(clear_buffer)
+CALL(clear_texture)
+CALL(resource_commit)
+CALL(set_active_query_state)
+CALL(set_blend_color)
+CALL(set_stencil_ref)
+CALL(set_clip_state)
+CALL(set_sample_mask)
+CALL(set_min_samples)
+CALL(set_polygon_stipple)
+CALL(texture_barrier)
+CALL(memory_barrier)
+CALL(delete_texture_handle)
+CALL(make_texture_handle_resident)
+CALL(delete_image_handle)
+CALL(make_image_handle_resident)
+
+CALL(bind_blend_state)
+CALL(bind_rasterizer_state)
+CALL(bind_depth_stencil_alpha_state)
+CALL(bind_compute_state)
+CALL(bind_fs_state)
+CALL(bind_vs_state)
+CALL(bind_gs_state)
+CALL(bind_tcs_state)
+CALL(bind_tes_state)
+CALL(bind_vertex_elements_state)
+
+CALL(delete_blend_state)
+CALL(delete_rasterizer_state)
+CALL(delete_depth_stencil_alpha_state)
+CALL(delete_compute_state)
+CALL(delete_fs_state)
+CALL(delete_vs_state)
+CALL(delete_gs_state)
+CALL(delete_tcs_state)
+CALL(delete_tes_state)
+CALL(delete_vertex_elements_state)
+CALL(delete_sampler_state)
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 32697b8..4bb14d6 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /* Helper utility for uploading user buffers & other data, and
@@ -59,7 +59,7 @@
 u_upload_create(struct pipe_context *pipe, unsigned default_size,
                 unsigned bind, enum pipe_resource_usage usage)
 {
-   struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
+   struct u_upload_mgr *upload = CALLOC_STRUCT(u_upload_mgr);
    if (!upload)
       return NULL;
 
@@ -97,7 +97,15 @@
                           PIPE_USAGE_STREAM);
 }
 
-static void upload_unmap_internal(struct u_upload_mgr *upload, boolean destroying)
+struct u_upload_mgr *
+u_upload_clone(struct pipe_context *pipe, struct u_upload_mgr *upload)
+{
+   return u_upload_create(pipe, upload->default_size, upload->bind,
+                          upload->usage);
+}
+
+static void
+upload_unmap_internal(struct u_upload_mgr *upload, boolean destroying)
 {
    if (!destroying && upload->map_persistent)
       return;
@@ -117,30 +125,32 @@
 }
 
 
-void u_upload_unmap( struct u_upload_mgr *upload )
+void
+u_upload_unmap(struct u_upload_mgr *upload)
 {
    upload_unmap_internal(upload, FALSE);
 }
 
 
-static void u_upload_release_buffer(struct u_upload_mgr *upload)
+static void
+u_upload_release_buffer(struct u_upload_mgr *upload)
 {
    /* Unmap and unreference the upload buffer. */
    upload_unmap_internal(upload, TRUE);
-   pipe_resource_reference( &upload->buffer, NULL );
+   pipe_resource_reference(&upload->buffer, NULL);
 }
 
 
-void u_upload_destroy( struct u_upload_mgr *upload )
+void
+u_upload_destroy(struct u_upload_mgr *upload)
 {
-   u_upload_release_buffer( upload );
-   FREE( upload );
+   u_upload_release_buffer(upload);
+   FREE(upload);
 }
 
 
 static void
-u_upload_alloc_buffer(struct u_upload_mgr *upload,
-                      unsigned min_size)
+u_upload_alloc_buffer(struct u_upload_mgr *upload, unsigned min_size)
 {
    struct pipe_screen *screen = upload->pipe->screen;
    struct pipe_resource buffer;
@@ -148,9 +158,9 @@
 
    /* Release the old buffer, if present:
     */
-   u_upload_release_buffer( upload );
+   u_upload_release_buffer(upload);
 
-   /* Allocate a new one: 
+   /* Allocate a new one:
     */
    size = align(MAX2(upload->default_size, min_size), 4096);
 
@@ -225,7 +235,7 @@
                                           offset,
                                           buffer_size - offset,
                                           upload->map_flags,
-					  &upload->transfer);
+                                          &upload->transfer);
       if (unlikely(!upload->map)) {
          upload->transfer = NULL;
          *out_offset = ~0;
@@ -249,13 +259,14 @@
    upload->offset = offset + size;
 }
 
-void u_upload_data(struct u_upload_mgr *upload,
-                   unsigned min_out_offset,
-                   unsigned size,
-                   unsigned alignment,
-                   const void *data,
-                   unsigned *out_offset,
-                   struct pipe_resource **outbuf)
+void
+u_upload_data(struct u_upload_mgr *upload,
+              unsigned min_out_offset,
+              unsigned size,
+              unsigned alignment,
+              const void *data,
+              unsigned *out_offset,
+              struct pipe_resource **outbuf)
 {
    uint8_t *ptr;
 
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 45382917..536467e 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -62,6 +62,13 @@
 u_upload_create_default(struct pipe_context *pipe);
 
 /**
+ * Create an uploader with identical parameters as another one, but using
+ * the given pipe_context instead.
+ */
+struct u_upload_mgr *
+u_upload_clone(struct pipe_context *pipe, struct u_upload_mgr *upload);
+
+/**
  * Destroy the upload manager.
  */
 void u_upload_destroy( struct u_upload_mgr *upload );
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 7d4a44b..6dc8bc7 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -162,9 +162,6 @@
    uint32_t dirty_real_vb_mask; /* which buffers are dirty since the last
                                    call of set_vertex_buffers */
 
-   /* The index buffer. */
-   struct pipe_index_buffer index_buffer;
-
    /* Vertex elements. */
    struct u_vbuf_elements *ve, *ve_saved;
 
@@ -369,21 +366,17 @@
 {
    struct pipe_screen *screen = mgr->pipe->screen;
    unsigned i;
-   unsigned num_vb = screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
-                                              PIPE_SHADER_CAP_MAX_INPUTS);
-
-   mgr->pipe->set_index_buffer(mgr->pipe, NULL);
-   pipe_resource_reference(&mgr->index_buffer.buffer, NULL);
+   const unsigned num_vb = screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
+                                                    PIPE_SHADER_CAP_MAX_INPUTS);
 
    mgr->pipe->set_vertex_buffers(mgr->pipe, 0, num_vb, NULL);
 
-   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
-      pipe_resource_reference(&mgr->vertex_buffer[i].buffer, NULL);
-   }
-   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
-      pipe_resource_reference(&mgr->real_vertex_buffer[i].buffer, NULL);
-   }
-   pipe_resource_reference(&mgr->aux_vertex_buffer_saved.buffer, NULL);
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
+      pipe_vertex_buffer_unreference(&mgr->vertex_buffer[i]);
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
+      pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[i]);
+
+   pipe_vertex_buffer_unreference(&mgr->aux_vertex_buffer_saved);
 
    translate_cache_destroy(mgr->translate_cache);
    cso_cache_delete(mgr->cso_cache);
@@ -392,10 +385,10 @@
 
 static enum pipe_error
 u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
+                         const struct pipe_draw_info *info,
                          unsigned vb_mask, unsigned out_vb,
                          int start_vertex, unsigned num_vertices,
-                         int start_index, unsigned num_indices, int min_index,
-                         boolean unroll_indices)
+                         int min_index, boolean unroll_indices)
 {
    struct translate *tr;
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
@@ -417,19 +410,19 @@
       vb = &mgr->vertex_buffer[i];
       offset = vb->buffer_offset + vb->stride * start_vertex;
 
-      if (vb->user_buffer) {
-         map = (uint8_t*)vb->user_buffer + offset;
+      if (vb->is_user_buffer) {
+         map = (uint8_t*)vb->buffer.user + offset;
       } else {
          unsigned size = vb->stride ? num_vertices * vb->stride
                                     : sizeof(double)*4;
 
-         if (offset + size > vb->buffer->width0) {
+         if (offset + size > vb->buffer.resource->width0) {
             /* Don't try to map past end of buffer.  This often happens when
              * we're translating an attribute that's at offset > 0 from the
              * start of the vertex.  If we'd subtract attrib's offset from
              * the size, this probably wouldn't happen.
              */
-            size = vb->buffer->width0 - offset;
+            size = vb->buffer.resource->width0 - offset;
 
             /* Also adjust num_vertices.  A common user error is to call
              * glDrawRangeElements() with incorrect 'end' argument.  The 'end
@@ -441,7 +434,7 @@
             num_vertices = (size + vb->stride - 1) / vb->stride;
          }
 
-         map = pipe_buffer_map_range(mgr->pipe, vb->buffer, offset, size,
+         map = pipe_buffer_map_range(mgr->pipe, vb->buffer.resource, offset, size,
                                      PIPE_TRANSFER_READ, &vb_transfer[i]);
       }
 
@@ -455,38 +448,35 @@
 
    /* Translate. */
    if (unroll_indices) {
-      struct pipe_index_buffer *ib = &mgr->index_buffer;
       struct pipe_transfer *transfer = NULL;
-      unsigned offset = ib->offset + start_index * ib->index_size;
+      const unsigned offset = info->start * info->index_size;
       uint8_t *map;
 
-      assert((ib->buffer || ib->user_buffer) && ib->index_size);
-
       /* Create and map the output buffer. */
       u_upload_alloc(mgr->pipe->stream_uploader, 0,
-                     key->output_stride * num_indices, 4,
+                     key->output_stride * info->count, 4,
                      &out_offset, &out_buffer,
                      (void**)&out_map);
       if (!out_buffer)
          return PIPE_ERROR_OUT_OF_MEMORY;
 
-      if (ib->user_buffer) {
-         map = (uint8_t*)ib->user_buffer + offset;
+      if (info->has_user_indices) {
+         map = (uint8_t*)info->index.user + offset;
       } else {
-         map = pipe_buffer_map_range(mgr->pipe, ib->buffer, offset,
-                                     num_indices * ib->index_size,
+         map = pipe_buffer_map_range(mgr->pipe, info->index.resource, offset,
+                                     info->count * info->index_size,
                                      PIPE_TRANSFER_READ, &transfer);
       }
 
-      switch (ib->index_size) {
+      switch (info->index_size) {
       case 4:
-         tr->run_elts(tr, (unsigned*)map, num_indices, 0, 0, out_map);
+         tr->run_elts(tr, (unsigned*)map, info->count, 0, 0, out_map);
          break;
       case 2:
-         tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, 0, out_map);
+         tr->run_elts16(tr, (uint16_t*)map, info->count, 0, 0, out_map);
          break;
       case 1:
-         tr->run_elts8(tr, map, num_indices, 0, 0, out_map);
+         tr->run_elts8(tr, map, info->count, 0, 0, out_map);
          break;
       }
 
@@ -524,8 +514,8 @@
 
    /* Move the buffer reference. */
    pipe_resource_reference(
-      &mgr->real_vertex_buffer[out_vb].buffer, NULL);
-   mgr->real_vertex_buffer[out_vb].buffer = out_buffer;
+      &mgr->real_vertex_buffer[out_vb].buffer.resource, NULL);
+   mgr->real_vertex_buffer[out_vb].buffer.resource = out_buffer;
 
    return PIPE_OK;
 }
@@ -571,28 +561,27 @@
 
 static boolean
 u_vbuf_translate_begin(struct u_vbuf *mgr,
+                       const struct pipe_draw_info *info,
                        int start_vertex, unsigned num_vertices,
-                       int start_instance, unsigned num_instances,
-                       int start_index, unsigned num_indices, int min_index,
-                       boolean unroll_indices)
+                       int min_index, boolean unroll_indices)
 {
    unsigned mask[VB_NUM] = {0};
    struct translate_key key[VB_NUM];
    unsigned elem_index[VB_NUM][PIPE_MAX_ATTRIBS]; /* ... into key.elements */
    unsigned i, type;
-   unsigned incompatible_vb_mask = mgr->incompatible_vb_mask &
-                                   mgr->ve->used_vb_mask;
+   const unsigned incompatible_vb_mask = mgr->incompatible_vb_mask &
+                                         mgr->ve->used_vb_mask;
 
-   int start[VB_NUM] = {
-      start_vertex,     /* VERTEX */
-      start_instance,   /* INSTANCE */
-      0                 /* CONST */
+   const int start[VB_NUM] = {
+      start_vertex,           /* VERTEX */
+      info->start_instance,   /* INSTANCE */
+      0                       /* CONST */
    };
 
-   unsigned num[VB_NUM] = {
-      num_vertices,     /* VERTEX */
-      num_instances,    /* INSTANCE */
-      1                 /* CONST */
+   const unsigned num[VB_NUM] = {
+      num_vertices,           /* VERTEX */
+      info->instance_count,   /* INSTANCE */
+      1                       /* CONST */
    };
 
    memset(key, 0, sizeof(key));
@@ -679,10 +668,9 @@
    for (type = 0; type < VB_NUM; type++) {
       if (key[type].nr_elements) {
          enum pipe_error err;
-         err = u_vbuf_translate_buffers(mgr, &key[type], mask[type],
+         err = u_vbuf_translate_buffers(mgr, &key[type], info, mask[type],
                                         mgr->fallback_vbs[type],
-                                        start[type], num[type],
-                                        start_index, num_indices, min_index,
+                                        start[type], num[type], min_index,
                                         unroll_indices && type == VB_VERTEX);
          if (err != PIPE_OK)
             return FALSE;
@@ -735,7 +723,7 @@
    for (i = 0; i < VB_NUM; i++) {
       unsigned vb = mgr->fallback_vbs[i];
       if (vb != ~0u) {
-         pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer, NULL);
+         pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer.resource, NULL);
          mgr->fallback_vbs[i] = ~0;
 
          /* This will cause the buffer to be unbound in the driver later. */
@@ -828,7 +816,7 @@
    uint32_t incompatible_vb_mask = 0;
    /* which buffers have a non-zero stride */
    uint32_t nonzero_stride_vb_mask = 0;
-   uint32_t mask = ~(((1ull << count) - 1) << start_slot);
+   const uint32_t mask = ~(((1ull << count) - 1) << start_slot);
 
    /* Zero out the bits we are going to rewrite completely. */
    mgr->user_vb_mask &= mask;
@@ -844,8 +832,8 @@
       for (i = 0; i < count; i++) {
          unsigned dst_index = start_slot + i;
 
-         pipe_resource_reference(&mgr->vertex_buffer[dst_index].buffer, NULL);
-         pipe_resource_reference(&mgr->real_vertex_buffer[dst_index].buffer,
+         pipe_vertex_buffer_unreference(&mgr->vertex_buffer[dst_index]);
+         pipe_resource_reference(&mgr->real_vertex_buffer[dst_index].buffer.resource,
                                  NULL);
       }
 
@@ -859,18 +847,13 @@
       struct pipe_vertex_buffer *orig_vb = &mgr->vertex_buffer[dst_index];
       struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[dst_index];
 
-      if (!vb->buffer && !vb->user_buffer) {
-         pipe_resource_reference(&orig_vb->buffer, NULL);
-         pipe_resource_reference(&real_vb->buffer, NULL);
-         real_vb->user_buffer = NULL;
+      if (!vb->buffer.resource) {
+         pipe_vertex_buffer_unreference(orig_vb);
+         pipe_vertex_buffer_unreference(real_vb);
          continue;
       }
 
-      pipe_resource_reference(&orig_vb->buffer, vb->buffer);
-      orig_vb->user_buffer = vb->user_buffer;
-
-      real_vb->buffer_offset = orig_vb->buffer_offset = vb->buffer_offset;
-      real_vb->stride = orig_vb->stride = vb->stride;
+      pipe_vertex_buffer_reference(orig_vb, vb);
 
       if (vb->stride) {
          nonzero_stride_vb_mask |= 1 << dst_index;
@@ -880,18 +863,23 @@
       if ((!mgr->caps.buffer_offset_unaligned && vb->buffer_offset % 4 != 0) ||
           (!mgr->caps.buffer_stride_unaligned && vb->stride % 4 != 0)) {
          incompatible_vb_mask |= 1 << dst_index;
-         pipe_resource_reference(&real_vb->buffer, NULL);
+         real_vb->buffer_offset = vb->buffer_offset;
+         real_vb->stride = vb->stride;
+         pipe_vertex_buffer_unreference(real_vb);
+         real_vb->is_user_buffer = false;
          continue;
       }
 
-      if (!mgr->caps.user_vertex_buffers && vb->user_buffer) {
+      if (!mgr->caps.user_vertex_buffers && vb->is_user_buffer) {
          user_vb_mask |= 1 << dst_index;
-         pipe_resource_reference(&real_vb->buffer, NULL);
+         real_vb->buffer_offset = vb->buffer_offset;
+         real_vb->stride = vb->stride;
+         pipe_vertex_buffer_unreference(real_vb);
+         real_vb->is_user_buffer = false;
          continue;
       }
 
-      pipe_resource_reference(&real_vb->buffer, vb->buffer);
-      real_vb->user_buffer = vb->user_buffer;
+      pipe_vertex_buffer_reference(real_vb, vb);
    }
 
    mgr->user_vb_mask |= user_vb_mask;
@@ -904,22 +892,6 @@
    mgr->dirty_real_vb_mask |= ~mask;
 }
 
-void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
-                             const struct pipe_index_buffer *ib)
-{
-   struct pipe_context *pipe = mgr->pipe;
-
-   if (ib) {
-      assert(ib->offset % ib->index_size == 0);
-      pipe_resource_reference(&mgr->index_buffer.buffer, ib->buffer);
-      memcpy(&mgr->index_buffer, ib, sizeof(*ib));
-   } else {
-      pipe_resource_reference(&mgr->index_buffer.buffer, NULL);
-   }
-
-   pipe->set_index_buffer(pipe, ib);
-}
-
 static enum pipe_error
 u_vbuf_upload_buffers(struct u_vbuf *mgr,
                       int start_vertex, unsigned num_vertices,
@@ -927,7 +899,7 @@
 {
    unsigned i;
    unsigned nr_velems = mgr->ve->count;
-   struct pipe_vertex_element *velems =
+   const struct pipe_vertex_element *velems =
          mgr->using_translate ? mgr->fallback_velems : mgr->ve->ve;
    unsigned start_offset[PIPE_MAX_ATTRIBS];
    unsigned end_offset[PIPE_MAX_ATTRIBS];
@@ -935,7 +907,7 @@
 
    /* Determine how much data needs to be uploaded. */
    for (i = 0; i < nr_velems; i++) {
-      struct pipe_vertex_element *velem = &velems[i];
+      const struct pipe_vertex_element *velem = &velems[i];
       unsigned index = velem->vertex_buffer_index;
       struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
       unsigned instance_div, first, size, index_bit;
@@ -947,7 +919,7 @@
          continue;
       }
 
-      if (!vb->user_buffer) {
+      if (!vb->is_user_buffer) {
          continue;
       }
 
@@ -997,11 +969,11 @@
       assert(start < end);
 
       real_vb = &mgr->real_vertex_buffer[i];
-      ptr = mgr->vertex_buffer[i].user_buffer;
+      ptr = mgr->vertex_buffer[i].buffer.user;
 
-      u_upload_data(mgr->pipe->stream_uploader, start, end - start, 4, ptr + start,
-                    &real_vb->buffer_offset, &real_vb->buffer);
-      if (!real_vb->buffer)
+      u_upload_data(mgr->pipe->stream_uploader, start, end - start, 4,
+                    ptr + start, &real_vb->buffer_offset, &real_vb->buffer.resource);
+      if (!real_vb->buffer.resource)
          return PIPE_ERROR_OUT_OF_MEMORY;
 
       real_vb->buffer_offset -= start;
@@ -1038,42 +1010,38 @@
 }
 
 static void u_vbuf_get_minmax_index(struct pipe_context *pipe,
-                                    struct pipe_index_buffer *ib,
-                                    boolean primitive_restart,
-                                    unsigned restart_index,
-                                    unsigned start, unsigned count,
-                                    int *out_min_index,
-                                    int *out_max_index)
+                                    const struct pipe_draw_info *info,
+                                    int *out_min_index, int *out_max_index)
 {
    struct pipe_transfer *transfer = NULL;
    const void *indices;
    unsigned i;
 
-   if (ib->user_buffer) {
-      indices = (uint8_t*)ib->user_buffer +
-                ib->offset + start * ib->index_size;
+   if (info->has_user_indices) {
+      indices = (uint8_t*)info->index.user +
+                info->start * info->index_size;
    } else {
-      indices = pipe_buffer_map_range(pipe, ib->buffer,
-                                      ib->offset + start * ib->index_size,
-                                      count * ib->index_size,
+      indices = pipe_buffer_map_range(pipe, info->index.resource,
+                                      info->start * info->index_size,
+                                      info->count * info->index_size,
                                       PIPE_TRANSFER_READ, &transfer);
    }
 
-   switch (ib->index_size) {
+   switch (info->index_size) {
    case 4: {
       const unsigned *ui_indices = (const unsigned*)indices;
       unsigned max_ui = 0;
       unsigned min_ui = ~0U;
-      if (primitive_restart) {
-         for (i = 0; i < count; i++) {
-            if (ui_indices[i] != restart_index) {
+      if (info->primitive_restart) {
+         for (i = 0; i < info->count; i++) {
+            if (ui_indices[i] != info->restart_index) {
                if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
                if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
             }
          }
       }
       else {
-         for (i = 0; i < count; i++) {
+         for (i = 0; i < info->count; i++) {
             if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
             if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
          }
@@ -1086,16 +1054,16 @@
       const unsigned short *us_indices = (const unsigned short*)indices;
       unsigned max_us = 0;
       unsigned min_us = ~0U;
-      if (primitive_restart) {
-         for (i = 0; i < count; i++) {
-            if (us_indices[i] != restart_index) {
+      if (info->primitive_restart) {
+         for (i = 0; i < info->count; i++) {
+            if (us_indices[i] != info->restart_index) {
                if (us_indices[i] > max_us) max_us = us_indices[i];
                if (us_indices[i] < min_us) min_us = us_indices[i];
             }
          }
       }
       else {
-         for (i = 0; i < count; i++) {
+         for (i = 0; i < info->count; i++) {
             if (us_indices[i] > max_us) max_us = us_indices[i];
             if (us_indices[i] < min_us) min_us = us_indices[i];
          }
@@ -1108,16 +1076,16 @@
       const unsigned char *ub_indices = (const unsigned char*)indices;
       unsigned max_ub = 0;
       unsigned min_ub = ~0U;
-      if (primitive_restart) {
-         for (i = 0; i < count; i++) {
-            if (ub_indices[i] != restart_index) {
+      if (info->primitive_restart) {
+         for (i = 0; i < info->count; i++) {
+            if (ub_indices[i] != info->restart_index) {
                if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
                if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
             }
          }
       }
       else {
-         for (i = 0; i < count; i++) {
+         for (i = 0; i < info->count; i++) {
             if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
             if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
          }
@@ -1156,9 +1124,10 @@
    int start_vertex, min_index;
    unsigned num_vertices;
    boolean unroll_indices = FALSE;
-   uint32_t used_vb_mask = mgr->ve->used_vb_mask;
+   const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
    uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask;
-   uint32_t incompatible_vb_mask = mgr->incompatible_vb_mask & used_vb_mask;
+   const uint32_t incompatible_vb_mask =
+      mgr->incompatible_vb_mask & used_vb_mask;
    struct pipe_draw_info new_info;
 
    /* Normal draw. No fallback and no user buffers. */
@@ -1182,16 +1151,16 @@
       struct pipe_transfer *transfer = NULL;
       int *data;
 
-      if (new_info.indexed) {
-         data = pipe_buffer_map_range(pipe, new_info.indirect,
-                                      new_info.indirect_offset, 20,
+      if (new_info.index_size) {
+         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
+                                      new_info.indirect->offset, 20,
                                       PIPE_TRANSFER_READ, &transfer);
          new_info.index_bias = data[3];
          new_info.start_instance = data[4];
       }
       else {
-         data = pipe_buffer_map_range(pipe, new_info.indirect,
-                                      new_info.indirect_offset, 16,
+         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
+                                      new_info.indirect->offset, 16,
                                       PIPE_TRANSFER_READ, &transfer);
          new_info.start_instance = data[3];
       }
@@ -1203,7 +1172,7 @@
       new_info.indirect = NULL;
    }
 
-   if (new_info.indexed) {
+   if (new_info.index_size) {
       /* See if anything needs to be done for per-vertex attribs. */
       if (u_vbuf_need_minmax_index(mgr)) {
          int max_index;
@@ -1212,10 +1181,8 @@
             min_index = new_info.min_index;
             max_index = new_info.max_index;
          } else {
-            u_vbuf_get_minmax_index(mgr->pipe, &mgr->index_buffer,
-                                    new_info.primitive_restart,
-                                    new_info.restart_index, new_info.start,
-                                    new_info.count, &min_index, &max_index);
+            u_vbuf_get_minmax_index(mgr->pipe, &new_info,
+                                    &min_index, &max_index);
          }
 
          assert(min_index <= max_index);
@@ -1251,16 +1218,14 @@
    if (unroll_indices ||
        incompatible_vb_mask ||
        mgr->ve->incompatible_elem_mask) {
-      if (!u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
-                                  new_info.start_instance,
-                                  new_info.instance_count, new_info.start,
-                                  new_info.count, min_index, unroll_indices)) {
+      if (!u_vbuf_translate_begin(mgr, &new_info, start_vertex, num_vertices,
+                                  min_index, unroll_indices)) {
          debug_warn_once("u_vbuf_translate_begin() failed");
          return;
       }
 
       if (unroll_indices) {
-         new_info.indexed = FALSE;
+         new_info.index_size = 0;
          new_info.index_bias = 0;
          new_info.min_index = 0;
          new_info.max_index = new_info.count - 1;
@@ -1334,16 +1299,13 @@
 
 void u_vbuf_save_aux_vertex_buffer_slot(struct u_vbuf *mgr)
 {
-   struct pipe_vertex_buffer *vb =
-         &mgr->vertex_buffer[mgr->aux_vertex_buffer_slot];
-
-   pipe_resource_reference(&mgr->aux_vertex_buffer_saved.buffer, vb->buffer);
-   memcpy(&mgr->aux_vertex_buffer_saved, vb, sizeof(*vb));
+   pipe_vertex_buffer_reference(&mgr->aux_vertex_buffer_saved,
+                           &mgr->vertex_buffer[mgr->aux_vertex_buffer_slot]);
 }
 
 void u_vbuf_restore_aux_vertex_buffer_slot(struct u_vbuf *mgr)
 {
    u_vbuf_set_vertex_buffers(mgr, mgr->aux_vertex_buffer_slot, 1,
                              &mgr->aux_vertex_buffer_saved);
-   pipe_resource_reference(&mgr->aux_vertex_buffer_saved.buffer, NULL);
+   pipe_vertex_buffer_unreference(&mgr->aux_vertex_buffer_saved);
 }
diff --git a/src/gallium/auxiliary/util/u_vbuf.h b/src/gallium/auxiliary/util/u_vbuf.h
index ddfa844..d070452 100644
--- a/src/gallium/auxiliary/util/u_vbuf.h
+++ b/src/gallium/auxiliary/util/u_vbuf.h
@@ -72,8 +72,6 @@
 void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr,
                                unsigned start_slot, unsigned count,
                                const struct pipe_vertex_buffer *bufs);
-void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
-                             const struct pipe_index_buffer *ib);
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info);
 
 /* Save/restore functionality. */
diff --git a/src/gallium/auxiliary/vl/vl_bicubic_filter.c b/src/gallium/auxiliary/vl/vl_bicubic_filter.c
index efd8a1c..a3dc6c8 100644
--- a/src/gallium/auxiliary/vl/vl_bicubic_filter.c
+++ b/src/gallium/auxiliary/vl/vl_bicubic_filter.c
@@ -295,7 +295,7 @@
       goto error_sampler;
 
    filter->quad = vl_vb_upload_quads(pipe);
-   if(!filter->quad.buffer)
+   if(!filter->quad.buffer.resource)
       goto error_quad;
 
    memset(&ve, 0, sizeof(ve));
@@ -349,7 +349,7 @@
    pipe->delete_vertex_elements_state(pipe, filter->ves);
 
 error_ves:
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
 error_quad:
    pipe->delete_sampler_state(pipe, filter->sampler);
@@ -373,7 +373,7 @@
    filter->pipe->delete_blend_state(filter->pipe, filter->blend);
    filter->pipe->delete_rasterizer_state(filter->pipe, filter->rs_state);
    filter->pipe->delete_vertex_elements_state(filter->pipe, filter->ves);
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
    filter->pipe->delete_vs_state(filter->pipe, filter->vs);
    filter->pipe->delete_fs_state(filter->pipe, filter->fs);
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index 693d685..a79bf11 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -605,7 +605,8 @@
     */
    c->vertex_buf.stride = sizeof(struct vertex2f) + sizeof(struct vertex4f) * 2;
    c->vertex_buf.buffer_offset = 0;
-   c->vertex_buf.buffer = NULL;
+   c->vertex_buf.buffer.resource = NULL;
+   c->vertex_buf.is_user_buffer = false;
 
    vertex_elems[0].src_offset = 0;
    vertex_elems[0].instance_divisor = 0;
@@ -630,7 +631,7 @@
    assert(c);
 
    c->pipe->delete_vertex_elements_state(c->pipe, c->vertex_elems_state);
-   pipe_resource_reference(&c->vertex_buf.buffer, NULL);
+   pipe_resource_reference(&c->vertex_buf.buffer.resource, NULL);
 }
 
 static inline struct u_rect
@@ -812,7 +813,7 @@
    u_upload_alloc(c->pipe->stream_uploader, 0,
                   c->vertex_buf.stride * VL_COMPOSITOR_MAX_LAYERS * 4, /* size */
                   4, /* alignment */
-                  &c->vertex_buf.buffer_offset, &c->vertex_buf.buffer,
+                  &c->vertex_buf.buffer_offset, &c->vertex_buf.buffer.resource,
                   (void**)&vb);
 
    for (i = 0; i < VL_COMPOSITOR_MAX_LAYERS; i++) {
diff --git a/src/gallium/auxiliary/vl/vl_deint_filter.c b/src/gallium/auxiliary/vl/vl_deint_filter.c
index 2eec5cb..d2c48bd 100644
--- a/src/gallium/auxiliary/vl/vl_deint_filter.c
+++ b/src/gallium/auxiliary/vl/vl_deint_filter.c
@@ -308,7 +308,7 @@
       goto error_sampler;
 
    filter->quad = vl_vb_upload_quads(pipe);
-   if(!filter->quad.buffer)
+   if(!filter->quad.buffer.resource)
       goto error_quad;
 
    memset(&ve, 0, sizeof(ve));
@@ -361,7 +361,7 @@
    pipe->delete_vertex_elements_state(pipe, filter->ves);
 
 error_ves:
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
 error_quad:
    pipe->delete_sampler_state(pipe, filter->sampler);
@@ -396,7 +396,7 @@
    filter->pipe->delete_blend_state(filter->pipe, filter->blend[2]);
    filter->pipe->delete_rasterizer_state(filter->pipe, filter->rs_state);
    filter->pipe->delete_vertex_elements_state(filter->pipe, filter->ves);
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
    filter->pipe->delete_vs_state(filter->pipe, filter->vs);
    filter->pipe->delete_fs_state(filter->pipe, filter->fs_copy_top);
diff --git a/src/gallium/auxiliary/vl/vl_matrix_filter.c b/src/gallium/auxiliary/vl/vl_matrix_filter.c
index b498d1f..1dacc7c 100644
--- a/src/gallium/auxiliary/vl/vl_matrix_filter.c
+++ b/src/gallium/auxiliary/vl/vl_matrix_filter.c
@@ -184,7 +184,7 @@
       goto error_sampler;
 
    filter->quad = vl_vb_upload_quads(pipe);
-   if(!filter->quad.buffer)
+   if(!filter->quad.buffer.resource)
       goto error_quad;
 
    memset(&ve, 0, sizeof(ve));
@@ -233,7 +233,7 @@
    pipe->delete_vertex_elements_state(pipe, filter->ves);
 
 error_ves:
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
 error_quad:
    pipe->delete_sampler_state(pipe, filter->sampler);
@@ -257,7 +257,7 @@
    filter->pipe->delete_blend_state(filter->pipe, filter->blend);
    filter->pipe->delete_rasterizer_state(filter->pipe, filter->rs_state);
    filter->pipe->delete_vertex_elements_state(filter->pipe, filter->ves);
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
    filter->pipe->delete_vs_state(filter->pipe, filter->vs);
    filter->pipe->delete_fs_state(filter->pipe, filter->fs);
diff --git a/src/gallium/auxiliary/vl/vl_median_filter.c b/src/gallium/auxiliary/vl/vl_median_filter.c
index 0183b875..273afaf 100644
--- a/src/gallium/auxiliary/vl/vl_median_filter.c
+++ b/src/gallium/auxiliary/vl/vl_median_filter.c
@@ -295,7 +295,7 @@
       goto error_sampler;
 
    filter->quad = vl_vb_upload_quads(pipe);
-   if(!filter->quad.buffer)
+   if(!filter->quad.buffer.resource)
       goto error_quad;
 
    memset(&ve, 0, sizeof(ve));
@@ -337,7 +337,7 @@
    pipe->delete_vertex_elements_state(pipe, filter->ves);
 
 error_ves:
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
 error_quad:
    pipe->delete_sampler_state(pipe, filter->sampler);
@@ -361,7 +361,7 @@
    filter->pipe->delete_blend_state(filter->pipe, filter->blend);
    filter->pipe->delete_rasterizer_state(filter->pipe, filter->rs_state);
    filter->pipe->delete_vertex_elements_state(filter->pipe, filter->ves);
-   pipe_resource_reference(&filter->quad.buffer, NULL);
+   pipe_resource_reference(&filter->quad.buffer.resource, NULL);
 
    filter->pipe->delete_vs_state(filter->pipe, filter->vs);
    filter->pipe->delete_fs_state(filter->pipe, filter->fs);
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index db62b44..8a2dae3 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -509,8 +509,8 @@
    dec->context->delete_vertex_elements_state(dec->context, dec->ves_ycbcr);
    dec->context->delete_vertex_elements_state(dec->context, dec->ves_mv);
 
-   pipe_resource_reference(&dec->quads.buffer, NULL);
-   pipe_resource_reference(&dec->pos.buffer, NULL);
+   pipe_resource_reference(&dec->quads.buffer.resource, NULL);
+   pipe_resource_reference(&dec->pos.buffer.resource, NULL);
 
    pipe_sampler_view_reference(&dec->zscan_linear, NULL);
    pipe_sampler_view_reference(&dec->zscan_normal, NULL);
diff --git a/src/gallium/auxiliary/vl/vl_vertex_buffers.c b/src/gallium/auxiliary/vl/vl_vertex_buffers.c
index 1721227..45a9bad 100644
--- a/src/gallium/auxiliary/vl/vl_vertex_buffers.c
+++ b/src/gallium/auxiliary/vl/vl_vertex_buffers.c
@@ -49,23 +49,23 @@
    /* create buffer */
    quad.stride = sizeof(struct vertex2f);
    quad.buffer_offset = 0;
-   quad.buffer = pipe_buffer_create
+   quad.buffer.resource = pipe_buffer_create
    (
       pipe->screen,
       PIPE_BIND_VERTEX_BUFFER,
       PIPE_USAGE_DEFAULT,
       sizeof(struct vertex2f) * 4
    );
-   quad.user_buffer = NULL;
+   quad.is_user_buffer = false;
 
-   if(!quad.buffer)
+   if(!quad.buffer.resource)
       return quad;
 
    /* and fill it */
    v = pipe_buffer_map
    (
       pipe,
-      quad.buffer,
+      quad.buffer.resource,
       PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
       &buf_transfer
    );
@@ -94,23 +94,23 @@
    /* create buffer */
    pos.stride = sizeof(struct vertex2s);
    pos.buffer_offset = 0;
-   pos.buffer = pipe_buffer_create
+   pos.buffer.resource = pipe_buffer_create
    (
       pipe->screen,
       PIPE_BIND_VERTEX_BUFFER,
       PIPE_USAGE_DEFAULT,
       sizeof(struct vertex2s) * width * height
    );
-   pos.user_buffer = NULL;
+   pos.is_user_buffer = false;
 
-   if(!pos.buffer)
+   if(!pos.buffer.resource)
       return pos;
 
    /* and fill it */
    v = pipe_buffer_map
    (
       pipe,
-      pos.buffer,
+      pos.buffer.resource,
       PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
       &buf_transfer
    );
@@ -268,8 +268,8 @@
 
    buf.stride = sizeof(struct vl_ycbcr_block);
    buf.buffer_offset = 0;
-   buf.buffer = buffer->ycbcr[component].resource;
-   buf.user_buffer = NULL;
+   buf.buffer.resource = buffer->ycbcr[component].resource;
+   buf.is_user_buffer = false;
 
    return buf;
 }
@@ -283,8 +283,8 @@
 
    buf.stride = sizeof(struct vl_motionvector);
    buf.buffer_offset = 0;
-   buf.buffer = buffer->mv[motionvector].resource;
-   buf.user_buffer = NULL;
+   buf.buffer.resource = buffer->mv[motionvector].resource;
+   buf.is_user_buffer = false;
 
    return buf;
 }
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index b4fb47e..043483b 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -406,7 +406,7 @@
       goto free_authenticate;
 
    if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
-      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev);
+      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev, 0);
 
    if (!scrn->base.pscreen)
       goto release_pipe;
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri3.c b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
index a810dea..68bac44 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri3.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
@@ -89,7 +89,7 @@
    int64_t last_ust, ns_frame, last_msc, next_msc;
 
    bool flushed;
-   int is_different_gpu;
+   bool is_different_gpu;
 };
 
 static void
@@ -817,13 +817,13 @@
    free(geom_reply);
 
    if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
-      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev);
+      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev, 0);
 
    if (!scrn->base.pscreen)
       goto release_pipe;
 
    scrn->pipe = scrn->base.pscreen->context_create(scrn->base.pscreen,
-                                                   &scrn->base, 0);
+                                                   NULL, 0);
    if (!scrn->pipe)
        goto no_context;
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
index df8809c..ebde5b8 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -52,7 +52,7 @@
       goto free_screen;
 
    if (pipe_loader_drm_probe_fd(&vscreen->dev, new_fd))
-      vscreen->pscreen = pipe_loader_create_screen(vscreen->dev);
+      vscreen->pscreen = pipe_loader_create_screen(vscreen->dev, 0);
 
    if (!vscreen->pscreen)
       goto release_pipe;
diff --git a/src/gallium/auxiliary/vl/vl_zscan.c b/src/gallium/auxiliary/vl/vl_zscan.c
index 24d6452..75013c4 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -95,6 +95,27 @@
    38,46,54,62,39,47,55,63
 };
 
+const int vl_zscan_h265_up_right_diagonal_16[] =
+{
+   /* Up-right diagonal scan order for 4x4 blocks - see H.265 section 6.5.3. */
+    0,  4,  1,  8,  5,  2, 12,  9,
+    6,  3, 13, 10,  7, 14, 11, 15,
+};
+
+const int vl_zscan_h265_up_right_diagonal[] =
+{
+   /* Up-right diagonal scan order for 8x8 blocks - see H.265 section 6.5.3. */
+    0,  8,  1, 16,  9,  2, 24, 17,
+   10,  3, 32, 25, 18, 11,  4, 40,
+   33, 26, 19, 12,  5, 48, 41, 34,
+   27, 20, 13,  6, 56, 49, 42, 35,
+   28, 21, 14,  7, 57, 50, 43, 36,
+   29, 22, 15, 58, 51, 44, 37, 30,
+   23, 59, 52, 45, 38, 31, 60, 53,
+   46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+
 static void *
 create_vert_shader(struct vl_zscan *zscan)
 {
diff --git a/src/gallium/auxiliary/vl/vl_zscan.h b/src/gallium/auxiliary/vl/vl_zscan.h
index 268cf0a..292152e 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@@ -68,6 +68,8 @@
 extern const int vl_zscan_linear[];
 extern const int vl_zscan_normal[];
 extern const int vl_zscan_alternate[];
+extern const int vl_zscan_h265_up_right_diagonal_16[];
+extern const int vl_zscan_h265_up_right_diagonal[];
 
 struct pipe_sampler_view *
 vl_zscan_layout(struct pipe_context *pipe, const int layout[64], unsigned blocks_per_line);
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 5949ff2..a46131c 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -53,8 +53,6 @@
 
 * ``set_vertex_buffers``
 
-* ``set_index_buffer``
-
 
 Non-CSO State
 ^^^^^^^^^^^^^
@@ -101,6 +99,14 @@
   various debug messages, eventually reported via KHR_debug and
   similar mechanisms.
 
+Samplers
+^^^^^^^^
+
+pipe_sampler_state objects control how textures are sampled (coordinate
+wrap modes, interpolation modes, etc).  Note that samplers are not used
+for texture buffer objects.  That is, pipe_context::bind_sampler_views()
+will not bind a sampler if the corresponding sampler view refers to a
+PIPE_BUFFER resource.
 
 Sampler Views
 ^^^^^^^^^^^^^
@@ -290,8 +296,8 @@
 Every instance with instanceID in the range between ``start_instance`` and
 ``start_instance``+``instance_count``-1, inclusive, will be drawn.
 
-If there is an index buffer bound, and ``indexed`` field is true, all vertex
-indices will be looked up in the index buffer.
+If  ``index_size`` != 0, all vertex indices will be looked up from the index
+buffer.
 
 In indexed draw, ``min_index`` and ``max_index`` respectively provide a lower
 and upper bound of the indices contained in the index buffer inside the range
@@ -759,6 +765,26 @@
 * ``set_device_reset_callback`` sets a callback which will be called when
   a device reset is detected. The callback is only called synchronously.
 
+Bindless
+^^^^^^^^
+
+If PIPE_CAP_BINDLESS_TEXTURE is TRUE, the following ``pipe_context`` functions
+are used to create/delete bindless handles, and to make them resident in the
+current context when they are going to be used by shaders.
+
+* ``create_texture_handle`` creates a 64-bit unsigned integer texture handle
+  that is going to be directly used in shaders.
+* ``delete_texture_handle`` deletes a 64-bit unsigned integer texture handle.
+* ``make_texture_handle_resident`` makes a 64-bit unsigned texture handle
+  resident in the current context to be accessible by shaders for texture
+  mapping.
+* ``create_image_handle`` creates a 64-bit unsigned integer image handle that
+  is going to be directly used in shaders.
+* ``delete_image_handle`` deletes a 64-bit unsigned integer image handle.
+* ``make_image_handle_resident`` makes a 64-bit unsigned integer image handle
+  resident in the current context to be accessible by shaders for image loads,
+  stores and atomic operations.
+
 Using several contexts
 ----------------------
 
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index bb2803a..32da228 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -217,7 +217,7 @@
   pipe_draw_info::indirect_stride and ::indirect_count
 * ``PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS``: Whether the driver supports
   taking the number of indirect draws from a separate parameter
-  buffer, see pipe_draw_info::indirect_params.
+  buffer, see pipe_draw_indirect_info::indirect_draw_count.
 * ``PIPE_CAP_TGSI_FS_FINE_DERIVATIVE``: Whether the fragment shader supports
   the FINE versions of DDX/DDY.
 * ``PIPE_CAP_VENDOR_ID``: The vendor ID of the underlying hardware. If it's
@@ -389,6 +389,13 @@
 * ``PIPE_CAP_TGSI_TES_LAYER_VIEWPORT``: Whether ``TGSI_SEMANTIC_LAYER`` and
   ``TGSI_SEMANTIC_VIEWPORT_INDEX`` are supported as tessellation evaluation
   shader outputs.
+* ``PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX``: Whether a buffer with just
+  PIPE_BIND_CONSTANT_BUFFER can be legally passed to set_vertex_buffers.
+* ``PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION``: As the name says.
+* ``PIPE_CAP_POST_DEPTH_COVERAGE``: whether
+  ``TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE`` is supported.
+* ``PIPE_CAP_BINDLESS_TEXTURE``: Whether bindless texture operations are
+  supported.
 
 
 .. _pipe_capf:
@@ -489,6 +496,9 @@
   cost than this value should be lowered by the state tracker for better
   performance. This is a tunable for the GLSL compiler and the behavior is
   specific to the compiler.
+* ``PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS``: Whether the merge registers
+  TGSI pass is skipped. This might reduce code size and register pressure if
+  the underlying driver has a real backend compiler.
 
 
 .. _pipe_compute_cap:
@@ -616,17 +626,26 @@
 
 Returns an identifying name for the screen.
 
+The returned string should remain valid and immutable for the lifetime of
+pipe_screen.
+
 get_vendor
 ^^^^^^^^^^
 
 Returns the screen vendor.
 
+The returned string should remain valid and immutable for the lifetime of
+pipe_screen.
+
 get_device_vendor
 ^^^^^^^^^^^^^^^^^
 
 Returns the actual vendor of the device driving the screen
 (as opposed to the driver vendor).
 
+The returned string should remain valid and immutable for the lifetime of
+pipe_screen.
+
 .. _get_param:
 
 get_param
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index e740476..0dd2ac0 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -26,7 +26,13 @@
 Modifiers
 ^^^^^^^^^^^^^^^
 
-TGSI supports modifiers on inputs (as well as saturate modifier on instructions).
+TGSI supports modifiers on inputs (as well as saturate and precise modifier
+on instructions).
+
+For arithmetic instruction having a precise modifier certain optimizations
+which may alter the result are disallowed. Example: *add(mul(a,b),c)* can't be
+optimized to TGSI_OPCODE_MAD, because some hardware only supports the fused
+MAD instruction.
 
 For inputs which have a floating point type, both absolute value and
 negation modifiers are supported (with absolute value being applied
@@ -237,6 +243,9 @@
 
 .. opcode:: MAD - Multiply And Add
 
+Perform a * b + c. The implementation is free to decide whether there is an
+intermediate rounding step or not.
+
 .. math::
 
   dst.x = src0.x \times src1.x + src2.x
@@ -939,7 +948,8 @@
 .. opcode:: TXF - Texel Fetch
 
   As per NV_gpu_shader4, extract a single texel from a specified texture
-  image. The source sampler may not be a CUBE or SHADOW.  src 0 is a
+  image or PIPE_BUFFER resource. The source sampler may not be a CUBE or
+  SHADOW.  src 0 is a
   four-component signed integer vector used to identify the single texel
   accessed. 3 components + level.  Just like texture instructions, an optional
   offset vector is provided, which is subject to various driver restrictions
@@ -982,7 +992,9 @@
 .. opcode:: TXQS - Texture Samples Query
 
   This retrieves the number of samples in the texture, and stores it
-  into the x component. The other components are undefined.
+  into the x component as an unsigned integer. The other components are
+  undefined.  If the texture is not multisampled, this function returns
+  (1, undef, undef, undef).
 
 .. math::
 
@@ -2538,15 +2550,46 @@
 
 .. opcode:: SAMPLE_POS
 
-  Query the position of a given sample.  dst receives float4 (x, y, 0, 0)
-  indicated where the sample is located. If the resource is not a multi-sample
-  resource and not a render target, the result is 0.
+  Query the position of a sample in the given resource or render target
+  when per-sample fragment shading is in effect.
+
+  Syntax: ``SAMPLE_POS dst, source, sample_index``
+
+  dst receives float4 (x, y, undef, undef) indicated where the sample is
+  located. Sample locations are in the range [0, 1] where 0.5 is the center
+  of the fragment.
+
+  source is either a sampler view (to indicate a shader resource) or temp
+  register (to indicate the render target).  The source register may have
+  an optional swizzle to apply to the returned result
+
+  sample_index is an integer scalar indicating which sample position is to
+  be queried.
+
+  If per-sample shading is not in effect or the source resource or render
+  target is not multisampled, the result is (0.5, 0.5, undef, undef).
+
+  NOTE: no driver has implemented this opcode yet (and no state tracker
+  emits it).  This information is subject to change.
 
 .. opcode:: SAMPLE_INFO
 
-  dst receives number of samples in x.  If the resource is not a multi-sample
-  resource and not a render target, the result is 0.
+  Query the number of samples in a multisampled resource or render target.
 
+  Syntax: ``SAMPLE_INFO dst, source``
+
+  dst receives int4 (n, 0, 0, 0) where n is the number of samples in a
+  resource or the render target.
+
+  source is either a sampler view (to indicate a shader resource) or temp
+  register (to indicate the render target).  The source register may have
+  an optional swizzle to apply to the returned result
+
+  If per-sample shading is not in effect or the source resource or render
+  target is not multisampled, the result is (1, 0, 0, 0).
+
+  NOTE: no driver has implemented this opcode yet (and no state tracker
+  emits it).  This information is subject to change.
 
 .. _resourceopcodes:
 
@@ -3284,22 +3327,33 @@
 """"""""""""""""""""""
 
 For fragment shaders, this semantic label indicates that a system value
-contains the current sample id (i.e. gl_SampleID).
-This is an integer value, and only the X component is used.
+contains the current sample id (i.e. gl_SampleID) as an unsigned int.
+Only the X component is used.  If per-sample shading is not enabled,
+the result is (0, undef, undef, undef).
 
 TGSI_SEMANTIC_SAMPLEPOS
 """""""""""""""""""""""
 
-For fragment shaders, this semantic label indicates that a system value
-contains the current sample's position (i.e. gl_SamplePosition). Only the X
-and Y values are used.
+For fragment shaders, this semantic label indicates that a system
+value contains the current sample's position as float4(x, y, undef, undef)
+in the render target (i.e.  gl_SamplePosition) when per-fragment shading
+is in effect.  Position values are in the range [0, 1] where 0.5 is
+the center of the fragment.
 
 TGSI_SEMANTIC_SAMPLEMASK
 """"""""""""""""""""""""
 
-For fragment shaders, this semantic label indicates that an output contains
-the sample mask used to disable further sample processing
-(i.e. gl_SampleMask). Only the X value is used, up to 32x MS.
+For fragment shaders, this semantic label can be applied to either a
+shader system value input or output.
+
+For a system value, the sample mask indicates the set of samples covered by
+the current primitive.  If MSAA is not enabled, the value is (1, 0, 0, 0).
+
+For an output, the sample mask is used to disable further sample processing.
+
+For both, the register type is uint[4] but only the X component is used
+(i.e. gl_SampleMask[0]). Each bit corresponds to one sample position (up
+to 32x MSAA is supported).
 
 TGSI_SEMANTIC_INVOCATIONID
 """"""""""""""""""""""""""
@@ -3720,6 +3774,13 @@
 mismatch between shaders, then it is unspecified whether this behavior
 will be enabled.
 
+FS_POST_DEPTH_COVERAGE
+""""""""""""""""""""""
+
+When enabled, the input for TGSI_SEMANTIC_SAMPLEMASK will exclude samples
+that have failed the depth/stencil tests. This is only valid when
+FS_EARLY_DEPTH_STENCIL is also specified.
+
 
 Texture Sampling and Texture Formats
 ------------------------------------
diff --git a/src/gallium/drivers/ddebug/dd_context.c b/src/gallium/drivers/ddebug/dd_context.c
index f9d3de5..6b1ddc9 100644
--- a/src/gallium/drivers/ddebug/dd_context.c
+++ b/src/gallium/drivers/ddebug/dd_context.c
@@ -151,6 +151,21 @@
 }
 
 static void
+dd_context_get_query_result_resource(struct pipe_context *_pipe,
+                                     struct pipe_query *query,
+                                     boolean wait,
+                                     enum pipe_query_value_type result_type,
+                                     int index,
+                                     struct pipe_resource *resource,
+                                     unsigned offset)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->get_query_result_resource(pipe, dd_query_unwrap(query), wait,
+                                   result_type, index, resource, offset);
+}
+
+static void
 dd_context_set_active_query_state(struct pipe_context *_pipe, boolean enable)
 {
    struct pipe_context *pipe = dd_context(_pipe)->pipe;
@@ -299,7 +314,8 @@
       struct dd_state *hstate = state; \
    \
       pipe->delete_##name##_state(pipe, hstate->cso); \
-      tgsi_free_tokens(hstate->state.shader.tokens); \
+      if (hstate->state.shader.type == PIPE_SHADER_IR_TGSI) \
+         tgsi_free_tokens(hstate->state.shader.tokens); \
       FREE(hstate); \
    }
 
@@ -315,7 +331,8 @@
          return NULL; \
       hstate->cso = pipe->create_##name##_state(pipe, state); \
       hstate->state.shader = *state; \
-      hstate->state.shader.tokens = tgsi_dup_tokens(state->tokens); \
+      if (hstate->state.shader.type == PIPE_SHADER_IR_TGSI) \
+         hstate->state.shader.tokens = tgsi_dup_tokens(state->tokens); \
       return hstate; \
    } \
     \
@@ -338,6 +355,8 @@
       return NULL;
    hstate->cso = pipe->create_compute_state(pipe, state);
 
+   hstate->state.shader.type = state->ir_type;
+
    if (state->ir_type == PIPE_SHADER_IR_TGSI)
       hstate->state.shader.tokens = tgsi_dup_tokens(state->prog);
 
@@ -561,17 +580,6 @@
 }
 
 static void
-dd_context_set_index_buffer(struct pipe_context *_pipe,
-                            const struct pipe_index_buffer *ib)
-{
-   struct dd_context *dctx = dd_context(_pipe);
-   struct pipe_context *pipe = dctx->pipe;
-
-   safe_memcpy(&dctx->draw_state.index_buffer, ib, sizeof(*ib));
-   pipe->set_index_buffer(pipe, ib);
-}
-
-static void
 dd_context_set_stream_output_targets(struct pipe_context *_pipe,
                                      unsigned num_targets,
                                      struct pipe_stream_output_target **tgs,
@@ -758,6 +766,60 @@
    return pipe->dump_debug_state(pipe, stream, flags);
 }
 
+static uint64_t
+dd_context_create_texture_handle(struct pipe_context *_pipe,
+                                 struct pipe_sampler_view *view,
+                                 const struct pipe_sampler_state *state)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   return pipe->create_texture_handle(pipe, view, state);
+}
+
+static void
+dd_context_delete_texture_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->delete_texture_handle(pipe, handle);
+}
+
+static void
+dd_context_make_texture_handle_resident(struct pipe_context *_pipe,
+                                        uint64_t handle, bool resident)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->make_texture_handle_resident(pipe, handle, resident);
+}
+
+static uint64_t
+dd_context_create_image_handle(struct pipe_context *_pipe,
+                               const struct pipe_image_view *image)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   return pipe->create_image_handle(pipe, image);
+}
+
+static void
+dd_context_delete_image_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->delete_image_handle(pipe, handle);
+}
+
+static void
+dd_context_make_image_handle_resident(struct pipe_context *_pipe,
+                                      uint64_t handle, unsigned access,
+                                      bool resident)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->make_image_handle_resident(pipe, handle, access, resident);
+}
+
 struct pipe_context *
 dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe)
 {
@@ -785,6 +847,7 @@
    CTX_INIT(begin_query);
    CTX_INIT(end_query);
    CTX_INIT(get_query_result);
+   CTX_INIT(get_query_result_resource);
    CTX_INIT(set_active_query_state);
    CTX_INIT(create_blend_state);
    CTX_INIT(bind_blend_state);
@@ -834,7 +897,6 @@
    CTX_INIT(set_shader_buffers);
    CTX_INIT(set_shader_images);
    CTX_INIT(set_vertex_buffers);
-   CTX_INIT(set_index_buffer);
    CTX_INIT(create_stream_output_target);
    CTX_INIT(stream_output_target_destroy);
    CTX_INIT(set_stream_output_targets);
@@ -860,6 +922,12 @@
    CTX_INIT(set_device_reset_callback);
    CTX_INIT(dump_debug_state);
    CTX_INIT(emit_string_marker);
+   CTX_INIT(create_texture_handle);
+   CTX_INIT(delete_texture_handle);
+   CTX_INIT(make_texture_handle_resident);
+   CTX_INIT(create_image_handle);
+   CTX_INIT(delete_image_handle);
+   CTX_INIT(make_image_handle_resident);
 
    dd_init_draw_functions(dctx);
 
diff --git a/src/gallium/drivers/ddebug/dd_draw.c b/src/gallium/drivers/ddebug/dd_draw.c
index 6e96c72..9c95d25 100644
--- a/src/gallium/drivers/ddebug/dd_draw.c
+++ b/src/gallium/drivers/ddebug/dd_draw.c
@@ -98,8 +98,13 @@
    else
       return 1;
 
-   tgsi_scan_shader(tokens, &info);
-   return info.writes_viewport_index ? PIPE_MAX_VIEWPORTS : 1;
+   if (tokens) {
+      tgsi_scan_shader(tokens, &info);
+      if (info.writes_viewport_index)
+         return PIPE_MAX_VIEWPORTS;
+   }
+
+   return 1;
 }
 
 #define COLOR_RESET	"\033[0m"
@@ -197,9 +202,9 @@
 }
 
 static void
-dd_dump_draw_vbo(struct dd_draw_state *dstate, struct pipe_draw_info *info, FILE *f)
+dd_dump_shader(struct dd_draw_state *dstate, enum pipe_shader_type sh, FILE *f)
 {
-   int sh, i;
+   int i;
    const char *shader_str[PIPE_SHADER_TYPES];
 
    shader_str[PIPE_SHADER_VERTEX] = "VERTEX";
@@ -209,19 +214,95 @@
    shader_str[PIPE_SHADER_FRAGMENT] = "FRAGMENT";
    shader_str[PIPE_SHADER_COMPUTE] = "COMPUTE";
 
+   if (sh == PIPE_SHADER_TESS_CTRL &&
+       !dstate->shaders[PIPE_SHADER_TESS_CTRL] &&
+       dstate->shaders[PIPE_SHADER_TESS_EVAL])
+      fprintf(f, "tess_state: {default_outer_level = {%f, %f, %f, %f}, "
+              "default_inner_level = {%f, %f}}\n",
+              dstate->tess_default_levels[0],
+              dstate->tess_default_levels[1],
+              dstate->tess_default_levels[2],
+              dstate->tess_default_levels[3],
+              dstate->tess_default_levels[4],
+              dstate->tess_default_levels[5]);
+
+   if (sh == PIPE_SHADER_FRAGMENT)
+      if (dstate->rs) {
+         unsigned num_viewports = dd_num_active_viewports(dstate);
+
+         if (dstate->rs->state.rs.clip_plane_enable)
+            DUMP(clip_state, &dstate->clip_state);
+
+         for (i = 0; i < num_viewports; i++)
+            DUMP_I(viewport_state, &dstate->viewports[i], i);
+
+         if (dstate->rs->state.rs.scissor)
+            for (i = 0; i < num_viewports; i++)
+               DUMP_I(scissor_state, &dstate->scissors[i], i);
+
+         DUMP(rasterizer_state, &dstate->rs->state.rs);
+
+         if (dstate->rs->state.rs.poly_stipple_enable)
+            DUMP(poly_stipple, &dstate->polygon_stipple);
+         fprintf(f, "\n");
+      }
+
+   if (!dstate->shaders[sh])
+      return;
+
+   fprintf(f, COLOR_SHADER "begin shader: %s" COLOR_RESET "\n", shader_str[sh]);
+   DUMP(shader_state, &dstate->shaders[sh]->state.shader);
+
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++)
+      if (dstate->constant_buffers[sh][i].buffer ||
+            dstate->constant_buffers[sh][i].user_buffer) {
+         DUMP_I(constant_buffer, &dstate->constant_buffers[sh][i], i);
+         if (dstate->constant_buffers[sh][i].buffer)
+            DUMP_M(resource, &dstate->constant_buffers[sh][i], buffer);
+      }
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      if (dstate->sampler_states[sh][i])
+         DUMP_I(sampler_state, &dstate->sampler_states[sh][i]->state.sampler, i);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      if (dstate->sampler_views[sh][i]) {
+         DUMP_I(sampler_view, dstate->sampler_views[sh][i], i);
+         DUMP_M(resource, dstate->sampler_views[sh][i], texture);
+      }
+
+   for (i = 0; i < PIPE_MAX_SHADER_IMAGES; i++)
+      if (dstate->shader_images[sh][i].resource) {
+         DUMP_I(image_view, &dstate->shader_images[sh][i], i);
+         if (dstate->shader_images[sh][i].resource)
+            DUMP_M(resource, &dstate->shader_images[sh][i], resource);
+      }
+
+   for (i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++)
+      if (dstate->shader_buffers[sh][i].buffer) {
+         DUMP_I(shader_buffer, &dstate->shader_buffers[sh][i], i);
+         if (dstate->shader_buffers[sh][i].buffer)
+            DUMP_M(resource, &dstate->shader_buffers[sh][i], buffer);
+      }
+
+   fprintf(f, COLOR_SHADER "end shader: %s" COLOR_RESET "\n\n", shader_str[sh]);
+}
+
+static void
+dd_dump_draw_vbo(struct dd_draw_state *dstate, struct pipe_draw_info *info, FILE *f)
+{
+   int sh, i;
+
    DUMP(draw_info, info);
-   if (info->indexed) {
-      DUMP(index_buffer, &dstate->index_buffer);
-      if (dstate->index_buffer.buffer)
-         DUMP_M(resource, &dstate->index_buffer, buffer);
-   }
    if (info->count_from_stream_output)
       DUMP_M(stream_output_target, info,
              count_from_stream_output);
-   if (info->indirect)
-      DUMP_M(resource, info, indirect);
-   if (info->indirect_params)
-      DUMP_M(resource, info, indirect_params);
+   if (info->indirect) {
+      DUMP_M(resource, info, indirect->buffer);
+      if (info->indirect->indirect_draw_count)
+         DUMP_M(resource, info, indirect->indirect_draw_count);
+   }
+
    fprintf(f, "\n");
 
    /* TODO: dump active queries */
@@ -229,11 +310,10 @@
    dd_dump_render_condition(dstate, f);
 
    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
-      if (dstate->vertex_buffers[i].buffer ||
-          dstate->vertex_buffers[i].user_buffer) {
+      if (dstate->vertex_buffers[i].buffer.resource) {
          DUMP_I(vertex_buffer, &dstate->vertex_buffers[i], i);
-         if (dstate->vertex_buffers[i].buffer)
-            DUMP_M(resource, &dstate->vertex_buffers[i], buffer);
+         if (!dstate->vertex_buffers[i].is_user_buffer)
+            DUMP_M(resource, &dstate->vertex_buffers[i], buffer.resource);
       }
 
    if (dstate->velems) {
@@ -258,78 +338,7 @@
       if (sh == PIPE_SHADER_COMPUTE)
          continue;
 
-      if (sh == PIPE_SHADER_TESS_CTRL &&
-          !dstate->shaders[PIPE_SHADER_TESS_CTRL] &&
-          dstate->shaders[PIPE_SHADER_TESS_EVAL])
-         fprintf(f, "tess_state: {default_outer_level = {%f, %f, %f, %f}, "
-                 "default_inner_level = {%f, %f}}\n",
-                 dstate->tess_default_levels[0],
-                 dstate->tess_default_levels[1],
-                 dstate->tess_default_levels[2],
-                 dstate->tess_default_levels[3],
-                 dstate->tess_default_levels[4],
-                 dstate->tess_default_levels[5]);
-
-      if (sh == PIPE_SHADER_FRAGMENT)
-         if (dstate->rs) {
-            unsigned num_viewports = dd_num_active_viewports(dstate);
-
-            if (dstate->rs->state.rs.clip_plane_enable)
-               DUMP(clip_state, &dstate->clip_state);
-
-            for (i = 0; i < num_viewports; i++)
-               DUMP_I(viewport_state, &dstate->viewports[i], i);
-
-            if (dstate->rs->state.rs.scissor)
-               for (i = 0; i < num_viewports; i++)
-                  DUMP_I(scissor_state, &dstate->scissors[i], i);
-
-            DUMP(rasterizer_state, &dstate->rs->state.rs);
-
-            if (dstate->rs->state.rs.poly_stipple_enable)
-               DUMP(poly_stipple, &dstate->polygon_stipple);
-            fprintf(f, "\n");
-         }
-
-      if (!dstate->shaders[sh])
-         continue;
-
-      fprintf(f, COLOR_SHADER "begin shader: %s" COLOR_RESET "\n", shader_str[sh]);
-      DUMP(shader_state, &dstate->shaders[sh]->state.shader);
-
-      for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++)
-         if (dstate->constant_buffers[sh][i].buffer ||
-             dstate->constant_buffers[sh][i].user_buffer) {
-            DUMP_I(constant_buffer, &dstate->constant_buffers[sh][i], i);
-            if (dstate->constant_buffers[sh][i].buffer)
-               DUMP_M(resource, &dstate->constant_buffers[sh][i], buffer);
-         }
-
-      for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-         if (dstate->sampler_states[sh][i])
-            DUMP_I(sampler_state, &dstate->sampler_states[sh][i]->state.sampler, i);
-
-      for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-         if (dstate->sampler_views[sh][i]) {
-            DUMP_I(sampler_view, dstate->sampler_views[sh][i], i);
-            DUMP_M(resource, dstate->sampler_views[sh][i], texture);
-         }
-
-      for (i = 0; i < PIPE_MAX_SHADER_IMAGES; i++)
-         if (dstate->shader_images[sh][i].resource) {
-            DUMP_I(image_view, &dstate->shader_images[sh][i], i);
-            if (dstate->shader_images[sh][i].resource)
-               DUMP_M(resource, &dstate->shader_images[sh][i], resource);
-         }
-
-      for (i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++)
-         if (dstate->shader_buffers[sh][i].buffer) {
-            DUMP_I(shader_buffer, &dstate->shader_buffers[sh][i], i);
-            if (dstate->shader_buffers[sh][i].buffer)
-               DUMP_M(resource, &dstate->shader_buffers[sh][i], buffer);
-         }
-
-      fprintf(f, COLOR_SHADER "end shader: %s" COLOR_RESET "\n\n", shader_str[sh]);
+      dd_dump_shader(dstate, sh, f);
    }
 
    if (dstate->dsa)
@@ -365,7 +374,11 @@
 dd_dump_launch_grid(struct dd_draw_state *dstate, struct pipe_grid_info *info, FILE *f)
 {
    fprintf(f, "%s:\n", __func__+8);
-   /* TODO */
+   DUMP(grid_info, info);
+   fprintf(f, "\n");
+
+   dd_dump_shader(dstate, PIPE_SHADER_COMPUTE, f);
+   fprintf(f, "\n");
 }
 
 static void
@@ -489,7 +502,7 @@
 {
    switch (call->type) {
    case CALL_DRAW_VBO:
-      dd_dump_draw_vbo(state, &call->info.draw_vbo, f);
+      dd_dump_draw_vbo(state, &call->info.draw_vbo.draw, f);
       break;
    case CALL_LAUNCH_GRID:
       dd_dump_launch_grid(state, &call->info.launch_grid, f);
@@ -608,9 +621,14 @@
 {
    switch (dst->type) {
    case CALL_DRAW_VBO:
-      pipe_so_target_reference(&dst->info.draw_vbo.count_from_stream_output, NULL);
-      pipe_resource_reference(&dst->info.draw_vbo.indirect, NULL);
-      pipe_resource_reference(&dst->info.draw_vbo.indirect_params, NULL);
+      pipe_so_target_reference(&dst->info.draw_vbo.draw.count_from_stream_output, NULL);
+      pipe_resource_reference(&dst->info.draw_vbo.indirect.buffer, NULL);
+      pipe_resource_reference(&dst->info.draw_vbo.indirect.indirect_draw_count, NULL);
+      if (dst->info.draw_vbo.draw.index_size &&
+          !dst->info.draw_vbo.draw.has_user_indices)
+         pipe_resource_reference(&dst->info.draw_vbo.draw.index.resource, NULL);
+      else
+         dst->info.draw_vbo.draw.index.user = NULL;
       break;
    case CALL_LAUNCH_GRID:
       pipe_resource_reference(&dst->info.launch_grid.indirect, NULL);
@@ -650,13 +668,30 @@
 
    switch (src->type) {
    case CALL_DRAW_VBO:
-      pipe_so_target_reference(&dst->info.draw_vbo.count_from_stream_output,
-                               src->info.draw_vbo.count_from_stream_output);
-      pipe_resource_reference(&dst->info.draw_vbo.indirect,
-                              src->info.draw_vbo.indirect);
-      pipe_resource_reference(&dst->info.draw_vbo.indirect_params,
-                              src->info.draw_vbo.indirect_params);
+      pipe_so_target_reference(&dst->info.draw_vbo.draw.count_from_stream_output,
+                               src->info.draw_vbo.draw.count_from_stream_output);
+      pipe_resource_reference(&dst->info.draw_vbo.indirect.buffer,
+                              src->info.draw_vbo.indirect.buffer);
+      pipe_resource_reference(&dst->info.draw_vbo.indirect.indirect_draw_count,
+                              src->info.draw_vbo.indirect.indirect_draw_count);
+
+      if (dst->info.draw_vbo.draw.index_size &&
+          !dst->info.draw_vbo.draw.has_user_indices)
+         pipe_resource_reference(&dst->info.draw_vbo.draw.index.resource, NULL);
+      else
+         dst->info.draw_vbo.draw.index.user = NULL;
+
+      if (src->info.draw_vbo.draw.index_size &&
+          !src->info.draw_vbo.draw.has_user_indices) {
+         pipe_resource_reference(&dst->info.draw_vbo.draw.index.resource,
+                                 src->info.draw_vbo.draw.index.resource);
+      }
+
       dst->info.draw_vbo = src->info.draw_vbo;
+      if (!src->info.draw_vbo.draw.indirect)
+         dst->info.draw_vbo.draw.indirect = NULL;
+      else
+         dst->info.draw_vbo.draw.indirect = &dst->info.draw_vbo.indirect;
       break;
    case CALL_LAUNCH_GRID:
       pipe_resource_reference(&dst->info.launch_grid.indirect,
@@ -711,8 +746,6 @@
    /* Just clear pointers to gallium objects. Don't clear the whole structure,
     * because it would kill performance with its size of 130 KB.
     */
-   memset(&state->base.index_buffer, 0,
-          sizeof(state->base.index_buffer));
    memset(state->base.vertex_buffers, 0,
           sizeof(state->base.vertex_buffers));
    memset(state->base.so_targets, 0,
@@ -750,10 +783,8 @@
    struct dd_draw_state *dst = &state->base;
    unsigned i,j;
 
-   util_set_index_buffer(&dst->index_buffer, NULL);
-
    for (i = 0; i < ARRAY_SIZE(dst->vertex_buffers); i++)
-      pipe_resource_reference(&dst->vertex_buffers[i].buffer, NULL);
+      pipe_vertex_buffer_unreference(&dst->vertex_buffers[i]);
    for (i = 0; i < ARRAY_SIZE(dst->so_targets); i++)
       pipe_so_target_reference(&dst->so_targets[i], NULL);
 
@@ -787,13 +818,9 @@
       dst->render_cond.query = NULL;
    }
 
-   util_set_index_buffer(&dst->index_buffer, &src->index_buffer);
-
    for (i = 0; i < ARRAY_SIZE(src->vertex_buffers); i++) {
-      pipe_resource_reference(&dst->vertex_buffers[i].buffer,
-                              src->vertex_buffers[i].buffer);
-      memcpy(&dst->vertex_buffers[i], &src->vertex_buffers[i],
-             sizeof(src->vertex_buffers[i]));
+      pipe_vertex_buffer_reference(&dst->vertex_buffers[i],
+                                   &src->vertex_buffers[i]);
    }
 
    dst->num_so_targets = src->num_so_targets;
@@ -809,8 +836,12 @@
 
       if (src->shaders[i]) {
          dst->shaders[i]->state.shader = src->shaders[i]->state.shader;
-         dst->shaders[i]->state.shader.tokens =
-            tgsi_dup_tokens(src->shaders[i]->state.shader.tokens);
+         if (src->shaders[i]->state.shader.tokens) {
+            dst->shaders[i]->state.shader.tokens =
+               tgsi_dup_tokens(src->shaders[i]->state.shader.tokens);
+         } else {
+            dst->shaders[i]->state.shader.ir.nir = NULL;
+         }
       } else {
          dst->shaders[i] = NULL;
       }
@@ -1164,7 +1195,13 @@
    struct dd_call call;
 
    call.type = CALL_DRAW_VBO;
-   call.info.draw_vbo = *info;
+   call.info.draw_vbo.draw = *info;
+   if (info->indirect) {
+      call.info.draw_vbo.indirect = *info->indirect;
+      call.info.draw_vbo.draw.indirect = &call.info.draw_vbo.indirect;
+   } else {
+      memset(&call.info.draw_vbo.indirect, 0, sizeof(*info->indirect));
+   }
 
    dd_before_draw(dctx);
    pipe->draw_vbo(pipe, info);
diff --git a/src/gallium/drivers/ddebug/dd_pipe.h b/src/gallium/drivers/ddebug/dd_pipe.h
index deae1f5..caad45b 100644
--- a/src/gallium/drivers/ddebug/dd_pipe.h
+++ b/src/gallium/drivers/ddebug/dd_pipe.h
@@ -104,12 +104,17 @@
    unsigned last_layer;
 };
 
+struct call_draw_info {
+   struct pipe_draw_info draw;
+   struct pipe_draw_indirect_info indirect;
+};
+
 struct dd_call
 {
    enum call_type type;
 
    union {
-      struct pipe_draw_info draw_vbo;
+      struct call_draw_info draw_vbo;
       struct pipe_grid_info launch_grid;
       struct call_resource_copy_region resource_copy_region;
       struct pipe_blit_info blit;
@@ -151,7 +156,6 @@
       unsigned mode;
    } render_cond;
 
-   struct pipe_index_buffer index_buffer;
    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
 
    unsigned num_so_targets;
diff --git a/src/gallium/drivers/ddebug/dd_screen.c b/src/gallium/drivers/ddebug/dd_screen.c
index 9642532..14e6f6b 100644
--- a/src/gallium/drivers/ddebug/dd_screen.c
+++ b/src/gallium/drivers/ddebug/dd_screen.c
@@ -55,6 +55,16 @@
    return screen->get_device_vendor(screen);
 }
 
+static const void *
+dd_screen_get_compiler_options(struct pipe_screen *_screen,
+                               enum pipe_shader_ir ir,
+                               enum pipe_shader_type shader)
+{
+   struct pipe_screen *screen = dd_screen(_screen)->screen;
+
+   return screen->get_compiler_options(screen, ir, shader);
+}
+
 static struct disk_cache *
 dd_screen_get_disk_shader_cache(struct pipe_screen *_screen)
 {
@@ -364,7 +374,7 @@
 
       if (sscanf(option+8, "%u", &apitrace_dump_call) != 1)
          return screen;
-   } else if (!strncmp(option, "pipelined", 8)) {
+   } else if (!strncmp(option, "pipelined", 9)) {
       mode = DD_DETECT_HANGS_PIPELINED;
 
       if (sscanf(option+10, "%u", &timeout) != 1)
@@ -411,6 +421,7 @@
    SCR_INIT(fence_finish);
    SCR_INIT(get_driver_query_info);
    SCR_INIT(get_driver_query_group_info);
+   SCR_INIT(get_compiler_options);
 
 #undef SCR_INIT
 
diff --git a/src/gallium/drivers/etnaviv/Android.mk b/src/gallium/drivers/etnaviv/Android.mk
new file mode 100644
index 0000000..6976d22
--- /dev/null
+++ b/src/gallium/drivers/etnaviv/Android.mk
@@ -0,0 +1,41 @@
+# Copyright (C) 2016 Linaro, Ltd, Rob Herring <robh@kernel.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(C_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
+LOCAL_MODULE := libmesa_pipe_etnaviv
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_ETNAVIV),)
+GALLIUM_TARGET_DRIVERS += etnaviv
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_etnaviv)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/etnaviv/etnaviv_blend.c b/src/gallium/drivers/etnaviv/etnaviv_blend.c
index 8ea09a3..6ed0e0f 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_blend.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_blend.c
@@ -129,3 +129,38 @@
 
    return true;
 }
+
+void
+etna_set_blend_color(struct pipe_context *pctx, const struct pipe_blend_color *bc)
+{
+   struct etna_context *ctx = etna_context(pctx);
+   struct compiled_blend_color *cs = &ctx->blend_color;
+
+   memcpy(cs->color, bc->color, sizeof(float) * 4);
+
+   ctx->dirty |= ETNA_DIRTY_BLEND_COLOR;
+}
+
+bool
+etna_update_blend_color(struct etna_context *ctx)
+{
+   struct pipe_framebuffer_state *pfb = &ctx->framebuffer_s;
+   struct compiled_blend_color *cs = &ctx->blend_color;
+
+   if (pfb->cbufs[0] &&
+       translate_rs_format_rb_swap(pfb->cbufs[0]->texture->format)) {
+      cs->PE_ALPHA_BLEND_COLOR =
+         VIVS_PE_ALPHA_BLEND_COLOR_R(etna_cfloat_to_uint8(cs->color[2])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_G(etna_cfloat_to_uint8(cs->color[1])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_B(etna_cfloat_to_uint8(cs->color[0])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_A(etna_cfloat_to_uint8(cs->color[3]));
+   } else {
+      cs->PE_ALPHA_BLEND_COLOR =
+         VIVS_PE_ALPHA_BLEND_COLOR_R(etna_cfloat_to_uint8(cs->color[0])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_G(etna_cfloat_to_uint8(cs->color[1])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_B(etna_cfloat_to_uint8(cs->color[2])) |
+         VIVS_PE_ALPHA_BLEND_COLOR_A(etna_cfloat_to_uint8(cs->color[3]));
+	}
+
+	return true;
+}
diff --git a/src/gallium/drivers/etnaviv/etnaviv_blend.h b/src/gallium/drivers/etnaviv/etnaviv_blend.h
index e26864d..c219396 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_blend.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_blend.h
@@ -56,4 +56,10 @@
 bool
 etna_update_blend(struct etna_context *ctx);
 
+void
+etna_set_blend_color(struct pipe_context *pctx, const struct pipe_blend_color *bc);
+
+bool
+etna_update_blend_color(struct etna_context *ctx);
+
 #endif
diff --git a/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c b/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
index 8b705f6..d73d0e3 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
@@ -106,7 +106,7 @@
    union util_color uc;
    util_pack_color(rgba, format, &uc);
    if (util_format_get_blocksize(format) == 2)
-      return uc.ui[0] << 16 | uc.ui[0];
+      return uc.ui[0] << 16 | (uc.ui[0] & 0xffff);
    else
       return uc.ui[0];
 }
@@ -122,8 +122,7 @@
    if (surf->surf.ts_size) { /* TS: use precompiled clear command */
       ctx->framebuffer.TS_COLOR_CLEAR_VALUE = new_clear_value;
 
-      if (!DBG_ENABLED(ETNA_DBG_NO_AUTODISABLE) &&
-          VIV_FEATURE(ctx->screen, chipMinorFeatures1, AUTO_DISABLE)) {
+      if (VIV_FEATURE(ctx->screen, chipMinorFeatures1, AUTO_DISABLE)) {
          /* Set number of color tiles to be filled */
          etna_set_state(ctx->stream, VIVS_TS_COLOR_AUTO_DISABLE_COUNT,
                         surf->surf.padded_width * surf->surf.padded_height / 16);
@@ -182,8 +181,7 @@
    if (surf->surf.ts_size) { /* TS: use precompiled clear command */
       /* Set new clear depth value */
       ctx->framebuffer.TS_DEPTH_CLEAR_VALUE = new_clear_value;
-      if (!DBG_ENABLED(ETNA_DBG_NO_AUTODISABLE) &&
-          VIV_FEATURE(ctx->screen, chipMinorFeatures1, AUTO_DISABLE)) {
+      if (VIV_FEATURE(ctx->screen, chipMinorFeatures1, AUTO_DISABLE)) {
          /* Set number of depth tiles to be filled */
          etna_set_state(ctx->stream, VIVS_TS_DEPTH_AUTO_DISABLE_COUNT,
                         surf->surf.padded_width * surf->surf.padded_height / 16);
@@ -398,7 +396,7 @@
    }
 
    unsigned src_format = etna_compatible_rs_format(blit_info->src.format);
-   unsigned dst_format = etna_compatible_rs_format(blit_info->src.format);
+   unsigned dst_format = etna_compatible_rs_format(blit_info->dst.format);
    if (translate_rs_format(src_format) == ETNA_NO_MATCH ||
        translate_rs_format(dst_format) == ETNA_NO_MATCH ||
        blit_info->scissor_enable || blit_info->src.box.x != 0 ||
@@ -467,16 +465,24 @@
       ts_mem_config |= VIVS_TS_MEM_CONFIG_MSAA | msaa_format;
    }
 
-   uint32_t to_flush = 0;
-
-   if (src->base.bind & PIPE_BIND_RENDER_TARGET)
-      to_flush |= VIVS_GL_FLUSH_CACHE_COLOR;
-   if (src->base.bind & PIPE_BIND_DEPTH_STENCIL)
-      to_flush |= VIVS_GL_FLUSH_CACHE_DEPTH;
-
-   if (to_flush) {
-      etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE, to_flush);
+   /* Always flush color and depth cache together before resolving. This works
+    * around artifacts that appear in some cases when scanning out a texture
+    * directly after it has been rendered to, such as rendering an animated web
+    * page in a QtWebEngine based WebView on GC2000. The artifacts look like
+    * the texture sampler samples zeroes instead of texture data in a small,
+    * irregular triangle in the lower right of each browser tile quad. Other
+    * attempts to avoid these artifacts, including a pipeline stall before the
+    * color flush or a TS cache flush afterwards, or flushing multiple times,
+    * with stalls before and after each flush, have shown no effect. */
+   if (src->base.bind & PIPE_BIND_RENDER_TARGET ||
+       src->base.bind & PIPE_BIND_DEPTH_STENCIL) {
+      etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE,
+		     VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_DEPTH);
       etna_stall(ctx->stream, SYNC_RECIPIENT_RA, SYNC_RECIPIENT_PE);
+
+      if (src->levels[blit_info->src.level].ts_size &&
+          src->levels[blit_info->src.level].ts_valid)
+         etna_set_state(ctx->stream, VIVS_TS_FLUSH_CACHE, VIVS_TS_FLUSH_CACHE_FLUSH);
    }
 
    /* Set up color TS to source surface before blit, if needed */
@@ -540,8 +546,9 @@
 
 manual:
    if (src->layout == ETNA_LAYOUT_TILED && dst->layout == ETNA_LAYOUT_TILED) {
-      etna_resource_wait(pctx, dst);
-      etna_resource_wait(pctx, src);
+      if ((src->status & ETNA_PENDING_WRITE) ||
+          (dst->status & ETNA_PENDING_WRITE))
+         pctx->flush(pctx, NULL, 0);
       return etna_manual_blit(dst, dst_lev, dst_offset, src, src_lev, src_offset, blit_info);
    }
 
@@ -603,10 +610,10 @@
 {
    struct etna_resource *rsc = etna_resource(prsc);
 
-   if (rsc->scanout) {
-      if (etna_resource_older(etna_resource(rsc->scanout->prime), rsc)) {
-         etna_copy_resource(pctx, rsc->scanout->prime, prsc, 0, 0);
-         etna_resource(rsc->scanout->prime)->seqno = rsc->seqno;
+   if (rsc->external) {
+      if (etna_resource_older(etna_resource(rsc->external), rsc)) {
+         etna_copy_resource(pctx, rsc->external, prsc, 0, 0);
+         etna_resource(rsc->external)->seqno = rsc->seqno;
       }
    } else if (etna_resource_needs_flush(rsc)) {
       etna_copy_resource(pctx, prsc, prsc, 0, 0);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_compiler.c b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
index af0f76b..165ab74 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
@@ -885,6 +885,8 @@
    default:
       assert(!"Invalid swizzle");
    }
+
+   unreachable("bad swizzle");
 }
 
 /* convert destination operand */
@@ -1389,12 +1391,27 @@
    else
       src_w = swizzle(src[0], SWIZZLE(W, W, W, W));
 
-   struct etna_inst ins[3] = { };
-   ins[0].opcode = INST_OPCODE_LOG;
-   ins[0].dst = etna_native_to_dst(inner_temp, INST_COMPS_X);
-   ins[0].src[2] = src_y;
+   if (c->specs->has_new_transcendentals) { /* Alternative LOG sequence */
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_LOG,
+         .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y),
+         .src[2] = src_y,
+         .tex = { .amode=1 }, /* Unknown bit needs to be set */
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
+         .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
+         .src[1] = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y)),
+      });
+   } else {
+      struct etna_inst ins[3] = { };
+      ins[0].opcode = INST_OPCODE_LOG;
+      ins[0].dst = etna_native_to_dst(inner_temp, INST_COMPS_X);
+      ins[0].src[2] = src_y;
 
-   emit_inst(c, &ins[0]);
+      emit_inst(c, &ins[0]);
+   }
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_MUL,
       .sat = 0,
@@ -1450,7 +1467,7 @@
 trans_trig(const struct instr_translater *t, struct etna_compile *c,
            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
-   if (c->specs->has_new_sin_cos) { /* Alternative SIN/COS */
+   if (c->specs->has_new_transcendentals) { /* Alternative SIN/COS */
       /* On newer chips alternative SIN/COS instructions are implemented,
        * which:
        * - Need their input scaled by 1/pi instead of 2/pi
@@ -1613,6 +1630,40 @@
 }
 
 static void
+trans_lg2(const struct instr_translater *t, struct etna_compile *c,
+            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
+{
+   if (c->specs->has_new_transcendentals) {
+      /* On newer chips alternative LOG instruction is implemented,
+       * which outputs an x and y component, which need to be multiplied to
+       * get the result.
+       */
+      struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xy */
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_LOG,
+         .sat = 0,
+         .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
+         .src[2] = src[0],
+         .tex = { .amode=1 }, /* Unknown bit needs to be set */
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .sat = inst->Instruction.Saturate,
+         .dst = convert_dst(c, &inst->Dst[0]),
+         .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
+         .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
+      });
+   } else {
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_LOG,
+         .sat = inst->Instruction.Saturate,
+         .dst = convert_dst(c, &inst->Dst[0]),
+         .src[2] = src[0],
+      });
+   }
+}
+
+static void
 trans_dph(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
@@ -1753,7 +1804,7 @@
    INSTR(DST, trans_instr, .opc = INST_OPCODE_DST, .src = {0, 1, -1}),
    INSTR(MAD, trans_instr, .opc = INST_OPCODE_MAD, .src = {0, 1, 2}),
    INSTR(EX2, trans_instr, .opc = INST_OPCODE_EXP, .src = {2, -1, -1}),
-   INSTR(LG2, trans_instr, .opc = INST_OPCODE_LOG, .src = {2, -1, -1}),
+   INSTR(LG2, trans_lg2),
    INSTR(SQRT, trans_instr, .opc = INST_OPCODE_SQRT, .src = {2, -1, -1}),
    INSTR(FRC, trans_instr, .opc = INST_OPCODE_FRC, .src = {2, -1, -1}),
    INSTR(CEIL, trans_instr, .opc = INST_OPCODE_CEIL, .src = {2, -1, -1}),
@@ -2302,7 +2353,7 @@
    if (!c)
       return false;
 
-   memset(&c->lbl_usage, -1, ARRAY_SIZE(c->lbl_usage));
+   memset(&c->lbl_usage, -1, sizeof(c->lbl_usage));
 
    const struct tgsi_token *tokens = v->shader->tokens;
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_context.c b/src/gallium/drivers/etnaviv/etnaviv_context.c
index cfbc906..2ca09ce 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_context.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_context.c
@@ -92,7 +92,7 @@
     *   buffer state as dirty
     */
 
-   if (info->indexed) {
+   if (info->index_size) {
       uint32_t new_control = ctx->index_buffer.FE_INDEX_STREAM_CONTROL;
 
       if (info->primitive_restart)
@@ -149,12 +149,16 @@
    uint32_t draw_mode;
    unsigned i;
 
+   if (!info->count_from_stream_output && !info->indirect &&
+       !info->primitive_restart &&
+       !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+      return;
+
    if (ctx->vertex_elements == NULL || ctx->vertex_elements->num_elements == 0)
       return; /* Nothing to do */
 
    if (!(ctx->prim_hwsupport & (1 << info->mode))) {
       struct primconvert_context *primconvert = ctx->primconvert;
-      util_primconvert_save_index_buffer(primconvert, &ctx->index_buffer.ib);
       util_primconvert_save_rasterizer_state(primconvert, ctx->rasterizer);
       util_primconvert_draw_vbo(primconvert, info);
       return;
@@ -173,18 +177,33 @@
    }
 
    /* Upload a user index buffer. */
-   struct pipe_index_buffer ibuffer_saved = {};
-   if (info->indexed && ctx->index_buffer.ib.user_buffer &&
-       !util_save_and_upload_index_buffer(pctx, info, &ctx->index_buffer.ib,
-                                          &ibuffer_saved)) {
-      BUG("Index buffer upload failed.");
-      return;
-   }
+   unsigned index_offset = 0;
+   struct pipe_resource *indexbuf = NULL;
 
-   if (info->indexed && !ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo) {
-      BUG("Unsupported or no index buffer");
-      return;
+   if (info->index_size) {
+      indexbuf = info->has_user_indices ? NULL : info->index.resource;
+      if (info->has_user_indices &&
+          !util_upload_index_buffer(pctx, info, &indexbuf, &index_offset)) {
+         BUG("Index buffer upload failed.");
+         return;
+      }
+
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo = etna_resource(indexbuf)->bo;
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.offset = index_offset;
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.flags = ETNA_RELOC_READ;
+      ctx->index_buffer.FE_INDEX_STREAM_CONTROL = translate_index_size(info->index_size);
+
+      if (!ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo) {
+         BUG("Unsupported or no index buffer");
+         return;
+      }
+   } else {
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo = 0;
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.offset = 0;
+      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.flags = 0;
+      ctx->index_buffer.FE_INDEX_STREAM_CONTROL = 0;
    }
+   ctx->dirty |= ETNA_DIRTY_INDEX_BUFFER;
 
    struct etna_shader_key key = {};
    struct etna_surface *cbuf = etna_surface(pfb->cbufs[0]);
@@ -229,12 +248,12 @@
 
    /* Mark VBOs as being read */
    for (i = 0; i < ctx->vertex_buffer.count; i++) {
-      assert(!ctx->vertex_buffer.vb[i].user_buffer);
-      resource_read(ctx, ctx->vertex_buffer.vb[i].buffer);
+      assert(!ctx->vertex_buffer.vb[i].is_user_buffer);
+      resource_read(ctx, ctx->vertex_buffer.vb[i].buffer.resource);
    }
 
    /* Mark index buffer as being read */
-   resource_read(ctx, ctx->index_buffer.ib.buffer);
+   resource_read(ctx, indexbuf);
 
    /* Mark textures as being read */
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
@@ -250,7 +269,7 @@
    /* First, sync state, then emit DRAW_PRIMITIVES or DRAW_INDEXED_PRIMITIVES */
    etna_emit_state(ctx);
 
-   if (info->indexed)
+   if (info->index_size)
       etna_draw_indexed_primitives(ctx->stream, draw_mode, info->start, prims, info->index_bias);
    else
       etna_draw_primitives(ctx->stream, draw_mode, info->start, prims);
@@ -269,8 +288,8 @@
       etna_resource(ctx->framebuffer.cbuf->texture)->seqno++;
    if (ctx->framebuffer.zsbuf)
       etna_resource(ctx->framebuffer.zsbuf->texture)->seqno++;
-   if (info->indexed && ibuffer_saved.user_buffer)
-      pctx->set_index_buffer(pctx, &ibuffer_saved);
+   if (info->index_size && indexbuf != info->index.resource)
+      pipe_resource_reference(&indexbuf, NULL);
 }
 
 static void
diff --git a/src/gallium/drivers/etnaviv/etnaviv_context.h b/src/gallium/drivers/etnaviv/etnaviv_context.h
index 56b57b5..2c9b24d 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_context.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_context.h
@@ -44,7 +44,6 @@
 struct etna_shader_variant;
 
 struct etna_index_buffer {
-   struct pipe_index_buffer ib;
    struct etna_reloc FE_INDEX_STREAM_BASE_ADDR;
    uint32_t FE_INDEX_STREAM_CONTROL;
    uint32_t FE_PRIMITIVE_RESTART_INDEX;
@@ -175,6 +174,7 @@
    struct {
       uint64_t prims_emitted;
       uint64_t draw_calls;
+      uint64_t rs_operations;
    } stats;
 
    struct pipe_debug_callback debug;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_emit.c b/src/gallium/drivers/etnaviv/etnaviv_emit.c
index 7ced5fc..bfff699 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_emit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_emit.c
@@ -171,6 +171,8 @@
    struct etna_cmd_stream *stream = ctx->stream;
    struct etna_coalesce coalesce;
 
+   ctx->stats.rs_operations++;
+
    if (screen->specs.pixel_pipes == 1) {
       etna_cmd_stream_reserve(stream, 22);
       etna_coalesce_start(stream, &coalesce);
@@ -369,8 +371,7 @@
 
       /*03818*/ EMIT_STATE(GL_MULTI_SAMPLE_CONFIG, val);
    }
-   if (likely(dirty & (ETNA_DIRTY_INDEX_BUFFER)) &&
-       ctx->index_buffer.ib.buffer) {
+   if (likely(dirty & (ETNA_DIRTY_INDEX_BUFFER))) {
       /*00644*/ EMIT_STATE_RELOC(FE_INDEX_STREAM_BASE_ADDR, &ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR);
       /*00648*/ EMIT_STATE(FE_INDEX_STREAM_CONTROL, ctx->index_buffer.FE_INDEX_STREAM_CONTROL);
    }
diff --git a/src/gallium/drivers/etnaviv/etnaviv_fence.c b/src/gallium/drivers/etnaviv/etnaviv_fence.c
index 65402aa..d82708e 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_fence.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_fence.c
@@ -65,10 +65,8 @@
 etna_screen_fence_finish(struct pipe_screen *pscreen, struct pipe_context *ctx,
                          struct pipe_fence_handle *fence, uint64_t timeout)
 {
-   if (fence->fence_fd != -1) {
-      int ret = sync_wait(fence->fence_fd, timeout / 1000000);
-      return ret == 0;
-   }
+   if (fence->fence_fd != -1)
+	return !sync_wait(fence->fence_fd, timeout / 1000000);
 
    if (etna_pipe_wait_ns(fence->screen->pipe, fence->timestamp, timeout))
       return false;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_format.c b/src/gallium/drivers/etnaviv/etnaviv_format.c
index 7c24386..69e07bc 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_format.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_format.c
@@ -40,6 +40,7 @@
    unsigned tex;
    unsigned rs;
    boolean present;
+   const unsigned char tex_swiz[4];
 };
 
 #define RS_FORMAT_NONE ~0
@@ -51,22 +52,31 @@
 #define RS_FORMAT_X8B8G8R8    (RS_FORMAT_X8R8G8B8 | RS_FORMAT_RB_SWAP)
 #define RS_FORMAT_A8B8G8R8    (RS_FORMAT_A8R8G8B8 | RS_FORMAT_RB_SWAP)
 
+#define SWIZ(x,y,z,w) {    \
+   PIPE_SWIZZLE_##x,       \
+   PIPE_SWIZZLE_##y,       \
+   PIPE_SWIZZLE_##z,       \
+   PIPE_SWIZZLE_##w        \
+}
+
 /* vertex + texture */
-#define VT(pipe, vtxfmt, texfmt, rsfmt)                   \
+#define VT(pipe, vtxfmt, texfmt, texswiz, rsfmt)          \
    [PIPE_FORMAT_##pipe] = {                               \
       .vtx = VIVS_FE_VERTEX_ELEMENT_CONFIG_TYPE_##vtxfmt, \
       .tex = TEXTURE_FORMAT_##texfmt,                     \
       .rs = RS_FORMAT_##rsfmt,                            \
       .present = 1,                                       \
+      .tex_swiz = texswiz,                                \
    }
 
 /* texture-only */
-#define _T(pipe, fmt, rsfmt)       \
+#define _T(pipe, fmt, swiz, rsfmt) \
    [PIPE_FORMAT_##pipe] = {        \
       .vtx = ETNA_NO_MATCH,        \
       .tex = TEXTURE_FORMAT_##fmt, \
       .rs = RS_FORMAT_##rsfmt,     \
       .present = 1,                \
+      .tex_swiz = swiz,            \
    }
 
 /* vertex-only */
@@ -87,9 +97,9 @@
    V_(R8_USCALED, UNSIGNED_BYTE, NONE),
    V_(R8_SSCALED, BYTE,          NONE),
 
-   _T(A8_UNORM, A8, NONE),
-   _T(L8_UNORM, L8, NONE),
-   _T(I8_UNORM, I8, NONE),
+   _T(A8_UNORM, A8, SWIZ(X, Y, Z, W), NONE),
+   _T(L8_UNORM, L8, SWIZ(X, Y, Z, W), NONE),
+   _T(I8_UNORM, I8, SWIZ(X, Y, Z, W), NONE),
 
    /* 16-bit */
    V_(R16_UNORM,   UNSIGNED_SHORT, NONE),
@@ -100,17 +110,17 @@
    V_(R16_SSCALED, SHORT,          NONE),
    V_(R16_FLOAT,   HALF_FLOAT,     NONE),
 
-   _T(B4G4R4A4_UNORM, A4R4G4B4, A4R4G4B4),
-   _T(B4G4R4X4_UNORM, X4R4G4B4, X4R4G4B4),
+   _T(B4G4R4A4_UNORM, A4R4G4B4, SWIZ(X, Y, Z, W), A4R4G4B4),
+   _T(B4G4R4X4_UNORM, X4R4G4B4, SWIZ(X, Y, Z, W), X4R4G4B4),
 
-   _T(L8A8_UNORM, A8L8, NONE),
+   _T(L8A8_UNORM, A8L8, SWIZ(X, Y, Z, W), NONE),
 
-   _T(Z16_UNORM,      D16,      A4R4G4B4),
-   _T(B5G6R5_UNORM,   R5G6B5,   R5G6B5),
-   _T(B5G5R5A1_UNORM, A1R5G5B5, A1R5G5B5),
-   _T(B5G5R5X1_UNORM, X1R5G5B5, X1R5G5B5),
+   _T(Z16_UNORM,      D16,      SWIZ(X, Y, Z, W), A4R4G4B4),
+   _T(B5G6R5_UNORM,   R5G6B5,   SWIZ(X, Y, Z, W), R5G6B5),
+   _T(B5G5R5A1_UNORM, A1R5G5B5, SWIZ(X, Y, Z, W), A1R5G5B5),
+   _T(B5G5R5X1_UNORM, X1R5G5B5, SWIZ(X, Y, Z, W), X1R5G5B5),
 
-   V_(R8G8_UNORM,   UNSIGNED_BYTE,  NONE),
+   VT(R8G8_UNORM,   UNSIGNED_BYTE,  EXT_G8R8 | EXT_FORMAT, SWIZ(X, Y, 0, 1), NONE),
    V_(R8G8_SNORM,   BYTE,           NONE),
    V_(R8G8_UINT,    UNSIGNED_BYTE,  NONE),
    V_(R8G8_SINT,    BYTE,           NONE),
@@ -145,27 +155,24 @@
 
    V_(A8B8G8R8_UNORM,   UNSIGNED_BYTE, NONE),
 
-   V_(R8G8B8A8_UNORM,   UNSIGNED_BYTE, A8B8G8R8),
+   VT(R8G8B8A8_UNORM,   UNSIGNED_BYTE, A8B8G8R8, SWIZ(X, Y, Z, W), A8B8G8R8),
    V_(R8G8B8A8_SNORM,   BYTE,          A8B8G8R8),
-   _T(R8G8B8X8_UNORM,   X8B8G8R8,      X8B8G8R8),
+   _T(R8G8B8X8_UNORM,   X8B8G8R8,      SWIZ(X, Y, Z, W), X8B8G8R8),
    V_(R8G8B8A8_UINT,    UNSIGNED_BYTE, A8B8G8R8),
    V_(R8G8B8A8_SINT,    BYTE,          A8B8G8R8),
    V_(R8G8B8A8_USCALED, UNSIGNED_BYTE, A8B8G8R8),
    V_(R8G8B8A8_SSCALED, BYTE,          A8B8G8R8),
 
-   _T(R8G8B8A8_UNORM, A8B8G8R8, A8B8G8R8),
-   _T(R8G8B8X8_UNORM, X8B8G8R8, X8B8G8R8),
-
-   _T(B8G8R8A8_UNORM, A8R8G8B8, A8R8G8B8),
-   _T(B8G8R8X8_UNORM, X8R8G8B8, X8R8G8B8),
+   _T(B8G8R8A8_UNORM, A8R8G8B8, SWIZ(X, Y, Z, W), A8R8G8B8),
+   _T(B8G8R8X8_UNORM, X8R8G8B8, SWIZ(X, Y, Z, W), X8R8G8B8),
 
    V_(R10G10B10A2_UNORM,   UNSIGNED_INT_10_10_10_2, NONE),
    V_(R10G10B10A2_SNORM,   INT_10_10_10_2,          NONE),
    V_(R10G10B10A2_USCALED, UNSIGNED_INT_10_10_10_2, NONE),
    V_(R10G10B10A2_SSCALED, INT_10_10_10_2,          NONE),
 
-   _T(X8Z24_UNORM,       D24S8, A8R8G8B8),
-   _T(S8_UINT_Z24_UNORM, D24S8, A8R8G8B8),
+   _T(X8Z24_UNORM,       D24S8, SWIZ(X, Y, Z, W), A8R8G8B8),
+   _T(S8_UINT_Z24_UNORM, D24S8, SWIZ(X, Y, Z, W), A8R8G8B8),
 
    /* 48-bit */
    V_(R16G16B16_UNORM,   UNSIGNED_SHORT, NONE),
@@ -215,30 +222,75 @@
    V_(R32G32B32A32_FIXED,   FIXED,        NONE),
 
    /* compressed */
-   _T(ETC1_RGB8, ETC1, NONE),
+   _T(ETC1_RGB8, ETC1, SWIZ(X, Y, Z, W), NONE),
 
-   _T(DXT1_RGB,  DXT1,      NONE),
-   _T(DXT1_RGBA, DXT1,      NONE),
-   _T(DXT3_RGBA, DXT2_DXT3, NONE),
-   _T(DXT3_RGBA, DXT2_DXT3, NONE),
-   _T(DXT5_RGBA, DXT4_DXT5, NONE),
+   _T(DXT1_RGB,  DXT1,      SWIZ(X, Y, Z, W), NONE),
+   _T(DXT1_RGBA, DXT1,      SWIZ(X, Y, Z, W), NONE),
+   _T(DXT3_RGBA, DXT2_DXT3, SWIZ(X, Y, Z, W), NONE),
+   _T(DXT5_RGBA, DXT4_DXT5, SWIZ(X, Y, Z, W), NONE),
+
+   _T(ETC2_RGB8,       EXT_NONE | EXT_FORMAT,                          SWIZ(X, Y, Z, W), NONE), /* Extd. format NONE doubles as ETC2_RGB8 */
+   _T(ETC2_SRGB8,      EXT_NONE | EXT_FORMAT,                          SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_RGB8A1,     EXT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_SRGB8A1,    EXT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_RGBA8,      EXT_RGBA8_ETC2_EAC | EXT_FORMAT,                SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_SRGBA8,     EXT_RGBA8_ETC2_EAC | EXT_FORMAT,                SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_R11_UNORM,  EXT_R11_EAC | EXT_FORMAT,                       SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_R11_SNORM,  EXT_SIGNED_R11_EAC | EXT_FORMAT,                SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_RG11_UNORM, EXT_RG11_EAC | EXT_FORMAT,                      SWIZ(X, Y, Z, W), NONE),
+   _T(ETC2_RG11_SNORM, EXT_SIGNED_RG11_EAC | EXT_FORMAT,               SWIZ(X, Y, Z, W), NONE),
 
    /* YUV */
-   _T(YUYV, YUY2, YUY2),
-   _T(UYVY, UYVY, NONE),
+   _T(YUYV, YUY2, SWIZ(X, Y, Z, W), YUY2),
+   _T(UYVY, UYVY, SWIZ(X, Y, Z, W), NONE),
 };
 
 uint32_t
 translate_texture_format(enum pipe_format fmt)
 {
-   /* XXX with TEXTURE_FORMAT_EXT and swizzle on newer chips we can
-    * support much more */
    if (!formats[fmt].present)
       return ETNA_NO_MATCH;
 
    return formats[fmt].tex;
 }
 
+bool
+texture_format_needs_swiz(enum pipe_format fmt)
+{
+   static const unsigned char def[4] = SWIZ(X, Y, Z, W);
+   bool swiz = false;
+
+   if (formats[fmt].present)
+      swiz = !memcmp(def, formats[fmt].tex_swiz, sizeof(formats[fmt].tex_swiz));
+
+   return swiz;
+}
+
+uint32_t
+get_texture_swiz(enum pipe_format fmt, unsigned swizzle_r,
+                 unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a)
+{
+   unsigned char swiz[4] = {
+      swizzle_r, swizzle_g, swizzle_b, swizzle_a,
+   }, rswiz[4];
+
+   assert(formats[fmt].present);
+   util_format_compose_swizzles(formats[fmt].tex_swiz, swiz, rswiz);
+
+   /* PIPE_SWIZZLE_ maps 1:1 to TEXTURE_SWIZZLE_ */
+   STATIC_ASSERT(PIPE_SWIZZLE_X == TEXTURE_SWIZZLE_RED);
+   STATIC_ASSERT(PIPE_SWIZZLE_Y == TEXTURE_SWIZZLE_GREEN);
+   STATIC_ASSERT(PIPE_SWIZZLE_Z == TEXTURE_SWIZZLE_BLUE);
+   STATIC_ASSERT(PIPE_SWIZZLE_W == TEXTURE_SWIZZLE_ALPHA);
+   STATIC_ASSERT(PIPE_SWIZZLE_0 == TEXTURE_SWIZZLE_ZERO);
+   STATIC_ASSERT(PIPE_SWIZZLE_1 == TEXTURE_SWIZZLE_ONE);
+
+   return VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_R(rswiz[0]) |
+          VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_G(rswiz[1]) |
+          VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_B(rswiz[2]) |
+          VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_A(rswiz[3]);
+}
+
 uint32_t
 translate_rs_format(enum pipe_format fmt)
 {
diff --git a/src/gallium/drivers/etnaviv/etnaviv_format.h b/src/gallium/drivers/etnaviv/etnaviv_format.h
index 549dfda..543e309 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_format.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_format.h
@@ -31,10 +31,18 @@
 #include <stdint.h>
 
 #define ETNA_NO_MATCH (~0)
+#define EXT_FORMAT (1 << 31)
 
 uint32_t
 translate_texture_format(enum pipe_format fmt);
 
+bool
+texture_format_needs_swiz(enum pipe_format fmt);
+
+uint32_t
+get_texture_swiz(enum pipe_format fmt, unsigned swizzle_r,
+                 unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
+
 uint32_t
 translate_rs_format(enum pipe_format fmt);
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_internal.h b/src/gallium/drivers/etnaviv/etnaviv_internal.h
index 2f8dacb..8a31167 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_internal.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_internal.h
@@ -70,8 +70,8 @@
    unsigned has_sign_floor_ceil : 1;
    /* can use VS_RANGE, PS_RANGE registers*/
    unsigned has_shader_range_registers : 1;
-   /* has the new sin/cos functions */
-   unsigned has_new_sin_cos : 1;
+   /* has the new sin/cos/log functions */
+   unsigned has_new_transcendentals : 1;
    /* supports single-buffer rendering with multiple pixel pipes */
    unsigned single_buffer : 1;
    /* can use any kind of wrapping mode on npot textures */
@@ -126,6 +126,7 @@
 
 /* Compiled pipe_blend_color */
 struct compiled_blend_color {
+   float color[4];
    uint32_t PE_ALPHA_BLEND_COLOR;
 };
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_query.c b/src/gallium/drivers/etnaviv/etnaviv_query.c
index b33e580..617e475 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_query.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_query.c
@@ -84,6 +84,7 @@
    struct pipe_driver_query_info list[] = {
       {"prims-emitted", PIPE_QUERY_PRIMITIVES_EMITTED, { 0 }},
       {"draw-calls", ETNA_QUERY_DRAW_CALLS, { 0 }},
+      {"rs-operations", ETNA_QUERY_RS_OPERATIONS, { 0 }},
    };
 
    if (!info)
diff --git a/src/gallium/drivers/etnaviv/etnaviv_query.h b/src/gallium/drivers/etnaviv/etnaviv_query.h
index 9a8d579..cebd662 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_query.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_query.h
@@ -54,6 +54,7 @@
 }
 
 #define ETNA_QUERY_DRAW_CALLS    (PIPE_QUERY_DRIVER_SPECIFIC + 0)
+#define ETNA_QUERY_RS_OPERATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 1)
 
 void
 etna_query_screen_init(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_query_sw.c b/src/gallium/drivers/etnaviv/etnaviv_query_sw.c
index d6420d9..213c61f 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_query_sw.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_query_sw.c
@@ -50,6 +50,8 @@
       return ctx->stats.prims_emitted;
    case ETNA_QUERY_DRAW_CALLS:
       return ctx->stats.draw_calls;
+   case ETNA_QUERY_RS_OPERATIONS:
+      return ctx->stats.rs_operations;
    }
 
    return 0;
@@ -106,6 +108,7 @@
    switch (query_type) {
    case PIPE_QUERY_PRIMITIVES_EMITTED:
    case ETNA_QUERY_DRAW_CALLS:
+   case ETNA_QUERY_RS_OPERATIONS:
       break;
    default:
       return NULL;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_resource.c b/src/gallium/drivers/etnaviv/etnaviv_resource.c
index 2bb849a..d6cccd2 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -36,6 +36,47 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
+#include <drm_fourcc.h>
+
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL<<56) - 1)
+#endif
+
+static enum etna_surface_layout modifier_to_layout(uint64_t modifier)
+{
+   switch (modifier) {
+   case DRM_FORMAT_MOD_VIVANTE_TILED:
+      return ETNA_LAYOUT_TILED;
+   case DRM_FORMAT_MOD_VIVANTE_SUPER_TILED:
+      return ETNA_LAYOUT_SUPER_TILED;
+   case DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED:
+      return ETNA_LAYOUT_MULTI_TILED;
+   case DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED:
+      return ETNA_LAYOUT_MULTI_SUPERTILED;
+   case DRM_FORMAT_MOD_LINEAR:
+   default:
+      return ETNA_LAYOUT_LINEAR;
+   }
+}
+
+static uint64_t layout_to_modifier(enum etna_surface_layout layout)
+{
+   switch (layout) {
+   case ETNA_LAYOUT_TILED:
+      return DRM_FORMAT_MOD_VIVANTE_TILED;
+   case ETNA_LAYOUT_SUPER_TILED:
+      return DRM_FORMAT_MOD_VIVANTE_SUPER_TILED;
+   case ETNA_LAYOUT_MULTI_TILED:
+      return DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED;
+   case ETNA_LAYOUT_MULTI_SUPERTILED:
+      return DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED;
+   case ETNA_LAYOUT_LINEAR:
+      return DRM_FORMAT_MOD_LINEAR;
+   default:
+      return DRM_FORMAT_MOD_INVALID;
+   }
+}
+
 /* A tile is 4x4 pixels, having 'screen->specs.bits_per_tile' of tile status.
  * So, in a buffer of N pixels, there are N / (4 * 4) tiles.
  * We need N * screen->specs.bits_per_tile / (4 * 4) bits of tile status, or
@@ -138,9 +179,10 @@
 /* Create a new resource object, using the given template info */
 struct pipe_resource *
 etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout,
-                    const struct pipe_resource *templat)
+                    uint64_t modifier, const struct pipe_resource *templat)
 {
    struct etna_screen *screen = etna_screen(pscreen);
+   struct etna_resource *rsc;
    unsigned size;
 
    DBG_F(ETNA_DBG_RESOURCE_MSGS,
@@ -180,14 +222,41 @@
                         &paddingY, &halign);
    assert(paddingX && paddingY);
 
-   if (templat->target != PIPE_BUFFER) {
-      unsigned min_paddingY = 4 * screen->specs.pixel_pipes;
-      if (paddingY < min_paddingY)
-         paddingY = min_paddingY;
+   if (templat->target != PIPE_BUFFER)
+      etna_adjust_rs_align(screen->specs.pixel_pipes, NULL, &paddingY);
+
+   if (templat->bind & PIPE_BIND_SCANOUT) {
+      struct pipe_resource scanout_templat = *templat;
+      struct renderonly_scanout *scanout;
+      struct winsys_handle handle;
+
+      /* pad scanout buffer size to be compatible with the RS */
+      if (modifier == DRM_FORMAT_MOD_LINEAR)
+         etna_adjust_rs_align(screen->specs.pixel_pipes, &paddingX, &paddingY);
+
+      scanout_templat.width0 = align(scanout_templat.width0, paddingX);
+      scanout_templat.height0 = align(scanout_templat.height0, paddingY);
+
+      scanout = renderonly_scanout_for_resource(&scanout_templat,
+                                                screen->ro, &handle);
+      if (!scanout)
+         return NULL;
+
+      assert(handle.type == DRM_API_HANDLE_TYPE_FD);
+      handle.modifier = modifier;
+      rsc = etna_resource(pscreen->resource_from_handle(pscreen, templat,
+                                                        &handle,
+                                                        PIPE_HANDLE_USAGE_WRITE));
+      close(handle.handle);
+      if (!rsc)
+         return NULL;
+
+      rsc->scanout = scanout;
+
+      return &rsc->base;
    }
 
-   struct etna_resource *rsc = CALLOC_STRUCT(etna_resource);
-
+   rsc = CALLOC_STRUCT(etna_resource);
    if (!rsc)
       return NULL;
 
@@ -208,21 +277,22 @@
    struct etna_bo *bo = etna_bo_new(screen->dev, size, flags);
    if (unlikely(bo == NULL)) {
       BUG("Problem allocating video memory for resource");
-      return NULL;
+      goto free_rsc;
    }
 
    rsc->bo = bo;
    rsc->ts_bo = 0; /* TS is only created when first bound to surface */
 
-   if (templat->bind & PIPE_BIND_SCANOUT)
-      rsc->scanout = renderonly_scanout_for_resource(&rsc->base, screen->ro);
-
    if (DBG_ENABLED(ETNA_DBG_ZERO)) {
       void *map = etna_bo_map(bo);
       memset(map, 0, size);
    }
 
    return &rsc->base;
+
+free_rsc:
+   FREE(rsc);
+   return NULL;
 }
 
 static struct pipe_resource *
@@ -231,12 +301,9 @@
 {
    struct etna_screen *screen = etna_screen(pscreen);
 
-   /* Figure out what tiling to use -- for now, assume that textures cannot be
-    * supertiled, and cannot be linear.
-    * There is a feature flag SUPERTILED_TEXTURE (not supported on any known hw)
-    * that may allow this, as well
-    * as LINEAR_TEXTURE_SUPPORT (supported on gc880 and gc2000 at least), but
-    * not sure how it works.
+   /* Figure out what tiling to use -- for now, assume that texture cannot be linear.
+    * there is a capability LINEAR_TEXTURE_SUPPORT (supported on gc880 and
+    * gc2000 at least), but not sure how it works.
     * Buffers always have LINEAR layout.
     */
    unsigned layout = ETNA_LAYOUT_LINEAR;
@@ -250,7 +317,7 @@
          layout = ETNA_LAYOUT_LINEAR;
    } else if (templat->target != PIPE_BUFFER) {
       bool want_multitiled = false;
-      bool want_supertiled = screen->specs.can_supertile && !DBG_ENABLED(ETNA_DBG_NO_SUPERTILE);
+      bool want_supertiled = screen->specs.can_supertile;
 
       /* When this GPU supports single-buffer rendering, don't ever enable
        * multi-tiling. This replicates the blob behavior on GC3000.
@@ -281,7 +348,89 @@
    if (templat->target == PIPE_TEXTURE_3D)
       layout = ETNA_LAYOUT_LINEAR;
 
-   return etna_resource_alloc(pscreen, layout, templat);
+   /* modifier is only used for scanout surfaces, so safe to use LINEAR here */
+   return etna_resource_alloc(pscreen, layout, DRM_FORMAT_MOD_LINEAR, templat);
+}
+
+enum modifier_priority {
+   MODIFIER_PRIORITY_INVALID = 0,
+   MODIFIER_PRIORITY_LINEAR,
+   MODIFIER_PRIORITY_SPLIT_TILED,
+   MODIFIER_PRIORITY_SPLIT_SUPER_TILED,
+   MODIFIER_PRIORITY_TILED,
+   MODIFIER_PRIORITY_SUPER_TILED,
+};
+
+const uint64_t priority_to_modifier[] = {
+   [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID,
+   [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR,
+   [MODIFIER_PRIORITY_SPLIT_TILED] = DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED,
+   [MODIFIER_PRIORITY_SPLIT_SUPER_TILED] = DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED,
+   [MODIFIER_PRIORITY_TILED] = DRM_FORMAT_MOD_VIVANTE_TILED,
+   [MODIFIER_PRIORITY_SUPER_TILED] = DRM_FORMAT_MOD_VIVANTE_SUPER_TILED,
+};
+
+static uint64_t
+select_best_modifier(const struct etna_screen * screen,
+                     const uint64_t *modifiers, const unsigned count)
+{
+   enum modifier_priority prio = MODIFIER_PRIORITY_INVALID;
+
+   for (int i = 0; i < count; i++) {
+      switch (modifiers[i]) {
+      case DRM_FORMAT_MOD_VIVANTE_SUPER_TILED:
+         if ((screen->specs.pixel_pipes > 1 && !screen->specs.single_buffer) ||
+             !screen->specs.can_supertile)
+            break;
+         prio = MAX2(prio, MODIFIER_PRIORITY_SUPER_TILED);
+         break;
+      case DRM_FORMAT_MOD_VIVANTE_TILED:
+         if (screen->specs.pixel_pipes > 1 && !screen->specs.single_buffer)
+            break;
+         prio = MAX2(prio, MODIFIER_PRIORITY_TILED);
+         break;
+      case DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED:
+         if ((screen->specs.pixel_pipes < 2) || !screen->specs.can_supertile)
+            break;
+         prio = MAX2(prio, MODIFIER_PRIORITY_SPLIT_SUPER_TILED);
+         break;
+      case DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED:
+         if (screen->specs.pixel_pipes < 2)
+            break;
+         prio = MAX2(prio, MODIFIER_PRIORITY_SPLIT_TILED);
+         break;
+      case DRM_FORMAT_MOD_LINEAR:
+         prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR);
+         break;
+      case DRM_FORMAT_MOD_INVALID:
+      default:
+         break;
+      }
+   }
+
+   return priority_to_modifier[prio];
+}
+
+static struct pipe_resource *
+etna_resource_create_modifiers(struct pipe_screen *pscreen,
+                               const struct pipe_resource *templat,
+                               const uint64_t *modifiers, int count)
+{
+   struct etna_screen *screen = etna_screen(pscreen);
+   struct pipe_resource tmpl = *templat;
+   uint64_t modifier = select_best_modifier(screen, modifiers, count);
+
+   if (modifier == DRM_FORMAT_MOD_INVALID)
+      return NULL;
+
+   /*
+    * We currently assume that all buffers allocated through this interface
+    * should be scanout enabled.
+    */
+   tmpl.bind |= PIPE_BIND_SCANOUT;
+
+   return etna_resource_alloc(pscreen, modifier_to_layout(modifier),
+                              modifier, &tmpl);
 }
 
 static void
@@ -289,11 +438,10 @@
 {
    struct etna_resource *res = etna_resource(prsc);
 
-   /* Make sure texture is older than the imported renderable buffer,
-    * so etna_update_sampler_source will copy the pixel data again.
-    */
-   if (res->texture)
-      etna_resource(res->texture)->seqno = res->seqno - 1;
+   if (res->external)
+      etna_resource(res->external)->seqno++;
+   else
+      res->seqno++;
 }
 
 static void
@@ -313,6 +461,7 @@
    list_delinit(&rsc->list);
 
    pipe_resource_reference(&rsc->texture, NULL);
+   pipe_resource_reference(&rsc->external, NULL);
 
    FREE(rsc);
 }
@@ -352,18 +501,26 @@
       goto fail;
 
    rsc->seqno = 1;
+   rsc->layout = modifier_to_layout(handle->modifier);
+   rsc->halign = TEXTURE_HALIGN_FOUR;
+
 
    level->width = tmpl->width0;
    level->height = tmpl->height0;
 
-   /* We will be using the RS to copy with this resource, so we must
-    * ensure that it is appropriately aligned for the RS requirements. */
-   unsigned paddingX = ETNA_RS_WIDTH_MASK + 1;
-   unsigned paddingY = (ETNA_RS_HEIGHT_MASK + 1) * screen->specs.pixel_pipes;
+   /* Determine padding of the imported resource. */
+   unsigned paddingX = 0, paddingY = 0;
+   etna_layout_multiple(rsc->layout, screen->specs.pixel_pipes,
+                        VIV_FEATURE(screen, chipMinorFeatures1, TEXTURE_HALIGN),
+                        &paddingX, &paddingY, &rsc->halign);
 
+   etna_adjust_rs_align(screen->specs.pixel_pipes, NULL, &paddingY);
    level->padded_width = align(level->width, paddingX);
    level->padded_height = align(level->height, paddingY);
 
+   level->layer_stride = level->stride * util_format_get_nblocksy(prsc->format,
+                                                                  level->padded_height);
+
    /* The DDX must give us a BO which conforms to our padding size.
     * The stride of the BO must be greater or equal to our padded
     * stride. The size of the BO must accomodate the padded height. */
@@ -376,20 +533,25 @@
       goto fail;
    }
 
-   if (handle->type == DRM_API_HANDLE_TYPE_SHARED && tmpl->bind & PIPE_BIND_RENDER_TARGET) {
-      /* Render targets are linear in Xorg but must be tiled
-      * here. It would be nice if dri_drawable_get_format()
-      * set scanout for these buffers too. */
-      struct etna_resource *tiled;
+   if (rsc->layout == ETNA_LAYOUT_LINEAR) {
+      /*
+       * Both sampler and pixel pipes can't handle linear, create a compatible
+       * base resource, where we can attach the imported buffer as an external
+       * resource.
+       */
+      struct pipe_resource tiled_templat = *tmpl;
 
-      ptiled = etna_resource_create(pscreen, tmpl);
+      /*
+       * Remove BIND_SCANOUT to avoid recursion, as etna_resource_create uses
+       * this function to import the scanout buffer and get a tiled resource.
+       */
+      tiled_templat.bind &= ~PIPE_BIND_SCANOUT;
+
+      ptiled = etna_resource_create(pscreen, &tiled_templat);
       if (!ptiled)
          goto fail;
 
-      tiled = etna_resource(ptiled);
-      tiled->scanout = renderonly_scanout_for_prime(prsc, screen->ro);
-      if (!tiled->scanout)
-         goto fail;
+      etna_resource(ptiled)->external = prsc;
 
       return ptiled;
    }
@@ -411,12 +573,34 @@
                          struct winsys_handle *handle, unsigned usage)
 {
    struct etna_resource *rsc = etna_resource(prsc);
+   /* Scanout is always attached to the base resource */
+   struct renderonly_scanout *scanout = rsc->scanout;
 
-   if (renderonly_get_handle(rsc->scanout, handle))
+   /*
+    * External resources are preferred, so a import->export chain of
+    * render/sampler incompatible buffers yield the same handle.
+    */
+   if (rsc->external)
+      rsc = etna_resource(rsc->external);
+
+   handle->stride = rsc->levels[0].stride;
+   handle->modifier = layout_to_modifier(rsc->layout);
+
+   if (handle->type == DRM_API_HANDLE_TYPE_SHARED) {
+      return etna_bo_get_name(rsc->bo, &handle->handle) == 0;
+   } else if (handle->type == DRM_API_HANDLE_TYPE_KMS) {
+      if (renderonly_get_handle(scanout, handle)) {
+         return TRUE;
+      } else {
+         handle->handle = etna_bo_handle(rsc->bo);
+         return TRUE;
+      }
+   } else if (handle->type == DRM_API_HANDLE_TYPE_FD) {
+      handle->handle = etna_bo_dmabuf(rsc->bo);
       return TRUE;
-
-   return etna_screen_bo_get_handle(pscreen, rsc->bo, rsc->levels[0].stride,
-                                    handle);
+   } else {
+      return FALSE;
+   }
 }
 
 void
@@ -440,26 +624,11 @@
 }
 
 void
-etna_resource_wait(struct pipe_context *pctx, struct etna_resource *rsc)
-{
-   if (rsc->status & ETNA_PENDING_WRITE) {
-      struct pipe_fence_handle *fence;
-      struct pipe_screen *pscreen = pctx->screen;
-
-      pctx->flush(pctx, &fence, 0);
-
-      if (!pscreen->fence_finish(pscreen, pctx, fence, 5000000000ULL))
-         BUG("fence timed out (hung GPU?)");
-
-      pscreen->fence_reference(pscreen, &fence, NULL);
-   }
-}
-
-void
 etna_resource_screen_init(struct pipe_screen *pscreen)
 {
    pscreen->can_create_resource = etna_screen_can_create_resource;
    pscreen->resource_create = etna_resource_create;
+   pscreen->resource_create_with_modifiers = etna_resource_create_modifiers;
    pscreen->resource_from_handle = etna_resource_from_handle;
    pscreen->resource_get_handle = etna_resource_get_handle;
    pscreen->resource_changed = etna_resource_changed;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_resource.h b/src/gallium/drivers/etnaviv/etnaviv_resource.h
index a8d42ee..0b135e2 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.h
@@ -75,6 +75,11 @@
 
    /* When we are rendering to a texture, we need a differently tiled resource */
    struct pipe_resource *texture;
+   /*
+    * If imported resources have an render/sampler incompatible tiling, we keep
+    * them as an external resource, which is blitted as needed.
+    */
+   struct pipe_resource *external;
 
    enum etna_resource_status status;
 
@@ -102,7 +107,7 @@
 static inline bool
 etna_resource_needs_flush(struct etna_resource *res)
 {
-   return (int)(res->seqno - res->flush_seqno) > 0;
+   return res->ts_bo && ((int)(res->seqno - res->flush_seqno) > 0);
 }
 
 /* is the resource only used on the sampler? */
@@ -124,9 +129,6 @@
 etna_resource_used(struct etna_context *ctx, struct pipe_resource *prsc,
                    enum etna_resource_status status);
 
-void
-etna_resource_wait(struct pipe_context *ctx, struct etna_resource *rsc);
-
 static inline void
 resource_read(struct etna_context *ctx, struct pipe_resource *prsc)
 {
@@ -149,7 +151,7 @@
 
 struct pipe_resource *
 etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout,
-                    const struct pipe_resource *templat);
+                    uint64_t modifier, const struct pipe_resource *templat);
 
 void
 etna_resource_screen_init(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c
index 5f1d280..129b0cd 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -45,6 +45,8 @@
 
 #include "state_tracker/drm_driver.h"
 
+#include <drm_fourcc.h>
+
 #define ETNA_DRM_VERSION(major, minor) ((major) << 16 | (minor))
 #define ETNA_DRM_VERSION_FENCE_FD      ETNA_DRM_VERSION(1, 1)
 
@@ -155,6 +157,7 @@
       return true; /* VIV_FEATURE(priv->dev, chipMinorFeatures1,
                       NON_POWER_OF_TWO); */
 
+   case PIPE_CAP_TEXTURE_SWIZZLE:
    case PIPE_CAP_PRIMITIVE_RESTART:
       return VIV_FEATURE(screen, chipMinorFeatures1, HALTI0);
 
@@ -164,7 +167,6 @@
 
    /* Unsupported features. */
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
-   case PIPE_CAP_TEXTURE_SWIZZLE: /* XXX supported on gc2000 */
    case PIPE_CAP_COMPUTE: /* XXX supported on gc2000 */
    case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: /* only one colorbuffer supported, so mixing makes no sense */
    case PIPE_CAP_CONDITIONAL_RENDER: /* no occlusion queries */
@@ -254,6 +256,10 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    /* Stream output. */
@@ -440,6 +446,7 @@
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
    case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       return 0;
    }
 
@@ -454,13 +461,35 @@
 }
 
 static bool
-gpu_supports_texure_format(struct etna_screen *screen, uint32_t fmt)
+gpu_supports_texure_format(struct etna_screen *screen, uint32_t fmt,
+                           enum pipe_format format)
 {
+   bool supported = true;
+
    if (fmt == TEXTURE_FORMAT_ETC1)
-      return VIV_FEATURE(screen, chipFeatures, ETC1_TEXTURE_COMPRESSION);
+      supported = VIV_FEATURE(screen, chipFeatures, ETC1_TEXTURE_COMPRESSION);
 
    if (fmt >= TEXTURE_FORMAT_DXT1 && fmt <= TEXTURE_FORMAT_DXT4_DXT5)
-      return VIV_FEATURE(screen, chipFeatures, DXT_TEXTURE_COMPRESSION);
+      supported = VIV_FEATURE(screen, chipFeatures, DXT_TEXTURE_COMPRESSION);
+
+   if (fmt & EXT_FORMAT) {
+      supported = VIV_FEATURE(screen, chipMinorFeatures1, HALTI0);
+
+      /* ETC1 is checked above, as it has its own feature bit. ETC2 is
+       * supported with HALTI0, however that implementation is buggy in hardware.
+       * The blob driver does per-block patching to work around this. As this
+       * is currently not implemented by etnaviv, enable it for HALTI1 (GC3000)
+       * only.
+       */
+      if (util_format_is_etc(format))
+         supported = VIV_FEATURE(screen, chipMinorFeatures2, HALTI1);
+   }
+
+   if (!supported)
+      return false;
+
+   if (texture_format_needs_swiz(format))
+      return VIV_FEATURE(screen, chipMinorFeatures1, HALTI0);
 
    return true;
 }
@@ -506,7 +535,7 @@
    if (usage & PIPE_BIND_SAMPLER_VIEW) {
       uint32_t fmt = translate_texture_format(format);
 
-      if (!gpu_supports_texure_format(screen, fmt))
+      if (!gpu_supports_texure_format(screen, fmt, format))
          fmt = ETNA_NO_MATCH;
 
       if (sample_count < 2 && fmt != ETNA_NO_MATCH)
@@ -540,6 +569,47 @@
    return usage == allowed;
 }
 
+const uint64_t supported_modifiers[] = {
+   DRM_FORMAT_MOD_LINEAR,
+   DRM_FORMAT_MOD_VIVANTE_TILED,
+   DRM_FORMAT_MOD_VIVANTE_SUPER_TILED,
+   DRM_FORMAT_MOD_VIVANTE_SPLIT_TILED,
+   DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED,
+};
+
+static void
+etna_screen_query_dmabuf_modifiers(struct pipe_screen *pscreen,
+                                   enum pipe_format format, int max,
+                                   uint64_t *modifiers,
+                                   unsigned int *external_only, int *count)
+{
+   struct etna_screen *screen = etna_screen(pscreen);
+   int i, num_modifiers = 0;
+
+   if (max > ARRAY_SIZE(supported_modifiers))
+      max = ARRAY_SIZE(supported_modifiers);
+
+   if (!max) {
+      modifiers = NULL;
+      max = ARRAY_SIZE(supported_modifiers);
+   }
+
+   for (i = 0; num_modifiers < max; i++) {
+      /* don't advertise split tiled formats on single pipe/buffer GPUs */
+      if ((screen->specs.pixel_pipes == 1 || screen->specs.single_buffer) &&
+          i >= 3)
+         break;
+
+      if (modifiers)
+         modifiers[num_modifiers] = supported_modifiers[i];
+      if (external_only)
+         external_only[num_modifiers] = 0;
+      num_modifiers++;
+   }
+
+   *count = num_modifiers;
+}
+
 static boolean
 etna_get_specs(struct etna_screen *screen)
 {
@@ -621,7 +691,7 @@
       screen->model >= 0x1000 || screen->model == 0x880;
    screen->specs.npot_tex_any_wrap =
       VIV_FEATURE(screen, chipMinorFeatures1, NON_POWER_OF_TWO);
-   screen->specs.has_new_sin_cos =
+   screen->specs.has_new_transcendentals =
       VIV_FEATURE(screen, chipMinorFeatures3, HAS_FAST_TRANSCENDENTALS);
 
    if (VIV_FEATURE(screen, chipMinorFeatures3, INSTRUCTION_CACHE)) {
@@ -691,25 +761,6 @@
    return false;
 }
 
-boolean
-etna_screen_bo_get_handle(struct pipe_screen *pscreen, struct etna_bo *bo,
-                          unsigned stride, struct winsys_handle *whandle)
-{
-   whandle->stride = stride;
-
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
-      return etna_bo_get_name(bo, &whandle->handle) == 0;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
-      whandle->handle = etna_bo_handle(bo);
-      return TRUE;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
-      whandle->handle = etna_bo_dmabuf(bo);
-      return TRUE;
-   } else {
-      return FALSE;
-   }
-}
-
 struct etna_bo *
 etna_screen_bo_from_handle(struct pipe_screen *pscreen,
                            struct winsys_handle *whandle, unsigned *out_stride)
@@ -832,6 +883,16 @@
    if (!etna_get_specs(screen))
       goto fail;
 
+   /* apply debug options that disable individual features */
+   if (DBG_ENABLED(ETNA_DBG_NO_EARLY_Z))
+      screen->features[viv_chipFeatures] |= chipFeatures_NO_EARLY_Z;
+   if (DBG_ENABLED(ETNA_DBG_NO_TS))
+         screen->features[viv_chipFeatures] &= ~chipFeatures_FAST_CLEAR;
+   if (DBG_ENABLED(ETNA_DBG_NO_AUTODISABLE))
+      screen->features[viv_chipMinorFeatures1] &= ~chipMinorFeatures1_AUTO_DISABLE;
+   if (DBG_ENABLED(ETNA_DBG_NO_SUPERTILE))
+      screen->specs.can_supertile = 0;
+
    pscreen->destroy = etna_screen_destroy;
    pscreen->get_param = etna_screen_get_param;
    pscreen->get_paramf = etna_screen_get_paramf;
@@ -844,6 +905,7 @@
    pscreen->get_timestamp = etna_screen_get_timestamp;
    pscreen->context_create = etna_context_create;
    pscreen->is_format_supported = etna_screen_is_format_supported;
+   pscreen->query_dmabuf_modifiers = etna_screen_query_dmabuf_modifiers;
 
    etna_fence_screen_init(pscreen);
    etna_query_screen_init(pscreen);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.h b/src/gallium/drivers/etnaviv/etnaviv_screen.h
index bec740b..dc57a38 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.h
@@ -84,10 +84,6 @@
    return (struct etna_screen *)pscreen;
 }
 
-boolean
-etna_screen_bo_get_handle(struct pipe_screen *pscreen, struct etna_bo *bo,
-                          unsigned stride, struct winsys_handle *whandle);
-
 struct etna_bo *
 etna_screen_bo_from_handle(struct pipe_screen *pscreen,
                            struct winsys_handle *whandle, unsigned *out_stride);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_state.c b/src/gallium/drivers/etnaviv/etnaviv_state.c
index dbb41e5..fc3d9f1 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -43,20 +43,6 @@
 #include "util/u_memory.h"
 
 static void
-etna_set_blend_color(struct pipe_context *pctx, const struct pipe_blend_color *bc)
-{
-   struct etna_context *ctx = etna_context(pctx);
-   struct compiled_blend_color *cs = &ctx->blend_color;
-
-   cs->PE_ALPHA_BLEND_COLOR =
-      VIVS_PE_ALPHA_BLEND_COLOR_R(etna_cfloat_to_uint8(bc->color[0])) |
-      VIVS_PE_ALPHA_BLEND_COLOR_G(etna_cfloat_to_uint8(bc->color[1])) |
-      VIVS_PE_ALPHA_BLEND_COLOR_B(etna_cfloat_to_uint8(bc->color[2])) |
-      VIVS_PE_ALPHA_BLEND_COLOR_A(etna_cfloat_to_uint8(bc->color[3]));
-   ctx->dirty |= ETNA_DIRTY_BLEND_COLOR;
-}
-
-static void
 etna_set_stencil_ref(struct pipe_context *pctx, const struct pipe_stencil_ref *sr)
 {
    struct etna_context *ctx = etna_context(pctx);
@@ -429,11 +415,11 @@
       struct compiled_set_vertex_buffer *cs = &so->cvb[idx];
       struct pipe_vertex_buffer *vbi = &so->vb[idx];
 
-      assert(!vbi->user_buffer); /* XXX support user_buffer using
-                                    etna_usermem_map */
+      assert(!vbi->is_user_buffer); /* XXX support user_buffer using
+                                       etna_usermem_map */
 
-      if (vbi->buffer) { /* GPU buffer */
-         cs->FE_VERTEX_STREAM_BASE_ADDR.bo = etna_resource(vbi->buffer)->bo;
+      if (vbi->buffer.resource) { /* GPU buffer */
+         cs->FE_VERTEX_STREAM_BASE_ADDR.bo = etna_resource(vbi->buffer.resource)->bo;
          cs->FE_VERTEX_STREAM_BASE_ADDR.offset = vbi->buffer_offset;
          cs->FE_VERTEX_STREAM_BASE_ADDR.flags = ETNA_RELOC_READ;
          cs->FE_VERTEX_STREAM_CONTROL =
@@ -448,34 +434,6 @@
 }
 
 static void
-etna_set_index_buffer(struct pipe_context *pctx, const struct pipe_index_buffer *ib)
-{
-   struct etna_context *ctx = etna_context(pctx);
-   uint32_t ctrl;
-
-   if (ib) {
-      pipe_resource_reference(&ctx->index_buffer.ib.buffer, ib->buffer);
-      memcpy(&ctx->index_buffer.ib, ib, sizeof(ctx->index_buffer.ib));
-      ctrl = translate_index_size(ctx->index_buffer.ib.index_size);
-   } else {
-      pipe_resource_reference(&ctx->index_buffer.ib.buffer, NULL);
-      ctrl = 0;
-   }
-
-   if (ctx->index_buffer.ib.buffer && ctrl != ETNA_NO_MATCH) {
-      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo = etna_resource(ctx->index_buffer.ib.buffer)->bo;
-      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.offset = ctx->index_buffer.ib.offset;
-      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.flags = ETNA_RELOC_READ;
-      ctx->index_buffer.FE_INDEX_STREAM_CONTROL = ctrl;
-   } else {
-      ctx->index_buffer.FE_INDEX_STREAM_BASE_ADDR.bo = NULL;
-      ctx->index_buffer.FE_INDEX_STREAM_CONTROL = 0;
-   }
-
-   ctx->dirty |= ETNA_DIRTY_INDEX_BUFFER;
-}
-
-static void
 etna_blend_state_bind(struct pipe_context *pctx, void *bs)
 {
    struct etna_context *ctx = etna_context(pctx);
@@ -628,6 +586,9 @@
    },
    {
       etna_update_blend, ETNA_DIRTY_BLEND | ETNA_DIRTY_FRAMEBUFFER
+   },
+   {
+      etna_update_blend_color, ETNA_DIRTY_BLEND_COLOR | ETNA_DIRTY_FRAMEBUFFER,
    }
 };
 
@@ -656,7 +617,6 @@
    pctx->set_viewport_states = etna_set_viewport_states;
 
    pctx->set_vertex_buffers = etna_set_vertex_buffers;
-   pctx->set_index_buffer = etna_set_index_buffer;
 
    pctx->bind_blend_state = etna_blend_state_bind;
    pctx->delete_blend_state = etna_blend_state_delete;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_surface.c b/src/gallium/drivers/etnaviv/etnaviv_surface.c
index 1db9b40..4b95f65 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_surface.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_surface.c
@@ -66,7 +66,7 @@
 
    if (VIV_FEATURE(ctx->screen, chipFeatures, FAST_CLEAR) &&
        VIV_FEATURE(ctx->screen, chipMinorFeatures0, MC20) &&
-       !DBG_ENABLED(ETNA_DBG_NO_TS) && !rsc->ts_bo &&
+       !rsc->ts_bo &&
        (rsc->levels[level].padded_width & ETNA_RS_WIDTH_MASK) == 0 &&
        (rsc->levels[level].padded_height & ETNA_RS_HEIGHT_MASK) == 0) {
       etna_screen_resource_alloc_ts(pctx->screen, rsc);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_texture.c b/src/gallium/drivers/etnaviv/etnaviv_texture.c
index 93b077b..b8ebab6 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_texture.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_texture.c
@@ -36,6 +36,8 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
+#include <drm_fourcc.h>
+
 static void *
 etna_create_sampler_state(struct pipe_context *pipe,
                           const struct pipe_sampler_state *ss)
@@ -113,13 +115,24 @@
 static void
 etna_update_sampler_source(struct pipe_sampler_view *view)
 {
-   struct etna_resource *res = etna_resource(view->texture);
+   struct etna_resource *base = etna_resource(view->texture);
+   struct etna_resource *to = base, *from = base;
 
-   if (res->texture && etna_resource_older(etna_resource(res->texture), res)) {
-      /* Texture is older than render buffer, copy the texture using RS */
-      etna_copy_resource(view->context, res->texture, view->texture, 0,
+   if (base->external && etna_resource_newer(etna_resource(base->external), base))
+      from = etna_resource(base->external);
+
+   if (base->texture)
+      to = etna_resource(base->texture);
+
+   if ((to != from) && etna_resource_older(to, from)) {
+      etna_copy_resource(view->context, &to->base, &from->base, 0,
                          view->texture->last_level);
-      etna_resource(res->texture)->seqno = res->seqno;
+      to->seqno = from->seqno;
+   } else if ((to == from) && etna_resource_needs_flush(to)) {
+      /* Resolve TS if needed, remove when adding sampler TS */
+      etna_copy_resource(view->context, &to->base, &from->base, 0,
+                         view->texture->last_level);
+      to->flush_seqno = from->seqno;
    }
 }
 
@@ -129,12 +142,18 @@
    if (util_format_is_compressed(res->base.format))
       return true;
 
-   /* The sampler (as we currently know it) only accepts tiled layouts */
+   struct etna_screen *screen = etna_screen(res->base.screen);
+   /* This GPU supports texturing from supertiled textures? */
+   if (res->layout == ETNA_LAYOUT_SUPER_TILED && VIV_FEATURE(screen, chipMinorFeatures2, SUPERTILED_TEXTURE))
+      return true;
+
+   /* TODO: LINEAR_TEXTURE_SUPPORT */
+
+   /* Otherwise, only support tiled layouts */
    if (res->layout != ETNA_LAYOUT_TILED)
       return false;
 
    /* If we have HALIGN support, we can allow for the RS padding */
-   struct etna_screen *screen = etna_screen(res->base.screen);
    if (VIV_FEATURE(screen, chipMinorFeatures1, TEXTURE_HALIGN))
       return true;
 
@@ -152,6 +171,11 @@
    struct etna_sampler_view *sv = CALLOC_STRUCT(etna_sampler_view);
    struct etna_resource *res = etna_resource(prsc);
    struct etna_context *ctx = etna_context(pctx);
+   const uint32_t format = translate_texture_format(so->format);
+   const bool ext = !!(format & EXT_FORMAT);
+   const uint32_t swiz = get_texture_swiz(so->format, so->swizzle_r,
+                                          so->swizzle_g, so->swizzle_b,
+                                          so->swizzle_a);
 
    if (!sv)
       return NULL;
@@ -165,7 +189,8 @@
          templat.bind &= ~(PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET |
                            PIPE_BIND_BLENDABLE);
          res->texture =
-            etna_resource_alloc(pctx->screen, ETNA_LAYOUT_TILED, &templat);
+            etna_resource_alloc(pctx->screen, ETNA_LAYOUT_TILED,
+                                DRM_FORMAT_MOD_LINEAR, &templat);
       }
 
       if (!res->texture) {
@@ -176,14 +201,13 @@
    }
 
    sv->base = *so;
-   pipe_reference(NULL, &prsc->reference);
-   sv->base.texture = prsc;
-   sv->base.reference.count = 1;
+   pipe_reference_init(&sv->base.reference, 1);
+   sv->base.texture = NULL;
+   pipe_resource_reference(&sv->base.texture, prsc);
    sv->base.context = pctx;
 
    /* merged with sampler state */
-   sv->TE_SAMPLER_CONFIG0 =
-      VIVS_TE_SAMPLER_CONFIG0_FORMAT(translate_texture_format(sv->base.format));
+   sv->TE_SAMPLER_CONFIG0 = COND(!ext, VIVS_TE_SAMPLER_CONFIG0_FORMAT(format));
    sv->TE_SAMPLER_CONFIG0_MASK = 0xffffffff;
 
    switch (sv->base.target) {
@@ -206,11 +230,8 @@
       return NULL;
    }
 
-   sv->TE_SAMPLER_CONFIG1 = VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_R(so->swizzle_r) |
-                            VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_G(so->swizzle_g) |
-                            VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_B(so->swizzle_b) |
-                            VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_A(so->swizzle_a) |
-                            VIVS_TE_SAMPLER_CONFIG1_HALIGN(res->halign);
+   sv->TE_SAMPLER_CONFIG1 = COND(ext, VIVS_TE_SAMPLER_CONFIG1_FORMAT_EXT(format)) |
+                            VIVS_TE_SAMPLER_CONFIG1_HALIGN(res->halign) | swiz;
    sv->TE_SAMPLER_SIZE = VIVS_TE_SAMPLER_SIZE_WIDTH(res->base.width0) |
                          VIVS_TE_SAMPLER_SIZE_HEIGHT(res->base.height0);
    sv->TE_SAMPLER_LOG_SIZE =
diff --git a/src/gallium/drivers/etnaviv/etnaviv_transfer.c b/src/gallium/drivers/etnaviv/etnaviv_transfer.c
index 4809b04..6c1edd4 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_transfer.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_transfer.c
@@ -39,6 +39,8 @@
 #include "util/u_surface.h"
 #include "util/u_transfer.h"
 
+#include <drm_fourcc.h>
+
 /* Compute offset into a 1D/2D/3D buffer of a certain box.
  * This box must be aligned to the block width and height of the
  * underlying format. */
@@ -70,6 +72,10 @@
    if (rsc->texture && !etna_resource_newer(rsc, etna_resource(rsc->texture)))
       rsc = etna_resource(rsc->texture); /* switch to using the texture resource */
 
+   /*
+    * Temporary resources are always pulled into the CPU domain, must push them
+    * back into GPU domain before the RS execs the blit to the base resource.
+    */
    if (trans->rsc)
       etna_bo_cpu_fini(etna_resource(trans->rsc)->bo);
 
@@ -85,21 +91,19 @@
          struct etna_resource_level *res_level = &rsc->levels[ptrans->level];
          void *mapped = etna_bo_map(rsc->bo) + res_level->offset;
 
-         if (rsc->layout == ETNA_LAYOUT_LINEAR || rsc->layout == ETNA_LAYOUT_TILED) {
-            if (rsc->layout == ETNA_LAYOUT_TILED && !util_format_is_compressed(rsc->base.format)) {
-               etna_texture_tile(
-                  mapped + ptrans->box.z * res_level->layer_stride,
-                  trans->staging, ptrans->box.x, ptrans->box.y,
-                  res_level->stride, ptrans->box.width, ptrans->box.height,
-                  ptrans->stride, util_format_get_blocksize(rsc->base.format));
-            } else { /* non-tiled or compressed format */
-               util_copy_box(mapped, rsc->base.format, res_level->stride,
-                             res_level->layer_stride, ptrans->box.x,
-                             ptrans->box.y, ptrans->box.z, ptrans->box.width,
-                             ptrans->box.height, ptrans->box.depth,
-                             trans->staging, ptrans->stride,
-                             ptrans->layer_stride, 0, 0, 0 /* src x,y,z */);
-            }
+         if (rsc->layout == ETNA_LAYOUT_TILED) {
+            etna_texture_tile(
+               mapped + ptrans->box.z * res_level->layer_stride,
+               trans->staging, ptrans->box.x, ptrans->box.y,
+               res_level->stride, ptrans->box.width, ptrans->box.height,
+               ptrans->stride, util_format_get_blocksize(rsc->base.format));
+         } else if (rsc->layout == ETNA_LAYOUT_LINEAR) {
+            util_copy_box(mapped, rsc->base.format, res_level->stride,
+                          res_level->layer_stride, ptrans->box.x,
+                          ptrans->box.y, ptrans->box.z, ptrans->box.width,
+                          ptrans->box.height, ptrans->box.depth,
+                          trans->staging, ptrans->stride,
+                          ptrans->layer_stride, 0, 0, 0 /* src x,y,z */);
          } else {
             BUG("unsupported tiling %i", rsc->layout);
          }
@@ -114,7 +118,12 @@
       }
    }
 
-   if (!trans->rsc)
+   /*
+    * Transfers without a temporary are only pulled into the CPU domain if they
+    * are not mapped unsynchronized. If they are, must push them back into GPU
+    * domain after CPU access is finished.
+    */
+   if (!trans->rsc && !(ptrans->usage & PIPE_TRANSFER_UNSYNCHRONIZED))
       etna_bo_cpu_fini(rsc->bo);
 
    pipe_resource_reference(&trans->rsc, NULL);
@@ -150,6 +159,20 @@
 
    assert(level <= prsc->last_level);
 
+   /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is
+    * being mapped. If we add buffer reallocation to avoid CPU/GPU sync this
+    * check needs to be extended to coherent mappings and shared resources.
+    */
+   if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+       !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+       prsc->last_level == 0 &&
+       prsc->width0 == box->width &&
+       prsc->height0 == box->height &&
+       prsc->depth0 == box->depth &&
+       prsc->array_size == 1) {
+      usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+   }
+
    if (rsc->texture && !etna_resource_newer(rsc, etna_resource(rsc->texture))) {
       /* We have a texture resource which is the same age or newer than the
        * render resource. Use the texture resource, which avoids bouncing
@@ -182,13 +205,16 @@
       templ.nr_samples = 0;
       templ.bind = PIPE_BIND_RENDER_TARGET;
 
-      trans->rsc = etna_resource_alloc(pctx->screen, ETNA_LAYOUT_LINEAR, &templ);
+      trans->rsc = etna_resource_alloc(pctx->screen, ETNA_LAYOUT_LINEAR,
+                                       DRM_FORMAT_MOD_LINEAR, &templ);
       if (!trans->rsc) {
          slab_free(&ctx->transfer_pool, trans);
          return NULL;
       }
 
-      etna_copy_resource(pctx, trans->rsc, prsc, level, trans->rsc->last_level);
+      if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE))
+         etna_copy_resource(pctx, trans->rsc, prsc, level,
+                            trans->rsc->last_level);
 
       /* Switch to using the temporary resource instead */
       rsc = etna_resource(trans->rsc);
@@ -199,8 +225,9 @@
    /* Always sync if we have the temporary resource.  The PIPE_TRANSFER_READ
     * case could be optimised if we knew whether the resource has outstanding
     * rendering. */
-   if (usage & PIPE_TRANSFER_READ || trans->rsc)
-      etna_resource_wait(pctx, rsc);
+   if ((usage & PIPE_TRANSFER_READ || trans->rsc) &&
+       rsc->status & ETNA_PENDING_WRITE)
+      pctx->flush(pctx, NULL, 0);
 
    /* XXX we don't handle PIPE_TRANSFER_FLUSH_EXPLICIT; this flag can be ignored
     * when mapping in-place,
@@ -252,26 +279,21 @@
       PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE is set.
     */
 
-   /* No need to allocate a buffer for copying if the resource is not in use,
-    * and no tiling is needed, can just return a direct pointer.
+   /*
+    * Pull resources into the CPU domain. Only skipped for unsynchronized
+    * transfers without a temporary resource.
     */
-   bool in_place = rsc->layout == ETNA_LAYOUT_LINEAR ||
-                   (rsc->layout == ETNA_LAYOUT_TILED &&
-                    util_format_is_compressed(prsc->format));
+   if (trans->rsc || !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      uint32_t prep_flags = 0;
 
-   /* Ignore PIPE_TRANSFER_UNSYNCHRONIZED and PIPE_TRANSFER_DONTBLOCK here.
-    * It appears that Gallium operates the index/vertex buffers in a
-    * circular fashion, and the CPU can catch up with the GPU and starts
-    * overwriting yet-to-be-processed entries, causing rendering corruption. */
-   uint32_t prep_flags = 0;
+      if (usage & PIPE_TRANSFER_READ)
+         prep_flags |= DRM_ETNA_PREP_READ;
+      if (usage & PIPE_TRANSFER_WRITE)
+         prep_flags |= DRM_ETNA_PREP_WRITE;
 
-   if (usage & PIPE_TRANSFER_READ)
-      prep_flags |= DRM_ETNA_PREP_READ;
-   if (usage & PIPE_TRANSFER_WRITE)
-      prep_flags |= DRM_ETNA_PREP_WRITE;
-
-   if (etna_bo_cpu_prep(rsc->bo, prep_flags))
-      goto fail_prep;
+      if (etna_bo_cpu_prep(rsc->bo, prep_flags))
+         goto fail_prep;
+   }
 
    /* map buffer object */
    void *mapped = etna_bo_map(rsc->bo);
@@ -280,7 +302,7 @@
 
    *out_transfer = ptrans;
 
-   if (in_place) {
+   if (rsc->layout == ETNA_LAYOUT_LINEAR) {
       ptrans->stride = res_level->stride;
       ptrans->layer_stride = res_level->layer_stride;
 
@@ -307,24 +329,21 @@
          goto fail;
 
       if (usage & PIPE_TRANSFER_READ) {
-         /* untile or copy resource for reading */
-         if (rsc->layout == ETNA_LAYOUT_LINEAR || rsc->layout == ETNA_LAYOUT_TILED) {
-            if (rsc->layout == ETNA_LAYOUT_TILED && !util_format_is_compressed(rsc->base.format)) {
-               etna_texture_untile(trans->staging,
-                                   mapped + ptrans->box.z * res_level->layer_stride,
-                                   ptrans->box.x, ptrans->box.y, res_level->stride,
-                                   ptrans->box.width, ptrans->box.height, ptrans->stride,
-                                   util_format_get_blocksize(rsc->base.format));
-            } else { /* non-tiled or compressed format */
-               util_copy_box(trans->staging, rsc->base.format, ptrans->stride,
-                             ptrans->layer_stride, 0, 0, 0, /* dst x,y,z */
-                             ptrans->box.width, ptrans->box.height,
-                             ptrans->box.depth, mapped, res_level->stride,
-                             res_level->layer_stride, ptrans->box.x,
-                             ptrans->box.y, ptrans->box.z);
-            }
-         } else /* TODO supertiling */
-         {
+         if (rsc->layout == ETNA_LAYOUT_TILED) {
+            etna_texture_untile(trans->staging,
+                                mapped + ptrans->box.z * res_level->layer_stride,
+                                ptrans->box.x, ptrans->box.y, res_level->stride,
+                                ptrans->box.width, ptrans->box.height, ptrans->stride,
+                                util_format_get_blocksize(rsc->base.format));
+         } else if (rsc->layout == ETNA_LAYOUT_LINEAR) {
+            util_copy_box(trans->staging, rsc->base.format, ptrans->stride,
+                          ptrans->layer_stride, 0, 0, 0, /* dst x,y,z */
+                          ptrans->box.width, ptrans->box.height,
+                          ptrans->box.depth, mapped, res_level->stride,
+                          res_level->layer_stride, ptrans->box.x,
+                          ptrans->box.y, ptrans->box.z);
+         } else {
+            /* TODO supertiling */
             BUG("unsupported tiling %i for reading", rsc->layout);
          }
       }
diff --git a/src/gallium/drivers/etnaviv/etnaviv_translate.h b/src/gallium/drivers/etnaviv/etnaviv_translate.h
index cbbfdf2..0761251 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_translate.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_translate.h
@@ -37,6 +37,7 @@
 #include "hw/state_3d.xml.h"
 
 #include "util/u_format.h"
+#include "util/u_math.h"
 
 #include <stdio.h>
 
@@ -405,6 +406,18 @@
    }
 }
 
+static inline void etna_adjust_rs_align(unsigned num_pixelpipes,
+                                        unsigned *paddingX, unsigned *paddingY)
+{
+   unsigned alignX = ETNA_RS_WIDTH_MASK + 1;
+   unsigned alignY = (ETNA_RS_HEIGHT_MASK + 1) * num_pixelpipes;
+
+   if (paddingX)
+      *paddingX = align(*paddingX, alignX);
+   if (paddingY)
+      *paddingY = align(*paddingY, alignY);
+}
+
 static inline uint32_t
 translate_clear_depth_stencil(enum pipe_format format, float depth,
                               unsigned stencil)
diff --git a/src/gallium/drivers/etnaviv/etnaviv_zsa.c b/src/gallium/drivers/etnaviv/etnaviv_zsa.c
index 7caba27..22c2020 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_zsa.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_zsa.c
@@ -27,13 +27,17 @@
 #include "etnaviv_zsa.h"
 
 #include "etnaviv_context.h"
+#include "etnaviv_screen.h"
 #include "etnaviv_translate.h"
 #include "util/u_memory.h"
 
+#include "hw/common.xml.h"
+
 void *
 etna_zsa_state_create(struct pipe_context *pctx,
                       const struct pipe_depth_stencil_alpha_state *so)
 {
+   struct etna_context *ctx = etna_context(pctx);
    struct etna_zsa_state *cs = CALLOC_STRUCT(etna_zsa_state);
 
    if (!cs)
@@ -42,7 +46,7 @@
    cs->base = *so;
 
    /* XXX does stencil[0] / stencil[1] order depend on rs->front_ccw? */
-   bool early_z = true;
+   bool early_z = !VIV_FEATURE(ctx->screen, chipFeatures, NO_EARLY_Z);
    bool disable_zs =
       (!so->depth.enabled || so->depth.func == PIPE_FUNC_ALWAYS) &&
       !so->depth.writemask;
@@ -88,9 +92,6 @@
    if (so->depth.enabled == false || so->depth.func == PIPE_FUNC_ALWAYS)
       early_z = false;
 
-   if (DBG_ENABLED(ETNA_DBG_NO_EARLY_Z))
-      early_z = false;
-
    /* compare funcs have 1 to 1 mapping */
    cs->PE_DEPTH_CONFIG =
       VIVS_PE_DEPTH_CONFIG_DEPTH_FUNC(so->depth.enabled ? so->depth.func
diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk
index 5c97d9e..7b54309 100644
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -48,3 +48,9 @@
 include $(LOCAL_PATH)/Android.gen.mk
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_FREEDRENO),)
+GALLIUM_TARGET_DRIVERS += msm
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_freedreno)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index b53a23e..db716f3 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -18,6 +18,8 @@
 	freedreno_program.h \
 	freedreno_query.c \
 	freedreno_query.h \
+	freedreno_query_acc.c \
+	freedreno_query_acc.h \
 	freedreno_query_hw.c \
 	freedreno_query_hw.h \
 	freedreno_query_sw.c \
@@ -124,6 +126,8 @@
 	a5xx/a5xx.xml.h \
 	a5xx/fd5_blend.c \
 	a5xx/fd5_blend.h \
+	a5xx/fd5_compute.c \
+	a5xx/fd5_compute.h \
 	a5xx/fd5_context.c \
 	a5xx/fd5_context.h \
 	a5xx/fd5_draw.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 327b3be..0811bdc 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,17 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-06-02 15:50:23)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 111898 bytes, from 2017-05-30 19:25:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 142603 bytes, from 2017-06-06 17:02:32)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
 
-Copyright (C) 2013-2016 by the following authors:
+Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
@@ -352,6 +352,38 @@
 #define REG_A2XX_RBBM_DEBUG					0x0000039b
 
 #define REG_A2XX_RBBM_PM_OVERRIDE1				0x0000039c
+#define A2XX_RBBM_PM_OVERRIDE1_RBBM_AHBCLK_PM_OVERRIDE		0x00000001
+#define A2XX_RBBM_PM_OVERRIDE1_SC_REG_SCLK_PM_OVERRIDE		0x00000002
+#define A2XX_RBBM_PM_OVERRIDE1_SC_SCLK_PM_OVERRIDE		0x00000004
+#define A2XX_RBBM_PM_OVERRIDE1_SP_TOP_SCLK_PM_OVERRIDE		0x00000008
+#define A2XX_RBBM_PM_OVERRIDE1_SP_V0_SCLK_PM_OVERRIDE		0x00000010
+#define A2XX_RBBM_PM_OVERRIDE1_SQ_REG_SCLK_PM_OVERRIDE		0x00000020
+#define A2XX_RBBM_PM_OVERRIDE1_SQ_REG_FIFOS_SCLK_PM_OVERRIDE	0x00000040
+#define A2XX_RBBM_PM_OVERRIDE1_SQ_CONST_MEM_SCLK_PM_OVERRIDE	0x00000080
+#define A2XX_RBBM_PM_OVERRIDE1_SQ_SQ_SCLK_PM_OVERRIDE		0x00000100
+#define A2XX_RBBM_PM_OVERRIDE1_SX_SCLK_PM_OVERRIDE		0x00000200
+#define A2XX_RBBM_PM_OVERRIDE1_SX_REG_SCLK_PM_OVERRIDE		0x00000400
+#define A2XX_RBBM_PM_OVERRIDE1_TCM_TCO_SCLK_PM_OVERRIDE		0x00000800
+#define A2XX_RBBM_PM_OVERRIDE1_TCM_TCM_SCLK_PM_OVERRIDE		0x00001000
+#define A2XX_RBBM_PM_OVERRIDE1_TCM_TCD_SCLK_PM_OVERRIDE		0x00002000
+#define A2XX_RBBM_PM_OVERRIDE1_TCM_REG_SCLK_PM_OVERRIDE		0x00004000
+#define A2XX_RBBM_PM_OVERRIDE1_TPC_TPC_SCLK_PM_OVERRIDE		0x00008000
+#define A2XX_RBBM_PM_OVERRIDE1_TPC_REG_SCLK_PM_OVERRIDE		0x00010000
+#define A2XX_RBBM_PM_OVERRIDE1_TCF_TCA_SCLK_PM_OVERRIDE		0x00020000
+#define A2XX_RBBM_PM_OVERRIDE1_TCF_TCB_SCLK_PM_OVERRIDE		0x00040000
+#define A2XX_RBBM_PM_OVERRIDE1_TCF_TCB_READ_SCLK_PM_OVERRIDE	0x00080000
+#define A2XX_RBBM_PM_OVERRIDE1_TP_TP_SCLK_PM_OVERRIDE		0x00100000
+#define A2XX_RBBM_PM_OVERRIDE1_TP_REG_SCLK_PM_OVERRIDE		0x00200000
+#define A2XX_RBBM_PM_OVERRIDE1_CP_G_SCLK_PM_OVERRIDE		0x00400000
+#define A2XX_RBBM_PM_OVERRIDE1_CP_REG_SCLK_PM_OVERRIDE		0x00800000
+#define A2XX_RBBM_PM_OVERRIDE1_CP_G_REG_SCLK_PM_OVERRIDE	0x01000000
+#define A2XX_RBBM_PM_OVERRIDE1_SPI_SCLK_PM_OVERRIDE		0x02000000
+#define A2XX_RBBM_PM_OVERRIDE1_RB_REG_SCLK_PM_OVERRIDE		0x04000000
+#define A2XX_RBBM_PM_OVERRIDE1_RB_SCLK_PM_OVERRIDE		0x08000000
+#define A2XX_RBBM_PM_OVERRIDE1_MH_MH_SCLK_PM_OVERRIDE		0x10000000
+#define A2XX_RBBM_PM_OVERRIDE1_MH_REG_SCLK_PM_OVERRIDE		0x20000000
+#define A2XX_RBBM_PM_OVERRIDE1_MH_MMU_SCLK_PM_OVERRIDE		0x40000000
+#define A2XX_RBBM_PM_OVERRIDE1_MH_TCROQ_SCLK_PM_OVERRIDE	0x80000000
 
 #define REG_A2XX_RBBM_PM_OVERRIDE2				0x0000039d
 
@@ -479,12 +511,43 @@
 #define REG_A2XX_PA_SU_DEBUG_DATA				0x00000c81
 
 #define REG_A2XX_PA_SU_FACE_DATA				0x00000c86
+#define A2XX_PA_SU_FACE_DATA_BASE_ADDR__MASK			0xffffffe0
+#define A2XX_PA_SU_FACE_DATA_BASE_ADDR__SHIFT			5
+static inline uint32_t A2XX_PA_SU_FACE_DATA_BASE_ADDR(uint32_t val)
+{
+	return ((val) << A2XX_PA_SU_FACE_DATA_BASE_ADDR__SHIFT) & A2XX_PA_SU_FACE_DATA_BASE_ADDR__MASK;
+}
 
 #define REG_A2XX_SQ_GPR_MANAGEMENT				0x00000d00
+#define A2XX_SQ_GPR_MANAGEMENT_REG_DYNAMIC			0x00000001
+#define A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_PIX__MASK		0x00000ff0
+#define A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_PIX__SHIFT		4
+static inline uint32_t A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_PIX(uint32_t val)
+{
+	return ((val) << A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_PIX__SHIFT) & A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_PIX__MASK;
+}
+#define A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_VTX__MASK		0x000ff000
+#define A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_VTX__SHIFT		12
+static inline uint32_t A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_VTX(uint32_t val)
+{
+	return ((val) << A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_VTX__SHIFT) & A2XX_SQ_GPR_MANAGEMENT_REG_SIZE_VTX__MASK;
+}
 
 #define REG_A2XX_SQ_FLOW_CONTROL				0x00000d01
 
 #define REG_A2XX_SQ_INST_STORE_MANAGMENT			0x00000d02
+#define A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_PIX__MASK	0x00000fff
+#define A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_PIX__SHIFT	0
+static inline uint32_t A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_PIX(uint32_t val)
+{
+	return ((val) << A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_PIX__SHIFT) & A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_PIX__MASK;
+}
+#define A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_VTX__MASK	0x0fff0000
+#define A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_VTX__SHIFT	16
+static inline uint32_t A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_VTX(uint32_t val)
+{
+	return ((val) << A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_VTX__SHIFT) & A2XX_SQ_INST_STORE_MANAGMENT_INST_BASE_VTX__MASK;
+}
 
 #define REG_A2XX_SQ_DEBUG_MISC					0x00000d05
 
@@ -746,6 +809,24 @@
 #define REG_A2XX_RB_BLEND_ALPHA					0x00002108
 
 #define REG_A2XX_RB_FOG_COLOR					0x00002109
+#define A2XX_RB_FOG_COLOR_FOG_RED__MASK				0x000000ff
+#define A2XX_RB_FOG_COLOR_FOG_RED__SHIFT			0
+static inline uint32_t A2XX_RB_FOG_COLOR_FOG_RED(uint32_t val)
+{
+	return ((val) << A2XX_RB_FOG_COLOR_FOG_RED__SHIFT) & A2XX_RB_FOG_COLOR_FOG_RED__MASK;
+}
+#define A2XX_RB_FOG_COLOR_FOG_GREEN__MASK			0x0000ff00
+#define A2XX_RB_FOG_COLOR_FOG_GREEN__SHIFT			8
+static inline uint32_t A2XX_RB_FOG_COLOR_FOG_GREEN(uint32_t val)
+{
+	return ((val) << A2XX_RB_FOG_COLOR_FOG_GREEN__SHIFT) & A2XX_RB_FOG_COLOR_FOG_GREEN__MASK;
+}
+#define A2XX_RB_FOG_COLOR_FOG_BLUE__MASK			0x00ff0000
+#define A2XX_RB_FOG_COLOR_FOG_BLUE__SHIFT			16
+static inline uint32_t A2XX_RB_FOG_COLOR_FOG_BLUE(uint32_t val)
+{
+	return ((val) << A2XX_RB_FOG_COLOR_FOG_BLUE__SHIFT) & A2XX_RB_FOG_COLOR_FOG_BLUE__MASK;
+}
 
 #define REG_A2XX_RB_STENCILREFMASK_BF				0x0000210c
 #define A2XX_RB_STENCILREFMASK_BF_STENCILREF__MASK		0x000000ff
@@ -894,14 +975,146 @@
 #define A2XX_SQ_CONTEXT_MISC_TX_CACHE_SEL			0x00040000
 
 #define REG_A2XX_SQ_INTERPOLATOR_CNTL				0x00002182
+#define A2XX_SQ_INTERPOLATOR_CNTL_PARAM_SHADE__MASK		0x0000ffff
+#define A2XX_SQ_INTERPOLATOR_CNTL_PARAM_SHADE__SHIFT		0
+static inline uint32_t A2XX_SQ_INTERPOLATOR_CNTL_PARAM_SHADE(uint32_t val)
+{
+	return ((val) << A2XX_SQ_INTERPOLATOR_CNTL_PARAM_SHADE__SHIFT) & A2XX_SQ_INTERPOLATOR_CNTL_PARAM_SHADE__MASK;
+}
+#define A2XX_SQ_INTERPOLATOR_CNTL_SAMPLING_PATTERN__MASK	0xffff0000
+#define A2XX_SQ_INTERPOLATOR_CNTL_SAMPLING_PATTERN__SHIFT	16
+static inline uint32_t A2XX_SQ_INTERPOLATOR_CNTL_SAMPLING_PATTERN(uint32_t val)
+{
+	return ((val) << A2XX_SQ_INTERPOLATOR_CNTL_SAMPLING_PATTERN__SHIFT) & A2XX_SQ_INTERPOLATOR_CNTL_SAMPLING_PATTERN__MASK;
+}
 
 #define REG_A2XX_SQ_WRAPPING_0					0x00002183
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_0__MASK			0x0000000f
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_0__SHIFT			0
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_0(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_0__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_0__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_1__MASK			0x000000f0
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_1__SHIFT			4
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_1(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_1__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_1__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_2__MASK			0x00000f00
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_2__SHIFT			8
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_2(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_2__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_2__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_3__MASK			0x0000f000
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_3__SHIFT			12
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_3(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_3__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_3__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_4__MASK			0x000f0000
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_4__SHIFT			16
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_4(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_4__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_4__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_5__MASK			0x00f00000
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_5__SHIFT			20
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_5(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_5__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_5__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_6__MASK			0x0f000000
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_6__SHIFT			24
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_6(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_6__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_6__MASK;
+}
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_7__MASK			0xf0000000
+#define A2XX_SQ_WRAPPING_0_PARAM_WRAP_7__SHIFT			28
+static inline uint32_t A2XX_SQ_WRAPPING_0_PARAM_WRAP_7(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_0_PARAM_WRAP_7__SHIFT) & A2XX_SQ_WRAPPING_0_PARAM_WRAP_7__MASK;
+}
 
 #define REG_A2XX_SQ_WRAPPING_1					0x00002184
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_8__MASK			0x0000000f
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_8__SHIFT			0
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_8(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_8__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_8__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_9__MASK			0x000000f0
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_9__SHIFT			4
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_9(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_9__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_9__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_10__MASK			0x00000f00
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_10__SHIFT			8
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_10(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_10__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_10__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_11__MASK			0x0000f000
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_11__SHIFT			12
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_11(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_11__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_11__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_12__MASK			0x000f0000
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_12__SHIFT			16
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_12(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_12__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_12__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_13__MASK			0x00f00000
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_13__SHIFT			20
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_13(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_13__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_13__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_14__MASK			0x0f000000
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_14__SHIFT			24
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_14(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_14__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_14__MASK;
+}
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_15__MASK			0xf0000000
+#define A2XX_SQ_WRAPPING_1_PARAM_WRAP_15__SHIFT			28
+static inline uint32_t A2XX_SQ_WRAPPING_1_PARAM_WRAP_15(uint32_t val)
+{
+	return ((val) << A2XX_SQ_WRAPPING_1_PARAM_WRAP_15__SHIFT) & A2XX_SQ_WRAPPING_1_PARAM_WRAP_15__MASK;
+}
 
 #define REG_A2XX_SQ_PS_PROGRAM					0x000021f6
+#define A2XX_SQ_PS_PROGRAM_BASE__MASK				0x00000fff
+#define A2XX_SQ_PS_PROGRAM_BASE__SHIFT				0
+static inline uint32_t A2XX_SQ_PS_PROGRAM_BASE(uint32_t val)
+{
+	return ((val) << A2XX_SQ_PS_PROGRAM_BASE__SHIFT) & A2XX_SQ_PS_PROGRAM_BASE__MASK;
+}
+#define A2XX_SQ_PS_PROGRAM_SIZE__MASK				0x00fff000
+#define A2XX_SQ_PS_PROGRAM_SIZE__SHIFT				12
+static inline uint32_t A2XX_SQ_PS_PROGRAM_SIZE(uint32_t val)
+{
+	return ((val) << A2XX_SQ_PS_PROGRAM_SIZE__SHIFT) & A2XX_SQ_PS_PROGRAM_SIZE__MASK;
+}
 
 #define REG_A2XX_SQ_VS_PROGRAM					0x000021f7
+#define A2XX_SQ_VS_PROGRAM_BASE__MASK				0x00000fff
+#define A2XX_SQ_VS_PROGRAM_BASE__SHIFT				0
+static inline uint32_t A2XX_SQ_VS_PROGRAM_BASE(uint32_t val)
+{
+	return ((val) << A2XX_SQ_VS_PROGRAM_BASE__SHIFT) & A2XX_SQ_VS_PROGRAM_BASE__MASK;
+}
+#define A2XX_SQ_VS_PROGRAM_SIZE__MASK				0x00fff000
+#define A2XX_SQ_VS_PROGRAM_SIZE__SHIFT				12
+static inline uint32_t A2XX_SQ_VS_PROGRAM_SIZE(uint32_t val)
+{
+	return ((val) << A2XX_SQ_VS_PROGRAM_SIZE__SHIFT) & A2XX_SQ_VS_PROGRAM_SIZE__MASK;
+}
 
 #define REG_A2XX_VGT_EVENT_INITIATOR				0x000021f9
 
@@ -1308,6 +1521,14 @@
 }
 
 #define REG_A2XX_PA_SC_VIZ_QUERY				0x00002293
+#define A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ENA			0x00000001
+#define A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID__MASK			0x0000007e
+#define A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID__SHIFT		1
+static inline uint32_t A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID(uint32_t val)
+{
+	return ((val) << A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID__SHIFT) & A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID__MASK;
+}
+#define A2XX_PA_SC_VIZ_QUERY_KILL_PIX_POST_EARLY_Z		0x00000100
 
 #define REG_A2XX_VGT_ENHANCE					0x00002294
 
@@ -1323,6 +1544,18 @@
 #define A2XX_PA_SC_LINE_CNTL_LAST_PIXEL				0x00000400
 
 #define REG_A2XX_PA_SC_AA_CONFIG				0x00002301
+#define A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES__MASK		0x00000007
+#define A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES__SHIFT		0
+static inline uint32_t A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(uint32_t val)
+{
+	return ((val) << A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES__SHIFT) & A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES__MASK;
+}
+#define A2XX_PA_SC_AA_CONFIG_MAX_SAMPLE_DIST__MASK		0x0001e000
+#define A2XX_PA_SC_AA_CONFIG_MAX_SAMPLE_DIST__SHIFT		13
+static inline uint32_t A2XX_PA_SC_AA_CONFIG_MAX_SAMPLE_DIST(uint32_t val)
+{
+	return ((val) << A2XX_PA_SC_AA_CONFIG_MAX_SAMPLE_DIST__SHIFT) & A2XX_PA_SC_AA_CONFIG_MAX_SAMPLE_DIST__MASK;
+}
 
 #define REG_A2XX_PA_SU_VTX_CNTL					0x00002302
 #define A2XX_PA_SU_VTX_CNTL_PIX_CENTER__MASK			0x00000001
@@ -1411,8 +1644,20 @@
 #define REG_A2XX_PA_SC_AA_MASK					0x00002312
 
 #define REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL			0x00002316
+#define A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL_VTX_REUSE_DEPTH__MASK	0x00000007
+#define A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL_VTX_REUSE_DEPTH__SHIFT	0
+static inline uint32_t A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL_VTX_REUSE_DEPTH(uint32_t val)
+{
+	return ((val) << A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL_VTX_REUSE_DEPTH__SHIFT) & A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL_VTX_REUSE_DEPTH__MASK;
+}
 
 #define REG_A2XX_VGT_OUT_DEALLOC_CNTL				0x00002317
+#define A2XX_VGT_OUT_DEALLOC_CNTL_DEALLOC_DIST__MASK		0x00000003
+#define A2XX_VGT_OUT_DEALLOC_CNTL_DEALLOC_DIST__SHIFT		0
+static inline uint32_t A2XX_VGT_OUT_DEALLOC_CNTL_DEALLOC_DIST(uint32_t val)
+{
+	return ((val) << A2XX_VGT_OUT_DEALLOC_CNTL_DEALLOC_DIST__SHIFT) & A2XX_VGT_OUT_DEALLOC_CNTL_DEALLOC_DIST__MASK;
+}
 
 #define REG_A2XX_RB_COPY_CONTROL				0x00002318
 #define A2XX_RB_COPY_CONTROL_COPY_SAMPLE_SELECT__MASK		0x00000007
diff --git a/src/gallium/drivers/freedreno/a2xx/disasm-a2xx.c b/src/gallium/drivers/freedreno/a2xx/disasm-a2xx.c
index fc309e8..c380450 100644
--- a/src/gallium/drivers/freedreno/a2xx/disasm-a2xx.c
+++ b/src/gallium/drivers/freedreno/a2xx/disasm-a2xx.c
@@ -111,7 +111,7 @@
 		case 0:  name = "gl_FragColor"; break;
 		}
 		break;
-	case SHADER_COMPUTE:
+	default:
 		unreachable("not reached");
 	}
 	/* if we had a symbol table here, we could look
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index a824018..8df1793 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -69,8 +69,8 @@
 		struct pipe_vertex_buffer *vb =
 				&vertexbuf->vb[elem->vertex_buffer_index];
 		bufs[i].offset = vb->buffer_offset;
-		bufs[i].size = fd_bo_size(fd_resource(vb->buffer)->bo);
-		bufs[i].prsc = vb->buffer;
+		bufs[i].size = fd_bo_size(fd_resource(vb->buffer.resource)->bo);
+		bufs[i].prsc = vb->buffer.resource;
 	}
 
 	// NOTE I believe the 0x78 (or 0x9c in solid_vp) relates to the
@@ -80,7 +80,8 @@
 }
 
 static bool
-fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
+             unsigned index_offset)
 {
 	struct fd_ringbuffer *ring = ctx->batch->draw;
 
@@ -108,7 +109,7 @@
 	OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
 
 	fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode],
-				 IGNORE_VISIBILITY, info);
+				 IGNORE_VISIBILITY, info, index_offset);
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010));
@@ -116,11 +117,13 @@
 
 	emit_cacheflush(ring);
 
+	fd_context_all_clean(ctx);
+
 	return true;
 }
 
 
-static void
+static bool
 fd2_clear(struct fd_context *ctx, unsigned buffers,
 		const union pipe_color_union *color, double depth, unsigned stencil)
 {
@@ -276,6 +279,20 @@
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
 	OUT_RING(ring, 0x00000000);
+
+	ctx->dirty |= FD_DIRTY_ZSA |
+			FD_DIRTY_VIEWPORT |
+			FD_DIRTY_RASTERIZER |
+			FD_DIRTY_SAMPLE_MASK |
+			FD_DIRTY_PROG |
+			FD_DIRTY_CONST |
+			FD_DIRTY_BLEND |
+			FD_DIRTY_FRAMEBUFFER;
+
+	ctx->dirty_shader[PIPE_SHADER_VERTEX]   |= FD_DIRTY_SHADER_PROG;
+	ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST;
+
+	return true;
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index b3a1b3d..d745e44 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -152,16 +152,18 @@
 static void
 emit_textures(struct fd_ringbuffer *ring, struct fd_context *ctx)
 {
+	struct fd_texture_stateobj *fragtex = &ctx->tex[PIPE_SHADER_FRAGMENT];
+	struct fd_texture_stateobj *verttex = &ctx->tex[PIPE_SHADER_VERTEX];
 	texmask emitted = 0;
 	unsigned i;
 
-	for (i = 0; i < ctx->verttex.num_samplers; i++)
-		if (ctx->verttex.samplers[i])
-			emitted |= emit_texture(ring, ctx, &ctx->verttex, i, emitted);
+	for (i = 0; i < verttex->num_samplers; i++)
+		if (verttex->samplers[i])
+			emitted |= emit_texture(ring, ctx, verttex, i, emitted);
 
-	for (i = 0; i < ctx->fragtex.num_samplers; i++)
-		if (ctx->fragtex.samplers[i])
-			emitted |= emit_texture(ring, ctx, &ctx->fragtex, i, emitted);
+	for (i = 0; i < fragtex->num_samplers; i++)
+		if (fragtex->samplers[i])
+			emitted |= emit_texture(ring, ctx, fragtex, i, emitted);
 }
 
 void
@@ -180,7 +182,7 @@
 }
 
 void
-fd2_emit_state(struct fd_context *ctx, uint32_t dirty)
+fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
 {
 	struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend);
 	struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa);
@@ -282,7 +284,7 @@
 		fd2_program_emit(ring, &ctx->prog);
 	}
 
-	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) {
 		emit_constants(ring,  VS_CONST_BASE * 4,
 				&ctx->constbuf[PIPE_SHADER_VERTEX],
 				(dirty & FD_DIRTY_PROG) ? ctx->prog.vp : NULL);
@@ -307,10 +309,8 @@
 		OUT_RING(ring, blend->rb_colormask);
 	}
 
-	if (dirty & (FD_DIRTY_VERTTEX | FD_DIRTY_FRAGTEX | FD_DIRTY_PROG))
+	if (dirty & (FD_DIRTY_TEX | FD_DIRTY_PROG))
 		emit_textures(ring, ctx);
-
-	ctx->dirty &= ~dirty;
 }
 
 /* emit per-context initialization:
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
index 6a26c85..d908b11 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@@ -42,7 +42,7 @@
 
 void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 		struct fd2_vertex_buf *vbufs, uint32_t n);
-void fd2_emit_state(struct fd_context *ctx, uint32_t dirty);
+void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty);
 void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring);
 
 void fd2_emit_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
index 4f31772..9a77457 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -241,16 +241,18 @@
 fd2_program_validate(struct fd_context *ctx)
 {
 	struct fd_program_stateobj *prog = &ctx->prog;
+	bool dirty_fp = !!(ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_PROG);
+	bool dirty_vp = !!(ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_PROG);
 
 	/* if vertex or frag shader is dirty, we may need to recompile. Compile
 	 * frag shader first, as that assigns the register slots for exports
 	 * from the vertex shader.  And therefore if frag shader has changed we
 	 * need to recompile both vert and frag shader.
 	 */
-	if (ctx->dirty & FD_SHADER_DIRTY_FP)
+	if (dirty_fp)
 		compile(prog, prog->fp);
 
-	if (ctx->dirty & (FD_SHADER_DIRTY_FP | FD_SHADER_DIRTY_VP))
+	if (dirty_fp || dirty_vp)
 		compile(prog, prog->vp);
 
 	/* if necessary, fix up vertex fetch instructions: */
@@ -259,8 +261,8 @@
 
 	/* if necessary, fix up texture fetch instructions: */
 	if (ctx->dirty & (FD_DIRTY_TEXSTATE | FD_DIRTY_PROG)) {
-		patch_tex_fetches(ctx, prog->vp, &ctx->verttex);
-		patch_tex_fetches(ctx, prog->fp, &ctx->fragtex);
+		patch_tex_fetches(ctx, prog->vp, &ctx->tex[PIPE_SHADER_VERTEX]);
+		patch_tex_fetches(ctx, prog->fp, &ctx->tex[PIPE_SHADER_FRAGMENT]);
 	}
 }
 
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.c b/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
index 932383a..089c337 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
@@ -116,7 +116,7 @@
 		 * a change in # of fragment textures/samplers will trigger patching and
 		 * re-emitting the vertex shader:
 		 */
-		if (nr != ctx->fragtex.num_samplers)
+		if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers)
 			ctx->dirty |= FD_DIRTY_TEXSTATE;
 	}
 
@@ -151,6 +151,25 @@
 	return &so->base;
 }
 
+static void
+fd2_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
+		unsigned start, unsigned nr,
+		struct pipe_sampler_view **views)
+{
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		struct fd_context *ctx = fd_context(pctx);
+
+		/* on a2xx, since there is a flat address space for textures/samplers,
+		 * a change in # of fragment textures/samplers will trigger patching and
+		 * re-emitting the vertex shader:
+		 */
+		if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_textures)
+			ctx->dirty |= FD_DIRTY_TEXSTATE;
+	}
+
+	fd_set_sampler_views(pctx, shader, start, nr, views);
+}
+
 /* map gallium sampler-id to hw const-idx.. adreno uses a flat address
  * space of samplers (const-idx), so we need to map the gallium sampler-id
  * which is per-shader to a global const-idx space.
@@ -166,9 +185,9 @@
 fd2_get_const_idx(struct fd_context *ctx, struct fd_texture_stateobj *tex,
 		unsigned samp_id)
 {
-	if (tex == &ctx->fragtex)
+	if (tex == &ctx->tex[PIPE_SHADER_FRAGMENT])
 		return samp_id;
-	return samp_id + ctx->fragtex.num_samplers;
+	return samp_id + ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers;
 }
 
 void
@@ -177,5 +196,5 @@
 	pctx->create_sampler_state = fd2_sampler_state_create;
 	pctx->bind_sampler_states = fd2_sampler_states_bind;
 	pctx->create_sampler_view = fd2_sampler_view_create;
-	pctx->set_sampler_views = fd_set_sampler_views;
+	pctx->set_sampler_views = fd2_set_sampler_views;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 8a0c1a1..9574789 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,17 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-06-02 15:50:23)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 111898 bytes, from 2017-05-30 19:25:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 142603 bytes, from 2017-06-06 17:02:32)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
 
-Copyright (C) 2013-2016 by the following authors:
+Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index dac5941..b432f59 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -26,6 +26,7 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include "freedreno_query_hw.h"
 
 #include "fd3_context.h"
 #include "fd3_blend.h"
@@ -51,6 +52,8 @@
 
 	u_upload_destroy(fd3_ctx->border_color_uploader);
 
+	fd_hw_query_fini(pctx);
+
 	fd_context_destroy(pctx);
 }
 
@@ -95,6 +98,8 @@
 	if (!pctx)
 		return NULL;
 
+	fd_hw_query_init(pctx);
+
 	fd3_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000,
 			DRM_FREEDRENO_GEM_TYPE_KMEM);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b3cd461..761f25b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -55,7 +55,7 @@
 
 static void
 draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct fd3_emit *emit)
+		struct fd3_emit *emit, unsigned index_offset)
 {
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
@@ -72,7 +72,7 @@
 	OUT_RING(ring, add_sat(info->min_index, info->index_bias)); /* VFD_INDEX_MIN */
 	OUT_RING(ring, add_sat(info->max_index, info->index_bias)); /* VFD_INDEX_MAX */
 	OUT_RING(ring, info->start_instance);   /* VFD_INSTANCEID_OFFSET */
-	OUT_RING(ring, info->indexed ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
+	OUT_RING(ring, info->index_size ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
 
 	OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1);
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
@@ -86,7 +86,7 @@
 
 	fd_draw_emit(ctx->batch, ring, primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
-			info);
+			info, index_offset);
 }
 
 /* fixup dirty shader state in case some "unrelated" (from the state-
@@ -100,39 +100,23 @@
 	struct ir3_shader_key *last_key = &fd3_ctx->last_key;
 
 	if (!ir3_shader_key_equal(last_key, key)) {
-		if (last_key->has_per_samp || key->has_per_samp) {
-			if ((last_key->vsaturate_s != key->vsaturate_s) ||
-					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r))
-				ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-			if ((last_key->fsaturate_s != key->fsaturate_s) ||
-					(last_key->fsaturate_t != key->fsaturate_t) ||
-					(last_key->fsaturate_r != key->fsaturate_r))
-				ctx->dirty |= FD_SHADER_DIRTY_FP;
+		if (ir3_shader_key_changes_fs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
 		}
 
-		if (last_key->vclamp_color != key->vclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-		if (last_key->fclamp_color != key->fclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->color_two_side != key->color_two_side)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->half_precision != key->half_precision)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->ucp_enables != key->ucp_enables)
-			ctx->dirty |= FD_SHADER_DIRTY_FP | FD_SHADER_DIRTY_VP;
+		if (ir3_shader_key_changes_vs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
+		}
 
 		fd3_ctx->last_key = *key;
 	}
 }
 
 static bool
-fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
+             unsigned index_offset)
 {
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
 	struct fd3_emit emit = {
@@ -173,14 +157,16 @@
 
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
-	draw_impl(ctx, ctx->batch->draw, &emit);
+	draw_impl(ctx, ctx->batch->draw, &emit, index_offset);
 
 	/* and now binning pass: */
 	emit.key.binning_pass = true;
 	emit.dirty = dirty & ~(FD_DIRTY_BLEND);
 	emit.vp = NULL;   /* we changed key so need to refetch vp */
 	emit.fp = NULL;
-	draw_impl(ctx, ctx->batch->binning, &emit);
+	draw_impl(ctx, ctx->batch->binning, &emit, index_offset);
+
+	fd_context_all_clean(ctx);
 
 	return true;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 6c3458a..aefbbea 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -401,7 +401,7 @@
 			struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
 			const struct pipe_vertex_buffer *vb =
 					&vtx->vertexbuf.vb[elem->vertex_buffer_index];
-			struct fd_resource *rsc = fd_resource(vb->buffer);
+			struct fd_resource *rsc = fd_resource(vb->buffer.resource);
 			enum pipe_format pfmt = elem->src_format;
 			enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
@@ -490,7 +490,7 @@
 {
 	const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
 	const struct ir3_shader_variant *fp = fd3_emit_get_fp(emit);
-	uint32_t dirty = emit->dirty;
+	const enum fd_dirty_3d_state dirty = emit->dirty;
 
 	emit_marker(ring, 5);
 
@@ -622,7 +622,7 @@
 			val |= A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc);
 		}
 
-		if (info->indexed && info->primitive_restart) {
+		if (info->index_size && info->primitive_restart) {
 			val |= A3XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART;
 		}
 
@@ -713,9 +713,9 @@
 	OUT_RING(ring, HLSQ_FLUSH);
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
+		ir3_emit_vs_consts(vp, ring, ctx, emit->info);
 		if (!emit->key.binning_pass)
-			ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
+			ir3_emit_fs_consts(fp, ring, ctx);
 	}
 
 	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
@@ -783,24 +783,14 @@
 				A3XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
 	}
 
-	if (dirty & (FD_DIRTY_VERTTEX | FD_DIRTY_FRAGTEX))
+	if (dirty & FD_DIRTY_TEX)
 		fd_wfi(ctx->batch, ring);
 
-	if (dirty & FD_DIRTY_VERTTEX) {
-		if (vp->has_samp)
-			emit_textures(ctx, ring, SB_VERT_TEX, &ctx->verttex);
-		else
-			dirty &= ~FD_DIRTY_VERTTEX;
-	}
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX)
+		emit_textures(ctx, ring, SB_VERT_TEX, &ctx->tex[PIPE_SHADER_VERTEX]);
 
-	if (dirty & FD_DIRTY_FRAGTEX) {
-		if (fp->has_samp)
-			emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->fragtex);
-		else
-			dirty &= ~FD_DIRTY_FRAGTEX;
-	}
-
-	ctx->dirty &= ~dirty;
+	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX)
+		emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT]);
 }
 
 /* emit setup at begin of new cmdstream buffer (don't rely on previous
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 6e7dee2..5e574da 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -48,7 +48,7 @@
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	uint32_t dirty;
+	enum fd_dirty_3d_state dirty;
 
 	uint32_t sprite_coord_enable;
 	bool sprite_coord_mode;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 0ec769b..151ecfb 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -152,6 +152,9 @@
 	if ((gmem->maxpw * gmem->maxph) > 32)
 		return false;
 
+	if ((gmem->maxpw > 15) || (gmem->maxph > 15))
+		return false;
+
 	return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
 }
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
index cce165c..cde42c3 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -133,6 +133,13 @@
 
 void fd3_query_context_init(struct pipe_context *pctx)
 {
+	struct fd_context *ctx = fd_context(pctx);
+
+	ctx->create_query = fd_hw_create_query;
+	ctx->query_prepare = fd_hw_query_prepare;
+	ctx->query_prepare_tile = fd_hw_query_prepare_tile;
+	ctx->query_set_stage = fd_hw_query_set_stage;
+
 	fd_hw_query_register_provider(pctx, &occlusion_counter);
 	fd_hw_query_register_provider(pctx, &occlusion_predicate);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index c0915e1..8f4b0da 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,17 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-06-02 15:50:23)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 111898 bytes, from 2017-05-30 19:25:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 142603 bytes, from 2017-06-06 17:02:32)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
 
-Copyright (C) 2013-2016 by the following authors:
+Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
@@ -3843,6 +3843,44 @@
 
 #define REG_A4XX_VBIF_ROUND_ROBIN_QOS_ARB			0x00003049
 
+#define REG_A4XX_VBIF_PERF_CNT_EN0				0x000030c0
+
+#define REG_A4XX_VBIF_PERF_CNT_EN1				0x000030c1
+
+#define REG_A4XX_VBIF_PERF_CNT_EN2				0x000030c2
+
+#define REG_A4XX_VBIF_PERF_CNT_EN3				0x000030c3
+
+#define REG_A4XX_VBIF_PERF_CNT_SEL0				0x000030d0
+
+#define REG_A4XX_VBIF_PERF_CNT_SEL1				0x000030d1
+
+#define REG_A4XX_VBIF_PERF_CNT_SEL2				0x000030d2
+
+#define REG_A4XX_VBIF_PERF_CNT_SEL3				0x000030d3
+
+#define REG_A4XX_VBIF_PERF_CNT_LOW0				0x000030d8
+
+#define REG_A4XX_VBIF_PERF_CNT_LOW1				0x000030d9
+
+#define REG_A4XX_VBIF_PERF_CNT_LOW2				0x000030da
+
+#define REG_A4XX_VBIF_PERF_CNT_LOW3				0x000030db
+
+#define REG_A4XX_VBIF_PERF_CNT_HIGH0				0x000030e0
+
+#define REG_A4XX_VBIF_PERF_CNT_HIGH1				0x000030e1
+
+#define REG_A4XX_VBIF_PERF_CNT_HIGH2				0x000030e2
+
+#define REG_A4XX_VBIF_PERF_CNT_HIGH3				0x000030e3
+
+#define REG_A4XX_VBIF_PERF_PWR_CNT_EN0				0x00003100
+
+#define REG_A4XX_VBIF_PERF_PWR_CNT_EN1				0x00003101
+
+#define REG_A4XX_VBIF_PERF_PWR_CNT_EN2				0x00003102
+
 #define REG_A4XX_UNKNOWN_0CC5					0x00000cc5
 
 #define REG_A4XX_UNKNOWN_0CC6					0x00000cc6
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 291df2d..db292af 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -26,6 +26,7 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include "freedreno_query_hw.h"
 
 #include "fd4_context.h"
 #include "fd4_blend.h"
@@ -51,6 +52,8 @@
 
 	u_upload_destroy(fd4_ctx->border_color_uploader);
 
+	fd_hw_query_fini(pctx);
+
 	fd_context_destroy(pctx);
 }
 
@@ -95,6 +98,8 @@
 	if (!pctx)
 		return NULL;
 
+	fd_hw_query_init(pctx);
+
 	fd4_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000,
 			DRM_FREEDRENO_GEM_TYPE_KMEM);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 641c58a..840e917 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -44,7 +44,7 @@
 
 static void
 draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct fd4_emit *emit)
+		struct fd4_emit *emit, unsigned index_offset)
 {
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
@@ -55,7 +55,7 @@
 		fd4_emit_vertex_bufs(ring, emit);
 
 	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
-	OUT_RING(ring, info->indexed ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
+	OUT_RING(ring, info->index_size ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, info->start_instance);   /* ??? UNKNOWN_2209 */
 
 	OUT_PKT0(ring, REG_A4XX_PC_RESTART_INDEX, 1);
@@ -70,7 +70,7 @@
 
 	fd4_draw_emit(ctx->batch, ring, primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
-			info);
+			info, index_offset);
 }
 
 /* fixup dirty shader state in case some "unrelated" (from the state-
@@ -84,44 +84,23 @@
 	struct ir3_shader_key *last_key = &fd4_ctx->last_key;
 
 	if (!ir3_shader_key_equal(last_key, key)) {
-		if (last_key->has_per_samp || key->has_per_samp) {
-			if ((last_key->vsaturate_s != key->vsaturate_s) ||
-					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vastc_srgb != key->vastc_srgb))
-				ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-			if ((last_key->fsaturate_s != key->fsaturate_s) ||
-					(last_key->fsaturate_t != key->fsaturate_t) ||
-					(last_key->fsaturate_r != key->fsaturate_r) ||
-					(last_key->fastc_srgb != key->fastc_srgb))
-				ctx->dirty |= FD_SHADER_DIRTY_FP;
+		if (ir3_shader_key_changes_fs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
 		}
 
-		if (last_key->vclamp_color != key->vclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-		if (last_key->fclamp_color != key->fclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->color_two_side != key->color_two_side)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->half_precision != key->half_precision)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->rasterflat != key->rasterflat)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->ucp_enables != key->ucp_enables)
-			ctx->dirty |= FD_SHADER_DIRTY_FP | FD_SHADER_DIRTY_VP;
+		if (ir3_shader_key_changes_vs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
+		}
 
 		fd4_ctx->last_key = *key;
 	}
 }
 
 static bool
-fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
+             unsigned index_offset)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	struct fd4_emit emit = {
@@ -155,7 +134,7 @@
 
 	fixup_shader_state(ctx, &emit.key);
 
-	unsigned dirty = ctx->dirty;
+	enum fd_dirty_3d_state dirty = ctx->dirty;
 
 	/* do regular pass first, since that is more likely to fail compiling: */
 
@@ -175,7 +154,7 @@
 		OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE);
 	}
 
-	draw_impl(ctx, ctx->batch->draw, &emit);
+	draw_impl(ctx, ctx->batch->draw, &emit, index_offset);
 
 	if (ctx->rasterizer->rasterizer_discard) {
 		fd_wfi(ctx->batch, ring);
@@ -190,7 +169,9 @@
 	emit.dirty = dirty & ~(FD_DIRTY_BLEND);
 	emit.vp = NULL;   /* we changed key so need to refetch vp */
 	emit.fp = NULL;
-	draw_impl(ctx, ctx->batch->binning, &emit);
+	draw_impl(ctx, ctx->batch->binning, &emit, index_offset);
+
+	fd_context_all_clean(ctx);
 
 	return true;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index 634b64b..842a952 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -104,22 +104,21 @@
 fd4_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		enum pc_di_primtype primtype,
 		enum pc_di_vis_cull_mode vismode,
-		const struct pipe_draw_info *info)
+		const struct pipe_draw_info *info,
+		unsigned index_offset)
 {
 	struct pipe_resource *idx_buffer = NULL;
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
 
-	if (info->indexed) {
-		struct pipe_index_buffer *idx = &batch->ctx->indexbuf;
+	if (info->index_size) {
+		assert(!info->has_user_indices);
 
-		assert(!idx->user_buffer);
-
-		idx_buffer = idx->buffer;
-		idx_type = fd4_size2indextype(idx->index_size);
-		idx_size = idx->index_size * info->count;
-		idx_offset = idx->offset + (info->start * idx->index_size);
+		idx_buffer = info->index.resource;
+		idx_type = fd4_size2indextype(info->index_size);
+		idx_size = info->index_size * info->count;
+		idx_offset = index_offset + info->start * info->index_size;
 		src_sel = DI_SRC_SEL_DMA;
 	} else {
 		idx_buffer = NULL;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 2f3e0a6..0f7c647 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -45,11 +45,6 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
-static const enum adreno_state_block sb[] = {
-	[SHADER_VERTEX]   = SB_VERT_SHADER,
-	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
-};
-
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
@@ -60,31 +55,31 @@
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
-	enum adreno_state_src src;
+	enum a4xx_state_src src;
 
 	debug_assert((regid % 4) == 0);
 	debug_assert((sizedwords % 4) == 0);
 
 	if (prsc) {
 		sz = 0;
-		src = 0x2;  // TODO ??
+		src = SS4_INDIRECT;
 	} else {
 		sz = sizedwords;
-		src = SS_DIRECT;
+		src = SS4_DIRECT;
 	}
 
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
-			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
+	OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz);
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE4_0_STATE_SRC(src) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) |
+			CP_LOAD_STATE4_0_NUM_UNIT(sizedwords/4));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
 		OUT_RELOC(ring, bo, offset,
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS), 0);
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0);
 	} else {
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+		OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS));
 		dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
 	}
 	for (i = 0; i < sz; i++) {
@@ -101,13 +96,13 @@
 
 	debug_assert((regid % 4) == 0);
 
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum);
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
-			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(anum/4));
-	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
+	OUT_PKT3(ring, CP_LOAD_STATE4, 2 + anum);
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) |
+			CP_LOAD_STATE4_0_NUM_UNIT(anum/4));
+	OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS));
 
 	for (i = 0; i < num; i++) {
 		if (prscs[i]) {
@@ -127,12 +122,12 @@
 
 static void
 emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		enum adreno_state_block sb, struct fd_texture_stateobj *tex,
+		enum a4xx_state_block sb, struct fd_texture_stateobj *tex,
 		const struct ir3_shader_variant *v)
 {
 	static const uint32_t bcolor_reg[] = {
-			[SB_VERT_TEX] = REG_A4XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR,
-			[SB_FRAG_TEX] = REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR,
+			[SB4_VS_TEX] = REG_A4XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR,
+			[SB4_FS_TEX] = REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR,
 	};
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	bool needs_border = false;
@@ -148,13 +143,13 @@
 		num_samplers = align(tex->num_samplers, 2);
 
 		/* output sampler state: */
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * num_samplers));
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(num_samplers));
-		OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
-				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
+		OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * num_samplers));
+		OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+				CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+				CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+				CP_LOAD_STATE4_0_NUM_UNIT(num_samplers));
+		OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) |
+				CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
 		for (i = 0; i < tex->num_samplers; i++) {
 			static const struct fd4_sampler_stateobj dummy_sampler = {};
 			const struct fd4_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -176,13 +171,13 @@
 		unsigned num_textures = tex->num_textures + v->astc_srgb.count;
 
 		/* emit texture state: */
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * num_textures));
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(num_textures));
-		OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
-				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
+		OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * num_textures));
+		OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+				CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+				CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+				CP_LOAD_STATE4_0_NUM_UNIT(num_textures));
+		OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) |
+				CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
 		for (i = 0; i < tex->num_textures; i++) {
 			static const struct fd4_pipe_sampler_view dummy_view = {};
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
@@ -267,13 +262,13 @@
 	}
 
 	/* output sampler state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs));
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
-	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
-			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
+	OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * nr_bufs));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) |
+			CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs));
+	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) |
+			CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
 	for (i = 0; i < nr_bufs; i++) {
 		OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
 				A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
@@ -284,13 +279,13 @@
 	}
 
 	/* emit texture state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * nr_bufs));
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
-	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
-			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
+	OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * nr_bufs));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) |
+			CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs));
+	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) |
+			CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
 	for (i = 0; i < nr_bufs; i++) {
 		if (bufs[i]) {
 			struct fd_resource *rsc = fd_resource(bufs[i]->texture);
@@ -408,7 +403,7 @@
 			struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
 			const struct pipe_vertex_buffer *vb =
 					&vtx->vertexbuf.vb[elem->vertex_buffer_index];
-			struct fd_resource *rsc = fd_resource(vb->buffer);
+			struct fd_resource *rsc = fd_resource(vb->buffer.resource);
 			enum pipe_format pfmt = elem->src_format;
 			enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
@@ -504,7 +499,7 @@
 {
 	const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
 	const struct ir3_shader_variant *fp = fd4_emit_get_fp(emit);
-	uint32_t dirty = emit->dirty;
+	const enum fd_dirty_3d_state dirty = emit->dirty;
 
 	emit_marker(ring, 5);
 
@@ -605,7 +600,7 @@
 			fd4_rasterizer_stateobj(ctx->rasterizer);
 		uint32_t val = rast->pc_prim_vtx_cntl;
 
-		if (info->indexed && info->primitive_restart)
+		if (info->index_size && info->primitive_restart)
 			val |= A4XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART;
 
 		val |= COND(vp->writes_psize, A4XX_PC_PRIM_VTX_CNTL_PSIZE);
@@ -682,9 +677,9 @@
 	}
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
+		ir3_emit_vs_consts(vp, ring, ctx, emit->info);
 		if (!emit->key.binning_pass)
-			ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
+			ir3_emit_fs_consts(fp, ring, ctx);
 	}
 
 	if ((dirty & FD_DIRTY_BLEND)) {
@@ -745,21 +740,11 @@
 		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
 
-	if (dirty & FD_DIRTY_VERTTEX) {
-		if (vp->has_samp)
-			emit_textures(ctx, ring, SB_VERT_TEX, &ctx->verttex, vp);
-		else
-			dirty &= ~FD_DIRTY_VERTTEX;
-	}
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX)
+		emit_textures(ctx, ring, SB4_VS_TEX, &ctx->tex[PIPE_SHADER_VERTEX], vp);
 
-	if (dirty & FD_DIRTY_FRAGTEX) {
-		if (fp->has_samp)
-			emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->fragtex, fp);
-		else
-			dirty &= ~FD_DIRTY_FRAGTEX;
-	}
-
-	ctx->dirty &= ~dirty;
+	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX)
+		emit_textures(ctx, ring, SB4_FS_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT], fp);
 }
 
 /* emit setup at begin of new cmdstream buffer (don't rely on previous
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 00f92fa..a724cae 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -48,7 +48,7 @@
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	uint32_t dirty;
+	enum fd_dirty_3d_state dirty;
 
 	uint32_t sprite_coord_enable;  /* bitmask */
 	bool sprite_coord_mode;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 5b7dc03..49476d8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -135,10 +135,11 @@
 use_hw_binning(struct fd_batch *batch)
 {
 	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
-	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
-	/* this seems to be a hw bug.. but this hack fixes piglit fbo-maxsize: */
-	if ((pfb->width > 4096) && (pfb->height > 4096))
+	if ((gmem->maxpw * gmem->maxph) > 32)
+		return false;
+
+	if ((gmem->maxpw > 15) || (gmem->maxph > 15))
 		return false;
 
 	return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 3e75125..05b0c4f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -89,37 +89,31 @@
 emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
 {
 	const struct ir3_info *si = &so->info;
-	enum adreno_state_block sb;
+	enum a4xx_state_block sb = fd4_stage2shadersb(so->type);
 	enum adreno_state_src src;
 	uint32_t i, sz, *bin;
 
-	if (so->type == SHADER_VERTEX) {
-		sb = SB_VERT_SHADER;
-	} else {
-		sb = SB_FRAG_SHADER;
-	}
-
 	if (fd_mesa_debug & FD_DBG_DIRECT) {
 		sz = si->sizedwords;
-		src = SS_DIRECT;
+		src = SS4_DIRECT;
 		bin = fd_bo_map(so->bo);
 	} else {
 		sz = 0;
-		src = 2;  // enums different on a4xx..
+		src = SS4_INDIRECT;
 		bin = NULL;
 	}
 
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-			CP_LOAD_STATE_0_NUM_UNIT(so->instrlen));
+	OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz);
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(src) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen));
 	if (bin) {
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER));
+		OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER));
 	} else {
 		OUT_RELOC(ring, so->bo, 0,
-				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0);
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0);
 	}
 
 	/* for how clever coverity is, it is sometimes rather dull, and
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 3ae3971..f7b385d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -275,6 +275,13 @@
 
 void fd4_query_context_init(struct pipe_context *pctx)
 {
+	struct fd_context *ctx = fd_context(pctx);
+
+	ctx->create_query = fd_hw_create_query;
+	ctx->query_prepare = fd_hw_query_prepare;
+	ctx->query_prepare_tile = fd_hw_query_prepare_tile;
+	ctx->query_set_stage = fd_hw_query_set_stage;
+
 	fd_hw_query_register_provider(pctx, &occlusion_counter);
 	fd_hw_query_register_provider(pctx, &occlusion_predicate);
 	fd_hw_query_register_provider(pctx, &time_elapsed);
diff --git a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
index 031cbdf..ae946d8 100644
--- a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
+++ b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
@@ -8,15 +8,10 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/ilia/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 141938 bytes, from 2017-07-08 01:02:47)
+- /home/ilia/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-11 01:04:14)
+- /home/ilia/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-07-04 02:59:47)
+- /home/ilia/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-07-04 02:59:47)
 
 Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -45,7 +40,9 @@
 
 
 enum a5xx_color_fmt {
+	RB5_A8_UNORM = 2,
 	RB5_R8_UNORM = 3,
+	RB5_R8_SNORM = 4,
 	RB5_R8_UINT = 5,
 	RB5_R8_SINT = 6,
 	RB5_R4G4B4A4_UNORM = 8,
@@ -62,6 +59,7 @@
 	RB5_R16_SINT = 25,
 	RB5_R8G8B8A8_UNORM = 48,
 	RB5_R8G8B8_UNORM = 49,
+	RB5_R8G8B8A8_SNORM = 50,
 	RB5_R8G8B8A8_UINT = 51,
 	RB5_R8G8B8A8_SINT = 52,
 	RB5_R10G10B10A2_UNORM = 55,
@@ -75,6 +73,8 @@
 	RB5_R32_FLOAT = 74,
 	RB5_R32_UINT = 75,
 	RB5_R32_SINT = 76,
+	RB5_R16G16B16A16_UNORM = 96,
+	RB5_R16G16B16A16_SNORM = 97,
 	RB5_R16G16B16A16_FLOAT = 98,
 	RB5_R16G16B16A16_UINT = 99,
 	RB5_R16G16B16A16_SINT = 100,
@@ -114,6 +114,11 @@
 	VFMT5_8_8_8_8_SNORM = 50,
 	VFMT5_8_8_8_8_UINT = 51,
 	VFMT5_8_8_8_8_SINT = 52,
+	VFMT5_10_10_10_2_UNORM = 54,
+	VFMT5_10_10_10_2_SNORM = 57,
+	VFMT5_10_10_10_2_UINT = 58,
+	VFMT5_10_10_10_2_SINT = 59,
+	VFMT5_11_11_10_FLOAT = 66,
 	VFMT5_16_16_UNORM = 67,
 	VFMT5_16_16_SNORM = 68,
 	VFMT5_16_16_FLOAT = 69,
@@ -158,6 +163,7 @@
 enum a5xx_tex_fmt {
 	TFMT5_A8_UNORM = 2,
 	TFMT5_8_UNORM = 3,
+	TFMT5_8_SNORM = 4,
 	TFMT5_8_UINT = 5,
 	TFMT5_8_SINT = 6,
 	TFMT5_4_4_4_4_UNORM = 8,
@@ -175,7 +181,7 @@
 	TFMT5_16_SINT = 25,
 	TFMT5_8_8_8_8_UNORM = 48,
 	TFMT5_8_8_8_UNORM = 49,
-	TFMT5_8_8_8_SNORM = 50,
+	TFMT5_8_8_8_8_SNORM = 50,
 	TFMT5_8_8_8_8_UINT = 51,
 	TFMT5_8_8_8_8_SINT = 52,
 	TFMT5_9_9_9_E5_FLOAT = 53,
@@ -190,16 +196,53 @@
 	TFMT5_32_FLOAT = 74,
 	TFMT5_32_UINT = 75,
 	TFMT5_32_SINT = 76,
+	TFMT5_16_16_16_16_UNORM = 96,
+	TFMT5_16_16_16_16_SNORM = 97,
 	TFMT5_16_16_16_16_FLOAT = 98,
 	TFMT5_16_16_16_16_UINT = 99,
 	TFMT5_16_16_16_16_SINT = 100,
 	TFMT5_32_32_FLOAT = 103,
 	TFMT5_32_32_UINT = 104,
 	TFMT5_32_32_SINT = 105,
+	TFMT5_32_32_32_UINT = 114,
+	TFMT5_32_32_32_SINT = 115,
+	TFMT5_32_32_32_FLOAT = 116,
 	TFMT5_32_32_32_32_FLOAT = 130,
 	TFMT5_32_32_32_32_UINT = 131,
 	TFMT5_32_32_32_32_SINT = 132,
 	TFMT5_X8Z24_UNORM = 160,
+	TFMT5_ETC2_RG11_UNORM = 171,
+	TFMT5_ETC2_RG11_SNORM = 172,
+	TFMT5_ETC2_R11_UNORM = 173,
+	TFMT5_ETC2_R11_SNORM = 174,
+	TFMT5_ETC1 = 175,
+	TFMT5_ETC2_RGB8 = 176,
+	TFMT5_ETC2_RGBA8 = 177,
+	TFMT5_ETC2_RGB8A1 = 178,
+	TFMT5_DXT1 = 179,
+	TFMT5_DXT3 = 180,
+	TFMT5_DXT5 = 181,
+	TFMT5_RGTC1_UNORM = 183,
+	TFMT5_RGTC1_SNORM = 184,
+	TFMT5_RGTC2_UNORM = 187,
+	TFMT5_RGTC2_SNORM = 188,
+	TFMT5_BPTC_UFLOAT = 190,
+	TFMT5_BPTC_FLOAT = 191,
+	TFMT5_BPTC = 192,
+	TFMT5_ASTC_4x4 = 193,
+	TFMT5_ASTC_5x4 = 194,
+	TFMT5_ASTC_5x5 = 195,
+	TFMT5_ASTC_6x5 = 196,
+	TFMT5_ASTC_6x6 = 197,
+	TFMT5_ASTC_8x5 = 198,
+	TFMT5_ASTC_8x6 = 199,
+	TFMT5_ASTC_8x8 = 200,
+	TFMT5_ASTC_10x5 = 201,
+	TFMT5_ASTC_10x6 = 202,
+	TFMT5_ASTC_10x8 = 203,
+	TFMT5_ASTC_10x10 = 204,
+	TFMT5_ASTC_12x10 = 205,
+	TFMT5_ASTC_12x12 = 206,
 };
 
 enum a5xx_tex_fetchsize {
@@ -230,6 +273,565 @@
 	BLIT_Z32 = 9,
 };
 
+enum a5xx_cp_perfcounter_select {
+	PERF_CP_ALWAYS_COUNT = 0,
+	PERF_CP_BUSY_GFX_CORE_IDLE = 1,
+	PERF_CP_BUSY_CYCLES = 2,
+	PERF_CP_PFP_IDLE = 3,
+	PERF_CP_PFP_BUSY_WORKING = 4,
+	PERF_CP_PFP_STALL_CYCLES_ANY = 5,
+	PERF_CP_PFP_STARVE_CYCLES_ANY = 6,
+	PERF_CP_PFP_ICACHE_MISS = 7,
+	PERF_CP_PFP_ICACHE_HIT = 8,
+	PERF_CP_PFP_MATCH_PM4_PKT_PROFILE = 9,
+	PERF_CP_ME_BUSY_WORKING = 10,
+	PERF_CP_ME_IDLE = 11,
+	PERF_CP_ME_STARVE_CYCLES_ANY = 12,
+	PERF_CP_ME_FIFO_EMPTY_PFP_IDLE = 13,
+	PERF_CP_ME_FIFO_EMPTY_PFP_BUSY = 14,
+	PERF_CP_ME_FIFO_FULL_ME_BUSY = 15,
+	PERF_CP_ME_FIFO_FULL_ME_NON_WORKING = 16,
+	PERF_CP_ME_STALL_CYCLES_ANY = 17,
+	PERF_CP_ME_ICACHE_MISS = 18,
+	PERF_CP_ME_ICACHE_HIT = 19,
+	PERF_CP_NUM_PREEMPTIONS = 20,
+	PERF_CP_PREEMPTION_REACTION_DELAY = 21,
+	PERF_CP_PREEMPTION_SWITCH_OUT_TIME = 22,
+	PERF_CP_PREEMPTION_SWITCH_IN_TIME = 23,
+	PERF_CP_DEAD_DRAWS_IN_BIN_RENDER = 24,
+	PERF_CP_PREDICATED_DRAWS_KILLED = 25,
+	PERF_CP_MODE_SWITCH = 26,
+	PERF_CP_ZPASS_DONE = 27,
+	PERF_CP_CONTEXT_DONE = 28,
+	PERF_CP_CACHE_FLUSH = 29,
+	PERF_CP_LONG_PREEMPTIONS = 30,
+};
+
+enum a5xx_rbbm_perfcounter_select {
+	PERF_RBBM_ALWAYS_COUNT = 0,
+	PERF_RBBM_ALWAYS_ON = 1,
+	PERF_RBBM_TSE_BUSY = 2,
+	PERF_RBBM_RAS_BUSY = 3,
+	PERF_RBBM_PC_DCALL_BUSY = 4,
+	PERF_RBBM_PC_VSD_BUSY = 5,
+	PERF_RBBM_STATUS_MASKED = 6,
+	PERF_RBBM_COM_BUSY = 7,
+	PERF_RBBM_DCOM_BUSY = 8,
+	PERF_RBBM_VBIF_BUSY = 9,
+	PERF_RBBM_VSC_BUSY = 10,
+	PERF_RBBM_TESS_BUSY = 11,
+	PERF_RBBM_UCHE_BUSY = 12,
+	PERF_RBBM_HLSQ_BUSY = 13,
+};
+
+enum a5xx_pc_perfcounter_select {
+	PERF_PC_BUSY_CYCLES = 0,
+	PERF_PC_WORKING_CYCLES = 1,
+	PERF_PC_STALL_CYCLES_VFD = 2,
+	PERF_PC_STALL_CYCLES_TSE = 3,
+	PERF_PC_STALL_CYCLES_VPC = 4,
+	PERF_PC_STALL_CYCLES_UCHE = 5,
+	PERF_PC_STALL_CYCLES_TESS = 6,
+	PERF_PC_STALL_CYCLES_TSE_ONLY = 7,
+	PERF_PC_STALL_CYCLES_VPC_ONLY = 8,
+	PERF_PC_PASS1_TF_STALL_CYCLES = 9,
+	PERF_PC_STARVE_CYCLES_FOR_INDEX = 10,
+	PERF_PC_STARVE_CYCLES_FOR_TESS_FACTOR = 11,
+	PERF_PC_STARVE_CYCLES_FOR_VIZ_STREAM = 12,
+	PERF_PC_STARVE_CYCLES_FOR_POSITION = 13,
+	PERF_PC_STARVE_CYCLES_DI = 14,
+	PERF_PC_VIS_STREAMS_LOADED = 15,
+	PERF_PC_INSTANCES = 16,
+	PERF_PC_VPC_PRIMITIVES = 17,
+	PERF_PC_DEAD_PRIM = 18,
+	PERF_PC_LIVE_PRIM = 19,
+	PERF_PC_VERTEX_HITS = 20,
+	PERF_PC_IA_VERTICES = 21,
+	PERF_PC_IA_PRIMITIVES = 22,
+	PERF_PC_GS_PRIMITIVES = 23,
+	PERF_PC_HS_INVOCATIONS = 24,
+	PERF_PC_DS_INVOCATIONS = 25,
+	PERF_PC_VS_INVOCATIONS = 26,
+	PERF_PC_GS_INVOCATIONS = 27,
+	PERF_PC_DS_PRIMITIVES = 28,
+	PERF_PC_VPC_POS_DATA_TRANSACTION = 29,
+	PERF_PC_3D_DRAWCALLS = 30,
+	PERF_PC_2D_DRAWCALLS = 31,
+	PERF_PC_NON_DRAWCALL_GLOBAL_EVENTS = 32,
+	PERF_TESS_BUSY_CYCLES = 33,
+	PERF_TESS_WORKING_CYCLES = 34,
+	PERF_TESS_STALL_CYCLES_PC = 35,
+	PERF_TESS_STARVE_CYCLES_PC = 36,
+};
+
+enum a5xx_vfd_perfcounter_select {
+	PERF_VFD_BUSY_CYCLES = 0,
+	PERF_VFD_STALL_CYCLES_UCHE = 1,
+	PERF_VFD_STALL_CYCLES_VPC_ALLOC = 2,
+	PERF_VFD_STALL_CYCLES_MISS_VB = 3,
+	PERF_VFD_STALL_CYCLES_MISS_Q = 4,
+	PERF_VFD_STALL_CYCLES_SP_INFO = 5,
+	PERF_VFD_STALL_CYCLES_SP_ATTR = 6,
+	PERF_VFD_STALL_CYCLES_VFDP_VB = 7,
+	PERF_VFD_STALL_CYCLES_VFDP_Q = 8,
+	PERF_VFD_DECODER_PACKER_STALL = 9,
+	PERF_VFD_STARVE_CYCLES_UCHE = 10,
+	PERF_VFD_RBUFFER_FULL = 11,
+	PERF_VFD_ATTR_INFO_FIFO_FULL = 12,
+	PERF_VFD_DECODED_ATTRIBUTE_BYTES = 13,
+	PERF_VFD_NUM_ATTRIBUTES = 14,
+	PERF_VFD_INSTRUCTIONS = 15,
+	PERF_VFD_UPPER_SHADER_FIBERS = 16,
+	PERF_VFD_LOWER_SHADER_FIBERS = 17,
+	PERF_VFD_MODE_0_FIBERS = 18,
+	PERF_VFD_MODE_1_FIBERS = 19,
+	PERF_VFD_MODE_2_FIBERS = 20,
+	PERF_VFD_MODE_3_FIBERS = 21,
+	PERF_VFD_MODE_4_FIBERS = 22,
+	PERF_VFD_TOTAL_VERTICES = 23,
+	PERF_VFD_NUM_ATTR_MISS = 24,
+	PERF_VFD_1_BURST_REQ = 25,
+	PERF_VFDP_STALL_CYCLES_VFD = 26,
+	PERF_VFDP_STALL_CYCLES_VFD_INDEX = 27,
+	PERF_VFDP_STALL_CYCLES_VFD_PROG = 28,
+	PERF_VFDP_STARVE_CYCLES_PC = 29,
+	PERF_VFDP_VS_STAGE_32_WAVES = 30,
+};
+
+enum a5xx_hlsq_perfcounter_select {
+	PERF_HLSQ_BUSY_CYCLES = 0,
+	PERF_HLSQ_STALL_CYCLES_UCHE = 1,
+	PERF_HLSQ_STALL_CYCLES_SP_STATE = 2,
+	PERF_HLSQ_STALL_CYCLES_SP_FS_STAGE = 3,
+	PERF_HLSQ_UCHE_LATENCY_CYCLES = 4,
+	PERF_HLSQ_UCHE_LATENCY_COUNT = 5,
+	PERF_HLSQ_FS_STAGE_32_WAVES = 6,
+	PERF_HLSQ_FS_STAGE_64_WAVES = 7,
+	PERF_HLSQ_QUADS = 8,
+	PERF_HLSQ_SP_STATE_COPY_TRANS_FS_STAGE = 9,
+	PERF_HLSQ_SP_STATE_COPY_TRANS_VS_STAGE = 10,
+	PERF_HLSQ_TP_STATE_COPY_TRANS_FS_STAGE = 11,
+	PERF_HLSQ_TP_STATE_COPY_TRANS_VS_STAGE = 12,
+	PERF_HLSQ_CS_INVOCATIONS = 13,
+	PERF_HLSQ_COMPUTE_DRAWCALLS = 14,
+};
+
+enum a5xx_vpc_perfcounter_select {
+	PERF_VPC_BUSY_CYCLES = 0,
+	PERF_VPC_WORKING_CYCLES = 1,
+	PERF_VPC_STALL_CYCLES_UCHE = 2,
+	PERF_VPC_STALL_CYCLES_VFD_WACK = 3,
+	PERF_VPC_STALL_CYCLES_HLSQ_PRIM_ALLOC = 4,
+	PERF_VPC_STALL_CYCLES_PC = 5,
+	PERF_VPC_STALL_CYCLES_SP_LM = 6,
+	PERF_VPC_POS_EXPORT_STALL_CYCLES = 7,
+	PERF_VPC_STARVE_CYCLES_SP = 8,
+	PERF_VPC_STARVE_CYCLES_LRZ = 9,
+	PERF_VPC_PC_PRIMITIVES = 10,
+	PERF_VPC_SP_COMPONENTS = 11,
+	PERF_VPC_SP_LM_PRIMITIVES = 12,
+	PERF_VPC_SP_LM_COMPONENTS = 13,
+	PERF_VPC_SP_LM_DWORDS = 14,
+	PERF_VPC_STREAMOUT_COMPONENTS = 15,
+	PERF_VPC_GRANT_PHASES = 16,
+};
+
+enum a5xx_tse_perfcounter_select {
+	PERF_TSE_BUSY_CYCLES = 0,
+	PERF_TSE_CLIPPING_CYCLES = 1,
+	PERF_TSE_STALL_CYCLES_RAS = 2,
+	PERF_TSE_STALL_CYCLES_LRZ_BARYPLANE = 3,
+	PERF_TSE_STALL_CYCLES_LRZ_ZPLANE = 4,
+	PERF_TSE_STARVE_CYCLES_PC = 5,
+	PERF_TSE_INPUT_PRIM = 6,
+	PERF_TSE_INPUT_NULL_PRIM = 7,
+	PERF_TSE_TRIVAL_REJ_PRIM = 8,
+	PERF_TSE_CLIPPED_PRIM = 9,
+	PERF_TSE_ZERO_AREA_PRIM = 10,
+	PERF_TSE_FACENESS_CULLED_PRIM = 11,
+	PERF_TSE_ZERO_PIXEL_PRIM = 12,
+	PERF_TSE_OUTPUT_NULL_PRIM = 13,
+	PERF_TSE_OUTPUT_VISIBLE_PRIM = 14,
+	PERF_TSE_CINVOCATION = 15,
+	PERF_TSE_CPRIMITIVES = 16,
+	PERF_TSE_2D_INPUT_PRIM = 17,
+	PERF_TSE_2D_ALIVE_CLCLES = 18,
+};
+
+enum a5xx_ras_perfcounter_select {
+	PERF_RAS_BUSY_CYCLES = 0,
+	PERF_RAS_SUPERTILE_ACTIVE_CYCLES = 1,
+	PERF_RAS_STALL_CYCLES_LRZ = 2,
+	PERF_RAS_STARVE_CYCLES_TSE = 3,
+	PERF_RAS_SUPER_TILES = 4,
+	PERF_RAS_8X4_TILES = 5,
+	PERF_RAS_MASKGEN_ACTIVE = 6,
+	PERF_RAS_FULLY_COVERED_SUPER_TILES = 7,
+	PERF_RAS_FULLY_COVERED_8X4_TILES = 8,
+	PERF_RAS_PRIM_KILLED_INVISILBE = 9,
+};
+
+enum a5xx_lrz_perfcounter_select {
+	PERF_LRZ_BUSY_CYCLES = 0,
+	PERF_LRZ_STARVE_CYCLES_RAS = 1,
+	PERF_LRZ_STALL_CYCLES_RB = 2,
+	PERF_LRZ_STALL_CYCLES_VSC = 3,
+	PERF_LRZ_STALL_CYCLES_VPC = 4,
+	PERF_LRZ_STALL_CYCLES_FLAG_PREFETCH = 5,
+	PERF_LRZ_STALL_CYCLES_UCHE = 6,
+	PERF_LRZ_LRZ_READ = 7,
+	PERF_LRZ_LRZ_WRITE = 8,
+	PERF_LRZ_READ_LATENCY = 9,
+	PERF_LRZ_MERGE_CACHE_UPDATING = 10,
+	PERF_LRZ_PRIM_KILLED_BY_MASKGEN = 11,
+	PERF_LRZ_PRIM_KILLED_BY_LRZ = 12,
+	PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ = 13,
+	PERF_LRZ_FULL_8X8_TILES = 14,
+	PERF_LRZ_PARTIAL_8X8_TILES = 15,
+	PERF_LRZ_TILE_KILLED = 16,
+	PERF_LRZ_TOTAL_PIXEL = 17,
+	PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ = 18,
+};
+
+enum a5xx_uche_perfcounter_select {
+	PERF_UCHE_BUSY_CYCLES = 0,
+	PERF_UCHE_STALL_CYCLES_VBIF = 1,
+	PERF_UCHE_VBIF_LATENCY_CYCLES = 2,
+	PERF_UCHE_VBIF_LATENCY_SAMPLES = 3,
+	PERF_UCHE_VBIF_READ_BEATS_TP = 4,
+	PERF_UCHE_VBIF_READ_BEATS_VFD = 5,
+	PERF_UCHE_VBIF_READ_BEATS_HLSQ = 6,
+	PERF_UCHE_VBIF_READ_BEATS_LRZ = 7,
+	PERF_UCHE_VBIF_READ_BEATS_SP = 8,
+	PERF_UCHE_READ_REQUESTS_TP = 9,
+	PERF_UCHE_READ_REQUESTS_VFD = 10,
+	PERF_UCHE_READ_REQUESTS_HLSQ = 11,
+	PERF_UCHE_READ_REQUESTS_LRZ = 12,
+	PERF_UCHE_READ_REQUESTS_SP = 13,
+	PERF_UCHE_WRITE_REQUESTS_LRZ = 14,
+	PERF_UCHE_WRITE_REQUESTS_SP = 15,
+	PERF_UCHE_WRITE_REQUESTS_VPC = 16,
+	PERF_UCHE_WRITE_REQUESTS_VSC = 17,
+	PERF_UCHE_EVICTS = 18,
+	PERF_UCHE_BANK_REQ0 = 19,
+	PERF_UCHE_BANK_REQ1 = 20,
+	PERF_UCHE_BANK_REQ2 = 21,
+	PERF_UCHE_BANK_REQ3 = 22,
+	PERF_UCHE_BANK_REQ4 = 23,
+	PERF_UCHE_BANK_REQ5 = 24,
+	PERF_UCHE_BANK_REQ6 = 25,
+	PERF_UCHE_BANK_REQ7 = 26,
+	PERF_UCHE_VBIF_READ_BEATS_CH0 = 27,
+	PERF_UCHE_VBIF_READ_BEATS_CH1 = 28,
+	PERF_UCHE_GMEM_READ_BEATS = 29,
+	PERF_UCHE_FLAG_COUNT = 30,
+};
+
+enum a5xx_tp_perfcounter_select {
+	PERF_TP_BUSY_CYCLES = 0,
+	PERF_TP_STALL_CYCLES_UCHE = 1,
+	PERF_TP_LATENCY_CYCLES = 2,
+	PERF_TP_LATENCY_TRANS = 3,
+	PERF_TP_FLAG_CACHE_REQUEST_SAMPLES = 4,
+	PERF_TP_FLAG_CACHE_REQUEST_LATENCY = 5,
+	PERF_TP_L1_CACHELINE_REQUESTS = 6,
+	PERF_TP_L1_CACHELINE_MISSES = 7,
+	PERF_TP_SP_TP_TRANS = 8,
+	PERF_TP_TP_SP_TRANS = 9,
+	PERF_TP_OUTPUT_PIXELS = 10,
+	PERF_TP_FILTER_WORKLOAD_16BIT = 11,
+	PERF_TP_FILTER_WORKLOAD_32BIT = 12,
+	PERF_TP_QUADS_RECEIVED = 13,
+	PERF_TP_QUADS_OFFSET = 14,
+	PERF_TP_QUADS_SHADOW = 15,
+	PERF_TP_QUADS_ARRAY = 16,
+	PERF_TP_QUADS_GRADIENT = 17,
+	PERF_TP_QUADS_1D = 18,
+	PERF_TP_QUADS_2D = 19,
+	PERF_TP_QUADS_BUFFER = 20,
+	PERF_TP_QUADS_3D = 21,
+	PERF_TP_QUADS_CUBE = 22,
+	PERF_TP_STATE_CACHE_REQUESTS = 23,
+	PERF_TP_STATE_CACHE_MISSES = 24,
+	PERF_TP_DIVERGENT_QUADS_RECEIVED = 25,
+	PERF_TP_BINDLESS_STATE_CACHE_REQUESTS = 26,
+	PERF_TP_BINDLESS_STATE_CACHE_MISSES = 27,
+	PERF_TP_PRT_NON_RESIDENT_EVENTS = 28,
+	PERF_TP_OUTPUT_PIXELS_POINT = 29,
+	PERF_TP_OUTPUT_PIXELS_BILINEAR = 30,
+	PERF_TP_OUTPUT_PIXELS_MIP = 31,
+	PERF_TP_OUTPUT_PIXELS_ANISO = 32,
+	PERF_TP_OUTPUT_PIXELS_ZERO_LOD = 33,
+	PERF_TP_FLAG_CACHE_REQUESTS = 34,
+	PERF_TP_FLAG_CACHE_MISSES = 35,
+	PERF_TP_L1_5_L2_REQUESTS = 36,
+	PERF_TP_2D_OUTPUT_PIXELS = 37,
+	PERF_TP_2D_OUTPUT_PIXELS_POINT = 38,
+	PERF_TP_2D_OUTPUT_PIXELS_BILINEAR = 39,
+	PERF_TP_2D_FILTER_WORKLOAD_16BIT = 40,
+	PERF_TP_2D_FILTER_WORKLOAD_32BIT = 41,
+};
+
+enum a5xx_sp_perfcounter_select {
+	PERF_SP_BUSY_CYCLES = 0,
+	PERF_SP_ALU_WORKING_CYCLES = 1,
+	PERF_SP_EFU_WORKING_CYCLES = 2,
+	PERF_SP_STALL_CYCLES_VPC = 3,
+	PERF_SP_STALL_CYCLES_TP = 4,
+	PERF_SP_STALL_CYCLES_UCHE = 5,
+	PERF_SP_STALL_CYCLES_RB = 6,
+	PERF_SP_SCHEDULER_NON_WORKING = 7,
+	PERF_SP_WAVE_CONTEXTS = 8,
+	PERF_SP_WAVE_CONTEXT_CYCLES = 9,
+	PERF_SP_FS_STAGE_WAVE_CYCLES = 10,
+	PERF_SP_FS_STAGE_WAVE_SAMPLES = 11,
+	PERF_SP_VS_STAGE_WAVE_CYCLES = 12,
+	PERF_SP_VS_STAGE_WAVE_SAMPLES = 13,
+	PERF_SP_FS_STAGE_DURATION_CYCLES = 14,
+	PERF_SP_VS_STAGE_DURATION_CYCLES = 15,
+	PERF_SP_WAVE_CTRL_CYCLES = 16,
+	PERF_SP_WAVE_LOAD_CYCLES = 17,
+	PERF_SP_WAVE_EMIT_CYCLES = 18,
+	PERF_SP_WAVE_NOP_CYCLES = 19,
+	PERF_SP_WAVE_WAIT_CYCLES = 20,
+	PERF_SP_WAVE_FETCH_CYCLES = 21,
+	PERF_SP_WAVE_IDLE_CYCLES = 22,
+	PERF_SP_WAVE_END_CYCLES = 23,
+	PERF_SP_WAVE_LONG_SYNC_CYCLES = 24,
+	PERF_SP_WAVE_SHORT_SYNC_CYCLES = 25,
+	PERF_SP_WAVE_JOIN_CYCLES = 26,
+	PERF_SP_LM_LOAD_INSTRUCTIONS = 27,
+	PERF_SP_LM_STORE_INSTRUCTIONS = 28,
+	PERF_SP_LM_ATOMICS = 29,
+	PERF_SP_GM_LOAD_INSTRUCTIONS = 30,
+	PERF_SP_GM_STORE_INSTRUCTIONS = 31,
+	PERF_SP_GM_ATOMICS = 32,
+	PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = 33,
+	PERF_SP_VS_STAGE_CFLOW_INSTRUCTIONS = 34,
+	PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = 35,
+	PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = 36,
+	PERF_SP_VS_STAGE_HALF_ALU_INSTRUCTIONS = 37,
+	PERF_SP_FS_STAGE_TEX_INSTRUCTIONS = 38,
+	PERF_SP_FS_STAGE_CFLOW_INSTRUCTIONS = 39,
+	PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = 40,
+	PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = 41,
+	PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = 42,
+	PERF_SP_FS_STAGE_BARY_INSTRUCTIONS = 43,
+	PERF_SP_VS_INSTRUCTIONS = 44,
+	PERF_SP_FS_INSTRUCTIONS = 45,
+	PERF_SP_ADDR_LOCK_COUNT = 46,
+	PERF_SP_UCHE_READ_TRANS = 47,
+	PERF_SP_UCHE_WRITE_TRANS = 48,
+	PERF_SP_EXPORT_VPC_TRANS = 49,
+	PERF_SP_EXPORT_RB_TRANS = 50,
+	PERF_SP_PIXELS_KILLED = 51,
+	PERF_SP_ICL1_REQUESTS = 52,
+	PERF_SP_ICL1_MISSES = 53,
+	PERF_SP_ICL0_REQUESTS = 54,
+	PERF_SP_ICL0_MISSES = 55,
+	PERF_SP_HS_INSTRUCTIONS = 56,
+	PERF_SP_DS_INSTRUCTIONS = 57,
+	PERF_SP_GS_INSTRUCTIONS = 58,
+	PERF_SP_CS_INSTRUCTIONS = 59,
+	PERF_SP_GPR_READ = 60,
+	PERF_SP_GPR_WRITE = 61,
+	PERF_SP_LM_CH0_REQUESTS = 62,
+	PERF_SP_LM_CH1_REQUESTS = 63,
+	PERF_SP_LM_BANK_CONFLICTS = 64,
+};
+
+enum a5xx_rb_perfcounter_select {
+	PERF_RB_BUSY_CYCLES = 0,
+	PERF_RB_STALL_CYCLES_CCU = 1,
+	PERF_RB_STALL_CYCLES_HLSQ = 2,
+	PERF_RB_STALL_CYCLES_FIFO0_FULL = 3,
+	PERF_RB_STALL_CYCLES_FIFO1_FULL = 4,
+	PERF_RB_STALL_CYCLES_FIFO2_FULL = 5,
+	PERF_RB_STARVE_CYCLES_SP = 6,
+	PERF_RB_STARVE_CYCLES_LRZ_TILE = 7,
+	PERF_RB_STARVE_CYCLES_CCU = 8,
+	PERF_RB_STARVE_CYCLES_Z_PLANE = 9,
+	PERF_RB_STARVE_CYCLES_BARY_PLANE = 10,
+	PERF_RB_Z_WORKLOAD = 11,
+	PERF_RB_HLSQ_ACTIVE = 12,
+	PERF_RB_Z_READ = 13,
+	PERF_RB_Z_WRITE = 14,
+	PERF_RB_C_READ = 15,
+	PERF_RB_C_WRITE = 16,
+	PERF_RB_TOTAL_PASS = 17,
+	PERF_RB_Z_PASS = 18,
+	PERF_RB_Z_FAIL = 19,
+	PERF_RB_S_FAIL = 20,
+	PERF_RB_BLENDED_FXP_COMPONENTS = 21,
+	PERF_RB_BLENDED_FP16_COMPONENTS = 22,
+	RB_RESERVED = 23,
+	PERF_RB_2D_ALIVE_CYCLES = 24,
+	PERF_RB_2D_STALL_CYCLES_A2D = 25,
+	PERF_RB_2D_STARVE_CYCLES_SRC = 26,
+	PERF_RB_2D_STARVE_CYCLES_SP = 27,
+	PERF_RB_2D_STARVE_CYCLES_DST = 28,
+	PERF_RB_2D_VALID_PIXELS = 29,
+};
+
+enum a5xx_rb_samples_perfcounter_select {
+	TOTAL_SAMPLES = 0,
+	ZPASS_SAMPLES = 1,
+	ZFAIL_SAMPLES = 2,
+	SFAIL_SAMPLES = 3,
+};
+
+enum a5xx_vsc_perfcounter_select {
+	PERF_VSC_BUSY_CYCLES = 0,
+	PERF_VSC_WORKING_CYCLES = 1,
+	PERF_VSC_STALL_CYCLES_UCHE = 2,
+	PERF_VSC_EOT_NUM = 3,
+};
+
+enum a5xx_ccu_perfcounter_select {
+	PERF_CCU_BUSY_CYCLES = 0,
+	PERF_CCU_STALL_CYCLES_RB_DEPTH_RETURN = 1,
+	PERF_CCU_STALL_CYCLES_RB_COLOR_RETURN = 2,
+	PERF_CCU_STARVE_CYCLES_FLAG_RETURN = 3,
+	PERF_CCU_DEPTH_BLOCKS = 4,
+	PERF_CCU_COLOR_BLOCKS = 5,
+	PERF_CCU_DEPTH_BLOCK_HIT = 6,
+	PERF_CCU_COLOR_BLOCK_HIT = 7,
+	PERF_CCU_PARTIAL_BLOCK_READ = 8,
+	PERF_CCU_GMEM_READ = 9,
+	PERF_CCU_GMEM_WRITE = 10,
+	PERF_CCU_DEPTH_READ_FLAG0_COUNT = 11,
+	PERF_CCU_DEPTH_READ_FLAG1_COUNT = 12,
+	PERF_CCU_DEPTH_READ_FLAG2_COUNT = 13,
+	PERF_CCU_DEPTH_READ_FLAG3_COUNT = 14,
+	PERF_CCU_DEPTH_READ_FLAG4_COUNT = 15,
+	PERF_CCU_COLOR_READ_FLAG0_COUNT = 16,
+	PERF_CCU_COLOR_READ_FLAG1_COUNT = 17,
+	PERF_CCU_COLOR_READ_FLAG2_COUNT = 18,
+	PERF_CCU_COLOR_READ_FLAG3_COUNT = 19,
+	PERF_CCU_COLOR_READ_FLAG4_COUNT = 20,
+	PERF_CCU_2D_BUSY_CYCLES = 21,
+	PERF_CCU_2D_RD_REQ = 22,
+	PERF_CCU_2D_WR_REQ = 23,
+	PERF_CCU_2D_REORDER_STARVE_CYCLES = 24,
+	PERF_CCU_2D_PIXELS = 25,
+};
+
+enum a5xx_cmp_perfcounter_select {
+	PERF_CMPDECMP_STALL_CYCLES_VBIF = 0,
+	PERF_CMPDECMP_VBIF_LATENCY_CYCLES = 1,
+	PERF_CMPDECMP_VBIF_LATENCY_SAMPLES = 2,
+	PERF_CMPDECMP_VBIF_READ_DATA_CCU = 3,
+	PERF_CMPDECMP_VBIF_WRITE_DATA_CCU = 4,
+	PERF_CMPDECMP_VBIF_READ_REQUEST = 5,
+	PERF_CMPDECMP_VBIF_WRITE_REQUEST = 6,
+	PERF_CMPDECMP_VBIF_READ_DATA = 7,
+	PERF_CMPDECMP_VBIF_WRITE_DATA = 8,
+	PERF_CMPDECMP_FLAG_FETCH_CYCLES = 9,
+	PERF_CMPDECMP_FLAG_FETCH_SAMPLES = 10,
+	PERF_CMPDECMP_DEPTH_WRITE_FLAG1_COUNT = 11,
+	PERF_CMPDECMP_DEPTH_WRITE_FLAG2_COUNT = 12,
+	PERF_CMPDECMP_DEPTH_WRITE_FLAG3_COUNT = 13,
+	PERF_CMPDECMP_DEPTH_WRITE_FLAG4_COUNT = 14,
+	PERF_CMPDECMP_COLOR_WRITE_FLAG1_COUNT = 15,
+	PERF_CMPDECMP_COLOR_WRITE_FLAG2_COUNT = 16,
+	PERF_CMPDECMP_COLOR_WRITE_FLAG3_COUNT = 17,
+	PERF_CMPDECMP_COLOR_WRITE_FLAG4_COUNT = 18,
+	PERF_CMPDECMP_2D_STALL_CYCLES_VBIF_REQ = 19,
+	PERF_CMPDECMP_2D_STALL_CYCLES_VBIF_WR = 20,
+	PERF_CMPDECMP_2D_STALL_CYCLES_VBIF_RETURN = 21,
+	PERF_CMPDECMP_2D_RD_DATA = 22,
+	PERF_CMPDECMP_2D_WR_DATA = 23,
+};
+
+enum a5xx_vbif_perfcounter_select {
+	AXI_READ_REQUESTS_ID_0 = 0,
+	AXI_READ_REQUESTS_ID_1 = 1,
+	AXI_READ_REQUESTS_ID_2 = 2,
+	AXI_READ_REQUESTS_ID_3 = 3,
+	AXI_READ_REQUESTS_ID_4 = 4,
+	AXI_READ_REQUESTS_ID_5 = 5,
+	AXI_READ_REQUESTS_ID_6 = 6,
+	AXI_READ_REQUESTS_ID_7 = 7,
+	AXI_READ_REQUESTS_ID_8 = 8,
+	AXI_READ_REQUESTS_ID_9 = 9,
+	AXI_READ_REQUESTS_ID_10 = 10,
+	AXI_READ_REQUESTS_ID_11 = 11,
+	AXI_READ_REQUESTS_ID_12 = 12,
+	AXI_READ_REQUESTS_ID_13 = 13,
+	AXI_READ_REQUESTS_ID_14 = 14,
+	AXI_READ_REQUESTS_ID_15 = 15,
+	AXI0_READ_REQUESTS_TOTAL = 16,
+	AXI1_READ_REQUESTS_TOTAL = 17,
+	AXI2_READ_REQUESTS_TOTAL = 18,
+	AXI3_READ_REQUESTS_TOTAL = 19,
+	AXI_READ_REQUESTS_TOTAL = 20,
+	AXI_WRITE_REQUESTS_ID_0 = 21,
+	AXI_WRITE_REQUESTS_ID_1 = 22,
+	AXI_WRITE_REQUESTS_ID_2 = 23,
+	AXI_WRITE_REQUESTS_ID_3 = 24,
+	AXI_WRITE_REQUESTS_ID_4 = 25,
+	AXI_WRITE_REQUESTS_ID_5 = 26,
+	AXI_WRITE_REQUESTS_ID_6 = 27,
+	AXI_WRITE_REQUESTS_ID_7 = 28,
+	AXI_WRITE_REQUESTS_ID_8 = 29,
+	AXI_WRITE_REQUESTS_ID_9 = 30,
+	AXI_WRITE_REQUESTS_ID_10 = 31,
+	AXI_WRITE_REQUESTS_ID_11 = 32,
+	AXI_WRITE_REQUESTS_ID_12 = 33,
+	AXI_WRITE_REQUESTS_ID_13 = 34,
+	AXI_WRITE_REQUESTS_ID_14 = 35,
+	AXI_WRITE_REQUESTS_ID_15 = 36,
+	AXI0_WRITE_REQUESTS_TOTAL = 37,
+	AXI1_WRITE_REQUESTS_TOTAL = 38,
+	AXI2_WRITE_REQUESTS_TOTAL = 39,
+	AXI3_WRITE_REQUESTS_TOTAL = 40,
+	AXI_WRITE_REQUESTS_TOTAL = 41,
+	AXI_TOTAL_REQUESTS = 42,
+	AXI_READ_DATA_BEATS_ID_0 = 43,
+	AXI_READ_DATA_BEATS_ID_1 = 44,
+	AXI_READ_DATA_BEATS_ID_2 = 45,
+	AXI_READ_DATA_BEATS_ID_3 = 46,
+	AXI_READ_DATA_BEATS_ID_4 = 47,
+	AXI_READ_DATA_BEATS_ID_5 = 48,
+	AXI_READ_DATA_BEATS_ID_6 = 49,
+	AXI_READ_DATA_BEATS_ID_7 = 50,
+	AXI_READ_DATA_BEATS_ID_8 = 51,
+	AXI_READ_DATA_BEATS_ID_9 = 52,
+	AXI_READ_DATA_BEATS_ID_10 = 53,
+	AXI_READ_DATA_BEATS_ID_11 = 54,
+	AXI_READ_DATA_BEATS_ID_12 = 55,
+	AXI_READ_DATA_BEATS_ID_13 = 56,
+	AXI_READ_DATA_BEATS_ID_14 = 57,
+	AXI_READ_DATA_BEATS_ID_15 = 58,
+	AXI0_READ_DATA_BEATS_TOTAL = 59,
+	AXI1_READ_DATA_BEATS_TOTAL = 60,
+	AXI2_READ_DATA_BEATS_TOTAL = 61,
+	AXI3_READ_DATA_BEATS_TOTAL = 62,
+	AXI_READ_DATA_BEATS_TOTAL = 63,
+	AXI_WRITE_DATA_BEATS_ID_0 = 64,
+	AXI_WRITE_DATA_BEATS_ID_1 = 65,
+	AXI_WRITE_DATA_BEATS_ID_2 = 66,
+	AXI_WRITE_DATA_BEATS_ID_3 = 67,
+	AXI_WRITE_DATA_BEATS_ID_4 = 68,
+	AXI_WRITE_DATA_BEATS_ID_5 = 69,
+	AXI_WRITE_DATA_BEATS_ID_6 = 70,
+	AXI_WRITE_DATA_BEATS_ID_7 = 71,
+	AXI_WRITE_DATA_BEATS_ID_8 = 72,
+	AXI_WRITE_DATA_BEATS_ID_9 = 73,
+	AXI_WRITE_DATA_BEATS_ID_10 = 74,
+	AXI_WRITE_DATA_BEATS_ID_11 = 75,
+	AXI_WRITE_DATA_BEATS_ID_12 = 76,
+	AXI_WRITE_DATA_BEATS_ID_13 = 77,
+	AXI_WRITE_DATA_BEATS_ID_14 = 78,
+	AXI_WRITE_DATA_BEATS_ID_15 = 79,
+	AXI0_WRITE_DATA_BEATS_TOTAL = 80,
+	AXI1_WRITE_DATA_BEATS_TOTAL = 81,
+	AXI2_WRITE_DATA_BEATS_TOTAL = 82,
+	AXI3_WRITE_DATA_BEATS_TOTAL = 83,
+	AXI_WRITE_DATA_BEATS_TOTAL = 84,
+	AXI_DATA_BEATS_TOTAL = 85,
+};
+
 enum a5xx_tex_filter {
 	A5XX_TEX_NEAREST = 0,
 	A5XX_TEX_LINEAR = 1,
@@ -1337,25 +1939,85 @@
 
 #define REG_A5XX_RBBM_SECVID_TSB_ADDR_MODE_CNTL			0x0000f810
 
-#define REG_A5XX_VSC_PIPE_DATA_LENGTH_0				0x00000c00
+#define REG_A5XX_VSC_BIN_SIZE					0x00000bc2
+#define A5XX_VSC_BIN_SIZE_WIDTH__MASK				0x000000ff
+#define A5XX_VSC_BIN_SIZE_WIDTH__SHIFT				0
+static inline uint32_t A5XX_VSC_BIN_SIZE_WIDTH(uint32_t val)
+{
+	assert(!(val & 0x1f));
+	return ((val >> 5) << A5XX_VSC_BIN_SIZE_WIDTH__SHIFT) & A5XX_VSC_BIN_SIZE_WIDTH__MASK;
+}
+#define A5XX_VSC_BIN_SIZE_HEIGHT__MASK				0x0001fe00
+#define A5XX_VSC_BIN_SIZE_HEIGHT__SHIFT				9
+static inline uint32_t A5XX_VSC_BIN_SIZE_HEIGHT(uint32_t val)
+{
+	assert(!(val & 0x1f));
+	return ((val >> 5) << A5XX_VSC_BIN_SIZE_HEIGHT__SHIFT) & A5XX_VSC_BIN_SIZE_HEIGHT__MASK;
+}
+
+#define REG_A5XX_VSC_SIZE_ADDRESS_LO				0x00000bc3
+
+#define REG_A5XX_VSC_SIZE_ADDRESS_HI				0x00000bc4
+
+#define REG_A5XX_UNKNOWN_0BC5					0x00000bc5
+
+#define REG_A5XX_UNKNOWN_0BC6					0x00000bc6
+
+static inline uint32_t REG_A5XX_VSC_PIPE_CONFIG(uint32_t i0) { return 0x00000bd0 + 0x1*i0; }
+
+static inline uint32_t REG_A5XX_VSC_PIPE_CONFIG_REG(uint32_t i0) { return 0x00000bd0 + 0x1*i0; }
+#define A5XX_VSC_PIPE_CONFIG_REG_X__MASK			0x000003ff
+#define A5XX_VSC_PIPE_CONFIG_REG_X__SHIFT			0
+static inline uint32_t A5XX_VSC_PIPE_CONFIG_REG_X(uint32_t val)
+{
+	return ((val) << A5XX_VSC_PIPE_CONFIG_REG_X__SHIFT) & A5XX_VSC_PIPE_CONFIG_REG_X__MASK;
+}
+#define A5XX_VSC_PIPE_CONFIG_REG_Y__MASK			0x000ffc00
+#define A5XX_VSC_PIPE_CONFIG_REG_Y__SHIFT			10
+static inline uint32_t A5XX_VSC_PIPE_CONFIG_REG_Y(uint32_t val)
+{
+	return ((val) << A5XX_VSC_PIPE_CONFIG_REG_Y__SHIFT) & A5XX_VSC_PIPE_CONFIG_REG_Y__MASK;
+}
+#define A5XX_VSC_PIPE_CONFIG_REG_W__MASK			0x00f00000
+#define A5XX_VSC_PIPE_CONFIG_REG_W__SHIFT			20
+static inline uint32_t A5XX_VSC_PIPE_CONFIG_REG_W(uint32_t val)
+{
+	return ((val) << A5XX_VSC_PIPE_CONFIG_REG_W__SHIFT) & A5XX_VSC_PIPE_CONFIG_REG_W__MASK;
+}
+#define A5XX_VSC_PIPE_CONFIG_REG_H__MASK			0x0f000000
+#define A5XX_VSC_PIPE_CONFIG_REG_H__SHIFT			24
+static inline uint32_t A5XX_VSC_PIPE_CONFIG_REG_H(uint32_t val)
+{
+	return ((val) << A5XX_VSC_PIPE_CONFIG_REG_H__SHIFT) & A5XX_VSC_PIPE_CONFIG_REG_H__MASK;
+}
+
+static inline uint32_t REG_A5XX_VSC_PIPE_DATA_ADDRESS(uint32_t i0) { return 0x00000be0 + 0x2*i0; }
+
+static inline uint32_t REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(uint32_t i0) { return 0x00000be0 + 0x2*i0; }
+
+static inline uint32_t REG_A5XX_VSC_PIPE_DATA_ADDRESS_HI(uint32_t i0) { return 0x00000be1 + 0x2*i0; }
+
+static inline uint32_t REG_A5XX_VSC_PIPE_DATA_LENGTH(uint32_t i0) { return 0x00000c00 + 0x1*i0; }
+
+static inline uint32_t REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(uint32_t i0) { return 0x00000c00 + 0x1*i0; }
 
 #define REG_A5XX_VSC_PERFCTR_VSC_SEL_0				0x00000c60
 
 #define REG_A5XX_VSC_PERFCTR_VSC_SEL_1				0x00000c61
 
-#define REG_A5XX_VSC_BIN_SIZE					0x00000cdd
-#define A5XX_VSC_BIN_SIZE_WINDOW_OFFSET_DISABLE			0x80000000
-#define A5XX_VSC_BIN_SIZE_X__MASK				0x00007fff
-#define A5XX_VSC_BIN_SIZE_X__SHIFT				0
-static inline uint32_t A5XX_VSC_BIN_SIZE_X(uint32_t val)
+#define REG_A5XX_VSC_RESOLVE_CNTL				0x00000cdd
+#define A5XX_VSC_RESOLVE_CNTL_WINDOW_OFFSET_DISABLE		0x80000000
+#define A5XX_VSC_RESOLVE_CNTL_X__MASK				0x00007fff
+#define A5XX_VSC_RESOLVE_CNTL_X__SHIFT				0
+static inline uint32_t A5XX_VSC_RESOLVE_CNTL_X(uint32_t val)
 {
-	return ((val) << A5XX_VSC_BIN_SIZE_X__SHIFT) & A5XX_VSC_BIN_SIZE_X__MASK;
+	return ((val) << A5XX_VSC_RESOLVE_CNTL_X__SHIFT) & A5XX_VSC_RESOLVE_CNTL_X__MASK;
 }
-#define A5XX_VSC_BIN_SIZE_Y__MASK				0x7fff0000
-#define A5XX_VSC_BIN_SIZE_Y__SHIFT				16
-static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
+#define A5XX_VSC_RESOLVE_CNTL_Y__MASK				0x7fff0000
+#define A5XX_VSC_RESOLVE_CNTL_Y__SHIFT				16
+static inline uint32_t A5XX_VSC_RESOLVE_CNTL_Y(uint32_t val)
 {
-	return ((val) << A5XX_VSC_BIN_SIZE_Y__SHIFT) & A5XX_VSC_BIN_SIZE_Y__MASK;
+	return ((val) << A5XX_VSC_RESOLVE_CNTL_Y__SHIFT) & A5XX_VSC_RESOLVE_CNTL_Y__MASK;
 }
 
 #define REG_A5XX_GRAS_ADDR_MODE_CNTL				0x00000c81
@@ -1518,6 +2180,7 @@
 #define REG_A5XX_VPC_ADDR_MODE_CNTL				0x00000e61
 
 #define REG_A5XX_VPC_MODE_CNTL					0x00000e62
+#define A5XX_VPC_MODE_CNTL_BINNING_PASS				0x00000001
 
 #define REG_A5XX_VPC_PERFCTR_VPC_SEL_0				0x00000e64
 
@@ -1689,6 +2352,14 @@
 
 #define REG_A5XX_VBIF_TEST_BUS_OUT				0x0000308c
 
+#define REG_A5XX_VBIF_PERF_CNT_EN0				0x000030c0
+
+#define REG_A5XX_VBIF_PERF_CNT_EN1				0x000030c1
+
+#define REG_A5XX_VBIF_PERF_CNT_EN2				0x000030c2
+
+#define REG_A5XX_VBIF_PERF_CNT_EN3				0x000030c3
+
 #define REG_A5XX_VBIF_PERF_CNT_SEL0				0x000030d0
 
 #define REG_A5XX_VBIF_PERF_CNT_SEL1				0x000030d1
@@ -1952,6 +2623,7 @@
 #define REG_A5XX_GPU_CS_AMP_CALIBRATION_CONTROL1		0x0000c557
 
 #define REG_A5XX_GRAS_CL_CNTL					0x0000e000
+#define A5XX_GRAS_CL_CNTL_ZERO_GB_SCALE_Z			0x00000040
 
 #define REG_A5XX_UNKNOWN_E001					0x0000e001
 
@@ -2103,6 +2775,7 @@
 #define REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL			0x0000e099
 
 #define REG_A5XX_GRAS_SC_CNTL					0x0000e0a0
+#define A5XX_GRAS_SC_CNTL_BINNING_PASS				0x00000001
 #define A5XX_GRAS_SC_CNTL_SAMPLES_PASSED			0x00008000
 
 #define REG_A5XX_GRAS_SC_BIN_CNTL				0x0000e0a1
@@ -2217,12 +2890,22 @@
 }
 
 #define REG_A5XX_GRAS_LRZ_CNTL					0x0000e100
+#define A5XX_GRAS_LRZ_CNTL_ENABLE				0x00000001
+#define A5XX_GRAS_LRZ_CNTL_LRZ_WRITE				0x00000002
+#define A5XX_GRAS_LRZ_CNTL_GREATER				0x00000004
 
 #define REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO			0x0000e101
 
 #define REG_A5XX_GRAS_LRZ_BUFFER_BASE_HI			0x0000e102
 
 #define REG_A5XX_GRAS_LRZ_BUFFER_PITCH				0x0000e103
+#define A5XX_GRAS_LRZ_BUFFER_PITCH__MASK			0xffffffff
+#define A5XX_GRAS_LRZ_BUFFER_PITCH__SHIFT			0
+static inline uint32_t A5XX_GRAS_LRZ_BUFFER_PITCH(uint32_t val)
+{
+	assert(!(val & 0x1f));
+	return ((val >> 5) << A5XX_GRAS_LRZ_BUFFER_PITCH__SHIFT) & A5XX_GRAS_LRZ_BUFFER_PITCH__MASK;
+}
 
 #define REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO		0x0000e104
 
@@ -2246,7 +2929,9 @@
 #define A5XX_RB_CNTL_BYPASS					0x00020000
 
 #define REG_A5XX_RB_RENDER_CNTL					0x0000e141
+#define A5XX_RB_RENDER_CNTL_BINNING_PASS			0x00000001
 #define A5XX_RB_RENDER_CNTL_SAMPLES_PASSED			0x00000040
+#define A5XX_RB_RENDER_CNTL_DISABLE_COLOR_PIPE			0x00000080
 #define A5XX_RB_RENDER_CNTL_FLAG_DEPTH				0x00004000
 #define A5XX_RB_RENDER_CNTL_FLAG_DEPTH2				0x00008000
 #define A5XX_RB_RENDER_CNTL_FLAG_MRTS__MASK			0x00ff0000
@@ -2354,6 +3039,13 @@
 static inline uint32_t REG_A5XX_RB_MRT_CONTROL(uint32_t i0) { return 0x0000e150 + 0x7*i0; }
 #define A5XX_RB_MRT_CONTROL_BLEND				0x00000001
 #define A5XX_RB_MRT_CONTROL_BLEND2				0x00000002
+#define A5XX_RB_MRT_CONTROL_ROP_ENABLE				0x00000004
+#define A5XX_RB_MRT_CONTROL_ROP_CODE__MASK			0x00000078
+#define A5XX_RB_MRT_CONTROL_ROP_CODE__SHIFT			3
+static inline uint32_t A5XX_RB_MRT_CONTROL_ROP_CODE(enum a3xx_rop_code val)
+{
+	return ((val) << A5XX_RB_MRT_CONTROL_ROP_CODE__SHIFT) & A5XX_RB_MRT_CONTROL_ROP_CODE__MASK;
+}
 #define A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK		0x00000780
 #define A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__SHIFT		7
 static inline uint32_t A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE(uint32_t val)
@@ -2727,7 +3419,25 @@
 	return ((val) << A5XX_RB_STENCILREFMASK_STENCILWRITEMASK__SHIFT) & A5XX_RB_STENCILREFMASK_STENCILWRITEMASK__MASK;
 }
 
-#define REG_A5XX_UNKNOWN_E1C7					0x0000e1c7
+#define REG_A5XX_RB_STENCILREFMASK_BF				0x0000e1c7
+#define A5XX_RB_STENCILREFMASK_BF_STENCILREF__MASK		0x000000ff
+#define A5XX_RB_STENCILREFMASK_BF_STENCILREF__SHIFT		0
+static inline uint32_t A5XX_RB_STENCILREFMASK_BF_STENCILREF(uint32_t val)
+{
+	return ((val) << A5XX_RB_STENCILREFMASK_BF_STENCILREF__SHIFT) & A5XX_RB_STENCILREFMASK_BF_STENCILREF__MASK;
+}
+#define A5XX_RB_STENCILREFMASK_BF_STENCILMASK__MASK		0x0000ff00
+#define A5XX_RB_STENCILREFMASK_BF_STENCILMASK__SHIFT		8
+static inline uint32_t A5XX_RB_STENCILREFMASK_BF_STENCILMASK(uint32_t val)
+{
+	return ((val) << A5XX_RB_STENCILREFMASK_BF_STENCILMASK__SHIFT) & A5XX_RB_STENCILREFMASK_BF_STENCILMASK__MASK;
+}
+#define A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK__MASK	0x00ff0000
+#define A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK__SHIFT	16
+static inline uint32_t A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(uint32_t val)
+{
+	return ((val) << A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK__SHIFT) & A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK__MASK;
+}
 
 #define REG_A5XX_RB_WINDOW_OFFSET				0x0000e1d0
 #define A5XX_RB_WINDOW_OFFSET_WINDOW_OFFSET_DISABLE		0x80000000
@@ -2744,6 +3454,9 @@
 	return ((val) << A5XX_RB_WINDOW_OFFSET_Y__SHIFT) & A5XX_RB_WINDOW_OFFSET_Y__MASK;
 }
 
+#define REG_A5XX_RB_SAMPLE_COUNT_CONTROL			0x0000e1d1
+#define A5XX_RB_SAMPLE_COUNT_CONTROL_COPY			0x00000002
+
 #define REG_A5XX_RB_BLIT_CNTL					0x0000e210
 #define A5XX_RB_BLIT_CNTL_BUF__MASK				0x0000000f
 #define A5XX_RB_BLIT_CNTL_BUF__SHIFT				0
@@ -2875,6 +3588,10 @@
 	return ((val >> 6) << A5XX_RB_BLIT_FLAG_DST_ARRAY_PITCH__SHIFT) & A5XX_RB_BLIT_FLAG_DST_ARRAY_PITCH__MASK;
 }
 
+#define REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO			0x0000e267
+
+#define REG_A5XX_RB_SAMPLE_COUNT_ADDR_HI			0x0000e268
+
 #define REG_A5XX_VPC_CNTL_0					0x0000e280
 #define A5XX_VPC_CNTL_0_STRIDE_IN_VPC__MASK			0x0000007f
 #define A5XX_VPC_CNTL_0_STRIDE_IN_VPC__SHIFT			0
@@ -2986,11 +3703,26 @@
 {
 	return ((val) << A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC__SHIFT) & A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC__MASK;
 }
+#define A5XX_PC_PRIMITIVE_CNTL_PRIMITIVE_RESTART		0x00000100
+#define A5XX_PC_PRIMITIVE_CNTL_PROVOKING_VTX_LAST		0x00000400
 
 #define REG_A5XX_PC_PRIM_VTX_CNTL				0x0000e385
 #define A5XX_PC_PRIM_VTX_CNTL_PSIZE				0x00000800
 
 #define REG_A5XX_PC_RASTER_CNTL					0x0000e388
+#define A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE__MASK		0x00000007
+#define A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE__SHIFT		0
+static inline uint32_t A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE(enum adreno_pa_su_sc_draw val)
+{
+	return ((val) << A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE__SHIFT) & A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE__MASK;
+}
+#define A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE__MASK		0x00000038
+#define A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE__SHIFT		3
+static inline uint32_t A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE(enum adreno_pa_su_sc_draw val)
+{
+	return ((val) << A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE__SHIFT) & A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE__MASK;
+}
+#define A5XX_PC_RASTER_CNTL_POLYMODE_ENABLE			0x00000040
 
 #define REG_A5XX_UNKNOWN_E389					0x0000e389
 
@@ -3058,12 +3790,18 @@
 	return ((val) << A5XX_VFD_DECODE_INSTR_IDX__SHIFT) & A5XX_VFD_DECODE_INSTR_IDX__MASK;
 }
 #define A5XX_VFD_DECODE_INSTR_INSTANCED				0x00020000
-#define A5XX_VFD_DECODE_INSTR_FORMAT__MASK			0x3ff00000
+#define A5XX_VFD_DECODE_INSTR_FORMAT__MASK			0x0ff00000
 #define A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT			20
 static inline uint32_t A5XX_VFD_DECODE_INSTR_FORMAT(enum a5xx_vtx_fmt val)
 {
 	return ((val) << A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT) & A5XX_VFD_DECODE_INSTR_FORMAT__MASK;
 }
+#define A5XX_VFD_DECODE_INSTR_SWAP__MASK			0x30000000
+#define A5XX_VFD_DECODE_INSTR_SWAP__SHIFT			28
+static inline uint32_t A5XX_VFD_DECODE_INSTR_SWAP(enum a3xx_color_swap val)
+{
+	return ((val) << A5XX_VFD_DECODE_INSTR_SWAP__SHIFT) & A5XX_VFD_DECODE_INSTR_SWAP__MASK;
+}
 #define A5XX_VFD_DECODE_INSTR_UNK30				0x40000000
 #define A5XX_VFD_DECODE_INSTR_FLOAT				0x80000000
 
@@ -3089,82 +3827,95 @@
 
 #define REG_A5XX_SP_SP_CNTL					0x0000e580
 
-#define REG_A5XX_SP_VS_CONTROL_REG				0x0000e584
-#define A5XX_SP_VS_CONTROL_REG_ENABLED				0x00000001
-#define A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET__MASK		0x000000fe
-#define A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT		1
-static inline uint32_t A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_SP_VS_CONFIG					0x0000e584
+#define A5XX_SP_VS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_SP_VS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_VS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_VS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_SP_VS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_VS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_SP_FS_CONTROL_REG				0x0000e585
-#define A5XX_SP_FS_CONTROL_REG_ENABLED				0x00000001
-#define A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET__MASK		0x000000fe
-#define A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT		1
-static inline uint32_t A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_SP_FS_CONFIG					0x0000e585
+#define A5XX_SP_FS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_SP_FS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_FS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_FS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_SP_FS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_FS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_SP_HS_CONTROL_REG				0x0000e586
-#define A5XX_SP_HS_CONTROL_REG_ENABLED				0x00000001
-#define A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET__MASK		0x000000fe
-#define A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT		1
-static inline uint32_t A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_SP_HS_CONFIG					0x0000e586
+#define A5XX_SP_HS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_SP_HS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_HS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_HS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_SP_HS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_HS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_SP_DS_CONTROL_REG				0x0000e587
-#define A5XX_SP_DS_CONTROL_REG_ENABLED				0x00000001
-#define A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET__MASK		0x000000fe
-#define A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT		1
-static inline uint32_t A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_SP_DS_CONFIG					0x0000e587
+#define A5XX_SP_DS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_SP_DS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_DS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_DS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_SP_DS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_DS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_SP_GS_CONTROL_REG				0x0000e588
-#define A5XX_SP_GS_CONTROL_REG_ENABLED				0x00000001
-#define A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET__MASK		0x000000fe
-#define A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT		1
-static inline uint32_t A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_SP_GS_CONFIG					0x0000e588
+#define A5XX_SP_GS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_SP_GS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_GS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_GS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_SP_GS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_GS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
 #define REG_A5XX_SP_CS_CONFIG					0x0000e589
+#define A5XX_SP_CS_CONFIG_ENABLED				0x00000001
+#define A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
+{
+	return ((val) << A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET__MASK;
+}
+#define A5XX_SP_CS_CONFIG_SHADEROBJOFFSET__MASK			0x00007f00
+#define A5XX_SP_CS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_SP_CS_CONFIG_SHADEROBJOFFSET(uint32_t val)
+{
+	return ((val) << A5XX_SP_CS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_SP_CS_CONFIG_SHADEROBJOFFSET__MASK;
+}
 
 #define REG_A5XX_SP_VS_CONFIG_MAX_CONST				0x0000e58a
 
@@ -3303,6 +4054,8 @@
 #define REG_A5XX_SP_FS_OBJ_START_HI				0x0000e5c4
 
 #define REG_A5XX_SP_BLEND_CNTL					0x0000e5c9
+#define A5XX_SP_BLEND_CNTL_ENABLED				0x00000001
+#define A5XX_SP_BLEND_CNTL_UNK8					0x00000100
 
 #define REG_A5XX_SP_FS_OUTPUT_CNTL				0x0000e5ca
 #define A5XX_SP_FS_OUTPUT_CNTL_MRT__MASK			0x0000000f
@@ -3344,16 +4097,68 @@
 {
 	return ((val) << A5XX_SP_FS_MRT_REG_COLOR_FORMAT__SHIFT) & A5XX_SP_FS_MRT_REG_COLOR_FORMAT__MASK;
 }
+#define A5XX_SP_FS_MRT_REG_COLOR_SINT				0x00000100
+#define A5XX_SP_FS_MRT_REG_COLOR_UINT				0x00000200
 #define A5XX_SP_FS_MRT_REG_COLOR_SRGB				0x00000400
 
 #define REG_A5XX_UNKNOWN_E5DB					0x0000e5db
 
-#define REG_A5XX_SP_CS_CNTL_0					0x0000e5f0
+#define REG_A5XX_UNKNOWN_E5F2					0x0000e5f2
+
+#define REG_A5XX_SP_CS_OBJ_START_LO				0x0000e5f3
+
+#define REG_A5XX_SP_CS_OBJ_START_HI				0x0000e5f4
+
+#define REG_A5XX_SP_CS_CTRL_REG0				0x0000e5f0
+#define A5XX_SP_CS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_CS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_CS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_CS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_CS_CTRL_REG0_THREADSIZE__MASK;
+}
+#define A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
+#define A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
+static inline uint32_t A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
+#define A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
+static inline uint32_t A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_CS_CTRL_REG0_VARYING				0x00010000
+#define A5XX_SP_CS_CTRL_REG0_PIXLODENABLE			0x00100000
+#define A5XX_SP_CS_CTRL_REG0_BRANCHSTACK__MASK			0xfe000000
+#define A5XX_SP_CS_CTRL_REG0_BRANCHSTACK__SHIFT			25
+static inline uint32_t A5XX_SP_CS_CTRL_REG0_BRANCHSTACK(uint32_t val)
+{
+	return ((val) << A5XX_SP_CS_CTRL_REG0_BRANCHSTACK__SHIFT) & A5XX_SP_CS_CTRL_REG0_BRANCHSTACK__MASK;
+}
 
 #define REG_A5XX_UNKNOWN_E600					0x0000e600
 
+#define REG_A5XX_UNKNOWN_E602					0x0000e602
+
+#define REG_A5XX_SP_HS_OBJ_START_LO				0x0000e603
+
+#define REG_A5XX_SP_HS_OBJ_START_HI				0x0000e604
+
+#define REG_A5XX_UNKNOWN_E62B					0x0000e62b
+
+#define REG_A5XX_SP_DS_OBJ_START_LO				0x0000e62c
+
+#define REG_A5XX_SP_DS_OBJ_START_HI				0x0000e62d
+
 #define REG_A5XX_UNKNOWN_E640					0x0000e640
 
+#define REG_A5XX_UNKNOWN_E65B					0x0000e65b
+
+#define REG_A5XX_SP_GS_OBJ_START_LO				0x0000e65c
+
+#define REG_A5XX_SP_GS_OBJ_START_HI				0x0000e65d
+
 #define REG_A5XX_TPL1_TP_RAS_MSAA_CNTL				0x0000e704
 #define A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES__MASK		0x00000003
 #define A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES__SHIFT		0
@@ -3377,24 +4182,64 @@
 
 #define REG_A5XX_TPL1_VS_TEX_COUNT				0x0000e700
 
+#define REG_A5XX_TPL1_HS_TEX_COUNT				0x0000e701
+
+#define REG_A5XX_TPL1_DS_TEX_COUNT				0x0000e702
+
+#define REG_A5XX_TPL1_GS_TEX_COUNT				0x0000e703
+
 #define REG_A5XX_TPL1_VS_TEX_SAMP_LO				0x0000e722
 
 #define REG_A5XX_TPL1_VS_TEX_SAMP_HI				0x0000e723
 
+#define REG_A5XX_TPL1_HS_TEX_SAMP_LO				0x0000e724
+
+#define REG_A5XX_TPL1_HS_TEX_SAMP_HI				0x0000e725
+
+#define REG_A5XX_TPL1_DS_TEX_SAMP_LO				0x0000e726
+
+#define REG_A5XX_TPL1_DS_TEX_SAMP_HI				0x0000e727
+
+#define REG_A5XX_TPL1_GS_TEX_SAMP_LO				0x0000e728
+
+#define REG_A5XX_TPL1_GS_TEX_SAMP_HI				0x0000e729
+
 #define REG_A5XX_TPL1_VS_TEX_CONST_LO				0x0000e72a
 
 #define REG_A5XX_TPL1_VS_TEX_CONST_HI				0x0000e72b
 
+#define REG_A5XX_TPL1_HS_TEX_CONST_LO				0x0000e72c
+
+#define REG_A5XX_TPL1_HS_TEX_CONST_HI				0x0000e72d
+
+#define REG_A5XX_TPL1_DS_TEX_CONST_LO				0x0000e72e
+
+#define REG_A5XX_TPL1_DS_TEX_CONST_HI				0x0000e72f
+
+#define REG_A5XX_TPL1_GS_TEX_CONST_LO				0x0000e730
+
+#define REG_A5XX_TPL1_GS_TEX_CONST_HI				0x0000e731
+
 #define REG_A5XX_TPL1_FS_TEX_COUNT				0x0000e750
 
+#define REG_A5XX_TPL1_CS_TEX_COUNT				0x0000e751
+
 #define REG_A5XX_TPL1_FS_TEX_SAMP_LO				0x0000e75a
 
 #define REG_A5XX_TPL1_FS_TEX_SAMP_HI				0x0000e75b
 
+#define REG_A5XX_TPL1_CS_TEX_SAMP_LO				0x0000e75c
+
+#define REG_A5XX_TPL1_CS_TEX_SAMP_HI				0x0000e75d
+
 #define REG_A5XX_TPL1_FS_TEX_CONST_LO				0x0000e75e
 
 #define REG_A5XX_TPL1_FS_TEX_CONST_HI				0x0000e75f
 
+#define REG_A5XX_TPL1_CS_TEX_CONST_LO				0x0000e760
+
+#define REG_A5XX_TPL1_CS_TEX_CONST_HI				0x0000e761
+
 #define REG_A5XX_TPL1_TP_FS_ROTATION_CNTL			0x0000e764
 
 #define REG_A5XX_HLSQ_CONTROL_0_REG				0x0000e784
@@ -3404,6 +4249,12 @@
 {
 	return ((val) << A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK;
 }
+#define A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE__MASK		0x00000004
+#define A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE__SHIFT		2
+static inline uint32_t A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE__SHIFT) & A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE__MASK;
+}
 
 #define REG_A5XX_HLSQ_CONTROL_1_REG				0x0000e785
 #define A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD__MASK	0x0000003f
@@ -3445,84 +4296,98 @@
 
 #define REG_A5XX_HLSQ_UPDATE_CNTL				0x0000e78a
 
-#define REG_A5XX_HLSQ_VS_CONTROL_REG				0x0000e78b
-#define A5XX_HLSQ_VS_CONTROL_REG_ENABLED			0x00000001
-#define A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET__MASK	0x000000fe
-#define A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT	1
-static inline uint32_t A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_HLSQ_VS_CONFIG					0x0000e78b
+#define A5XX_HLSQ_VS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_HLSQ_FS_CONTROL_REG				0x0000e78c
-#define A5XX_HLSQ_FS_CONTROL_REG_ENABLED			0x00000001
-#define A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET__MASK	0x000000fe
-#define A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT	1
-static inline uint32_t A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_HLSQ_FS_CONFIG					0x0000e78c
+#define A5XX_HLSQ_FS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_HLSQ_HS_CONTROL_REG				0x0000e78d
-#define A5XX_HLSQ_HS_CONTROL_REG_ENABLED			0x00000001
-#define A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET__MASK	0x000000fe
-#define A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT	1
-static inline uint32_t A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_HLSQ_HS_CONFIG					0x0000e78d
+#define A5XX_HLSQ_HS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_HLSQ_DS_CONTROL_REG				0x0000e78e
-#define A5XX_HLSQ_DS_CONTROL_REG_ENABLED			0x00000001
-#define A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET__MASK	0x000000fe
-#define A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT	1
-static inline uint32_t A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_HLSQ_DS_CONFIG					0x0000e78e
+#define A5XX_HLSQ_DS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
-#define REG_A5XX_HLSQ_GS_CONTROL_REG				0x0000e78f
-#define A5XX_HLSQ_GS_CONTROL_REG_ENABLED			0x00000001
-#define A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET__MASK	0x000000fe
-#define A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT	1
-static inline uint32_t A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(uint32_t val)
+#define REG_A5XX_HLSQ_GS_CONFIG					0x0000e78f
+#define A5XX_HLSQ_GS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET__MASK;
 }
-#define A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET__MASK		0x00007f00
-#define A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET__SHIFT		8
-static inline uint32_t A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
+#define A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET__MASK;
+	return ((val) << A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET__MASK;
 }
 
 #define REG_A5XX_HLSQ_CS_CONFIG					0x0000e790
+#define A5XX_HLSQ_CS_CONFIG_ENABLED				0x00000001
+#define A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET__MASK		0x000000fe
+#define A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET__SHIFT		1
+static inline uint32_t A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET__SHIFT) & A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET__MASK;
+}
+#define A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET__MASK		0x00007f00
+#define A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET__SHIFT		8
+static inline uint32_t A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET__SHIFT) & A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET__MASK;
+}
 
 #define REG_A5XX_HLSQ_VS_CNTL					0x0000e791
+#define A5XX_HLSQ_VS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_VS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_VS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_VS_CNTL_INSTRLEN(uint32_t val)
@@ -3531,6 +4396,7 @@
 }
 
 #define REG_A5XX_HLSQ_FS_CNTL					0x0000e792
+#define A5XX_HLSQ_FS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_FS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_FS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_FS_CNTL_INSTRLEN(uint32_t val)
@@ -3539,6 +4405,7 @@
 }
 
 #define REG_A5XX_HLSQ_HS_CNTL					0x0000e793
+#define A5XX_HLSQ_HS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_HS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_HS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_HS_CNTL_INSTRLEN(uint32_t val)
@@ -3547,6 +4414,7 @@
 }
 
 #define REG_A5XX_HLSQ_DS_CNTL					0x0000e794
+#define A5XX_HLSQ_DS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_DS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_DS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_DS_CNTL_INSTRLEN(uint32_t val)
@@ -3555,6 +4423,7 @@
 }
 
 #define REG_A5XX_HLSQ_GS_CNTL					0x0000e795
+#define A5XX_HLSQ_GS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_GS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_GS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_GS_CNTL_INSTRLEN(uint32_t val)
@@ -3563,6 +4432,7 @@
 }
 
 #define REG_A5XX_HLSQ_CS_CNTL					0x0000e796
+#define A5XX_HLSQ_CS_CNTL_SSBO_ENABLE				0x00000001
 #define A5XX_HLSQ_CS_CNTL_INSTRLEN__MASK			0xfffffffe
 #define A5XX_HLSQ_CS_CNTL_INSTRLEN__SHIFT			1
 static inline uint32_t A5XX_HLSQ_CS_CNTL_INSTRLEN(uint32_t val)
@@ -3577,20 +4447,86 @@
 #define REG_A5XX_HLSQ_CS_KERNEL_GROUP_Z				0x0000e7bb
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_0				0x0000e7b0
+#define A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM__MASK			0x00000003
+#define A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM__SHIFT			0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM__SHIFT) & A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM__MASK;
+}
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX__MASK			0x00000ffc
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX__SHIFT		2
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX__SHIFT) & A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX__MASK;
+}
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY__MASK			0x003ff000
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY__SHIFT		12
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY__SHIFT) & A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY__MASK;
+}
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ__MASK			0xffc00000
+#define A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ__SHIFT		22
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ__SHIFT) & A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_1				0x0000e7b1
+#define A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__MASK			0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__SHIFT			0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_1_SIZE_X(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__SHIFT) & A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_2				0x0000e7b2
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_3				0x0000e7b3
+#define A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__MASK			0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__SHIFT			0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__SHIFT) & A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_4				0x0000e7b4
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_5				0x0000e7b5
+#define A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__MASK			0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__SHIFT			0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__SHIFT) & A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_6				0x0000e7b6
 
 #define REG_A5XX_HLSQ_CS_CNTL_0					0x0000e7b7
+#define A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID__MASK			0x000000ff
+#define A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID__SHIFT			0
+static inline uint32_t A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID__SHIFT) & A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID__MASK;
+}
+#define A5XX_HLSQ_CS_CNTL_0_UNK0__MASK				0x0000ff00
+#define A5XX_HLSQ_CS_CNTL_0_UNK0__SHIFT				8
+static inline uint32_t A5XX_HLSQ_CS_CNTL_0_UNK0(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CNTL_0_UNK0__SHIFT) & A5XX_HLSQ_CS_CNTL_0_UNK0__MASK;
+}
+#define A5XX_HLSQ_CS_CNTL_0_UNK1__MASK				0x00ff0000
+#define A5XX_HLSQ_CS_CNTL_0_UNK1__SHIFT				16
+static inline uint32_t A5XX_HLSQ_CS_CNTL_0_UNK1(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CNTL_0_UNK1__SHIFT) & A5XX_HLSQ_CS_CNTL_0_UNK1__MASK;
+}
+#define A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID__MASK			0xff000000
+#define A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID__SHIFT			24
+static inline uint32_t A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID__SHIFT) & A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_CNTL_1					0x0000e7b8
 
@@ -3602,16 +4538,12 @@
 
 #define REG_A5XX_UNKNOWN_E7C5					0x0000e7c5
 
-#define REG_A5XX_UNKNOWN_E7CA					0x0000e7ca
-
-#define REG_A5XX_HLSQ_FS_CONSTLEN				0x0000e7d7
-
-#define REG_A5XX_HLSQ_FS_INSTRLEN				0x0000e7d8
-
 #define REG_A5XX_HLSQ_HS_CONSTLEN				0x0000e7c8
 
 #define REG_A5XX_HLSQ_HS_INSTRLEN				0x0000e7c9
 
+#define REG_A5XX_UNKNOWN_E7CA					0x0000e7ca
+
 #define REG_A5XX_HLSQ_DS_CONSTLEN				0x0000e7cd
 
 #define REG_A5XX_HLSQ_DS_INSTRLEN				0x0000e7ce
@@ -3624,11 +4556,15 @@
 
 #define REG_A5XX_UNKNOWN_E7D4					0x0000e7d4
 
+#define REG_A5XX_HLSQ_FS_CONSTLEN				0x0000e7d7
+
+#define REG_A5XX_HLSQ_FS_INSTRLEN				0x0000e7d8
+
 #define REG_A5XX_UNKNOWN_E7D9					0x0000e7d9
 
-#define REG_A5XX_HLSQ_CONTEXT_SWITCH_CS_SW_3			0x0000e7dc
+#define REG_A5XX_HLSQ_CS_CONSTLEN				0x0000e7dc
 
-#define REG_A5XX_HLSQ_CONTEXT_SWITCH_CS_SW_4			0x0000e7dd
+#define REG_A5XX_HLSQ_CS_INSTRLEN				0x0000e7dd
 
 #define REG_A5XX_RB_2D_SRC_SOLID_DW0				0x00002101
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
index e5107a7..98b6d44 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
@@ -59,12 +59,12 @@
 		const struct pipe_blend_state *cso)
 {
 	struct fd5_blend_stateobj *so;
-//	enum a3xx_rop_code rop = ROP_COPY;
+	enum a3xx_rop_code rop = ROP_COPY;
 	bool reads_dest = false;
 	unsigned i, mrt_blend = 0;
 
 	if (cso->logicop_enable) {
-//		rop = cso->logicop_func;  /* maps 1:1 */
+		rop = cso->logicop_func;  /* maps 1:1 */
 
 		switch (cso->logicop_func) {
 		case PIPE_LOGICOP_NOR:
@@ -90,6 +90,8 @@
 
 	so->base = *cso;
 
+	so->lrz_write = true;  /* unless blend enabled for any MRT */
+
 	for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) {
 		const struct pipe_rt_blend_state *rt;
 
@@ -115,9 +117,8 @@
 
 
 		so->rb_mrt[i].control =
-//				A5XX_RB_MRT_CONTROL_ROP_CODE(rop) |
-//				COND(cso->logicop_enable, A5XX_RB_MRT_CONTROL_ROP_ENABLE) |
-				0x60 | /* XXX set other than RECTLIST clear blits?? */
+				A5XX_RB_MRT_CONTROL_ROP_CODE(rop) |
+				COND(cso->logicop_enable, A5XX_RB_MRT_CONTROL_ROP_ENABLE) |
 				A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask);
 
 		if (rt->blend_enable) {
@@ -126,6 +127,7 @@
 					A5XX_RB_MRT_CONTROL_BLEND |
 					A5XX_RB_MRT_CONTROL_BLEND2;
 			mrt_blend |= (1 << i);
+			so->lrz_write = false;
 		}
 
 		if (reads_dest) {
@@ -139,6 +141,8 @@
 
 	so->rb_blend_cntl = A5XX_RB_BLEND_CNTL_ENABLE_BLEND(mrt_blend) |
 		COND(cso->independent_blend_enable, A5XX_RB_BLEND_CNTL_INDEPENDENT_BLEND);
+	so->sp_blend_cntl = A5XX_SP_BLEND_CNTL_UNK8 |
+		COND(mrt_blend, A5XX_SP_BLEND_CNTL_ENABLED);
 
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blend.h b/src/gallium/drivers/freedreno/a5xx/fd5_blend.h
index 85c6158..6985495 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_blend.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_blend.h
@@ -46,6 +46,8 @@
 		uint32_t blend_control_alpha;
 	} rb_mrt[A5XX_MAX_RENDER_TARGETS];
 	uint32_t rb_blend_cntl;
+	uint32_t sp_blend_cntl;
+	bool lrz_write;
 };
 
 static inline struct fd5_blend_stateobj *
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
new file mode 100644
index 0000000..2efcece
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_state.h"
+
+#include "fd5_compute.h"
+#include "fd5_context.h"
+#include "fd5_emit.h"
+
+struct fd5_compute_stateobj {
+	struct ir3_shader *shader;
+};
+
+
+static void *
+fd5_create_compute_state(struct pipe_context *pctx,
+		const struct pipe_compute_state *cso)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct ir3_compiler *compiler = ctx->screen->compiler;
+	struct fd5_compute_stateobj *so = CALLOC_STRUCT(fd5_compute_stateobj);
+	so->shader = ir3_shader_create_compute(compiler, cso, &ctx->debug);
+	return so;
+}
+
+static void
+fd5_delete_compute_state(struct pipe_context *pctx, void *hwcso)
+{
+	struct fd5_compute_stateobj *so = hwcso;
+	ir3_shader_destroy(so->shader);
+	free(so);
+}
+
+/* maybe move to fd5_program? */
+static void
+cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
+{
+	const struct ir3_info *i = &v->info;
+	enum a3xx_threadsize thrsz;
+
+	/* note: blob uses local_size_x/y/z threshold to choose threadsize: */
+	thrsz = FOUR_QUADS;
+
+	OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1);
+	OUT_RING(ring, 0x00000000);        /* SP_SP_CNTL */
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 1);
+	OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS) |
+		A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(thrsz) |
+		0x00000880 /* XXX */);
+
+	OUT_PKT4(ring, REG_A5XX_SP_CS_CTRL_REG0, 1);
+	OUT_RING(ring, A5XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |
+		A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
+		A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
+		A5XX_SP_CS_CTRL_REG0_BRANCHSTACK(0x3) |  // XXX need to figure this out somehow..
+		0x6 /* XXX */);
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1);
+	OUT_RING(ring, A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET(0) |
+		A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET(0) |
+		A5XX_HLSQ_CS_CONFIG_ENABLED);
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL, 1);
+	OUT_RING(ring, A5XX_HLSQ_CS_CNTL_INSTRLEN(v->instrlen) |
+		COND(v->has_ssbo, A5XX_HLSQ_CS_CNTL_SSBO_ENABLE));
+
+	OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1);
+	OUT_RING(ring, A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET(0) |
+		A5XX_SP_CS_CONFIG_SHADEROBJOFFSET(0) |
+		A5XX_SP_CS_CONFIG_ENABLED);
+
+	unsigned constlen = align(v->constlen, 4) / 4;
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2);
+	OUT_RING(ring, constlen);          /* HLSQ_CS_CONSTLEN */
+	OUT_RING(ring, v->instrlen);       /* HLSQ_CS_INSTRLEN */
+
+	OUT_PKT4(ring, REG_A5XX_SP_CS_OBJ_START_LO, 2);
+	OUT_RELOC(ring, v->bo, 0, 0, 0);   /* SP_CS_OBJ_START_LO/HI */
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1);
+	OUT_RING(ring, 0x1f00000);
+
+	uint32_t local_invocation_id, work_group_id;
+	local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
+	work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2);
+	OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+			A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
+			A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
+			A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+	OUT_RING(ring, 0x1);               /* HLSQ_CS_CNTL_1 */
+
+	fd5_emit_shader(ring, v);
+}
+
+static void
+fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info)
+{
+	struct fd5_compute_stateobj *so = ctx->compute;
+	struct ir3_shader_key key = {0};
+	struct ir3_shader_variant *v;
+	struct fd_ringbuffer *ring = ctx->batch->draw;
+
+	if (info->indirect)
+		return;  // TODO
+
+	v = ir3_shader_variant(so->shader, key, &ctx->debug);
+
+	if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
+		cs_program_emit(ring, v);
+
+	fd5_emit_cs_state(ctx, ring, v);
+	ir3_emit_cs_consts(v, ring, ctx, info);
+
+	const unsigned *local_size = info->block; // v->shader->nir->info->cs.local_size;
+	const unsigned *num_groups = info->grid;
+	/* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */
+	const unsigned work_dim = info->work_dim ? info->work_dim : 3;
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_NDRANGE_0, 7);
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) |
+		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
+		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
+		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_1_SIZE_X(local_size[0] * num_groups[0]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_2 */
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y(local_size[1] * num_groups[1]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_4 */
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z(local_size[2] * num_groups[2]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_6 */
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3);
+	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_X */
+	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Y */
+	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Z */
+
+	OUT_PKT7(ring, CP_EXEC_CS, 4);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
+	OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
+	OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
+}
+
+void
+fd5_compute_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->launch_grid = fd5_launch_grid;
+	pctx->create_compute_state = fd5_create_compute_state;
+	pctx->delete_compute_state = fd5_delete_compute_state;
+}
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.h b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h
new file mode 100644
index 0000000..d5cc8b8
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD5_COMPUTE_H_
+#define FD5_COMPUTE_H_
+
+#include "pipe/p_context.h"
+
+void fd5_compute_init(struct pipe_context *pctx);
+
+#endif /* FD5_COMPUTE_H_ */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.c b/src/gallium/drivers/freedreno/a5xx/fd5_context.c
index 1e4e83c..3632cc5 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.c
@@ -24,8 +24,10 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include "freedreno_query_acc.h"
 
 #include "fd5_context.h"
+#include "fd5_compute.h"
 #include "fd5_blend.h"
 #include "fd5_draw.h"
 #include "fd5_emit.h"
@@ -85,6 +87,7 @@
 	pctx->create_depth_stencil_alpha_state = fd5_zsa_state_create;
 
 	fd5_draw_init(pctx);
+	fd5_compute_init(pctx);
 	fd5_gmem_init(pctx);
 	fd5_texture_init(pctx);
 	fd5_prog_init(pctx);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
index 846c4b9..f6de6ca 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
@@ -80,6 +80,12 @@
 	 * due to variant change.  See fixup_shader_state()
 	 */
 	struct ir3_shader_key last_key;
+
+	/* number of active samples-passed queries: */
+	int samples_passed_queries;
+
+	/* cached state about current emitted shader program (3d): */
+	unsigned max_loc;
 };
 
 static inline struct fd5_context *
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
index d9fce2f..d1f1d03 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
@@ -42,7 +42,7 @@
 
 static void
 draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct fd5_emit *emit)
+		struct fd5_emit *emit, unsigned index_offset)
 {
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
@@ -53,17 +53,17 @@
 		fd5_emit_vertex_bufs(ring, emit);
 
 	OUT_PKT4(ring, REG_A5XX_VFD_INDEX_OFFSET, 2);
-	OUT_RING(ring, info->indexed ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
+	OUT_RING(ring, info->index_size ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, info->start_instance);   /* ??? UNKNOWN_2209 */
 
 	OUT_PKT4(ring, REG_A5XX_PC_RESTART_INDEX, 1);
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);
 
-	fd5_emit_render_cntl(ctx, false);
+	fd5_emit_render_cntl(ctx, false, emit->key.binning_pass);
 	fd5_draw_emit(ctx->batch, ring, primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
-			info);
+			info, index_offset);
 }
 
 /* fixup dirty shader state in case some "unrelated" (from the state-
@@ -77,44 +77,23 @@
 	struct ir3_shader_key *last_key = &fd5_ctx->last_key;
 
 	if (!ir3_shader_key_equal(last_key, key)) {
-		if (last_key->has_per_samp || key->has_per_samp) {
-			if ((last_key->vsaturate_s != key->vsaturate_s) ||
-					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vastc_srgb != key->vastc_srgb))
-				ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-			if ((last_key->fsaturate_s != key->fsaturate_s) ||
-					(last_key->fsaturate_t != key->fsaturate_t) ||
-					(last_key->fsaturate_r != key->fsaturate_r) ||
-					(last_key->fastc_srgb != key->fastc_srgb))
-				ctx->dirty |= FD_SHADER_DIRTY_FP;
+		if (ir3_shader_key_changes_fs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
 		}
 
-		if (last_key->vclamp_color != key->vclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_VP;
-
-		if (last_key->fclamp_color != key->fclamp_color)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->color_two_side != key->color_two_side)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->half_precision != key->half_precision)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->rasterflat != key->rasterflat)
-			ctx->dirty |= FD_SHADER_DIRTY_FP;
-
-		if (last_key->ucp_enables != key->ucp_enables)
-			ctx->dirty |= FD_SHADER_DIRTY_FP | FD_SHADER_DIRTY_VP;
+		if (ir3_shader_key_changes_vs(last_key, key)) {
+			ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG;
+			ctx->dirty |= FD_DIRTY_PROG;
+		}
 
 		fd5_ctx->last_key = *key;
 	}
 }
 
 static bool
-fd5_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd5_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
+             unsigned index_offset)
 {
 	struct fd5_context *fd5_ctx = fd5_context(ctx);
 	struct fd5_emit emit = {
@@ -149,23 +128,30 @@
 	fixup_shader_state(ctx, &emit.key);
 
 	unsigned dirty = ctx->dirty;
+	const struct ir3_shader_variant *vp = fd5_emit_get_vp(&emit);
+	const struct ir3_shader_variant *fp = fd5_emit_get_fp(&emit);
 
 	/* do regular pass first, since that is more likely to fail compiling: */
 
-	if (!(fd5_emit_get_vp(&emit) && fd5_emit_get_fp(&emit)))
+	if (!vp || !fp)
 		return false;
 
+	/* figure out whether we need to disable LRZ write for binning
+	 * pass using draw pass's fp:
+	 */
+	emit.no_lrz_write = fp->writes_pos || fp->has_kill;
+
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
 
-	draw_impl(ctx, ctx->batch->draw, &emit);
+	draw_impl(ctx, ctx->batch->draw, &emit, index_offset);
 
-//	/* and now binning pass: */
-//	emit.key.binning_pass = true;
-//	emit.dirty = dirty & ~(FD_DIRTY_BLEND);
-//	emit.vp = NULL;   /* we changed key so need to refetch vp */
-//	emit.fp = NULL;
-//	draw_impl(ctx, ctx->batch->binning, &emit);
+	/* and now binning pass: */
+	emit.key.binning_pass = true;
+	emit.dirty = dirty & ~(FD_DIRTY_BLEND);
+	emit.vp = NULL;   /* we changed key so need to refetch vp */
+	emit.fp = NULL;
+	draw_impl(ctx, ctx->batch->binning, &emit, index_offset);
 
 	if (emit.streamout_mask) {
 		struct fd_ringbuffer *ring = ctx->batch->draw;
@@ -178,10 +164,104 @@
 		}
 	}
 
+	fd_context_all_clean(ctx);
+
 	return true;
 }
 
+static bool is_z32(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+	case PIPE_FORMAT_Z32_UNORM:
+	case PIPE_FORMAT_Z32_FLOAT:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void
+fd5_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
+{
+	struct fd_ringbuffer *ring;
+	uint32_t clear = util_pack_z(PIPE_FORMAT_Z16_UNORM, depth);
+
+	// TODO mid-frame clears (ie. app doing crazy stuff)??  Maybe worth
+	// splitting both clear and lrz clear out into their own rb's.  And
+	// just throw away any draws prior to clear.  (Anything not fullscreen
+	// clear, just fallback to generic path that treats it as a normal
+	// draw
+
+	if (!batch->lrz_clear) {
+		batch->lrz_clear = fd_ringbuffer_new(batch->ctx->screen->pipe, 0x1000);
+		fd_ringbuffer_set_parent(batch->lrz_clear, batch->gmem);
+	}
+
+	ring = batch->lrz_clear;
+
+	OUT_WFI5(ring);
+
+	OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1);
+	OUT_RING(ring, 0x10000000);
+
+	OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1);
+	OUT_RING(ring, 0x20fffff);
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
+	OUT_RING(ring, A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0));
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1);
+	OUT_RING(ring, 0x00000000);
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1);
+	OUT_RING(ring, 0x00000181);
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1);
+	OUT_RING(ring, 0x00000000);
+
+	OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5);
+	OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(RB5_R16_UNORM) |
+			A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE5_LINEAR) |
+			A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX));
+	OUT_RING(ring, A5XX_RB_MRT_PITCH(zsbuf->lrz_pitch * 2));
+	OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(fd_bo_size(zsbuf->lrz)));
+	OUT_RELOCW(ring, zsbuf->lrz, 0x1000, 0, 0);
+
+	OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1);
+	OUT_RING(ring, 0x00000000);
+
+	OUT_PKT4(ring, REG_A5XX_RB_DEST_MSAA_CNTL, 1);
+	OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE));
+
+	OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1);
+	OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0));
+
+	OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1);
+	OUT_RING(ring, A5XX_RB_CLEAR_CNTL_FAST_CLEAR |
+			A5XX_RB_CLEAR_CNTL_MASK(0xf));
+
+	OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 1);
+	OUT_RING(ring, clear);  /* RB_CLEAR_COLOR_DW0 */
+
+	OUT_PKT4(ring, REG_A5XX_VSC_RESOLVE_CNTL, 2);
+	OUT_RING(ring, A5XX_VSC_RESOLVE_CNTL_X(zsbuf->lrz_width) |
+			 A5XX_VSC_RESOLVE_CNTL_Y(zsbuf->lrz_height));
+	OUT_RING(ring, 0x00000000);   // XXX UNKNOWN_0CDE
+
+	OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1);
+	OUT_RING(ring, A5XX_RB_CNTL_BYPASS);
+
+	OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2);
+	OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(0) |
+			A5XX_RB_RESOLVE_CNTL_1_Y(0));
+	OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(zsbuf->lrz_width - 1) |
+			A5XX_RB_RESOLVE_CNTL_2_Y(zsbuf->lrz_height - 1));
+
+	fd5_emit_blit(batch->ctx, ring);
+}
+
+static bool
 fd5_clear(struct fd_context *ctx, unsigned buffers,
 		const union pipe_color_union *color, double depth, unsigned stencil)
 {
@@ -189,14 +269,16 @@
 	struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
 	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
 
-	/* TODO handle scissor.. or fallback to slow-clear? */
+	if ((buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
+			is_z32(pfb->zsbuf->format))
+		return false;
 
 	ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx);
 	ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny);
 	ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx);
 	ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy);
 
-	fd5_emit_render_cntl(ctx, true);
+	fd5_emit_render_cntl(ctx, true, false);
 
 	if (buffers & PIPE_CLEAR_COLOR) {
 		for (int i = 0; i < pfb->nr_cbufs; i++) {
@@ -286,11 +368,21 @@
 		OUT_RING(ring, clear);    /* RB_CLEAR_COLOR_DW0 */
 
 		fd5_emit_blit(ctx, ring);
+
+		if (pfb->zsbuf && (buffers & PIPE_CLEAR_DEPTH)) {
+			struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture);
+			if (zsbuf->lrz) {
+				zsbuf->lrz_valid = true;
+				fd5_clear_lrz(ctx->batch, zsbuf, depth);
+			}
+		}
 	}
 
 	/* disable fast clear to not interfere w/ gmem->mem, etc.. */
 	OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1);
 	OUT_RING(ring, 0x00000000);   /* RB_CLEAR_CNTL */
+
+	return true;
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
index 8ce70d3..de210e4 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
@@ -80,22 +80,21 @@
 fd5_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		enum pc_di_primtype primtype,
 		enum pc_di_vis_cull_mode vismode,
-		const struct pipe_draw_info *info)
+		const struct pipe_draw_info *info,
+		unsigned index_offset)
 {
 	struct pipe_resource *idx_buffer = NULL;
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
 
-	if (info->indexed) {
-		struct pipe_index_buffer *idx = &batch->ctx->indexbuf;
+	if (info->index_size) {
+		assert(!info->has_user_indices);
 
-		assert(!idx->user_buffer);
-
-		idx_buffer = idx->buffer;
-		idx_type = fd4_size2indextype(idx->index_size);
-		idx_size = idx->index_size * info->count;
-		idx_offset = idx->offset + (info->start * idx->index_size);
+		idx_buffer = info->index.resource;
+		idx_type = fd4_size2indextype(info->index_size);
+		idx_size = info->index_size * info->count;
+		idx_offset = index_offset + info->start * info->index_size;
 		src_sel = DI_SRC_SEL_DMA;
 	} else {
 		idx_buffer = NULL;
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
index a12b143..21931e9 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
@@ -43,11 +43,6 @@
 #include "fd5_format.h"
 #include "fd5_zsa.h"
 
-static const enum adreno_state_block sb[] = {
-	[SHADER_VERTEX]   = SB_VERT_SHADER,
-	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
-};
-
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
@@ -58,32 +53,32 @@
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
-	enum adreno_state_src src;
+	enum a4xx_state_src src;
 
 	debug_assert((regid % 4) == 0);
 	debug_assert((sizedwords % 4) == 0);
 
 	if (prsc) {
 		sz = 0;
-		src = 0x2;  // TODO ??
+		src = SS4_INDIRECT;
 	} else {
 		sz = sizedwords;
-		src = SS_DIRECT;
+		src = SS4_DIRECT;
 	}
 
-	OUT_PKT7(ring, CP_LOAD_STATE, 3 + sz);
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
-			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sz);
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE4_0_STATE_SRC(src) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) |
+			CP_LOAD_STATE4_0_NUM_UNIT(sizedwords/4));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
 		OUT_RELOC(ring, bo, offset,
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS), 0);
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0);
 	} else {
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-		OUT_RING(ring, CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(0));
+		OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS));
+		OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
 		dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
 	}
 	for (i = 0; i < sz; i++) {
@@ -100,14 +95,14 @@
 
 	debug_assert((regid % 4) == 0);
 
-	OUT_PKT7(ring, CP_LOAD_STATE, 3 + (2 * anum));
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
-			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(anum/2));
-	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-	OUT_RING(ring, CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(0));
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * anum));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) |
+			CP_LOAD_STATE4_0_NUM_UNIT(anum/2));
+	OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS));
+	OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
 
 	for (i = 0; i < num; i++) {
 		if (prscs[i]) {
@@ -132,38 +127,6 @@
  * the same as a6xx then move this somewhere common ;-)
  *
  * Entry layout looks like (total size, 0x60 bytes):
- *
- *   offset | description
- *   -------+-------------
- *     0x00 | fp32[0]
- *          | fp32[1]
- *          | fp32[2]
- *          | fp32[3]
- *     0x10 | uint16[0]
- *          | uint16[1]
- *          | uint16[2]
- *          | uint16[3]
- *     0x18 | int16[0]
- *          | int16[1]
- *          | int16[2]
- *          | int16[3]
- *     0x20 | fp16[0]
- *          | fp16[1]
- *          | fp16[2]
- *          | fp16[3]
- *     0x28 | ?? maybe padding ??
- *     0x30 | uint8[0]
- *          | uint8[1]
- *          | uint8[2]
- *          | uint8[3]
- *     0x34 | int8[0]
- *          | int8[1]
- *          | int8[2]
- *          | int8[3]
- *     0x38 | ?? maybe padding ??
- *
- * Some uncertainty, because not clear that this actually works properly
- * with blob, so who knows..
  */
 
 struct PACKED bcolor_entry {
@@ -171,22 +134,25 @@
 	uint16_t ui16[4];
 	int16_t  si16[4];
 	uint16_t fp16[4];
-	uint8_t  __pad0[8];
+	uint16_t rgb565;
+	uint16_t rgb5a1;
+	uint16_t rgba4;
+	uint8_t __pad0[2];
 	uint8_t  ui8[4];
 	int8_t   si8[4];
-	uint8_t  __pad1[40];
+	uint32_t rgb10a2;
+	uint32_t z24; /* also s8? */
+	uint8_t  __pad1[32];
 };
 
 #define FD5_BORDER_COLOR_SIZE        0x60
 #define FD5_BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * FD5_BORDER_COLOR_SIZE)
-#define FD5_BORDER_COLOR_OFFSET      8   /* TODO probably should be dynamic */
 
 static void
 setup_border_colors(struct fd_texture_stateobj *tex, struct bcolor_entry *entries)
 {
 	unsigned i, j;
-
-	debug_assert(tex->num_samplers < FD5_BORDER_COLOR_OFFSET);  // TODO
+	STATIC_ASSERT(sizeof(struct bcolor_entry) == FD5_BORDER_COLOR_SIZE);
 
 	for (i = 0; i < tex->num_samplers; i++) {
 		struct bcolor_entry *e = &entries[i];
@@ -213,6 +179,12 @@
 		const struct util_format_description *desc =
 				util_format_description(tex->textures[i]->format);
 
+		e->rgb565 = 0;
+		e->rgb5a1 = 0;
+		e->rgba4 = 0;
+		e->rgb10a2 = 0;
+		e->z24 = 0;
+
 		for (j = 0; j < 4; j++) {
 			int c = desc->swizzle[j];
 
@@ -220,23 +192,62 @@
 				continue;
 
 			if (desc->channel[c].pure_integer) {
-				float f = bc->i[c];
-
-				e->fp32[j] = fui(f);
-				e->fp16[j] = util_float_to_half(f);
-				e->ui16[j] = bc->ui[c];
-				e->si16[j] = bc->i[c];
-				e->ui8[j]  = bc->ui[c];
-				e->si8[j]  = bc->i[c];
+				uint16_t clamped;
+				switch (desc->channel[c].size) {
+				case 2:
+					assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
+					clamped = CLAMP(bc->ui[j], 0, 0x3);
+					break;
+				case 8:
+					if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
+						clamped = CLAMP(bc->i[j], -128, 127);
+					else
+						clamped = CLAMP(bc->ui[j], 0, 255);
+					break;
+				case 10:
+					assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
+					clamped = CLAMP(bc->ui[j], 0, 0x3ff);
+					break;
+				case 16:
+					if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
+						clamped = CLAMP(bc->i[j], -32768, 32767);
+					else
+						clamped = CLAMP(bc->ui[j], 0, 65535);
+					break;
+				default:
+					assert(!"Unexpected bit size");
+				case 32:
+					clamped = 0;
+					break;
+				}
+				e->fp32[c] = bc->ui[j];
+				e->fp16[c] = clamped;
 			} else {
-				float f = bc->f[c];
+				float f = bc->f[j];
+				float f_u = CLAMP(f, 0, 1);
+				float f_s = CLAMP(f, -1, 1);
 
-				e->fp32[j] = fui(f);
-				e->fp16[j] = util_float_to_half(f);
-				e->ui16[j] = f * 65535.0;
-				e->si16[j] = f * 32767.5;
-				e->ui8[j]  = f * 255.0;
-				e->si8[j]  = f * 128.0;
+				e->fp32[c] = fui(f);
+				e->fp16[c] = util_float_to_half(f);
+				e->ui16[c] = f_u * 0xffff;
+				e->si16[c] = f_s * 0x7fff;
+				e->ui8[c]  = f_u * 0xff;
+				e->si8[c]  = f_s * 0x7f;
+				if (c == 1)
+					e->rgb565 |= (int)(f_u * 0x3f) << 5;
+				else if (c < 3)
+					e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0);
+				if (c == 3)
+					e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0;
+				else
+					e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5);
+				if (c == 3)
+					e->rgb10a2 |= (int)(f_u * 0x3) << 30;
+				else
+					e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10);
+				e->rgba4 |= (int)(f_u * 0xf) << (c * 4);
+				if (c == 0)
+					e->z24 = f_u * 0xffffff;
 			}
 		}
 
@@ -265,8 +276,9 @@
 
 	entries = ptr;
 
-	setup_border_colors(&ctx->verttex, &entries[0]);
-	setup_border_colors(&ctx->fragtex, &entries[ctx->verttex.num_samplers]);
+	setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]);
+	setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT],
+			&entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]);
 
 	OUT_PKT4(ring, REG_A5XX_TPL1_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
 	OUT_RELOC(ring, fd_resource(fd5_ctx->border_color_buf)->bo, off, 0, 0);
@@ -276,22 +288,22 @@
 
 static bool
 emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		enum adreno_state_block sb, struct fd_texture_stateobj *tex)
+		enum a4xx_state_block sb, struct fd_texture_stateobj *tex)
 {
 	bool needs_border = false;
-	unsigned bcolor_offset = (sb == SB_FRAG_TEX) ? ctx->verttex.num_samplers : 0;
+	unsigned bcolor_offset = (sb == SB4_FS_TEX) ? ctx->tex[PIPE_SHADER_VERTEX].num_samplers : 0;
 	unsigned i;
 
 	if (tex->num_samplers > 0) {
 		/* output sampler state: */
-		OUT_PKT7(ring, CP_LOAD_STATE, 3 + (4 * tex->num_samplers));
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(tex->num_samplers));
-		OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
-				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-		OUT_RING(ring, CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(0));
+		OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (4 * tex->num_samplers));
+		OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+				CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+				CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+				CP_LOAD_STATE4_0_NUM_UNIT(tex->num_samplers));
+		OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) |
+				CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+		OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
 		for (i = 0; i < tex->num_samplers; i++) {
 			static const struct fd5_sampler_stateobj dummy_sampler = {};
 			const struct fd5_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -311,14 +323,14 @@
 		unsigned num_textures = tex->num_textures;
 
 		/* emit texture state: */
-		OUT_PKT7(ring, CP_LOAD_STATE, 3 + (12 * num_textures));
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(num_textures));
-		OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
-				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-		OUT_RING(ring, CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(0));
+		OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (12 * num_textures));
+		OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+				CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+				CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+				CP_LOAD_STATE4_0_NUM_UNIT(num_textures));
+		OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) |
+				CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+		OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
 		for (i = 0; i < tex->num_textures; i++) {
 			static const struct fd5_pipe_sampler_view dummy_view = {};
 			const struct fd5_pipe_sampler_view *view = tex->textures[i] ?
@@ -349,6 +361,72 @@
 	return needs_border;
 }
 
+static void
+emit_ssbos(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		enum a4xx_state_block sb, struct fd_shaderbuf_stateobj *so)
+{
+	unsigned count = util_last_bit(so->enabled_mask);
+
+	if (count == 0)
+		return;
+
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (4 * count));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE4_0_NUM_UNIT(count));
+	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(0) |
+			CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+	OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+	for (unsigned i = 0; i < count; i++) {
+		struct pipe_shader_buffer *buf = &so->sb[i];
+		if (buf->buffer) {
+			struct fd_resource *rsc = fd_resource(buf->buffer);
+			OUT_RELOCW(ring, rsc->bo, 0, 0, 0);
+		} else {
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+		OUT_RING(ring, 0x00000000);
+		OUT_RING(ring, 0x00000000);
+	}
+
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * count));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE4_0_NUM_UNIT(count));
+	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(1) |
+			CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+	OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+	for (unsigned i = 0; i < count; i++) {
+		struct pipe_shader_buffer *buf = &so->sb[i];
+
+		// TODO maybe offset encoded somewhere here??
+		OUT_RING(ring, (buf->buffer_size << 16));
+		OUT_RING(ring, 0x00000000);
+	}
+
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * count));
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE4_0_NUM_UNIT(count));
+	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(2) |
+			CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+	OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+	for (unsigned i = 0; i < count; i++) {
+		struct pipe_shader_buffer *buf = &so->sb[i];
+		if (buf->buffer) {
+			struct fd_resource *rsc = fd_resource(buf->buffer);
+			OUT_RELOCW(ring, rsc->bo, 0, 0, 0);
+		} else {
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+	}
+}
+
 void
 fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 {
@@ -363,7 +441,7 @@
 			struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
 			const struct pipe_vertex_buffer *vb =
 					&vtx->vertexbuf.vb[elem->vertex_buffer_index];
-			struct fd_resource *rsc = fd_resource(vb->buffer);
+			struct fd_resource *rsc = fd_resource(vb->buffer.resource);
 			enum pipe_format pfmt = elem->src_format;
 			enum a5xx_vtx_fmt fmt = fd5_pipe2vtx(pfmt);
 			bool isint = util_format_is_pure_integer(pfmt);
@@ -380,6 +458,7 @@
 			OUT_RING(ring, A5XX_VFD_DECODE_INSTR_IDX(j) |
 					A5XX_VFD_DECODE_INSTR_FORMAT(fmt) |
 					COND(elem->instance_divisor, A5XX_VFD_DECODE_INSTR_INSTANCED) |
+					A5XX_VFD_DECODE_INSTR_SWAP(fd5_pipe2swap(pfmt)) |
 					A5XX_VFD_DECODE_INSTR_UNK30 |
 					COND(!isint, A5XX_VFD_DECODE_INSTR_FLOAT));
 			OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */
@@ -400,15 +479,15 @@
 fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd5_emit *emit)
 {
+	struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
 	const struct ir3_shader_variant *vp = fd5_emit_get_vp(emit);
 	const struct ir3_shader_variant *fp = fd5_emit_get_fp(emit);
-	uint32_t dirty = emit->dirty;
+	const enum fd_dirty_3d_state dirty = emit->dirty;
 	bool needs_border = false;
 
 	emit_marker5(ring, 5);
 
 	if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->key.binning_pass) {
-		struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
 		unsigned char mrt_comp[A5XX_MAX_RENDER_TARGETS] = {0};
 
 		for (unsigned i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) {
@@ -428,7 +507,6 @@
 
 	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa);
-		struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
 		uint32_t rb_alpha_control = zsa->rb_alpha_control;
 
 		if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])))
@@ -441,13 +519,33 @@
 		OUT_RING(ring, zsa->rb_stencil_control);
 	}
 
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG)) {
+		struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend);
+		struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa);
+
+		if (pfb->zsbuf) {
+			struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+			uint32_t gras_lrz_cntl = zsa->gras_lrz_cntl;
+
+			if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid)
+				gras_lrz_cntl = 0;
+			else if (emit->key.binning_pass && blend->lrz_write && zsa->lrz_write)
+				gras_lrz_cntl |= A5XX_GRAS_LRZ_CNTL_LRZ_WRITE;
+
+			OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1);
+			OUT_RING(ring, gras_lrz_cntl);
+		}
+	}
+
 	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) {
 		struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa);
 		struct pipe_stencil_ref *sr = &ctx->stencil_ref;
 
-		OUT_PKT4(ring, REG_A5XX_RB_STENCILREFMASK, 1);
+		OUT_PKT4(ring, REG_A5XX_RB_STENCILREFMASK, 2);
 		OUT_RING(ring, zsa->rb_stencilrefmask |
 				A5XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0]));
+		OUT_RING(ring, zsa->rb_stencilrefmask_bf |
+				A5XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1]));
 	}
 
 	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
@@ -466,39 +564,6 @@
 				COND(fragz && fp->frag_coord, A5XX_GRAS_SU_DEPTH_PLANE_CNTL_UNK1));
 	}
 
-	if (dirty & FD_DIRTY_RASTERIZER) {
-		struct fd5_rasterizer_stateobj *rasterizer =
-				fd5_rasterizer_stateobj(ctx->rasterizer);
-
-		OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
-		OUT_RING(ring, rasterizer->gras_su_cntl);
-
-		OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2);
-		OUT_RING(ring, rasterizer->gras_su_point_minmax);
-		OUT_RING(ring, rasterizer->gras_su_point_size);
-
-		OUT_PKT4(ring, REG_A5XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
-		OUT_RING(ring, rasterizer->gras_su_poly_offset_scale);
-		OUT_RING(ring, rasterizer->gras_su_poly_offset_offset);
-		OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp);
-	}
-
-	/* NOTE: since primitive_restart is not actually part of any
-	 * state object, we need to make sure that we always emit
-	 * PRIM_VTX_CNTL.. either that or be more clever and detect
-	 * when it changes.
-	 */
-	if (emit->info) {
-		struct fd5_rasterizer_stateobj *rast =
-			fd5_rasterizer_stateobj(ctx->rasterizer);
-		uint32_t val = rast->pc_prim_vtx_cntl;
-
-		val |= COND(vp->writes_psize, A5XX_PC_PRIM_VTX_CNTL_PSIZE);
-
-		OUT_PKT4(ring, REG_A5XX_PC_PRIM_VTX_CNTL, 1);
-		OUT_RING(ring, val);
-	}
-
 	if (dirty & FD_DIRTY_SCISSOR) {
 		struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
 
@@ -532,10 +597,52 @@
 	}
 
 	if (dirty & FD_DIRTY_PROG)
-		fd5_program_emit(ring, emit);
+		fd5_program_emit(ctx, ring, emit);
 
-	if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER)) {
-		struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
+	if (dirty & FD_DIRTY_RASTERIZER) {
+		struct fd5_rasterizer_stateobj *rasterizer =
+				fd5_rasterizer_stateobj(ctx->rasterizer);
+
+		OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
+		OUT_RING(ring, rasterizer->gras_su_cntl);
+
+		OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2);
+		OUT_RING(ring, rasterizer->gras_su_point_minmax);
+		OUT_RING(ring, rasterizer->gras_su_point_size);
+
+		OUT_PKT4(ring, REG_A5XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
+		OUT_RING(ring, rasterizer->gras_su_poly_offset_scale);
+		OUT_RING(ring, rasterizer->gras_su_poly_offset_offset);
+		OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp);
+
+		OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1);
+		OUT_RING(ring, rasterizer->pc_raster_cntl);
+
+		OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1);
+		OUT_RING(ring, rasterizer->gras_cl_clip_cntl);
+	}
+
+	/* note: must come after program emit.. because there is some overlap
+	 * in registers, ex. PC_PRIMITIVE_CNTL and we rely on some cached
+	 * values from fd5_program_emit() to avoid having to re-emit the prog
+	 * every time rast state changes.
+	 *
+	 * Since the primitive restart state is not part of a tracked object, we
+	 * re-emit this register every time.
+	 */
+	if (emit->info && ctx->rasterizer) {
+		struct fd5_rasterizer_stateobj *rasterizer =
+				fd5_rasterizer_stateobj(ctx->rasterizer);
+		unsigned max_loc = fd5_context(ctx)->max_loc;
+
+		OUT_PKT4(ring, REG_A5XX_PC_PRIMITIVE_CNTL, 1);
+		OUT_RING(ring, rasterizer->pc_primitive_cntl |
+				 A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC(max_loc) |
+				 COND(emit->info->primitive_restart && emit->info->index_size,
+					  A5XX_PC_PRIMITIVE_CNTL_PRIMITIVE_RESTART));
+	}
+
+	if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
 		uint32_t posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH);
 		unsigned nr = pfb->nr_cbufs;
 
@@ -555,9 +662,9 @@
 	}
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
+		ir3_emit_vs_consts(vp, ring, ctx, emit->info);
 		if (!emit->key.binning_pass)
-			ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
+			ir3_emit_fs_consts(fp, ring, ctx);
 
 		struct pipe_stream_output_info *info = &vp->shader->stream_output;
 		if (info->num_outputs) {
@@ -595,8 +702,7 @@
 		uint32_t i;
 
 		for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) {
-			enum pipe_format format = pipe_surface_format(
-					ctx->batch->framebuffer.cbufs[i]);
+			enum pipe_format format = pipe_surface_format(pfb->cbufs[i]);
 			bool is_int = util_format_is_pure_integer(format);
 			bool has_alpha = util_format_has_alpha(format);
 			uint32_t control = blend->rb_mrt[i].control;
@@ -604,7 +710,7 @@
 
 			if (is_int) {
 				control &= A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK;
-//				control |= A5XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
+				control |= A5XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
 			}
 
 			if (has_alpha) {
@@ -626,7 +732,7 @@
 				A5XX_RB_BLEND_CNTL_SAMPLE_MASK(0xffff));
 
 		OUT_PKT4(ring, REG_A5XX_SP_BLEND_CNTL, 1);
-		OUT_RING(ring, 0x00000100);
+		OUT_RING(ring, blend->sp_blend_cntl);
 	}
 
 	if (dirty & FD_DIRTY_BLEND_COLOR) {
@@ -651,30 +757,65 @@
 		OUT_RING(ring, A5XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
 
-	if (dirty & FD_DIRTY_VERTTEX) {
-		if (vp->has_samp) {
-			needs_border |= emit_textures(ctx, ring, SB_VERT_TEX, &ctx->verttex);
-			OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1);
-			OUT_RING(ring, ctx->verttex.num_textures);
-		} else {
-			dirty &= ~FD_DIRTY_VERTTEX;
-		}
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) {
+		needs_border |= emit_textures(ctx, ring, SB4_VS_TEX,
+				&ctx->tex[PIPE_SHADER_VERTEX]);
+		OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1);
+		OUT_RING(ring, ctx->tex[PIPE_SHADER_VERTEX].num_textures);
 	}
 
-	if (dirty & FD_DIRTY_FRAGTEX) {
-		if (fp->has_samp) {
-			needs_border |= emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->fragtex);
-			OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1);
-			OUT_RING(ring, ctx->fragtex.num_textures);
-		} else {
-			dirty &= ~FD_DIRTY_FRAGTEX;
-		}
+	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) {
+		needs_border |= emit_textures(ctx, ring, SB4_FS_TEX,
+				&ctx->tex[PIPE_SHADER_FRAGMENT]);
+		OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1);
+		OUT_RING(ring, ctx->tex[PIPE_SHADER_FRAGMENT].num_textures);
 	}
 
+	OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1);
+	OUT_RING(ring, 0);
+
 	if (needs_border)
 		emit_border_color(ctx, ring);
 
-	ctx->dirty &= ~dirty;
+	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO)
+		emit_ssbos(ctx, ring, SB4_SSBO, &ctx->shaderbuf[PIPE_SHADER_FRAGMENT]);
+}
+
+void
+fd5_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		struct ir3_shader_variant *cp)
+{
+	enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
+
+	if (dirty & FD_DIRTY_SHADER_TEX) {
+		bool needs_border = false;
+		needs_border |= emit_textures(ctx, ring, SB4_CS_TEX,
+				&ctx->tex[PIPE_SHADER_COMPUTE]);
+
+		if (needs_border)
+			emit_border_color(ctx, ring);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1);
+		OUT_RING(ring, 0);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_HS_TEX_COUNT, 1);
+		OUT_RING(ring, 0);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_DS_TEX_COUNT, 1);
+		OUT_RING(ring, 0);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_GS_TEX_COUNT, 1);
+		OUT_RING(ring, 0);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1);
+		OUT_RING(ring, 0);
+
+		OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1);
+		OUT_RING(ring, ctx->tex[PIPE_SHADER_COMPUTE].num_textures);
+	}
+
+	if (dirty & FD_DIRTY_SHADER_SSBO)
+		emit_ssbos(ctx, ring, SB4_CS_SSBO, &ctx->shaderbuf[PIPE_SHADER_COMPUTE]);
 }
 
 /* emit setup at begin of new cmdstream buffer (don't rely on previous
@@ -707,9 +848,6 @@
 	OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1);
 	OUT_RING(ring, 0x00000012);
 
-	OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1);
-	OUT_RING(ring, 0x00000000);
-
 	OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2);
 	OUT_RING(ring, A5XX_GRAS_SU_POINT_MINMAX_MIN(1.0) |
 			A5XX_GRAS_SU_POINT_MINMAX_MAX(4092.0));
@@ -773,10 +911,6 @@
 	OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
 	OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
 
-	/* other regs not used (yet?) and always seem to have same value: */
-	OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1);
-	OUT_RING(ring, 0x00000080);   /* GRAS_CL_CNTL */
-
 	OUT_PKT4(ring, REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 1);
 	OUT_RING(ring, 0x00000000);   /* GRAS_SU_CONSERVATIVE_RAS_CNTL */
 
@@ -819,9 +953,6 @@
 	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E093, 1);
 	OUT_RING(ring, 0x00000000);   /* UNKNOWN_E093 */
 
-	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E1C7, 1);
-	OUT_RING(ring, 0x00000000);   /* UNKNOWN_E1C7 */
-
 	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E29A, 1);
 	OUT_RING(ring, 0x00ffff00);   /* UNKNOWN_E29A */
 
@@ -918,11 +1049,8 @@
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	// TODO hacks.. these should not be hardcoded:
-	OUT_PKT4(ring, REG_A5XX_GRAS_SC_CNTL, 1);
-	OUT_RING(ring, 0x00000008);   /* GRAS_SC_CNTL */
-
-	fd_hw_query_enable(batch, ring);
+	OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1);
+	OUT_RING(ring, 0x00000000);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
index 0525b3e..2d8a0fd 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
@@ -44,13 +44,19 @@
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	uint32_t dirty;
+	enum fd_dirty_3d_state dirty;
 
 	uint32_t sprite_coord_enable;  /* bitmask */
 	bool sprite_coord_mode;
 	bool rasterflat;
 	bool no_decode_srgb;
 
+	/* in binning pass, we don't have real frag shader, so we
+	 * don't know if real draw disqualifies lrz write.  So just
+	 * figure that out up-front and stash it in the emit.
+	 */
+	bool no_lrz_write;
+
 	/* cached to avoid repeated lookups of same variants: */
 	const struct ir3_shader_variant *vp, *fp;
 	/* TODO: other shader stages.. */
@@ -114,7 +120,8 @@
 	OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(mode));
 	OUT_RING(ring, 0x00000000);   /* ADDR_LO */
 	OUT_RING(ring, 0x00000000);   /* ADDR_HI */
-	OUT_RING(ring, COND(mode == GMEM, CP_SET_RENDER_MODE_3_GMEM_ENABLE));
+	OUT_RING(ring, COND(mode == GMEM, CP_SET_RENDER_MODE_3_GMEM_ENABLE) |
+			COND(mode == BINNING, CP_SET_RENDER_MODE_3_VSC_ENABLE));
 	OUT_RING(ring, 0x00000000);
 	emit_marker5(ring, 7);
 }
@@ -135,9 +142,9 @@
 }
 
 static inline void
-fd5_emit_render_cntl(struct fd_context *ctx, bool blit)
+fd5_emit_render_cntl(struct fd_context *ctx, bool blit, bool binning)
 {
-	struct fd_ringbuffer *ring = ctx->batch->draw;
+	struct fd_ringbuffer *ring = binning ? ctx->batch->binning : ctx->batch->draw;
 
 	/* TODO eventually this partially depends on the pfb state, ie.
 	 * which of the cbuf(s)/zsbuf has an UBWC flag buffer.. that part
@@ -147,9 +154,34 @@
 	 * Other bits seem to depend on query state, like if samples-passed
 	 * query is active.
 	 */
+	bool samples_passed = (fd5_context(ctx)->samples_passed_queries > 0);
 	OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1);
 	OUT_RING(ring, 0x00000000 |   /* RB_RENDER_CNTL */
+			COND(binning, A5XX_RB_RENDER_CNTL_BINNING_PASS) |
+			COND(binning, A5XX_RB_RENDER_CNTL_DISABLE_COLOR_PIPE) |
+			COND(samples_passed, A5XX_RB_RENDER_CNTL_SAMPLES_PASSED) |
 			COND(!blit, 0x8));
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_SC_CNTL, 1);
+	OUT_RING(ring, 0x00000008 |   /* GRAS_SC_CNTL */
+			COND(binning, A5XX_GRAS_SC_CNTL_BINNING_PASS) |
+			COND(samples_passed, A5XX_GRAS_SC_CNTL_SAMPLES_PASSED));
+}
+
+static inline void
+fd5_emit_lrz_flush(struct fd_ringbuffer *ring)
+{
+	/* TODO I think the extra writes to GRAS_LRZ_CNTL are probably
+	 * a workaround and not needed on all a5xx.
+	 */
+	OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1);
+	OUT_RING(ring, A5XX_GRAS_LRZ_CNTL_ENABLE);
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, LRZ_FLUSH);
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1);
+	OUT_RING(ring, 0x0);
 }
 
 void fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit);
@@ -157,6 +189,9 @@
 void fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd5_emit *emit);
 
+void fd5_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		struct ir3_shader_variant *cp);
+
 void fd5_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring);
 
 void fd5_emit_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_format.c b/src/gallium/drivers/freedreno/a5xx/fd5_format.c
index 4e3c834..ae5cc83 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_format.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_format.c
@@ -77,24 +77,24 @@
 static struct fd5_format formats[PIPE_FORMAT_COUNT] = {
 	/* 8-bit */
 	VT(R8_UNORM,   8_UNORM, R8_UNORM, WZYX),
-	V_(R8_SNORM,   8_SNORM, NONE,     WZYX),
+	VT(R8_SNORM,   8_SNORM, R8_SNORM, WZYX),
 	VT(R8_UINT,    8_UINT,  R8_UINT,  WZYX),
 	VT(R8_SINT,    8_SINT,  R8_SINT,  WZYX),
 	V_(R8_USCALED, 8_UINT,  NONE,     WZYX),
 	V_(R8_SSCALED, 8_UINT,  NONE,     WZYX),
 
-	_T(A8_UNORM,   8_UNORM, NONE,     WZYX),
+	_T(A8_UNORM,   8_UNORM, A8_UNORM, WZYX),
 	_T(L8_UNORM,   8_UNORM, R8_UNORM, WZYX),
 	_T(I8_UNORM,   8_UNORM, NONE,     WZYX),
 
-//	_T(A8_UINT,    8_UINT,  NONE,     WZYX),
-//	_T(A8_SINT,    8_SINT,  NONE,     WZYX),
-//	_T(L8_UINT,    8_UINT,  NONE,     WZYX),
-//	_T(L8_SINT,    8_SINT,  NONE,     WZYX),
-//	_T(I8_UINT,    8_UINT,  NONE,     WZYX),
-//	_T(I8_SINT,    8_SINT,  NONE,     WZYX),
+	_T(A8_UINT,    8_UINT,  NONE,     WZYX),
+	_T(A8_SINT,    8_SINT,  NONE,     WZYX),
+	_T(L8_UINT,    8_UINT,  NONE,     WZYX),
+	_T(L8_SINT,    8_SINT,  NONE,     WZYX),
+	_T(I8_UINT,    8_UINT,  NONE,     WZYX),
+	_T(I8_SINT,    8_SINT,  NONE,     WZYX),
 
-//	_T(S8_UINT,    8_UINT,  R8_UNORM, WZYX),
+	_T(S8_UINT,    8_UINT,  R8_UNORM, WZYX),
 
 	/* 16-bit */
 	VT(R16_UNORM,   16_UNORM, R16_UNORM, WZYX),
@@ -105,18 +105,18 @@
 	V_(R16_SSCALED, 16_UINT,  NONE,      WZYX),
 	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT, WZYX),
 
-//	_T(A16_UNORM,   16_UNORM, NONE,      WZYX),
-//	_T(A16_SNORM,   16_SNORM, NONE,      WZYX),
-//	_T(A16_UINT,    16_UINT,  NONE,      WZYX),
-//	_T(A16_SINT,    16_SINT,  NONE,      WZYX),
-//	_T(L16_UNORM,   16_UNORM, NONE,      WZYX),
-//	_T(L16_SNORM,   16_SNORM, NONE,      WZYX),
-//	_T(L16_UINT,    16_UINT,  NONE,      WZYX),
-//	_T(L16_SINT,    16_SINT,  NONE,      WZYX),
-//	_T(I16_UNORM,   16_UNORM, NONE,      WZYX),
-//	_T(I16_SNORM,   16_SNORM, NONE,      WZYX),
-//	_T(I16_UINT,    16_UINT,  NONE,      WZYX),
-//	_T(I16_SINT,    16_SINT,  NONE,      WZYX),
+	_T(A16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(A16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(A16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(A16_SINT,    16_SINT,  NONE,      WZYX),
+	_T(L16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(L16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(L16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(L16_SINT,    16_SINT,  NONE,      WZYX),
+	_T(I16_UNORM,   16_UNORM, NONE,      WZYX),
+	_T(I16_SNORM,   16_SNORM, NONE,      WZYX),
+	_T(I16_UINT,    16_UINT,  NONE,      WZYX),
+	_T(I16_SINT,    16_SINT,  NONE,      WZYX),
 
 	VT(R8G8_UNORM,   8_8_UNORM, R8G8_UNORM, WZYX),
 	VT(R8G8_SNORM,   8_8_SNORM, R8G8_SNORM, WZYX),
@@ -125,8 +125,8 @@
 	V_(R8G8_USCALED, 8_8_UINT,  NONE,       WZYX),
 	V_(R8G8_SSCALED, 8_8_SINT,  NONE,       WZYX),
 
-//	_T(L8A8_UINT,    8_8_UINT,  NONE,       WZYX),
-//	_T(L8A8_SINT,    8_8_SINT,  NONE,       WZYX),
+	_T(L8A8_UINT,    8_8_UINT,  NONE,       WZYX),
+	_T(L8A8_SINT,    8_8_SINT,  NONE,       WZYX),
 
 	_T(B5G6R5_UNORM,   5_6_5_UNORM,   R5G6B5_UNORM,   WXYZ),
 	_T(B5G5R5A1_UNORM, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ),
@@ -149,33 +149,33 @@
 	VT(R32_FLOAT,   32_FLOAT, R32_FLOAT,WZYX),
 	V_(R32_FIXED,   32_FIXED, NONE,     WZYX),
 
-//	_T(A32_UINT,    32_UINT,  NONE,     WZYX),
-//	_T(A32_SINT,    32_SINT,  NONE,     WZYX),
-//	_T(L32_UINT,    32_UINT,  NONE,     WZYX),
-//	_T(L32_SINT,    32_SINT,  NONE,     WZYX),
-//	_T(I32_UINT,    32_UINT,  NONE,     WZYX),
-//	_T(I32_SINT,    32_SINT,  NONE,     WZYX),
+	_T(A32_UINT,    32_UINT,  NONE,     WZYX),
+	_T(A32_SINT,    32_SINT,  NONE,     WZYX),
+	_T(L32_UINT,    32_UINT,  NONE,     WZYX),
+	_T(L32_SINT,    32_SINT,  NONE,     WZYX),
+	_T(I32_UINT,    32_UINT,  NONE,     WZYX),
+	_T(I32_SINT,    32_SINT,  NONE,     WZYX),
 
 	VT(R16G16_UNORM,   16_16_UNORM, R16G16_UNORM, WZYX),
 	VT(R16G16_SNORM,   16_16_SNORM, R16G16_SNORM, WZYX),
 	VT(R16G16_UINT,    16_16_UINT,  R16G16_UINT,  WZYX),
 	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT,  WZYX),
-	V_(R16G16_USCALED, 16_16_UINT,  NONE,         WZYX),
-	V_(R16G16_SSCALED, 16_16_SINT,  NONE,         WZYX),
+	VT(R16G16_USCALED, 16_16_UINT,  NONE,         WZYX),
+	VT(R16G16_SSCALED, 16_16_SINT,  NONE,         WZYX),
 	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT, WZYX),
 
-//	_T(L16A16_UNORM,   16_16_UNORM, NONE,         WZYX),
-//	_T(L16A16_SNORM,   16_16_SNORM, NONE,         WZYX),
-//	_T(L16A16_UINT,    16_16_UINT,  NONE,         WZYX),
-//	_T(L16A16_SINT,    16_16_SINT,  NONE,         WZYX),
+	_T(L16A16_UNORM,   16_16_UNORM, NONE,         WZYX),
+	_T(L16A16_SNORM,   16_16_SNORM, NONE,         WZYX),
+	_T(L16A16_UINT,    16_16_UINT,  NONE,         WZYX),
+	_T(L16A16_SINT,    16_16_SINT,  NONE,         WZYX),
 
 	VT(R8G8B8A8_UNORM,   8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(R8G8B8X8_UNORM,   8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(R8G8B8A8_SRGB,    8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(R8G8B8X8_SRGB,    8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX),
-	V_(R8G8B8A8_SNORM,   8_8_8_8_SNORM, NONE,           WZYX),
-	V_(R8G8B8A8_UINT,    8_8_8_8_UINT,  R8G8B8A8_UINT,  WZYX),
-	V_(R8G8B8A8_SINT,    8_8_8_8_SINT,  NONE,           WZYX),
+	VT(R8G8B8A8_SNORM,   8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX),
+	VT(R8G8B8A8_UINT,    8_8_8_8_UINT,  R8G8B8A8_UINT,  WZYX),
+	VT(R8G8B8A8_SINT,    8_8_8_8_SINT,  R8G8B8A8_SINT,  WZYX),
 	V_(R8G8B8A8_USCALED, 8_8_8_8_UINT,  NONE,           WZYX),
 	V_(R8G8B8A8_SSCALED, 8_8_8_8_SINT,  NONE,           WZYX),
 
@@ -194,19 +194,19 @@
 	_T(A8R8G8B8_SRGB,    8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW),
 	_T(X8R8G8B8_SRGB,    8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW),
 
-	_T(R10G10B10A2_UNORM,   10_10_10_2_UNORM, NONE,              WZYX),
-	_T(B10G10R10A2_UNORM,   10_10_10_2_UNORM, NONE,              WXYZ),
-	_T(B10G10R10X2_UNORM,   10_10_10_2_UNORM, NONE,              WXYZ),
-//	V_(R10G10B10A2_SNORM,   10_10_10_2_SNORM, NONE,              WZYX),
-//	V_(B10G10R10A2_SNORM,   10_10_10_2_SNORM, NONE,              WXYZ),
-	_T(R10G10B10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WZYX),
-	_T(B10G10R10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WXYZ),
-//	V_(R10G10B10A2_USCALED, 10_10_10_2_UINT,  NONE,              WZYX),
-//	V_(B10G10R10A2_USCALED, 10_10_10_2_UINT,  NONE,              WXYZ),
-//	V_(R10G10B10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WZYX),
-//	V_(B10G10R10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WXYZ),
+	VT(R10G10B10A2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WZYX),
+	VT(B10G10R10A2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
+	_T(B10G10R10X2_UNORM,   10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ),
+	V_(R10G10B10A2_SNORM,   10_10_10_2_SNORM, NONE,              WZYX),
+	V_(B10G10R10A2_SNORM,   10_10_10_2_SNORM, NONE,              WXYZ),
+	VT(R10G10B10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WZYX),
+	VT(B10G10R10A2_UINT,    10_10_10_2_UINT,  R10G10B10A2_UINT,  WXYZ),
+	V_(R10G10B10A2_USCALED, 10_10_10_2_UINT,  NONE,              WZYX),
+	V_(B10G10R10A2_USCALED, 10_10_10_2_UINT,  NONE,              WXYZ),
+	V_(R10G10B10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WZYX),
+	V_(B10G10R10A2_SSCALED, 10_10_10_2_SINT,  NONE,              WXYZ),
 
-	_T(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX),
+	VT(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX),
 	_T(R9G9B9E5_FLOAT,  9_9_9_E5_FLOAT, NONE,            WZYX),
 
 	_T(Z24X8_UNORM,       X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
@@ -224,18 +224,18 @@
 	V_(R16G16B16_FLOAT,   16_16_16_FLOAT, NONE, WZYX),
 
 	/* 64-bit */
-	V_(R16G16B16A16_UNORM,   16_16_16_16_UNORM, NONE,               WZYX),
-	V_(R16G16B16X16_UNORM,   16_16_16_16_UNORM, NONE,               WZYX),
-	V_(R16G16B16A16_SNORM,   16_16_16_16_SNORM, NONE,               WZYX),
-	V_(R16G16B16X16_SNORM,   16_16_16_16_SNORM, NONE,               WZYX),
-	V_(R16G16B16A16_UINT,    16_16_16_16_UINT,  NONE,               WZYX),
-//	_T(R16G16B16X16_UINT,    16_16_16_16_UINT,  R16G16B16A16_UINT,  WZYX),
-	V_(R16G16B16A16_SINT,    16_16_16_16_SINT,  NONE,               WZYX),
-//	_T(R16G16B16X16_SINT,    16_16_16_16_SINT,  R16G16B16A16_SINT,  WZYX),
-	V_(R16G16B16A16_USCALED, 16_16_16_16_UINT,  NONE,               WZYX),
-	V_(R16G16B16A16_SSCALED, 16_16_16_16_SINT,  NONE,               WZYX),
+	VT(R16G16B16A16_UNORM,   16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX),
+	VT(R16G16B16X16_UNORM,   16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX),
+	VT(R16G16B16A16_SNORM,   16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX),
+	VT(R16G16B16X16_SNORM,   16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX),
+	VT(R16G16B16A16_UINT,    16_16_16_16_UINT,  R16G16B16A16_UINT,  WZYX),
+	VT(R16G16B16X16_UINT,    16_16_16_16_UINT,  R16G16B16A16_UINT,  WZYX),
+	VT(R16G16B16A16_SINT,    16_16_16_16_SINT,  R16G16B16A16_SINT,  WZYX),
+	VT(R16G16B16X16_SINT,    16_16_16_16_SINT,  R16G16B16A16_SINT,  WZYX),
+	VT(R16G16B16A16_USCALED, 16_16_16_16_UINT,  NONE,               WZYX),
+	VT(R16G16B16A16_SSCALED, 16_16_16_16_SINT,  NONE,               WZYX),
 	VT(R16G16B16A16_FLOAT,   16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX),
-	_T(R16G16B16X16_FLOAT,   16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX),
+	VT(R16G16B16X16_FLOAT,   16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX),
 
 	VT(R32G32_UINT,    32_32_UINT,  R32G32_UINT, WZYX),
 	VT(R32G32_SINT,    32_32_SINT,  R32G32_SINT, WZYX),
@@ -244,15 +244,15 @@
 	VT(R32G32_FLOAT,   32_32_FLOAT, R32G32_FLOAT,WZYX),
 	V_(R32G32_FIXED,   32_32_FIXED, NONE,        WZYX),
 
-//	_T(L32A32_UINT,    32_32_UINT,  NONE,        WZYX),
-//	_T(L32A32_SINT,    32_32_SINT,  NONE,        WZYX),
+	_T(L32A32_UINT,    32_32_UINT,  NONE,        WZYX),
+	_T(L32A32_SINT,    32_32_SINT,  NONE,        WZYX),
 
 	/* 96-bit */
-	V_(R32G32B32_UINT,    32_32_32_UINT,  NONE, WZYX),
-	V_(R32G32B32_SINT,    32_32_32_SINT,  NONE, WZYX),
+	VT(R32G32B32_UINT,    32_32_32_UINT,  NONE, WZYX),
+	VT(R32G32B32_SINT,    32_32_32_SINT,  NONE, WZYX),
 	V_(R32G32B32_USCALED, 32_32_32_UINT,  NONE, WZYX),
 	V_(R32G32B32_SSCALED, 32_32_32_SINT,  NONE, WZYX),
-	V_(R32G32B32_FLOAT,   32_32_32_FLOAT, NONE, WZYX),
+	VT(R32G32B32_FLOAT,   32_32_32_FLOAT, NONE, WZYX),
 	V_(R32G32B32_FIXED,   32_32_32_FIXED, NONE, WZYX),
 
 	/* 128-bit */
@@ -267,70 +267,70 @@
 	V_(R32G32B32A32_FIXED,   32_32_32_32_FIXED, NONE,               WZYX),
 
 	/* compressed */
-//	_T(ETC1_RGB8, ETC1, NONE, WZYX),
-//	_T(ETC2_RGB8, ETC2_RGB8, NONE, WZYX),
-//	_T(ETC2_SRGB8, ETC2_RGB8, NONE, WZYX),
-//	_T(ETC2_RGB8A1, ETC2_RGB8A1, NONE, WZYX),
-//	_T(ETC2_SRGB8A1, ETC2_RGB8A1, NONE, WZYX),
-//	_T(ETC2_RGBA8, ETC2_RGBA8, NONE, WZYX),
-//	_T(ETC2_SRGBA8, ETC2_RGBA8, NONE, WZYX),
-//	_T(ETC2_R11_UNORM, ETC2_R11_UNORM, NONE, WZYX),
-//	_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
-//	_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
-//	_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
+	_T(ETC1_RGB8, ETC1, NONE, WZYX),
+	_T(ETC2_RGB8, ETC2_RGB8, NONE, WZYX),
+	_T(ETC2_SRGB8, ETC2_RGB8, NONE, WZYX),
+	_T(ETC2_RGB8A1, ETC2_RGB8A1, NONE, WZYX),
+	_T(ETC2_SRGB8A1, ETC2_RGB8A1, NONE, WZYX),
+	_T(ETC2_RGBA8, ETC2_RGBA8, NONE, WZYX),
+	_T(ETC2_SRGBA8, ETC2_RGBA8, NONE, WZYX),
+	_T(ETC2_R11_UNORM, ETC2_R11_UNORM, NONE, WZYX),
+	_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
+	_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
+	_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
 
-//	_T(DXT1_RGB,   DXT1, NONE, WZYX),
-//	_T(DXT1_SRGB,  DXT1, NONE, WZYX),
-//	_T(DXT1_RGBA,  DXT1, NONE, WZYX),
-//	_T(DXT1_SRGBA, DXT1, NONE, WZYX),
-//	_T(DXT3_RGBA,  DXT3, NONE, WZYX),
-//	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
-//	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
-//	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
+	_T(DXT1_RGB,   DXT1, NONE, WZYX),
+	_T(DXT1_SRGB,  DXT1, NONE, WZYX),
+	_T(DXT1_RGBA,  DXT1, NONE, WZYX),
+	_T(DXT1_SRGBA, DXT1, NONE, WZYX),
+	_T(DXT3_RGBA,  DXT3, NONE, WZYX),
+	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
+	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
+	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
 
-//	_T(BPTC_RGBA_UNORM, BPTC,        NONE, WZYX),
-//	_T(BPTC_SRGBA,      BPTC,        NONE, WZYX),
-//	_T(BPTC_RGB_FLOAT,  BPTC_FLOAT,  NONE, WZYX),
-//	_T(BPTC_RGB_UFLOAT, BPTC_UFLOAT, NONE, WZYX),
+	_T(BPTC_RGBA_UNORM, BPTC,        NONE, WZYX),
+	_T(BPTC_SRGBA,      BPTC,        NONE, WZYX),
+	_T(BPTC_RGB_FLOAT,  BPTC_FLOAT,  NONE, WZYX),
+	_T(BPTC_RGB_UFLOAT, BPTC_UFLOAT, NONE, WZYX),
 
-//	_T(RGTC1_UNORM, RGTC1_UNORM, NONE, WZYX),
-//	_T(RGTC1_SNORM, RGTC1_SNORM, NONE, WZYX),
-//	_T(RGTC2_UNORM, RGTC2_UNORM, NONE, WZYX),
-//	_T(RGTC2_SNORM, RGTC2_SNORM, NONE, WZYX),
-//	_T(LATC1_UNORM, RGTC1_UNORM, NONE, WZYX),
-//	_T(LATC1_SNORM, RGTC1_SNORM, NONE, WZYX),
-//	_T(LATC2_UNORM, RGTC2_UNORM, NONE, WZYX),
-//	_T(LATC2_SNORM, RGTC2_SNORM, NONE, WZYX),
+	_T(RGTC1_UNORM, RGTC1_UNORM, NONE, WZYX),
+	_T(RGTC1_SNORM, RGTC1_SNORM, NONE, WZYX),
+	_T(RGTC2_UNORM, RGTC2_UNORM, NONE, WZYX),
+	_T(RGTC2_SNORM, RGTC2_SNORM, NONE, WZYX),
+	_T(LATC1_UNORM, RGTC1_UNORM, NONE, WZYX),
+	_T(LATC1_SNORM, RGTC1_SNORM, NONE, WZYX),
+	_T(LATC2_UNORM, RGTC2_UNORM, NONE, WZYX),
+	_T(LATC2_SNORM, RGTC2_SNORM, NONE, WZYX),
 
-//	_T(ASTC_4x4,   ASTC_4x4,   NONE, WZYX),
-//	_T(ASTC_5x4,   ASTC_5x4,   NONE, WZYX),
-//	_T(ASTC_5x5,   ASTC_5x5,   NONE, WZYX),
-//	_T(ASTC_6x5,   ASTC_6x5,   NONE, WZYX),
-//	_T(ASTC_6x6,   ASTC_6x6,   NONE, WZYX),
-//	_T(ASTC_8x5,   ASTC_8x5,   NONE, WZYX),
-//	_T(ASTC_8x6,   ASTC_8x6,   NONE, WZYX),
-//	_T(ASTC_8x8,   ASTC_8x8,   NONE, WZYX),
-//	_T(ASTC_10x5,  ASTC_10x5,  NONE, WZYX),
-//	_T(ASTC_10x6,  ASTC_10x6,  NONE, WZYX),
-//	_T(ASTC_10x8,  ASTC_10x8,  NONE, WZYX),
-//	_T(ASTC_10x10, ASTC_10x10, NONE, WZYX),
-//	_T(ASTC_12x10, ASTC_12x10, NONE, WZYX),
-//	_T(ASTC_12x12, ASTC_12x12, NONE, WZYX),
+	_T(ASTC_4x4,   ASTC_4x4,   NONE, WZYX),
+	_T(ASTC_5x4,   ASTC_5x4,   NONE, WZYX),
+	_T(ASTC_5x5,   ASTC_5x5,   NONE, WZYX),
+	_T(ASTC_6x5,   ASTC_6x5,   NONE, WZYX),
+	_T(ASTC_6x6,   ASTC_6x6,   NONE, WZYX),
+	_T(ASTC_8x5,   ASTC_8x5,   NONE, WZYX),
+	_T(ASTC_8x6,   ASTC_8x6,   NONE, WZYX),
+	_T(ASTC_8x8,   ASTC_8x8,   NONE, WZYX),
+	_T(ASTC_10x5,  ASTC_10x5,  NONE, WZYX),
+	_T(ASTC_10x6,  ASTC_10x6,  NONE, WZYX),
+	_T(ASTC_10x8,  ASTC_10x8,  NONE, WZYX),
+	_T(ASTC_10x10, ASTC_10x10, NONE, WZYX),
+	_T(ASTC_12x10, ASTC_12x10, NONE, WZYX),
+	_T(ASTC_12x12, ASTC_12x12, NONE, WZYX),
 
-//	_T(ASTC_4x4_SRGB,   ASTC_4x4,   NONE, WZYX),
-//	_T(ASTC_5x4_SRGB,   ASTC_5x4,   NONE, WZYX),
-//	_T(ASTC_5x5_SRGB,   ASTC_5x5,   NONE, WZYX),
-//	_T(ASTC_6x5_SRGB,   ASTC_6x5,   NONE, WZYX),
-//	_T(ASTC_6x6_SRGB,   ASTC_6x6,   NONE, WZYX),
-//	_T(ASTC_8x5_SRGB,   ASTC_8x5,   NONE, WZYX),
-//	_T(ASTC_8x6_SRGB,   ASTC_8x6,   NONE, WZYX),
-//	_T(ASTC_8x8_SRGB,   ASTC_8x8,   NONE, WZYX),
-//	_T(ASTC_10x5_SRGB,  ASTC_10x5,  NONE, WZYX),
-//	_T(ASTC_10x6_SRGB,  ASTC_10x6,  NONE, WZYX),
-//	_T(ASTC_10x8_SRGB,  ASTC_10x8,  NONE, WZYX),
-//	_T(ASTC_10x10_SRGB, ASTC_10x10, NONE, WZYX),
-//	_T(ASTC_12x10_SRGB, ASTC_12x10, NONE, WZYX),
-//	_T(ASTC_12x12_SRGB, ASTC_12x12, NONE, WZYX),
+	_T(ASTC_4x4_SRGB,   ASTC_4x4,   NONE, WZYX),
+	_T(ASTC_5x4_SRGB,   ASTC_5x4,   NONE, WZYX),
+	_T(ASTC_5x5_SRGB,   ASTC_5x5,   NONE, WZYX),
+	_T(ASTC_6x5_SRGB,   ASTC_6x5,   NONE, WZYX),
+	_T(ASTC_6x6_SRGB,   ASTC_6x6,   NONE, WZYX),
+	_T(ASTC_8x5_SRGB,   ASTC_8x5,   NONE, WZYX),
+	_T(ASTC_8x6_SRGB,   ASTC_8x6,   NONE, WZYX),
+	_T(ASTC_8x8_SRGB,   ASTC_8x8,   NONE, WZYX),
+	_T(ASTC_10x5_SRGB,  ASTC_10x5,  NONE, WZYX),
+	_T(ASTC_10x6_SRGB,  ASTC_10x6,  NONE, WZYX),
+	_T(ASTC_10x8_SRGB,  ASTC_10x8,  NONE, WZYX),
+	_T(ASTC_10x10_SRGB, ASTC_10x10, NONE, WZYX),
+	_T(ASTC_12x10_SRGB, ASTC_12x10, NONE, WZYX),
+	_T(ASTC_12x12_SRGB, ASTC_12x12, NONE, WZYX),
 };
 
 /* convert pipe format to vertex buffer format: */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
index b80a04f..c623b57 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
@@ -58,7 +58,7 @@
 	for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) {
 		enum a5xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
-		bool srgb = false;
+		bool srgb = false, sint = false, uint = false;
 		struct fd_resource *rsc = NULL;
 		struct fd_resource_slice *slice = NULL;
 		uint32_t stride = 0;
@@ -76,6 +76,8 @@
 			format = fd5_pipe2color(pformat);
 			swap = fd5_pipe2swap(pformat);
 			srgb = util_format_is_srgb(pformat);
+			sint = util_format_is_pure_sint(pformat);
+			uint = util_format_is_pure_uint(pformat);
 
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
@@ -110,6 +112,8 @@
 
 		OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1);
 		OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) |
+				COND(sint, A5XX_SP_FS_MRT_REG_COLOR_SINT) |
+				COND(uint, A5XX_SP_FS_MRT_REG_COLOR_UINT) |
 				COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB));
 
 		/* when we support UBWC, these would be the system memory
@@ -162,6 +166,24 @@
 		OUT_RING(ring, 0x00000000);    /* RB_DEPTH_FLAG_BUFFER_BASE_HI */
 		OUT_RING(ring, 0x00000000);    /* RB_DEPTH_FLAG_BUFFER_PITCH */
 
+		if (rsc->lrz) {
+			OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3);
+			OUT_RELOCW(ring, rsc->lrz, 0x1000, 0, 0);
+			OUT_RING(ring, A5XX_GRAS_LRZ_BUFFER_PITCH(rsc->lrz_pitch));
+
+			OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2);
+			OUT_RELOCW(ring, rsc->lrz, 0, 0, 0);
+		} else {
+			OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);     /* GRAS_LRZ_BUFFER_PITCH */
+
+			OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+
 		if (rsc->stencil) {
 			if (gmem) {
 				stride = 1 * gmem->bin_w;
@@ -207,6 +229,21 @@
 	}
 }
 
+static bool
+use_hw_binning(struct fd_batch *batch)
+{
+	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+	if ((gmem->maxpw * gmem->maxph) > 32)
+		return false;
+
+	if ((gmem->maxpw > 15) || (gmem->maxph > 15))
+		return false;
+
+	return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) &&
+			(batch->num_draws > 0);
+}
+
 static void
 patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode)
 {
@@ -218,16 +255,130 @@
 	util_dynarray_resize(&batch->draw_patches, 0);
 }
 
+static void
+update_vsc_pipe(struct fd_batch *batch)
+{
+	struct fd_context *ctx = batch->ctx;
+	struct fd5_context *fd5_ctx = fd5_context(ctx);
+	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+	struct fd_ringbuffer *ring = batch->gmem;
+	int i;
+
+	OUT_PKT4(ring, REG_A5XX_VSC_BIN_SIZE, 3);
+	OUT_RING(ring, A5XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
+			A5XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
+	OUT_RELOCW(ring, fd5_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */
+
+	OUT_PKT4(ring, REG_A5XX_UNKNOWN_0BC5, 2);
+	OUT_RING(ring, 0x00000000);   /* UNKNOWN_0BC5 */
+	OUT_RING(ring, 0x00000000);   /* UNKNOWN_0BC6 */
+
+	OUT_PKT4(ring, REG_A5XX_VSC_PIPE_CONFIG_REG(0), 16);
+	for (i = 0; i < 16; i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+		OUT_RING(ring, A5XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
+				A5XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
+				A5XX_VSC_PIPE_CONFIG_REG_W(pipe->w) |
+				A5XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
+	}
+
+	OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32);
+	for (i = 0; i < 16; i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+		if (!pipe->bo) {
+			pipe->bo = fd_bo_new(ctx->dev, 0x20000,
+					DRM_FREEDRENO_GEM_TYPE_KMEM);
+		}
+		OUT_RELOCW(ring, pipe->bo, 0, 0, 0);     /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */
+	}
+
+	OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(0), 16);
+	for (i = 0; i < 16; i++) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+		OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */
+	}
+}
+
+static void
+emit_binning_pass(struct fd_batch *batch)
+{
+	struct fd_context *ctx = batch->ctx;
+	struct fd_ringbuffer *ring = batch->gmem;
+	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+	uint32_t x1 = gmem->minx;
+	uint32_t y1 = gmem->miny;
+	uint32_t x2 = gmem->minx + gmem->width - 1;
+	uint32_t y2 = gmem->miny + gmem->height - 1;
+
+	fd5_set_render_mode(batch->ctx, ring, BINNING);
+
+	OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1);
+	OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) |
+			A5XX_RB_CNTL_HEIGHT(gmem->bin_h));
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
+	OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) |
+			A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1));
+	OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) |
+			A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2));
+
+	OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2);
+	OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) |
+			A5XX_RB_RESOLVE_CNTL_1_Y(y1));
+	OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) |
+			A5XX_RB_RESOLVE_CNTL_2_Y(y2));
+
+	update_vsc_pipe(batch);
+
+	OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1);
+	OUT_RING(ring, A5XX_VPC_MODE_CNTL_BINNING_PASS);
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, UNK_2C);
+
+	OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1);
+	OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) |
+			A5XX_RB_WINDOW_OFFSET_Y(0));
+
+	/* emit IB to binning drawcmds: */
+	ctx->emit_ib(ring, batch->binning);
+
+	fd_reset_wfi(batch);
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, UNK_2D);
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+	OUT_RING(ring, CACHE_FLUSH_TS);
+	OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0, 0, 0);  /* ADDR_LO/HI */
+	OUT_RING(ring, 0x00000000);
+
+	// TODO CP_COND_WRITE's for all the vsc buffers (check for overflow??)
+
+	fd_wfi(batch, ring);
+
+	OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1);
+	OUT_RING(ring, 0x0);
+}
+
 /* before first tile */
 static void
 fd5_emit_tile_init(struct fd_batch *batch)
 {
+	struct fd_context *ctx = batch->ctx;
 	struct fd_ringbuffer *ring = batch->gmem;
+	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
 	fd5_emit_restore(batch, ring);
 
-	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_26);
+	if (batch->lrz_clear)
+		ctx->emit_ib(ring, batch->lrz_clear);
+
+	fd5_emit_lrz_flush(ring);
+
+	OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1);
+	OUT_RING(ring, 0x00000080);   /* GRAS_CL_CNTL */
 
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
@@ -243,9 +394,16 @@
 	OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1);
 	OUT_RING(ring, 0x7c13c080);   /* RB_CCU_CNTL */
 
-/*
-opcode: CP_PREEMPT_ENABLE_LOCAL (6a) (2 dwords)
- */
+	emit_zs(ring, pfb->zsbuf, &ctx->gmem);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, &ctx->gmem);
+
+	if (use_hw_binning(batch)) {
+		emit_binning_pass(batch);
+		fd5_emit_lrz_flush(ring);
+		patch_draws(batch, USE_VISIBILITY);
+	} else {
+		patch_draws(batch, IGNORE_VISIBILITY);
+	}
 
 	fd5_set_render_mode(batch->ctx, ring, GMEM);
 }
@@ -254,6 +412,8 @@
 static void
 fd5_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
 {
+	struct fd_context *ctx = batch->ctx;
+	struct fd5_context *fd5_ctx = fd5_context(ctx);
 	struct fd_ringbuffer *ring = batch->gmem;
 
 	uint32_t x1 = tile->xoff;
@@ -273,6 +433,25 @@
 	OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) |
 			A5XX_RB_RESOLVE_CNTL_2_Y(y2));
 
+	if (use_hw_binning(batch)) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p];
+
+		OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
+
+		OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
+		OUT_RING(ring, 0x0);
+
+		OUT_PKT7(ring, CP_SET_BIN_DATA5, 5);
+		OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
+				CP_SET_BIN_DATA5_0_VSC_N(tile->n));
+		OUT_RELOC(ring, pipe->bo, 0, 0, 0);      /* VSC_PIPE[p].DATA_ADDRESS */
+		OUT_RELOC(ring, fd5_ctx->vsc_size_mem,   /* VSC_SIZE_ADDRESS + (p * 4) */
+				(tile->p * 4), 0, 0);
+	} else {
+		OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
+		OUT_RING(ring, 0x1);
+	}
+
 	OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1);
 	OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(x1) |
 			A5XX_RB_WINDOW_OFFSET_Y(y1));
@@ -383,15 +562,10 @@
 	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
 	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
-	OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
-	OUT_RING(ring, 0x1);
-
 	OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1);
 	OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) |
 			A5XX_RB_CNTL_HEIGHT(gmem->bin_h));
 
-	patch_draws(batch, IGNORE_VISIBILITY);
-
 	emit_zs(ring, pfb->zsbuf, gmem);
 	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem);
 
@@ -488,8 +662,7 @@
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
 
-	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_26);
+	fd5_emit_lrz_flush(ring);
 
 	fd5_cache_flush(batch, ring);
 	fd5_set_render_mode(batch->ctx, ring, BYPASS);
@@ -503,8 +676,7 @@
 
 	fd5_emit_restore(batch, ring);
 
-	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_26);
+	fd5_emit_lrz_flush(ring);
 
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
@@ -578,8 +750,7 @@
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
 
-	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_26);
+	fd5_emit_lrz_flush(ring);
 
 	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
 	OUT_RING(ring, UNK_1D);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
index 890020f..aa4babd 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -84,42 +84,36 @@
 	delete_shader_stateobj(so);
 }
 
-static void
-emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
+void
+fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
 {
 	const struct ir3_info *si = &so->info;
-	enum adreno_state_block sb;
-	enum adreno_state_src src;
+	enum a4xx_state_block sb = fd4_stage2shadersb(so->type);
+	enum a4xx_state_src src;
 	uint32_t i, sz, *bin;
 
-	if (so->type == SHADER_VERTEX) {
-		sb = SB_VERT_SHADER;
-	} else {
-		sb = SB_FRAG_SHADER;
-	}
-
 	if (fd_mesa_debug & FD_DBG_DIRECT) {
 		sz = si->sizedwords;
-		src = SS_DIRECT;
+		src = SS4_DIRECT;
 		bin = fd_bo_map(so->bo);
 	} else {
 		sz = 0;
-		src = 2;  // enums different on a5xx..
+		src = SS4_INDIRECT;
 		bin = NULL;
 	}
 
-	OUT_PKT7(ring, CP_LOAD_STATE, 3 + sz);
-	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-			CP_LOAD_STATE_0_NUM_UNIT(so->instrlen));
+	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sz);
+	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+			CP_LOAD_STATE4_0_STATE_SRC(src) |
+			CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen));
 	if (bin) {
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER));
-		OUT_RING(ring, CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(0));
+		OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER));
+		OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
 	} else {
 		OUT_RELOC(ring, so->bo, 0,
-				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0);
+				CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0);
 	}
 
 	/* for how clever coverity is, it is sometimes rather dull, and
@@ -330,7 +324,8 @@
 }
 
 void
-fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
+fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+				 struct fd5_emit *emit)
 {
 	struct stage s[MAX_STAGES];
 	uint32_t pos_regid, psize_regid, color_regid[8];
@@ -374,49 +369,54 @@
 	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
 	 */
 
-	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONTROL_REG, 5);
-	OUT_RING(ring, A5XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
-			A5XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff) |
-			COND(s[VS].v, A5XX_HLSQ_VS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
-			A5XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff) |
-			COND(s[FS].v, A5XX_HLSQ_FS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
-			A5XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff) |
-			COND(s[HS].v, A5XX_HLSQ_HS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
-			A5XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff) |
-			COND(s[DS].v, A5XX_HLSQ_DS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
-			A5XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff) |
-			COND(s[GS].v, A5XX_HLSQ_GS_CONTROL_REG_ENABLED));
+	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONFIG, 5);
+	OUT_RING(ring, A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) |
+			A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) |
+			COND(s[VS].v, A5XX_HLSQ_VS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) |
+			A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) |
+			COND(s[FS].v, A5XX_HLSQ_FS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) |
+			A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) |
+			COND(s[HS].v, A5XX_HLSQ_HS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) |
+			A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) |
+			COND(s[DS].v, A5XX_HLSQ_DS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) |
+			A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) |
+			COND(s[GS].v, A5XX_HLSQ_GS_CONFIG_ENABLED));
 
 	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1);
 	OUT_RING(ring, 0x00000000);
 
 	OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CNTL, 5);
-	OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen));
-	OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen));
-	OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen));
-	OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen));
-	OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen));
+	OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen) |
+			COND(s[VS].v && s[VS].v->has_ssbo, A5XX_HLSQ_VS_CNTL_SSBO_ENABLE));
+	OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen) |
+			COND(s[FS].v && s[FS].v->has_ssbo, A5XX_HLSQ_FS_CNTL_SSBO_ENABLE));
+	OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen) |
+			COND(s[HS].v && s[HS].v->has_ssbo, A5XX_HLSQ_HS_CNTL_SSBO_ENABLE));
+	OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen) |
+			COND(s[DS].v && s[DS].v->has_ssbo, A5XX_HLSQ_DS_CNTL_SSBO_ENABLE));
+	OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen) |
+			COND(s[GS].v && s[GS].v->has_ssbo, A5XX_HLSQ_GS_CNTL_SSBO_ENABLE));
 
-	OUT_PKT4(ring, REG_A5XX_SP_VS_CONTROL_REG, 5);
-	OUT_RING(ring, A5XX_SP_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
-			A5XX_SP_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff) |
-			COND(s[VS].v, A5XX_SP_VS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_SP_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
-			A5XX_SP_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff) |
-			COND(s[FS].v, A5XX_SP_FS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_SP_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
-			A5XX_SP_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff) |
-			COND(s[HS].v, A5XX_SP_HS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_SP_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
-			A5XX_SP_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff) |
-			COND(s[DS].v, A5XX_SP_DS_CONTROL_REG_ENABLED));
-	OUT_RING(ring, A5XX_SP_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
-			A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff) |
-			COND(s[GS].v, A5XX_SP_GS_CONTROL_REG_ENABLED));
+	OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG, 5);
+	OUT_RING(ring, A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) |
+			A5XX_SP_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) |
+			COND(s[VS].v, A5XX_SP_VS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) |
+			A5XX_SP_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) |
+			COND(s[FS].v, A5XX_SP_FS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) |
+			A5XX_SP_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) |
+			COND(s[HS].v, A5XX_SP_HS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) |
+			A5XX_SP_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) |
+			COND(s[DS].v, A5XX_SP_DS_CONFIG_ENABLED));
+	OUT_RING(ring, A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) |
+			A5XX_SP_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) |
+			COND(s[GS].v, A5XX_SP_GS_CONFIG_ENABLED));
 
 	OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1);
 	OUT_RING(ring, 0x00000000);
@@ -441,9 +441,9 @@
 	OUT_RING(ring, s[GS].constlen);    /* HLSQ_GS_CONSTLEN */
 	OUT_RING(ring, s[GS].instrlen);    /* HLSQ_GS_INSTRLEN */
 
-	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTEXT_SWITCH_CS_SW_3, 2);
-	OUT_RING(ring, 0x00000000);   /* HLSQ_CONTEXT_SWITCH_CS_SW_3 */
-	OUT_RING(ring, 0x00000000);   /* HLSQ_CONTEXT_SWITCH_CS_SW_4 */
+	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2);
+	OUT_RING(ring, 0x00000000);        /* HLSQ_CS_CONSTLEN */
+	OUT_RING(ring, 0x00000000);        /* HLSQ_CS_INSTRLEN */
 
 	OUT_PKT4(ring, REG_A5XX_SP_VS_CTRL_REG0, 1);
 	OUT_RING(ring, A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
@@ -525,39 +525,35 @@
 	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_LO/HI */
 
 	if (s[VS].instrlen)
-		emit_shader(ring, s[VS].v);
+		fd5_emit_shader(ring, s[VS].v);
 
 	// TODO depending on other bits in this reg (if any) set somewhere else?
 	OUT_PKT4(ring, REG_A5XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, COND(s[VS].v->writes_psize, A5XX_PC_PRIM_VTX_CNTL_PSIZE));
 
+	OUT_PKT4(ring, REG_A5XX_SP_PRIMITIVE_CNTL, 1);
+	OUT_RING(ring, A5XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt));
+
+	OUT_PKT4(ring, REG_A5XX_VPC_CNTL_0, 1);
+	OUT_RING(ring, A5XX_VPC_CNTL_0_STRIDE_IN_VPC(l.max_loc) |
+			COND(s[FS].v->total_in > 0, A5XX_VPC_CNTL_0_VARYING) |
+			COND(s[FS].v->frag_coord, A5XX_VPC_CNTL_0_VARYING) |
+			0x10000);    // XXX
+
+	fd5_context(ctx)->max_loc = l.max_loc;
+
 	if (emit->key.binning_pass) {
 		OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2);
 		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_LO */
 		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_HI */
 	} else {
-		// TODO if some of these other bits depend on something other than
-		// program state we should probably move these next three regs:
-
-		OUT_PKT4(ring, REG_A5XX_SP_PRIMITIVE_CNTL, 1);
-		OUT_RING(ring, A5XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt));
-
-		OUT_PKT4(ring, REG_A5XX_VPC_CNTL_0, 1);
-		OUT_RING(ring, A5XX_VPC_CNTL_0_STRIDE_IN_VPC(l.max_loc) |
-				COND(s[FS].v->total_in > 0, A5XX_VPC_CNTL_0_VARYING) |
-				COND(s[FS].v->frag_coord, A5XX_VPC_CNTL_0_VARYING) |
-				0x10000);    // XXX
-
-		OUT_PKT4(ring, REG_A5XX_PC_PRIMITIVE_CNTL, 1);
-		OUT_RING(ring, A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC(l.max_loc) |
-				0x400);      // XXX
-
 		OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2);
 		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_LO/HI */
 	}
 
 	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5);
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
+			A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(TWO_QUADS) |
 			0x00000880);               /* XXX HLSQ_CONTROL_0 */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63));
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
@@ -613,10 +609,12 @@
 					A5XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
 	}
 
-	if (emit->key.binning_pass) {
-		OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
-		OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(0));
-	} else {
+
+	OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
+	OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) |
+			A5XX_VPC_PACK_PSIZELOC(psize_loc));
+
+	if (!emit->key.binning_pass) {
 		uint32_t vinterp[8], vpsrepl[8];
 
 		memset(vinterp, 0, sizeof(vinterp));
@@ -698,10 +696,6 @@
 			}
 		}
 
-		OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
-		OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) |
-				A5XX_VPC_PACK_PSIZELOC(psize_loc));
-
 		OUT_PKT4(ring, REG_A5XX_VPC_VARYING_INTERP_MODE(0), 8);
 		for (i = 0; i < 8; i++)
 			OUT_RING(ring, vinterp[i]);     /* VPC_VARYING_INTERP[i].MODE */
@@ -713,7 +707,7 @@
 
 	if (!emit->key.binning_pass)
 		if (s[FS].instrlen)
-			emit_shader(ring, s[FS].v);
+			fd5_emit_shader(ring, s[FS].v);
 
 	OUT_PKT4(ring, REG_A5XX_VFD_CONTROL_1, 5);
 	OUT_RING(ring, A5XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
index adcd180..585263e 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
@@ -37,7 +37,10 @@
 
 struct fd5_emit;
 
-void fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit);
+void fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so);
+
+void fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+					  struct fd5_emit *emit);
 
 void fd5_prog_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_query.c b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
index 894c682..80b84ce 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_query.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
+ * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,9 +24,230 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+/* NOTE: see https://github.com/freedreno/freedreno/wiki/A5xx-Queries */
+
+#include "freedreno_query_acc.h"
+#include "freedreno_resource.h"
+
+#include "fd5_context.h"
+#include "fd5_format.h"
 #include "fd5_query.h"
 
-void fd5_query_context_init(struct pipe_context *pctx)
+struct PACKED fd5_query_sample {
+	uint64_t start;
+	uint64_t result;
+	uint64_t stop;
+};
+
+#define query_sample(aq, field)                 \
+	fd_resource((aq)->prsc)->bo,                \
+	offsetof(struct fd5_query_sample, field),   \
+	0, 0
+
+/*
+ * Occlusion Query:
+ *
+ * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
+ * interpret results
+ */
+
+static void
+occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
 {
-	/* TODO */
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
+	OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
+
+	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
+	OUT_RELOCW(ring, query_sample(aq, start));
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, ZPASS_DONE);
+	fd_reset_wfi(batch);
+
+	fd5_context(batch->ctx)->samples_passed_queries++;
+}
+
+static void
+occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT7(ring, CP_MEM_WRITE, 4);
+	OUT_RELOCW(ring, query_sample(aq, stop));
+	OUT_RING(ring, 0xffffffff);
+	OUT_RING(ring, 0xffffffff);
+
+	OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
+
+	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
+	OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
+
+	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
+	OUT_RELOCW(ring, query_sample(aq, stop));
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, ZPASS_DONE);
+	fd_reset_wfi(batch);
+
+	OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
+	OUT_RING(ring, 0x00000014);   // XXX
+	OUT_RELOC(ring, query_sample(aq, stop));
+	OUT_RING(ring, 0xffffffff);
+	OUT_RING(ring, 0xffffffff);
+	OUT_RING(ring, 0x00000010);   // XXX
+
+	/* result += stop - start: */
+	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+			CP_MEM_TO_MEM_0_NEG_C);
+	OUT_RELOCW(ring, query_sample(aq, result));     /* dst */
+	OUT_RELOC(ring, query_sample(aq, result));      /* srcA */
+	OUT_RELOC(ring, query_sample(aq, stop));        /* srcB */
+	OUT_RELOC(ring, query_sample(aq, start));       /* srcC */
+
+	fd5_context(batch->ctx)->samples_passed_queries--;
+}
+
+static void
+occlusion_counter_result(struct fd_context *ctx, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd5_query_sample *sp = buf;
+	result->u64 = sp->result;
+}
+
+static void
+occlusion_predicate_result(struct fd_context *ctx, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd5_query_sample *sp = buf;
+	result->b = !!sp->result;
+}
+
+static const struct fd_acc_sample_provider occlusion_counter = {
+		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
+		.active = FD_STAGE_DRAW,
+		.size = sizeof(struct fd5_query_sample),
+		.resume = occlusion_resume,
+		.pause = occlusion_pause,
+		.result = occlusion_counter_result,
+};
+
+static const struct fd_acc_sample_provider occlusion_predicate = {
+		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
+		.active = FD_STAGE_DRAW,
+		.size = sizeof(struct fd5_query_sample),
+		.resume = occlusion_resume,
+		.pause = occlusion_pause,
+		.result = occlusion_predicate_result,
+};
+
+/*
+ * Timestamp Queries:
+ */
+
+static void
+timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+	OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_AND_INV_EVENT) |
+			CP_EVENT_WRITE_0_TIMESTAMP);
+	OUT_RELOCW(ring, query_sample(aq, start));
+	OUT_RING(ring, 0x00000000);
+
+	fd_reset_wfi(batch);
+}
+
+static void
+timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_ringbuffer *ring = batch->draw;
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+	OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_AND_INV_EVENT) |
+			CP_EVENT_WRITE_0_TIMESTAMP);
+	OUT_RELOCW(ring, query_sample(aq, stop));
+	OUT_RING(ring, 0x00000000);
+
+	fd_reset_wfi(batch);
+	fd_wfi(batch, ring);
+
+	/* result += stop - start: */
+	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+			CP_MEM_TO_MEM_0_NEG_C);
+	OUT_RELOCW(ring, query_sample(aq, result));     /* dst */
+	OUT_RELOC(ring, query_sample(aq, result));      /* srcA */
+	OUT_RELOC(ring, query_sample(aq, stop));        /* srcB */
+	OUT_RELOC(ring, query_sample(aq, start));       /* srcC */
+}
+
+static uint64_t
+ticks_to_ns(struct fd_context *ctx, uint32_t ts)
+{
+	/* This is based on the 19.2MHz always-on rbbm timer.
+	 *
+	 * TODO we should probably query this value from kernel..
+	 */
+	return ts * (1000000000 / 19200000);
+}
+
+static void
+time_elapsed_accumulate_result(struct fd_context *ctx, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd5_query_sample *sp = buf;
+	result->u64 = ticks_to_ns(ctx, sp->result);
+}
+
+static void
+timestamp_accumulate_result(struct fd_context *ctx, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd5_query_sample *sp = buf;
+	result->u64 = ticks_to_ns(ctx, sp->result);
+}
+
+static const struct fd_acc_sample_provider time_elapsed = {
+		.query_type = PIPE_QUERY_TIME_ELAPSED,
+		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
+		.size = sizeof(struct fd5_query_sample),
+		.resume = timestamp_resume,
+		.pause = timestamp_pause,
+		.result = time_elapsed_accumulate_result,
+};
+
+/* NOTE: timestamp query isn't going to give terribly sensible results
+ * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
+ * add in a binning pass, the results get even more non-sensical.  So
+ * we just return the timestamp on the first tile and hope that is
+ * kind of good enough.
+ */
+
+static const struct fd_acc_sample_provider timestamp = {
+		.query_type = PIPE_QUERY_TIMESTAMP,
+		.active = FD_STAGE_ALL,
+		.size = sizeof(struct fd5_query_sample),
+		.resume = timestamp_resume,
+		.pause = timestamp_pause,
+		.result = timestamp_accumulate_result,
+};
+
+void
+fd5_query_context_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+
+	ctx->create_query = fd_acc_create_query;
+	ctx->query_set_stage = fd_acc_query_set_stage;
+
+	fd_acc_query_register_provider(pctx, &occlusion_counter);
+	fd_acc_query_register_provider(pctx, &occlusion_predicate);
+
+	fd_acc_query_register_provider(pctx, &time_elapsed);
+	fd_acc_query_register_provider(pctx, &timestamp);
 }
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
index 822cbb9..2bbcbf2 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
@@ -55,7 +55,6 @@
 		psize_max = cso->point_size;
 	}
 
-	so->gras_cl_clip_cntl = 0x80000; /* ??? */
 	so->gras_su_point_minmax =
 			A5XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) |
 			A5XX_GRAS_SU_POINT_MINMAX_MAX(psize_max);
@@ -69,13 +68,13 @@
 
 	so->gras_su_cntl =
 			A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(cso->line_width/2.0);
-//	so->pc_prim_vtx_cntl2 =
-//		A5XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) |
-//		A5XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back));
+	so->pc_raster_cntl =
+		A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) |
+		A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back));
 
-//	if (cso->fill_front != PIPE_POLYGON_MODE_FILL ||
-//		cso->fill_back != PIPE_POLYGON_MODE_FILL)
-//		so->pc_prim_vtx_cntl2 |= A5XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE;
+	if (cso->fill_front != PIPE_POLYGON_MODE_FILL ||
+		cso->fill_back != PIPE_POLYGON_MODE_FILL)
+		so->pc_raster_cntl |= A5XX_PC_RASTER_CNTL_POLYMODE_ENABLE;
 
 	if (cso->cull_face & PIPE_FACE_FRONT)
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT;
@@ -83,17 +82,17 @@
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK;
 	if (!cso->front_ccw)
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_FRONT_CW;
-//	if (!cso->flatshade_first)
-//		so->pc_prim_vtx_cntl |= A5XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST;
-
 	if (cso->offset_tri)
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_POLY_OFFSET;
 
+	if (!cso->flatshade_first)
+		so->pc_primitive_cntl |= A5XX_PC_PRIMITIVE_CNTL_PROVOKING_VTX_LAST;
+
 //	if (!cso->depth_clip)
 //		so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE |
 //			A5XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE;
-//	if (cso->clip_halfz)
-//		so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z;
+	if (cso->clip_halfz)
+		so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CNTL_ZERO_GB_SCALE_Z;
 
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h
index 1c8771f..b597581 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h
@@ -41,8 +41,8 @@
 
 	uint32_t gras_su_cntl;
 	uint32_t gras_cl_clip_cntl;
-	uint32_t pc_prim_vtx_cntl;
-	uint32_t pc_prim_vtx_cntl2;
+	uint32_t pc_primitive_cntl;
+	uint32_t pc_raster_cntl;
 };
 
 static inline struct fd5_rasterizer_stateobj *
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
index 127cf33..87b69ea 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
@@ -123,7 +123,6 @@
 		A5XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge, &so->needs_border));
 
 	so->texsamp1 =
-		COND(miplinear, A5XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) |
 		COND(!cso->seamless_cube_map, A5XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) |
 		COND(!cso->normalized_coords, A5XX_TEX_SAMP_1_UNNORM_COORDS);
 
@@ -212,8 +211,7 @@
 static bool
 use_astc_srgb_workaround(struct pipe_context *pctx, enum pipe_format format)
 {
-	return (fd_screen(pctx->screen)->gpu_id == 420) &&
-		(util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC);
+	return false;  // TODO check if this is still needed on a5xx
 }
 
 static struct pipe_sampler_view *
@@ -223,7 +221,6 @@
 	struct fd5_pipe_sampler_view *so = CALLOC_STRUCT(fd5_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
 	unsigned lvl, layers;
-	uint32_t sz2 = 0;
 
 	if (!so)
 		return NULL;
@@ -300,8 +297,6 @@
 			A5XX_TEX_CONST_5_DEPTH(layers / 6);
 		break;
 	case PIPE_TEXTURE_3D:
-		while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0)
-			sz2 = rsc->slices[++lvl].size0;
 		so->texconst3 =
 			A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->slices[lvl].size0);
 		so->texconst5 =
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c
index 7b2be93..495a4cc 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c
@@ -45,6 +45,26 @@
 
 	so->base = *cso;
 
+	switch (cso->depth.func) {
+	case PIPE_FUNC_LESS:
+	case PIPE_FUNC_LEQUAL:
+		so->gras_lrz_cntl = A5XX_GRAS_LRZ_CNTL_ENABLE;
+		break;
+
+	case PIPE_FUNC_GREATER:
+	case PIPE_FUNC_GEQUAL:
+		so->gras_lrz_cntl = A5XX_GRAS_LRZ_CNTL_ENABLE | A5XX_GRAS_LRZ_CNTL_GREATER;
+		break;
+
+	default:
+		/* LRZ not enabled */
+		so->gras_lrz_cntl = 0;
+		break;
+	}
+
+	if (!(cso->stencil->enabled || cso->alpha.enabled || !cso->depth.writemask))
+		so->lrz_write = true;
+
 	so->rb_depth_cntl |=
 		A5XX_RB_DEPTH_CNTL_ZFUNC(cso->depth.func); /* maps 1:1 */
 
@@ -79,9 +99,9 @@
 				A5XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) |
 				A5XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) |
 				A5XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op));
-//			so->rb_stencilrefmask_bf |=
-//				A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) |
-//				A5XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask);
+			so->rb_stencilrefmask_bf |=
+				A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) |
+				A5XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask);
 		}
 	}
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h
index 86bdd5f..c15ba1a 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h
@@ -40,6 +40,9 @@
 	uint32_t rb_depth_cntl;
 	uint32_t rb_stencil_control;
 	uint32_t rb_stencilrefmask;
+	uint32_t rb_stencilrefmask_bf;
+	uint32_t gras_lrz_cntl;
+	bool lrz_write;
 };
 
 static inline struct fd5_zsa_stateobj *
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 7fd527b..66a8450 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,17 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-06-02 15:50:23)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 111898 bytes, from 2017-05-30 19:25:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 142603 bytes, from 2017-06-06 17:02:32)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
 
-Copyright (C) 2013-2016 by the following authors:
+Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
@@ -424,6 +424,35 @@
 #define REG_AXXX_CP_IB2_BUFSZ					0x0000045b
 
 #define REG_AXXX_CP_STAT					0x0000047f
+#define AXXX_CP_STAT_CP_BUSY					0x80000000
+#define AXXX_CP_STAT_VS_EVENT_FIFO_BUSY				0x40000000
+#define AXXX_CP_STAT_PS_EVENT_FIFO_BUSY				0x20000000
+#define AXXX_CP_STAT_CF_EVENT_FIFO_BUSY				0x10000000
+#define AXXX_CP_STAT_RB_EVENT_FIFO_BUSY				0x08000000
+#define AXXX_CP_STAT_ME_BUSY					0x04000000
+#define AXXX_CP_STAT_MIU_WR_C_BUSY				0x02000000
+#define AXXX_CP_STAT_CP_3D_BUSY					0x00800000
+#define AXXX_CP_STAT_CP_NRT_BUSY				0x00400000
+#define AXXX_CP_STAT_RBIU_SCRATCH_BUSY				0x00200000
+#define AXXX_CP_STAT_RCIU_ME_BUSY				0x00100000
+#define AXXX_CP_STAT_RCIU_PFP_BUSY				0x00080000
+#define AXXX_CP_STAT_MEQ_RING_BUSY				0x00040000
+#define AXXX_CP_STAT_PFP_BUSY					0x00020000
+#define AXXX_CP_STAT_ST_QUEUE_BUSY				0x00010000
+#define AXXX_CP_STAT_INDIRECT2_QUEUE_BUSY			0x00002000
+#define AXXX_CP_STAT_INDIRECTS_QUEUE_BUSY			0x00001000
+#define AXXX_CP_STAT_RING_QUEUE_BUSY				0x00000800
+#define AXXX_CP_STAT_CSF_BUSY					0x00000400
+#define AXXX_CP_STAT_CSF_ST_BUSY				0x00000200
+#define AXXX_CP_STAT_EVENT_BUSY					0x00000100
+#define AXXX_CP_STAT_CSF_INDIRECT2_BUSY				0x00000080
+#define AXXX_CP_STAT_CSF_INDIRECTS_BUSY				0x00000040
+#define AXXX_CP_STAT_CSF_RING_BUSY				0x00000020
+#define AXXX_CP_STAT_RCIU_BUSY					0x00000010
+#define AXXX_CP_STAT_RBIU_BUSY					0x00000008
+#define AXXX_CP_STAT_MIU_RD_RETURN_BUSY				0x00000004
+#define AXXX_CP_STAT_MIU_RD_REQ_BUSY				0x00000002
+#define AXXX_CP_STAT_MIU_WR_BUSY				0x00000001
 
 #define REG_AXXX_CP_SCRATCH_REG0				0x00000578
 
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index ca67878..9da798d 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,17 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13324 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  31866 bytes, from 2017-06-02 15:50:23)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 111898 bytes, from 2017-05-30 19:25:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 142603 bytes, from 2017-06-06 17:02:32)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
 
-Copyright (C) 2013-2016 by the following authors:
+Copyright (C) 2013-2017 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
@@ -76,7 +76,7 @@
 	UNK_1D = 29,
 	BLIT = 30,
 	UNK_25 = 37,
-	UNK_26 = 38,
+	LRZ_FLUSH = 38,
 	UNK_2C = 44,
 	UNK_2D = 45,
 };
@@ -142,11 +142,13 @@
 	CP_WAIT_IB_PFD_COMPLETE = 93,
 	CP_REG_RMW = 33,
 	CP_SET_BIN_DATA = 47,
+	CP_SET_BIN_DATA5 = 47,
 	CP_REG_TO_MEM = 62,
 	CP_MEM_WRITE = 61,
 	CP_MEM_WRITE_CNTR = 79,
 	CP_COND_EXEC = 68,
 	CP_COND_WRITE = 69,
+	CP_COND_WRITE5 = 69,
 	CP_EVENT_WRITE = 70,
 	CP_EVENT_WRITE_SHD = 88,
 	CP_EVENT_WRITE_CFL = 89,
@@ -173,6 +175,7 @@
 	CP_SET_PROTECTED_MODE = 95,
 	CP_BOOTSTRAP_UCODE = 111,
 	CP_LOAD_STATE = 48,
+	CP_LOAD_STATE4 = 48,
 	CP_COND_INDIRECT_BUFFER_PFE = 58,
 	CP_COND_INDIRECT_BUFFER_PFD = 50,
 	CP_INDIRECT_BUFFER_PFE = 63,
@@ -248,22 +251,61 @@
 	SS_INDIRECT_STM = 6,
 };
 
+enum a4xx_state_block {
+	SB4_VS_TEX = 0,
+	SB4_HS_TEX = 1,
+	SB4_DS_TEX = 2,
+	SB4_GS_TEX = 3,
+	SB4_FS_TEX = 4,
+	SB4_CS_TEX = 5,
+	SB4_VS_SHADER = 8,
+	SB4_HS_SHADER = 9,
+	SB4_DS_SHADER = 10,
+	SB4_GS_SHADER = 11,
+	SB4_FS_SHADER = 12,
+	SB4_CS_SHADER = 13,
+	SB4_SSBO = 14,
+	SB4_CS_SSBO = 15,
+};
+
+enum a4xx_state_type {
+	ST4_SHADER = 0,
+	ST4_CONSTANTS = 1,
+};
+
+enum a4xx_state_src {
+	SS4_DIRECT = 0,
+	SS4_INDIRECT = 2,
+};
+
 enum a4xx_index_size {
 	INDEX4_SIZE_8_BIT = 0,
 	INDEX4_SIZE_16_BIT = 1,
 	INDEX4_SIZE_32_BIT = 2,
 };
 
+enum cp_cond_function {
+	WRITE_ALWAYS = 0,
+	WRITE_LT = 1,
+	WRITE_LE = 2,
+	WRITE_EQ = 3,
+	WRITE_NE = 4,
+	WRITE_GE = 5,
+	WRITE_GT = 6,
+};
+
 enum render_mode_cmd {
 	BYPASS = 1,
 	BINNING = 2,
 	GMEM = 3,
 	BLIT2D = 5,
+	BLIT2DSCALE = 7,
 };
 
 enum cp_blit_cmd {
 	BLIT_OP_FILL = 0,
 	BLIT_OP_COPY = 1,
+	BLIT_OP_SCALE = 3,
 };
 
 #define REG_CP_LOAD_STATE_0					0x00000000
@@ -307,12 +349,53 @@
 	return ((val >> 2) << CP_LOAD_STATE_1_EXT_SRC_ADDR__SHIFT) & CP_LOAD_STATE_1_EXT_SRC_ADDR__MASK;
 }
 
-#define REG_CP_LOAD_STATE_2					0x00000002
-#define CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__MASK			0xffffffff
-#define CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__SHIFT			0
-static inline uint32_t CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(uint32_t val)
+#define REG_CP_LOAD_STATE4_0					0x00000000
+#define CP_LOAD_STATE4_0_DST_OFF__MASK				0x0000ffff
+#define CP_LOAD_STATE4_0_DST_OFF__SHIFT				0
+static inline uint32_t CP_LOAD_STATE4_0_DST_OFF(uint32_t val)
 {
-	return ((val) << CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__SHIFT) & CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__MASK;
+	return ((val) << CP_LOAD_STATE4_0_DST_OFF__SHIFT) & CP_LOAD_STATE4_0_DST_OFF__MASK;
+}
+#define CP_LOAD_STATE4_0_STATE_SRC__MASK			0x00030000
+#define CP_LOAD_STATE4_0_STATE_SRC__SHIFT			16
+static inline uint32_t CP_LOAD_STATE4_0_STATE_SRC(enum a4xx_state_src val)
+{
+	return ((val) << CP_LOAD_STATE4_0_STATE_SRC__SHIFT) & CP_LOAD_STATE4_0_STATE_SRC__MASK;
+}
+#define CP_LOAD_STATE4_0_STATE_BLOCK__MASK			0x003c0000
+#define CP_LOAD_STATE4_0_STATE_BLOCK__SHIFT			18
+static inline uint32_t CP_LOAD_STATE4_0_STATE_BLOCK(enum a4xx_state_block val)
+{
+	return ((val) << CP_LOAD_STATE4_0_STATE_BLOCK__SHIFT) & CP_LOAD_STATE4_0_STATE_BLOCK__MASK;
+}
+#define CP_LOAD_STATE4_0_NUM_UNIT__MASK				0xffc00000
+#define CP_LOAD_STATE4_0_NUM_UNIT__SHIFT			22
+static inline uint32_t CP_LOAD_STATE4_0_NUM_UNIT(uint32_t val)
+{
+	return ((val) << CP_LOAD_STATE4_0_NUM_UNIT__SHIFT) & CP_LOAD_STATE4_0_NUM_UNIT__MASK;
+}
+
+#define REG_CP_LOAD_STATE4_1					0x00000001
+#define CP_LOAD_STATE4_1_STATE_TYPE__MASK			0x00000003
+#define CP_LOAD_STATE4_1_STATE_TYPE__SHIFT			0
+static inline uint32_t CP_LOAD_STATE4_1_STATE_TYPE(enum a4xx_state_type val)
+{
+	return ((val) << CP_LOAD_STATE4_1_STATE_TYPE__SHIFT) & CP_LOAD_STATE4_1_STATE_TYPE__MASK;
+}
+#define CP_LOAD_STATE4_1_EXT_SRC_ADDR__MASK			0xfffffffc
+#define CP_LOAD_STATE4_1_EXT_SRC_ADDR__SHIFT			2
+static inline uint32_t CP_LOAD_STATE4_1_EXT_SRC_ADDR(uint32_t val)
+{
+	assert(!(val & 0x3));
+	return ((val >> 2) << CP_LOAD_STATE4_1_EXT_SRC_ADDR__SHIFT) & CP_LOAD_STATE4_1_EXT_SRC_ADDR__MASK;
+}
+
+#define REG_CP_LOAD_STATE4_2					0x00000002
+#define CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI__MASK			0xffffffff
+#define CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI__SHIFT			0
+static inline uint32_t CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI__SHIFT) & CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI__MASK;
 }
 
 #define REG_CP_DRAW_INDX_0					0x00000000
@@ -581,6 +664,52 @@
 	return ((val) << CP_SET_BIN_DATA_1_BIN_SIZE_ADDRESS__SHIFT) & CP_SET_BIN_DATA_1_BIN_SIZE_ADDRESS__MASK;
 }
 
+#define REG_CP_SET_BIN_DATA5_0					0x00000000
+#define CP_SET_BIN_DATA5_0_VSC_SIZE__MASK			0x003f0000
+#define CP_SET_BIN_DATA5_0_VSC_SIZE__SHIFT			16
+static inline uint32_t CP_SET_BIN_DATA5_0_VSC_SIZE(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_0_VSC_SIZE__SHIFT) & CP_SET_BIN_DATA5_0_VSC_SIZE__MASK;
+}
+#define CP_SET_BIN_DATA5_0_VSC_N__MASK				0x07c00000
+#define CP_SET_BIN_DATA5_0_VSC_N__SHIFT				22
+static inline uint32_t CP_SET_BIN_DATA5_0_VSC_N(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_0_VSC_N__SHIFT) & CP_SET_BIN_DATA5_0_VSC_N__MASK;
+}
+
+#define REG_CP_SET_BIN_DATA5_1					0x00000001
+#define CP_SET_BIN_DATA5_1_BIN_DATA_ADDR_LO__MASK		0xffffffff
+#define CP_SET_BIN_DATA5_1_BIN_DATA_ADDR_LO__SHIFT		0
+static inline uint32_t CP_SET_BIN_DATA5_1_BIN_DATA_ADDR_LO(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_1_BIN_DATA_ADDR_LO__SHIFT) & CP_SET_BIN_DATA5_1_BIN_DATA_ADDR_LO__MASK;
+}
+
+#define REG_CP_SET_BIN_DATA5_2					0x00000002
+#define CP_SET_BIN_DATA5_2_BIN_DATA_ADDR_HI__MASK		0xffffffff
+#define CP_SET_BIN_DATA5_2_BIN_DATA_ADDR_HI__SHIFT		0
+static inline uint32_t CP_SET_BIN_DATA5_2_BIN_DATA_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_2_BIN_DATA_ADDR_HI__SHIFT) & CP_SET_BIN_DATA5_2_BIN_DATA_ADDR_HI__MASK;
+}
+
+#define REG_CP_SET_BIN_DATA5_3					0x00000003
+#define CP_SET_BIN_DATA5_3_BIN_SIZE_ADDRESS_LO__MASK		0xffffffff
+#define CP_SET_BIN_DATA5_3_BIN_SIZE_ADDRESS_LO__SHIFT		0
+static inline uint32_t CP_SET_BIN_DATA5_3_BIN_SIZE_ADDRESS_LO(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_3_BIN_SIZE_ADDRESS_LO__SHIFT) & CP_SET_BIN_DATA5_3_BIN_SIZE_ADDRESS_LO__MASK;
+}
+
+#define REG_CP_SET_BIN_DATA5_4					0x00000004
+#define CP_SET_BIN_DATA5_4_BIN_SIZE_ADDRESS_HI__MASK		0xffffffff
+#define CP_SET_BIN_DATA5_4_BIN_SIZE_ADDRESS_HI__SHIFT		0
+static inline uint32_t CP_SET_BIN_DATA5_4_BIN_SIZE_ADDRESS_HI(uint32_t val)
+{
+	return ((val) << CP_SET_BIN_DATA5_4_BIN_SIZE_ADDRESS_HI__SHIFT) & CP_SET_BIN_DATA5_4_BIN_SIZE_ADDRESS_HI__MASK;
+}
+
 #define REG_CP_REG_TO_MEM_0					0x00000000
 #define CP_REG_TO_MEM_0_REG__MASK				0x0000ffff
 #define CP_REG_TO_MEM_0_REG__SHIFT				0
@@ -605,6 +734,128 @@
 	return ((val) << CP_REG_TO_MEM_1_DEST__SHIFT) & CP_REG_TO_MEM_1_DEST__MASK;
 }
 
+#define REG_CP_MEM_TO_MEM_0					0x00000000
+#define CP_MEM_TO_MEM_0_NEG_A					0x00000001
+#define CP_MEM_TO_MEM_0_NEG_B					0x00000002
+#define CP_MEM_TO_MEM_0_NEG_C					0x00000004
+#define CP_MEM_TO_MEM_0_DOUBLE					0x20000000
+
+#define REG_CP_COND_WRITE_0					0x00000000
+#define CP_COND_WRITE_0_FUNCTION__MASK				0x00000007
+#define CP_COND_WRITE_0_FUNCTION__SHIFT				0
+static inline uint32_t CP_COND_WRITE_0_FUNCTION(enum cp_cond_function val)
+{
+	return ((val) << CP_COND_WRITE_0_FUNCTION__SHIFT) & CP_COND_WRITE_0_FUNCTION__MASK;
+}
+#define CP_COND_WRITE_0_POLL_MEMORY				0x00000010
+#define CP_COND_WRITE_0_WRITE_MEMORY				0x00000100
+
+#define REG_CP_COND_WRITE_1					0x00000001
+#define CP_COND_WRITE_1_POLL_ADDR__MASK				0xffffffff
+#define CP_COND_WRITE_1_POLL_ADDR__SHIFT			0
+static inline uint32_t CP_COND_WRITE_1_POLL_ADDR(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE_1_POLL_ADDR__SHIFT) & CP_COND_WRITE_1_POLL_ADDR__MASK;
+}
+
+#define REG_CP_COND_WRITE_2					0x00000002
+#define CP_COND_WRITE_2_REF__MASK				0xffffffff
+#define CP_COND_WRITE_2_REF__SHIFT				0
+static inline uint32_t CP_COND_WRITE_2_REF(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE_2_REF__SHIFT) & CP_COND_WRITE_2_REF__MASK;
+}
+
+#define REG_CP_COND_WRITE_3					0x00000003
+#define CP_COND_WRITE_3_MASK__MASK				0xffffffff
+#define CP_COND_WRITE_3_MASK__SHIFT				0
+static inline uint32_t CP_COND_WRITE_3_MASK(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE_3_MASK__SHIFT) & CP_COND_WRITE_3_MASK__MASK;
+}
+
+#define REG_CP_COND_WRITE_4					0x00000004
+#define CP_COND_WRITE_4_WRITE_ADDR__MASK			0xffffffff
+#define CP_COND_WRITE_4_WRITE_ADDR__SHIFT			0
+static inline uint32_t CP_COND_WRITE_4_WRITE_ADDR(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE_4_WRITE_ADDR__SHIFT) & CP_COND_WRITE_4_WRITE_ADDR__MASK;
+}
+
+#define REG_CP_COND_WRITE_5					0x00000005
+#define CP_COND_WRITE_5_WRITE_DATA__MASK			0xffffffff
+#define CP_COND_WRITE_5_WRITE_DATA__SHIFT			0
+static inline uint32_t CP_COND_WRITE_5_WRITE_DATA(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE_5_WRITE_DATA__SHIFT) & CP_COND_WRITE_5_WRITE_DATA__MASK;
+}
+
+#define REG_CP_COND_WRITE5_0					0x00000000
+#define CP_COND_WRITE5_0_FUNCTION__MASK				0x00000007
+#define CP_COND_WRITE5_0_FUNCTION__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_0_FUNCTION(enum cp_cond_function val)
+{
+	return ((val) << CP_COND_WRITE5_0_FUNCTION__SHIFT) & CP_COND_WRITE5_0_FUNCTION__MASK;
+}
+#define CP_COND_WRITE5_0_POLL_MEMORY				0x00000010
+#define CP_COND_WRITE5_0_WRITE_MEMORY				0x00000100
+
+#define REG_CP_COND_WRITE5_1					0x00000001
+#define CP_COND_WRITE5_1_POLL_ADDR_LO__MASK			0xffffffff
+#define CP_COND_WRITE5_1_POLL_ADDR_LO__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_1_POLL_ADDR_LO(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_1_POLL_ADDR_LO__SHIFT) & CP_COND_WRITE5_1_POLL_ADDR_LO__MASK;
+}
+
+#define REG_CP_COND_WRITE5_2					0x00000002
+#define CP_COND_WRITE5_2_POLL_ADDR_HI__MASK			0xffffffff
+#define CP_COND_WRITE5_2_POLL_ADDR_HI__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_2_POLL_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_2_POLL_ADDR_HI__SHIFT) & CP_COND_WRITE5_2_POLL_ADDR_HI__MASK;
+}
+
+#define REG_CP_COND_WRITE5_3					0x00000003
+#define CP_COND_WRITE5_3_REF__MASK				0xffffffff
+#define CP_COND_WRITE5_3_REF__SHIFT				0
+static inline uint32_t CP_COND_WRITE5_3_REF(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_3_REF__SHIFT) & CP_COND_WRITE5_3_REF__MASK;
+}
+
+#define REG_CP_COND_WRITE5_4					0x00000004
+#define CP_COND_WRITE5_4_MASK__MASK				0xffffffff
+#define CP_COND_WRITE5_4_MASK__SHIFT				0
+static inline uint32_t CP_COND_WRITE5_4_MASK(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_4_MASK__SHIFT) & CP_COND_WRITE5_4_MASK__MASK;
+}
+
+#define REG_CP_COND_WRITE5_5					0x00000005
+#define CP_COND_WRITE5_5_WRITE_ADDR_LO__MASK			0xffffffff
+#define CP_COND_WRITE5_5_WRITE_ADDR_LO__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_5_WRITE_ADDR_LO(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_5_WRITE_ADDR_LO__SHIFT) & CP_COND_WRITE5_5_WRITE_ADDR_LO__MASK;
+}
+
+#define REG_CP_COND_WRITE5_6					0x00000006
+#define CP_COND_WRITE5_6_WRITE_ADDR_HI__MASK			0xffffffff
+#define CP_COND_WRITE5_6_WRITE_ADDR_HI__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_6_WRITE_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_6_WRITE_ADDR_HI__SHIFT) & CP_COND_WRITE5_6_WRITE_ADDR_HI__MASK;
+}
+
+#define REG_CP_COND_WRITE5_7					0x00000007
+#define CP_COND_WRITE5_7_WRITE_DATA__MASK			0xffffffff
+#define CP_COND_WRITE5_7_WRITE_DATA__SHIFT			0
+static inline uint32_t CP_COND_WRITE5_7_WRITE_DATA(uint32_t val)
+{
+	return ((val) << CP_COND_WRITE5_7_WRITE_DATA__SHIFT) & CP_COND_WRITE5_7_WRITE_DATA__MASK;
+}
+
 #define REG_CP_DISPATCH_COMPUTE_0				0x00000000
 
 #define REG_CP_DISPATCH_COMPUTE_1				0x00000001
@@ -656,6 +907,7 @@
 }
 
 #define REG_CP_SET_RENDER_MODE_3				0x00000003
+#define CP_SET_RENDER_MODE_3_VSC_ENABLE				0x00000008
 #define CP_SET_RENDER_MODE_3_GMEM_ENABLE			0x00000010
 
 #define REG_CP_SET_RENDER_MODE_4				0x00000004
@@ -684,6 +936,50 @@
 	return ((val) << CP_SET_RENDER_MODE_7_ADDR_1_HI__SHIFT) & CP_SET_RENDER_MODE_7_ADDR_1_HI__MASK;
 }
 
+#define REG_CP_COMPUTE_CHECKPOINT_0				0x00000000
+#define CP_COMPUTE_CHECKPOINT_0_ADDR_0_LO__MASK			0xffffffff
+#define CP_COMPUTE_CHECKPOINT_0_ADDR_0_LO__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_0_ADDR_0_LO(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_0_ADDR_0_LO__SHIFT) & CP_COMPUTE_CHECKPOINT_0_ADDR_0_LO__MASK;
+}
+
+#define REG_CP_COMPUTE_CHECKPOINT_1				0x00000001
+#define CP_COMPUTE_CHECKPOINT_1_ADDR_0_HI__MASK			0xffffffff
+#define CP_COMPUTE_CHECKPOINT_1_ADDR_0_HI__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_1_ADDR_0_HI(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_1_ADDR_0_HI__SHIFT) & CP_COMPUTE_CHECKPOINT_1_ADDR_0_HI__MASK;
+}
+
+#define REG_CP_COMPUTE_CHECKPOINT_2				0x00000002
+
+#define REG_CP_COMPUTE_CHECKPOINT_3				0x00000003
+
+#define REG_CP_COMPUTE_CHECKPOINT_4				0x00000004
+#define CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__MASK		0xffffffff
+#define CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__SHIFT) & CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__MASK;
+}
+
+#define REG_CP_COMPUTE_CHECKPOINT_5				0x00000005
+#define CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO__MASK			0xffffffff
+#define CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO__SHIFT) & CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO__MASK;
+}
+
+#define REG_CP_COMPUTE_CHECKPOINT_6				0x00000006
+#define CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__MASK			0xffffffff
+#define CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__SHIFT) & CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__MASK;
+}
+
 #define REG_CP_PERFCOUNTER_ACTION_0				0x00000000
 
 #define REG_CP_PERFCOUNTER_ACTION_1				0x00000001
@@ -709,6 +1005,7 @@
 {
 	return ((val) << CP_EVENT_WRITE_0_EVENT__SHIFT) & CP_EVENT_WRITE_0_EVENT__MASK;
 }
+#define CP_EVENT_WRITE_0_TIMESTAMP				0x40000000
 
 #define REG_CP_EVENT_WRITE_1					0x00000001
 #define CP_EVENT_WRITE_1_ADDR_0_LO__MASK			0xffffffff
@@ -792,5 +1089,31 @@
 	return ((val) << CP_BLIT_4_DST_Y2__SHIFT) & CP_BLIT_4_DST_Y2__MASK;
 }
 
+#define REG_CP_EXEC_CS_0					0x00000000
+
+#define REG_CP_EXEC_CS_1					0x00000001
+#define CP_EXEC_CS_1_NGROUPS_X__MASK				0xffffffff
+#define CP_EXEC_CS_1_NGROUPS_X__SHIFT				0
+static inline uint32_t CP_EXEC_CS_1_NGROUPS_X(uint32_t val)
+{
+	return ((val) << CP_EXEC_CS_1_NGROUPS_X__SHIFT) & CP_EXEC_CS_1_NGROUPS_X__MASK;
+}
+
+#define REG_CP_EXEC_CS_2					0x00000002
+#define CP_EXEC_CS_2_NGROUPS_Y__MASK				0xffffffff
+#define CP_EXEC_CS_2_NGROUPS_Y__SHIFT				0
+static inline uint32_t CP_EXEC_CS_2_NGROUPS_Y(uint32_t val)
+{
+	return ((val) << CP_EXEC_CS_2_NGROUPS_Y__SHIFT) & CP_EXEC_CS_2_NGROUPS_Y__MASK;
+}
+
+#define REG_CP_EXEC_CS_3					0x00000003
+#define CP_EXEC_CS_3_NGROUPS_Z__MASK				0xffffffff
+#define CP_EXEC_CS_3_NGROUPS_Z__SHIFT				0
+static inline uint32_t CP_EXEC_CS_3_NGROUPS_Z(uint32_t val)
+{
+	return ((val) << CP_EXEC_CS_3_NGROUPS_Z__SHIFT) & CP_EXEC_CS_3_NGROUPS_Z__MASK;
+}
+
 
 #endif /* ADRENO_PM4_XML */
diff --git a/src/gallium/drivers/freedreno/disasm.h b/src/gallium/drivers/freedreno/disasm.h
index e81dd1c..579dd50 100644
--- a/src/gallium/drivers/freedreno/disasm.h
+++ b/src/gallium/drivers/freedreno/disasm.h
@@ -26,8 +26,12 @@
 
 enum shader_t {
 	SHADER_VERTEX,
+	SHADER_TCS,
+	SHADER_TES,
+	SHADER_GEOM,
 	SHADER_FRAGMENT,
 	SHADER_COMPUTE,
+	SHADER_MAX,
 };
 
 /* bitmask of debug flags */
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index 5cd6a69..c2142b5 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -76,14 +76,14 @@
 	batch->max_scissor.minx = batch->max_scissor.miny = ~0;
 	batch->max_scissor.maxx = batch->max_scissor.maxy = 0;
 
-	util_dynarray_init(&batch->draw_patches);
+	util_dynarray_init(&batch->draw_patches, NULL);
 
 	if (is_a3xx(ctx->screen))
-		util_dynarray_init(&batch->rbrc_patches);
+		util_dynarray_init(&batch->rbrc_patches, NULL);
 
 	assert(batch->resources->entries == 0);
 
-	util_dynarray_init(&batch->samples);
+	util_dynarray_init(&batch->samples, NULL);
 }
 
 struct fd_batch *
@@ -118,6 +118,10 @@
 	fd_ringbuffer_del(batch->draw);
 	fd_ringbuffer_del(batch->binning);
 	fd_ringbuffer_del(batch->gmem);
+	if (batch->lrz_clear) {
+		fd_ringbuffer_del(batch->lrz_clear);
+		batch->lrz_clear = NULL;
+	}
 
 	util_dynarray_fini(&batch->draw_patches);
 
@@ -262,9 +266,9 @@
 	/* close out the draw cmds by making sure any active queries are
 	 * paused:
 	 */
-	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_NULL);
+	fd_batch_set_stage(batch, FD_STAGE_NULL);
 
-	batch->ctx->dirty = ~0;
+	fd_context_all_dirty(batch->ctx);
 	batch_flush_reset_dependencies(batch, true);
 
 	if (batch->ctx->screen->reorder) {
@@ -272,7 +276,7 @@
 		fd_batch_reference(&tmp, batch);
 
 		if (!util_queue_is_initialized(&batch->ctx->flush_queue))
-			util_queue_init(&batch->ctx->flush_queue, "flush_queue", 16, 1);
+			util_queue_init(&batch->ctx->flush_queue, "flush_queue", 16, 1, 0);
 
 		util_queue_add_job(&batch->ctx->flush_queue,
 				batch, &batch->flush_fence,
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index 095d214..d6a818a 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -154,6 +154,9 @@
 	/** tiling/gmem (IB0) cmdstream: */
 	struct fd_ringbuffer *gmem;
 
+	// TODO maybe more generically split out clear and clear_binning rings?
+	struct fd_ringbuffer *lrz_clear;
+
 	/**
 	 * hw query related state:
 	 */
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 1a0a71c..1cf366b 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -121,7 +121,6 @@
 	fd_fence_ref(pctx->screen, &ctx->last_fence, NULL);
 
 	fd_prog_fini(pctx);
-	fd_hw_query_fini(pctx);
 
 	if (ctx->blitter)
 		util_blitter_destroy(ctx->blitter);
@@ -211,7 +210,7 @@
 			}});
 	ctx->solid_vbuf_state.vertexbuf.count = 1;
 	ctx->solid_vbuf_state.vertexbuf.vb[0].stride = 12;
-	ctx->solid_vbuf_state.vertexbuf.vb[0].buffer = ctx->solid_vbuf;
+	ctx->solid_vbuf_state.vertexbuf.vb[0].buffer.resource = ctx->solid_vbuf;
 
 	/* setup blit_vbuf_state: */
 	ctx->blit_vbuf_state.vtx = pctx->create_vertex_elements_state(
@@ -226,9 +225,9 @@
 			}});
 	ctx->blit_vbuf_state.vertexbuf.count = 2;
 	ctx->blit_vbuf_state.vertexbuf.vb[0].stride = 8;
-	ctx->blit_vbuf_state.vertexbuf.vb[0].buffer = ctx->blit_texcoord_vbuf;
+	ctx->blit_vbuf_state.vertexbuf.vb[0].buffer.resource = ctx->blit_texcoord_vbuf;
 	ctx->blit_vbuf_state.vertexbuf.vb[1].stride = 12;
-	ctx->blit_vbuf_state.vertexbuf.vb[1].buffer = ctx->solid_vbuf;
+	ctx->blit_vbuf_state.vertexbuf.vb[1].buffer.resource = ctx->solid_vbuf;
 }
 
 void
@@ -287,7 +286,6 @@
 	fd_query_context_init(pctx);
 	fd_texture_init(pctx);
 	fd_state_init(pctx);
-	fd_hw_query_init(pctx);
 
 	ctx->blitter = util_blitter_create(pctx);
 	if (!ctx->blitter)
@@ -297,6 +295,9 @@
 	if (!ctx->primconvert)
 		goto fail;
 
+	list_inithead(&ctx->hw_active_queries);
+	list_inithead(&ctx->acc_active_queries);
+
 	return pctx;
 
 fail:
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index cb33b8c..4472afb 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -72,6 +72,12 @@
 	uint32_t dirty_mask;
 };
 
+struct fd_shaderbuf_stateobj {
+	struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS];
+	uint32_t enabled_mask;
+	uint32_t dirty_mask;
+};
+
 struct fd_vertexbuf_stateobj {
 	struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
 	unsigned count;
@@ -107,6 +113,43 @@
 	struct fd_vertexbuf_stateobj vertexbuf;
 };
 
+/* global 3d pipeline dirty state: */
+enum fd_dirty_3d_state {
+	FD_DIRTY_BLEND       = BIT(0),
+	FD_DIRTY_RASTERIZER  = BIT(1),
+	FD_DIRTY_ZSA         = BIT(2),
+	FD_DIRTY_BLEND_COLOR = BIT(3),
+	FD_DIRTY_STENCIL_REF = BIT(4),
+	FD_DIRTY_SAMPLE_MASK = BIT(5),
+	FD_DIRTY_FRAMEBUFFER = BIT(6),
+	FD_DIRTY_STIPPLE     = BIT(7),
+	FD_DIRTY_VIEWPORT    = BIT(8),
+	FD_DIRTY_VTXSTATE    = BIT(9),
+	FD_DIRTY_VTXBUF      = BIT(10),
+
+	FD_DIRTY_SCISSOR     = BIT(12),
+	FD_DIRTY_STREAMOUT   = BIT(13),
+	FD_DIRTY_UCP         = BIT(14),
+	FD_DIRTY_BLEND_DUAL  = BIT(15),
+
+	/* These are a bit redundent with fd_dirty_shader_state, and possibly
+	 * should be removed.  (But OTOH kinda convenient in some places)
+	 */
+	FD_DIRTY_PROG        = BIT(16),
+	FD_DIRTY_CONST       = BIT(17),
+	FD_DIRTY_TEX         = BIT(18),
+
+	/* only used by a2xx.. possibly can be removed.. */
+	FD_DIRTY_TEXSTATE    = BIT(19),
+};
+
+/* per shader-stage dirty state: */
+enum fd_dirty_shader_state {
+	FD_DIRTY_SHADER_PROG  = BIT(0),
+	FD_DIRTY_SHADER_CONST = BIT(1),
+	FD_DIRTY_SHADER_TEX   = BIT(2),
+	FD_DIRTY_SHADER_SSBO  = BIT(3),
+};
 
 struct fd_context {
 	struct pipe_context base;
@@ -123,15 +166,26 @@
 	/* slab for pipe_transfer allocations: */
 	struct slab_child_pool transfer_pool;
 
+	/**
+	 * query related state:
+	 */
+	/*@{*/
 	/* slabs for fd_hw_sample and fd_hw_sample_period allocations: */
 	struct slab_mempool sample_pool;
 	struct slab_mempool sample_period_pool;
 
 	/* sample-providers for hw queries: */
-	const struct fd_hw_sample_provider *sample_providers[MAX_HW_SAMPLE_PROVIDERS];
+	const struct fd_hw_sample_provider *hw_sample_providers[MAX_HW_SAMPLE_PROVIDERS];
 
 	/* list of active queries: */
-	struct list_head active_queries;
+	struct list_head hw_active_queries;
+
+	/* sample-providers for accumulating hw queries: */
+	const struct fd_acc_sample_provider *acc_sample_providers[MAX_HW_SAMPLE_PROVIDERS];
+
+	/* list of active accumulating queries: */
+	struct list_head acc_active_queries;
+	/*@}*/
 
 	/* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to
 	 * DI_PT_x value to use for draw initiator.  There are some
@@ -192,44 +246,21 @@
 	 * means we'd always have to recalc tiles ever batch)
 	 */
 	struct fd_gmem_stateobj gmem;
-	struct fd_vsc_pipe      pipe[8];
+	struct fd_vsc_pipe      pipe[16];
 	struct fd_tile          tile[512];
 
 	/* which state objects need to be re-emit'd: */
-	enum {
-		FD_DIRTY_BLEND       = (1 <<  0),
-		FD_DIRTY_RASTERIZER  = (1 <<  1),
-		FD_DIRTY_ZSA         = (1 <<  2),
-		FD_DIRTY_FRAGTEX     = (1 <<  3),
-		FD_DIRTY_VERTTEX     = (1 <<  4),
-		FD_DIRTY_TEXSTATE    = (1 <<  5),
+	enum fd_dirty_3d_state dirty;
 
-		FD_SHADER_DIRTY_VP   = (1 <<  6),
-		FD_SHADER_DIRTY_FP   = (1 <<  7),
-		/* skip geom/tcs/tes/compute */
-		FD_DIRTY_PROG        = FD_SHADER_DIRTY_FP | FD_SHADER_DIRTY_VP,
+	/* per shader-stage dirty status: */
+	enum fd_dirty_shader_state dirty_shader[PIPE_SHADER_TYPES];
 
-		FD_DIRTY_BLEND_COLOR = (1 << 12),
-		FD_DIRTY_STENCIL_REF = (1 << 13),
-		FD_DIRTY_SAMPLE_MASK = (1 << 14),
-		FD_DIRTY_FRAMEBUFFER = (1 << 15),
-		FD_DIRTY_STIPPLE     = (1 << 16),
-		FD_DIRTY_VIEWPORT    = (1 << 17),
-		FD_DIRTY_CONSTBUF    = (1 << 18),
-		FD_DIRTY_VTXSTATE    = (1 << 19),
-		FD_DIRTY_VTXBUF      = (1 << 20),
-		FD_DIRTY_INDEXBUF    = (1 << 21),
-		FD_DIRTY_SCISSOR     = (1 << 22),
-		FD_DIRTY_STREAMOUT   = (1 << 23),
-		FD_DIRTY_UCP         = (1 << 24),
-		FD_DIRTY_BLEND_DUAL  = (1 << 25),
-	} dirty;
-
+	void *compute;
 	struct pipe_blend_state *blend;
 	struct pipe_rasterizer_state *rasterizer;
 	struct pipe_depth_stencil_alpha_state *zsa;
 
-	struct fd_texture_stateobj verttex, fragtex;
+	struct fd_texture_stateobj tex[PIPE_SHADER_TYPES];
 
 	struct fd_program_stateobj prog;
 
@@ -241,7 +272,7 @@
 	struct pipe_poly_stipple stipple;
 	struct pipe_viewport_state viewport;
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
-	struct pipe_index_buffer indexbuf;
+	struct fd_shaderbuf_stateobj shaderbuf[PIPE_SHADER_TYPES];
 	struct fd_streamout_stateobj streamout;
 	struct pipe_clip_state ucp;
 
@@ -264,10 +295,14 @@
 	void (*emit_sysmem_fini)(struct fd_batch *batch);
 
 	/* draw: */
-	bool (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info);
-	void (*clear)(struct fd_context *ctx, unsigned buffers,
+	bool (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info,
+                         unsigned index_offset);
+	bool (*clear)(struct fd_context *ctx, unsigned buffers,
 			const union pipe_color_union *color, double depth, unsigned stencil);
 
+	/* compute: */
+	void (*launch_grid)(struct fd_context *ctx, const struct pipe_grid_info *info);
+
 	/* constant emit:  (note currently not used/needed for a2xx) */
 	void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type,
 			uint32_t regid, uint32_t offset, uint32_t sizedwords,
@@ -279,6 +314,13 @@
 	/* indirect-branch emit: */
 	void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringbuffer *target);
 
+	/* query: */
+	struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type);
+	void (*query_prepare)(struct fd_batch *batch, uint32_t num_tiles);
+	void (*query_prepare_tile)(struct fd_batch *batch, uint32_t n,
+			struct fd_ringbuffer *ring);
+	void (*query_set_stage)(struct fd_batch *batch, enum fd_render_stage stage);
+
 	/*
 	 * Common pre-cooked VBO state (used for a3xx and later):
 	 */
@@ -325,6 +367,31 @@
 	mtx_unlock(&ctx->screen->lock);
 }
 
+/* mark all state dirty: */
+static inline void
+fd_context_all_dirty(struct fd_context *ctx)
+{
+	ctx->dirty = ~0;
+	for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++)
+		ctx->dirty_shader[i] = ~0;
+}
+
+static inline void
+fd_context_all_clean(struct fd_context *ctx)
+{
+	ctx->dirty = 0;
+	for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
+		/* don't mark compute state as clean, since it is not emitted
+		 * during normal draw call.  The places that call _all_dirty(),
+		 * it is safe to mark compute state dirty as well, but the
+		 * inverse is not true.
+		 */
+		if (i == PIPE_SHADER_COMPUTE)
+			continue;
+		ctx->dirty_shader[i] = 0;
+	}
+}
+
 static inline struct pipe_scissor_state *
 fd_context_get_scissor(struct fd_context *ctx)
 {
@@ -339,6 +406,27 @@
 	return (1 << prim) & ctx->primtype_mask;
 }
 
+static inline void
+fd_batch_set_stage(struct fd_batch *batch, enum fd_render_stage stage)
+{
+	struct fd_context *ctx = batch->ctx;
+
+	/* special case: internal blits (like mipmap level generation)
+	 * go through normal draw path (via util_blitter_blit()).. but
+	 * we need to ignore the FD_STAGE_DRAW which will be set, so we
+	 * don't enable queries which should be paused during internal
+	 * blits:
+	 */
+	if ((batch->stage == FD_STAGE_BLIT) &&
+			(stage != FD_STAGE_NULL))
+		return;
+
+	if (ctx->query_set_stage)
+		ctx->query_set_stage(batch, stage);
+
+	batch->stage = stage;
+}
+
 void fd_context_setup_common_vbos(struct fd_context *ctx);
 void fd_context_cleanup_common_vbos(struct fd_context *ctx);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index a3c35cb..f2ccfc5 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -37,6 +37,7 @@
 #include "freedreno_context.h"
 #include "freedreno_state.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_acc.h"
 #include "freedreno_query_hw.h"
 #include "freedreno_util.h"
 
@@ -65,6 +66,11 @@
 	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
 	unsigned i, prims, buffers = 0;
 
+	if (!info->count_from_stream_output && !info->indirect &&
+	    !info->primitive_restart &&
+	    !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+		return;
+
 	/* if we supported transform feedback, we'd have to disable this: */
 	if (((scissor->maxx - scissor->minx) *
 			(scissor->maxy - scissor->miny)) == 0) {
@@ -79,23 +85,31 @@
 	if (!fd_supported_prim(ctx, info->mode)) {
 		if (ctx->streamout.num_targets > 0)
 			debug_error("stream-out with emulated prims");
-		util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf);
 		util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
 		util_primconvert_draw_vbo(ctx->primconvert, info);
 		return;
 	}
 
 	/* Upload a user index buffer. */
-	struct pipe_index_buffer ibuffer_saved = {};
-	if (info->indexed && ctx->indexbuf.user_buffer &&
-	    !util_save_and_upload_index_buffer(pctx, info, &ctx->indexbuf,
-					       &ibuffer_saved)) {
-		return;
+	struct pipe_resource *indexbuf = NULL;
+	unsigned index_offset = 0;
+	struct pipe_draw_info new_info;
+	if (info->index_size) {
+		if (info->has_user_indices) {
+			if (!util_upload_index_buffer(pctx, info, &indexbuf, &index_offset))
+				return;
+			new_info = *info;
+			new_info.index.resource = indexbuf;
+			new_info.has_user_indices = false;
+			info = &new_info;
+		} else {
+			indexbuf = info->index.resource;
+		}
 	}
 
 	if (ctx->in_blit) {
 		fd_batch_reset(batch);
-		ctx->dirty = ~0;
+		fd_context_all_dirty(ctx);
 	}
 
 	batch->blit = ctx->in_blit;
@@ -104,7 +118,7 @@
 	/* NOTE: needs to be before resource_written(batch->query_buf), otherwise
 	 * query_buf may not be created yet.
 	 */
-	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_DRAW);
+	fd_batch_set_stage(batch, FD_STAGE_DRAW);
 
 	/*
 	 * Figure out the buffers/features we need:
@@ -145,6 +159,12 @@
 			batch->gmem_reason |= FD_GMEM_BLEND_ENABLED;
 	}
 
+	/* Mark SSBOs as being written.. we don't actually know which ones are
+	 * read vs written, so just assume the worst
+	 */
+	foreach_bit(i, ctx->shaderbuf[PIPE_SHADER_FRAGMENT].enabled_mask)
+		resource_read(batch, ctx->shaderbuf[PIPE_SHADER_FRAGMENT].sb[i].buffer);
+
 	foreach_bit(i, ctx->constbuf[PIPE_SHADER_VERTEX].enabled_mask)
 		resource_read(batch, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
 	foreach_bit(i, ctx->constbuf[PIPE_SHADER_FRAGMENT].enabled_mask)
@@ -152,18 +172,18 @@
 
 	/* Mark VBOs as being read */
 	foreach_bit(i, ctx->vtx.vertexbuf.enabled_mask) {
-		assert(!ctx->vtx.vertexbuf.vb[i].user_buffer);
-		resource_read(batch, ctx->vtx.vertexbuf.vb[i].buffer);
+		assert(!ctx->vtx.vertexbuf.vb[i].is_user_buffer);
+		resource_read(batch, ctx->vtx.vertexbuf.vb[i].buffer.resource);
 	}
 
 	/* Mark index buffer as being read */
-	resource_read(batch, ctx->indexbuf.buffer);
+	resource_read(batch, indexbuf);
 
 	/* Mark textures as being read */
-	foreach_bit(i, ctx->verttex.valid_textures)
-		resource_read(batch, ctx->verttex.textures[i]->texture);
-	foreach_bit(i, ctx->fragtex.valid_textures)
-		resource_read(batch, ctx->fragtex.textures[i]->texture);
+	foreach_bit(i, ctx->tex[PIPE_SHADER_VERTEX].valid_textures)
+		resource_read(batch, ctx->tex[PIPE_SHADER_VERTEX].textures[i]->texture);
+	foreach_bit(i, ctx->tex[PIPE_SHADER_FRAGMENT].valid_textures)
+		resource_read(batch, ctx->tex[PIPE_SHADER_FRAGMENT].textures[i]->texture);
 
 	/* Mark streamout buffers as being written.. */
 	for (i = 0; i < ctx->streamout.num_targets; i++)
@@ -172,6 +192,9 @@
 
 	resource_written(batch, batch->query_buf);
 
+	list_for_each_entry(struct fd_acc_query, aq, &ctx->acc_active_queries, node)
+		resource_written(batch, aq->prsc);
+
 	mtx_unlock(&ctx->screen->lock);
 
 	batch->num_draws++;
@@ -200,19 +223,19 @@
 		util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 		util_format_short_name(pipe_surface_format(pfb->zsbuf)));
 
-	if (ctx->draw_vbo(ctx, info))
+	if (ctx->draw_vbo(ctx, info, index_offset))
 		batch->needs_flush = true;
 
 	for (i = 0; i < ctx->streamout.num_targets; i++)
 		ctx->streamout.offsets[i] += info->count;
 
 	if (fd_mesa_debug & FD_DBG_DDRAW)
-		ctx->dirty = 0xffffffff;
+		fd_context_all_dirty(ctx);
 
 	fd_batch_check_size(batch);
 
-	if (info->indexed && ibuffer_saved.user_buffer)
-           pctx->set_index_buffer(pctx, &ibuffer_saved);
+	if (info == &new_info)
+		pipe_resource_reference(&indexbuf, NULL);
 }
 
 /* Generic clear implementation (partially) using u_blitter: */
@@ -271,7 +294,7 @@
 		.max_index = 1,
 		.instance_count = 1,
 	};
-	ctx->draw_vbo(ctx, &info);
+	ctx->draw_vbo(ctx, &info, 0);
 
 	util_blitter_restore_constant_buffer_state(blitter);
 	util_blitter_restore_vertex_states(blitter);
@@ -307,7 +330,7 @@
 
 	if (ctx->in_blit) {
 		fd_batch_reset(batch);
-		ctx->dirty = ~0;
+		fd_context_all_dirty(ctx);
 	}
 
 	/* for bookkeeping about which buffers have been cleared (and thus
@@ -348,6 +371,9 @@
 
 	resource_written(batch, batch->query_buf);
 
+	list_for_each_entry(struct fd_acc_query, aq, &ctx->acc_active_queries, node)
+		resource_written(batch, aq->prsc);
+
 	mtx_unlock(&ctx->screen->lock);
 
 	DBG("%p: %x %ux%u depth=%f, stencil=%u (%s/%s)", batch, buffers,
@@ -358,26 +384,22 @@
 	/* if per-gen backend doesn't implement ctx->clear() generic
 	 * blitter clear:
 	 */
-	if (!ctx->clear) {
-		fd_blitter_clear(pctx, buffers, color, depth, stencil);
-		return;
+	bool fallback = true;
+
+	if (ctx->clear) {
+		fd_batch_set_stage(batch, FD_STAGE_CLEAR);
+
+		if (ctx->clear(ctx, buffers, color, depth, stencil)) {
+			if (fd_mesa_debug & FD_DBG_DCLEAR)
+				fd_context_all_dirty(ctx);
+
+			fallback = false;
+		}
 	}
 
-	fd_hw_query_set_stage(batch, batch->draw, FD_STAGE_CLEAR);
-
-	ctx->clear(ctx, buffers, color, depth, stencil);
-
-	ctx->dirty |= FD_DIRTY_ZSA |
-			FD_DIRTY_VIEWPORT |
-			FD_DIRTY_RASTERIZER |
-			FD_DIRTY_SAMPLE_MASK |
-			FD_DIRTY_PROG |
-			FD_DIRTY_CONSTBUF |
-			FD_DIRTY_BLEND |
-			FD_DIRTY_FRAMEBUFFER;
-
-	if (fd_mesa_debug & FD_DBG_DCLEAR)
-		ctx->dirty = 0xffffffff;
+	if (fallback) {
+		fd_blitter_clear(pctx, buffers, color, depth, stencil);
+	}
 }
 
 static void
@@ -399,6 +421,43 @@
 			buffers, depth, stencil, x, y, w, h);
 }
 
+static void
+fd_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_batch *batch, *save_batch = NULL;
+	unsigned i;
+
+	batch = fd_batch_create(ctx);
+	fd_batch_reference(&save_batch, ctx->batch);
+	fd_batch_reference(&ctx->batch, batch);
+
+	mtx_lock(&ctx->screen->lock);
+
+	/* Mark SSBOs as being written.. we don't actually know which ones are
+	 * read vs written, so just assume the worst
+	 */
+	foreach_bit(i, ctx->shaderbuf[PIPE_SHADER_COMPUTE].enabled_mask)
+		resource_read(batch, ctx->shaderbuf[PIPE_SHADER_COMPUTE].sb[i].buffer);
+
+	/* UBO's are read */
+	foreach_bit(i, ctx->constbuf[PIPE_SHADER_COMPUTE].enabled_mask)
+		resource_read(batch, ctx->constbuf[PIPE_SHADER_COMPUTE].cb[i].buffer);
+
+	/* Mark textures as being read */
+	foreach_bit(i, ctx->tex[PIPE_SHADER_COMPUTE].valid_textures)
+		resource_read(batch, ctx->tex[PIPE_SHADER_COMPUTE].textures[i]->texture);
+
+	mtx_unlock(&ctx->screen->lock);
+
+	ctx->launch_grid(ctx, info);
+
+	fd_gmem_flush_compute(batch);
+
+	fd_batch_reference(&ctx->batch, save_batch);
+	fd_batch_reference(&save_batch, NULL);
+}
+
 void
 fd_draw_init(struct pipe_context *pctx)
 {
@@ -406,4 +465,8 @@
 	pctx->clear = fd_clear;
 	pctx->clear_render_target = fd_clear_render_target;
 	pctx->clear_depth_stencil = fd_clear_depth_stencil;
+
+	if (has_compute(fd_screen(pctx->screen))) {
+		pctx->launch_grid = fd_launch_grid;
+	}
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h
index 18a5037..b293f73 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.h
+++ b/src/gallium/drivers/freedreno/freedreno_draw.h
@@ -115,22 +115,21 @@
 fd_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		enum pc_di_primtype primtype,
 		enum pc_di_vis_cull_mode vismode,
-		const struct pipe_draw_info *info)
+		const struct pipe_draw_info *info,
+		unsigned index_offset)
 {
 	struct pipe_resource *idx_buffer = NULL;
 	enum pc_di_index_size idx_type = INDEX_SIZE_IGN;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
 
-	if (info->indexed) {
-		struct pipe_index_buffer *idx = &batch->ctx->indexbuf;
+	if (info->index_size) {
+		assert(!info->has_user_indices);
 
-		assert(!idx->user_buffer);
-
-		idx_buffer = idx->buffer;
-		idx_type = size2indextype(idx->index_size);
-		idx_size = idx->index_size * info->count;
-		idx_offset = idx->offset + (info->start * idx->index_size);
+		idx_buffer = info->index.resource;
+		idx_type = size2indextype(info->index_size);
+		idx_size = info->index_size * info->count;
+		idx_offset = index_offset + info->start * info->index_size;
 		src_sel = DI_SRC_SEL_DMA;
 	} else {
 		idx_buffer = NULL;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index dc86192..0340071 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -112,6 +112,7 @@
 	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 	const uint32_t gmem_alignw = ctx->screen->gmem_alignw;
 	const uint32_t gmem_alignh = ctx->screen->gmem_alignh;
+	const unsigned npipes = ctx->screen->num_vsc_pipes;
 	const uint32_t gmem_size = ctx->screen->gmemsize_bytes;
 	uint32_t minx, miny, width, height;
 	uint32_t nbins_x = 1, nbins_y = 1;
@@ -121,7 +122,7 @@
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
-	int tile_n[ARRAY_SIZE(ctx->pipe)];
+	int tile_n[npipes];
 
 	if (has_zs) {
 		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
@@ -224,7 +225,7 @@
 
 	/* configure pipes: */
 	xoff = yoff = 0;
-	for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+	for (i = 0; i < npipes; i++) {
 		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
 
 		if (xoff >= nbins_x) {
@@ -244,7 +245,7 @@
 		xoff += tpp_x;
 	}
 
-	for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+	for (; i < npipes; i++) {
 		struct fd_vsc_pipe *pipe = &ctx->pipe[i];
 		pipe->x = pipe->y = pipe->w = pipe->h = 0;
 	}
@@ -335,7 +336,8 @@
 
 		ctx->emit_tile_renderprep(batch, tile);
 
-		fd_hw_query_prepare_tile(batch, i, batch->gmem);
+		if (ctx->query_prepare_tile)
+			ctx->query_prepare_tile(batch, i, batch->gmem);
 
 		/* emit IB to drawcmds: */
 		ctx->emit_ib(batch->gmem, batch->draw);
@@ -356,7 +358,8 @@
 
 	ctx->emit_sysmem_prep(batch);
 
-	fd_hw_query_prepare_tile(batch, 0, batch->gmem);
+	if (ctx->query_prepare_tile)
+		ctx->query_prepare_tile(batch, 0, batch->gmem);
 
 	/* emit IB to drawcmds: */
 	ctx->emit_ib(batch->gmem, batch->draw);
@@ -405,7 +408,8 @@
 			batch, pfb->width, pfb->height,
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
-		fd_hw_query_prepare(batch, 1);
+		if (ctx->query_prepare)
+			ctx->query_prepare(batch, 1);
 		render_sysmem(batch);
 		ctx->stats.batch_sysmem++;
 	} else {
@@ -415,7 +419,8 @@
 			batch, pfb->width, pfb->height, gmem->nbins_x, gmem->nbins_y,
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
-		fd_hw_query_prepare(batch, gmem->nbins_x * gmem->nbins_y);
+		if (ctx->query_prepare)
+			ctx->query_prepare(batch, gmem->nbins_x * gmem->nbins_y);
 		render_tiles(batch);
 		ctx->stats.batch_gmem++;
 	}
@@ -438,6 +443,13 @@
 	flush_ring(batch);
 }
 
+void
+fd_gmem_flush_compute(struct fd_batch *batch)
+{
+	render_sysmem(batch);
+	flush_ring(batch);
+}
+
 /* tile needs restore if it isn't completely contained within the
  * cleared scissor:
  */
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index 07e13f5..f5276ce 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -64,6 +64,7 @@
 
 void fd_gmem_render_tiles(struct fd_batch *batch);
 void fd_gmem_render_noop(struct fd_batch *batch);
+void fd_gmem_flush_compute(struct fd_batch *batch);
 
 bool fd_gmem_needs_restore(struct fd_batch *batch, struct fd_tile *tile,
 		uint32_t buffers);
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c
index db6b258..0bb5d68 100644
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -37,7 +37,8 @@
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->prog.fp = hwcso;
-	ctx->dirty |= FD_SHADER_DIRTY_FP;
+	ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG;
+	ctx->dirty |= FD_DIRTY_PROG;
 }
 
 static void
@@ -45,7 +46,8 @@
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->prog.vp = hwcso;
-	ctx->dirty |= FD_SHADER_DIRTY_VP;
+	ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG;
+	ctx->dirty |= FD_DIRTY_PROG;
 }
 
 static const char *solid_fp =
diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c
index 1e72c6d..0d7bc9f 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -46,8 +46,8 @@
 	struct fd_query *q;
 
 	q = fd_sw_create_query(ctx, query_type);
-	if (!q)
-		q = fd_hw_create_query(ctx, query_type);
+	if (!q && ctx->create_query)
+		q = ctx->create_query(ctx, query_type);
 
 	return (struct pipe_query *) q;
 }
@@ -63,14 +63,34 @@
 fd_begin_query(struct pipe_context *pctx, struct pipe_query *pq)
 {
 	struct fd_query *q = fd_query(pq);
-	return q->funcs->begin_query(fd_context(pctx), q);
+	boolean ret;
+
+	if (q->active)
+		return false;
+
+	ret = q->funcs->begin_query(fd_context(pctx), q);
+	q->active = ret;
+
+	return ret;
 }
 
 static bool
 fd_end_query(struct pipe_context *pctx, struct pipe_query *pq)
 {
 	struct fd_query *q = fd_query(pq);
+
+	/* there are a couple special cases, which don't have
+	 * a matching ->begin_query():
+	 */
+	if (skip_begin_query(q->type) && !q->active)
+		fd_begin_query(pctx, pq);
+
+	if (!q->active)
+		return false;
+
 	q->funcs->end_query(fd_context(pctx), q);
+	q->active = false;
+
 	return true;
 }
 
@@ -79,6 +99,12 @@
 		boolean wait, union pipe_query_result *result)
 {
 	struct fd_query *q = fd_query(pq);
+
+	if (q->active)
+		return false;
+
+	util_query_clear_result(result, q->type);
+
 	return q->funcs->get_query_result(fd_context(pctx), q, wait, result);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_query.h b/src/gallium/drivers/freedreno/freedreno_query.h
index 1e4f45f..49a8680 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.h
+++ b/src/gallium/drivers/freedreno/freedreno_query.h
@@ -77,4 +77,25 @@
 	}
 }
 
+/* maps query_type to sample provider idx: */
+static inline
+int pidx(unsigned query_type)
+{
+	switch (query_type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+		return 0;
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		return 1;
+	/* TODO currently queries only emitted in main pass (not in binning pass)..
+	 * which is fine for occlusion query, but pretty much not anything else.
+	 */
+	case PIPE_QUERY_TIME_ELAPSED:
+		return 2;
+	case PIPE_QUERY_TIMESTAMP:
+		return 3;
+	default:
+		return -1;
+	}
+}
+
 #endif /* FREEDRENO_QUERY_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c
new file mode 100644
index 0000000..96cee1a
--- /dev/null
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "freedreno_query_acc.h"
+#include "freedreno_context.h"
+#include "freedreno_resource.h"
+#include "freedreno_util.h"
+
+
+static bool
+is_active(struct fd_acc_query *aq, enum fd_render_stage stage)
+{
+	return !!(aq->provider->active & stage);
+}
+
+static void
+fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_acc_query *aq = fd_acc_query(q);
+
+	DBG("%p: active=%d", q, q->active);
+
+	pipe_resource_reference(&aq->prsc, NULL);
+	list_del(&aq->node);
+
+	free(aq);
+}
+
+static void
+realloc_query_bo(struct fd_context *ctx, struct fd_acc_query *aq)
+{
+	struct fd_resource *rsc;
+	void *map;
+
+	pipe_resource_reference(&aq->prsc, NULL);
+
+	aq->prsc = pipe_buffer_create(&ctx->screen->base,
+			PIPE_BIND_QUERY_BUFFER, 0, 0x1000);
+
+	/* don't assume the buffer is zero-initialized: */
+	rsc = fd_resource(aq->prsc);
+
+	fd_bo_cpu_prep(rsc->bo, ctx->screen->pipe, DRM_FREEDRENO_PREP_WRITE);
+
+	map = fd_bo_map(rsc->bo);
+	memset(map, 0, aq->provider->size);
+	fd_bo_cpu_fini(rsc->bo);
+}
+
+static boolean
+fd_acc_begin_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_batch *batch = ctx->batch;
+	struct fd_acc_query *aq = fd_acc_query(q);
+	const struct fd_acc_sample_provider *p = aq->provider;
+
+	DBG("%p: active=%d", q, q->active);
+
+	/* ->begin_query() discards previous results, so realloc bo: */
+	realloc_query_bo(ctx, aq);
+
+	/* then resume query if needed to collect first sample: */
+	if (batch && is_active(aq, batch->stage))
+		p->resume(aq, batch);
+
+	/* add to active list: */
+	assert(list_empty(&aq->node));
+	list_addtail(&aq->node, &ctx->acc_active_queries);
+
+	return true;
+}
+
+static void
+fd_acc_end_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_batch *batch = ctx->batch;
+	struct fd_acc_query *aq = fd_acc_query(q);
+	const struct fd_acc_sample_provider *p = aq->provider;
+
+	DBG("%p: active=%d", q, q->active);
+
+	if (batch && is_active(aq, batch->stage))
+		p->pause(aq, batch);
+
+	/* remove from active list: */
+	list_delinit(&aq->node);
+}
+
+static boolean
+fd_acc_get_query_result(struct fd_context *ctx, struct fd_query *q,
+		boolean wait, union pipe_query_result *result)
+{
+	struct fd_acc_query *aq = fd_acc_query(q);
+	const struct fd_acc_sample_provider *p = aq->provider;
+	struct fd_resource *rsc = fd_resource(aq->prsc);
+
+	DBG("%p: wait=%d, active=%d", q, wait, q->active);
+
+	assert(LIST_IS_EMPTY(&aq->node));
+
+	/* if !wait, then check the last sample (the one most likely to
+	 * not be ready yet) and bail if it is not ready:
+	 */
+	if (!wait) {
+		int ret;
+
+		if (pending(rsc, false)) {
+			/* piglit spec@arb_occlusion_query@occlusion_query_conform
+			 * test, and silly apps perhaps, get stuck in a loop trying
+			 * to get  query result forever with wait==false..  we don't
+			 * wait to flush unnecessarily but we also don't want to
+			 * spin forever:
+			 */
+			if (aq->no_wait_cnt++ > 5)
+				fd_batch_flush(rsc->write_batch, false);
+			return false;
+		}
+
+		ret = fd_bo_cpu_prep(rsc->bo, ctx->screen->pipe,
+				DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC);
+		if (ret)
+			return false;
+
+		fd_bo_cpu_fini(rsc->bo);
+	}
+
+	if (rsc->write_batch)
+		fd_batch_flush(rsc->write_batch, true);
+
+	/* get the result: */
+	fd_bo_cpu_prep(rsc->bo, ctx->screen->pipe, DRM_FREEDRENO_PREP_READ);
+
+	void *ptr = fd_bo_map(rsc->bo);
+	p->result(ctx, ptr, result);
+	fd_bo_cpu_fini(rsc->bo);
+
+	return true;
+}
+
+static const struct fd_query_funcs acc_query_funcs = {
+		.destroy_query    = fd_acc_destroy_query,
+		.begin_query      = fd_acc_begin_query,
+		.end_query        = fd_acc_end_query,
+		.get_query_result = fd_acc_get_query_result,
+};
+
+struct fd_query *
+fd_acc_create_query(struct fd_context *ctx, unsigned query_type)
+{
+	struct fd_acc_query *aq;
+	struct fd_query *q;
+	int idx = pidx(query_type);
+
+	if ((idx < 0) || !ctx->acc_sample_providers[idx])
+		return NULL;
+
+	aq = CALLOC_STRUCT(fd_acc_query);
+	if (!aq)
+		return NULL;
+
+	DBG("%p: query_type=%u", aq, query_type);
+
+	aq->provider = ctx->acc_sample_providers[idx];
+
+	list_inithead(&aq->node);
+
+	q = &aq->base;
+	q->funcs = &acc_query_funcs;
+	q->type = query_type;
+
+	return q;
+}
+
+void
+fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage)
+{
+	if (stage != batch->stage) {
+		struct fd_acc_query *aq;
+		LIST_FOR_EACH_ENTRY(aq, &batch->ctx->acc_active_queries, node) {
+			const struct fd_acc_sample_provider *p = aq->provider;
+
+			bool was_active = is_active(aq, batch->stage);
+			bool now_active = is_active(aq, stage);
+
+			if (now_active && !was_active)
+				p->resume(aq, batch);
+			else if (was_active && !now_active)
+				p->pause(aq, batch);
+		}
+	}
+}
+
+void
+fd_acc_query_register_provider(struct pipe_context *pctx,
+		const struct fd_acc_sample_provider *provider)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	int idx = pidx(provider->query_type);
+
+	assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS));
+	assert(!ctx->acc_sample_providers[idx]);
+
+	ctx->acc_sample_providers[idx] = provider;
+}
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h
new file mode 100644
index 0000000..f8dfabc
--- /dev/null
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_QUERY_ACC_H_
+#define FREEDRENO_QUERY_ACC_H_
+
+#include "util/list.h"
+
+#include "freedreno_query.h"
+#include "freedreno_context.h"
+
+
+/*
+ * Accumulated HW Queries:
+ *
+ * Unlike the original HW Queries in earlier adreno generations (see
+ * freedreno_query_hw.[ch], later generations can accumulate the per-
+ * tile results of some (a4xx) or all (a5xx+?) queries in the cmdstream.
+ * But we still need to handle pausing/resuming the query across stage
+ * changes (in particular when switching between batches).
+ *
+ * fd_acc_sample_provider:
+ *   - one per accumulated query type, registered/implemented by gpu
+ *     generation specific code
+ *   - knows how to emit cmdstream to pause/resume a query instance
+ *
+ * fd_acc_query:
+ *   - one instance per query object
+ *   - each query object has it's own result buffer, which may
+ *     span multiple batches, etc.
+ */
+
+
+struct fd_acc_query;
+
+struct fd_acc_sample_provider {
+	unsigned query_type;
+
+	/* stages applicable to the query type: */
+	enum fd_render_stage active;
+
+	unsigned size;
+
+	void (*resume)(struct fd_acc_query *aq, struct fd_batch *batch);
+	void (*pause)(struct fd_acc_query *aq, struct fd_batch *batch);
+
+	void (*result)(struct fd_context *ctx, void *buf,
+			union pipe_query_result *result);
+};
+
+struct fd_acc_query {
+	struct fd_query base;
+
+	const struct fd_acc_sample_provider *provider;
+
+	struct pipe_resource *prsc;
+	unsigned offset;
+
+	struct list_head node;   /* list-node in ctx->active_acc_queries */
+
+	int no_wait_cnt;         /* see fd_acc_get_query_result() */
+};
+
+static inline struct fd_acc_query *
+fd_acc_query(struct fd_query *q)
+{
+	return (struct fd_acc_query *)q;
+}
+
+struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type);
+void fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage);
+void fd_acc_query_register_provider(struct pipe_context *pctx,
+		const struct fd_acc_sample_provider *provider);
+
+#endif /* FREEDRENO_QUERY_ACC_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c
index 470826a..73c3691 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -40,26 +40,6 @@
 	struct list_head list;
 };
 
-/* maps query_type to sample provider idx: */
-static int pidx(unsigned query_type)
-{
-	switch (query_type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-		return 0;
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		return 1;
-	/* TODO currently queries only emitted in main pass (not in binning pass)..
-	 * which is fine for occlusion query, but pretty much not anything else.
-	 */
-	case PIPE_QUERY_TIME_ELAPSED:
-		return 2;
-	case PIPE_QUERY_TIMESTAMP:
-		return 3;
-	default:
-		return -1;
-	}
-}
-
 static struct fd_hw_sample *
 get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		unsigned query_type)
@@ -72,7 +52,7 @@
 
 	if (!batch->sample_cache[idx]) {
 		struct fd_hw_sample *new_samp =
-			ctx->sample_providers[idx]->get_sample(batch, ring);
+			ctx->hw_sample_providers[idx]->get_sample(batch, ring);
 		fd_hw_sample_reference(ctx, &batch->sample_cache[idx], new_samp);
 		util_dynarray_append(&batch->samples, struct fd_hw_sample *, new_samp);
 		batch->needs_flush = true;
@@ -162,20 +142,15 @@
 
 	DBG("%p: active=%d", q, q->active);
 
-	if (q->active)
-		return false;
-
 	/* begin_query() should clear previous results: */
 	destroy_periods(ctx, hq);
 
 	if (batch && is_active(hq, batch->stage))
 		resume_query(batch, hq, batch->draw);
 
-	q->active = true;
-
 	/* add to active list: */
 	assert(list_empty(&hq->list));
-	list_addtail(&hq->list, &ctx->active_queries);
+	list_addtail(&hq->list, &ctx->hw_active_queries);
 
 	return true;
 }
@@ -186,22 +161,11 @@
 	struct fd_batch *batch = ctx->batch;
 	struct fd_hw_query *hq = fd_hw_query(q);
 
-	/* there are a couple special cases, which don't have
-	 * a matching ->begin_query():
-	 */
-	if (skip_begin_query(q->type) && !q->active) {
-		fd_hw_begin_query(ctx, q);
-	}
-
 	DBG("%p: active=%d", q, q->active);
 
-	if (!q->active)
-		return;
-
 	if (batch && is_active(hq, batch->stage))
 		pause_query(batch, hq, batch->draw);
 
-	q->active = false;
 	/* remove from active list: */
 	list_delinit(&hq->list);
 }
@@ -222,11 +186,6 @@
 
 	DBG("%p: wait=%d, active=%d", q, wait, q->active);
 
-	if (q->active)
-		return false;
-
-	util_query_clear_result(result, q->type);
-
 	if (LIST_IS_EMPTY(&hq->periods))
 		return true;
 
@@ -315,7 +274,7 @@
 	struct fd_query *q;
 	int idx = pidx(query_type);
 
-	if ((idx < 0) || !ctx->sample_providers[idx])
+	if ((idx < 0) || !ctx->hw_sample_providers[idx])
 		return NULL;
 
 	hq = CALLOC_STRUCT(fd_hw_query);
@@ -324,7 +283,7 @@
 
 	DBG("%p: query_type=%u", hq, query_type);
 
-	hq->provider = ctx->sample_providers[idx];
+	hq->provider = ctx->hw_sample_providers[idx];
 
 	list_inithead(&hq->periods);
 	list_inithead(&hq->list);
@@ -421,33 +380,21 @@
 }
 
 void
-fd_hw_query_set_stage(struct fd_batch *batch, struct fd_ringbuffer *ring,
-		enum fd_render_stage stage)
+fd_hw_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage)
 {
-	/* special case: internal blits (like mipmap level generation)
-	 * go through normal draw path (via util_blitter_blit()).. but
-	 * we need to ignore the FD_STAGE_DRAW which will be set, so we
-	 * don't enable queries which should be paused during internal
-	 * blits:
-	 */
-	if ((batch->stage == FD_STAGE_BLIT) &&
-			(stage != FD_STAGE_NULL))
-		return;
-
 	if (stage != batch->stage) {
 		struct fd_hw_query *hq;
-		LIST_FOR_EACH_ENTRY(hq, &batch->ctx->active_queries, list) {
+		LIST_FOR_EACH_ENTRY(hq, &batch->ctx->hw_active_queries, list) {
 			bool was_active = is_active(hq, batch->stage);
 			bool now_active = is_active(hq, stage);
 
 			if (now_active && !was_active)
-				resume_query(batch, hq, ring);
+				resume_query(batch, hq, batch->draw);
 			else if (was_active && !now_active)
-				pause_query(batch, hq, ring);
+				pause_query(batch, hq, batch->draw);
 		}
 	}
 	clear_sample_cache(batch);
-	batch->stage = stage;
 }
 
 /* call the provider->enable() for all the hw queries that were active
@@ -460,9 +407,9 @@
 	struct fd_context *ctx = batch->ctx;
 	for (int idx = 0; idx < MAX_HW_SAMPLE_PROVIDERS; idx++) {
 		if (batch->active_providers & (1 << idx)) {
-			assert(ctx->sample_providers[idx]);
-			if (ctx->sample_providers[idx]->enable)
-				ctx->sample_providers[idx]->enable(ctx, ring);
+			assert(ctx->hw_sample_providers[idx]);
+			if (ctx->hw_sample_providers[idx]->enable)
+				ctx->hw_sample_providers[idx]->enable(ctx, ring);
 		}
 	}
 	batch->active_providers = 0;  /* clear it for next frame */
@@ -476,9 +423,9 @@
 	int idx = pidx(provider->query_type);
 
 	assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS));
-	assert(!ctx->sample_providers[idx]);
+	assert(!ctx->hw_sample_providers[idx]);
 
-	ctx->sample_providers[idx] = provider;
+	ctx->hw_sample_providers[idx] = provider;
 }
 
 void
@@ -490,7 +437,6 @@
 			16);
 	slab_create(&ctx->sample_period_pool, sizeof(struct fd_hw_sample_period),
 			16);
-	list_inithead(&ctx->active_queries);
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.h b/src/gallium/drivers/freedreno/freedreno_query_hw.h
index abd8668..f283985 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.h
@@ -146,8 +146,7 @@
 void fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles);
 void fd_hw_query_prepare_tile(struct fd_batch *batch, uint32_t n,
 		struct fd_ringbuffer *ring);
-void fd_hw_query_set_stage(struct fd_batch *batch,
-		struct fd_ringbuffer *ring, enum fd_render_stage stage);
+void fd_hw_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage);
 void fd_hw_query_enable(struct fd_batch *batch, struct fd_ringbuffer *ring);
 void fd_hw_query_register_provider(struct pipe_context *pctx,
 		const struct fd_hw_sample_provider *provider);
diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c
index 4af6a12..dfa8987 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c
@@ -89,7 +89,6 @@
 fd_sw_begin_query(struct fd_context *ctx, struct fd_query *q)
 {
 	struct fd_sw_query *sq = fd_sw_query(q);
-	q->active = true;
 	sq->begin_value = read_counter(ctx, q->type);
 	if (is_rate_query(q))
 		sq->begin_time = os_time_get();
@@ -100,7 +99,6 @@
 fd_sw_end_query(struct fd_context *ctx, struct fd_query *q)
 {
 	struct fd_sw_query *sq = fd_sw_query(q);
-	q->active = false;
 	sq->end_value = read_counter(ctx, q->type);
 	if (is_rate_query(q))
 		sq->end_time = os_time_get();
@@ -112,11 +110,6 @@
 {
 	struct fd_sw_query *sq = fd_sw_query(q);
 
-	if (q->active)
-		return false;
-
-	util_query_clear_result(result, q->type);
-
 	result->u64 = sq->end_value - sq->begin_value;
 
 	if (is_rate_query(q)) {
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index a24f3f3..5aa90ce 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -51,39 +51,46 @@
 static void
 fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 {
-	int i;
-
 	/* Go through the entire state and see if the resource is bound
 	 * anywhere. If it is, mark the relevant state as dirty. This is called on
 	 * realloc_bo.
 	 */
 
-	/* Constbufs */
-	for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS && !(ctx->dirty & FD_DIRTY_CONSTBUF); i++) {
-		if (ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer == prsc)
-			ctx->dirty |= FD_DIRTY_CONSTBUF;
-		if (ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer == prsc)
-			ctx->dirty |= FD_DIRTY_CONSTBUF;
-	}
-
 	/* VBOs */
-	for (i = 0; i < ctx->vtx.vertexbuf.count && !(ctx->dirty & FD_DIRTY_VTXBUF); i++) {
-		if (ctx->vtx.vertexbuf.vb[i].buffer == prsc)
+	for (unsigned i = 0; i < ctx->vtx.vertexbuf.count && !(ctx->dirty & FD_DIRTY_VTXBUF); i++) {
+		if (ctx->vtx.vertexbuf.vb[i].buffer.resource == prsc)
 			ctx->dirty |= FD_DIRTY_VTXBUF;
 	}
 
-	/* Index buffer */
-	if (ctx->indexbuf.buffer == prsc)
-		ctx->dirty |= FD_DIRTY_INDEXBUF;
+	/* per-shader-stage resources: */
+	for (unsigned stage = 0; stage < PIPE_SHADER_TYPES; stage++) {
+		/* Constbufs.. note that constbuf[0] is normal uniforms emitted in
+		 * cmdstream rather than by pointer..
+		 */
+		const unsigned num_ubos = util_last_bit(ctx->constbuf[stage].enabled_mask);
+		for (unsigned i = 1; i < num_ubos; i++) {
+			if (ctx->dirty_shader[stage] & FD_DIRTY_SHADER_CONST)
+				break;
+			if (ctx->constbuf[stage].cb[i].buffer == prsc)
+				ctx->dirty_shader[stage] |= FD_DIRTY_SHADER_CONST;
+		}
 
-	/* Textures */
-	for (i = 0; i < ctx->verttex.num_textures && !(ctx->dirty & FD_DIRTY_VERTTEX); i++) {
-		if (ctx->verttex.textures[i] && (ctx->verttex.textures[i]->texture == prsc))
-			ctx->dirty |= FD_DIRTY_VERTTEX;
-	}
-	for (i = 0; i < ctx->fragtex.num_textures && !(ctx->dirty & FD_DIRTY_FRAGTEX); i++) {
-		if (ctx->fragtex.textures[i] && (ctx->fragtex.textures[i]->texture == prsc))
-			ctx->dirty |= FD_DIRTY_FRAGTEX;
+		/* Textures */
+		for (unsigned i = 0; i < ctx->tex[stage].num_textures; i++) {
+			if (ctx->dirty_shader[stage] & FD_DIRTY_SHADER_TEX)
+				break;
+			if (ctx->tex[stage].textures[i] && (ctx->tex[stage].textures[i]->texture == prsc))
+				ctx->dirty_shader[stage] |= FD_DIRTY_SHADER_TEX;
+		}
+
+		/* SSBOs */
+		const unsigned num_ssbos = util_last_bit(ctx->shaderbuf[stage].enabled_mask);
+		for (unsigned i = 0; i < num_ssbos; i++) {
+			if (ctx->dirty_shader[stage] & FD_DIRTY_SHADER_SSBO)
+				break;
+			if (ctx->shaderbuf[stage].sb[i].buffer == prsc)
+				ctx->dirty_shader[stage] |= FD_DIRTY_SHADER_SSBO;
+		}
 	}
 }
 
@@ -102,7 +109,6 @@
 		fd_bo_del(rsc->bo);
 
 	rsc->bo = fd_bo_new(screen->dev, size, flags);
-	rsc->timestamp = 0;
 	util_range_set_empty(&rsc->valid_buffer_range);
 	fd_bc_invalidate_resource(rsc, true);
 }
@@ -196,7 +202,6 @@
 
 	/* TODO valid_buffer_range?? */
 	swap(rsc->bo,        shadow->bo);
-	swap(rsc->timestamp, shadow->timestamp);
 	swap(rsc->write_batch,   shadow->write_batch);
 
 	/* at this point, the newly created shadow buffer is not referenced
@@ -699,8 +704,9 @@
 setup_slices(struct fd_resource *rsc, uint32_t alignment, enum pipe_format format)
 {
 	struct pipe_resource *prsc = &rsc->base.b;
+	struct fd_screen *screen = fd_screen(prsc->screen);
 	enum util_format_layout layout = util_format_description(format)->layout;
-	uint32_t pitchalign = fd_screen(prsc->screen)->gmem_alignw;
+	uint32_t pitchalign = screen->gmem_alignw;
 	uint32_t level, size = 0;
 	uint32_t width = prsc->width0;
 	uint32_t height = prsc->height0;
@@ -710,6 +716,9 @@
 	 */
 	uint32_t layers_in_level = rsc->layer_first ? 1 : prsc->array_size;
 
+	if (is_a5xx(screen) && (rsc->base.b.target >= PIPE_TEXTURE_2D))
+		height = align(height, screen->gmem_alignh);
+
 	for (level = 0; level <= prsc->last_level; level++) {
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
 		uint32_t blocks;
@@ -776,6 +785,25 @@
 	realloc_bo(rsc, setup_slices(rsc, 1, prsc->format));
 }
 
+// TODO common helper?
+static bool
+has_depth(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_Z32_UNORM:
+	case PIPE_FORMAT_Z32_FLOAT:
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+	case PIPE_FORMAT_Z24X8_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /**
  * Create a new texture object, using the given template info.
  */
@@ -783,6 +811,7 @@
 fd_resource_create(struct pipe_screen *pscreen,
 		const struct pipe_resource *tmpl)
 {
+	struct fd_screen *screen = fd_screen(pscreen);
 	struct fd_resource *rsc = CALLOC_STRUCT(fd_resource);
 	struct pipe_resource *prsc = &rsc->base.b;
 	enum pipe_format format = tmpl->format;
@@ -810,7 +839,7 @@
 
 	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
 		format = PIPE_FORMAT_Z32_FLOAT;
-	else if (fd_screen(pscreen)->gpu_id < 400 &&
+	else if (screen->gpu_id < 400 &&
 			 util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_RGTC)
 		format = PIPE_FORMAT_R8G8B8A8_UNORM;
 	rsc->internal_format = format;
@@ -818,8 +847,24 @@
 
 	assert(rsc->cpp);
 
+	// XXX probably need some extra work if we hit rsc shadowing path w/ lrz..
+	if (is_a5xx(screen) && (fd_mesa_debug & FD_DBG_LRZ) && has_depth(format)) {
+		const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+				DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */
+		unsigned lrz_pitch  = align(DIV_ROUND_UP(tmpl->width0, 8), 32);
+		unsigned lrz_height = DIV_ROUND_UP(tmpl->height0, 8);
+		unsigned size = lrz_pitch * lrz_height * 2;
+
+		size += 0x1000; /* for GRAS_LRZ_FAST_CLEAR_BUFFER */
+
+		rsc->lrz_height = lrz_height;
+		rsc->lrz_width = lrz_pitch;
+		rsc->lrz_pitch = lrz_pitch;
+		rsc->lrz = fd_bo_new(screen->dev, size, flags);
+	}
+
 	alignment = slice_alignment(pscreen, tmpl);
-	if (is_a4xx(fd_screen(pscreen)) || is_a5xx(fd_screen(pscreen))) {
+	if (is_a4xx(screen) || is_a5xx(screen)) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
 			rsc->layer_first = false;
@@ -1079,16 +1124,17 @@
 	util_blitter_save_framebuffer(ctx->blitter,
 			ctx->batch ? &ctx->batch->framebuffer : NULL);
 	util_blitter_save_fragment_sampler_states(ctx->blitter,
-			ctx->fragtex.num_samplers,
-			(void **)ctx->fragtex.samplers);
+			ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers,
+			(void **)ctx->tex[PIPE_SHADER_FRAGMENT].samplers);
 	util_blitter_save_fragment_sampler_views(ctx->blitter,
-			ctx->fragtex.num_textures, ctx->fragtex.textures);
+			ctx->tex[PIPE_SHADER_FRAGMENT].num_textures,
+			ctx->tex[PIPE_SHADER_FRAGMENT].textures);
 	if (!render_cond)
 		util_blitter_save_render_condition(ctx->blitter,
 			ctx->cond_query, ctx->cond_cond, ctx->cond_mode);
 
 	if (ctx->batch)
-		fd_hw_query_set_stage(ctx->batch, ctx->batch->draw, stage);
+		fd_batch_set_stage(ctx->batch, stage);
 
 	ctx->in_blit = discard;
 }
@@ -1097,7 +1143,7 @@
 fd_blitter_pipe_end(struct fd_context *ctx)
 {
 	if (ctx->batch)
-		fd_hw_query_set_stage(ctx->batch, ctx->batch->draw, FD_STAGE_NULL);
+		fd_batch_set_stage(ctx->batch, FD_STAGE_NULL);
 	ctx->in_blit = false;
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 60ba7e6..5bdb007 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -71,7 +71,6 @@
 	bool layer_first;        /* see above description */
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
-	uint32_t timestamp;
 	/* buffer range that has been initialized */
 	struct util_range valid_buffer_range;
 
@@ -96,6 +95,15 @@
 	 * shadowed.
 	 */
 	uint32_t bc_batch_mask;
+
+	/*
+	 * LRZ
+	 */
+	bool lrz_valid : 1;
+	uint16_t lrz_width;  // for lrz clear, does this differ from lrz_pitch?
+	uint16_t lrz_height;
+	uint16_t lrz_pitch;
+	struct fd_bo *lrz;
 };
 
 static inline struct fd_resource *
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 9b3ca4d..a915d65 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -75,10 +75,10 @@
 		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
 		{"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
 		{"deqp",      FD_DBG_DEQP,   "Enable dEQP hacks"},
-		{"nir",       FD_DBG_NIR,    "Prefer NIR as native IR"},
 		{"inorder",   FD_DBG_INORDER,"Disable reordering for draws/blits"},
 		{"bstat",     FD_DBG_BSTAT,  "Print batch stats at context destroy"},
 		{"nogrow",    FD_DBG_NOGROW, "Disable \"growable\" cmdstream buffers, even if kernel supports it"},
+		{"lrz",       FD_DBG_LRZ,    "Enable experimental LRZ support (a5xx+)"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -189,13 +189,15 @@
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
 		return is_a4xx(screen) ? 0 : 1;
 
+	case PIPE_CAP_COMPUTE:
+		return has_compute(screen);
+
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-	case PIPE_CAP_COMPUTE:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
 	case PIPE_CAP_PCI_GROUP:
 	case PIPE_CAP_PCI_BUS:
@@ -215,10 +217,15 @@
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 	case PIPE_CAP_CLIP_HALFZ:
 		return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen);
 
+	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+		return is_a3xx(screen) || is_a4xx(screen);
+
+	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+		return is_a5xx(screen);
+
 	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
 		return 0;
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@@ -250,6 +257,11 @@
 			return 120;
 		return is_ir3(screen) ? 140 : 120;
 
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+		if (is_a5xx(screen))
+			return 4;
+		return 0;
+
 	/* Unsupported features. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
@@ -267,7 +279,6 @@
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -282,7 +293,6 @@
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
@@ -307,6 +317,10 @@
 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
 	case PIPE_CAP_TGSI_BALLOT:
 	case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+	case PIPE_CAP_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_BINDLESS_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -370,7 +384,7 @@
 	case PIPE_CAP_QUERY_TIMESTAMP:
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
 		/* only a4xx, requires new enough kernel so we know max_freq: */
-		return (screen->max_freq > 0) && is_a4xx(screen);
+		return (screen->max_freq > 0) && (is_a4xx(screen) || is_a5xx(screen));
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
@@ -439,7 +453,7 @@
 
 static int
 fd_screen_get_shader_param(struct pipe_screen *pscreen,
-						   enum pipe_shader_type shader,
+		enum pipe_shader_type shader,
 		enum pipe_shader_cap param)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
@@ -450,6 +464,9 @@
 	case PIPE_SHADER_VERTEX:
 		break;
 	case PIPE_SHADER_COMPUTE:
+		if (has_compute(screen))
+			break;
+		return 0;
 	case PIPE_SHADER_GEOMETRY:
 		/* maye we could emulate.. */
 		return 0;
@@ -510,22 +527,142 @@
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
-		if ((fd_mesa_debug & FD_DBG_NIR) && is_ir3(screen))
+		if (is_ir3(screen))
 			return PIPE_SHADER_IR_NIR;
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_SUPPORTED_IRS:
+		if (is_ir3(screen)) {
+			return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI);
+		} else {
+			return (1 << PIPE_SHADER_IR_TGSI);
+		}
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
-	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+		return 0;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		if (is_a5xx(screen)) {
+			/* a5xx (and a4xx for that matter) has one state-block
+			 * for compute-shader SSBO's and another that is shared
+			 * by VS/HS/DS/GS/FS..  so to simplify things for now
+			 * just advertise SSBOs for FS and CS.  We could possibly
+			 * do what blob does, and partition the space for
+			 * VS/HS/DS/GS/FS.  The blob advertises:
+			 *
+			 *   GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS: 4
+			 *   GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS: 4
+			 *   GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS: 4
+			 *   GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS: 4
+			 *   GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS: 4
+			 *   GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: 24
+			 *   GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: 24
+			 *
+			 * I think that way we could avoid having to patch shaders
+			 * for actual SSBO indexes by using a static partitioning.
+			 */
+			switch(shader)
+			{
+			case PIPE_SHADER_FRAGMENT:
+			case PIPE_SHADER_COMPUTE:
+				return 24;
+			default:
+				return 0;
+			}
+		}
+		return 0;
+	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+		/* probably should be same as MAX_SHADRER_BUFFERS but not implemented yet */
 		return 0;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
 }
 
+/* TODO depending on how much the limits differ for a3xx/a4xx, maybe move this
+ * into per-generation backend?
+ */
+static int
+fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
+		enum pipe_compute_cap param, void *ret)
+{
+	struct fd_screen *screen = fd_screen(pscreen);
+	const char * const ir = "ir3";
+
+	if (!has_compute(screen))
+		return 0;
+
+	switch (param) {
+	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+		if (ret) {
+			uint32_t *address_bits = ret;
+			address_bits[0] = 32;
+
+			if (is_a5xx(screen))
+				address_bits[0] = 64;
+		}
+		return 1 * sizeof(uint32_t);
+
+	case PIPE_COMPUTE_CAP_IR_TARGET:
+		if (ret)
+			sprintf(ret, ir);
+		return strlen(ir) * sizeof(char);
+
+	case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+		if (ret) {
+			uint64_t *grid_dimension = ret;
+			grid_dimension[0] = 3;
+		}
+		return 1 * sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+		if (ret) {
+			uint64_t *grid_size = ret;
+			grid_size[0] = 65535;
+			grid_size[1] = 65535;
+			grid_size[2] = 65535;
+		}
+		return 3 * sizeof(uint64_t) ;
+
+	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+		if (ret) {
+			uint64_t *grid_size = ret;
+			grid_size[0] = 1024;
+			grid_size[1] = 1024;
+			grid_size[2] = 64;
+		}
+		return 3 * sizeof(uint64_t) ;
+
+	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+		if (ret) {
+			uint64_t *max_threads_per_block = ret;
+			*max_threads_per_block = 1024;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+	case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+	case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+		break;
+	case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+		if (ret) {
+			uint64_t *max = ret;
+			*max = 32768;
+		}
+		return sizeof(uint64_t);
+	case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+	case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+	case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+		break;
+	}
+
+	return 0;
+}
+
 static const void *
 fd_get_compiler_options(struct pipe_screen *pscreen,
 		enum pipe_shader_ir ir, unsigned shader)
@@ -533,7 +670,7 @@
 	struct fd_screen *screen = fd_screen(pscreen);
 
 	if (is_ir3(screen))
-		return ir3_get_compiler_options();
+		return ir3_get_compiler_options(screen->compiler);
 
 	return NULL;
 }
@@ -697,9 +834,11 @@
 	if (screen->gpu_id >= 500) {
 		screen->gmem_alignw = 64;
 		screen->gmem_alignh = 32;
+		screen->num_vsc_pipes = 16;
 	} else {
 		screen->gmem_alignw = 32;
 		screen->gmem_alignh = 32;
+		screen->num_vsc_pipes = 8;
 	}
 
 	/* NOTE: don't enable reordering on a2xx, since completely untested.
@@ -718,6 +857,7 @@
 	pscreen->get_param = fd_screen_get_param;
 	pscreen->get_paramf = fd_screen_get_paramf;
 	pscreen->get_shader_param = fd_screen_get_shader_param;
+	pscreen->get_compute_param = fd_get_compute_param;
 	pscreen->get_compiler_options = fd_get_compiler_options;
 
 	fd_resource_screen_init(pscreen);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index f2b1d8c..c5018da 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -38,6 +38,7 @@
 #include "os/os_thread.h"
 
 #include "freedreno_batch_cache.h"
+#include "freedreno_util.h"
 
 struct fd_bo;
 
@@ -65,6 +66,7 @@
 	uint32_t max_freq;
 	uint32_t max_rts;        /* max # of render targets */
 	uint32_t gmem_alignw, gmem_alignh;
+	uint32_t num_vsc_pipes;
 	bool has_timestamp;
 
 	void *compiler;          /* currently unused for a2xx */
@@ -127,4 +129,10 @@
 	return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen);
 }
 
+static inline bool
+has_compute(struct fd_screen *screen)
+{
+	return is_a5xx(screen);
+}
+
 #endif /* FREEDRENO_SCREEN_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 804d2b7..012e2b3 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -109,7 +109,56 @@
 
 	so->enabled_mask |= 1 << index;
 	so->dirty_mask |= 1 << index;
-	ctx->dirty |= FD_DIRTY_CONSTBUF;
+	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_CONST;
+	ctx->dirty |= FD_DIRTY_CONST;
+}
+
+static void
+fd_set_shader_buffers(struct pipe_context *pctx,
+		enum pipe_shader_type shader,
+		unsigned start, unsigned count,
+		const struct pipe_shader_buffer *buffers)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_shaderbuf_stateobj *so = &ctx->shaderbuf[shader];
+	unsigned mask = 0;
+
+	if (buffers) {
+		for (unsigned i = 0; i < count; i++) {
+			unsigned n = i + start;
+			struct pipe_shader_buffer *buf = &so->sb[n];
+
+			if ((buf->buffer == buffers[i].buffer) &&
+					(buf->buffer_offset == buffers[i].buffer_offset) &&
+					(buf->buffer_size == buffers[i].buffer_size))
+				continue;
+
+			mask |= BIT(n);
+
+			buf->buffer_offset = buffers[i].buffer_offset;
+			buf->buffer_size = buffers[i].buffer_size;
+			pipe_resource_reference(&buf->buffer, buffers[i].buffer);
+
+			if (buf->buffer)
+				so->enabled_mask |= BIT(n);
+			else
+				so->enabled_mask &= ~BIT(n);
+		}
+	} else {
+		mask = (BIT(count) - 1) << start;
+
+		for (unsigned i = 0; i < count; i++) {
+			unsigned n = i + start;
+			struct pipe_shader_buffer *buf = &so->sb[n];
+
+			pipe_resource_reference(&buf->buffer, NULL);
+		}
+
+		so->enabled_mask &= ~mask;
+	}
+
+	so->dirty_mask |= mask;
+	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_SSBO;
 }
 
 static void
@@ -125,13 +174,13 @@
 		fd_batch_reference(&old_batch, ctx->batch);
 
 		if (likely(old_batch))
-			fd_hw_query_set_stage(old_batch, old_batch->draw, FD_STAGE_NULL);
+			fd_batch_set_stage(old_batch, FD_STAGE_NULL);
 
 		batch = fd_batch_from_fb(&ctx->screen->batch_cache, ctx, framebuffer);
 		fd_batch_reference(&ctx->batch, NULL);
 		fd_reset_wfi(batch);
 		ctx->batch = batch;
-		ctx->dirty = ~0;
+		fd_context_all_dirty(ctx);
 
 		if (old_batch && old_batch->blit && !old_batch->back_blit) {
 			/* for blits, there is not really much point in hanging on
@@ -210,8 +259,8 @@
 	 */
 	if (ctx->screen->gpu_id < 300) {
 		for (i = 0; i < count; i++) {
-			bool new_enabled = vb && (vb[i].buffer || vb[i].user_buffer);
-			bool old_enabled = so->vb[i].buffer || so->vb[i].user_buffer;
+			bool new_enabled = vb && vb[i].buffer.resource;
+			bool old_enabled = so->vb[i].buffer.resource != NULL;
 			uint32_t new_stride = vb ? vb[i].stride : 0;
 			uint32_t old_stride = so->vb[i].stride;
 			if ((new_enabled != old_enabled) || (new_stride != old_stride)) {
@@ -228,24 +277,6 @@
 }
 
 static void
-fd_set_index_buffer(struct pipe_context *pctx,
-		const struct pipe_index_buffer *ib)
-{
-	struct fd_context *ctx = fd_context(pctx);
-
-	if (ib) {
-		pipe_resource_reference(&ctx->indexbuf.buffer, ib->buffer);
-		ctx->indexbuf.index_size = ib->index_size;
-		ctx->indexbuf.offset = ib->offset;
-		ctx->indexbuf.user_buffer = ib->user_buffer;
-	} else {
-		pipe_resource_reference(&ctx->indexbuf.buffer, NULL);
-	}
-
-	ctx->dirty |= FD_DIRTY_INDEXBUF;
-}
-
-static void
 fd_blend_state_bind(struct pipe_context *pctx, void *hwcso)
 {
 	struct fd_context *ctx = fd_context(pctx);
@@ -402,6 +433,32 @@
 	ctx->dirty |= FD_DIRTY_STREAMOUT;
 }
 
+static void
+fd_bind_compute_state(struct pipe_context *pctx, void *state)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->compute = state;
+	ctx->dirty_shader[PIPE_SHADER_COMPUTE] |= FD_DIRTY_SHADER_PROG;
+}
+
+static void
+fd_set_compute_resources(struct pipe_context *pctx,
+		unsigned start, unsigned count, struct pipe_surface **prscs)
+{
+	// TODO
+}
+
+static void
+fd_set_global_binding(struct pipe_context *pctx,
+		unsigned first, unsigned count, struct pipe_resource **prscs,
+		uint32_t **handles)
+{
+	/* TODO only used by clover.. seems to need us to return the actual
+	 * gpuaddr of the buffer.. which isn't really exposed to mesa atm.
+	 * How is this used?
+	 */
+}
+
 void
 fd_state_init(struct pipe_context *pctx)
 {
@@ -410,13 +467,13 @@
 	pctx->set_clip_state = fd_set_clip_state;
 	pctx->set_sample_mask = fd_set_sample_mask;
 	pctx->set_constant_buffer = fd_set_constant_buffer;
+	pctx->set_shader_buffers = fd_set_shader_buffers;
 	pctx->set_framebuffer_state = fd_set_framebuffer_state;
 	pctx->set_polygon_stipple = fd_set_polygon_stipple;
 	pctx->set_scissor_states = fd_set_scissor_states;
 	pctx->set_viewport_states = fd_set_viewport_states;
 
 	pctx->set_vertex_buffers = fd_set_vertex_buffers;
-	pctx->set_index_buffer = fd_set_index_buffer;
 
 	pctx->bind_blend_state = fd_blend_state_bind;
 	pctx->delete_blend_state = fd_blend_state_delete;
@@ -434,4 +491,10 @@
 	pctx->create_stream_output_target = fd_create_stream_output_target;
 	pctx->stream_output_target_destroy = fd_stream_output_target_destroy;
 	pctx->set_stream_output_targets = fd_set_stream_output_targets;
+
+	if (has_compute(fd_screen(pctx->screen))) {
+		pctx->bind_compute_state = fd_bind_compute_state;
+		pctx->set_compute_resources = fd_set_compute_resources;
+		pctx->set_global_binding = fd_set_global_binding;
+	}
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_texture.c b/src/gallium/drivers/freedreno/freedreno_texture.c
index 2d03892..1487f74 100644
--- a/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -91,14 +91,9 @@
 {
 	struct fd_context *ctx = fd_context(pctx);
 
-	if (shader == PIPE_SHADER_FRAGMENT) {
-		bind_sampler_states(&ctx->fragtex, start, nr, hwcso);
-		ctx->dirty |= FD_DIRTY_FRAGTEX;
-	}
-	else if (shader == PIPE_SHADER_VERTEX) {
-		bind_sampler_states(&ctx->verttex, start, nr, hwcso);
-		ctx->dirty |= FD_DIRTY_VERTTEX;
-	}
+	bind_sampler_states(&ctx->tex[shader], start, nr, hwcso);
+	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_TEX;
+	ctx->dirty |= FD_DIRTY_TEX;
 }
 
 void
@@ -108,27 +103,9 @@
 {
 	struct fd_context *ctx = fd_context(pctx);
 
-	switch (shader) {
-	case PIPE_SHADER_FRAGMENT:
-		/* on a2xx, since there is a flat address space for textures/samplers,
-		 * a change in # of fragment textures/samplers will trigger patching
-		 * and re-emitting the vertex shader:
-		 *
-		 * (note: later gen's ignore FD_DIRTY_TEXSTATE so fine to set it)
-		 */
-		if (nr != ctx->fragtex.num_textures)
-			ctx->dirty |= FD_DIRTY_TEXSTATE;
-
-		set_sampler_views(&ctx->fragtex, start, nr, views);
-		ctx->dirty |= FD_DIRTY_FRAGTEX;
-		break;
-	case PIPE_SHADER_VERTEX:
-		set_sampler_views(&ctx->verttex, start, nr, views);
-		ctx->dirty |= FD_DIRTY_VERTTEX;
-		break;
-	default:
-		break;
-	}
+	set_sampler_views(&ctx->tex[shader], start, nr, views);
+	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_TEX;
+	ctx->dirty |= FD_DIRTY_TEX;
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index a9b38c9..14fcf1d 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -76,10 +76,10 @@
 #define FD_DBG_SHADERDB 0x0800
 #define FD_DBG_FLUSH    0x1000
 #define FD_DBG_DEQP     0x2000
-#define FD_DBG_NIR      0x4000
-#define FD_DBG_INORDER  0x8000
-#define FD_DBG_BSTAT   0x10000
-#define FD_DBG_NOGROW  0x20000
+#define FD_DBG_INORDER  0x4000
+#define FD_DBG_BSTAT    0x8000
+#define FD_DBG_NOGROW  0x10000
+#define FD_DBG_LRZ     0x20000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -411,13 +411,8 @@
 	extern unsigned marker_cnt;
 //XXX	unsigned reg = REG_A5XX_CP_SCRATCH_REG(scratch_idx);
 	unsigned reg = 0x00000b78 + scratch_idx;
-	assert(reg != HW_QUERY_BASE_REG);
-	if (reg == HW_QUERY_BASE_REG)
-		return;
-	OUT_WFI5(ring);
 	OUT_PKT4(ring, reg, 1);
 	OUT_RING(ring, ++marker_cnt);
-	OUT_WFI5(ring);
 }
 
 /* helper to get numeric value from environment variable..  mostly
@@ -449,4 +444,27 @@
 #define foreach_bit(b, mask) \
 	for (uint32_t _m = (mask); _m && ({(b) = u_bit_scan(&_m); 1;});)
 
+
+#define BIT(bit) (1u << bit)
+
+/*
+ * a4xx+ helpers:
+ */
+
+static inline enum a4xx_state_block
+fd4_stage2shadersb(enum shader_t type)
+{
+	switch (type) {
+	case SHADER_VERTEX:
+		return SB4_VS_SHADER;
+	case SHADER_FRAGMENT:
+		return SB4_FS_SHADER;
+	case SHADER_COMPUTE:
+		return SB4_CS_SHADER;
+	default:
+		unreachable("bad shader type");
+		return ~0;
+	}
+}
+
 #endif /* FREEDRENO_UTIL_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index e29d156..4685ed6 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -159,16 +159,16 @@
 		break;
 	case OPC_BR:
 		printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
-				component[cat0->comp], cat0->a3xx.immed);
+				component[cat0->comp], cat0->a5xx.immed);
 		break;
 	case OPC_JUMP:
 	case OPC_CALL:
-		printf(" #%d", cat0->a3xx.immed);
+		printf(" #%d", cat0->a5xx.immed);
 		break;
 	}
 
-	if ((debug & PRINT_VERBOSE) && (cat0->a3xx.dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
-		printf("\t{0: %x,%x,%x,%x}", cat0->a3xx.dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+	if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		printf("\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
 }
 
 static void print_instr_cat1(instr_t *instr)
@@ -506,7 +506,6 @@
 	case OPC_STP:
 	case OPC_STI:
 	case OPC_STLW:
-	case OPC_STGB_4D_4:
 	case OPC_STIB:
 		dst.full  = true;
 		src1.full = type_size(cat6->type) == 32;
@@ -523,6 +522,18 @@
 	case OPC_PREFETCH:
 	case OPC_RESINFO:
 		break;
+	case OPC_LDGB:
+		printf(".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		printf(".%dd", cat6->ldgb.d + 1);
+		printf(".%s", type[cat6->type]);
+		printf(".%d", cat6->ldgb.type_size + 1);
+		break;
+	case OPC_STGB:
+		printf(".%s", cat6->stgb.typed ? "typed" : "untyped");
+		printf(".%dd", cat6->stgb.d + 1);
+		printf(".%s", type[cat6->type]);
+		printf(".%d", cat6->stgb.type_size + 1);
+		break;
 	case OPC_ATOMIC_ADD:
 	case OPC_ATOMIC_SUB:
 	case OPC_ATOMIC_XCHG:
@@ -558,6 +569,7 @@
 		break;
 
 	case OPC_LDG:
+	case OPC_LDC:
 		ss = 'g';
 		break;
 	case OPC_LDP:
@@ -589,6 +601,61 @@
 		break;
 	}
 
+	if (_OPC(6, cat6->opc) == OPC_STGB) {
+		struct reginfo src3;
+
+		memset(&src3, 0, sizeof(src3));
+
+		src1.reg = (reg_t)(cat6->stgb.src1);
+		src2.reg = (reg_t)(cat6->stgb.src2);
+		src2.im  = cat6->stgb.src2_im;
+		src3.reg = (reg_t)(cat6->stgb.src3);
+		src3.im  = cat6->stgb.src3_im;
+		src3.full = true;
+
+		printf("g[%u], ", cat6->stgb.dst_ssbo);
+		print_src(&src1);
+		printf(", ");
+		print_src(&src2);
+		printf(", ");
+		print_src(&src3);
+
+		if (debug & PRINT_VERBOSE)
+			printf(" (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+		return;
+	}
+
+	if ((_OPC(6, cat6->opc) == OPC_LDGB) || is_atomic(_OPC(6, cat6->opc))) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(&dst);
+		printf(", ");
+		printf("g[%u], ", cat6->ldgb.src_ssbo);
+		print_src(&src1);
+		printf(", ");
+		print_src(&src2);
+
+		if (is_atomic(_OPC(6, cat6->opc))) {
+			struct reginfo src3;
+			memset(&src3, 0, sizeof(src3));
+			src3.reg = (reg_t)(cat6->ldgb.src3);
+			src3.full = true;
+
+			printf(", ");
+			print_src(&src3);
+		}
+
+		if (debug & PRINT_VERBOSE)
+			printf(" (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+		return;
+	}
 	if (cat6->dst_off) {
 		dst.reg = (reg_t)(cat6->c.dst);
 		dstoff  = cat6->c.off;
@@ -806,10 +873,10 @@
 	OPC(6, OPC_ATOMIC_AND,     atomic.and),
 	OPC(6, OPC_ATOMIC_OR,      atomic.or),
 	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
-	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.3d),
-	OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
+	OPC(6, OPC_LDGB,         ldgb),
+	OPC(6, OPC_STGB,         stgb),
 	OPC(6, OPC_STIB,         stib),
-	OPC(6, OPC_LDC_4,        ldc.4),
+	OPC(6, OPC_LDC,          ldc),
 	OPC(6, OPC_LDLV,         ldlv),
 
 
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index 0d369b6..b429b3b 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -189,10 +189,10 @@
 	OPC_ATOMIC_AND      = _OPC(6, 24),
 	OPC_ATOMIC_OR       = _OPC(6, 25),
 	OPC_ATOMIC_XOR      = _OPC(6, 26),
-	OPC_LDGB_TYPED_4D   = _OPC(6, 27),
-	OPC_STGB_4D_4       = _OPC(6, 28),
+	OPC_LDGB            = _OPC(6, 27),
+	OPC_STGB            = _OPC(6, 28),
 	OPC_STIB            = _OPC(6, 29),
-	OPC_LDC_4           = _OPC(6, 30),
+	OPC_LDC             = _OPC(6, 30),
 	OPC_LDLV            = _OPC(6, 31),
 
 	/* meta instructions (category -1): */
@@ -639,18 +639,63 @@
 
 	uint32_t dst      : 8;
 	uint32_t mustbe0  : 1;
-	uint32_t pad0     : 23;
+	uint32_t idx      : 8;
+	uint32_t pad0     : 15;
 } instr_cat6d_t;
 
-/* I think some of the other cat6 instructions use additional
- * sub-encodings..
+/* ldgb and atomics.. atomics use 3rd src and pad0=1, pad3=3.  For
+ * ldgb pad0=0, pad3=2
  */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad0     : 1;
+	uint32_t src3     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t src_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t pad3     : 2;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;  // ???
+	uint32_t src1     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t pad0     : 9;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t src3     : 8;
+	uint32_t src3_im  : 1;
+	uint32_t dst_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t pad3     : 2;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
 
 typedef union PACKED {
 	instr_cat6a_t a;
 	instr_cat6b_t b;
 	instr_cat6c_t c;
 	instr_cat6d_t d;
+	instr_cat6ldgb_t ldgb;
+	instr_cat6stgb_t stgb;
 	struct PACKED {
 		/* dword0: */
 		uint32_t src_off  : 1;
@@ -733,4 +778,24 @@
 	}
 }
 
+static inline bool is_atomic(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		return true;
+	default:
+		return false;
+	}
+}
+
 #endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index c5a0302..d703f4e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -106,7 +106,7 @@
 			info->max_const = MAX2(info->max_const, max);
 		} else if (val.num == 63) {
 			/* ignore writes to dummy register r63.x */
-		} else if ((max != REG_A0) && (max != REG_P0)) {
+		} else if (max < 48) {
 			if (reg->flags & IR3_REG_HALF) {
 				info->max_half_reg = MAX2(info->max_half_reg, max);
 			} else {
@@ -475,6 +475,13 @@
 	struct ir3_register *dst, *src1, *src2;
 	instr_cat6_t *cat6 = ptr;
 
+	cat6->type     = instr->cat6.type;
+	cat6->opc      = instr->opc;
+	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
+	cat6->opc_cat  = 6;
+
 	/* the "dst" for a store instruction is (from the perspective
 	 * of data flow in the shader, ie. register use/def, etc) in
 	 * fact a register that is read by the instruction, rather
@@ -500,7 +507,65 @@
 	 * indicate to use the src_off encoding even if offset is zero
 	 * (but then what to do about dst_off?)
 	 */
-	if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
+	if ((instr->opc == OPC_LDGB) || is_atomic(instr->opc)) {
+		struct ir3_register *src3 = instr->regs[3];
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = 4 - 1;      /* always .4d ? */
+		ldgb->typed = false;  /* TODO true for images */
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		/* then next two are src1/src2: */
+		ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+		ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+		if (is_atomic(instr->opc)) {
+			struct ir3_register *src4 = instr->regs[4];
+			ldgb->src3 = reg(src4, info, instr->repeat, 0);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x3;
+		} else {
+			ldgb->pad0 = 0x0;
+			ldgb->pad3 = 0x2;
+		}
+
+		return 0;
+	} else if (instr->opc == OPC_STGB) {
+		struct ir3_register *src3 = instr->regs[4];
+		instr_cat6stgb_t *stgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = true;
+		stgb->pad3 = 0x2;
+
+		stgb->d = 4 - 1;    /* always .4d ? */
+		stgb->typed = false;
+		stgb->type_size = instr->cat6.iim_val - 1;
+
+		/* first src is dst_ssbo: */
+		iassert(dst->flags & IR3_REG_IMMED);
+		stgb->dst_ssbo = dst->uim_val;
+
+		/* then src1/src2/src3: */
+		stgb->src1 = reg(src1, info, instr->repeat, 0);
+		stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+		return 0;
+	} else if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
 		instr_cat6a_t *cat6a = ptr;
 
 		cat6->src_off = true;
@@ -536,13 +601,6 @@
 		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 	}
 
-	cat6->type     = instr->cat6.type;
-	cat6->opc      = instr->opc;
-	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat6->g        = !!(instr->flags & IR3_INSTR_G);
-	cat6->opc_cat  = 6;
-
 	return 0;
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 480b27c..de7a2a8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -58,8 +58,14 @@
 		IR3_REG_CONST  = 0x001,
 		IR3_REG_IMMED  = 0x002,
 		IR3_REG_HALF   = 0x004,
-		IR3_REG_RELATIV= 0x008,
-		IR3_REG_R      = 0x010,
+		/* high registers are used for some things in compute shaders,
+		 * for example.  Seems to be for things that are global to all
+		 * threads in a wave, so possibly these are global/shared by
+		 * all the threads in the wave?
+		 */
+		IR3_REG_HIGH   = 0x008,
+		IR3_REG_RELATIV= 0x010,
+		IR3_REG_R      = 0x020,
 		/* Most instructions, it seems, can do float abs/neg but not
 		 * integer.  The CP pass needs to know what is intended (int or
 		 * float) in order to do the right thing.  For this reason the
@@ -68,23 +74,23 @@
 		 * bitwise not, so split that out into a new flag to make it
 		 * more clear.
 		 */
-		IR3_REG_FNEG   = 0x020,
-		IR3_REG_FABS   = 0x040,
-		IR3_REG_SNEG   = 0x080,
-		IR3_REG_SABS   = 0x100,
-		IR3_REG_BNOT   = 0x200,
-		IR3_REG_EVEN   = 0x400,
-		IR3_REG_POS_INF= 0x800,
+		IR3_REG_FNEG   = 0x040,
+		IR3_REG_FABS   = 0x080,
+		IR3_REG_SNEG   = 0x100,
+		IR3_REG_SABS   = 0x200,
+		IR3_REG_BNOT   = 0x400,
+		IR3_REG_EVEN   = 0x800,
+		IR3_REG_POS_INF= 0x1000,
 		/* (ei) flag, end-input?  Set on last bary, presumably to signal
 		 * that the shader needs no more input:
 		 */
-		IR3_REG_EI     = 0x1000,
+		IR3_REG_EI     = 0x2000,
 		/* meta-flags, for intermediate stages of IR, ie.
 		 * before register assignment is done:
 		 */
-		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_ARRAY  = 0x4000,
-		IR3_REG_PHI_SRC= 0x8000,   /* phi src, regs[0]->instr points to phi */
+		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
+		IR3_REG_ARRAY  = 0x8000,
+		IR3_REG_PHI_SRC= 0x10000,  /* phi src, regs[0]->instr points to phi */
 
 	} flags;
 	union {
@@ -220,7 +226,7 @@
 			type_t type;
 			int src_offset;
 			int dst_offset;
-			int iim_val;
+			int iim_val;          /* for ldgb/stgb, # of components */
 		} cat6;
 		/* for meta-instructions, just used to hold extra data
 		 * before instruction scheduling, etc
@@ -337,6 +343,21 @@
 	return num;
 }
 
+/*
+ * Stupid/simple growable array implementation:
+ */
+#define DECLARE_ARRAY(type, name) \
+	unsigned name ## _count, name ## _sz; \
+	type * name;
+
+#define array_insert(ctx, arr, val) do { \
+		if (arr ## _count == arr ## _sz) { \
+			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
+			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
+		} \
+		arr[arr ##_count++] = val; \
+	} while (0)
+
 struct ir3 {
 	struct ir3_compiler *compiler;
 
@@ -350,8 +371,7 @@
 	 * threads in a group are killed before the last bary.f gets
 	 * a chance to signal end of input (ei).
 	 */
-	unsigned baryfs_count, baryfs_sz;
-	struct ir3_instruction **baryfs;
+	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
 
 	/* Track all indirect instructions (read and write).  To avoid
 	 * deadlock scenario where an address register gets scheduled,
@@ -363,17 +383,15 @@
 	 * convenient list of instructions that reference some address
 	 * register simplifies this.
 	 */
-	unsigned indirects_count, indirects_sz;
-	struct ir3_instruction **indirects;
+	DECLARE_ARRAY(struct ir3_instruction *, indirects);
+
 	/* and same for instructions that consume predicate register: */
-	unsigned predicates_count, predicates_sz;
-	struct ir3_instruction **predicates;
+	DECLARE_ARRAY(struct ir3_instruction *, predicates);
 
 	/* Track texture sample instructions which need texture state
 	 * patched in (for astc-srgb workaround):
 	 */
-	unsigned astc_srgb_count, astc_srgb_sz;
-	struct ir3_instruction **astc_srgb;
+	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
 
 	/* List of blocks: */
 	struct list_head block_list;
@@ -382,14 +400,14 @@
 	struct list_head array_list;
 };
 
-typedef struct nir_variable nir_variable;
+typedef struct nir_register nir_register;
 
 struct ir3_array {
 	struct list_head node;
 	unsigned length;
 	unsigned id;
 
-	nir_variable *var;
+	nir_register *r;
 
 	/* We track the last write and last access (read or write) to
 	 * setup dependencies on instructions that read or write the
@@ -432,8 +450,7 @@
 	/* Track instructions which do not write a register but other-
 	 * wise must not be discarded (such as kill, stg, etc)
 	 */
-	unsigned keeps_count, keeps_sz;
-	struct ir3_instruction **keeps;
+	DECLARE_ARRAY(struct ir3_instruction *, keeps);
 
 	/* used for per-pass extra block data.  Mainly used right
 	 * now in RA step to track livein/liveout.
@@ -596,6 +613,7 @@
 	 */
 	switch (instr->opc) {
 	case OPC_STG:
+	case OPC_STGB:
 	case OPC_STP:
 	case OPC_STL:
 	case OPC_STLW:
@@ -611,11 +629,12 @@
 {
 	switch (instr->opc) {
 	case OPC_LDG:
+	case OPC_LDGB:
 	case OPC_LDL:
 	case OPC_LDP:
 	case OPC_L2G:
 	case OPC_LDLW:
-	case OPC_LDC_4:
+	case OPC_LDC:
 	case OPC_LDLV:
 		/* probably some others too.. */
 		return true;
@@ -854,14 +873,6 @@
 	}
 }
 
-#define array_insert(ctx, arr, val) do { \
-		if (arr ## _count == arr ## _sz) { \
-			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
-			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
-		} \
-		arr[arr ##_count++] = val; \
-	} while (0)
-
 /* iterator for an instructions's sources (reg), also returns src #: */
 #define foreach_src_n(__srcreg, __n, __instr) \
 	if ((__instr)->regs_count) \
@@ -925,7 +936,7 @@
 		bool frag_coord, bool frag_face);
 
 /* legalize: */
-void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary);
+void ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary);
 
 /* ************************************************************************* */
 /* instruction helpers */
@@ -1019,6 +1030,24 @@
 	return instr;                                                        \
 }
 
+#define INSTR4(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | dflags)->instr = d;           \
+	return instr;                                                        \
+}
+
 /* cat0 instructions: */
 INSTR0(BR);
 INSTR0(JUMP);
@@ -1136,6 +1165,19 @@
 INSTR2(LDLV)
 INSTR2(LDG)
 INSTR3(STG)
+INSTR3(LDGB);
+INSTR4(STGB);
+INSTR4(ATOMIC_ADD);
+INSTR4(ATOMIC_SUB);
+INSTR4(ATOMIC_XCHG);
+INSTR4(ATOMIC_INC);
+INSTR4(ATOMIC_DEC);
+INSTR4(ATOMIC_CMPXCHG);
+INSTR4(ATOMIC_MIN);
+INSTR4(ATOMIC_MAX);
+INSTR4(ATOMIC_AND);
+INSTR4(ATOMIC_OR);
+INSTR4(ATOMIC_XOR);
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ba1bfa8..fdec3f2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -96,6 +96,8 @@
 	}
 }
 
+static struct ir3_compiler *compiler;
+
 static nir_shader *
 load_glsl(unsigned num_files, char* const* files, gl_shader_stage stage)
 {
@@ -109,9 +111,7 @@
 	if (!prog)
 		errx(1, "couldn't parse `%s'", files[0]);
 
-	nir_shader *nir = glsl_to_nir(prog, stage, ir3_get_compiler_options());
-
-	standalone_compiler_cleanup(prog);
+	nir_shader *nir = glsl_to_nir(prog, stage, ir3_get_compiler_options(compiler));
 
 	/* required NIR passes: */
 	/* TODO cmdline args for some of the conditional lowering passes? */
@@ -366,6 +366,8 @@
 
 	nir_shader *nir;
 
+	compiler = ir3_compiler_create(NULL, gpu_id);
+
 	if (s.from_tgsi) {
 		struct tgsi_token toks[65536];
 
@@ -392,7 +394,7 @@
 		return -1;
 	}
 
-	s.compiler = ir3_compiler_create(NULL, gpu_id);
+	s.compiler = compiler;
 	s.nir = ir3_optimize_nir(&s, nir, NULL);
 
 	v.key = key;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 22619e8..764aeb4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -71,6 +71,23 @@
 	/* For vertex shaders, keep track of the system values sources */
 	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
 
+	/* Compute shader inputs: */
+	struct ir3_instruction *local_invocation_id, *work_group_id;
+
+	/* For SSBO's and atomics, we need to preserve order, such
+	 * that reads don't overtake writes, and the order of writes
+	 * is preserved.  Atomics are considered as a write.
+	 *
+	 * To do this, we track last write and last access, in a
+	 * similar way to ir3_array.  But since we don't know whether
+	 * the same SSBO is bound to multiple slots, so we simply
+	 * track this globally rather than per-SSBO.
+	 *
+	 * TODO should we track this per block instead?  I guess it
+	 * shouldn't matter much?
+	 */
+	struct ir3_instruction *last_write, *last_access;
+
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;
 
@@ -81,8 +98,17 @@
 	 * duplicate instruction sequences (which our backend does not
 	 * try to clean up, since that should be done as the NIR stage)
 	 * we cache the address value generated for a given src value:
+	 *
+	 * Note that we have to cache these per alignment, since same
+	 * src used for an array of vec1 cannot be also used for an
+	 * array of vec4.
 	 */
-	struct hash_table *addr_ht;
+	struct hash_table *addr_ht[4];
+
+	/* last dst array, for indirect we need to insert a var-store.
+	 */
+	struct ir3_instruction **last_dst;
+	unsigned last_dst_n;
 
 	/* maps nir_block to ir3_block, mostly for the purposes of
 	 * figuring out the blocks successors
@@ -178,6 +204,11 @@
 		ctx->s = so->shader->nir;
 	}
 
+	/* this needs to be the last pass run, so do this here instead of
+	 * in ir3_optimize_nir():
+	 */
+	NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
 			so->shader->id, so->id, so->type,
@@ -187,7 +218,7 @@
 	}
 
 	so->num_uniforms = ctx->s->num_uniforms;
-	so->num_ubos = ctx->s->info->num_ubos;
+	so->num_ubos = ctx->s->info.num_ubos;
 
 	/* Layout of constant registers, each section aligned to vec4.  Note
 	 * that pointer size (ubo, etc) changes depending on generation.
@@ -211,18 +242,24 @@
 
 	if (so->num_ubos > 0) {
 		so->constbase.ubo = constoff;
-		constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4;
+		constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
 	}
 
+	unsigned num_driver_params = 0;
 	if (so->type == SHADER_VERTEX) {
-		so->constbase.driver_param = constoff;
-		constoff += align(IR3_DP_COUNT, 4) / 4;
+		num_driver_params = IR3_DP_VS_COUNT;
+	} else if (so->type == SHADER_COMPUTE) {
+		num_driver_params = IR3_DP_CS_COUNT;
+	}
 
-		if ((compiler->gpu_id < 500) &&
-				so->shader->stream_output.num_outputs > 0) {
-			so->constbase.tfbo = constoff;
-			constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4;
-		}
+	so->constbase.driver_param = constoff;
+	constoff += align(num_driver_params, 4) / 4;
+
+	if ((so->type == SHADER_VERTEX) &&
+			(compiler->gpu_id < 500) &&
+			so->shader->stream_output.num_outputs > 0) {
+		so->constbase.tfbo = constoff;
+		constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4;
 	}
 
 	so->constbase.immediate = constoff;
@@ -253,68 +290,180 @@
 }
 
 static void
-declare_var(struct ir3_compile *ctx, nir_variable *var)
+declare_array(struct ir3_compile *ctx, nir_register *reg)
 {
-	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
 	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
 	arr->id = ++ctx->num_arrays;
-	arr->length = length;
-	arr->var = var;
+	/* NOTE: sometimes we get non array regs, for example for arrays of
+	 * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
+	 * treat a non-array as if it was an array of length 1.
+	 *
+	 * It would be nice if there was a nir pass to convert arrays of
+	 * length 1 to ssa.
+	 */
+	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+	compile_assert(ctx, arr->length > 0);
+	arr->r = reg;
 	list_addtail(&arr->node, &ctx->ir->array_list);
 }
 
 static struct ir3_array *
-get_var(struct ir3_compile *ctx, nir_variable *var)
+get_array(struct ir3_compile *ctx, nir_register *reg)
 {
 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-		if (arr->var == var)
+		if (arr->r == reg)
 			return arr;
 	}
-	compile_error(ctx, "bogus var: %s\n", var->name);
+	compile_error(ctx, "bogus reg: %s\n", reg->name);
 	return NULL;
 }
 
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_access = mov;
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *dst;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_access;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
+	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_write = arr->last_access = mov;
+
+	return mov;
+}
+
 /* allocate a n element value array (to be populated by caller) and
  * insert in def_ht
  */
 static struct ir3_instruction **
-__get_dst(struct ir3_compile *ctx, void *key, unsigned n)
+get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
 {
 	struct ir3_instruction **value =
 		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
-	_mesa_hash_table_insert(ctx->def_ht, key, value);
+	_mesa_hash_table_insert(ctx->def_ht, dst, value);
 	return value;
 }
 
 static struct ir3_instruction **
 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
 {
-	compile_assert(ctx, dst->is_ssa);
+	struct ir3_instruction **value;
+
 	if (dst->is_ssa) {
-		return __get_dst(ctx, &dst->ssa, n);
+		value = get_dst_ssa(ctx, &dst->ssa, n);
 	} else {
-		return __get_dst(ctx, dst->reg.reg, n);
+		value = ralloc_array(ctx, struct ir3_instruction *, n);
 	}
+
+	/* NOTE: in non-ssa case, we don't really need to store last_dst
+	 * but this helps us catch cases where put_dst() call is forgotten
+	 */
+	compile_assert(ctx, !ctx->last_dst);
+	ctx->last_dst = value;
+	ctx->last_dst_n = n;
+
+	return value;
 }
 
-static struct ir3_instruction **
-get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
-{
-	return __get_dst(ctx, dst, n);
-}
+static struct ir3_instruction * get_addr(struct ir3_compile *ctx, struct ir3_instruction *src, int align);
 
 static struct ir3_instruction * const *
 get_src(struct ir3_compile *ctx, nir_src *src)
 {
-	struct hash_entry *entry;
-	compile_assert(ctx, src->is_ssa);
 	if (src->is_ssa) {
+		struct hash_entry *entry;
 		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+		compile_assert(ctx, entry);
+		return entry->data;
 	} else {
-		entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
+		nir_register *reg = src->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = arr->r->num_components;
+		struct ir3_instruction *addr = NULL;
+		struct ir3_instruction **value =
+			ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+		if (src->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = src->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			value[i] = create_array_load(ctx, arr, n, addr);
+		}
+
+		return value;
 	}
-	compile_assert(ctx, entry);
-	return entry->data;
+}
+
+static void
+put_dst(struct ir3_compile *ctx, nir_dest *dst)
+{
+	if (!dst->is_ssa) {
+		nir_register *reg = dst->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = ctx->last_dst_n;
+		struct ir3_instruction *addr = NULL;
+
+		if (dst->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = dst->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			if (!ctx->last_dst[i])
+				continue;
+			create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+		}
+
+		ralloc_free(ctx->last_dst);
+	}
+	ctx->last_dst = NULL;
+	ctx->last_dst_n = 0;
 }
 
 static struct ir3_instruction *
@@ -332,7 +481,7 @@
 }
 
 static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src)
+create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *instr, *immed;
 
@@ -342,12 +491,41 @@
 	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
 	instr->regs[0]->flags |= IR3_REG_HALF;
 
-	immed = create_immed(block, 2);
-	immed->regs[0]->flags |= IR3_REG_HALF;
+	switch(align){
+	case 1:
+		/* src *= 1: */
+		break;
+	case 2:
+		/* src *= 2	=> src <<= 1: */
+		immed = create_immed(block, 1);
+		immed->regs[0]->flags |= IR3_REG_HALF;
 
-	instr = ir3_SHL_B(block, instr, 0, immed, 0);
-	instr->regs[0]->flags |= IR3_REG_HALF;
-	instr->regs[1]->flags |= IR3_REG_HALF;
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 3:
+		/* src *= 3: */
+		immed = create_immed(block, 3);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_MULL_U(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 4:
+		/* src *= 4 => src <<= 2: */
+		immed = create_immed(block, 2);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	default:
+		unreachable("bad align");
+		return NULL;
+	}
 
 	instr = ir3_MOV(block, instr, TYPE_S16);
 	instr->regs[0]->num = regid(REG_A0, 0);
@@ -361,22 +539,25 @@
  * sequences for each use of a given NIR level src as address
  */
 static struct ir3_instruction *
-get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
+get_addr(struct ir3_compile *ctx, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *addr;
+	unsigned idx = align - 1;
 
-	if (!ctx->addr_ht) {
-		ctx->addr_ht = _mesa_hash_table_create(ctx,
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+
+	if (!ctx->addr_ht[idx]) {
+		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
 				_mesa_hash_pointer, _mesa_key_pointer_equal);
 	} else {
 		struct hash_entry *entry;
-		entry = _mesa_hash_table_search(ctx->addr_ht, src);
+		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
 		if (entry)
 			return entry->data;
 	}
 
-	addr = create_addr(ctx->block, src);
-	_mesa_hash_table_insert(ctx->addr_ht, src, addr);
+	addr = create_addr(ctx->block, src, align);
+	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
 
 	return addr;
 }
@@ -430,7 +611,7 @@
 }
 
 static struct ir3_instruction *
-create_collect(struct ir3_block *block, struct ir3_instruction **arr,
+create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
 		unsigned arrsz)
 {
 	struct ir3_instruction *collect;
@@ -468,63 +649,8 @@
 	return mov;
 }
 
-/* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
-		struct ir3_instruction *address)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *mov;
-	struct ir3_register *src;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	ir3_reg_create(mov, 0, 0);
-	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-			COND(address, IR3_REG_RELATIV));
-	src->instr = arr->last_write;
-	src->size  = arr->length;
-	src->array.id = arr->id;
-	src->array.offset = n;
-
-	if (address)
-		ir3_instr_set_address(mov, address);
-
-	arr->last_access = mov;
-
-	return mov;
-}
-
-/* relative (indirect) if address!=NULL */
-static struct ir3_instruction *
-create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
-		struct ir3_instruction *src, struct ir3_instruction *address)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *mov;
-	struct ir3_register *dst;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-			COND(address, IR3_REG_RELATIV));
-	dst->instr = arr->last_access;
-	dst->size  = arr->length;
-	dst->array.id = arr->id;
-	dst->array.offset = n;
-	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-
-	ir3_instr_set_address(mov, address);
-
-	arr->last_write = arr->last_access = mov;
-
-	return mov;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_block *block, unsigned n)
+create_input_compmask(struct ir3_block *block, unsigned n, unsigned compmask)
 {
 	struct ir3_instruction *in;
 
@@ -532,10 +658,18 @@
 	in->inout.block = block;
 	ir3_reg_create(in, n, 0);
 
+	in->regs[0]->wrmask = compmask;
+
 	return in;
 }
 
 static struct ir3_instruction *
+create_input(struct ir3_block *block, unsigned n)
+{
+	return create_input_compmask(block, n, 0x1);
+}
+
+static struct ir3_instruction *
 create_frag_input(struct ir3_compile *ctx, bool use_ldlv)
 {
 	struct ir3_block *block = ctx->block;
@@ -682,8 +816,17 @@
 	const nir_op_info *info = &nir_op_infos[alu->op];
 	struct ir3_instruction **dst, *src[info->num_inputs];
 	struct ir3_block *b = ctx->block;
+	unsigned dst_sz, wrmask;
 
-	dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
+	if (alu->dest.dest.is_ssa) {
+		dst_sz = alu->dest.dest.ssa.num_components;
+		wrmask = (1 << dst_sz) - 1;
+	} else {
+		dst_sz = alu->dest.dest.reg.reg->num_components;
+		wrmask = alu->dest.write_mask;
+	}
+
+	dst = get_dst(ctx, &alu->dest.dest, dst_sz);
 
 	/* Vectors are special in that they have non-scalarized writemasks,
 	 * and just take the first swizzle channel for each argument in
@@ -705,9 +848,32 @@
 			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
 		}
 
+		put_dst(ctx, &alu->dest.dest);
 		return;
 	}
 
+	/* We also get mov's with more than one component for mov's so
+	 * handle those specially:
+	 */
+	if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
+		type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+		nir_alu_src *asrc = &alu->src[0];
+		struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
+
+		for (unsigned i = 0; i < dst_sz; i++) {
+			if (wrmask & (1 << i)) {
+				dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+			} else {
+				dst[i] = NULL;
+			}
+		}
+
+		put_dst(ctx, &alu->dest.dest);
+		return;
+	}
+
+	compile_assert(ctx, alu->dest.dest.is_ssa);
+
 	/* General case: We can just grab the one used channel per src. */
 	for (int i = 0; i < info->num_inputs; i++) {
 		unsigned chan = ffs(alu->dest.write_mask) - 1;
@@ -734,12 +900,6 @@
 	case nir_op_u2f32:
 		dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
 		break;
-	case nir_op_imov:
-		dst[0] = ir3_MOV(b, src[0], TYPE_S32);
-		break;
-	case nir_op_fmov:
-		dst[0] = ir3_MOV(b, src[0], TYPE_F32);
-		break;
 	case nir_op_f2b:
 		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
 		dst[0]->cat2.condition = IR3_COND_NE;
@@ -985,6 +1145,8 @@
 				nir_op_infos[alu->op].name);
 		break;
 	}
+
+	put_dst(ctx, &alu->dest.dest);
 }
 
 /* handles direct/indirect UBO reads: */
@@ -1008,8 +1170,8 @@
 		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
 		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
 	} else {
-		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
-		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0));
+		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
+		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
 	}
 
 	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
@@ -1059,84 +1221,169 @@
 	}
 }
 
-/* handles array reads: */
 static void
-emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+mark_ssbo_read(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+	instr->regs[0]->instr = ctx->last_write;
+	instr->regs[0]->flags |= IR3_REG_SSA;
+	ctx->last_access = instr;
+}
+
+static void
+mark_ssbo_write(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+	instr->regs[0]->instr = ctx->last_access;
+	instr->regs[0]->flags |= IR3_REG_SSA;
+	ctx->last_write = ctx->last_access = instr;
+}
+
+static void
+emit_intrinsic_load_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction **dst)
 {
-	nir_deref_var *dvar = intr->variables[0];
-	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldgb, *src0, *src1, *offset;
+	nir_const_value *const_offset;
 
-	compile_assert(ctx, dvar->deref.child &&
-		(dvar->deref.child->deref_type == nir_deref_type_array));
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
 
-	switch (darr->deref_array_type) {
-	case nir_deref_array_type_direct:
-		/* direct access does not require anything special: */
-		for (int i = 0; i < intr->num_components; i++) {
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-			dst[i] = create_var_load(ctx, arr, n, NULL);
-		}
-		break;
-	case nir_deref_array_type_indirect: {
-		/* for indirect, we need to collect all the array elements: */
-		struct ir3_instruction *addr =
-				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
-		for (int i = 0; i < intr->num_components; i++) {
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-			dst[i] = create_var_load(ctx, arr, n, addr);
-		}
-		break;
-	}
-	default:
-		compile_error(ctx, "Unhandled load deref type: %u\n",
-				darr->deref_array_type);
-		break;
-	}
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+	src0 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+	ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0);
+	ldgb->regs[0]->wrmask = (1 << intr->num_components) - 1;
+	ldgb->cat6.iim_val = intr->num_components;
+	ldgb->cat6.type = TYPE_U32;
+	mark_ssbo_read(ctx, ldgb);
+
+	split_dest(b, dst, ldgb, 0, intr->num_components);
 }
 
-/* handles array writes: */
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
 static void
-emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
-	nir_deref_var *dvar = intr->variables[0];
-	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
-	struct ir3_instruction *addr;
-	struct ir3_instruction * const *src;
-	unsigned wrmask = nir_intrinsic_write_mask(intr);
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	unsigned ncomp = ffs(~intr->const_index[0]) - 1;
 
-	compile_assert(ctx, dvar->deref.child &&
-		(dvar->deref.child->deref_type == nir_deref_type_array));
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	compile_assert(ctx, const_offset);
 
-	src = get_src(ctx, &intr->src[0]);
+	offset = get_src(ctx, &intr->src[2])[0];
 
-	switch (darr->deref_array_type) {
-	case nir_deref_array_type_direct:
-		addr = NULL;
-		break;
-	case nir_deref_array_type_indirect:
-		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
-		break;
-	default:
-		compile_error(ctx, "Unhandled store deref type: %u\n",
-				darr->deref_array_type);
-		return;
-	}
+	/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+	 * nir already *= 4:
+	 */
+	src0 = create_collect(b, get_src(ctx, &intr->src[0]), ncomp);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
 
-	for (int i = 0; i < intr->num_components; i++) {
-		if (!(wrmask & (1 << i)))
-			continue;
-		unsigned n = darr->base_offset * 4 + i;
-		compile_assert(ctx, n < arr->length);
-		create_var_store(ctx, arr, n, src[i], addr);
-	}
+	stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0, src2, 0);
+	stgb->cat6.iim_val = ncomp;
+	stgb->cat6.type = TYPE_U32;
+	mark_ssbo_write(ctx, stgb);
+
+	array_insert(b, b->keeps, stgb);
 }
 
-static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
+static struct ir3_instruction *
+emit_intrinsic_atomic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	type_t type = TYPE_U32;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+	ssbo = create_immed(b, const_offset->u32[0]);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is data (or uvec2(data, compare)
+	 * src1 is offset
+	 * src2 is uvec2(offset*4, 0)
+	 *
+	 * Note that nir already multiplies the offset by four
+	 */
+	src0 = get_src(ctx, &intr->src[2])[0];
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_ssbo_atomic_add:
+		atomic = ir3_ATOMIC_ADD(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imin:
+		atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umin:
+		atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imax:
+		atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umax:
+		atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_and:
+		atomic = ir3_ATOMIC_AND(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_or:
+		atomic = ir3_ATOMIC_OR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_xor:
+		atomic = ir3_ATOMIC_XOR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(b, (struct ir3_instruction*[]){
+			src0,
+			get_src(ctx, &intr->src[3])[0],
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.type = type;
+	mark_ssbo_write(ctx, atomic);
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+static void add_sysval_input_compmask(struct ir3_compile *ctx,
+		gl_system_value slot, unsigned compmask,
 		struct ir3_instruction *instr)
 {
 	struct ir3_shader_variant *so = ctx->so;
@@ -1145,7 +1392,7 @@
 
 	so->inputs[n].sysval = true;
 	so->inputs[n].slot = slot;
-	so->inputs[n].compmask = 1;
+	so->inputs[n].compmask = compmask;
 	so->inputs[n].regid = r;
 	so->inputs[n].interpolate = INTERP_MODE_FLAT;
 	so->total_in++;
@@ -1154,6 +1401,12 @@
 	ctx->ir->inputs[r] = instr;
 }
 
+static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
+		struct ir3_instruction *instr)
+{
+	add_sysval_input_compmask(ctx, slot, 0x1, instr);
+}
+
 static void
 emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
@@ -1185,7 +1438,7 @@
 			for (int i = 0; i < intr->num_components; i++) {
 				int n = idx * 4 + i;
 				dst[i] = create_uniform_indirect(ctx, n,
-						get_addr(ctx, src[0]));
+						get_addr(ctx, src[0], 4));
 			}
 			/* NOTE: if relative addressing is used, we set
 			 * constlen in the compiler (to worst-case value)
@@ -1211,7 +1464,7 @@
 			src = get_src(ctx, &intr->src[0]);
 			struct ir3_instruction *collect =
 					create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
-			struct ir3_instruction *addr = get_addr(ctx, src[0]);
+			struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
@@ -1219,11 +1472,28 @@
 			}
 		}
 		break;
-	case nir_intrinsic_load_var:
-		emit_intrinsic_load_var(ctx, intr, dst);
+	case nir_intrinsic_load_ssbo:
+		emit_intrinsic_load_ssbo(ctx, intr, dst);
 		break;
-	case nir_intrinsic_store_var:
-		emit_intrinsic_store_var(ctx, intr);
+	case nir_intrinsic_store_ssbo:
+		emit_intrinsic_store_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		if (info->has_dest) {
+			compile_assert(ctx, intr->num_components == 1);
+			dst[0] = emit_intrinsic_atomic(ctx, intr);
+		} else {
+			emit_intrinsic_atomic(ctx, intr);
+		}
 		break;
 	case nir_intrinsic_store_output:
 		idx = nir_intrinsic_base(intr);
@@ -1276,14 +1546,33 @@
 			ctx->frag_face = create_input(b, 0);
 			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 		}
-		/* for fragface, we always get -1 or 0, but that is inverse
-		 * of what nir expects (where ~0 is true).  Unfortunately
-		 * trying to widen from half to full in add.s seems to do a
-		 * non-sign-extending widen (resulting in something that
-		 * gets interpreted as float Inf??)
+		/* for fragface, we get -1 for back and 0 for front. However this is
+		 * the inverse of what nir expects (where ~0 is true).
 		 */
 		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
-		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
+		dst[0] = ir3_NOT_B(b, dst[0], 0);
+		break;
+	case nir_intrinsic_load_local_invocation_id:
+		if (!ctx->local_invocation_id) {
+			ctx->local_invocation_id = create_input_compmask(b, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+					0x7, ctx->local_invocation_id);
+		}
+		split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+		break;
+	case nir_intrinsic_load_work_group_id:
+		if (!ctx->work_group_id) {
+			ctx->work_group_id = create_input_compmask(b, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
+					0x7, ctx->work_group_id);
+			ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
+		}
+		split_dest(b, dst, ctx->work_group_id, 0, 3);
+		break;
+	case nir_intrinsic_load_num_work_groups:
+		for (int i = 0; i < intr->num_components; i++) {
+			dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+		}
 		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {
@@ -1318,6 +1607,9 @@
 				nir_intrinsic_infos[intr->intrinsic].name);
 		break;
 	}
+
+	if (info->has_dest)
+		put_dst(ctx, &intr->dest);
 }
 
 static void
@@ -1602,6 +1894,8 @@
 							   factor, 0);
 		}
 	}
+
+	put_dst(ctx, &tex->dest);
 }
 
 static void
@@ -1625,6 +1919,8 @@
 	 */
 	if (ctx->levels_add_one)
 		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+
+	put_dst(ctx, &tex->dest);
 }
 
 static void
@@ -1667,6 +1963,8 @@
 			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
 		}
 	}
+
+	put_dst(ctx, &tex->dest);
 }
 
 static void
@@ -1685,6 +1983,8 @@
 	phi->phi.nphi = nphi;
 
 	dst[0] = phi;
+
+	put_dst(ctx, &nphi->dest);
 }
 
 /* phi instructions are left partially constructed.  We don't resolve
@@ -1820,8 +2120,10 @@
 	list_addtail(&block->node, &ctx->ir->block_list);
 
 	/* re-emit addr register in each block if needed: */
-	_mesa_hash_table_destroy(ctx->addr_ht, NULL);
-	ctx->addr_ht = NULL;
+	for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
+		_mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
+		ctx->addr_ht[i] = NULL;
+	}
 
 	nir_foreach_instr(instr, nblock) {
 		emit_instr(ctx, instr);
@@ -2190,6 +2492,11 @@
 	return drvloc;
 }
 
+static const unsigned max_sysvals[SHADER_MAX] = {
+	[SHADER_VERTEX]  = 16,
+	[SHADER_COMPUTE] = 16, // TODO how many do we actually need?
+};
+
 static void
 emit_instructions(struct ir3_compile *ctx)
 {
@@ -2199,11 +2506,9 @@
 	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
 	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
 
-	/* or vtx shaders, we need to leave room for sysvals:
+	/* we need to leave room for sysvals:
 	 */
-	if (ctx->so->type == SHADER_VERTEX) {
-		ninputs += 16;
-	}
+	ninputs += max_sysvals[ctx->so->type];
 
 	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
 
@@ -2212,9 +2517,7 @@
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
-	if (ctx->so->type == SHADER_VERTEX) {
-		ctx->ir->ninputs -= 16;
-	}
+	ninputs -= max_sysvals[ctx->so->type];
 
 	/* for fragment shader, we have a single input register (usually
 	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
@@ -2239,17 +2542,15 @@
 		setup_output(ctx, var);
 	}
 
-	/* Setup global variables (which should only be arrays): */
-	nir_foreach_variable(var, &ctx->s->globals) {
-		declare_var(ctx, var);
+	/* Setup registers (which should only be arrays): */
+	nir_foreach_register(reg, &ctx->s->registers) {
+		declare_array(ctx, reg);
 	}
 
-	/* Setup local variables (which should only be arrays): */
 	/* NOTE: need to do something more clever when we support >1 fxn */
-	nir_foreach_variable(var, &fxn->locals) {
-		declare_var(ctx, var);
+	nir_foreach_register(reg, &fxn->registers) {
+		declare_array(ctx, reg);
 	}
-
 	/* And emit the body: */
 	ctx->impl = fxn;
 	emit_function(ctx, fxn);
@@ -2541,7 +2842,7 @@
 	/* We need to do legalize after (for frag shader's) the "bary.f"
 	 * offsets (inloc) have been assigned.
 	 */
-	ir3_legalize(ir, &so->has_samp, &max_bary);
+	ir3_legalize(ir, &so->has_samp, &so->has_ssbo, &max_bary);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER LEGALIZE:\n");
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 7bb858d..8c907eb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -193,6 +193,12 @@
 			 */
 			if (is_store(instr) && (n == 1))
 				return false;
+
+			/* disallow CP into anything but the SSBO slot argument for
+			 * atomics:
+			 */
+			if (is_atomic(instr->opc) && (n != 0))
+				return false;
 		}
 
 		break;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 6acea01..fffa765 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -43,6 +43,7 @@
 
 struct ir3_legalize_ctx {
 	bool has_samp;
+	bool has_ssbo;
 	int max_bary;
 };
 
@@ -192,6 +193,9 @@
 				regmask_set(&needs_sy, n->regs[0]);
 		}
 
+		if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc))
+			ctx->has_ssbo = true;
+
 		/* both tex/sfu appear to not always immediately consume
 		 * their src register(s):
 		 */
@@ -388,7 +392,7 @@
 }
 
 void
-ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
+ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary)
 {
 	struct ir3_legalize_ctx ctx = {
 			.max_bary = -1,
@@ -399,6 +403,7 @@
 	}
 
 	*has_samp = ctx.has_samp;
+	*has_ssbo = ctx.has_ssbo;
 	*max_bary = ctx.max_bary;
 
 	do {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 336fa95..d30543d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -52,6 +52,23 @@
 		.lower_extract_word = true,
 };
 
+static const nir_shader_compiler_options options_5xx = {
+		.lower_fpow = true,
+		.lower_fsat = true,
+		.lower_scmp = true,
+		.lower_flrp32 = true,
+		.lower_flrp64 = true,
+		.lower_ffract = true,
+		.lower_fmod32 = true,
+		.lower_fmod64 = true,
+		.lower_fdiv = true,
+		.fuse_ffma = true,
+		.native_integers = true,
+		.vertex_id_zero_based = false,
+		.lower_extract_byte = true,
+		.lower_extract_word = true,
+};
+
 struct nir_shader *
 ir3_tgsi_to_nir(const struct tgsi_token *tokens)
 {
@@ -59,8 +76,10 @@
 }
 
 const nir_shader_compiler_options *
-ir3_get_compiler_options(void)
+ir3_get_compiler_options(struct ir3_compiler *compiler)
 {
+	if (compiler->gpu_id >= 500)
+		return &options_5xx;
 	return &options;
 }
 
@@ -90,6 +109,7 @@
 		progress = false;
 
 		OPT_V(s, nir_lower_vars_to_ssa);
+		progress |= OPT(s, nir_opt_copy_prop_vars);
 		progress |= OPT(s, nir_lower_alu_to_scalar);
 		progress |= OPT(s, nir_lower_phis_to_scalar);
 
@@ -114,7 +134,6 @@
 	if (key) {
 		switch (shader->type) {
 		case SHADER_FRAGMENT:
-		case SHADER_COMPUTE:
 			tex_options.saturate_s = key->fsaturate_s;
 			tex_options.saturate_t = key->fsaturate_t;
 			tex_options.saturate_r = key->fsaturate_r;
@@ -124,6 +143,9 @@
 			tex_options.saturate_t = key->vsaturate_t;
 			tex_options.saturate_r = key->vsaturate_r;
 			break;
+		default:
+			/* TODO */
+			break;
 		}
 	}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index e1e9523..2e2e093 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -38,7 +38,7 @@
 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
 
 struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
-const nir_shader_compiler_options * ir3_get_compiler_options(void);
+const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
 struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 		const struct ir3_shader_key *key);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index f70c779..26c1508 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -95,60 +95,47 @@
 	1, 2, 3, 4,
 };
 #define half_class_count  ARRAY_SIZE(half_class_sizes)
-#define total_class_count (class_count + half_class_count)
+
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+	1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
 
 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * 48)
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
 /* Number of virtual regs in a given class: */
 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
 
 /* register-set, created one time, used for all shaders: */
 struct ir3_ra_reg_set {
 	struct ra_regs *regs;
 	unsigned int classes[class_count];
 	unsigned int half_classes[half_class_count];
+	unsigned int high_classes[high_class_count];
 	/* maps flat virtual register space to base gpr: */
 	uint16_t *ra_reg_to_gpr;
 	/* maps cls,gpr to flat virtual register space: */
 	uint16_t **gpr_to_ra_reg;
 };
 
-/* One-time setup of RA register-set, which describes all the possible
- * "virtual" registers and their interferences.  Ie. double register
- * occupies (and conflicts with) two single registers, and so forth.
- * Since registers do not need to be aligned to their class size, they
- * can conflict with other registers in the same class too.  Ie:
- *
- *    Single (base) |  Double
- *    --------------+---------------
- *       R0         |  D0
- *       R1         |  D0 D1
- *       R2         |     D1 D2
- *       R3         |        D2
- *           .. and so on..
- *
- * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
- * really just four scalar registers.  Don't let that confuse you.)
- */
-struct ir3_ra_reg_set *
-ir3_ra_alloc_reg_set(void *memctx)
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+		const unsigned *sizes, unsigned count)
 {
-	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
-	unsigned ra_reg_count, reg, first_half_reg;
-	unsigned int **q_values;
-
-	/* calculate # of regs across all classes: */
-	ra_reg_count = 0;
-	for (unsigned i = 0; i < class_count; i++)
-		ra_reg_count += CLASS_REGS(i);
-	for (unsigned i = 0; i < half_class_count; i++)
-		ra_reg_count += HALF_CLASS_REGS(i);
-
-	/* allocate and populate q_values: */
-	q_values = ralloc_array(set, unsigned *, total_class_count);
-	for (unsigned i = 0; i < class_count; i++) {
-		q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
+	for (unsigned i = 0; i < count; i++) {
+		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
 
 		/* From register_allocate.c:
 		 *
@@ -175,19 +162,50 @@
 		 *
 		 * (Idea copied from brw_fs_reg_allocate.cpp)
 		 */
-		for (unsigned j = 0; j < class_count; j++)
-			q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+		for (unsigned j = 0; j < count; j++)
+			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
 	}
+}
 
-	for (unsigned i = class_count; i < total_class_count; i++) {
-		q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(void *memctx)
+{
+	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
+	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+	unsigned int **q_values;
 
-		/* see comment above: */
-		for (unsigned j = class_count; j < total_class_count; j++) {
-			q_values[i][j] = half_class_sizes[i - class_count] +
-					half_class_sizes[j - class_count] - 1;
-		}
-	}
+	/* calculate # of regs across all classes: */
+	ra_reg_count = 0;
+	for (unsigned i = 0; i < class_count; i++)
+		ra_reg_count += CLASS_REGS(i);
+	for (unsigned i = 0; i < half_class_count; i++)
+		ra_reg_count += HALF_CLASS_REGS(i);
+	for (unsigned i = 0; i < high_class_count; i++)
+		ra_reg_count += HIGH_CLASS_REGS(i);
+
+	/* allocate and populate q_values: */
+	q_values = ralloc_array(set, unsigned *, total_class_count);
+
+	build_q_values(q_values, 0, class_sizes, class_count);
+	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
 
 	/* allocate the reg-set.. */
 	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
@@ -215,18 +233,19 @@
 	}
 
 	first_half_reg = reg;
+	base = HALF_OFFSET;
 
 	for (unsigned i = 0; i < half_class_count; i++) {
 		set->half_classes[i] = ra_alloc_reg_class(set->regs);
 
-		set->gpr_to_ra_reg[class_count + i] =
-				ralloc_array(set, uint16_t, CLASS_REGS(i));
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
 
 		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
 			ra_class_add_reg(set->regs, set->half_classes[i], reg);
 
 			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[class_count + i][j] = reg;
+			set->gpr_to_ra_reg[base + i][j] = reg;
 
 			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
 				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
@@ -235,6 +254,29 @@
 		}
 	}
 
+	first_high_reg = reg;
+	base = HIGH_OFFSET;
+
+	for (unsigned i = 0; i < high_class_count; i++) {
+		set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[base + i][j] = reg;
+
+			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+			reg++;
+		}
+	}
+
+
 	ra_set_finalize(set->regs, q_values);
 
 	ralloc_free(q_values);
@@ -287,13 +329,23 @@
 	return !!(instr->regs[0]->flags & IR3_REG_HALF);
 }
 
-static int
-size_to_class(unsigned sz, bool half)
+static bool
+is_high(struct ir3_instruction *instr)
 {
-	if (half) {
+	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+	if (high) {
+		for (unsigned i = 0; i < high_class_count; i++)
+			if (high_class_sizes[i] >= sz)
+				return i + HIGH_OFFSET;
+	} else if (half) {
 		for (unsigned i = 0; i < half_class_count; i++)
 			if (half_class_sizes[i] >= sz)
-				return i + class_count;
+				return i + HALF_OFFSET;
 	} else {
 		for (unsigned i = 0; i < class_count; i++)
 			if (class_sizes[i] >= sz)
@@ -497,7 +549,7 @@
 			id->defn = instr;
 		} else {
 			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-			id->cls = size_to_class(id->sz, is_half(id->defn));
+			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
 		}
 	}
 }
@@ -710,9 +762,12 @@
 
 				def(name, id->defn);
 
-				if (is_half(id->defn)) {
+				if (is_high(id->defn)) {
 					ra_set_node_class(ctx->g, name,
-							ctx->set->half_classes[id->cls - class_count]);
+							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+				} else if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - HALF_OFFSET]);
 				} else {
 					ra_set_node_class(ctx->g, name,
 							ctx->set->classes[id->cls]);
@@ -981,6 +1036,9 @@
 
 		debug_assert(!(reg->flags & IR3_REG_RELATIV));
 
+		if (is_high(id->defn))
+			num += FIRST_HIGH_REG;
+
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
@@ -1029,7 +1087,7 @@
 		unsigned i = 0, j;
 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 			struct ir3_instruction *instr = ir->inputs[i];
-			int cls = size_to_class(1, true);
+			int cls = size_to_class(1, true, false);
 			unsigned name = __ra_name(ctx, cls, instr);
 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index ffe1b04..a176f16 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -223,7 +223,6 @@
 	 */
 	switch (shader->type) {
 	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
 		key.binning_pass = false;
 		if (key.has_per_samp) {
 			key.vsaturate_s = 0;
@@ -243,6 +242,9 @@
 			key.fastc_srgb = 0;
 		}
 		break;
+	default:
+		/* TODO */
+		break;
 	}
 
 	for (v = shader->variants; v; v = v->next)
@@ -289,6 +291,7 @@
 		/* we take ownership of the reference: */
 		nir = cso->ir.nir;
 	} else {
+		debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
 		if (fd_mesa_debug & FD_DBG_DISASM) {
 			DBG("dump tgsi: type=%d", shader->type);
 			tgsi_dump(cso->tokens, 0);
@@ -315,6 +318,43 @@
 	return shader;
 }
 
+/* a bit annoying that compute-shader and normal shader state objects
+ * aren't a bit more aligned.
+ */
+struct ir3_shader *
+ir3_shader_create_compute(struct ir3_compiler *compiler,
+		const struct pipe_compute_state *cso,
+		struct pipe_debug_callback *debug)
+{
+	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+
+	shader->compiler = compiler;
+	shader->id = ++shader->compiler->shader_count;
+	shader->type = SHADER_COMPUTE;
+
+	nir_shader *nir;
+	if (cso->ir_type == PIPE_SHADER_IR_NIR) {
+		/* we take ownership of the reference: */
+		nir = (nir_shader *)cso->prog;
+	} else {
+		debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
+		if (fd_mesa_debug & FD_DBG_DISASM) {
+			DBG("dump tgsi: type=%d", shader->type);
+			tgsi_dump(cso->prog, 0);
+		}
+		nir = ir3_tgsi_to_nir(cso->prog);
+	}
+
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
+
+	return shader;
+}
+
 static void dump_reg(const char *name, uint32_t r)
 {
 	if (r != regid(63,0))
@@ -418,7 +458,8 @@
 		}
 		debug_printf("\n");
 		break;
-	case SHADER_COMPUTE:
+	default:
+		/* TODO */
 		break;
 	}
 
@@ -462,7 +503,8 @@
 		if (so->frag_face)
 			debug_printf("; fragface: hr0.x\n");
 		break;
-	case SHADER_COMPUTE:
+	default:
+		/* TODO */
 		break;
 	}
 
@@ -472,7 +514,7 @@
 uint64_t
 ir3_shader_outputs(const struct ir3_shader *so)
 {
-	return so->nir->info->outputs_written;
+	return so->nir->info.outputs_written;
 }
 
 /* This has to reach into the fd_context a bit more than the rest of
@@ -655,23 +697,19 @@
 }
 
 void
-ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty)
+ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_draw_info *info)
 {
-	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
+	enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_VERTEX];
+
+	debug_assert(v->type == SHADER_VERTEX);
+
+	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
 		struct fd_constbuf_stateobj *constbuf;
 		bool shader_dirty;
 
-		if (v->type == SHADER_VERTEX) {
-			constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX];
-			shader_dirty = !!(dirty & FD_SHADER_DIRTY_VP);
-		} else if (v->type == SHADER_FRAGMENT) {
-			constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT];
-			shader_dirty = !!(dirty & FD_SHADER_DIRTY_FP);
-		} else {
-			unreachable("bad shader type");
-			return;
-		}
+		constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX];
+		shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG);
 
 		emit_user_consts(ctx, v, ring, constbuf);
 		emit_ubos(ctx, v, ring, constbuf);
@@ -681,11 +719,11 @@
 
 	/* emit driver params every time: */
 	/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
-	if (info && (v->type == SHADER_VERTEX)) {
+	if (info) {
 		uint32_t offset = v->constbase.driver_param;
 		if (v->constlen > offset) {
-			uint32_t vertex_params[IR3_DP_COUNT] = {
-				[IR3_DP_VTXID_BASE] = info->indexed ?
+			uint32_t vertex_params[IR3_DP_VS_COUNT] = {
+				[IR3_DP_VTXID_BASE] = info->index_size ?
 						info->index_bias : info->start,
 				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v),
 			};
@@ -717,3 +755,61 @@
 		}
 	}
 }
+
+void
+ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx)
+{
+	enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_FRAGMENT];
+
+	debug_assert(v->type == SHADER_FRAGMENT);
+
+	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
+		struct fd_constbuf_stateobj *constbuf;
+		bool shader_dirty;
+
+		constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT];
+		shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG);
+
+		emit_user_consts(ctx, v, ring, constbuf);
+		emit_ubos(ctx, v, ring, constbuf);
+		if (shader_dirty)
+			emit_immediates(ctx, v, ring);
+	}
+}
+
+/* emit compute-shader consts: */
+void
+ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_grid_info *info)
+{
+	enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
+
+	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
+		struct fd_constbuf_stateobj *constbuf;
+		bool shader_dirty;
+
+		constbuf = &ctx->constbuf[PIPE_SHADER_COMPUTE];
+		shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG);
+
+		emit_user_consts(ctx, v, ring, constbuf);
+		emit_ubos(ctx, v, ring, constbuf);
+		if (shader_dirty)
+			emit_immediates(ctx, v, ring);
+	}
+
+	/* emit compute-shader driver-params: */
+	uint32_t offset = v->constbase.driver_param;
+	if (v->constlen > offset) {
+		uint32_t compute_params[IR3_DP_CS_COUNT] = {
+			[IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
+			[IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
+			[IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
+			/* do we need work-group-size? */
+		};
+
+		fd_wfi(ctx->batch, ring);
+		ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
+				ARRAY_SIZE(compute_params), compute_params, NULL);
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 052a563..6c2af6d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -38,13 +38,20 @@
 
 /* driver param indices: */
 enum ir3_driver_param {
+	/* compute shader driver params: */
+	IR3_DP_NUM_WORK_GROUPS_X = 0,
+	IR3_DP_NUM_WORK_GROUPS_Y = 1,
+	IR3_DP_NUM_WORK_GROUPS_Z = 2,
+	IR3_DP_CS_COUNT   = 4,   /* must be aligned to vec4 */
+
+	/* vertex shader driver params: */
 	IR3_DP_VTXID_BASE = 0,
 	IR3_DP_VTXCNT_MAX = 1,
 	/* user-clip-plane components, up to 8x vec4's: */
 	IR3_DP_UCP0_X     = 4,
 	/* .... */
 	IR3_DP_UCP7_W     = 35,
-	IR3_DP_COUNT      = 36   /* must be aligned to vec4 */
+	IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
 };
 
 /* Configuration key used to identify a shader variant.. different
@@ -105,6 +112,57 @@
 	return a->global == b->global;
 }
 
+/* will the two keys produce different lowering for a fragment shader? */
+static inline bool
+ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->fsaturate_s != key->fsaturate_s) ||
+				(last_key->fsaturate_t != key->fsaturate_t) ||
+				(last_key->fsaturate_r != key->fsaturate_r) ||
+				(last_key->fastc_srgb != key->fastc_srgb))
+			return true;
+	}
+
+	if (last_key->fclamp_color != key->fclamp_color)
+		return true;
+
+	if (last_key->color_two_side != key->color_two_side)
+		return true;
+
+	if (last_key->half_precision != key->half_precision)
+		return true;
+
+	if (last_key->rasterflat != key->rasterflat)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
+/* will the two keys produce different lowering for a vertex shader? */
+static inline bool
+ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->vsaturate_s != key->vsaturate_s) ||
+				(last_key->vsaturate_t != key->vsaturate_t) ||
+				(last_key->vsaturate_r != key->vsaturate_r) ||
+				(last_key->vastc_srgb != key->vastc_srgb))
+			return true;
+	}
+
+	if (last_key->vclamp_color != key->vclamp_color)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
 struct ir3_shader_variant {
 	struct fd_bo *bo;
 
@@ -198,6 +256,9 @@
 	/* do we have one or more texture sample instructions: */
 	bool has_samp;
 
+	/* do we have one or more SSBO instructions: */
+	bool has_ssbo;
+
 	/* do we have kill instructions: */
 	bool has_kill;
 
@@ -259,6 +320,10 @@
 struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler,
 		const struct pipe_shader_state *cso, enum shader_t type,
 		struct pipe_debug_callback *debug);
+struct ir3_shader *
+ir3_shader_create_compute(struct ir3_compiler *compiler,
+		const struct pipe_compute_state *cso,
+		struct pipe_debug_callback *debug);
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key, struct pipe_debug_callback *debug);
@@ -267,8 +332,12 @@
 
 struct fd_ringbuffer;
 struct fd_context;
-void ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty);
+void ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_draw_info *info);
+void ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx);
+void ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_grid_info *info);
 
 static inline const char *
 ir3_shader_stage(struct ir3_shader *shader)
diff --git a/src/gallium/drivers/i915/Android.mk b/src/gallium/drivers/i915/Android.mk
index fece305..f009154 100644
--- a/src/gallium/drivers/i915/Android.mk
+++ b/src/gallium/drivers/i915/Android.mk
@@ -34,3 +34,8 @@
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_I915),)
+GALLIUM_TARGET_DRIVERS += i915
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_i915)
+endif
diff --git a/src/gallium/drivers/i915/Automake.inc b/src/gallium/drivers/i915/Automake.inc
index 8cb40f2..73ef810 100644
--- a/src/gallium/drivers/i915/Automake.inc
+++ b/src/gallium/drivers/i915/Automake.inc
@@ -5,7 +5,7 @@
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/i915/drm/libi915drm.la \
 	$(top_builddir)/src/gallium/drivers/i915/libi915.la \
-	$(INTEL_LIBS) \
+	$(I915_LIBS) \
 	$(LIBDRM_LIBS)
 
 endif
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 6664cfc..8ea9440 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -37,6 +37,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
 #include "pipe/p_screen.h"
 
@@ -57,6 +58,9 @@
    const void *mapped_indices = NULL;
    unsigned i;
 
+   if (!u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+      return;
+
    /*
     * Ack vs contants here, helps ipers a lot.
     */
@@ -69,22 +73,23 @@
     * Map vertex buffers
     */
    for (i = 0; i < i915->nr_vertex_buffers; i++) {
-      const void *buf = i915->vertex_buffers[i].user_buffer;
+      const void *buf = i915->vertex_buffers[i].is_user_buffer ?
+                           i915->vertex_buffers[i].buffer.user : NULL;
       if (!buf)
-            buf = i915_buffer(i915->vertex_buffers[i].buffer)->data;
+            buf = i915_buffer(i915->vertex_buffers[i].buffer.resource)->data;
       draw_set_mapped_vertex_buffer(draw, i, buf, ~0);
    }
 
    /*
     * Map index buffer, if present
     */
-   if (info->indexed) {
-      mapped_indices = i915->index_buffer.user_buffer;
+   if (info->index_size) {
+      mapped_indices = info->has_user_indices ? info->index.user : NULL;
       if (!mapped_indices)
-         mapped_indices = i915_buffer(i915->index_buffer.buffer)->data;
+         mapped_indices = i915_buffer(info->index.resource)->data;
       draw_set_indexes(draw,
-                       (ubyte *) mapped_indices + i915->index_buffer.offset,
-                       i915->index_buffer.index_size, ~0);
+                       (ubyte *) mapped_indices,
+                       info->index_size, ~0);
    }
 
    if (i915->constants[PIPE_SHADER_VERTEX])
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index ea13834..626a17f 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -249,7 +249,6 @@
    struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
    struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
-   struct pipe_index_buffer index_buffer;
 
    unsigned dirty;
 
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 139b5d8..4ad98e2 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -277,6 +277,7 @@
    case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
    case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
    case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -306,6 +307,9 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 3747922..ddc2709 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -1060,17 +1060,6 @@
    FREE( velems );
 }
 
-static void i915_set_index_buffer(struct pipe_context *pipe,
-                                  const struct pipe_index_buffer *ib)
-{
-   struct i915_context *i915 = i915_context(pipe);
-
-   if (ib)
-      memcpy(&i915->index_buffer, ib, sizeof(i915->index_buffer));
-   else
-      memset(&i915->index_buffer, 0, sizeof(i915->index_buffer));
-}
-
 static void
 i915_set_sample_mask(struct pipe_context *pipe,
                      unsigned sample_mask)
@@ -1119,5 +1108,4 @@
    i915->base.sampler_view_destroy = i915_sampler_view_destroy;
    i915->base.set_viewport_states = i915_set_viewport_states;
    i915->base.set_vertex_buffers = i915_set_vertex_buffers;
-   i915->base.set_index_buffer = i915_set_index_buffer;
 }
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index dbfbc84..7809010 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -216,6 +216,23 @@
    if (I915_DBG_ON(DBG_ATOMS))
       i915_dump_dirty(i915, __FUNCTION__);
 
+   if (!i915->fs) {
+      i915->dirty &= ~(I915_NEW_FS_CONSTANTS | I915_NEW_FS);
+      i915->hardware_dirty &= ~(I915_HW_PROGRAM | I915_HW_CONSTANTS);
+   }
+
+   if (!i915->vs)
+      i915->dirty &= ~I915_NEW_VS;
+
+   if (!i915->blend)
+      i915->dirty &= ~I915_NEW_BLEND;
+
+   if (!i915->rasterizer)
+      i915->dirty &= ~I915_NEW_RASTERIZER;
+
+   if (!i915->depth_stencil)
+      i915->dirty &= ~I915_NEW_DEPTH_STENCIL;
+   
    for (i = 0; atoms[i]; i++)
       if (atoms[i]->dirty & i915->dirty)
          atoms[i]->update(i915);
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 85b2721..434b09d 100644
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -213,7 +213,8 @@
 
    /* I915_NEW_RASTERIZER
     */
-   st[1] |= i915->rasterizer->st;
+   if (i915->rasterizer)
+      st[1] |= i915->rasterizer->st;
 
    /* I915_NEW_STIPPLE
     */
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index b6007acd..14566a4 100644
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -168,11 +168,13 @@
 
    /* I915_NEW_BLEND
     */
-   LIS6 |= i915->blend->LIS6;
+   if (i915->blend)
+      LIS6 |= i915->blend->LIS6;
 
    /* I915_NEW_DEPTH
     */
-   LIS6 |= i915->depth_stencil->depth_LIS6;
+   if (i915->depth_stencil)
+      LIS6 |= i915->depth_stencil->depth_LIS6;
 
    set_immediate(i915, I915_IMMEDIATE_S6, LIS6);
 }
diff --git a/src/gallium/drivers/i915/i915_state_static.c b/src/gallium/drivers/i915/i915_state_static.c
index 9a7ea22..88b418b 100644
--- a/src/gallium/drivers/i915/i915_state_static.c
+++ b/src/gallium/drivers/i915/i915_state_static.c
@@ -216,7 +216,7 @@
       zformat = translate_depth_format(depth_surface->format);
 
       if (is->is_i945 && tex->tiling != I915_TILE_NONE
-            && !i915->fs->info.writes_z)
+          && (i915->fs && !i915->fs->info.writes_z))
          early_z = CLASSIC_EARLY_DEPTH;
    } else
       zformat = 0;
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index 27b0d9e..57e90c6 100644
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -105,7 +105,7 @@
       goto fallback;
 
    util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-   util_blitter_default_src_texture(&src_templ, src, src_level);
+   util_blitter_default_src_texture(i915->blitter, &src_templ, src, src_level);
 
    if (!util_blitter_is_copy_supported(i915->blitter, dst, src))
       goto fallback;
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index fbbd22a..74d7a9e 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -4,7 +4,7 @@
 Import('*')
 
 if not env['llvm']:
-    print 'warning: LLVM disabled: not building llvmpipe'
+    print('warning: LLVM disabled: not building llvmpipe')
     Return()
 
 env = env.Clone()
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 9a1a7b9..613d60f 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -96,7 +96,7 @@
    }
 
    for (i = 0; i < llvmpipe->num_vertex_buffers; i++) {
-      pipe_resource_reference(&llvmpipe->vertex_buffer[i].buffer, NULL);
+      pipe_vertex_buffer_unreference(&llvmpipe->vertex_buffer[i]);
    }
 
    lp_delete_setup_variants(llvmpipe);
@@ -119,10 +119,10 @@
 
 
 static void
-llvmpipe_render_condition ( struct pipe_context *pipe,
-                            struct pipe_query *query,
-                            boolean condition,
-                            uint mode )
+llvmpipe_render_condition(struct pipe_context *pipe,
+                          struct pipe_query *query,
+                          boolean condition,
+                          enum pipe_render_cond_flag mode)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
 
@@ -227,6 +227,12 @@
 
    lp_reset_counters();
 
+   /* If llvmpipe_set_scissor_states() is never called, we still need to
+    * make sure that derived scissor state is computed.
+    * See https://bugs.freedesktop.org/show_bug.cgi?id=101709
+    */
+   llvmpipe->dirty |= LP_NEW_SCISSOR;
+
    return &llvmpipe->pipe;
 
  fail:
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index d4bd02d..54d98fd 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -81,7 +81,6 @@
 
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-   struct pipe_index_buffer index_buffer;
 
    unsigned num_samplers[PIPE_SHADER_TYPES];
    unsigned num_sampler_views[PIPE_SHADER_TYPES];
@@ -149,7 +148,7 @@
 
    /** Conditional query object and mode */
    struct pipe_query *render_cond_query;
-   uint render_cond_mode;
+   enum pipe_render_cond_flag render_cond_mode;
    boolean render_cond_cond;
 
    /** The LLVMContext to use for LLVM related work */
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index 22ef5fc..2efe3ef 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -73,33 +73,30 @@
     * Map vertex buffers
     */
    for (i = 0; i < lp->num_vertex_buffers; i++) {
-      const void *buf = lp->vertex_buffer[i].user_buffer;
+      const void *buf = lp->vertex_buffer[i].is_user_buffer ?
+                           lp->vertex_buffer[i].buffer.user : NULL;
       size_t size = ~0;
       if (!buf) {
-         if (!lp->vertex_buffer[i].buffer) {
+         if (!lp->vertex_buffer[i].buffer.resource) {
             continue;
          }
-         buf = llvmpipe_resource_data(lp->vertex_buffer[i].buffer);
-         size = lp->vertex_buffer[i].buffer->width0;
+         buf = llvmpipe_resource_data(lp->vertex_buffer[i].buffer.resource);
+         size = lp->vertex_buffer[i].buffer.resource->width0;
       }
       draw_set_mapped_vertex_buffer(draw, i, buf, size);
    }
 
    /* Map index buffer, if present */
-   if (info->indexed) {
+   if (info->index_size) {
       unsigned available_space = ~0;
-      mapped_indices = lp->index_buffer.user_buffer;
+      mapped_indices = info->has_user_indices ? info->index.user : NULL;
       if (!mapped_indices) {
-         mapped_indices = llvmpipe_resource_data(lp->index_buffer.buffer);
-         if (lp->index_buffer.buffer->width0 > lp->index_buffer.offset)
-            available_space =
-               (lp->index_buffer.buffer->width0 - lp->index_buffer.offset);
-         else
-            available_space = 0;
+         mapped_indices = llvmpipe_resource_data(info->index.resource);
+         available_space = info->index.resource->width0;
       }
       draw_set_indexes(draw,
-                       (ubyte *) mapped_indices + lp->index_buffer.offset,
-                       lp->index_buffer.index_size, available_space);
+                       (ubyte *) mapped_indices,
+                       info->index_size, available_space);
    }
 
    for (i = 0; i < lp->num_so_targets; i++) {
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 85449ab..e98e30d 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -352,6 +352,10 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 38d9138..32387ab 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -1347,6 +1347,10 @@
    
    setup->dirty = ~0;
 
+   /* Initialize empty default fb correctly, so the rect is empty */
+   setup->framebuffer.x1 = -1;
+   setup->framebuffer.y1 = -1;
+
    return setup;
 
 no_scenes:
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 9714691..4b55fd9 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -215,10 +215,11 @@
                         unsigned *tri_size);
 
 boolean
-lp_setup_bin_triangle( struct lp_setup_context *setup,
-                       struct lp_rast_triangle *tri,
-                       const struct u_rect *bbox,
-                       int nr_planes,
-                       unsigned scissor_index );
+lp_setup_bin_triangle(struct lp_setup_context *setup,
+                      struct lp_rast_triangle *tri,
+                      const struct u_rect *bboxorig,
+                      const struct u_rect *bbox,
+                      int nr_planes,
+                      unsigned scissor_index);
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 018130c..d0bac5e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -288,7 +288,9 @@
    struct lp_rast_plane *plane;
    struct lp_line_info info;
    float width = MAX2(1.0, setup->line_width);
-   struct u_rect bbox;
+   const struct u_rect *scissor;
+   struct u_rect bbox, bboxpos;
+   boolean s_planes[4];
    unsigned tri_bytes;
    int x[4]; 
    int y[4];
@@ -579,10 +581,12 @@
       return TRUE;
    }
 
+   bboxpos = bbox;
+
    /* Can safely discard negative regions:
     */
-   bbox.x0 = MAX2(bbox.x0, 0);
-   bbox.y0 = MAX2(bbox.y0, 0);
+   bboxpos.x0 = MAX2(bboxpos.x0, 0);
+   bboxpos.y0 = MAX2(bboxpos.y0, 0);
 
    nr_planes = 4;
    /*
@@ -591,8 +595,8 @@
     */
    if (setup->scissor_test) {
       /* why not just use draw_regions */
-      boolean s_planes[4];
-      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      scissor = &setup->scissors[viewport_index];
+      scissor_planes_needed(s_planes, &bboxpos, scissor);
       nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
    }
 
@@ -718,11 +722,7 @@
     * (easier to evaluate) to ordinary planes.)
     */
    if (nr_planes > 4) {
-      /* why not just use draw_regions */
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
       struct lp_rast_plane *plane_s = &plane[4];
-      boolean s_planes[4];
-      scissor_planes_needed(s_planes, &bbox, scissor);
 
       if (s_planes[0]) {
          plane_s->dcdx = -1 << 8;
@@ -755,7 +755,7 @@
       assert(plane_s == &plane[nr_planes]);
    }
 
-   return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
+   return lp_setup_bin_triangle(setup, line, &bbox, &bboxpos, nr_planes, viewport_index);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index ddb6f0e..8cb6b83 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -513,7 +513,7 @@
       plane[3].eo = 0;
    }
 
-   return lp_setup_bin_triangle(setup, point, &bbox, nr_planes, viewport_index);
+   return lp_setup_bin_triangle(setup, point, &bbox, &bbox, nr_planes, viewport_index);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a7a5d05..39755d6 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -273,7 +273,9 @@
    const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *tri;
    struct lp_rast_plane *plane;
-   struct u_rect bbox;
+   const struct u_rect *scissor;
+   struct u_rect bbox, bboxpos;
+   boolean s_planes[4];
    unsigned tri_bytes;
    int nr_planes = 3;
    unsigned viewport_index = 0;
@@ -332,12 +334,14 @@
       return TRUE;
    }
 
+   bboxpos = bbox;
+
    /* Can safely discard negative regions, but need to keep hold of
     * information about when the triangle extends past screen
     * boundaries.  See trimmed_box in lp_setup_bin_triangle().
     */
-   bbox.x0 = MAX2(bbox.x0, 0);
-   bbox.y0 = MAX2(bbox.y0, 0);
+   bboxpos.x0 = MAX2(bboxpos.x0, 0);
+   bboxpos.y0 = MAX2(bboxpos.y0, 0);
 
    nr_planes = 3;
    /*
@@ -346,8 +350,8 @@
     */
    if (setup->scissor_test) {
       /* why not just use draw_regions */
-      boolean s_planes[4];
-      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      scissor = &setup->scissors[viewport_index];
+      scissor_planes_needed(s_planes, &bboxpos, scissor);
       nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
    }
 
@@ -358,7 +362,7 @@
    if (!tri)
       return FALSE;
 
-#if 0
+#ifdef DEBUG
    tri->v[0][0] = v0[0][0];
    tri->v[1][0] = v1[0][0];
    tri->v[2][0] = v2[0][0];
@@ -680,10 +684,7 @@
     */
    if (nr_planes > 3) {
       /* why not just use draw_regions */
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
       struct lp_rast_plane *plane_s = &plane[3];
-      boolean s_planes[4];
-      scissor_planes_needed(s_planes, &bbox, scissor);
 
       if (s_planes[0]) {
          plane_s->dcdx = -1 << 8;
@@ -716,7 +717,7 @@
       assert(plane_s == &plane[nr_planes]);
    }
 
-   return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
+   return lp_setup_bin_triangle(setup, tri, &bbox, &bboxpos, nr_planes, viewport_index);
 }
 
 /*
@@ -747,11 +748,12 @@
 
 
 boolean
-lp_setup_bin_triangle( struct lp_setup_context *setup,
-                       struct lp_rast_triangle *tri,
-                       const struct u_rect *bbox,
-                       int nr_planes,
-                       unsigned viewport_index )
+lp_setup_bin_triangle(struct lp_setup_context *setup,
+                      struct lp_rast_triangle *tri,
+                      const struct u_rect *bboxorig,
+                      const struct u_rect *bbox,
+                      int nr_planes,
+                      unsigned viewport_index)
 {
    struct lp_scene *scene = setup->scene;
    struct u_rect trimmed_box = *bbox;   
@@ -767,7 +769,16 @@
    int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
                  (bbox->y1 - (bbox->y0 & ~3)));
    int sz = floor_pot(max_sz);
-   boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
+
+   /*
+    * NOTE: It is important to use the original bounding box
+    * which might contain negative values here, because if the
+    * plane math may overflow or not with the 32bit rasterization
+    * functions depends on the original extent of the triangle.
+    */
+   int max_szorig = ((bboxorig->x1 - (bboxorig->x0 & ~3)) |
+                     (bboxorig->y1 - (bboxorig->y0 & ~3)));
+   boolean use_32bits = max_szorig <= MAX_FIXED_LENGTH32;
 
    /* Now apply scissor, etc to the bounding box.  Could do this
     * earlier, but it confuses the logic for tri-16 and would force
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index fa9d4fb..3e75d44 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -194,6 +194,7 @@
    /* This needs LP_NEW_RASTERIZER because of draw_prepare_shader_outputs(). */
    if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
                           LP_NEW_FS |
+                          LP_NEW_GS |
                           LP_NEW_VS))
       compute_vertex_info(llvmpipe);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
index 1e93fd8..702ecf9 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vertex.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -93,18 +93,6 @@
 }
 
 
-static void
-llvmpipe_set_index_buffer(struct pipe_context *pipe,
-                          const struct pipe_index_buffer *ib)
-{
-   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-
-   if (ib)
-      memcpy(&llvmpipe->index_buffer, ib, sizeof(llvmpipe->index_buffer));
-   else
-      memset(&llvmpipe->index_buffer, 0, sizeof(llvmpipe->index_buffer));
-}
-
 void
 llvmpipe_init_vertex_funcs(struct llvmpipe_context *llvmpipe)
 {
@@ -113,5 +101,4 @@
    llvmpipe->pipe.delete_vertex_elements_state = llvmpipe_delete_vertex_elements_state;
 
    llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
-   llvmpipe->pipe.set_index_buffer = llvmpipe_set_index_buffer;
 }
diff --git a/src/gallium/drivers/noop/noop_state.c b/src/gallium/drivers/noop/noop_state.c
index 32a54e9..80cfae8 100644
--- a/src/gallium/drivers/noop/noop_state.c
+++ b/src/gallium/drivers/noop/noop_state.c
@@ -76,7 +76,10 @@
 
    if (!sampler_view)
       return NULL;
+
    /* initialize base object */
+   *sampler_view = *state;
+   sampler_view->texture = NULL;
    pipe_resource_reference(&sampler_view->texture, texture);
    pipe_reference_init(&sampler_view->reference, 1);
    sampler_view->context = ctx;
@@ -188,11 +191,6 @@
    FREE(state);
 }
 
-static void noop_set_index_buffer(struct pipe_context *ctx,
-                                  const struct pipe_index_buffer *ib)
-{
-}
-
 static void noop_set_vertex_buffers(struct pipe_context *ctx,
                                     unsigned start_slot, unsigned count,
                                     const struct pipe_vertex_buffer *buffers)
@@ -298,7 +296,6 @@
    ctx->set_scissor_states = noop_set_scissor_states;
    ctx->set_stencil_ref = noop_set_stencil_ref;
    ctx->set_vertex_buffers = noop_set_vertex_buffers;
-   ctx->set_index_buffer = noop_set_index_buffer;
    ctx->set_viewport_states = noop_set_viewport_states;
    ctx->sampler_view_destroy = noop_sampler_view_destroy;
    ctx->surface_destroy = noop_surface_destroy;
diff --git a/src/gallium/drivers/nouveau/Android.mk b/src/gallium/drivers/nouveau/Android.mk
index 3b26b59..2de22e7 100644
--- a/src/gallium/drivers/nouveau/Android.mk
+++ b/src/gallium/drivers/nouveau/Android.mk
@@ -39,6 +39,11 @@
 LOCAL_SHARED_LIBRARIES := libdrm_nouveau
 LOCAL_MODULE := libmesa_pipe_nouveau
 
-LOCAL_C_INCLUDES := external/libcxx/include
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_NOUVEAU),)
+GALLIUM_TARGET_DRIVERS += nouveau
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_nouveau)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index b67a1dd..b96f919 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -575,6 +575,7 @@
    encSize = 0;
    ipa = 0;
    mask = 0;
+   precise = 0;
 
    lanes = 0xf;
 
@@ -905,6 +906,9 @@
 
    tex.rIndirectSrc = -1;
    tex.sIndirectSrc = -1;
+
+   if (op == OP_TXF)
+      sType = TYPE_U32;
 }
 
 TexInstruction::~TexInstruction()
@@ -1214,8 +1218,8 @@
    PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
    PROG_TYPE_CASE(COMPUTE, COMPUTE);
    default:
-      type = nv50_ir::Program::TYPE_COMPUTE;
-      break;
+      INFO_DBG(info->dbgFlags, VERBOSE, "unsupported program type %u\n", info->type);
+      return -1;
    }
    INFO_DBG(info->dbgFlags, VERBOSE, "translating program of type %u\n", type);
 
@@ -1224,25 +1228,21 @@
       return -1;
 
    nv50_ir::Program *prog = new nv50_ir::Program(type, targ);
-   if (!prog)
+   if (!prog) {
+      nv50_ir::Target::destroy(targ);
       return -1;
+   }
    prog->driver = info;
    prog->dbgFlags = info->dbgFlags;
    prog->optLevel = info->optLevel;
 
    switch (info->bin.sourceRep) {
-#if 0
-   case PIPE_IR_LLVM:
-   case PIPE_IR_GLSL:
-      return -1;
-   case PIPE_IR_SM4:
-      ret = prog->makeFromSM4(info) ? 0 : -2;
-      break;
-   case PIPE_IR_TGSI:
-#endif
-   default:
+   case PIPE_SHADER_IR_TGSI:
       ret = prog->makeFromTGSI(info) ? 0 : -2;
       break;
+   default:
+      ret = -1;
+      break;
    }
    if (ret < 0)
       goto out;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index de6c110..bc15992 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -884,6 +884,8 @@
    unsigned perPatch   : 1;
    unsigned exit       : 1; // terminate program after insn
    unsigned mask       : 4; // for vector ops
+   // prevent algebraic optimisations that aren't bit-for-bit identical
+   unsigned precise    : 1;
 
    int8_t postFactor; // MUL/DIV(if < 0) by 1 << postFactor
 
@@ -1253,7 +1255,6 @@
    inline void add(Value *rval, int& id) { allRValues.insert(rval, id); }
 
    bool makeFromTGSI(struct nv50_ir_prog_info *);
-   bool makeFromSM4(struct nv50_ir_prog_info *);
    bool convertToSSA();
    bool optimizeSSA(int level);
    bool optimizePostRA(int level);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index e7d840d..76f08b1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -54,11 +54,6 @@
    ubyte si; /* TGSI semantic index */
 };
 
-#define NV50_PROGRAM_IR_TGSI 0
-#define NV50_PROGRAM_IR_SM4  1
-#define NV50_PROGRAM_IR_GLSL 2
-#define NV50_PROGRAM_IR_LLVM 3
-
 #ifdef DEBUG
 # define NV50_IR_DEBUG_BASIC     (1 << 0)
 # define NV50_IR_DEBUG_VERBOSE   (2 << 0)
@@ -95,7 +90,7 @@
       uint32_t *code;
       uint32_t codeSize;
       uint32_t instructions;
-      uint8_t sourceRep;  /* NV50_PROGRAM_IR */
+      uint8_t sourceRep;  /* PIPE_SHADER_IR_* */
       const void *source;
       void *relocData;
       void *fixupData;
@@ -142,6 +137,7 @@
          unsigned numColourResults;
          bool writesDepth;
          bool earlyFragTests;
+         bool postDepthCoverage;
          bool separateFragData;
          bool usesDiscard;
          bool persampleInvocation;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 8b58df4..b1e9f94 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1858,6 +1858,8 @@
 void
 CodeEmitterGM107::emitISCADD()
 {
+   assert(insn->src(1).get()->asImm());
+
    switch (insn->src(2).getFile()) {
    case FILE_GPR:
       emitInsn(0x5c180000);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 14c00bd..58594f0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2006,6 +2006,7 @@
 void
 CodeEmitterNVC0::emitMOV(const Instruction *i)
 {
+   assert(!i->saturate);
    if (i->def(0).getFile() == FILE_PREDICATE) {
       if (i->src(0).getFile() == FILE_GPR) {
          code[0] = 0xfc01c003;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 617e9a3..aa45b81 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -305,6 +305,8 @@
    case TGSI_OPCODE_TXD:
    case TGSI_OPCODE_TXL:
    case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXF:
+   case TGSI_OPCODE_TG4:
    case TGSI_OPCODE_TEX_LZ:
    case TGSI_OPCODE_TXF_LZ:
    case TGSI_OPCODE_LODQ:
@@ -343,6 +345,8 @@
       }
    }
       return mask;
+   case TGSI_OPCODE_TXQ:
+      return 1;
    case TGSI_OPCODE_XPD:
    {
       unsigned int x = 0;
@@ -1277,6 +1281,9 @@
    case TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL:
       info->prop.fp.earlyFragTests = prop->u[0].Data;
       break;
+   case TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE:
+      info->prop.fp.postDepthCoverage = prop->u[0].Data;
+      break;
    case TGSI_PROPERTY_MUL_ZERO_WINS:
       info->io.mul_zero_wins = prop->u[0].Data;
       break;
@@ -2156,6 +2163,7 @@
          /* Save the viewport index into a scratch register so that it can be
             exported at EMIT time */
          if (info->out[idx].sn == TGSI_SEMANTIC_VIEWPORT_INDEX &&
+             prog->getType() == Program::TYPE_GEOMETRY &&
              viewport != NULL)
             mkOp1(OP_MOV, TYPE_U32, viewport, val);
          else
@@ -3185,6 +3193,7 @@
          geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
          if (op == OP_MUL && dstTy == TYPE_F32)
             geni->dnz = info->io.mul_zero_wins;
+         geni->precise = insn->Instruction.Precise;
       }
       break;
    case TGSI_OPCODE_MAD:
@@ -3198,6 +3207,7 @@
          geni = mkOp3(op, dstTy, dst0[c], src0, src1, src2);
          if (dstTy == TYPE_F32)
             geni->dnz = info->io.mul_zero_wins;
+         geni->precise = insn->Instruction.Precise;
       }
       break;
    case TGSI_OPCODE_MOV:
@@ -4080,7 +4090,9 @@
          tmp[0] = fetchSrc(0, c);
          tmp[1] = fetchSrc(0, c + 1);
          mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
-         src1 = fetchSrc(1, c / 2);
+         // Theoretically src1 is a 64-bit value but in practice only the low
+         // bits matter. The IR expects this to be a 32-bit value.
+         src1 = fetchSrc(1, c);
          mkOp2(op, dstTy, dst, src0, src1);
          mkSplit(&dst0[c], 4, dst);
          c++;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 0fbf6b8..7e4e193 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -727,7 +727,9 @@
       // Leave PFETCH alone... we just folded its 2 args into 1.
       break;
    default:
-      i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+      i->op = i->saturate ? OP_SAT : OP_MOV;
+      if (i->saturate)
+         unary(i, *i->getSrc(0)->asImm());
       break;
    }
    i->subOp = 0;
@@ -1509,6 +1511,17 @@
    default:
       return;
    }
+
+   // This can get left behind some of the optimizations which simplify
+   // saturatable values.
+   if (newi->op == OP_MOV && newi->saturate) {
+      ImmediateValue tmp;
+      newi->saturate = 0;
+      newi->op = OP_SAT;
+      if (newi->src(0).getImmediate(tmp))
+         unary(newi, tmp);
+   }
+
    if (newi->op != op)
       foldCount++;
 }
@@ -1677,7 +1690,8 @@
       return false;
 
    bool changed = false;
-   if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
+   // we can't optimize to MAD if the add is precise
+   if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
       changed = tryADDToMADOrSAD(add, OP_MAD);
    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
       changed = tryADDToMADOrSAD(add, OP_SAD);
@@ -1713,7 +1727,7 @@
       return false;
 
    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
-       src->getInsn()->dnz)
+       src->getInsn()->dnz || src->getInsn()->precise)
       return false;
 
    if (toOp == OP_SAD) {
@@ -2649,7 +2663,7 @@
    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
 
    for (; it; it = it->next) {
-      if (it->locked && insn->op != OP_LOAD)
+      if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
          continue;
       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
           it->rel[0] != insn->getIndirect(0, 0) ||
@@ -2787,11 +2801,15 @@
    Record that;
    that.set(ldst);
 
-   if (this->fileIndex != that.fileIndex)
+   // This assumes that images/buffers can't overlap. They can.
+   // TODO: Plumb the restrict logic through, and only skip when it's a
+   // restrict situation, or there can implicitly be no writes.
+   if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
       return false;
 
    if (this->rel[0] || that.rel[0])
       return this->base == that.base;
+
    return
       (this->offset < that.offset + that.size) &&
       (this->offset + this->size > that.offset);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 193628c..b33d7b4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -37,11 +37,9 @@
 #if __cplusplus >= 201103L
 using std::hash;
 using std::unordered_map;
-#elif !defined(ANDROID)
+#else
 using std::tr1::hash;
 using std::tr1::unordered_map;
-#else
-#error Android release before Lollipop is not supported!
 #endif
 
 #define MAX_REGISTER_FILE_SIZE 256
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index e9d1057..afeca14 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -174,11 +174,15 @@
    virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0;
 
    virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) {
-      threads = info->prop.cp.numThreads[0] *
-         info->prop.cp.numThreads[1] *
-         info->prop.cp.numThreads[2];
-      if (threads == 0)
-         threads = info->target >= NVISA_GK104_CHIPSET ? 1024 : 512;
+      if (info->type == PIPE_SHADER_COMPUTE) {
+         threads = info->prop.cp.numThreads[0] *
+            info->prop.cp.numThreads[1] *
+            info->prop.cp.numThreads[2];
+         if (threads == 0)
+            threads = info->target >= NVISA_GK104_CHIPSET ? 1024 : 512;
+      } else {
+         threads = 32; // doesn't matter, just not too big.
+      }
    }
 
    virtual bool runLegalizePass(Program *, CGStage stage) const = 0;
diff --git a/src/gallium/drivers/nouveau/codegen/unordered_set.h b/src/gallium/drivers/nouveau/codegen/unordered_set.h
index 8ef6d46..0e2945f 100644
--- a/src/gallium/drivers/nouveau/codegen/unordered_set.h
+++ b/src/gallium/drivers/nouveau/codegen/unordered_set.h
@@ -1,7 +1,7 @@
 #ifndef __NV50_UNORDERED_SET_H__
 #define __NV50_UNORDERED_SET_H__
 
-#if (__cplusplus >= 201103L) || defined(ANDROID)
+#if (__cplusplus >= 201103L)
 #include <unordered_set>
 #else
 #include <tr1/unordered_set>
@@ -11,36 +11,8 @@
 
 #if __cplusplus >= 201103L
 using std::unordered_set;
-#elif !defined(ANDROID)
+#else
 using std::tr1::unordered_set;
-#else // Android release before lollipop
-using std::isfinite;
-typedef std::tr1::unordered_set<void *> voidptr_unordered_set;
-
-template <typename V>
-class unordered_set : public voidptr_unordered_set {
-  public:
-    typedef voidptr_unordered_set _base;
-    typedef _base::iterator _biterator;
-    typedef _base::const_iterator const_biterator;
-
-    class iterator : public _biterator {
-      public:
-        iterator(const _biterator & i) : _biterator(i) {}
-        V operator*() const { return reinterpret_cast<V>(*_biterator(*this)); }
-    };
-    class const_iterator : public const_biterator {
-      public:
-        const_iterator(const iterator & i) : const_biterator(i) {}
-        const_iterator(const const_biterator & i) : const_biterator(i) {}
-        const V operator*() const { return reinterpret_cast<const V>(*const_biterator(*this)); }
-    };
-
-    iterator begin() { return _base::begin(); }
-    iterator end() { return _base::end(); }
-    const_iterator begin() const { return _base::begin(); }
-    const_iterator end() const { return _base::end(); }
-};
 #endif
 
 } // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index d8009f5..3151a6f 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -109,7 +109,7 @@
 
    info.type = type;
    info.target = chipset;
-   info.bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info.bin.sourceRep = PIPE_SHADER_IR_TGSI;
    info.bin.source = tokens;
 
    info.io.auxCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 15cb965..13b76d7 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -158,7 +158,7 @@
       if (res != -1) {
          screen->disk_shader_cache =
             disk_cache_create(nouveau_screen_get_name(&screen->base),
-                              timestamp_str);
+                              timestamp_str, 0);
          free(timestamp_str);
       }
    }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index 4c16e0c..e137525 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -115,7 +115,7 @@
 
    if (res->bind & PIPE_BIND_VERTEX_BUFFER) {
       for (i = 0; i < nv30->num_vtxbufs; ++i) {
-         if (nv30->vtxbuf[i].buffer == res) {
+         if (nv30->vtxbuf[i].buffer.resource == res) {
             nv30->dirty |= NV30_NEW_ARRAYS;
             nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXBUF);
             if (!--ref)
@@ -123,13 +123,6 @@
          }
       }
    }
-   if (res->bind & PIPE_BIND_INDEX_BUFFER) {
-      if (nv30->idxbuf.buffer == res) {
-         nouveau_bufctx_reset(nv30->bufctx, BUFCTX_IDXBUF);
-         if (!--ref)
-            return ref;
-      }
-   }
 
    if (res->bind & PIPE_BIND_SAMPLER_VIEW) {
       for (i = 0; i < nv30->fragprog.num_textures; ++i) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 0ab2f95..1496b37 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -110,7 +110,6 @@
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
-   struct pipe_index_buffer idxbuf;
    uint32_t vbo_fifo;
    uint32_t vbo_user;
    unsigned vbo_min_index;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 10c9f56..4c587fc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -419,25 +419,26 @@
    }
 
    for (i = 0; i < nv30->num_vtxbufs; i++) {
-      const void *map = nv30->vtxbuf[i].user_buffer;
+      const void *map = nv30->vtxbuf[i].is_user_buffer ?
+                           nv30->vtxbuf[i].buffer.user : NULL;
       if (!map) {
-         if (nv30->vtxbuf[i].buffer)
-            map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
+         if (nv30->vtxbuf[i].buffer.resource)
+            map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer.resource,
                                   PIPE_TRANSFER_UNSYNCHRONIZED |
                                   PIPE_TRANSFER_READ, &transfer[i]);
       }
       draw_set_mapped_vertex_buffer(draw, i, map, ~0);
    }
 
-   if (info->indexed) {
-      const void *map = nv30->idxbuf.user_buffer;
+   if (info->index_size) {
+      const void *map = info->has_user_indices ? info->index.user : NULL;
       if (!map)
-         map = pipe_buffer_map(pipe, nv30->idxbuf.buffer,
+         map = pipe_buffer_map(pipe, info->index.resource,
                                PIPE_TRANSFER_UNSYNCHRONIZED |
                                PIPE_TRANSFER_READ, &transferi);
       draw_set_indexes(draw,
-                       (ubyte *) map + nv30->idxbuf.offset,
-                       nv30->idxbuf.index_size, ~0);
+                       (ubyte *) map,
+                       info->index_size, ~0);
    } else {
       draw_set_indexes(draw, NULL, 0, 0);
    }
@@ -445,7 +446,7 @@
    draw_vbo(draw, info);
    draw_flush(draw);
 
-   if (info->indexed && transferi)
+   if (info->index_size && transferi)
       pipe_buffer_unmap(pipe, transferi);
    for (i = 0; i < nv30->num_vtxbufs; i++)
       if (transfer[i])
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
index 67ab050..fc8520b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_push.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -199,7 +199,7 @@
 {
    struct push_context ctx;
    unsigned i, index_size;
-   bool apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->index_size && info->index_bias;
 
    ctx.push = nv30->base.pushbuf;
    ctx.translate = nv30->vertex->translate;
@@ -209,9 +209,9 @@
    for (i = 0; i < nv30->num_vtxbufs; ++i) {
       uint8_t *data;
       struct pipe_vertex_buffer *vb = &nv30->vtxbuf[i];
-      struct nv04_resource *res = nv04_resource(vb->buffer);
+      struct nv04_resource *res = nv04_resource(vb->buffer.resource);
 
-      if (!vb->buffer && !vb->user_buffer) {
+      if (!vb->buffer.resource) {
          continue;
       }
 
@@ -224,18 +224,18 @@
       ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
    }
 
-   if (info->indexed) {
-      if (nv30->idxbuf.buffer)
+   if (info->index_size) {
+      if (!info->has_user_indices)
          ctx.idxbuf = nouveau_resource_map_offset(&nv30->base,
-            nv04_resource(nv30->idxbuf.buffer), nv30->idxbuf.offset,
+            nv04_resource(info->index.resource), info->start * info->index_size,
             NOUVEAU_BO_RD);
       else
-         ctx.idxbuf = nv30->idxbuf.user_buffer;
+         ctx.idxbuf = info->index.user;
       if (!ctx.idxbuf) {
          nv30_state_release(nv30);
          return;
       }
-      index_size = nv30->idxbuf.index_size;
+      index_size = info->index_size;
       ctx.primitive_restart = info->primitive_restart;
       ctx.restart_index = info->restart_index;
    } else {
@@ -277,12 +277,12 @@
    BEGIN_NV04(ctx.push, NV30_3D(VERTEX_BEGIN_END), 1);
    PUSH_DATA (ctx.push, NV30_3D_VERTEX_BEGIN_END_STOP);
 
-   if (info->indexed)
-      nouveau_resource_unmap(nv04_resource(nv30->idxbuf.buffer));
+   if (info->index_size && !info->has_user_indices)
+      nouveau_resource_unmap(nv04_resource(info->index.resource));
 
    for (i = 0; i < nv30->num_vtxbufs; ++i) {
-      if (nv30->vtxbuf[i].buffer) {
-         nouveau_resource_unmap(nv04_resource(nv30->vtxbuf[i].buffer));
+      if (nv30->vtxbuf[i].buffer.resource) {
+         nouveau_resource_unmap(nv04_resource(nv30->vtxbuf[i].buffer.resource));
       }
    }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
index 6238a23..ff34f6e 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -39,15 +39,11 @@
 
    if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
       for (i = 0; i < nv30->num_vtxbufs; ++i) {
-         if (!nv30->vtxbuf[i].buffer)
+         if (!nv30->vtxbuf[i].buffer.resource)
             continue;
-         if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
+         if (nv30->vtxbuf[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
             nv30->base.vbo_dirty = true;
       }
-
-      if (nv30->idxbuf.buffer &&
-          nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv30->base.vbo_dirty = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 24b6b60..a352ff5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -96,6 +96,7 @@
    case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
       return 1;
    /* nv35 capabilities */
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
@@ -216,6 +217,9 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -313,6 +317,7 @@
       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
          return 0;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
@@ -360,6 +365,7 @@
       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
          return 0;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
index 16b668b..2a81225 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -438,23 +438,6 @@
     nv30->dirty |= NV30_NEW_ARRAYS;
 }
 
-static void
-nv30_set_index_buffer(struct pipe_context *pipe,
-                      const struct pipe_index_buffer *ib)
-{
-    struct nv30_context *nv30 = nv30_context(pipe);
-
-    if (ib) {
-       pipe_resource_reference(&nv30->idxbuf.buffer, ib->buffer);
-       nv30->idxbuf.index_size = ib->index_size;
-       nv30->idxbuf.offset = ib->offset;
-       nv30->idxbuf.user_buffer = ib->user_buffer;
-    } else {
-       pipe_resource_reference(&nv30->idxbuf.buffer, NULL);
-       nv30->idxbuf.user_buffer = NULL;
-    }
-}
-
 void
 nv30_state_init(struct pipe_context *pipe)
 {
@@ -481,5 +464,4 @@
    pipe->set_viewport_states = nv30_set_viewport_states;
 
    pipe->set_vertex_buffers = nv30_set_vertex_buffers;
-   pipe->set_index_buffer = nv30_set_index_buffer;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index bc9b9a1..bb0a8a0 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -25,6 +25,7 @@
 
 #include "util/u_format.h"
 #include "util/u_inlines.h"
+#include "util/u_prim.h"
 #include "translate/translate.h"
 
 #include "nouveau_fence.h"
@@ -39,7 +40,7 @@
 {
    const unsigned nc = util_format_get_nr_components(ve->src_format);
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
-   struct nv04_resource *res = nv04_resource(vb->buffer);
+   struct nv04_resource *res = nv04_resource(vb->buffer.resource);
    const struct util_format_description *desc =
       util_format_description(ve->src_format);
    const void *data;
@@ -101,12 +102,12 @@
 
    for (i = 0; i < nv30->num_vtxbufs; i++) {
       vb = &nv30->vtxbuf[i];
-      if (!vb->stride || !vb->buffer) /* NOTE: user_buffer not implemented */
+      if (!vb->stride || !vb->buffer.resource) /* NOTE: user_buffer not implemented */
          continue;
-      buf = nv04_resource(vb->buffer);
+      buf = nv04_resource(vb->buffer.resource);
 
       /* NOTE: user buffers with temporary storage count as mapped by GPU */
-      if (!nouveau_resource_mapped_by_gpu(vb->buffer)) {
+      if (!nouveau_resource_mapped_by_gpu(vb->buffer.resource)) {
          if (nv30->vbo_push_hint) {
             nv30->vbo_fifo = ~0;
             continue;
@@ -137,7 +138,7 @@
       struct pipe_vertex_element *ve = &nv30->vertex->pipe[i];
       const int b = ve->vertex_buffer_index;
       struct pipe_vertex_buffer *vb = &nv30->vtxbuf[b];
-      struct nv04_resource *buf = nv04_resource(vb->buffer);
+      struct nv04_resource *buf = nv04_resource(vb->buffer.resource);
 
       if (!(nv30->vbo_user & (1 << b)))
          continue;
@@ -172,7 +173,7 @@
       int i = ffs(vbo_user) - 1;
       vbo_user &= ~(1 << i);
 
-      nouveau_buffer_release_gpu_storage(nv04_resource(nv30->vtxbuf[i].buffer));
+      nouveau_buffer_release_gpu_storage(nv04_resource(nv30->vtxbuf[i].buffer.resource));
    }
 
    nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXTMP);
@@ -234,7 +235,7 @@
       vb = &nv30->vtxbuf[ve->vertex_buffer_index];
       user = (nv30->vbo_user & (1 << ve->vertex_buffer_index));
 
-      res = nv04_resource(vb->buffer);
+      res = nv04_resource(vb->buffer.resource);
 
       if (nv30->vbo_fifo || unlikely(vb->stride == 0)) {
          if (!nv30->vbo_fifo)
@@ -458,10 +459,11 @@
 
 static void
 nv30_draw_elements(struct nv30_context *nv30, bool shorten,
+                   const struct pipe_draw_info *info,
                    unsigned mode, unsigned start, unsigned count,
-                   unsigned instance_count, int32_t index_bias)
+                   unsigned instance_count, int32_t index_bias,
+		   unsigned index_size)
 {
-   const unsigned index_size = nv30->idxbuf.index_size;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    unsigned prim = nv30_prim_gl(mode);
@@ -473,9 +475,9 @@
    }
 
    if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 &&
-       nv30->idxbuf.buffer) {
-      struct nv04_resource *res = nv04_resource(nv30->idxbuf.buffer);
-      unsigned offset = nv30->idxbuf.offset;
+       !info->has_user_indices) {
+      struct nv04_resource *res = nv04_resource(info->index.resource);
+      unsigned offset = 0;
 
       assert(nouveau_resource_mapped_by_gpu(&res->base));
 
@@ -510,12 +512,12 @@
       PUSH_RESET(push, BUFCTX_IDXBUF);
    } else {
       const void *data;
-      if (nv30->idxbuf.buffer)
+      if (!info->has_user_indices)
          data = nouveau_resource_map_offset(&nv30->base,
-                                            nv04_resource(nv30->idxbuf.buffer),
-                                            nv30->idxbuf.offset, NOUVEAU_BO_RD);
+                                            nv04_resource(info->index.resource),
+                                            start * index_size, NOUVEAU_BO_RD);
       else
-         data = nv30->idxbuf.user_buffer;
+         data = info->index.user;
       if (!data)
          return;
 
@@ -550,11 +552,15 @@
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
    int i;
 
+   if (!info->primitive_restart &&
+       !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+      return;
+
    /* For picking only a few vertices from a large user buffer, push is better,
     * if index count is larger and we expect repeated vertices, suggest upload.
     */
    nv30->vbo_push_hint = /* the 64 is heuristic */
-      !(info->indexed &&
+      !(info->index_size &&
         ((info->max_index - info->min_index + 64) < info->count));
 
    nv30->vbo_min_index = info->min_index;
@@ -578,14 +584,14 @@
    }
 
    for (i = 0; i < nv30->num_vtxbufs && !nv30->base.vbo_dirty; ++i) {
-      if (!nv30->vtxbuf[i].buffer)
+      if (!nv30->vtxbuf[i].buffer.resource)
          continue;
-      if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+      if (nv30->vtxbuf[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
          nv30->base.vbo_dirty = true;
    }
 
-   if (!nv30->base.vbo_dirty && nv30->idxbuf.buffer &&
-       nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+   if (!nv30->base.vbo_dirty && info->index_size && !info->has_user_indices &&
+       info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
       nv30->base.vbo_dirty = true;
 
    if (nv30->base.vbo_dirty) {
@@ -594,7 +600,7 @@
       nv30->base.vbo_dirty = false;
    }
 
-   if (!info->indexed) {
+   if (!info->index_size) {
       nv30_draw_arrays(nv30,
                        info->mode, info->start, info->count,
                        info->instance_count);
@@ -623,9 +629,9 @@
             shorten = false;
       }
 
-      nv30_draw_elements(nv30, shorten,
+      nv30_draw_elements(nv30, shorten, info,
                          info->mode, info->start, info->count,
-                         info->instance_count, info->index_bias);
+                         info->instance_count, info->index_bias, info->index_size);
    }
 
    nv30_state_release(nv30);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 4924d21..278a8a4 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -5,6 +5,7 @@
 #include "util/u_dynarray.h"
 #include "util/u_inlines.h"
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -1118,7 +1119,7 @@
       goto out_err;
 
    tgsi_parse_init(&parse, fp->pipe.tokens);
-   util_dynarray_init(&insns);
+   util_dynarray_init(&insns, NULL);
 
    while (!tgsi_parse_end_of_tokens(&parse)) {
       tgsi_parse_token(&parse);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index baea701..bec9975 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -4,6 +4,7 @@
 #include "pipe/p_state.h"
 #include "util/u_dynarray.h"
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
@@ -997,7 +998,7 @@
       vpc->cvtx_idx = vpc->hpos_idx;
    }
 
-   util_dynarray_init(&insns);
+   util_dynarray_init(&insns, NULL);
 
    tgsi_parse_init(&parse, vp->pipe.tokens);
    while (!tgsi_parse_end_of_tokens(&parse)) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index bf768bc..6124343 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -62,16 +62,12 @@
 
    if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
-         if (!nv50->vtxbuf[i].buffer)
+         if (!nv50->vtxbuf[i].buffer.resource && !nv50->vtxbuf[i].is_user_buffer)
             continue;
-         if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
+         if (nv50->vtxbuf[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
             nv50->base.vbo_dirty = true;
       }
 
-      if (nv50->idxbuf.buffer &&
-          nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv50->base.vbo_dirty = true;
-
       for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
          uint32_t valid = nv50->constbuf_valid[s];
 
@@ -144,9 +140,7 @@
 
    assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
    for (i = 0; i < nv50->num_vtxbufs; ++i)
-      pipe_resource_reference(&nv50->vtxbuf[i].buffer, NULL);
-
-   pipe_resource_reference(&nv50->idxbuf.buffer, NULL);
+      pipe_vertex_buffer_unreference(&nv50->vtxbuf[i]);
 
    for (s = 0; s < 3; ++s) {
       assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
@@ -230,7 +224,7 @@
 
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
-         if (nv50->vtxbuf[i].buffer == res) {
+         if (nv50->vtxbuf[i].buffer.resource == res) {
             nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
             if (!--ref)
@@ -238,14 +232,6 @@
          }
       }
 
-      if (nv50->idxbuf.buffer == res) {
-         /* Just rebind to the bufctx as there is no separate dirty bit */
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
-         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD);
-         if (!--ref)
-            return ref;
-      }
-
       for (s = 0; s < 3; ++s) {
       assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
       for (i = 0; i < nv50->num_textures[s]; ++i) {
@@ -391,7 +377,7 @@
 
    nv50->base.scratch.bo_size = 2 << 20;
 
-   util_dynarray_init(&nv50->global_residents);
+   util_dynarray_init(&nv50->global_residents, NULL);
 
    return pipe;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index cca44f5..224535a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -143,7 +143,6 @@
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
    uint32_t vtxbufs_coherent;
-   struct pipe_index_buffer idxbuf;
    uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
    uint32_t vbo_constant; /* bitmask of user buffers with stride 0 */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 76d06ae..92e73f8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -20,6 +20,8 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "pipe/p_defines.h"
+
 #include "nv50/nv50_program.h"
 #include "nv50/nv50_context.h"
 
@@ -331,7 +333,7 @@
 
    info->type = prog->type;
    info->target = chipset;
-   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info->bin.sourceRep = PIPE_SHADER_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;
 
    info->io.auxCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index 6a53ad0..bec2d42 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -244,7 +244,7 @@
    unsigned i, index_size;
    unsigned inst_count = info->instance_count;
    unsigned vert_count = info->count;
-   bool apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->index_size && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
@@ -264,11 +264,11 @@
       const struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
       const uint8_t *data;
 
-      if (unlikely(vb->buffer))
+      if (unlikely(!vb->is_user_buffer))
          data = nouveau_resource_map_offset(&nv50->base,
-            nv04_resource(vb->buffer), vb->buffer_offset, NOUVEAU_BO_RD);
+            nv04_resource(vb->buffer.resource), vb->buffer_offset, NOUVEAU_BO_RD);
       else
-         data = vb->user_buffer;
+         data = vb->buffer.user;
 
       if (apply_bias && likely(!(nv50->vertex->instance_bufs & (1 << i))))
          data += (ptrdiff_t)info->index_bias * vb->stride;
@@ -276,17 +276,16 @@
       ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
    }
 
-   if (info->indexed) {
-      if (nv50->idxbuf.buffer) {
+   if (info->index_size) {
+      if (!info->has_user_indices) {
          ctx.idxbuf = nouveau_resource_map_offset(&nv50->base,
-            nv04_resource(nv50->idxbuf.buffer), nv50->idxbuf.offset,
-            NOUVEAU_BO_RD);
+            nv04_resource(info->index.resource), 0, NOUVEAU_BO_RD);
       } else {
-         ctx.idxbuf = nv50->idxbuf.user_buffer;
+         ctx.idxbuf = info->index.user;
       }
       if (!ctx.idxbuf)
          return;
-      index_size = nv50->idxbuf.index_size;
+      index_size = info->index_size;
       ctx.primitive_restart = info->primitive_restart;
       ctx.restart_index = info->restart_index;
    } else {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index f691b47..78e11cf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -200,6 +200,8 @@
    case PIPE_CAP_TGSI_MUL_ZERO_WINS:
    case PIPE_CAP_TGSI_TEX_TXF_LZ:
    case PIPE_CAP_TGSI_CLOCK:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -268,6 +270,8 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -341,6 +345,8 @@
       return 0; /* please inline, or provide function declarations */
    case PIPE_SHADER_CAP_INTEGERS:
       return 1;
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+      return 1;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       /* The chip could handle more sampler views than samplers */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 6fa3d2c..a7d86b0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -1060,7 +1060,7 @@
    for (i = 0; i < count; ++i) {
       unsigned dst_index = start_slot + i;
 
-      if (!vb[i].buffer && vb[i].user_buffer) {
+      if (vb[i].is_user_buffer) {
          nv50->vbo_user |= 1 << dst_index;
          if (!vb[i].stride)
             nv50->vbo_constant |= 1 << dst_index;
@@ -1071,8 +1071,8 @@
          nv50->vbo_user &= ~(1 << dst_index);
          nv50->vbo_constant &= ~(1 << dst_index);
 
-         if (vb[i].buffer &&
-             vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         if (vb[i].buffer.resource &&
+             vb[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
             nv50->vtxbufs_coherent |= (1 << dst_index);
          else
             nv50->vtxbufs_coherent &= ~(1 << dst_index);
@@ -1081,29 +1081,6 @@
 }
 
 static void
-nv50_set_index_buffer(struct pipe_context *pipe,
-                      const struct pipe_index_buffer *ib)
-{
-   struct nv50_context *nv50 = nv50_context(pipe);
-
-   if (nv50->idxbuf.buffer)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
-
-   if (ib) {
-      pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
-      nv50->idxbuf.index_size = ib->index_size;
-      if (ib->buffer) {
-         nv50->idxbuf.offset = ib->offset;
-         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD);
-      } else {
-         nv50->idxbuf.user_buffer = ib->user_buffer;
-      }
-   } else {
-      pipe_resource_reference(&nv50->idxbuf.buffer, NULL);
-   }
-}
-
-static void
 nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
@@ -1341,7 +1318,6 @@
    pipe->bind_vertex_elements_state = nv50_vertex_state_bind;
 
    pipe->set_vertex_buffers = nv50_set_vertex_buffers;
-   pipe->set_index_buffer = nv50_set_index_buffer;
 
    pipe->create_stream_output_target = nv50_so_target_create;
    pipe->stream_output_target_destroy = nv50_so_target_destroy;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 227038e..ed04112 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -141,13 +141,13 @@
                   struct pipe_vertex_element *ve, unsigned attr)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   const void *data = (const uint8_t *)vb->user_buffer + ve->src_offset;
+   const void *data = (const uint8_t *)vb->buffer.user + ve->src_offset;
    float v[4];
    const unsigned nc = util_format_get_nr_components(ve->src_format);
    const struct util_format_description *desc =
       util_format_description(ve->src_format);
 
-   assert(vb->user_buffer);
+   assert(vb->is_user_buffer);
 
    if (desc->channel[0].pure_integer) {
       if (desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
@@ -200,7 +200,7 @@
    if (unlikely(nv50->vertex->instance_bufs & (1 << vbi))) {
       /* TODO: use min and max instance divisor to get a proper range */
       *base = 0;
-      *size = nv50->vtxbuf[vbi].buffer->width0;
+      *size = nv50->vtxbuf[vbi].buffer.resource->width0;
    } else {
       /* NOTE: if there are user buffers, we *must* have index bounds */
       assert(nv50->vb_elt_limit != ~0);
@@ -227,7 +227,7 @@
       nv50_user_vbuf_range(nv50, b, &base, &size);
 
       limits[b] = base + size - 1;
-      addrs[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size,
+      addrs[b] = nouveau_scratch_data(&nv50->base, vb->buffer.user, base, size,
                                       &bo);
       if (addrs[b])
          BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, NOUVEAU_BO_GART |
@@ -266,7 +266,7 @@
          struct nouveau_bo *bo;
          const uint32_t bo_flags = NOUVEAU_BO_GART | NOUVEAU_BO_RD;
          written |= 1 << b;
-         address[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer,
+         address[b] = nouveau_scratch_data(&nv50->base, vb->buffer.user,
                                            base, size, &bo);
          if (address[b])
             BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, bo_flags, bo);
@@ -317,8 +317,9 @@
       /* if vertex buffer was written by GPU - flush VBO cache */
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
-         struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer);
-         if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer.resource);
+         if (!nv50->vtxbuf[i].is_user_buffer &&
+             buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
             nv50->base.vbo_dirty = true;
          }
@@ -386,12 +387,12 @@
          address = addrs[b] + ve->pipe.src_offset;
          limit = addrs[b] + limits[b];
       } else
-      if (!vb->buffer) {
+      if (!vb->buffer.resource) {
          BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
          PUSH_DATA (push, 0);
          continue;
       } else {
-         struct nv04_resource *buf = nv04_resource(vb->buffer);
+         struct nv04_resource *buf = nv04_resource(vb->buffer.resource);
          if (!(refd & (1 << b))) {
             refd |= 1 << b;
             BCTX_REFN(nv50->bufctx_3d, 3D_VERTEX, buf, RD);
@@ -594,12 +595,13 @@
 
 static void
 nv50_draw_elements(struct nv50_context *nv50, bool shorten,
+                   const struct pipe_draw_info *info,
                    unsigned mode, unsigned start, unsigned count,
-                   unsigned instance_count, int32_t index_bias)
+                   unsigned instance_count, int32_t index_bias,
+		   unsigned index_size)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    unsigned prim;
-   const unsigned index_size = nv50->idxbuf.index_size;
 
    prim = nv50_prim_gl(mode);
 
@@ -613,15 +615,15 @@
       nv50->state.index_bias = index_bias;
    }
 
-   if (nv50->idxbuf.buffer) {
-      struct nv04_resource *buf = nv04_resource(nv50->idxbuf.buffer);
+   if (!info->has_user_indices) {
+      struct nv04_resource *buf = nv04_resource(info->index.resource);
       unsigned pb_start;
       unsigned pb_bytes;
-      const unsigned base = (buf->offset + nv50->idxbuf.offset) & ~3;
+      const unsigned base = buf->offset & ~3;
 
-      start += ((buf->offset + nv50->idxbuf.offset) & 3) >> (index_size >> 1);
+      start += (buf->offset & 3) >> (index_size >> 1);
 
-      assert(nouveau_resource_mapped_by_gpu(nv50->idxbuf.buffer));
+      assert(nouveau_resource_mapped_by_gpu(info->index.resource));
 
       /* This shouldn't have to be here. The going theory is that the buffer
        * is being filled in by PGRAPH, and it's not done yet by the time it
@@ -674,7 +676,7 @@
          prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
       }
    } else {
-      const void *data = nv50->idxbuf.user_buffer;
+      const void *data = info->index.user;
 
       while (instance_count--) {
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
@@ -768,6 +770,9 @@
    bool tex_dirty = false;
    int s;
 
+   if (info->index_size && !info->has_user_indices)
+      BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(info->index.resource), RD);
+
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nv50->vb_elt_first = info->min_index + info->index_bias;
    nv50->vb_elt_limit = info->max_index - info->min_index;
@@ -778,7 +783,7 @@
     * if index count is larger and we expect repeated vertices, suggest upload.
     */
    nv50->vbo_push_hint = /* the 64 is heuristic */
-      !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count));
+      !(info->index_size && ((nv50->vb_elt_limit + 64) < info->count));
 
    if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_3D_ARRAYS | NV50_NEW_3D_VERTEX))) {
       if (!!nv50->vbo_fifo != nv50->vbo_push_hint)
@@ -832,9 +837,7 @@
 
    if (nv50->vbo_fifo) {
       nv50_push_vbo(nv50, info);
-      push->kick_notify = nv50_default_kick_notify;
-      nouveau_pushbuf_bufctx(push, NULL);
-      return;
+      goto cleanup;
    }
 
    if (nv50->state.instance_base != info->start_instance) {
@@ -852,7 +855,7 @@
       nv50->base.vbo_dirty = false;
    }
 
-   if (info->indexed) {
+   if (info->index_size) {
       bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv50->state.prim_restart) {
@@ -877,9 +880,9 @@
             shorten = false;
       }
 
-      nv50_draw_elements(nv50, shorten,
+      nv50_draw_elements(nv50, shorten, info,
                          info->mode, info->start, info->count,
-                         info->instance_count, info->index_bias);
+                         info->instance_count, info->index_bias, info->index_size);
    } else
    if (unlikely(info->count_from_stream_output)) {
       nva0_draw_stream_output(nv50, info);
@@ -888,9 +891,13 @@
                        info->mode, info->start, info->count,
                        info->instance_count);
    }
+
+cleanup:
    push->kick_notify = nv50_default_kick_notify;
 
    nv50_release_user_vbufs(nv50);
 
    nouveau_pushbuf_bufctx(push, NULL);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
 }
diff --git a/src/gallium/drivers/nouveau/nv_object.xml.h b/src/gallium/drivers/nouveau/nv_object.xml.h
index f200c9c..664bfae 100644
--- a/src/gallium/drivers/nouveau/nv_object.xml.h
+++ b/src/gallium/drivers/nouveau/nv_object.xml.h
@@ -203,8 +203,10 @@
 #define NVC8_COMPUTE_CLASS					0x000092c0
 #define NVE4_COMPUTE_CLASS					0x0000a0c0
 #define NVF0_COMPUTE_CLASS					0x0000a1c0
-#define GM107_COMPUTE_CLASS				0x0000b0c0
-#define GM200_COMPUTE_CLASS				0x0000b1c0
+#define GM107_COMPUTE_CLASS					0x0000b0c0
+#define GM200_COMPUTE_CLASS					0x0000b1c0
+#define GP100_COMPUTE_CLASS					0x0000c0c0
+#define GP104_COMPUTE_CLASS					0x0000c1c0
 #define NV84_CRYPT_CLASS					0x000074c1
 #define BLOB_NVC0_PCOPY1_CLASS					0x000090b8
 #define BLOB_NVC0_PCOPY0_CLASS					0x000090b5
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
index accde94..d7245fb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
@@ -631,6 +631,8 @@
 #define NVC0_3D_UNK0F00__ESIZE					0x00000004
 #define NVC0_3D_UNK0F00__LEN					0x00000004
 
+#define NVC0_3D_POST_DEPTH_COVERAGE				0x00000f1c
+
 #define NVE4_3D_UNK0F20(i0)				       (0x00000f20 + 0x4*(i0))
 #define NVE4_3D_UNK0F20__ESIZE					0x00000004
 #define NVE4_3D_UNK0F20__LEN					0x00000005
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index d0f4da3..d5ef585 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -62,16 +62,12 @@
 
    if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
-         if (!nvc0->vtxbuf[i].buffer)
+         if (!nvc0->vtxbuf[i].buffer.resource && !nvc0->vtxbuf[i].is_user_buffer)
             continue;
-         if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
+         if (nvc0->vtxbuf[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
             nvc0->base.vbo_dirty = true;
       }
 
-      if (nvc0->idxbuf.buffer &&
-          nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nvc0->base.vbo_dirty = true;
-
       for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
          uint32_t valid = nvc0->constbuf_valid[s];
 
@@ -147,9 +143,7 @@
    util_unreference_framebuffer_state(&nvc0->framebuffer);
 
    for (i = 0; i < nvc0->num_vtxbufs; ++i)
-      pipe_resource_reference(&nvc0->vtxbuf[i].buffer, NULL);
-
-   pipe_resource_reference(&nvc0->idxbuf.buffer, NULL);
+      pipe_vertex_buffer_unreference(&nvc0->vtxbuf[i]);
 
    for (s = 0; s < 6; ++s) {
       for (i = 0; i < nvc0->num_textures[s]; ++i)
@@ -260,7 +254,7 @@
 
    if (res->target == PIPE_BUFFER) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
-         if (nvc0->vtxbuf[i].buffer == res) {
+         if (nvc0->vtxbuf[i].buffer.resource == res) {
             nvc0->dirty_3d |= NVC0_NEW_3D_ARRAYS;
             nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_VTX);
             if (!--ref)
@@ -268,13 +262,6 @@
          }
       }
 
-      if (nvc0->idxbuf.buffer == res) {
-         nvc0->dirty_3d |= NVC0_NEW_3D_IDXBUF;
-         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_IDX);
-         if (!--ref)
-            return ref;
-      }
-
       for (s = 0; s < 6; ++s) {
          for (i = 0; i < nvc0->num_textures[s]; ++i) {
             if (nvc0->textures[s][i] &&
@@ -474,7 +461,7 @@
 
    memset(nvc0->tex_handles, ~0, sizeof(nvc0->tex_handles));
 
-   util_dynarray_init(&nvc0->global_residents);
+   util_dynarray_init(&nvc0->global_residents, NULL);
 
    return pipe;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 79a5333..6f631b9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -53,7 +53,7 @@
 #define NVC0_NEW_3D_TEXTURES     (1 << 19)
 #define NVC0_NEW_3D_SAMPLERS     (1 << 20)
 #define NVC0_NEW_3D_TFB_TARGETS  (1 << 21)
-#define NVC0_NEW_3D_IDXBUF       (1 << 22)
+
 #define NVC0_NEW_3D_SURFACES     (1 << 23)
 #define NVC0_NEW_3D_MIN_SAMPLES  (1 << 24)
 #define NVC0_NEW_3D_TESSFACTOR   (1 << 25)
@@ -193,7 +193,6 @@
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
    uint32_t vtxbufs_coherent;
-   struct pipe_index_buffer idxbuf;
    uint32_t constant_vbos;
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
    uint32_t vb_elt_first; /* from pipe_draw_info, for vertex upload */
@@ -312,6 +311,7 @@
 void nvc0_compprog_validate(struct nvc0_context *);
 
 void nvc0_tfb_validate(struct nvc0_context *);
+void nvc0_layer_validate(struct nvc0_context *);
 
 /* nvc0_state.c */
 extern void nvc0_init_state_functions(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 6cc5183..e43a8de 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -487,6 +487,7 @@
    fp->fp.early_z = info->prop.fp.earlyFragTests;
    fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn;
    fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer;
+   fp->fp.post_depth_coverage = info->prop.fp.postDepthCoverage;
 
    /* Mark position xy and layer as read */
    if (fp->fp.reads_framebuffer)
@@ -567,7 +568,7 @@
 
    info->type = prog->type;
    info->target = chipset;
-   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info->bin.sourceRep = PIPE_SHADER_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 421ca19..b73822e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -50,6 +50,7 @@
       bool force_persample_interp;
       bool flatshade;
       bool reads_framebuffer;
+      bool post_depth_coverage;
    } fp;
    struct {
       uint32_t tess_mode; /* ~0 if defined by the other stage */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index 0991af8..d8d82de 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -346,6 +346,7 @@
    case PIPE_QUERY_PIPELINE_STATISTICS:
       for (i = 0; i < 10; ++i)
          res64[i] = data64[i * 2] - data64[24 + i * 2];
+      result->pipeline_statistics.cs_invocations = 0;
       break;
    case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
       res32[0] = hq->data[1];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index e3cbf55..8bbe403 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -27,9 +27,6 @@
 #include "util/u_format_s3tc.h"
 #include "pipe/p_screen.h"
 
-#include "vl/vl_decoder.h"
-#include "vl/vl_video_buffer.h"
-
 #include "nouveau_vp3_video.h"
 
 #include "nvc0/nvc0_context.h"
@@ -257,9 +254,10 @@
    case PIPE_CAP_INT64:
    case PIPE_CAP_TGSI_TEX_TXF_LZ:
    case PIPE_CAP_TGSI_CLOCK:
-      return 1;
    case PIPE_CAP_COMPUTE:
-      return (class_3d < GP100_3D_CLASS);
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+      return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
@@ -267,6 +265,9 @@
    case PIPE_CAP_TGSI_FS_FBFETCH:
       return class_3d >= NVE4_3D_CLASS; /* needs testing on fermi */
    case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+   case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
       return class_3d >= GM200_3D_CLASS;
    case PIPE_CAP_TGSI_BALLOT:
       return class_3d >= NVE4_3D_CLASS;
@@ -279,7 +280,6 @@
    case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_VERTEXID_NOBASE:
@@ -300,7 +300,7 @@
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
    case PIPE_CAP_INT64_DIVMOD:
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
-   case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -396,6 +396,8 @@
       return 1;
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       return 1;
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+      return 1;
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
    case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
@@ -695,9 +697,8 @@
    case 0x100:
    case 0x110:
    case 0x120:
-      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
    case 0x130:
-      return 0;
+      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
    default:
       return -1;
    }
@@ -917,6 +918,7 @@
    case 0x130:
       switch (dev->chipset) {
       case 0x130:
+      case 0x13b:
          obj_class = GP100_3D_CLASS;
          break;
       default:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 9e0211b..de0a02d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -55,6 +55,7 @@
    uint32_t uniform_buffer_bound[6];
    struct nvc0_transform_feedback_state *tfb;
    bool seamless_cube_map;
+   bool post_depth_coverage;
 };
 
 struct nvc0_screen {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index c644fe9..697bf49 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -146,6 +146,11 @@
       nvc0->state.early_z_forced = fp->fp.early_z;
       IMMED_NVC0(push, NVC0_3D(FORCE_EARLY_FRAGMENT_TESTS), fp->fp.early_z);
    }
+   if (fp->fp.post_depth_coverage != nvc0->state.post_depth_coverage) {
+      nvc0->state.post_depth_coverage = fp->fp.post_depth_coverage;
+      IMMED_NVC0(push, NVC0_3D(POST_DEPTH_COVERAGE),
+                 fp->fp.post_depth_coverage);
+   }
 
    BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 2);
    PUSH_DATA (push, 0x51);
@@ -220,18 +225,13 @@
 
    /* we allow GPs with no code for specifying stream output state only */
    if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) {
-      const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
-
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
       PUSH_DATA (push, 0x41);
       BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1);
       PUSH_DATA (push, gp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1);
       PUSH_DATA (push, gp->num_gprs);
-      BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
-      PUSH_DATA (push, gp_selects_layer ? NVC0_3D_LAYER_USE_GP : 0);
    } else {
-      IMMED_NVC0(push, NVC0_3D(LAYER), 0);
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
       PUSH_DATA (push, 0x40);
    }
@@ -252,6 +252,27 @@
 }
 
 void
+nvc0_layer_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *last;
+   bool prog_selects_layer = false;
+
+   if (nvc0->gmtyprog)
+      last = nvc0->gmtyprog;
+   else if (nvc0->tevlprog)
+      last = nvc0->tevlprog;
+   else
+      last = nvc0->vertprog;
+
+   if (last)
+      prog_selects_layer = !!(last->hdr[13] & (1 << 9));
+
+   BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
+   PUSH_DATA (push, prog_selects_layer ? NVC0_3D_LAYER_USE_GP : 0);
+}
+
+void
 nvc0_tfb_validate(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index c51c9a7..99d45a2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -941,7 +941,7 @@
     for (i = 0; i < count; ++i) {
        unsigned dst_index = start_slot + i;
 
-       if (vb[i].user_buffer) {
+       if (vb[i].is_user_buffer) {
           nvc0->vbo_user |= 1 << dst_index;
           if (!vb[i].stride && nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
              nvc0->constant_vbos |= 1 << dst_index;
@@ -952,8 +952,8 @@
           nvc0->vbo_user &= ~(1 << dst_index);
           nvc0->constant_vbos &= ~(1 << dst_index);
 
-          if (vb[i].buffer &&
-              vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+          if (vb[i].buffer.resource &&
+              vb[i].buffer.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
              nvc0->vtxbufs_coherent |= (1 << dst_index);
           else
              nvc0->vtxbufs_coherent &= ~(1 << dst_index);
@@ -962,31 +962,6 @@
 }
 
 static void
-nvc0_set_index_buffer(struct pipe_context *pipe,
-                      const struct pipe_index_buffer *ib)
-{
-    struct nvc0_context *nvc0 = nvc0_context(pipe);
-
-    if (nvc0->idxbuf.buffer)
-       nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_IDX);
-
-    if (ib) {
-       pipe_resource_reference(&nvc0->idxbuf.buffer, ib->buffer);
-       nvc0->idxbuf.index_size = ib->index_size;
-       if (ib->buffer) {
-          nvc0->idxbuf.offset = ib->offset;
-          nvc0->dirty_3d |= NVC0_NEW_3D_IDXBUF;
-       } else {
-          nvc0->idxbuf.user_buffer = ib->user_buffer;
-          nvc0->dirty_3d &= ~NVC0_NEW_3D_IDXBUF;
-       }
-    } else {
-       nvc0->dirty_3d &= ~NVC0_NEW_3D_IDXBUF;
-       pipe_resource_reference(&nvc0->idxbuf.buffer, NULL);
-    }
-}
-
-static void
 nvc0_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
 {
     struct nvc0_context *nvc0 = nvc0_context(pipe);
@@ -1426,7 +1401,6 @@
    pipe->bind_vertex_elements_state = nvc0_vertex_state_bind;
 
    pipe->set_vertex_buffers = nvc0_set_vertex_buffers;
-   pipe->set_index_buffer = nvc0_set_index_buffer;
 
    pipe->create_stream_output_target = nvc0_so_target_create;
    pipe->stream_output_target_destroy = nvc0_so_target_destroy;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 68fd730..37a6761 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -819,8 +819,6 @@
 
    if (!ctx_to->vertex)
       ctx_to->dirty_3d &= ~(NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS);
-   if (!ctx_to->idxbuf.buffer)
-      ctx_to->dirty_3d &= ~NVC0_NEW_3D_IDXBUF;
 
    if (!ctx_to->vertprog)
       ctx_to->dirty_3d &= ~NVC0_NEW_3D_VERTPROG;
@@ -876,8 +874,10 @@
     { nvc0_vertex_arrays_validate, NVC0_NEW_3D_VERTEX | NVC0_NEW_3D_ARRAYS },
     { nvc0_validate_surfaces,      NVC0_NEW_3D_SURFACES },
     { nvc0_validate_buffers,       NVC0_NEW_3D_BUFFERS },
-    { nvc0_idxbuf_validate,        NVC0_NEW_3D_IDXBUF },
     { nvc0_tfb_validate,           NVC0_NEW_3D_TFB_TARGETS | NVC0_NEW_3D_GMTYPROG },
+    { nvc0_layer_validate,         NVC0_NEW_3D_VERTPROG |
+                                   NVC0_NEW_3D_TEVLPROG |
+                                   NVC0_NEW_3D_GMTYPROG },
     { nvc0_validate_driverconst,   NVC0_NEW_3D_DRIVERCONST },
 };
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index 14fb53c..225b894 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -112,13 +112,27 @@
                         const struct nv50_m2mf_rect *src,
                         uint32_t nblocksx, uint32_t nblocksy)
 {
+   static const struct {
+      int cs;
+      int nc;
+   } cpbs[] = {
+      [ 1] = { 1, 1 },
+      [ 2] = { 1, 2 },
+      [ 3] = { 1, 3 },
+      [ 4] = { 1, 4 },
+      [ 6] = { 2, 3 },
+      [ 8] = { 2, 4 },
+      [ 9] = { 3, 3 },
+      [12] = { 3, 4 },
+      [16] = { 4, 4 },
+   };
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nouveau_bufctx *bctx = nvc0->bufctx;
    uint32_t exec;
    uint32_t src_base = src->base;
    uint32_t dst_base = dst->base;
-   const int cpp = dst->cpp;
 
+   assert(dst->cpp < ARRAY_SIZE(cpbs) && cpbs[dst->cpp].cs);
    assert(dst->cpp == src->cpp);
 
    nouveau_bufctx_refn(bctx, 0, dst->bo, dst->domain | NOUVEAU_BO_WR);
@@ -126,35 +140,45 @@
    nouveau_pushbuf_bufctx(push, bctx);
    nouveau_pushbuf_validate(push);
 
-   exec = 0x200 /* 2D_ENABLE */ | 0x6 /* UNK */;
+   exec = 0x400 /* REMAP_ENABLE */ | 0x200 /* 2D_ENABLE */ | 0x6 /* UNK */;
 
-   if (!nouveau_bo_memtype(dst->bo)) {
+   BEGIN_NVC0(push, SUBC_COPY(0x0708), 1);
+   PUSH_DATA (push, (cpbs[dst->cpp].nc - 1) << 24 |
+                    (cpbs[src->cpp].nc - 1) << 20 |
+                    (cpbs[src->cpp].cs - 1) << 16 |
+                    3 << 12 /* DST_W = SRC_W */ |
+                    2 <<  8 /* DST_Z = SRC_Z */ |
+                    1 <<  4 /* DST_Y = SRC_Y */ |
+                    0 <<  0 /* DST_X = SRC_X */);
+
+   if (nouveau_bo_memtype(dst->bo)) {
+      BEGIN_NVC0(push, SUBC_COPY(0x070c), 6);
+      PUSH_DATA (push, 0x1000 | dst->tile_mode);
+      PUSH_DATA (push, dst->width);
+      PUSH_DATA (push, dst->height);
+      PUSH_DATA (push, dst->depth);
+      PUSH_DATA (push, dst->z);
+      PUSH_DATA (push, (dst->y << 16) | dst->x);
+   } else {
       assert(!dst->z);
-      dst_base += dst->y * dst->pitch + dst->x * cpp;
+      dst_base += dst->y * dst->pitch + dst->x * dst->cpp;
       exec |= 0x100; /* DST_MODE_2D_LINEAR */
    }
-   if (!nouveau_bo_memtype(src->bo)) {
+
+   if (nouveau_bo_memtype(src->bo)) {
+      BEGIN_NVC0(push, SUBC_COPY(0x0728), 6);
+      PUSH_DATA (push, 0x1000 | src->tile_mode);
+      PUSH_DATA (push, src->width);
+      PUSH_DATA (push, src->height);
+      PUSH_DATA (push, src->depth);
+      PUSH_DATA (push, src->z);
+      PUSH_DATA (push, (src->y << 16) | src->x);
+   } else {
       assert(!src->z);
-      src_base += src->y * src->pitch + src->x * cpp;
+      src_base += src->y * src->pitch + src->x * src->cpp;
       exec |= 0x080; /* SRC_MODE_2D_LINEAR */
    }
 
-   BEGIN_NVC0(push, SUBC_COPY(0x070c), 6);
-   PUSH_DATA (push, 0x1000 | dst->tile_mode);
-   PUSH_DATA (push, dst->pitch);
-   PUSH_DATA (push, dst->height);
-   PUSH_DATA (push, dst->depth);
-   PUSH_DATA (push, dst->z);
-   PUSH_DATA (push, (dst->y << 16) | (dst->x * cpp));
-
-   BEGIN_NVC0(push, SUBC_COPY(0x0728), 6);
-   PUSH_DATA (push, 0x1000 | src->tile_mode);
-   PUSH_DATA (push, src->pitch);
-   PUSH_DATA (push, src->height);
-   PUSH_DATA (push, src->depth);
-   PUSH_DATA (push, src->z);
-   PUSH_DATA (push, (src->y << 16) | (src->x * cpp));
-
    BEGIN_NVC0(push, SUBC_COPY(0x0400), 8);
    PUSH_DATAh(push, src->bo->offset + src_base);
    PUSH_DATA (push, src->bo->offset + src_base);
@@ -162,7 +186,7 @@
    PUSH_DATA (push, dst->bo->offset + dst_base);
    PUSH_DATA (push, src->pitch);
    PUSH_DATA (push, dst->pitch);
-   PUSH_DATA (push, nblocksx * cpp);
+   PUSH_DATA (push, nblocksx);
    PUSH_DATA (push, nblocksy);
 
    BEGIN_NVC0(push, SUBC_COPY(0x0300), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 9a3eb06..63dcced 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -176,8 +176,8 @@
    uint32_t mode;
    const struct util_format_description *desc;
    void *dst;
-   const void *src = (const uint8_t *)vb->user_buffer + ve->src_offset;
-   assert(!vb->buffer);
+   const void *src = (const uint8_t *)vb->buffer.user + ve->src_offset;
+   assert(vb->is_user_buffer);
 
    desc = util_format_description(ve->src_format);
 
@@ -254,7 +254,7 @@
          struct nouveau_bo *bo;
          const uint32_t bo_flags = NOUVEAU_BO_RD | NOUVEAU_BO_GART;
          written |= 1 << b;
-         address[b] = nouveau_scratch_data(&nvc0->base, vb->user_buffer,
+         address[b] = nouveau_scratch_data(&nvc0->base, vb->buffer.user,
                                            base, size, &bo);
          if (bo)
             BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, bo_flags, bo);
@@ -289,7 +289,7 @@
 
       nvc0_user_vbuf_range(nvc0, b, &base, &size);
 
-      address = nouveau_scratch_data(&nvc0->base, nvc0->vtxbuf[b].user_buffer,
+      address = nouveau_scratch_data(&nvc0->base, nvc0->vtxbuf[b].buffer.user,
                                      base, size, &bo);
       if (bo)
          BCTX_REFN_bo(nvc0->bufctx_3d, 3D_VTX_TMP, bo_flags, bo);
@@ -346,9 +346,9 @@
          /* address/value set in nvc0_update_user_vbufs */
          continue;
       }
-      res = nv04_resource(vb->buffer);
+      res = nv04_resource(vb->buffer.resource);
       offset = ve->pipe.src_offset + vb->buffer_offset;
-      limit = vb->buffer->width0 - 1;
+      limit = vb->buffer.resource->width0 - 1;
 
       if (unlikely(ve->pipe.instance_divisor)) {
          BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 4);
@@ -395,12 +395,12 @@
          }
          /* address/value set in nvc0_update_user_vbufs_shared */
          continue;
-      } else if (!vb->buffer) {
+      } else if (!vb->buffer.resource) {
          /* there can be holes in the vertex buffer lists */
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
          continue;
       }
-      buf = nv04_resource(vb->buffer);
+      buf = nv04_resource(vb->buffer.resource);
       offset = vb->buffer_offset;
       limit = buf->base.width0 - 1;
 
@@ -522,26 +522,6 @@
       nvc0_validate_vertex_buffers(nvc0);
 }
 
-void
-nvc0_idxbuf_validate(struct nvc0_context *nvc0)
-{
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nv04_resource *buf = nv04_resource(nvc0->idxbuf.buffer);
-
-   assert(buf);
-   assert(nouveau_resource_mapped_by_gpu(&buf->base));
-
-   PUSH_SPACE(push, 6);
-   BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5);
-   PUSH_DATAh(push, buf->address + nvc0->idxbuf.offset);
-   PUSH_DATA (push, buf->address + nvc0->idxbuf.offset);
-   PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
-   PUSH_DATA (push, buf->address + buf->base.width0 - 1);
-   PUSH_DATA (push, nvc0->idxbuf.index_size >> 1);
-
-   BCTX_REFN(nvc0->bufctx_3d, 3D_IDX, buf, RD);
-}
-
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
@@ -588,7 +568,7 @@
    unsigned prim;
 
    if (nvc0->state.index_bias) {
-      /* index_bias is implied 0 if !info->indexed (really ?) */
+      /* index_bias is implied 0 if !info->index_size (really ?) */
       /* TODO: can we deactivate it for the VERTEX_BUFFER_FIRST command ? */
       PUSH_SPACE(push, 2);
       IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
@@ -711,12 +691,13 @@
 
 static void
 nvc0_draw_elements(struct nvc0_context *nvc0, bool shorten,
+                   const struct pipe_draw_info *info,
                    unsigned mode, unsigned start, unsigned count,
-                   unsigned instance_count, int32_t index_bias)
+                   unsigned instance_count, int32_t index_bias,
+		   unsigned index_size)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned prim;
-   const unsigned index_size = nvc0->idxbuf.index_size;
 
    prim = nvc0_prim_gl(mode);
 
@@ -729,7 +710,7 @@
       nvc0->state.index_bias = index_bias;
    }
 
-   if (nvc0->idxbuf.buffer) {
+   if (!info->has_user_indices) {
       PUSH_SPACE(push, 1);
       IMMED_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), prim);
       do {
@@ -745,7 +726,7 @@
       } while (instance_count);
       IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
    } else {
-      const void *data = nvc0->idxbuf.user_buffer;
+      const void *data = info->index.user;
 
       while (instance_count--) {
          PUSH_SPACE(push, 2);
@@ -818,10 +799,10 @@
 nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nv04_resource *buf = nv04_resource(info->indirect);
-   struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
-   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
-   uint32_t offset = buf->offset + info->indirect_offset;
+   struct nv04_resource *buf = nv04_resource(info->indirect->buffer);
+   struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count);
+   unsigned size, macro, count = info->indirect->draw_count, drawid = info->drawid;
+   uint32_t offset = buf->offset + info->indirect->offset;
    struct nvc0_screen *screen = nvc0->screen;
 
    PUSH_SPACE(push, 7);
@@ -841,9 +822,9 @@
    BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
    PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
 
-   if (info->indexed) {
-      assert(nvc0->idxbuf.buffer);
-      assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
+   if (info->index_size) {
+      assert(!info->has_user_indices);
+      assert(nouveau_resource_mapped_by_gpu(info->index.resource));
       size = 5;
       if (buf_count)
          macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT;
@@ -851,7 +832,7 @@
          macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
    } else {
       if (nvc0->state.index_bias) {
-         /* index_bias is implied 0 if !info->indexed (really ?) */
+         /* index_bias is implied 0 if !info->index_size (really ?) */
          IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
          IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
          nvc0->state.index_bias = 0;
@@ -870,7 +851,7 @@
     */
    while (count) {
       unsigned draws = count, pushes, i;
-      if (info->indirect_stride == size * 4) {
+      if (info->indirect->stride == size * 4) {
          draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size);
          pushes = 1;
       } else {
@@ -890,20 +871,20 @@
       if (buf_count) {
          nouveau_pushbuf_data(push,
                               buf_count->bo,
-                              buf_count->offset + info->indirect_params_offset,
+                              buf_count->offset + info->indirect->indirect_draw_count_offset,
                               NVC0_IB_ENTRY_1_NO_PREFETCH | 4);
       }
       if (pushes == 1) {
          nouveau_pushbuf_data(push,
                               buf->bo, offset,
                               NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws));
-         offset += draws * info->indirect_stride;
+         offset += draws * info->indirect->stride;
       } else {
          for (i = 0; i < pushes; i++) {
             nouveau_pushbuf_data(push,
                                  buf->bo, offset,
                                  NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4));
-            offset += info->indirect_stride;
+            offset += info->indirect->stride;
          }
       }
       count -= draws;
@@ -950,7 +931,7 @@
     * if index count is larger and we expect repeated vertices, suggest upload.
     */
    nvc0->vbo_push_hint =
-      !info->indirect && info->indexed &&
+      !info->indirect && info->index_size &&
       (nvc0->vb_elt_limit >= (info->count * 2));
 
    /* Check whether we want to switch vertex-submission mode. */
@@ -974,6 +955,23 @@
       IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
    }
 
+   if (info->index_size && !info->has_user_indices) {
+      struct nv04_resource *buf = nv04_resource(info->index.resource);
+
+      assert(buf);
+      assert(nouveau_resource_mapped_by_gpu(&buf->base));
+
+      PUSH_SPACE(push, 6);
+      BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5);
+      PUSH_DATAh(push, buf->address);
+      PUSH_DATA (push, buf->address);
+      PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
+      PUSH_DATA (push, buf->address + buf->base.width0 - 1);
+      PUSH_DATA (push, info->index_size >> 1);
+
+      BCTX_REFN(nvc0->bufctx_3d, 3D_IDX, buf, RD);
+   }
+
    nvc0_state_validate_3d(nvc0, ~0);
 
    if (nvc0->vertprog->vp.need_draw_parameters && !info->indirect) {
@@ -1029,9 +1027,7 @@
 
    if (nvc0->state.vbo_mode) {
       nvc0_push_vbo(nvc0, info);
-      push->kick_notify = nvc0_default_kick_notify;
-      nouveau_pushbuf_bufctx(push, NULL);
-      return;
+      goto cleanup;
    }
 
    /* space for base instance, flush, and prim restart */
@@ -1046,8 +1042,8 @@
 
    nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
 
-   if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
-       nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+   if (!nvc0->base.vbo_dirty && info->index_size && !info->has_user_indices &&
+       info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
       nvc0->base.vbo_dirty = true;
 
    nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
@@ -1064,23 +1060,27 @@
    if (unlikely(info->count_from_stream_output)) {
       nvc0_draw_stream_output(nvc0, info);
    } else
-   if (info->indexed) {
+   if (info->index_size) {
       bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart && info->restart_index > 65535)
          shorten = false;
 
-      nvc0_draw_elements(nvc0, shorten,
+      nvc0_draw_elements(nvc0, shorten, info,
                          info->mode, info->start, info->count,
-                         info->instance_count, info->index_bias);
+                         info->instance_count, info->index_bias, info->index_size);
    } else {
       nvc0_draw_arrays(nvc0,
                        info->mode, info->start, info->count,
                        info->instance_count);
    }
+
+cleanup:
    push->kick_notify = nvc0_default_kick_notify;
 
    nvc0_release_user_vbufs(nvc0);
 
    nouveau_pushbuf_bufctx(push, NULL);
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_IDX);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index fd2bcbb..256e20d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -69,11 +69,11 @@
       const uint8_t *map;
       const struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[i];
 
-      if (likely(!vb->buffer))
-         map = (const uint8_t *)vb->user_buffer;
+      if (likely(vb->is_user_buffer))
+         map = (const uint8_t *)vb->buffer.user;
       else
          map = nouveau_resource_map_offset(&nvc0->base,
-            nv04_resource(vb->buffer), vb->buffer_offset, NOUVEAU_BO_RD);
+            nv04_resource(vb->buffer.resource), vb->buffer_offset, NOUVEAU_BO_RD);
 
       if (index_bias && !unlikely(nvc0->vertex->instance_bufs & (1 << i)))
          map += (intptr_t)index_bias * vb->stride;
@@ -83,14 +83,15 @@
 }
 
 static inline void
-nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
+nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0,
+                     const struct pipe_draw_info *info)
 {
-   if (nvc0->idxbuf.buffer) {
-      struct nv04_resource *buf = nv04_resource(nvc0->idxbuf.buffer);
-      ctx->idxbuf = nouveau_resource_map_offset(&nvc0->base,
-         buf, nvc0->idxbuf.offset, NOUVEAU_BO_RD);
+   if (!info->has_user_indices) {
+      struct nv04_resource *buf = nv04_resource(info->index.resource);
+      ctx->idxbuf = nouveau_resource_map_offset(
+            &nvc0->base, buf, 0, NOUVEAU_BO_RD);
    } else {
-      ctx->idxbuf = nvc0->idxbuf.user_buffer;
+      ctx->idxbuf = info->index.user;
    }
 }
 
@@ -101,16 +102,16 @@
    unsigned attr = nvc0->vertprog->vp.edgeflag;
    struct pipe_vertex_element *ve = &nvc0->vertex->element[attr].pipe;
    struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[ve->vertex_buffer_index];
-   struct nv04_resource *buf = nv04_resource(vb->buffer);
+   struct nv04_resource *buf = nv04_resource(vb->buffer.resource);
 
    ctx->edgeflag.stride = vb->stride;
    ctx->edgeflag.width = util_format_get_blocksize(ve->src_format);
-   if (buf) {
+   if (!vb->is_user_buffer) {
       unsigned offset = vb->buffer_offset + ve->src_offset;
       ctx->edgeflag.data = nouveau_resource_map_offset(&nvc0->base,
                            buf, offset, NOUVEAU_BO_RD);
    } else {
-      ctx->edgeflag.data = (const uint8_t *)vb->user_buffer + ve->src_offset;
+      ctx->edgeflag.data = (const uint8_t *)vb->buffer.user + ve->src_offset;
    }
 
    if (index_bias)
@@ -499,16 +500,16 @@
        */
       BEGIN_NVC0(ctx.push, NVC0_3D(PRIM_RESTART_ENABLE), 2);
       PUSH_DATA (ctx.push, 1);
-      PUSH_DATA (ctx.push, info->indexed ? 0xffffffff : info->restart_index);
+      PUSH_DATA (ctx.push, info->index_size ? 0xffffffff : info->restart_index);
    } else
    if (nvc0->state.prim_restart) {
       IMMED_NVC0(ctx.push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
    }
    nvc0->state.prim_restart = info->primitive_restart;
 
-   if (info->indexed) {
-      nvc0_push_map_idxbuf(&ctx, nvc0);
-      index_size = nvc0->idxbuf.index_size;
+   if (info->index_size) {
+      nvc0_push_map_idxbuf(&ctx, nvc0, info);
+      index_size = info->index_size;
    } else {
       if (unlikely(info->count_from_stream_output)) {
          struct pipe_context *pipe = &nvc0->base.pipe;
@@ -583,10 +584,10 @@
       IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_ARRAY_FETCH(1)), 0);
    }
 
-   if (info->indexed)
-      nouveau_resource_unmap(nv04_resource(nvc0->idxbuf.buffer));
+   if (info->index_size && !info->has_user_indices)
+      nouveau_resource_unmap(nv04_resource(info->index.resource));
    for (i = 0; i < nvc0->num_vtxbufs; ++i)
-      nouveau_resource_unmap(nv04_resource(nvc0->vtxbuf[i].buffer));
+      nouveau_resource_unmap(nv04_resource(nvc0->vtxbuf[i].buffer.resource));
 
    NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1);
 }
@@ -626,7 +627,7 @@
    uint64_t va;
    uint32_t *data;
    uint32_t format;
-   unsigned index_size = nvc0->idxbuf.index_size;
+   unsigned index_size = info->index_size;
    unsigned i;
    unsigned a = nvc0->vertex->num_elements;
 
@@ -639,11 +640,11 @@
                 bo);
    nouveau_pushbuf_validate(push);
 
-   if (info->indexed) {
+   if (info->index_size) {
       if (!info->index_bias) {
          memcpy(data, ctx->idxbuf, info->count * index_size);
       } else {
-         switch (nvc0->idxbuf.index_size) {
+         switch (info->index_size) {
          case 1:
             copy_indices_u8(data, ctx->idxbuf, info->index_bias, info->count);
             break;
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 798761d..bc5d9e0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -29,6 +29,7 @@
 
 #ifdef DEBUG
 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
+static void gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *);
 #endif
 
 
@@ -57,6 +58,9 @@
    case 0x120:
       obj_class = GM200_COMPUTE_CLASS;
       break;
+   case 0x130:
+      obj_class = dev->chipset == 0x130 ? GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS;
+      break;
    default:
       NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
       return -1;
@@ -585,7 +589,45 @@
                               NVC0_CB_AUX_INFO(5), 1 << 11);
 }
 
-static inline struct nve4_cp_launch_desc *
+static void
+gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
+                                struct gp100_cp_launch_desc *desc,
+                                const struct pipe_grid_info *info)
+{
+   const struct nvc0_screen *screen = nvc0->screen;
+   const struct nvc0_program *cp = nvc0->compprog;
+
+   gp100_cp_launch_desc_init_default(desc);
+
+   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
+
+   desc->griddim_x = info->grid[0];
+   desc->griddim_y = info->grid[1];
+   desc->griddim_z = info->grid[2];
+   desc->blockdim_x = info->block[0];
+   desc->blockdim_y = info->block[1];
+   desc->blockdim_z = info->block[2];
+
+   desc->shared_size = align(cp->cp.smem_size, 0x100);
+   desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10);
+   desc->local_size_n = 0;
+   desc->cstack_size = 0x800;
+
+   desc->gpr_alloc = cp->num_gprs;
+   desc->bar_alloc = cp->num_barriers;
+
+   // Only bind user uniforms and the driver constant buffer through the
+   // launch descriptor because UBOs are sticked to the driver cb to avoid the
+   // limitation of 8 CBs.
+   if (nvc0->constbuf[5][0].user || cp->parm_size) {
+      gp100_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
+                                  NVC0_CB_USR_INFO(5), 1 << 16);
+   }
+   gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
+                               NVC0_CB_AUX_INFO(5), 1 << 11);
+}
+
+static inline void *
 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 {
@@ -597,7 +639,28 @@
       ptr += adj;
       *pgpuaddr += adj;
    }
-   return (struct nve4_cp_launch_desc *)ptr;
+   return ptr;
+}
+
+static void
+nve4_upload_indirect_desc(struct nouveau_pushbuf *push,
+                          struct nv04_resource *res,  uint64_t gpuaddr,
+                          uint32_t length, uint32_t bo_offset)
+{
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, gpuaddr);
+   PUSH_DATA (push, gpuaddr);
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, length);
+   PUSH_DATA (push, 1);
+
+   nouveau_pushbuf_space(push, 32, 0, 1);
+   PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (length / 4));
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+   nouveau_pushbuf_data(push, res->bo, bo_offset,
+                        NVC0_IB_ENTRY_1_NO_PREFETCH | length);
 }
 
 void
@@ -605,7 +668,7 @@
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nve4_cp_launch_desc *desc;
+   void *desc;
    uint64_t desc_gpuaddr;
    struct nouveau_bo *desc_bo;
    int ret;
@@ -622,13 +685,20 @@
    if (ret)
       goto out;
 
-   nve4_compute_setup_launch_desc(nvc0, desc, info);
+   if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
+      gp100_compute_setup_launch_desc(nvc0, desc, info);
+   else
+      nve4_compute_setup_launch_desc(nvc0, desc, info);
 
    nve4_compute_upload_input(nvc0, info);
 
 #ifdef DEBUG
-   if (debug_get_num_option("NV50_PROG_DEBUG", 0))
-      nve4_compute_dump_launch_desc(desc);
+   if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
+      if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
+         gp100_compute_dump_launch_desc(desc);
+      else
+         nve4_compute_dump_launch_desc(desc);
+   }
 #endif
 
    if (unlikely(info->indirect)) {
@@ -646,35 +716,17 @@
       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
       PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
 
-      /* overwrite griddim_x and griddim_y as two 32-bits integers even
-       * if griddim_y must be a 16-bits integer */
-      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-      PUSH_DATAh(push, desc_gpuaddr + 48);
-      PUSH_DATA (push, desc_gpuaddr + 48);
-      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-      PUSH_DATA (push, 8);
-      PUSH_DATA (push, 1);
+      if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) {
+         nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 12, offset);
+      } else {
+         /* overwrite griddim_x and griddim_y as two 32-bits integers even
+          * if griddim_y must be a 16-bits integer */
+         nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 8, offset);
 
-      nouveau_pushbuf_space(push, 32, 0, 1);
-      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
-
-      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
-      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
-      nouveau_pushbuf_data(push, res->bo, offset,
-                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
-
-      /* overwrite the 16 high bits of griddim_y with griddim_z because
-       * we need (z << 16) | x */
-      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-      PUSH_DATAh(push, desc_gpuaddr + 54);
-      PUSH_DATA (push, desc_gpuaddr + 54);
-      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-      PUSH_DATA (push, 4);
-      PUSH_DATA (push, 1);
-      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
-      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
-      nouveau_pushbuf_data(push, res->bo, offset + 8,
-                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+         /* overwrite the 16 high bits of griddim_y with griddim_z because
+          * we need (z << 16) | x */
+         nve4_upload_indirect_desc(push, res, desc_gpuaddr + 54, 4, offset + 8);
+      }
    }
 
    /* upload descriptor and flush */
@@ -831,6 +883,53 @@
                    i, address, size, valid ? "" : "  (invalid)");
    }
 }
+
+static void
+gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *desc)
+{
+   const uint32_t *data = (const uint32_t *)desc;
+   unsigned i;
+   bool zero = false;
+
+   debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
+
+   for (i = 0; i < sizeof(*desc); i += 4) {
+      if (data[i / 4]) {
+         debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
+         zero = false;
+      } else
+      if (!zero) {
+         debug_printf("...\n");
+         zero = true;
+      }
+   }
+
+   debug_printf("entry = 0x%x\n", desc->entry);
+   debug_printf("grid dimensions = %ux%ux%u\n",
+                desc->griddim_x, desc->griddim_y, desc->griddim_z);
+   debug_printf("block dimensions = %ux%ux%u\n",
+                desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
+   debug_printf("s[] size: 0x%x\n", desc->shared_size);
+   debug_printf("l[] size: -0x%x / +0x%x\n",
+                desc->local_size_n, desc->local_size_p);
+   debug_printf("stack size: 0x%x\n", desc->cstack_size);
+   debug_printf("barrier count: %u\n", desc->bar_alloc);
+   debug_printf("$r count: %u\n", desc->gpr_alloc);
+   debug_printf("linked tsc: %d\n", desc->linked_tsc);
+
+   for (i = 0; i < 8; ++i) {
+      uint64_t address;
+      uint32_t size = desc->cb[i].size_sh4 << 4;
+      bool valid = !!(desc->cb_mask & (1 << i));
+
+      address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
+
+      if (!valid && !address && !size)
+         continue;
+      debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
+                   i, address, size, valid ? "" : "  (invalid)");
+   }
+}
 #endif
 
 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 5fe58b9..7ff6935 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -18,8 +18,8 @@
    u16 griddim_z;
    u32 unk14[3];
    u16 shared_size; /* must be aligned to 0x100 */
-   u16 unk15;
-   u16 unk16;
+   u16 unk17;
+   u16 unk18;
    u16 blockdim_x;
    u16 blockdim_y;
    u16 blockdim_z;
@@ -45,6 +45,46 @@
    u32 unk48[16];
 };
 
+struct gp100_cp_launch_desc
+{
+   u32 unk0[8];
+   u32 entry;
+   u32 unk9[2];
+   u32 unk11_0      : 30;
+   u32 linked_tsc   : 1;
+   u32 unk11_31     : 1;
+   u32 griddim_x    : 31;
+   u32 unk12        : 1;
+   u16 griddim_y;
+   u16 unk13;
+   u16 griddim_z;
+   u16 unk14;
+   u32 unk15[2];
+   u32 shared_size  : 18;
+   u32 unk17        : 14;
+   u16 unk18;
+   u16 blockdim_x;
+   u16 blockdim_y;
+   u16 blockdim_z;
+   u32 cb_mask      : 8;
+   u32 unk20        : 24;
+   u32 unk21[8];
+   u32 local_size_p : 24;
+   u32 unk29        : 3;
+   u32 bar_alloc    : 5;
+   u32 local_size_n : 24;
+   u32 gpr_alloc    : 8;
+   u32 cstack_size  : 24;
+   u32 unk31        : 8;
+   struct {
+      u32 address_l;
+      u32 address_h : 17;
+      u32 reserved  : 2;
+      u32 size_sh4  : 13;
+   } cb[8];
+   u32 unk48[16];
+};
+
 static inline void
 nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
 {
@@ -73,6 +113,33 @@
    desc->cb_mask |= 1 << index;
 }
 
+static inline void
+gp100_cp_launch_desc_init_default(struct gp100_cp_launch_desc *desc)
+{
+   memset(desc, 0, sizeof(*desc));
+
+   desc->unk0[4]  = 0x40;
+   desc->unk11_0  = 0x04014000;
+}
+
+static inline void
+gp100_cp_launch_desc_set_cb(struct gp100_cp_launch_desc *desc,
+                            unsigned index,
+                            struct nouveau_bo *bo,
+                            uint32_t base, uint32_t size)
+{
+   uint64_t address = bo->offset + base;
+
+   assert(index < 8);
+   assert(!(base & 0xff));
+
+   desc->cb[index].address_l = address;
+   desc->cb[index].address_h = address >> 32;
+   desc->cb[index].size_sh4 = DIV_ROUND_UP(size, 16);
+
+   desc->cb_mask |= 1 << index;
+}
+
 struct nve4_mp_trap_info {
    u32 lock;
    u32 pc;
diff --git a/src/gallium/drivers/pl111/Android.mk b/src/gallium/drivers/pl111/Android.mk
new file mode 100644
index 0000000..00a123e
--- /dev/null
+++ b/src/gallium/drivers/pl111/Android.mk
@@ -0,0 +1,39 @@
+# Copyright (C) 2014 Emil Velikov <emil.l.velikov@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(C_SOURCES)
+
+LOCAL_MODULE := libmesa_pipe_pl111
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_PL111),)
+GALLIUM_TARGET_DRIVERS += pl111
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_pl111)
+endif
diff --git a/src/gallium/drivers/pl111/Automake.inc b/src/gallium/drivers/pl111/Automake.inc
new file mode 100644
index 0000000..4ecd7de
--- /dev/null
+++ b/src/gallium/drivers/pl111/Automake.inc
@@ -0,0 +1,9 @@
+if HAVE_GALLIUM_PL111
+
+TARGET_DRIVERS += pl111
+TARGET_CPPFLAGS += -DGALLIUM_PL111
+TARGET_LIB_DEPS += \
+    $(top_builddir)/src/gallium/winsys/pl111/drm/libpl111drm.la \
+    $(LIBDRM_LIBS)
+
+endif
diff --git a/src/gallium/drivers/pl111/Makefile.am b/src/gallium/drivers/pl111/Makefile.am
new file mode 100644
index 0000000..b3e95ee
--- /dev/null
+++ b/src/gallium/drivers/pl111/Makefile.am
@@ -0,0 +1,8 @@
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CPPFLAGS = \
+	$(GALLIUM_CFLAGS)
+
+noinst_LTLIBRARIES = libpl111.la
+
+libpl111_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/drivers/pl111/Makefile.sources b/src/gallium/drivers/pl111/Makefile.sources
new file mode 100644
index 0000000..2039675
--- /dev/null
+++ b/src/gallium/drivers/pl111/Makefile.sources
@@ -0,0 +1,2 @@
+C_SOURCES :=
+
diff --git a/src/gallium/drivers/r300/Android.mk b/src/gallium/drivers/r300/Android.mk
index e2939ac..7b1c105 100644
--- a/src/gallium/drivers/r300/Android.mk
+++ b/src/gallium/drivers/r300/Android.mk
@@ -41,3 +41,8 @@
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
+ifneq ($(HAVE_GALLIUM_R300),)
+GALLIUM_TARGET_DRIVERS += r300
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_radeon)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 434cf38..8fda727 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -567,7 +567,7 @@
      * colorbuffers. */
 
     util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-    util_blitter_default_src_texture(&src_templ, src, src_level);
+    util_blitter_default_src_texture(r300->blitter, &src_templ, src, src_level);
 
     layout = util_format_description(dst_templ.format)->layout;
 
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 02af5d7..6f4231d 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -63,7 +63,7 @@
     }
 
     /* Manually-created vertex buffers. */
-    pipe_resource_reference(&r300->dummy_vb.buffer, NULL);
+    pipe_vertex_buffer_unreference(&r300->dummy_vb);
     pb_reference(&r300->vbo, NULL);
 
     r300->context.delete_depth_stencil_alpha_state(&r300->context,
@@ -468,7 +468,7 @@
         vb.height0 = 1;
         vb.depth0 = 1;
 
-        r300->dummy_vb.buffer = screen->resource_create(screen, &vb);
+        r300->dummy_vb.buffer.resource = screen->resource_create(screen, &vb);
         r300->context.set_vertex_buffers(&r300->context, 0, 1, &r300->dummy_vb);
     }
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 264ace5..ce1fab4 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -591,7 +591,6 @@
 
     void *dsa_decompress_zmask;
 
-    struct pipe_index_buffer index_buffer;
     struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
     unsigned nr_vertex_buffers;
     struct u_upload_mgr *uploader;
@@ -733,7 +732,7 @@
 
 /* r300_render_translate.c */
 void r300_translate_index_buffer(struct r300_context *r300,
-                                 struct pipe_index_buffer *ib,
+                                 const struct pipe_draw_info *info,
                                  struct pipe_resource **out_index_buffer,
                                  unsigned *index_size, unsigned index_offset,
                                  unsigned *start, unsigned count);
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 671aa62..63f12de 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -966,7 +966,7 @@
         }
 
         for (i = 0; i < vertex_array_count; i++) {
-            buf = r300_resource(vbuf[velem[i].vertex_buffer_index].buffer);
+            buf = r300_resource(vbuf[velem[i].vertex_buffer_index].buffer.resource);
             OUT_CS_RELOC(buf);
         }
     } else {
@@ -1018,7 +1018,7 @@
         }
 
         for (i = 0; i < vertex_array_count; i++) {
-            buf = r300_resource(vbuf[velem[i].vertex_buffer_index].buffer);
+            buf = r300_resource(vbuf[velem[i].vertex_buffer_index].buffer.resource);
             OUT_CS_RELOC(buf);
         }
     }
@@ -1381,7 +1381,7 @@
         struct pipe_resource *buf;
 
         for (; vbuf != last; vbuf++) {
-            buf = vbuf->buffer;
+            buf = vbuf->buffer.resource;
             if (!buf)
                 continue;
 
diff --git a/src/gallium/drivers/r300/r300_public.h b/src/gallium/drivers/r300/r300_public.h
index 57a69cb..d230010 100644
--- a/src/gallium/drivers/r300/r300_public.h
+++ b/src/gallium/drivers/r300/r300_public.h
@@ -8,7 +8,7 @@
 
 struct radeon_winsys;
 
-struct pipe_screen* r300_screen_create(struct radeon_winsys *rws);
+struct pipe_screen* r300_screen_create(struct radeon_winsys *rws, unsigned flags);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index ad0f489..8eca143 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -373,7 +373,7 @@
         /* Map the buffer. */
         if (!map[vbi]) {
             map[vbi] = (uint32_t*)r300->rws->buffer_map(
-                r300_resource(vbuf->buffer)->buf,
+                r300_resource(vbuf->buffer.resource)->buf,
                 r300->cs, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED);
             map[vbi] += (vbuf->buffer_offset / 4) + stride[i] * info->start;
         }
@@ -501,7 +501,7 @@
     const uint8_t *ptr1;
     const uint16_t *ptr2;
     const uint32_t *ptr4;
-    unsigned index_size = r300->index_buffer.index_size;
+    unsigned index_size = info->index_size;
     unsigned i, count_dwords = index_size == 4 ? info->count :
                                                  (info->count + 1) / 2;
     CS_LOCALS(r300);
@@ -519,7 +519,7 @@
 
     switch (index_size) {
     case 1:
-        ptr1 = (uint8_t*)r300->index_buffer.user_buffer;
+        ptr1 = (uint8_t*)info->index.user;
         ptr1 += info->start;
 
         OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (info->count << 16) |
@@ -543,7 +543,7 @@
         break;
 
     case 2:
-        ptr2 = (uint16_t*)r300->index_buffer.user_buffer;
+        ptr2 = (uint16_t*)info->index.user;
         ptr2 += info->start;
 
         OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (info->count << 16) |
@@ -562,7 +562,7 @@
         break;
 
     case 4:
-        ptr4 = (uint32_t*)r300->index_buffer.user_buffer;
+        ptr4 = (uint32_t*)info->index.user;
         ptr4 += info->start;
 
         OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (info->count << 16) |
@@ -584,8 +584,9 @@
                                const struct pipe_draw_info *info,
                                int instance_id)
 {
-    struct pipe_resource *indexBuffer = r300->index_buffer.buffer;
-    unsigned indexSize = r300->index_buffer.index_size;
+    struct pipe_resource *indexBuffer =
+       info->has_user_indices ? NULL : info->index.resource;
+    unsigned indexSize = info->index_size;
     struct pipe_resource* orgIndexBuffer = indexBuffer;
     unsigned start = info->start;
     unsigned count = info->count;
@@ -600,7 +601,7 @@
                               &index_offset);
     }
 
-    r300_translate_index_buffer(r300, &r300->index_buffer, &indexBuffer,
+    r300_translate_index_buffer(r300, info, &indexBuffer,
                                 &indexSize, index_offset, &start, count);
 
     /* Fallback for misaligned ushort indices. */
@@ -621,10 +622,10 @@
                                      count, (uint8_t*)ptr);
         }
     } else {
-        if (r300->index_buffer.user_buffer)
+        if (info->has_user_indices)
             r300_upload_index_buffer(r300, &indexBuffer, indexSize,
                                      &start, count,
-                                     r300->index_buffer.user_buffer);
+                                     info->index.user);
     }
 
     /* 19 dwords for emit_draw_elements. Give up if the function fails. */
@@ -741,13 +742,13 @@
       unsigned size, max_count, value;
 
       /* We're not interested in constant and per-instance attribs. */
-      if (!vb->buffer ||
+      if (!vb->buffer.resource ||
           !vb->stride ||
           velems[i].instance_divisor) {
          continue;
       }
 
-      size = vb->buffer->width0;
+      size = vb->buffer.resource->width0;
 
       /* Subtract buffer_offset. */
       value = vb->buffer_offset;
@@ -792,7 +793,7 @@
     r300_update_derived_state(r300);
 
     /* Draw. */
-    if (info.indexed) {
+    if (info.index_size) {
         unsigned max_count = r300_max_vertex_count(r300);
 
         if (!max_count) {
@@ -807,11 +808,9 @@
         }
 
         info.max_index = max_count - 1;
-        info.start += r300->index_buffer.offset / r300->index_buffer.index_size;
 
         if (info.instance_count <= 1) {
-            if (info.count <= 8 &&
-                r300->index_buffer.user_buffer) {
+            if (info.count <= 8 && info.has_user_indices) {
                 r300_draw_elements_immediate(r300, &info);
             } else {
                 r300_draw_elements(r300, &info, -1);
@@ -847,6 +846,17 @@
         return;
     }
 
+    if (!u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+       return;
+
+    if (info->index_size) {
+        draw_set_indexes(r300->draw,
+                         info->has_user_indices ?
+                             info->index.user :
+                             r300_resource(info->index.resource)->malloced_buffer,
+                         info->index_size, ~0);
+    }
+
     r300_update_derived_state(r300);
 
     draw_vbo(r300->draw, info);
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
index 7800f6e..7dc49d3 100644
--- a/src/gallium/drivers/r300/r300_render_translate.c
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -26,7 +26,7 @@
 
 
 void r300_translate_index_buffer(struct r300_context *r300,
-                                 struct pipe_index_buffer *ib,
+                                 const struct pipe_draw_info *info,
                                  struct pipe_resource **out_buffer,
                                  unsigned *index_size, unsigned index_offset,
                                  unsigned *start, unsigned count)
@@ -41,7 +41,7 @@
                        &out_offset, out_buffer, &ptr);
 
         util_shorten_ubyte_elts_to_userptr(
-                &r300->context, ib, PIPE_TRANSFER_UNSYNCHRONIZED, index_offset,
+                &r300->context, info, PIPE_TRANSFER_UNSYNCHRONIZED, index_offset,
                 *start, count, ptr);
 
         *index_size = 2;
@@ -54,7 +54,7 @@
             u_upload_alloc(r300->uploader, 0, count * 2, 4,
                            &out_offset, out_buffer, &ptr);
 
-            util_rebuild_ushort_elts_to_userptr(&r300->context, ib,
+            util_rebuild_ushort_elts_to_userptr(&r300->context, info,
                                                 PIPE_TRANSFER_UNSYNCHRONIZED,
                                                 index_offset, *start,
                                                 count, ptr);
@@ -69,7 +69,7 @@
             u_upload_alloc(r300->uploader, 0, count * 4, 4,
                            &out_offset, out_buffer, &ptr);
 
-            util_rebuild_uint_elts_to_userptr(&r300->context, ib,
+            util_rebuild_uint_elts_to_userptr(&r300->context, info,
                                               PIPE_TRANSFER_UNSYNCHRONIZED,
                                               index_offset, *start,
                                               count, ptr);
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index addfc79..5cdb248 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -113,6 +113,7 @@
         case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
         case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
         case PIPE_CAP_CLIP_HALFZ:
+        case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
             return 1;
 
         case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
@@ -238,6 +239,9 @@
         case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
         case PIPE_CAP_TGSI_BALLOT:
         case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+        case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+        case PIPE_CAP_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_BINDLESS_TEXTURE:
             return 0;
 
         /* SWTCL-only features. */
@@ -352,6 +356,7 @@
         case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
         case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+        case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
@@ -412,6 +417,7 @@
         case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
         case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+        case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
@@ -718,7 +724,7 @@
     return rws->fence_wait(rws, fence, timeout);
 }
 
-struct pipe_screen* r300_screen_create(struct radeon_winsys *rws)
+struct pipe_screen* r300_screen_create(struct radeon_winsys *rws, unsigned flags)
 {
     struct r300_screen *r300screen = CALLOC_STRUCT(r300_screen);
 
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 95ada57..4af1c46 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -111,7 +111,7 @@
 
                 /* We changed the buffer, now we need to bind it where the old one was bound. */
                 for (i = 0; i < r300->nr_vertex_buffers; i++) {
-                    if (r300->vertex_buffer[i].buffer == &rbuf->b.b) {
+                    if (r300->vertex_buffer[i].buffer.resource == &rbuf->b.b) {
                         r300->vertex_arrays_dirty = TRUE;
                         break;
                     }
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 709cbd1..c2b9937 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1773,47 +1773,16 @@
         return;
 
     for (i = 0; i < count; i++) {
-        if (buffers[i].user_buffer) {
+        if (buffers[i].is_user_buffer) {
             draw_set_mapped_vertex_buffer(r300->draw, start_slot + i,
-                                          buffers[i].user_buffer, ~0);
-        } else if (buffers[i].buffer) {
+                                          buffers[i].buffer.user, ~0);
+        } else if (buffers[i].buffer.resource) {
             draw_set_mapped_vertex_buffer(r300->draw, start_slot + i,
-                                          r300_resource(buffers[i].buffer)->malloced_buffer, ~0);
+                                          r300_resource(buffers[i].buffer.resource)->malloced_buffer, ~0);
         }
     }
 }
 
-static void r300_set_index_buffer_hwtcl(struct pipe_context* pipe,
-                                        const struct pipe_index_buffer *ib)
-{
-    struct r300_context* r300 = r300_context(pipe);
-
-    if (ib) {
-        pipe_resource_reference(&r300->index_buffer.buffer, ib->buffer);
-        memcpy(&r300->index_buffer, ib, sizeof(*ib));
-    } else {
-        pipe_resource_reference(&r300->index_buffer.buffer, NULL);
-    }
-}
-
-static void r300_set_index_buffer_swtcl(struct pipe_context* pipe,
-                                        const struct pipe_index_buffer *ib)
-{
-    struct r300_context* r300 = r300_context(pipe);
-
-    if (ib) {
-        const void *buf = NULL;
-        if (ib->user_buffer) {
-            buf = ib->user_buffer;
-        } else if (ib->buffer) {
-            buf = r300_resource(ib->buffer)->malloced_buffer;
-        }
-        draw_set_indexes(r300->draw,
-                         (const ubyte *) buf + ib->offset,
-                         ib->index_size, ~0);
-    }
-}
-
 /* Initialize the PSC tables. */
 static void r300_vertex_psc(struct r300_vertex_element_state *velems)
 {
@@ -2125,10 +2094,8 @@
 
     if (r300->screen->caps.has_tcl) {
         r300->context.set_vertex_buffers = r300_set_vertex_buffers_hwtcl;
-        r300->context.set_index_buffer = r300_set_index_buffer_hwtcl;
     } else {
         r300->context.set_vertex_buffers = r300_set_vertex_buffers_swtcl;
-        r300->context.set_index_buffer = r300_set_index_buffer_swtcl;
     }
 
     r300->context.create_vertex_elements_state = r300_create_vertex_elements_state;
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index c202fbe..cdf9ccb 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1119,7 +1119,7 @@
     /* Create the backing buffer if needed. */
     if (!tex->buf) {
         tex->buf = rws->buffer_create(rws, tex->tex.size_in_bytes, 2048,
-                                      tex->domain, RADEON_FLAG_HANDLE);
+                                      tex->domain, RADEON_FLAG_NO_SUBALLOC);
 
         if (!tex->buf) {
             goto fail;
diff --git a/src/gallium/drivers/r600/Android.mk b/src/gallium/drivers/r600/Android.mk
index 7be3614..1683cfa 100644
--- a/src/gallium/drivers/r600/Android.mk
+++ b/src/gallium/drivers/r600/Android.mk
@@ -30,12 +30,28 @@
 
 LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES)
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/amd/common \
-	external/libcxx/include
+LOCAL_C_INCLUDES += $(MESA_TOP)/src/amd/common
 
-LOCAL_STATIC_LIBRARIES := libmesa_amd_common
 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_r600
 
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+intermediates := $(call local-generated-sources-dir)
+
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(R600_GENERATED_FILES))
+
+$(intermediates)/egd_tables.h: $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h > $@
+
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_R600),)
+GALLIUM_TARGET_DRIVERS += r600
+$(eval GALLIUM_LIBS += \
+	$(LOCAL_MODULE) \
+	libmesa_winsys_radeon)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/r600/Automake.inc b/src/gallium/drivers/r600/Automake.inc
index fa45735..bb9f6ec 100644
--- a/src/gallium/drivers/r600/Automake.inc
+++ b/src/gallium/drivers/r600/Automake.inc
@@ -5,17 +5,12 @@
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/drivers/r600/libr600.la \
 	$(RADEON_LIBS) \
-	$(LIBDRM_LIBS)
+	$(LIBDRM_LIBS) \
+	$(LIBELF_LIBS)
 
 TARGET_RADEON_WINSYS = \
 	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la
 
 TARGET_RADEON_COMMON = \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la
-
-if HAVE_GALLIUM_LLVM
-TARGET_RADEON_COMMON += \
-	$(top_builddir)/src/amd/common/libamd_common.la
-endif
-
 endif
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index 21762d8..36d240d 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -1,14 +1,21 @@
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
+egd_tables.h: $(srcdir)/egd_tables.py $(srcdir)/evergreend.h
+	$(AM_V_at)$(MKDIR_P) $(@D)
+	$(AM_V_GEN) $(PYTHON2) $(srcdir)/egd_tables.py $(srcdir)/evergreend.h > $@
+
+BUILT_SOURCES = $(R600_GENERATED_FILES)
 AM_CFLAGS = \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(RADEON_CFLAGS) \
+	$(LIBELF_CFLAGS) \
 	-I$(top_srcdir)/src/amd/common
 
 AM_CXXFLAGS = \
 	$(GALLIUM_DRIVER_CXXFLAGS) \
 	$(RADEON_CFLAGS) \
+	$(LIBELF_CFLAGS) \
 	-I$(top_srcdir)/src/amd/common
 
 noinst_LTLIBRARIES = libr600.la
@@ -30,6 +37,10 @@
 	-DHAVE_OPENCL
 endif
 
+CLEANFILES = \
+	egd_tables.h
+
 EXTRA_DIST = \
+	egd_tables.py \
 	sb/notes.markdown \
 	sb/sb_bc_fmt_def.inc
diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources
index 8bf8083..2f20652 100644
--- a/src/gallium/drivers/r600/Makefile.sources
+++ b/src/gallium/drivers/r600/Makefile.sources
@@ -2,6 +2,7 @@
 	compute_memory_pool.c \
 	compute_memory_pool.h \
 	eg_asm.c \
+	eg_debug.c \
 	eg_sq.h \
 	evergreen_compute.c \
 	evergreen_compute.h \
@@ -64,3 +65,6 @@
 	sb/sb_shader.h \
 	sb/sb_ssa_builder.cpp \
 	sb/sb_valtable.cpp
+
+R600_GENERATED_FILES = \
+	egd_tables.h
\ No newline at end of file
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 46683c1..6840cf6 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -196,7 +196,12 @@
 
 int eg_bytecode_gds_build(struct r600_bytecode *bc, struct r600_bytecode_gds *gds, unsigned id)
 {
-	unsigned opcode = r600_isa_fetch_opcode(bc->isa->hw_class, gds->op) >> 8;
+	unsigned gds_op = (r600_isa_fetch_opcode(bc->isa->hw_class, gds->op) >> 8) & 0x3f;
+	unsigned opcode;
+	if (gds->op == FETCH_OP_TF_WRITE)
+		opcode = 5;
+	else
+		opcode = 4;
 	bc->bytecode[id++] = S_SQ_MEM_GDS_WORD0_MEM_INST(2) |
 		S_SQ_MEM_GDS_WORD0_MEM_OP(opcode) |
 		S_SQ_MEM_GDS_WORD0_SRC_GPR(gds->src_gpr) |
@@ -207,8 +212,12 @@
 
 	bc->bytecode[id++] = S_SQ_MEM_GDS_WORD1_DST_GPR(gds->dst_gpr) |
 		S_SQ_MEM_GDS_WORD1_DST_REL(gds->dst_rel) |
-		S_SQ_MEM_GDS_WORD1_GDS_OP(gds->gds_op) |
-		S_SQ_MEM_GDS_WORD1_SRC_GPR(gds->src_gpr2);
+		S_SQ_MEM_GDS_WORD1_GDS_OP(gds_op) |
+		S_SQ_MEM_GDS_WORD1_SRC_GPR(gds->src_gpr2) |
+		S_SQ_MEM_GDS_WORD1_UAV_INDEX_MODE(gds->uav_index_mode) |
+		S_SQ_MEM_GDS_WORD1_UAV_ID(gds->uav_id) |
+		S_SQ_MEM_GDS_WORD1_ALLOC_CONSUME(gds->alloc_consume) |
+		S_SQ_MEM_GDS_WORD1_BCAST_FIRST_REQ(gds->bcast_first_req);
 
 	bc->bytecode[id++] = S_SQ_MEM_GDS_WORD2_DST_SEL_X(gds->dst_sel_x) |
 		S_SQ_MEM_GDS_WORD2_DST_SEL_Y(gds->dst_sel_y) |
diff --git a/src/gallium/drivers/r600/eg_debug.c b/src/gallium/drivers/r600/eg_debug.c
new file mode 100644
index 0000000..32a4f23
--- /dev/null
+++ b/src/gallium/drivers/r600/eg_debug.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+#include "r600_pipe.h"
+#include "evergreend.h"
+
+#include "egd_tables.h"
+
+#define AC_IS_TRACE_POINT(x)            (((x) & 0xcafe0000) == 0xcafe0000)
+#define AC_GET_TRACE_POINT_ID(x)        ((x) & 0xffff)
+
+/* Parsed IBs are difficult to read without colors. Use "less -R file" to
+ * read them, or use "aha -b -f file" to convert them to html.
+ */
+#define COLOR_RESET	"\033[0m"
+#define COLOR_RED	"\033[31m"
+#define COLOR_GREEN	"\033[1;32m"
+#define COLOR_YELLOW	"\033[1;33m"
+#define COLOR_CYAN	"\033[1;36m"
+
+#define INDENT_PKT 8
+
+typedef void *(*ac_debug_addr_callback)(void *data, uint64_t addr);
+static void print_spaces(FILE *f, unsigned num)
+{
+	fprintf(f, "%*s", num, "");
+}
+
+static void print_value(FILE *file, uint32_t value, int bits)
+{
+	/* Guess if it's int or float */
+	if (value <= (1 << 15)) {
+		if (value <= 9)
+			fprintf(file, "%u\n", value);
+		else
+			fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
+	} else {
+		float f = uif(value);
+
+		if (fabs(f) < 100000 && f*10 == floor(f*10))
+			fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
+		else
+			/* Don't print more leading zeros than there are bits. */
+			fprintf(file, "0x%0*x\n", bits / 4, value);
+	}
+}
+
+static void print_named_value(FILE *file, const char *name, uint32_t value,
+			      int bits)
+{
+	print_spaces(file, INDENT_PKT);
+	fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
+	print_value(file, value, bits);
+}
+
+static void eg_dump_reg(FILE *file, unsigned offset, uint32_t value,
+			uint32_t field_mask)
+{
+	int r, f;
+
+	for (r = 0; r < ARRAY_SIZE(egd_reg_table); r++) {
+		const struct eg_reg *reg = &egd_reg_table[r];
+		const char *reg_name = egd_strings + reg->name_offset;
+
+		if (reg->offset == offset) {
+			bool first_field = true;
+
+			print_spaces(file, INDENT_PKT);
+			fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ",
+				reg_name);
+
+			if (!reg->num_fields) {
+				print_value(file, value, 32);
+				return;
+			}
+
+			for (f = 0; f < reg->num_fields; f++) {
+				const struct eg_field *field = egd_fields_table + reg->fields_offset + f;
+				const int *values_offsets = egd_strings_offsets + field->values_offset;
+				uint32_t val = (value & field->mask) >>
+					       (ffs(field->mask) - 1);
+
+				if (!(field->mask & field_mask))
+					continue;
+
+				/* Indent the field. */
+				if (!first_field)
+					print_spaces(file,
+						     INDENT_PKT + strlen(reg_name) + 4);
+
+				/* Print the field. */
+				fprintf(file, "%s = ", egd_strings + field->name_offset);
+
+				if (val < field->num_values && values_offsets[val] >= 0)
+					fprintf(file, "%s\n", egd_strings + values_offsets[val]);
+				else
+					print_value(file, val,
+						    util_bitcount(field->mask));
+
+				first_field = false;
+			}
+			return;
+		}
+	}
+
+	print_spaces(file, INDENT_PKT);
+	fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
+}
+
+
+static void ac_parse_set_reg_packet(FILE *f, uint32_t *ib, unsigned count,
+				    unsigned reg_offset)
+{
+	unsigned reg = (ib[1] << 2) + reg_offset;
+	int i;
+
+	for (i = 0; i < count; i++)
+		eg_dump_reg(f, reg + i*4, ib[2+i], ~0);
+}
+
+static uint32_t *ac_parse_packet3(FILE *f, uint32_t *ib, int *num_dw,
+				  int trace_id, enum chip_class chip_class,
+				  ac_debug_addr_callback addr_callback,
+				  void *addr_callback_data)
+{
+	unsigned count = PKT_COUNT_G(ib[0]);
+	unsigned op = PKT3_IT_OPCODE_G(ib[0]);
+	const char *predicate = PKT3_PREDICATE(ib[0]) ? "(predicate)" : "";
+	int i;
+
+	/* Print the name first. */
+	for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
+		if (packet3_table[i].op == op)
+			break;
+
+	if (i < ARRAY_SIZE(packet3_table)) {
+		const char *name = egd_strings + packet3_table[i].name_offset;
+
+		if (op == PKT3_SET_CONTEXT_REG ||
+		    op == PKT3_SET_CONFIG_REG ||
+		    op == PKT3_SET_UCONFIG_REG ||
+		    op == PKT3_SET_SH_REG)
+			fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n",
+				name, predicate);
+		else
+			fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n",
+				name, predicate);
+	} else
+		fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n",
+			op, predicate);
+
+	/* Print the contents. */
+	switch (op) {
+	case PKT3_SET_CONTEXT_REG:
+		ac_parse_set_reg_packet(f, ib, count, EVERGREEN_CONTEXT_REG_OFFSET);
+		break;
+	case PKT3_SET_CONFIG_REG:
+		ac_parse_set_reg_packet(f, ib, count, EVERGREEN_CONFIG_REG_OFFSET);
+		break;
+	case PKT3_SURFACE_SYNC:
+		eg_dump_reg(f, R_0085F0_CP_COHER_CNTL, ib[1], ~0);
+		eg_dump_reg(f, R_0085F4_CP_COHER_SIZE, ib[2], ~0);
+		eg_dump_reg(f, R_0085F8_CP_COHER_BASE, ib[3], ~0);
+		print_named_value(f, "POLL_INTERVAL", ib[4], 16);
+		break;
+	case PKT3_EVENT_WRITE:
+		/* TODO dump VGT_EVENT_INITIATOR */
+#if 0
+		eg_dump_reg(f, R_028A90_VGT_EVENT_INITIATOR, ib[1],
+			    S_028A90_EVENT_TYPE(~0));
+#endif
+		print_named_value(f, "EVENT_INDEX", (ib[1] >> 8) & 0xf, 4);
+		print_named_value(f, "INV_L2", (ib[1] >> 20) & 0x1, 1);
+		if (count > 0) {
+			print_named_value(f, "ADDRESS_LO", ib[2], 32);
+			print_named_value(f, "ADDRESS_HI", ib[3], 16);
+		}
+		break;
+	case PKT3_DRAW_INDEX_AUTO:
+		eg_dump_reg(f, R_008970_VGT_NUM_INDICES, ib[1], ~0);
+		eg_dump_reg(f, R_0287F0_VGT_DRAW_INITIATOR, ib[2], ~0);
+		break;
+	case PKT3_DRAW_INDEX_2:
+		eg_dump_reg(f, R_028A78_VGT_DMA_MAX_SIZE, ib[1], ~0);
+		eg_dump_reg(f, R_0287E8_VGT_DMA_BASE, ib[2], ~0);
+		eg_dump_reg(f, R_0287E4_VGT_DMA_BASE_HI, ib[3], ~0);
+		eg_dump_reg(f, R_008970_VGT_NUM_INDICES, ib[4], ~0);
+		eg_dump_reg(f, R_0287F0_VGT_DRAW_INITIATOR, ib[5], ~0);
+		break;
+	case PKT3_INDEX_TYPE:
+		eg_dump_reg(f, R_028A7C_VGT_DMA_INDEX_TYPE, ib[1], ~0);
+		break;
+	case PKT3_NUM_INSTANCES:
+		eg_dump_reg(f, R_028A88_VGT_NUM_INSTANCES, ib[1], ~0);
+		break;
+	case PKT3_INDIRECT_BUFFER:
+		break;
+	case PKT3_PFP_SYNC_ME:
+		break;
+	case PKT3_NOP:
+		if (ib[0] == 0xffff1000) {
+			count = -1; /* One dword NOP. */
+			break;
+		} else if (count == 0 && AC_IS_TRACE_POINT(ib[1])) {
+			unsigned packet_id = AC_GET_TRACE_POINT_ID(ib[1]);
+
+			print_spaces(f, INDENT_PKT);
+			fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
+
+			if (trace_id == -1)
+				break; /* tracing was disabled */
+
+			print_spaces(f, INDENT_PKT);
+			if (packet_id < trace_id)
+				fprintf(f, COLOR_RED
+					"This trace point was reached by the CP."
+					COLOR_RESET "\n");
+			else if (packet_id == trace_id)
+				fprintf(f, COLOR_RED
+					"!!!!! This is the last trace point that "
+					"was reached by the CP !!!!!"
+					COLOR_RESET "\n");
+			else if (packet_id+1 == trace_id)
+				fprintf(f, COLOR_RED
+					"!!!!! This is the first trace point that "
+					"was NOT been reached by the CP !!!!!"
+					COLOR_RESET "\n");
+			else
+				fprintf(f, COLOR_RED
+					"!!!!! This trace point was NOT reached "
+					"by the CP !!!!!"
+					COLOR_RESET "\n");
+			break;
+		}
+		/* fall through, print all dwords */
+	default:
+		for (i = 0; i < count+1; i++) {
+			print_spaces(f, INDENT_PKT);
+			fprintf(f, "0x%08x\n", ib[1+i]);
+		}
+	}
+
+	ib += count + 2;
+	*num_dw -= count + 2;
+	return ib;
+}
+
+/**
+ * Parse and print an IB into a file.
+ *
+ * \param f		file
+ * \param ib		IB
+ * \param num_dw	size of the IB
+ * \param chip_class	chip class
+ * \param trace_id	the last trace ID that is known to have been reached
+ *			and executed by the CP, typically read from a buffer
+ * \param addr_callback Get a mapped pointer of the IB at a given address. Can
+ *                      be NULL.
+ * \param addr_callback_data user data for addr_callback
+ */
+static void eg_parse_ib(FILE *f, uint32_t *ib, int num_dw, int trace_id,
+			const char *name, enum chip_class chip_class,
+			ac_debug_addr_callback addr_callback, void *addr_callback_data)
+{
+	fprintf(f, "------------------ %s begin ------------------\n", name);
+
+	while (num_dw > 0) {
+		unsigned type = PKT_TYPE_G(ib[0]);
+
+		switch (type) {
+		case 3:
+			ib = ac_parse_packet3(f, ib, &num_dw, trace_id,
+					      chip_class, addr_callback,
+					      addr_callback_data);
+			break;
+		case 2:
+			/* type-2 nop */
+			if (ib[0] == 0x80000000) {
+				fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
+				ib++;
+				num_dw--;
+				break;
+			}
+			/* fall through */
+		default:
+			fprintf(f, "Unknown packet type %i\n", type);
+			return;
+		}
+	}
+
+	fprintf(f, "------------------- %s end -------------------\n", name);
+	if (num_dw < 0) {
+		printf("Packet ends after the end of IB.\n");
+		exit(0);
+	}
+	fprintf(f, "\n");
+}
+
+static void eg_dump_last_ib(struct r600_context *rctx, FILE *f)
+{
+	int last_trace_id = -1;
+
+	if (!rctx->last_gfx.ib)
+		return;
+
+	if (rctx->last_trace_buf) {
+		/* We are expecting that the ddebug pipe has already
+		 * waited for the context, so this buffer should be idle.
+		 * If the GPU is hung, there is no point in waiting for it.
+		 */
+		uint32_t *map = rctx->b.ws->buffer_map(rctx->last_trace_buf->buf,
+						       NULL,
+						       PIPE_TRANSFER_UNSYNCHRONIZED |
+						       PIPE_TRANSFER_READ);
+		if (map)
+			last_trace_id = *map;
+	}
+
+	eg_parse_ib(f, rctx->last_gfx.ib, rctx->last_gfx.num_dw,
+		    last_trace_id, "IB", rctx->b.chip_class,
+		     NULL, NULL);
+}
+
+
+void eg_dump_debug_state(struct pipe_context *ctx, FILE *f,
+			 unsigned flags)
+{
+	struct r600_context *rctx = (struct r600_context*)ctx;
+
+	eg_dump_last_ib(rctx, f);
+
+	fprintf(f, "Done.\n");
+
+	/* dump only once */
+	radeon_clear_saved_cs(&rctx->last_gfx);
+	r600_resource_reference(&rctx->last_trace_buf, NULL);
+}
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index f542a0c..935405b 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -176,6 +176,18 @@
 #define   G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)                    (((x) >> 30) & 0x3)
 #define   C_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE                       0x3FFFFFFF
 /* done */
+
+#define P_SQ_CF_ALLOC_EXPORT_WORD0_RAT
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_ID(x)                   (((x) & 0xF) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_ID(x)                   (((x) >> 0) & 0xF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_ID                      0xFFFFFFF0
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(x)                 (((x) & 0x3F) << 4)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(x)                 (((x) >> 4) & 0x3F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST                    0xFFFFFC0F
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INDEX_MODE(x)           (((x) & 0x3) << 11)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INDEX_MODE(x)           (((x) >> 11) & 0x3)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INDEX_MODE              0xFFFFE7FF
+
 #define P_SQ_CF_ALLOC_EXPORT_WORD1
 #define   S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((unsigned)(x) & 0xF) << 16)
 #define   G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((x) >> 16) & 0xF)
@@ -528,6 +540,10 @@
 #define   S_SQ_MEM_GDS_WORD1_DST_REL(x)                              (((unsigned)(x) & 0x3) << 7)
 #define   S_SQ_MEM_GDS_WORD1_GDS_OP(x)                               (((unsigned)(x) & 0x3f) << 9)
 #define   S_SQ_MEM_GDS_WORD1_SRC_GPR(x)                              (((unsigned)(x) & 0x7f) << 16)
+#define   S_SQ_MEM_GDS_WORD1_UAV_INDEX_MODE(x)                       (((unsigned)(x) & 0x3) << 24)
+#define   S_SQ_MEM_GDS_WORD1_UAV_ID(x)                               (((unsigned)(x) & 0xf) << 26)
+#define   S_SQ_MEM_GDS_WORD1_ALLOC_CONSUME(x)                        (((unsigned)(x) & 0x1) << 30)
+#define   S_SQ_MEM_GDS_WORD1_BCAST_FIRST_REQ(x)                      (((unsigned)(x) & 0x1) << 31)
 
 #define P_SQ_MEM_GDS_WORD2
 #define   S_SQ_MEM_GDS_WORD2_DST_SEL_X(x)                            (((unsigned)(x) & 0x7) << 0)
diff --git a/src/gallium/drivers/r600/egd_tables.py b/src/gallium/drivers/r600/egd_tables.py
new file mode 100644
index 0000000..d7b78c7
--- /dev/null
+++ b/src/gallium/drivers/r600/egd_tables.py
@@ -0,0 +1,310 @@
+
+CopyRight = '''
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+'''
+
+import sys
+import re
+
+
+class StringTable:
+    """
+    A class for collecting multiple strings in a single larger string that is
+    used by indexing (to avoid relocations in the resulting binary)
+    """
+    def __init__(self):
+        self.table = []
+        self.length = 0
+
+    def add(self, string):
+        # We might get lucky with string being a suffix of a previously added string
+        for te in self.table:
+            if te[0].endswith(string):
+                idx = te[1] + len(te[0]) - len(string)
+                te[2].add(idx)
+                return idx
+
+        idx = self.length
+        self.table.append((string, idx, set((idx,))))
+        self.length += len(string) + 1
+
+        return idx
+
+    def emit(self, filp, name, static=True):
+        """
+        Write
+        [static] const char name[] = "...";
+        to filp.
+        """
+        fragments = [
+            '"%s\\0" /* %s */' % (
+                te[0].encode('string_escape'),
+                ', '.join(str(idx) for idx in te[2])
+            )
+            for te in self.table
+        ]
+        filp.write('%sconst char %s[] =\n%s;\n' % (
+            'static ' if static else '',
+            name,
+            '\n'.join('\t' + fragment for fragment in fragments)
+        ))
+
+class IntTable:
+    """
+    A class for collecting multiple arrays of integers in a single big array
+    that is used by indexing (to avoid relocations in the resulting binary)
+    """
+    def __init__(self, typename):
+        self.typename = typename
+        self.table = []
+        self.idxs = set()
+
+    def add(self, array):
+        # We might get lucky and find the array somewhere in the existing data
+        try:
+            idx = 0
+            while True:
+                idx = self.table.index(array[0], idx, len(self.table) - len(array) + 1)
+
+                for i in range(1, len(array)):
+                    if array[i] != self.table[idx + i]:
+                        break
+                else:
+                    self.idxs.add(idx)
+                    return idx
+
+                idx += 1
+        except ValueError:
+            pass
+
+        idx = len(self.table)
+        self.table += array
+        self.idxs.add(idx)
+        return idx
+
+    def emit(self, filp, name, static=True):
+        """
+        Write
+        [static] const typename name[] = { ... };
+        to filp.
+        """
+        idxs = sorted(self.idxs) + [len(self.table)]
+
+        fragments = [
+            ('\t/* %s */ %s' % (
+                idxs[i],
+                ' '.join((str(elt) + ',') for elt in self.table[idxs[i]:idxs[i+1]])
+            ))
+            for i in range(len(idxs) - 1)
+        ]
+
+        filp.write('%sconst %s %s[] = {\n%s\n};\n' % (
+            'static ' if static else '',
+            self.typename, name,
+            '\n'.join(fragments)
+        ))
+
+class Field:
+    def __init__(self, reg, s_name):
+        self.s_name = s_name
+        self.name = strip_prefix(s_name)
+        self.values = []
+        self.varname_values = '%s__%s__values' % (reg.r_name.lower(), self.name.lower())
+
+class Reg:
+    def __init__(self, r_name):
+        self.r_name = r_name
+        self.name = strip_prefix(r_name)
+        self.fields = []
+        self.own_fields = True
+
+
+def strip_prefix(s):
+    '''Strip prefix in the form ._.*_, e.g. R_001234_'''
+    return s[s[2:].find('_')+3:]
+
+def parse(filename, regs, packets):
+    stream = open(filename)
+
+    for line in stream:
+        if not line.startswith('#define '):
+            continue
+
+        line = line[8:].strip()
+
+        if line.startswith('R_'):
+            name = line.split()[0]
+
+            for it in regs:
+                if it.r_name == name:
+                    reg = it
+                    break
+            else:
+                reg = Reg(name)
+                regs.append(reg)
+
+        elif line.startswith('S_'):
+            name = line[:line.find('(')]
+
+            for it in reg.fields:
+                if it.s_name == name:
+                    field = it
+                    break
+            else:
+                field = Field(reg, name)
+                reg.fields.append(field)
+
+        elif line.startswith('V_'):
+            split = line.split()
+            name = split[0]
+            value = int(split[1], 0)
+
+            for (n,v) in field.values:
+                if n == name:
+                    if v != value:
+                        sys.exit('Value mismatch: name = ' + name)
+
+            field.values.append((name, value))
+
+        elif line.startswith('PKT3_') and line.find('0x') != -1 and line.find('(') == -1:
+            packets.append(line.split()[0])
+
+    # Copy fields to indexed registers which have their fields only defined
+    # at register index 0.
+    # For example, copy fields from CB_COLOR0_INFO to CB_COLORn_INFO, n > 0.
+    match_number = re.compile('[0-9]+')
+    reg_dict = dict()
+
+    # Create a dict of registers with fields and '0' in their name
+    for reg in regs:
+        if len(reg.fields) and reg.name.find('0') != -1:
+            reg_dict[reg.name] = reg
+
+    # Assign fields
+    for reg in regs:
+        if not len(reg.fields):
+            reg0 = reg_dict.get(match_number.sub('0', reg.name))
+            if reg0 != None:
+                reg.fields = reg0.fields
+                reg.fields_owner = reg0
+                reg.own_fields = False
+
+
+def write_tables(regs, packets):
+
+    strings = StringTable()
+    strings_offsets = IntTable("int")
+
+    print '/* This file is autogenerated by egd_tables.py from evergreend.h. Do not edit directly. */'
+    print
+    print CopyRight.strip()
+    print '''
+#ifndef EG_TABLES_H
+#define EG_TABLES_H
+
+struct eg_field {
+        unsigned name_offset;
+        unsigned mask;
+        unsigned num_values;
+        unsigned values_offset; /* offset into eg_strings_offsets */
+};
+
+struct eg_reg {
+        unsigned name_offset;
+        unsigned offset;
+        unsigned num_fields;
+        unsigned fields_offset;
+};
+
+struct eg_packet3 {
+        unsigned name_offset;
+        unsigned op;
+};
+'''
+
+    print 'static const struct eg_packet3 packet3_table[] = {'
+    for pkt in packets:
+        print '\t{%s, %s},' % (strings.add(pkt[5:]), pkt)
+    print '};'
+    print
+
+    print 'static const struct eg_field egd_fields_table[] = {'
+
+    fields_idx = 0
+    for reg in regs:
+        if len(reg.fields) and reg.own_fields:
+            print '\t/* %s */' % (fields_idx)
+
+            reg.fields_idx = fields_idx
+
+            for field in reg.fields:
+                if len(field.values):
+                    values_offsets = []
+                    for value in field.values:
+                        while value[1] >= len(values_offsets):
+                            values_offsets.append(-1)
+                        values_offsets[value[1]] = strings.add(strip_prefix(value[0]))
+                    print '\t{%s, %s(~0u), %s, %s},' % (
+                        strings.add(field.name), field.s_name,
+                        len(values_offsets), strings_offsets.add(values_offsets))
+                else:
+                    print '\t{%s, %s(~0u)},' % (strings.add(field.name), field.s_name)
+                fields_idx += 1
+
+    print '};'
+    print
+
+    print 'static const struct eg_reg egd_reg_table[] = {'
+    for reg in regs:
+        if len(reg.fields):
+            print '\t{%s, %s, %s, %s},' % (strings.add(reg.name), reg.r_name,
+                len(reg.fields), reg.fields_idx if reg.own_fields else reg.fields_owner.fields_idx)
+        else:
+            print '\t{%s, %s},' % (strings.add(reg.name), reg.r_name)
+    print '};'
+    print
+
+    strings.emit(sys.stdout, "egd_strings")
+
+    print
+
+    strings_offsets.emit(sys.stdout, "egd_strings_offsets")
+
+    print
+    print '#endif'
+
+
+def main():
+    regs = []
+    packets = []
+    for arg in sys.argv[1:]:
+        parse(arg, regs, packets)
+    write_tables(regs, packets)
+
+
+if __name__ == '__main__':
+    main()
+
+# kate: space-indent on; indent-width 4; replace-tabs on;
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index ca2081a..6e87539 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -24,9 +24,12 @@
  *      Adam Rak <adam.rak@streamnovation.com>
  */
 
+#ifdef HAVE_OPENCL
+#include <gelf.h>
+#include <libelf.h>
+#endif
 #include <stdio.h>
 #include <errno.h>
-#include "ac_binary.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
@@ -146,8 +149,8 @@
 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
 	vb->stride = 1;
 	vb->buffer_offset = offset;
-	vb->buffer = buffer;
-	vb->user_buffer = NULL;
+	vb->buffer.resource = buffer;
+	vb->is_user_buffer = false;
 
 	/* The vertex instructions in the compute shaders use the texture cache,
 	 * so we need to invalidate it. */
@@ -179,6 +182,169 @@
 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
 
 #ifdef HAVE_OPENCL
+static void parse_symbol_table(Elf_Data *symbol_table_data,
+				const GElf_Shdr *symbol_table_header,
+				struct ac_shader_binary *binary)
+{
+	GElf_Sym symbol;
+	unsigned i = 0;
+	unsigned symbol_count =
+		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
+
+	/* We are over allocating this list, because symbol_count gives the
+	 * total number of symbols, and we will only be filling the list
+	 * with offsets of global symbols.  The memory savings from
+	 * allocating the correct size of this list will be small, and
+	 * I don't think it is worth the cost of pre-computing the number
+	 * of global symbols.
+	 */
+	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
+
+	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
+		unsigned i;
+		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
+		    symbol.st_shndx == 0 /* Undefined symbol */) {
+			continue;
+		}
+
+		binary->global_symbol_offsets[binary->global_symbol_count] =
+					symbol.st_value;
+
+		/* Sort the list using bubble sort.  This list will usually
+		 * be small. */
+		for (i = binary->global_symbol_count; i > 0; --i) {
+			uint64_t lhs = binary->global_symbol_offsets[i - 1];
+			uint64_t rhs = binary->global_symbol_offsets[i];
+			if (lhs < rhs) {
+				break;
+			}
+			binary->global_symbol_offsets[i] = lhs;
+			binary->global_symbol_offsets[i - 1] = rhs;
+		}
+		++binary->global_symbol_count;
+	}
+}
+
+
+static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
+			unsigned symbol_sh_link,
+			struct ac_shader_binary *binary)
+{
+	unsigned i;
+
+	if (!relocs || !symbols || !binary->reloc_count) {
+		return;
+	}
+	binary->relocs = CALLOC(binary->reloc_count,
+			sizeof(struct ac_shader_reloc));
+	for (i = 0; i < binary->reloc_count; i++) {
+		GElf_Sym symbol;
+		GElf_Rel rel;
+		char *symbol_name;
+		struct ac_shader_reloc *reloc = &binary->relocs[i];
+
+		gelf_getrel(relocs, i, &rel);
+		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
+		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
+
+		reloc->offset = rel.r_offset;
+		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
+		reloc->name[sizeof(reloc->name)-1] = 0;
+	}
+}
+
+static void r600_elf_read(const char *elf_data, unsigned elf_size,
+		 struct ac_shader_binary *binary)
+{
+	char *elf_buffer;
+	Elf *elf;
+	Elf_Scn *section = NULL;
+	Elf_Data *symbols = NULL, *relocs = NULL;
+	size_t section_str_index;
+	unsigned symbol_sh_link = 0;
+
+	/* One of the libelf implementations
+	 * (http://www.mr511.de/software/english.htm) requires calling
+	 * elf_version() before elf_memory().
+	 */
+	elf_version(EV_CURRENT);
+	elf_buffer = MALLOC(elf_size);
+	memcpy(elf_buffer, elf_data, elf_size);
+
+	elf = elf_memory(elf_buffer, elf_size);
+
+	elf_getshdrstrndx(elf, &section_str_index);
+
+	while ((section = elf_nextscn(elf, section))) {
+		const char *name;
+		Elf_Data *section_data = NULL;
+		GElf_Shdr section_header;
+		if (gelf_getshdr(section, &section_header) != &section_header) {
+			fprintf(stderr, "Failed to read ELF section header\n");
+			return;
+		}
+		name = elf_strptr(elf, section_str_index, section_header.sh_name);
+		if (!strcmp(name, ".text")) {
+			section_data = elf_getdata(section, section_data);
+			binary->code_size = section_data->d_size;
+			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
+			memcpy(binary->code, section_data->d_buf, binary->code_size);
+		} else if (!strcmp(name, ".AMDGPU.config")) {
+			section_data = elf_getdata(section, section_data);
+			binary->config_size = section_data->d_size;
+			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
+			memcpy(binary->config, section_data->d_buf, binary->config_size);
+		} else if (!strcmp(name, ".AMDGPU.disasm")) {
+			/* Always read disassembly if it's available. */
+			section_data = elf_getdata(section, section_data);
+			binary->disasm_string = strndup(section_data->d_buf,
+							section_data->d_size);
+		} else if (!strncmp(name, ".rodata", 7)) {
+			section_data = elf_getdata(section, section_data);
+			binary->rodata_size = section_data->d_size;
+			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
+			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
+		} else if (!strncmp(name, ".symtab", 7)) {
+			symbols = elf_getdata(section, section_data);
+			symbol_sh_link = section_header.sh_link;
+			parse_symbol_table(symbols, &section_header, binary);
+		} else if (!strcmp(name, ".rel.text")) {
+			relocs = elf_getdata(section, section_data);
+			binary->reloc_count = section_header.sh_size /
+					section_header.sh_entsize;
+		}
+	}
+
+	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
+
+	if (elf){
+		elf_end(elf);
+	}
+	FREE(elf_buffer);
+
+	/* Cache the config size per symbol */
+	if (binary->global_symbol_count) {
+		binary->config_size_per_symbol =
+			binary->config_size / binary->global_symbol_count;
+	} else {
+		binary->global_symbol_count = 1;
+		binary->config_size_per_symbol = binary->config_size;
+	}
+}
+
+static const unsigned char *r600_shader_binary_config_start(
+	const struct ac_shader_binary *binary,
+	uint64_t symbol_offset)
+{
+	unsigned i;
+	for (i = 0; i < binary->global_symbol_count; ++i) {
+		if (binary->global_symbol_offsets[i] == symbol_offset) {
+			unsigned offset = i * binary->config_size_per_symbol;
+			return binary->config + offset;
+		}
+	}
+	return binary->config;
+}
 
 static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
 					   struct r600_bytecode *bc,
@@ -187,7 +353,7 @@
 {
        unsigned i;
        const unsigned char *config =
-               ac_shader_binary_config_start(binary, symbol_offset);
+               r600_shader_binary_config_start(binary, symbol_offset);
 
        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
                unsigned reg =
@@ -251,7 +417,7 @@
 	header = cso->prog;
 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
 	radeon_shader_binary_init(&shader->binary);
-	ac_elf_read(code, header->num_bytes, &shader->binary);
+	r600_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
 	/* Upload code + ROdata */
@@ -281,7 +447,9 @@
 	if (!shader)
 		return;
 
+#ifdef HAVE_OPENCL
 	radeon_shader_binary_clean(&shader->binary);
+#endif
 	r600_destroy_shader(&shader->bc);
 
 	/* TODO destroy shader->code_bo, shader->const_bo
@@ -442,6 +610,9 @@
 	radeon_emit(cs, info->grid[2]);
 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 	radeon_emit(cs, 1);
+
+	if (rctx->is_debug)
+		eg_trace_emit(rctx);
 }
 
 static void compute_emit_cs(struct r600_context *rctx,
@@ -867,10 +1038,9 @@
 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
 
 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
-						S_0286E8_TID_IN_GROUP_ENA
-						| S_0286E8_TGID_ENA
-						| S_0286E8_DISABLE_INDEX_PACK)
-						;
+			       S_0286E8_TID_IN_GROUP_ENA(1) |
+			       S_0286E8_TGID_ENA(1) |
+			       S_0286E8_DISABLE_INDEX_PACK(1));
 
 	/* The LOOP_CONST registers are an optimizations for loops that allows
 	 * you to store the initial counter, increment value, and maximum
diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h
index 6f4be3e..32f53ad 100644
--- a/src/gallium/drivers/r600/evergreen_compute_internal.h
+++ b/src/gallium/drivers/r600/evergreen_compute_internal.h
@@ -21,10 +21,10 @@
  * Authors:
  *      Adam Rak <adam.rak@streamnovation.com>
  */
- 
 #ifndef EVERGREEN_COMPUTE_INTERNAL_H
 #define EVERGREEN_COMPUTE_INTERNAL_H
 
+#include "ac_binary.h"
 #include "r600_asm.h"
 #ifdef HAVE_OPENCL
 #include <llvm-c/Core.h>
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 5697da4..9595351 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1393,8 +1393,8 @@
 	}
 
 	/* use htile only for first level */
-	if (rtex->htile_buffer && !level) {
-		uint64_t va = rtex->htile_buffer->gpu_address;
+	if (rtex->htile_offset && !level) {
+		uint64_t va = rtex->resource.gpu_address + rtex->htile_offset;
 		surf->db_htile_data_base = va >> 8;
 		surf->db_htile_surface = S_028ABC_HTILE_WIDTH(1) |
 					 S_028ABC_HTILE_HEIGHT(1) |
@@ -1550,6 +1550,7 @@
 	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
+	rctx->framebuffer.do_update_surf_dirtiness = true;
 }
 
 static void evergreen_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
@@ -1875,7 +1876,7 @@
 		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, &rtex->resource,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 		radeon_emit(cs, reloc_idx);
@@ -1952,7 +1953,7 @@
 		unsigned buffer_index = u_bit_scan(&dirty_mask);
 
 		vb = &state->vb[buffer_index];
-		rbuffer = (struct r600_resource*)vb->buffer;
+		rbuffer = (struct r600_resource*)vb->buffer.resource;
 		assert(rbuffer);
 
 		va = rbuffer->gpu_address + vb->buffer_offset;
@@ -4072,3 +4073,32 @@
 	}
 	return true;
 }
+
+#define AC_ENCODE_TRACE_POINT(id)       (0xcafe0000 | ((id) & 0xffff))
+
+void eg_trace_emit(struct r600_context *rctx)
+{
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	unsigned reloc;
+
+	if (rctx->b.chip_class < EVERGREEN)
+		return;
+
+	/* This must be done after r600_need_cs_space. */
+	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+					  (struct r600_resource*)rctx->trace_buf, RADEON_USAGE_WRITE,
+					  RADEON_PRIO_CP_DMA);
+
+	rctx->trace_id++;
+	radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rctx->trace_buf,
+			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+	radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+	radeon_emit(cs, rctx->trace_buf->gpu_address);
+	radeon_emit(cs, rctx->trace_buf->gpu_address >> 32 | MEM_WRITE_32_BITS | MEM_WRITE_CONFIRM);
+	radeon_emit(cs, rctx->trace_id);
+	radeon_emit(cs, 0);
+	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+	radeon_emit(cs, reloc);
+	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+	radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id));
+}
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 0173ea3..2e54928 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -50,6 +50,8 @@
 #define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH       0x1f
 #define EVENT_TYPE_VGT_FLUSH                   0x24
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c
+#define EVENT_TYPE_CS_DONE                     0x2f
+#define EVENT_TYPE_PS_DONE                     0x30
 
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
@@ -59,6 +61,7 @@
 		 * 3 - SAMPLE_STREAMOUTSTAT*
 		 * 4 - *S_PARTIAL_FLUSH
 		 * 5 - TS events
+		 * 6 - EOS events
 		 */
 
 #define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
@@ -87,6 +90,8 @@
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define		WAIT_REG_MEM_EQUAL		3
 #define PKT3_MEM_WRITE                         0x3D
+#define		MEM_WRITE_CONFIRM		(1 << 17)
+#define		MEM_WRITE_32_BITS		(1 << 18)
 #define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_PFP_SYNC_ME		       0x42
 #define PKT3_SURFACE_SYNC                      0x43
@@ -94,6 +99,7 @@
 #define PKT3_COND_WRITE                        0x45
 #define PKT3_EVENT_WRITE                       0x46
 #define PKT3_EVENT_WRITE_EOP                   0x47
+#define PKT3_EVENT_WRITE_EOS                   0x48
 #define PKT3_ONE_REG_WRITE                     0x57
 #define PKT3_SET_CONFIG_REG                    0x68
 #define PKT3_SET_CONTEXT_REG                   0x69
@@ -162,6 +168,36 @@
 #define PKT3_CP_DMA_CMD_SAIC      (1 << 28)
 #define PKT3_CP_DMA_CMD_DAIC      (1 << 29)
 
+#define PKT3_SET_APPEND_CNT                    0x75
+/* 1. header
+ * 2. COMMAND
+ *  1:0 - SOURCE SEL
+ *  15:2 - Reserved
+ *  31:16 - WR_REG_OFFSET - context register to write source data to.
+ *          (one of R_02872C_GDS_APPEND_COUNT_0-11)
+ * 3. CONTROL
+ *  (for source == mem)
+ *  31:2 SRC_ADDRESS_LO
+ *  0:1 SWAP
+ *  (for source == GDS)
+ *  31:0 GDS offset
+ *  (for source == DATA)
+ *  31:0 DATA
+ *  (for source == REG)
+ *  31:0 REG
+ * 4. SRC_ADDRESS_HI[7:0]
+ * kernel driver 2.44 only supports SRC == MEM.
+ */
+#define PKT3_SET_APPEND_CNT_SRC_SELECT(x) ((x) << 0)
+/* source is from the data in CONTROL */
+#define PKT3_SAC_SRC_SEL_DATA 0x0
+/* source is from register */
+#define PKT3_SAC_SRC_SEL_REG  0x1
+/* source is from GDS offset in CONTROL */
+#define PKT3_SAC_SRC_SEL_GDS  0x2
+/* source is from memory address */
+#define PKT3_SAC_SRC_SEL_MEM  0x3
+
 /* Registers */
 #define R_0084FC_CP_STRMOUT_CNTL		     0x0084FC
 #define   S_0084FC_OFFSET_UPDATE_DONE(x)		(((unsigned)(x) & 0x1) << 0)
@@ -388,9 +424,9 @@
 #define   S_028C70_FAST_CLEAR(x)                       (((unsigned)(x) & 0x1) << 17)
 #define   G_028C70_FAST_CLEAR(x)                       (((x) >> 17) & 0x1)
 #define   C_028C70_FAST_CLEAR                          0xFFFDFFFF
-#define   S_028C70_COMPRESSION(x)                      (((unsigned)(x) & 0x3) << 18)
-#define   G_028C70_COMPRESSION(x)                      (((x) >> 18) & 0x3)
-#define   C_028C70_COMPRESSION                         0xFFF3FFFF
+#define   S_028C70_COMPRESSION(x)                      (((unsigned)(x) & 0x1) << 18)
+#define   G_028C70_COMPRESSION(x)                      (((x) >> 18) & 0x1)
+#define   C_028C70_COMPRESSION                         0xFFFBFFFF
 #define   S_028C70_BLEND_CLAMP(x)                      (((unsigned)(x) & 0x1) << 19)
 #define   G_028C70_BLEND_CLAMP(x)                      (((x) >> 19) & 0x1)
 #define   C_028C70_BLEND_CLAMP                         0xFFF7FFFF
@@ -849,6 +885,7 @@
 #define     V_02880C_EXPORT_DB_FOUR16                  0x01
 #define     V_02880C_EXPORT_DB_TWO                     0x02
 #define   S_02880C_ALPHA_TO_MASK_DISABLE(x)            (((unsigned)(x) & 0x1) << 12)
+#define   S_02880C_DEPTH_BEFORE_SHADER(x)              (((unsigned)(x) & 0x1) << 15)
 #define   S_02880C_CONSERVATIVE_Z_EXPORT(x)            (((unsigned)(x) & 0x03) << 16)
 #define   G_02880C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 16) & 0x03)
 #define   C_02880C_CONSERVATIVE_Z_EXPORT               0xFFFCFFFF
@@ -1078,6 +1115,11 @@
 #define   G_028208_BR_Y(x)                             (((x) >> 16) & 0x7FFF)
 #define   C_028208_BR_Y                                0x8000FFFF
 
+#define R_028A78_VGT_DMA_MAX_SIZE                    0x028A78
+#define R_028A7C_VGT_DMA_INDEX_TYPE                  0x028A7C
+#define R_028A88_VGT_NUM_INSTANCES                   0x028A88
+#define R_0287E4_VGT_DMA_BASE_HI                     0x0287E4
+#define R_0287E8_VGT_DMA_BASE                        0x0287E8
 #define R_0287F0_VGT_DRAW_INITIATOR                  0x0287F0
 #define   S_0287F0_SOURCE_SELECT(x)                    (((unsigned)(x) & 0x3) << 0)
 #define   G_0287F0_SOURCE_SELECT(x)                    (((x) >> 0) & 0x3)
@@ -1951,12 +1993,25 @@
 #define R_0286DC_SPI_FOG_CNTL                        0x000286DC
 #define R_0286E4_SPI_PS_IN_CONTROL_2                 0x000286E4
 #define R_0286E8_SPI_COMPUTE_INPUT_CNTL              0x000286E8
-#define   S_0286E8_TID_IN_GROUP_ENA                  1
-#define   S_0286E8_TGID_ENA                          2
-#define   S_0286E8_DISABLE_INDEX_PACK                4
+#define   S_0286E8_TID_IN_GROUP_ENA(x)                  (((unsigned)(x) & 0x1) << 0)
+#define   S_0286E8_TGID_ENA(x)                          (((unsigned)(x) & 0x1) << 1)
+#define   S_0286E8_DISABLE_INDEX_PACK(x)                (((unsigned)(x) & 0x1) << 2)
 #define R_028720_GDS_ADDR_BASE                       0x00028720
 #define R_028724_GDS_ADDR_SIZE                       0x00028724
 #define R_028728_GDS_ORDERED_WAVE_PER_SE             0x00028728
+#define R_02872C_GDS_APPEND_COUNT_0                  0x0002872C
+#define R_028730_GDS_APPEND_COUNT_1                  0x00028730
+#define R_028734_GDS_APPEND_COUNT_2                  0x00028734
+#define R_028738_GDS_APPEND_COUNT_3                  0x00028738
+#define R_02873C_GDS_APPEND_COUNT_4                  0x0002873C
+#define R_028740_GDS_APPEND_COUNT_5                  0x00028740
+#define R_028748_GDS_APPEND_COUNT_6                  0x00028744
+#define R_028744_GDS_APPEND_COUNT_7                  0x00028748
+#define R_028744_GDS_APPEND_COUNT_8                  0x0002874C
+#define R_028744_GDS_APPEND_COUNT_9                  0x00028750
+#define R_028744_GDS_APPEND_COUNT_10                 0x00028754
+#define R_028744_GDS_APPEND_COUNT_11                 0x00028758
+
 #define R_028784_CB_BLEND1_CONTROL                   0x00028784
 #define R_028788_CB_BLEND2_CONTROL                   0x00028788
 #define R_02878C_CB_BLEND3_CONTROL                   0x0002878C
@@ -2021,10 +2076,24 @@
 #define R_0288EC_SQ_LDS_ALLOC_PS                     0x000288EC
 #define R_028900_SQ_ESGS_RING_ITEMSIZE               0x00028900
 #define R_028904_SQ_GSVS_RING_ITEMSIZE               0x00028904
+#define R_008C50_SQ_ESTMP_RING_BASE                  0x00008C50
 #define R_028908_SQ_ESTMP_RING_ITEMSIZE              0x00028908
+#define R_008C54_SQ_ESTMP_RING_SIZE                  0x00008C54
+#define R_008C58_SQ_GSTMP_RING_BASE                  0x00008C58
 #define R_02890C_SQ_GSTMP_RING_ITEMSIZE              0x0002890C
+#define R_008C5C_SQ_GSTMP_RING_SIZE                  0x00008C5C
+#define R_008C60_SQ_VSTMP_RING_BASE                  0x00008C60
 #define R_028910_SQ_VSTMP_RING_ITEMSIZE              0x00028910
+#define R_008C64_SQ_VSTMP_RING_SIZE                  0x00008C64
+#define R_008C68_SQ_PSTMP_RING_BASE                  0x00008C68
 #define R_028914_SQ_PSTMP_RING_ITEMSIZE              0x00028914
+#define R_008C6C_SQ_PSTMP_RING_SIZE                  0x00008C6C
+#define R_008E10_SQ_LSTMP_RING_BASE                  0x00008E10
+#define R_028830_SQ_LSTMP_RING_ITEMSIZE              0x00028830
+#define R_008E14_SQ_LSTMP_RING_SIZE                  0x00008E14
+#define R_008E18_SQ_HSTMP_RING_BASE                  0x00008E18
+#define R_028834_SQ_HSTMP_RING_ITEMSIZE              0x00028834
+#define R_008E1C_SQ_HSTMP_RING_SIZE                  0x00008E1C
 #define R_02891C_SQ_GS_VERT_ITEMSIZE                 0x0002891C
 #define R_028920_SQ_GS_VERT_ITEMSIZE_1               0x00028920
 #define R_028924_SQ_GS_VERT_ITEMSIZE_2               0x00028924
@@ -2208,6 +2277,18 @@
 #define   S_028B98_STREAM_1_BUFFER_EN(x)		(((unsigned)(x) & 0x0F) << 4)
 #define   S_028B98_STREAM_2_BUFFER_EN(x)		(((unsigned)(x) & 0x0F) << 8)
 #define   S_028B98_STREAM_3_BUFFER_EN(x)		(((unsigned)(x) & 0x0F) << 12)
+#define R_028B9C_CB_IMMED0_BASE                      0x00028B9C
+#define R_028BA0_CB_IMMED1_BASE                      0x00028BA0
+#define R_028BA4_CB_IMMED2_BASE                      0x00028BA4
+#define R_028BA4_CB_IMMED3_BASE                      0x00028BA8
+#define R_028BA4_CB_IMMED4_BASE                      0x00028BAC
+#define R_028BA4_CB_IMMED5_BASE                      0x00028BB0
+#define R_028BA4_CB_IMMED6_BASE                      0x00028BB4
+#define R_028BA4_CB_IMMED7_BASE                      0x00028BB8
+#define R_028BA4_CB_IMMED8_BASE                      0x00028BBC
+#define R_028BA4_CB_IMMED9_BASE                      0x00028BC0
+#define R_028BA4_CB_IMMED10_BASE                     0x00028BC4
+#define R_028BA4_CB_IMMED11_BASE                     0x00028BC8
 #define R_028C00_PA_SC_LINE_CNTL                     0x00028C00
 #define   S_028C00_EXPAND_LINE_WIDTH(x)                (((unsigned)(x) & 0x1) << 9)
 #define   G_028C00_EXPAND_LINE_WIDTH(x)                (((x) >> 9) & 0x1)
@@ -2487,6 +2568,8 @@
 #define   S_0085F0_CR2_ACTION_ENA(x)                   (((unsigned)(x) & 0x1) << 31)
 #define   G_0085F0_CR2_ACTION_ENA(x)                   (((x) >> 31) & 0x1)
 #define   C_0085F0_CR2_ACTION_ENA                      0x7FFFFFFF
+#define R_0085F4_CP_COHER_SIZE                       0x0085F4
+#define R_0085F8_CP_COHER_BASE                       0x0085F8
 #define R_008970_VGT_NUM_INDICES                     0x008970
 
 #define R_03CFF0_SQ_VTX_BASE_VTX_LOC                    0x03CFF0
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 3dcbde0..9e00528 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1332,11 +1332,13 @@
 static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
 {
 	return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) &&
-			(bc->chip_class == CAYMAN ||
-			bc->cf_last->op != CF_OP_TEX));
+		 bc->cf_last->op != CF_OP_GDS &&
+		 (bc->chip_class == CAYMAN ||
+		  bc->cf_last->op != CF_OP_TEX));
 }
 
-int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
+static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx,
+					  bool use_tc)
 {
 	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
 	int r;
@@ -1348,7 +1350,7 @@
 	/* Load index register if required */
 	if (bc->chip_class >= EVERGREEN) {
 		if (vtx->buffer_index_mode)
-			egcm_load_index_reg(bc, 0, false);
+			egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false);
 	}
 
 	/* cf can contains only alu or only vtx or only tex */
@@ -1363,9 +1365,14 @@
 		switch (bc->chip_class) {
 		case R600:
 		case R700:
-		case EVERGREEN:
 			bc->cf_last->op = CF_OP_VTX;
 			break;
+		case EVERGREEN:
+			if (use_tc)
+				bc->cf_last->op = CF_OP_TEX;
+			else
+				bc->cf_last->op = CF_OP_VTX;
+			break;
 		case CAYMAN:
 			bc->cf_last->op = CF_OP_TEX;
 			break;
@@ -1388,6 +1395,16 @@
 	return 0;
 }
 
+int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
+{
+	return r600_bytecode_add_vtx_internal(bc, vtx, false);
+}
+
+int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
+{
+	return r600_bytecode_add_vtx_internal(bc, vtx, true);
+}
+
 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
 {
 	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
@@ -1453,6 +1470,11 @@
 		return -ENOMEM;
 	memcpy(ngds, gds, sizeof(struct r600_bytecode_gds));
 
+	if (bc->chip_class >= EVERGREEN) {
+		if (gds->uav_index_mode)
+			egcm_load_index_reg(bc, gds->uav_index_mode - 1, false);
+	}
+
 	if (bc->cf_last == NULL ||
 	    bc->cf_last->op != CF_OP_GDS ||
 	    bc->force_add_cf) {
@@ -2129,7 +2151,8 @@
 						o += print_swizzle(7);
 				}
 
-				if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND)
+				if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND ||
+				    cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND)
 					o += fprintf(stderr, " R%d", cf->output.index_gpr);
 
 				o += print_indent(o, 67);
@@ -2320,6 +2343,11 @@
 			if (gds->op != FETCH_OP_TF_WRITE) {
 				o += fprintf(stderr, ", R%d.", gds->src_gpr2);
 			}
+			if (gds->alloc_consume) {
+				o += fprintf(stderr, " UAV: %d", gds->uav_id);
+				if (gds->uav_index_mode)
+					o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]);
+			}
 			fprintf(stderr, "\n");
 			id += 4;
 		}
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index b12913d..91fe260 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -121,7 +121,6 @@
 struct r600_bytecode_gds {
 	struct list_head		list;
 	unsigned			op;
-	unsigned                        gds_op;
 	unsigned			src_gpr;
 	unsigned			src_rel;
 	unsigned			src_sel_x;
@@ -134,6 +133,10 @@
 	unsigned			dst_sel_y;
 	unsigned			dst_sel_z;
 	unsigned			dst_sel_w;
+	unsigned			uav_index_mode;
+	unsigned                        uav_id;
+	unsigned                        alloc_consume;
+	unsigned                        bcast_first_req;
 };
 
 struct r600_bytecode_output {
@@ -268,6 +271,8 @@
 		const struct r600_bytecode_alu *alu);
 int r600_bytecode_add_vtx(struct r600_bytecode *bc,
 		const struct r600_bytecode_vtx *vtx);
+int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc,
+			     const struct r600_bytecode_vtx *vtx);
 int r600_bytecode_add_tex(struct r600_bytecode *bc,
 		const struct r600_bytecode_tex *tex);
 int r600_bytecode_add_gds(struct r600_bytecode *bc,
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index c52492e..79505d5 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -444,7 +444,7 @@
 		 * disable fast clear for texture array.
 		 */
 		/* Only use htile for first level */
-		if (rtex->htile_buffer && !level &&
+		if (rtex->htile_offset && !level &&
                    fb->zsbuf->u.tex.first_layer == 0 &&
                    fb->zsbuf->u.tex.last_layer == util_max_layer(&rtex->resource.b.b, level)) {
 			if (rtex->depth_clear_value != depth) {
@@ -647,7 +647,7 @@
         src_heightFL = u_minify(src->height0, src_level);
 
 	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-	util_blitter_default_src_texture(&src_templ, src, src_level);
+	util_blitter_default_src_texture(rctx->blitter, &src_templ, src, src_level);
 
 	if (util_format_is_compressed(src->format) ||
 	    util_format_is_compressed(dst->format)) {
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 4511ce0..ca7f41d 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -274,17 +274,40 @@
 
 	r600_flush_emit(ctx);
 
+	if (ctx->trace_buf)
+		eg_trace_emit(ctx);
 	/* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */
 	if (ctx->b.chip_class == R600) {
 		radeon_set_context_reg(cs, R_028350_SX_MISC, 0);
 	}
 
+	if (ctx->is_debug) {
+		/* Save the IB for debug contexts. */
+		radeon_clear_saved_cs(&ctx->last_gfx);
+		radeon_save_cs(ws, cs, &ctx->last_gfx);
+		r600_resource_reference(&ctx->last_trace_buf, ctx->trace_buf);
+		r600_resource_reference(&ctx->trace_buf, NULL);
+	}
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->b.last_gfx_fence);
 	if (fence)
 		ws->fence_reference(fence, ctx->b.last_gfx_fence);
 	ctx->b.num_gfx_cs_flushes++;
 
+	if (ctx->is_debug) {
+		if (!ws->fence_wait(ws, ctx->b.last_gfx_fence, 10000000)) {
+			const char *fname = getenv("R600_TRACE");
+			if (!fname)
+				exit(-1);
+			FILE *fl = fopen(fname, "w+");
+			if (fl) {
+				eg_dump_debug_state(&ctx->b.b, fl, 0);
+				fclose(fl);
+			} else
+				perror(fname);
+			exit(-1);
+		}
+	}
 	r600_begin_new_cs(ctx);
 }
 
@@ -292,6 +315,23 @@
 {
 	unsigned shader;
 
+	if (ctx->is_debug) {
+		uint32_t zero = 0;
+
+		/* Create a buffer used for writing trace IDs and initialize it to 0. */
+		assert(!ctx->trace_buf);
+		ctx->trace_buf = (struct r600_resource*)
+			pipe_buffer_create(ctx->b.b.screen, 0,
+					   PIPE_USAGE_STAGING, 4);
+		if (ctx->trace_buf)
+			pipe_buffer_write_nooverlap(&ctx->b.b, &ctx->trace_buf->b.b,
+						    0, sizeof(zero), &zero);
+		ctx->trace_id = 0;
+	}
+
+	if (ctx->trace_buf)
+		eg_trace_emit(ctx);
+
 	ctx->b.flags = 0;
 	ctx->b.gtt = 0;
 	ctx->b.vram = 0;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 12bf551..f0ea409 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -111,6 +111,11 @@
 	FREE(rctx->start_compute_cs_cmd.buf);
 
 	r600_common_context_cleanup(&rctx->b);
+
+	r600_resource_reference(&rctx->trace_buf, NULL);
+	r600_resource_reference(&rctx->last_trace_buf, NULL);
+	radeon_clear_saved_cs(&rctx->last_gfx);
+
 	FREE(rctx);
 }
 
@@ -125,7 +130,8 @@
 		return NULL;
 
 	rctx->b.b.screen = screen;
-	rctx->b.b.priv = priv;
+	assert(!priv);
+	rctx->b.b.priv = NULL; /* for threaded_context_unwrap_sync */
 	rctx->b.b.destroy = r600_destroy_context;
 	rctx->b.set_atom_dirty = (void *)r600_set_atom_dirty;
 
@@ -137,7 +143,7 @@
 
 	r600_init_blit_functions(rctx);
 
-	if (rscreen->b.info.has_uvd) {
+	if (rscreen->b.info.has_hw_decode) {
 		rctx->b.b.create_video_codec = r600_uvd_create_decoder;
 		rctx->b.b.create_video_buffer = r600_video_buffer_create;
 	} else {
@@ -145,6 +151,8 @@
 		rctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
+	if (getenv("R600_TRACE"))
+		rctx->is_debug = true;
 	r600_init_common_state_functions(rctx);
 
 	switch (rctx->b.chip_class) {
@@ -286,6 +294,8 @@
 	case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
 		return 1;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -386,6 +396,8 @@
 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
 	case PIPE_CAP_TGSI_BALLOT:
 	case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+	case PIPE_CAP_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_BINDLESS_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_DOUBLES:
@@ -581,6 +593,7 @@
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
 	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		/* due to a bug in the shader compiler, some loops hang
@@ -619,7 +632,7 @@
 	return r600_resource_create_common(screen, templ);
 }
 
-struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
+struct pipe_screen *r600_screen_create(struct radeon_winsys *ws, unsigned flags)
 {
 	struct r600_screen *rscreen = CALLOC_STRUCT(r600_screen);
 
@@ -634,7 +647,7 @@
 	rscreen->b.b.get_shader_param = r600_get_shader_param;
 	rscreen->b.b.resource_create = r600_resource_create;
 
-	if (!r600_common_screen_init(&rscreen->b, ws)) {
+	if (!r600_common_screen_init(&rscreen->b, ws, flags)) {
 		FREE(rscreen);
 		return NULL;
 	}
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 7f1ecc2..c9294a7 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -189,6 +189,7 @@
 	bool cb0_is_integer;
 	bool is_msaa_resolve;
 	bool dual_src_blend;
+	bool do_update_surf_dirtiness;
 };
 
 struct r600_sample_mask {
@@ -508,9 +509,6 @@
 	 * the GPU addresses are updated. */
 	struct list_head		texture_buffers;
 
-	/* Index buffer. */
-	struct pipe_index_buffer	index_buffer;
-
 	/* Last draw state (-1 = unset). */
 	enum pipe_prim_type		last_primitive_type; /* Last primitive type used in draw_vbo. */
 	enum pipe_prim_type		current_rast_prim; /* primitive type after TES, GS */
@@ -526,6 +524,13 @@
 	struct r600_pipe_shader_selector *last_tcs;
 	unsigned last_num_tcs_input_cp;
 	unsigned lds_alloc;
+
+	/* Debug state. */
+	bool			is_debug;
+	struct radeon_saved_cs	last_gfx;
+	struct r600_resource	*last_trace_buf;
+	struct r600_resource	*trace_buf;
+	unsigned		trace_id;
 };
 
 static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
@@ -920,10 +925,6 @@
 /*
  * common helpers
  */
-static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
-{
-	return value * (1 << frac_bits);
-}
 
 /* 12.4 fixed-point */
 static inline unsigned r600_pack_float_12p4(float x)
@@ -954,4 +955,8 @@
 #define     V_028A6C_OUTPRIM_TYPE_TRISTRIP             2
 
 unsigned r600_conv_prim_to_gs_out(unsigned mode);
+
+void eg_trace_emit(struct r600_context *rctx);
+void eg_dump_debug_state(struct pipe_context *ctx, FILE *f,
+			 unsigned flags);
 #endif
diff --git a/src/gallium/drivers/r600/r600_public.h b/src/gallium/drivers/r600/r600_public.h
index e4fe23a..2018440 100644
--- a/src/gallium/drivers/r600/r600_public.h
+++ b/src/gallium/drivers/r600/r600_public.h
@@ -25,6 +25,6 @@
 
 struct radeon_winsys;
 
-struct pipe_screen *r600_screen_create(struct radeon_winsys *ws);
+struct pipe_screen *r600_screen_create(struct radeon_winsys *ws, unsigned flags);
 
 #endif
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index bdaf28c..2eb8187 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -39,23 +39,23 @@
 #include <stdio.h>
 #include <errno.h>
 
-/* CAYMAN notes 
+/* CAYMAN notes
 Why CAYMAN got loops for lots of instructions is explained here.
 
 -These 8xx t-slot only ops are implemented in all vector slots.
 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
-These 8xx t-slot only opcodes become vector ops, with all four 
-slots expecting the arguments on sources a and b. Result is 
+These 8xx t-slot only opcodes become vector ops, with all four
+slots expecting the arguments on sources a and b. Result is
 broadcast to all channels.
 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
-These 8xx t-slot only opcodes become vector ops in the z, y, and 
+These 8xx t-slot only opcodes become vector ops in the z, y, and
 x slots.
 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
 SQRT_IEEE/_64
 SIN/COS
-The w slot may have an independent co-issued operation, or if the 
-result is required to be in the w slot, the opcode above may be 
+The w slot may have an independent co-issued operation, or if the
+result is required to be in the w slot, the opcode above may be
 issued in the w slot as well.
 The compiler must issue the source argument to slots z, y, and x
 */
@@ -1134,9 +1134,10 @@
 
 		if (enabled) {
 			int gpr = gpr_offset + num_regs++;
+			ctx->shader->nsys_inputs++;
 
 			// add to inputs, allocate a gpr
-			k = ctx->shader->ninput ++;
+			k = ctx->shader->ninput++;
 			ctx->shader->input[k].name = name;
 			ctx->shader->input[k].sid = 0;
 			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
@@ -2945,6 +2946,7 @@
 	shader->indirect_files = ctx.info.indirect_files;
 
 	shader->uses_doubles = ctx.info.uses_doubles;
+	shader->nsys_inputs = 0;
 
 	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
 	tgsi_parse_init(&ctx.parse, tokens);
@@ -3158,7 +3160,7 @@
 			goto out_err;
 		}
 	}
-	
+
 	shader->ring_item_sizes[0] = ctx.next_ring_offset;
 	shader->ring_item_sizes[1] = 0;
 	shader->ring_item_sizes[2] = 0;
@@ -4270,7 +4272,7 @@
 	int i, j, r;
 	struct r600_bytecode_alu alu;
 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
-	
+
 	for (i = 0 ; i < last_slot; i++) {
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ctx->inst_info->op;
@@ -4797,7 +4799,7 @@
 					alu.last = 1;
 				} else
 					alu.dst.write = 0;
-				
+
 				r = r600_bytecode_add_alu(ctx->bc, &alu);
 				if (r)
 					return r;
@@ -5273,7 +5275,7 @@
 
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_FLT_TO_UINT;
-		  
+
 			alu.dst.sel = tmp0;
 			alu.dst.chan = 0;
 			alu.dst.write = 1;
@@ -5344,7 +5346,7 @@
 			} else {
 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
 			}
-			
+
 			alu.last = 1;
 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
 				return r;
@@ -5610,7 +5612,7 @@
 			} else {
 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
 			}
-			
+
 			alu.src[1].sel = tmp0;
 			alu.src[1].chan = 2;
 
@@ -6322,12 +6324,10 @@
 	struct r600_bytecode_alu alu;
 	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
 	unsigned location;
-	int input;
+	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
 
 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
 
-	input = inst->Src[0].Register.Index;
-
 	/* Interpolators have been marked for use already by allocate_system_value_inputs */
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
@@ -7012,7 +7012,7 @@
 		r = r600_bytecode_add_alu(ctx->bc, &alu);
 		if (r)
 			return r;
-		/* write initial compare value into Z component 
+		/* write initial compare value into Z component
 		  - W src 0 for shadow cube
 		  - X src 1 for shadow cube array */
 		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
@@ -7090,7 +7090,7 @@
 				r = r600_bytecode_add_alu(ctx->bc, &alu);
 				if (r)
 					return r;
-					
+
 				r = r600_bytecode_add_tex(ctx->bc, &tex);
 				if (r)
 					return r;
@@ -7417,7 +7417,7 @@
 	/* does this shader want a num layers from TXQ for a cube array? */
 	if (has_txq_cube_array_z) {
 		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
-		
+
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP1_MOV;
 
@@ -8071,10 +8071,10 @@
 				alu.op = ALU_OP1_LOG_IEEE;
 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
 				r600_bytecode_src_set_abs(&alu.src[0]);
-			
+
 				alu.dst.sel = ctx->temp_reg;
 				alu.dst.chan = i;
-				if (i == 0) 
+				if (i == 0)
 					alu.dst.write = 1;
 				if (i == 2)
 					alu.last = 1;
@@ -8089,7 +8089,7 @@
 			alu.op = ALU_OP1_LOG_IEEE;
 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
 			r600_bytecode_src_set_abs(&alu.src[0]);
-			
+
 			alu.dst.sel = ctx->temp_reg;
 			alu.dst.chan = 0;
 			alu.dst.write = 1;
@@ -8130,10 +8130,10 @@
 					alu.dst.write = 1;
 				if (i == 2)
 					alu.last = 1;
-				
+
 				r = r600_bytecode_add_alu(ctx->bc, &alu);
 				if (r)
-					return r;	
+					return r;
 			}
 		} else {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
@@ -8214,7 +8214,7 @@
 					alu.dst.write = 1;
 				if (i == 2)
 					alu.last = 1;
-				
+
 				r = r600_bytecode_add_alu(ctx->bc, &alu);
 				if (r)
 					return r;
@@ -8954,7 +8954,7 @@
 
 		alu.src[0].sel = ctx->temp_reg;
 		alu.src[0].chan = i;
-		
+
 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
 		if (i == lasti) {
 			alu.last = 1;
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index cfdb020..9032d50 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -49,11 +49,11 @@
 	int			spi_sid;
 	unsigned		interpolate;
 	unsigned		ij_index;
-	unsigned        interpolate_location; //  TGSI_INTERPOLATE_LOC_CENTER, CENTROID, SAMPLE
+	unsigned		interpolate_location; //  TGSI_INTERPOLATE_LOC_CENTER, CENTROID, SAMPLE
 	unsigned		lds_pos; /* for evergreen */
 	unsigned		back_color_input;
 	unsigned		write_mask;
-	int				ring_offset;
+	int			ring_offset;
 };
 
 struct r600_shader {
@@ -62,6 +62,7 @@
 	unsigned		ninput;
 	unsigned		noutput;
 	unsigned		nlds;
+	unsigned		nsys_inputs;
 	struct r600_shader_io	input[64];
 	struct r600_shader_io	output[64];
 	boolean			uses_kill;
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index d58c6f9..aa38381 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -197,6 +197,10 @@
 #define     EG_V_SQ_ALU_SRC_LDS_DIRECT_A                             0x000000DF
 #define     EG_V_SQ_ALU_SRC_LDS_DIRECT_B                             0x000000E0
 
+#define     EG_V_SQ_ALU_SRC_HW_WAVE_ID                               0x000000E7
+#define     EG_V_SQ_ALU_SRC_SIMD_ID                                  0x000000E8
+#define     EG_V_SQ_ALU_SRC_SE_ID                                    0x000000E9
+
 #define     V_SQ_ALU_SRC_0                                           0x000000F8
 #define     V_SQ_ALU_SRC_1                                           0x000000F9
 #define     V_SQ_ALU_SRC_1_INT                                       0x000000FA
@@ -493,4 +497,39 @@
 #define SQ_VTX_FETCH_INSTANCE_DATA 1
 #define SQ_VTX_FETCH_NO_INDEX_OFFSET 2
 
+/* EG RAT functions */
+#define       V_RAT_INST_NOP                                         0
+#define       V_RAT_INST_STORE_TYPED                                 1
+#define       V_RAT_INST_CMPXCHG_INT                                 4
+#define       V_RAT_INST_ADD                                         7
+#define       V_RAT_INST_SUB                                         8
+#define       V_RAT_INST_RSUB                                        9
+#define       V_RAT_INST_MIN_INT                                     10
+#define       V_RAT_INST_MIN_UINT                                    11
+#define       V_RAT_INST_MAX_INT                                     12
+#define       V_RAT_INST_MAX_UINT                                    13
+#define       V_RAT_INST_AND                                         14
+#define       V_RAT_INST_OR                                          15
+#define       V_RAT_INST_XOR                                         16
+#define       V_RAT_INST_INC_UINT                                    18
+#define       V_RAT_INST_DEC_UINT                                    19
+#define       V_RAT_INST_STORE_DWORD                                 20
+#define       V_RAT_INST_STORE_SHORT                                 21
+#define       V_RAT_INST_STORE_BYTE                                  22
+#define       V_RAT_INST_NOP_RTN                                     32
+#define       V_RAT_INST_XCHG_RTN                                    34
+#define       V_RAT_INST_CMPXCHG_INT_RTN                             36
+#define       V_RAT_INST_ADD_RTN                                     39
+#define       V_RAT_INST_SUB_RTN                                     40
+#define       V_RAT_INST_RSUB_RTN                                    41
+#define       V_RAT_INST_MIN_INT_RTN                                 42
+#define       V_RAT_INST_MIN_UINT_RTN                                43
+#define       V_RAT_INST_MAX_INT_RTN                                 44
+#define       V_RAT_INST_MAX_UINT_RTN                                45
+#define       V_RAT_INST_AND_RTN                                     46
+#define       V_RAT_INST_OR_RTN                                      47
+#define       V_RAT_INST_XOR_RTN                                     48
+#define       V_RAT_INST_INC_UINT_RTN                                50
+#define       V_RAT_INST_DEC_UINT_RTN                                51
+
 #endif
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 06100ab..dca8fe5 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1061,8 +1061,8 @@
 	surf->db_prefetch_limit = (rtex->surface.u.legacy.level[level].nblk_y / 8) - 1;
 
 	/* use htile only for first level */
-	if (rtex->htile_buffer && !level) {
-		surf->db_htile_data_base = 0;
+	if (rtex->htile_offset && !level) {
+		surf->db_htile_data_base = rtex->htile_offset >> 8;
 		surf->db_htile_surface = S_028D24_HTILE_WIDTH(1) |
 					 S_028D24_HTILE_HEIGHT(1) |
 					 S_028D24_FULL_CACHE(1);
@@ -1209,6 +1209,7 @@
 	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
+	rctx->framebuffer.do_update_surf_dirtiness = true;
 }
 
 static uint32_t sample_locs_2x[] = {
@@ -1542,7 +1543,7 @@
 		radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value));
 		radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, &rtex->resource,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 		radeon_emit(cs, reloc_idx);
@@ -1657,7 +1658,7 @@
 		unsigned buffer_index = u_bit_scan(&dirty_mask);
 
 		vb = &rctx->vertex_buffer_state.vb[buffer_index];
-		rbuffer = (struct r600_resource*)vb->buffer;
+		rbuffer = (struct r600_resource*)vb->buffer.resource;
 		assert(rbuffer);
 
 		offset = vb->buffer_offset;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 5be49dc..4c97efa 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -99,6 +99,7 @@
 		       R600_CONTEXT_FLUSH_AND_INV_CB |
 		       R600_CONTEXT_FLUSH_AND_INV |
 		       R600_CONTEXT_WAIT_3D_IDLE;
+	rctx->framebuffer.do_update_surf_dirtiness = true;
 }
 
 static unsigned r600_conv_pipe_prim(unsigned prim)
@@ -522,20 +523,6 @@
 	FREE(shader);
 }
 
-static void r600_set_index_buffer(struct pipe_context *ctx,
-			   const struct pipe_index_buffer *ib)
-{
-	struct r600_context *rctx = (struct r600_context *)ctx;
-
-	if (ib) {
-		pipe_resource_reference(&rctx->index_buffer.buffer, ib->buffer);
-		memcpy(&rctx->index_buffer, ib, sizeof(*ib));
-		r600_context_add_resource_size(ctx, ib->buffer);
-	} else {
-		pipe_resource_reference(&rctx->index_buffer.buffer, NULL);
-	}
-}
-
 void r600_vertex_buffers_dirty(struct r600_context *rctx)
 {
 	if (rctx->vertex_buffer_state.dirty_mask) {
@@ -561,21 +548,21 @@
 	if (input) {
 		for (i = 0; i < count; i++) {
 			if (memcmp(&input[i], &vb[i], sizeof(struct pipe_vertex_buffer))) {
-				if (input[i].buffer) {
+				if (input[i].buffer.resource) {
 					vb[i].stride = input[i].stride;
 					vb[i].buffer_offset = input[i].buffer_offset;
-					pipe_resource_reference(&vb[i].buffer, input[i].buffer);
+					pipe_resource_reference(&vb[i].buffer.resource, input[i].buffer.resource);
 					new_buffer_mask |= 1 << i;
-					r600_context_add_resource_size(ctx, input[i].buffer);
+					r600_context_add_resource_size(ctx, input[i].buffer.resource);
 				} else {
-					pipe_resource_reference(&vb[i].buffer, NULL);
+					pipe_resource_reference(&vb[i].buffer.resource, NULL);
 					disable_mask |= 1 << i;
 				}
 			}
 		}
 	} else {
 		for (i = 0; i < count; i++) {
-			pipe_resource_reference(&vb[i].buffer, NULL);
+			pipe_resource_reference(&vb[i].buffer.resource, NULL);
 		}
 		disable_mask = ((1ull << count) - 1);
 	}
@@ -860,11 +847,11 @@
 			case TGSI_SEMANTIC_TESSOUTER:
 			case TGSI_SEMANTIC_PATCH:
 				sel->lds_patch_outputs_written_mask |=
-					1llu << r600_get_lds_unique_index(name, index);
+					1ull << r600_get_lds_unique_index(name, index);
 				break;
 			default:
 				sel->lds_outputs_written_mask |=
-					1llu << r600_get_lds_unique_index(name, index);
+					1ull << r600_get_lds_unique_index(name, index);
 			}
 		}
 		break;
@@ -1413,6 +1400,32 @@
 		ureg_create_shader_and_destroy(ureg, &rctx->b.b);
 }
 
+static void r600_update_compressed_resource_state(struct r600_context *rctx)
+{
+	unsigned i;
+	unsigned counter;
+
+	counter = p_atomic_read(&rctx->screen->b.compressed_colortex_counter);
+	if (counter != rctx->b.last_compressed_colortex_counter) {
+		rctx->b.last_compressed_colortex_counter = counter;
+
+		for (i = 0; i < PIPE_SHADER_TYPES; ++i) {
+			r600_update_compressed_colortex_mask(&rctx->samplers[i].views);
+		}
+	}
+
+	/* Decompress textures if needed. */
+	for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+		struct r600_samplerview_state *views = &rctx->samplers[i].views;
+		if (views->compressed_depthtex_mask) {
+			r600_decompress_depth_textures(rctx, views);
+		}
+		if (views->compressed_colortex_mask) {
+			r600_decompress_color_textures(rctx, views);
+		}
+	}
+}
+
 #define SELECT_SHADER_OR_FAIL(x) do {					\
 		r600_shader_select(ctx, rctx->x##_shader, &x##_dirty);	\
 		if (unlikely(!rctx->x##_shader->current))		\
@@ -1453,30 +1466,8 @@
 	bool need_buf_const;
 	struct r600_pipe_shader *clip_so_current = NULL;
 
-	if (!rctx->blitter->running) {
-		unsigned i;
-		unsigned counter;
-
-		counter = p_atomic_read(&rctx->screen->b.compressed_colortex_counter);
-		if (counter != rctx->b.last_compressed_colortex_counter) {
-			rctx->b.last_compressed_colortex_counter = counter;
-
-			for (i = 0; i < PIPE_SHADER_TYPES; ++i) {
-				r600_update_compressed_colortex_mask(&rctx->samplers[i].views);
-			}
-		}
-
-		/* Decompress textures if needed. */
-		for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-			struct r600_samplerview_state *views = &rctx->samplers[i].views;
-			if (views->compressed_depthtex_mask) {
-				r600_decompress_depth_textures(rctx, views);
-			}
-			if (views->compressed_colortex_mask) {
-				r600_decompress_color_textures(rctx, views);
-			}
-		}
-	}
+	if (!rctx->blitter->running)
+		r600_update_compressed_resource_state(rctx);
 
 	SELECT_SHADER_OR_FAIL(ps);
 
@@ -1701,14 +1692,16 @@
 static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct pipe_index_buffer ib = {};
+	struct pipe_resource *indexbuf = info->has_user_indices ? NULL : info->index.resource;
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
+	bool has_user_indices = info->has_user_indices;
 	uint64_t mask;
-	unsigned num_patches, dirty_tex_counter;
+	unsigned num_patches, dirty_tex_counter, index_offset = 0;
+	unsigned index_size = info->index_size;
 	int index_bias;
 
-	if (!info->indirect && !info->count && (info->indexed || !info->count_from_stream_output)) {
+	if (!info->indirect && !info->count && (index_size || !info->count_from_stream_output)) {
 		return;
 	}
 
@@ -1732,6 +1725,7 @@
 	if (unlikely(dirty_tex_counter != rctx->b.last_dirty_tex_counter)) {
 		rctx->b.last_dirty_tex_counter = dirty_tex_counter;
 		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
+		rctx->framebuffer.do_update_surf_dirtiness = true;
 	}
 
 	if (!r600_update_derived_state(rctx)) {
@@ -1745,18 +1739,11 @@
 		: (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
 		: info->mode;
 
-	if (info->indexed) {
-		/* Initialize the index buffer struct. */
-		pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
-		ib.user_buffer = rctx->index_buffer.user_buffer;
-		ib.index_size = rctx->index_buffer.index_size;
-		ib.offset = rctx->index_buffer.offset;
-		if (!info->indirect) {
-			ib.offset += info->start * ib.index_size;
-		}
+	if (index_size) {
+		index_offset += info->start * index_size;
 
 		/* Translate 8-bit indices to 16-bit. */
-		if (unlikely(ib.index_size == 1)) {
+		if (unlikely(index_size == 1)) {
 			struct pipe_resource *out_buffer = NULL;
 			unsigned out_offset;
 			void *ptr;
@@ -1768,12 +1755,12 @@
 			}
 			else {
 				/* Have to get start/count from indirect buffer, slow path ahead... */
-				struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
+				struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect->buffer;
 				unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource,
 					PIPE_TRANSFER_READ);
 				if (data) {
-					data += info->indirect_offset / sizeof(unsigned);
-					start = data[2] * ib.index_size;
+					data += info->indirect->offset / sizeof(unsigned);
+					start = data[2] * index_size;
 					count = data[0];
 				}
 				else {
@@ -1784,19 +1771,16 @@
 
 			u_upload_alloc(ctx->stream_uploader, start, count * 2,
                                        256, &out_offset, &out_buffer, &ptr);
-			if (unlikely(!ptr)) {
-				pipe_resource_reference(&ib.buffer, NULL);
+			if (unlikely(!ptr))
 				return;
-			}
 
 			util_shorten_ubyte_elts_to_userptr(
-						&rctx->b.b, &ib, 0, 0, ib.offset + start, count, ptr);
+						&rctx->b.b, info, 0, 0, index_offset, count, ptr);
 
-			pipe_resource_reference(&ib.buffer, NULL);
-			ib.user_buffer = NULL;
-			ib.buffer = out_buffer;
-			ib.offset = out_offset;
-			ib.index_size = 2;
+			indexbuf = out_buffer;
+			index_offset = out_offset;
+			index_size = 2;
+			has_user_indices = false;
 		}
 
 		/* Upload the index buffer.
@@ -1804,13 +1788,14 @@
 		 * and the indices are emitted via PKT3_DRAW_INDEX_IMMD.
 		 * Indirect draws never use immediate indices.
 		 * Note: Instanced rendering in combination with immediate indices hangs. */
-		if (ib.user_buffer && (R600_BIG_ENDIAN || info->indirect ||
+		if (has_user_indices && (R600_BIG_ENDIAN || info->indirect ||
 						 info->instance_count > 1 ||
-						 info->count*ib.index_size > 20)) {
+						 info->count*index_size > 20)) {
+			indexbuf = NULL;
 			u_upload_data(ctx->stream_uploader, 0,
-                                      info->count * ib.index_size, 256,
-				      ib.user_buffer, &ib.offset, &ib.buffer);
-			ib.user_buffer = NULL;
+                                      info->count * index_size, 256,
+				      info->index.user, &index_offset, &indexbuf);
+			has_user_indices = false;
 		}
 		index_bias = info->index_bias;
 	} else {
@@ -1838,7 +1823,7 @@
 		evergreen_setup_tess_constants(rctx, info, &num_patches);
 
 	/* Emit states. */
-	r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
+	r600_need_cs_space(rctx, has_user_indices ? 5 : 0, TRUE);
 	r600_flush_emit(rctx);
 
 	mask = rctx->dirty_atoms;
@@ -1916,7 +1901,7 @@
 		radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
 		radeon_emit(cs, info->instance_count);
 	} else {
-		uint64_t va = r600_resource(info->indirect)->gpu_address;
+		uint64_t va = r600_resource(info->indirect->buffer)->gpu_address;
 		assert(rctx->b.chip_class >= EVERGREEN);
 
 		// Invalidate so non-indirect draw calls reset this state
@@ -1930,26 +1915,26 @@
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-							  (struct r600_resource*)info->indirect,
+							  (struct r600_resource*)info->indirect->buffer,
 							  RADEON_USAGE_READ,
                                                           RADEON_PRIO_DRAW_INDIRECT));
 	}
 
-	if (info->indexed) {
+	if (index_size) {
 		radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-		radeon_emit(cs, ib.index_size == 4 ?
+		radeon_emit(cs, index_size == 4 ?
 				(VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) :
 				(VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0)));
 
-		if (ib.user_buffer) {
-			unsigned size_bytes = info->count*ib.index_size;
+		if (has_user_indices) {
+			unsigned size_bytes = info->count*index_size;
 			unsigned size_dw = align(size_bytes, 4) / 4;
 			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit));
 			radeon_emit(cs, info->count);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_IMMEDIATE);
-			radeon_emit_array(cs, ib.user_buffer, size_dw);
+			radeon_emit_array(cs, info->index.user, size_dw);
 		} else {
-			uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset;
+			uint64_t va = r600_resource(indexbuf)->gpu_address + index_offset;
 
 			if (likely(!info->indirect)) {
 				radeon_emit(cs, PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit));
@@ -1959,12 +1944,12 @@
 				radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
 				radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 				radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-									  (struct r600_resource*)ib.buffer,
+									  (struct r600_resource*)indexbuf,
 									  RADEON_USAGE_READ,
                                                                           RADEON_PRIO_INDEX_BUFFER));
 			}
 			else {
-				uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size;
+				uint32_t max_size = (indexbuf->width0 - index_offset) / index_size;
 
 				radeon_emit(cs, PKT3(EG_PKT3_INDEX_BASE, 1, 0));
 				radeon_emit(cs, va);
@@ -1972,7 +1957,7 @@
 
 				radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 				radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-									  (struct r600_resource*)ib.buffer,
+									  (struct r600_resource*)indexbuf,
 									  RADEON_USAGE_READ,
                                                                           RADEON_PRIO_INDEX_BUFFER));
 
@@ -1980,7 +1965,7 @@
 				radeon_emit(cs, max_size);
 
 				radeon_emit(cs, PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit));
-				radeon_emit(cs, info->indirect_offset);
+				radeon_emit(cs, info->indirect->offset);
 				radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
 			}
 		}
@@ -2010,7 +1995,7 @@
 		}
 		else {
 			radeon_emit(cs, PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit));
-			radeon_emit(cs, info->indirect_offset);
+			radeon_emit(cs, info->indirect->offset);
 		}
 		radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
 				(info->count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0));
@@ -2034,32 +2019,39 @@
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT));
 	}
 
-	/* Set the depth buffer as dirty. */
-	if (rctx->framebuffer.state.zsbuf) {
-		struct pipe_surface *surf = rctx->framebuffer.state.zsbuf;
-		struct r600_texture *rtex = (struct r600_texture *)surf->texture;
+	if (rctx->trace_buf)
+		eg_trace_emit(rctx);
 
-		rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-
-		if (rtex->surface.flags & RADEON_SURF_SBUFFER)
-			rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
-	}
-	if (rctx->framebuffer.compressed_cb_mask) {
-		struct pipe_surface *surf;
-		struct r600_texture *rtex;
-		unsigned mask = rctx->framebuffer.compressed_cb_mask;
-
-		do {
-			unsigned i = u_bit_scan(&mask);
-			surf = rctx->framebuffer.state.cbufs[i];
-			rtex = (struct r600_texture*)surf->texture;
+	if (rctx->framebuffer.do_update_surf_dirtiness) {
+		/* Set the depth buffer as dirty. */
+		if (rctx->framebuffer.state.zsbuf) {
+			struct pipe_surface *surf = rctx->framebuffer.state.zsbuf;
+			struct r600_texture *rtex = (struct r600_texture *)surf->texture;
 
 			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
 
-		} while (mask);
+			if (rtex->surface.flags & RADEON_SURF_SBUFFER)
+				rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+		}
+		if (rctx->framebuffer.compressed_cb_mask) {
+			struct pipe_surface *surf;
+			struct r600_texture *rtex;
+			unsigned mask = rctx->framebuffer.compressed_cb_mask;
+
+			do {
+				unsigned i = u_bit_scan(&mask);
+				surf = rctx->framebuffer.state.cbufs[i];
+				rtex = (struct r600_texture*)surf->texture;
+
+				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+
+			} while (mask);
+		}
+		rctx->framebuffer.do_update_surf_dirtiness = false;
 	}
 
-	pipe_resource_reference(&ib.buffer, NULL);
+	if (index_size && indexbuf != info->index.resource)
+		pipe_resource_reference(&indexbuf, NULL);
 	rctx->b.num_draw_calls++;
 }
 
@@ -2833,7 +2825,7 @@
 	mask = rctx->vertex_buffer_state.enabled_mask;
 	while (mask) {
 		i = u_bit_scan(&mask);
-		if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) {
+		if (rctx->vertex_buffer_state.vb[i].buffer.resource == &rbuffer->b.b) {
 			rctx->vertex_buffer_state.dirty_mask |= 1 << i;
 			r600_vertex_buffers_dirty(rctx);
 		}
@@ -2966,7 +2958,6 @@
 	rctx->b.b.set_sample_mask = r600_set_sample_mask;
 	rctx->b.b.set_stencil_ref = r600_set_pipe_stencil_ref;
 	rctx->b.b.set_vertex_buffers = r600_set_vertex_buffers;
-	rctx->b.b.set_index_buffer = r600_set_index_buffer;
 	rctx->b.b.set_sampler_views = r600_set_sampler_views;
 	rctx->b.b.sampler_view_destroy = r600_sampler_view_destroy;
 	rctx->b.b.texture_barrier = r600_texture_barrier;
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 75d64c1..0d04708 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -219,8 +219,12 @@
 #define R_008C4C_SQ_GSVS_RING_SIZE                   0x008C4C
 #define R_008C50_SQ_ESTMP_RING_BASE                  0x008C50
 #define R_008C54_SQ_ESTMP_RING_SIZE                  0x008C54
-#define R_008C50_SQ_GSTMP_RING_BASE                  0x008C58
-#define R_008C54_SQ_GSTMP_RING_SIZE                  0x008C5C
+#define R_008C58_SQ_GSTMP_RING_BASE                  0x008C58
+#define R_008C5C_SQ_GSTMP_RING_SIZE                  0x008C5C
+#define R_008C68_SQ_PSTMP_RING_BASE                  0x008C68
+#define R_008C6C_SQ_PSTMP_RING_SIZE                  0x008C6C
+#define R_008C60_SQ_VSTMP_RING_BASE                  0x008C60
+#define R_008C64_SQ_VSTMP_RING_SIZE                  0x008C64
 
 #define R_0088C8_VGT_GS_PER_ES                       0x0088C8
 #define R_0088CC_VGT_ES_PER_GS                       0x0088CC
@@ -3773,6 +3777,12 @@
 #define SQ_TEX_INST_SAMPLE_C_G_LB	0x1E
 #define SQ_TEX_INST_SAMPLE_C_G_LZ	0x1F
 
+#define EG_0802C_GRBM_GFX_INDEX          			0x802C
+#define   S_0802C_INSTANCE_INDEX(x)					  (((x) & 0xffff) << 0)
+#define   S_0802C_SE_INDEX(x)						  (((x) & 0x3fff) << 16)
+#define   S_0802C_INSTANCE_BROADCAST_WRITES(x)		  (((x) & 0x1) << 30)
+#define   S_0802C_SE_BROADCAST_WRITES(x)			  (((x) & 0x1) << 31)
+
 #define CM_R_028AA8_IA_MULTI_VGT_PARAM                0x028AA8
 #define   S_028AA8_PRIMGROUP_SIZE(x)                   (((unsigned)(x) & 0xFFFF) << 0)
 #define   G_028AA8_PRIMGROUP_SIZE(x)                   (((x) >> 0) & 0xFFFF)
diff --git a/src/gallium/drivers/r600/r700_sq.h b/src/gallium/drivers/r600/r700_sq.h
index d881012..81e0e7a 100644
--- a/src/gallium/drivers/r600/r700_sq.h
+++ b/src/gallium/drivers/r600/r700_sq.h
@@ -543,4 +543,34 @@
 #define   G_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) >> 29) & 0x7)
 #define   C_SQ_TEX_WORD2_SRC_SEL_W                                   0x1FFFFFFF
 
+#define P_SQ_MEM_RD_WORD0
+#define   S_SQ_MEM_RD_WORD0_MEM_INST(x)                              (((x) & 0x1F) << 0)
+#define   S_SQ_MEM_RD_WORD0_ELEM_SIZE(x)                             (((x) & 0x3) << 5)
+#define   S_SQ_MEM_RD_WORD0_FETCH_WHOLE_QUAD(x)                      (((x) & 0x1) << 7)
+#define   S_SQ_MEM_RD_WORD0_MEM_OP(x)                                (((x) & 0x7) << 8)
+#define   S_SQ_MEM_RD_WORD0_UNCACHED(x)                              (((x) & 0x1) << 11)
+#define   S_SQ_MEM_RD_WORD0_INDEXED(x)                               (((x) & 0x1) << 12)
+#define   S_SQ_MEM_RD_WORD0_SRC_SEL_Y(x)                             (((x) & 0x3) << 13)
+#define   S_SQ_MEM_RD_WORD0_SRC_GPR(x)                               (((x) & 0x7F) << 16)
+#define   S_SQ_MEM_RD_WORD0_SRC_REL(x)                               (((x) & 0x1) << 23)
+#define   S_SQ_MEM_RD_WORD0_SRC_SEL_X(x)                             (((x) & 0x3) << 24)
+#define   S_SQ_MEM_RD_WORD0_BURST_COUNT(x)                           (((x) & 0xF) << 26)
+#define   S_SQ_MEM_RD_WORD0_LDS_REQ(x)                               (((x) & 0x1) << 30)
+#define   S_SQ_MEM_RD_WORD0_COALESCED_READ(x)                        (((x) & 0x1) << 31)
+#define P_SQ_MEM_RD_WORD1
+#define   S_SQ_MEM_RD_WORD1_DST_GPR(x)                               (((x) & 0x7f) << 0)
+#define   S_SQ_MEM_RD_WORD1_DST_REL(x)                               (((x) & 0x1) << 7)
+#define   S_SQ_MEM_RD_WORD1_DST_SEL_X(x)                             (((x) & 0x7) << 9)
+#define   S_SQ_MEM_RD_WORD1_DST_SEL_Y(x)                             (((x) & 0x7) << 12)
+#define   S_SQ_MEM_RD_WORD1_DST_SEL_Z(x)                             (((x) & 0x7) << 15)
+#define   S_SQ_MEM_RD_WORD1_DST_SEL_W(x)                             (((x) & 0x7) << 18)
+#define   S_SQ_MEM_RD_WORD1_DATA_FORMAT(x)                           (((x) & 0x3F) << 22)
+#define   S_SQ_MEM_RD_WORD1_NUM_FORMAT_ALL(x)                        (((x) & 0x3) << 28)
+#define   S_SQ_MEM_RD_WORD1_FORMAT_COMP_ALL(x)                       (((x) & 0x1) << 30)
+#define   S_SQ_MEM_RD_WORD1_SRF_MODE_ALL(x)                          (((x) & 0x1) << 31)
+#define P_SQ_MEM_RD_WORD2
+#define   S_SQ_MEM_RD_WORD2_ARRAY_BASE(x)                            (((x) & 0x1FFF) << 0)
+#define   S_SQ_MEM_RD_WORD2_ENDIAN_SWAP(x)                           (((x) & 0x3) << 16)
+#define   S_SQ_MEM_RD_WORD2_ARRAY_SIZE(x)                            (((x) & 0xFFF) << 20)
+
 #endif
diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h
index 2c662ac..fed041c 100644
--- a/src/gallium/drivers/r600/sb/sb_bc.h
+++ b/src/gallium/drivers/r600/sb/sb_bc.h
@@ -787,7 +787,7 @@
 	} \
 	unsigned get_##name() const { \
 		return (value>>(first_bit))&((1ull<<((last_bit)-(first_bit)+1))-1); \
-	} \
+	}
 
 #define BC_RSRVD(fmt, last_bit, first_bit)
 
diff --git a/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc b/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc
index 35ecee5..5e6fb25 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc
+++ b/src/gallium/drivers/r600/sb/sb_bc_fmt_def.inc
@@ -560,7 +560,7 @@
 BC_FIELD(MEM_GDS_WORD1,     UAV_INDEX_MODE,         U_IM,      25, 24)
 BC_FIELD(MEM_GDS_WORD1,     UAV_ID,                 U_ID,      29, 26)
 BC_FIELD(MEM_GDS_WORD1,     ALLOC_CONSUME,          AC,        30, 30)
-BC_FIELD(MEM_GDS_WORD1,     BCARD_FIRST_REQ,        BFR,       31, 31)
+BC_FIELD(MEM_GDS_WORD1,     BCAST_FIRST_REQ,        BFR,       31, 31)
 BC_FORMAT_END(MEM_GDS_WORD1)
 
 BC_FORMAT_BEGIN_HW(MEM_GDS_WORD2, EGCM)
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index 5113b75..2fbec2f 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -711,22 +711,24 @@
 }
 
 int post_scheduler::run() {
-	run_on(sh.root);
-	return 0;
+	return run_on(sh.root) ? 0 : 1;
 }
 
-void post_scheduler::run_on(container_node* n) {
-
+bool post_scheduler::run_on(container_node* n) {
+	int r = true;
 	for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
 		if (I->is_container()) {
 			if (I->subtype == NST_BB) {
 				bb_node* bb = static_cast<bb_node*>(*I);
-				schedule_bb(bb);
+				r = schedule_bb(bb);
 			} else {
-				run_on(static_cast<container_node*>(*I));
+				r = run_on(static_cast<container_node*>(*I));
 			}
+			if (!r)
+				break;
 		}
 	}
+	return r;
 }
 
 void post_scheduler::init_uc_val(container_node *c, value *v) {
@@ -758,7 +760,7 @@
 	return F == ucm.end() ? 0 : F->second;
 }
 
-void post_scheduler::schedule_bb(bb_node* bb) {
+bool post_scheduler::schedule_bb(bb_node* bb) {
 	PSC_DUMP(
 		sblog << "scheduling BB " << bb->id << "\n";
 		if (!pending.empty())
@@ -791,8 +793,10 @@
 
 		if (n->is_alu_clause()) {
 			n->remove();
-			process_alu(static_cast<container_node*>(n));
-			continue;
+			bool r = process_alu(static_cast<container_node*>(n));
+			if (r)
+				continue;
+			return false;
 		}
 
 		n->remove();
@@ -800,6 +804,7 @@
 	}
 
 	this->cur_bb = NULL;
+	return true;
 }
 
 void post_scheduler::init_regmap() {
@@ -933,10 +938,10 @@
 	cur_bb->push_front(c);
 }
 
-void post_scheduler::process_alu(container_node *c) {
+bool post_scheduler::process_alu(container_node *c) {
 
 	if (c->empty())
-		return;
+		return true;
 
 	ucm.clear();
 	alu.reset();
@@ -973,7 +978,7 @@
 		}
 	}
 
-	schedule_alu(c);
+	return schedule_alu(c);
 }
 
 void post_scheduler::update_local_interferences() {
@@ -1135,15 +1140,20 @@
 	emit_index_registers();
 }
 
-void post_scheduler::schedule_alu(container_node *c) {
+bool post_scheduler::schedule_alu(container_node *c) {
 
 	assert(!ready.empty() || !ready_copies.empty());
 
-	while (1) {
-
+	bool improving = true;
+	int last_pending = pending.count();
+	while (improving) {
 		prev_regmap = regmap;
-
 		if (!prepare_alu_group()) {
+
+			int new_pending = pending.count();
+			improving = (new_pending < last_pending) || (last_pending == 0);
+			last_pending = new_pending;
+
 			if (alu.current_idx[0] || alu.current_idx[1]) {
 				regmap = prev_regmap;
 				emit_clause();
@@ -1186,6 +1196,7 @@
 		dump::dump_op_list(&pending);
 		assert(!"unscheduled pending instructions");
 	}
+	return improving;
 }
 
 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h
index 05b428c..5a26634 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.h
+++ b/src/gallium/drivers/r600/sb/sb_sched.h
@@ -267,14 +267,14 @@
 		live(), ucm(), alu(sh),	regmap(), cleared_interf() {}
 
 	virtual int run();
-	void run_on(container_node *n);
-	void schedule_bb(bb_node *bb);
+	bool run_on(container_node *n);
+	bool schedule_bb(bb_node *bb);
 
 	void load_index_register(value *v, unsigned idx);
 	void process_fetch(container_node *c);
 
-	void process_alu(container_node *c);
-	void schedule_alu(container_node *c);
+	bool process_alu(container_node *c);
+	bool schedule_alu(container_node *c);
 	bool prepare_alu_group();
 
 	void release_op(node *n);
diff --git a/src/gallium/drivers/radeon/Android.mk b/src/gallium/drivers/radeon/Android.mk
index 2bddac8..c2d3a1c 100644
--- a/src/gallium/drivers/radeon/Android.mk
+++ b/src/gallium/drivers/radeon/Android.mk
@@ -30,12 +30,18 @@
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
-endif
-
 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeon
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+$(call mesa-build-with-llvm)
+endif
+
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_R600)$(HAVE_GALLIUM_RADEONSI),)
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE))
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/radeon/LLVM_REVISION.txt b/src/gallium/drivers/radeon/LLVM_REVISION.txt
deleted file mode 100644
index e840995..0000000
--- a/src/gallium/drivers/radeon/LLVM_REVISION.txt
+++ /dev/null
@@ -1 +0,0 @@
-@181269
diff --git a/src/gallium/drivers/radeon/Makefile.am b/src/gallium/drivers/radeon/Makefile.am
index 2be6af4..7f64b76 100644
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -26,6 +26,3 @@
 	$(LLVM_LDFLAGS)
 
 endif
-
-EXTRA_DIST = \
-	LLVM_REVISION.txt
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index 9dd4e1a..f4e817e 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -14,6 +14,8 @@
 	r600_viewport.c \
 	radeon_uvd.c \
 	radeon_uvd.h \
+	radeon_vcn_dec.c \
+	radeon_vcn_dec.h \
 	radeon_vce_40_2_2.c \
 	radeon_vce_50.c \
 	radeon_vce_52.c \
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index b2289e2..dd1c209 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -110,6 +110,8 @@
 	res->bo_size = size;
 	res->bo_alignment = alignment;
 	res->flags = 0;
+	res->texture_handle_allocated = false;
+	res->image_handle_allocated = false;
 
 	switch (res->b.b.usage) {
 	case PIPE_USAGE_STREAM:
@@ -130,7 +132,6 @@
 			res->flags |= RADEON_FLAG_GTT_WC;
 			break;
 		}
-		res->flags |= RADEON_FLAG_CPU_ACCESS;
 		/* fall through */
 	case PIPE_USAGE_DEFAULT:
 	case PIPE_USAGE_IMMUTABLE:
@@ -156,15 +157,12 @@
 		if (rscreen->info.drm_major == 2 &&
 		    rscreen->info.drm_minor < 40)
 			res->domains = RADEON_DOMAIN_GTT;
-		else if (res->domains & RADEON_DOMAIN_VRAM)
-			res->flags |= RADEON_FLAG_CPU_ACCESS;
 	}
 
 	/* Tiled textures are unmappable. Always put them in VRAM. */
 	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
 	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
 		res->domains = RADEON_DOMAIN_VRAM;
-		res->flags &= ~RADEON_FLAG_CPU_ACCESS;
 		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
 			 RADEON_FLAG_GTT_WC;
 	}
@@ -178,12 +176,17 @@
 	 */
 	if (!rscreen->info.has_dedicated_vram &&
 	    (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
-	    res->domains == RADEON_DOMAIN_VRAM)
+	    res->domains == RADEON_DOMAIN_VRAM) {
 		res->domains = RADEON_DOMAIN_VRAM_GTT;
+		res->flags &= ~RADEON_FLAG_NO_CPU_ACCESS; /* disallowed with VRAM_GTT */
+	}
 
 	if (rscreen->debug_flags & DBG_NO_WC)
 		res->flags &= ~RADEON_FLAG_GTT_WC;
 
+	if (res->b.b.bind & PIPE_BIND_SHARED)
+		res->flags |= RADEON_FLAG_NO_SUBALLOC;
+
 	/* Set expected VRAM and GART usage for the buffer. */
 	res->vram_usage = 0;
 	res->gart_usage = 0;
@@ -238,6 +241,7 @@
 {
 	struct r600_resource *rbuffer = r600_resource(buf);
 
+	threaded_resource_deinit(buf);
 	util_range_destroy(&rbuffer->valid_buffer_range);
 	pb_reference(&rbuffer->buf, NULL);
 	FREE(rbuffer);
@@ -248,7 +252,7 @@
 		       struct r600_resource *rbuffer)
 {
 	/* Shared buffers can't be reallocated. */
-	if (rbuffer->is_shared)
+	if (rbuffer->b.is_shared)
 		return false;
 
 	/* Sparse buffers can't be reallocated. */
@@ -258,7 +262,7 @@
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
 	 */
-	if (rctx->ws->buffer_is_user_ptr(rbuffer->buf))
+	if (rbuffer->b.is_user_ptr)
 		return false;
 
 	/* Check if mapping this buffer would cause waiting for the GPU. */
@@ -272,6 +276,30 @@
 	return true;
 }
 
+/* Replace the storage of dst with src. */
+void r600_replace_buffer_storage(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 struct pipe_resource *src)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_resource *rdst = r600_resource(dst);
+	struct r600_resource *rsrc = r600_resource(src);
+	uint64_t old_gpu_address = rdst->gpu_address;
+
+	pb_reference(&rdst->buf, rsrc->buf);
+	rdst->gpu_address = rsrc->gpu_address;
+	rdst->b.b.bind = rsrc->b.b.bind;
+	rdst->flags = rsrc->flags;
+
+	assert(rdst->vram_usage == rsrc->vram_usage);
+	assert(rdst->gart_usage == rsrc->gart_usage);
+	assert(rdst->bo_size == rsrc->bo_size);
+	assert(rdst->bo_alignment == rsrc->bo_alignment);
+	assert(rdst->domains == rsrc->domains);
+
+	rctx->rebind_buffer(ctx, dst, old_gpu_address);
+}
+
 void r600_invalidate_resource(struct pipe_context *ctx,
 			      struct pipe_resource *resource)
 {
@@ -292,18 +320,24 @@
 				      unsigned offset)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
-	struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers);
+	struct r600_transfer *transfer;
 
-	transfer->transfer.resource = NULL;
-	pipe_resource_reference(&transfer->transfer.resource, resource);
-	transfer->transfer.level = 0;
-	transfer->transfer.usage = usage;
-	transfer->transfer.box = *box;
-	transfer->transfer.stride = 0;
-	transfer->transfer.layer_stride = 0;
+	if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+		transfer = slab_alloc(&rctx->pool_transfers_unsync);
+	else
+		transfer = slab_alloc(&rctx->pool_transfers);
+
+	transfer->b.b.resource = NULL;
+	pipe_resource_reference(&transfer->b.b.resource, resource);
+	transfer->b.b.level = 0;
+	transfer->b.b.usage = usage;
+	transfer->b.b.box = *box;
+	transfer->b.b.stride = 0;
+	transfer->b.b.layer_stride = 0;
+	transfer->b.staging = NULL;
 	transfer->offset = offset;
 	transfer->staging = staging;
-	*ptransfer = &transfer->transfer;
+	*ptransfer = &transfer->b.b;
 	return data;
 }
 
@@ -343,14 +377,15 @@
 	 *
 	 * So don't ever use staging buffers.
 	 */
-	if (rscreen->ws->buffer_is_user_ptr(rbuffer->buf))
+	if (rbuffer->b.is_user_ptr)
 		usage |= PIPE_TRANSFER_PERSISTENT;
 
 	/* See if the buffer range being mapped has never been initialized,
 	 * in which case it can be mapped unsynchronized. */
-	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+	if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       TC_TRANSFER_MAP_IGNORE_VALID_RANGE)) &&
 	    usage & PIPE_TRANSFER_WRITE &&
-	    !rbuffer->is_shared &&
+	    !rbuffer->b.is_shared &&
 	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
 		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 	}
@@ -362,7 +397,8 @@
 	}
 
 	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
-	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       TC_TRANSFER_MAP_NO_INVALIDATE))) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
 		if (r600_invalidate_buffer(rctx, rbuffer)) {
@@ -418,6 +454,7 @@
 		 (rbuffer->flags & RADEON_FLAG_SPARSE)) {
 		struct r600_resource *staging;
 
+		assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
 		staging = (struct r600_resource*) pipe_buffer_create(
 				ctx->screen, 0, PIPE_USAGE_STAGING,
 				box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT));
@@ -482,8 +519,10 @@
 				     struct pipe_transfer *transfer,
 				     const struct pipe_box *rel_box)
 {
-	if (transfer->usage & (PIPE_TRANSFER_WRITE |
-			       PIPE_TRANSFER_FLUSH_EXPLICIT)) {
+	unsigned required_usage = PIPE_TRANSFER_WRITE |
+				  PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+	if ((transfer->usage & required_usage) == required_usage) {
 		struct pipe_box box;
 
 		u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
@@ -501,10 +540,12 @@
 	    !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
 		r600_buffer_do_flush_region(ctx, transfer, &transfer->box);
 
-	if (rtransfer->staging)
-		r600_resource_reference(&rtransfer->staging, NULL);
-
+	r600_resource_reference(&rtransfer->staging, NULL);
+	assert(rtransfer->b.staging == NULL); /* for threaded context only */
 	pipe_resource_reference(&transfer->resource, NULL);
+
+	/* Don't use pool_transfers_unsync. We are always in the driver
+	 * thread. */
 	slab_free(&rctx->pool_transfers, transfer);
 }
 
@@ -551,11 +592,13 @@
 	rbuffer->b.b.next = NULL;
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
+
 	rbuffer->b.vtbl = &r600_buffer_vtbl;
+	threaded_resource_init(&rbuffer->b.b);
+
 	rbuffer->buf = NULL;
 	rbuffer->bind_history = 0;
 	rbuffer->TC_L2_dirty = false;
-	rbuffer->is_shared = false;
 	util_range_init(&rbuffer->valid_buffer_range);
 	return rbuffer;
 }
@@ -569,8 +612,6 @@
 
 	r600_init_resource_fields(rscreen, rbuffer, templ->width0, alignment);
 
-	if (templ->bind & PIPE_BIND_SHARED)
-		rbuffer->flags |= RADEON_FLAG_HANDLE;
 	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
 		rbuffer->flags |= RADEON_FLAG_SPARSE;
 
@@ -613,7 +654,9 @@
 
 	rbuffer->domains = RADEON_DOMAIN_GTT;
 	rbuffer->flags = 0;
+	rbuffer->b.is_user_ptr = true;
 	util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0);
+	util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0);
 
 	/* Convert a user pointer to a buffer. */
 	rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
diff --git a/src/gallium/drivers/radeon/r600_gpu_load.c b/src/gallium/drivers/radeon/r600_gpu_load.c
index 3b45545..d8f7c3d 100644
--- a/src/gallium/drivers/radeon/r600_gpu_load.c
+++ b/src/gallium/drivers/radeon/r600_gpu_load.c
@@ -105,7 +105,7 @@
 	UPDATE_COUNTER(gui, GUI_ACTIVE);
 	gui_busy = GUI_ACTIVE(value);
 
-	if (rscreen->chip_class >= CIK) {
+	if (rscreen->chip_class == CIK || rscreen->chip_class == VI) {
 		/* SRBM_STATUS2 */
 		rscreen->ws->read_registers(rscreen->ws, SRBM_STATUS2, 1, &value);
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3b49040..7212727 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -610,6 +610,7 @@
 			      unsigned context_flags)
 {
 	slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
+	slab_create_child(&rctx->pool_transfers_unsync, &rscreen->pool_transfers);
 
 	rctx->screen = rscreen;
 	rctx->ws = rscreen->ws;
@@ -636,6 +637,10 @@
 	else
 		rctx->b.buffer_subdata = r600_buffer_subdata;
 
+	/* Set a reasonable default to avoid a performance regression in r600
+	 * on stable branches. */
+	rctx->current_rast_prim = PIPE_PRIM_TRIANGLES;
+
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 		rctx->b.get_device_reset_status = r600_get_reset_status;
 		rctx->gpu_reset_counter =
@@ -671,7 +676,7 @@
 	if (!rctx->ctx)
 		return false;
 
-	if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
+	if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
 		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 						   r600_flush_dma_ring,
 						   rctx);
@@ -713,6 +718,7 @@
 		u_upload_destroy(rctx->b.const_uploader);
 
 	slab_destroy_child(&rctx->pool_transfers);
+	slab_destroy_child(&rctx->pool_transfers_unsync);
 
 	if (rctx->allocator_zeroed_memory) {
 		u_suballocator_destroy(rctx->allocator_zeroed_memory);
@@ -769,6 +775,7 @@
 	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+." },
 	{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
 	{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" },
+	{ "ce", DBG_CE, "Force enable the constant engine" },
 	{ "noce", DBG_NO_CE, "Disable the constant engine"},
 	{ "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" },
 	{ "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" },
@@ -786,7 +793,14 @@
 	return "AMD";
 }
 
-static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
+static const char *r600_get_marketing_name(struct radeon_winsys *ws)
+{
+	if (!ws->get_chip_name)
+		return NULL;
+	return ws->get_chip_name(ws);
+}
+
+static const char *r600_get_family_name(const struct r600_common_screen *rscreen)
 {
 	switch (rscreen->info.family) {
 	case CHIP_R600: return "AMD R600";
@@ -865,8 +879,9 @@
 #endif
 		if (res != -1) {
 			rscreen->disk_shader_cache =
-				disk_cache_create(r600_get_chip_name(rscreen),
-						  timestamp_str);
+				disk_cache_create(r600_get_family_name(rscreen),
+						  timestamp_str,
+						  rscreen->debug_flags);
 			free(timestamp_str);
 		}
 	}
@@ -888,17 +903,12 @@
 static float r600_get_paramf(struct pipe_screen* pscreen,
 			     enum pipe_capf param)
 {
-	struct r600_common_screen *rscreen = (struct r600_common_screen *)pscreen;
-
 	switch (param) {
 	case PIPE_CAPF_MAX_LINE_WIDTH:
 	case PIPE_CAPF_MAX_LINE_WIDTH_AA:
 	case PIPE_CAPF_MAX_POINT_WIDTH:
 	case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-		if (rscreen->family >= CHIP_CEDAR)
-			return 16384.0f;
-		else
-			return 8192.0f;
+		return 8192.0f;
 	case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
 		return 16.0f;
 	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
@@ -1002,10 +1012,10 @@
 	case CHIP_STONEY:
 		return "stoney";
 	case CHIP_POLARIS10:
-		return HAVE_LLVM >= 0x0309 ? "polaris10" : "carrizo";
+		return "polaris10";
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12: /* same as polaris11 */
-		return HAVE_LLVM >= 0x0309 ? "polaris11" : "carrizo";
+		return "polaris11";
 	case CHIP_VEGA10:
 	case CHIP_RAVEN:
 		return "gfx900";
@@ -1014,6 +1024,25 @@
 	}
 }
 
+static unsigned get_max_threads_per_block(struct r600_common_screen *screen,
+					  enum pipe_shader_ir ir_type)
+{
+	if (ir_type != PIPE_SHADER_IR_TGSI)
+		return 256;
+
+	/* Only 16 waves per thread-group on gfx9. */
+	if (screen->chip_class >= GFX9)
+		return 1024;
+
+	/* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice
+	 * round number.
+	 */
+	if (screen->chip_class >= SI)
+		return 2048;
+
+	return 256;
+}
+
 static int r600_get_compute_param(struct pipe_screen *screen,
         enum pipe_shader_ir ir_type,
         enum pipe_compute_cap param,
@@ -1068,27 +1097,17 @@
 	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
 		if (ret) {
 			uint64_t *block_size = ret;
-			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
-			    ir_type == PIPE_SHADER_IR_TGSI) {
-				block_size[0] = 2048;
-				block_size[1] = 2048;
-				block_size[2] = 2048;
-			} else {
-				block_size[0] = 256;
-				block_size[1] = 256;
-				block_size[2] = 256;
-			}
+			unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type);
+			block_size[0] = threads_per_block;
+			block_size[1] = threads_per_block;
+			block_size[2] = threads_per_block;
 		}
 		return 3 * sizeof(uint64_t);
 
 	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
 		if (ret) {
 			uint64_t *max_threads_per_block = ret;
-			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
-			    ir_type == PIPE_SHADER_IR_TGSI)
-				*max_threads_per_block = 2048;
-			else
-				*max_threads_per_block = 256;
+			*max_threads_per_block = get_max_threads_per_block(rscreen, ir_type);
 		}
 		return sizeof(uint64_t);
 	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
@@ -1176,7 +1195,7 @@
 	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
 		if (ret) {
 			uint64_t *max_variable_threads_per_block = ret;
-			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+			if (rscreen->chip_class >= SI &&
 			    ir_type == PIPE_SHADER_IR_TGSI)
 				*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
 			else
@@ -1220,10 +1239,12 @@
 {
 	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
 	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
-	struct r600_common_context *rctx =
-		ctx ? (struct r600_common_context*)ctx : NULL;
+	struct r600_common_context *rctx;
 	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
 
+	ctx = threaded_context_unwrap_sync(ctx);
+	rctx = ctx ? (struct r600_common_context*)ctx : NULL;
+
 	if (rfence->sdma) {
 		if (!rws->fence_wait(rws, rfence->sdma, timeout))
 			return false;
@@ -1310,12 +1331,19 @@
 }
 
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
-			     struct radeon_winsys *ws)
+			     struct radeon_winsys *ws, unsigned flags)
 {
-	char llvm_string[32] = {}, kernel_version[128] = {};
+	char family_name[32] = {}, llvm_string[32] = {}, kernel_version[128] = {};
 	struct utsname uname_data;
+	const char *chip_name;
 
 	ws->query_info(ws, &rscreen->info);
+	rscreen->ws = ws;
+
+	if ((chip_name = r600_get_marketing_name(ws)))
+		snprintf(family_name, sizeof(family_name), "%s / ", r600_get_family_name(rscreen));
+	else
+		chip_name = r600_get_family_name(rscreen);
 
 	if (uname(&uname_data) == 0)
 		snprintf(kernel_version, sizeof(kernel_version),
@@ -1328,8 +1356,8 @@
 	}
 
 	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
-		 "%s (DRM %i.%i.%i%s%s)",
-		 r600_get_chip_name(rscreen), rscreen->info.drm_major,
+		 "%s (%sDRM %i.%i.%i%s%s)",
+		 chip_name, family_name, rscreen->info.drm_major,
 		 rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
 		 kernel_version, llvm_string);
 
@@ -1346,7 +1374,7 @@
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
 	rscreen->b.query_memory_info = r600_query_memory_info;
 
-	if (rscreen->info.has_uvd) {
+	if (rscreen->info.has_hw_decode) {
 		rscreen->b.get_video_param = rvid_get_video_param;
 		rscreen->b.is_video_format_supported = rvid_is_format_supported;
 	} else {
@@ -1357,13 +1385,17 @@
 	r600_init_screen_texture_functions(rscreen);
 	r600_init_screen_query_functions(rscreen);
 
-	rscreen->ws = ws;
 	rscreen->family = rscreen->info.family;
 	rscreen->chip_class = rscreen->info.chip_class;
 	rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
 	rscreen->has_rbplus = false;
 	rscreen->rbplus_allowed = false;
 
+	/* Set the flag in debug_flags, so that the shader cache takes it
+	 * into account. */
+	if (flags & PIPE_SCREEN_ENABLE_CORRECT_TGSI_DERIVATIVES_AFTER_KILL)
+		rscreen->debug_flags |= DBG_FS_CORRECT_DERIVS_AFTER_KILL;
+
 	r600_disk_cache_create(rscreen);
 
 	slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
@@ -1382,7 +1414,7 @@
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
 		printf("family = %i (%s)\n", rscreen->info.family,
-		       r600_get_chip_name(rscreen));
+		       r600_get_family_name(rscreen));
 		printf("chip_class = %i\n", rscreen->info.chip_class);
 		printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
 		printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
@@ -1391,8 +1423,8 @@
 		       (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
 		printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
 		printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
-		printf("has_sdma = %i\n", rscreen->info.has_sdma);
-		printf("has_uvd = %i\n", rscreen->info.has_uvd);
+		printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings);
+		printf("has_hw_decode = %i\n", rscreen->info.has_hw_decode);
 		printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
 		printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
 		printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index fbd0ac7..baa1298 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -45,6 +45,7 @@
 #include "util/slab.h"
 #include "util/u_suballoc.h"
 #include "util/u_transfer.h"
+#include "util/u_threaded_context.h"
 
 #define ATI_VENDOR_ID 0x1002
 
@@ -64,12 +65,12 @@
 #define R600_PRIM_RECTANGLE_LIST	PIPE_PRIM_MAX
 
 /* Debug flags. */
-/* logging */
+/* logging and features */
 #define DBG_TEX			(1 << 0)
 /* gap - reuse */
 #define DBG_COMPUTE		(1 << 2)
 #define DBG_VM			(1 << 3)
-/* gap - reuse */
+#define DBG_CE			(1 << 4)
 /* shader logging */
 #define DBG_FS			(1 << 5)
 #define DBG_VS			(1 << 6)
@@ -84,32 +85,33 @@
 #define DBG_PREOPT_IR		(1 << 15)
 #define DBG_CHECK_IR		(1 << 16)
 #define DBG_NO_OPT_VARIANT	(1 << 17)
+#define DBG_FS_CORRECT_DERIVS_AFTER_KILL (1 << 18)
 /* gaps */
 #define DBG_TEST_DMA		(1 << 20)
 /* Bits 21-31 are reserved for the r600g driver. */
 /* features */
-#define DBG_NO_ASYNC_DMA	(1llu << 32)
-#define DBG_NO_HYPERZ		(1llu << 33)
-#define DBG_NO_DISCARD_RANGE	(1llu << 34)
-#define DBG_NO_2D_TILING	(1llu << 35)
-#define DBG_NO_TILING		(1llu << 36)
-#define DBG_SWITCH_ON_EOP	(1llu << 37)
-#define DBG_FORCE_DMA		(1llu << 38)
-#define DBG_PRECOMPILE		(1llu << 39)
-#define DBG_INFO		(1llu << 40)
-#define DBG_NO_WC		(1llu << 41)
-#define DBG_CHECK_VM		(1llu << 42)
-#define DBG_NO_DCC		(1llu << 43)
-#define DBG_NO_DCC_CLEAR	(1llu << 44)
-#define DBG_NO_RB_PLUS		(1llu << 45)
-#define DBG_SI_SCHED		(1llu << 46)
-#define DBG_MONOLITHIC_SHADERS	(1llu << 47)
-#define DBG_NO_CE		(1llu << 48)
-#define DBG_UNSAFE_MATH		(1llu << 49)
-#define DBG_NO_DCC_FB		(1llu << 50)
-#define DBG_TEST_VMFAULT_CP	(1llu << 51)
-#define DBG_TEST_VMFAULT_SDMA	(1llu << 52)
-#define DBG_TEST_VMFAULT_SHADER	(1llu << 53)
+#define DBG_NO_ASYNC_DMA	(1ull << 32)
+#define DBG_NO_HYPERZ		(1ull << 33)
+#define DBG_NO_DISCARD_RANGE	(1ull << 34)
+#define DBG_NO_2D_TILING	(1ull << 35)
+#define DBG_NO_TILING		(1ull << 36)
+#define DBG_SWITCH_ON_EOP	(1ull << 37)
+#define DBG_FORCE_DMA		(1ull << 38)
+#define DBG_PRECOMPILE		(1ull << 39)
+#define DBG_INFO		(1ull << 40)
+#define DBG_NO_WC		(1ull << 41)
+#define DBG_CHECK_VM		(1ull << 42)
+#define DBG_NO_DCC		(1ull << 43)
+#define DBG_NO_DCC_CLEAR	(1ull << 44)
+#define DBG_NO_RB_PLUS		(1ull << 45)
+#define DBG_SI_SCHED		(1ull << 46)
+#define DBG_MONOLITHIC_SHADERS	(1ull << 47)
+#define DBG_NO_CE		(1ull << 48)
+#define DBG_UNSAFE_MATH		(1ull << 49)
+#define DBG_NO_DCC_FB		(1ull << 50)
+#define DBG_TEST_VMFAULT_CP	(1ull << 51)
+#define DBG_TEST_VMFAULT_SDMA	(1ull << 52)
+#define DBG_TEST_VMFAULT_SHADER	(1ull << 53)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS        16
@@ -140,7 +142,7 @@
  * at the moment.
  */
 struct r600_resource {
-	struct u_resource		b;
+	struct threaded_resource	b;
 
 	/* Winsys objects. */
 	struct pb_buffer		*buf;
@@ -179,12 +181,15 @@
 	bool				TC_L2_dirty;
 
 	/* Whether the resource has been exported via resource_get_handle. */
-	bool				is_shared;
 	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
+
+	/* Whether this resource is referenced by bindless handles. */
+	bool				texture_handle_allocated;
+	bool				image_handle_allocated;
 };
 
 struct r600_transfer {
-	struct pipe_transfer		transfer;
+	struct threaded_transfer	b;
 	struct r600_resource		*staging;
 	unsigned			offset;
 };
@@ -232,12 +237,13 @@
 	unsigned			last_msaa_resolve_target_micro_mode;
 
 	/* Depth buffer compression and fast clear. */
-	struct r600_resource		*htile_buffer;
+	uint64_t			htile_offset;
 	bool				tc_compatible_htile;
 	bool				depth_cleared; /* if it was cleared at least once */
 	float				depth_clear_value;
 	bool				stencil_cleared; /* if it was cleared at least once */
 	uint8_t				stencil_clear_value;
+	bool				upgraded_depth; /* upgraded from unorm to Z32_FLOAT */
 
 	bool				non_disp_tiling; /* R600-Cayman only */
 
@@ -552,9 +558,12 @@
 	unsigned			gpu_reset_counter;
 	unsigned			last_dirty_tex_counter;
 	unsigned			last_compressed_colortex_counter;
+	unsigned			last_num_draw_calls;
 
+	struct threaded_context		*tc;
 	struct u_suballocator		*allocator_zeroed_memory;
 	struct slab_child_pool		pool_transfers;
+	struct slab_child_pool		pool_transfers_unsync; /* for threaded_context */
 
 	/* Current unaccounted memory usage. */
 	uint64_t			vram;
@@ -571,6 +580,7 @@
 
 	/* Additional context states. */
 	unsigned flags; /* flush flags */
+	enum pipe_prim_type		current_rast_prim; /* primitive type after TES, GS */
 
 	/* Queries. */
 	/* Maintain the list of active queries for pausing between IBs. */
@@ -580,6 +590,7 @@
 	unsigned			num_cs_dw_queries_suspend;
 	/* Misc stats. */
 	unsigned			num_draw_calls;
+	unsigned			num_prim_restart_calls;
 	unsigned			num_spill_draw_calls;
 	unsigned			num_compute_calls;
 	unsigned			num_spill_compute_calls;
@@ -588,9 +599,11 @@
 	unsigned			num_vs_flushes;
 	unsigned			num_ps_flushes;
 	unsigned			num_cs_flushes;
-	unsigned			num_fb_cache_flushes;
+	unsigned			num_cb_cache_flushes;
+	unsigned			num_db_cache_flushes;
 	unsigned			num_L2_invalidates;
 	unsigned			num_L2_writebacks;
+	unsigned			num_resident_handles;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
 
@@ -664,6 +677,12 @@
 	 * the buffer is bound, including all resource descriptors. */
 	void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf);
 
+	/* Update all resource bindings where the buffer is bound, including
+	 * all resource descriptors. This is invalidate_buffer without
+	 * the invalidation. */
+	void (*rebind_buffer)(struct pipe_context *ctx, struct pipe_resource *buf,
+			      uint64_t old_gpu_address);
+
 	/* Enable or disable occlusion queries. */
 	void (*set_occlusion_query_state)(struct pipe_context *ctx, bool enable);
 
@@ -681,7 +700,7 @@
 				enum ring_type ring);
 };
 
-/* r600_buffer.c */
+/* r600_buffer_common.c */
 bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
 				     struct pb_buffer *buf,
 				     enum radeon_bo_usage usage);
@@ -712,6 +731,9 @@
 void
 r600_invalidate_resource(struct pipe_context *ctx,
 			 struct pipe_resource *resource);
+void r600_replace_buffer_storage(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 struct pipe_resource *src);
 
 /* r600_common_pipe.c */
 void r600_gfx_write_event_eop(struct r600_common_context *ctx,
@@ -727,7 +749,7 @@
 			 enum blitter_attrib_type type,
 			 const union pipe_color_union *attrib);
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
-			     struct radeon_winsys *ws);
+			     struct radeon_winsys *ws, unsigned flags);
 void r600_destroy_common_screen(struct r600_common_screen *rscreen);
 void r600_preflush_suspend_features(struct r600_common_context *ctx);
 void r600_postflush_resume_features(struct r600_common_context *ctx);
@@ -830,7 +852,7 @@
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct pipe_framebuffer_state *fb,
 				   struct r600_atom *fb_state,
-				   unsigned *buffers, unsigned *dirty_cbufs,
+				   unsigned *buffers, ubyte *dirty_cbufs,
 				   const union pipe_color_union *color);
 bool r600_texture_disable_dcc(struct r600_common_context *rctx,
 			      struct r600_texture *rtex);
@@ -987,4 +1009,9 @@
 	(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) |	   \
 	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
 
+static inline int S_FIXED(float value, unsigned frac_bits)
+{
+	return value * (1 << frac_bits);
+}
+
 #endif
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 0980eca..28c896a 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -71,8 +71,10 @@
 	case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
 	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
 	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
+	case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
 	case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
@@ -97,6 +99,9 @@
 	case R600_QUERY_DRAW_CALLS:
 		query->begin_result = rctx->num_draw_calls;
 		break;
+	case R600_QUERY_PRIM_RESTART_CALLS:
+		query->begin_result = rctx->num_prim_restart_calls;
+		break;
 	case R600_QUERY_SPILL_DRAW_CALLS:
 		query->begin_result = rctx->num_spill_draw_calls;
 		break;
@@ -121,8 +126,11 @@
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->begin_result = rctx->num_cs_flushes;
 		break;
-	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
-		query->begin_result = rctx->num_fb_cache_flushes;
+	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
+		query->begin_result = rctx->num_cb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
+		query->begin_result = rctx->num_db_cache_flushes;
 		break;
 	case R600_QUERY_NUM_L2_INVALIDATES:
 		query->begin_result = rctx->num_L2_invalidates;
@@ -130,6 +138,18 @@
 	case R600_QUERY_NUM_L2_WRITEBACKS:
 		query->begin_result = rctx->num_L2_writebacks;
 		break;
+	case R600_QUERY_NUM_RESIDENT_HANDLES:
+		query->begin_result = rctx->num_resident_handles;
+		break;
+	case R600_QUERY_TC_OFFLOADED_SLOTS:
+		query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
+		break;
+	case R600_QUERY_TC_DIRECT_SLOTS:
+		query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
+		break;
+	case R600_QUERY_TC_NUM_SYNCS:
+		query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
@@ -148,16 +168,28 @@
 	case R600_QUERY_NUM_GFX_IBS:
 	case R600_QUERY_NUM_SDMA_IBS:
 	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_NUM_EVICTIONS: {
+	case R600_QUERY_NUM_EVICTIONS:
+	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
 		break;
 	}
+	case R600_QUERY_GFX_BO_LIST_SIZE:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+		query->begin_time = rctx->ws->query_value(rctx->ws,
+							  RADEON_NUM_GFX_IBS);
+		break;
 	case R600_QUERY_CS_THREAD_BUSY:
 		ws_id = winsys_id_from_type(query->b.type);
 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
 		query->begin_time = os_time_get_nano();
 		break;
+	case R600_QUERY_GALLIUM_THREAD_BUSY:
+		query->begin_result =
+			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
+		query->begin_time = os_time_get_nano();
+		break;
 	case R600_QUERY_GPU_LOAD:
 	case R600_QUERY_GPU_SHADERS_BUSY:
 	case R600_QUERY_GPU_TA_BUSY:
@@ -221,6 +253,9 @@
 	case R600_QUERY_DRAW_CALLS:
 		query->end_result = rctx->num_draw_calls;
 		break;
+	case R600_QUERY_PRIM_RESTART_CALLS:
+		query->end_result = rctx->num_prim_restart_calls;
+		break;
 	case R600_QUERY_SPILL_DRAW_CALLS:
 		query->end_result = rctx->num_spill_draw_calls;
 		break;
@@ -245,8 +280,11 @@
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->end_result = rctx->num_cs_flushes;
 		break;
-	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
-		query->end_result = rctx->num_fb_cache_flushes;
+	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
+		query->end_result = rctx->num_cb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
+		query->end_result = rctx->num_db_cache_flushes;
 		break;
 	case R600_QUERY_NUM_L2_INVALIDATES:
 		query->end_result = rctx->num_L2_invalidates;
@@ -254,6 +292,18 @@
 	case R600_QUERY_NUM_L2_WRITEBACKS:
 		query->end_result = rctx->num_L2_writebacks;
 		break;
+	case R600_QUERY_NUM_RESIDENT_HANDLES:
+		query->end_result = rctx->num_resident_handles;
+		break;
+	case R600_QUERY_TC_OFFLOADED_SLOTS:
+		query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
+		break;
+	case R600_QUERY_TC_DIRECT_SLOTS:
+		query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
+		break;
+	case R600_QUERY_TC_NUM_SYNCS:
+		query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
@@ -269,16 +319,28 @@
 	case R600_QUERY_NUM_GFX_IBS:
 	case R600_QUERY_NUM_SDMA_IBS:
 	case R600_QUERY_NUM_BYTES_MOVED:
-	case R600_QUERY_NUM_EVICTIONS: {
+	case R600_QUERY_NUM_EVICTIONS:
+	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
 		break;
 	}
+	case R600_QUERY_GFX_BO_LIST_SIZE:
+		ws_id = winsys_id_from_type(query->b.type);
+		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+		query->end_time = rctx->ws->query_value(rctx->ws,
+							RADEON_NUM_GFX_IBS);
+		break;
 	case R600_QUERY_CS_THREAD_BUSY:
 		ws_id = winsys_id_from_type(query->b.type);
 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
 		query->end_time = os_time_get_nano();
 		break;
+	case R600_QUERY_GALLIUM_THREAD_BUSY:
+		query->end_result =
+			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
+		query->end_time = os_time_get_nano();
+		break;
 	case R600_QUERY_GPU_LOAD:
 	case R600_QUERY_GPU_SHADERS_BUSY:
 	case R600_QUERY_GPU_TA_BUSY:
@@ -348,12 +410,19 @@
 		return true;
 	case PIPE_QUERY_GPU_FINISHED: {
 		struct pipe_screen *screen = rctx->b.screen;
-		result->b = screen->fence_finish(screen, &rctx->b, query->fence,
+		struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
+
+		result->b = screen->fence_finish(screen, ctx, query->fence,
 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
 		return result->b;
 	}
 
+	case R600_QUERY_GFX_BO_LIST_SIZE:
+		result->u64 = (query->end_result - query->begin_result) /
+			      (query->end_time - query->begin_time);
+		return true;
 	case R600_QUERY_CS_THREAD_BUSY:
+	case R600_QUERY_GALLIUM_THREAD_BUSY:
 		result->u64 = (query->end_result - query->begin_result) * 100 /
 			      (query->end_time - query->begin_time);
 		return true;
@@ -663,8 +732,25 @@
 		radeon_emit(cs, va >> 32);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
-		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
-					 0, 3, NULL, va, 0, 0);
+		if (ctx->chip_class >= SI) {
+			/* Write the timestamp from the CP not waiting for
+			 * outstanding draws (top-of-pipe).
+			 */
+			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+			radeon_emit(cs, COPY_DATA_COUNT_SEL |
+					COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
+					COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0);
+			radeon_emit(cs, va);
+			radeon_emit(cs, va >> 32);
+		} else {
+			/* Write the timestamp after the last draw is done.
+			 * (bottom-of-pipe)
+			 */
+			r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
+						 0, 3, NULL, va, 0, 0);
+		}
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1206,12 +1292,16 @@
 	query->ops->clear_result(query, result);
 
 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned usage = PIPE_TRANSFER_READ |
+				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
 		unsigned results_base = 0;
 		void *map;
 
-		map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
-						      PIPE_TRANSFER_READ |
-						      (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+		if (rquery->b.flushed)
+			map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+		else
+			map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
+
 		if (!map)
 			return false;
 
@@ -1283,6 +1373,7 @@
 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+		"IMM[4] UINT32 {0, 0, 0, 0}\n"
 
 		"AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
 		"UIF TEMP[5]\n"
@@ -1382,7 +1473,7 @@
 					/* Convert to boolean */
 					"AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
 					"UIF TEMP[4]\n"
-						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
+						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
 						"MOV TEMP[0].y, IMM[0].xxxx\n"
 					"ENDIF\n"
@@ -1761,6 +1852,7 @@
 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
+	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
@@ -1769,10 +1861,16 @@
 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
-	X("num-fb-cache-flushes",	NUM_FB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
 	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
 	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
+	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
+	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
+	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
+	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
 	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
+	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
@@ -1781,8 +1879,10 @@
 	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
 	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
 	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
+	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
+	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
 	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
index b9ab44c..1bbaa76 100644
--- a/src/gallium/drivers/radeon/r600_query.h
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -28,9 +28,7 @@
 #ifndef R600_QUERY_H
 #define R600_QUERY_H
 
-#include "pipe/p_defines.h"
-#include "pipe/p_state.h"
-#include "util/list.h"
+#include "util/u_threaded_context.h"
 
 struct pipe_context;
 struct pipe_query;
@@ -44,6 +42,7 @@
 
 enum {
 	R600_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+	R600_QUERY_PRIM_RESTART_CALLS,
 	R600_QUERY_SPILL_DRAW_CALLS,
 	R600_QUERY_COMPUTE_CALLS,
 	R600_QUERY_SPILL_COMPUTE_CALLS,
@@ -52,10 +51,16 @@
 	R600_QUERY_NUM_VS_FLUSHES,
 	R600_QUERY_NUM_PS_FLUSHES,
 	R600_QUERY_NUM_CS_FLUSHES,
-	R600_QUERY_NUM_FB_CACHE_FLUSHES,
+	R600_QUERY_NUM_CB_CACHE_FLUSHES,
+	R600_QUERY_NUM_DB_CACHE_FLUSHES,
 	R600_QUERY_NUM_L2_INVALIDATES,
 	R600_QUERY_NUM_L2_WRITEBACKS,
+	R600_QUERY_NUM_RESIDENT_HANDLES,
+	R600_QUERY_TC_OFFLOADED_SLOTS,
+	R600_QUERY_TC_DIRECT_SLOTS,
+	R600_QUERY_TC_NUM_SYNCS,
 	R600_QUERY_CS_THREAD_BUSY,
+	R600_QUERY_GALLIUM_THREAD_BUSY,
 	R600_QUERY_REQUESTED_VRAM,
 	R600_QUERY_REQUESTED_GTT,
 	R600_QUERY_MAPPED_VRAM,
@@ -64,8 +69,10 @@
 	R600_QUERY_NUM_MAPPED_BUFFERS,
 	R600_QUERY_NUM_GFX_IBS,
 	R600_QUERY_NUM_SDMA_IBS,
+	R600_QUERY_GFX_BO_LIST_SIZE,
 	R600_QUERY_NUM_BYTES_MOVED,
 	R600_QUERY_NUM_EVICTIONS,
+	R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
 	R600_QUERY_VRAM_USAGE,
 	R600_QUERY_VRAM_VIS_USAGE,
 	R600_QUERY_GTT_USAGE,
@@ -128,6 +135,7 @@
 };
 
 struct r600_query {
+	struct threaded_query b;
 	struct r600_query_ops *ops;
 
 	/* The type of query */
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 4b20825..e4ae283 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -240,10 +240,7 @@
 		bpe = 4; /* stencil is allocated separately on evergreen */
 	} else {
 		bpe = util_format_get_blocksize(ptex->format);
-		/* align byte per element on dword */
-		if (bpe == 3) {
-			bpe = 4;
-		}
+		assert(util_is_power_of_two(bpe));
 	}
 
 	if (!is_flushed_depth && is_depth) {
@@ -389,7 +386,7 @@
 {
 	/* We can't disable DCC if it can be written by another process. */
 	return rtex->dcc_offset &&
-	       (!rtex->resource.is_shared ||
+	       (!rtex->resource.b.is_shared ||
 		!(rtex->resource.external_usage & PIPE_HANDLE_USAGE_WRITE));
 }
 
@@ -451,29 +448,34 @@
 	return r600_texture_discard_dcc(rscreen, rtex);
 }
 
-static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx,
-					     struct r600_texture *rtex,
-					     bool invalidate_storage)
+static void r600_reallocate_texture_inplace(struct r600_common_context *rctx,
+					    struct r600_texture *rtex,
+					    unsigned new_bind_flag,
+					    bool invalidate_storage)
 {
 	struct pipe_screen *screen = rctx->b.screen;
 	struct r600_texture *new_tex;
 	struct pipe_resource templ = rtex->resource.b.b;
 	unsigned i;
 
-	templ.bind |= PIPE_BIND_LINEAR;
+	templ.bind |= new_bind_flag;
 
 	/* r600g doesn't react to dirty_tex_descriptor_counter */
 	if (rctx->chip_class < SI)
 		return;
 
-	if (rtex->resource.is_shared ||
-	    rtex->surface.is_linear)
+	if (rtex->resource.b.is_shared)
 		return;
 
-	/* This fails with MSAA, depth, and compressed textures. */
-	if (r600_choose_tiling(rctx->screen, &templ) !=
-	    RADEON_SURF_MODE_LINEAR_ALIGNED)
-		return;
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		if (rtex->surface.is_linear)
+			return;
+
+		/* This fails with MSAA, depth, and compressed textures. */
+		if (r600_choose_tiling(rctx->screen, &templ) !=
+		    RADEON_SURF_MODE_LINEAR_ALIGNED)
+			return;
+	}
 
 	new_tex = (struct r600_texture*)screen->resource_create(screen, &templ);
 	if (!new_tex)
@@ -493,8 +495,10 @@
 		}
 	}
 
-	r600_texture_discard_cmask(rctx->screen, rtex);
-	r600_texture_discard_dcc(rctx->screen, rtex);
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		r600_texture_discard_cmask(rctx->screen, rtex);
+		r600_texture_discard_dcc(rctx->screen, rtex);
+	}
 
 	/* Replace the structure fields of rtex. */
 	rtex->resource.b.b.bind = templ.bind;
@@ -507,16 +511,30 @@
 	rtex->resource.domains = new_tex->resource.domains;
 	rtex->resource.flags = new_tex->resource.flags;
 	rtex->size = new_tex->size;
+	rtex->db_render_format = new_tex->db_render_format;
+	rtex->db_compatible = new_tex->db_compatible;
+	rtex->can_sample_z = new_tex->can_sample_z;
+	rtex->can_sample_s = new_tex->can_sample_s;
 	rtex->surface = new_tex->surface;
-	rtex->non_disp_tiling = new_tex->non_disp_tiling;
+	rtex->fmask = new_tex->fmask;
+	rtex->cmask = new_tex->cmask;
 	rtex->cb_color_info = new_tex->cb_color_info;
-	rtex->cmask = new_tex->cmask; /* needed even without CMASK */
+	rtex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+	rtex->htile_offset = new_tex->htile_offset;
+	rtex->tc_compatible_htile = new_tex->tc_compatible_htile;
+	rtex->depth_cleared = new_tex->depth_cleared;
+	rtex->stencil_cleared = new_tex->stencil_cleared;
+	rtex->non_disp_tiling = new_tex->non_disp_tiling;
+	rtex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+	rtex->framebuffers_bound = new_tex->framebuffers_bound;
 
-	assert(!rtex->htile_buffer);
-	assert(!rtex->cmask.size);
-	assert(!rtex->fmask.size);
-	assert(!rtex->dcc_offset);
-	assert(!rtex->is_depth);
+	if (new_bind_flag == PIPE_BIND_LINEAR) {
+		assert(!rtex->htile_offset);
+		assert(!rtex->cmask.size);
+		assert(!rtex->fmask.size);
+		assert(!rtex->dcc_offset);
+		assert(!rtex->is_depth);
+	}
 
 	r600_texture_reference(&new_tex, NULL);
 
@@ -530,22 +548,33 @@
                                        unsigned usage)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
-	struct r600_common_context *rctx = (struct r600_common_context*)
-					   (ctx ? ctx : rscreen->aux_context);
+	struct r600_common_context *rctx;
 	struct r600_resource *res = (struct r600_resource*)resource;
 	struct r600_texture *rtex = (struct r600_texture*)resource;
 	struct radeon_bo_metadata metadata;
 	bool update_metadata = false;
 	unsigned stride, offset, slice_size;
 
-	/* This is not supported now, but it might be required for OpenCL
-	 * interop in the future.
-	 */
-	if (resource->target != PIPE_BUFFER &&
-	    (resource->nr_samples > 1 || rtex->is_depth))
-		return false;
+	ctx = threaded_context_unwrap_sync(ctx);
+	rctx = (struct r600_common_context*)(ctx ? ctx : rscreen->aux_context);
 
 	if (resource->target != PIPE_BUFFER) {
+		/* This is not supported now, but it might be required for OpenCL
+		 * interop in the future.
+		 */
+		if (resource->nr_samples > 1 || rtex->is_depth)
+			return false;
+
+		/* Move a suballocated texture into a non-suballocated allocation. */
+		if (rscreen->ws->buffer_is_suballocated(res->buf)) {
+			assert(!res->b.is_shared);
+			r600_reallocate_texture_inplace(rctx, rtex,
+							PIPE_BIND_SHARED, false);
+			rctx->b.flush(&rctx->b, NULL, 0);
+			assert(res->b.b.bind & PIPE_BIND_SHARED);
+			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+		}
+
 		/* Since shader image stores don't support DCC on VI,
 		 * disable it for external clients that want write
 		 * access.
@@ -568,7 +597,7 @@
 		}
 
 		/* Set metadata. */
-		if (!res->is_shared || update_metadata) {
+		if (!res->b.is_shared || update_metadata) {
 			r600_texture_init_metadata(rscreen, rtex, &metadata);
 			if (rscreen->query_opaque_metadata)
 				rscreen->query_opaque_metadata(rscreen, rtex,
@@ -576,25 +605,7 @@
 
 			rscreen->ws->buffer_set_metadata(res->buf, &metadata);
 		}
-	}
 
-	if (res->is_shared) {
-		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
-		 * doesn't set it.
-		 */
-		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
-			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-	} else {
-		res->is_shared = true;
-		res->external_usage = usage;
-	}
-
-	if (res->b.b.target == PIPE_BUFFER) {
-		offset = 0;
-		stride = 0;
-		slice_size = 0;
-	} else {
 		if (rscreen->chip_class >= GFX9) {
 			offset = rtex->surface.u.gfx9.surf_offset;
 			stride = rtex->surface.u.gfx9.surf_pitch *
@@ -606,7 +617,51 @@
 				 rtex->surface.bpe;
 			slice_size = rtex->surface.u.legacy.level[0].slice_size;
 		}
+	} else {
+		/* Move a suballocated buffer into a non-suballocated allocation. */
+		if (rscreen->ws->buffer_is_suballocated(res->buf)) {
+			assert(!res->b.is_shared);
+
+			/* Allocate a new buffer with PIPE_BIND_SHARED. */
+			struct pipe_resource templ = res->b.b;
+			templ.bind |= PIPE_BIND_SHARED;
+
+			struct pipe_resource *newb =
+				screen->resource_create(screen, &templ);
+			if (!newb)
+				return false;
+
+			/* Copy the old buffer contents to the new one. */
+			struct pipe_box box;
+			u_box_1d(0, newb->width0, &box);
+			rctx->b.resource_copy_region(&rctx->b, newb, 0, 0, 0, 0,
+						     &res->b.b, 0, &box);
+			/* Move the new buffer storage to the old pipe_resource. */
+			r600_replace_buffer_storage(&rctx->b, &res->b.b, newb);
+			pipe_resource_reference(&newb, NULL);
+
+			assert(res->b.b.bind & PIPE_BIND_SHARED);
+			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+		}
+
+		/* Buffers */
+		offset = 0;
+		stride = 0;
+		slice_size = 0;
 	}
+
+	if (res->b.is_shared) {
+		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+		 * doesn't set it.
+		 */
+		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+	} else {
+		res->b.is_shared = true;
+		res->external_usage = usage;
+	}
+
 	return rscreen->ws->buffer_get_handle(res->buf, stride, offset,
 					      slice_size, whandle);
 }
@@ -619,7 +674,6 @@
 
 	r600_texture_reference(&rtex->flushed_depth_texture, NULL);
 
-	r600_resource_reference(&rtex->htile_buffer, NULL);
 	if (rtex->cmask_buffer != &rtex->resource) {
 	    r600_resource_reference(&rtex->cmask_buffer, NULL);
 	}
@@ -936,33 +990,14 @@
 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
 					struct r600_texture *rtex)
 {
-	uint32_t clear_value;
-
-	if (rscreen->chip_class >= GFX9 || rtex->tc_compatible_htile) {
-		clear_value = 0x0000030F;
-	} else {
+	if (rscreen->chip_class <= VI && !rtex->tc_compatible_htile)
 		r600_texture_get_htile_size(rscreen, rtex);
-		clear_value = 0;
-	}
 
 	if (!rtex->surface.htile_size)
 		return;
 
-	rtex->htile_buffer = (struct r600_resource*)
-		r600_aligned_buffer_create(&rscreen->b,
-					   R600_RESOURCE_FLAG_UNMAPPABLE,
-					   PIPE_USAGE_DEFAULT,
-					   rtex->surface.htile_size,
-					   rtex->surface.htile_alignment);
-	if (rtex->htile_buffer == NULL) {
-		/* this is not a fatal error as we can still keep rendering
-		 * without htile buffer */
-		R600_ERR("Failed to create buffer object for htile buffer.\n");
-	} else {
-		r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
-					 0, rtex->surface.htile_size,
-					 clear_value);
-	}
+	rtex->htile_offset = align(rtex->size, rtex->surface.htile_alignment);
+	rtex->size = rtex->htile_offset + rtex->surface.htile_size;
 }
 
 void r600_print_texture_info(struct r600_common_screen *rscreen,
@@ -1011,11 +1046,12 @@
 				rtex->surface.u.gfx9.cmask.pipe_aligned);
 		}
 
-		if (rtex->htile_buffer) {
-			fprintf(f, "  HTile: size=%u, alignment=%u, "
+		if (rtex->htile_offset) {
+			fprintf(f, "  HTile: offset=%"PRIu64", size=%"PRIu64", alignment=%u, "
 				"rb_aligned=%u, pipe_aligned=%u\n",
-				rtex->htile_buffer->b.b.width0,
-				rtex->htile_buffer->buf->alignment,
+				rtex->htile_offset,
+				rtex->surface.htile_size,
+				rtex->surface.htile_alignment,
 				rtex->surface.u.gfx9.htile.rb_aligned,
 				rtex->surface.u.gfx9.htile.pipe_aligned);
 		}
@@ -1058,10 +1094,11 @@
 			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
 			rtex->cmask.slice_tile_max);
 
-	if (rtex->htile_buffer)
-		fprintf(f, "  HTile: size=%u, alignment=%u, TC_compatible = %u\n",
-			rtex->htile_buffer->b.b.width0,
-			rtex->htile_buffer->buf->alignment,
+	if (rtex->htile_offset)
+		fprintf(f, "  HTile: offset=%"PRIu64", size=%"PRIu64", "
+			"alignment=%u, TC_compatible = %u\n",
+			rtex->htile_offset, rtex->surface.htile_size,
+			rtex->surface.htile_alignment,
 			rtex->tc_compatible_htile);
 
 	if (rtex->dcc_offset) {
@@ -1150,8 +1187,11 @@
 		if (rscreen->chip_class >= GFX9 &&
 		    base->format == PIPE_FORMAT_Z16_UNORM)
 			rtex->db_render_format = base->format;
-		else
+		else {
 			rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+			rtex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+					       base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+		}
 	} else {
 		rtex->db_render_format = base->format;
 	}
@@ -1225,7 +1265,9 @@
 		r600_init_resource_fields(rscreen, resource, rtex->size,
 					  rtex->surface.surf_alignment);
 
-		resource->flags |= RADEON_FLAG_HANDLE;
+		/* Displayable surfaces are not suballocated. */
+		if (resource->b.b.bind & PIPE_BIND_SCANOUT)
+			resource->flags |= RADEON_FLAG_NO_SUBALLOC;
 
 		if (!r600_alloc_resource(rscreen, resource)) {
 			FREE(rtex);
@@ -1249,6 +1291,17 @@
 					 rtex->cmask.offset, rtex->cmask.size,
 					 0xCCCCCCCC);
 	}
+	if (rtex->htile_offset) {
+		uint32_t clear_value = 0;
+
+		if (rscreen->chip_class >= GFX9 || rtex->tc_compatible_htile)
+			clear_value = 0x0000030F;
+
+		r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
+					 rtex->htile_offset,
+					 rtex->surface.htile_size,
+					 clear_value);
+	}
 
 	/* Initialize DCC only if the texture is not being imported. */
 	if (!buf && rtex->dcc_offset) {
@@ -1285,6 +1338,8 @@
 {
 	const struct util_format_description *desc = util_format_description(templ->format);
 	bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING;
+	bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+				!(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
 
 	/* MSAA resources must be 2D tiled. */
 	if (templ->nr_samples > 1)
@@ -1294,6 +1349,14 @@
 	if (templ->flags & R600_RESOURCE_FLAG_TRANSFER)
 		return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
+	/* Avoid Z/S decompress blits by forcing TC-compatible HTILE on VI,
+	 * which requires 2D tiling.
+	 */
+	if (rscreen->chip_class == VI &&
+	    is_depth_stencil &&
+	    (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY))
+		return RADEON_SURF_MODE_2D;
+
 	/* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */
 	if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN &&
 	    (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) &&
@@ -1304,9 +1367,9 @@
 	/* Handle common candidates for the linear mode.
 	 * Compressed textures and DB surfaces must always be tiled.
 	 */
-	if (!force_tiling && !util_format_is_compressed(templ->format) &&
-	    (!util_format_is_depth_or_stencil(templ->format) ||
-	     templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) {
+	if (!force_tiling &&
+	    !is_depth_stencil &&
+	    !util_format_is_compressed(templ->format)) {
 		if (rscreen->debug_flags & DBG_NO_TILING)
 			return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
@@ -1407,7 +1470,9 @@
 			array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
 
 		is_scanout = metadata.u.gfx9.swizzle_mode == 0 ||
-			     metadata.u.gfx9.swizzle_mode % 4 == 2;
+			      metadata.u.gfx9.swizzle_mode % 4 == 2;
+
+		surface.u.gfx9.surf.swizzle_mode = metadata.u.gfx9.swizzle_mode;
 	} else {
 		surface.u.legacy.pipe_config = metadata.u.legacy.pipe_config;
 		surface.u.legacy.bankw = metadata.u.legacy.bankw;
@@ -1436,17 +1501,12 @@
 	if (!rtex)
 		return NULL;
 
-	rtex->resource.is_shared = true;
+	rtex->resource.b.is_shared = true;
 	rtex->resource.external_usage = usage;
 
 	if (rscreen->apply_opaque_metadata)
 		rscreen->apply_opaque_metadata(rscreen, rtex, &metadata);
 
-	/* Validate that addrlib arrived at the same surface parameters. */
-	if (rscreen->chip_class >= GFX9) {
-		assert(metadata.u.gfx9.swizzle_mode == surface.u.gfx9.surf.swizzle_mode);
-	}
-
 	return &rtex->resource.b.b;
 }
 
@@ -1554,7 +1614,7 @@
 {
 	/* r600g doesn't react to dirty_tex_descriptor_counter */
 	return rscreen->chip_class >= SI &&
-		!rtex->resource.is_shared &&
+		!rtex->resource.b.is_shared &&
 		!(transfer_usage & PIPE_TRANSFER_READ) &&
 		rtex->resource.b.b.last_level == 0 &&
 		util_texrange_covers_whole_level(&rtex->resource.b.b, 0,
@@ -1616,8 +1676,9 @@
 				r600_can_invalidate_texture(rctx->screen, rtex,
 							    usage, box);
 
-			r600_degrade_tile_mode_to_linear(rctx, rtex,
-							 can_invalidate);
+			r600_reallocate_texture_inplace(rctx, rtex,
+							PIPE_BIND_LINEAR,
+							can_invalidate);
 		}
 
 		/* Tiled textures need to be converted into a linear texture for CPU
@@ -1652,10 +1713,10 @@
 	trans = CALLOC_STRUCT(r600_transfer);
 	if (!trans)
 		return NULL;
-	pipe_resource_reference(&trans->transfer.resource, texture);
-	trans->transfer.level = level;
-	trans->transfer.usage = usage;
-	trans->transfer.box = *box;
+	pipe_resource_reference(&trans->b.b.resource, texture);
+	trans->b.b.level = level;
+	trans->b.b.usage = usage;
+	trans->b.b.box = *box;
 
 	if (rtex->is_depth) {
 		struct r600_texture *staging_depth;
@@ -1697,8 +1758,8 @@
 
 			/* Just get the strides. */
 			r600_texture_get_offset(rctx->screen, staging_depth, level, NULL,
-						&trans->transfer.stride,
-						&trans->transfer.layer_stride);
+						&trans->b.b.stride,
+						&trans->b.b.layer_stride);
 		} else {
 			/* XXX: only readback the rectangle which is being mapped? */
 			/* XXX: when discard is true, no need to read back from depth texture */
@@ -1715,8 +1776,8 @@
 
 			offset = r600_texture_get_offset(rctx->screen, staging_depth,
 							 level, box,
-							 &trans->transfer.stride,
-							 &trans->transfer.layer_stride);
+							 &trans->b.b.stride,
+							 &trans->b.b.layer_stride);
 		}
 
 		trans->staging = (struct r600_resource*)staging_depth;
@@ -1741,8 +1802,8 @@
 
 		/* Just get the strides. */
 		r600_texture_get_offset(rctx->screen, staging, 0, NULL,
-					&trans->transfer.stride,
-					&trans->transfer.layer_stride);
+					&trans->b.b.stride,
+					&trans->b.b.layer_stride);
 
 		if (usage & PIPE_TRANSFER_READ)
 			r600_copy_to_staging_texture(ctx, trans);
@@ -1753,8 +1814,8 @@
 	} else {
 		/* the resource is mapped directly */
 		offset = r600_texture_get_offset(rctx->screen, rtex, level, box,
-						 &trans->transfer.stride,
-						 &trans->transfer.layer_stride);
+						 &trans->b.b.stride,
+						 &trans->b.b.layer_stride);
 		buf = &rtex->resource;
 	}
 
@@ -1764,7 +1825,7 @@
 		return NULL;
 	}
 
-	*ptransfer = &trans->transfer;
+	*ptransfer = &trans->b.b;
 	return map + offset;
 }
 
@@ -2266,7 +2327,7 @@
 	/* The intent is to use this with shared displayable back buffers,
 	 * but it's not strictly limited only to them.
 	 */
-	if (!tex->resource.is_shared ||
+	if (!tex->resource.b.is_shared ||
 	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
 	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
 	    tex->resource.b.b.last_level > 0 ||
@@ -2415,6 +2476,14 @@
 	bool main_value = false;
 	bool extra_value = false;
 	int extra_channel;
+
+	/* This is needed to get the correct DCC clear value for luminance formats.
+	 * 1) Get the linear format (because the next step can't handle L8_SRGB).
+	 * 2) Convert luminance to red. (the real hw format for luminance)
+	 */
+	surface_format = util_format_linear(surface_format);
+	surface_format = util_format_luminance_to_red(surface_format);
+
 	const struct util_format_description *desc = util_format_description(surface_format);
 
 	if (desc->block.bits == 128 &&
@@ -2433,7 +2502,8 @@
 
 	if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT ||
 	    surface_format == PIPE_FORMAT_B5G6R5_UNORM ||
-	    surface_format == PIPE_FORMAT_B5G6R5_SRGB) {
+	    surface_format == PIPE_FORMAT_B5G6R5_SRGB ||
+	    util_format_is_alpha(surface_format)) {
 		extra_channel = -1;
 	} else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
 		if(r600_translate_colorswap(surface_format, false) <= 1)
@@ -2533,7 +2603,7 @@
 static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
 					   struct r600_texture *rtex)
 {
-	if (rtex->resource.is_shared ||
+	if (rtex->resource.b.is_shared ||
 	    rtex->resource.b.b.nr_samples <= 1 ||
 	    rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode)
 		return;
@@ -2637,7 +2707,7 @@
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct pipe_framebuffer_state *fb,
 				   struct r600_atom *fb_state,
-				   unsigned *buffers, unsigned *dirty_cbufs,
+				   unsigned *buffers, ubyte *dirty_cbufs,
 				   const union pipe_color_union *color)
 {
 	int i;
@@ -2683,7 +2753,7 @@
 		 * because there is no way to communicate the clear color among
 		 * all clients
 		 */
-		if (tex->resource.is_shared &&
+		if (tex->resource.b.is_shared &&
 		    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
 			continue;
 
@@ -2717,10 +2787,6 @@
 			uint32_t reset_value;
 			bool clear_words_needed;
 
-			/* TODO: fix DCC clear */
-			if (rctx->chip_class >= GFX9)
-				continue;
-
 			if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR)
 				continue;
 
diff --git a/src/gallium/drivers/radeon/r600_viewport.c b/src/gallium/drivers/radeon/r600_viewport.c
index 2de1382..ae64199 100644
--- a/src/gallium/drivers/radeon/r600_viewport.c
+++ b/src/gallium/drivers/radeon/r600_viewport.c
@@ -165,6 +165,7 @@
 	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct pipe_viewport_state vp;
 	float left, top, right, bottom, max_range, guardband_x, guardband_y;
+	float discard_x, discard_y;
 
 	/* Reconstruct the viewport transformation from the scissor. */
 	vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
@@ -198,6 +199,22 @@
 	guardband_x = MIN2(-left, right);
 	guardband_y = MIN2(-top, bottom);
 
+	discard_x = 1.0;
+	discard_y = 1.0;
+
+	if (rctx->current_rast_prim < PIPE_PRIM_TRIANGLES) {
+		/* When rendering wide points or lines, we need to be more
+		 * conservative about when to discard them entirely. Since
+		 * point size can be determined by the VS output, we basically
+		 * disable discard completely completely here.
+		 *
+		 * TODO: This can hurt performance when rendering lines and
+		 * points with fixed size, and could be improved.
+		 */
+		discard_x = guardband_x;
+		discard_y = guardband_y;
+	}
+
 	/* If any of the GB registers is updated, all of them must be updated. */
 	if (rctx->chip_class >= CAYMAN)
 		radeon_set_context_reg_seq(cs, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
@@ -205,9 +222,9 @@
 		radeon_set_context_reg_seq(cs, R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4);
 
 	radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
-	radeon_emit(cs, fui(1.0));         /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
+	radeon_emit(cs, fui(discard_y));   /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
 	radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
-	radeon_emit(cs, fui(1.0));         /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+	radeon_emit(cs, fui(discard_x));   /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
 }
 
 static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom)
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c
new file mode 100644
index 0000000..bd93b84
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c
@@ -0,0 +1,1308 @@
+/**************************************************************************
+ *
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "pipe/p_video_codec.h"
+
+#include "util/u_memory.h"
+#include "util/u_video.h"
+
+#include "vl/vl_mpeg12_decoder.h"
+
+#include "r600_pipe_common.h"
+#include "radeon_video.h"
+#include "radeon_vcn_dec.h"
+
+#define FB_BUFFER_OFFSET		0x1000
+#define FB_BUFFER_SIZE			2048
+#define IT_SCALING_TABLE_SIZE		992
+#define RDECODE_SESSION_CONTEXT_SIZE	(128 * 1024)
+
+#define RDECODE_GPCOM_VCPU_CMD		0x2070c
+#define RDECODE_GPCOM_VCPU_DATA0	0x20710
+#define RDECODE_GPCOM_VCPU_DATA1	0x20714
+#define RDECODE_ENGINE_CNTL		0x20718
+
+#define NUM_BUFFERS			4
+#define NUM_MPEG2_REFS			6
+#define NUM_H264_REFS			17
+#define NUM_VC1_REFS			5
+
+struct radeon_decoder {
+	struct pipe_video_codec		base;
+
+	unsigned			stream_handle;
+	unsigned			stream_type;
+	unsigned			frame_number;
+
+	struct pipe_screen		*screen;
+	struct radeon_winsys		*ws;
+	struct radeon_winsys_cs		*cs;
+
+	void				*msg;
+	uint32_t			*fb;
+	uint8_t				*it;
+	void				*bs_ptr;
+
+	struct rvid_buffer		msg_fb_it_buffers[NUM_BUFFERS];
+	struct rvid_buffer		bs_buffers[NUM_BUFFERS];
+	struct rvid_buffer		dpb;
+	struct rvid_buffer		ctx;
+	struct rvid_buffer		sessionctx;
+
+	unsigned			bs_size;
+	unsigned			cur_buffer;
+};
+
+static rvcn_dec_message_avc_t get_h264_msg(struct radeon_decoder *dec,
+		struct pipe_h264_picture_desc *pic)
+{
+	rvcn_dec_message_avc_t result;
+
+	memset(&result, 0, sizeof(result));
+	switch (pic->base.profile) {
+	case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+		result.profile = RDECODE_H264_PROFILE_BASELINE;
+		break;
+
+	case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+		result.profile = RDECODE_H264_PROFILE_MAIN;
+		break;
+
+	case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+		result.profile = RDECODE_H264_PROFILE_HIGH;
+		break;
+
+	default:
+		assert(0);
+		break;
+	}
+
+	result.level = dec->base.level;
+
+	result.sps_info_flags = 0;
+	result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0;
+	result.sps_info_flags |= pic->pps->sps->mb_adaptive_frame_field_flag << 1;
+	result.sps_info_flags |= pic->pps->sps->frame_mbs_only_flag << 2;
+	result.sps_info_flags |= pic->pps->sps->delta_pic_order_always_zero_flag << 3;
+	result.sps_info_flags |= 1 << RDECODE_SPS_INFO_H264_EXTENSION_SUPPORT_FLAG_SHIFT;
+
+	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
+	result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
+	result.log2_max_frame_num_minus4 = pic->pps->sps->log2_max_frame_num_minus4;
+	result.pic_order_cnt_type = pic->pps->sps->pic_order_cnt_type;
+	result.log2_max_pic_order_cnt_lsb_minus4 =
+		pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
+
+	switch (dec->base.chroma_format) {
+	case PIPE_VIDEO_CHROMA_FORMAT_NONE:
+		break;
+	case PIPE_VIDEO_CHROMA_FORMAT_400:
+		result.chroma_format = 0;
+		break;
+	case PIPE_VIDEO_CHROMA_FORMAT_420:
+		result.chroma_format = 1;
+		break;
+	case PIPE_VIDEO_CHROMA_FORMAT_422:
+		result.chroma_format = 2;
+		break;
+	case PIPE_VIDEO_CHROMA_FORMAT_444:
+		result.chroma_format = 3;
+		break;
+	}
+
+	result.pps_info_flags = 0;
+	result.pps_info_flags |= pic->pps->transform_8x8_mode_flag << 0;
+	result.pps_info_flags |= pic->pps->redundant_pic_cnt_present_flag << 1;
+	result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 2;
+	result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag << 3;
+	result.pps_info_flags |= pic->pps->weighted_bipred_idc << 4;
+	result.pps_info_flags |= pic->pps->weighted_pred_flag << 6;
+	result.pps_info_flags |= pic->pps->bottom_field_pic_order_in_frame_present_flag << 7;
+	result.pps_info_flags |= pic->pps->entropy_coding_mode_flag << 8;
+
+	result.num_slice_groups_minus1 = pic->pps->num_slice_groups_minus1;
+	result.slice_group_map_type = pic->pps->slice_group_map_type;
+	result.slice_group_change_rate_minus1 = pic->pps->slice_group_change_rate_minus1;
+	result.pic_init_qp_minus26 = pic->pps->pic_init_qp_minus26;
+	result.chroma_qp_index_offset = pic->pps->chroma_qp_index_offset;
+	result.second_chroma_qp_index_offset = pic->pps->second_chroma_qp_index_offset;
+
+	memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16);
+	memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64);
+
+	memcpy(dec->it, result.scaling_list_4x4, 6*16);
+	memcpy((dec->it + 96), result.scaling_list_8x8, 2*64);
+
+	result.num_ref_frames = pic->num_ref_frames;
+
+	result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
+	result.num_ref_idx_l1_active_minus1 = pic->num_ref_idx_l1_active_minus1;
+
+	result.frame_num = pic->frame_num;
+	memcpy(result.frame_num_list, pic->frame_num_list, 4*16);
+	result.curr_field_order_cnt_list[0] = pic->field_order_cnt[0];
+	result.curr_field_order_cnt_list[1] = pic->field_order_cnt[1];
+	memcpy(result.field_order_cnt_list, pic->field_order_cnt_list, 4*16*2);
+
+	result.decoded_pic_idx = pic->frame_num;
+
+	return result;
+}
+
+static void radeon_dec_destroy_associated_data(void *data)
+{
+	/* NOOP, since we only use an intptr */
+}
+
+static rvcn_dec_message_hevc_t get_h265_msg(struct radeon_decoder *dec,
+					struct pipe_video_buffer *target,
+					struct pipe_h265_picture_desc *pic)
+{
+	rvcn_dec_message_hevc_t result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+	result.sps_info_flags = 0;
+	result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0;
+	result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1;
+	result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2;
+	result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3;
+	result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4;
+	result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5;
+	result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6;
+	result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7;
+	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
+	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
+		result.sps_info_flags |= 1 << 9;
+	if (pic->UseRefPicList == true)
+		result.sps_info_flags |= 1 << 10;
+
+	result.chroma_format = pic->pps->sps->chroma_format_idc;
+	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
+	result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
+	result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
+	result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1;
+	result.log2_min_luma_coding_block_size_minus3 =
+		pic->pps->sps->log2_min_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_luma_coding_block_size =
+		pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+	result.log2_min_transform_block_size_minus2 =
+		pic->pps->sps->log2_min_transform_block_size_minus2;
+	result.log2_diff_max_min_transform_block_size =
+		pic->pps->sps->log2_diff_max_min_transform_block_size;
+	result.max_transform_hierarchy_depth_inter =
+		pic->pps->sps->max_transform_hierarchy_depth_inter;
+	result.max_transform_hierarchy_depth_intra =
+		pic->pps->sps->max_transform_hierarchy_depth_intra;
+	result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1;
+	result.pcm_sample_bit_depth_chroma_minus1 =
+		pic->pps->sps->pcm_sample_bit_depth_chroma_minus1;
+	result.log2_min_pcm_luma_coding_block_size_minus3 =
+		pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_pcm_luma_coding_block_size =
+		pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size;
+	result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets;
+
+	result.pps_info_flags = 0;
+	result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0;
+	result.pps_info_flags |= pic->pps->output_flag_present_flag << 1;
+	result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2;
+	result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3;
+	result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4;
+	result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5;
+	result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6;
+	result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7;
+	result.pps_info_flags |= pic->pps->weighted_pred_flag << 8;
+	result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9;
+	result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10;
+	result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11;
+	result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12;
+	result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13;
+	result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14;
+	result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15;
+	result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16;
+	result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17;
+	result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18;
+	result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19;
+
+	result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits;
+	result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps;
+	result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1;
+	result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1;
+	result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset;
+	result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset;
+	result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2;
+	result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2;
+	result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth;
+	result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1;
+	result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1;
+	result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2;
+	result.init_qp_minus26 = pic->pps->init_qp_minus26;
+
+	for (i = 0; i < 19; ++i)
+		result.column_width_minus1[i] = pic->pps->column_width_minus1[i];
+
+	for (i = 0; i < 21; ++i)
+		result.row_height_minus1[i] = pic->pps->row_height_minus1[i];
+
+	result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx;
+	result.curr_idx = pic->CurrPicOrderCntVal;
+	result.curr_poc = pic->CurrPicOrderCntVal;
+
+	vl_video_buffer_set_associated_data(target, &dec->base,
+					    (void *)(uintptr_t)pic->CurrPicOrderCntVal,
+					    &radeon_dec_destroy_associated_data);
+
+	for (i = 0; i < 16; ++i) {
+		struct pipe_video_buffer *ref = pic->ref[i];
+		uintptr_t ref_pic = 0;
+
+		result.poc_list[i] = pic->PicOrderCntVal[i];
+
+		if (ref)
+			ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base);
+		else
+			ref_pic = 0x7F;
+		result.ref_pic_list[i] = ref_pic;
+	}
+
+	for (i = 0; i < 8; ++i) {
+		result.ref_pic_set_st_curr_before[i] = 0xFF;
+		result.ref_pic_set_st_curr_after[i] = 0xFF;
+		result.ref_pic_set_lt_curr[i] = 0xFF;
+	}
+
+	for (i = 0; i < pic->NumPocStCurrBefore; ++i)
+		result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i];
+
+	for (i = 0; i < pic->NumPocStCurrAfter; ++i)
+		result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i];
+
+	for (i = 0; i < pic->NumPocLtCurr; ++i)
+		result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i];
+
+	for (i = 0; i < 6; ++i)
+		result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i];
+
+	for (i = 0; i < 2; ++i)
+		result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i];
+
+	memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16);
+	memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64);
+	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
+	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
+
+	for (i = 0 ; i < 2 ; i++) {
+		for (int j = 0 ; j < 15 ; j++)
+			result.direct_reflist[i][j] = pic->RefPicList[i][j];
+	}
+
+	if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) &&
+		(target->buffer_format == PIPE_FORMAT_NV12)) {
+		result.p010_mode = 0;
+		result.luma_10to8 = 5;
+		result.chroma_10to8 = 5;
+		result.hevc_reserved[0] = 4; /* sclr_luma10to8 */
+		result.hevc_reserved[1] = 4; /* sclr_chroma10to8 */
+	}
+
+	return result;
+}
+
+static unsigned calc_ctx_size_h265_main(struct radeon_decoder *dec)
+{
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	width = align (width, 16);
+	height = align (height, 16);
+	return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
+}
+
+static unsigned calc_ctx_size_h265_main10(struct radeon_decoder *dec, struct pipe_h265_picture_desc *pic)
+{
+	unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb;
+	unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size;
+	unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4);
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+	unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 ||
+			pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1;
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3));
+	log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+
+	width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+	height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+
+	num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4);
+	context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256);
+	max_mb_address = (unsigned) ceil(height * 8 / 2048.0);
+
+	cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb;
+	db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024);
+
+	return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size;
+}
+
+static rvcn_dec_message_vc1_t get_vc1_msg(struct pipe_vc1_picture_desc *pic)
+{
+	rvcn_dec_message_vc1_t result;
+
+	memset(&result, 0, sizeof(result));
+	switch(pic->base.profile) {
+	case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+		result.profile = RDECODE_VC1_PROFILE_SIMPLE;
+		result.level = 1;
+		break;
+
+	case PIPE_VIDEO_PROFILE_VC1_MAIN:
+		result.profile = RDECODE_VC1_PROFILE_MAIN;
+		result.level = 2;
+		break;
+
+	case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+		result.profile = RDECODE_VC1_PROFILE_ADVANCED;
+		result.level = 4;
+		break;
+
+	default:
+		assert(0);
+	}
+
+	result.sps_info_flags |= pic->postprocflag << 7;
+	result.sps_info_flags |= pic->pulldown << 6;
+	result.sps_info_flags |= pic->interlace << 5;
+	result.sps_info_flags |= pic->tfcntrflag << 4;
+	result.sps_info_flags |= pic->finterpflag << 3;
+	result.sps_info_flags |= pic->psf << 1;
+
+	result.pps_info_flags |= pic->range_mapy_flag << 31;
+	result.pps_info_flags |= pic->range_mapy << 28;
+	result.pps_info_flags |= pic->range_mapuv_flag << 27;
+	result.pps_info_flags |= pic->range_mapuv << 24;
+	result.pps_info_flags |= pic->multires << 21;
+	result.pps_info_flags |= pic->maxbframes << 16;
+	result.pps_info_flags |= pic->overlap << 11;
+	result.pps_info_flags |= pic->quantizer << 9;
+	result.pps_info_flags |= pic->panscan_flag << 7;
+	result.pps_info_flags |= pic->refdist_flag << 6;
+	result.pps_info_flags |= pic->vstransform << 0;
+
+	if (pic->base.profile != PIPE_VIDEO_PROFILE_VC1_SIMPLE) {
+		result.pps_info_flags |= pic->syncmarker << 20;
+		result.pps_info_flags |= pic->rangered << 19;
+		result.pps_info_flags |= pic->loopfilter << 5;
+		result.pps_info_flags |= pic->fastuvmc << 4;
+		result.pps_info_flags |= pic->extended_mv << 3;
+		result.pps_info_flags |= pic->extended_dmv << 8;
+		result.pps_info_flags |= pic->dquant << 1;
+	}
+
+	result.chroma_format = 1;
+
+	return result;
+}
+
+static uint32_t get_ref_pic_idx(struct radeon_decoder *dec, struct pipe_video_buffer *ref)
+{
+	uint32_t min = MAX2(dec->frame_number, NUM_MPEG2_REFS) - NUM_MPEG2_REFS;
+	uint32_t max = MAX2(dec->frame_number, 1) - 1;
+	uintptr_t frame;
+
+	/* seems to be the most sane fallback */
+	if (!ref)
+		return max;
+
+	/* get the frame number from the associated data */
+	frame = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base);
+
+	/* limit the frame number to a valid range */
+	return MAX2(MIN2(frame, max), min);
+}
+
+static rvcn_dec_message_mpeg2_vld_t get_mpeg2_msg(struct radeon_decoder *dec,
+				       struct pipe_mpeg12_picture_desc *pic)
+{
+	const int *zscan = pic->alternate_scan ? vl_zscan_alternate : vl_zscan_normal;
+	rvcn_dec_message_mpeg2_vld_t	result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+	result.decoded_pic_idx = dec->frame_number;
+
+	result.forward_ref_pic_idx = get_ref_pic_idx(dec, pic->ref[0]);
+	result.backward_ref_pic_idx = get_ref_pic_idx(dec, pic->ref[1]);
+
+	result.load_intra_quantiser_matrix = 1;
+	result.load_nonintra_quantiser_matrix = 1;
+
+	for (i = 0; i < 64; ++i) {
+		result.intra_quantiser_matrix[i] = pic->intra_matrix[zscan[i]];
+		result.nonintra_quantiser_matrix[i] = pic->non_intra_matrix[zscan[i]];
+	}
+
+	result.profile_and_level_indication = 0;
+	result.chroma_format = 0x1;
+
+	result.picture_coding_type = pic->picture_coding_type;
+	result.f_code[0][0] = pic->f_code[0][0] + 1;
+	result.f_code[0][1] = pic->f_code[0][1] + 1;
+	result.f_code[1][0] = pic->f_code[1][0] + 1;
+	result.f_code[1][1] = pic->f_code[1][1] + 1;
+	result.intra_dc_precision = pic->intra_dc_precision;
+	result.pic_structure = pic->picture_structure;
+	result.top_field_first = pic->top_field_first;
+	result.frame_pred_frame_dct = pic->frame_pred_frame_dct;
+	result.concealment_motion_vectors = pic->concealment_motion_vectors;
+	result.q_scale_type = pic->q_scale_type;
+	result.intra_vlc_format = pic->intra_vlc_format;
+	result.alternate_scan = pic->alternate_scan;
+
+	return result;
+}
+
+static rvcn_dec_message_mpeg4_asp_vld_t get_mpeg4_msg(struct radeon_decoder *dec,
+				       struct pipe_mpeg4_picture_desc *pic)
+{
+	rvcn_dec_message_mpeg4_asp_vld_t result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+	result.decoded_pic_idx = dec->frame_number;
+
+	result.forward_ref_pic_idx = get_ref_pic_idx(dec, pic->ref[0]);
+	result.backward_ref_pic_idx = get_ref_pic_idx(dec, pic->ref[1]);
+
+	result.variant_type = 0;
+	result.profile_and_level_indication = 0xF0;
+
+	result.video_object_layer_verid = 0x5;
+	result.video_object_layer_shape = 0x0;
+
+	result.video_object_layer_width = dec->base.width;
+	result.video_object_layer_height = dec->base.height;
+
+	result.vop_time_increment_resolution = pic->vop_time_increment_resolution;
+
+	result.short_video_header |= pic->short_video_header << 0;
+	result.interlaced |= pic->interlaced << 2;
+        result.load_intra_quant_mat |= 1 << 3;
+	result.load_nonintra_quant_mat |= 1 << 4;
+	result.quarter_sample |= pic->quarter_sample << 5;
+	result.complexity_estimation_disable |= 1 << 6;
+	result.resync_marker_disable |= pic->resync_marker_disable << 7;
+	result.newpred_enable |= 0 << 10; //
+	result.reduced_resolution_vop_enable |= 0 << 11;
+
+	result.quant_type = pic->quant_type;
+
+	for (i = 0; i < 64; ++i) {
+		result.intra_quant_mat[i] = pic->intra_matrix[vl_zscan_normal[i]];
+		result.nonintra_quant_mat[i] = pic->non_intra_matrix[vl_zscan_normal[i]];
+	}
+
+	return result;
+}
+
+static void rvcn_dec_message_create(struct radeon_decoder *dec)
+{
+	rvcn_dec_message_header_t *header = dec->msg;
+	rvcn_dec_message_create_t *create = dec->msg + sizeof(rvcn_dec_message_header_t);
+	unsigned sizes = sizeof(rvcn_dec_message_header_t) + sizeof(rvcn_dec_message_create_t);
+
+	memset(dec->msg, 0, sizes);
+	header->header_size = sizeof(rvcn_dec_message_header_t);
+	header->total_size = sizes;
+	header->num_buffers = 1;
+	header->msg_type = RDECODE_MSG_CREATE;
+	header->stream_handle = dec->stream_handle;
+	header->status_report_feedback_number = 0;
+
+	header->index[0].message_id = RDECODE_MESSAGE_CREATE;
+	header->index[0].offset = sizeof(rvcn_dec_message_header_t);
+	header->index[0].size = sizeof(rvcn_dec_message_create_t);
+	header->index[0].filled = 0;
+
+	create->stream_type = dec->stream_type;
+	create->session_flags = 0;
+	create->width_in_samples = dec->base.width;
+	create->height_in_samples = dec->base.height;
+}
+
+static struct pb_buffer *rvcn_dec_message_decode(struct radeon_decoder *dec,
+					struct pipe_video_buffer *target,
+					struct pipe_picture_desc *picture)
+{
+	struct r600_texture *luma = (struct r600_texture *)
+				((struct vl_video_buffer *)target)->resources[0];
+	struct r600_texture *chroma = (struct r600_texture *)
+				((struct vl_video_buffer *)target)->resources[1];
+	rvcn_dec_message_header_t *header;
+	rvcn_dec_message_index_t *index;
+	rvcn_dec_message_decode_t *decode;
+	unsigned sizes = 0, offset_decode, offset_codec;
+	void *codec;
+
+	header = dec->msg;
+	sizes += sizeof(rvcn_dec_message_header_t);
+	index = (void*)header + sizeof(rvcn_dec_message_header_t);
+	sizes += sizeof(rvcn_dec_message_index_t);
+	offset_decode = sizes;
+	decode = (void*)index + sizeof(rvcn_dec_message_index_t);
+	sizes += sizeof(rvcn_dec_message_decode_t);
+	offset_codec = sizes;
+	codec = (void*)decode + sizeof(rvcn_dec_message_decode_t);
+
+	memset(dec->msg, 0, sizes);
+	header->header_size = sizeof(rvcn_dec_message_header_t);
+	header->total_size = sizes;
+	header->num_buffers = 2;
+	header->msg_type = RDECODE_MSG_DECODE;
+	header->stream_handle = dec->stream_handle;
+	header->status_report_feedback_number = dec->frame_number;
+
+	header->index[0].message_id = RDECODE_MESSAGE_DECODE;
+	header->index[0].offset = offset_decode;
+	header->index[0].size = sizeof(rvcn_dec_message_decode_t);
+	header->index[0].filled = 0;
+
+	index->offset = offset_codec;
+	index->size = sizeof(rvcn_dec_message_avc_t);
+	index->filled = 0;
+
+	decode->stream_type = dec->stream_type;;
+	decode->decode_flags = 0x1;
+	decode->width_in_samples = dec->base.width;;
+	decode->height_in_samples = dec->base.height;;
+
+	decode->bsd_size = align(dec->bs_size, 128);
+	decode->dpb_size = dec->dpb.res->buf->size;
+	decode->dt_size =
+		((struct r600_resource *)((struct vl_video_buffer *)target)->resources[0])->buf->size +
+		((struct r600_resource *)((struct vl_video_buffer *)target)->resources[1])->buf->size;
+
+	decode->sct_size = 0;
+	decode->sc_coeff_size = 0;
+
+	decode->sw_ctxt_size = RDECODE_SESSION_CONTEXT_SIZE;
+	decode->db_pitch = align(dec->base.width, 32);
+	decode->db_surf_tile_config = 0;
+
+	decode->dt_pitch = luma->surface.u.gfx9.surf_pitch * luma->surface.bpe;;
+	decode->dt_uv_pitch = decode->dt_pitch / 2;
+
+	decode->dt_tiling_mode = 0;
+	decode->dt_swizzle_mode = RDECODE_SW_MODE_LINEAR;
+	decode->dt_array_mode = RDECODE_ARRAY_MODE_LINEAR;
+	decode->dt_field_mode = ((struct vl_video_buffer *)target)->base.interlaced;
+	decode->dt_surf_tile_config = 0;
+	decode->dt_uv_surf_tile_config = 0;
+
+	decode->dt_luma_top_offset = luma->surface.u.gfx9.surf_offset;
+	decode->dt_chroma_top_offset = chroma->surface.u.gfx9.surf_offset;
+	if (decode->dt_field_mode) {
+		decode->dt_luma_bottom_offset = luma->surface.u.gfx9.surf_offset +
+				luma->surface.u.gfx9.surf_slice_size;
+		decode->dt_chroma_bottom_offset = chroma->surface.u.gfx9.surf_offset +
+				chroma->surface.u.gfx9.surf_slice_size;
+	} else {
+		decode->dt_luma_bottom_offset = decode->dt_luma_top_offset;
+		decode->dt_chroma_bottom_offset = decode->dt_chroma_top_offset;
+	}
+
+	switch (u_reduce_video_profile(picture->profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+		rvcn_dec_message_avc_t avc =
+			get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture);
+		memcpy(codec, (void*)&avc, sizeof(rvcn_dec_message_avc_t));
+		index->message_id = RDECODE_MESSAGE_AVC;
+		break;
+	}
+	case PIPE_VIDEO_FORMAT_HEVC: {
+		rvcn_dec_message_hevc_t hevc =
+			get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+
+		memcpy(codec, (void*)&hevc, sizeof(rvcn_dec_message_hevc_t));
+		index->message_id = RDECODE_MESSAGE_HEVC;
+		if (dec->ctx.res == NULL) {
+			unsigned ctx_size;
+			if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+				ctx_size = calc_ctx_size_h265_main10(dec,
+					(struct pipe_h265_picture_desc*)picture);
+			else
+				ctx_size = calc_ctx_size_h265_main(dec);
+			if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT))
+				RVID_ERR("Can't allocated context buffer.\n");
+			rvid_clear_buffer(dec->base.context, &dec->ctx);
+		}
+		break;
+	}
+	case PIPE_VIDEO_FORMAT_VC1: {
+		rvcn_dec_message_vc1_t vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture);
+
+		memcpy(codec, (void*)&vc1, sizeof(rvcn_dec_message_vc1_t));
+		if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) ||
+		    (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) {
+			decode->width_in_samples = align(decode->width_in_samples, 16) / 16;
+			decode->height_in_samples = align(decode->height_in_samples, 16) / 16;
+		}
+		index->message_id = RDECODE_MESSAGE_VC1;
+		break;
+
+	}
+	case PIPE_VIDEO_FORMAT_MPEG12: {
+		rvcn_dec_message_mpeg2_vld_t mpeg2 =
+			get_mpeg2_msg(dec, (struct pipe_mpeg12_picture_desc*)picture);
+
+		memcpy(codec, (void*)&mpeg2, sizeof(rvcn_dec_message_mpeg2_vld_t));
+		index->message_id = RDECODE_MESSAGE_MPEG2_VLD;
+		break;
+	}
+	case PIPE_VIDEO_FORMAT_MPEG4: {
+		rvcn_dec_message_mpeg4_asp_vld_t mpeg4 =
+			get_mpeg4_msg(dec, (struct pipe_mpeg4_picture_desc*)picture);
+
+		memcpy(codec, (void*)&mpeg4, sizeof(rvcn_dec_message_mpeg4_asp_vld_t));
+		index->message_id = RDECODE_MESSAGE_MPEG4_ASP_VLD;
+		break;
+	}
+	default:
+		assert(0);
+		return NULL;
+	}
+
+	if (dec->ctx.res)
+		decode->hw_ctxt_size = dec->ctx.res->buf->size;
+
+	return luma->resource.buf;
+}
+
+static void rvcn_dec_message_destroy(struct radeon_decoder *dec)
+{
+	rvcn_dec_message_header_t *header = dec->msg;
+
+	memset(dec->msg, 0, sizeof(rvcn_dec_message_header_t));
+	header->header_size = sizeof(rvcn_dec_message_header_t);
+	header->total_size = sizeof(rvcn_dec_message_header_t) -
+			sizeof(rvcn_dec_message_index_t);
+	header->num_buffers = 0;
+	header->msg_type = RDECODE_MSG_DESTROY;
+	header->stream_handle = dec->stream_handle;
+	header->status_report_feedback_number = 0;
+}
+
+static void rvcn_dec_message_feedback(struct radeon_decoder *dec)
+{
+	rvcn_dec_feedback_header_t *header = (void*)dec->fb;
+
+	header->header_size = sizeof(rvcn_dec_feedback_header_t);
+	header->total_size = sizeof(rvcn_dec_feedback_header_t);
+	header->num_buffers = 0;
+}
+
+/* flush IB to the hardware */
+static int flush(struct radeon_decoder *dec, unsigned flags)
+{
+	return dec->ws->cs_flush(dec->cs, flags, NULL);
+}
+
+/* add a new set register command to the IB */
+static void set_reg(struct radeon_decoder *dec, unsigned reg, uint32_t val)
+{
+	radeon_emit(dec->cs, RDECODE_PKT0(reg >> 2, 0));
+	radeon_emit(dec->cs, val);
+}
+
+/* send a command to the VCPU through the GPCOM registers */
+static void send_cmd(struct radeon_decoder *dec, unsigned cmd,
+		     struct pb_buffer* buf, uint32_t off,
+		     enum radeon_bo_usage usage, enum radeon_bo_domain domain)
+{
+	uint64_t addr;
+
+	dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
+			   domain, RADEON_PRIO_UVD);
+	addr = dec->ws->buffer_get_virtual_address(buf);
+	addr = addr + off;
+
+	set_reg(dec, RDECODE_GPCOM_VCPU_DATA0, addr);
+	set_reg(dec, RDECODE_GPCOM_VCPU_DATA1, addr >> 32);
+	set_reg(dec, RDECODE_GPCOM_VCPU_CMD, cmd << 1);
+}
+
+/* do the codec needs an IT buffer ?*/
+static bool have_it(struct radeon_decoder *dec)
+{
+	return dec->stream_type == RDECODE_CODEC_H264_PERF ||
+		dec->stream_type == RDECODE_CODEC_H265;
+}
+
+/* map the next available message/feedback/itscaling buffer */
+static void map_msg_fb_it_buf(struct radeon_decoder *dec)
+{
+	struct rvid_buffer* buf;
+	uint8_t *ptr;
+
+	/* grab the current message/feedback buffer */
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
+
+	/* and map it for CPU access */
+	ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);
+
+	/* calc buffer offsets */
+	dec->msg = ptr;
+
+	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
+	if (have_it(dec))
+		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
+}
+
+/* unmap and send a message command to the VCPU */
+static void send_msg_buf(struct radeon_decoder *dec)
+{
+	struct rvid_buffer* buf;
+
+	/* ignore the request if message/feedback buffer isn't mapped */
+	if (!dec->msg || !dec->fb)
+		return;
+
+	/* grab the current message buffer */
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
+
+	/* unmap the buffer */
+	dec->ws->buffer_unmap(buf->res->buf);
+	dec->msg = NULL;
+	dec->fb = NULL;
+	dec->it = NULL;
+
+	if (dec->sessionctx.res)
+		send_cmd(dec, RDECODE_CMD_SESSION_CONTEXT_BUFFER,
+			 dec->sessionctx.res->buf, 0, RADEON_USAGE_READWRITE,
+			 RADEON_DOMAIN_VRAM);
+
+	/* and send it to the hardware */
+	send_cmd(dec, RDECODE_CMD_MSG_BUFFER, buf->res->buf, 0,
+		 RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
+}
+
+/* cycle to the next set of buffers */
+static void next_buffer(struct radeon_decoder *dec)
+{
+	++dec->cur_buffer;
+	dec->cur_buffer %= NUM_BUFFERS;
+}
+
+static unsigned calc_ctx_size_h264_perf(struct radeon_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, ctx_size;
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	// picture width & height in 16 pixel units
+	width_in_mb = width / VL_MACROBLOCK_WIDTH;
+	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
+
+	unsigned fs_in_mb = width_in_mb * height_in_mb;
+	unsigned num_dpb_buffer;
+	switch(dec->base.level) {
+	case 30:
+		num_dpb_buffer = 8100 / fs_in_mb;
+		break;
+	case 31:
+		num_dpb_buffer = 18000 / fs_in_mb;
+		break;
+	case 32:
+		num_dpb_buffer = 20480 / fs_in_mb;
+		break;
+	case 41:
+		num_dpb_buffer = 32768 / fs_in_mb;
+		break;
+	case 42:
+		num_dpb_buffer = 34816 / fs_in_mb;
+		break;
+	case 50:
+		num_dpb_buffer = 110400 / fs_in_mb;
+		break;
+	case 51:
+		num_dpb_buffer = 184320 / fs_in_mb;
+		break;
+	default:
+		num_dpb_buffer = 184320 / fs_in_mb;
+		break;
+	}
+	num_dpb_buffer++;
+	max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+	ctx_size = max_references * align(width_in_mb * height_in_mb  * 192, 256);
+
+	return ctx_size;
+}
+
+/* calculate size of reference picture buffer */
+static unsigned calc_dpb_size(struct radeon_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, image_size, dpb_size;
+
+	// always align them to MB size for dpb calculation
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	// always one more for currently decoded picture
+	unsigned max_references = dec->base.max_references + 1;
+
+	// aligned size of a single frame
+	image_size = align(width, 32) * height;
+	image_size += image_size / 2;
+	image_size = align(image_size, 1024);
+
+	// picture width & height in 16 pixel units
+	width_in_mb = width / VL_MACROBLOCK_WIDTH;
+	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
+
+	switch (u_reduce_video_profile(dec->base.profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+		unsigned fs_in_mb = width_in_mb * height_in_mb;
+		unsigned num_dpb_buffer;
+
+		switch(dec->base.level) {
+		case 30:
+			num_dpb_buffer = 8100 / fs_in_mb;
+			break;
+		case 31:
+			num_dpb_buffer = 18000 / fs_in_mb;
+			break;
+		case 32:
+			num_dpb_buffer = 20480 / fs_in_mb;
+			break;
+		case 41:
+			num_dpb_buffer = 32768 / fs_in_mb;
+			break;
+		case 42:
+			num_dpb_buffer = 34816 / fs_in_mb;
+			break;
+		case 50:
+			num_dpb_buffer = 110400 / fs_in_mb;
+			break;
+		case 51:
+			num_dpb_buffer = 184320 / fs_in_mb;
+			break;
+		default:
+			num_dpb_buffer = 184320 / fs_in_mb;
+			break;
+		}
+		num_dpb_buffer++;
+		max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+		dpb_size = image_size * max_references;
+		break;
+	}
+
+	case PIPE_VIDEO_FORMAT_HEVC:
+		if (dec->base.width * dec->base.height >= 4096*2000)
+			max_references = MAX2(max_references, 8);
+		else
+			max_references = MAX2(max_references, 17);
+
+		width = align (width, 16);
+		height = align (height, 16);
+		if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+			dpb_size = align((align(width, 32) * height * 9) / 4, 256) * max_references;
+		else
+			dpb_size = align((align(width, 32) * height * 3) / 2, 256) * max_references;
+		break;
+
+	case PIPE_VIDEO_FORMAT_VC1:
+		// the firmware seems to allways assume a minimum of ref frames
+		max_references = MAX2(NUM_VC1_REFS, max_references);
+
+		// reference picture buffer
+		dpb_size = image_size * max_references;
+
+		// CONTEXT_BUFFER
+		dpb_size += width_in_mb * height_in_mb * 128;
+
+		// IT surface buffer
+		dpb_size += width_in_mb * 64;
+
+		// DB surface buffer
+		dpb_size += width_in_mb * 128;
+
+		// BP
+		dpb_size += align(MAX2(width_in_mb, height_in_mb) * 7 * 16, 64);
+		break;
+
+	case PIPE_VIDEO_FORMAT_MPEG12:
+		// reference picture buffer, must be big enough for all frames
+		dpb_size = image_size * NUM_MPEG2_REFS;
+		break;
+
+	case PIPE_VIDEO_FORMAT_MPEG4:
+		// reference picture buffer
+		dpb_size = image_size * max_references;
+
+		// CM
+		dpb_size += width_in_mb * height_in_mb * 64;
+
+		// IT surface buffer
+		dpb_size += align(width_in_mb * height_in_mb * 32, 64);
+
+		dpb_size = MAX2(dpb_size, 30 * 1024 * 1024);
+		break;
+
+	default:
+		// something is missing here
+		assert(0);
+
+		// at least use a sane default value
+		dpb_size = 32 * 1024 * 1024;
+		break;
+	}
+	return dpb_size;
+}
+
+/**
+ * destroy this video decoder
+ */
+static void radeon_dec_destroy(struct pipe_video_codec *decoder)
+{
+	struct radeon_decoder *dec = (struct radeon_decoder*)decoder;
+	unsigned i;
+
+	assert(decoder);
+
+	map_msg_fb_it_buf(dec);
+	rvcn_dec_message_destroy(dec);
+	send_msg_buf(dec);
+
+	flush(dec, 0);
+
+	dec->ws->cs_destroy(dec->cs);
+
+	for (i = 0; i < NUM_BUFFERS; ++i) {
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
+		rvid_destroy_buffer(&dec->bs_buffers[i]);
+	}
+
+	rvid_destroy_buffer(&dec->dpb);
+	rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->sessionctx);
+
+	FREE(dec);
+}
+
+/**
+ * start decoding of a new frame
+ */
+static void radeon_dec_begin_frame(struct pipe_video_codec *decoder,
+			     struct pipe_video_buffer *target,
+			     struct pipe_picture_desc *picture)
+{
+	struct radeon_decoder *dec = (struct radeon_decoder*)decoder;
+	uintptr_t frame;
+
+	assert(decoder);
+
+	frame = ++dec->frame_number;
+	vl_video_buffer_set_associated_data(target, decoder, (void *)frame,
+					    &radeon_dec_destroy_associated_data);
+
+	dec->bs_size = 0;
+	dec->bs_ptr = dec->ws->buffer_map(
+		dec->bs_buffers[dec->cur_buffer].res->buf,
+		dec->cs, PIPE_TRANSFER_WRITE);
+}
+
+/**
+ * decode a macroblock
+ */
+static void radeon_dec_decode_macroblock(struct pipe_video_codec *decoder,
+				   struct pipe_video_buffer *target,
+				   struct pipe_picture_desc *picture,
+				   const struct pipe_macroblock *macroblocks,
+				   unsigned num_macroblocks)
+{
+	/* not supported (yet) */
+	assert(0);
+}
+
+/**
+ * decode a bitstream
+ */
+static void radeon_dec_decode_bitstream(struct pipe_video_codec *decoder,
+				  struct pipe_video_buffer *target,
+				  struct pipe_picture_desc *picture,
+				  unsigned num_buffers,
+				  const void * const *buffers,
+				  const unsigned *sizes)
+{
+	struct radeon_decoder *dec = (struct radeon_decoder*)decoder;
+	unsigned i;
+
+	assert(decoder);
+
+	if (!dec->bs_ptr)
+		return;
+
+	for (i = 0; i < num_buffers; ++i) {
+		struct rvid_buffer *buf = &dec->bs_buffers[dec->cur_buffer];
+		unsigned new_size = dec->bs_size + sizes[i];
+
+		if (new_size > buf->res->buf->size) {
+			dec->ws->buffer_unmap(buf->res->buf);
+			if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {
+				RVID_ERR("Can't resize bitstream buffer!");
+				return;
+			}
+
+			dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
+							  PIPE_TRANSFER_WRITE);
+			if (!dec->bs_ptr)
+				return;
+
+			dec->bs_ptr += dec->bs_size;
+		}
+
+		memcpy(dec->bs_ptr, buffers[i], sizes[i]);
+		dec->bs_size += sizes[i];
+		dec->bs_ptr += sizes[i];
+	}
+}
+
+/**
+ * end decoding of the current frame
+ */
+static void radeon_dec_end_frame(struct pipe_video_codec *decoder,
+			   struct pipe_video_buffer *target,
+			   struct pipe_picture_desc *picture)
+{
+	struct radeon_decoder *dec = (struct radeon_decoder*)decoder;
+	struct pb_buffer *dt;
+	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
+
+	assert(decoder);
+
+	if (!dec->bs_ptr)
+		return;
+
+	msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
+	bs_buf = &dec->bs_buffers[dec->cur_buffer];
+
+	memset(dec->bs_ptr, 0, align(dec->bs_size, 128) - dec->bs_size);
+	dec->ws->buffer_unmap(bs_buf->res->buf);
+
+	map_msg_fb_it_buf(dec);
+	dt = rvcn_dec_message_decode(dec, target, picture);
+	rvcn_dec_message_feedback(dec);
+	send_msg_buf(dec);
+
+	send_cmd(dec, RDECODE_CMD_DPB_BUFFER, dec->dpb.res->buf, 0,
+		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	if (dec->ctx.res)
+		send_cmd(dec, RDECODE_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0,
+			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	send_cmd(dec, RDECODE_CMD_BITSTREAM_BUFFER, bs_buf->res->buf,
+		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
+	send_cmd(dec, RDECODE_CMD_DECODING_TARGET_BUFFER, dt, 0,
+		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
+	send_cmd(dec, RDECODE_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf,
+		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
+	if (have_it(dec))
+		send_cmd(dec, RDECODE_CMD_IT_SCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf,
+			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
+	set_reg(dec, RDECODE_ENGINE_CNTL, 1);
+
+	flush(dec, RADEON_FLUSH_ASYNC);
+	next_buffer(dec);
+}
+
+/**
+ * flush any outstanding command buffers to the hardware
+ */
+static void radeon_dec_flush(struct pipe_video_codec *decoder)
+{
+}
+
+/**
+ * create and HW decoder
+ */
+struct pipe_video_codec *radeon_create_decoder(struct pipe_context *context,
+					     const struct pipe_video_codec *templ)
+{
+	struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws;
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
+	unsigned width = templ->width, height = templ->height;
+	unsigned dpb_size, bs_buf_size, stream_type = 0;
+	struct radeon_decoder *dec;
+	int r, i;
+
+	switch(u_reduce_video_profile(templ->profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG12:
+		if (templ->entrypoint > PIPE_VIDEO_ENTRYPOINT_BITSTREAM)
+			return vl_create_mpeg12_decoder(context, templ);
+		stream_type = RDECODE_CODEC_MPEG2_VLD;
+		break;
+	case PIPE_VIDEO_FORMAT_MPEG4:
+		width = align(width, VL_MACROBLOCK_WIDTH);
+		height = align(height, VL_MACROBLOCK_HEIGHT);
+		stream_type = RDECODE_CODEC_MPEG4;
+		break;
+	case PIPE_VIDEO_FORMAT_VC1:
+		stream_type = RDECODE_CODEC_VC1;
+		break;
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+		width = align(width, VL_MACROBLOCK_WIDTH);
+		height = align(height, VL_MACROBLOCK_HEIGHT);
+		stream_type = RDECODE_CODEC_H264_PERF;
+		break;
+	case PIPE_VIDEO_FORMAT_HEVC:
+		stream_type = RDECODE_CODEC_H265;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	dec = CALLOC_STRUCT(radeon_decoder);
+
+	if (!dec)
+		return NULL;
+
+	dec->base = *templ;
+	dec->base.context = context;
+	dec->base.width = width;
+	dec->base.height = height;
+
+	dec->base.destroy = radeon_dec_destroy;
+	dec->base.begin_frame = radeon_dec_begin_frame;
+	dec->base.decode_macroblock = radeon_dec_decode_macroblock;
+	dec->base.decode_bitstream = radeon_dec_decode_bitstream;
+	dec->base.end_frame = radeon_dec_end_frame;
+	dec->base.flush = radeon_dec_flush;
+
+	dec->stream_type = stream_type;
+	dec->stream_handle = rvid_alloc_stream_handle();
+	dec->screen = context->screen;
+	dec->ws = ws;
+	dec->cs = ws->cs_create(rctx->ctx, RING_VCN_DEC, NULL, NULL);
+	if (!dec->cs) {
+		RVID_ERR("Can't get command submission context.\n");
+		goto error;
+	}
+
+	bs_buf_size = width * height * (512 / (16 * 16));
+	for (i = 0; i < NUM_BUFFERS; ++i) {
+		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+		if (have_it(dec))
+			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
+		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i],
+					msg_fb_it_size, PIPE_USAGE_STAGING)) {
+			RVID_ERR("Can't allocated message buffers.\n");
+			goto error;
+		}
+
+		if (!rvid_create_buffer(dec->screen, &dec->bs_buffers[i],
+					bs_buf_size, PIPE_USAGE_STAGING)) {
+			RVID_ERR("Can't allocated bitstream buffers.\n");
+			goto error;
+		}
+
+		rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]);
+		rvid_clear_buffer(context, &dec->bs_buffers[i]);
+	}
+
+	dpb_size = calc_dpb_size(dec);
+
+	if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) {
+		RVID_ERR("Can't allocated dpb.\n");
+		goto error;
+	}
+
+	rvid_clear_buffer(context, &dec->dpb);
+
+	if (dec->stream_type == RDECODE_CODEC_H264_PERF) {
+		unsigned ctx_size = calc_ctx_size_h264_perf(dec);
+		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+			RVID_ERR("Can't allocated context buffer.\n");
+			goto error;
+		}
+		rvid_clear_buffer(context, &dec->ctx);
+	}
+
+	if (!rvid_create_buffer(dec->screen, &dec->sessionctx,
+				RDECODE_SESSION_CONTEXT_SIZE,
+				PIPE_USAGE_DEFAULT)) {
+		RVID_ERR("Can't allocated session ctx.\n");
+		goto error;
+	}
+	rvid_clear_buffer(context, &dec->sessionctx);
+
+	map_msg_fb_it_buf(dec);
+	rvcn_dec_message_create(dec);
+	send_msg_buf(dec);
+	r = flush(dec, 0);
+	if (r)
+		goto error;
+
+	next_buffer(dec);
+
+	return &dec->base;
+
+error:
+	if (dec->cs) dec->ws->cs_destroy(dec->cs);
+
+	for (i = 0; i < NUM_BUFFERS; ++i) {
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
+		rvid_destroy_buffer(&dec->bs_buffers[i]);
+	}
+
+	rvid_destroy_buffer(&dec->dpb);
+	rvid_destroy_buffer(&dec->ctx);
+	rvid_destroy_buffer(&dec->sessionctx);
+
+	FREE(dec);
+
+	return NULL;
+}
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.h b/src/gallium/drivers/radeon/radeon_vcn_dec.h
new file mode 100644
index 0000000..accffef
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.h
@@ -0,0 +1,506 @@
+/**************************************************************************
+ *
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _RADEON_VCN_DEC_H
+#define _RADEON_VCN_DEC_H
+
+#define RDECODE_PKT_TYPE_S(x)			(((unsigned)(x)	& 0x3) << 30)
+#define RDECODE_PKT_TYPE_G(x)			(((x) >> 30) & 0x3)
+#define RDECODE_PKT_TYPE_C			0x3FFFFFFF
+#define RDECODE_PKT_COUNT_S(x)			(((unsigned)(x) & 0x3FFF) << 16)
+#define RDECODE_PKT_COUNT_G(x)			(((x) >> 16) & 0x3FFF)
+#define RDECODE_PKT_COUNT_C			0xC000FFFF
+#define RDECODE_PKT0_BASE_INDEX_S(x)		(((unsigned)(x)	& 0xFFFF) << 0)
+#define RDECODE_PKT0_BASE_INDEX_G(x)		(((x) >> 0) & 0xFFFF)
+#define RDECODE_PKT0_BASE_INDEX_C		0xFFFF0000
+#define RDECODE_PKT0(index, count)		(RDECODE_PKT_TYPE_S(0) | \
+						RDECODE_PKT0_BASE_INDEX_S(index) | \
+						RDECODE_PKT_COUNT_S(count))
+
+#define RDECODE_PKT2()				(RDECODE_PKT_TYPE_S(2))
+
+#define RDECODE_CMD_MSG_BUFFER				0x00000000
+#define RDECODE_CMD_DPB_BUFFER				0x00000001
+#define RDECODE_CMD_DECODING_TARGET_BUFFER		0x00000002
+#define RDECODE_CMD_FEEDBACK_BUFFER			0x00000003
+#define RDECODE_CMD_SESSION_CONTEXT_BUFFER		0x00000005
+#define RDECODE_CMD_BITSTREAM_BUFFER			0x00000100
+#define RDECODE_CMD_IT_SCALING_TABLE_BUFFER		0x00000204
+#define RDECODE_CMD_CONTEXT_BUFFER			0x00000206
+
+#define RDECODE_MSG_CREATE				0x00000000
+#define RDECODE_MSG_DECODE				0x00000001
+#define RDECODE_MSG_DESTROY				0x00000002
+
+#define RDECODE_CODEC_H264				0x00000000
+#define RDECODE_CODEC_VC1				0x00000001
+#define RDECODE_CODEC_MPEG2_VLD 			0x00000003
+#define RDECODE_CODEC_MPEG4				0x00000004
+#define RDECODE_CODEC_H264_PERF 			0x00000007
+#define RDECODE_CODEC_H265				0x00000010
+
+#define RDECODE_ARRAY_MODE_LINEAR			0x00000000
+#define RDECODE_ARRAY_MODE_MACRO_LINEAR_MICRO_TILED	0x00000001
+#define RDECODE_ARRAY_MODE_1D_THIN			0x00000002
+#define RDECODE_ARRAY_MODE_2D_THIN			0x00000004
+#define RDECODE_ARRAY_MODE_MACRO_TILED_MICRO_LINEAR	0x00000004
+#define RDECODE_ARRAY_MODE_MACRO_TILED_MICRO_TILED	0x00000005
+
+#define RDECODE_H264_PROFILE_BASELINE			0x00000000
+#define RDECODE_H264_PROFILE_MAIN			0x00000001
+#define RDECODE_H264_PROFILE_HIGH			0x00000002
+#define RDECODE_H264_PROFILE_STEREO_HIGH		0x00000003
+#define RDECODE_H264_PROFILE_MVC			0x00000004
+
+#define RDECODE_VC1_PROFILE_SIMPLE			0x00000000
+#define RDECODE_VC1_PROFILE_MAIN			0x00000001
+#define RDECODE_VC1_PROFILE_ADVANCED			0x00000002
+
+#define RDECODE_SW_MODE_LINEAR				0x00000000
+#define RDECODE_256B_S					0x00000001
+#define RDECODE_256B_D					0x00000002
+#define RDECODE_4KB_S					0x00000005
+#define RDECODE_4KB_D					0x00000006
+#define RDECODE_64KB_S					0x00000009
+#define RDECODE_64KB_D					0x0000000A
+#define RDECODE_4KB_S_X 				0x00000015
+#define RDECODE_4KB_D_X 				0x00000016
+#define RDECODE_64KB_S_X				0x00000019
+#define RDECODE_64KB_D_X				0x0000001A
+
+#define RDECODE_MESSAGE_NOT_SUPPORTED			0x00000000
+#define RDECODE_MESSAGE_CREATE				0x00000001
+#define RDECODE_MESSAGE_DECODE				0x00000002
+#define RDECODE_MESSAGE_AVC				0x00000006
+#define RDECODE_MESSAGE_VC1				0x00000007
+#define RDECODE_MESSAGE_MPEG2_VLD			0x0000000A
+#define RDECODE_MESSAGE_MPEG4_ASP_VLD			0x0000000B
+#define RDECODE_MESSAGE_HEVC				0x0000000D
+
+#define RDECODE_FEEDBACK_PROFILING			0x00000001
+
+#define RDECODE_SPS_INFO_H264_EXTENSION_SUPPORT_FLAG_SHIFT	7
+
+typedef struct rvcn_dec_message_index_s {
+	unsigned int	message_id;
+	unsigned int	offset;
+	unsigned int	size;
+	unsigned int	filled;
+} rvcn_dec_message_index_t;
+
+typedef struct rvcn_dec_message_header_s {
+	unsigned int	header_size;
+	unsigned int	total_size;
+	unsigned int	num_buffers;
+	unsigned int	msg_type;
+	unsigned int	stream_handle;
+	unsigned int	status_report_feedback_number;
+
+	rvcn_dec_message_index_t	index[1];
+} rvcn_dec_message_header_t;
+
+typedef struct rvcn_dec_message_create_s {
+	unsigned int	stream_type;
+	unsigned int	session_flags;
+	unsigned int	width_in_samples;
+	unsigned int	height_in_samples;
+} rvcn_dec_message_create_t;
+
+typedef struct rvcn_dec_message_decode_s {
+	unsigned int	stream_type;
+	unsigned int	decode_flags;
+	unsigned int	width_in_samples;
+	unsigned int	height_in_samples;
+
+	unsigned int	bsd_size;
+	unsigned int	dpb_size;
+	unsigned int	dt_size;
+	unsigned int	sct_size;
+	unsigned int	sc_coeff_size;
+	unsigned int	hw_ctxt_size;
+	unsigned int	sw_ctxt_size;
+	unsigned int	pic_param_size;
+	unsigned int	mb_cntl_size;
+	unsigned int	reserved0[4];
+	unsigned int	decode_buffer_flags;
+
+	unsigned int	db_pitch;
+	unsigned int	db_aligned_height;
+	unsigned int	db_tiling_mode;
+	unsigned int	db_swizzle_mode;
+	unsigned int	db_array_mode;
+	unsigned int	db_field_mode;
+	unsigned int	db_surf_tile_config;
+
+	unsigned int	dt_pitch;
+	unsigned int	dt_uv_pitch;
+	unsigned int	dt_tiling_mode;
+	unsigned int	dt_swizzle_mode;
+	unsigned int	dt_array_mode;
+	unsigned int	dt_field_mode;
+	unsigned int	dt_out_format;
+	unsigned int	dt_surf_tile_config;
+	unsigned int	dt_uv_surf_tile_config;
+	unsigned int	dt_luma_top_offset;
+	unsigned int	dt_luma_bottom_offset;
+	unsigned int	dt_chroma_top_offset;
+	unsigned int	dt_chroma_bottom_offset;
+	unsigned int	dt_chromaV_top_offset;
+	unsigned int	dt_chromaV_bottom_offset;
+
+	unsigned char	dpbRefArraySlice[16];
+	unsigned char	dpbCurArraySlice;
+	unsigned char	dpbReserved[3];
+} rvcn_dec_message_decode_t;
+
+typedef struct {
+	unsigned short	viewOrderIndex;
+	unsigned short	viewId;
+	unsigned short	numOfAnchorRefsInL0;
+	unsigned short	viewIdOfAnchorRefsInL0[15];
+	unsigned short	numOfAnchorRefsInL1;
+	unsigned short	viewIdOfAnchorRefsInL1[15];
+	unsigned short	numOfNonAnchorRefsInL0;
+	unsigned short	viewIdOfNonAnchorRefsInL0[15];
+	unsigned short	numOfNonAnchorRefsInL1;
+	unsigned short	viewIdOfNonAnchorRefsInL1[15];
+} radeon_mvcElement_t;
+
+typedef struct rvcn_dec_message_avc_s {
+	unsigned int	profile;
+	unsigned int	level;
+
+	unsigned int	sps_info_flags;
+	unsigned int	pps_info_flags;
+	unsigned char	chroma_format;
+	unsigned char	bit_depth_luma_minus8;
+	unsigned char	bit_depth_chroma_minus8;
+	unsigned char	log2_max_frame_num_minus4;
+
+	unsigned char	pic_order_cnt_type;
+	unsigned char	log2_max_pic_order_cnt_lsb_minus4;
+	unsigned char	num_ref_frames;
+	unsigned char	reserved_8bit;
+
+	signed char	pic_init_qp_minus26;
+	signed char	pic_init_qs_minus26;
+	signed char	chroma_qp_index_offset;
+	signed char	second_chroma_qp_index_offset;
+
+	unsigned char	num_slice_groups_minus1;
+	unsigned char	slice_group_map_type;
+	unsigned char	num_ref_idx_l0_active_minus1;
+	unsigned char	num_ref_idx_l1_active_minus1;
+
+	unsigned short	slice_group_change_rate_minus1;
+	unsigned short	reserved_16bit_1;
+
+	unsigned char	scaling_list_4x4[6][16];
+	unsigned char	scaling_list_8x8[2][64];
+
+	unsigned int	frame_num;
+	unsigned int	frame_num_list[16];
+	int		curr_field_order_cnt_list[2];
+	int		field_order_cnt_list[16][2];
+
+	unsigned int	decoded_pic_idx;
+	unsigned int	curr_pic_ref_frame_num;
+	unsigned char	ref_frame_list[16];
+
+	unsigned int	reserved[122];
+
+	struct {
+		unsigned int	numViews;
+		unsigned int	viewId0;
+		radeon_mvcElement_t	mvcElements[1];
+	} mvc;
+
+} rvcn_dec_message_avc_t;
+
+typedef struct rvcn_dec_message_vc1_s {
+	unsigned int	profile;
+	unsigned int	level;
+	unsigned int	sps_info_flags;
+	unsigned int	pps_info_flags;
+	unsigned int	pic_structure;
+	unsigned int	chroma_format;
+	unsigned short	decoded_pic_idx;
+	unsigned short	deblocked_pic_idx;
+	unsigned short	forward_ref_idx;
+	unsigned short	backward_ref_idx;
+	unsigned int	cached_frame_flag;
+} rvcn_dec_message_vc1_t;
+
+typedef struct rvcn_dec_message_mpeg2_vld_s {
+	unsigned int	decoded_pic_idx;
+	unsigned int	forward_ref_pic_idx;
+	unsigned int	backward_ref_pic_idx;
+
+	unsigned char	load_intra_quantiser_matrix;
+	unsigned char	load_nonintra_quantiser_matrix;
+	unsigned char	reserved_quantiser_alignement[2];
+	unsigned char	intra_quantiser_matrix[64];
+	unsigned char	nonintra_quantiser_matrix[64];
+
+	unsigned char	profile_and_level_indication;
+	unsigned char	chroma_format;
+
+	unsigned char	picture_coding_type;
+
+	unsigned char	reserved_1;
+
+	unsigned char	f_code[2][2];
+	unsigned char	intra_dc_precision;
+	unsigned char	pic_structure;
+	unsigned char	top_field_first;
+	unsigned char	frame_pred_frame_dct;
+	unsigned char	concealment_motion_vectors;
+	unsigned char	q_scale_type;
+	unsigned char	intra_vlc_format;
+	unsigned char	alternate_scan;
+} rvcn_dec_message_mpeg2_vld_t;
+
+typedef struct rvcn_dec_message_mpeg4_asp_vld_s {
+	unsigned int	decoded_pic_idx;
+	unsigned int	forward_ref_pic_idx;
+	unsigned int	backward_ref_pic_idx;
+
+	unsigned int	variant_type;
+	unsigned char	profile_and_level_indication;
+
+	unsigned char	video_object_layer_verid;
+	unsigned char	video_object_layer_shape;
+
+	unsigned char	reserved_1;
+
+	unsigned short	video_object_layer_width;
+	unsigned short	video_object_layer_height;
+
+	unsigned short	vop_time_increment_resolution;
+
+	unsigned short	reserved_2;
+
+	struct {
+		unsigned int	short_video_header :1;
+		unsigned int	obmc_disable :1;
+		unsigned int	interlaced :1;
+		unsigned int	load_intra_quant_mat :1;
+		unsigned int	load_nonintra_quant_mat :1;
+		unsigned int	quarter_sample :1;
+		unsigned int	complexity_estimation_disable :1;
+		unsigned int	resync_marker_disable :1;
+		unsigned int	data_partitioned :1;
+		unsigned int	reversible_vlc :1;
+		unsigned int	newpred_enable :1;
+		unsigned int	reduced_resolution_vop_enable :1;
+		unsigned int	scalability :1;
+		unsigned int	is_object_layer_identifier :1;
+		unsigned int	fixed_vop_rate :1;
+		unsigned int	newpred_segment_type :1;
+		unsigned int	reserved_bits :16;
+	};
+
+	unsigned char	quant_type;
+	unsigned char	reserved_3[3];
+	unsigned char	intra_quant_mat[64];
+	unsigned char	nonintra_quant_mat[64];
+
+	struct {
+		unsigned char	sprite_enable;
+
+		unsigned char	reserved_4[3];
+
+		unsigned short	sprite_width;
+		unsigned short	sprite_height;
+		short		sprite_left_coordinate;
+		short		sprite_top_coordinate;
+
+		unsigned char	no_of_sprite_warping_points;
+		unsigned char	sprite_warping_accuracy;
+		unsigned char	sprite_brightness_change;
+		unsigned char	low_latency_sprite_enable;
+	} sprite_config;
+
+	struct {
+		struct {
+			unsigned int	check_skip :1;
+			unsigned int	switch_rounding :1;
+			unsigned int	t311 :1;
+			unsigned int	reserved_bits :29;
+		};
+
+		unsigned char	vol_mode;
+
+		unsigned char	reserved_5[3];
+	} divx_311_config;
+
+	struct {
+		unsigned char	vop_data_present;
+		unsigned char	vop_coding_type;
+		unsigned char	vop_quant;
+		unsigned char	vop_coded;
+		unsigned char	vop_rounding_type;
+		unsigned char	intra_dc_vlc_thr;
+		unsigned char	top_field_first;
+		unsigned char	alternate_vertical_scan_flag;
+		unsigned char	vop_fcode_forward;
+		unsigned char	vop_fcode_backward;
+		unsigned int	TRB[2];
+		unsigned int	TRD[2];
+	} vop;
+
+} rvcn_dec_message_mpeg4_asp_vld_t;
+
+typedef struct rvcn_dec_message_hevc_s {
+	unsigned int	sps_info_flags;
+	unsigned int	pps_info_flags;
+	unsigned char	chroma_format;
+	unsigned char	bit_depth_luma_minus8;
+	unsigned char	bit_depth_chroma_minus8;
+	unsigned char	log2_max_pic_order_cnt_lsb_minus4;
+
+	unsigned char	sps_max_dec_pic_buffering_minus1;
+	unsigned char	log2_min_luma_coding_block_size_minus3;
+	unsigned char	log2_diff_max_min_luma_coding_block_size;
+	unsigned char	log2_min_transform_block_size_minus2;
+
+	unsigned char	log2_diff_max_min_transform_block_size;
+	unsigned char	max_transform_hierarchy_depth_inter;
+	unsigned char	max_transform_hierarchy_depth_intra;
+	unsigned char	pcm_sample_bit_depth_luma_minus1;
+
+	unsigned char	pcm_sample_bit_depth_chroma_minus1;
+	unsigned char	log2_min_pcm_luma_coding_block_size_minus3;
+	unsigned char	log2_diff_max_min_pcm_luma_coding_block_size;
+	unsigned char	num_extra_slice_header_bits;
+
+	unsigned char	num_short_term_ref_pic_sets;
+	unsigned char	num_long_term_ref_pic_sps;
+	unsigned char	num_ref_idx_l0_default_active_minus1;
+	unsigned char	num_ref_idx_l1_default_active_minus1;
+
+	signed char	pps_cb_qp_offset;
+	signed char	pps_cr_qp_offset;
+	signed char	pps_beta_offset_div2;
+	signed char	pps_tc_offset_div2;
+
+	unsigned char	diff_cu_qp_delta_depth;
+	unsigned char	num_tile_columns_minus1;
+	unsigned char	num_tile_rows_minus1;
+	unsigned char	log2_parallel_merge_level_minus2;
+
+	unsigned short	column_width_minus1[19];
+	unsigned short	row_height_minus1[21];
+
+	signed char	init_qp_minus26;
+	unsigned char	num_delta_pocs_ref_rps_idx;
+	unsigned char	curr_idx;
+	unsigned char	reserved[1];
+	int		curr_poc;
+	unsigned char	ref_pic_list[16];
+	int		poc_list[16];
+	unsigned char	ref_pic_set_st_curr_before[8];
+	unsigned char	ref_pic_set_st_curr_after[8];
+	unsigned char	ref_pic_set_lt_curr[8];
+
+	unsigned char	ucScalingListDCCoefSizeID2[6];
+	unsigned char	ucScalingListDCCoefSizeID3[2];
+
+	unsigned char	highestTid;
+	unsigned char	isNonRef;
+
+	unsigned char	p010_mode;
+	unsigned char	msb_mode;
+	unsigned char	luma_10to8;
+	unsigned char	chroma_10to8;
+
+	unsigned char	hevc_reserved[2];
+
+	unsigned char	direct_reflist[2][15];
+} rvcn_dec_message_hevc_t;
+
+typedef struct rvcn_dec_feature_index_s {
+	unsigned int	feature_id;
+	unsigned int	offset;
+	unsigned int	size;
+	unsigned int	filled;
+} rvcn_dec_feature_index_t;
+
+typedef struct rvcn_dec_feedback_header_s {
+	unsigned int	header_size;
+	unsigned int	total_size;
+	unsigned int	num_buffers;
+	unsigned int	status_report_feedback_number;
+	unsigned int	status;
+	unsigned int	value;
+	unsigned int	errorBits;
+	rvcn_dec_feature_index_t	index[1];
+} rvcn_dec_feedback_header_t;
+
+typedef struct rvcn_dec_feedback_profiling_s {
+	unsigned int	size;
+
+	unsigned int	decodingTime;
+	unsigned int	decodePlusOverhead;
+	unsigned int	masterTimerHits;
+	unsigned int	uvdLBSIREWaitCount;
+
+	unsigned int	avgMPCMemLatency;
+	unsigned int	maxMPCMemLatency;
+	unsigned int	uvdMPCLumaHits;
+	unsigned int	uvdMPCLumaHitPend;
+	unsigned int	uvdMPCLumaSearch;
+	unsigned int	uvdMPCChromaHits;
+	unsigned int	uvdMPCChromaHitPend;
+	unsigned int	uvdMPCChromaSearch;
+
+	unsigned int	uvdLMIPerfCountLo;
+	unsigned int	uvdLMIPerfCountHi;
+	unsigned int	uvdLMIAvgLatCntrEnvHit;
+	unsigned int	uvdLMILatCntr;
+
+	unsigned int	frameCRC0;
+	unsigned int	frameCRC1;
+	unsigned int	frameCRC2;
+	unsigned int	frameCRC3;
+
+	unsigned int	uvdLMIPerfMonCtrl;
+	unsigned int	uvdLMILatCtrl;
+	unsigned int	uvdMPCCntl;
+	unsigned int	reserved0[4];
+	unsigned int	decoderID;
+	unsigned int	codec;
+
+	unsigned int	dmaHwCrc32Enable;
+	unsigned int	dmaHwCrc32Value;
+	unsigned int	dmaHwCrc32Value2;
+} rvcn_dec_feedback_profiling_t;
+
+struct pipe_video_codec *radeon_create_decoder(struct pipe_context *context,
+		const struct pipe_video_codec *templat);
+
+#endif
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index c7ad7f7..2910b2d 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -182,8 +182,11 @@
 
 			for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j)
 				surfaces[i]->u.legacy.level[j].offset += off;
-		} else
+		} else {
 			surfaces[i]->u.gfx9.surf_offset += off;
+			for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.gfx9.offset); ++j)
+				surfaces[i]->u.gfx9.offset[j] += off;
+		}
 
 		off += surfaces[i]->surf_size;
 	}
@@ -203,7 +206,8 @@
 	/* TODO: 2D tiling workaround */
 	alignment *= 2;
 
-	pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, 0);
+	pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM,
+			       RADEON_FLAG_GTT_WC);
 	if (!pb)
 		return;
 
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 2e287c6..351edcd 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -28,7 +28,8 @@
 
 #include "pipebuffer/pb_buffer.h"
 
-#include "amd/common/amd_family.h"
+#include "amd/common/ac_gpu_info.h"
+#include "amd/common/ac_surface.h"
 
 #define RADEON_FLUSH_ASYNC		(1 << 0)
 #define RADEON_FLUSH_END_OF_FRAME       (1 << 1)
@@ -50,10 +51,9 @@
 
 enum radeon_bo_flag { /* bitfield */
     RADEON_FLAG_GTT_WC =        (1 << 0),
-    RADEON_FLAG_CPU_ACCESS =    (1 << 1),
-    RADEON_FLAG_NO_CPU_ACCESS = (1 << 2),
-    RADEON_FLAG_HANDLE =        (1 << 3), /* the buffer must not be suballocated */
-    RADEON_FLAG_SPARSE =        (1 << 4),
+    RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
+    RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
+    RADEON_FLAG_SPARSE =        (1 << 3),
 };
 
 enum radeon_bo_usage { /* bitfield */
@@ -75,6 +75,8 @@
     RING_DMA,
     RING_UVD,
     RING_VCE,
+    RING_UVD_ENC,
+    RING_VCN_DEC,
     RING_LAST,
 };
 
@@ -88,8 +90,10 @@
     RADEON_TIMESTAMP,
     RADEON_NUM_GFX_IBS,
     RADEON_NUM_SDMA_IBS,
+    RADEON_GFX_BO_LIST_COUNTER, /* number of BOs submitted in gfx IBs */
     RADEON_NUM_BYTES_MOVED,
     RADEON_NUM_EVICTIONS,
+    RADEON_NUM_VRAM_CPU_PAGE_FAULTS,
     RADEON_VRAM_USAGE,
     RADEON_VRAM_VIS_USAGE,
     RADEON_GTT_USAGE,
@@ -175,66 +179,6 @@
     uint64_t                      used_gart;
 };
 
-struct radeon_info {
-    /* PCI info: domain:bus:dev:func */
-    uint32_t                    pci_domain;
-    uint32_t                    pci_bus;
-    uint32_t                    pci_dev;
-    uint32_t                    pci_func;
-
-    /* Device info. */
-    uint32_t                    pci_id;
-    enum radeon_family          family;
-    enum chip_class             chip_class;
-    uint32_t                    gart_page_size;
-    uint64_t                    gart_size;
-    uint64_t                    vram_size;
-    uint64_t                    vram_vis_size;
-    uint64_t                    max_alloc_size;
-    uint32_t                    min_alloc_size;
-    bool                        has_dedicated_vram;
-    bool                        has_virtual_memory;
-    bool                        gfx_ib_pad_with_type2;
-    bool                        has_sdma;
-    bool                        has_uvd;
-    uint32_t                    uvd_fw_version;
-    uint32_t                    vce_fw_version;
-    uint32_t                    me_fw_version;
-    uint32_t                    pfp_fw_version;
-    uint32_t                    ce_fw_version;
-    uint32_t                    vce_harvest_config;
-    uint32_t                    clock_crystal_freq;
-    uint32_t                    tcc_cache_line_size;
-
-    /* Kernel info. */
-    uint32_t                    drm_major; /* version */
-    uint32_t                    drm_minor;
-    uint32_t                    drm_patchlevel;
-    bool                        has_userptr;
-
-    /* Shader cores. */
-    uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
-    uint32_t                    max_shader_clock;
-    uint32_t                    num_good_compute_units;
-    uint32_t                    max_se; /* shader engines */
-    uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
-
-    /* Render backends (color + depth blocks). */
-    uint32_t                    r300_num_gb_pipes;
-    uint32_t                    r300_num_z_pipes;
-    uint32_t                    r600_gb_backend_map; /* R600 harvest config */
-    bool                        r600_gb_backend_map_valid;
-    uint32_t                    r600_num_banks;
-    uint32_t                    num_render_backends;
-    uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
-    uint32_t                    pipe_interleave_bytes;
-    uint32_t                    enabled_rb_mask; /* GCN harvest config */
-
-    /* Tile modes. */
-    uint32_t                    si_tile_mode_array[32];
-    uint32_t                    cik_macrotile_mode_array[16];
-};
-
 /* Tiling info for display code, DRI sharing, and other data. */
 struct radeon_bo_metadata {
     /* Tiling flags describing the texture layout for display code
@@ -273,153 +217,6 @@
     RADEON_FID_R300_CMASK_ACCESS,
 };
 
-#define RADEON_SURF_MAX_LEVELS                  15
-
-enum radeon_surf_mode {
-    RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
-    RADEON_SURF_MODE_1D = 2,
-    RADEON_SURF_MODE_2D = 3,
-};
-
-/* These are defined exactly like GB_TILE_MODEn.MICRO_TILE_MODE_NEW. */
-enum radeon_micro_mode {
-    RADEON_MICRO_MODE_DISPLAY = 0,
-    RADEON_MICRO_MODE_THIN = 1,
-    RADEON_MICRO_MODE_DEPTH = 2,
-    RADEON_MICRO_MODE_ROTATED = 3,
-};
-
-/* the first 16 bits are reserved for libdrm_radeon, don't use them */
-#define RADEON_SURF_SCANOUT                     (1 << 16)
-#define RADEON_SURF_ZBUFFER                     (1 << 17)
-#define RADEON_SURF_SBUFFER                     (1 << 18)
-#define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
-/* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
-#define RADEON_SURF_FMASK                       (1 << 21)
-#define RADEON_SURF_DISABLE_DCC                 (1 << 22)
-#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
-#define RADEON_SURF_IMPORTED                    (1 << 24)
-#define RADEON_SURF_OPTIMIZE_FOR_SPACE          (1 << 25)
-
-struct legacy_surf_level {
-    uint64_t                    offset;
-    uint64_t                    slice_size;
-    uint64_t                    dcc_offset;
-    uint64_t                    dcc_fast_clear_size;
-    uint16_t                    nblk_x;
-    uint16_t                    nblk_y;
-    enum radeon_surf_mode       mode;
-};
-
-struct legacy_surf_layout {
-    unsigned                    bankw:4;  /* max 8 */
-    unsigned                    bankh:4;  /* max 8 */
-    unsigned                    mtilea:4; /* max 8 */
-    unsigned                    tile_split:13;         /* max 4K */
-    unsigned                    stencil_tile_split:13; /* max 4K */
-    unsigned                    pipe_config:5;      /* max 17 */
-    unsigned                    num_banks:5;        /* max 16 */
-    unsigned                    macro_tile_index:4; /* max 15 */
-
-    /* Whether the depth miptree or stencil miptree as used by the DB are
-     * adjusted from their TC compatible form to ensure depth/stencil
-     * compatibility. If either is true, the corresponding plane cannot be
-     * sampled from.
-     */
-    unsigned                    depth_adjusted:1;
-    unsigned                    stencil_adjusted:1;
-
-    struct legacy_surf_level    level[RADEON_SURF_MAX_LEVELS];
-    struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
-    uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
-    uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
-};
-
-/* Same as addrlib - AddrResourceType. */
-enum gfx9_resource_type {
-    RADEON_RESOURCE_1D = 0,
-    RADEON_RESOURCE_2D,
-    RADEON_RESOURCE_3D,
-};
-
-struct gfx9_surf_flags {
-    uint16_t                    swizzle_mode; /* tile mode */
-    uint16_t                    epitch; /* (pitch - 1) or (height - 1) */
-};
-
-struct gfx9_surf_meta_flags {
-    unsigned                    rb_aligned:1;   /* optimal for RBs */
-    unsigned                    pipe_aligned:1; /* optimal for TC */
-};
-
-struct gfx9_surf_layout {
-    struct gfx9_surf_flags      surf;    /* color or depth surface */
-    struct gfx9_surf_flags      fmask;   /* not added to surf_size */
-    struct gfx9_surf_flags      stencil; /* added to surf_size, use stencil_offset */
-
-    struct gfx9_surf_meta_flags dcc;   /* metadata of color */
-    struct gfx9_surf_meta_flags htile; /* metadata of depth and stencil */
-    struct gfx9_surf_meta_flags cmask; /* metadata of fmask */
-
-    enum gfx9_resource_type     resource_type; /* 1D, 2D or 3D */
-    uint64_t                    surf_offset; /* 0 unless imported with an offset */
-    /* The size of the 2D plane containing all mipmap levels. */
-    uint64_t                    surf_slice_size;
-    uint16_t                    surf_pitch; /* in blocks */
-    uint16_t                    surf_height;
-    /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
-    uint32_t                    offset[RADEON_SURF_MAX_LEVELS];
-
-    uint16_t                    dcc_pitch_max;  /* (mip chain pitch - 1) */
-
-    uint64_t                    stencil_offset; /* separate stencil */
-    uint64_t                    fmask_size;
-    uint64_t                    cmask_size;
-
-    uint32_t                    fmask_alignment;
-    uint32_t                    cmask_alignment;
-};
-
-struct radeon_surf {
-    /* Format properties. */
-    unsigned                    blk_w:4;
-    unsigned                    blk_h:4;
-    unsigned                    bpe:5;
-    /* Number of mipmap levels where DCC is enabled starting from level 0.
-     * Non-zero levels may be disabled due to alignment constraints, but not
-     * the first level.
-     */
-    unsigned                    num_dcc_levels:4;
-    unsigned                    is_linear:1;
-    /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
-    unsigned                    micro_tile_mode:3;
-    uint32_t                    flags;
-
-    /* These are return values. Some of them can be set by the caller, but
-     * they will be treated as hints (e.g. bankw, bankh) and might be
-     * changed by the calculator.
-     */
-    uint64_t                    surf_size;
-    uint64_t                    dcc_size;
-    uint64_t                    htile_size;
-
-    uint32_t                    surf_alignment;
-    uint32_t                    dcc_alignment;
-    uint32_t                    htile_alignment;
-
-    union {
-        /* R600-VI return values.
-         *
-         * Some of them can be set by the caller if certain parameters are
-         * desirable. The allocator will try to obey them.
-         */
-        struct legacy_surf_layout legacy;
-
-        /* GFX9+ return values. */
-        struct gfx9_surf_layout gfx9;
-    } u;
-};
-
 struct radeon_bo_list_item {
     uint64_t bo_size;
     uint64_t vm_address;
@@ -563,6 +360,9 @@
      */
     bool (*buffer_is_user_ptr)(struct pb_buffer *buf);
 
+    /** Whether the buffer was suballocated. */
+    bool (*buffer_is_suballocated)(struct pb_buffer *buf);
+
     /**
      * Get a winsys handle from a winsys buffer. The internal structure
      * of the handle is platform-specific and only a winsys should access it.
@@ -840,6 +640,8 @@
 
     bool (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
                            unsigned num_registers, uint32_t *out);
+
+    const char* (*get_chip_name)(struct radeon_winsys *ws);
 };
 
 static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw)
@@ -859,4 +661,95 @@
     cs->current.cdw += count;
 }
 
+enum radeon_heap {
+    RADEON_HEAP_VRAM_NO_CPU_ACCESS,
+    RADEON_HEAP_VRAM,
+    RADEON_HEAP_VRAM_GTT, /* combined heaps */
+    RADEON_HEAP_GTT_WC,
+    RADEON_HEAP_GTT,
+    RADEON_MAX_SLAB_HEAPS,
+    RADEON_MAX_CACHED_HEAPS = RADEON_MAX_SLAB_HEAPS,
+};
+
+static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap heap)
+{
+    switch (heap) {
+    case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
+    case RADEON_HEAP_VRAM:
+        return RADEON_DOMAIN_VRAM;
+    case RADEON_HEAP_VRAM_GTT:
+        return RADEON_DOMAIN_VRAM_GTT;
+    case RADEON_HEAP_GTT_WC:
+    case RADEON_HEAP_GTT:
+        return RADEON_DOMAIN_GTT;
+    default:
+        assert(0);
+        return (enum radeon_bo_domain)0;
+    }
+}
+
+static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
+{
+    switch (heap) {
+    case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
+        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
+    case RADEON_HEAP_VRAM:
+    case RADEON_HEAP_VRAM_GTT:
+    case RADEON_HEAP_GTT_WC:
+        return RADEON_FLAG_GTT_WC;
+    case RADEON_HEAP_GTT:
+    default:
+        return 0;
+    }
+}
+
+/* The pb cache bucket is chosen to minimize pb_cache misses.
+ * It must be between 0 and 3 inclusive.
+ */
+static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
+{
+    switch (heap) {
+    case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
+        return 0;
+    case RADEON_HEAP_VRAM:
+    case RADEON_HEAP_VRAM_GTT:
+        return 1;
+    case RADEON_HEAP_GTT_WC:
+        return 2;
+    case RADEON_HEAP_GTT:
+    default:
+        return 3;
+    }
+}
+
+/* Return the heap index for winsys allocators, or -1 on failure. */
+static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
+                                        enum radeon_bo_flag flags)
+{
+    /* VRAM implies WC (write combining) */
+    assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
+    /* NO_CPU_ACCESS implies VRAM only. */
+    assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM);
+
+    /* Unsupported flags: NO_SUBALLOC, SPARSE. */
+    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
+        return -1;
+
+    switch (domain) {
+    case RADEON_DOMAIN_VRAM:
+        if (flags & RADEON_FLAG_NO_CPU_ACCESS)
+            return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
+        else
+            return RADEON_HEAP_VRAM;
+    case RADEON_DOMAIN_VRAM_GTT:
+        return RADEON_HEAP_VRAM_GTT;
+    case RADEON_DOMAIN_GTT:
+        if (flags & RADEON_FLAG_GTT_WC)
+            return RADEON_HEAP_GTT_WC;
+        else
+            return RADEON_HEAP_GTT;
+    }
+    return -1;
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/Android.mk b/src/gallium/drivers/radeonsi/Android.mk
index b6f9e26..faf3880 100644
--- a/src/gallium/drivers/radeonsi/Android.mk
+++ b/src/gallium/drivers/radeonsi/Android.mk
@@ -38,8 +38,22 @@
 	$(MESA_TOP)/src/amd/common \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common
 
+LOCAL_STATIC_LIBRARIES := libmesa_amd_common
+
 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeonsi
 
+$(call mesa-build-with-llvm)
+
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_RADEONSI),)
+GALLIUM_TARGET_DRIVERS += radeonsi
+$(eval GALLIUM_LIBS += \
+	$(LOCAL_MODULE) \
+	$(LOCAL_STATIC_LIBRARIES) \
+	libmesa_winsys_radeon \
+	libmesa_winsys_amdgpu)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 15ae977..626fe6f 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -2,6 +2,7 @@
 	cik_sdma.c \
 	si_blit.c \
 	si_compute.c \
+	si_compute.h \
 	si_cp_dma.c \
 	si_debug.c \
 	si_descriptors.c \
@@ -17,6 +18,7 @@
 	si_shader.h \
 	si_shader_internal.h \
 	si_shader_tgsi_alu.c \
+	si_shader_tgsi_mem.c \
 	si_shader_tgsi_setup.c \
 	si_state.c \
 	si_state_draw.c \
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 998288d..caa4c3c 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -22,6 +22,7 @@
  */
 
 #include "si_pipe.h"
+#include "si_compute.h"
 #include "util/u_format.h"
 #include "util/u_surface.h"
 
@@ -120,6 +121,8 @@
 
 	assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
 
+	sctx->decompression_enabled = true;
+
 	while (level_mask) {
 		unsigned level = u_bit_scan(&level_mask);
 
@@ -163,6 +166,7 @@
 			fully_copied_levels |= 1u << level;
 	}
 
+	sctx->decompression_enabled = false;
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
 	si_mark_atom_dirty(sctx, &sctx->db_render_state);
@@ -218,6 +222,8 @@
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
+	sctx->decompression_enabled = true;
+
 	while (level_mask) {
 		unsigned level = u_bit_scan(&level_mask);
 
@@ -255,6 +261,7 @@
 	if (planes & PIPE_MASK_S)
 		texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
 
+	sctx->decompression_enabled = false;
 	sctx->db_flush_depth_inplace = false;
 	sctx->db_flush_stencil_inplace = false;
 	si_mark_atom_dirty(sctx, &sctx->db_render_state);
@@ -298,11 +305,11 @@
 }
 
 static void
-si_flush_depth_texture(struct si_context *sctx,
-		       struct r600_texture *tex,
-		       unsigned required_planes,
-		       unsigned first_level, unsigned last_level,
-		       unsigned first_layer, unsigned last_layer)
+si_decompress_depth(struct si_context *sctx,
+		    struct r600_texture *tex,
+		    unsigned required_planes,
+		    unsigned first_level, unsigned last_level,
+		    unsigned first_layer, unsigned last_layer)
 {
 	unsigned inplace_planes = 0;
 	unsigned copy_planes = 0;
@@ -331,8 +338,6 @@
 		}
 	}
 
-	assert(!tex->tc_compatible_htile || levels_z == 0);
-
 	/* We may have to allocate the flushed texture here when called from
 	 * si_decompress_subresource.
 	 */
@@ -369,19 +374,47 @@
 	}
 
 	if (inplace_planes) {
-		si_blit_decompress_zs_in_place(
-			sctx, tex,
-			levels_z, levels_s,
-			first_layer, last_layer);
+		if (!tex->tc_compatible_htile) {
+			si_blit_decompress_zs_in_place(
+						sctx, tex,
+						levels_z, levels_s,
+						first_layer, last_layer);
+		}
+
+		/* Only in-place decompression needs to flush DB caches, or
+		 * when we don't decompress but TC-compatible planes are dirty.
+		 */
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_INV_VMEM_L1;
+
+		/* If we flush DB caches for TC-compatible depth, the dirty
+		 * state becomes 0 for the whole mipmap tree and all planes.
+		 * (there is nothing else to flush)
+		 */
+		if (tex->tc_compatible_htile) {
+			if (r600_can_sample_zs(tex, false))
+				tex->dirty_level_mask = 0;
+			if (r600_can_sample_zs(tex, true))
+				tex->stencil_dirty_level_mask = 0;
+		}
+	}
+	/* set_framebuffer_state takes care of coherency for single-sample.
+	 * The DB->CB copy uses CB for the final writes.
+	 */
+	if (copy_planes && tex->resource.b.b.nr_samples > 1) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_FLUSH_AND_INV_CB;
 	}
 }
 
 static void
-si_flush_depth_textures(struct si_context *sctx,
-			struct si_textures_info *textures)
+si_decompress_sampler_depth_textures(struct si_context *sctx,
+				     struct si_textures_info *textures)
 {
 	unsigned i;
-	unsigned mask = textures->depth_texture_mask;
+	unsigned mask = textures->needs_depth_decompress_mask;
 
 	while (mask) {
 		struct pipe_sampler_view *view;
@@ -397,11 +430,10 @@
 		tex = (struct r600_texture *)view->texture;
 		assert(tex->db_compatible);
 
-		si_flush_depth_texture(
-				sctx, tex,
-				sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
-				view->u.tex.first_level, view->u.tex.last_level,
-				0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
+		si_decompress_depth(sctx, tex,
+				    sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+				    view->u.tex.first_level, view->u.tex.last_level,
+				    0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
 	}
 }
 
@@ -431,11 +463,13 @@
 				level_mask &= ~(1 << i);
 		}
 	} else if (rtex->fmask.size) {
-		custom_blend = sctx->custom_blend_decompress;
+		custom_blend = sctx->custom_blend_fmask_decompress;
 	} else {
-		custom_blend = sctx->custom_blend_fastclear;
+		custom_blend = sctx->custom_blend_eliminate_fastclear;
 	}
 
+	sctx->decompression_enabled = true;
+
 	while (level_mask) {
 		unsigned level = u_bit_scan(&level_mask);
 
@@ -453,10 +487,19 @@
 			surf_tmpl.u.tex.last_layer = layer;
 			cbsurf = ctx->create_surface(ctx, &rtex->resource.b.b, &surf_tmpl);
 
+			/* Required before and after FMASK and DCC_DECOMPRESS. */
+			if (custom_blend == sctx->custom_blend_fmask_decompress ||
+			    custom_blend == sctx->custom_blend_dcc_decompress)
+				sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
 			si_blitter_begin(ctx, SI_DECOMPRESS);
 			util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
 			si_blitter_end(ctx);
 
+			if (custom_blend == sctx->custom_blend_fmask_decompress ||
+			    custom_blend == sctx->custom_blend_dcc_decompress)
+				sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
 			pipe_surface_reference(&cbsurf, NULL);
 		}
 
@@ -466,6 +509,12 @@
 			rtex->dirty_level_mask &= ~(1 << level);
 		}
 	}
+
+	sctx->decompression_enabled = false;
+
+	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
+			 SI_CONTEXT_INV_VMEM_L1;
 }
 
 static void
@@ -486,7 +535,7 @@
 				     struct si_textures_info *textures)
 {
 	unsigned i;
-	unsigned mask = textures->compressed_colortex_mask;
+	unsigned mask = textures->needs_color_decompress_mask;
 
 	while (mask) {
 		struct pipe_sampler_view *view;
@@ -509,7 +558,7 @@
 				   struct si_images_info *images)
 {
 	unsigned i;
-	unsigned mask = images->compressed_colortex_mask;
+	unsigned mask = images->needs_color_decompress_mask;
 
 	while (mask) {
 		const struct pipe_image_view *view;
@@ -611,6 +660,48 @@
 	}
 }
 
+static void si_check_render_feedback_resident_textures(struct si_context *sctx)
+{
+	util_dynarray_foreach(&sctx->resident_tex_handles,
+			      struct si_texture_handle *, tex_handle) {
+		struct pipe_sampler_view *view;
+		struct r600_texture *tex;
+
+		view = (*tex_handle)->view;
+		if (view->texture->target == PIPE_BUFFER)
+			continue;
+
+		tex = (struct r600_texture *)view->texture;
+
+		si_check_render_feedback_texture(sctx, tex,
+						 view->u.tex.first_level,
+						 view->u.tex.last_level,
+						 view->u.tex.first_layer,
+						 view->u.tex.last_layer);
+	}
+}
+
+static void si_check_render_feedback_resident_images(struct si_context *sctx)
+{
+	util_dynarray_foreach(&sctx->resident_img_handles,
+			      struct si_image_handle *, img_handle) {
+		struct pipe_image_view *view;
+		struct r600_texture *tex;
+
+		view = &(*img_handle)->view;
+		if (view->resource->target == PIPE_BUFFER)
+			continue;
+
+		tex = (struct r600_texture *)view->resource;
+
+		si_check_render_feedback_texture(sctx, tex,
+						 view->u.tex.level,
+						 view->u.tex.level,
+						 view->u.tex.first_layer,
+						 view->u.tex.last_layer);
+	}
+}
+
 static void si_check_render_feedback(struct si_context *sctx)
 {
 
@@ -621,9 +712,49 @@
 		si_check_render_feedback_images(sctx, &sctx->images[i]);
 		si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
 	}
+
+	si_check_render_feedback_resident_images(sctx);
+	si_check_render_feedback_resident_textures(sctx);
+
 	sctx->need_check_render_feedback = false;
 }
 
+static void si_decompress_resident_textures(struct si_context *sctx)
+{
+	util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress,
+			      struct si_texture_handle *, tex_handle) {
+		struct pipe_sampler_view *view = (*tex_handle)->view;
+		struct r600_texture *tex = (struct r600_texture *)view->texture;
+
+		si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
+					    view->u.tex.last_level);
+	}
+
+	util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress,
+			      struct si_texture_handle *, tex_handle) {
+		struct pipe_sampler_view *view = (*tex_handle)->view;
+		struct si_sampler_view *sview = (struct si_sampler_view *)view;
+		struct r600_texture *tex = (struct r600_texture *)view->texture;
+
+		si_decompress_depth(sctx, tex,
+			sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+			view->u.tex.first_level, view->u.tex.last_level,
+			0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
+	}
+}
+
+static void si_decompress_resident_images(struct si_context *sctx)
+{
+	util_dynarray_foreach(&sctx->resident_img_needs_color_decompress,
+			      struct si_image_handle *, img_handle) {
+		struct pipe_image_view *view = &(*img_handle)->view;
+		struct r600_texture *tex = (struct r600_texture *)view->resource;
+
+		si_decompress_color_texture(sctx, tex, view->u.tex.level,
+					    view->u.tex.level);
+	}
+}
+
 static void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
 {
 	unsigned compressed_colortex_counter, mask;
@@ -635,25 +766,37 @@
 	compressed_colortex_counter = p_atomic_read(&sctx->screen->b.compressed_colortex_counter);
 	if (compressed_colortex_counter != sctx->b.last_compressed_colortex_counter) {
 		sctx->b.last_compressed_colortex_counter = compressed_colortex_counter;
-		si_update_compressed_colortex_masks(sctx);
+		si_update_needs_color_decompress_masks(sctx);
 	}
 
 	/* Decompress color & depth textures if needed. */
-	mask = sctx->compressed_tex_shader_mask & shader_mask;
+	mask = sctx->shader_needs_decompress_mask & shader_mask;
 	while (mask) {
 		unsigned i = u_bit_scan(&mask);
 
-		if (sctx->samplers[i].depth_texture_mask) {
-			si_flush_depth_textures(sctx, &sctx->samplers[i]);
+		if (sctx->samplers[i].needs_depth_decompress_mask) {
+			si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
 		}
-		if (sctx->samplers[i].compressed_colortex_mask) {
+		if (sctx->samplers[i].needs_color_decompress_mask) {
 			si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
 		}
-		if (sctx->images[i].compressed_colortex_mask) {
+		if (sctx->images[i].needs_color_decompress_mask) {
 			si_decompress_image_color_textures(sctx, &sctx->images[i]);
 		}
 	}
 
+	if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
+		if (sctx->uses_bindless_samplers)
+			si_decompress_resident_textures(sctx);
+		if (sctx->uses_bindless_images)
+			si_decompress_resident_images(sctx);
+	} else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
+		if (sctx->cs_shader_state.program->uses_bindless_samplers)
+			si_decompress_resident_textures(sctx);
+		if (sctx->cs_shader_state.program->uses_bindless_images)
+			si_decompress_resident_images(sctx);
+	}
+
 	si_check_render_feedback(sctx);
 }
 
@@ -706,7 +849,7 @@
 		}
 	}
 
-	if (zstex && zstex->htile_buffer &&
+	if (zstex && zstex->htile_offset &&
 	    zsbuf->u.tex.level == 0 &&
 	    zsbuf->u.tex.first_layer == 0 &&
 	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
@@ -819,10 +962,32 @@
 		if (!(rtex->surface.flags & RADEON_SURF_SBUFFER))
 			planes &= ~PIPE_MASK_S;
 
-		si_flush_depth_texture(sctx, rtex, planes,
-				       level, level,
-				       first_layer, last_layer);
+		/* If we've rendered into the framebuffer and it's a blitting
+		 * source, make sure the decompression pass is invoked
+		 * by dirtying the framebuffer.
+		 */
+		if (sctx->framebuffer.state.zsbuf &&
+		    sctx->framebuffer.state.zsbuf->u.tex.level == level &&
+		    sctx->framebuffer.state.zsbuf->texture == tex)
+			si_update_fb_dirtiness_after_rendering(sctx);
+
+		si_decompress_depth(sctx, rtex, planes,
+				    level, level,
+				    first_layer, last_layer);
 	} else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_offset) {
+		/* If we've rendered into the framebuffer and it's a blitting
+		 * source, make sure the decompression pass is invoked
+		 * by dirtying the framebuffer.
+		 */
+		for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+			if (sctx->framebuffer.state.cbufs[i] &&
+			    sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
+			    sctx->framebuffer.state.cbufs[i]->texture == tex) {
+				si_update_fb_dirtiness_after_rendering(sctx);
+				break;
+			}
+		}
+
 		si_blit_decompress_color(ctx, rtex, level, level,
 					 first_layer, last_layer, false);
 	}
@@ -875,7 +1040,7 @@
 	src_height0 = src->height0;
 
 	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-	util_blitter_default_src_texture(&src_templ, src, src_level);
+	util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
 
 	if (util_format_is_compressed(src->format) ||
 	    util_format_is_compressed(dst->format)) {
@@ -953,6 +1118,43 @@
 		}
 	}
 
+	/* SNORM8 blitting has precision issues on some chips. Use the SINT
+	 * equivalent instead, which doesn't force DCC decompression.
+	 * Note that some chips avoid this issue by using SDMA.
+	 */
+	if (util_format_is_snorm8(dst_templ.format)) {
+		switch (dst_templ.format) {
+		case PIPE_FORMAT_R8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_R8_SINT;
+			break;
+		case PIPE_FORMAT_R8G8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_R8G8_SINT;
+			break;
+		case PIPE_FORMAT_R8G8B8X8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_R8G8B8X8_SINT;
+			break;
+		case PIPE_FORMAT_R8G8B8A8_SNORM:
+		/* There are no SINT variants for ABGR and XBGR, so we have to use RGBA. */
+		case PIPE_FORMAT_A8B8G8R8_SNORM:
+		case PIPE_FORMAT_X8B8G8R8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_R8G8B8A8_SINT;
+			break;
+		case PIPE_FORMAT_A8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_A8_SINT;
+			break;
+		case PIPE_FORMAT_L8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_L8_SINT;
+			break;
+		case PIPE_FORMAT_L8A8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_L8A8_SINT;
+			break;
+		case PIPE_FORMAT_I8_SNORM:
+			dst_templ.format = src_templ.format = PIPE_FORMAT_I8_SINT;
+			break;
+		default:; /* fall through */
+		}
+	}
+
 	vi_disable_dcc_if_incompatible_format(&sctx->b, dst, dst_level,
 					      dst_templ.format);
 	vi_disable_dcc_if_incompatible_format(&sctx->b, src, src_level,
@@ -983,6 +1185,29 @@
 	pipe_sampler_view_reference(&src_view, NULL);
 }
 
+static void si_do_CB_resolve(struct si_context *sctx,
+			     const struct pipe_blit_info *info,
+			     struct pipe_resource *dst,
+			     unsigned dst_level, unsigned dst_z,
+			     enum pipe_format format)
+{
+	/* Required before and after CB_RESOLVE. */
+	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+	si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |
+			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+	util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
+					  info->src.resource, info->src.box.z,
+					  ~0, sctx->custom_blend_resolve,
+					  format);
+	si_blitter_end(&sctx->b.b);
+
+	/* Flush caches for possible texturing. */
+	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
+			 SI_CONTEXT_INV_VMEM_L1;
+}
+
 static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 				     const struct pipe_blit_info *info)
 {
@@ -993,7 +1218,6 @@
 	unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
 	unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
 	enum pipe_format format = info->src.format;
-	unsigned sample_mask = ~0;
 	struct pipe_resource *tmp, templ;
 	struct pipe_blit_info blit;
 
@@ -1060,15 +1284,8 @@
 		}
 
 		/* Resolve directly from src to dst. */
-		si_blitter_begin(ctx, SI_COLOR_RESOLVE |
-				 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-		util_blitter_custom_resolve_color(sctx->blitter,
-						  info->dst.resource, info->dst.level,
-						  info->dst.box.z,
-						  info->src.resource, info->src.box.z,
-						  sample_mask, sctx->custom_blend_resolve,
-						  format);
-		si_blitter_end(ctx);
+		si_do_CB_resolve(sctx, info, info->dst.resource,
+				 info->dst.level, info->dst.box.z, format);
 		return true;
 	}
 
@@ -1102,13 +1319,7 @@
 	assert(src->surface.micro_tile_mode == rtmp->surface.micro_tile_mode);
 
 	/* resolve */
-	si_blitter_begin(ctx, SI_COLOR_RESOLVE |
-			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_custom_resolve_color(sctx->blitter, tmp, 0, 0,
-					  info->src.resource, info->src.box.z,
-					  sample_mask, sctx->custom_blend_resolve,
-					  format);
-	si_blitter_end(ctx);
+	si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
 
 	/* blit */
 	blit = *info;
@@ -1200,11 +1411,15 @@
 	rtex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
 						     last_level - base_level);
 
+	sctx->generate_mipmap_for_depth = rtex->is_depth;
+
 	si_blitter_begin(ctx, SI_BLIT | SI_DISABLE_RENDER_COND);
 	util_blitter_generate_mipmap(sctx->blitter, tex, format,
 				     base_level, last_level,
 				     first_layer, last_layer);
 	si_blitter_end(ctx);
+
+	sctx->generate_mipmap_for_depth = false;
 	return true;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 33ebe2e..bf44ae0 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -29,27 +29,9 @@
 #include "amd_kernel_code_t.h"
 #include "radeon/r600_cs.h"
 #include "si_pipe.h"
+#include "si_compute.h"
 #include "sid.h"
 
-#define MAX_GLOBAL_BUFFERS 22
-
-struct si_compute {
-	struct si_screen *screen;
-	struct tgsi_token *tokens;
-	struct util_queue_fence ready;
-	struct si_compiler_ctx_state compiler_ctx_state;
-
-	unsigned ir_type;
-	unsigned local_size;
-	unsigned private_size;
-	unsigned input_size;
-	struct si_shader shader;
-
-	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
-	unsigned use_code_object_v2 : 1;
-	unsigned variable_group_size : 1;
-};
-
 struct dispatch_packet {
 	uint16_t header;
 	uint16_t setup;
@@ -113,18 +95,29 @@
 
 	memset(&sel, 0, sizeof(sel));
 
+	sel.screen = program->screen;
 	tgsi_scan_shader(program->tokens, &sel.info);
 	sel.tokens = program->tokens;
 	sel.type = PIPE_SHADER_COMPUTE;
 	sel.local_size = program->local_size;
+	si_get_active_slot_masks(&sel.info,
+				 &program->active_const_and_shader_buffers,
+				 &program->active_samplers_and_images);
 
 	program->shader.selector = &sel;
 	program->shader.is_monolithic = true;
+	program->uses_grid_size = sel.info.uses_grid_size;
+	program->uses_block_size = sel.info.uses_block_size;
+	program->uses_bindless_samplers = sel.info.uses_bindless_samplers;
+	program->uses_bindless_images = sel.info.uses_bindless_images;
 
 	if (si_shader_create(program->screen, tm, &program->shader, debug)) {
 		program->shader.compilation_failed = true;
 	} else {
 		bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
+		unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
+				      (sel.info.uses_grid_size ? 3 : 0) +
+				      (sel.info.uses_block_size ? 3 : 0);
 
 		shader->config.rsrc1 =
 			S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
@@ -133,10 +126,13 @@
 			S_00B848_FLOAT_MODE(shader->config.float_mode);
 
 		shader->config.rsrc2 =
-			S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
+			S_00B84C_USER_SGPR(user_sgprs) |
 			S_00B84C_SCRATCH_EN(scratch_enabled) |
-			S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
-			S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) |
+			S_00B84C_TGID_X_EN(sel.info.uses_block_id[0]) |
+			S_00B84C_TGID_Y_EN(sel.info.uses_block_id[1]) |
+			S_00B84C_TGID_Z_EN(sel.info.uses_block_id[2]) |
+			S_00B84C_TIDIG_COMP_CNT(sel.info.uses_thread_id[2] ? 2 :
+						sel.info.uses_thread_id[1] ? 1 : 0) |
 			S_00B84C_LDS_SIZE(shader->config.lds_size);
 
 		program->variable_group_size =
@@ -214,7 +210,24 @@
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	sctx->cs_shader_state.program = (struct si_compute*)state;
+	struct si_compute *program = (struct si_compute*)state;
+
+	sctx->cs_shader_state.program = program;
+	if (!program)
+		return;
+
+	/* Wait because we need active slot usage masks. */
+	if (program->ir_type == PIPE_SHADER_IR_TGSI)
+		util_queue_fence_wait(&program->ready);
+
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+				  program->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+				  SI_DESCS_FIRST_COMPUTE +
+				  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+				  program->active_samplers_and_images);
 }
 
 static void si_set_global_binding(
@@ -252,11 +265,6 @@
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t bc_va;
 
-	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
-	radeon_emit(cs, 0);
-	radeon_emit(cs, 0);
-	radeon_emit(cs, 0);
-
 	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
 	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
 	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
@@ -331,7 +339,7 @@
 	if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
 		uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
 
-		si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va);
+		si_shader_apply_scratch_relocs(shader, scratch_va);
 
 		if (si_shader_binary_upload(sctx->screen, shader))
 			return false;
@@ -650,36 +658,43 @@
 static void si_setup_tgsi_grid(struct si_context *sctx,
                                 const struct pipe_grid_info *info)
 {
+	struct si_compute *program = sctx->cs_shader_state.program;
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
-	                          4 * SI_SGPR_GRID_SIZE;
+				 4 * SI_NUM_RESOURCE_SGPRS;
+	unsigned block_size_reg = grid_size_reg +
+				  /* 12 bytes = 3 dwords. */
+				  12 * program->uses_grid_size;
 
 	if (info->indirect) {
-		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
-		uint64_t va = base_va + info->indirect_offset;
-		int i;
+		if (program->uses_grid_size) {
+			uint64_t base_va = r600_resource(info->indirect)->gpu_address;
+			uint64_t va = base_va + info->indirect_offset;
+			int i;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-		                 (struct r600_resource *)info->indirect,
-		                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					 (struct r600_resource *)info->indirect,
+					 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
-		for (i = 0; i < 3; ++i) {
-			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-					COPY_DATA_DST_SEL(COPY_DATA_REG));
-			radeon_emit(cs, (va +  4 * i));
-			radeon_emit(cs, (va + 4 * i) >> 32);
-			radeon_emit(cs, (grid_size_reg >> 2) + i);
-			radeon_emit(cs, 0);
+			for (i = 0; i < 3; ++i) {
+				radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+				radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+						COPY_DATA_DST_SEL(COPY_DATA_REG));
+				radeon_emit(cs, (va + 4 * i));
+				radeon_emit(cs, (va + 4 * i) >> 32);
+				radeon_emit(cs, (grid_size_reg >> 2) + i);
+				radeon_emit(cs, 0);
+			}
 		}
 	} else {
-		struct si_compute *program = sctx->cs_shader_state.program;
-
-		radeon_set_sh_reg_seq(cs, grid_size_reg, program->variable_group_size ? 6 : 3);
-		radeon_emit(cs, info->grid[0]);
-		radeon_emit(cs, info->grid[1]);
-		radeon_emit(cs, info->grid[2]);
-		if (program->variable_group_size) {
+		if (program->uses_grid_size) {
+			radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+			radeon_emit(cs, info->grid[0]);
+			radeon_emit(cs, info->grid[1]);
+			radeon_emit(cs, info->grid[2]);
+		}
+		if (program->variable_group_size && program->uses_block_size) {
+			radeon_set_sh_reg_seq(cs, block_size_reg, 3);
 			radeon_emit(cs, info->block[0]);
 			radeon_emit(cs, info->block[1]);
 			radeon_emit(cs, info->block[2]);
@@ -703,6 +718,13 @@
 	radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
 	radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
 
+	unsigned dispatch_initiator =
+		S_00B800_COMPUTE_SHADER_EN(1) |
+		S_00B800_FORCE_START_AT_000(1) |
+		/* If the KMD allows it (there is a KMD hw register for it),
+		 * allow launching waves out-of-order. (same as Vulkan) */
+		S_00B800_ORDER_MODE(sctx->b.chip_class >= CIK);
+
 	if (info->indirect) {
 		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
@@ -719,14 +741,14 @@
 		radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
 		                PKT3_SHADER_TYPE_S(1));
 		radeon_emit(cs, info->indirect_offset);
-		radeon_emit(cs, 1);
+		radeon_emit(cs, dispatch_initiator);
 	} else {
 		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
 		                PKT3_SHADER_TYPE_S(1));
 		radeon_emit(cs, info->grid[0]);
 		radeon_emit(cs, info->grid[1]);
 		radeon_emit(cs, info->grid[2]);
-		radeon_emit(cs, 1);
+		radeon_emit(cs, dispatch_initiator);
 	}
 }
 
@@ -755,11 +777,13 @@
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 				 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-		util_queue_fence_wait(&program->ready);
+	if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+	    program->shader.compilation_failed)
+		return;
 
-		if (program->shader.compilation_failed)
-			return;
+	if (sctx->b.last_num_draw_calls != sctx->b.num_draw_calls) {
+		si_update_fb_dirtiness_after_rendering(sctx);
+		sctx->b.last_num_draw_calls = sctx->b.num_draw_calls;
 	}
 
 	si_decompress_compute_textures(sctx);
@@ -771,8 +795,9 @@
 	if (info->indirect) {
 		r600_context_add_resource_size(ctx, info->indirect);
 
-		/* The hw doesn't read the indirect buffer via TC L2. */
-		if (r600_resource(info->indirect)->TC_L2_dirty) {
+		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
+		if (sctx->b.chip_class <= VI &&
+		    r600_resource(info->indirect)->TC_L2_dirty) {
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 			r600_resource(info->indirect)->TC_L2_dirty = false;
 		}
@@ -845,7 +870,8 @@
 	}
 
 	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-		util_queue_fence_wait(&program->ready);
+		util_queue_drop_job(&sctx->screen->shader_compiler_queue,
+				    &program->ready);
 		util_queue_fence_destroy(&program->ready);
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h
new file mode 100644
index 0000000..268817b
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SI_COMPUTE_H
+#define SI_COMPUTE_H
+
+#include "si_shader.h"
+
+#define MAX_GLOBAL_BUFFERS 22
+
+struct si_compute {
+	struct si_screen *screen;
+	struct tgsi_token *tokens;
+	struct util_queue_fence ready;
+	struct si_compiler_ctx_state compiler_ctx_state;
+
+	/* bitmasks of used descriptor slots */
+	uint32_t active_const_and_shader_buffers;
+	uint64_t active_samplers_and_images;
+
+	unsigned ir_type;
+	unsigned local_size;
+	unsigned private_size;
+	unsigned input_size;
+	struct si_shader shader;
+
+	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
+	unsigned use_code_object_v2 : 1;
+	unsigned variable_group_size : 1;
+	unsigned uses_grid_size:1;
+	unsigned uses_block_size:1;
+	unsigned uses_bindless_samplers:1;
+	unsigned uses_bindless_images:1;
+};
+
+#endif /* SI_COMPUTE_H */
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 9505d62..e42f260 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -142,8 +142,11 @@
 
 static unsigned get_tc_l2_flag(struct si_context *sctx, enum r600_coherency coher)
 {
-	return coher == R600_COHERENCY_SHADER &&
-	       sctx->b.chip_class >= CIK ? CP_DMA_USE_L2 : 0;
+	if ((sctx->b.chip_class >= GFX9 && coher == R600_COHERENCY_CB_META) ||
+	    (sctx->b.chip_class >= CIK && coher == R600_COHERENCY_SHADER))
+		return CP_DMA_USE_L2;
+
+	return 0;
 }
 
 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
@@ -212,7 +215,7 @@
 	if (!size)
 		return;
 
-	dma_clear_size = size & ~3llu;
+       dma_clear_size = size & ~3ull;
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index 9d0c0c5..06dea61 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -25,6 +25,7 @@
  */
 
 #include "si_pipe.h"
+#include "si_compute.h"
 #include "sid.h"
 #include "gfx9d.h"
 #include "sid_tables.h"
@@ -35,18 +36,33 @@
 DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
 
 static void si_dump_shader(struct si_screen *sscreen,
-			   struct si_shader_ctx_state *state, FILE *f)
+			   enum pipe_shader_type processor,
+			   const struct si_shader *shader, FILE *f)
 {
-	struct si_shader *current = state->current;
+	if (shader->shader_log)
+		fwrite(shader->shader_log, shader->shader_log_size, 1, f);
+	else
+		si_shader_dump(sscreen, shader, NULL, processor, f, false);
+}
+
+static void si_dump_gfx_shader(struct si_screen *sscreen,
+			       const struct si_shader_ctx_state *state, FILE *f)
+{
+	const struct si_shader *current = state->current;
 
 	if (!state->cso || !current)
 		return;
 
-	if (current->shader_log)
-		fwrite(current->shader_log, current->shader_log_size, 1, f);
-	else
-		si_shader_dump(sscreen, state->current, NULL,
-			       state->cso->info.processor, f, false);
+	si_dump_shader(sscreen, state->cso->info.processor, current, f);
+}
+
+static void si_dump_compute_shader(struct si_screen *sscreen,
+				   const struct si_cs_shader_state *state, FILE *f)
+{
+	if (!state->program || state->program != state->emitted_program)
+		return;
+
+	si_dump_shader(sscreen, PIPE_SHADER_COMPUTE, &state->program->shader, f);
 }
 
 /**
@@ -328,7 +344,7 @@
 
 		/* Print the usage. */
 		for (j = 0; j < 64; j++) {
-			if (!(saved->bo_list[i].priority_usage & (1llu << j)))
+			if (!(saved->bo_list[i].priority_usage & (1ull << j)))
 				continue;
 
 			fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
@@ -364,27 +380,29 @@
 	}
 }
 
+typedef unsigned (*slot_remap_func)(unsigned);
+
 static void si_dump_descriptor_list(struct si_descriptors *desc,
 				    const char *shader_name,
 				    const char *elem_name,
+				    unsigned element_dw_size,
 				    unsigned num_elements,
+				    slot_remap_func slot_remap,
 				    FILE *f)
 {
 	unsigned i, j;
-	uint32_t *cpu_list = desc->list;
-	uint32_t *gpu_list = desc->gpu_list;
-	const char *list_note = "GPU list";
-
-	if (!gpu_list) {
-		gpu_list = cpu_list;
-		list_note = "CPU list";
-	}
 
 	for (i = 0; i < num_elements; i++) {
+		unsigned dw_offset = slot_remap(i) * element_dw_size;
+		uint32_t *gpu_ptr = desc->gpu_list ? desc->gpu_list : desc->list;
+		const char *list_note = desc->gpu_list ? "GPU list" : "CPU list";
+		uint32_t *cpu_list = desc->list + dw_offset;
+		uint32_t *gpu_list = gpu_ptr + dw_offset;
+
 		fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n",
 			shader_name, elem_name, i, list_note);
 
-		switch (desc->element_dw_size) {
+		switch (element_dw_size) {
 		case 4:
 			for (j = 0; j < 4; j++)
 				ac_dump_reg(f, R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
@@ -428,46 +446,85 @@
 		}
 
 		fprintf(f, "\n");
-		gpu_list += desc->element_dw_size;
-		cpu_list += desc->element_dw_size;
 	}
 }
 
+static unsigned si_identity(unsigned slot)
+{
+	return slot;
+}
+
 static void si_dump_descriptors(struct si_context *sctx,
-				struct si_shader_ctx_state *state,
-				FILE *f)
+				enum pipe_shader_type processor,
+				const struct tgsi_shader_info *info, FILE *f)
+{
+	struct si_descriptors *descs =
+		&sctx->descriptors[SI_DESCS_FIRST_SHADER +
+				   processor * SI_NUM_SHADER_DESCS];
+	static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
+	const char *name = shader_name[processor];
+	unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
+	unsigned enabled_images;
+
+	if (info) {
+		enabled_constbuf = info->const_buffers_declared;
+		enabled_shaderbuf = info->shader_buffers_declared;
+		enabled_samplers = info->samplers_declared;
+		enabled_images = info->images_declared;
+	} else {
+		enabled_constbuf = sctx->const_and_shader_buffers[processor].enabled_mask >>
+				   SI_NUM_SHADER_BUFFERS;
+		enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
+				    u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+		enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >>
+				    (32 - SI_NUM_SHADER_BUFFERS);
+		enabled_samplers = sctx->samplers[processor].views.enabled_mask;
+		enabled_images = sctx->images[processor].enabled_mask;
+	}
+
+	if (processor == PIPE_SHADER_VERTEX) {
+		assert(info); /* only CS may not have an info struct */
+
+		si_dump_descriptor_list(&sctx->vertex_buffers, name,
+					" - Vertex buffer", 4, info->num_inputs,
+					si_identity, f);
+	}
+
+	si_dump_descriptor_list(&descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
+				name, " - Constant buffer", 4,
+				util_last_bit(enabled_constbuf),
+				si_get_constbuf_slot, f);
+	si_dump_descriptor_list(&descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
+				name, " - Shader buffer", 4,
+				util_last_bit(enabled_shaderbuf),
+				si_get_shaderbuf_slot, f);
+	si_dump_descriptor_list(&descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
+				name, " - Sampler", 16,
+				util_last_bit(enabled_samplers),
+				si_get_sampler_slot, f);
+	si_dump_descriptor_list(&descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
+				name, " - Image", 8,
+				util_last_bit(enabled_images),
+				si_get_image_slot, f);
+}
+
+static void si_dump_gfx_descriptors(struct si_context *sctx,
+				    const struct si_shader_ctx_state *state,
+				    FILE *f)
 {
 	if (!state->cso || !state->current)
 		return;
 
-	unsigned type = state->cso->type;
-	const struct tgsi_shader_info *info = &state->cso->info;
-	struct si_descriptors *descs =
-		&sctx->descriptors[SI_DESCS_FIRST_SHADER +
-				   type * SI_NUM_SHADER_DESCS];
-	static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
+	si_dump_descriptors(sctx, state->cso->type, &state->cso->info, f);
+}
 
-	static const char *elem_name[] = {
-		" - Constant buffer",
-		" - Shader buffer",
-		" - Sampler",
-		" - Image",
-	};
-	unsigned num_elements[] = {
-		util_last_bit(info->const_buffers_declared),
-		util_last_bit(info->shader_buffers_declared),
-		util_last_bit(info->samplers_declared),
-		util_last_bit(info->images_declared),
-	};
+static void si_dump_compute_descriptors(struct si_context *sctx, FILE *f)
+{
+	if (!sctx->cs_shader_state.program ||
+	    sctx->cs_shader_state.program != sctx->cs_shader_state.emitted_program)
+		return;
 
-	if (type == PIPE_SHADER_VERTEX) {
-		si_dump_descriptor_list(&sctx->vertex_buffers, shader_name[type],
-					" - Vertex buffer", info->num_inputs, f);
-	}
-
-	for (unsigned i = 0; i < SI_NUM_SHADER_DESCS; ++i, ++descs)
-		si_dump_descriptor_list(descs, shader_name[type], elem_name[i],
-					num_elements[i], f);
+	si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, f);
 }
 
 struct si_shader_inst {
@@ -639,6 +696,14 @@
 		si_add_split_disasm(shader->prolog->binary.disasm_string,
 				    start_addr, &num_inst, instructions);
 	}
+	if (shader->previous_stage) {
+		si_add_split_disasm(shader->previous_stage->binary.disasm_string,
+				    start_addr, &num_inst, instructions);
+	}
+	if (shader->prolog2) {
+		si_add_split_disasm(shader->prolog2->binary.disasm_string,
+				    start_addr, &num_inst, instructions);
+	}
 	si_add_split_disasm(shader->binary.disasm_string,
 			    start_addr, &num_inst, instructions);
 	if (shader->epilog) {
@@ -745,11 +810,12 @@
 		si_dump_framebuffer(sctx, f);
 
 	if (flags & PIPE_DUMP_CURRENT_SHADERS) {
-		si_dump_shader(sctx->screen, &sctx->vs_shader, f);
-		si_dump_shader(sctx->screen, &sctx->tcs_shader, f);
-		si_dump_shader(sctx->screen, &sctx->tes_shader, f);
-		si_dump_shader(sctx->screen, &sctx->gs_shader, f);
-		si_dump_shader(sctx->screen, &sctx->ps_shader, f);
+		si_dump_gfx_shader(sctx->screen, &sctx->vs_shader, f);
+		si_dump_gfx_shader(sctx->screen, &sctx->tcs_shader, f);
+		si_dump_gfx_shader(sctx->screen, &sctx->tes_shader, f);
+		si_dump_gfx_shader(sctx->screen, &sctx->gs_shader, f);
+		si_dump_gfx_shader(sctx->screen, &sctx->ps_shader, f);
+		si_dump_compute_shader(sctx->screen, &sctx->cs_shader_state, f);
 
 		if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
 			si_dump_annotated_shaders(sctx, f);
@@ -758,12 +824,14 @@
 		}
 
 		si_dump_descriptor_list(&sctx->descriptors[SI_DESCS_RW_BUFFERS],
-					"", "RW buffers", SI_NUM_RW_BUFFERS, f);
-		si_dump_descriptors(sctx, &sctx->vs_shader, f);
-		si_dump_descriptors(sctx, &sctx->tcs_shader, f);
-		si_dump_descriptors(sctx, &sctx->tes_shader, f);
-		si_dump_descriptors(sctx, &sctx->gs_shader, f);
-		si_dump_descriptors(sctx, &sctx->ps_shader, f);
+					"", "RW buffers", 4, SI_NUM_RW_BUFFERS,
+					si_identity, f);
+		si_dump_gfx_descriptors(sctx, &sctx->vs_shader, f);
+		si_dump_gfx_descriptors(sctx, &sctx->tcs_shader, f);
+		si_dump_gfx_descriptors(sctx, &sctx->tes_shader, f);
+		si_dump_gfx_descriptors(sctx, &sctx->gs_shader, f);
+		si_dump_gfx_descriptors(sctx, &sctx->ps_shader, f);
+		si_dump_compute_descriptors(sctx, f);
 	}
 
 	if (flags & PIPE_DUMP_LAST_COMMAND_BUFFER) {
@@ -798,7 +866,7 @@
 	fprintf(f, "SDMA Dump Done.\n");
 }
 
-static bool si_vm_fault_occured(struct si_context *sctx, uint32_t *out_addr)
+static bool si_vm_fault_occured(struct si_context *sctx, uint64_t *out_addr)
 {
 	char line[2000];
 	unsigned sec, usec;
@@ -826,7 +894,7 @@
 			}
 			continue;
 		}
-		timestamp = sec * 1000000llu + usec;
+		timestamp = sec * 1000000ull + usec;
 
 		/* If just updating the timestamp. */
 		if (!out_addr)
@@ -853,18 +921,35 @@
 		}
 		msg++;
 
+		const char *header_line, *addr_line_prefix, *addr_line_format;
+
+		if (sctx->b.chip_class >= GFX9) {
+			/* Match this:
+			 * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
+			 * ..:   at page 0x0000000219f8f000 from 27
+			 * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
+			 */
+			header_line = "VMC page fault";
+			addr_line_prefix = "   at page";
+			addr_line_format = "%"PRIx64;
+		} else {
+			header_line = "GPU fault detected:";
+			addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
+			addr_line_format = "%"PRIX64;
+		}
+
 		switch (progress) {
 		case 0:
-			if (strstr(msg, "GPU fault detected:"))
+			if (strstr(msg, header_line))
 				progress = 1;
 			break;
 		case 1:
-			msg = strstr(msg, "VM_CONTEXT1_PROTECTION_FAULT_ADDR");
+			msg = strstr(msg, addr_line_prefix);
 			if (msg) {
 				msg = strstr(msg, "0x");
 				if (msg) {
 					msg += 2;
-					if (sscanf(msg, "%X", out_addr) == 1)
+					if (sscanf(msg, addr_line_format, out_addr) == 1)
 						fault = true;
 				}
 			}
@@ -887,7 +972,7 @@
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_screen *screen = sctx->b.b.screen;
 	FILE *f;
-	uint32_t addr;
+	uint64_t addr;
 	char cmd_line[4096];
 
 	if (!si_vm_fault_occured(sctx, &addr))
@@ -903,7 +988,7 @@
 	fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
 	fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
 	fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
-	fprintf(f, "Failing VM page: 0x%08x\n\n", addr);
+	fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
 
 	if (sctx->apitrace_call_number)
 		fprintf(f, "Last apitrace call: %u\n\n",
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bd73fcc..463a7f7 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -60,6 +60,7 @@
 #include "sid.h"
 #include "gfx9d.h"
 
+#include "util/hash_table.h"
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
@@ -95,37 +96,45 @@
 	 * descriptor */
 };
 
-static void si_init_descriptors(struct si_descriptors *desc,
-				unsigned shader_userdata_index,
-				unsigned element_dw_size,
-				unsigned num_elements,
-				const uint32_t *null_descriptor,
-				unsigned *ce_offset)
+static void si_init_descriptor_list(uint32_t *desc_list,
+				    unsigned element_dw_size,
+				    unsigned num_elements,
+				    const uint32_t *null_descriptor)
 {
 	int i;
 
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
-
-	desc->list = CALLOC(num_elements, element_dw_size * 4);
-	desc->element_dw_size = element_dw_size;
-	desc->num_elements = num_elements;
-	desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1;
-	desc->shader_userdata_offset = shader_userdata_index * 4;
-
-	if (ce_offset) {
-		desc->uses_ce = true;
-		desc->ce_offset = *ce_offset;
-
-		/* make sure that ce_offset stays 32 byte aligned */
-		*ce_offset += align(element_dw_size * num_elements * 4, 32);
-	}
-
 	/* Initialize the array to NULL descriptors if the element size is 8. */
 	if (null_descriptor) {
 		assert(element_dw_size % 8 == 0);
 		for (i = 0; i < num_elements * element_dw_size / 8; i++)
-			memcpy(desc->list + i * 8, null_descriptor,
-			       8 * 4);
+			memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
+	}
+}
+
+static void si_init_descriptors(struct si_context *sctx,
+				struct si_descriptors *desc,
+				unsigned shader_userdata_index,
+				unsigned element_dw_size,
+				unsigned num_elements,
+				unsigned first_ce_slot,
+				unsigned num_ce_slots,
+				unsigned *ce_offset)
+{
+	assert(num_elements <= sizeof(desc->dirty_mask)*8);
+
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
+	desc->element_dw_size = element_dw_size;
+	desc->num_elements = num_elements;
+	desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0;
+	desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0;
+	desc->dirty_mask = u_bit_consecutive64(0, num_elements);
+	desc->shader_userdata_offset = shader_userdata_index * 4;
+
+	if (desc->num_ce_slots) {
+		desc->uses_ce = true;
+		desc->ce_offset = *ce_offset;
+
+		*ce_offset += element_dw_size * desc->num_ce_slots * 4;
 	}
 }
 
@@ -136,12 +145,14 @@
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
-			 unsigned *out_offset, struct r600_resource **out_buf) {
+			 unsigned *out_offset, struct r600_resource **out_buf)
+{
 	uint64_t va;
 
 	u_suballocator_alloc(sctx->ce_suballocator, size,
-			     sctx->screen->b.info.tcc_cache_line_size,
-			     out_offset, (struct pipe_resource**)out_buf);
+			     si_optimal_tcc_alignment(sctx, size),
+			     out_offset,
+			     (struct pipe_resource**)out_buf);
 	if (!out_buf)
 			return false;
 
@@ -160,38 +171,36 @@
 	return true;
 }
 
-static void si_ce_reinitialize_descriptors(struct si_context *sctx,
-                                           struct si_descriptors *desc)
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx)
 {
-	if (desc->buffer) {
-		struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
-		unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-		uint64_t va = buffer->gpu_address + desc->buffer_offset;
-		struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
-
-		if (!ib)
-			ib = sctx->ce_ib;
-
-		list_size = align(list_size, 32);
-
-		radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
-		radeon_emit(ib, va);
-		radeon_emit(ib, va >> 32);
-		radeon_emit(ib, list_size / 4);
-		radeon_emit(ib, desc->ce_offset);
-
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
-		                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
-	}
-	desc->ce_ram_dirty = false;
+	bool success = si_ce_upload(sctx, 0, sctx->total_ce_ram_allocated,
+				    &sctx->ce_ram_saved_offset,
+				    &sctx->ce_ram_saved_buffer);
+	(void)success;
+	assert(success);
 }
 
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx)
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx)
 {
-	int i;
+	if (!sctx->ce_ram_saved_buffer)
+		return;
 
-	for (i = 0; i < SI_NUM_DESCS; ++i)
-		si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]);
+	struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+	if (!ib)
+		ib = sctx->ce_ib;
+
+	uint64_t va = sctx->ce_ram_saved_buffer->gpu_address +
+		      sctx->ce_ram_saved_offset;
+
+	radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+	radeon_emit(ib, va);
+	radeon_emit(ib, va >> 32);
+	radeon_emit(ib, sctx->total_ce_ram_allocated / 4);
+	radeon_emit(ib, 0);
+
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				  sctx->ce_ram_saved_buffer,
+				  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 }
 
 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
@@ -206,21 +215,27 @@
 				  struct si_descriptors *desc,
 				  struct r600_atom * atom)
 {
-	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	unsigned slot_size = desc->element_dw_size * 4;
+	unsigned first_slot_offset = desc->first_active_slot * slot_size;
+	unsigned upload_size = desc->num_active_slots * slot_size;
 
-	if (!desc->dirty_mask)
+	/* Skip the upload if no shader is using the descriptors. dirty_mask
+	 * will stay dirty and the descriptors will be uploaded when there is
+	 * a shader using them.
+	 */
+	if (!upload_size)
 		return true;
 
-	if (sctx->ce_ib && desc->uses_ce) {
-		uint32_t const* list = (uint32_t const*)desc->list;
+	if (desc->uses_ce) {
+		const uint32_t *list = desc->list +
+				       desc->first_ce_slot * desc->element_dw_size;
+		uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) &
+				u_bit_consecutive64(0, desc->num_ce_slots);
 
-		if (desc->ce_ram_dirty)
-			si_ce_reinitialize_descriptors(sctx, desc);
 
-		while(desc->dirty_mask) {
+		while (mask) {
 			int begin, count;
-			u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
-						     &count);
+			u_bit_scan_consecutive_range64(&mask, &begin, &count);
 
 			begin *= desc->element_dw_size;
 			count *= desc->element_dw_size;
@@ -231,25 +246,34 @@
 			radeon_emit_array(sctx->ce_ib, list + begin, count);
 		}
 
-		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
-		                           &desc->buffer_offset, &desc->buffer))
+		if (!si_ce_upload(sctx,
+				  desc->ce_offset +
+				  (first_slot_offset - desc->first_ce_slot * slot_size),
+				  upload_size, (unsigned*)&desc->buffer_offset,
+				  &desc->buffer))
 			return false;
 	} else {
-		void *ptr;
+		uint32_t *ptr;
 
-		u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
-			       sctx->screen->b.info.tcc_cache_line_size,
-			       &desc->buffer_offset,
-			       (struct pipe_resource**)&desc->buffer, &ptr);
+		u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+			       si_optimal_tcc_alignment(sctx, upload_size),
+			       (unsigned*)&desc->buffer_offset,
+			       (struct pipe_resource**)&desc->buffer,
+			       (void**)&ptr);
 		if (!desc->buffer)
 			return false; /* skip the draw call */
 
-		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
-		desc->gpu_list = ptr;
+		util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
+					upload_size);
+		desc->gpu_list = ptr - first_slot_offset / 4;
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 	}
+
+	/* The shader pointer should point to slot 0. */
+	desc->buffer_offset -= first_slot_offset;
+
 	desc->dirty_mask = 0;
 
 	if (atom)
@@ -261,8 +285,6 @@
 static void
 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
 {
-	desc->ce_ram_dirty = true;
-
 	if (!desc->buffer)
 		return;
 
@@ -273,16 +295,16 @@
 /* SAMPLER VIEWS */
 
 static unsigned
-si_sampler_descriptors_idx(unsigned shader)
+si_sampler_and_image_descriptors_idx(unsigned shader)
 {
 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_SAMPLERS;
+	       SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
 }
 
 static struct si_descriptors *
-si_sampler_descriptors(struct si_context *sctx, unsigned shader)
+si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader)
 {
-	return &sctx->descriptors[si_sampler_descriptors_idx(shader)];
+	return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
 }
 
 static void si_release_sampler_views(struct si_sampler_views *views)
@@ -331,14 +353,6 @@
 						    rtex->dcc_separate_buffer, usage,
 						    RADEON_PRIO_DCC, check_mem);
 	}
-
-	if (rtex->htile_buffer &&
-	    rtex->tc_compatible_htile &&
-	    !is_stencil_sampler) {
-		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
-						    rtex->htile_buffer, usage,
-						    RADEON_PRIO_HTILE, check_mem);
-	}
 }
 
 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
@@ -418,8 +432,8 @@
 
 			if (sscreen->b.chip_class <= VI)
 				meta_va += base_level_info->dcc_offset;
-		} else if (tex->tc_compatible_htile && !is_stencil) {
-			meta_va = tex->htile_buffer->gpu_address;
+		} else if (tex->tc_compatible_htile) {
+			meta_va = tex->resource.gpu_address + tex->htile_offset;
 		}
 
 		if (meta_va) {
@@ -467,6 +481,59 @@
 	}
 }
 
+static void si_set_sampler_view_desc(struct si_context *sctx,
+				     struct si_sampler_view *sview,
+				     struct si_sampler_state *sstate,
+				     uint32_t *desc)
+{
+	struct pipe_sampler_view *view = &sview->base;
+	struct r600_texture *rtex = (struct r600_texture *)view->texture;
+	bool is_buffer = rtex->resource.b.b.target == PIPE_BUFFER;
+
+	if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+		if (vi_dcc_enabled(rtex, view->u.tex.first_level))
+			if (!r600_texture_disable_dcc(&sctx->b, rtex))
+				sctx->b.decompress_dcc(&sctx->b.b, rtex);
+
+		sview->dcc_incompatible = false;
+	}
+
+	assert(rtex); /* views with texture == NULL aren't supported */
+	memcpy(desc, sview->state, 8*4);
+
+	if (is_buffer) {
+		si_set_buf_desc_address(&rtex->resource,
+					sview->base.u.buf.offset,
+					desc + 4);
+	} else {
+		bool is_separate_stencil = rtex->db_compatible &&
+					   sview->is_stencil_sampler;
+
+		si_set_mutable_tex_desc_fields(sctx->screen, rtex,
+					       sview->base_level_info,
+					       sview->base_level,
+					       sview->base.u.tex.first_level,
+					       sview->block_width,
+					       is_separate_stencil,
+					       desc);
+	}
+
+	if (!is_buffer && rtex->fmask.size) {
+		memcpy(desc + 8, sview->fmask_state, 8*4);
+	} else {
+		/* Disable FMASK and bind sampler state in [12:15]. */
+		memcpy(desc + 8, null_texture_descriptor, 4*4);
+
+		if (sstate) {
+			if (!is_buffer && rtex->upgraded_depth &&
+			    !sview->is_stencil_sampler)
+				memcpy(desc + 12, sstate->upgraded_depth_val, 4*4);
+			else
+				memcpy(desc + 12, sstate->val, 4*4);
+		}
+	}
+}
+
 static void si_set_sampler_view(struct si_context *sctx,
 				unsigned shader,
 				unsigned slot, struct pipe_sampler_view *view,
@@ -474,61 +541,23 @@
 {
 	struct si_sampler_views *views = &sctx->samplers[shader].views;
 	struct si_sampler_view *rview = (struct si_sampler_view*)view;
-	struct si_descriptors *descs = si_sampler_descriptors(sctx, shader);
-	uint32_t *desc = descs->list + slot * 16;
+	struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+	unsigned desc_slot = si_get_sampler_slot(slot);
+	uint32_t *desc = descs->list + desc_slot * 16;
 
 	if (views->views[slot] == view && !disallow_early_out)
 		return;
 
 	if (view) {
 		struct r600_texture *rtex = (struct r600_texture *)view->texture;
-		bool is_buffer = rtex->resource.b.b.target == PIPE_BUFFER;
 
-		if (unlikely(!is_buffer && rview->dcc_incompatible)) {
-			if (vi_dcc_enabled(rtex, view->u.tex.first_level))
-				if (!r600_texture_disable_dcc(&sctx->b, rtex))
-					sctx->b.decompress_dcc(&sctx->b.b, rtex);
+		si_set_sampler_view_desc(sctx, rview,
+					 views->sampler_states[slot], desc);
 
-			rview->dcc_incompatible = false;
-		}
-
-		assert(rtex); /* views with texture == NULL aren't supported */
-		pipe_sampler_view_reference(&views->views[slot], view);
-		memcpy(desc, rview->state, 8*4);
-
-		if (is_buffer) {
+		if (rtex->resource.b.b.target == PIPE_BUFFER)
 			rtex->resource.bind_history |= PIPE_BIND_SAMPLER_VIEW;
 
-			si_set_buf_desc_address(&rtex->resource,
-						view->u.buf.offset,
-						desc + 4);
-		} else {
-			bool is_separate_stencil =
-				rtex->db_compatible &&
-				rview->is_stencil_sampler;
-
-			si_set_mutable_tex_desc_fields(sctx->screen, rtex,
-						       rview->base_level_info,
-						       rview->base_level,
-						       rview->base.u.tex.first_level,
-						       rview->block_width,
-						       is_separate_stencil,
-						       desc);
-		}
-
-		if (!is_buffer && rtex->fmask.size) {
-			memcpy(desc + 8,
-			       rview->fmask_state, 8*4);
-		} else {
-			/* Disable FMASK and bind sampler state in [12:15]. */
-			memcpy(desc + 8,
-			       null_texture_descriptor, 4*4);
-
-			if (views->sampler_states[slot])
-				memcpy(desc + 12,
-				       views->sampler_states[slot]->val, 4*4);
-		}
-
+		pipe_sampler_view_reference(&views->views[slot], view);
 		views->enabled_mask |= 1u << slot;
 
 		/* Since this can flush, it must be done after enabled_mask is
@@ -549,36 +578,39 @@
 		views->enabled_mask &= ~(1u << slot);
 	}
 
-	descs->dirty_mask |= 1u << slot;
-	sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
+	descs->dirty_mask |= 1ull << desc_slot;
+	sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 }
 
-static bool is_compressed_colortex(struct r600_texture *rtex)
+static bool color_needs_decompression(struct r600_texture *rtex)
 {
 	return rtex->fmask.size ||
 	       (rtex->dirty_level_mask &&
 		(rtex->cmask.size || rtex->dcc_offset));
 }
 
-static bool depth_needs_decompression(struct r600_texture *rtex,
-				      struct si_sampler_view *sview)
+static bool depth_needs_decompression(struct r600_texture *rtex)
 {
-	return rtex->db_compatible &&
-	       (!rtex->tc_compatible_htile || sview->is_stencil_sampler);
+	/* If the depth/stencil texture is TC-compatible, no decompression
+	 * will be done. The decompression function will only flush DB caches
+	 * to make it coherent with shaders. That's necessary because the driver
+	 * doesn't flush DB caches in any other case.
+	 */
+	return rtex->db_compatible;
 }
 
-static void si_update_compressed_tex_shader_mask(struct si_context *sctx,
-						 unsigned shader)
+static void si_update_shader_needs_decompress_mask(struct si_context *sctx,
+						   unsigned shader)
 {
 	struct si_textures_info *samplers = &sctx->samplers[shader];
 	unsigned shader_bit = 1 << shader;
 
-	if (samplers->depth_texture_mask ||
-	    samplers->compressed_colortex_mask ||
-	    sctx->images[shader].compressed_colortex_mask)
-		sctx->compressed_tex_shader_mask |= shader_bit;
+	if (samplers->needs_depth_decompress_mask ||
+	    samplers->needs_color_decompress_mask ||
+	    sctx->images[shader].needs_color_decompress_mask)
+		sctx->shader_needs_decompress_mask |= shader_bit;
 	else
-		sctx->compressed_tex_shader_mask &= ~shader_bit;
+		sctx->shader_needs_decompress_mask &= ~shader_bit;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -597,8 +629,8 @@
 		unsigned slot = start + i;
 
 		if (!views || !views[i]) {
-			samplers->depth_texture_mask &= ~(1u << slot);
-			samplers->compressed_colortex_mask &= ~(1u << slot);
+			samplers->needs_depth_decompress_mask &= ~(1u << slot);
+			samplers->needs_color_decompress_mask &= ~(1u << slot);
 			si_set_sampler_view(sctx, shader, slot, NULL, false);
 			continue;
 		}
@@ -608,33 +640,32 @@
 		if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
 			struct r600_texture *rtex =
 				(struct r600_texture*)views[i]->texture;
-			struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
 
-			if (depth_needs_decompression(rtex, rview)) {
-				samplers->depth_texture_mask |= 1u << slot;
+			if (depth_needs_decompression(rtex)) {
+				samplers->needs_depth_decompress_mask |= 1u << slot;
 			} else {
-				samplers->depth_texture_mask &= ~(1u << slot);
+				samplers->needs_depth_decompress_mask &= ~(1u << slot);
 			}
-			if (is_compressed_colortex(rtex)) {
-				samplers->compressed_colortex_mask |= 1u << slot;
+			if (color_needs_decompression(rtex)) {
+				samplers->needs_color_decompress_mask |= 1u << slot;
 			} else {
-				samplers->compressed_colortex_mask &= ~(1u << slot);
+				samplers->needs_color_decompress_mask &= ~(1u << slot);
 			}
 
 			if (rtex->dcc_offset &&
 			    p_atomic_read(&rtex->framebuffers_bound))
 				sctx->need_check_render_feedback = true;
 		} else {
-			samplers->depth_texture_mask &= ~(1u << slot);
-			samplers->compressed_colortex_mask &= ~(1u << slot);
+			samplers->needs_depth_decompress_mask &= ~(1u << slot);
+			samplers->needs_color_decompress_mask &= ~(1u << slot);
 		}
 	}
 
-	si_update_compressed_tex_shader_mask(sctx, shader);
+	si_update_shader_needs_decompress_mask(sctx, shader);
 }
 
 static void
-si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
+si_samplers_update_needs_color_decompress_mask(struct si_textures_info *samplers)
 {
 	unsigned mask = samplers->views.enabled_mask;
 
@@ -645,10 +676,10 @@
 		if (res && res->target != PIPE_BUFFER) {
 			struct r600_texture *rtex = (struct r600_texture *)res;
 
-			if (is_compressed_colortex(rtex)) {
-				samplers->compressed_colortex_mask |= 1u << i;
+			if (color_needs_decompression(rtex)) {
+				samplers->needs_color_decompress_mask |= 1u << i;
 			} else {
-				samplers->compressed_colortex_mask &= ~(1u << i);
+				samplers->needs_color_decompress_mask &= ~(1u << i);
 			}
 		}
 	}
@@ -656,19 +687,6 @@
 
 /* IMAGE VIEWS */
 
-static unsigned
-si_image_descriptors_idx(unsigned shader)
-{
-	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_IMAGES;
-}
-
-static struct si_descriptors*
-si_image_descriptors(struct si_context *sctx, unsigned shader)
-{
-	return &sctx->descriptors[si_image_descriptors_idx(shader)];
-}
-
 static void
 si_release_image_views(struct si_images_info *images)
 {
@@ -704,15 +722,17 @@
 	struct si_images_info *images = &ctx->images[shader];
 
 	if (images->enabled_mask & (1u << slot)) {
-		struct si_descriptors *descs = si_image_descriptors(ctx, shader);
+		struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+		unsigned desc_slot = si_get_image_slot(slot);
 
 		pipe_resource_reference(&images->views[slot].resource, NULL);
-		images->compressed_colortex_mask &= ~(1 << slot);
+		images->needs_color_decompress_mask &= ~(1 << slot);
 
-		memcpy(descs->list + slot*8, null_image_descriptor, 8*4);
+		memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4);
 		images->enabled_mask &= ~(1u << slot);
-		descs->dirty_mask |= 1u << slot;
-		ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
+		/* two 8-byte images share one 16-byte slot */
+		descs->dirty_mask |= 1u << (desc_slot / 2);
+		ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 	}
 }
 
@@ -728,27 +748,16 @@
 		       view->u.buf.offset + view->u.buf.size);
 }
 
-static void si_set_shader_image(struct si_context *ctx,
-				unsigned shader,
-				unsigned slot, const struct pipe_image_view *view,
-				bool skip_decompress)
+static void si_set_shader_image_desc(struct si_context *ctx,
+				     const struct pipe_image_view *view,
+				     bool skip_decompress,
+				     uint32_t *desc)
 {
 	struct si_screen *screen = ctx->screen;
-	struct si_images_info *images = &ctx->images[shader];
-	struct si_descriptors *descs = si_image_descriptors(ctx, shader);
 	struct r600_resource *res;
-	uint32_t *desc = descs->list + slot * 8;
-
-	if (!view || !view->resource) {
-		si_disable_shader_image(ctx, shader, slot);
-		return;
-	}
 
 	res = (struct r600_resource *)view->resource;
 
-	if (&images->views[slot] != view)
-		util_copy_image_view(&images->views[slot], view);
-
 	if (res->b.b.target == PIPE_BUFFER) {
 		if (view->access & PIPE_IMAGE_ACCESS_WRITE)
 			si_mark_image_range_valid(view);
@@ -758,9 +767,6 @@
 					  view->u.buf.offset,
 					  view->u.buf.size, desc);
 		si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
-
-		images->compressed_colortex_mask &= ~(1 << slot);
-		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
 	} else {
 		static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
 		struct r600_texture *tex = (struct r600_texture *)res;
@@ -778,22 +784,10 @@
 			 * The decompression is relatively cheap if the surface
 			 * has been decompressed already.
 			 */
-			if (r600_texture_disable_dcc(&ctx->b, tex))
-				uses_dcc = false;
-			else
+			if (!r600_texture_disable_dcc(&ctx->b, tex))
 				ctx->b.decompress_dcc(&ctx->b.b, tex);
 		}
 
-		if (is_compressed_colortex(tex)) {
-			images->compressed_colortex_mask |= 1 << slot;
-		} else {
-			images->compressed_colortex_mask &= ~(1 << slot);
-		}
-
-		if (uses_dcc &&
-		    p_atomic_read(&tex->framebuffers_bound))
-			ctx->need_check_render_feedback = true;
-
 		if (ctx->b.chip_class >= GFX9) {
 			/* Always set the base address. The swizzle modes don't
 			 * allow setting mipmap level offsets as the base.
@@ -829,14 +823,59 @@
 					       util_format_get_blockwidth(view->format),
 					       false, desc);
 	}
+}
+
+static void si_set_shader_image(struct si_context *ctx,
+				unsigned shader,
+				unsigned slot, const struct pipe_image_view *view,
+				bool skip_decompress)
+{
+	struct si_images_info *images = &ctx->images[shader];
+	struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+	struct r600_resource *res;
+	unsigned desc_slot = si_get_image_slot(slot);
+	uint32_t *desc = descs->list + desc_slot * 8;
+
+	if (!view || !view->resource) {
+		si_disable_shader_image(ctx, shader, slot);
+		return;
+	}
+
+	res = (struct r600_resource *)view->resource;
+
+	if (&images->views[slot] != view)
+		util_copy_image_view(&images->views[slot], view);
+
+	si_set_shader_image_desc(ctx, view, skip_decompress, desc);
+
+	if (res->b.b.target == PIPE_BUFFER) {
+		images->needs_color_decompress_mask &= ~(1 << slot);
+		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+	} else {
+		struct r600_texture *tex = (struct r600_texture *)res;
+		unsigned level = view->u.tex.level;
+
+		if (color_needs_decompression(tex)) {
+			images->needs_color_decompress_mask |= 1 << slot;
+		} else {
+			images->needs_color_decompress_mask &= ~(1 << slot);
+		}
+
+		if (vi_dcc_enabled(tex, level) &&
+		    p_atomic_read(&tex->framebuffers_bound))
+			ctx->need_check_render_feedback = true;
+	}
 
 	images->enabled_mask |= 1u << slot;
-	descs->dirty_mask |= 1u << slot;
-	ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
+	/* two 8-byte images share one 16-byte slot */
+	descs->dirty_mask |= 1u << (desc_slot / 2);
+	ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 
 	/* Since this can flush, it must be done after enabled_mask is updated. */
 	si_sampler_view_add_buffer(ctx, &res->b.b,
-				   RADEON_USAGE_READWRITE, false, true);
+				   (view->access & PIPE_IMAGE_ACCESS_WRITE) ?
+				   RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+				   false, true);
 }
 
 static void
@@ -863,11 +902,11 @@
 			si_set_shader_image(ctx, shader, slot, NULL, false);
 	}
 
-	si_update_compressed_tex_shader_mask(ctx, shader);
+	si_update_shader_needs_decompress_mask(ctx, shader);
 }
 
 static void
-si_images_update_compressed_colortex_mask(struct si_images_info *images)
+si_images_update_needs_color_decompress_mask(struct si_images_info *images)
 {
 	unsigned mask = images->enabled_mask;
 
@@ -878,10 +917,10 @@
 		if (res && res->target != PIPE_BUFFER) {
 			struct r600_texture *rtex = (struct r600_texture *)res;
 
-			if (is_compressed_colortex(rtex)) {
-				images->compressed_colortex_mask |= 1 << i;
+			if (color_needs_decompression(rtex)) {
+				images->needs_color_decompress_mask |= 1 << i;
 			} else {
-				images->compressed_colortex_mask &= ~(1 << i);
+				images->needs_color_decompress_mask &= ~(1 << i);
 			}
 		}
 	}
@@ -895,7 +934,7 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_textures_info *samplers = &sctx->samplers[shader];
-	struct si_descriptors *desc = si_sampler_descriptors(sctx, shader);
+	struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
 	struct si_sampler_state **sstates = (struct si_sampler_state**)states;
 	int i;
 
@@ -904,6 +943,7 @@
 
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
+		unsigned desc_slot = si_get_sampler_slot(slot);
 
 		if (!sstates[i] ||
 		    sstates[i] == samplers->views.sampler_states[slot])
@@ -917,34 +957,53 @@
 		/* If FMASK is bound, don't overwrite it.
 		 * The sampler state will be set after FMASK is unbound.
 		 */
-		if (samplers->views.views[slot] &&
-		    samplers->views.views[slot]->texture &&
-		    samplers->views.views[slot]->texture->target != PIPE_BUFFER &&
-		    ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size)
+		struct si_sampler_view *sview =
+			(struct si_sampler_view *)samplers->views.views[slot];
+
+		struct r600_texture *tex = NULL;
+
+		if (sview && sview->base.texture &&
+		    sview->base.texture->target != PIPE_BUFFER)
+			tex = (struct r600_texture *)sview->base.texture;
+
+		if (tex && tex->fmask.size)
 			continue;
 
-		memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
-		desc->dirty_mask |= 1u << slot;
-		sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
+		if (tex && tex->upgraded_depth && !sview->is_stencil_sampler)
+			memcpy(desc->list + desc_slot * 16 + 12,
+			       sstates[i]->upgraded_depth_val, 4*4);
+		else
+			memcpy(desc->list + desc_slot * 16 + 12,
+			       sstates[i]->val, 4*4);
+		desc->dirty_mask |= 1ull << desc_slot;
+
+		sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 	}
 }
 
 /* BUFFER RESOURCES */
 
-static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_context *sctx,
+				     struct si_buffer_resources *buffers,
 				     struct si_descriptors *descs,
 				     unsigned num_buffers,
+				     unsigned first_ce_slot,
+				     unsigned num_ce_slots,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
+				     enum radeon_bo_usage shader_usage_constbuf,
 				     enum radeon_bo_priority priority,
+				     enum radeon_bo_priority priority_constbuf,
 				     unsigned *ce_offset)
 {
 	buffers->shader_usage = shader_usage;
+	buffers->shader_usage_constbuf = shader_usage_constbuf;
 	buffers->priority = priority;
+	buffers->priority_constbuf = priority_constbuf;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
-	si_init_descriptors(descs, shader_userdata_index, 4,
-			    num_buffers, NULL, ce_offset);
+	si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers,
+			    first_ce_slot, num_ce_slots, ce_offset);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
@@ -969,8 +1028,11 @@
 		int i = u_bit_scan(&mask);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      (struct r600_resource*)buffers->buffers[i],
-				      buffers->shader_usage, buffers->priority);
+			r600_resource(buffers->buffers[i]),
+			i < SI_NUM_SHADER_BUFFERS ? buffers->shader_usage :
+						    buffers->shader_usage_constbuf,
+			i < SI_NUM_SHADER_BUFFERS ? buffers->priority :
+						    buffers->priority_constbuf);
 	}
 }
 
@@ -1004,15 +1066,15 @@
 	int i;
 
 	for (i = 0; i < count; i++) {
-		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
+		int vb = sctx->vertex_elements->vertex_buffer_index[i];
 
 		if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
 			continue;
-		if (!sctx->vertex_buffer[vb].buffer)
+		if (!sctx->vertex_buffer[vb].buffer.resource)
 			continue;
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
+				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer.resource,
 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 	}
 
@@ -1025,7 +1087,7 @@
 
 bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
-	struct si_vertex_element *velems = sctx->vertex_elements;
+	struct si_vertex_elements *velems = sctx->vertex_elements;
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	unsigned i, count;
 	unsigned desc_list_byte_size;
@@ -1051,7 +1113,7 @@
 	u_upload_alloc(sctx->b.b.const_uploader, 0,
 		       desc_list_byte_size,
 		       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
-		       &desc->buffer_offset,
+		       (unsigned*)&desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
@@ -1063,21 +1125,20 @@
 	assert(count <= SI_MAX_ATTRIBS);
 
 	for (i = 0; i < count; i++) {
-		struct pipe_vertex_element *ve = &velems->elements[i];
 		struct pipe_vertex_buffer *vb;
 		struct r600_resource *rbuffer;
 		unsigned offset;
-		unsigned vbo_index = ve->vertex_buffer_index;
+		unsigned vbo_index = velems->vertex_buffer_index[i];
 		uint32_t *desc = &ptr[i*4];
 
 		vb = &sctx->vertex_buffer[vbo_index];
-		rbuffer = (struct r600_resource*)vb->buffer;
+		rbuffer = (struct r600_resource*)vb->buffer.resource;
 		if (!rbuffer) {
 			memset(desc, 0, 16);
 			continue;
 		}
 
-		offset = vb->buffer_offset + ve->src_offset;
+		offset = vb->buffer_offset + velems->src_offset[i];
 		va = rbuffer->gpu_address + offset;
 
 		/* Fill in T# buffer resource description */
@@ -1087,18 +1148,18 @@
 
 		if (sctx->b.chip_class != VI && vb->stride) {
 			/* Round up by rounding down and adding 1 */
-			desc[2] = (vb->buffer->width0 - offset -
+			desc[2] = (vb->buffer.resource->width0 - offset -
 				   velems->format_size[i]) /
 				  vb->stride + 1;
 		} else {
-			desc[2] = vb->buffer->width0 - offset;
+			desc[2] = vb->buffer.resource->width0 - offset;
 		}
 
 		desc[3] = velems->rsrc_word3[i];
 
 		if (first_vb_use_mask & (1 << i)) {
 			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-					      (struct r600_resource*)vb->buffer,
+					      (struct r600_resource*)vb->buffer.resource,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 		}
 	}
@@ -1119,16 +1180,16 @@
 /* CONSTANT BUFFERS */
 
 static unsigned
-si_const_buffer_descriptors_idx(unsigned shader)
+si_const_and_shader_buffer_descriptors_idx(unsigned shader)
 {
 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_CONST_BUFFERS;
+	       SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
 }
 
 static struct si_descriptors *
-si_const_buffer_descriptors(struct si_context *sctx, unsigned shader)
+si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader)
 {
-	return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)];
+	return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
 }
 
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
@@ -1199,8 +1260,8 @@
 		buffers->buffers[slot] = buffer;
 		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 						    (struct r600_resource*)buffer,
-						    buffers->shader_usage,
-						    buffers->priority, true);
+						    buffers->shader_usage_constbuf,
+						    buffers->priority_constbuf, true);
 		buffers->enabled_mask |= 1u << slot;
 	} else {
 		/* Clear the descriptor. */
@@ -1228,8 +1289,9 @@
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	si_set_constant_buffer(sctx, &sctx->const_buffers[shader],
-			       si_const_buffer_descriptors_idx(shader),
+	slot = si_get_constbuf_slot(slot);
+	si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
+			       si_const_and_shader_buffer_descriptors_idx(shader),
 			       slot, input);
 }
 
@@ -1238,35 +1300,22 @@
 {
 	cbuf->user_buffer = NULL;
 	si_get_buffer_from_descriptors(
-		&sctx->const_buffers[shader],
-		si_const_buffer_descriptors(sctx, shader),
-		slot, &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
+		&sctx->const_and_shader_buffers[shader],
+		si_const_and_shader_buffer_descriptors(sctx, shader),
+		si_get_constbuf_slot(slot),
+		&cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
 }
 
 /* SHADER BUFFERS */
 
-static unsigned
-si_shader_buffer_descriptors_idx(enum pipe_shader_type shader)
-{
-	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_SHADER_BUFFERS;
-}
-
-static struct si_descriptors *
-si_shader_buffer_descriptors(struct si_context *sctx,
-				  enum pipe_shader_type shader)
-{
-	return &sctx->descriptors[si_shader_buffer_descriptors_idx(shader)];
-}
-
 static void si_set_shader_buffers(struct pipe_context *ctx,
 				  enum pipe_shader_type shader,
 				  unsigned start_slot, unsigned count,
 				  const struct pipe_shader_buffer *sbuffers)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
-	struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
+	struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+	struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
 	unsigned i;
 
 	assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
@@ -1274,7 +1323,7 @@
 	for (i = 0; i < count; ++i) {
 		const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
 		struct r600_resource *buf;
-		unsigned slot = start_slot + i;
+		unsigned slot = si_get_shaderbuf_slot(start_slot + i);
 		uint32_t *desc = descs->list + slot * 4;
 		uint64_t va;
 
@@ -1284,7 +1333,7 @@
 			buffers->enabled_mask &= ~(1u << slot);
 			descs->dirty_mask |= 1u << slot;
 			sctx->descriptors_dirty |=
-				1u << si_shader_buffer_descriptors_idx(shader);
+				1u << si_const_and_shader_buffer_descriptors_idx(shader);
 			continue;
 		}
 
@@ -1311,7 +1360,7 @@
 		buffers->enabled_mask |= 1u << slot;
 		descs->dirty_mask |= 1u << slot;
 		sctx->descriptors_dirty |=
-			1u << si_shader_buffer_descriptors_idx(shader);
+			1u << si_const_and_shader_buffer_descriptors_idx(shader);
 
 		util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset,
 			       sbuffer->buffer_offset + sbuffer->buffer_size);
@@ -1323,12 +1372,13 @@
 			   uint start_slot, uint count,
 			   struct pipe_shader_buffer *sbuf)
 {
-	struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
-	struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
+	struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+	struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
 
 	for (unsigned i = 0; i < count; ++i) {
 		si_get_buffer_from_descriptors(
-			buffers, descs, start_slot + i,
+			buffers, descs,
+			si_get_shaderbuf_slot(start_slot + i),
 			&sbuf[i].buffer, &sbuf[i].buffer_offset,
 			&sbuf[i].buffer_size);
 	}
@@ -1584,17 +1634,59 @@
 
 /* TEXTURE METADATA ENABLE/DISABLE */
 
+static void
+si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
+{
+	util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
+	util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
+
+	util_dynarray_foreach(&sctx->resident_tex_handles,
+			      struct si_texture_handle *, tex_handle) {
+		struct pipe_resource *res = (*tex_handle)->view->texture;
+		struct r600_texture *rtex;
+
+		if (!res || res->target == PIPE_BUFFER)
+			continue;
+
+		rtex = (struct r600_texture *)res;
+		if (!color_needs_decompression(rtex))
+			continue;
+
+		util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
+				     struct si_texture_handle *, *tex_handle);
+	}
+
+	util_dynarray_foreach(&sctx->resident_img_handles,
+			      struct si_image_handle *, img_handle) {
+		struct pipe_image_view *view = &(*img_handle)->view;
+		struct pipe_resource *res = view->resource;
+		struct r600_texture *rtex;
+
+		if (!res || res->target == PIPE_BUFFER)
+			continue;
+
+		rtex = (struct r600_texture *)res;
+		if (!color_needs_decompression(rtex))
+			continue;
+
+		util_dynarray_append(&sctx->resident_img_needs_color_decompress,
+				     struct si_image_handle *, *img_handle);
+	}
+}
+
 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
  * while the texture is bound, possibly by a different context. In that case,
- * call this function to update compressed_colortex_masks.
+ * call this function to update needs_*_decompress_masks.
  */
-void si_update_compressed_colortex_masks(struct si_context *sctx)
+void si_update_needs_color_decompress_masks(struct si_context *sctx)
 {
 	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
-		si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
-		si_images_update_compressed_colortex_mask(&sctx->images[i]);
-		si_update_compressed_tex_shader_mask(sctx, i);
+		si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
+		si_images_update_needs_color_decompress_mask(&sctx->images[i]);
+		si_update_shader_needs_decompress_mask(sctx, i);
 	}
+
+	si_resident_handles_update_needs_color_decompress(sctx);
 }
 
 /* BUFFER DISCARD/INVALIDATION */
@@ -1603,11 +1695,14 @@
 static void si_reset_buffer_resources(struct si_context *sctx,
 				      struct si_buffer_resources *buffers,
 				      unsigned descriptors_idx,
+				      unsigned slot_mask,
 				      struct pipe_resource *buf,
-				      uint64_t old_va)
+				      uint64_t old_va,
+				      enum radeon_bo_usage usage,
+				      enum radeon_bo_priority priority)
 {
 	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-	unsigned mask = buffers->enabled_mask;
+	unsigned mask = buffers->enabled_mask & slot_mask;
 
 	while (mask) {
 		unsigned i = u_bit_scan(&mask);
@@ -1620,31 +1715,20 @@
 
 			radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 							    (struct r600_resource *)buf,
-							    buffers->shader_usage,
-							    buffers->priority, true);
+							    usage, priority, true);
 		}
 	}
 }
 
-/* Reallocate a buffer a update all resource bindings where the buffer is
- * bound.
- *
- * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
- * idle by discarding its contents. Apps usually tell us when to do this using
- * map_buffer flags, for example.
- */
-static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
+static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf,
+			     uint64_t old_va)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct r600_resource *rbuffer = r600_resource(buf);
 	unsigned i, shader;
-	uint64_t old_va = rbuffer->gpu_address;
 	unsigned num_elems = sctx->vertex_elements ?
 				       sctx->vertex_elements->count : 0;
 
-	/* Reallocate the buffer in the same pipe_resource. */
-	r600_alloc_resource(&sctx->screen->b, rbuffer);
-
 	/* We changed the buffer, now we need to bind it where the old one
 	 * was bound. This consists of 2 things:
 	 *   1) Updating the resource descriptor and dirtying it.
@@ -1654,14 +1738,14 @@
 	/* Vertex buffers. */
 	if (rbuffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
 		for (i = 0; i < num_elems; i++) {
-			int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
+			int vb = sctx->vertex_elements->vertex_buffer_index[i];
 
 			if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
 				continue;
-			if (!sctx->vertex_buffer[vb].buffer)
+			if (!sctx->vertex_buffer[vb].buffer.resource)
 				continue;
 
-			if (sctx->vertex_buffer[vb].buffer == buf) {
+			if (sctx->vertex_buffer[vb].buffer.resource == buf) {
 				sctx->vertex_buffers_dirty = true;
 				break;
 			}
@@ -1700,16 +1784,22 @@
 	/* Constant and shader buffers. */
 	if (rbuffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
 		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-			si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
-						  si_const_buffer_descriptors_idx(shader),
-						  buf, old_va);
+			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+						  si_const_and_shader_buffer_descriptors_idx(shader),
+						  u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
+						  buf, old_va,
+						  sctx->const_and_shader_buffers[shader].shader_usage_constbuf,
+						  sctx->const_and_shader_buffers[shader].priority_constbuf);
 	}
 
 	if (rbuffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
 		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-			si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
-						  si_shader_buffer_descriptors_idx(shader),
-						  buf, old_va);
+			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+						  si_const_and_shader_buffer_descriptors_idx(shader),
+						  u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),
+						  buf, old_va,
+						  sctx->const_and_shader_buffers[shader].shader_usage,
+						  sctx->const_and_shader_buffers[shader].priority);
 	}
 
 	if (rbuffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
@@ -1717,19 +1807,21 @@
 		for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 			struct si_sampler_views *views = &sctx->samplers[shader].views;
 			struct si_descriptors *descs =
-				si_sampler_descriptors(sctx, shader);
+				si_sampler_and_image_descriptors(sctx, shader);
 			unsigned mask = views->enabled_mask;
 
 			while (mask) {
 				unsigned i = u_bit_scan(&mask);
 				if (views->views[i]->texture == buf) {
+					unsigned desc_slot = si_get_sampler_slot(i);
+
 					si_desc_reset_buffer_offset(ctx,
 								    descs->list +
-								    i * 16 + 4,
+								    desc_slot * 16 + 4,
 								    old_va, buf);
-					descs->dirty_mask |= 1u << i;
+					descs->dirty_mask |= 1ull << desc_slot;
 					sctx->descriptors_dirty |=
-						1u << si_sampler_descriptors_idx(shader);
+						1u << si_sampler_and_image_descriptors_idx(shader);
 
 					radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
 									    rbuffer, RADEON_USAGE_READ,
@@ -1745,22 +1837,25 @@
 		for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
 			struct si_images_info *images = &sctx->images[shader];
 			struct si_descriptors *descs =
-				si_image_descriptors(sctx, shader);
+				si_sampler_and_image_descriptors(sctx, shader);
 			unsigned mask = images->enabled_mask;
 
 			while (mask) {
 				unsigned i = u_bit_scan(&mask);
 
 				if (images->views[i].resource == buf) {
+					unsigned desc_slot = si_get_image_slot(i);
+
 					if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
 						si_mark_image_range_valid(&images->views[i]);
 
 					si_desc_reset_buffer_offset(
-						ctx, descs->list + i * 8 + 4,
+						ctx, descs->list + desc_slot * 8 + 4,
 						old_va, buf);
-					descs->dirty_mask |= 1u << i;
+					/* two 8-byte images share one 16-byte slot */
+					descs->dirty_mask |= 1u << (desc_slot / 2);
 					sctx->descriptors_dirty |=
-						1u << si_image_descriptors_idx(shader);
+						1u << si_sampler_and_image_descriptors_idx(shader);
 
 					radeon_add_to_buffer_list_check_mem(
 						&sctx->b, &sctx->b.gfx, rbuffer,
@@ -1770,6 +1865,184 @@
 			}
 		}
 	}
+
+	/* Bindless texture handles */
+	if (rbuffer->texture_handle_allocated) {
+		util_dynarray_foreach(&sctx->resident_tex_handles,
+				      struct si_texture_handle *, tex_handle) {
+			struct pipe_sampler_view *view = (*tex_handle)->view;
+			struct si_bindless_descriptor *desc = (*tex_handle)->desc;
+
+			if (view->texture == buf) {
+				si_set_buf_desc_address(rbuffer,
+							view->u.buf.offset,
+							&desc->desc_list[4]);
+				desc->dirty = true;
+				sctx->bindless_descriptors_dirty = true;
+
+				radeon_add_to_buffer_list_check_mem(
+					&sctx->b, &sctx->b.gfx, rbuffer,
+					RADEON_USAGE_READ,
+					RADEON_PRIO_SAMPLER_BUFFER, true);
+			}
+		}
+	}
+
+	/* Bindless image handles */
+	if (rbuffer->image_handle_allocated) {
+		util_dynarray_foreach(&sctx->resident_img_handles,
+				      struct si_image_handle *, img_handle) {
+			struct pipe_image_view *view = &(*img_handle)->view;
+			struct si_bindless_descriptor *desc = (*img_handle)->desc;
+
+			if (view->resource == buf) {
+				if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+					si_mark_image_range_valid(view);
+
+				si_set_buf_desc_address(rbuffer,
+							view->u.buf.offset,
+							&desc->desc_list[4]);
+				desc->dirty = true;
+				sctx->bindless_descriptors_dirty = true;
+
+				radeon_add_to_buffer_list_check_mem(
+					&sctx->b, &sctx->b.gfx, rbuffer,
+					RADEON_USAGE_READWRITE,
+					RADEON_PRIO_SAMPLER_BUFFER, true);
+			}
+		}
+	}
+}
+
+/* Reallocate a buffer a update all resource bindings where the buffer is
+ * bound.
+ *
+ * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
+ * idle by discarding its contents. Apps usually tell us when to do this using
+ * map_buffer flags, for example.
+ */
+static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct r600_resource *rbuffer = r600_resource(buf);
+	uint64_t old_va = rbuffer->gpu_address;
+
+	/* Reallocate the buffer in the same pipe_resource. */
+	r600_alloc_resource(&sctx->screen->b, rbuffer);
+
+	si_rebind_buffer(ctx, buf, old_va);
+}
+
+static void si_upload_bindless_descriptor(struct si_context *sctx,
+					  struct si_bindless_descriptor *desc)
+{
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	uint64_t va = desc->buffer->gpu_address + desc->offset;
+	unsigned num_dwords = sizeof(desc->desc_list) / 4;
+
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + num_dwords, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_TC_L2) |
+		    S_370_WR_CONFIRM(1) |
+		    S_370_ENGINE_SEL(V_370_ME));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit_array(cs, desc->desc_list, num_dwords);
+}
+
+static void si_upload_bindless_descriptors(struct si_context *sctx)
+{
+	if (!sctx->bindless_descriptors_dirty)
+		return;
+
+	/* Wait for graphics/compute to be idle before updating the resident
+	 * descriptors directly in memory, in case the GPU is using them.
+	 */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 SI_CONTEXT_CS_PARTIAL_FLUSH;
+	si_emit_cache_flush(sctx);
+
+	util_dynarray_foreach(&sctx->resident_tex_handles,
+			      struct si_texture_handle *, tex_handle) {
+		struct si_bindless_descriptor *desc = (*tex_handle)->desc;
+
+		if (!desc->dirty)
+			continue;
+
+		si_upload_bindless_descriptor(sctx, desc);
+		desc->dirty = false;
+	}
+
+	util_dynarray_foreach(&sctx->resident_img_handles,
+			      struct si_image_handle *, img_handle) {
+		struct si_bindless_descriptor *desc = (*img_handle)->desc;
+
+		if (!desc->dirty)
+			continue;
+
+		si_upload_bindless_descriptor(sctx, desc);
+		desc->dirty = false;
+	}
+
+	/* Invalidate L1 because it doesn't know that L2 changed. */
+	sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1;
+	si_emit_cache_flush(sctx);
+
+	sctx->bindless_descriptors_dirty = false;
+}
+
+/* Update mutable image descriptor fields of all resident textures. */
+static void si_update_resident_texture_descriptor(struct si_context *sctx,
+						  struct si_texture_handle *tex_handle)
+{
+	struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
+	struct si_bindless_descriptor *desc = tex_handle->desc;
+	uint32_t desc_list[16];
+
+	if (sview->base.texture->target == PIPE_BUFFER)
+		return;
+
+	memcpy(desc_list, desc->desc_list, sizeof(desc_list));
+	si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate,
+				 &desc->desc_list[0]);
+
+	if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
+		desc->dirty = true;
+		sctx->bindless_descriptors_dirty = true;
+	}
+}
+
+static void si_update_resident_image_descriptor(struct si_context *sctx,
+						struct si_image_handle *img_handle)
+{
+	struct si_bindless_descriptor *desc = img_handle->desc;
+	struct pipe_image_view *view = &img_handle->view;
+	uint32_t desc_list[16];
+
+	if (view->resource->target == PIPE_BUFFER)
+		return;
+
+	memcpy(desc_list, desc->desc_list, sizeof(desc_list));
+	si_set_shader_image_desc(sctx, view, true, &desc->desc_list[0]);
+
+	if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
+		desc->dirty = true;
+		sctx->bindless_descriptors_dirty = true;
+	}
+}
+
+static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
+{
+	util_dynarray_foreach(&sctx->resident_tex_handles,
+			      struct si_texture_handle *, tex_handle) {
+		si_update_resident_texture_descriptor(sctx, *tex_handle);
+	}
+
+	util_dynarray_foreach(&sctx->resident_img_handles,
+			      struct si_image_handle *, img_handle) {
+		si_update_resident_image_descriptor(sctx, *img_handle);
+	}
+
+	si_upload_bindless_descriptors(sctx);
 }
 
 /* Update mutable image descriptor fields of all bound textures. */
@@ -1810,8 +2083,10 @@
 					    samplers->views[i], true);
 		}
 
-		si_update_compressed_tex_shader_mask(sctx, shader);
+		si_update_shader_needs_decompress_mask(sctx, shader);
 	}
+
+	si_update_all_resident_texture_descriptors(sctx);
 }
 
 /* SHADER USER DATA */
@@ -1901,7 +2176,8 @@
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
-	assert(desc->buffer);
+	if (!desc->buffer)
+		return; /* the pointer is not used by current shaders */
 
 	va = desc->buffer->gpu_address +
 	     desc->buffer_offset;
@@ -1926,18 +2202,25 @@
 				       R_00B030_SPI_SHADER_USER_DATA_PS_0);
 		si_emit_shader_pointer(sctx, descs,
 				       R_00B130_SPI_SHADER_USER_DATA_VS_0);
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B330_SPI_SHADER_USER_DATA_ES_0);
 
-		/* GFX9 merged LS-HS and ES-GS. Only set RW_BUFFERS for ES and LS. */
 		if (sctx->b.chip_class >= GFX9) {
+			/* GFX9 merged LS-HS and ES-GS.
+			 * Set RW_BUFFERS in the special registers, so that
+			 * it's preloaded into s[0:1] instead of s[8:9].
+			 */
 			si_emit_shader_pointer(sctx, descs,
-					       R_00B430_SPI_SHADER_USER_DATA_LS_0);
+					       R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
+			si_emit_shader_pointer(sctx, descs,
+					       R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
 		} else {
 			si_emit_shader_pointer(sctx, descs,
 					       R_00B230_SPI_SHADER_USER_DATA_GS_0);
 			si_emit_shader_pointer(sctx, descs,
+					       R_00B330_SPI_SHADER_USER_DATA_ES_0);
+			si_emit_shader_pointer(sctx, descs,
 					       R_00B430_SPI_SHADER_USER_DATA_HS_0);
+			si_emit_shader_pointer(sctx, descs,
+					       R_00B530_SPI_SHADER_USER_DATA_LS_0);
 		}
 	}
 
@@ -1979,55 +2262,623 @@
 	sctx->shader_pointers_dirty &= ~compute_mask;
 }
 
+/* BINDLESS */
+
+struct si_bindless_descriptor_slab
+{
+	struct pb_slab base;
+	struct r600_resource *buffer;
+	struct si_bindless_descriptor *entries;
+};
+
+bool si_bindless_descriptor_can_reclaim_slab(void *priv,
+					     struct pb_slab_entry *entry)
+{
+	/* Do not allow to reclaim any bindless descriptors for now because the
+	 * GPU might be using them. This should be improved later on.
+	 */
+	return false;
+}
+
+struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
+						  unsigned entry_size,
+						  unsigned group_index)
+{
+	struct si_context *sctx = priv;
+	struct si_screen *sscreen = sctx->screen;
+	struct si_bindless_descriptor_slab *slab;
+
+	slab = CALLOC_STRUCT(si_bindless_descriptor_slab);
+	if (!slab)
+		return NULL;
+
+	/* Create a buffer in VRAM for 1024 bindless descriptors. */
+	slab->buffer = (struct r600_resource *)
+		pipe_buffer_create(&sscreen->b.b, 0,
+				   PIPE_USAGE_DEFAULT, 64 * 1024);
+	if (!slab->buffer)
+		goto fail;
+
+	slab->base.num_entries = slab->buffer->bo_size / entry_size;
+	slab->base.num_free = slab->base.num_entries;
+	slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
+	if (!slab->entries)
+		goto fail_buffer;
+
+	LIST_INITHEAD(&slab->base.free);
+
+	for (unsigned i = 0; i < slab->base.num_entries; ++i) {
+		struct si_bindless_descriptor *desc = &slab->entries[i];
+
+		desc->entry.slab = &slab->base;
+		desc->entry.group_index = group_index;
+		desc->buffer = slab->buffer;
+		desc->offset = i * entry_size;
+
+		LIST_ADDTAIL(&desc->entry.head, &slab->base.free);
+	}
+
+	/* Add the descriptor to the per-context list. */
+	util_dynarray_append(&sctx->bindless_descriptors,
+			    struct r600_resource *, slab->buffer);
+
+	return &slab->base;
+
+fail_buffer:
+	r600_resource_reference(&slab->buffer, NULL);
+fail:
+	FREE(slab);
+	return NULL;
+}
+
+void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab)
+{
+	struct si_context *sctx = priv;
+	struct si_bindless_descriptor_slab *slab =
+		(struct si_bindless_descriptor_slab *)pslab;
+
+	/* Remove the descriptor from the per-context list. */
+	util_dynarray_delete_unordered(&sctx->bindless_descriptors,
+				       struct r600_resource *, slab->buffer);
+
+	r600_resource_reference(&slab->buffer, NULL);
+	FREE(slab->entries);
+	FREE(slab);
+}
+
+static struct si_bindless_descriptor *
+si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+			      unsigned size)
+{
+	struct si_screen *sscreen = sctx->screen;
+	struct si_bindless_descriptor *desc;
+	struct pb_slab_entry *entry;
+	void *ptr;
+
+	/* Sub-allocate the bindless descriptor from a slab to avoid dealing
+	 * with a ton of buffers and for reducing the winsys overhead.
+	 */
+	entry = pb_slab_alloc(&sctx->bindless_descriptor_slabs, 64, 0);
+	if (!entry)
+		return NULL;
+
+	desc = NULL;
+	desc = container_of(entry, desc, entry);
+
+	/* Upload the descriptor directly in VRAM. Because the slabs are
+	 * currently never reclaimed, we don't need to synchronize the
+	 * operation.
+	 */
+	ptr = sscreen->b.ws->buffer_map(desc->buffer->buf, NULL,
+					PIPE_TRANSFER_WRITE |
+					PIPE_TRANSFER_UNSYNCHRONIZED);
+	util_memcpy_cpu_to_le32(ptr + desc->offset, desc_list, size);
+
+	/* Keep track of the initial descriptor especially for buffers
+	 * invalidation because we might need to know the previous address.
+	 */
+	memcpy(desc->desc_list, desc_list, sizeof(desc->desc_list));
+
+	return desc;
+}
+
+static void si_invalidate_bindless_buf_desc(struct si_context *sctx,
+					    struct si_bindless_descriptor *desc,
+					    struct pipe_resource *resource,
+					    uint64_t offset)
+{
+	struct r600_resource *buf = r600_resource(resource);
+	uint32_t *desc_list = desc->desc_list + 4;
+	uint64_t old_desc_va;
+
+	assert(resource->target == PIPE_BUFFER);
+
+	/* Retrieve the old buffer addr from the descriptor. */
+	old_desc_va  = desc_list[0];
+	old_desc_va |= ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc_list[1]) << 32);
+
+	if (old_desc_va != buf->gpu_address + offset) {
+		/* The buffer has been invalidated when the handle wasn't
+		 * resident, update the descriptor and the dirty flag.
+		 */
+		si_set_buf_desc_address(buf, offset, &desc_list[0]);
+
+		desc->dirty = true;
+		sctx->bindless_descriptors_dirty = true;
+	}
+}
+
+static uint64_t si_create_texture_handle(struct pipe_context *ctx,
+					 struct pipe_sampler_view *view,
+					 const struct pipe_sampler_state *state)
+{
+	struct si_sampler_view *sview = (struct si_sampler_view *)view;
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_texture_handle *tex_handle;
+	struct si_sampler_state *sstate;
+	uint32_t desc_list[16];
+	uint64_t handle;
+
+	tex_handle = CALLOC_STRUCT(si_texture_handle);
+	if (!tex_handle)
+		return 0;
+
+	memset(desc_list, 0, sizeof(desc_list));
+	si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
+
+	sstate = ctx->create_sampler_state(ctx, state);
+	if (!sstate) {
+		FREE(tex_handle);
+		return 0;
+	}
+
+	si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
+	memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
+	ctx->delete_sampler_state(ctx, sstate);
+
+	tex_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
+							 sizeof(desc_list));
+	if (!tex_handle->desc) {
+		FREE(tex_handle);
+		return 0;
+	}
+
+	handle = tex_handle->desc->buffer->gpu_address +
+		 tex_handle->desc->offset;
+
+	if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)handle,
+				     tex_handle)) {
+		pb_slab_free(&sctx->bindless_descriptor_slabs,
+			     &tex_handle->desc->entry);
+		FREE(tex_handle);
+		return 0;
+	}
+
+	pipe_sampler_view_reference(&tex_handle->view, view);
+
+	r600_resource(sview->base.texture)->texture_handle_allocated = true;
+
+	return handle;
+}
+
+static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_texture_handle *tex_handle;
+	struct hash_entry *entry;
+
+	entry = _mesa_hash_table_search(sctx->tex_handles, (void *)handle);
+	if (!entry)
+		return;
+
+	tex_handle = (struct si_texture_handle *)entry->data;
+
+	pipe_sampler_view_reference(&tex_handle->view, NULL);
+	_mesa_hash_table_remove(sctx->tex_handles, entry);
+	pb_slab_free(&sctx->bindless_descriptor_slabs,
+		     &tex_handle->desc->entry);
+	FREE(tex_handle);
+}
+
+static void si_make_texture_handle_resident(struct pipe_context *ctx,
+					    uint64_t handle, bool resident)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_texture_handle *tex_handle;
+	struct si_sampler_view *sview;
+	struct hash_entry *entry;
+
+	entry = _mesa_hash_table_search(sctx->tex_handles, (void *)handle);
+	if (!entry)
+		return;
+
+	tex_handle = (struct si_texture_handle *)entry->data;
+	sview = (struct si_sampler_view *)tex_handle->view;
+
+	if (resident) {
+		if (sview->base.texture->target != PIPE_BUFFER) {
+			struct r600_texture *rtex =
+				(struct r600_texture *)sview->base.texture;
+
+			if (depth_needs_decompression(rtex)) {
+				util_dynarray_append(
+					&sctx->resident_tex_needs_depth_decompress,
+					struct si_texture_handle *,
+					tex_handle);
+			}
+
+			if (color_needs_decompression(rtex)) {
+				util_dynarray_append(
+					&sctx->resident_tex_needs_color_decompress,
+					struct si_texture_handle *,
+					tex_handle);
+			}
+
+			if (rtex->dcc_offset &&
+			    p_atomic_read(&rtex->framebuffers_bound))
+				sctx->need_check_render_feedback = true;
+
+			/* Re-upload the descriptor if it has been updated
+			 * while it wasn't resident.
+			 */
+			si_update_resident_texture_descriptor(sctx, tex_handle);
+			if (tex_handle->desc->dirty)
+				sctx->bindless_descriptors_dirty = true;
+		} else {
+			si_invalidate_bindless_buf_desc(sctx, tex_handle->desc,
+							sview->base.texture,
+							sview->base.u.buf.offset);
+		}
+
+		/* Add the texture handle to the per-context list. */
+		util_dynarray_append(&sctx->resident_tex_handles,
+				     struct si_texture_handle *, tex_handle);
+
+		/* Add the buffers to the current CS in case si_begin_new_cs()
+		 * is not going to be called.
+		 */
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  tex_handle->desc->buffer,
+					  RADEON_USAGE_READWRITE,
+					  RADEON_PRIO_DESCRIPTORS);
+
+		si_sampler_view_add_buffer(sctx, sview->base.texture,
+					   RADEON_USAGE_READ,
+					   sview->is_stencil_sampler, false);
+	} else {
+		/* Remove the texture handle from the per-context list. */
+		util_dynarray_delete_unordered(&sctx->resident_tex_handles,
+					       struct si_texture_handle *,
+					       tex_handle);
+
+		if (sview->base.texture->target != PIPE_BUFFER) {
+			util_dynarray_delete_unordered(
+				&sctx->resident_tex_needs_depth_decompress,
+				struct si_texture_handle *, tex_handle);
+
+			util_dynarray_delete_unordered(
+				&sctx->resident_tex_needs_color_decompress,
+				struct si_texture_handle *, tex_handle);
+		}
+	}
+}
+
+static uint64_t si_create_image_handle(struct pipe_context *ctx,
+				       const struct pipe_image_view *view)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_image_handle *img_handle;
+	uint32_t desc_list[16];
+	uint64_t handle;
+
+	if (!view || !view->resource)
+		return 0;
+
+	img_handle = CALLOC_STRUCT(si_image_handle);
+	if (!img_handle)
+		return 0;
+
+	memset(desc_list, 0, sizeof(desc_list));
+	si_init_descriptor_list(&desc_list[0], 8, 1, null_image_descriptor);
+
+	si_set_shader_image_desc(sctx, view, false, &desc_list[0]);
+
+	img_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
+							 sizeof(desc_list));
+	if (!img_handle->desc) {
+		FREE(img_handle);
+		return 0;
+	}
+
+	handle = img_handle->desc->buffer->gpu_address +
+		 img_handle->desc->offset;
+
+	if (!_mesa_hash_table_insert(sctx->img_handles, (void *)handle,
+				     img_handle)) {
+		pb_slab_free(&sctx->bindless_descriptor_slabs,
+			     &img_handle->desc->entry);
+		FREE(img_handle);
+		return 0;
+	}
+
+	util_copy_image_view(&img_handle->view, view);
+
+	r600_resource(view->resource)->image_handle_allocated = true;
+
+	return handle;
+}
+
+static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_image_handle *img_handle;
+	struct hash_entry *entry;
+
+	entry = _mesa_hash_table_search(sctx->img_handles, (void *)handle);
+	if (!entry)
+		return;
+
+	img_handle = (struct si_image_handle *)entry->data;
+
+	util_copy_image_view(&img_handle->view, NULL);
+	_mesa_hash_table_remove(sctx->img_handles, entry);
+	pb_slab_free(&sctx->bindless_descriptor_slabs,
+		     &img_handle->desc->entry);
+	FREE(img_handle);
+}
+
+static void si_make_image_handle_resident(struct pipe_context *ctx,
+					  uint64_t handle, unsigned access,
+					  bool resident)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_image_handle *img_handle;
+	struct pipe_image_view *view;
+	struct r600_resource *res;
+	struct hash_entry *entry;
+
+	entry = _mesa_hash_table_search(sctx->img_handles, (void *)handle);
+	if (!entry)
+		return;
+
+	img_handle = (struct si_image_handle *)entry->data;
+	view = &img_handle->view;
+	res = (struct r600_resource *)view->resource;
+
+	if (resident) {
+		if (res->b.b.target != PIPE_BUFFER) {
+			struct r600_texture *rtex = (struct r600_texture *)res;
+			unsigned level = view->u.tex.level;
+
+			if (color_needs_decompression(rtex)) {
+				util_dynarray_append(
+					&sctx->resident_img_needs_color_decompress,
+					struct si_image_handle *,
+					img_handle);
+			}
+
+			if (vi_dcc_enabled(rtex, level) &&
+			    p_atomic_read(&rtex->framebuffers_bound))
+				sctx->need_check_render_feedback = true;
+
+			/* Re-upload the descriptor if it has been updated
+			 * while it wasn't resident.
+			 */
+			si_update_resident_image_descriptor(sctx, img_handle);
+			if (img_handle->desc->dirty)
+				sctx->bindless_descriptors_dirty = true;
+
+		} else {
+			si_invalidate_bindless_buf_desc(sctx, img_handle->desc,
+							view->resource,
+							view->u.buf.offset);
+		}
+
+		/* Add the image handle to the per-context list. */
+		util_dynarray_append(&sctx->resident_img_handles,
+				     struct si_image_handle *, img_handle);
+
+		/* Add the buffers to the current CS in case si_begin_new_cs()
+		 * is not going to be called.
+		 */
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  img_handle->desc->buffer,
+					  RADEON_USAGE_READWRITE,
+					  RADEON_PRIO_DESCRIPTORS);
+
+		si_sampler_view_add_buffer(sctx, view->resource,
+					   (access & PIPE_IMAGE_ACCESS_WRITE) ?
+					   RADEON_USAGE_READWRITE :
+					   RADEON_USAGE_READ, false, false);
+	} else {
+		/* Remove the image handle from the per-context list. */
+		util_dynarray_delete_unordered(&sctx->resident_img_handles,
+					       struct si_image_handle *,
+					       img_handle);
+
+		if (res->b.b.target != PIPE_BUFFER) {
+			util_dynarray_delete_unordered(
+				&sctx->resident_img_needs_color_decompress,
+				struct si_image_handle *,
+				img_handle);
+		}
+	}
+}
+
+
+void si_all_resident_buffers_begin_new_cs(struct si_context *sctx)
+{
+	unsigned num_resident_tex_handles, num_resident_img_handles;
+
+	num_resident_tex_handles = sctx->resident_tex_handles.size /
+				   sizeof(struct si_texture_handle *);
+	num_resident_img_handles = sctx->resident_img_handles.size /
+				   sizeof(struct si_image_handle *);
+
+	/* Skip adding the bindless descriptors when no handles are resident.
+	 */
+	if (!num_resident_tex_handles && !num_resident_img_handles)
+		return;
+
+	/* Add all bindless descriptors. */
+	util_dynarray_foreach(&sctx->bindless_descriptors,
+			      struct r600_resource *, desc) {
+
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *desc,
+					  RADEON_USAGE_READWRITE,
+					  RADEON_PRIO_DESCRIPTORS);
+	}
+
+	/* Add all resident texture handles. */
+	util_dynarray_foreach(&sctx->resident_tex_handles,
+			      struct si_texture_handle *, tex_handle) {
+		struct si_sampler_view *sview =
+			(struct si_sampler_view *)(*tex_handle)->view;
+
+		si_sampler_view_add_buffer(sctx, sview->base.texture,
+					   RADEON_USAGE_READ,
+					   sview->is_stencil_sampler, false);
+	}
+
+	/* Add all resident image handles. */
+	util_dynarray_foreach(&sctx->resident_img_handles,
+			      struct si_image_handle *, img_handle) {
+		struct pipe_image_view *view = &(*img_handle)->view;
+
+		si_sampler_view_add_buffer(sctx, view->resource,
+					   RADEON_USAGE_READWRITE,
+					   false, false);
+	}
+
+	sctx->b.num_resident_handles += num_resident_tex_handles +
+					num_resident_img_handles;
+}
+
 /* INIT/DEINIT/UPLOAD */
 
+/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order
+ * to make CE RAM as useful as possible, this defines limits
+ * for the number slots that can be in CE RAM on GFX9. If a shader
+ * is using more, descriptors will be uploaded to memory directly and
+ * CE won't be used.
+ *
+ * These numbers are based on shader-db.
+ */
+static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 1,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 24,
+	[PIPE_SHADER_COMPUTE] = 16,
+};
+static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = {
+	/* these must be even due to slot alignment */
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 2,
+	[PIPE_SHADER_COMPUTE] = 8,
+};
+static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 9,
+	[PIPE_SHADER_TESS_CTRL] = 3,
+	[PIPE_SHADER_TESS_EVAL] = 5,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 8,
+	[PIPE_SHADER_COMPUTE] = 6,
+};
+static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = {
+	[PIPE_SHADER_VERTEX] = 0,
+	[PIPE_SHADER_TESS_CTRL] = 0,
+	[PIPE_SHADER_TESS_EVAL] = 0,
+	[PIPE_SHADER_GEOMETRY] = 0,
+	[PIPE_SHADER_FRAGMENT] = 12,
+	[PIPE_SHADER_COMPUTE] = 13,
+};
+
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 	unsigned ce_offset = 0;
 
+	STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0);
+	STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0);
+
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		/* GFX9 has only 4KB of CE, while previous chips had 32KB.
-		 * Rarely used descriptors don't use CE RAM.
-		 */
-		bool big_ce = sctx->b.chip_class <= VI;
-		bool images_use_ce = big_ce;
-		bool shaderbufs_use_ce = big_ce ||
-					 i == PIPE_SHADER_COMPUTE;
-		bool samplers_use_ce = big_ce ||
-				       i == PIPE_SHADER_FRAGMENT;
+		bool gfx9_tcs = false;
+		bool gfx9_gs = false;
+		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
+		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
 
-		si_init_buffer_resources(&sctx->const_buffers[i],
-					 si_const_buffer_descriptors(sctx, i),
-					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
-					 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER,
+		unsigned first_sampler_ce_slot = 0;
+		unsigned num_sampler_ce_slots = num_sampler_slots;
+
+		unsigned first_buffer_ce_slot = 0;
+		unsigned num_buffer_ce_slots = num_buffer_slots;
+
+		/* Adjust CE slot ranges based on GFX9 CE RAM limits. */
+		if (sctx->b.chip_class >= GFX9) {
+			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
+			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
+
+			first_sampler_ce_slot =
+				si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2;
+			num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 +
+					       gfx9_max_ce_samplers[i];
+
+			first_buffer_ce_slot =
+				si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1);
+			num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] +
+					      gfx9_max_ce_const_buffers[i];
+		}
+
+		si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i],
+					 si_const_and_shader_buffer_descriptors(sctx, i),
+					 num_buffer_slots,
+					 first_buffer_ce_slot, num_buffer_ce_slots,
+					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
+					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
+						   SI_SGPR_CONST_AND_SHADER_BUFFERS,
+					 RADEON_USAGE_READWRITE,
+					 RADEON_USAGE_READ,
+					 RADEON_PRIO_SHADER_RW_BUFFER,
+					 RADEON_PRIO_CONST_BUFFER,
 					 &ce_offset);
-		si_init_buffer_resources(&sctx->shader_buffers[i],
-					 si_shader_buffer_descriptors(sctx, i),
-					 SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
-					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER,
-					 shaderbufs_use_ce ? &ce_offset : NULL);
 
-		si_init_descriptors(si_sampler_descriptors(sctx, i),
-				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
-				    null_texture_descriptor,
-				    samplers_use_ce ? &ce_offset : NULL);
+		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
+		si_init_descriptors(sctx, desc,
+				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
+				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
+					      SI_SGPR_SAMPLERS_AND_IMAGES,
+				    16, num_sampler_slots,
+				    first_sampler_ce_slot, num_sampler_ce_slots,
+				    &ce_offset);
 
-		si_init_descriptors(si_image_descriptors(sctx, i),
-				    SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
-				    null_image_descriptor,
-				    images_use_ce ? &ce_offset : NULL);
+		int j;
+		for (j = 0; j < SI_NUM_IMAGES; j++)
+			memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
+		for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++)
+			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
 	}
 
-	si_init_buffer_resources(&sctx->rw_buffers,
+	si_init_buffer_resources(sctx, &sctx->rw_buffers,
 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-				 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS,
+				 SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS,
+				 SI_SGPR_RW_BUFFERS,
+				 /* The second set of usage/priority is used by
+				  * const buffers in RW buffer slots. */
+				 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
+				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
 				 &ce_offset);
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
+	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
+			    4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
 
 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+	sctx->total_ce_ram_allocated = ce_offset;
 
 	if (sctx->b.chip_class >= GFX9)
 		assert(ce_offset <= 4096);
@@ -2042,7 +2893,14 @@
 	sctx->b.b.set_shader_buffers = si_set_shader_buffers;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
+	sctx->b.b.create_texture_handle = si_create_texture_handle;
+	sctx->b.b.delete_texture_handle = si_delete_texture_handle;
+	sctx->b.b.make_texture_handle_resident = si_make_texture_handle_resident;
+	sctx->b.b.create_image_handle = si_create_image_handle;
+	sctx->b.b.delete_image_handle = si_delete_image_handle;
+	sctx->b.b.make_image_handle_resident = si_make_image_handle_resident;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
+	sctx->b.rebind_buffer = si_rebind_buffer;
 
 	/* Shader user data. */
 	si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
@@ -2082,6 +2940,9 @@
 	}
 
 	sctx->descriptors_dirty &= ~mask;
+
+	si_upload_bindless_descriptors(sctx);
+
 	return true;
 }
 
@@ -2106,6 +2967,8 @@
 
 	sctx->descriptors_dirty &= ~mask;
 
+	si_upload_bindless_descriptors(sctx);
+
 	return true;
 }
 
@@ -2114,15 +2977,15 @@
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_release_buffer_resources(&sctx->const_buffers[i],
-					    si_const_buffer_descriptors(sctx, i));
-		si_release_buffer_resources(&sctx->shader_buffers[i],
-					    si_shader_buffer_descriptors(sctx, i));
+		si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
+					    si_const_and_shader_buffer_descriptors(sctx, i));
 		si_release_sampler_views(&sctx->samplers[i].views);
 		si_release_image_views(&sctx->images[i]);
 	}
 	si_release_buffer_resources(&sctx->rw_buffers,
 				    &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+	for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
+		pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
 
 	for (i = 0; i < SI_NUM_DESCS; ++i)
 		si_release_descriptors(&sctx->descriptors[i]);
@@ -2134,8 +2997,7 @@
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
-		si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
+		si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
 		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
 	}
@@ -2147,3 +3009,61 @@
 
 	si_shader_userdata_begin_new_cs(sctx);
 }
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask)
+{
+	struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+	/* Ignore no-op updates and updates that disable all slots. */
+	if (!new_active_mask ||
+	    new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+						   desc->num_active_slots))
+		return;
+
+	int first, count;
+	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+	assert(new_active_mask == 0);
+
+	/* Upload/dump descriptors if slots are being enabled. */
+	if (first < desc->first_active_slot ||
+	    first + count > desc->first_active_slot + desc->num_active_slots)
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+	/* Enable or disable CE for this descriptor array. */
+	bool used_ce = desc->uses_ce;
+	desc->uses_ce = desc->first_ce_slot <= first &&
+			desc->first_ce_slot + desc->num_ce_slots >= first + count;
+
+	if (desc->uses_ce != used_ce) {
+		/* Upload or dump descriptors if we're disabling or enabling CE,
+		 * respectively. */
+		sctx->descriptors_dirty |= 1u << desc_idx;
+
+		/* If we're enabling CE, re-upload all descriptors to CE RAM.
+		 * When CE was disabled, uploads to CE RAM stopped.
+		 */
+		if (desc->uses_ce) {
+			desc->dirty_mask |=
+				u_bit_consecutive64(desc->first_ce_slot,
+						    desc->num_ce_slots);
+		}
+	}
+
+	desc->first_active_slot = first;
+	desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel)
+{
+	if (!sel)
+		return;
+
+	si_set_active_descriptors(sctx,
+		si_const_and_shader_buffer_descriptors_idx(sel->type),
+		sel->active_const_and_shader_buffers);
+	si_set_active_descriptors(sctx,
+		si_sampler_and_image_descriptors_idx(sel->type),
+		sel->active_samplers_and_images);
+}
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index e15f6a9..345825a 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -32,21 +32,21 @@
 	/* Ensure we have enough space to start a new range in a hole */
 	assert(element_size >= 3);
 
-	/* 5 dwords for possible load to reinitialize when we have no preamble
-	 * IB + 5 dwords for write to L2 + 3 bytes for every range written to
-	 * CE RAM.
+	/* 5 dwords for write to L2 + 3 bytes for the packet header of
+	 * every disjoint range written to CE RAM.
 	 */
-	return 5 + 5 + 3 + count * element_size;
+	return 5 + (3 * count / 2) + count * element_size;
 }
 
 static unsigned si_ce_needed_cs_space(void)
 {
 	unsigned space = 0;
 
-	space += si_descriptor_list_cs_space(SI_NUM_CONST_BUFFERS, 4);
-	space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS, 4);
-	space += si_descriptor_list_cs_space(SI_NUM_SAMPLERS, 16);
-	space += si_descriptor_list_cs_space(SI_NUM_IMAGES, 8);
+	space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS +
+					     SI_NUM_CONST_BUFFERS, 4);
+	/* two 8-byte images share one 16-byte slot */
+	space += si_descriptor_list_cs_space(SI_NUM_IMAGES / 2 +
+					     SI_NUM_SAMPLERS, 16);
 	space *= SI_NUM_SHADERS;
 
 	space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4);
@@ -123,6 +123,10 @@
 
 	ctx->gfx_flush_in_progress = true;
 
+	/* This CE dump should be done in parallel with the last draw. */
+	if (ctx->ce_ib)
+		si_ce_save_all_descriptors_at_ib_end(ctx);
+
 	r600_preflush_suspend_features(&ctx->b);
 
 	ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
@@ -207,8 +211,8 @@
 	else if (ctx->ce_ib)
 		si_ce_enable_loads(ctx->ce_ib);
 
-	if (ctx->ce_preamble_ib)
-		si_ce_reinitialize_all_descriptors(ctx);
+	if (ctx->ce_ib)
+		si_ce_restore_all_descriptors_at_ib_start(ctx);
 
 	if (ctx->b.chip_class >= CIK)
 		si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
@@ -231,6 +235,7 @@
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
+	si_all_resident_buffers_begin_new_cs(ctx);
 
 	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 836844e..71cae15 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -27,6 +27,7 @@
 #include "sid.h"
 
 #include "radeon/radeon_uvd.h"
+#include "util/hash_table.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 #include "util/u_tests.h"
@@ -45,13 +46,15 @@
 	 * properly.
 	 */
 	struct pipe_framebuffer_state fb = {};
-	context->set_framebuffer_state(context, &fb);
+	if (context->set_framebuffer_state)
+		context->set_framebuffer_state(context, &fb);
 
 	si_release_all_descriptors(sctx);
 
 	if (sctx->ce_suballocator)
 		u_suballocator_destroy(sctx->ce_suballocator);
 
+	r600_resource_reference(&sctx->ce_ram_saved_buffer, NULL);
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
 	pipe_resource_reference(&sctx->tf_ring, NULL);
@@ -61,6 +64,7 @@
 	free(sctx->border_color_table);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
 	r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
+	r600_resource_reference(&sctx->wait_mem_scratch, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	if (sctx->init_config_gs_rings)
@@ -74,10 +78,10 @@
 		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
 	if (sctx->custom_blend_resolve)
 		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
-	if (sctx->custom_blend_decompress)
-		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
-	if (sctx->custom_blend_fastclear)
-		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
+	if (sctx->custom_blend_fmask_decompress)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fmask_decompress);
+	if (sctx->custom_blend_eliminate_fastclear)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_eliminate_fastclear);
 	if (sctx->custom_blend_dcc_decompress)
 		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_dcc_decompress);
 
@@ -92,6 +96,17 @@
 	r600_resource_reference(&sctx->last_trace_buf, NULL);
 	radeon_clear_saved_cs(&sctx->last_gfx);
 
+	pb_slabs_deinit(&sctx->bindless_descriptor_slabs);
+	util_dynarray_fini(&sctx->bindless_descriptors);
+
+	_mesa_hash_table_destroy(sctx->tex_handles, NULL);
+	_mesa_hash_table_destroy(sctx->img_handles, NULL);
+
+	util_dynarray_fini(&sctx->resident_tex_handles);
+	util_dynarray_fini(&sctx->resident_img_handles);
+	util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
+	util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
+	util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
 	FREE(sctx);
 }
 
@@ -126,11 +141,12 @@
 	char features[256];
 
 	snprintf(features, sizeof(features),
-		 "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s",
+		 "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s",
 		 sscreen->b.chip_class >= GFX9 ? ",+xnack" : ",-xnack",
+		 sscreen->llvm_has_working_vgpr_indexing ? "" : ",-promote-alloca",
 		 sscreen->b.debug_flags & DBG_SI_SCHED ? ",+si-scheduler" : "");
 
-	return LLVMCreateTargetMachine(si_llvm_get_amdgpu_target(triple), triple,
+	return LLVMCreateTargetMachine(ac_get_llvm_target(triple), triple,
 				       r600_get_llvm_processor_name(sscreen->b.family),
 				       features,
 				       LLVMCodeGenLevelDefault,
@@ -139,7 +155,7 @@
 }
 
 static struct pipe_context *si_create_context(struct pipe_screen *screen,
-                                              void *priv, unsigned flags)
+                                              unsigned flags)
 {
 	struct si_context *sctx = CALLOC_STRUCT(si_context);
 	struct si_screen* sscreen = (struct si_screen *)screen;
@@ -149,14 +165,11 @@
 	if (!sctx)
 		return NULL;
 
-	if (sscreen->b.debug_flags & DBG_CHECK_VM)
-		flags |= PIPE_CONTEXT_DEBUG;
-
 	if (flags & PIPE_CONTEXT_DEBUG)
 		sscreen->record_llvm_ir = true; /* racy but not critical */
 
 	sctx->b.b.screen = screen; /* this must be set first */
-	sctx->b.b.priv = priv;
+	sctx->b.b.priv = NULL;
 	sctx->b.b.destroy = si_destroy_context;
 	sctx->b.b.emit_string_marker = si_emit_string_marker;
 	sctx->b.set_atom_dirty = (void *)si_set_atom_dirty;
@@ -174,7 +187,7 @@
 	si_init_cp_dma_functions(sctx);
 	si_init_debug_functions(sctx);
 
-	if (sscreen->b.info.has_uvd) {
+	if (sscreen->b.info.has_hw_decode) {
 		sctx->b.b.create_video_codec = si_uvd_create_decoder;
 		sctx->b.b.create_video_buffer = si_video_buffer_create;
 	} else {
@@ -185,15 +198,24 @@
 	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
 				       si_context_gfx_flush, sctx);
 
-	/* SI + AMDGPU + CE = GPU hang */
-	if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib &&
-	    sscreen->b.chip_class != SI &&
-	    /* These can't use CE due to a power gating bug in the kernel. */
-	    sscreen->b.family != CHIP_CARRIZO &&
-	    sscreen->b.family != CHIP_STONEY &&
-	    /* Some CE bug is causing green screen corruption w/ MPV video
-	     * playback and occasional corruption w/ 3D. */
-	    sscreen->b.chip_class != GFX9) {
+	bool enable_ce = sscreen->b.chip_class != SI && /* SI hangs */
+			 /* These can't use CE due to a power gating bug in the kernel. */
+			 sscreen->b.family != CHIP_CARRIZO &&
+			 sscreen->b.family != CHIP_STONEY;
+
+	/* CE is currently disabled by default, because it makes s_load latency
+	 * worse, because CE IB doesn't run in lockstep with DE.
+	 * Remove this line after that performance issue has been resolved.
+	 */
+	enable_ce = false;
+
+	/* Apply CE overrides. */
+	if (sscreen->b.debug_flags & DBG_NO_CE)
+		enable_ce = false;
+	else if (sscreen->b.debug_flags & DBG_CE)
+		enable_ce = true;
+
+	if (ws->cs_add_const_ib && enable_ce) {
 		sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
 		if (!sctx->ce_ib)
 			goto fail;
@@ -258,6 +280,23 @@
 	/* these must be last */
 	si_begin_new_cs(sctx);
 
+	if (sctx->b.chip_class >= GFX9) {
+		sctx->wait_mem_scratch = (struct r600_resource*)
+			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4);
+		if (!sctx->wait_mem_scratch)
+			goto fail;
+
+		/* Initialize the memory. */
+		struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+		radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
+			    S_370_WR_CONFIRM(1) |
+			    S_370_ENGINE_SEL(V_370_ME));
+		radeon_emit(cs, sctx->wait_mem_scratch->gpu_address);
+		radeon_emit(cs, sctx->wait_mem_scratch->gpu_address >> 32);
+		radeon_emit(cs, sctx->wait_mem_number);
+	}
+
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
 	 * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
 	if (sctx->b.chip_class == CIK) {
@@ -279,6 +318,8 @@
 
 		si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
 				 &sctx->null_const_buf);
+		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
+				 &sctx->null_const_buf);
 		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
 				 &sctx->null_const_buf);
 		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
@@ -314,6 +355,27 @@
 
 	sctx->tm = si_create_llvm_target_machine(sscreen);
 
+	/* Create a slab allocator for all bindless descriptors. */
+	if (!pb_slabs_init(&sctx->bindless_descriptor_slabs, 6, 6, 1, sctx,
+			   si_bindless_descriptor_can_reclaim_slab,
+			   si_bindless_descriptor_slab_alloc,
+			   si_bindless_descriptor_slab_free))
+		goto fail;
+
+	util_dynarray_init(&sctx->bindless_descriptors, NULL);
+
+	/* Bindless handles. */
+	sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+						    _mesa_key_pointer_equal);
+	sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+						    _mesa_key_pointer_equal);
+
+	util_dynarray_init(&sctx->resident_tex_handles, NULL);
+	util_dynarray_init(&sctx->resident_img_handles, NULL);
+	util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
+	util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
+	util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
+
 	return &sctx->b.b;
 fail:
 	fprintf(stderr, "radeonsi: Failed to create a context.\n");
@@ -321,6 +383,42 @@
 	return NULL;
 }
 
+static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
+						   void *priv, unsigned flags)
+{
+	struct si_screen *sscreen = (struct si_screen *)screen;
+	struct pipe_context *ctx;
+
+	if (sscreen->b.debug_flags & DBG_CHECK_VM)
+		flags |= PIPE_CONTEXT_DEBUG;
+
+	ctx = si_create_context(screen, flags);
+
+	if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
+		return ctx;
+
+	/* Clover (compute-only) is unsupported.
+	 *
+	 * Since the threaded context creates shader states from the non-driver
+	 * thread, asynchronous compilation is required for create_{shader}_-
+	 * state not to use pipe_context. Debug contexts (ddebug) disable
+	 * asynchronous compilation, so don't use the threaded context with
+	 * those.
+	 */
+	if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG))
+		return ctx;
+
+	/* When shaders are logged to stderr, asynchronous compilation is
+	 * disabled too. */
+	if (sscreen->b.debug_flags & (DBG_VS | DBG_TCS | DBG_TES | DBG_GS |
+				      DBG_PS | DBG_CS))
+		return ctx;
+
+	return threaded_context_create(ctx, &sscreen->b.pool_transfers,
+				       r600_replace_buffer_storage,
+				       &((struct si_context*)ctx)->b.tc);
+}
+
 /*
  * pipe_screen
  */
@@ -328,8 +426,7 @@
 {
 	/* Old kernels disallowed some register writes for SI
 	 * that are used for indirect dispatches. */
-	return HAVE_LLVM >= 0x309 &&
-	       (sscreen->b.chip_class >= CIK ||
+	return (sscreen->b.chip_class >= CIK ||
 		sscreen->b.info.drm_major == 3 ||
 		(sscreen->b.info.drm_major == 2 &&
 		 sscreen->b.info.drm_minor >= 45));
@@ -423,12 +520,17 @@
 	case PIPE_CAP_DOUBLES:
 	case PIPE_CAP_TGSI_TEX_TXF_LZ:
 	case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+	case PIPE_CAP_BINDLESS_TEXTURE:
+	case PIPE_CAP_QUERY_TIMESTAMP:
+	case PIPE_CAP_QUERY_TIME_ELAPSED:
 		return 1;
 
 	case PIPE_CAP_INT64:
 	case PIPE_CAP_INT64_DIVMOD:
 	case PIPE_CAP_TGSI_CLOCK:
-		return HAVE_LLVM >= 0x0309;
+	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+		return 1;
 
 	case PIPE_CAP_TGSI_VOTE:
 		return HAVE_LLVM >= 0x0400;
@@ -459,17 +561,13 @@
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
 	case PIPE_CAP_MAX_VERTEX_STREAMS:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 4;
 
-	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
-		return HAVE_LLVM >= 0x0309 ? 4 : 0;
-
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		if (sscreen->b.chip_class >= GFX9)
-			return 140;
 		if (si_have_tgsi_compute(sscreen))
 			return 450;
-		return HAVE_LLVM >= 0x0309 ? 420 : 410;
+		return 420;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.max_alloc_size, INT_MAX);
@@ -484,6 +582,9 @@
 			sscreen->b.info.drm_minor < 50);
 
 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+		/* TODO: GFX9 hangs. */
+		if (sscreen->b.chip_class >= GFX9)
+			return 0;
 		/* Disable on SI due to VM faults in CP DMA. Enable once these
 		 * faults are mitigated in software.
 		 */
@@ -508,6 +609,7 @@
 	case PIPE_CAP_TGSI_MUL_ZERO_WINS:
 	case PIPE_CAP_UMA:
 	case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+	case PIPE_CAP_POST_DEPTH_COVERAGE:
 		return 0;
 
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
@@ -557,11 +659,6 @@
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 
-	/* Timer queries, present when the clock frequency is non zero. */
-	case PIPE_CAP_QUERY_TIMESTAMP:
-	case PIPE_CAP_QUERY_TIME_ELAPSED:
-		return sscreen->b.info.clock_crystal_freq != 0;
-
  	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
 		return -32;
@@ -601,12 +698,9 @@
 	{
 	case PIPE_SHADER_FRAGMENT:
 	case PIPE_SHADER_VERTEX:
-		break;
 	case PIPE_SHADER_GEOMETRY:
 	case PIPE_SHADER_TESS_CTRL:
 	case PIPE_SHADER_TESS_EVAL:
-		if (sscreen->b.chip_class >= GFX9)
-			return 0;
 		break;
 	case PIPE_SHADER_COMPUTE:
 		switch (param) {
@@ -662,9 +756,9 @@
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return SI_NUM_SAMPLERS;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-		return HAVE_LLVM >= 0x0309 ? SI_NUM_SHADER_BUFFERS : 0;
+		return SI_NUM_SHADER_BUFFERS;
 	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
-		return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0;
+		return SI_NUM_IMAGES;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
@@ -675,19 +769,27 @@
 	/* Supported boolean features. */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
 	case PIPE_SHADER_CAP_INTEGERS:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
 		return 1;
 
 	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-		/* TODO: Indirection of geometry shader input dimension is not
-		 * handled yet
-		 */
-		return shader != PIPE_SHADER_GEOMETRY;
+		/* TODO: Indirect indexing of GS inputs is unimplemented. */
+		return shader != PIPE_SHADER_GEOMETRY &&
+		       (sscreen->llvm_has_working_vgpr_indexing ||
+			/* TCS and TES load inputs directly from LDS or
+			 * offchip memory, so indirect indexing is trivial. */
+			shader == PIPE_SHADER_TESS_CTRL ||
+			shader == PIPE_SHADER_TESS_EVAL);
+
+	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+		return sscreen->llvm_has_working_vgpr_indexing ||
+		       /* TCS stores outputs directly to memory. */
+		       shader == PIPE_SHADER_TESS_CTRL;
 
 	/* Unsupported boolean features. */
 	case PIPE_SHADER_CAP_SUBROUTINES:
@@ -704,7 +806,6 @@
 	struct si_screen *sscreen = (struct si_screen *)pscreen;
 	struct si_shader_part *parts[] = {
 		sscreen->vs_prologs,
-		sscreen->vs_epilogs,
 		sscreen->tcs_epilogs,
 		sscreen->gs_prologs,
 		sscreen->ps_prologs,
@@ -716,11 +817,16 @@
 		return;
 
 	util_queue_destroy(&sscreen->shader_compiler_queue);
+	util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
 	for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
 		if (sscreen->tm[i])
 			LLVMDisposeTargetMachine(sscreen->tm[i]);
 
+	for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++)
+		if (sscreen->tm_low_priority[i])
+			LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]);
+
 	/* Free shader parts. */
 	for (i = 0; i < ARRAY_SIZE(parts); i++) {
 		while (parts[i]) {
@@ -831,17 +937,18 @@
 	exit(0);
 }
 
-struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
+struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
+					   unsigned flags)
 {
 	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-	unsigned num_cpus, num_compiler_threads, i;
+	unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
 
 	if (!sscreen) {
 		return NULL;
 	}
 
 	/* Set functions first. */
-	sscreen->b.b.context_create = si_create_context;
+	sscreen->b.b.context_create = si_pipe_create_context;
 	sscreen->b.b.destroy = si_destroy_screen;
 	sscreen->b.b.get_param = si_get_param;
 	sscreen->b.b.get_shader_param = si_get_shader_param;
@@ -849,24 +956,40 @@
 
 	si_init_screen_state_functions(sscreen);
 
-	if (!r600_common_screen_init(&sscreen->b, ws) ||
+	if (!r600_common_screen_init(&sscreen->b, ws, flags) ||
 	    !si_init_gs_info(sscreen) ||
 	    !si_init_shader_cache(sscreen)) {
 		FREE(sscreen);
 		return NULL;
 	}
 
-	/* Only enable as many threads as we have target machines and CPUs. */
-	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm));
+	/* Only enable as many threads as we have target machines, but at most
+	 * the number of CPUs - 1 if there is more than one.
+	 */
+	num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+	num_threads = MAX2(1, num_threads - 1);
+	num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm));
+	num_compiler_threads_lowprio =
+		MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority));
 
 	if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
-			     32, num_compiler_threads)) {
+			     32, num_compiler_threads,
+			     UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
 		si_destroy_shader_cache(sscreen);
 		FREE(sscreen);
 		return NULL;
 	}
 
+	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
+			     "si_shader_low",
+			     32, num_compiler_threads_lowprio,
+			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
+			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+	       si_destroy_shader_cache(sscreen);
+	       FREE(sscreen);
+	       return NULL;
+	}
+
 	si_handle_env_var_force_family(sscreen);
 
 	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
@@ -891,16 +1014,19 @@
 		 sscreen->b.info.pfp_fw_version >= 211 &&
 		 sscreen->b.info.me_fw_version >= 173) ||
 		(sscreen->b.chip_class == SI &&
-		 sscreen->b.info.pfp_fw_version >= 121 &&
-		 sscreen->b.info.me_fw_version >= 87);
+		 sscreen->b.info.pfp_fw_version >= 79 &&
+		 sscreen->b.info.me_fw_version >= 142);
 
-	sscreen->has_ds_bpermute = HAVE_LLVM >= 0x0309 &&
-				   sscreen->b.chip_class >= VI;
-
+	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
 	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
 					    sscreen->b.family <= CHIP_POLARIS12) ||
 					   sscreen->b.family == CHIP_VEGA10 ||
 					   sscreen->b.family == CHIP_RAVEN;
+	/* While it would be nice not to have this flag, we are constrained
+	 * by the reality that LLVM 5.0 doesn't have working VGPR indexing
+	 * on GFX9.
+	 */
+	sscreen->llvm_has_working_vgpr_indexing = sscreen->b.chip_class <= VI;
 
 	sscreen->b.has_cp_dma = true;
 	sscreen->b.has_streamout = true;
@@ -923,8 +1049,10 @@
 		(sscreen->b.debug_flags & DBG_MONOLITHIC_SHADERS) != 0;
 
 	sscreen->b.barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
-					    SI_CONTEXT_INV_VMEM_L1 |
-					    SI_CONTEXT_INV_GLOBAL_L2;
+					    SI_CONTEXT_INV_VMEM_L1;
+	if (sscreen->b.chip_class <= VI)
+		sscreen->b.barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
+
 	sscreen->b.barrier_flags.compute_to_L2 = SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
@@ -932,9 +1060,11 @@
 
 	for (i = 0; i < num_compiler_threads; i++)
 		sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
+	for (i = 0; i < num_compiler_threads_lowprio; i++)
+		sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen);
 
 	/* Create the auxiliary context. This must be done last. */
-	sscreen->b.aux_context = sscreen->b.b.context_create(&sscreen->b.b, NULL, 0);
+	sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0);
 
 	if (sscreen->b.debug_flags & DBG_TEST_DMA)
 		r600_test_dma(&sscreen->b);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 529e1e3..710286f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -28,6 +28,8 @@
 
 #include "si_shader.h"
 
+#include "util/u_dynarray.h"
+
 #ifdef PIPE_ARCH_BIG_ENDIAN
 #define SI_BIG_ENDIAN 1
 #else
@@ -57,7 +59,8 @@
 #define SI_CONTEXT_WRITEBACK_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 4)
 /* gaps */
 /* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_DB	(R600_CONTEXT_PRIVATE_FLAG << 7)
+#define SI_CONTEXT_FLUSH_AND_INV_DB	(R600_CONTEXT_PRIVATE_FLAG << 6)
+#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 7)
 #define SI_CONTEXT_FLUSH_AND_INV_CB	(R600_CONTEXT_PRIVATE_FLAG << 8)
 /* Engine synchronization. */
 #define SI_CONTEXT_VS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 9)
@@ -67,6 +70,7 @@
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC	(R600_CONTEXT_PRIVATE_FLAG << 13)
 
 #define SI_MAX_BORDER_COLORS	4096
+#define SIX_BITS		0x3F
 
 struct si_compute;
 struct hash_table;
@@ -80,6 +84,7 @@
 	bool				has_draw_indirect_multi;
 	bool				has_ds_bpermute;
 	bool				has_msaa_sample_loc_bug;
+	bool				llvm_has_working_vgpr_indexing;
 
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
@@ -87,7 +92,6 @@
 
 	mtx_t			shader_parts_mutex;
 	struct si_shader_part		*vs_prologs;
-	struct si_shader_part		*vs_epilogs;
 	struct si_shader_part		*tcs_epilogs;
 	struct si_shader_part		*gs_prologs;
 	struct si_shader_part		*ps_prologs;
@@ -110,7 +114,15 @@
 
 	/* Shader compiler queue for multithreaded compilation. */
 	struct util_queue		shader_compiler_queue;
-	LLVMTargetMachineRef		tm[4]; /* used by the queue only */
+	/* Use at most 3 normal compiler threads on quadcore and better.
+	 * Hyperthreaded CPUs report the number of threads, but we want
+	 * the number of cores. */
+	LLVMTargetMachineRef		tm[3]; /* used by the queue only */
+
+	struct util_queue		shader_compiler_queue_low_priority;
+	/* Use at most 2 low priority threads on quadcore and better.
+	 * We want to minimize the impact on multithreaded Mesa. */
+	LLVMTargetMachineRef		tm_low_priority[2]; /* at most 2 threads */
 };
 
 struct si_blend_color {
@@ -125,8 +137,8 @@
 	uint32_t			state[8];
 	uint32_t			fmask_state[8];
 	const struct legacy_surf_level	*base_level_info;
-	unsigned			base_level;
-	unsigned			block_width;
+	ubyte				base_level;
+	ubyte				block_width;
 	bool is_stencil_sampler;
 	bool dcc_incompatible;
 };
@@ -138,6 +150,7 @@
 	unsigned			magic;
 #endif
 	uint32_t			val[4];
+	uint32_t			upgraded_depth_val[4];
 };
 
 struct si_cs_shader_state {
@@ -150,33 +163,32 @@
 
 struct si_textures_info {
 	struct si_sampler_views		views;
-	uint32_t			depth_texture_mask; /* which textures are depth */
-	uint32_t			compressed_colortex_mask;
+	uint32_t			needs_depth_decompress_mask;
+	uint32_t			needs_color_decompress_mask;
 };
 
 struct si_images_info {
 	struct pipe_image_view		views[SI_NUM_IMAGES];
-	uint32_t			compressed_colortex_mask;
+	uint32_t			needs_color_decompress_mask;
 	unsigned			enabled_mask;
 };
 
 struct si_framebuffer {
 	struct r600_atom		atom;
 	struct pipe_framebuffer_state	state;
-	unsigned			nr_samples;
-	unsigned			log_samples;
-	unsigned			compressed_cb_mask;
 	unsigned			colorbuf_enabled_4bit;
 	unsigned			spi_shader_col_format;
 	unsigned			spi_shader_col_format_alpha;
 	unsigned			spi_shader_col_format_blend;
 	unsigned			spi_shader_col_format_blend_alpha;
-	unsigned			color_is_int8;
-	unsigned			color_is_int10;
-	unsigned			dirty_cbufs;
+	ubyte				nr_samples:5; /* at most 16xAA */
+	ubyte				log_samples:3; /* at most 4 = 16xAA */
+	ubyte				compressed_cb_mask;
+	ubyte				color_is_int8;
+	ubyte				color_is_int10;
+	ubyte				dirty_cbufs;
 	bool				dirty_zsbuf;
 	bool				any_dst_linear;
-	bool				do_update_surf_dirtiness;
 };
 
 struct si_clip_state {
@@ -218,32 +230,59 @@
 		unsigned count_from_stream_output:1;
 		unsigned line_stipple_enabled:1;
 		unsigned uses_tess:1;
-		unsigned tcs_tes_uses_prim_id:1;
+		unsigned tess_uses_prim_id:1;
 		unsigned uses_gs:1;
 		unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
 	} u;
 	uint32_t index;
 };
 
+struct si_bindless_descriptor
+{
+	struct pb_slab_entry		entry;
+	struct r600_resource		*buffer;
+	unsigned			offset;
+	uint32_t			desc_list[16];
+	bool				dirty;
+};
+
+struct si_texture_handle
+{
+	struct si_bindless_descriptor	*desc;
+	struct pipe_sampler_view	*view;
+	struct si_sampler_state		sstate;
+};
+
+struct si_image_handle
+{
+	struct si_bindless_descriptor	*desc;
+	struct pipe_image_view		view;
+};
+
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
 	void				*custom_dsa_flush;
 	void				*custom_blend_resolve;
-	void				*custom_blend_decompress;
-	void				*custom_blend_fastclear;
+	void				*custom_blend_fmask_decompress;
+	void				*custom_blend_eliminate_fastclear;
 	void				*custom_blend_dcc_decompress;
 	struct si_screen		*screen;
+	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
+	struct si_shader_ctx_state	fixed_func_tcs_shader;
+	struct r600_resource		*wait_mem_scratch;
+	unsigned			wait_mem_number;
 
 	struct radeon_winsys_cs		*ce_ib;
 	struct radeon_winsys_cs		*ce_preamble_ib;
-	bool				ce_need_synchronization;
+	struct r600_resource		*ce_ram_saved_buffer;
 	struct u_suballocator		*ce_suballocator;
+	unsigned			ce_ram_saved_offset;
+	uint16_t			total_ce_ram_allocated;
+	bool				ce_need_synchronization:1;
 
-	struct si_shader_ctx_state	fixed_func_tcs_shader;
-	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
-	bool				gfx_flush_in_progress;
-	bool				compute_is_busy;
+	bool				gfx_flush_in_progress:1;
+	bool				compute_is_busy:1;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
@@ -284,7 +323,7 @@
 	struct si_cs_shader_state	cs_shader_state;
 
 	/* shader information */
-	struct si_vertex_element	*vertex_elements;
+	struct si_vertex_elements	*vertex_elements;
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
 	bool				do_update_shaders;
@@ -294,10 +333,9 @@
 	struct si_descriptors		descriptors[SI_NUM_DESCS];
 	unsigned			descriptors_dirty;
 	unsigned			shader_pointers_dirty;
-	unsigned			compressed_tex_shader_mask;
+	unsigned			shader_needs_decompress_mask;
 	struct si_buffer_resources	rw_buffers;
-	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
-	struct si_buffer_resources	shader_buffers[SI_NUM_SHADERS];
+	struct si_buffer_resources	const_and_shader_buffers[SI_NUM_SHADERS];
 	struct si_textures_info		samplers[SI_NUM_SHADERS];
 	struct si_images_info		images[SI_NUM_SHADERS];
 
@@ -315,7 +353,6 @@
 	/* Vertex and index buffers. */
 	bool				vertex_buffers_dirty;
 	bool				vertex_buffer_pointer_dirty;
-	struct pipe_index_buffer	index_buffer;
 	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
 
 	/* MSAA config state. */
@@ -323,19 +360,21 @@
 	bool				smoothing_enabled;
 
 	/* DB render state. */
-	bool			dbcb_depth_copy_enabled;
-	bool			dbcb_stencil_copy_enabled;
-	unsigned		dbcb_copy_sample;
-	bool			db_flush_depth_inplace;
-	bool			db_flush_stencil_inplace;
-	bool			db_depth_clear;
-	bool			db_depth_disable_expclear;
-	bool			db_stencil_clear;
-	bool			db_stencil_disable_expclear;
 	unsigned		ps_db_shader_control;
-	bool			occlusion_queries_disabled;
+	unsigned		dbcb_copy_sample;
+	bool			dbcb_depth_copy_enabled:1;
+	bool			dbcb_stencil_copy_enabled:1;
+	bool			db_flush_depth_inplace:1;
+	bool			db_flush_stencil_inplace:1;
+	bool			db_depth_clear:1;
+	bool			db_depth_disable_expclear:1;
+	bool			db_stencil_clear:1;
+	bool			db_stencil_disable_expclear:1;
+	bool			occlusion_queries_disabled:1;
+	bool			generate_mipmap_for_depth:1;
 
 	/* Emitted draw state. */
+	bool			gs_tri_strip_adj_fix:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
@@ -350,8 +389,6 @@
 	unsigned		last_sc_line_stipple;
 	unsigned		current_vs_state;
 	unsigned		last_vs_state;
-	enum pipe_prim_type	current_rast_prim; /* primitive type after TES, GS */
-	bool			gs_tri_strip_adj_fix;
 
 	/* Scratch buffer */
 	struct r600_atom	scratch_state;
@@ -362,7 +399,8 @@
 	struct r600_resource	*compute_scratch_buffer;
 
 	/* Emitted derived tessellation state. */
-	struct si_shader	*last_ls; /* local shader (VS) */
+	/* Local shader (VS), or HS if LS-HS are merged. */
+	struct si_shader	*last_ls;
 	struct si_shader_selector *last_tcs;
 	int			last_num_tcs_input_cp;
 	int			last_tes_sh_base;
@@ -380,10 +418,35 @@
 
 	/* Other state */
 	bool need_check_render_feedback;
+	bool			decompression_enabled;
 
 	/* Precomputed IA_MULTI_VGT_PARAM */
 	union si_vgt_param_key  ia_multi_vgt_param_key;
 	unsigned		ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
+
+	/* Slab allocator for bindless descriptors. */
+	struct pb_slabs		bindless_descriptor_slabs;
+
+	/* Bindless descriptors. */
+	struct util_dynarray	bindless_descriptors;
+	bool			bindless_descriptors_dirty;
+
+	/* Allocated bindless handles */
+	struct hash_table	*tex_handles;
+	struct hash_table	*img_handles;
+
+	/* Resident bindless handles */
+	struct util_dynarray	resident_tex_handles;
+	struct util_dynarray	resident_img_handles;
+
+	/* Resident bindless handles which need decompression */
+	struct util_dynarray	resident_tex_needs_color_decompress;
+	struct util_dynarray	resident_img_needs_color_decompress;
+	struct util_dynarray	resident_tex_needs_depth_decompress;
+
+	/* Bindless state */
+	bool			uses_bindless_samplers;
+	bool			uses_bindless_images;
 };
 
 /* cik_sdma.c */
@@ -487,36 +550,30 @@
 	si_set_atom_dirty(sctx, atom, true);
 }
 
-static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
+static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)
 {
 	if (sctx->gs_shader.cso)
-		return &sctx->gs_shader.cso->info;
-	else if (sctx->tes_shader.cso)
-		return &sctx->tes_shader.cso->info;
-	else if (sctx->vs_shader.cso)
-		return &sctx->vs_shader.cso->info;
-	else
-		return NULL;
+		return &sctx->gs_shader;
+	if (sctx->tes_shader.cso)
+		return &sctx->tes_shader;
+
+	return &sctx->vs_shader;
+}
+
+static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
+{
+	struct si_shader_ctx_state *vs = si_get_vs(sctx);
+
+	return vs->cso ? &vs->cso->info : NULL;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
-	if (sctx->gs_shader.current)
+	if (sctx->gs_shader.cso)
 		return sctx->gs_shader.cso->gs_copy_shader;
-	else if (sctx->tes_shader.current)
-		return sctx->tes_shader.current;
-	else
-		return sctx->vs_shader.current;
-}
 
-static inline bool si_vs_exports_prim_id(struct si_shader *shader)
-{
-	if (shader->selector->type == PIPE_SHADER_VERTEX)
-		return shader->key.part.vs.epilog.export_prim_id;
-	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		return shader->key.part.tes.epilog.export_prim_id;
-	else
-		return false;
+	struct si_shader_ctx_state *vs = si_get_vs(sctx);
+	return vs->current ? vs->current : NULL;
 }
 
 static inline unsigned
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index bf923ec..1ae1861 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -45,8 +45,7 @@
 	unsigned count;
 	count = state->ndw - state->last_pm4 - 2;
 	state->pm4[state->last_pm4] =
-		PKT3(state->last_opcode, count, predicate)
-		   | PKT3_SHADER_TYPE_S(state->compute_pkt);
+		PKT3(state->last_opcode, count, predicate);
 
 	assert(state->ndw <= SI_PM4_MAX_DW);
 }
@@ -110,12 +109,6 @@
 	state->ndw = 0;
 }
 
-void si_pm4_free_state_simple(struct si_pm4_state *state)
-{
-	si_pm4_clear_state(state);
-	FREE(state);
-}
-
 void si_pm4_free_state(struct si_context *sctx,
 		       struct si_pm4_state *state,
 		       unsigned idx)
@@ -127,7 +120,8 @@
 		sctx->emitted.array[idx] = NULL;
 	}
 
-	si_pm4_free_state_simple(state);
+	si_pm4_clear_state(state);
+	FREE(state);
 }
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index 106abe1..6301f20 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -30,11 +30,10 @@
 #include "radeon/radeon_winsys.h"
 
 #define SI_PM4_MAX_DW		176
-#define SI_PM4_MAX_BO		1
+#define SI_PM4_MAX_BO		3
 
 // forward defines
 struct si_context;
-enum chip_class;
 
 struct si_pm4_state
 {
@@ -55,8 +54,6 @@
 	struct r600_resource	*bo[SI_PM4_MAX_BO];
 	enum radeon_bo_usage	bo_usage[SI_PM4_MAX_BO];
 	enum radeon_bo_priority	bo_priority[SI_PM4_MAX_BO];
-
-	bool compute_pkt;
 };
 
 void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
@@ -72,7 +69,6 @@
 				   struct si_pm4_state *state);
 
 void si_pm4_clear_state(struct si_pm4_state *state);
-void si_pm4_free_state_simple(struct si_pm4_state *state);
 void si_pm4_free_state(struct si_context *sctx,
 		       struct si_pm4_state *state,
 		       unsigned idx);
diff --git a/src/gallium/drivers/radeonsi/si_public.h b/src/gallium/drivers/radeonsi/si_public.h
index 7cf36c8..13b1731 100644
--- a/src/gallium/drivers/radeonsi/si_public.h
+++ b/src/gallium/drivers/radeonsi/si_public.h
@@ -25,6 +25,7 @@
 
 struct radeon_winsys;
 
-struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws);
+struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
+					   unsigned flags);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ff18272..68a9515 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -41,6 +41,7 @@
 
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
+#include "ac_exp_param.h"
 #include "si_shader_internal.h"
 #include "si_pipe.h"
 #include "sid.h"
@@ -62,22 +63,19 @@
 
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
-			       struct si_shader *shader,
 			       LLVMTargetMachineRef tm);
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 				 struct lp_build_tgsi_context *bld_base,
 				 struct lp_build_emit_data *emit_data);
 
-static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
+static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
 			       FILE *f);
 
 static unsigned llvm_get_type_size(LLVMTypeRef type);
 
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key);
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-					union si_shader_part_key *key);
 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 					 union si_shader_part_key *key);
 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
@@ -85,21 +83,49 @@
 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key);
 
-/* Ideally pass the sample mask input to the PS epilog as v13, which
+/* Ideally pass the sample mask input to the PS epilog as v14, which
  * is its usual location, so that the shader doesn't have to add v_mov.
  */
-#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
-
-/* The VS location of the PrimitiveID input is the same in the epilog,
- * so that the main shader part doesn't have to move it.
- */
-#define VS_EPILOG_PRIMID_LOC 2
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
 
 enum {
 	CONST_ADDR_SPACE = 2,
 	LOCAL_ADDR_SPACE = 3,
 };
 
+static bool is_merged_shader(struct si_shader *shader)
+{
+	if (shader->selector->screen->b.chip_class <= VI)
+		return false;
+
+	return shader->key.as_ls ||
+	       shader->key.as_es ||
+	       shader->selector->type == PIPE_SHADER_TESS_CTRL ||
+	       shader->selector->type == PIPE_SHADER_GEOMETRY;
+}
+
+/**
+ * Returns a unique index for a per-patch semantic name and index. The index
+ * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
+ * can be calculated.
+ */
+unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
+{
+	switch (semantic_name) {
+	case TGSI_SEMANTIC_TESSOUTER:
+		return 0;
+	case TGSI_SEMANTIC_TESSINNER:
+		return 1;
+	case TGSI_SEMANTIC_PATCH:
+		assert(index < 30);
+		return 2 + index;
+
+	default:
+		assert(!"invalid semantic name");
+		return 0;
+	}
+}
+
 /**
  * Returns a unique index for a semantic name and index. The index must be
  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
@@ -110,48 +136,38 @@
 	switch (semantic_name) {
 	case TGSI_SEMANTIC_POSITION:
 		return 0;
-	case TGSI_SEMANTIC_PSIZE:
-		return 1;
-	case TGSI_SEMANTIC_CLIPDIST:
-		assert(index <= 1);
-		return 2 + index;
 	case TGSI_SEMANTIC_GENERIC:
-		if (index <= 63-4)
-			return 4 + index;
+		/* Since some shader stages use the the highest used IO index
+		 * to determine the size to allocate for inputs/outputs
+		 * (in LDS, tess and GS rings). GENERIC should be placed right
+		 * after POSITION to make that size as small as possible.
+		 */
+		if (index < SI_MAX_IO_GENERIC)
+			return 1 + index;
 
 		assert(!"invalid generic index");
 		return 0;
-
-	/* patch indices are completely separate and thus start from 0 */
-	case TGSI_SEMANTIC_TESSOUTER:
-		return 0;
-	case TGSI_SEMANTIC_TESSINNER:
-		return 1;
-	case TGSI_SEMANTIC_PATCH:
-		return 2 + index;
-
-	default:
-		assert(!"invalid semantic name");
-		return 0;
-	}
-}
-
-unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
-{
-	switch (name) {
+	case TGSI_SEMANTIC_PSIZE:
+		return SI_MAX_IO_GENERIC + 1;
+	case TGSI_SEMANTIC_CLIPDIST:
+		assert(index <= 1);
+		return SI_MAX_IO_GENERIC + 2 + index;
 	case TGSI_SEMANTIC_FOG:
-		return 0;
+		return SI_MAX_IO_GENERIC + 4;
 	case TGSI_SEMANTIC_LAYER:
-		return 1;
+		return SI_MAX_IO_GENERIC + 5;
 	case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		return 2;
+		return SI_MAX_IO_GENERIC + 6;
 	case TGSI_SEMANTIC_PRIMID:
-		return 3;
+		return SI_MAX_IO_GENERIC + 7;
 	case TGSI_SEMANTIC_COLOR: /* these alias */
 	case TGSI_SEMANTIC_BCOLOR:
-		return 4 + index;
+		assert(index < 2);
+		return SI_MAX_IO_GENERIC + 8 + index;
 	case TGSI_SEMANTIC_TEXCOORD:
-		return 6 + index;
+		assert(index < 8);
+		assert(SI_MAX_IO_GENERIC + 10 + index < 64);
+		return SI_MAX_IO_GENERIC + 10 + index;
 	default:
 		assert(!"invalid semantic name");
 		return 0;
@@ -190,7 +206,7 @@
 {
 	switch (ctx->type) {
 	case PIPE_SHADER_TESS_CTRL:
-		return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
+		return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 
 	case PIPE_SHADER_TESS_EVAL:
 		return LLVMGetParam(ctx->main_fn,
@@ -226,20 +242,13 @@
 static LLVMValueRef
 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 {
-	if (ctx->type == PIPE_SHADER_VERTEX)
-		return unpack_param(ctx, SI_PARAM_VS_STATE_BITS, 8, 13);
-	else if (ctx->type == PIPE_SHADER_TESS_CTRL)
-		return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 8, 13);
-	else {
-		assert(0);
-		return NULL;
-	}
+	return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 }
 
 static LLVMValueRef
 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
-	return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
+	return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 }
 
 static LLVMValueRef
@@ -247,7 +256,7 @@
 {
 	return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 				unpack_param(ctx,
-					     SI_PARAM_TCS_OUT_OFFSETS,
+					     ctx->param_tcs_out_lds_offsets,
 					     0, 16),
 				4);
 }
@@ -257,7 +266,7 @@
 {
 	return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 				unpack_param(ctx,
-					     SI_PARAM_TCS_OUT_OFFSETS,
+					     ctx->param_tcs_out_lds_offsets,
 					     16, 16),
 				4);
 }
@@ -303,7 +312,7 @@
 
 static LLVMValueRef get_instance_index_for_fetch(
 	struct si_shader_context *ctx,
-	unsigned param_start_instance, unsigned divisor)
+	unsigned param_start_instance, LLVMValueRef divisor)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 
@@ -311,9 +320,8 @@
 					   ctx->param_instance_id);
 
 	/* The division must be done before START_INSTANCE is added. */
-	if (divisor > 1)
-		result = LLVMBuildUDiv(gallivm->builder, result,
-				LLVMConstInt(ctx->i32, divisor, 0), "");
+	if (divisor != ctx->i32_1)
+		result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
 
 	return LLVMBuildAdd(gallivm->builder, result,
 			    LLVMGetParam(ctx->main_fn, param_start_instance), "");
@@ -354,7 +362,7 @@
 	LLVMValueRef input[3];
 
 	/* Load the T list */
-	t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS);
+	t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 
 	t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 
@@ -364,7 +372,7 @@
 				    ctx->param_vertex_index0 +
 				    input_index);
 
-	fix_fetch = ctx->shader->key.mono.vs.fix_fetch[input_index];
+	fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 
 	/* Do multiple loads for special formats. */
 	switch (fix_fetch) {
@@ -554,13 +562,13 @@
 				    ctx->param_vs_prim_id);
 	case PIPE_SHADER_TESS_CTRL:
 		return LLVMGetParam(ctx->main_fn,
-				    SI_PARAM_PATCH_ID);
+				    ctx->param_tcs_patch_id);
 	case PIPE_SHADER_TESS_EVAL:
 		return LLVMGetParam(ctx->main_fn,
 				    ctx->param_tes_patch_id);
 	case PIPE_SHADER_GEOMETRY:
 		return LLVMGetParam(ctx->main_fn,
-				    SI_PARAM_PRIMITIVE_ID);
+				    ctx->param_gs_prim_id);
 	default:
 		assert(0);
 		return ctx->i32_0;
@@ -589,19 +597,12 @@
  * Like get_indirect_index, but restricts the return value to a (possibly
  * undefined) value inside [0..num).
  */
-static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
-					       const struct tgsi_ind_register *ind,
-					       int rel_index, unsigned num)
+LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
+					   const struct tgsi_ind_register *ind,
+					   int rel_index, unsigned num)
 {
 	LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 
-	/* LLVM 3.8: If indirect resource indexing is used:
-	 * - SI & CIK hang
-	 * - VI crashes
-	 */
-	if (HAVE_LLVM == 0x0308)
-		return LLVMGetUndef(ctx->i32);
-
 	return si_llvm_bound_index(ctx, result, num);
 }
 
@@ -680,10 +681,15 @@
 				    LLVMBuildMul(gallivm->builder, ind_index,
 						 LLVMConstInt(ctx->i32, 4, 0), ""), "");
 
-		param = si_shader_io_get_unique_index(name[first], index[first]);
+		param = reg.Register.Dimension ?
+			si_shader_io_get_unique_index(name[first], index[first]) :
+			si_shader_io_get_unique_index_patch(name[first], index[first]);
 	} else {
-		param = si_shader_io_get_unique_index(name[reg.Register.Index],
-						      index[reg.Register.Index]);
+		param = reg.Register.Dimension ?
+			si_shader_io_get_unique_index(name[reg.Register.Index],
+						      index[reg.Register.Index]) :
+			si_shader_io_get_unique_index_patch(name[reg.Register.Index],
+							    index[reg.Register.Index]);
 	}
 
 	/* Add the base address of the element. */
@@ -718,8 +724,8 @@
 	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 	LLVMValueRef param_stride, constant16;
 
-	vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
-	num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
+	vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
+	num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 	total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 	                              num_patches, "");
 
@@ -745,7 +751,7 @@
 
 	if (!vertex_index) {
 		LLVMValueRef patch_data_offset =
-		           unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
+		           unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 
 		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 		                         patch_data_offset, "");
@@ -805,8 +811,9 @@
 		param_index = ctx->i32_0;
 	}
 
-	param_index_base = si_shader_io_get_unique_index(name[param_base],
-	                                                 index[param_base]);
+	param_index_base = reg.Register.Dimension ?
+		si_shader_io_get_unique_index(name[param_base], index[param_base]) :
+		si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
 
 	param_index = LLVMBuildAdd(gallivm->builder, param_index,
 	                           LLVMConstInt(ctx->i32, param_index_base, 0),
@@ -819,7 +826,7 @@
 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
                                 enum tgsi_opcode_type type, unsigned swizzle,
                                 LLVMValueRef buffer, LLVMValueRef offset,
-                                LLVMValueRef base, bool readonly_memory)
+                                LLVMValueRef base, bool can_speculate)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = &ctx->gallivm;
@@ -829,14 +836,14 @@
 
 	if (swizzle == ~0) {
 		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-					     0, 1, 0, readonly_memory);
+					     0, 1, 0, can_speculate, false);
 
 		return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 	}
 
 	if (!tgsi_type_is_64bit(type)) {
 		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-					     0, 1, 0, readonly_memory);
+					     0, 1, 0, can_speculate, false);
 
 		value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 		return LLVMBuildExtractElement(gallivm->builder, value,
@@ -844,10 +851,10 @@
 	}
 
 	value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-	                          swizzle * 4, 1, 0, readonly_memory);
+	                          swizzle * 4, 1, 0, can_speculate, false);
 
 	value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-	                           swizzle * 4 + 4, 1, 0, readonly_memory);
+	                           swizzle * 4 + 4, 1, 0, can_speculate, false);
 
 	return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 }
@@ -901,20 +908,44 @@
  * \param value		value to store
  */
 static void lds_store(struct lp_build_tgsi_context *bld_base,
-		      unsigned swizzle, LLVMValueRef dw_addr,
+		      unsigned dw_offset_imm, LLVMValueRef dw_addr,
 		      LLVMValueRef value)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = &ctx->gallivm;
 
 	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
-			    LLVMConstInt(ctx->i32, swizzle, 0));
+			    LLVMConstInt(ctx->i32, dw_offset_imm, 0));
 
 	value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 	ac_build_indexed_store(&ctx->ac, ctx->lds,
 			       dw_addr, value);
 }
 
+static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
+						  unsigned param)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+
+	LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
+	addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
+	addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
+
+	uint64_t desc2 = 0xffffffff;
+	uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+			 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+		         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+		         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+			 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+		         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+	LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
+
+	LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
+	desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
+	desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
+	return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
+}
+
 static LLVMValueRef fetch_input_tcs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -923,7 +954,7 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMValueRef dw_addr, stride;
 
-	stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 24, 8);
+	stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 	dw_addr = get_tcs_in_current_patch_offset(ctx);
 	dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 
@@ -939,7 +970,7 @@
 	LLVMValueRef dw_addr, stride;
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 		dw_addr = get_tcs_out_current_patch_offset(ctx);
 		dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 	} else {
@@ -956,14 +987,11 @@
 	enum tgsi_opcode_type type, unsigned swizzle)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef rw_buffers, buffer, base, addr;
+	LLVMValueRef buffer, base, addr;
 
-	rw_buffers = LLVMGetParam(ctx->main_fn,
-				  SI_PARAM_RW_BUFFERS);
-	buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-			LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+	buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
-	base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
+	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 	addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
 
 	return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
@@ -980,7 +1008,7 @@
 	const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
 	unsigned chan_index;
 	LLVMValueRef dw_addr, stride;
-	LLVMValueRef rw_buffers, buffer, base, buf_addr;
+	LLVMValueRef buffer, base, buf_addr;
 	LLVMValueRef values[4];
 	bool skip_lds_store;
 	bool is_tess_factor = false;
@@ -995,7 +1023,7 @@
 	}
 
 	if (reg->Register.Dimension) {
-		stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 		dw_addr = get_tcs_out_current_patch_offset(ctx);
 		dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
 		skip_lds_store = !sh_info->reads_pervertex_outputs;
@@ -1016,12 +1044,9 @@
 		}
 	}
 
-	rw_buffers = LLVMGetParam(ctx->main_fn,
-				  SI_PARAM_RW_BUFFERS);
-	buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-			LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+	buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
-	base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
+	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 	buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
 
 
@@ -1064,7 +1089,6 @@
 	struct lp_build_context *uint =	&ctx->bld_base.uint_bld;
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMValueRef vtx_offset, soffset;
-	unsigned vtx_offset_param;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
@@ -1077,6 +1101,36 @@
 	if (!reg->Register.Dimension)
 		return NULL;
 
+	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
+
+	/* GFX9 has the ESGS ring in LDS. */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		unsigned index = reg->Dimension.Index;
+
+		switch (index / 2) {
+		case 0:
+			vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
+						  index % 2 ? 16 : 0, 16);
+			break;
+		case 1:
+			vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
+						  index % 2 ? 16 : 0, 16);
+			break;
+		case 2:
+			vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
+						  index % 2 ? 16 : 0, 16);
+			break;
+		default:
+			assert(0);
+			return NULL;
+		}
+
+		vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
+					  LLVMConstInt(ctx->i32, param * 4, 0), "");
+		return lds_load(bld_base, type, swizzle, vtx_offset);
+	}
+
+	/* GFX6: input load from the ESGS ring in memory. */
 	if (swizzle == ~0) {
 		LLVMValueRef values[TGSI_NUM_CHANNELS];
 		unsigned chan;
@@ -1087,31 +1141,30 @@
 					      TGSI_NUM_CHANNELS);
 	}
 
-	/* Get the vertex offset parameter */
-	vtx_offset_param = reg->Dimension.Index;
+	/* Get the vertex offset parameter on GFX6. */
+	unsigned vtx_offset_param = reg->Dimension.Index;
 	if (vtx_offset_param < 2) {
-		vtx_offset_param += SI_PARAM_VTX0_OFFSET;
+		vtx_offset_param += ctx->param_gs_vtx0_offset;
 	} else {
 		assert(vtx_offset_param < 6);
-		vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
+		vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
 	}
 	vtx_offset = lp_build_mul_imm(uint,
 				      LLVMGetParam(ctx->main_fn,
 						   vtx_offset_param),
 				      4);
 
-	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
 	soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
 
 	value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
-				     vtx_offset, soffset, 0, 1, 0, true);
+				     vtx_offset, soffset, 0, 1, 0, true, false);
 	if (tgsi_type_is_64bit(type)) {
 		LLVMValueRef value2;
 		soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
 
 		value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
 					      ctx->i32_0, vtx_offset, soffset,
-					      0, 1, 0, true);
+					      0, 1, 0, true, false);
 		return si_llvm_emit_fetch_64bit(bld_base, type,
 						value, value2);
 	}
@@ -1149,6 +1202,24 @@
 	}
 }
 
+static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
+				       unsigned attr_index, unsigned chan,
+				       LLVMValueRef prim_mask,
+				       LLVMValueRef i, LLVMValueRef j)
+{
+	if (i || j) {
+		return ac_build_fs_interp(&ctx->ac,
+					  LLVMConstInt(ctx->i32, chan, 0),
+					  LLVMConstInt(ctx->i32, attr_index, 0),
+					  prim_mask, i, j);
+	}
+	return ac_build_fs_interp_mov(&ctx->ac,
+				      LLVMConstInt(ctx->i32, 2, 0), /* P0 */
+				      LLVMConstInt(ctx->i32, chan, 0),
+				      LLVMConstInt(ctx->i32, attr_index, 0),
+				      prim_mask);
+}
+
 /**
  * Interpolate a fragment shader input.
  *
@@ -1175,9 +1246,7 @@
 			    LLVMValueRef result[4])
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMValueRef attr_number;
-	LLVMValueRef i, j;
-
+	LLVMValueRef i = NULL, j = NULL;
 	unsigned chan;
 
 	/* fs.constant returns the param from the middle vertex, so it's not
@@ -1195,8 +1264,6 @@
 	 */
 	bool interp = interp_param != NULL;
 
-	attr_number = LLVMConstInt(ctx->i32, input_index, 0);
-
 	if (interp) {
 		interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
 						LLVMVectorType(ctx->f32, 2), "");
@@ -1210,7 +1277,6 @@
 	if (semantic_name == TGSI_SEMANTIC_COLOR &&
 	    ctx->shader->key.part.ps.prolog.color_two_side) {
 		LLVMValueRef is_face_positive;
-		LLVMValueRef back_attr_number;
 
 		/* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
 		 * otherwise it's at offset "num_inputs".
@@ -1219,30 +1285,18 @@
 		if (semantic_index == 1 && colors_read_mask & 0xf)
 			back_attr_offset += 1;
 
-		back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
-
 		is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
 						 face, ctx->i32_0, "");
 
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 			LLVMValueRef front, back;
 
-			if (interp) {
-				front = ac_build_fs_interp(&ctx->ac, llvm_chan,
-							attr_number, prim_mask,
-							i, j);
-				back = ac_build_fs_interp(&ctx->ac, llvm_chan,
-							back_attr_number, prim_mask,
-							i, j);
-			} else {
-				front = ac_build_fs_interp_mov(&ctx->ac,
-					LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-					llvm_chan, attr_number, prim_mask);
-				back = ac_build_fs_interp_mov(&ctx->ac,
-					LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-					llvm_chan, back_attr_number, prim_mask);
-			}
+			front = si_build_fs_interp(ctx,
+						   input_index, chan,
+						   prim_mask, i, j);
+			back = si_build_fs_interp(ctx,
+						  back_attr_offset, chan,
+						  prim_mask, i, j);
 
 			result[chan] = LLVMBuildSelect(gallivm->builder,
 						is_face_positive,
@@ -1251,29 +1305,16 @@
 						"");
 		}
 	} else if (semantic_name == TGSI_SEMANTIC_FOG) {
-		if (interp) {
-			result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
-						       attr_number, prim_mask, i, j);
-		} else {
-			result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
-							   LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-							   attr_number, prim_mask);
-		}
+		result[0] = si_build_fs_interp(ctx, input_index,
+					       0, prim_mask, i, j);
 		result[1] =
 		result[2] = LLVMConstReal(ctx->f32, 0.0f);
 		result[3] = LLVMConstReal(ctx->f32, 1.0f);
 	} else {
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
-
-			if (interp) {
-				result[chan] = ac_build_fs_interp(&ctx->ac,
-					llvm_chan, attr_number, prim_mask, i, j);
-			} else {
-				result[chan] = ac_build_fs_interp_mov(&ctx->ac,
-					LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-					llvm_chan, attr_number, prim_mask);
-			}
+			result[chan] = si_build_fs_interp(ctx,
+							  input_index, chan,
+							  prim_mask, i, j);
 		}
 	}
 }
@@ -1313,13 +1354,8 @@
 		interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
 	}
 
-	if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
-	    decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
-	    ctx->shader->key.part.ps.prolog.flatshade_colors)
-		interp_param = NULL; /* load the constant color */
-
 	interp_fs_input(ctx, input_index, decl->Semantic.Name,
-			decl->Semantic.Index, shader->selector->info.num_inputs,
+			decl->Semantic.Index, 0, /* this param is unused */
 			shader->selector->info.colors_read, interp_param,
 			LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
 			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
@@ -1339,12 +1375,8 @@
 				      LLVMValueRef resource,
 				      LLVMValueRef offset)
 {
-	LLVMBuilderRef builder = ctx->gallivm.builder;
-	LLVMValueRef args[2] = {resource, offset};
-
-	return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
-				  LP_FUNC_ATTR_READNONE |
-				  LP_FUNC_ATTR_LEGACY);
+	return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
+				    0, 0, 0, true, true);
 }
 
 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
@@ -1352,7 +1384,7 @@
 	struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef desc = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
+	LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
 	LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
 	LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
 
@@ -1391,7 +1423,7 @@
 				     LLVMGetParam(ctx->main_fn,
 						  ctx->param_vertex_id),
 				     LLVMGetParam(ctx->main_fn,
-						  SI_PARAM_BASE_VERTEX), "");
+						  ctx->param_base_vertex), "");
 		break;
 
 	case TGSI_SEMANTIC_VERTEXID_NOBASE:
@@ -1406,34 +1438,32 @@
 		 * (for direct draws) or the CP (for indirect draws) is the
 		 * first vertex ID, but GLSL expects 0 to be returned.
 		 */
-		LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, SI_PARAM_VS_STATE_BITS);
+		LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
 		LLVMValueRef indexed;
 
 		indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
 		indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
 
 		value = LLVMBuildSelect(gallivm->builder, indexed,
-					LLVMGetParam(ctx->main_fn, SI_PARAM_BASE_VERTEX),
+					LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
 					ctx->i32_0, "");
 		break;
 	}
 
 	case TGSI_SEMANTIC_BASEINSTANCE:
-		value = LLVMGetParam(ctx->main_fn,
-				     SI_PARAM_START_INSTANCE);
+		value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
 		break;
 
 	case TGSI_SEMANTIC_DRAWID:
-		value = LLVMGetParam(ctx->main_fn,
-				     SI_PARAM_DRAWID);
+		value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
 		break;
 
 	case TGSI_SEMANTIC_INVOCATIONID:
 		if (ctx->type == PIPE_SHADER_TESS_CTRL)
-			value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+			value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
 		else if (ctx->type == PIPE_SHADER_GEOMETRY)
 			value = LLVMGetParam(ctx->main_fn,
-					     SI_PARAM_GS_INSTANCE_ID);
+					     ctx->param_gs_instance_id);
 		else
 			assert(!"INVOCATIONID not implemented");
 		break;
@@ -1503,9 +1533,9 @@
 
 	case TGSI_SEMANTIC_VERTICESIN:
 		if (ctx->type == PIPE_SHADER_TESS_CTRL)
-			value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+			value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
 		else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-			value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
+			value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 		else
 			assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
 		break;
@@ -1513,15 +1543,12 @@
 	case TGSI_SEMANTIC_TESSINNER:
 	case TGSI_SEMANTIC_TESSOUTER:
 	{
-		LLVMValueRef rw_buffers, buffer, base, addr;
-		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
+		LLVMValueRef buffer, base, addr;
+		int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
 
-		rw_buffers = LLVMGetParam(ctx->main_fn,
-					SI_PARAM_RW_BUFFERS);
-		buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-		        LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+		buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
-		base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
+		base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 		addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
 		                          LLVMConstInt(ctx->i32, param, 0));
 
@@ -1538,7 +1565,7 @@
 		int i, offset;
 
 		slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
-		buf = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
+		buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
 		buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
 		offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
 
@@ -1554,7 +1581,7 @@
 		break;
 
 	case TGSI_SEMANTIC_GRID_SIZE:
-		value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
+		value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
 		break;
 
 	case TGSI_SEMANTIC_BLOCK_SIZE:
@@ -1575,31 +1602,37 @@
 
 			value = lp_build_gather_values(gallivm, values, 3);
 		} else {
-			value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
+			value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
 		}
 		break;
 	}
 
 	case TGSI_SEMANTIC_BLOCK_ID:
-		value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
+	{
+		LLVMValueRef values[3];
+
+		for (int i = 0; i < 3; i++) {
+			values[i] = ctx->i32_0;
+			if (ctx->param_block_id[i] >= 0) {
+				values[i] = LLVMGetParam(ctx->main_fn,
+							 ctx->param_block_id[i]);
+			}
+		}
+		value = lp_build_gather_values(gallivm, values, 3);
 		break;
+	}
 
 	case TGSI_SEMANTIC_THREAD_ID:
-		value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
+		value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
 		break;
 
 	case TGSI_SEMANTIC_HELPER_INVOCATION:
-		if (HAVE_LLVM >= 0x0309) {
-			value = lp_build_intrinsic(gallivm->builder,
-						   "llvm.amdgcn.ps.live",
-						   ctx->i1, NULL, 0,
-						   LP_FUNC_ATTR_READNONE);
-			value = LLVMBuildNot(gallivm->builder, value, "");
-			value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
-		} else {
-			assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
-			return;
-		}
+		value = lp_build_intrinsic(gallivm->builder,
+					   "llvm.amdgcn.ps.live",
+					   ctx->i1, NULL, 0,
+					   LP_FUNC_ATTR_READNONE);
+		value = LLVMBuildNot(gallivm->builder, value, "");
+		value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
 		break;
 
 	case TGSI_SEMANTIC_SUBGROUP_SIZE:
@@ -1675,10 +1708,10 @@
 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
 {
 	LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
-					     SI_PARAM_CONST_BUFFERS);
+					     ctx->param_const_and_shader_buffers);
 
 	return ac_build_indexed_load_const(&ctx->ac, list_ptr,
-					LLVMConstInt(ctx->i32, i, 0));
+			LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
 }
 
 static LLVMValueRef fetch_constant(
@@ -1708,11 +1741,13 @@
 	idx = reg->Register.Index * 4 + swizzle;
 
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
-		LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_CONST_BUFFERS);
+		LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
 		LLVMValueRef index;
-		index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
-						   reg->Dimension.Index,
-						   SI_NUM_CONST_BUFFERS);
+		index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
+						      reg->Dimension.Index,
+						      ctx->num_const_buffers);
+		index = LLVMBuildAdd(ctx->gallivm.builder, index,
+				     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
 		bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
 	} else
 		bufp = load_const_buffer_desc(ctx, buf);
@@ -2014,7 +2049,7 @@
 	unsigned chan;
 	unsigned const_chan;
 	LLVMValueRef base_elt;
-	LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
+	LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
 	LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
 						   SI_VS_CONST_CLIP_PLANES, 0);
 	LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
@@ -2170,7 +2205,7 @@
 		LLVMValueRef so_write_offset[4] = {};
 		LLVMValueRef so_buffers[4];
 		LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
-						    SI_PARAM_RW_BUFFERS);
+						    ctx->param_rw_buffers);
 
 		for (i = 0; i < 4; i++) {
 			if (!so->stride[i])
@@ -2207,6 +2242,62 @@
 	lp_build_endif(&if_ctx);
 }
 
+static void si_export_param(struct si_shader_context *ctx, unsigned index,
+			    LLVMValueRef *values)
+{
+	struct ac_export_args args;
+
+	si_llvm_init_export_args(&ctx->bld_base, values,
+				 V_008DFC_SQ_EXP_PARAM + index, &args);
+	ac_build_export(&ctx->ac, &args);
+}
+
+static void si_build_param_exports(struct si_shader_context *ctx,
+				   struct si_shader_output_values *outputs,
+			           unsigned noutput)
+{
+	struct si_shader *shader = ctx->shader;
+	unsigned param_count = 0;
+
+	for (unsigned i = 0; i < noutput; i++) {
+		unsigned semantic_name = outputs[i].semantic_name;
+		unsigned semantic_index = outputs[i].semantic_index;
+
+		if (outputs[i].vertex_stream[0] != 0 &&
+		    outputs[i].vertex_stream[1] != 0 &&
+		    outputs[i].vertex_stream[2] != 0 &&
+		    outputs[i].vertex_stream[3] != 0)
+			continue;
+
+		switch (semantic_name) {
+		case TGSI_SEMANTIC_LAYER:
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+		case TGSI_SEMANTIC_CLIPDIST:
+		case TGSI_SEMANTIC_COLOR:
+		case TGSI_SEMANTIC_BCOLOR:
+		case TGSI_SEMANTIC_PRIMID:
+		case TGSI_SEMANTIC_FOG:
+		case TGSI_SEMANTIC_TEXCOORD:
+		case TGSI_SEMANTIC_GENERIC:
+			break;
+		default:
+			continue;
+		}
+
+		if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
+		     semantic_index < SI_MAX_IO_GENERIC) &&
+		    shader->key.opt.kill_outputs &
+		    (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
+			continue;
+
+		si_export_param(ctx, param_count, outputs[i].values);
+
+		assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+		shader->info.vs_output_param_offset[i] = param_count++;
+	}
+
+	shader->info.nr_param_exports = param_count;
+}
 
 /* Generate export instructions for hardware VS shader stage */
 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
@@ -2216,114 +2307,47 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct si_shader *shader = ctx->shader;
 	struct lp_build_context *base = &bld_base->base;
-	struct ac_export_args args, pos_args[4] = {};
+	struct ac_export_args pos_args[4] = {};
 	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
-	unsigned semantic_name, semantic_index;
-	unsigned target;
-	unsigned param_count = 0;
 	unsigned pos_idx;
 	int i;
 
+	/* Build position exports. */
 	for (i = 0; i < noutput; i++) {
-		semantic_name = outputs[i].semantic_name;
-		semantic_index = outputs[i].semantic_index;
-		bool export_param = true;
-
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_POSITION: /* ignore these */
-		case TGSI_SEMANTIC_PSIZE:
-		case TGSI_SEMANTIC_CLIPVERTEX:
-		case TGSI_SEMANTIC_EDGEFLAG:
+		switch (outputs[i].semantic_name) {
+		case TGSI_SEMANTIC_POSITION:
+			si_llvm_init_export_args(bld_base, outputs[i].values,
+						 V_008DFC_SQ_EXP_POS, &pos_args[0]);
 			break;
-		case TGSI_SEMANTIC_GENERIC:
-		case TGSI_SEMANTIC_CLIPDIST:
-			if (shader->key.opt.hw_vs.kill_outputs &
-			    (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
-				export_param = false;
-			break;
-		default:
-			if (shader->key.opt.hw_vs.kill_outputs2 &
-			    (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
-				export_param = false;
-			break;
-		}
-
-		if (outputs[i].vertex_stream[0] != 0 &&
-		    outputs[i].vertex_stream[1] != 0 &&
-		    outputs[i].vertex_stream[2] != 0 &&
-		    outputs[i].vertex_stream[3] != 0)
-			export_param = false;
-
-handle_semantic:
-		/* Select the correct target */
-		switch(semantic_name) {
 		case TGSI_SEMANTIC_PSIZE:
 			psize_value = outputs[i].values[0];
-			continue;
-		case TGSI_SEMANTIC_EDGEFLAG:
-			edgeflag_value = outputs[i].values[0];
-			continue;
+			break;
 		case TGSI_SEMANTIC_LAYER:
 			layer_value = outputs[i].values[0];
-			semantic_name = TGSI_SEMANTIC_GENERIC;
-			goto handle_semantic;
+			break;
 		case TGSI_SEMANTIC_VIEWPORT_INDEX:
 			viewport_index_value = outputs[i].values[0];
-			semantic_name = TGSI_SEMANTIC_GENERIC;
-			goto handle_semantic;
-		case TGSI_SEMANTIC_POSITION:
-			target = V_008DFC_SQ_EXP_POS;
+			break;
+		case TGSI_SEMANTIC_EDGEFLAG:
+			edgeflag_value = outputs[i].values[0];
 			break;
 		case TGSI_SEMANTIC_CLIPDIST:
-			if (shader->key.opt.hw_vs.clip_disable) {
-				semantic_name = TGSI_SEMANTIC_GENERIC;
-				goto handle_semantic;
+			if (!shader->key.opt.clip_disable) {
+				unsigned index = 2 + outputs[i].semantic_index;
+				si_llvm_init_export_args(bld_base, outputs[i].values,
+							 V_008DFC_SQ_EXP_POS + index,
+							 &pos_args[index]);
 			}
-			target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
 			break;
 		case TGSI_SEMANTIC_CLIPVERTEX:
-			if (shader->key.opt.hw_vs.clip_disable)
-				continue;
-			si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
-			continue;
-		case TGSI_SEMANTIC_COLOR:
-		case TGSI_SEMANTIC_BCOLOR:
-		case TGSI_SEMANTIC_PRIMID:
-		case TGSI_SEMANTIC_FOG:
-		case TGSI_SEMANTIC_TEXCOORD:
-		case TGSI_SEMANTIC_GENERIC:
-			if (!export_param)
-				continue;
-			target = V_008DFC_SQ_EXP_PARAM + param_count;
-			assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
-			shader->info.vs_output_param_offset[i] = param_count;
-			param_count++;
+			if (!shader->key.opt.clip_disable) {
+				si_llvm_emit_clipvertex(bld_base, pos_args,
+							outputs[i].values);
+			}
 			break;
-		default:
-			target = 0;
-			fprintf(stderr,
-				"Warning: SI unhandled vs output type:%d\n",
-				semantic_name);
-		}
-
-		si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
-
-		if (target >= V_008DFC_SQ_EXP_POS &&
-		    target <= (V_008DFC_SQ_EXP_POS + 3)) {
-			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
-			       &args, sizeof(args));
-		} else {
-			ac_build_export(&ctx->ac, &args);
-		}
-
-		if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
-			semantic_name = TGSI_SEMANTIC_GENERIC;
-			goto handle_semantic;
 		}
 	}
 
-	shader->info.nr_param_exports = param_count;
-
 	/* We need to add the position output manually if it's missing. */
 	if (!pos_args[0].out[0]) {
 		pos_args[0].enabled_channels = 0xf; /* writemask */
@@ -2422,6 +2446,9 @@
 
 		ac_build_export(&ctx->ac, &pos_args[i]);
 	}
+
+	/* Build parameter exports. */
+	si_build_param_exports(ctx, outputs, noutput);
 }
 
 /**
@@ -2432,25 +2459,21 @@
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
+	LLVMValueRef invocation_id, buffer, buffer_offset;
 	LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
 	uint64_t inputs;
 
-	invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+	invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
+	buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
+	buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 
-	rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
-	buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-	                LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
-
-	buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
-
-	lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 24, 8);
+	lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 	lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
 	                                 lds_vertex_stride, "");
 	lds_base = get_tcs_in_current_patch_offset(ctx);
 	lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
 
-	inputs = ctx->shader->key.mono.tcs.inputs_to_copy;
+	inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
 	while (inputs) {
 		unsigned i = u_bit_scan64(&inputs);
 
@@ -2481,8 +2504,8 @@
 	struct si_shader *shader = ctx->shader;
 	unsigned tess_inner_index, tess_outer_index;
 	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-	LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
-	unsigned stride, outer_comps, inner_comps, i;
+	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+	unsigned stride, outer_comps, inner_comps, i, offset;
 	struct lp_build_if_state if_ctx, inner_if_ctx;
 
 	si_llvm_emit_barrier(NULL, bld_base, NULL);
@@ -2522,8 +2545,8 @@
 	/* Load tess_inner and tess_outer from LDS.
 	 * Any invocation can write them, so we can't get them from a temporary.
 	 */
-	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
-	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
+	tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+	tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
 
 	lds_base = tcs_out_current_patch_data_offset;
 	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
@@ -2563,14 +2586,11 @@
 		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
 	/* Get the buffer. */
-	rw_buffers = LLVMGetParam(ctx->main_fn,
-				  SI_PARAM_RW_BUFFERS);
-	buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-			LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
+	buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
 
 	/* Get the offset. */
 	tf_base = LLVMGetParam(ctx->main_fn,
-			       SI_PARAM_TESS_FACTOR_OFFSET);
+			       ctx->param_tcs_factor_offset);
 	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
 				  LLVMConstInt(ctx->i32, 4 * stride, 0), "");
 
@@ -2579,21 +2599,26 @@
 				  rel_patch_id, ctx->i32_0, ""));
 
 	/* Store the dynamic HS control word. */
-	ac_build_buffer_store_dword(&ctx->ac, buffer,
-				    LLVMConstInt(ctx->i32, 0x80000000, 0),
-				    1, ctx->i32_0, tf_base,
-				    0, 1, 0, true, false);
+	offset = 0;
+	if (ctx->screen->b.chip_class <= VI) {
+		ac_build_buffer_store_dword(&ctx->ac, buffer,
+					    LLVMConstInt(ctx->i32, 0x80000000, 0),
+					    1, ctx->i32_0, tf_base,
+					    offset, 1, 0, true, false);
+		offset += 4;
+	}
 
 	lp_build_endif(&inner_if_ctx);
 
 	/* Store the tessellation factors. */
 	ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
 				    MIN2(stride, 4), byteoffset, tf_base,
-				    4, 1, 0, true, false);
+				    offset, 1, 0, true, false);
+	offset += 16;
 	if (vec1)
 		ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
 					    stride - 4, byteoffset, tf_base,
-					    20, 1, 0, true, false);
+					    offset, 1, 0, true, false);
 
 	/* Store the tess factors into the offchip buffer if TES reads them. */
 	if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
@@ -2601,11 +2626,10 @@
 		LLVMValueRef tf_inner_offset;
 		unsigned param_outer, param_inner;
 
-		buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-				LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
-		base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
+		buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
+		base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 
-		param_outer = si_shader_io_get_unique_index(
+		param_outer = si_shader_io_get_unique_index_patch(
 				      TGSI_SEMANTIC_TESSOUTER, 0);
 		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
 					LLVMConstInt(ctx->i32, param_outer, 0));
@@ -2617,7 +2641,7 @@
 					    outer_comps, tf_outer_offset,
 					    base, 0, 1, 0, true, false);
 		if (inner_comps) {
-			param_inner = si_shader_io_get_unique_index(
+			param_inner = si_shader_io_get_unique_index_patch(
 					      TGSI_SEMANTIC_TESSINNER, 0);
 			tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
 					LLVMConstInt(ctx->i32, param_inner, 0));
@@ -2633,62 +2657,182 @@
 	lp_build_endif(&if_ctx);
 }
 
+static LLVMValueRef
+si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
+		    unsigned param, unsigned return_index)
+{
+	return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
+				    LLVMGetParam(ctx->main_fn, param),
+				    return_index, "");
+}
+
+static LLVMValueRef
+si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
+			  unsigned param, unsigned return_index)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+	LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
+
+	return LLVMBuildInsertValue(builder, ret,
+				    LLVMBuildBitCast(builder, p, ctx->f32, ""),
+				    return_index, "");
+}
+
+static LLVMValueRef
+si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
+			     unsigned param, unsigned return_index)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+	LLVMValueRef ptr, lo, hi;
+
+	ptr = LLVMGetParam(ctx->main_fn, param);
+	ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
+	ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
+	lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
+	hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
+	ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
+	return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
+}
+
 /* This only writes the tessellation factor levels. */
 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
+	LLVMBuilderRef builder = ctx->gallivm.builder;
 	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-	LLVMValueRef offchip_soffset, offchip_layout;
 
 	si_copy_tcs_inputs(bld_base);
 
 	rel_patch_id = get_rel_patch_id(ctx);
-	invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+	invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
 	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
+	if (ctx->screen->b.chip_class >= GFX9) {
+		LLVMBasicBlockRef blocks[2] = {
+			LLVMGetInsertBlock(builder),
+			ctx->merged_wrap_if_state.entry_block
+		};
+		LLVMValueRef values[2];
+
+		lp_build_endif(&ctx->merged_wrap_if_state);
+
+		values[0] = rel_patch_id;
+		values[1] = LLVMGetUndef(ctx->i32);
+		rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+
+		values[0] = tf_lds_offset;
+		values[1] = LLVMGetUndef(ctx->i32);
+		tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+
+		values[0] = invocation_id;
+		values[1] = ctx->i32_1; /* cause the epilog to skip threads */
+		invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+	}
+
 	/* Return epilog parameters from this function. */
-	LLVMBuilderRef builder = ctx->gallivm.builder;
 	LLVMValueRef ret = ctx->return_value;
-	LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
 	unsigned vgpr;
 
-	/* RW_BUFFERS pointer */
-	rw_buffers = LLVMGetParam(ctx->main_fn,
-				  SI_PARAM_RW_BUFFERS);
-	rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
-	rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
-	rw0 = LLVMBuildExtractElement(builder, rw_buffers,
-				      ctx->i32_0, "");
-	rw1 = LLVMBuildExtractElement(builder, rw_buffers,
-				      ctx->i32_1, "");
-	ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
-	ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
-
-	/* Tess offchip and factor buffer soffset are after user SGPRs. */
-	offchip_layout = LLVMGetParam(ctx->main_fn,
-				      SI_PARAM_TCS_OFFCHIP_LAYOUT);
-	offchip_soffset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
-	tf_soffset = LLVMGetParam(ctx->main_fn,
-				  SI_PARAM_TESS_FACTOR_OFFSET);
-	ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
-				   SI_SGPR_TCS_OFFCHIP_LAYOUT, "");
-	ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
-				   SI_TCS_NUM_USER_SGPR, "");
-	ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
-				   SI_TCS_NUM_USER_SGPR + 1, "");
+	if (ctx->screen->b.chip_class >= GFX9) {
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+					  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+					  8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+					  8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
+		/* Tess offchip and tess factor offsets are at the beginning. */
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
+		vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
+	} else {
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+					  GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+					  GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+					  GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
+		/* Tess offchip and tess factor offsets are after user SGPRs. */
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
+					  GFX6_TCS_NUM_USER_SGPR);
+		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
+					  GFX6_TCS_NUM_USER_SGPR + 1);
+		vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+	}
 
 	/* VGPRs */
 	rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
 	invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
 	tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
 
-	vgpr = SI_TCS_NUM_USER_SGPR + 2;
 	ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
 	ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
 	ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
 	ctx->return_value = ret;
 }
 
+/* Pass TCS inputs from LS to TCS on GFX9. */
+static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
+{
+	LLVMValueRef ret = ctx->return_value;
+
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+
+	ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
+				  8 + SI_SGPR_VS_STATE_BITS);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+				  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
+				  8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
+				  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+				  8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+				  8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
+
+	unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
+					   8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
+					   8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
+
+	unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+	ret = si_insert_input_ret_float(ctx, ret,
+					ctx->param_tcs_patch_id, vgpr++);
+	ret = si_insert_input_ret_float(ctx, ret,
+					ctx->param_tcs_rel_ids, vgpr++);
+	ctx->return_value = ret;
+}
+
+/* Pass GS inputs from ES to GS on GFX9. */
+static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
+{
+	LLVMValueRef ret = ctx->return_value;
+
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
+	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
+
+	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+
+	unsigned desc_param = ctx->param_vs_state_bits + 1;
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
+					   8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
+	ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
+					   8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
+
+	unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
+	for (unsigned i = 0; i < 5; i++) {
+		unsigned param = ctx->param_gs_vtx01_offset + i;
+		ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
+	}
+	ctx->return_value = ret;
+}
+
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2699,7 +2843,7 @@
 	LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
 					      ctx->param_rel_auto_id);
 	LLVMValueRef vertex_dw_stride =
-		unpack_param(ctx, SI_PARAM_VS_STATE_BITS, 24, 8);
+		unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 	LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
 						 vertex_dw_stride, "");
 
@@ -2738,6 +2882,9 @@
 				  LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
 		}
 	}
+
+	if (ctx->screen->b.chip_class >= GFX9)
+		si_set_ls_return_value_for_tcs(ctx);
 }
 
 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2748,31 +2895,60 @@
 	struct tgsi_shader_info *info = &es->selector->info;
 	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
 					    ctx->param_es2gs_offset);
+	LLVMValueRef lds_base = NULL;
 	unsigned chan;
 	int i;
 
+	if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
+		unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+		LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+		LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
+		vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
+					 LLVMBuildMul(gallivm->builder, wave_idx,
+						      LLVMConstInt(ctx->i32, 64, false), ""), "");
+		lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
+					LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
+	}
+
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr = ctx->outputs[i];
-		int param_index;
+		int param;
 
 		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
 		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 			continue;
 
-		param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
-							    info->output_semantic_index[i]);
+		param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+						      info->output_semantic_index[i]);
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
 			out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
 
+			/* GFX9 has the ESGS ring in LDS. */
+			if (ctx->screen->b.chip_class >= GFX9) {
+				lds_store(bld_base, param * 4 + chan, lds_base, out_val);
+				continue;
+			}
+
 			ac_build_buffer_store_dword(&ctx->ac,
 						    ctx->esgs_ring,
 						    out_val, 1, NULL, soffset,
-						    (4 * param_index + chan) * 4,
+						    (4 * param + chan) * 4,
 						    1, 1, true, true);
 		}
 	}
+
+	if (ctx->screen->b.chip_class >= GFX9)
+		si_set_es_return_value_for_gs(ctx);
+}
+
+static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
+{
+	if (ctx->screen->b.chip_class >= GFX9)
+		return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
+	else
+		return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
 }
 
 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2780,7 +2956,10 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-			 LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));
+			 si_get_gs_wave_id(ctx));
+
+	if (ctx->screen->b.chip_class >= GFX9)
+		lp_build_endif(&ctx->merged_wrap_if_state);
 }
 
 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2815,7 +2994,7 @@
 			if (!cond) {
 				/* The state is in the first bit of the user SGPR. */
 				cond = LLVMGetParam(ctx->main_fn,
-						    SI_PARAM_VS_STATE_BITS);
+						    ctx->param_vs_state_bits);
 				cond = LLVMBuildTrunc(gallivm->builder, cond,
 						      ctx->i1, "");
 				lp_build_if(&if_ctx, gallivm, cond);
@@ -2845,19 +3024,25 @@
 			outputs[i].vertex_stream[j] =
 				(info->output_streams[i] >> (2 * j)) & 3;
 		}
-
 	}
 
-	/* Return the primitive ID from the LLVM function. */
-	ctx->return_value =
-		LLVMBuildInsertValue(gallivm->builder,
-				     ctx->return_value,
-				     bitcast(bld_base, TGSI_TYPE_FLOAT,
-					     get_primitive_id(bld_base, 0)),
-				     VS_EPILOG_PRIMID_LOC, "");
-
 	if (ctx->shader->selector->so.num_outputs)
 		si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+	/* Export PrimitiveID. */
+	if (ctx->shader->key.mono.u.vs_export_prim_id) {
+		outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+		outputs[i].semantic_index = 0;
+		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+					       get_primitive_id(bld_base, 0));
+		for (j = 1; j < 4; j++)
+			outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
+
+		memset(outputs[i].vertex_stream, 0,
+		       sizeof(outputs[i].vertex_stream));
+		i++;
+	}
+
 	si_llvm_export_vs(bld_base, outputs, i);
 	FREE(outputs);
 }
@@ -3075,6 +3260,9 @@
 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
 	LLVMValueRef ret;
 
+	if (ctx->postponed_kill)
+		ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
+
 	/* Read the output values. */
 	for (i = 0; i < info->num_outputs; i++) {
 		unsigned semantic_name = info->output_semantic_name[i];
@@ -3143,44 +3331,6 @@
 	ctx->return_value = ret;
 }
 
-/**
- * Given a v8i32 resource descriptor for a buffer, extract the size of the
- * buffer in number of elements and return it as an i32.
- */
-static LLVMValueRef get_buffer_size(
-	struct lp_build_tgsi_context *bld_base,
-	LLVMValueRef descriptor)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef size =
-		LLVMBuildExtractElement(builder, descriptor,
-					LLVMConstInt(ctx->i32, 2, 0), "");
-
-	if (ctx->screen->b.chip_class == VI) {
-		/* On VI, the descriptor contains the size in bytes,
-		 * but TXQ must return the size in elements.
-		 * The stride is always non-zero for resources using TXQ.
-		 */
-		LLVMValueRef stride =
-			LLVMBuildExtractElement(builder, descriptor,
-						ctx->i32_1, "");
-		stride = LLVMBuildLShr(builder, stride,
-				       LLVMConstInt(ctx->i32, 16, 0), "");
-		stride = LLVMBuildAnd(builder, stride,
-				      LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
-
-		size = LLVMBuildUDiv(builder, size, stride, "");
-	}
-
-	return size;
-}
-
-static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
-				struct lp_build_tgsi_context *bld_base,
-				struct lp_build_emit_data *emit_data);
-
 /* Prevent optimizations (at least of memory accesses) across the current
  * point in the program by emitting empty inline assembly that is marked as
  * having side effects.
@@ -3222,12 +3372,7 @@
 	}
 }
 
-/* Combine these with & instead of |. */
-#define NOOP_WAITCNT 0xf7f
-#define LGKM_CNT 0x07f
-#define VM_CNT 0xf70
-
-static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
+void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
@@ -3260,7 +3405,7 @@
 		waitcnt &= LGKM_CNT;
 
 	if (waitcnt != NOOP_WAITCNT)
-		emit_waitcnt(ctx, waitcnt);
+		si_emit_waitcnt(ctx, waitcnt);
 }
 
 static void clock_emit(
@@ -3282,1765 +3427,12 @@
 		LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
 }
 
-static LLVMValueRef
-shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
-			 const struct tgsi_full_src_register *reg)
-{
-	LLVMValueRef index;
-	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
-					     SI_PARAM_SHADER_BUFFERS);
-
-	if (!reg->Register.Indirect)
-		index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
-	else
-		index = get_bounded_indirect_index(ctx, &reg->Indirect,
-						   reg->Register.Index,
-						   SI_NUM_SHADER_BUFFERS);
-
-	return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
-}
-
-static bool tgsi_is_array_sampler(unsigned target)
-{
-	return target == TGSI_TEXTURE_1D_ARRAY ||
-	       target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
-	       target == TGSI_TEXTURE_2D_ARRAY ||
-	       target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
-	       target == TGSI_TEXTURE_CUBE_ARRAY ||
-	       target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
-	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
-}
-
-static bool tgsi_is_array_image(unsigned target)
-{
-	return target == TGSI_TEXTURE_3D ||
-	       target == TGSI_TEXTURE_CUBE ||
-	       target == TGSI_TEXTURE_1D_ARRAY ||
-	       target == TGSI_TEXTURE_2D_ARRAY ||
-	       target == TGSI_TEXTURE_CUBE_ARRAY ||
-	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
-}
-
-/**
- * Given a 256-bit resource descriptor, force the DCC enable bit to off.
- *
- * At least on Tonga, executing image stores on images with DCC enabled and
- * non-trivial can eventually lead to lockups. This can occur when an
- * application binds an image as read-only but then uses a shader that writes
- * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
- * program termination) in this case, but it doesn't cost much to be a bit
- * nicer: disabling DCC in the shader still leads to undefined results but
- * avoids the lockup.
- */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
-				  LLVMValueRef rsrc)
-{
-	if (ctx->screen->b.chip_class <= CIK) {
-		return rsrc;
-	} else {
-		LLVMBuilderRef builder = ctx->gallivm.builder;
-		LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
-		LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
-		LLVMValueRef tmp;
-
-		tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
-		tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
-		return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
-	}
-}
-
-static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
+LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
 {
 	return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
 			       CONST_ADDR_SPACE);
 }
 
-static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
-				    LLVMValueRef list, LLVMValueRef index,
-				    unsigned target)
-{
-	LLVMBuilderRef builder = ctx->gallivm.builder;
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		index = LLVMBuildMul(builder, index,
-				     LLVMConstInt(ctx->i32, 2, 0), "");
-		index = LLVMBuildAdd(builder, index,
-				     ctx->i32_1, "");
-		list = LLVMBuildPointerCast(builder, list,
-					    const_array(ctx->v4i32, 0), "");
-	}
-
-	return ac_build_indexed_load_const(&ctx->ac, list, index);
-}
-
-/**
- * Load the resource descriptor for \p image.
- */
-static void
-image_fetch_rsrc(
-	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *image,
-	bool is_store, unsigned target,
-	LLVMValueRef *rsrc)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
-					     SI_PARAM_IMAGES);
-	LLVMValueRef index;
-	bool dcc_off = is_store;
-
-	assert(image->Register.File == TGSI_FILE_IMAGE);
-
-	if (!image->Register.Indirect) {
-		const struct tgsi_shader_info *info = bld_base->info;
-		unsigned images_writemask = info->images_store |
-					    info->images_atomic;
-
-		index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
-
-		if (images_writemask & (1 << image->Register.Index))
-			dcc_off = true;
-	} else {
-		/* From the GL_ARB_shader_image_load_store extension spec:
-		 *
-		 *    If a shader performs an image load, store, or atomic
-		 *    operation using an image variable declared as an array,
-		 *    and if the index used to select an individual element is
-		 *    negative or greater than or equal to the size of the
-		 *    array, the results of the operation are undefined but may
-		 *    not lead to termination.
-		 */
-		index = get_bounded_indirect_index(ctx, &image->Indirect,
-						   image->Register.Index,
-						   SI_NUM_IMAGES);
-	}
-
-	*rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
-	if (dcc_off && target != TGSI_TEXTURE_BUFFER)
-		*rsrc = force_dcc_off(ctx, *rsrc);
-}
-
-static LLVMValueRef image_fetch_coords(
-		struct lp_build_tgsi_context *bld_base,
-		const struct tgsi_full_instruction *inst,
-		unsigned src, LLVMValueRef desc)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	unsigned target = inst->Memory.Texture;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
-	LLVMValueRef coords[4];
-	LLVMValueRef tmp;
-	int chan;
-
-	for (chan = 0; chan < num_coords; ++chan) {
-		tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
-		tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-		coords[chan] = tmp;
-	}
-
-	if (ctx->screen->b.chip_class >= GFX9) {
-		/* 1D textures are allocated and used as 2D on GFX9. */
-		if (target == TGSI_TEXTURE_1D) {
-			coords[1] = ctx->i32_0;
-			num_coords++;
-		} else if (target == TGSI_TEXTURE_1D_ARRAY) {
-			coords[2] = coords[1];
-			coords[1] = ctx->i32_0;
-			num_coords++;
-		} else if (target == TGSI_TEXTURE_2D) {
-			/* The hw can't bind a slice of a 3D image as a 2D
-			 * image, because it ignores BASE_ARRAY if the target
-			 * is 3D. The workaround is to read BASE_ARRAY and set
-			 * it as the 3rd address operand for all 2D images.
-			 */
-			LLVMValueRef first_layer, const5, mask;
-
-			const5 = LLVMConstInt(ctx->i32, 5, 0);
-			mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
-			first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
-			first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
-
-			coords[2] = first_layer;
-			num_coords++;
-		}
-	}
-
-	if (num_coords == 1)
-		return coords[0];
-
-	if (num_coords == 3) {
-		/* LLVM has difficulties lowering 3-element vectors. */
-		coords[3] = bld_base->uint_bld.undef;
-		num_coords = 4;
-	}
-
-	return lp_build_gather_values(gallivm, coords, num_coords);
-}
-
-/**
- * Append the extra mode bits that are used by image load and store.
- */
-static void image_append_args(
-		struct si_shader_context *ctx,
-		struct lp_build_emit_data * emit_data,
-		unsigned target,
-		bool atomic,
-		bool force_glc)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
-	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
-	LLVMValueRef r128 = i1false;
-	LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
-	LLVMValueRef glc =
-		force_glc ||
-		inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
-		i1true : i1false;
-	LLVMValueRef slc = i1false;
-	LLVMValueRef lwe = i1false;
-
-	if (atomic || (HAVE_LLVM <= 0x0309)) {
-		emit_data->args[emit_data->arg_count++] = r128;
-		emit_data->args[emit_data->arg_count++] = da;
-		if (!atomic) {
-			emit_data->args[emit_data->arg_count++] = glc;
-		}
-		emit_data->args[emit_data->arg_count++] = slc;
-		return;
-	}
-
-	/* HAVE_LLVM >= 0x0400 */
-	emit_data->args[emit_data->arg_count++] = glc;
-	emit_data->args[emit_data->arg_count++] = slc;
-	emit_data->args[emit_data->arg_count++] = lwe;
-	emit_data->args[emit_data->arg_count++] = da;
-}
-
-/**
- * Append the resource and indexing arguments for buffer intrinsics.
- *
- * \param rsrc the v4i32 buffer resource
- * \param index index into the buffer (stride-based)
- * \param offset byte offset into the buffer
- */
-static void buffer_append_args(
-		struct si_shader_context *ctx,
-		struct lp_build_emit_data *emit_data,
-		LLVMValueRef rsrc,
-		LLVMValueRef index,
-		LLVMValueRef offset,
-		bool atomic,
-		bool force_glc)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
-	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
-
-	emit_data->args[emit_data->arg_count++] = rsrc;
-	emit_data->args[emit_data->arg_count++] = index; /* vindex */
-	emit_data->args[emit_data->arg_count++] = offset; /* voffset */
-	if (!atomic) {
-		emit_data->args[emit_data->arg_count++] =
-			force_glc ||
-			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
-			i1true : i1false; /* glc */
-	}
-	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
-}
-
-static void load_fetch_args(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	unsigned target = inst->Memory.Texture;
-	LLVMValueRef rsrc;
-
-	emit_data->dst_type = ctx->v4f32;
-
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-		LLVMBuilderRef builder = gallivm->builder;
-		LLVMValueRef offset;
-		LLVMValueRef tmp;
-
-		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
-
-		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
-		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-				   offset, false, false);
-	} else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
-		LLVMValueRef coords;
-
-		image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
-		coords = image_fetch_coords(bld_base, inst, 1, rsrc);
-
-		if (target == TGSI_TEXTURE_BUFFER) {
-			buffer_append_args(ctx, emit_data, rsrc, coords,
-					   ctx->i32_0, false, false);
-		} else {
-			emit_data->args[0] = coords;
-			emit_data->args[1] = rsrc;
-			emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
-			emit_data->arg_count = 3;
-
-			image_append_args(ctx, emit_data, target, false, false);
-		}
-	}
-}
-
-static unsigned get_load_intr_attribs(bool readonly_memory)
-{
-	/* READNONE means writes can't affect it, while READONLY means that
-	 * writes can affect it. */
-	return readonly_memory && HAVE_LLVM >= 0x0400 ?
-				 LP_FUNC_ATTR_READNONE :
-				 LP_FUNC_ATTR_READONLY;
-}
-
-static unsigned get_store_intr_attribs(bool writeonly_memory)
-{
-	return writeonly_memory && HAVE_LLVM >= 0x0400 ?
-				  LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-				  LP_FUNC_ATTR_WRITEONLY;
-}
-
-static void load_emit_buffer(struct si_shader_context *ctx,
-			     struct lp_build_emit_data *emit_data,
-			     bool readonly_memory)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	uint writemask = inst->Dst[0].Register.WriteMask;
-	uint count = util_last_bit(writemask);
-	const char *intrinsic_name;
-	LLVMTypeRef dst_type;
-
-	switch (count) {
-	case 1:
-		intrinsic_name = "llvm.amdgcn.buffer.load.f32";
-		dst_type = ctx->f32;
-		break;
-	case 2:
-		intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
-		dst_type = LLVMVectorType(ctx->f32, 2);
-		break;
-	default: // 3 & 4
-		intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
-		dst_type = ctx->v4f32;
-		count = 4;
-	}
-
-	emit_data->output[emit_data->chan] = lp_build_intrinsic(
-			builder, intrinsic_name, dst_type,
-			emit_data->args, emit_data->arg_count,
-			get_load_intr_attribs(readonly_memory));
-}
-
-static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
-                                   const struct tgsi_full_instruction *inst,
-                                   LLVMTypeRef type, int arg)
-{
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef offset, ptr;
-	int addr_space;
-
-	offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
-	offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
-
-	ptr = ctx->shared_memory;
-	ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
-	addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
-	ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
-
-	return ptr;
-}
-
-static void load_emit_memory(
-		struct si_shader_context *ctx,
-		struct lp_build_emit_data *emit_data)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	unsigned writemask = inst->Dst[0].Register.WriteMask;
-	LLVMValueRef channels[4], ptr, derived_ptr, index;
-	int chan;
-
-	ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
-
-	for (chan = 0; chan < 4; ++chan) {
-		if (!(writemask & (1 << chan))) {
-			channels[chan] = LLVMGetUndef(ctx->f32);
-			continue;
-		}
-
-		index = LLVMConstInt(ctx->i32, chan, 0);
-		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
-		channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
-	}
-	emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
-}
-
-/**
- * Return true if the memory accessed by a LOAD or STORE instruction is
- * read-only or write-only, respectively.
- *
- * \param shader_buffers_reverse_access_mask
- *	For LOAD, set this to (store | atomic) slot usage in the shader.
- *	For STORE, set this to (load | atomic) slot usage in the shader.
- * \param images_reverse_access_mask  Same as above, but for images.
- */
-static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
-				  const struct tgsi_shader_info *info,
-				  unsigned shader_buffers_reverse_access_mask,
-				  unsigned images_reverse_access_mask)
-{
-	/* RESTRICT means NOALIAS.
-	 * If there are no writes, we can assume the accessed memory is read-only.
-	 * If there are no reads, we can assume the accessed memory is write-only.
-	 */
-	if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
-		unsigned reverse_access_mask;
-
-		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-			reverse_access_mask = shader_buffers_reverse_access_mask;
-		} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-			reverse_access_mask = info->images_buffers &
-					      images_reverse_access_mask;
-		} else {
-			reverse_access_mask = ~info->images_buffers &
-					      images_reverse_access_mask;
-		}
-
-		if (inst->Src[0].Register.Indirect) {
-			if (!reverse_access_mask)
-				return true;
-		} else {
-			if (!(reverse_access_mask &
-			      (1u << inst->Src[0].Register.Index)))
-				return true;
-		}
-	}
-
-	/* If there are no buffer writes (for both shader buffers & image
-	 * buffers), it implies that buffer memory is read-only.
-	 * If there are no buffer reads (for both shader buffers & image
-	 * buffers), it implies that buffer memory is write-only.
-	 *
-	 * Same for the case when there are no writes/reads for non-buffer
-	 * images.
-	 */
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
-	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
-	     inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
-		if (!shader_buffers_reverse_access_mask &&
-		    !(info->images_buffers & images_reverse_access_mask))
-			return true;
-	} else {
-		if (!(~info->images_buffers & images_reverse_access_mask))
-			return true;
-	}
-	return false;
-}
-
-static void load_emit(
-		const struct lp_build_tgsi_action *action,
-		struct lp_build_tgsi_context *bld_base,
-		struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
-	char intrinsic_name[64];
-	bool readonly_memory = false;
-
-	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
-		load_emit_memory(ctx, emit_data);
-		return;
-	}
-
-	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
-		emit_waitcnt(ctx, VM_CNT);
-
-	readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
-			  is_oneway_access_only(inst, info,
-						info->shader_buffers_store |
-						info->shader_buffers_atomic,
-						info->images_store |
-						info->images_atomic);
-
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-		load_emit_buffer(ctx, emit_data, readonly_memory);
-		return;
-	}
-
-	if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] =
-			lp_build_intrinsic(
-				builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
-				emit_data->args, emit_data->arg_count,
-				get_load_intr_attribs(readonly_memory));
-	} else {
-		ac_get_image_intr_name("llvm.amdgcn.image.load",
-				       emit_data->dst_type,		/* vdata */
-				       LLVMTypeOf(emit_data->args[0]), /* coords */
-				       LLVMTypeOf(emit_data->args[1]), /* rsrc */
-				       intrinsic_name, sizeof(intrinsic_name));
-
-		emit_data->output[emit_data->chan] =
-			lp_build_intrinsic(
-				builder, intrinsic_name, emit_data->dst_type,
-				emit_data->args, emit_data->arg_count,
-				get_load_intr_attribs(readonly_memory));
-	}
-}
-
-static void store_fetch_args(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct tgsi_full_src_register memory;
-	LLVMValueRef chans[4];
-	LLVMValueRef data;
-	LLVMValueRef rsrc;
-	unsigned chan;
-
-	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
-
-	for (chan = 0; chan < 4; ++chan) {
-		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
-	}
-	data = lp_build_gather_values(gallivm, chans, 4);
-
-	emit_data->args[emit_data->arg_count++] = data;
-
-	memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
-
-	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
-		LLVMValueRef offset;
-		LLVMValueRef tmp;
-
-		rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
-
-		tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
-		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-				   offset, false, false);
-	} else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
-		unsigned target = inst->Memory.Texture;
-		LLVMValueRef coords;
-
-		/* 8bit/16bit TC L1 write corruption bug on SI.
-		 * All store opcodes not aligned to a dword are affected.
-		 *
-		 * The only way to get unaligned stores in radeonsi is through
-		 * shader images.
-		 */
-		bool force_glc = ctx->screen->b.chip_class == SI;
-
-		image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
-		coords = image_fetch_coords(bld_base, inst, 0, rsrc);
-
-		if (target == TGSI_TEXTURE_BUFFER) {
-			buffer_append_args(ctx, emit_data, rsrc, coords,
-					   ctx->i32_0, false, force_glc);
-		} else {
-			emit_data->args[1] = coords;
-			emit_data->args[2] = rsrc;
-			emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
-			emit_data->arg_count = 4;
-
-			image_append_args(ctx, emit_data, target, false, force_glc);
-		}
-	}
-}
-
-static void store_emit_buffer(
-		struct si_shader_context *ctx,
-		struct lp_build_emit_data *emit_data,
-		bool writeonly_memory)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef base_data = emit_data->args[0];
-	LLVMValueRef base_offset = emit_data->args[3];
-	unsigned writemask = inst->Dst[0].Register.WriteMask;
-
-	while (writemask) {
-		int start, count;
-		const char *intrinsic_name;
-		LLVMValueRef data;
-		LLVMValueRef offset;
-		LLVMValueRef tmp;
-
-		u_bit_scan_consecutive_range(&writemask, &start, &count);
-
-		/* Due to an LLVM limitation, split 3-element writes
-		 * into a 2-element and a 1-element write. */
-		if (count == 3) {
-			writemask |= 1 << (start + 2);
-			count = 2;
-		}
-
-		if (count == 4) {
-			data = base_data;
-			intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
-		} else if (count == 2) {
-			LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
-
-			tmp = LLVMBuildExtractElement(
-				builder, base_data,
-				LLVMConstInt(ctx->i32, start, 0), "");
-			data = LLVMBuildInsertElement(
-				builder, LLVMGetUndef(v2f32), tmp,
-				ctx->i32_0, "");
-
-			tmp = LLVMBuildExtractElement(
-				builder, base_data,
-				LLVMConstInt(ctx->i32, start + 1, 0), "");
-			data = LLVMBuildInsertElement(
-				builder, data, tmp, ctx->i32_1, "");
-
-			intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
-		} else {
-			assert(count == 1);
-			data = LLVMBuildExtractElement(
-				builder, base_data,
-				LLVMConstInt(ctx->i32, start, 0), "");
-			intrinsic_name = "llvm.amdgcn.buffer.store.f32";
-		}
-
-		offset = base_offset;
-		if (start != 0) {
-			offset = LLVMBuildAdd(
-				builder, offset,
-				LLVMConstInt(ctx->i32, start * 4, 0), "");
-		}
-
-		emit_data->args[0] = data;
-		emit_data->args[3] = offset;
-
-		lp_build_intrinsic(
-			builder, intrinsic_name, emit_data->dst_type,
-			emit_data->args, emit_data->arg_count,
-			get_store_intr_attribs(writeonly_memory));
-	}
-}
-
-static void store_emit_memory(
-		struct si_shader_context *ctx,
-		struct lp_build_emit_data *emit_data)
-{
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	unsigned writemask = inst->Dst[0].Register.WriteMask;
-	LLVMValueRef ptr, derived_ptr, data, index;
-	int chan;
-
-	ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
-
-	for (chan = 0; chan < 4; ++chan) {
-		if (!(writemask & (1 << chan))) {
-			continue;
-		}
-		data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
-		index = LLVMConstInt(ctx->i32, chan, 0);
-		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
-		LLVMBuildStore(builder, data, derived_ptr);
-	}
-}
-
-static void store_emit(
-		const struct lp_build_tgsi_action *action,
-		struct lp_build_tgsi_context *bld_base,
-		struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
-	unsigned target = inst->Memory.Texture;
-	char intrinsic_name[64];
-	bool writeonly_memory = false;
-
-	if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
-		store_emit_memory(ctx, emit_data);
-		return;
-	}
-
-	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
-		emit_waitcnt(ctx, VM_CNT);
-
-	writeonly_memory = is_oneway_access_only(inst, info,
-						 info->shader_buffers_load |
-						 info->shader_buffers_atomic,
-						 info->images_load |
-						 info->images_atomic);
-
-	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
-		store_emit_buffer(ctx, emit_data, writeonly_memory);
-		return;
-	}
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = lp_build_intrinsic(
-			builder, "llvm.amdgcn.buffer.store.format.v4f32",
-			emit_data->dst_type, emit_data->args,
-			emit_data->arg_count,
-			get_store_intr_attribs(writeonly_memory));
-	} else {
-		ac_get_image_intr_name("llvm.amdgcn.image.store",
-				       LLVMTypeOf(emit_data->args[0]), /* vdata */
-				       LLVMTypeOf(emit_data->args[1]), /* coords */
-				       LLVMTypeOf(emit_data->args[2]), /* rsrc */
-				       intrinsic_name, sizeof(intrinsic_name));
-
-		emit_data->output[emit_data->chan] =
-			lp_build_intrinsic(
-				builder, intrinsic_name, emit_data->dst_type,
-				emit_data->args, emit_data->arg_count,
-				get_store_intr_attribs(writeonly_memory));
-	}
-}
-
-static void atomic_fetch_args(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef data1, data2;
-	LLVMValueRef rsrc;
-	LLVMValueRef tmp;
-
-	emit_data->dst_type = ctx->f32;
-
-	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
-	data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
-		tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
-		data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-	}
-
-	/* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
-	 * of arguments, which is reversed relative to TGSI (and GLSL)
-	 */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
-		emit_data->args[emit_data->arg_count++] = data2;
-	emit_data->args[emit_data->arg_count++] = data1;
-
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-		LLVMValueRef offset;
-
-		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
-
-		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
-		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-				   offset, true, false);
-	} else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
-		unsigned target = inst->Memory.Texture;
-		LLVMValueRef coords;
-
-		image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
-		coords = image_fetch_coords(bld_base, inst, 1, rsrc);
-
-		if (target == TGSI_TEXTURE_BUFFER) {
-			buffer_append_args(ctx, emit_data, rsrc, coords,
-					   ctx->i32_0, true, false);
-		} else {
-			emit_data->args[emit_data->arg_count++] = coords;
-			emit_data->args[emit_data->arg_count++] = rsrc;
-
-			image_append_args(ctx, emit_data, target, true, false);
-		}
-	}
-}
-
-static void atomic_emit_memory(struct si_shader_context *ctx,
-                               struct lp_build_emit_data *emit_data) {
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef ptr, result, arg;
-
-	ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
-
-	arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
-	arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
-		LLVMValueRef new_data;
-		new_data = lp_build_emit_fetch(&ctx->bld_base,
-		                               inst, 3, 0);
-
-		new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
-
-#if HAVE_LLVM >= 0x309
-		result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
-		                       LLVMAtomicOrderingSequentiallyConsistent,
-		                       LLVMAtomicOrderingSequentiallyConsistent,
-		                       false);
-#endif
-
-		result = LLVMBuildExtractValue(builder, result, 0, "");
-	} else {
-		LLVMAtomicRMWBinOp op;
-
-		switch(inst->Instruction.Opcode) {
-			case TGSI_OPCODE_ATOMUADD:
-				op = LLVMAtomicRMWBinOpAdd;
-				break;
-			case TGSI_OPCODE_ATOMXCHG:
-				op = LLVMAtomicRMWBinOpXchg;
-				break;
-			case TGSI_OPCODE_ATOMAND:
-				op = LLVMAtomicRMWBinOpAnd;
-				break;
-			case TGSI_OPCODE_ATOMOR:
-				op = LLVMAtomicRMWBinOpOr;
-				break;
-			case TGSI_OPCODE_ATOMXOR:
-				op = LLVMAtomicRMWBinOpXor;
-				break;
-			case TGSI_OPCODE_ATOMUMIN:
-				op = LLVMAtomicRMWBinOpUMin;
-				break;
-			case TGSI_OPCODE_ATOMUMAX:
-				op = LLVMAtomicRMWBinOpUMax;
-				break;
-			case TGSI_OPCODE_ATOMIMIN:
-				op = LLVMAtomicRMWBinOpMin;
-				break;
-			case TGSI_OPCODE_ATOMIMAX:
-				op = LLVMAtomicRMWBinOpMax;
-				break;
-			default:
-				unreachable("unknown atomic opcode");
-		}
-
-		result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
-		                       LLVMAtomicOrderingSequentiallyConsistent,
-		                       false);
-	}
-	emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
-}
-
-static void atomic_emit(
-		const struct lp_build_tgsi_action *action,
-		struct lp_build_tgsi_context *bld_base,
-		struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	char intrinsic_name[40];
-	LLVMValueRef tmp;
-
-	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
-		atomic_emit_memory(ctx, emit_data);
-		return;
-	}
-
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
-	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-		snprintf(intrinsic_name, sizeof(intrinsic_name),
-			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
-	} else {
-		LLVMValueRef coords;
-		char coords_type[8];
-
-		if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
-			coords = emit_data->args[2];
-		else
-			coords = emit_data->args[1];
-
-		ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
-		snprintf(intrinsic_name, sizeof(intrinsic_name),
-			 "llvm.amdgcn.image.atomic.%s.%s",
-			 action->intr_name, coords_type);
-	}
-
-	tmp = lp_build_intrinsic(
-		builder, intrinsic_name, ctx->i32,
-		emit_data->args, emit_data->arg_count, 0);
-	emit_data->output[emit_data->chan] =
-		LLVMBuildBitCast(builder, tmp, ctx->f32, "");
-}
-
-static void set_tex_fetch_args(struct si_shader_context *ctx,
-			       struct lp_build_emit_data *emit_data,
-			       unsigned target,
-			       LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
-			       LLVMValueRef *param, unsigned count,
-			       unsigned dmask)
-{
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	struct ac_image_args args = {};
-
-	/* Pad to power of two vector */
-	while (count < util_next_power_of_two(count))
-		param[count++] = LLVMGetUndef(ctx->i32);
-
-	if (count > 1)
-		args.addr = lp_build_gather_values(gallivm, param, count);
-	else
-		args.addr = param[0];
-
-	args.resource = res_ptr;
-	args.sampler = samp_ptr;
-	args.dmask = dmask;
-	args.unorm = target == TGSI_TEXTURE_RECT ||
-		     target == TGSI_TEXTURE_SHADOWRECT;
-	args.da = tgsi_is_array_sampler(target);
-
-	/* Ugly, but we seem to have no other choice right now. */
-	STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
-	memcpy(emit_data->args, &args, sizeof(args));
-}
-
-static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
-				unsigned target, LLVMValueRef out)
-{
-	LLVMBuilderRef builder = ctx->gallivm.builder;
-
-	/* 1D textures are allocated and used as 2D on GFX9. */
-        if (ctx->screen->b.chip_class >= GFX9 &&
-	    (target == TGSI_TEXTURE_1D_ARRAY ||
-	     target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
-		LLVMValueRef layers =
-			LLVMBuildExtractElement(builder, out,
-						LLVMConstInt(ctx->i32, 2, 0), "");
-		out = LLVMBuildInsertElement(builder, out, layers,
-					     ctx->i32_1, "");
-	}
-
-	/* Divide the number of layers by 6 to get the number of cubes. */
-	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
-	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-		LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
-
-		LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
-		z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
-
-		out = LLVMBuildInsertElement(builder, out, z, imm2, "");
-	}
-	return out;
-}
-
-static void resq_fetch_args(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	const struct tgsi_full_src_register *reg = &inst->Src[0];
-
-	emit_data->dst_type = ctx->v4i32;
-
-	if (reg->Register.File == TGSI_FILE_BUFFER) {
-		emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
-		emit_data->arg_count = 1;
-	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-		image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
-				 &emit_data->args[0]);
-		emit_data->arg_count = 1;
-	} else {
-		LLVMValueRef res_ptr;
-		unsigned image_target;
-
-		if (inst->Memory.Texture == TGSI_TEXTURE_3D)
-			image_target = TGSI_TEXTURE_2D_ARRAY;
-		else
-			image_target = inst->Memory.Texture;
-
-		image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
-				 &res_ptr);
-		set_tex_fetch_args(ctx, emit_data, image_target,
-				   res_ptr, NULL, &ctx->i32_0, 1,
-				   0xf);
-	}
-}
-
-static void resq_emit(
-		const struct lp_build_tgsi_action *action,
-		struct lp_build_tgsi_context *bld_base,
-		struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	LLVMValueRef out;
-
-	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-		out = LLVMBuildExtractElement(builder, emit_data->args[0],
-					      LLVMConstInt(ctx->i32, 2, 0), "");
-	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-		out = get_buffer_size(bld_base, emit_data->args[0]);
-	} else {
-		struct ac_image_args args;
-
-		memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-		args.opcode = ac_image_get_resinfo;
-		out = ac_build_image_opcode(&ctx->ac, &args);
-
-		out = fix_resinfo(ctx, inst->Memory.Texture, out);
-	}
-
-	emit_data->output[emit_data->chan] = out;
-}
-
-static const struct lp_build_tgsi_action tex_action;
-
-enum desc_type {
-	DESC_IMAGE,
-	DESC_BUFFER,
-	DESC_FMASK,
-	DESC_SAMPLER,
-};
-
-/**
- * Load an image view, fmask view. or sampler state descriptor.
- */
-static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
-				      LLVMValueRef list, LLVMValueRef index,
-				      enum desc_type type)
-{
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-
-	switch (type) {
-	case DESC_IMAGE:
-		/* The image is at [0:7]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
-		break;
-	case DESC_BUFFER:
-		/* The buffer is in [4:7]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
-		index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
-		list = LLVMBuildPointerCast(builder, list,
-					    const_array(ctx->v4i32, 0), "");
-		break;
-	case DESC_FMASK:
-		/* The FMASK is at [8:15]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
-		index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
-		break;
-	case DESC_SAMPLER:
-		/* The sampler state is at [12:15]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
-		index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
-		list = LLVMBuildPointerCast(builder, list,
-					    const_array(ctx->v4i32, 0), "");
-		break;
-	}
-
-	return ac_build_indexed_load_const(&ctx->ac, list, index);
-}
-
-/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
- *
- * SI-CI:
- *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
- *   filtering manually. The driver sets img7 to a mask clearing
- *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
- *     s_and_b32 samp0, samp0, img7
- *
- * VI:
- *   The ANISO_OVERRIDE sampler field enables this fix in TA.
- */
-static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
-					   LLVMValueRef res, LLVMValueRef samp)
-{
-	LLVMBuilderRef builder = ctx->gallivm.builder;
-	LLVMValueRef img7, samp0;
-
-	if (ctx->screen->b.chip_class >= VI)
-		return samp;
-
-	img7 = LLVMBuildExtractElement(builder, res,
-				       LLVMConstInt(ctx->i32, 7, 0), "");
-	samp0 = LLVMBuildExtractElement(builder, samp,
-					ctx->i32_0, "");
-	samp0 = LLVMBuildAnd(builder, samp0, img7, "");
-	return LLVMBuildInsertElement(builder, samp, samp0,
-				      ctx->i32_0, "");
-}
-
-static void tex_fetch_ptrs(
-	struct lp_build_tgsi_context *bld_base,
-	struct lp_build_emit_data *emit_data,
-	LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef list = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLERS);
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	const struct tgsi_full_src_register *reg;
-	unsigned target = inst->Texture.Texture;
-	unsigned sampler_src;
-	LLVMValueRef index;
-
-	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
-	reg = &emit_data->inst->Src[sampler_src];
-
-	if (reg->Register.Indirect) {
-		index = get_bounded_indirect_index(ctx,
-						   &reg->Indirect,
-						   reg->Register.Index,
-						   SI_NUM_SAMPLERS);
-	} else {
-		index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
-	}
-
-	if (target == TGSI_TEXTURE_BUFFER)
-		*res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
-	else
-		*res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
-
-	if (samp_ptr)
-		*samp_ptr = NULL;
-	if (fmask_ptr)
-		*fmask_ptr = NULL;
-
-	if (target == TGSI_TEXTURE_2D_MSAA ||
-	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-		if (fmask_ptr)
-			*fmask_ptr = load_sampler_desc(ctx, list, index,
-						       DESC_FMASK);
-	} else if (target != TGSI_TEXTURE_BUFFER) {
-		if (samp_ptr) {
-			*samp_ptr = load_sampler_desc(ctx, list, index,
-						      DESC_SAMPLER);
-			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
-		}
-	}
-}
-
-static void txq_fetch_args(
-	struct lp_build_tgsi_context *bld_base,
-	struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	unsigned target = inst->Texture.Texture;
-	LLVMValueRef res_ptr;
-	LLVMValueRef address;
-
-	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		/* Read the size from the buffer descriptor directly. */
-		emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
-		return;
-	}
-
-	/* Textures - set the mip level. */
-	address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
-
-	set_tex_fetch_args(ctx, emit_data, target, res_ptr,
-			   NULL, &address, 1, 0xf);
-}
-
-static void txq_emit(const struct lp_build_tgsi_action *action,
-		     struct lp_build_tgsi_context *bld_base,
-		     struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct ac_image_args args;
-	unsigned target = emit_data->inst->Texture.Texture;
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		/* Just return the buffer size. */
-		emit_data->output[emit_data->chan] = emit_data->args[0];
-		return;
-	}
-
-	memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-
-	args.opcode = ac_image_get_resinfo;
-	LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
-
-	emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context *bld_base,
-	struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	unsigned opcode = inst->Instruction.Opcode;
-	unsigned target = inst->Texture.Texture;
-	LLVMValueRef coords[5], derivs[6];
-	LLVMValueRef address[16];
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
-	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
-	unsigned count = 0;
-	unsigned chan;
-	unsigned num_deriv_channels = 0;
-	bool has_offset = inst->Texture.NumOffsets > 0;
-	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
-	unsigned dmask = 0xf;
-
-	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->dst_type = ctx->v4f32;
-		emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
-						      ctx->v16i8, "");
-		emit_data->args[1] = ctx->i32_0;
-		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
-		emit_data->arg_count = 3;
-		return;
-	}
-
-	/* Fetch and project texture coordinates */
-	coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-	for (chan = 0; chan < 3; chan++ ) {
-		coords[chan] = lp_build_emit_fetch(bld_base,
-						   emit_data->inst, 0,
-						   chan);
-		if (opcode == TGSI_OPCODE_TXP)
-			coords[chan] = lp_build_emit_llvm_binary(bld_base,
-								 TGSI_OPCODE_DIV,
-								 coords[chan],
-								 coords[3]);
-	}
-
-	if (opcode == TGSI_OPCODE_TXP)
-		coords[3] = bld_base->base.one;
-
-	/* Pack offsets. */
-	if (has_offset &&
-	    opcode != TGSI_OPCODE_TXF &&
-	    opcode != TGSI_OPCODE_TXF_LZ) {
-		/* The offsets are six-bit signed integers packed like this:
-		 *   X=[5:0], Y=[13:8], and Z=[21:16].
-		 */
-		LLVMValueRef offset[3], pack;
-
-		assert(inst->Texture.NumOffsets == 1);
-
-		for (chan = 0; chan < 3; chan++) {
-			offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
-								     emit_data->inst, 0, chan);
-			offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
-						    LLVMConstInt(ctx->i32, 0x3f, 0), "");
-			if (chan)
-				offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
-							    LLVMConstInt(ctx->i32, chan*8, 0), "");
-		}
-
-		pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
-		pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
-		address[count++] = pack;
-	}
-
-	/* Pack LOD bias value */
-	if (opcode == TGSI_OPCODE_TXB)
-		address[count++] = coords[3];
-	if (opcode == TGSI_OPCODE_TXB2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-
-	/* Pack depth comparison value */
-	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
-		LLVMValueRef z;
-
-		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-			z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-		} else {
-			assert(ref_pos >= 0);
-			z = coords[ref_pos];
-		}
-
-		/* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
-		 * so the depth comparison value isn't clamped for Z16 and
-		 * Z24 anymore. Do it manually here.
-		 *
-		 * It's unnecessary if the original texture format was
-		 * Z32_FLOAT, but we don't know that here.
-		 */
-		if (ctx->screen->b.chip_class == VI)
-			z = ac_build_clamp(&ctx->ac, z);
-
-		address[count++] = z;
-	}
-
-	/* Pack user derivatives */
-	if (opcode == TGSI_OPCODE_TXD) {
-		int param, num_src_deriv_channels, num_dst_deriv_channels;
-
-		switch (target) {
-		case TGSI_TEXTURE_3D:
-			num_src_deriv_channels = 3;
-			num_dst_deriv_channels = 3;
-			num_deriv_channels = 3;
-			break;
-		case TGSI_TEXTURE_2D:
-		case TGSI_TEXTURE_SHADOW2D:
-		case TGSI_TEXTURE_RECT:
-		case TGSI_TEXTURE_SHADOWRECT:
-		case TGSI_TEXTURE_2D_ARRAY:
-		case TGSI_TEXTURE_SHADOW2D_ARRAY:
-			num_src_deriv_channels = 2;
-			num_dst_deriv_channels = 2;
-			num_deriv_channels = 2;
-			break;
-		case TGSI_TEXTURE_CUBE:
-		case TGSI_TEXTURE_SHADOWCUBE:
-		case TGSI_TEXTURE_CUBE_ARRAY:
-		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-			/* Cube derivatives will be converted to 2D. */
-			num_src_deriv_channels = 3;
-			num_dst_deriv_channels = 3;
-			num_deriv_channels = 2;
-			break;
-		case TGSI_TEXTURE_1D:
-		case TGSI_TEXTURE_SHADOW1D:
-		case TGSI_TEXTURE_1D_ARRAY:
-		case TGSI_TEXTURE_SHADOW1D_ARRAY:
-			num_src_deriv_channels = 1;
-
-			/* 1D textures are allocated and used as 2D on GFX9. */
-			if (ctx->screen->b.chip_class >= GFX9) {
-				num_dst_deriv_channels = 2;
-				num_deriv_channels = 2;
-			} else {
-				num_dst_deriv_channels = 1;
-				num_deriv_channels = 1;
-			}
-			break;
-		default:
-			unreachable("invalid target");
-		}
-
-		for (param = 0; param < 2; param++) {
-			for (chan = 0; chan < num_src_deriv_channels; chan++)
-				derivs[param * num_dst_deriv_channels + chan] =
-					lp_build_emit_fetch(bld_base, inst, param+1, chan);
-
-			/* Fill in the rest with zeros. */
-			for (chan = num_src_deriv_channels;
-			     chan < num_dst_deriv_channels; chan++)
-				derivs[param * num_dst_deriv_channels + chan] =
-					bld_base->base.zero;
-		}
-	}
-
-	if (target == TGSI_TEXTURE_CUBE ||
-	    target == TGSI_TEXTURE_CUBE_ARRAY ||
-	    target == TGSI_TEXTURE_SHADOWCUBE ||
-	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
-		ac_prepare_cube_coords(&ctx->ac,
-				       opcode == TGSI_OPCODE_TXD,
-				       target == TGSI_TEXTURE_CUBE_ARRAY ||
-				       target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
-				       coords, derivs);
-
-	if (opcode == TGSI_OPCODE_TXD)
-		for (int i = 0; i < num_deriv_channels * 2; i++)
-			address[count++] = derivs[i];
-
-	/* Pack texture coordinates */
-	address[count++] = coords[0];
-	if (num_coords > 1)
-		address[count++] = coords[1];
-	if (num_coords > 2)
-		address[count++] = coords[2];
-
-	/* 1D textures are allocated and used as 2D on GFX9. */
-	if (ctx->screen->b.chip_class >= GFX9) {
-		LLVMValueRef filler;
-
-		/* Use 0.5, so that we don't sample the border color. */
-		if (opcode == TGSI_OPCODE_TXF)
-			filler = ctx->i32_0;
-		else
-			filler = LLVMConstReal(ctx->f32, 0.5);
-
-		if (target == TGSI_TEXTURE_1D ||
-		    target == TGSI_TEXTURE_SHADOW1D) {
-			address[count++] = filler;
-		} else if (target == TGSI_TEXTURE_1D_ARRAY ||
-			   target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
-			address[count] = address[count - 1];
-			address[count - 1] = filler;
-			count++;
-		}
-	}
-
-	/* Pack LOD or sample index */
-	if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
-		address[count++] = coords[3];
-	else if (opcode == TGSI_OPCODE_TXL2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-
-	if (count > 16) {
-		assert(!"Cannot handle more than 16 texture address parameters");
-		count = 16;
-	}
-
-	for (chan = 0; chan < count; chan++ ) {
-		address[chan] = LLVMBuildBitCast(gallivm->builder,
-						 address[chan], ctx->i32, "");
-	}
-
-	/* Adjust the sample index according to FMASK.
-	 *
-	 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
-	 * which is the identity mapping. Each nibble says which physical sample
-	 * should be fetched to get that sample.
-	 *
-	 * For example, 0x11111100 means there are only 2 samples stored and
-	 * the second sample covers 3/4 of the pixel. When reading samples 0
-	 * and 1, return physical sample 0 (determined by the first two 0s
-	 * in FMASK), otherwise return physical sample 1.
-	 *
-	 * The sample index should be adjusted as follows:
-	 *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
-	 */
-	if (target == TGSI_TEXTURE_2D_MSAA ||
-	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-		struct lp_build_emit_data txf_emit_data = *emit_data;
-		LLVMValueRef txf_address[4];
-		/* We only need .xy for non-arrays, and .xyz for arrays. */
-		unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
-		struct tgsi_full_instruction inst = {};
-
-		memcpy(txf_address, address, sizeof(txf_address));
-
-		/* Read FMASK using TXF_LZ. */
-		inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
-		inst.Texture.Texture = target;
-		txf_emit_data.inst = &inst;
-		txf_emit_data.chan = 0;
-		set_tex_fetch_args(ctx, &txf_emit_data,
-				   target, fmask_ptr, NULL,
-				   txf_address, txf_count, 0xf);
-		build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
-
-		/* Initialize some constants. */
-		LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
-		LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
-
-		/* Apply the formula. */
-		LLVMValueRef fmask =
-			LLVMBuildExtractElement(gallivm->builder,
-						txf_emit_data.output[0],
-						ctx->i32_0, "");
-
-		unsigned sample_chan = txf_count; /* the sample index is last */
-
-		LLVMValueRef sample_index4 =
-			LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
-
-		LLVMValueRef shifted_fmask =
-			LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
-
-		LLVMValueRef final_sample =
-			LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
-
-		/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-		 * resource descriptor is 0 (invalid),
-		 */
-		LLVMValueRef fmask_desc =
-			LLVMBuildBitCast(gallivm->builder, fmask_ptr,
-					 ctx->v8i32, "");
-
-		LLVMValueRef fmask_word1 =
-			LLVMBuildExtractElement(gallivm->builder, fmask_desc,
-						ctx->i32_1, "");
-
-		LLVMValueRef word1_is_nonzero =
-			LLVMBuildICmp(gallivm->builder, LLVMIntNE,
-				      fmask_word1, ctx->i32_0, "");
-
-		/* Replace the MSAA sample index. */
-		address[sample_chan] =
-			LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
-					final_sample, address[sample_chan], "");
-	}
-
-	if (opcode == TGSI_OPCODE_TXF ||
-	    opcode == TGSI_OPCODE_TXF_LZ) {
-		/* add tex offsets */
-		if (inst->Texture.NumOffsets) {
-			struct lp_build_context *uint_bld = &bld_base->uint_bld;
-			const struct tgsi_texture_offset *off = inst->TexOffsets;
-
-			assert(inst->Texture.NumOffsets == 1);
-
-			switch (target) {
-			case TGSI_TEXTURE_3D:
-				address[2] = lp_build_add(uint_bld, address[2],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
-				/* fall through */
-			case TGSI_TEXTURE_2D:
-			case TGSI_TEXTURE_SHADOW2D:
-			case TGSI_TEXTURE_RECT:
-			case TGSI_TEXTURE_SHADOWRECT:
-			case TGSI_TEXTURE_2D_ARRAY:
-			case TGSI_TEXTURE_SHADOW2D_ARRAY:
-				address[1] =
-					lp_build_add(uint_bld, address[1],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
-				/* fall through */
-			case TGSI_TEXTURE_1D:
-			case TGSI_TEXTURE_SHADOW1D:
-			case TGSI_TEXTURE_1D_ARRAY:
-			case TGSI_TEXTURE_SHADOW1D_ARRAY:
-				address[0] =
-					lp_build_add(uint_bld, address[0],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
-				break;
-				/* texture offsets do not apply to other texture targets */
-			}
-		}
-	}
-
-	if (opcode == TGSI_OPCODE_TG4) {
-		unsigned gather_comp = 0;
-
-		/* DMASK was repurposed for GATHER4. 4 components are always
-		 * returned and DMASK works like a swizzle - it selects
-		 * the component to fetch. The only valid DMASK values are
-		 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
-		 * (red,red,red,red) etc.) The ISA document doesn't mention
-		 * this.
-		 */
-
-		/* Get the component index from src1.x for Gather4. */
-		if (!tgsi_is_shadow_target(target)) {
-			LLVMValueRef comp_imm;
-			struct tgsi_src_register src1 = inst->Src[1].Register;
-
-			assert(src1.File == TGSI_FILE_IMMEDIATE);
-
-			comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
-			gather_comp = LLVMConstIntGetZExtValue(comp_imm);
-			gather_comp = CLAMP(gather_comp, 0, 3);
-		}
-
-		dmask = 1 << gather_comp;
-	}
-
-	set_tex_fetch_args(ctx, emit_data, target, res_ptr,
-			   samp_ptr, address, count, dmask);
-}
-
-/* Gather4 should follow the same rules as bilinear filtering, but the hardware
- * incorrectly forces nearest filtering if the texture format is integer.
- * The only effect it has on Gather4, which always returns 4 texels for
- * bilinear filtering, is that the final coordinates are off by 0.5 of
- * the texel size.
- *
- * The workaround is to subtract 0.5 from the unnormalized coordinates,
- * or (0.5 / size) from the normalized coordinates.
- */
-static void si_lower_gather4_integer(struct si_shader_context *ctx,
-				     struct ac_image_args *args,
-				     unsigned target)
-{
-	LLVMBuilderRef builder = ctx->gallivm.builder;
-	LLVMValueRef coord = args->addr;
-	LLVMValueRef half_texel[2];
-	/* Texture coordinates start after:
-	 *   {offset, bias, z-compare, derivatives}
-	 * Only the offset and z-compare can occur here.
-	 */
-	unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
-	int c;
-
-	if (target == TGSI_TEXTURE_RECT ||
-	    target == TGSI_TEXTURE_SHADOWRECT) {
-		half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
-	} else {
-		struct tgsi_full_instruction txq_inst = {};
-		struct lp_build_emit_data txq_emit_data = {};
-
-		/* Query the texture size. */
-		txq_inst.Texture.Texture = target;
-		txq_emit_data.inst = &txq_inst;
-		txq_emit_data.dst_type = ctx->v4i32;
-		set_tex_fetch_args(ctx, &txq_emit_data, target,
-				   args->resource, NULL, &ctx->i32_0,
-				   1, 0xf);
-		txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
-
-		/* Compute -0.5 / size. */
-		for (c = 0; c < 2; c++) {
-			half_texel[c] =
-				LLVMBuildExtractElement(builder, txq_emit_data.output[0],
-							LLVMConstInt(ctx->i32, c, 0), "");
-			half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
-			half_texel[c] =
-				lp_build_emit_llvm_unary(&ctx->bld_base,
-							 TGSI_OPCODE_RCP, half_texel[c]);
-			half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
-						      LLVMConstReal(ctx->f32, -0.5), "");
-		}
-	}
-
-	for (c = 0; c < 2; c++) {
-		LLVMValueRef tmp;
-		LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
-
-		tmp = LLVMBuildExtractElement(builder, coord, index, "");
-		tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
-		tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
-		tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-		coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
-	}
-
-	args->addr = coord;
-}
-
-static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
-				struct lp_build_tgsi_context *bld_base,
-				struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	const struct tgsi_full_instruction *inst = emit_data->inst;
-	struct ac_image_args args;
-	unsigned opcode = inst->Instruction.Opcode;
-	unsigned target = inst->Texture.Texture;
-
-	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] =
-			ac_build_buffer_load_format(&ctx->ac,
-						    emit_data->args[0],
-						    emit_data->args[2],
-						    emit_data->args[1],
-						    true);
-		return;
-	}
-
-	memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-
-	args.opcode = ac_image_sample;
-	args.compare = tgsi_is_shadow_target(target);
-	args.offset = inst->Texture.NumOffsets > 0;
-
-	switch (opcode) {
-	case TGSI_OPCODE_TXF:
-	case TGSI_OPCODE_TXF_LZ:
-		args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
-			      target == TGSI_TEXTURE_2D_MSAA ||
-			      target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
-				      ac_image_load : ac_image_load_mip;
-		args.compare = false;
-		args.offset = false;
-		break;
-	case TGSI_OPCODE_LODQ:
-		args.opcode = ac_image_get_lod;
-		args.compare = false;
-		args.offset = false;
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TEX2:
-	case TGSI_OPCODE_TXP:
-		if (ctx->type != PIPE_SHADER_FRAGMENT)
-			args.level_zero = true;
-		break;
-	case TGSI_OPCODE_TEX_LZ:
-		args.level_zero = true;
-		break;
-	case TGSI_OPCODE_TXB:
-	case TGSI_OPCODE_TXB2:
-		assert(ctx->type == PIPE_SHADER_FRAGMENT);
-		args.bias = true;
-		break;
-	case TGSI_OPCODE_TXL:
-	case TGSI_OPCODE_TXL2:
-		args.lod = true;
-		break;
-	case TGSI_OPCODE_TXD:
-		args.deriv = true;
-		break;
-	case TGSI_OPCODE_TG4:
-		args.opcode = ac_image_gather4;
-		args.level_zero = true;
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
-	/* The hardware needs special lowering for Gather4 with integer formats. */
-	if (ctx->screen->b.chip_class <= VI &&
-	    opcode == TGSI_OPCODE_TG4) {
-		struct tgsi_shader_info *info = &ctx->shader->selector->info;
-		/* This will also work with non-constant indexing because of how
-		 * glsl_to_tgsi works and we intent to preserve that behavior.
-		 */
-		const unsigned src_idx = 2;
-		unsigned sampler = inst->Src[src_idx].Register.Index;
-
-		assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
-
-		if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
-		    info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
-			si_lower_gather4_integer(ctx, &args, target);
-	}
-
-	emit_data->output[emit_data->chan] =
-		ac_build_image_opcode(&ctx->ac, &args);
-}
-
-static void si_llvm_emit_txqs(
-	const struct lp_build_tgsi_action *action,
-	struct lp_build_tgsi_context *bld_base,
-	struct lp_build_emit_data *emit_data)
-{
-	struct si_shader_context *ctx = si_shader_context(bld_base);
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef res, samples;
-	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
-
-	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
-
-
-	/* Read the samples from the descriptor directly. */
-	res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
-	samples = LLVMBuildExtractElement(
-		builder, res,
-		LLVMConstInt(ctx->i32, 3, 0), "");
-	samples = LLVMBuildLShr(builder, samples,
-				LLVMConstInt(ctx->i32, 16, 0), "");
-	samples = LLVMBuildAnd(builder, samples,
-			       LLVMConstInt(ctx->i32, 0xf, 0), "");
-	samples = LLVMBuildShl(builder, ctx->i32_1,
-			       samples, "");
-
-	emit_data->output[emit_data->chan] = samples;
-}
-
 static void si_llvm_emit_ddxy(
 	const struct lp_build_tgsi_action *action,
 	struct lp_build_tgsi_context *bld_base,
@@ -5065,7 +3457,7 @@
 
 	val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
 	val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
-			    mask, idx, ctx->lds, val);
+			    mask, idx, val);
 	emit_data->output[emit_data->chan] = val;
 }
 
@@ -5144,18 +3536,41 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct si_shader *shader = ctx->shader;
 	struct gallivm_state *gallivm = &ctx->gallivm;
+	const struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMValueRef interp_param;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
-	int input_index = inst->Src[0].Register.Index;
+	const struct tgsi_full_src_register *input = &inst->Src[0];
+	int input_base, input_array_size;
 	int chan;
 	int i;
-	LLVMValueRef attr_number;
-	LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
+	LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
+	LLVMValueRef array_idx;
 	int interp_param_idx;
-	unsigned interp = shader->selector->info.input_interpolate[input_index];
+	unsigned interp;
 	unsigned location;
 
-	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
+	assert(input->Register.File == TGSI_FILE_INPUT);
+
+	if (input->Register.Indirect) {
+		unsigned array_id = input->Indirect.ArrayID;
+
+		if (array_id) {
+			input_base = info->input_array_first[array_id];
+			input_array_size = info->input_array_last[array_id] - input_base + 1;
+		} else {
+			input_base = inst->Src[0].Register.Index;
+			input_array_size = info->num_inputs - input_base;
+		}
+
+		array_idx = get_indirect_index(ctx, &input->Indirect,
+					       input->Register.Index - input_base);
+	} else {
+		input_base = inst->Src[0].Register.Index;
+		input_array_size = 1;
+		array_idx = ctx->i32_0;
+	}
+
+	interp = shader->selector->info.input_interpolate[input_base];
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
@@ -5171,8 +3586,6 @@
 	else
 		interp_param = NULL;
 
-	attr_number = LLVMConstInt(ctx->i32, input_index, 0);
-
 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
 	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
 		LLVMValueRef ij_out[2];
@@ -5211,28 +3624,35 @@
 		interp_param = lp_build_gather_values(gallivm, ij_out, 2);
 	}
 
+	if (interp_param) {
+		interp_param = LLVMBuildBitCast(gallivm->builder,
+			interp_param, LLVMVectorType(ctx->f32, 2), "");
+	}
+
 	for (chan = 0; chan < 4; chan++) {
-		LLVMValueRef llvm_chan;
-		unsigned schan;
+		LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
+		unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
 
-		schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
-		llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
+		for (unsigned idx = 0; idx < input_array_size; ++idx) {
+			LLVMValueRef v, i = NULL, j = NULL;
 
-		if (interp_param) {
-			interp_param = LLVMBuildBitCast(gallivm->builder,
-				interp_param, LLVMVectorType(ctx->f32, 2), "");
-			LLVMValueRef i = LLVMBuildExtractElement(
-				gallivm->builder, interp_param, ctx->i32_0, "");
-			LLVMValueRef j = LLVMBuildExtractElement(
-				gallivm->builder, interp_param, ctx->i32_1, "");
-			emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
-				llvm_chan, attr_number, params,
-				i, j);
-		} else {
-			emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
-				LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-				llvm_chan, attr_number, params);
+			if (interp_param) {
+				interp_param = LLVMBuildBitCast(gallivm->builder,
+					interp_param, LLVMVectorType(ctx->f32, 2), "");
+				i = LLVMBuildExtractElement(
+					gallivm->builder, interp_param, ctx->i32_0, "");
+				j = LLVMBuildExtractElement(
+					gallivm->builder, interp_param, ctx->i32_1, "");
+			}
+			v = si_build_fs_interp(ctx, input_base + idx, schan,
+					       prim_mask, i, j);
+
+			gather = LLVMBuildInsertElement(gallivm->builder,
+				gather, v, LLVMConstInt(ctx->i32, idx, false), "");
 		}
+
+		emit_data->output[chan] = LLVMBuildExtractElement(
+			gallivm->builder, gather, array_idx, "");
 	}
 }
 
@@ -5402,7 +3822,7 @@
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	struct lp_build_if_state if_state;
 	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
-					    SI_PARAM_GS2VS_OFFSET);
+					    ctx->param_gs2vs_offset);
 	LLVMValueRef gs_next_vertex;
 	LLVMValueRef can_emit, kill;
 	unsigned chan, offset;
@@ -5474,7 +3894,7 @@
 
 	/* Signal vertex emission */
 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-			 LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));
+			 si_get_gs_wave_id(ctx));
 	if (!use_kill)
 		lp_build_endif(&if_state);
 }
@@ -5491,7 +3911,7 @@
 	/* Signal primitive cut */
 	stream = si_llvm_get_stream(bld_base, emit_data);
 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-			 LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID));
+			 si_get_gs_wave_id(ctx));
 }
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
@@ -5505,24 +3925,17 @@
 	 * The real barrier instruction isn’t needed, because an entire patch
 	 * always fits into a single wave.
 	 */
-	if (HAVE_LLVM >= 0x0309 &&
-	    ctx->screen->b.chip_class == SI &&
+	if (ctx->screen->b.chip_class == SI &&
 	    ctx->type == PIPE_SHADER_TESS_CTRL) {
-		emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
+		si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
 		return;
 	}
 
 	lp_build_intrinsic(gallivm->builder,
-			   HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
-					       : "llvm.AMDGPU.barrier.local",
+			   "llvm.amdgcn.s.barrier",
 			   ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
 }
 
-static const struct lp_build_tgsi_action tex_action = {
-	.fetch_args = tex_fetch_args,
-	.emit = build_tex_intrinsic,
-};
-
 static const struct lp_build_tgsi_action interp_action = {
 	.fetch_args = interp_fetch_args,
 	.emit = build_interp_intrinsic,
@@ -5532,13 +3945,12 @@
 			       const char *name,
 			       LLVMTypeRef *returns, unsigned num_returns,
 			       LLVMTypeRef *params, unsigned num_params,
-			       int last_sgpr)
+			       int last_sgpr, unsigned max_workgroup_size)
 {
 	int i;
 
 	si_llvm_create_func(ctx, name, returns, num_returns,
 			    params, num_params);
-	si_llvm_shader_type(ctx->main_fn, ctx->type);
 	ctx->return_value = LLVMGetUndef(ctx->return_type);
 
 	for (i = 0; i <= last_sgpr; ++i) {
@@ -5559,6 +3971,10 @@
 			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
 	}
 
+	if (max_workgroup_size) {
+		si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
+				      max_workgroup_size);
+	}
 	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
 					   "no-signed-zeros-fp-math",
 					   "true");
@@ -5628,18 +4044,34 @@
 	}
 }
 
-static void declare_tess_lds(struct si_shader_context *ctx)
+static void declare_lds_as_pointer(struct si_shader_context *ctx)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 
 	unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
 	ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
 		LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
-		"tess_lds");
+		"lds");
 }
 
-static unsigned si_get_max_workgroup_size(struct si_shader *shader)
+static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
 {
+	switch (shader->selector->type) {
+	case PIPE_SHADER_TESS_CTRL:
+		/* Return this so that LLVM doesn't remove s_barrier
+		 * instructions on chips where we use s_barrier. */
+		return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+	case PIPE_SHADER_GEOMETRY:
+		return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
+
+	case PIPE_SHADER_COMPUTE:
+		break; /* see below */
+
+	default:
+		return 0;
+	}
+
 	const unsigned *properties = shader->selector->info.properties;
 	unsigned max_work_group_size =
 	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
@@ -5655,42 +4087,117 @@
 	return max_work_group_size;
 }
 
+static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
+					    LLVMTypeRef *params,
+					    unsigned *num_params,
+					    bool assign_params)
+{
+	params[(*num_params)++] = si_const_array(ctx->v4i32,
+						 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS);
+	params[(*num_params)++] = si_const_array(ctx->v8i32,
+						 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2);
+
+	if (assign_params) {
+		ctx->param_const_and_shader_buffers = *num_params - 2;
+		ctx->param_samplers_and_images = *num_params - 1;
+	}
+}
+
+static void declare_default_desc_pointers(struct si_shader_context *ctx,
+					  LLVMTypeRef *params,
+				          unsigned *num_params)
+{
+	params[ctx->param_rw_buffers = (*num_params)++] =
+		si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
+	declare_per_stage_desc_pointers(ctx, params, num_params, true);
+}
+
+static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
+					    LLVMTypeRef *params,
+					    unsigned *num_params)
+{
+	params[ctx->param_vertex_buffers = (*num_params)++] =
+		si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS);
+	params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
+	params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
+	params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
+	params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
+}
+
+static void declare_vs_input_vgprs(struct si_shader_context *ctx,
+				   LLVMTypeRef *params, unsigned *num_params,
+				   unsigned *num_prolog_vgprs)
+{
+	struct si_shader *shader = ctx->shader;
+
+	params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
+	if (shader->key.as_ls) {
+		params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
+		params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
+	} else {
+		params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
+		params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
+	}
+	params[(*num_params)++] = ctx->i32; /* unused */
+
+	if (!shader->is_gs_copy_shader) {
+		/* Vertex load indices. */
+		ctx->param_vertex_index0 = (*num_params);
+		for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
+			params[(*num_params)++] = ctx->i32;
+		*num_prolog_vgprs += shader->selector->info.num_inputs;
+	}
+}
+
+static void declare_tes_input_vgprs(struct si_shader_context *ctx,
+				    LLVMTypeRef *params, unsigned *num_params)
+{
+	params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
+	params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
+	params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
+	params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
+}
+
+enum {
+	/* Convenient merged shader definitions. */
+	SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
+	SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
+};
+
 static void create_function(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	struct si_shader *shader = ctx->shader;
-	LLVMTypeRef params[SI_NUM_PARAMS + SI_MAX_ATTRIBS], v3i32;
+	LLVMTypeRef params[100]; /* just make it large enough */
 	LLVMTypeRef returns[16+32*4];
-	unsigned i, last_sgpr, num_params, num_return_sgprs;
+	unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
 	unsigned num_returns = 0;
 	unsigned num_prolog_vgprs = 0;
+	unsigned type = ctx->type;
 
-	v3i32 = LLVMVectorType(ctx->i32, 3);
+	/* Set MERGED shaders. */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
+			type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
+		else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
+			type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
+	}
 
-	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
-	params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
-	params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
-	params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
+	LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
 
-	switch (ctx->type) {
+	switch (type) {
 	case PIPE_SHADER_VERTEX:
-		params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_MAX_ATTRIBS);
-		params[SI_PARAM_BASE_VERTEX] = ctx->i32;
-		params[SI_PARAM_START_INSTANCE] = ctx->i32;
-		params[SI_PARAM_DRAWID] = ctx->i32;
-		params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
-		num_params = SI_PARAM_VS_STATE_BITS+1;
+		declare_default_desc_pointers(ctx, params, &num_params);
+		declare_vs_specific_input_sgprs(ctx, params, &num_params);
 
 		if (shader->key.as_es) {
 			params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
 		} else if (shader->key.as_ls) {
 			/* no extra parameters */
 		} else {
-			if (shader->is_gs_copy_shader) {
-				num_params = SI_PARAM_RW_BUFFERS+1;
-			}
+			if (shader->is_gs_copy_shader)
+				num_params = ctx->param_rw_buffers + 1;
 
 			/* The locations of the other parameters are assigned dynamically. */
 			declare_streamout_params(ctx, &shader->selector->so,
@@ -5700,97 +4207,187 @@
 		last_sgpr = num_params-1;
 
 		/* VGPRs */
-		params[ctx->param_vertex_id = num_params++] = ctx->i32;
-		params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
-		params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
-		params[ctx->param_instance_id = num_params++] = ctx->i32;
-
-		if (!shader->is_gs_copy_shader) {
-			/* Vertex load indices. */
-			ctx->param_vertex_index0 = num_params;
-
-			for (i = 0; i < shader->selector->info.num_inputs; i++)
-				params[num_params++] = ctx->i32;
-
-			num_prolog_vgprs += shader->selector->info.num_inputs;
-
-			/* PrimitiveID output. */
-			if (!shader->key.as_es && !shader->key.as_ls)
-				for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-					returns[num_returns++] = ctx->f32;
-		}
+		declare_vs_input_vgprs(ctx, params, &num_params,
+				       &num_prolog_vgprs);
 		break;
 
-	case PIPE_SHADER_TESS_CTRL:
-		params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
-		params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
-		params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
-		params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
-		params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
-		params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
-		last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+	case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
+		declare_default_desc_pointers(ctx, params, &num_params);
+		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+		params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
+		params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
+		params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
+		last_sgpr = num_params - 1;
 
 		/* VGPRs */
-		params[SI_PARAM_PATCH_ID] = ctx->i32;
-		params[SI_PARAM_REL_IDS] = ctx->i32;
-		num_params = SI_PARAM_REL_IDS+1;
+		params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
+		params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
 
-		/* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
+		/* param_tcs_offchip_offset and param_tcs_factor_offset are
 		 * placed after the user SGPRs.
 		 */
-		for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
+		for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
 			returns[num_returns++] = ctx->i32; /* SGPRs */
-
 		for (i = 0; i < 3; i++)
 			returns[num_returns++] = ctx->f32; /* VGPRs */
 		break;
 
+	case SI_SHADER_MERGED_VERTEX_TESSCTRL:
+		/* Merged stages have 8 system SGPRs at the beginning. */
+		params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
+			si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
+		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+		params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
+		params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32; /* unused */
+		params[num_params++] = ctx->i32; /* unused */
+
+		params[num_params++] = ctx->i32; /* unused */
+		params[num_params++] = ctx->i32; /* unused */
+		declare_per_stage_desc_pointers(ctx, params, &num_params,
+						ctx->type == PIPE_SHADER_VERTEX);
+		declare_vs_specific_input_sgprs(ctx, params, &num_params);
+
+		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+		params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
+		params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32; /* unused */
+
+		declare_per_stage_desc_pointers(ctx, params, &num_params,
+						ctx->type == PIPE_SHADER_TESS_CTRL);
+		last_sgpr = num_params - 1;
+
+		/* VGPRs (first TCS, then VS) */
+		params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
+		params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
+
+		if (ctx->type == PIPE_SHADER_VERTEX) {
+			declare_vs_input_vgprs(ctx, params, &num_params,
+					       &num_prolog_vgprs);
+
+			/* LS return values are inputs to the TCS main shader part. */
+			for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
+				returns[num_returns++] = ctx->i32; /* SGPRs */
+			for (i = 0; i < 2; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		} else {
+			/* TCS return values are inputs to the TCS epilog.
+			 *
+			 * param_tcs_offchip_offset, param_tcs_factor_offset,
+			 * param_tcs_offchip_layout, and param_rw_buffers
+			 * should be passed to the epilog.
+			 */
+			for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
+				returns[num_returns++] = ctx->i32; /* SGPRs */
+			for (i = 0; i < 3; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
+		break;
+
+	case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
+		/* Merged stages have 8 system SGPRs at the beginning. */
+		params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
+			si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
+		params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
+		params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+		params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
+		params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
+
+		params[num_params++] = ctx->i32; /* unused */
+		params[num_params++] = ctx->i32; /* unused */
+		declare_per_stage_desc_pointers(ctx, params, &num_params,
+						(ctx->type == PIPE_SHADER_VERTEX ||
+						 ctx->type == PIPE_SHADER_TESS_EVAL));
+		if (ctx->type == PIPE_SHADER_VERTEX) {
+			declare_vs_specific_input_sgprs(ctx, params, &num_params);
+		} else {
+			/* TESS_EVAL (and also GEOMETRY):
+			 * Declare as many input SGPRs as the VS has. */
+			params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+			params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
+			params[num_params++] = ctx->i32; /* unused */
+			params[num_params++] = ctx->i32; /* unused */
+			params[num_params++] = ctx->i32; /* unused */
+			params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
+		}
+
+		declare_per_stage_desc_pointers(ctx, params, &num_params,
+						ctx->type == PIPE_SHADER_GEOMETRY);
+		last_sgpr = num_params - 1;
+
+		/* VGPRs (first GS, then VS/TES) */
+		params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
+		params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
+
+		if (ctx->type == PIPE_SHADER_VERTEX) {
+			declare_vs_input_vgprs(ctx, params, &num_params,
+					       &num_prolog_vgprs);
+		} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+			declare_tes_input_vgprs(ctx, params, &num_params);
+		}
+
+		if (ctx->type == PIPE_SHADER_VERTEX ||
+		    ctx->type == PIPE_SHADER_TESS_EVAL) {
+			/* ES return values are inputs to GS. */
+			for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
+				returns[num_returns++] = ctx->i32; /* SGPRs */
+			for (i = 0; i < 5; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
+		break;
+
 	case PIPE_SHADER_TESS_EVAL:
-		params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
-		num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
+		declare_default_desc_pointers(ctx, params, &num_params);
+		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
 
 		if (shader->key.as_es) {
-			params[ctx->param_oc_lds = num_params++] = ctx->i32;
+			params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
 			params[num_params++] = ctx->i32;
 			params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
 		} else {
 			params[num_params++] = ctx->i32;
 			declare_streamout_params(ctx, &shader->selector->so,
 						 params, ctx->i32, &num_params);
-			params[ctx->param_oc_lds = num_params++] = ctx->i32;
+			params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
 		}
 		last_sgpr = num_params - 1;
 
 		/* VGPRs */
-		params[ctx->param_tes_u = num_params++] = ctx->f32;
-		params[ctx->param_tes_v = num_params++] = ctx->f32;
-		params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
-		params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
-
-		/* PrimitiveID output. */
-		if (!shader->key.as_es)
-			for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-				returns[num_returns++] = ctx->f32;
+		declare_tes_input_vgprs(ctx, params, &num_params);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
-		params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
-		params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
-		last_sgpr = SI_PARAM_GS_WAVE_ID;
+		declare_default_desc_pointers(ctx, params, &num_params);
+		params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
+		last_sgpr = num_params - 1;
 
 		/* VGPRs */
-		params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
-		params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
-		params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
-		params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
-		params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
-		params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
-		params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
-		params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
-		num_params = SI_PARAM_GS_INSTANCE_ID+1;
+		params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
+		params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
+		declare_default_desc_pointers(ctx, params, &num_params);
 		params[SI_PARAM_ALPHA_REF] = ctx->f32;
 		params[SI_PARAM_PRIM_MASK] = ctx->i32;
 		last_sgpr = SI_PARAM_PRIM_MASK;
@@ -5809,6 +4406,7 @@
 		params[SI_PARAM_FRONT_FACE] = ctx->i32;
 		shader->info.face_vgpr_index = 20;
 		params[SI_PARAM_ANCILLARY] = ctx->i32;
+		shader->info.ancillary_vgpr_index = 21;
 		params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
 		params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
 		num_params = SI_PARAM_POS_FIXED_PT+1;
@@ -5846,13 +4444,20 @@
 		break;
 
 	case PIPE_SHADER_COMPUTE:
-		params[SI_PARAM_GRID_SIZE] = v3i32;
-		params[SI_PARAM_BLOCK_SIZE] = v3i32;
-		params[SI_PARAM_BLOCK_ID] = v3i32;
-		last_sgpr = SI_PARAM_BLOCK_ID;
+		declare_default_desc_pointers(ctx, params, &num_params);
+		if (shader->selector->info.uses_grid_size)
+			params[ctx->param_grid_size = num_params++] = v3i32;
+		if (shader->selector->info.uses_block_size)
+			params[ctx->param_block_size = num_params++] = v3i32;
 
-		params[SI_PARAM_THREAD_ID] = v3i32;
-		num_params = SI_PARAM_THREAD_ID + 1;
+		for (i = 0; i < 3; i++) {
+			ctx->param_block_id[i] = -1;
+			if (shader->selector->info.uses_block_id[i])
+				params[ctx->param_block_id[i] = num_params++] = ctx->i32;
+		}
+		last_sgpr = num_params - 1;
+
+		params[ctx->param_thread_id = num_params++] = v3i32;
 		break;
 	default:
 		assert(0 && "unimplemented shader");
@@ -5862,7 +4467,8 @@
 	assert(num_params <= ARRAY_SIZE(params));
 
 	si_create_function(ctx, "main", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr,
+			   si_get_max_workgroup_size(shader));
 
 	/* Reserve register locations for VGPR inputs the PS prolog may need. */
 	if (ctx->type == PIPE_SHADER_FRAGMENT &&
@@ -5876,11 +4482,8 @@
 				      S_0286D0_LINEAR_CENTER_ENA(1) |
 				      S_0286D0_LINEAR_CENTROID_ENA(1) |
 				      S_0286D0_FRONT_FACE_ENA(1) |
+				      S_0286D0_ANCILLARY_ENA(1) |
 				      S_0286D0_POS_FIXED_PT_ENA(1));
-	} else if (ctx->type == PIPE_SHADER_COMPUTE) {
-		si_llvm_add_attribute(ctx->main_fn,
-				      "amdgpu-max-work-group-size",
-				      si_get_max_workgroup_size(shader));
 	}
 
 	shader->info.num_input_sgprs = 0;
@@ -5895,23 +4498,13 @@
 	assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
 	shader->info.num_input_vgprs -= num_prolog_vgprs;
 
-	if (!ctx->screen->has_ds_bpermute &&
-	    bld_base->info &&
-	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
-		ctx->lds =
-			LLVMAddGlobalInAddressSpace(gallivm->module,
-						    LLVMArrayType(ctx->i32, 64),
-						    "ddxy_lds",
-						    LOCAL_ADDR_SPACE);
-
-	if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) ||
-	    ctx->type == PIPE_SHADER_TESS_CTRL)
-		declare_tess_lds(ctx);
+	if (shader->key.as_ls ||
+	    ctx->type == PIPE_SHADER_TESS_CTRL ||
+	    /* GFX9 has the ESGS ring buffer in LDS. */
+	    (ctx->screen->b.chip_class >= GFX9 &&
+	     (shader->key.as_es ||
+	      ctx->type == PIPE_SHADER_GEOMETRY)))
+		declare_lds_as_pointer(ctx);
 }
 
 /**
@@ -5924,13 +4517,10 @@
 	LLVMBuilderRef builder = gallivm->builder;
 
 	LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
-					    SI_PARAM_RW_BUFFERS);
+					    ctx->param_rw_buffers);
 
-	if ((ctx->type == PIPE_SHADER_VERTEX &&
-	     ctx->shader->key.as_es) ||
-	    (ctx->type == PIPE_SHADER_TESS_EVAL &&
-	     ctx->shader->key.as_es) ||
-	    ctx->type == PIPE_SHADER_GEOMETRY) {
+	if (ctx->screen->b.chip_class <= VI &&
+	    (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
 		unsigned ring =
 			ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
 							     : SI_ES_RING_ESGS;
@@ -6011,7 +4601,6 @@
 					     S_008F0C_ADD_TID_ENABLE(1),
 					     0),
 				LLVMConstInt(ctx->i32, 3, 0), "");
-			ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
 
 			ctx->gsvs_ring[stream] = ring;
 		}
@@ -6084,6 +4673,7 @@
 		case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
+		case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
 			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
@@ -6134,24 +4724,16 @@
 		conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 }
 
-void si_shader_apply_scratch_relocs(struct si_context *sctx,
-			struct si_shader *shader,
-			struct si_shader_config *config,
-			uint64_t scratch_va)
+void si_shader_apply_scratch_relocs(struct si_shader *shader,
+				    uint64_t scratch_va)
 {
 	unsigned i;
 	uint32_t scratch_rsrc_dword0 = scratch_va;
 	uint32_t scratch_rsrc_dword1 =
 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
 
-	/* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
-	 * correctly.
-	 */
-	if (HAVE_LLVM >= 0x0309)
-		scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
-	else
-		scratch_rsrc_dword1 |=
-			S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
+	/* Enable scratch coalescing. */
+	scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
 
 	for (i = 0 ; i < shader->binary.reloc_count; i++) {
 		const struct ac_shader_reloc *reloc =
@@ -6166,12 +4748,16 @@
 	}
 }
 
-static unsigned si_get_shader_binary_size(struct si_shader *shader)
+static unsigned si_get_shader_binary_size(const struct si_shader *shader)
 {
 	unsigned size = shader->binary.code_size;
 
 	if (shader->prolog)
 		size += shader->prolog->binary.code_size;
+	if (shader->previous_stage)
+		size += shader->previous_stage->binary.code_size;
+	if (shader->prolog2)
+		size += shader->prolog2->binary.code_size;
 	if (shader->epilog)
 		size += shader->epilog->binary.code_size;
 	return size;
@@ -6181,6 +4767,10 @@
 {
 	const struct ac_shader_binary *prolog =
 		shader->prolog ? &shader->prolog->binary : NULL;
+	const struct ac_shader_binary *previous_stage =
+		shader->previous_stage ? &shader->previous_stage->binary : NULL;
+	const struct ac_shader_binary *prolog2 =
+		shader->prolog2 ? &shader->prolog2->binary : NULL;
 	const struct ac_shader_binary *epilog =
 		shader->epilog ? &shader->epilog->binary : NULL;
 	const struct ac_shader_binary *mainb = &shader->binary;
@@ -6189,15 +4779,12 @@
 	unsigned char *ptr;
 
 	assert(!prolog || !prolog->rodata_size);
-	assert((!prolog && !epilog) || !mainb->rodata_size);
+	assert(!previous_stage || !previous_stage->rodata_size);
+	assert(!prolog2 || !prolog2->rodata_size);
+	assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
+	       !mainb->rodata_size);
 	assert(!epilog || !epilog->rodata_size);
 
-	/* GFX9 can fetch at most 128 bytes past the end of the shader.
-	 * Prevent VM faults.
-	 */
-	if (sscreen->b.chip_class >= GFX9)
-		bo_size += 128;
-
 	r600_resource_reference(&shader->bo, NULL);
 	shader->bo = (struct r600_resource*)
 		     pipe_buffer_create(&sscreen->b.b, 0,
@@ -6208,20 +4795,31 @@
 
 	/* Upload. */
 	ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
-					PIPE_TRANSFER_READ_WRITE);
+					PIPE_TRANSFER_READ_WRITE |
+					PIPE_TRANSFER_UNSYNCHRONIZED);
 
+	/* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
+	 * endian-independent. */
 	if (prolog) {
-		util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+		memcpy(ptr, prolog->code, prolog->code_size);
 		ptr += prolog->code_size;
 	}
+	if (previous_stage) {
+		memcpy(ptr, previous_stage->code, previous_stage->code_size);
+		ptr += previous_stage->code_size;
+	}
+	if (prolog2) {
+		memcpy(ptr, prolog2->code, prolog2->code_size);
+		ptr += prolog2->code_size;
+	}
 
-	util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+	memcpy(ptr, mainb->code, mainb->code_size);
 	ptr += mainb->code_size;
 
 	if (epilog)
-		util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+		memcpy(ptr, epilog->code, epilog->code_size);
 	else if (mainb->rodata_size > 0)
-		util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+		memcpy(ptr, mainb->rodata, mainb->rodata_size);
 
 	sscreen->b.ws->buffer_unmap(shader->bo->buf);
 	return 0;
@@ -6276,13 +4874,13 @@
 }
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
-				 struct si_shader *shader,
+				 const struct si_shader *shader,
 			         struct pipe_debug_callback *debug,
 			         unsigned processor,
 				 FILE *file,
 				 bool check_debug_option)
 {
-	struct si_shader_config *conf = &shader->config;
+	const struct si_shader_config *conf = &shader->config;
 	unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
 	unsigned code_size = si_get_shader_binary_size(shader);
 	unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
@@ -6368,7 +4966,7 @@
 			   conf->spilled_vgprs, conf->private_mem_vgprs);
 }
 
-const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
+const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
 {
 	switch (processor) {
 	case PIPE_SHADER_VERTEX:
@@ -6399,13 +4997,13 @@
 	}
 }
 
-void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor,
 		    FILE *file, bool check_debug_option)
 {
 	if (!check_debug_option ||
 	    r600_can_dump_shader(&sscreen->b, processor))
-		si_dump_shader_key(processor, &shader->key, file);
+		si_dump_shader_key(processor, shader, file);
 
 	if (!check_debug_option && shader->binary.llvm_ir_string) {
 		fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
@@ -6421,6 +5019,12 @@
 		if (shader->prolog)
 			si_shader_dump_disassembly(&shader->prolog->binary,
 						   debug, "prolog", file);
+		if (shader->previous_stage)
+			si_shader_dump_disassembly(&shader->previous_stage->binary,
+						   debug, "previous stage", file);
+		if (shader->prolog2)
+			si_shader_dump_disassembly(&shader->prolog2->binary,
+						   debug, "prolog2", file);
 
 		si_shader_dump_disassembly(&shader->binary, debug, "main", file);
 
@@ -6434,14 +5038,14 @@
 			     check_debug_option);
 }
 
-int si_compile_llvm(struct si_screen *sscreen,
-		    struct ac_shader_binary *binary,
-		    struct si_shader_config *conf,
-		    LLVMTargetMachineRef tm,
-		    LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug,
-		    unsigned processor,
-		    const char *name)
+static int si_compile_llvm(struct si_screen *sscreen,
+			   struct ac_shader_binary *binary,
+			   struct si_shader_config *conf,
+			   LLVMTargetMachineRef tm,
+			   LLVMModuleRef mod,
+			   struct pipe_debug_callback *debug,
+			   unsigned processor,
+			   const char *name)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
@@ -6544,7 +5148,8 @@
 	shader->selector = gs_selector;
 	shader->is_gs_copy_shader = true;
 
-	si_init_shader_ctx(&ctx, sscreen, shader, tm);
+	si_init_shader_ctx(&ctx, sscreen, tm);
+	ctx.shader = shader;
 	ctx.type = PIPE_SHADER_VERTEX;
 
 	builder = gallivm->builder;
@@ -6613,7 +5218,8 @@
 					ac_build_buffer_load(&ctx.ac,
 							     ctx.gsvs_ring[0], 1,
 							     ctx.i32_0, voffset,
-							     soffset, 0, 1, 1, true);
+							     soffset, 0, 1, 1,
+							     true, false);
 			}
 		}
 
@@ -6634,13 +5240,8 @@
 
 	LLVMBuildRetVoid(gallivm->builder);
 
-	/* Dump LLVM IR before any optimization passes */
-	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
-	    r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
-		ac_dump_module(ctx.gallivm.module);
-
-	si_llvm_finalize_module(&ctx,
-		r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
+	ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+	si_llvm_optimize_module(&ctx);
 
 	r = si_compile_llvm(sscreen, &ctx.shader->binary,
 			    &ctx.shader->config, ctx.tm,
@@ -6666,41 +5267,62 @@
 	return shader;
 }
 
-static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
+static void si_dump_shader_key_vs(const struct si_shader_key *key,
+				  const struct si_vs_prolog_bits *prolog,
+				  const char *prefix, FILE *f)
+{
+	fprintf(f, "  %s.instance_divisor_is_one = %u\n",
+		prefix, prolog->instance_divisor_is_one);
+	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
+		prefix, prolog->instance_divisor_is_fetched);
+
+	fprintf(f, "  mono.vs.fix_fetch = {");
+	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
+		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
+	fprintf(f, "}\n");
+}
+
+static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
 			       FILE *f)
 {
-	int i;
+	const struct si_shader_key *key = &shader->key;
 
 	fprintf(f, "SHADER KEY\n");
 
-	switch (shader) {
+	switch (processor) {
 	case PIPE_SHADER_VERTEX:
-		fprintf(f, "  part.vs.prolog.instance_divisors = {");
-		for (i = 0; i < ARRAY_SIZE(key->part.vs.prolog.instance_divisors); i++)
-			fprintf(f, !i ? "%u" : ", %u",
-				key->part.vs.prolog.instance_divisors[i]);
-		fprintf(f, "}\n");
-		fprintf(f, "  part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id);
+		si_dump_shader_key_vs(key, &key->part.vs.prolog,
+				      "part.vs.prolog", f);
 		fprintf(f, "  as_es = %u\n", key->as_es);
 		fprintf(f, "  as_ls = %u\n", key->as_ls);
-
-		fprintf(f, "  mono.vs.fix_fetch = {");
-		for (i = 0; i < SI_MAX_ATTRIBS; i++)
-			fprintf(f, !i ? "%u" : ", %u", key->mono.vs.fix_fetch[i]);
-		fprintf(f, "}\n");
+		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
+			key->mono.u.vs_export_prim_id);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
+		if (shader->selector->screen->b.chip_class >= GFX9) {
+			si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
+					      "part.tcs.ls_prolog", f);
+		}
 		fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
-		fprintf(f, "  mono.tcs.inputs_to_copy = 0x%"PRIx64"\n", key->mono.tcs.inputs_to_copy);
+		fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
-		fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
 		fprintf(f, "  as_es = %u\n", key->as_es);
+		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
+			key->mono.u.vs_export_prim_id);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
+		if (shader->is_gs_copy_shader)
+			break;
+
+		if (shader->selector->screen->b.chip_class >= GFX9 &&
+		    key->part.gs.es->type == PIPE_SHADER_VERTEX) {
+			si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
+					      "part.gs.vs_prolog", f);
+		}
 		fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
 		break;
 
@@ -6731,27 +5353,22 @@
 		assert(0);
 	}
 
-	if ((shader == PIPE_SHADER_GEOMETRY ||
-	     shader == PIPE_SHADER_TESS_EVAL ||
-	     shader == PIPE_SHADER_VERTEX) &&
+	if ((processor == PIPE_SHADER_GEOMETRY ||
+	     processor == PIPE_SHADER_TESS_EVAL ||
+	     processor == PIPE_SHADER_VERTEX) &&
 	    !key->as_es && !key->as_ls) {
-		fprintf(f, "  opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
-		fprintf(f, "  opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
-		fprintf(f, "  opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
+		fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
+		fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
 	}
 }
 
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
-			       struct si_shader *shader,
 			       LLVMTargetMachineRef tm)
 {
 	struct lp_build_tgsi_context *bld_base;
-	struct lp_build_tgsi_action tmpl = {};
 
-	si_llvm_context_init(ctx, sscreen, shader, tm,
-		(shader && shader->selector) ? &shader->selector->info : NULL,
-		(shader && shader->selector) ? shader->selector->tokens : NULL);
+	si_llvm_context_init(ctx, sscreen, tm);
 
 	bld_base = &ctx->bld_base;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -6760,53 +5377,6 @@
 	bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
 	bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
 
-	bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
-	bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
-	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
-
-	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
-	bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
-	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
-
-	tmpl.fetch_args = atomic_fetch_args;
-	tmpl.emit = atomic_emit;
-	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
-	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
-	bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
-	bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
-	bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
-	bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
-	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
-	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
-	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
-	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
-	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
-
 	bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
 
 	bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
@@ -6831,161 +5401,22 @@
 	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 }
 
-#define EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
-
-/* Return true if the PARAM export has been eliminated. */
-static bool si_eliminate_const_output(struct si_shader_context *ctx,
-				      LLVMValueRef inst, unsigned offset)
-{
-	struct si_shader *shader = ctx->shader;
-	unsigned num_outputs = shader->selector->info.num_outputs;
-	unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
-	bool is_zero[4] = {}, is_one[4] = {};
-
-	for (i = 0; i < 4; i++) {
-		LLVMBool loses_info;
-		LLVMValueRef p = LLVMGetOperand(inst, EXP_OUT0 + i);
-
-		/* It's a constant expression. Undef outputs are eliminated too. */
-		if (LLVMIsUndef(p)) {
-			is_zero[i] = true;
-			is_one[i] = true;
-		} else if (LLVMIsAConstantFP(p)) {
-			double a = LLVMConstRealGetDouble(p, &loses_info);
-
-			if (a == 0)
-				is_zero[i] = true;
-			else if (a == 1)
-				is_one[i] = true;
-			else
-				return false; /* other constant */
-		} else
-			return false;
-	}
-
-	/* Only certain combinations of 0 and 1 can be eliminated. */
-	if (is_zero[0] && is_zero[1] && is_zero[2])
-		default_val = is_zero[3] ? 0 : 1;
-	else if (is_one[0] && is_one[1] && is_one[2])
-		default_val = is_zero[3] ? 2 : 3;
-	else
-		return false;
-
-	/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
-	LLVMInstructionEraseFromParent(inst);
-
-	/* Change OFFSET to DEFAULT_VAL. */
-	for (i = 0; i < num_outputs; i++) {
-		if (shader->info.vs_output_param_offset[i] == offset) {
-			shader->info.vs_output_param_offset[i] =
-				EXP_PARAM_DEFAULT_VAL_0000 + default_val;
-			break;
-		}
-	}
-	return true;
-}
-
-struct si_vs_exports {
-	unsigned num;
-	unsigned offset[SI_MAX_VS_OUTPUTS];
-	LLVMValueRef inst[SI_MAX_VS_OUTPUTS];
-};
-
-static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
+static void si_optimize_vs_outputs(struct si_shader_context *ctx)
 {
 	struct si_shader *shader = ctx->shader;
 	struct tgsi_shader_info *info = &shader->selector->info;
-	LLVMBasicBlockRef bb;
-	struct si_vs_exports exports;
-	bool removed_any = false;
 
-	exports.num = 0;
-
-	if (ctx->type == PIPE_SHADER_FRAGMENT ||
-	    ctx->type == PIPE_SHADER_COMPUTE ||
-	    shader->key.as_es ||
-	    shader->key.as_ls)
+	if ((ctx->type != PIPE_SHADER_VERTEX &&
+	     ctx->type != PIPE_SHADER_TESS_EVAL) ||
+	    shader->key.as_ls ||
+	    shader->key.as_es)
 		return;
 
-	/* Process all LLVM instructions. */
-	bb = LLVMGetFirstBasicBlock(ctx->main_fn);
-	while (bb) {
-		LLVMValueRef inst = LLVMGetFirstInstruction(bb);
-
-		while (inst) {
-			LLVMValueRef cur = inst;
-			inst = LLVMGetNextInstruction(inst);
-
-			if (LLVMGetInstructionOpcode(cur) != LLVMCall)
-				continue;
-
-			LLVMValueRef callee = lp_get_called_value(cur);
-
-			if (!lp_is_function(callee))
-				continue;
-
-			const char *name = LLVMGetValueName(callee);
-			unsigned num_args = LLVMCountParams(callee);
-
-			/* Check if this is an export instruction. */
-			if ((num_args != 9 && num_args != 8) ||
-			    (strcmp(name, "llvm.SI.export") &&
-			     strcmp(name, "llvm.amdgcn.exp.f32")))
-				continue;
-
-			LLVMValueRef arg = LLVMGetOperand(cur, EXP_TARGET);
-			unsigned target = LLVMConstIntGetZExtValue(arg);
-
-			if (target < V_008DFC_SQ_EXP_PARAM)
-				continue;
-
-			target -= V_008DFC_SQ_EXP_PARAM;
-
-			/* Eliminate constant value PARAM exports. */
-			if (si_eliminate_const_output(ctx, cur, target)) {
-				removed_any = true;
-			} else {
-				exports.offset[exports.num] = target;
-				exports.inst[exports.num] = cur;
-				exports.num++;
-			}
-		}
-		bb = LLVMGetNextBasicBlock(bb);
-	}
-
-	/* Remove holes in export memory due to removed PARAM exports.
-	 * This is done by renumbering all PARAM exports.
-	 */
-	if (removed_any) {
-		ubyte current_offset[SI_MAX_VS_OUTPUTS];
-		unsigned new_count = 0;
-		unsigned out, i;
-
-		/* Make a copy of the offsets. We need the old version while
-		 * we are modifying some of them. */
-		assert(sizeof(current_offset) ==
-		       sizeof(shader->info.vs_output_param_offset));
-		memcpy(current_offset, shader->info.vs_output_param_offset,
-		       sizeof(current_offset));
-
-		for (i = 0; i < exports.num; i++) {
-			unsigned offset = exports.offset[i];
-
-			for (out = 0; out < info->num_outputs; out++) {
-				if (current_offset[out] != offset)
-					continue;
-
-				LLVMSetOperand(exports.inst[i], EXP_TARGET,
-					       LLVMConstInt(ctx->i32,
-							    V_008DFC_SQ_EXP_PARAM + new_count, 0));
-				shader->info.vs_output_param_offset[out] = new_count;
-				new_count++;
-				break;
-			}
-		}
-		shader->info.nr_param_exports = new_count;
-	}
+	ac_optimize_vs_outputs(&ctx->ac,
+			       ctx->main_fn,
+			       shader->info.vs_output_param_offset,
+			       info->num_outputs,
+			       &shader->info.nr_param_exports);
 }
 
 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
@@ -7014,9 +5445,30 @@
 	}
 }
 
-static bool si_compile_tgsi_main(struct si_shader_context *ctx,
-				 struct si_shader *shader)
+static void si_init_exec_full_mask(struct si_shader_context *ctx)
 {
+	LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+	lp_build_intrinsic(ctx->gallivm.builder,
+			   "llvm.amdgcn.init.exec", ctx->voidt,
+			   &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
+static void si_init_exec_from_input(struct si_shader_context *ctx,
+				    unsigned param, unsigned bitoffset)
+{
+	LLVMValueRef args[] = {
+		LLVMGetParam(ctx->main_fn, param),
+		LLVMConstInt(ctx->i32, bitoffset, 0),
+	};
+	lp_build_intrinsic(ctx->gallivm.builder,
+			   "llvm.amdgcn.init.exec.from.input",
+			   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
+}
+
+static bool si_compile_tgsi_main(struct si_shader_context *ctx,
+				 bool is_monolithic)
+{
+	struct si_shader *shader = ctx->shader;
 	struct si_shader_selector *sel = shader->selector;
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
@@ -7062,6 +5514,45 @@
 	create_function(ctx);
 	preload_ring_buffers(ctx);
 
+	/* For GFX9 merged shaders:
+	 * - Set EXEC for the first shader. If the prolog is present, set
+	 *   EXEC there instead.
+	 * - Add a barrier before the second shader.
+	 * - In the second shader, reset EXEC to ~0 and wrap the main part in
+	 *   an if-statement. This is required for correctness in geometry
+	 *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+	 *   GS_CUT messages.
+	 *
+	 * For monolithic merged shaders, the first shader is wrapped in an
+	 * if-block together with its prolog in si_build_wrapper_function.
+	 */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		if (!is_monolithic &&
+		    sel->info.num_instructions > 1 && /* not empty shader */
+		    (shader->key.as_es || shader->key.as_ls) &&
+		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
+		     (ctx->type == PIPE_SHADER_VERTEX &&
+		      !sel->vs_needs_prolog))) {
+			si_init_exec_from_input(ctx,
+						ctx->param_merged_wave_info, 0);
+		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
+			   ctx->type == PIPE_SHADER_GEOMETRY) {
+			if (!is_monolithic)
+				si_init_exec_full_mask(ctx);
+
+			/* The barrier must execute for all shaders in a
+			 * threadgroup.
+			 */
+			si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+			LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+			LLVMValueRef ena =
+				LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+					    ac_get_thread_id(&ctx->ac), num_threads, "");
+			lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
+		}
+	}
+
 	if (ctx->type == PIPE_SHADER_GEOMETRY) {
 		int i;
 		for (i = 0; i < 4; i++) {
@@ -7071,6 +5562,12 @@
 		}
 	}
 
+	if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
+	    ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
+		/* This is initialized to 0.0 = not kill. */
+		ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
+	}
+
 	if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
 		fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
 		return false;
@@ -7083,43 +5580,38 @@
 /**
  * Compute the VS prolog key, which contains all the information needed to
  * build the VS prolog function, and set shader->info bits where needed.
+ *
+ * \param info             Shader info of the vertex shader.
+ * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
+ * \param prolog_key       Key of the VS prolog
+ * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
+ * \param key              Output shader part key.
  */
-static void si_get_vs_prolog_key(struct si_shader *shader,
+static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
+				 unsigned num_input_sgprs,
+				 const struct si_vs_prolog_bits *prolog_key,
+				 struct si_shader *shader_out,
 				 union si_shader_part_key *key)
 {
-	struct tgsi_shader_info *info = &shader->selector->info;
-
 	memset(key, 0, sizeof(*key));
-	key->vs_prolog.states = shader->key.part.vs.prolog;
-	key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+	key->vs_prolog.states = *prolog_key;
+	key->vs_prolog.num_input_sgprs = num_input_sgprs;
 	key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+	key->vs_prolog.as_ls = shader_out->key.as_ls;
 
-	/* Set the instanceID flag. */
-	for (unsigned i = 0; i < info->num_inputs; i++)
-		if (key->vs_prolog.states.instance_divisors[i])
-			shader->info.uses_instanceid = true;
-}
-
-/**
- * Compute the VS epilog key, which contains all the information needed to
- * build the VS epilog function, and set the PrimitiveID output offset.
- */
-static void si_get_vs_epilog_key(struct si_shader *shader,
-				 struct si_vs_epilog_bits *states,
-				 union si_shader_part_key *key)
-{
-	memset(key, 0, sizeof(*key));
-	key->vs_epilog.states = *states;
-
-	/* Set up the PrimitiveID output. */
-	if (shader->key.part.vs.epilog.export_prim_id) {
-		unsigned index = shader->selector->info.num_outputs;
-		unsigned offset = shader->info.nr_param_exports++;
-
-		key->vs_epilog.prim_id_param_offset = offset;
-		assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
-		shader->info.vs_output_param_offset[index] = offset;
+	if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
+		key->vs_prolog.as_ls = 1;
+		key->vs_prolog.num_merged_next_stage_vgprs = 2;
+	} else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
+		key->vs_prolog.num_merged_next_stage_vgprs = 5;
 	}
+
+	/* Enable loading the InstanceID VGPR. */
+	uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
+
+	if ((key->vs_prolog.states.instance_divisor_is_one |
+	     key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
+		shader_out->info.uses_instanceid = true;
 }
 
 /**
@@ -7145,6 +5637,7 @@
 		 key->ps_prolog.states.force_linear_center_interp ||
 		 key->ps_prolog.states.bc_optimize_for_persp ||
 		 key->ps_prolog.states.bc_optimize_for_linear);
+	key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
 
 	if (info->colors_read) {
 		unsigned *color = shader->selector->color_attr_index;
@@ -7254,7 +5747,8 @@
 	       key->ps_prolog.states.force_linear_center_interp ||
 	       key->ps_prolog.states.bc_optimize_for_persp ||
 	       key->ps_prolog.states.bc_optimize_for_linear ||
-	       key->ps_prolog.states.poly_stipple;
+	       key->ps_prolog.states.poly_stipple ||
+	       key->ps_prolog.states.samplemask_log_ps_iter;
 }
 
 /**
@@ -7280,14 +5774,21 @@
 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 					union si_shader_part_key *key)
 {
-	const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
-	const unsigned num_vgprs = 8;
+	unsigned num_sgprs, num_vgprs;
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMTypeRef params[32];
-	LLVMTypeRef returns[32];
+	LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
+	LLVMTypeRef returns[48];
 	LLVMValueRef func, ret;
 
+	if (ctx->screen->b.chip_class >= GFX9) {
+		num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+		num_vgprs = 5; /* ES inputs are not needed by GS */
+	} else {
+		num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+		num_vgprs = 8;
+	}
+
 	for (unsigned i = 0; i < num_sgprs; ++i) {
 		params[i] = ctx->i32;
 		returns[i] = ctx->i32;
@@ -7300,9 +5801,16 @@
 
 	/* Create the function. */
 	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
-			   params, num_sgprs + num_vgprs, num_sgprs - 1);
+			   params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
 	func = ctx->main_fn;
 
+	/* Set the full EXEC mask for the prolog, because we are only fiddling
+	 * with registers here. The main shader part will set the correct EXEC
+	 * mask.
+	 */
+	if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+		si_init_exec_full_mask(ctx);
+
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
@@ -7319,7 +5827,7 @@
 
 	if (key->gs_prolog.states.tri_strip_adj_fix) {
 		/* Remap the input vertices for every other primitive. */
-		const unsigned vtx_params[6] = {
+		const unsigned gfx6_vtx_params[6] = {
 			num_sgprs,
 			num_sgprs + 1,
 			num_sgprs + 3,
@@ -7327,18 +5835,53 @@
 			num_sgprs + 5,
 			num_sgprs + 6
 		};
+		const unsigned gfx9_vtx_params[3] = {
+			num_sgprs,
+			num_sgprs + 1,
+			num_sgprs + 4,
+		};
+		LLVMValueRef vtx_in[6], vtx_out[6];
 		LLVMValueRef prim_id, rotate;
 
+		if (ctx->screen->b.chip_class >= GFX9) {
+			for (unsigned i = 0; i < 3; i++) {
+				vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+				vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+			}
+		} else {
+			for (unsigned i = 0; i < 6; i++)
+				vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
+		}
+
 		prim_id = LLVMGetParam(func, num_sgprs + 2);
 		rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
 
 		for (unsigned i = 0; i < 6; ++i) {
-			LLVMValueRef base, rotated, actual;
-			base = LLVMGetParam(func, vtx_params[i]);
-			rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
-			actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
-			actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
-			ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
+			LLVMValueRef base, rotated;
+			base = vtx_in[i];
+			rotated = vtx_in[(i + 4) % 6];
+			vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+		}
+
+		if (ctx->screen->b.chip_class >= GFX9) {
+			for (unsigned i = 0; i < 3; i++) {
+				LLVMValueRef hi, out;
+
+				hi = LLVMBuildShl(builder, vtx_out[i*2+1],
+						  LLVMConstInt(ctx->i32, 16, 0), "");
+				out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
+				out = LLVMBuildBitCast(builder, out, ctx->f32, "");
+				ret = LLVMBuildInsertValue(builder, ret, out,
+							   gfx9_vtx_params[i], "");
+			}
+		} else {
+			for (unsigned i = 0; i < 6; i++) {
+				LLVMValueRef out;
+
+				out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
+				ret = LLVMBuildInsertValue(builder, ret, out,
+							   gfx6_vtx_params[i], "");
+			}
 		}
 	}
 
@@ -7352,20 +5895,25 @@
 static void si_build_wrapper_function(struct si_shader_context *ctx,
 				      LLVMValueRef *parts,
 				      unsigned num_parts,
-				      unsigned main_part)
+				      unsigned main_part,
+				      unsigned next_shader_first_part)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	LLVMBuilderRef builder = ctx->gallivm.builder;
-	/* PS epilog has one arg per color component */
-	LLVMTypeRef param_types[48];
-	LLVMValueRef out[48];
+	/* PS epilog has one arg per color component; gfx9 merged shader
+	 * prologs need to forward 32 user SGPRs.
+	 */
+	LLVMTypeRef param_types[64];
+	LLVMValueRef initial[64], out[64];
 	LLVMTypeRef function_type;
 	unsigned num_params;
-	unsigned num_out;
+	unsigned num_out, initial_num_out;
 	MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+	MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
 	unsigned num_sgprs, num_vgprs;
 	unsigned last_sgpr_param;
 	unsigned gprs;
+	struct lp_build_if_state if_state;
 
 	for (unsigned i = 0; i < num_parts; ++i) {
 		lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
@@ -7415,7 +5963,12 @@
 		gprs += size;
 	}
 
-	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+	si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
+			   last_sgpr_param,
+			   si_get_max_workgroup_size(ctx->shader));
+
+	if (is_merged_shader(ctx->shader))
+		si_init_exec_full_mask(ctx);
 
 	/* Record the arguments of the function as if they were an output of
 	 * a previous part.
@@ -7453,6 +6006,10 @@
 			num_out_sgpr = num_out;
 	}
 
+	memcpy(initial, out, sizeof(out));
+	initial_num_out = num_out;
+	initial_num_out_sgpr = num_out_sgpr;
+
 	/* Now chain the parts. */
 	for (unsigned part = 0; part < num_parts; ++part) {
 		LLVMValueRef in[48];
@@ -7463,6 +6020,18 @@
 		num_params = LLVMCountParams(parts[part]);
 		assert(num_params <= ARRAY_SIZE(param_types));
 
+		/* Merged shaders are executed conditionally depending
+		 * on the number of enabled threads passed in the input SGPRs. */
+		if (is_merged_shader(ctx->shader) && part == 0) {
+			LLVMValueRef ena, count = initial[3];
+
+			count = LLVMBuildAnd(builder, count,
+					     LLVMConstInt(ctx->i32, 0x7f, 0), "");
+			ena = LLVMBuildICmp(builder, LLVMIntULT,
+					    ac_get_thread_id(&ctx->ac), count, "");
+			lp_build_if(&if_state, &ctx->gallivm, ena);
+		}
+
 		/* Derive arguments for the next part from outputs of the
 		 * previous one.
 		 */
@@ -7510,9 +6079,27 @@
 		}
 
 		ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
-		ret_type = LLVMTypeOf(ret);
+
+		if (is_merged_shader(ctx->shader) &&
+		    part + 1 == next_shader_first_part) {
+			lp_build_endif(&if_state);
+
+			/* The second half of the merged shader should use
+			 * the inputs from the toplevel (wrapper) function,
+			 * not the return value from the last call.
+			 *
+			 * That's because the last call was executed condi-
+			 * tionally, so we can't consume it in the main
+			 * block.
+			 */
+			memcpy(out, initial, sizeof(initial));
+			num_out = initial_num_out;
+			num_out_sgpr = initial_num_out_sgpr;
+			continue;
+		}
 
 		/* Extract the returned GPRs. */
+		ret_type = LLVMTypeOf(ret);
 		num_out = 0;
 		num_out_sgpr = 0;
 
@@ -7525,6 +6112,7 @@
 				LLVMValueRef val =
 					LLVMBuildExtractValue(builder, ret, i, "");
 
+				assert(num_out < ARRAY_SIZE(out));
 				out[num_out++] = val;
 
 				if (LLVMTypeOf(val) == ctx->i32) {
@@ -7556,83 +6144,176 @@
 		si_dump_streamout(&sel->so);
 	}
 
-	si_init_shader_ctx(&ctx, sscreen, shader, tm);
+	si_init_shader_ctx(&ctx, sscreen, tm);
+	si_llvm_context_set_tgsi(&ctx, shader);
 	ctx.separate_prolog = !is_monolithic;
 
-	memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED,
+	memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
 	       sizeof(shader->info.vs_output_param_offset));
 
 	shader->info.uses_instanceid = sel->info.uses_instanceid;
 
 	ctx.load_system_value = declare_system_value;
 
-	if (!si_compile_tgsi_main(&ctx, shader)) {
+	if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
 		si_llvm_dispose(&ctx);
 		return -1;
 	}
 
 	if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-		LLVMValueRef parts[3];
-		bool need_prolog;
-		bool need_epilog;
+		LLVMValueRef parts[2];
+		bool need_prolog = sel->vs_needs_prolog;
 
-		need_prolog = sel->vs_needs_prolog;
-		need_epilog = !shader->key.as_es && !shader->key.as_ls;
-
-		parts[need_prolog ? 1 : 0] = ctx.main_fn;
+		parts[1] = ctx.main_fn;
 
 		if (need_prolog) {
 			union si_shader_part_key prolog_key;
-			si_get_vs_prolog_key(shader, &prolog_key);
+			si_get_vs_prolog_key(&sel->info,
+					     shader->info.num_input_sgprs,
+					     &shader->key.part.vs.prolog,
+					     shader, &prolog_key);
 			si_build_vs_prolog_function(&ctx, &prolog_key);
 			parts[0] = ctx.main_fn;
 		}
 
-		if (need_epilog) {
-			union si_shader_part_key epilog_key;
-			si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
-			si_build_vs_epilog_function(&ctx, &epilog_key);
-			parts[need_prolog ? 2 : 1] = ctx.main_fn;
-		}
-
-		si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
-					  need_prolog ? 1 : 0);
+		si_build_wrapper_function(&ctx, parts + !need_prolog,
+					  1 + need_prolog, need_prolog, 0);
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-		LLVMValueRef parts[2];
-		union si_shader_part_key epilog_key;
+		if (sscreen->b.chip_class >= GFX9) {
+			struct si_shader_selector *ls = shader->key.part.tcs.ls;
+			LLVMValueRef parts[4];
 
-		parts[0] = ctx.main_fn;
+			/* TCS main part */
+			parts[2] = ctx.main_fn;
 
-		memset(&epilog_key, 0, sizeof(epilog_key));
-		epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-		si_build_tcs_epilog_function(&ctx, &epilog_key);
-		parts[1] = ctx.main_fn;
+			/* TCS epilog */
+			union si_shader_part_key tcs_epilog_key;
+			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+			si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+			parts[3] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, 2, 0);
-	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
-		   !shader->key.as_es) {
-		LLVMValueRef parts[2];
-		union si_shader_part_key epilog_key;
+			/* VS prolog */
+			if (ls->vs_needs_prolog) {
+				union si_shader_part_key vs_prolog_key;
+				si_get_vs_prolog_key(&ls->info,
+						     shader->info.num_input_sgprs,
+						     &shader->key.part.tcs.ls_prolog,
+						     shader, &vs_prolog_key);
+				vs_prolog_key.vs_prolog.is_monolithic = true;
+				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+				parts[0] = ctx.main_fn;
+			}
 
-		parts[0] = ctx.main_fn;
+			/* VS as LS main part */
+			struct si_shader shader_ls = {};
+			shader_ls.selector = ls;
+			shader_ls.key.as_ls = 1;
+			shader_ls.key.mono = shader->key.mono;
+			shader_ls.key.opt = shader->key.opt;
+			si_llvm_context_set_tgsi(&ctx, &shader_ls);
 
-		si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
-		si_build_vs_epilog_function(&ctx, &epilog_key);
-		parts[1] = ctx.main_fn;
+			if (!si_compile_tgsi_main(&ctx, true)) {
+				si_llvm_dispose(&ctx);
+				return -1;
+			}
+			shader->info.uses_instanceid |= ls->info.uses_instanceid;
+			parts[1] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, 2, 0);
+			/* Reset the shader context. */
+			ctx.shader = shader;
+			ctx.type = PIPE_SHADER_TESS_CTRL;
+
+			si_build_wrapper_function(&ctx,
+						  parts + !ls->vs_needs_prolog,
+						  4 - !ls->vs_needs_prolog, 0,
+						  ls->vs_needs_prolog ? 2 : 1);
+		} else {
+			LLVMValueRef parts[2];
+			union si_shader_part_key epilog_key;
+
+			parts[0] = ctx.main_fn;
+
+			memset(&epilog_key, 0, sizeof(epilog_key));
+			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+			si_build_tcs_epilog_function(&ctx, &epilog_key);
+			parts[1] = ctx.main_fn;
+
+			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+		}
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
-		LLVMValueRef parts[2];
-		union si_shader_part_key prolog_key;
+		if (ctx.screen->b.chip_class >= GFX9) {
+			struct si_shader_selector *es = shader->key.part.gs.es;
+			LLVMValueRef es_prolog = NULL;
+			LLVMValueRef es_main = NULL;
+			LLVMValueRef gs_prolog = NULL;
+			LLVMValueRef gs_main = ctx.main_fn;
 
-		parts[1] = ctx.main_fn;
+			/* GS prolog */
+			union si_shader_part_key gs_prolog_key;
+			memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
+			gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+			gs_prolog_key.gs_prolog.is_monolithic = true;
+			si_build_gs_prolog_function(&ctx, &gs_prolog_key);
+			gs_prolog = ctx.main_fn;
 
-		memset(&prolog_key, 0, sizeof(prolog_key));
-		prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-		si_build_gs_prolog_function(&ctx, &prolog_key);
-		parts[0] = ctx.main_fn;
+			/* ES prolog */
+			if (es->vs_needs_prolog) {
+				union si_shader_part_key vs_prolog_key;
+				si_get_vs_prolog_key(&es->info,
+						     shader->info.num_input_sgprs,
+						     &shader->key.part.tcs.ls_prolog,
+						     shader, &vs_prolog_key);
+				vs_prolog_key.vs_prolog.is_monolithic = true;
+				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+				es_prolog = ctx.main_fn;
+			}
 
-		si_build_wrapper_function(&ctx, parts, 2, 1);
+			/* ES main part */
+			struct si_shader shader_es = {};
+			shader_es.selector = es;
+			shader_es.key.as_es = 1;
+			shader_es.key.mono = shader->key.mono;
+			shader_es.key.opt = shader->key.opt;
+			si_llvm_context_set_tgsi(&ctx, &shader_es);
+
+			if (!si_compile_tgsi_main(&ctx, true)) {
+				si_llvm_dispose(&ctx);
+				return -1;
+			}
+			shader->info.uses_instanceid |= es->info.uses_instanceid;
+			es_main = ctx.main_fn;
+
+			/* Reset the shader context. */
+			ctx.shader = shader;
+			ctx.type = PIPE_SHADER_GEOMETRY;
+
+			/* Prepare the array of shader parts. */
+			LLVMValueRef parts[4];
+			unsigned num_parts = 0, main_part, next_first_part;
+
+			if (es_prolog)
+				parts[num_parts++] = es_prolog;
+
+			parts[main_part = num_parts++] = es_main;
+			parts[next_first_part = num_parts++] = gs_prolog;
+			parts[num_parts++] = gs_main;
+
+			si_build_wrapper_function(&ctx, parts, num_parts,
+						  main_part, next_first_part);
+		} else {
+			LLVMValueRef parts[2];
+			union si_shader_part_key prolog_key;
+
+			parts[1] = ctx.main_fn;
+
+			memset(&prolog_key, 0, sizeof(prolog_key));
+			prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+			si_build_gs_prolog_function(&ctx, &prolog_key);
+			parts[0] = ctx.main_fn;
+
+			si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+		}
 	} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
 		LLVMValueRef parts[3];
 		union si_shader_part_key prolog_key;
@@ -7653,19 +6334,14 @@
 		si_build_ps_epilog_function(&ctx, &epilog_key);
 		parts[need_prolog ? 2 : 1] = ctx.main_fn;
 
-		si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
+		si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+					  need_prolog ? 1 : 0, 0);
 	}
 
-	/* Dump LLVM IR before any optimization passes */
-	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
-	    r600_can_dump_shader(&sscreen->b, ctx.type))
-		LLVMDumpModule(ctx.gallivm.module);
-
-	si_llvm_finalize_module(&ctx,
-				    r600_extra_shader_checks(&sscreen->b, ctx.type));
+	si_llvm_optimize_module(&ctx);
 
 	/* Post-optimization transformations and analysis. */
-	si_eliminate_const_vs_outputs(&ctx);
+	si_optimize_vs_outputs(&ctx);
 
 	if ((debug && debug->debug_message) ||
 	    r600_can_dump_shader(&sscreen->b, ctx.type))
@@ -7712,13 +6388,14 @@
 	}
 
 	/* Add the scratch offset to input SGPRs. */
-	if (shader->config.scratch_bytes_per_wave)
+	if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
 		shader->info.num_input_sgprs += 1; /* scratch byte offset */
 
 	/* Calculate the number of fragment input VGPRs. */
 	if (ctx.type == PIPE_SHADER_FRAGMENT) {
 		shader->info.num_input_vgprs = 0;
 		shader->info.face_vgpr_index = -1;
+		shader->info.ancillary_vgpr_index = -1;
 
 		if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
 			shader->info.num_input_vgprs += 2;
@@ -7748,8 +6425,10 @@
 			shader->info.face_vgpr_index = shader->info.num_input_vgprs;
 			shader->info.num_input_vgprs += 1;
 		}
-		if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
+		if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
+			shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
 			shader->info.num_input_vgprs += 1;
+		}
 		if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
 			shader->info.num_input_vgprs += 1;
 		if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
@@ -7804,7 +6483,8 @@
 	struct si_shader_context ctx;
 	struct gallivm_state *gallivm = &ctx.gallivm;
 
-	si_init_shader_ctx(&ctx, sscreen, &shader, tm);
+	si_init_shader_ctx(&ctx, sscreen, tm);
+	ctx.shader = &shader;
 	ctx.type = type;
 
 	switch (type) {
@@ -7830,8 +6510,7 @@
 	build(&ctx, key);
 
 	/* Compile. */
-	si_llvm_finalize_module(&ctx,
-		r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
+	si_llvm_optimize_module(&ctx);
 
 	if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
 			    gallivm->module, debug, ctx.type, name)) {
@@ -7849,6 +6528,21 @@
 	return result;
 }
 
+static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
+{
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMValueRef ptr[2], list;
+
+	/* Get the pointer to rw buffers. */
+	ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
+	ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
+	list = lp_build_gather_values(gallivm, ptr, 2);
+	list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
+	list = LLVMBuildIntToPtr(gallivm->builder, list,
+				 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
+	return list;
+}
+
 /**
  * Build the vertex shader prolog function.
  *
@@ -7872,15 +6566,19 @@
 	LLVMTypeRef *params, *returns;
 	LLVMValueRef ret, func;
 	int last_sgpr, num_params, num_returns, i;
+	unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
+				 key->vs_prolog.num_merged_next_stage_vgprs;
+	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
+				      num_input_vgprs;
+	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 
-	ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
-	ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+	ctx->param_vertex_id = first_vs_vgpr;
+	ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
 
 	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-	params = alloca((key->vs_prolog.num_input_sgprs + 4) *
-			sizeof(LLVMTypeRef));
-	returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
-			  key->vs_prolog.last_input + 1) *
+	params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
+	returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
 			 sizeof(LLVMTypeRef));
 	num_params = 0;
 	num_returns = 0;
@@ -7893,8 +6591,8 @@
 	}
 	last_sgpr = num_params - 1;
 
-	/* 4 preloaded VGPRs (outputs must be floats) */
-	for (i = 0; i < 4; i++) {
+	/* Preloaded VGPRs (outputs must be floats) */
+	for (i = 0; i < num_input_vgprs; i++) {
 		params[num_params++] = ctx->i32;
 		returns[num_returns++] = ctx->f32;
 	}
@@ -7905,9 +6603,13 @@
 
 	/* Create the function. */
 	si_create_function(ctx, "vs_prolog", returns, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
+	if (key->vs_prolog.num_merged_next_stage_vgprs &&
+	    !key->vs_prolog.is_monolithic)
+		si_init_exec_from_input(ctx, 3, 0);
+
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
 	 * but it will prevent the compiler from overwriting them unintentionally.
 	 */
@@ -7916,27 +6618,51 @@
 		LLVMValueRef p = LLVMGetParam(func, i);
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
-	for (i = num_params - 4; i < num_params; i++) {
+	for (; i < num_params; i++) {
 		LLVMValueRef p = LLVMGetParam(func, i);
 		p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
 		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
 	}
 
 	/* Compute vertex load indices from instance divisors. */
+	LLVMValueRef instance_divisor_constbuf = NULL;
+
+	if (key->vs_prolog.states.instance_divisor_is_fetched) {
+		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+		LLVMValueRef buf_index =
+			LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+		instance_divisor_constbuf =
+			ac_build_indexed_load_const(&ctx->ac, list, buf_index);
+	}
+
 	for (i = 0; i <= key->vs_prolog.last_input; i++) {
-		unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+		bool divisor_is_one =
+			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+		bool divisor_is_fetched =
+			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
 		LLVMValueRef index;
 
-		if (divisor) {
+		if (divisor_is_one || divisor_is_fetched) {
+			LLVMValueRef divisor = ctx->i32_1;
+
+			if (divisor_is_fetched) {
+				divisor = buffer_load_const(ctx, instance_divisor_constbuf,
+							    LLVMConstInt(ctx->i32, i * 4, 0));
+				divisor = LLVMBuildBitCast(gallivm->builder, divisor,
+							   ctx->i32, "");
+			}
+
 			/* InstanceID / Divisor + StartInstance */
 			index = get_instance_index_for_fetch(ctx,
+							     user_sgpr_base +
 							     SI_SGPR_START_INSTANCE,
 							     divisor);
 		} else {
 			/* VertexID + BaseVertex */
 			index = LLVMBuildAdd(gallivm->builder,
 					     LLVMGetParam(func, ctx->param_vertex_id),
-					     LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+					     LLVMGetParam(func, user_sgpr_base +
+								SI_SGPR_BASE_VERTEX), "");
 		}
 
 		index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
@@ -7947,76 +6673,30 @@
 	si_llvm_build_ret(ctx, ret);
 }
 
-/**
- * Build the vertex shader epilog function. This is also used by the tessellation
- * evaluation shader compiled as VS.
- *
- * The input is PrimitiveID.
- *
- * If PrimitiveID is required by the pixel shader, export it.
- * Otherwise, do nothing.
- */
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-					union si_shader_part_key *key)
-{
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-	LLVMTypeRef params[5];
-	int num_params, i;
-
-	/* Declare input VGPRs. */
-	num_params = key->vs_epilog.states.export_prim_id ?
-			   (VS_EPILOG_PRIMID_LOC + 1) : 0;
-	assert(num_params <= ARRAY_SIZE(params));
-
-	for (i = 0; i < num_params; i++)
-		params[i] = ctx->f32;
-
-	/* Create the function. */
-	si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
-
-	/* Emit exports. */
-	if (key->vs_epilog.states.export_prim_id) {
-		struct lp_build_context *base = &bld_base->base;
-		struct ac_export_args args;
-
-		args.enabled_channels = 0x1; /* enabled channels */
-		args.valid_mask = 0; /* whether the EXEC mask is valid */
-		args.done = 0; /* DONE bit */
-		args.target = V_008DFC_SQ_EXP_PARAM +
-			      key->vs_epilog.prim_id_param_offset;
-		args.compr = 0; /* COMPR flag (0 = 32-bit export) */
-		args.out[0] = LLVMGetParam(ctx->main_fn,
-				       VS_EPILOG_PRIMID_LOC); /* X */
-		args.out[1] = base->undef; /* Y */
-		args.out[2] = base->undef; /* Z */
-		args.out[3] = base->undef; /* W */
-
-		ac_build_export(&ctx->ac, &args);
-	}
-
-	LLVMBuildRetVoid(gallivm->builder);
-}
-
-/**
- * Create & compile a vertex shader epilog. This a helper used by VS and TES.
- */
-static bool si_get_vs_epilog(struct si_screen *sscreen,
+static bool si_get_vs_prolog(struct si_screen *sscreen,
 			     LLVMTargetMachineRef tm,
-		             struct si_shader *shader,
-		             struct pipe_debug_callback *debug,
-			     struct si_vs_epilog_bits *states)
+			     struct si_shader *shader,
+			     struct pipe_debug_callback *debug,
+			     struct si_shader *main_part,
+			     const struct si_vs_prolog_bits *key)
 {
-	union si_shader_part_key epilog_key;
+	struct si_shader_selector *vs = main_part->selector;
 
-	si_get_vs_epilog_key(shader, states, &epilog_key);
+	/* The prolog is a no-op if there are no inputs. */
+	if (!vs->vs_needs_prolog)
+		return true;
 
-	shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
-					    PIPE_SHADER_VERTEX, true,
-					    &epilog_key, tm, debug,
-					    si_build_vs_epilog_function,
-					    "Vertex Shader Epilog");
-	return shader->epilog != NULL;
+	/* Get the prolog. */
+	union si_shader_part_key prolog_key;
+	si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
+			     key, shader, &prolog_key);
+
+	shader->prolog =
+		si_get_shader_part(sscreen, &sscreen->vs_prologs,
+				   PIPE_SHADER_VERTEX, true, &prolog_key, tm,
+				   debug, si_build_vs_prolog_function,
+				   "Vertex Shader Prolog");
+	return shader->prolog != NULL;
 }
 
 /**
@@ -8027,45 +6707,8 @@
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
-	if (shader->selector->vs_needs_prolog) {
-		union si_shader_part_key prolog_key;
-
-		/* Get the prolog. */
-		si_get_vs_prolog_key(shader, &prolog_key);
-
-		shader->prolog =
-			si_get_shader_part(sscreen, &sscreen->vs_prologs,
-					   PIPE_SHADER_VERTEX, true,
-					   &prolog_key, tm, debug,
-					   si_build_vs_prolog_function,
-					   "Vertex Shader Prolog");
-		if (!shader->prolog)
-			return false;
-	}
-
-	/* Get the epilog. */
-	if (!shader->key.as_es && !shader->key.as_ls &&
-	    !si_get_vs_epilog(sscreen, tm, shader, debug,
-			      &shader->key.part.vs.epilog))
-		return false;
-
-	return true;
-}
-
-/**
- * Select and compile (or reuse) TES parts (epilog).
- */
-static bool si_shader_select_tes_parts(struct si_screen *sscreen,
-				       LLVMTargetMachineRef tm,
-				       struct si_shader *shader,
-				       struct pipe_debug_callback *debug)
-{
-	if (shader->key.as_es)
-		return true;
-
-	/* TES compiled as VS. */
-	return si_get_vs_epilog(sscreen, tm, shader, debug,
-				&shader->key.part.tes.epilog);
+	return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
+				&shader->key.part.vs.prolog);
 }
 
 /**
@@ -8077,32 +6720,54 @@
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-	LLVMTypeRef params[16];
+	LLVMTypeRef params[32];
 	LLVMValueRef func;
-	int last_sgpr, num_params;
+	int last_sgpr, num_params = 0;
 
-	/* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
-	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST_BUFFERS] = ctx->i64;
-	params[SI_PARAM_SAMPLERS] = ctx->i64;
-	params[SI_PARAM_IMAGES] = ctx->i64;
-	params[SI_PARAM_SHADER_BUFFERS] = ctx->i64;
-	params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
-	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
-	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
-	params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
-	params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
-	params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
-	last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
-	num_params = last_sgpr + 1;
+	if (ctx->screen->b.chip_class >= GFX9) {
+		params[num_params++] = ctx->i64;
+		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32; /* wave info */
+		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
+	} else {
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i64;
+		params[num_params++] = ctx->i64;
+		params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
+		params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+		params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
+	}
+	last_sgpr = num_params - 1;
 
 	params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
 	params[num_params++] = ctx->i32; /* invocation ID within the patch */
 	params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
 
 	/* Create the function. */
-	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
-	declare_tess_lds(ctx);
+	si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
+			   ctx->screen->b.chip_class >= CIK ? 128 : 64);
+	declare_lds_as_pointer(ctx);
 	func = ctx->main_fn;
 
 	si_write_tess_factors(bld_base,
@@ -8121,9 +6786,19 @@
 				       struct si_shader *shader,
 				       struct pipe_debug_callback *debug)
 {
-	union si_shader_part_key epilog_key;
+	if (sscreen->b.chip_class >= GFX9) {
+		struct si_shader *ls_main_part =
+			shader->key.part.tcs.ls->main_shader_part_ls;
+
+		if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
+				      &shader->key.part.tcs.ls_prolog))
+			return false;
+
+		shader->previous_stage = ls_main_part;
+	}
 
 	/* Get the epilog. */
+	union si_shader_part_key epilog_key;
 	memset(&epilog_key, 0, sizeof(epilog_key));
 	epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
 
@@ -8143,20 +6818,31 @@
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
-	union si_shader_part_key prolog_key;
+	if (sscreen->b.chip_class >= GFX9) {
+		struct si_shader *es_main_part =
+			shader->key.part.gs.es->main_shader_part_es;
+
+		if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
+		    !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
+				      &shader->key.part.gs.vs_prolog))
+			return false;
+
+		shader->previous_stage = es_main_part;
+	}
 
 	if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
 		return true;
 
+	union si_shader_part_key prolog_key;
 	memset(&prolog_key, 0, sizeof(prolog_key));
 	prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
 
-	shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
+	shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
 					    PIPE_SHADER_GEOMETRY, true,
 					    &prolog_key, tm, debug,
 					    si_build_gs_prolog_function,
 					    "Geometry Shader Prolog");
-	return shader->prolog != NULL;
+	return shader->prolog2 != NULL;
 }
 
 /**
@@ -8201,7 +6887,7 @@
 
 	/* Create the function. */
 	si_create_function(ctx, "ps_prolog", params, num_returns, params,
-			   num_params, last_sgpr);
+			   num_params, last_sgpr, 0);
 	func = ctx->main_fn;
 
 	/* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -8218,15 +6904,7 @@
 		/* POS_FIXED_PT is always last. */
 		unsigned pos = key->ps_prolog.num_input_sgprs +
 			       key->ps_prolog.num_input_vgprs - 1;
-		LLVMValueRef ptr[2], list;
-
-		/* Get the pointer to rw buffers. */
-		ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
-		ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
-		list = lp_build_gather_values(gallivm, ptr, 2);
-		list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
-		list = LLVMBuildIntToPtr(gallivm->builder, list,
-					  const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
+		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 
 		si_llvm_emit_polygon_stipple(ctx, list, pos);
 	}
@@ -8394,6 +7072,54 @@
 		}
 	}
 
+	/* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
+	 * says:
+	 *
+	 *    "When per-sample shading is active due to the use of a fragment
+	 *     input qualified by sample or due to the use of the gl_SampleID
+	 *     or gl_SamplePosition variables, only the bit for the current
+	 *     sample is set in gl_SampleMaskIn. When state specifies multiple
+	 *     fragment shader invocations for a given fragment, the sample
+	 *     mask for any single fragment shader invocation may specify a
+	 *     subset of the covered samples for the fragment. In this case,
+	 *     the bit corresponding to each covered sample will be set in
+	 *     exactly one fragment shader invocation."
+	 *
+	 * The samplemask loaded by hardware is always the coverage of the
+	 * entire pixel/fragment, so mask bits out based on the sample ID.
+	 */
+	if (key->ps_prolog.states.samplemask_log_ps_iter) {
+		/* The bit pattern matches that used by fixed function fragment
+		 * processing. */
+		static const uint16_t ps_iter_masks[] = {
+			0xffff, /* not used */
+			0x5555,
+			0x1111,
+			0x0101,
+			0x0001,
+		};
+		assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
+
+		uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
+		unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
+					  key->ps_prolog.ancillary_vgpr_index;
+		LLVMValueRef sampleid = unpack_param(ctx, ancillary_vgpr, 8, 4);
+		LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
+
+		samplemask = LLVMBuildBitCast(gallivm->builder, samplemask, ctx->i32, "");
+		samplemask = LLVMBuildAnd(
+			gallivm->builder,
+			samplemask,
+			LLVMBuildShl(gallivm->builder,
+				     LLVMConstInt(ctx->i32, ps_iter_mask, false),
+				     sampleid, ""),
+			"");
+		samplemask = LLVMBuildBitCast(gallivm->builder, samplemask, ctx->f32, "");
+
+		ret = LLVMBuildInsertValue(gallivm->builder, ret, samplemask,
+					   ancillary_vgpr + 1, "");
+	}
+
 	/* Tell LLVM to insert WQM instruction sequence when needed. */
 	if (key->ps_prolog.wqm) {
 		LLVMAddTargetDependentFunctionAttr(func,
@@ -8414,15 +7140,14 @@
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 	LLVMTypeRef params[16+8*4+3];
 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-	int last_sgpr, num_params, i;
+	int last_sgpr, num_params = 0, i;
 	struct si_ps_exports exp = {};
 
 	/* Declare input SGPRs. */
-	params[SI_PARAM_RW_BUFFERS] = ctx->i64;
-	params[SI_PARAM_CONST_BUFFERS] = ctx->i64;
-	params[SI_PARAM_SAMPLERS] = ctx->i64;
-	params[SI_PARAM_IMAGES] = ctx->i64;
-	params[SI_PARAM_SHADER_BUFFERS] = ctx->i64;
+	params[ctx->param_rw_buffers = num_params++] = ctx->i64;
+	params[ctx->param_const_and_shader_buffers = num_params++] = ctx->i64;
+	params[ctx->param_samplers_and_images = num_params++] = ctx->i64;
+	assert(num_params == SI_PARAM_ALPHA_REF);
 	params[SI_PARAM_ALPHA_REF] = ctx->f32;
 	last_sgpr = SI_PARAM_ALPHA_REF;
 
@@ -8442,7 +7167,8 @@
 		params[i] = ctx->f32;
 
 	/* Create the function. */
-	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
+	si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
+			   last_sgpr, 0);
 	/* Disable elimination of unused inputs. */
 	si_llvm_add_attribute(ctx->main_fn,
 				  "InitialPSInputAddr", 0xffffff);
@@ -8462,7 +7188,7 @@
 		if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
 			/* Just set this if any of the colorbuffers are enabled. */
 			if (spi_format &
-			    ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+			    ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
 				last_color_export = 0;
 		} else {
 			for (i = 0; i < 8; i++)
@@ -8591,6 +7317,12 @@
 		assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
 	}
 
+	/* Samplemask fixup requires the sample ID. */
+	if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
+		shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
+		assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
+	}
+
 	/* The sample mask input is always enabled, because the API shader always
 	 * passes it through to the epilog. Disable it here if it's unused.
 	 */
@@ -8668,6 +7400,7 @@
 		shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
 		shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
 		shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+		shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
 		memcpy(shader->info.vs_output_param_offset,
 		       mainp->info.vs_output_param_offset,
 		       sizeof(mainp->info.vs_output_param_offset));
@@ -8686,8 +7419,6 @@
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_EVAL:
-			if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
-				return -1;
 			break;
 		case PIPE_SHADER_GEOMETRY:
 			if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
@@ -8712,6 +7443,32 @@
 			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
 							shader->prolog->config.num_vgprs);
 		}
+		if (shader->previous_stage) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->previous_stage->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->previous_stage->config.num_vgprs);
+			shader->config.spilled_sgprs =
+				MAX2(shader->config.spilled_sgprs,
+				     shader->previous_stage->config.spilled_sgprs);
+			shader->config.spilled_vgprs =
+				MAX2(shader->config.spilled_vgprs,
+				     shader->previous_stage->config.spilled_vgprs);
+			shader->config.private_mem_vgprs =
+				MAX2(shader->config.private_mem_vgprs,
+				     shader->previous_stage->config.private_mem_vgprs);
+			shader->config.scratch_bytes_per_wave =
+				MAX2(shader->config.scratch_bytes_per_wave,
+				     shader->previous_stage->config.scratch_bytes_per_wave);
+			shader->info.uses_instanceid |=
+				shader->previous_stage->info.uses_instanceid;
+		}
+		if (shader->prolog2) {
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+							shader->prolog2->config.num_sgprs);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+							shader->prolog2->config.num_vgprs);
+		}
 		if (shader->epilog) {
 			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
 							shader->epilog->config.num_sgprs);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 2aac424..0d08513 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,6 +26,73 @@
  *      Christian König <christian.koenig@amd.com>
  */
 
+/* The compiler middle-end architecture: Explaining (non-)monolithic shaders
+ * -------------------------------------------------------------------------
+ *
+ * Typically, there is one-to-one correspondence between API and HW shaders,
+ * that is, for every API shader, there is exactly one shader binary in
+ * the driver.
+ *
+ * The problem with that is that we also have to emulate some API states
+ * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
+ * to deal with it are:
+ * - each shader has multiple variants for each combination of emulated states,
+ *   and the variants are compiled on demand, possibly relying on a shader
+ *   cache for good performance
+ * - patch shaders at the binary level
+ *
+ * This driver uses something completely different. The emulated states are
+ * usually implemented at the beginning or end of shaders. Therefore, we can
+ * split the shader into 3 parts:
+ * - prolog part (shader code dependent on states)
+ * - main part (the API shader)
+ * - epilog part (shader code dependent on states)
+ *
+ * Each part is compiled as a separate shader and the final binaries are
+ * concatenated. This type of shader is called non-monolithic, because it
+ * consists of multiple independent binaries. Creating a new shader variant
+ * is therefore only a concatenation of shader parts (binaries) and doesn't
+ * involve any compilation. The main shader parts are the only parts that are
+ * compiled when applications create shader objects. The prolog and epilog
+ * parts are compiled on the first use and saved, so that their binaries can
+ * be reused by many other shaders.
+ *
+ * One of the roles of the prolog part is to compute vertex buffer addresses
+ * for vertex shaders. A few of the roles of the epilog part are color buffer
+ * format conversions in pixel shaders that we have to do manually, and write
+ * tessellation factors in tessellation control shaders. The prolog and epilog
+ * have many other important responsibilities in various shader stages.
+ * They don't just "emulate legacy stuff".
+ *
+ * Monolithic shaders are shaders where the parts are combined before LLVM
+ * compilation, and the whole thing is compiled and optimized as one unit with
+ * one binary on the output. The result is the same as the non-monolithic
+ * shader, but the final code can be better, because LLVM can optimize across
+ * all shader parts. Monolithic shaders aren't usually used except for these
+ * special cases:
+ *
+ * 1) Some rarely-used states require modification of the main shader part
+ *    itself, and in such cases, only the monolithic shader variant is
+ *    compiled, and that's always done on the first use.
+ *
+ * 2) When we do cross-stage optimizations for separate shader objects and
+ *    e.g. eliminate unused shader varyings, the resulting optimized shader
+ *    variants are always compiled as monolithic shaders, and always
+ *    asynchronously (i.e. not stalling ongoing rendering). We call them
+ *    "optimized monolithic" shaders. The important property here is that
+ *    the non-monolithic unoptimized shader variant is always available for use
+ *    when the asynchronous compilation of the optimized shader is not done
+ *    yet.
+ *
+ * Starting with GFX9 chips, some shader stages are merged, and the number of
+ * shader parts per shader increased. The complete new list of shader parts is:
+ * - 1st shader: prolog part
+ * - 1st shader: main part
+ * - 2nd shader: prolog part
+ * - 2nd shader: main part
+ * - 2nd shader: epilog part
+ */
+
 /* How linking shader inputs and outputs between vertex, tessellation, and
  * geometry shaders works.
  *
@@ -78,18 +145,22 @@
 
 #define SI_MAX_VS_OUTPUTS	40
 
+/* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an
+ * index smaller than this.
+ */
+#define SI_MAX_IO_GENERIC       46
+
 /* SGPR user data indices */
 enum {
+	/* GFX9 merged shaders have RW_BUFFERS among the first 8 system SGPRs,
+	 * and these two are used for other purposes.
+	 */
 	SI_SGPR_RW_BUFFERS,  /* rings (& stream-out, VS only) */
 	SI_SGPR_RW_BUFFERS_HI,
-	SI_SGPR_CONST_BUFFERS,
-	SI_SGPR_CONST_BUFFERS_HI,
-	SI_SGPR_SAMPLERS,  /* images & sampler states interleaved */
-	SI_SGPR_SAMPLERS_HI,
-	SI_SGPR_IMAGES,
-	SI_SGPR_IMAGES_HI,
-	SI_SGPR_SHADER_BUFFERS,
-	SI_SGPR_SHADER_BUFFERS_HI,
+	SI_SGPR_CONST_AND_SHADER_BUFFERS,
+	SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
+	SI_SGPR_SAMPLERS_AND_IMAGES,
+	SI_SGPR_SAMPLERS_AND_IMAGES_HI,
 	SI_NUM_RESOURCE_SGPRS,
 
 	/* all VS variants */
@@ -101,91 +172,52 @@
 	SI_SGPR_VS_STATE_BITS,
 	SI_VS_NUM_USER_SGPR,
 
-	/* both TCS and TES */
-	SI_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+	/* TES */
+	SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+	SI_SGPR_TES_OFFCHIP_ADDR_BASE64K,
 	SI_TES_NUM_USER_SGPR,
 
-	/* TCS only */
-	SI_SGPR_TCS_OUT_OFFSETS = SI_TES_NUM_USER_SGPR,
-	SI_SGPR_TCS_OUT_LAYOUT,
-	SI_SGPR_TCS_IN_LAYOUT,
-	SI_TCS_NUM_USER_SGPR,
+	/* GFX6-8: TCS only */
+	GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+	GFX6_SGPR_TCS_OUT_OFFSETS,
+	GFX6_SGPR_TCS_OUT_LAYOUT,
+	GFX6_SGPR_TCS_IN_LAYOUT,
+	GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K,
+	GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K,
+	GFX6_TCS_NUM_USER_SGPR,
+
+	/* GFX9: Merged LS-HS (VS-TCS) only. */
+	GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
+	GFX9_SGPR_TCS_OUT_OFFSETS,
+	GFX9_SGPR_TCS_OUT_LAYOUT,
+	GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K,
+	GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K,
+	GFX9_SGPR_unused_to_align_the_next_pointer,
+	GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS,
+	GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS_HI,
+	GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES,
+	GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES_HI,
+	GFX9_TCS_NUM_USER_SGPR,
+
+	/* GFX9: Merged ES-GS (VS-GS or TES-GS). */
+	GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS = SI_VS_NUM_USER_SGPR,
+	GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS_HI,
+	GFX9_SGPR_GS_SAMPLERS_AND_IMAGES,
+	GFX9_SGPR_GS_SAMPLERS_AND_IMAGES_HI,
+	GFX9_GS_NUM_USER_SGPR,
 
 	/* GS limits */
-	SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
+	GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
 	SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,
 
 	/* PS only */
 	SI_SGPR_ALPHA_REF	= SI_NUM_RESOURCE_SGPRS,
 	SI_PS_NUM_USER_SGPR,
-
-	/* CS only */
-	SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
-	SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
-	SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
 };
 
 /* LLVM function parameter indices */
 enum {
-	SI_PARAM_RW_BUFFERS,
-	SI_PARAM_CONST_BUFFERS,
-	SI_PARAM_SAMPLERS,
-	SI_PARAM_IMAGES,
-	SI_PARAM_SHADER_BUFFERS,
-	SI_NUM_RESOURCE_PARAMS,
-
-	/* VS only parameters */
-	SI_PARAM_VERTEX_BUFFERS	= SI_NUM_RESOURCE_PARAMS,
-	SI_PARAM_BASE_VERTEX,
-	SI_PARAM_START_INSTANCE,
-	SI_PARAM_DRAWID,
-	SI_PARAM_VS_STATE_BITS,
-
-	/* Layout of TCS outputs in the offchip buffer
-	 *   [0:8] = the number of patches per threadgroup.
-	 *   [9:15] = the number of output vertices per patch.
-	 *   [16:31] = the offset of per patch attributes in the buffer in bytes.
-	 */
-	SI_PARAM_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_PARAMS, /* for TCS & TES */
-
-	/* TCS only parameters. */
-
-	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
-	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
-	 *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
-	 */
-	SI_PARAM_TCS_OUT_OFFSETS,
-
-	/* Layout of TCS outputs / TES inputs:
-	 *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
-	 *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
-	 *   [26:31] = gl_PatchVerticesIn, max = 32
-	 */
-	SI_PARAM_TCS_OUT_LAYOUT,
-
-	/* Layout of LS outputs / TCS inputs
-	 *   [8:20] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
-	 *   [24:31] = stride between vertices in dwords = num_inputs * 4, max = 32*4
-	 * (same layout as SI_PARAM_VS_STATE_BITS)
-	 */
-	SI_PARAM_TCS_IN_LAYOUT,
-
-	SI_PARAM_TCS_OC_LDS,
-	SI_PARAM_TESS_FACTOR_OFFSET,
-	SI_PARAM_PATCH_ID,
-	SI_PARAM_REL_IDS,
-
-	/* GS only parameters */
-	SI_PARAM_GS2VS_OFFSET = SI_NUM_RESOURCE_PARAMS,
-	SI_PARAM_GS_WAVE_ID,
-	SI_PARAM_VTX0_OFFSET,
-	SI_PARAM_VTX1_OFFSET,
-	SI_PARAM_PRIMITIVE_ID,
-	SI_PARAM_VTX2_OFFSET,
-	SI_PARAM_VTX3_OFFSET,
-	SI_PARAM_VTX4_OFFSET,
-	SI_PARAM_VTX5_OFFSET,
-	SI_PARAM_GS_INSTANCE_ID,
+	SI_NUM_RESOURCE_PARAMS = 3,
 
 	/* PS only parameters */
 	SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
@@ -207,12 +239,6 @@
 	SI_PARAM_SAMPLE_COVERAGE,
 	SI_PARAM_POS_FIXED_PT,
 
-	/* CS only parameters */
-	SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
-	SI_PARAM_BLOCK_SIZE,
-	SI_PARAM_BLOCK_ID,
-	SI_PARAM_THREAD_ID,
-
 	SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
 };
 
@@ -275,6 +301,7 @@
  * binaries for one TGSI program. This can be shared by multiple contexts.
  */
 struct si_shader_selector {
+	struct pipe_reference	reference;
 	struct si_screen	*screen;
 	struct util_queue_fence ready;
 	struct si_compiler_ctx_state compiler_ctx_state;
@@ -299,6 +326,9 @@
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 	bool		vs_needs_prolog;
+	unsigned	pa_cl_vs_out_cntl;
+	ubyte		clipdist_mask;
+	ubyte		culldist_mask;
 
 	/* GS parameters. */
 	unsigned	esgs_itemsize;
@@ -309,6 +339,7 @@
 	unsigned	max_gs_stream; /* count - 1 */
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;
+	unsigned	enabled_streamout_buffer_mask;
 
 	/* PS parameters. */
 	unsigned	color_attr_index[2];
@@ -322,11 +353,13 @@
 	unsigned local_size;
 
 	uint64_t	outputs_written;	/* "get_unique_index" bits */
-	uint32_t	patch_outputs_written;	/* "get_unique_index" bits */
-	uint32_t	outputs_written2;	/* "get_unique_index2" bits */
+	uint32_t	patch_outputs_written;	/* "get_unique_index_patch" bits */
 
 	uint64_t	inputs_read;		/* "get_unique_index" bits */
-	uint32_t	inputs_read2;		/* "get_unique_index2" bits */
+
+	/* bitmasks of used descriptor slots */
+	uint32_t	active_const_and_shader_buffers;
+	uint64_t	active_samplers_and_images;
 };
 
 /* Valid shader configurations:
@@ -334,20 +367,32 @@
  * API shaders       VS | TCS | TES | GS |pass| PS
  * are compiled as:     |     |     |    |thru|
  *                      |     |     |    |    |
- * Only VS & PS:     VS | --  | --  | -- | -- | PS
- * With GS:          ES | --  | --  | GS | VS | PS
- * With Tessel.:     LS | HS  | VS  | -- | -- | PS
- * With both:        LS | HS  | ES  | GS | VS | PS
+ * Only VS & PS:     VS |     |     |    |    | PS
+ * GFX6 - with GS:   ES |     |     | GS | VS | PS
+ *      - with tess: LS | HS  | VS  |    |    | PS
+ *      - with both: LS | HS  | ES  | GS | VS | PS
+ * GFX9 - with GS:   -> |     |     | GS | VS | PS
+ *      - with tess: -> | HS  | VS  |    |    | PS
+ *      - with both: -> | HS  | ->  | GS | VS | PS
+ *
+ * -> = merged with the next stage
  */
 
+/* Use the byte alignment for all following structure members for optimal
+ * shader key memory footprint.
+ */
+#pragma pack(push, 1)
+
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
-	unsigned	instance_divisors[SI_MAX_ATTRIBS];
-};
-
-/* Common VS bits between the shader key and the epilog key. */
-struct si_vs_epilog_bits {
-	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
+	/* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+	 *   divisor is 0.
+	 * - If "is_one" has a bit set, the instance divisor is 1.
+	 * - If "is_fetched" has a bit set, the instance divisor will be loaded
+	 *   from the constant buffer.
+	 */
+	uint16_t	instance_divisor_is_one;     /* bitmask of inputs */
+	uint16_t	instance_divisor_is_fetched; /* bitmask of inputs */
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
@@ -371,6 +416,7 @@
 	unsigned	force_linear_center_interp:1;
 	unsigned	bc_optimize_for_persp:1;
 	unsigned	bc_optimize_for_linear:1;
+	unsigned	samplemask_log_ps_iter:3;
 };
 
 /* Common PS bits between the shader key and the epilog key. */
@@ -388,27 +434,31 @@
 union si_shader_part_key {
 	struct {
 		struct si_vs_prolog_bits states;
-		unsigned	num_input_sgprs:5;
+		unsigned	num_input_sgprs:6;
+		/* For merged stages such as LS-HS, HS input VGPRs are first. */
+		unsigned	num_merged_next_stage_vgprs:3;
 		unsigned	last_input:4;
+		unsigned	as_ls:1;
+		/* Prologs for monolithic shaders shouldn't set EXEC. */
+		unsigned	is_monolithic:1;
 	} vs_prolog;
 	struct {
-		struct si_vs_epilog_bits states;
-		unsigned	prim_id_param_offset:5;
-	} vs_epilog;
-	struct {
 		struct si_tcs_epilog_bits states;
 	} tcs_epilog;
 	struct {
 		struct si_gs_prolog_bits states;
+		/* Prologs of monolithic shaders shouldn't set EXEC. */
+		unsigned	is_monolithic:1;
 	} gs_prolog;
 	struct {
 		struct si_ps_prolog_bits states;
-		unsigned	num_input_sgprs:5;
+		unsigned	num_input_sgprs:6;
 		unsigned	num_input_vgprs:5;
 		/* Color interpolation and two-side color selection. */
 		unsigned	colors_read:8; /* color input components read */
 		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
 		unsigned	face_vgpr_index:5;
+		unsigned	ancillary_vgpr_index:5;
 		unsigned	wqm:1;
 		char		color_attr_index[2];
 		char		color_interp_vgpr_index[2]; /* -1 == constant */
@@ -427,15 +477,15 @@
 	union {
 		struct {
 			struct si_vs_prolog_bits prolog;
-			struct si_vs_epilog_bits epilog;
 		} vs;
 		struct {
+			struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
+			struct si_shader_selector *ls;   /* for merged LS-HS */
 			struct si_tcs_epilog_bits epilog;
 		} tcs; /* tessellation control shader */
 		struct {
-			struct si_vs_epilog_bits epilog; /* same as VS */
-		} tes; /* tessellation evaluation shader */
-		struct {
+			struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
+			struct si_shader_selector *es;   /* for merged ES-GS */
 			struct si_gs_prolog_bits prolog;
 		} gs;
 		struct {
@@ -451,26 +501,36 @@
 	unsigned as_ls:1; /* local shader, which precedes TCS */
 
 	/* Flags for monolithic compilation only. */
-	union {
-		struct {
-			/* One byte for every input: SI_FIX_FETCH_* enums. */
-			uint8_t		fix_fetch[SI_MAX_ATTRIBS];
-		} vs;
-		struct {
-			uint64_t	inputs_to_copy; /* for fixed-func TCS */
-		} tcs;
+	struct {
+		/* One byte for every input: SI_FIX_FETCH_* enums. */
+		uint8_t		vs_fix_fetch[SI_MAX_ATTRIBS];
+
+		union {
+			uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+			/* When PS needs PrimID and GS is disabled. */
+			unsigned	vs_export_prim_id:1;
+		} u;
 	} mono;
 
 	/* Optimization flags for asynchronous compilation only. */
-	union {
-		struct {
-			uint64_t	kill_outputs; /* "get_unique_index" bits */
-			uint32_t	kill_outputs2; /* "get_unique_index2" bits */
-			unsigned	clip_disable:1;
-		} hw_vs; /* HW VS (it can be VS, TES, GS) */
+	struct {
+		/* For HW VS (it can be VS, TES, GS) */
+		uint64_t	kill_outputs; /* "get_unique_index" bits */
+		unsigned	clip_disable:1;
+
+		/* For shaders where monolithic variants have better code.
+		 *
+		 * This is a flag that has no effect on code generation,
+		 * but forces monolithic shaders to be used as soon as
+		 * possible, because it's in the "opt" group.
+		 */
+		unsigned	prefer_mono:1;
 	} opt;
 };
 
+/* Restore the pack alignment to default. */
+#pragma pack(pop)
+
 struct si_shader_config {
 	unsigned			num_sgprs;
 	unsigned			num_vgprs;
@@ -486,24 +546,13 @@
 	unsigned			rsrc2;
 };
 
-enum {
-	/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
-	EXP_PARAM_OFFSET_0 = 0,
-	EXP_PARAM_OFFSET_31 = 31,
-	/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
-	EXP_PARAM_DEFAULT_VAL_0000 = 64,
-	EXP_PARAM_DEFAULT_VAL_0001,
-	EXP_PARAM_DEFAULT_VAL_1110,
-	EXP_PARAM_DEFAULT_VAL_1111,
-	EXP_PARAM_UNDEFINED = 255,
-};
-
 /* GCN-specific shader info. */
 struct si_shader_info {
 	ubyte			vs_output_param_offset[SI_MAX_VS_OUTPUTS];
 	ubyte			num_input_sgprs;
 	ubyte			num_input_vgprs;
-	char			face_vgpr_index;
+	signed char		face_vgpr_index;
+	signed char		ancillary_vgpr_index;
 	bool			uses_instanceid;
 	ubyte			nr_pos_exports;
 	ubyte			nr_param_exports;
@@ -513,9 +562,12 @@
 	struct si_compiler_ctx_state	compiler_ctx_state;
 
 	struct si_shader_selector	*selector;
+	struct si_shader_selector	*previous_stage_sel; /* for refcounting */
 	struct si_shader		*next_variant;
 
 	struct si_shader_part		*prolog;
+	struct si_shader		*previous_stage; /* for GFX9 */
+	struct si_shader_part		*prolog2;
 	struct si_shader_part		*epilog;
 
 	struct si_pm4_state		*pm4;
@@ -562,33 +614,23 @@
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug);
-int si_compile_llvm(struct si_screen *sscreen,
-		    struct ac_shader_binary *binary,
-		    struct si_shader_config *conf,
-		    LLVMTargetMachineRef tm,
-		    LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug,
-		    unsigned processor,
-		    const char *name);
 void si_shader_destroy(struct si_shader *shader);
+unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
-unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor,
 		    FILE *f, bool check_debug_option);
 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
 				      unsigned *lds_size);
-void si_shader_apply_scratch_relocs(struct si_context *sctx,
-			struct si_shader *shader,
-			struct si_shader_config *config,
-			uint64_t scratch_va);
+void si_shader_apply_scratch_relocs(struct si_shader *shader,
+				    uint64_t scratch_va);
 void si_shader_binary_read_config(struct ac_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset);
 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
 				    bool writes_samplemask);
-const char *si_get_shader_name(struct si_shader *shader, unsigned processor);
+const char *si_get_shader_name(const struct si_shader *shader, unsigned processor);
 
 /* Inline helpers. */
 
@@ -604,4 +646,16 @@
 	return &sel->main_shader_part;
 }
 
+static inline bool
+si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
+{
+	return selector ? selector->info.uses_bindless_samplers : false;
+}
+
+static inline bool
+si_shader_uses_bindless_images(struct si_shader_selector *selector)
+{
+	return selector ? selector->info.uses_bindless_images : false;
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index fd7deec..6b98bca 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -25,6 +25,7 @@
 #define SI_SHADER_PRIVATE_H
 
 #include "si_shader.h"
+#include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "tgsi/tgsi_parse.h"
@@ -57,6 +58,12 @@
 
 	unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
 
+	/* For clamping the non-constant index in resource indexing: */
+	unsigned num_const_buffers;
+	unsigned num_shader_buffers;
+	unsigned num_images;
+	unsigned num_samplers;
+
 	/* Whether the prolog will be compiled separately. */
 	bool separate_prolog;
 
@@ -99,6 +106,8 @@
 	unsigned flow_depth;
 	unsigned flow_depth_max;
 
+	struct lp_build_if_state merged_wrap_if_state;
+
 	struct tgsi_array_info *temp_arrays;
 	LLVMValueRef *temp_array_allocas;
 
@@ -107,20 +116,97 @@
 	LLVMValueRef main_fn;
 	LLVMTypeRef return_type;
 
-	int param_streamout_config;
-	int param_streamout_write_index;
-	int param_streamout_offset[4];
+	/* Parameter indices for LLVMGetParam. */
+	int param_rw_buffers;
+	int param_const_and_shader_buffers;
+	int param_samplers_and_images;
+	/* Common inputs for merged shaders. */
+	int param_merged_wave_info;
+	int param_merged_scratch_offset;
+	/* API VS */
+	int param_vertex_buffers;
+	int param_base_vertex;
+	int param_start_instance;
+	int param_draw_id;
 	int param_vertex_id;
 	int param_rel_auto_id;
 	int param_vs_prim_id;
 	int param_instance_id;
 	int param_vertex_index0;
+	/* VS states and layout of LS outputs / TCS inputs at the end
+	 *   [0] = clamp vertex color
+	 *   [1] = indexed
+	 *   [8:20] = stride between patches in DW = num_inputs * num_vertices * 4
+	 *            max = 32*32*4 + 32*4
+	 *   [24:31] = stride between vertices in DW = num_inputs * 4
+	 *             max = 32*4
+	 */
+	int param_vs_state_bits;
+	/* HW VS */
+	int param_streamout_config;
+	int param_streamout_write_index;
+	int param_streamout_offset[4];
+
+	/* API TCS & TES */
+	/* Layout of TCS outputs in the offchip buffer
+	 * # 6 bits
+	 *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
+	 * # 6 bits
+	 *   [6:11] = the number of output vertices per patch, max = 32
+	 * # 20 bits
+	 *   [12:31] = the offset of per patch attributes in the buffer in bytes.
+	 *             max = NUM_PATCHES*32*32*16
+	 */
+	int param_tcs_offchip_layout;
+
+	/* API TCS */
+	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
+	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+	 *   [16:31] = TCS output patch0 offset for per-patch / 16
+	 *             max = (NUM_PATCHES + 1) * 32*32
+	 */
+	int param_tcs_out_lds_offsets;
+	/* Layout of TCS outputs / TES inputs:
+	 *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
+	 *            max = 32*32*4 + 32*4
+	 *   [13:20] = stride between output vertices in DW = num_inputs * 4
+	 *             max = 32*4
+	 *   [26:31] = gl_PatchVerticesIn, max = 32
+	 */
+	int param_tcs_out_lds_layout;
+	int param_tcs_offchip_addr_base64k;
+	int param_tcs_factor_addr_base64k;
+	int param_tcs_offchip_offset;
+	int param_tcs_factor_offset;
+	int param_tcs_patch_id;
+	int param_tcs_rel_ids;
+
+	/* API TES */
 	int param_tes_u;
 	int param_tes_v;
 	int param_tes_rel_patch_id;
 	int param_tes_patch_id;
+	/* HW ES */
 	int param_es2gs_offset;
-	int param_oc_lds;
+	/* API GS */
+	int param_gs2vs_offset;
+	int param_gs_wave_id; /* GFX6 */
+	int param_gs_vtx0_offset; /* in dwords (GFX6) */
+	int param_gs_vtx1_offset; /* in dwords (GFX6) */
+	int param_gs_prim_id;
+	int param_gs_vtx2_offset; /* in dwords (GFX6) */
+	int param_gs_vtx3_offset; /* in dwords (GFX6) */
+	int param_gs_vtx4_offset; /* in dwords (GFX6) */
+	int param_gs_vtx5_offset; /* in dwords (GFX6) */
+	int param_gs_instance_id;
+	int param_gs_vtx01_offset; /* in dwords (GFX9) */
+	int param_gs_vtx23_offset; /* in dwords (GFX9) */
+	int param_gs_vtx45_offset; /* in dwords (GFX9) */
+	/* CS */
+	int param_grid_size;
+	int param_block_size;
+	int param_block_id[3];
+	int param_thread_id;
 
 	LLVMTargetMachineRef tm;
 
@@ -134,6 +220,7 @@
 
 	LLVMValueRef lds;
 	LLVMValueRef gs_next_vertex[4];
+	LLVMValueRef postponed_kill;
 	LLVMValueRef return_value;
 
 	LLVMTypeRef voidt;
@@ -143,7 +230,6 @@
 	LLVMTypeRef i64;
 	LLVMTypeRef i128;
 	LLVMTypeRef f32;
-	LLVMTypeRef v16i8;
 	LLVMTypeRef v2i32;
 	LLVMTypeRef v4i32;
 	LLVMTypeRef v4f32;
@@ -162,9 +248,6 @@
 }
 
 void si_llvm_add_attribute(LLVMValueRef F, const char *name, int value);
-void si_llvm_shader_type(LLVMValueRef F, unsigned type);
-
-LLVMTargetRef si_llvm_get_amdgpu_target(const char *triple);
 
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 			 LLVMTargetMachineRef tm,
@@ -182,10 +265,9 @@
 
 void si_llvm_context_init(struct si_shader_context *ctx,
 			  struct si_screen *sscreen,
-			  struct si_shader *shader,
-			  LLVMTargetMachineRef tm,
-			  const struct tgsi_shader_info *info,
-			  const struct tgsi_token *tokens);
+			  LLVMTargetMachineRef tm);
+void si_llvm_context_set_tgsi(struct si_shader_context *ctx,
+			      struct si_shader *shader);
 
 void si_llvm_create_func(struct si_shader_context *ctx,
 			 const char *name,
@@ -194,8 +276,7 @@
 
 void si_llvm_dispose(struct si_shader_context *ctx);
 
-void si_llvm_finalize_module(struct si_shader_context *ctx,
-			     bool run_verifier);
+void si_llvm_optimize_module(struct si_shader_context *ctx);
 
 LLVMValueRef si_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base,
 				      enum tgsi_opcode_type type,
@@ -212,6 +293,20 @@
 			const struct tgsi_opcode_info *info,
 			LLVMValueRef dst[4]);
 
+/* Combine these with & instead of |. */
+#define NOOP_WAITCNT 0xf7f
+#define LGKM_CNT 0x07f
+#define VM_CNT 0xf70
+
+void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16);
+
+LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
+					   const struct tgsi_ind_register *ind,
+					   int rel_index, unsigned num);
+
+LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements);
+
 void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base);
+void si_shader_context_init_mem(struct si_shader_context *ctx);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
index 1e2d75d..12f8de4 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
@@ -60,6 +60,27 @@
 		     struct lp_build_emit_data *emit_data)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+
+	if (ctx->postponed_kill) {
+		if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_KILL_IF) {
+			LLVMValueRef val;
+
+			/* Take the minimum kill value. This is the same as OR
+			 * between 2 kill values. If the value is negative,
+			 * the pixel will be killed.
+			 */
+			val = LLVMBuildLoad(builder, ctx->postponed_kill, "");
+			val = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
+							val, emit_data->args[0]);
+			LLVMBuildStore(builder, val, ctx->postponed_kill);
+		} else {
+			LLVMBuildStore(builder,
+				       LLVMConstReal(ctx->f32, -1),
+				       ctx->postponed_kill);
+		}
+		return;
+	}
 
 	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_KILL_IF)
 		ac_build_kill(&ctx->ac, emit_data->args[0]);
@@ -701,8 +722,7 @@
 			      emit_data->args[0], emit_data->args[1], "");
 
 	/* Use v_rcp_f32 instead of precise division. */
-	if (HAVE_LLVM >= 0x0309 &&
-	    !LLVMIsConstant(emit_data->output[emit_data->chan]))
+	if (!LLVMIsConstant(emit_data->output[emit_data->chan]))
 		LLVMSetMetadata(emit_data->output[emit_data->chan],
 				ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 }
@@ -748,8 +768,7 @@
 	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
 	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name =
-		HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.rsq.f64" : "llvm.AMDGPU.rsq.f64";
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.amdgcn.rsq.f64";
 	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64";
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
new file mode 100644
index 0000000..36020be
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -0,0 +1,2049 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_shader_internal.h"
+#include "si_pipe.h"
+#include "sid.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_gather.h"
+#include "gallivm/lp_bld_intr.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data);
+
+static const struct lp_build_tgsi_action tex_action;
+
+enum desc_type {
+	DESC_IMAGE,
+	DESC_BUFFER,
+	DESC_FMASK,
+	DESC_SAMPLER,
+};
+
+/**
+ * Given a v8i32 resource descriptor for a buffer, extract the size of the
+ * buffer in number of elements and return it as an i32.
+ */
+static LLVMValueRef get_buffer_size(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef descriptor)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef size =
+		LLVMBuildExtractElement(builder, descriptor,
+					LLVMConstInt(ctx->i32, 2, 0), "");
+
+	if (ctx->screen->b.chip_class == VI) {
+		/* On VI, the descriptor contains the size in bytes,
+		 * but TXQ must return the size in elements.
+		 * The stride is always non-zero for resources using TXQ.
+		 */
+		LLVMValueRef stride =
+			LLVMBuildExtractElement(builder, descriptor,
+						ctx->i32_1, "");
+		stride = LLVMBuildLShr(builder, stride,
+				       LLVMConstInt(ctx->i32, 16, 0), "");
+		stride = LLVMBuildAnd(builder, stride,
+				      LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
+
+		size = LLVMBuildUDiv(builder, size, stride, "");
+	}
+
+	return size;
+}
+
+static LLVMValueRef
+shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
+			 const struct tgsi_full_src_register *reg)
+{
+	LLVMValueRef index;
+	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
+					     ctx->param_const_and_shader_buffers);
+
+	if (!reg->Register.Indirect) {
+		index = LLVMConstInt(ctx->i32,
+				     si_get_shaderbuf_slot(reg->Register.Index), 0);
+	} else {
+		index = si_get_bounded_indirect_index(ctx, &reg->Indirect,
+						      reg->Register.Index,
+						      ctx->num_shader_buffers);
+		index = LLVMBuildSub(ctx->gallivm.builder,
+				     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
+				     index, "");
+	}
+
+	return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
+}
+
+static bool tgsi_is_array_sampler(unsigned target)
+{
+	return target == TGSI_TEXTURE_1D_ARRAY ||
+	       target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY ||
+	       target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
+	       target == TGSI_TEXTURE_CUBE_ARRAY ||
+	       target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
+}
+
+static bool tgsi_is_array_image(unsigned target)
+{
+	return target == TGSI_TEXTURE_3D ||
+	       target == TGSI_TEXTURE_CUBE ||
+	       target == TGSI_TEXTURE_1D_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY ||
+	       target == TGSI_TEXTURE_CUBE_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
+}
+
+/**
+ * Given a 256-bit resource descriptor, force the DCC enable bit to off.
+ *
+ * At least on Tonga, executing image stores on images with DCC enabled and
+ * non-trivial can eventually lead to lockups. This can occur when an
+ * application binds an image as read-only but then uses a shader that writes
+ * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
+ * program termination) in this case, but it doesn't cost much to be a bit
+ * nicer: disabling DCC in the shader still leads to undefined results but
+ * avoids the lockup.
+ */
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
+				  LLVMValueRef rsrc)
+{
+	if (ctx->screen->b.chip_class <= CIK) {
+		return rsrc;
+	} else {
+		LLVMBuilderRef builder = ctx->gallivm.builder;
+		LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
+		LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
+		LLVMValueRef tmp;
+
+		tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
+		tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
+		return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
+	}
+}
+
+static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
+				    LLVMValueRef list, LLVMValueRef index,
+				    unsigned target)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		index = LLVMBuildMul(builder, index,
+				     LLVMConstInt(ctx->i32, 2, 0), "");
+		index = LLVMBuildAdd(builder, index,
+				     ctx->i32_1, "");
+		list = LLVMBuildPointerCast(builder, list,
+					    si_const_array(ctx->v4i32, 0), "");
+	}
+
+	return ac_build_indexed_load_const(&ctx->ac, list, index);
+}
+
+/**
+ * Load the resource descriptor for \p image.
+ */
+static void
+image_fetch_rsrc(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *image,
+	bool is_store, unsigned target,
+	LLVMValueRef *rsrc)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
+					     ctx->param_samplers_and_images);
+	LLVMValueRef index;
+	bool dcc_off = is_store;
+
+	if (!image->Register.Indirect) {
+		const struct tgsi_shader_info *info = bld_base->info;
+		unsigned images_writemask = info->images_store |
+					    info->images_atomic;
+
+		index = LLVMConstInt(ctx->i32,
+				     si_get_image_slot(image->Register.Index), 0);
+
+		if (images_writemask & (1 << image->Register.Index))
+			dcc_off = true;
+	} else {
+		/* From the GL_ARB_shader_image_load_store extension spec:
+		 *
+		 *    If a shader performs an image load, store, or atomic
+		 *    operation using an image variable declared as an array,
+		 *    and if the index used to select an individual element is
+		 *    negative or greater than or equal to the size of the
+		 *    array, the results of the operation are undefined but may
+		 *    not lead to termination.
+		 */
+		index = si_get_bounded_indirect_index(ctx, &image->Indirect,
+						      image->Register.Index,
+						      ctx->num_images);
+		index = LLVMBuildSub(ctx->gallivm.builder,
+				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
+				     index, "");
+	}
+
+	if (image->Register.File != TGSI_FILE_IMAGE) {
+		struct gallivm_state *gallivm = &ctx->gallivm;
+		LLVMBuilderRef builder = gallivm->builder;
+
+		LLVMValueRef ptr =
+			lp_build_emit_fetch_src(bld_base, image,
+						TGSI_TYPE_UNSIGNED64, 0);
+		rsrc_ptr = LLVMBuildIntToPtr(builder, ptr,
+					     si_const_array(ctx->v8i32, 0), "");
+		index = LLVMConstInt(ctx->i32, 0, 0);
+	}
+
+	*rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
+	if (dcc_off && target != TGSI_TEXTURE_BUFFER)
+		*rsrc = force_dcc_off(ctx, *rsrc);
+}
+
+static LLVMValueRef image_fetch_coords(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_instruction *inst,
+		unsigned src, LLVMValueRef desc)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	unsigned target = inst->Memory.Texture;
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+	LLVMValueRef coords[4];
+	LLVMValueRef tmp;
+	int chan;
+
+	for (chan = 0; chan < num_coords; ++chan) {
+		tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
+		tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+		coords[chan] = tmp;
+	}
+
+	if (ctx->screen->b.chip_class >= GFX9) {
+		/* 1D textures are allocated and used as 2D on GFX9. */
+		if (target == TGSI_TEXTURE_1D) {
+			coords[1] = ctx->i32_0;
+			num_coords++;
+		} else if (target == TGSI_TEXTURE_1D_ARRAY) {
+			coords[2] = coords[1];
+			coords[1] = ctx->i32_0;
+			num_coords++;
+		} else if (target == TGSI_TEXTURE_2D) {
+			/* The hw can't bind a slice of a 3D image as a 2D
+			 * image, because it ignores BASE_ARRAY if the target
+			 * is 3D. The workaround is to read BASE_ARRAY and set
+			 * it as the 3rd address operand for all 2D images.
+			 */
+			LLVMValueRef first_layer, const5, mask;
+
+			const5 = LLVMConstInt(ctx->i32, 5, 0);
+			mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
+			first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
+			first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
+
+			coords[2] = first_layer;
+			num_coords++;
+		}
+	}
+
+	if (num_coords == 1)
+		return coords[0];
+
+	if (num_coords == 3) {
+		/* LLVM has difficulties lowering 3-element vectors. */
+		coords[3] = bld_base->uint_bld.undef;
+		num_coords = 4;
+	}
+
+	return lp_build_gather_values(gallivm, coords, num_coords);
+}
+
+/**
+ * Append the extra mode bits that are used by image load and store.
+ */
+static void image_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data * emit_data,
+		unsigned target,
+		bool atomic,
+		bool force_glc)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
+	LLVMValueRef r128 = i1false;
+	LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
+	LLVMValueRef glc =
+		force_glc ||
+		inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+		i1true : i1false;
+	LLVMValueRef slc = i1false;
+	LLVMValueRef lwe = i1false;
+
+	if (atomic || (HAVE_LLVM <= 0x0309)) {
+		emit_data->args[emit_data->arg_count++] = r128;
+		emit_data->args[emit_data->arg_count++] = da;
+		if (!atomic) {
+			emit_data->args[emit_data->arg_count++] = glc;
+		}
+		emit_data->args[emit_data->arg_count++] = slc;
+		return;
+	}
+
+	/* HAVE_LLVM >= 0x0400 */
+	emit_data->args[emit_data->arg_count++] = glc;
+	emit_data->args[emit_data->arg_count++] = slc;
+	emit_data->args[emit_data->arg_count++] = lwe;
+	emit_data->args[emit_data->arg_count++] = da;
+}
+
+/**
+ * Append the resource and indexing arguments for buffer intrinsics.
+ *
+ * \param rsrc the v4i32 buffer resource
+ * \param index index into the buffer (stride-based)
+ * \param offset byte offset into the buffer
+ */
+static void buffer_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data,
+		LLVMValueRef rsrc,
+		LLVMValueRef index,
+		LLVMValueRef offset,
+		bool atomic,
+		bool force_glc)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
+
+	emit_data->args[emit_data->arg_count++] = rsrc;
+	emit_data->args[emit_data->arg_count++] = index; /* vindex */
+	emit_data->args[emit_data->arg_count++] = offset; /* voffset */
+	if (!atomic) {
+		emit_data->args[emit_data->arg_count++] =
+			force_glc ||
+			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+			i1true : i1false; /* glc */
+	}
+	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
+}
+
+static void load_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef rsrc;
+
+	emit_data->dst_type = ctx->v4f32;
+
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMBuilderRef builder = gallivm->builder;
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
+
+		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
+				   offset, false, false);
+	} else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
+		   tgsi_is_bindless_image_file(inst->Src[0].Register.File)) {
+		LLVMValueRef coords;
+
+		image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
+		coords = image_fetch_coords(bld_base, inst, 1, rsrc);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					   ctx->i32_0, false, false);
+		} else {
+			emit_data->args[0] = coords;
+			emit_data->args[1] = rsrc;
+			emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
+			emit_data->arg_count = 3;
+
+			image_append_args(ctx, emit_data, target, false, false);
+		}
+	}
+}
+
+static unsigned get_load_intr_attribs(bool can_speculate)
+{
+	/* READNONE means writes can't affect it, while READONLY means that
+	 * writes can affect it. */
+	return can_speculate && HAVE_LLVM >= 0x0400 ?
+				 LP_FUNC_ATTR_READNONE :
+				 LP_FUNC_ATTR_READONLY;
+}
+
+static unsigned get_store_intr_attribs(bool writeonly_memory)
+{
+	return writeonly_memory && HAVE_LLVM >= 0x0400 ?
+				  LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+				  LP_FUNC_ATTR_WRITEONLY;
+}
+
+static void load_emit_buffer(struct si_shader_context *ctx,
+			     struct lp_build_emit_data *emit_data,
+			     bool can_speculate)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	uint writemask = inst->Dst[0].Register.WriteMask;
+	uint count = util_last_bit(writemask);
+	LLVMValueRef *args = emit_data->args;
+
+	/* Don't use SMEM for shader buffer loads, because LLVM doesn't
+	 * select SMEM for SI.load.const with a non-constant offset, and
+	 * constant offsets practically don't exist with shader buffers.
+	 *
+	 * Also, SI.load.const doesn't use inst_offset when it's lowered
+	 * to VMEM, so we just end up with more VALU instructions in the end
+	 * and no benefit.
+	 *
+	 * TODO: Remove this line once LLVM can select SMEM with a non-constant
+	 *       offset, and can derive inst_offset when VMEM is selected.
+	 *       After that, si_memory_barrier should invalidate sL1 for shader
+	 *       buffers.
+	 */
+
+	assert(LLVMConstIntGetZExtValue(args[1]) == 0); /* vindex */
+	emit_data->output[emit_data->chan] =
+		ac_build_buffer_load(&ctx->ac, args[0], count, NULL,
+				     args[2], NULL, 0,
+				     LLVMConstIntGetZExtValue(args[3]),
+				     LLVMConstIntGetZExtValue(args[4]),
+				     can_speculate, false);
+}
+
+static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
+                                   const struct tgsi_full_instruction *inst,
+                                   LLVMTypeRef type, int arg)
+{
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef offset, ptr;
+	int addr_space;
+
+	offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
+	offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
+
+	ptr = ctx->shared_memory;
+	ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
+	addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+	ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
+
+	return ptr;
+}
+
+static void load_emit_memory(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	unsigned writemask = inst->Dst[0].Register.WriteMask;
+	LLVMValueRef channels[4], ptr, derived_ptr, index;
+	int chan;
+
+	ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
+
+	for (chan = 0; chan < 4; ++chan) {
+		if (!(writemask & (1 << chan))) {
+			channels[chan] = LLVMGetUndef(ctx->f32);
+			continue;
+		}
+
+		index = LLVMConstInt(ctx->i32, chan, 0);
+		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+		channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
+	}
+	emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
+}
+
+/**
+ * Return true if the memory accessed by a LOAD or STORE instruction is
+ * read-only or write-only, respectively.
+ *
+ * \param shader_buffers_reverse_access_mask
+ *	For LOAD, set this to (store | atomic) slot usage in the shader.
+ *	For STORE, set this to (load | atomic) slot usage in the shader.
+ * \param images_reverse_access_mask  Same as above, but for images.
+ */
+static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
+				  const struct tgsi_shader_info *info,
+				  unsigned shader_buffers_reverse_access_mask,
+				  unsigned images_reverse_access_mask)
+{
+	/* RESTRICT means NOALIAS.
+	 * If there are no writes, we can assume the accessed memory is read-only.
+	 * If there are no reads, we can assume the accessed memory is write-only.
+	 */
+	if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
+		unsigned reverse_access_mask;
+
+		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+			reverse_access_mask = shader_buffers_reverse_access_mask;
+		} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+			reverse_access_mask = info->images_buffers &
+					      images_reverse_access_mask;
+		} else {
+			reverse_access_mask = ~info->images_buffers &
+					      images_reverse_access_mask;
+		}
+
+		if (inst->Src[0].Register.Indirect) {
+			if (!reverse_access_mask)
+				return true;
+		} else {
+			if (!(reverse_access_mask &
+			      (1u << inst->Src[0].Register.Index)))
+				return true;
+		}
+	}
+
+	/* If there are no buffer writes (for both shader buffers & image
+	 * buffers), it implies that buffer memory is read-only.
+	 * If there are no buffer reads (for both shader buffers & image
+	 * buffers), it implies that buffer memory is write-only.
+	 *
+	 * Same for the case when there are no writes/reads for non-buffer
+	 * images.
+	 */
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
+	    (inst->Memory.Texture == TGSI_TEXTURE_BUFFER &&
+	     (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
+	      tgsi_is_bindless_image_file(inst->Src[0].Register.File)))) {
+		if (!shader_buffers_reverse_access_mask &&
+		    !(info->images_buffers & images_reverse_access_mask))
+			return true;
+	} else {
+		if (!(~info->images_buffers & images_reverse_access_mask))
+			return true;
+	}
+	return false;
+}
+
+static void load_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
+	char intrinsic_name[64];
+	bool can_speculate = false;
+
+	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
+		load_emit_memory(ctx, emit_data);
+		return;
+	}
+
+	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
+		si_emit_waitcnt(ctx, VM_CNT);
+
+	can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
+			  is_oneway_access_only(inst, info,
+						info->shader_buffers_store |
+						info->shader_buffers_atomic,
+						info->images_store |
+						info->images_atomic);
+
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		load_emit_buffer(ctx, emit_data, can_speculate);
+		return;
+	}
+
+	if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				get_load_intr_attribs(can_speculate));
+	} else {
+		ac_get_image_intr_name("llvm.amdgcn.image.load",
+				       emit_data->dst_type,		/* vdata */
+				       LLVMTypeOf(emit_data->args[0]), /* coords */
+				       LLVMTypeOf(emit_data->args[1]), /* rsrc */
+				       intrinsic_name, sizeof(intrinsic_name));
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				get_load_intr_attribs(can_speculate));
+	}
+}
+
+static void store_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct tgsi_full_src_register memory;
+	LLVMValueRef chans[4];
+	LLVMValueRef data;
+	LLVMValueRef rsrc;
+	unsigned chan;
+
+	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
+
+	for (chan = 0; chan < 4; ++chan) {
+		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
+	}
+	data = lp_build_gather_values(gallivm, chans, 4);
+
+	emit_data->args[emit_data->arg_count++] = data;
+
+	memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
+
+	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
+
+		rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
+		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
+				   offset, false, false);
+	} else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE ||
+		   tgsi_is_bindless_image_file(inst->Dst[0].Register.File)) {
+		unsigned target = inst->Memory.Texture;
+		LLVMValueRef coords;
+
+		/* 8bit/16bit TC L1 write corruption bug on SI.
+		 * All store opcodes not aligned to a dword are affected.
+		 *
+		 * The only way to get unaligned stores in radeonsi is through
+		 * shader images.
+		 */
+		bool force_glc = ctx->screen->b.chip_class == SI;
+
+		image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
+		coords = image_fetch_coords(bld_base, inst, 0, rsrc);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					   ctx->i32_0, false, force_glc);
+		} else {
+			emit_data->args[1] = coords;
+			emit_data->args[2] = rsrc;
+			emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
+			emit_data->arg_count = 4;
+
+			image_append_args(ctx, emit_data, target, false, force_glc);
+		}
+	}
+}
+
+static void store_emit_buffer(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data,
+		bool writeonly_memory)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef base_data = emit_data->args[0];
+	LLVMValueRef base_offset = emit_data->args[3];
+	unsigned writemask = inst->Dst[0].Register.WriteMask;
+
+	while (writemask) {
+		int start, count;
+		const char *intrinsic_name;
+		LLVMValueRef data;
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
+
+		u_bit_scan_consecutive_range(&writemask, &start, &count);
+
+		/* Due to an LLVM limitation, split 3-element writes
+		 * into a 2-element and a 1-element write. */
+		if (count == 3) {
+			writemask |= 1 << (start + 2);
+			count = 2;
+		}
+
+		if (count == 4) {
+			data = base_data;
+			intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
+		} else if (count == 2) {
+			LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
+
+			tmp = LLVMBuildExtractElement(
+				builder, base_data,
+				LLVMConstInt(ctx->i32, start, 0), "");
+			data = LLVMBuildInsertElement(
+				builder, LLVMGetUndef(v2f32), tmp,
+				ctx->i32_0, "");
+
+			tmp = LLVMBuildExtractElement(
+				builder, base_data,
+				LLVMConstInt(ctx->i32, start + 1, 0), "");
+			data = LLVMBuildInsertElement(
+				builder, data, tmp, ctx->i32_1, "");
+
+			intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
+		} else {
+			assert(count == 1);
+			data = LLVMBuildExtractElement(
+				builder, base_data,
+				LLVMConstInt(ctx->i32, start, 0), "");
+			intrinsic_name = "llvm.amdgcn.buffer.store.f32";
+		}
+
+		offset = base_offset;
+		if (start != 0) {
+			offset = LLVMBuildAdd(
+				builder, offset,
+				LLVMConstInt(ctx->i32, start * 4, 0), "");
+		}
+
+		emit_data->args[0] = data;
+		emit_data->args[3] = offset;
+
+		lp_build_intrinsic(
+			builder, intrinsic_name, emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			get_store_intr_attribs(writeonly_memory));
+	}
+}
+
+static void store_emit_memory(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	unsigned writemask = inst->Dst[0].Register.WriteMask;
+	LLVMValueRef ptr, derived_ptr, data, index;
+	int chan;
+
+	ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
+
+	for (chan = 0; chan < 4; ++chan) {
+		if (!(writemask & (1 << chan))) {
+			continue;
+		}
+		data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
+		index = LLVMConstInt(ctx->i32, chan, 0);
+		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+		LLVMBuildStore(builder, data, derived_ptr);
+	}
+}
+
+static void store_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[64];
+	bool writeonly_memory = false;
+
+	if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
+		store_emit_memory(ctx, emit_data);
+		return;
+	}
+
+	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
+		si_emit_waitcnt(ctx, VM_CNT);
+
+	writeonly_memory = is_oneway_access_only(inst, info,
+						 info->shader_buffers_load |
+						 info->shader_buffers_atomic,
+						 info->images_load |
+						 info->images_atomic);
+
+	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
+		store_emit_buffer(ctx, emit_data, writeonly_memory);
+		return;
+	}
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, "llvm.amdgcn.buffer.store.format.v4f32",
+			emit_data->dst_type, emit_data->args,
+			emit_data->arg_count,
+			get_store_intr_attribs(writeonly_memory));
+	} else {
+		ac_get_image_intr_name("llvm.amdgcn.image.store",
+				       LLVMTypeOf(emit_data->args[0]), /* vdata */
+				       LLVMTypeOf(emit_data->args[1]), /* coords */
+				       LLVMTypeOf(emit_data->args[2]), /* rsrc */
+				       intrinsic_name, sizeof(intrinsic_name));
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				get_store_intr_attribs(writeonly_memory));
+	}
+}
+
+static void atomic_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef data1, data2;
+	LLVMValueRef rsrc;
+	LLVMValueRef tmp;
+
+	emit_data->dst_type = ctx->f32;
+
+	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
+	data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+		tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
+		data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+	}
+
+	/* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
+	 * of arguments, which is reversed relative to TGSI (and GLSL)
+	 */
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+		emit_data->args[emit_data->arg_count++] = data2;
+	emit_data->args[emit_data->arg_count++] = data1;
+
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMValueRef offset;
+
+		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
+				   offset, true, false);
+	} else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
+		   tgsi_is_bindless_image_file(inst->Src[0].Register.File)) {
+		unsigned target = inst->Memory.Texture;
+		LLVMValueRef coords;
+
+		image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
+		coords = image_fetch_coords(bld_base, inst, 1, rsrc);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					   ctx->i32_0, true, false);
+		} else {
+			emit_data->args[emit_data->arg_count++] = coords;
+			emit_data->args[emit_data->arg_count++] = rsrc;
+
+			image_append_args(ctx, emit_data, target, true, false);
+		}
+	}
+}
+
+static void atomic_emit_memory(struct si_shader_context *ctx,
+                               struct lp_build_emit_data *emit_data) {
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef ptr, result, arg;
+
+	ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
+
+	arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
+	arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+		LLVMValueRef new_data;
+		new_data = lp_build_emit_fetch(&ctx->bld_base,
+		                               inst, 3, 0);
+
+		new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
+
+		result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
+		                       LLVMAtomicOrderingSequentiallyConsistent,
+		                       LLVMAtomicOrderingSequentiallyConsistent,
+		                       false);
+
+		result = LLVMBuildExtractValue(builder, result, 0, "");
+	} else {
+		LLVMAtomicRMWBinOp op;
+
+		switch(inst->Instruction.Opcode) {
+			case TGSI_OPCODE_ATOMUADD:
+				op = LLVMAtomicRMWBinOpAdd;
+				break;
+			case TGSI_OPCODE_ATOMXCHG:
+				op = LLVMAtomicRMWBinOpXchg;
+				break;
+			case TGSI_OPCODE_ATOMAND:
+				op = LLVMAtomicRMWBinOpAnd;
+				break;
+			case TGSI_OPCODE_ATOMOR:
+				op = LLVMAtomicRMWBinOpOr;
+				break;
+			case TGSI_OPCODE_ATOMXOR:
+				op = LLVMAtomicRMWBinOpXor;
+				break;
+			case TGSI_OPCODE_ATOMUMIN:
+				op = LLVMAtomicRMWBinOpUMin;
+				break;
+			case TGSI_OPCODE_ATOMUMAX:
+				op = LLVMAtomicRMWBinOpUMax;
+				break;
+			case TGSI_OPCODE_ATOMIMIN:
+				op = LLVMAtomicRMWBinOpMin;
+				break;
+			case TGSI_OPCODE_ATOMIMAX:
+				op = LLVMAtomicRMWBinOpMax;
+				break;
+			default:
+				unreachable("unknown atomic opcode");
+		}
+
+		result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
+		                       LLVMAtomicOrderingSequentiallyConsistent,
+		                       false);
+	}
+	emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
+}
+
+static void atomic_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	char intrinsic_name[40];
+	LLVMValueRef tmp;
+
+	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
+		atomic_emit_memory(ctx, emit_data);
+		return;
+	}
+
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
+	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
+	} else {
+		LLVMValueRef coords;
+		char coords_type[8];
+
+		if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+			coords = emit_data->args[2];
+		else
+			coords = emit_data->args[1];
+
+		ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.atomic.%s.%s",
+			 action->intr_name, coords_type);
+	}
+
+	tmp = lp_build_intrinsic(
+		builder, intrinsic_name, ctx->i32,
+		emit_data->args, emit_data->arg_count, 0);
+	emit_data->output[emit_data->chan] =
+		LLVMBuildBitCast(builder, tmp, ctx->f32, "");
+}
+
+static void set_tex_fetch_args(struct si_shader_context *ctx,
+			       struct lp_build_emit_data *emit_data,
+			       unsigned target,
+			       LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
+			       LLVMValueRef *param, unsigned count,
+			       unsigned dmask)
+{
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	struct ac_image_args args = {};
+
+	/* Pad to power of two vector */
+	while (count < util_next_power_of_two(count))
+		param[count++] = LLVMGetUndef(ctx->i32);
+
+	if (count > 1)
+		args.addr = lp_build_gather_values(gallivm, param, count);
+	else
+		args.addr = param[0];
+
+	args.resource = res_ptr;
+	args.sampler = samp_ptr;
+	args.dmask = dmask;
+	args.unorm = target == TGSI_TEXTURE_RECT ||
+		     target == TGSI_TEXTURE_SHADOWRECT;
+	args.da = tgsi_is_array_sampler(target);
+
+	/* Ugly, but we seem to have no other choice right now. */
+	STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
+	memcpy(emit_data->args, &args, sizeof(args));
+}
+
+static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
+				unsigned target, LLVMValueRef out)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+
+	/* 1D textures are allocated and used as 2D on GFX9. */
+        if (ctx->screen->b.chip_class >= GFX9 &&
+	    (target == TGSI_TEXTURE_1D_ARRAY ||
+	     target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
+		LLVMValueRef layers =
+			LLVMBuildExtractElement(builder, out,
+						LLVMConstInt(ctx->i32, 2, 0), "");
+		out = LLVMBuildInsertElement(builder, out, layers,
+					     ctx->i32_1, "");
+	}
+
+	/* Divide the number of layers by 6 to get the number of cubes. */
+	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
+	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+		LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
+
+		LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
+		z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
+
+		out = LLVMBuildInsertElement(builder, out, z, imm2, "");
+	}
+	return out;
+}
+
+static void resq_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const struct tgsi_full_src_register *reg = &inst->Src[0];
+
+	emit_data->dst_type = ctx->v4i32;
+
+	if (reg->Register.File == TGSI_FILE_BUFFER) {
+		emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
+		emit_data->arg_count = 1;
+	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+		image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
+				 &emit_data->args[0]);
+		emit_data->arg_count = 1;
+	} else {
+		LLVMValueRef res_ptr;
+		unsigned image_target;
+
+		if (inst->Memory.Texture == TGSI_TEXTURE_3D)
+			image_target = TGSI_TEXTURE_2D_ARRAY;
+		else
+			image_target = inst->Memory.Texture;
+
+		image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
+				 &res_ptr);
+		set_tex_fetch_args(ctx, emit_data, image_target,
+				   res_ptr, NULL, &ctx->i32_0, 1,
+				   0xf);
+	}
+}
+
+static void resq_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	LLVMValueRef out;
+
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		out = LLVMBuildExtractElement(builder, emit_data->args[0],
+					      LLVMConstInt(ctx->i32, 2, 0), "");
+	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+		out = get_buffer_size(bld_base, emit_data->args[0]);
+	} else {
+		struct ac_image_args args;
+
+		memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
+		args.opcode = ac_image_get_resinfo;
+		out = ac_build_image_opcode(&ctx->ac, &args);
+
+		out = fix_resinfo(ctx, inst->Memory.Texture, out);
+	}
+
+	emit_data->output[emit_data->chan] = out;
+}
+
+/**
+ * Load an image view, fmask view. or sampler state descriptor.
+ */
+static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
+				      LLVMValueRef list, LLVMValueRef index,
+				      enum desc_type type)
+{
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+
+	switch (type) {
+	case DESC_IMAGE:
+		/* The image is at [0:7]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		break;
+	case DESC_BUFFER:
+		/* The buffer is in [4:7]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+		index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
+		list = LLVMBuildPointerCast(builder, list,
+					    si_const_array(ctx->v4i32, 0), "");
+		break;
+	case DESC_FMASK:
+		/* The FMASK is at [8:15]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
+		break;
+	case DESC_SAMPLER:
+		/* The sampler state is at [12:15]. */
+		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+		index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
+		list = LLVMBuildPointerCast(builder, list,
+					    si_const_array(ctx->v4i32, 0), "");
+		break;
+	}
+
+	return ac_build_indexed_load_const(&ctx->ac, list, index);
+}
+
+/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
+ *
+ * SI-CI:
+ *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
+ *   filtering manually. The driver sets img7 to a mask clearing
+ *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
+ *     s_and_b32 samp0, samp0, img7
+ *
+ * VI:
+ *   The ANISO_OVERRIDE sampler field enables this fix in TA.
+ */
+static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
+					   LLVMValueRef res, LLVMValueRef samp)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+	LLVMValueRef img7, samp0;
+
+	if (ctx->screen->b.chip_class >= VI)
+		return samp;
+
+	img7 = LLVMBuildExtractElement(builder, res,
+				       LLVMConstInt(ctx->i32, 7, 0), "");
+	samp0 = LLVMBuildExtractElement(builder, samp,
+					ctx->i32_0, "");
+	samp0 = LLVMBuildAnd(builder, samp0, img7, "");
+	return LLVMBuildInsertElement(builder, samp, samp0,
+				      ctx->i32_0, "");
+}
+
+static void tex_fetch_ptrs(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data,
+	LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const struct tgsi_full_src_register *reg;
+	unsigned target = inst->Texture.Texture;
+	unsigned sampler_src;
+	LLVMValueRef index;
+
+	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
+	reg = &emit_data->inst->Src[sampler_src];
+
+	if (reg->Register.Indirect) {
+		index = si_get_bounded_indirect_index(ctx,
+						      &reg->Indirect,
+						      reg->Register.Index,
+						      ctx->num_samplers);
+		index = LLVMBuildAdd(ctx->gallivm.builder, index,
+				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
+	} else {
+		index = LLVMConstInt(ctx->i32,
+				     si_get_sampler_slot(reg->Register.Index), 0);
+	}
+
+	if (reg->Register.File != TGSI_FILE_SAMPLER) {
+		struct gallivm_state *gallivm = &ctx->gallivm;
+		LLVMBuilderRef builder = gallivm->builder;
+
+		LLVMValueRef ptr =
+			lp_build_emit_fetch_src(bld_base, reg,
+						TGSI_TYPE_UNSIGNED64, 0);
+		list = LLVMBuildIntToPtr(builder, ptr,
+					 si_const_array(ctx->v8i32, 0), "");
+		index = LLVMConstInt(ctx->i32, 0, 0);
+	}
+
+	if (target == TGSI_TEXTURE_BUFFER)
+		*res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
+	else
+		*res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
+
+	if (samp_ptr)
+		*samp_ptr = NULL;
+	if (fmask_ptr)
+		*fmask_ptr = NULL;
+
+	if (target == TGSI_TEXTURE_2D_MSAA ||
+	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+		if (fmask_ptr)
+			*fmask_ptr = load_sampler_desc(ctx, list, index,
+						       DESC_FMASK);
+	} else if (target != TGSI_TEXTURE_BUFFER) {
+		if (samp_ptr) {
+			*samp_ptr = load_sampler_desc(ctx, list, index,
+						      DESC_SAMPLER);
+			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
+		}
+	}
+}
+
+static void txq_fetch_args(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	unsigned target = inst->Texture.Texture;
+	LLVMValueRef res_ptr;
+	LLVMValueRef address;
+
+	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		/* Read the size from the buffer descriptor directly. */
+		emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
+		return;
+	}
+
+	/* Textures - set the mip level. */
+	address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
+
+	set_tex_fetch_args(ctx, emit_data, target, res_ptr,
+			   NULL, &address, 1, 0xf);
+}
+
+static void txq_emit(const struct lp_build_tgsi_action *action,
+		     struct lp_build_tgsi_context *bld_base,
+		     struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct ac_image_args args;
+	unsigned target = emit_data->inst->Texture.Texture;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		/* Just return the buffer size. */
+		emit_data->output[emit_data->chan] = emit_data->args[0];
+		return;
+	}
+
+	memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
+
+	args.opcode = ac_image_get_resinfo;
+	LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
+
+	emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
+}
+
+static void tex_fetch_args(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	unsigned opcode = inst->Instruction.Opcode;
+	unsigned target = inst->Texture.Texture;
+	LLVMValueRef coords[5], derivs[6];
+	LLVMValueRef address[16];
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
+	unsigned count = 0;
+	unsigned chan;
+	unsigned num_deriv_channels = 0;
+	bool has_offset = inst->Texture.NumOffsets > 0;
+	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
+	unsigned dmask = 0xf;
+
+	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->dst_type = ctx->v4f32;
+		emit_data->args[0] = res_ptr;
+		emit_data->args[1] = ctx->i32_0;
+		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
+		emit_data->arg_count = 3;
+		return;
+	}
+
+	/* Fetch and project texture coordinates */
+	coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+	for (chan = 0; chan < 3; chan++ ) {
+		coords[chan] = lp_build_emit_fetch(bld_base,
+						   emit_data->inst, 0,
+						   chan);
+		if (opcode == TGSI_OPCODE_TXP)
+			coords[chan] = lp_build_emit_llvm_binary(bld_base,
+								 TGSI_OPCODE_DIV,
+								 coords[chan],
+								 coords[3]);
+	}
+
+	if (opcode == TGSI_OPCODE_TXP)
+		coords[3] = bld_base->base.one;
+
+	/* Pack offsets. */
+	if (has_offset &&
+	    opcode != TGSI_OPCODE_TXF &&
+	    opcode != TGSI_OPCODE_TXF_LZ) {
+		/* The offsets are six-bit signed integers packed like this:
+		 *   X=[5:0], Y=[13:8], and Z=[21:16].
+		 */
+		LLVMValueRef offset[3], pack;
+
+		assert(inst->Texture.NumOffsets == 1);
+
+		for (chan = 0; chan < 3; chan++) {
+			offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
+								     emit_data->inst, 0, chan);
+			offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
+						    LLVMConstInt(ctx->i32, 0x3f, 0), "");
+			if (chan)
+				offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
+							    LLVMConstInt(ctx->i32, chan*8, 0), "");
+		}
+
+		pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
+		pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
+		address[count++] = pack;
+	}
+
+	/* Pack LOD bias value */
+	if (opcode == TGSI_OPCODE_TXB)
+		address[count++] = coords[3];
+	if (opcode == TGSI_OPCODE_TXB2)
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+
+	/* Pack depth comparison value */
+	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
+		LLVMValueRef z;
+
+		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+			z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+		} else {
+			assert(ref_pos >= 0);
+			z = coords[ref_pos];
+		}
+
+		/* Section 8.23.1 (Depth Texture Comparison Mode) of the
+		 * OpenGL 4.5 spec says:
+		 *
+		 *    "If the texture’s internal format indicates a fixed-point
+		 *     depth texture, then D_t and D_ref are clamped to the
+		 *     range [0, 1]; otherwise no clamping is performed."
+		 *
+		 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
+		 * so the depth comparison value isn't clamped for Z16 and
+		 * Z24 anymore. Do it manually here.
+		 */
+		if (ctx->screen->b.chip_class >= VI) {
+			LLVMValueRef upgraded;
+			LLVMValueRef clamped;
+			upgraded = LLVMBuildExtractElement(gallivm->builder, samp_ptr,
+							   LLVMConstInt(ctx->i32, 3, false), "");
+			upgraded = LLVMBuildLShr(gallivm->builder, upgraded,
+						 LLVMConstInt(ctx->i32, 29, false), "");
+			upgraded = LLVMBuildTrunc(gallivm->builder, upgraded, ctx->i1, "");
+			clamped = ac_build_clamp(&ctx->ac, z);
+			z = LLVMBuildSelect(gallivm->builder, upgraded, clamped, z, "");
+		}
+
+		address[count++] = z;
+	}
+
+	/* Pack user derivatives */
+	if (opcode == TGSI_OPCODE_TXD) {
+		int param, num_src_deriv_channels, num_dst_deriv_channels;
+
+		switch (target) {
+		case TGSI_TEXTURE_3D:
+			num_src_deriv_channels = 3;
+			num_dst_deriv_channels = 3;
+			num_deriv_channels = 3;
+			break;
+		case TGSI_TEXTURE_2D:
+		case TGSI_TEXTURE_SHADOW2D:
+		case TGSI_TEXTURE_RECT:
+		case TGSI_TEXTURE_SHADOWRECT:
+		case TGSI_TEXTURE_2D_ARRAY:
+		case TGSI_TEXTURE_SHADOW2D_ARRAY:
+			num_src_deriv_channels = 2;
+			num_dst_deriv_channels = 2;
+			num_deriv_channels = 2;
+			break;
+		case TGSI_TEXTURE_CUBE:
+		case TGSI_TEXTURE_SHADOWCUBE:
+		case TGSI_TEXTURE_CUBE_ARRAY:
+		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+			/* Cube derivatives will be converted to 2D. */
+			num_src_deriv_channels = 3;
+			num_dst_deriv_channels = 3;
+			num_deriv_channels = 2;
+			break;
+		case TGSI_TEXTURE_1D:
+		case TGSI_TEXTURE_SHADOW1D:
+		case TGSI_TEXTURE_1D_ARRAY:
+		case TGSI_TEXTURE_SHADOW1D_ARRAY:
+			num_src_deriv_channels = 1;
+
+			/* 1D textures are allocated and used as 2D on GFX9. */
+			if (ctx->screen->b.chip_class >= GFX9) {
+				num_dst_deriv_channels = 2;
+				num_deriv_channels = 2;
+			} else {
+				num_dst_deriv_channels = 1;
+				num_deriv_channels = 1;
+			}
+			break;
+		default:
+			unreachable("invalid target");
+		}
+
+		for (param = 0; param < 2; param++) {
+			for (chan = 0; chan < num_src_deriv_channels; chan++)
+				derivs[param * num_dst_deriv_channels + chan] =
+					lp_build_emit_fetch(bld_base, inst, param+1, chan);
+
+			/* Fill in the rest with zeros. */
+			for (chan = num_src_deriv_channels;
+			     chan < num_dst_deriv_channels; chan++)
+				derivs[param * num_dst_deriv_channels + chan] =
+					bld_base->base.zero;
+		}
+	}
+
+	if (target == TGSI_TEXTURE_CUBE ||
+	    target == TGSI_TEXTURE_CUBE_ARRAY ||
+	    target == TGSI_TEXTURE_SHADOWCUBE ||
+	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+		ac_prepare_cube_coords(&ctx->ac,
+				       opcode == TGSI_OPCODE_TXD,
+				       target == TGSI_TEXTURE_CUBE_ARRAY ||
+				       target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
+				       opcode == TGSI_OPCODE_LODQ,
+				       coords, derivs);
+	} else if (tgsi_is_array_sampler(target) &&
+		   opcode != TGSI_OPCODE_TXF &&
+		   opcode != TGSI_OPCODE_TXF_LZ &&
+		   ctx->screen->b.chip_class <= VI) {
+		unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
+		coords[array_coord] =
+			ac_build_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32,
+					   &coords[array_coord], 1, 0);
+	}
+
+	if (opcode == TGSI_OPCODE_TXD)
+		for (int i = 0; i < num_deriv_channels * 2; i++)
+			address[count++] = derivs[i];
+
+	/* Pack texture coordinates */
+	address[count++] = coords[0];
+	if (num_coords > 1)
+		address[count++] = coords[1];
+	if (num_coords > 2)
+		address[count++] = coords[2];
+
+	/* 1D textures are allocated and used as 2D on GFX9. */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		LLVMValueRef filler;
+
+		/* Use 0.5, so that we don't sample the border color. */
+		if (opcode == TGSI_OPCODE_TXF ||
+		    opcode == TGSI_OPCODE_TXF_LZ)
+			filler = ctx->i32_0;
+		else
+			filler = LLVMConstReal(ctx->f32, 0.5);
+
+		if (target == TGSI_TEXTURE_1D ||
+		    target == TGSI_TEXTURE_SHADOW1D) {
+			address[count++] = filler;
+		} else if (target == TGSI_TEXTURE_1D_ARRAY ||
+			   target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
+			address[count] = address[count - 1];
+			address[count - 1] = filler;
+			count++;
+		}
+	}
+
+	/* Pack LOD or sample index */
+	if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
+		address[count++] = coords[3];
+	else if (opcode == TGSI_OPCODE_TXL2)
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+
+	if (count > 16) {
+		assert(!"Cannot handle more than 16 texture address parameters");
+		count = 16;
+	}
+
+	for (chan = 0; chan < count; chan++ ) {
+		address[chan] = LLVMBuildBitCast(gallivm->builder,
+						 address[chan], ctx->i32, "");
+	}
+
+	/* Adjust the sample index according to FMASK.
+	 *
+	 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+	 * which is the identity mapping. Each nibble says which physical sample
+	 * should be fetched to get that sample.
+	 *
+	 * For example, 0x11111100 means there are only 2 samples stored and
+	 * the second sample covers 3/4 of the pixel. When reading samples 0
+	 * and 1, return physical sample 0 (determined by the first two 0s
+	 * in FMASK), otherwise return physical sample 1.
+	 *
+	 * The sample index should be adjusted as follows:
+	 *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
+	 */
+	if (target == TGSI_TEXTURE_2D_MSAA ||
+	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+		struct lp_build_emit_data txf_emit_data = *emit_data;
+		LLVMValueRef txf_address[4];
+		/* We only need .xy for non-arrays, and .xyz for arrays. */
+		unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
+		struct tgsi_full_instruction inst = {};
+
+		memcpy(txf_address, address, sizeof(txf_address));
+
+		/* Read FMASK using TXF_LZ. */
+		inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
+		inst.Texture.Texture = target;
+		txf_emit_data.inst = &inst;
+		txf_emit_data.chan = 0;
+		set_tex_fetch_args(ctx, &txf_emit_data,
+				   target, fmask_ptr, NULL,
+				   txf_address, txf_count, 0xf);
+		build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
+
+		/* Initialize some constants. */
+		LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
+		LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
+
+		/* Apply the formula. */
+		LLVMValueRef fmask =
+			LLVMBuildExtractElement(gallivm->builder,
+						txf_emit_data.output[0],
+						ctx->i32_0, "");
+
+		unsigned sample_chan = txf_count; /* the sample index is last */
+
+		LLVMValueRef sample_index4 =
+			LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
+
+		LLVMValueRef shifted_fmask =
+			LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
+
+		LLVMValueRef final_sample =
+			LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
+
+		/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+		 * resource descriptor is 0 (invalid),
+		 */
+		LLVMValueRef fmask_desc =
+			LLVMBuildBitCast(gallivm->builder, fmask_ptr,
+					 ctx->v8i32, "");
+
+		LLVMValueRef fmask_word1 =
+			LLVMBuildExtractElement(gallivm->builder, fmask_desc,
+						ctx->i32_1, "");
+
+		LLVMValueRef word1_is_nonzero =
+			LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+				      fmask_word1, ctx->i32_0, "");
+
+		/* Replace the MSAA sample index. */
+		address[sample_chan] =
+			LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
+					final_sample, address[sample_chan], "");
+	}
+
+	if (opcode == TGSI_OPCODE_TXF ||
+	    opcode == TGSI_OPCODE_TXF_LZ) {
+		/* add tex offsets */
+		if (inst->Texture.NumOffsets) {
+			struct lp_build_context *uint_bld = &bld_base->uint_bld;
+			const struct tgsi_texture_offset *off = inst->TexOffsets;
+
+			assert(inst->Texture.NumOffsets == 1);
+
+			switch (target) {
+			case TGSI_TEXTURE_3D:
+				address[2] = lp_build_add(uint_bld, address[2],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
+				/* fall through */
+			case TGSI_TEXTURE_2D:
+			case TGSI_TEXTURE_SHADOW2D:
+			case TGSI_TEXTURE_RECT:
+			case TGSI_TEXTURE_SHADOWRECT:
+			case TGSI_TEXTURE_2D_ARRAY:
+			case TGSI_TEXTURE_SHADOW2D_ARRAY:
+				address[1] =
+					lp_build_add(uint_bld, address[1],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
+				/* fall through */
+			case TGSI_TEXTURE_1D:
+			case TGSI_TEXTURE_SHADOW1D:
+			case TGSI_TEXTURE_1D_ARRAY:
+			case TGSI_TEXTURE_SHADOW1D_ARRAY:
+				address[0] =
+					lp_build_add(uint_bld, address[0],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
+				break;
+				/* texture offsets do not apply to other texture targets */
+			}
+		}
+	}
+
+	if (opcode == TGSI_OPCODE_TG4) {
+		unsigned gather_comp = 0;
+
+		/* DMASK was repurposed for GATHER4. 4 components are always
+		 * returned and DMASK works like a swizzle - it selects
+		 * the component to fetch. The only valid DMASK values are
+		 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+		 * (red,red,red,red) etc.) The ISA document doesn't mention
+		 * this.
+		 */
+
+		/* Get the component index from src1.x for Gather4. */
+		if (!tgsi_is_shadow_target(target)) {
+			LLVMValueRef comp_imm;
+			struct tgsi_src_register src1 = inst->Src[1].Register;
+
+			assert(src1.File == TGSI_FILE_IMMEDIATE);
+
+			comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
+			gather_comp = LLVMConstIntGetZExtValue(comp_imm);
+			gather_comp = CLAMP(gather_comp, 0, 3);
+		}
+
+		dmask = 1 << gather_comp;
+	}
+
+	set_tex_fetch_args(ctx, emit_data, target, res_ptr,
+			   samp_ptr, address, count, dmask);
+}
+
+/* Gather4 should follow the same rules as bilinear filtering, but the hardware
+ * incorrectly forces nearest filtering if the texture format is integer.
+ * The only effect it has on Gather4, which always returns 4 texels for
+ * bilinear filtering, is that the final coordinates are off by 0.5 of
+ * the texel size.
+ *
+ * The workaround is to subtract 0.5 from the unnormalized coordinates,
+ * or (0.5 / size) from the normalized coordinates.
+ *
+ * However, cube textures with 8_8_8_8 data formats require a different
+ * workaround of overriding the num format to USCALED/SSCALED. This would lose
+ * precision in 32-bit data formats, so it needs to be applied dynamically at
+ * runtime. In this case, return an i1 value that indicates whether the
+ * descriptor was overridden (and hence a fixup of the sampler result is needed).
+ */
+static LLVMValueRef
+si_lower_gather4_integer(struct si_shader_context *ctx,
+			 struct ac_image_args *args,
+			 unsigned target,
+			 enum tgsi_return_type return_type)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+	LLVMValueRef wa_8888 = NULL;
+	LLVMValueRef coord = args->addr;
+	LLVMValueRef half_texel[2];
+	/* Texture coordinates start after:
+	 *   {offset, bias, z-compare, derivatives}
+	 * Only the offset and z-compare can occur here.
+	 */
+	unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
+	int c;
+
+	assert(return_type == TGSI_RETURN_TYPE_SINT ||
+	       return_type == TGSI_RETURN_TYPE_UINT);
+
+	if (target == TGSI_TEXTURE_CUBE ||
+	    target == TGSI_TEXTURE_CUBE_ARRAY) {
+		LLVMValueRef formats;
+		LLVMValueRef data_format;
+		LLVMValueRef wa_formats;
+
+		formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
+
+		data_format = LLVMBuildLShr(builder, formats,
+					    LLVMConstInt(ctx->i32, 20, false), "");
+		data_format = LLVMBuildAnd(builder, data_format,
+					   LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
+		wa_8888 = LLVMBuildICmp(
+			builder, LLVMIntEQ, data_format,
+			LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
+			"");
+
+		uint32_t wa_num_format =
+			return_type == TGSI_RETURN_TYPE_UINT ?
+			S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_USCALED) :
+			S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_SSCALED);
+		wa_formats = LLVMBuildAnd(builder, formats,
+					  LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false),
+					  "");
+		wa_formats = LLVMBuildOr(builder, wa_formats,
+					LLVMConstInt(ctx->i32, wa_num_format, false), "");
+
+		formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
+		args->resource = LLVMBuildInsertElement(
+			builder, args->resource, formats, ctx->i32_1, "");
+	}
+
+	if (target == TGSI_TEXTURE_RECT ||
+	    target == TGSI_TEXTURE_SHADOWRECT) {
+		assert(!wa_8888);
+		half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
+	} else {
+		struct tgsi_full_instruction txq_inst = {};
+		struct lp_build_emit_data txq_emit_data = {};
+		struct lp_build_if_state if_ctx;
+
+		if (wa_8888) {
+			/* Skip the texture size query entirely if we don't need it. */
+			lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
+		}
+
+		/* Query the texture size. */
+		txq_inst.Texture.Texture = target;
+		txq_emit_data.inst = &txq_inst;
+		txq_emit_data.dst_type = ctx->v4i32;
+		set_tex_fetch_args(ctx, &txq_emit_data, target,
+				   args->resource, NULL, &ctx->i32_0,
+				   1, 0xf);
+		txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
+
+		/* Compute -0.5 / size. */
+		for (c = 0; c < 2; c++) {
+			half_texel[c] =
+				LLVMBuildExtractElement(builder, txq_emit_data.output[0],
+							LLVMConstInt(ctx->i32, c, 0), "");
+			half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
+			half_texel[c] =
+				lp_build_emit_llvm_unary(&ctx->bld_base,
+							 TGSI_OPCODE_RCP, half_texel[c]);
+			half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
+						      LLVMConstReal(ctx->f32, -0.5), "");
+		}
+
+		if (wa_8888) {
+			lp_build_endif(&if_ctx);
+
+			LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
+
+			for (c = 0; c < 2; c++) {
+				LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
+				half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
+							     values, bb);
+			}
+		}
+	}
+
+	for (c = 0; c < 2; c++) {
+		LLVMValueRef tmp;
+		LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
+
+		tmp = LLVMBuildExtractElement(builder, coord, index, "");
+		tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
+		tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
+		tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
+		coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
+	}
+
+	args->addr = coord;
+
+	return wa_8888;
+}
+
+/* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
+ * result after the gather operation.
+ */
+static LLVMValueRef
+si_fix_gather4_integer_result(struct si_shader_context *ctx,
+			   LLVMValueRef result,
+			   enum tgsi_return_type return_type,
+			   LLVMValueRef wa)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+
+	assert(return_type == TGSI_RETURN_TYPE_SINT ||
+	       return_type == TGSI_RETURN_TYPE_UINT);
+
+	for (unsigned chan = 0; chan < 4; ++chan) {
+		LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
+		LLVMValueRef value;
+		LLVMValueRef wa_value;
+
+		value = LLVMBuildExtractElement(builder, result, chanv, "");
+
+		if (return_type == TGSI_RETURN_TYPE_UINT)
+			wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
+		else
+			wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
+		wa_value = LLVMBuildBitCast(builder, wa_value, ctx->f32, "");
+		value = LLVMBuildSelect(builder, wa, wa_value, value, "");
+
+		result = LLVMBuildInsertElement(builder, result, value, chanv, "");
+	}
+
+	return result;
+}
+
+static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct ac_image_args args;
+	unsigned opcode = inst->Instruction.Opcode;
+	unsigned target = inst->Texture.Texture;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] =
+			ac_build_buffer_load_format(&ctx->ac,
+						    emit_data->args[0],
+						    emit_data->args[2],
+						    emit_data->args[1],
+						    true);
+		return;
+	}
+
+	memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
+
+	args.opcode = ac_image_sample;
+	args.compare = tgsi_is_shadow_target(target);
+	args.offset = inst->Texture.NumOffsets > 0;
+
+	switch (opcode) {
+	case TGSI_OPCODE_TXF:
+	case TGSI_OPCODE_TXF_LZ:
+		args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
+			      target == TGSI_TEXTURE_2D_MSAA ||
+			      target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
+				      ac_image_load : ac_image_load_mip;
+		args.compare = false;
+		args.offset = false;
+		break;
+	case TGSI_OPCODE_LODQ:
+		args.opcode = ac_image_get_lod;
+		args.compare = false;
+		args.offset = false;
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TEX2:
+	case TGSI_OPCODE_TXP:
+		if (ctx->type != PIPE_SHADER_FRAGMENT)
+			args.level_zero = true;
+		break;
+	case TGSI_OPCODE_TEX_LZ:
+		args.level_zero = true;
+		break;
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXB2:
+		assert(ctx->type == PIPE_SHADER_FRAGMENT);
+		args.bias = true;
+		break;
+	case TGSI_OPCODE_TXL:
+	case TGSI_OPCODE_TXL2:
+		args.lod = true;
+		break;
+	case TGSI_OPCODE_TXD:
+		args.deriv = true;
+		break;
+	case TGSI_OPCODE_TG4:
+		args.opcode = ac_image_gather4;
+		args.level_zero = true;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	/* The hardware needs special lowering for Gather4 with integer formats. */
+	LLVMValueRef gather4_int_result_workaround = NULL;
+
+	if (ctx->screen->b.chip_class <= VI &&
+	    opcode == TGSI_OPCODE_TG4) {
+		assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
+
+		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
+		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
+			gather4_int_result_workaround =
+				si_lower_gather4_integer(ctx, &args, target,
+							 inst->Texture.ReturnType);
+		}
+	}
+
+	LLVMValueRef result =
+		ac_build_image_opcode(&ctx->ac, &args);
+
+	if (gather4_int_result_workaround) {
+		result = si_fix_gather4_integer_result(ctx, result,
+						       inst->Texture.ReturnType,
+						       gather4_int_result_workaround);
+	}
+
+	emit_data->output[emit_data->chan] = result;
+}
+
+static void si_llvm_emit_txqs(
+	const struct lp_build_tgsi_action *action,
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = &ctx->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef res, samples;
+	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
+
+	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
+
+
+	/* Read the samples from the descriptor directly. */
+	res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
+	samples = LLVMBuildExtractElement(
+		builder, res,
+		LLVMConstInt(ctx->i32, 3, 0), "");
+	samples = LLVMBuildLShr(builder, samples,
+				LLVMConstInt(ctx->i32, 16, 0), "");
+	samples = LLVMBuildAnd(builder, samples,
+			       LLVMConstInt(ctx->i32, 0xf, 0), "");
+	samples = LLVMBuildShl(builder, ctx->i32_1,
+			       samples, "");
+
+	emit_data->output[emit_data->chan] = samples;
+}
+
+static const struct lp_build_tgsi_action tex_action = {
+	.fetch_args = tex_fetch_args,
+	.emit = build_tex_intrinsic,
+};
+
+/**
+ * Setup actions for TGSI memory opcode, including texture opcodes.
+ */
+void si_shader_context_init_mem(struct si_shader_context *ctx)
+{
+	struct lp_build_tgsi_context *bld_base;
+	struct lp_build_tgsi_action tmpl = {};
+
+	bld_base = &ctx->bld_base;
+
+	bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
+	bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
+	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
+
+	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
+	bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
+	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
+
+	tmpl.fetch_args = atomic_fetch_args;
+	tmpl.emit = atomic_emit;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
+}
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
index 7218d2d..1ba4ed2 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -40,7 +40,6 @@
 #include <stdio.h>
 #include <llvm-c/Transforms/IPO.h>
 #include <llvm-c/Transforms/Scalar.h>
-#include <llvm-c/Support.h>
 
 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  */
@@ -50,25 +49,12 @@
 	LLVMBasicBlockRef loop_entry_block;
 };
 
-#define CPU_STRING_LEN 30
-#define FS_STRING_LEN 30
-#define TRIPLE_STRING_LEN 7
-
-/**
- * Shader types for the LLVM backend.
- */
-enum si_llvm_shader_type {
-	RADEON_LLVM_SHADER_PS = 0,
-	RADEON_LLVM_SHADER_VS = 1,
-	RADEON_LLVM_SHADER_GS = 2,
-	RADEON_LLVM_SHADER_CS = 3,
-};
-
 enum si_llvm_calling_convention {
 	RADEON_LLVM_AMDGPU_VS = 87,
 	RADEON_LLVM_AMDGPU_GS = 88,
 	RADEON_LLVM_AMDGPU_PS = 89,
 	RADEON_LLVM_AMDGPU_CS = 90,
+	RADEON_LLVM_AMDGPU_HS = 93,
 };
 
 void si_llvm_add_attribute(LLVMValueRef F, const char *name, int value)
@@ -79,87 +65,6 @@
 	LLVMAddTargetDependentFunctionAttr(F, name, str);
 }
 
-/**
- * Set the shader type we want to compile
- *
- * @param type shader type to set
- */
-void si_llvm_shader_type(LLVMValueRef F, unsigned type)
-{
-	enum si_llvm_shader_type llvm_type;
-	enum si_llvm_calling_convention calling_conv;
-
-	switch (type) {
-	case PIPE_SHADER_VERTEX:
-	case PIPE_SHADER_TESS_CTRL:
-	case PIPE_SHADER_TESS_EVAL:
-		llvm_type = RADEON_LLVM_SHADER_VS;
-		calling_conv = RADEON_LLVM_AMDGPU_VS;
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		llvm_type = RADEON_LLVM_SHADER_GS;
-		calling_conv = RADEON_LLVM_AMDGPU_GS;
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		llvm_type = RADEON_LLVM_SHADER_PS;
-		calling_conv = RADEON_LLVM_AMDGPU_PS;
-		break;
-	case PIPE_SHADER_COMPUTE:
-		llvm_type = RADEON_LLVM_SHADER_CS;
-		calling_conv = RADEON_LLVM_AMDGPU_CS;
-		break;
-	default:
-		unreachable("Unhandle shader type");
-	}
-
-	if (HAVE_LLVM >= 0x309)
-		LLVMSetFunctionCallConv(F, calling_conv);
-	else
-		si_llvm_add_attribute(F, "ShaderType", llvm_type);
-}
-
-static void init_amdgpu_target()
-{
-	gallivm_init_llvm_targets();
-	LLVMInitializeAMDGPUTargetInfo();
-	LLVMInitializeAMDGPUTarget();
-	LLVMInitializeAMDGPUTargetMC();
-	LLVMInitializeAMDGPUAsmPrinter();
-
-	/* For inline assembly. */
-	LLVMInitializeAMDGPUAsmParser();
-
-	if (HAVE_LLVM >= 0x0400) {
-		/*
-		 * Workaround for bug in llvm 4.0 that causes image intrinsics
-		 * to disappear.
-		 * https://reviews.llvm.org/D26348
-		 */
-		const char *argv[2] = {"mesa", "-simplifycfg-sink-common=false"};
-		LLVMParseCommandLineOptions(2, argv, NULL);
-	}
-}
-
-static once_flag init_amdgpu_target_once_flag = ONCE_FLAG_INIT;
-
-LLVMTargetRef si_llvm_get_amdgpu_target(const char *triple)
-{
-	LLVMTargetRef target = NULL;
-	char *err_message = NULL;
-
-	call_once(&init_amdgpu_target_once_flag, init_amdgpu_target);
-
-	if (LLVMGetTargetFromTriple(triple, &target, &err_message)) {
-		fprintf(stderr, "Cannot find target for triple %s ", triple);
-		if (err_message) {
-			fprintf(stderr, "%s\n", err_message);
-		}
-		LLVMDisposeMessage(err_message);
-		return NULL;
-	}
-	return target;
-}
-
 struct si_llvm_diagnostics {
 	struct pipe_debug_callback *debug;
 	unsigned retval;
@@ -243,7 +148,10 @@
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
 
-	ac_elf_read(buffer_data, buffer_size, binary);
+	if (!ac_elf_read(buffer_data, buffer_size, binary)) {
+		fprintf(stderr, "radeonsi: cannot read an ELF shader binary\n");
+		diag.retval = 1;
+	}
 
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
@@ -848,13 +756,10 @@
 			 * FIXME: We shouldn't need to have the non-alloca
 			 * code path for arrays. LLVM should be smart enough to
 			 * promote allocas into registers when profitable.
-			 *
-			 * LLVM 3.8 crashes with this.
 			 */
-			if ((HAVE_LLVM >= 0x0309 && array_size > 16) ||
-			    /* TODO: VGPR indexing is buggy on GFX9. */
-			    ctx->screen->b.chip_class == GFX9) {
-				array_alloca = LLVMBuildAlloca(builder,
+			if (array_size > 16 ||
+			    !ctx->screen->llvm_has_working_vgpr_indexing) {
+				array_alloca = lp_build_alloca_undef(&ctx->gallivm,
 					LLVMArrayType(ctx->f32,
 						      array_size), "array");
 				ctx->temp_array_allocas[id] = array_alloca;
@@ -1256,10 +1161,7 @@
 
 void si_llvm_context_init(struct si_shader_context *ctx,
 			  struct si_screen *sscreen,
-			  struct si_shader *shader,
-			  LLVMTargetMachineRef tm,
-			  const struct tgsi_shader_info *info,
-			  const struct tgsi_token *tokens)
+			  LLVMTargetMachineRef tm)
 {
 	struct lp_type type;
 
@@ -1269,23 +1171,19 @@
 	 * helper functions in the gallivm module.
 	 */
 	memset(ctx, 0, sizeof(*ctx));
-	ctx->shader = shader;
 	ctx->screen = sscreen;
 	ctx->tm = tm;
-	ctx->type = info ? info->processor : -1;
 
 	ctx->gallivm.context = LLVMContextCreate();
 	ctx->gallivm.module = LLVMModuleCreateWithNameInContext("tgsi",
 						ctx->gallivm.context);
 	LLVMSetTarget(ctx->gallivm.module, "amdgcn--");
 
-#if HAVE_LLVM >= 0x0309
 	LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
 	char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
 	LLVMSetDataLayout(ctx->gallivm.module, data_layout_str);
 	LLVMDisposeTargetData(data_layout);
 	LLVMDisposeMessage(data_layout_str);
-#endif
 
 	bool unsafe_fpmath = (sscreen->b.debug_flags & DBG_UNSAFE_MATH) != 0;
 	enum lp_float_mode float_mode =
@@ -1295,30 +1193,12 @@
 	ctx->gallivm.builder = lp_create_builder(ctx->gallivm.context,
 						 float_mode);
 
-	ac_llvm_context_init(&ctx->ac, ctx->gallivm.context);
+	ac_llvm_context_init(&ctx->ac, ctx->gallivm.context, sscreen->b.chip_class);
 	ctx->ac.module = ctx->gallivm.module;
 	ctx->ac.builder = ctx->gallivm.builder;
 
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
-	bld_base->info = info;
-
-	if (info && info->array_max[TGSI_FILE_TEMPORARY] > 0) {
-		int size = info->array_max[TGSI_FILE_TEMPORARY];
-
-		ctx->temp_arrays = CALLOC(size, sizeof(ctx->temp_arrays[0]));
-		ctx->temp_array_allocas = CALLOC(size, sizeof(ctx->temp_array_allocas[0]));
-
-		if (tokens)
-			tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, size,
-					 ctx->temp_arrays);
-	}
-
-	if (info && info->file_max[TGSI_FILE_IMMEDIATE] >= 0) {
-		int size = info->file_max[TGSI_FILE_IMMEDIATE] + 1;
-		ctx->imms = MALLOC(size * TGSI_NUM_CHANNELS * sizeof(LLVMValueRef));
-	}
-
 	type.floating = true;
 	type.fixed = false;
 	type.sign = true;
@@ -1335,17 +1215,10 @@
 	lp_build_context_init(&ctx->bld_base.int64_bld, &ctx->gallivm, lp_int_type(type));
 
 	bld_base->soa = 1;
-	bld_base->emit_store = si_llvm_emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
 	bld_base->emit_immediate = emit_immediate;
 
-	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = si_llvm_emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = si_llvm_emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = si_llvm_emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = si_llvm_emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
-
 	/* metadata allowing 2.5 ULP */
 	ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->gallivm.context,
 						       "fpmath", 6);
@@ -1363,6 +1236,7 @@
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 
 	si_shader_context_init_alu(&ctx->bld_base);
+	si_shader_context_init_mem(ctx);
 
 	ctx->voidt = LLVMVoidTypeInContext(ctx->gallivm.context);
 	ctx->i1 = LLVMInt1TypeInContext(ctx->gallivm.context);
@@ -1371,7 +1245,6 @@
 	ctx->i64 = LLVMInt64TypeInContext(ctx->gallivm.context);
 	ctx->i128 = LLVMIntTypeInContext(ctx->gallivm.context, 128);
 	ctx->f32 = LLVMFloatTypeInContext(ctx->gallivm.context);
-	ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
 	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
 	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
 	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
@@ -1381,6 +1254,72 @@
 	ctx->i32_1 = LLVMConstInt(ctx->i32, 1, 0);
 }
 
+/* Set the context to a certain TGSI shader. Can be called repeatedly
+ * to change the shader. */
+void si_llvm_context_set_tgsi(struct si_shader_context *ctx,
+			      struct si_shader *shader)
+{
+	const struct tgsi_shader_info *info = NULL;
+	const struct tgsi_token *tokens = NULL;
+
+	if (shader && shader->selector) {
+		info = &shader->selector->info;
+		tokens = shader->selector->tokens;
+	}
+
+	ctx->shader = shader;
+	ctx->type = info ? info->processor : -1;
+	ctx->bld_base.info = info;
+
+	/* Clean up the old contents. */
+	FREE(ctx->temp_arrays);
+	ctx->temp_arrays = NULL;
+	FREE(ctx->temp_array_allocas);
+	ctx->temp_array_allocas = NULL;
+
+	FREE(ctx->imms);
+	ctx->imms = NULL;
+	ctx->imms_num = 0;
+
+	FREE(ctx->temps);
+	ctx->temps = NULL;
+	ctx->temps_count = 0;
+
+	if (!info || !tokens)
+		return;
+
+	if (info->array_max[TGSI_FILE_TEMPORARY] > 0) {
+		int size = info->array_max[TGSI_FILE_TEMPORARY];
+
+		ctx->temp_arrays = CALLOC(size, sizeof(ctx->temp_arrays[0]));
+		ctx->temp_array_allocas = CALLOC(size, sizeof(ctx->temp_array_allocas[0]));
+
+		tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, size,
+				 ctx->temp_arrays);
+	}
+	if (info->file_max[TGSI_FILE_IMMEDIATE] >= 0) {
+		int size = info->file_max[TGSI_FILE_IMMEDIATE] + 1;
+		ctx->imms = MALLOC(size * TGSI_NUM_CHANNELS * sizeof(LLVMValueRef));
+	}
+
+	/* Re-set these to start with a clean slate. */
+	ctx->bld_base.num_instructions = 0;
+	ctx->bld_base.pc = 0;
+	memset(ctx->outputs, 0, sizeof(ctx->outputs));
+
+	ctx->bld_base.emit_store = si_llvm_emit_store;
+	ctx->bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = si_llvm_emit_fetch;
+	ctx->bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = si_llvm_emit_fetch;
+	ctx->bld_base.emit_fetch_funcs[TGSI_FILE_TEMPORARY] = si_llvm_emit_fetch;
+	ctx->bld_base.emit_fetch_funcs[TGSI_FILE_OUTPUT] = si_llvm_emit_fetch;
+	ctx->bld_base.emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
+
+	ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+	ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+	ctx->num_samplers = util_last_bit(info->samplers_declared);
+	ctx->num_images = util_last_bit(info->images_declared);
+}
+
 void si_llvm_create_func(struct si_shader_context *ctx,
 			 const char *name,
 			 LLVMTypeRef *return_types, unsigned num_return_elems,
@@ -1388,6 +1327,8 @@
 {
 	LLVMTypeRef main_fn_type, ret_type;
 	LLVMBasicBlockRef main_fn_body;
+	enum si_llvm_calling_convention call_conv;
+	unsigned real_shader_type;
 
 	if (num_return_elems)
 		ret_type = LLVMStructTypeInContext(ctx->gallivm.context,
@@ -1403,22 +1344,60 @@
 	main_fn_body = LLVMAppendBasicBlockInContext(ctx->gallivm.context,
 			ctx->main_fn, "main_body");
 	LLVMPositionBuilderAtEnd(ctx->gallivm.builder, main_fn_body);
+
+	real_shader_type = ctx->type;
+
+	/* LS is merged into HS (TCS), and ES is merged into GS. */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		if (ctx->shader->key.as_ls)
+			real_shader_type = PIPE_SHADER_TESS_CTRL;
+		else if (ctx->shader->key.as_es)
+			real_shader_type = PIPE_SHADER_GEOMETRY;
+	}
+
+	switch (real_shader_type) {
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_TESS_EVAL:
+		call_conv = RADEON_LLVM_AMDGPU_VS;
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		call_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS :
+						  RADEON_LLVM_AMDGPU_VS;
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		call_conv = RADEON_LLVM_AMDGPU_GS;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		call_conv = RADEON_LLVM_AMDGPU_PS;
+		break;
+	case PIPE_SHADER_COMPUTE:
+		call_conv = RADEON_LLVM_AMDGPU_CS;
+		break;
+	default:
+		unreachable("Unhandle shader type");
+	}
+
+	LLVMSetFunctionCallConv(ctx->main_fn, call_conv);
 }
 
-void si_llvm_finalize_module(struct si_shader_context *ctx,
-			     bool run_verifier)
+void si_llvm_optimize_module(struct si_shader_context *ctx)
 {
 	struct gallivm_state *gallivm = &ctx->gallivm;
 	const char *triple = LLVMGetTarget(gallivm->module);
 	LLVMTargetLibraryInfoRef target_library_info;
 
+	/* Dump LLVM IR before any optimization passes */
+	if (ctx->screen->b.debug_flags & DBG_PREOPT_IR &&
+	    r600_can_dump_shader(&ctx->screen->b, ctx->type))
+		LLVMDumpModule(ctx->gallivm.module);
+
 	/* Create the pass manager */
 	gallivm->passmgr = LLVMCreatePassManager();
 
 	target_library_info = gallivm_create_target_library_info(triple);
 	LLVMAddTargetLibraryInfo(target_library_info, gallivm->passmgr);
 
-	if (run_verifier)
+	if (r600_extra_shader_checks(&ctx->screen->b, ctx->type))
 		LLVMAddVerifierPass(gallivm->passmgr);
 
 	LLVMAddAlwaysInlinerPass(gallivm->passmgr);
@@ -1431,6 +1410,10 @@
 	LLVMAddLICMPass(gallivm->passmgr);
 	LLVMAddAggressiveDCEPass(gallivm->passmgr);
 	LLVMAddCFGSimplificationPass(gallivm->passmgr);
+#if HAVE_LLVM >= 0x0400
+	/* This is recommended by the instruction combining pass. */
+	LLVMAddEarlyCSEMemSSAPass(gallivm->passmgr);
+#endif
 	LLVMAddInstructionCombiningPass(gallivm->passmgr);
 
 	/* Run the pass */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index a98f5c7..84e06b0 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -74,11 +74,6 @@
 	}
 }
 
-static uint32_t S_FIXED(float value, uint32_t frac_bits)
-{
-	return value * (1 << frac_bits);
-}
-
 /* 12.4 fixed-point */
 static unsigned si_pack_float_12p4(float x)
 {
@@ -603,9 +598,27 @@
 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	si_pm4_bind_state(sctx, blend, (struct si_state_blend *)state);
-	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
-	sctx->do_update_shaders = true;
+	struct si_state_blend *old_blend = sctx->queued.named.blend;
+	struct si_state_blend *blend = (struct si_state_blend *)state;
+
+	if (!state)
+		return;
+
+	if (!old_blend ||
+	     old_blend->cb_target_mask != blend->cb_target_mask ||
+	     old_blend->dual_src_blend != blend->dual_src_blend)
+		si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+
+	si_pm4_bind_state(sctx, blend, state);
+
+	if (!old_blend ||
+	    old_blend->cb_target_mask != blend->cb_target_mask ||
+	    old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+	    old_blend->alpha_to_one != blend->alpha_to_one ||
+	    old_blend->dual_src_blend != blend->dual_src_blend ||
+	    old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+	    old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+		sctx->do_update_shaders = true;
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -619,9 +632,6 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	if (memcmp(&sctx->blend_color.state, state, sizeof(*state)) == 0)
-		return;
-
 	sctx->blend_color.state = *state;
 	si_mark_atom_dirty(sctx, &sctx->blend_color.atom);
 }
@@ -666,24 +676,21 @@
 	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
 }
 
-#define SIX_BITS 0x3F
-
 static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *vs = si_get_vs_state(sctx);
-	struct tgsi_shader_info *info = si_get_vs_info(sctx);
+	struct si_shader_selector *vs_sel = vs->selector;
+	struct tgsi_shader_info *info = &vs_sel->info;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned window_space =
 	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	unsigned clipdist_mask =
-		info->writes_clipvertex ? SIX_BITS : info->clipdist_writemask;
+	unsigned clipdist_mask = vs_sel->clipdist_mask;
 	unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
-	unsigned culldist_mask = info->culldist_writemask << info->num_written_clipdistance;
+	unsigned culldist_mask = vs_sel->culldist_mask;
 	unsigned total_mask;
-	bool misc_vec_ena;
 
-	if (vs->key.opt.hw_vs.clip_disable) {
+	if (vs->key.opt.clip_disable) {
 		assert(!info->culldist_writemask);
 		clipdist_mask = 0;
 		culldist_mask = 0;
@@ -699,27 +706,15 @@
 	clipdist_mask &= rs->clip_plane_enable;
 	culldist_mask |= clipdist_mask;
 
-	misc_vec_ena = info->writes_psize || info->writes_edgeflag ||
-		       info->writes_layer || info->writes_viewport_index;
-
 	radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
-		S_02881C_USE_VTX_POINT_SIZE(info->writes_psize) |
-		S_02881C_USE_VTX_EDGE_FLAG(info->writes_edgeflag) |
-		S_02881C_USE_VTX_RENDER_TARGET_INDX(info->writes_layer) |
-	        S_02881C_USE_VTX_VIEWPORT_INDX(info->writes_viewport_index) |
+		vs_sel->pa_cl_vs_out_cntl |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
-		S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
-		S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
 		clipdist_mask | (culldist_mask << 8));
 	radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
 		rs->pa_cl_clip_cntl |
 		ucp_mask |
 		S_028810_CLIP_DISABLE(window_space));
-
-	/* reuse needs to be set off if we write oViewport */
-	radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
-			       S_028AB4_REUSE_OFF(info->writes_viewport_index));
 }
 
 /*
@@ -836,8 +831,8 @@
 			S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) |
 			S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2)));
 
-	tmp = (unsigned)state->line_width * 8;
-	si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, S_028A08_WIDTH(tmp));
+	si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
+		       S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2)));
 	si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
 		       S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
 		       S_028A48_MSAA_ENABLE(state->multisample ||
@@ -864,6 +859,15 @@
 		S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
 
+	if (!rs->uses_poly_offset)
+		return rs;
+
+	rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
+	if (!rs->pm4_poly_offset) {
+		FREE(rs);
+		return NULL;
+	}
+
 	/* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
 	for (i = 0; i < 3; i++) {
 		struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
@@ -933,19 +937,39 @@
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_poly_offset_state(sctx);
 
-	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+	if (!old_rs ||
+	    old_rs->clip_plane_enable != rs->clip_plane_enable ||
+	    old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
+		si_mark_atom_dirty(sctx, &sctx->clip_regs);
+
 	sctx->ia_multi_vgt_param_key.u.line_stipple_enabled =
 		rs->line_stipple_enable;
-	sctx->do_update_shaders = true;
+
+	if (!old_rs ||
+	    old_rs->clip_plane_enable != rs->clip_plane_enable ||
+	    old_rs->rasterizer_discard != rs->rasterizer_discard ||
+	    old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+	    old_rs->flatshade != rs->flatshade ||
+	    old_rs->two_side != rs->two_side ||
+	    old_rs->multisample_enable != rs->multisample_enable ||
+	    old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+	    old_rs->poly_smooth != rs->poly_smooth ||
+	    old_rs->line_smooth != rs->line_smooth ||
+	    old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
+	    old_rs->force_persample_interp != rs->force_persample_interp)
+		sctx->do_update_shaders = true;
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
 
 	if (sctx->queued.named.rasterizer == state)
 		si_pm4_bind_state(sctx, poly_offset, NULL);
-	si_pm4_delete_state(sctx, rasterizer, (struct si_state_rasterizer *)state);
+
+	FREE(rs->pm4_poly_offset);
+	si_pm4_delete_state(sctx, rasterizer, rs);
 }
 
 /*
@@ -1062,7 +1086,8 @@
 	}
 
 	si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
-	si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+	if (state->stencil[0].enabled)
+		si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
 	if (state->depth.bounds_test) {
 		si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
 		si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
@@ -1074,6 +1099,7 @@
 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
 {
         struct si_context *sctx = (struct si_context *)ctx;
+	struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
         struct si_state_dsa *dsa = state;
 
         if (!state)
@@ -1086,7 +1112,9 @@
 		sctx->stencil_ref.dsa_part = dsa->stencil_ref;
 		si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
 	}
-	sctx->do_update_shaders = true;
+
+	if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
+		sctx->do_update_shaders = true;
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
@@ -2290,7 +2318,7 @@
 				      S_02801C_Y_MAX(rtex->resource.b.b.height0 - 1);
 
 		/* Only use HTILE for the first level. */
-		if (rtex->htile_buffer && !level) {
+		if (rtex->htile_offset && !level) {
 			z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
 				  S_028038_ALLOW_EXPCLEAR(1);
 
@@ -2316,7 +2344,8 @@
 				s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
 			}
 
-			surf->db_htile_data_base = rtex->htile_buffer->gpu_address >> 8;
+			surf->db_htile_data_base = (rtex->resource.gpu_address +
+						    rtex->htile_offset) >> 8;
 			surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
 						 S_028ABC_PIPE_ALIGNED(rtex->surface.u.gfx9.htile.pipe_aligned) |
 						 S_028ABC_RB_ALIGNED(rtex->surface.u.gfx9.htile.rb_aligned);
@@ -2368,7 +2397,7 @@
 								levelinfo->nblk_y) / 64 - 1);
 
 		/* Only use HTILE for the first level. */
-		if (rtex->htile_buffer && !level) {
+		if (rtex->htile_offset && !level) {
 			z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
 				  S_028040_ALLOW_EXPCLEAR(1);
 
@@ -2394,7 +2423,8 @@
 				s_info |= S_028044_TILE_STENCIL_DISABLE(1);
 			}
 
-			surf->db_htile_data_base = rtex->htile_buffer->gpu_address >> 8;
+			surf->db_htile_data_base = (rtex->resource.gpu_address +
+						    rtex->htile_offset) >> 8;
 			surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
 
 			if (rtex->tc_compatible_htile) {
@@ -2416,6 +2446,38 @@
 	surf->depth_initialized = true;
 }
 
+void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
+{
+	if (sctx->decompression_enabled)
+		return;
+
+	if (sctx->framebuffer.state.zsbuf) {
+		struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
+		struct r600_texture *rtex = (struct r600_texture *)surf->texture;
+
+		rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+
+		if (rtex->surface.flags & RADEON_SURF_SBUFFER)
+			rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+	}
+	if (sctx->framebuffer.compressed_cb_mask) {
+		struct pipe_surface *surf;
+		struct r600_texture *rtex;
+		unsigned mask = sctx->framebuffer.compressed_cb_mask;
+
+		do {
+			unsigned i = u_bit_scan(&mask);
+			surf = sctx->framebuffer.state.cbufs[i];
+			rtex = (struct r600_texture*)surf->texture;
+
+			if (rtex->fmask.size)
+				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+			if (rtex->dcc_gather_statistics)
+				rtex->separate_dcc_dirty = true;
+		} while (mask);
+	}
+}
+
 static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
 {
 	for (int i = 0; i < state->nr_cbufs; ++i) {
@@ -2443,6 +2505,8 @@
 	bool unbound = false;
 	int i;
 
+	si_update_fb_dirtiness_after_rendering(sctx);
+
 	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
 		if (!sctx->framebuffer.state.cbufs[i])
 			continue;
@@ -2484,14 +2548,46 @@
 	 * the only client not using TC that can change textures is
 	 * the framebuffer.
 	 *
-	 * Flush all CB and DB caches here because all buffers can be used
-	 * for write by both TC (with shader image stores) and CB/DB.
+	 * Wait for compute shaders because of possible transitions:
+	 * - FB write -> shader read
+	 * - shader write -> FB read
+	 *
+	 * DB caches are flushed on demand (using si_decompress_textures).
+	 *
+	 * When MSAA is enabled, CB and TC caches are flushed on demand
+	 * (after FMASK decompression). Shader write -> FB read transitions
+	 * cannot happen for MSAA textures, because MSAA shader images are
+	 * not supported.
+	 *
+	 * Only flush and wait for CB if there is actually a bound color buffer.
 	 */
-	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_FLUSH_AND_INV_CB |
-			 SI_CONTEXT_FLUSH_AND_INV_DB |
-			 SI_CONTEXT_CS_PARTIAL_FLUSH;
+	if (sctx->framebuffer.nr_samples <= 1 &&
+	    sctx->framebuffer.state.nr_cbufs) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_FLUSH_AND_INV_CB;
+	}
+	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+	/* u_blitter doesn't invoke depth decompression when it does multiple
+	 * blits in a row, but the only case when it matters for DB is when
+	 * doing generate_mipmap. So here we flush DB manually between
+	 * individual generate_mipmap blits.
+	 * Note that lower mipmap levels aren't compressed.
+	 */
+	if (sctx->generate_mipmap_for_depth) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_FLUSH_AND_INV_DB;
+	} else if (sctx->b.chip_class == GFX9) {
+		/* It appears that DB metadata "leaks" in a sequence of:
+		 *  - depth clear
+		 *  - DCC decompress for shader image writes (with DB disabled)
+		 *  - render with DEPTH_BEFORE_SHADER=1
+		 * Flushing DB metadata works around the problem.
+		 */
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+	}
 
 	/* Take the maximum of the old and new count. If the new count is lower,
 	 * dirtying is needed to disable the unbound colorbuffers.
@@ -2609,9 +2705,14 @@
 		si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
 	}
 
-	sctx->need_check_render_feedback = true;
 	sctx->do_update_shaders = true;
-	sctx->framebuffer.do_update_surf_dirtiness = true;
+
+	if (!sctx->decompression_enabled) {
+		/* Prevent textures decompression when the framebuffer state
+		 * changes come from the decompression passes themselves.
+		 */
+		sctx->need_check_render_feedback = true;
+	}
 }
 
 static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
@@ -2783,12 +2884,6 @@
 					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
 					      RADEON_PRIO_DEPTH_BUFFER);
 
-		if (zb->db_htile_data_base) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-					      rtex->htile_buffer, RADEON_USAGE_READWRITE,
-					      RADEON_PRIO_HTILE);
-		}
-
 		if (sctx->b.chip_class >= GFX9) {
 			radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
 			radeon_emit(cs, zb->db_htile_data_base);	/* DB_HTILE_DATA_BASE */
@@ -3069,14 +3164,13 @@
 			   uint32_t *fmask_state)
 {
 	struct pipe_resource *res = &tex->resource.b.b;
-	const struct util_format_description *base_desc, *desc;
+	const struct util_format_description *desc;
 	unsigned char swizzle[4];
 	int first_non_void;
 	unsigned num_format, data_format, type;
 	uint64_t va;
 
 	desc = util_format_description(pipe_format);
-	base_desc = util_format_description(res->format);
 
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
 		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
@@ -3177,14 +3271,11 @@
 		data_format = 0;
 	}
 
-	/* Enable clamping for UNORM depth formats promoted to Z32F. */
+	/* S8 with Z32 HTILE needs a special format. */
 	if (screen->b.chip_class >= GFX9 &&
-	    util_format_has_depth(desc) &&
-	    num_format == V_008F14_IMG_NUM_FORMAT_FLOAT &&
-	    util_get_depth_format_type(base_desc) != UTIL_FORMAT_TYPE_FLOAT) {
-		/* NUM_FORMAT=FLOAT and DATA_FORMAT=24_8 means "clamp to [0,1]". */
-		data_format = V_008F14_IMG_DATA_FORMAT_24_8;
-	}
+	    pipe_format == PIPE_FORMAT_S8_UINT &&
+	    tex->tc_compatible_htile)
+		data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
 
 	if (!sampler &&
 	    (res->target == PIPE_TEXTURE_CUBE ||
@@ -3522,16 +3613,54 @@
 		 wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
 }
 
-static bool sampler_state_needs_border_color(const struct pipe_sampler_state *state)
+static uint32_t si_translate_border_color(struct si_context *sctx,
+					  const struct pipe_sampler_state *state,
+					  const union pipe_color_union *color)
 {
 	bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
 			     state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
 
-	return (state->border_color.ui[0] || state->border_color.ui[1] ||
-		state->border_color.ui[2] || state->border_color.ui[3]) &&
-	       (wrap_mode_uses_border_color(state->wrap_s, linear_filter) ||
-		wrap_mode_uses_border_color(state->wrap_t, linear_filter) ||
-		wrap_mode_uses_border_color(state->wrap_r, linear_filter));
+	if ((color->f[0] == 0 && color->f[1] == 0 &&
+	     color->f[2] == 0 && color->f[3] == 0) ||
+	    (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
+	     !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
+	     !wrap_mode_uses_border_color(state->wrap_r, linear_filter)))
+		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+
+	if (color->f[0] == 0 && color->f[1] == 0 &&
+	    color->f[2] == 0 && color->f[3] == 1)
+		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK);
+	if (color->f[0] == 1 && color->f[1] == 1 &&
+	    color->f[2] == 1 && color->f[3] == 1)
+		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE);
+
+	int i;
+
+	/* Check if the border has been uploaded already. */
+	for (i = 0; i < sctx->border_color_count; i++)
+		if (memcmp(&sctx->border_color_table[i], color,
+			   sizeof(*color)) == 0)
+			break;
+
+	if (i >= SI_MAX_BORDER_COLORS) {
+		/* Getting 4096 unique border colors is very unlikely. */
+		fprintf(stderr, "radeonsi: The border color table is full. "
+			"Any new border colors will be just black. "
+			"Please file a bug.\n");
+		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+	}
+
+	if (i == sctx->border_color_count) {
+		/* Upload a new border color. */
+		memcpy(&sctx->border_color_table[i], color,
+		       sizeof(*color));
+		util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
+					color, sizeof(*color));
+		sctx->border_color_count++;
+	}
+
+	return S_008F3C_BORDER_COLOR_PTR(i) |
+	       S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
 }
 
 static void *si_create_sampler_state(struct pipe_context *ctx,
@@ -3540,64 +3669,15 @@
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct r600_common_screen *rscreen = sctx->b.screen;
 	struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
-	unsigned border_color_type, border_color_index = 0;
 	unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
 						       : state->max_anisotropy;
 	unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
+	union pipe_color_union clamped_border_color;
 
 	if (!rstate) {
 		return NULL;
 	}
 
-	if (!sampler_state_needs_border_color(state))
-		border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-	else if (state->border_color.f[0] == 0 &&
-		 state->border_color.f[1] == 0 &&
-		 state->border_color.f[2] == 0 &&
-		 state->border_color.f[3] == 0)
-		border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-	else if (state->border_color.f[0] == 0 &&
-		 state->border_color.f[1] == 0 &&
-		 state->border_color.f[2] == 0 &&
-		 state->border_color.f[3] == 1)
-		border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK;
-	else if (state->border_color.f[0] == 1 &&
-		 state->border_color.f[1] == 1 &&
-		 state->border_color.f[2] == 1 &&
-		 state->border_color.f[3] == 1)
-		border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE;
-	else {
-		int i;
-
-		border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER;
-
-		/* Check if the border has been uploaded already. */
-		for (i = 0; i < sctx->border_color_count; i++)
-			if (memcmp(&sctx->border_color_table[i], &state->border_color,
-				   sizeof(state->border_color)) == 0)
-				break;
-
-		if (i >= SI_MAX_BORDER_COLORS) {
-			/* Getting 4096 unique border colors is very unlikely. */
-			fprintf(stderr, "radeonsi: The border color table is full. "
-				"Any new border colors will be just black. "
-				"Please file a bug.\n");
-			border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-		} else {
-			if (i == sctx->border_color_count) {
-				/* Upload a new border color. */
-				memcpy(&sctx->border_color_table[i], &state->border_color,
-				       sizeof(state->border_color));
-				util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
-							&state->border_color,
-							sizeof(state->border_color));
-				sctx->border_color_count++;
-			}
-
-			border_color_index = i;
-		}
-	}
-
 #ifdef DEBUG
 	rstate->magic = SI_SAMPLER_STATE_MAGIC;
 #endif
@@ -3618,12 +3698,28 @@
 			  S_008F38_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) |
 			  S_008F38_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) |
 			  S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
-			  S_008F38_MIP_POINT_PRECLAMP(1) |
+			  S_008F38_MIP_POINT_PRECLAMP(0) |
 			  S_008F38_DISABLE_LSB_CEIL(sctx->b.chip_class <= VI) |
 			  S_008F38_FILTER_PREC_FIX(1) |
 			  S_008F38_ANISO_OVERRIDE(sctx->b.chip_class >= VI));
-	rstate->val[3] = S_008F3C_BORDER_COLOR_PTR(border_color_index) |
-			 S_008F3C_BORDER_COLOR_TYPE(border_color_type);
+	rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color);
+
+	/* Create sampler resource for upgraded depth textures. */
+	memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
+
+	for (unsigned i = 0; i < 4; ++i) {
+		/* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
+		 * when the border color is 1.0. */
+		clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
+	}
+
+	if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0)
+		rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
+	else
+		rstate->upgraded_depth_val[3] =
+			si_translate_border_color(sctx, state, &clamped_border_color) |
+			S_008F3C_UPGRADED_DEPTH(1);
+
 	return rstate;
 }
 
@@ -3675,7 +3771,7 @@
 				       const struct pipe_vertex_element *elements)
 {
 	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-	struct si_vertex_element *v = CALLOC_STRUCT(si_vertex_element);
+	struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
 	bool used[SI_NUM_VERTEX_BUFFERS] = {};
 	int i;
 
@@ -3699,6 +3795,16 @@
 			return NULL;
 		}
 
+		if (elements[i].instance_divisor) {
+			v->uses_instance_divisors = true;
+			v->instance_divisors[i] = elements[i].instance_divisor;
+
+			if (v->instance_divisors[i] == 1)
+				v->instance_divisor_is_one |= 1u << i;
+			else
+				v->instance_divisor_is_fetched |= 1u << i;
+		}
+
 		if (!used[vbo_index]) {
 			v->first_vb_use_mask |= 1 << i;
 			used[vbo_index] = true;
@@ -3712,6 +3818,8 @@
 		memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
 		v->format_size[i] = desc->block.bits / 8;
+		v->src_offset[i] = elements[i].src_offset;
+		v->vertex_buffer_index[i] = vbo_index;
 
 		/* The hardware always treats the 2-bit alpha channel as
 		 * unsigned, so a shader workaround is needed. The affected
@@ -3804,19 +3912,35 @@
 				   S_008F0C_NUM_FORMAT(num_format) |
 				   S_008F0C_DATA_FORMAT(data_format);
 	}
-	memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
-
 	return v;
 }
 
 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_vertex_element *v = (struct si_vertex_element*)state;
+	struct si_vertex_elements *old = sctx->vertex_elements;
+	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
 	sctx->vertex_elements = v;
 	sctx->vertex_buffers_dirty = true;
-	sctx->do_update_shaders = true;
+
+	if (v &&
+	    (!old ||
+	     old->count != v->count ||
+	     old->uses_instance_divisors != v->uses_instance_divisors ||
+	     v->uses_instance_divisors || /* we don't check which divisors changed */
+	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
+		sctx->do_update_shaders = true;
+
+	if (v && v->instance_divisor_is_fetched) {
+		struct pipe_constant_buffer cb;
+
+		cb.buffer = NULL;
+		cb.user_buffer = v->instance_divisors;
+		cb.buffer_offset = 0;
+		cb.buffer_size = sizeof(uint32_t) * v->count;
+		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
+	}
 }
 
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
@@ -3842,59 +3966,23 @@
 		for (i = 0; i < count; i++) {
 			const struct pipe_vertex_buffer *src = buffers + i;
 			struct pipe_vertex_buffer *dsti = dst + i;
+			struct pipe_resource *buf = src->buffer.resource;
 
-			if (unlikely(src->user_buffer)) {
-				/* Zero-stride attribs only. */
-				assert(src->stride == 0);
-
-				/* Assume that the user_buffer comes from
-				 * gl_current_attrib, which implies it has
-				 * 4 * 8 bytes (for dvec4 attributes).
-				 *
-				 * Use const_uploader to upload into VRAM directly.
-				 */
-				u_upload_data(sctx->b.b.const_uploader, 0, 32, 32,
-					      src->user_buffer,
-					      &dsti->buffer_offset,
-					      &dsti->buffer);
-				dsti->stride = 0;
-			} else {
-				struct pipe_resource *buf = src->buffer;
-
-				pipe_resource_reference(&dsti->buffer, buf);
-				dsti->buffer_offset = src->buffer_offset;
-				dsti->stride = src->stride;
-				r600_context_add_resource_size(ctx, buf);
-				if (buf)
-					r600_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
-			}
+			pipe_resource_reference(&dsti->buffer.resource, buf);
+			dsti->buffer_offset = src->buffer_offset;
+			dsti->stride = src->stride;
+			r600_context_add_resource_size(ctx, buf);
+			if (buf)
+				r600_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
 		}
 	} else {
 		for (i = 0; i < count; i++) {
-			pipe_resource_reference(&dst[i].buffer, NULL);
+			pipe_resource_reference(&dst[i].buffer.resource, NULL);
 		}
 	}
 	sctx->vertex_buffers_dirty = true;
 }
 
-static void si_set_index_buffer(struct pipe_context *ctx,
-				const struct pipe_index_buffer *ib)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	if (ib) {
-		struct pipe_resource *buf = ib->buffer;
-
-		pipe_resource_reference(&sctx->index_buffer.buffer, buf);
-	        memcpy(&sctx->index_buffer, ib, sizeof(*ib));
-		r600_context_add_resource_size(ctx, buf);
-		if (buf)
-			r600_resource(buf)->bind_history |= PIPE_BIND_INDEX_BUFFER;
-	} else {
-		pipe_resource_reference(&sctx->index_buffer.buffer, NULL);
-	}
-}
-
 /*
  * Misc
  */
@@ -3926,10 +4014,15 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_FLUSH_AND_INV_CB;
-	sctx->framebuffer.do_update_surf_dirtiness = true;
+	si_update_fb_dirtiness_after_rendering(sctx);
+
+	/* Multisample surfaces are flushed in si_decompress_textures. */
+	if (sctx->framebuffer.nr_samples <= 1 &&
+	    sctx->framebuffer.state.nr_cbufs) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_FLUSH_AND_INV_CB;
+	}
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
@@ -3966,12 +4059,19 @@
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 	}
 
-	if (flags & PIPE_BARRIER_FRAMEBUFFER)
+	/* MSAA color, any depth and any stencil are flushed in
+	 * si_decompress_textures when needed.
+	 */
+	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
+	    sctx->framebuffer.nr_samples <= 1 &&
+	    sctx->framebuffer.state.nr_cbufs) {
 		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-				 SI_CONTEXT_FLUSH_AND_INV_DB;
+				 SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+	}
 
-	if (flags & (PIPE_BARRIER_FRAMEBUFFER |
-		     PIPE_BARRIER_INDIRECT_BUFFER))
+	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
+	if (sctx->screen->b.chip_class <= VI &&
+	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
 		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }
 
@@ -4027,8 +4127,8 @@
 
 	sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
 	sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
-	sctx->custom_blend_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
-	sctx->custom_blend_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
+	sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
+	sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
 	sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
 
 	sctx->b.b.set_clip_state = si_set_clip_state;
@@ -4049,7 +4149,6 @@
 	sctx->b.b.bind_vertex_elements_state = si_bind_vertex_elements;
 	sctx->b.b.delete_vertex_elements_state = si_delete_vertex_element;
 	sctx->b.b.set_vertex_buffers = si_set_vertex_buffers;
-	sctx->b.b.set_index_buffer = si_set_index_buffer;
 
 	sctx->b.b.texture_barrier = si_texture_barrier;
 	sctx->b.b.memory_barrier = si_memory_barrier;
@@ -4331,14 +4430,19 @@
 	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
 	/* FIXME calculate these values somehow ??? */
-	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
-	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
+	if (sctx->b.chip_class <= VI) {
+		si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+		si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
+	}
 	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
 
 	si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+	si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+	if (sctx->b.chip_class >= GFX9)
+		si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
 	if (sctx->b.chip_class < CIK)
 		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
@@ -4482,20 +4586,20 @@
 	}
 
 	if (sctx->b.chip_class >= CIK) {
-		/* If this is 0, Bonaire can hang even if GS isn't being used.
-		 * Other chips are unaffected. These are suboptimal values,
-		 * but we don't use on-chip GS.
-		 */
-		si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
-			       S_028A44_ES_VERTS_PER_SUBGRP(64) |
-			       S_028A44_GS_PRIMS_PER_SUBGRP(4));
-
 		if (sctx->b.chip_class >= GFX9) {
 			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
 		} else {
 			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
 			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
 			si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+
+			/* If this is 0, Bonaire can hang even if GS isn't being used.
+			 * Other chips are unaffected. These are suboptimal values,
+			 * but we don't use on-chip GS.
+			 */
+			si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+				       S_028A44_ES_VERTS_PER_SUBGRP(64) |
+				       S_028A44_GS_PRIMS_PER_SUBGRP(4));
 		}
 		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 6257299..acc8fb7 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,6 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
+#include "pipebuffer/pb_slab.h"
+
 #define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
 #define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
 
@@ -42,6 +44,7 @@
 
 struct si_screen;
 struct si_shader;
+struct si_shader_selector;
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
@@ -59,25 +62,25 @@
 struct si_state_rasterizer {
 	struct si_pm4_state	pm4;
 	/* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
-	struct si_pm4_state	pm4_poly_offset[3];
-	bool			flatshade;
-	bool			two_side;
-	bool			multisample_enable;
-	bool			force_persample_interp;
-	bool			line_stipple_enable;
-	unsigned		sprite_coord_enable;
+	struct si_pm4_state	*pm4_poly_offset;
 	unsigned		pa_sc_line_stipple;
 	unsigned		pa_cl_clip_cntl;
-	unsigned		clip_plane_enable;
-	bool			poly_stipple_enable;
-	bool			line_smooth;
-	bool			poly_smooth;
-	bool			uses_poly_offset;
-	bool			clamp_fragment_color;
-	bool			clamp_vertex_color;
-	bool			rasterizer_discard;
-	bool			scissor_enable;
-	bool			clip_halfz;
+	unsigned		sprite_coord_enable:8;
+	unsigned		clip_plane_enable:8;
+	unsigned		flatshade:1;
+	unsigned		two_side:1;
+	unsigned		multisample_enable:1;
+	unsigned		force_persample_interp:1;
+	unsigned		line_stipple_enable:1;
+	unsigned		poly_stipple_enable:1;
+	unsigned		line_smooth:1;
+	unsigned		poly_smooth:1;
+	unsigned		uses_poly_offset:1;
+	unsigned		clamp_fragment_color:1;
+	unsigned		clamp_vertex_color:1;
+	unsigned		rasterizer_discard:1;
+	unsigned		scissor_enable:1;
+	unsigned		clip_halfz:1;
 };
 
 struct si_dsa_stencil_ref_part {
@@ -97,17 +100,23 @@
 	struct si_dsa_stencil_ref_part	dsa_part;
 };
 
-struct si_vertex_element
+struct si_vertex_elements
 {
-	unsigned			count;
-	unsigned			first_vb_use_mask;
-	/* Vertex buffer descriptor list size aligned for optimal prefetch. */
-	unsigned			desc_list_byte_size;
-
-	uint8_t				fix_fetch[SI_MAX_ATTRIBS];
+	uint32_t			instance_divisors[SI_MAX_ATTRIBS];
 	uint32_t			rsrc_word3[SI_MAX_ATTRIBS];
-	uint32_t			format_size[SI_MAX_ATTRIBS];
-	struct pipe_vertex_element	elements[SI_MAX_ATTRIBS];
+	uint16_t			src_offset[SI_MAX_ATTRIBS];
+	uint8_t				fix_fetch[SI_MAX_ATTRIBS];
+	uint8_t				format_size[SI_MAX_ATTRIBS];
+	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];
+
+	uint8_t				count;
+	bool				uses_instance_divisors;
+
+	uint16_t			first_vb_use_mask;
+	/* Vertex buffer descriptor list size aligned for optimal prefetch. */
+	uint16_t			desc_list_byte_size;
+	uint16_t			instance_divisor_is_one; /* bitmask of inputs */
+	uint16_t			instance_divisor_is_fetched;  /* bitmask of inputs */
 };
 
 union si_state {
@@ -164,9 +173,6 @@
 
 /* Private read-write buffer slots. */
 enum {
-	SI_HS_RING_TESS_FACTOR,
-	SI_HS_RING_TESS_OFFCHIP,
-
 	SI_ES_RING_ESGS,
 	SI_GS_RING_ESGS,
 
@@ -178,6 +184,7 @@
 	SI_VS_STREAMOUT_BUF3,
 
 	SI_HS_CONST_DEFAULT_TESS_LEVELS,
+	SI_VS_CONST_INSTANCE_DIVISORS,
 	SI_VS_CONST_CLIP_PLANES,
 	SI_PS_CONST_POLY_STIPPLE,
 	SI_PS_CONST_SAMPLE_POSITIONS,
@@ -197,11 +204,11 @@
  *  21 - compute const buffers
  *   ...
  */
-#define SI_SHADER_DESCS_CONST_BUFFERS  0
-#define SI_SHADER_DESCS_SHADER_BUFFERS 1
-#define SI_SHADER_DESCS_SAMPLERS       2
-#define SI_SHADER_DESCS_IMAGES         3
-#define SI_NUM_SHADER_DESCS            4
+enum {
+	SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+	SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+	SI_NUM_SHADER_DESCS,
+};
 
 #define SI_DESCS_RW_BUFFERS            0
 #define SI_DESCS_FIRST_SHADER          1
@@ -218,30 +225,42 @@
 	uint32_t *list;
 	/* The list in mapped GPU memory. */
 	uint32_t *gpu_list;
-	/* The size of one descriptor. */
-	unsigned element_dw_size;
-	/* The maximum number of descriptors. */
-	unsigned num_elements;
+	/* Slots that have been changed and need to be uploaded. */
+	uint64_t dirty_mask;
 
 	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
-	unsigned buffer_offset;
+	int buffer_offset; /* can be negative if not using lower slots */
+
+	/* The size of one descriptor. */
+	ubyte element_dw_size;
+	/* The maximum number of descriptors. */
+	ubyte num_elements;
 
 	/* Offset in CE RAM */
-	unsigned ce_offset;
+	uint16_t ce_offset;
 
-	/* elements of the list that are changed and need to be uploaded */
-	unsigned dirty_mask;
+	/* Slots allocated in CE RAM. If we get active slots outside of this
+	 * range, direct uploads to memory will be used instead. This basically
+	 * governs switching between onchip (CE) and offchip (upload) modes.
+	 */
+	ubyte first_ce_slot;
+	ubyte num_ce_slots;
+
+	/* Slots that are used by currently-bound shaders.
+	 * With CE: It determines which slots are dumped to L2.
+	 *          It doesn't skip uploads to CE RAM.
+	 * Without CE: It determines which slots are uploaded.
+	 */
+	ubyte first_active_slot;
+	ubyte num_active_slots;
 
 	/* Whether CE is used to upload this descriptor array. */
 	bool uses_ce;
-	/* Whether the CE ram is dirty and needs to be reinitialized entirely
-	 * before we can do partial updates. */
-	bool ce_ram_dirty;
 
-	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
-	 * array will be stored. */
-	unsigned shader_userdata_offset;
+	/* The SGPR index where the 64-bit pointer to the descriptor array will
+	 * be stored. */
+	ubyte shader_userdata_offset;
 };
 
 struct si_sampler_views {
@@ -253,10 +272,13 @@
 };
 
 struct si_buffer_resources {
-	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
-	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
 
+	enum radeon_bo_usage		shader_usage:4; /* READ, WRITE, or READWRITE */
+	enum radeon_bo_usage		shader_usage_constbuf:4;
+	enum radeon_bo_priority		priority:6;
+	enum radeon_bo_priority		priority_constbuf:6;
+
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	unsigned			enabled_mask;
 };
@@ -283,7 +305,8 @@
 	} while(0)
 
 /* si_descriptors.c */
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx);
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx);
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx);
 void si_ce_enable_loads(struct radeon_winsys_cs *ib);
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
 				    struct r600_texture *tex,
@@ -308,16 +331,28 @@
 bool si_upload_compute_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
+void si_all_resident_buffers_begin_new_cs(struct si_context *sctx);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
 void si_update_all_texture_descriptors(struct si_context *sctx);
 void si_shader_change_notify(struct si_context *sctx);
-void si_update_compressed_colortex_masks(struct si_context *sctx);
+void si_update_needs_color_decompress_masks(struct si_context *sctx);
 void si_emit_graphics_shader_userdata(struct si_context *sctx,
                                       struct r600_atom *atom);
 void si_emit_compute_shader_userdata(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
 		      uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+			       uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+					  struct si_shader_selector *sel);
+bool si_bindless_descriptor_can_reclaim_slab(void *priv,
+					     struct pb_slab_entry *entry);
+struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
+						  unsigned entry_size,
+						  unsigned group_index);
+void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
+
 /* si_state.c */
 struct si_shader_selector;
 
@@ -349,6 +384,7 @@
 			      const struct pipe_sampler_view *state,
 			      unsigned width0, unsigned height0,
 			      unsigned force_level);
+void si_update_fb_dirtiness_after_rendering(struct si_context *sctx);
 
 /* si_state_shader.c */
 bool si_update_shaders(struct si_context *sctx);
@@ -356,6 +392,9 @@
 bool si_init_shader_cache(struct si_screen *sscreen);
 void si_destroy_shader_cache(struct si_screen *sscreen);
 void si_init_shader_selector_async(void *job, int thread_index);
+void si_get_active_slot_masks(const struct tgsi_shader_info *info,
+			      uint32_t *const_and_shader_buffers,
+			      uint64_t *samplers_and_images);
 
 /* si_state_draw.c */
 void si_init_ia_multi_vgt_param_table(struct si_context *sctx);
@@ -375,4 +414,28 @@
 		return rtex->surface.u.legacy.tiling_index[level];
 }
 
+static inline unsigned si_get_constbuf_slot(unsigned slot)
+{
+	/* Constant buffers are in slots [16..31], ascending */
+	return SI_NUM_SHADER_BUFFERS + slot;
+}
+
+static inline unsigned si_get_shaderbuf_slot(unsigned slot)
+{
+	/* shader buffers are in slots [15..0], descending */
+	return SI_NUM_SHADER_BUFFERS - 1 - slot;
+}
+
+static inline unsigned si_get_sampler_slot(unsigned slot)
+{
+	/* samplers are in slots [8..39], ascending */
+	return SI_NUM_IMAGES / 2 + slot;
+}
+
+static inline unsigned si_get_image_slot(unsigned slot)
+{
+	/* images are in slots [15..0] (sampler slots [7..0]), descending */
+	return SI_NUM_IMAGES - 1 - slot;
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index a62b1aa..55bf104 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -96,12 +96,13 @@
 				       unsigned *num_patches)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	struct si_shader_ctx_state *ls = &sctx->vs_shader;
+	struct si_shader *ls_current;
+	struct si_shader_selector *ls;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
 		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
-	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tcs_tes_uses_prim_id;
+	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
 	bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
 					 sctx->b.screen->info.max_se == 1;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
@@ -110,11 +111,24 @@
 	unsigned num_tcs_patch_outputs;
 	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
 	unsigned input_patch_size, output_patch_size, output_patch0_offset;
-	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+	unsigned perpatch_output_offset, lds_size;
 	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
 	unsigned offchip_layout, hardware_lds_size, ls_hs_config;
 
-	if (sctx->last_ls == ls->current &&
+	/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
+	if (sctx->b.chip_class >= GFX9) {
+		if (sctx->tcs_shader.cso)
+			ls_current = sctx->tcs_shader.current;
+		else
+			ls_current = sctx->fixed_func_tcs_shader.current;
+
+		ls = ls_current->key.part.tcs.ls;
+	} else {
+		ls_current = sctx->vs_shader.current;
+		ls = sctx->vs_shader.cso;
+	}
+
+	if (sctx->last_ls == ls_current &&
 	    sctx->last_tcs == tcs &&
 	    sctx->last_tes_sh_base == tes_sh_base &&
 	    sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
@@ -124,7 +138,7 @@
 		return;
 	}
 
-	sctx->last_ls = ls->current;
+	sctx->last_ls = ls_current;
 	sctx->last_tcs = tcs;
 	sctx->last_tes_sh_base = tes_sh_base;
 	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
@@ -132,7 +146,7 @@
 
 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
-	num_tcs_inputs = util_last_bit64(ls->cso->outputs_written);
+	num_tcs_inputs = util_last_bit64(ls->outputs_written);
 
 	if (sctx->tcs_shader.cso) {
 		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
@@ -180,8 +194,14 @@
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
-	/* SI bug workaround - limit LS-HS threadgroups to only one wave. */
-	if (sctx->b.chip_class == SI) {
+	if (sctx->b.chip_class == SI ||
+	    /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
+	     * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
+	    (sctx->b.chip_class == GFX9 &&
+	     num_tcs_input_cp > num_tcs_output_cp)) {
+		/* SI bug workaround, related to power management. Limit LS-HS
+		 * threadgroups to only one wave.
+		 */
 		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
@@ -204,27 +224,6 @@
 	output_patch0_offset = input_patch_size * *num_patches;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
-	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->config.rsrc2;
-
-	if (sctx->b.chip_class >= CIK) {
-		assert(lds_size <= 65536);
-		lds_size = align(lds_size, 512) / 512;
-	} else {
-		assert(lds_size <= 32768);
-		lds_size = align(lds_size, 256) / 256;
-	}
-	si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
-	ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
-	/* Due to a hw bug, RSRC2_LS must be written twice with another
-	 * LS register written in between. */
-	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
-		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->config.rsrc1);
-	radeon_emit(cs, ls_rsrc2);
-
 	/* Compute userdata SGPRs. */
 	assert(((input_vertex_size / 4) & ~0xff) == 0);
 	assert(((output_vertex_size / 4) & ~0xff) == 0);
@@ -241,25 +240,66 @@
 			 ((output_vertex_size / 4) << 13);
 	tcs_out_offsets = (output_patch0_offset / 16) |
 			  ((perpatch_output_offset / 16) << 16);
-	offchip_layout = (pervertex_output_patch_size * *num_patches << 16) |
-			 (num_tcs_output_cp << 9) | *num_patches;
+	offchip_layout = *num_patches |
+			 (num_tcs_output_cp << 6) |
+			 (pervertex_output_patch_size * *num_patches << 12);
 
-	/* Set them for LS. */
+	/* Compute the LDS size. */
+	lds_size = output_patch0_offset + output_patch_size * *num_patches;
+
+	if (sctx->b.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		lds_size = align(lds_size, 512) / 512;
+	} else {
+		assert(lds_size <= 32768);
+		lds_size = align(lds_size, 256) / 256;
+	}
+
+	/* Set SI_SGPR_VS_STATE_BITS. */
 	sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
 				  C_VS_STATE_LS_OUT_VERTEX_SIZE;
 	sctx->current_vs_state |= tcs_in_layout;
 
-	/* Set them for TCS. */
-	radeon_set_sh_reg_seq(cs,
-		R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-	radeon_emit(cs, offchip_layout);
-	radeon_emit(cs, tcs_out_offsets);
-	radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
-	radeon_emit(cs, tcs_in_layout);
+	if (sctx->b.chip_class >= GFX9) {
+		unsigned hs_rsrc2 = ls_current->config.rsrc2 |
+				    S_00B42C_LDS_SIZE(lds_size);
 
-	/* Set them for TES. */
-	radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 1);
+		radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+
+		/* Set userdata SGPRs for merged LS-HS. */
+		radeon_set_sh_reg_seq(cs,
+				      R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+				      GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+		radeon_emit(cs, offchip_layout);
+		radeon_emit(cs, tcs_out_offsets);
+		radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+	} else {
+		unsigned ls_rsrc2 = ls_current->config.rsrc2;
+
+		si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+		/* Due to a hw bug, RSRC2_LS must be written twice with another
+		 * LS register written in between. */
+		if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+			radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+		radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+		radeon_emit(cs, ls_current->config.rsrc1);
+		radeon_emit(cs, ls_rsrc2);
+
+		/* Set userdata SGPRs for TCS. */
+		radeon_set_sh_reg_seq(cs,
+			R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+		radeon_emit(cs, offchip_layout);
+		radeon_emit(cs, tcs_out_offsets);
+		radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+		radeon_emit(cs, tcs_in_layout);
+	}
+
+	/* Set userdata SGPRs for TES. */
+	radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
 	radeon_emit(cs, offchip_layout);
+	radeon_emit(cs, r600_resource(sctx->tess_offchip_ring)->gpu_address >> 16);
 
 	ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
 		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
@@ -301,7 +341,7 @@
 
 	if (key->u.uses_tess) {
 		/* SWITCH_ON_EOI must be set if PrimID is used. */
-		if (key->u.tcs_tes_uses_prim_id)
+		if (key->u.tess_uses_prim_id)
 			ia_switch_on_eoi = true;
 
 		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
@@ -432,7 +472,7 @@
 		key.u.count_from_stream_output = count_from_so;
 		key.u.line_stipple_enabled = line_stipple;
 		key.u.uses_tess = uses_tess;
-		key.u.tcs_tes_uses_prim_id = tess_uses_primid;
+		key.u.tess_uses_prim_id = tess_uses_primid;
 		key.u.uses_gs = uses_gs;
 
 		sctx->ia_multi_vgt_param[key.index] =
@@ -471,7 +511,8 @@
 
 	if (sctx->gs_shader.cso) {
 		/* GS requirement. */
-		if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+		if (sctx->b.chip_class <= VI &&
+		    SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
 			ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
 
 		/* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
@@ -494,7 +535,7 @@
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	enum pipe_prim_type rast_prim = sctx->current_rast_prim;
+	enum pipe_prim_type rast_prim = sctx->b.current_rast_prim;
 	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
 	/* Skip this if not rendering lines. */
@@ -524,7 +565,7 @@
 			     const struct pipe_draw_info *info)
 {
 	sctx->current_vs_state &= C_VS_STATE_INDEXED;
-	sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->indexed);
+	sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
 
 	if (sctx->current_vs_state != sctx->last_vs_state) {
 		struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
@@ -544,7 +585,7 @@
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
-	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
+	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->b.current_rast_prim);
 	unsigned ia_multi_vgt_param;
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
@@ -597,8 +638,11 @@
 
 static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_draw_info *info,
-				 const struct pipe_index_buffer *ib)
+				 struct pipe_resource *indexbuf,
+				 unsigned index_size,
+				 unsigned index_offset)
 {
+	struct pipe_draw_indirect_info *indirect = info->indirect;
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
 	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
@@ -629,12 +673,12 @@
 	}
 
 	/* draw packet */
-	if (info->indexed) {
-		if (ib->index_size != sctx->last_index_size) {
+	if (index_size) {
+		if (index_size != sctx->last_index_size) {
 			unsigned index_type;
 
 			/* index type */
-			switch (ib->index_size) {
+			switch (index_size) {
 			case 1:
 				index_type = V_028A7C_VGT_INDEX_8;
 				break;
@@ -661,15 +705,15 @@
 				radeon_emit(cs, index_type);
 			}
 
-			sctx->last_index_size = ib->index_size;
+			sctx->last_index_size = index_size;
 		}
 
-		index_max_size = (ib->buffer->width0 - ib->offset) /
-				  ib->index_size;
-		index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
+		index_max_size = (indexbuf->width0 - index_offset) /
+				  index_size;
+		index_va = r600_resource(indexbuf)->gpu_address + index_offset;
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      (struct r600_resource *)ib->buffer,
+				      (struct r600_resource *)indexbuf,
 				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 	} else {
 		/* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE,
@@ -679,8 +723,8 @@
 			sctx->last_index_size = -1;
 	}
 
-	if (info->indirect) {
-		uint64_t indirect_va = r600_resource(info->indirect)->gpu_address;
+	if (indirect) {
+		uint64_t indirect_va = r600_resource(indirect->buffer)->gpu_address;
 
 		assert(indirect_va % 8 == 0);
 
@@ -692,15 +736,15 @@
 		radeon_emit(cs, indirect_va >> 32);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      (struct r600_resource *)info->indirect,
+				      (struct r600_resource *)indirect->buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
-		unsigned di_src_sel = info->indexed ? V_0287F0_DI_SRC_SEL_DMA
+		unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
 						    : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
 
-		assert(info->indirect_offset % 4 == 0);
+		assert(indirect->offset % 4 == 0);
 
-		if (info->indexed) {
+		if (index_size) {
 			radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
 			radeon_emit(cs, index_va);
 			radeon_emit(cs, index_va >> 32);
@@ -710,40 +754,40 @@
 		}
 
 		if (!sctx->screen->has_draw_indirect_multi) {
-			radeon_emit(cs, PKT3(info->indexed ? PKT3_DRAW_INDEX_INDIRECT
+			radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT
 							   : PKT3_DRAW_INDIRECT,
 					     3, render_cond_bit));
-			radeon_emit(cs, info->indirect_offset);
+			radeon_emit(cs, indirect->offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, di_src_sel);
 		} else {
 			uint64_t count_va = 0;
 
-			if (info->indirect_params) {
+			if (indirect->indirect_draw_count) {
 				struct r600_resource *params_buf =
-					(struct r600_resource *)info->indirect_params;
+					(struct r600_resource *)indirect->indirect_draw_count;
 
 				radeon_add_to_buffer_list(
 					&sctx->b, &sctx->b.gfx, params_buf,
 					RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
-				count_va = params_buf->gpu_address + info->indirect_params_offset;
+				count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
 			}
 
-			radeon_emit(cs, PKT3(info->indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
+			radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
 							     PKT3_DRAW_INDIRECT_MULTI,
 					     8, render_cond_bit));
-			radeon_emit(cs, info->indirect_offset);
+			radeon_emit(cs, indirect->offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
 					S_2C3_DRAW_INDEX_ENABLE(1) |
-					S_2C3_COUNT_INDIRECT_ENABLE(!!info->indirect_params));
-			radeon_emit(cs, info->indirect_count);
+					S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+			radeon_emit(cs, indirect->draw_count);
 			radeon_emit(cs, count_va);
 			radeon_emit(cs, count_va >> 32);
-			radeon_emit(cs, info->indirect_stride);
+			radeon_emit(cs, indirect->stride);
 			radeon_emit(cs, di_src_sel);
 		}
 	} else {
@@ -753,7 +797,7 @@
 		radeon_emit(cs, info->instance_count);
 
 		/* Base vertex and start instance. */
-		base_vertex = info->indexed ? info->index_bias : info->start;
+		base_vertex = index_size ? info->index_bias : info->start;
 
 		if (base_vertex != sctx->last_base_vertex ||
 		    sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
@@ -771,8 +815,8 @@
 			sctx->last_sh_base_reg = sh_base_reg;
 		}
 
-		if (info->indexed) {
-			index_va += info->start * ib->index_size;
+		if (index_size) {
+			index_va += info->start * index_size;
 
 			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
 			radeon_emit(cs, index_max_size);
@@ -821,9 +865,10 @@
 	uint32_t flush_cb_db = rctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
 					      SI_CONTEXT_FLUSH_AND_INV_DB);
 
-	if (rctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-			   SI_CONTEXT_FLUSH_AND_INV_DB))
-		sctx->b.num_fb_cache_flushes++;
+	if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+		sctx->b.num_cb_cache_flushes++;
+	if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+		sctx->b.num_db_cache_flushes++;
 
 	/* SI has a bug that it always flushes ICACHE and KCACHE if either
 	 * bit is set. An alternative way is to write SQC_CACHES, but that
@@ -865,7 +910,8 @@
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
 	}
-	if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+	if (rctx->flags & (SI_CONTEXT_FLUSH_AND_INV_DB |
+			   SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
 		/* Flush HTILE. SURFACE_SYNC will wait for idle. */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
@@ -913,9 +959,8 @@
 	 * wait for idle on GFX9. We have to use a TS event.
 	 */
 	if (sctx->b.chip_class >= GFX9 && flush_cb_db) {
-		struct r600_resource *rbuf = NULL;
 		uint64_t va;
-		unsigned offset = 0, tc_flags, cb_db_event;
+		unsigned tc_flags, cb_db_event;
 
 		/* Set the CB/DB flush event. */
 		switch (flush_cb_db) {
@@ -931,38 +976,33 @@
 		}
 
 		/* TC    | TC_WB         = invalidate L2 data
-		 * TC_MD | TC_WB         = invalidate L2 metadata
+		 * TC_MD | TC_WB         = invalidate L2 metadata (DCC, etc.)
 		 * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
-		 *
-		 * The metadata cache must always be invalidated for coherency
-		 * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
-		 *
-		 * TC must be invalidated on GFX9 only if the CB/DB surface is
-		 * not pipe-aligned. If the surface is RB-aligned, it might not
-		 * strictly be pipe-aligned since RB alignment takes precendence.
 		 */
-		tc_flags = EVENT_TC_WB_ACTION_ENA |
-			   EVENT_TC_MD_ACTION_ENA;
+		tc_flags = 0;
 
 		/* Ideally flush TC together with CB/DB. */
 		if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
 			tc_flags |= EVENT_TC_ACTION_ENA |
+				    EVENT_TC_WB_ACTION_ENA |
 				    EVENT_TCL1_ACTION_ENA;
 
 			/* Clear the flags. */
 			rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
 					 SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
 					 SI_CONTEXT_INV_VMEM_L1);
+			sctx->b.num_L2_invalidates++;
 		}
 
-		/* Allocate memory for the fence. */
-		u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4,
-				     &offset, (struct pipe_resource**)&rbuf);
-		va = rbuf->gpu_address + offset;
+		/* Do the flush (enqueue the event and wait for it). */
+		va = sctx->wait_mem_scratch->gpu_address;
+		sctx->wait_mem_number++;
 
 		r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
-					 rbuf, va, 0, 1);
-		r600_gfx_wait_fence(rctx, va, 1, 0xffffffff);
+					 sctx->wait_mem_scratch, va,
+					 sctx->wait_mem_number - 1,
+					 sctx->wait_mem_number);
+		r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
 	}
 
 	/* Make sure ME is idle (it executes most packets) before continuing.
@@ -1044,17 +1084,19 @@
 				    const struct pipe_draw_info *info,
 				    unsigned *start, unsigned *count)
 {
-	if (info->indirect) {
+	struct pipe_draw_indirect_info *indirect = info->indirect;
+
+	if (indirect) {
 		unsigned indirect_count;
 		struct pipe_transfer *transfer;
 		unsigned begin, end;
 		unsigned map_size;
 		unsigned *data;
 
-		if (info->indirect_params) {
+		if (indirect->indirect_draw_count) {
 			data = pipe_buffer_map_range(&sctx->b.b,
-					info->indirect_params,
-					info->indirect_params_offset,
+					indirect->indirect_draw_count,
+					indirect->indirect_draw_count_offset,
 					sizeof(unsigned),
 					PIPE_TRANSFER_READ, &transfer);
 
@@ -1062,7 +1104,7 @@
 
 			pipe_buffer_unmap(&sctx->b.b, transfer);
 		} else {
-			indirect_count = info->indirect_count;
+			indirect_count = indirect->draw_count;
 		}
 
 		if (!indirect_count) {
@@ -1070,9 +1112,9 @@
 			return;
 		}
 
-		map_size = (indirect_count - 1) * info->indirect_stride + 3 * sizeof(unsigned);
-		data = pipe_buffer_map_range(&sctx->b.b, info->indirect,
-					     info->indirect_offset, map_size,
+		map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
+		data = pipe_buffer_map_range(&sctx->b.b, indirect->buffer,
+					     indirect->offset, map_size,
 					     PIPE_TRANSFER_READ, &transfer);
 
 		begin = UINT_MAX;
@@ -1087,7 +1129,7 @@
 				end = MAX2(end, start + count);
 			}
 
-			data += info->indirect_stride / sizeof(unsigned);
+			data += indirect->stride / sizeof(unsigned);
 		}
 
 		pipe_buffer_unmap(&sctx->b.b, transfer);
@@ -1129,11 +1171,12 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	const struct pipe_index_buffer *ib = &sctx->index_buffer;
-	struct pipe_index_buffer ib_tmp; /* for index buffer uploads only */
+	struct pipe_resource *indexbuf = info->index.resource;
 	unsigned mask, dirty_tex_counter;
 	enum pipe_prim_type rast_prim;
 	unsigned num_patches = 0;
+	unsigned index_size = info->index_size;
+	unsigned index_offset = info->indirect ? info->start * index_size : 0;
 
 	if (likely(!info->indirect)) {
 		/* SI-CI treat instance_count==0 as instance_count==1. There is
@@ -1145,7 +1188,7 @@
 
 		/* Handle count == 0. */
 		if (unlikely(!info->count &&
-			     (info->indexed || !info->count_from_stream_output)))
+			     (index_size || !info->count_from_stream_output)))
 			return;
 	}
 
@@ -1169,7 +1212,6 @@
 		sctx->framebuffer.dirty_cbufs |=
 			((1 << sctx->framebuffer.state.nr_cbufs) - 1);
 		sctx->framebuffer.dirty_zsbuf = true;
-		sctx->framebuffer.do_update_surf_dirtiness = true;
 		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 		si_update_all_texture_descriptors(sctx);
 	}
@@ -1183,13 +1225,23 @@
 	 * current_rast_prim for this draw_vbo call. */
 	if (sctx->gs_shader.cso)
 		rast_prim = sctx->gs_shader.cso->gs_output_prim;
-	else if (sctx->tes_shader.cso)
-		rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
-	else
+	else if (sctx->tes_shader.cso) {
+		if (sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+			rast_prim = PIPE_PRIM_POINTS;
+		else
+			rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+	} else
 		rast_prim = info->mode;
 
-	if (rast_prim != sctx->current_rast_prim) {
-		sctx->current_rast_prim = rast_prim;
+	if (rast_prim != sctx->b.current_rast_prim) {
+		bool old_is_poly = sctx->b.current_rast_prim >= PIPE_PRIM_TRIANGLES;
+		bool new_is_poly = rast_prim >= PIPE_PRIM_TRIANGLES;
+		if (old_is_poly != new_is_poly) {
+			sctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+			si_set_atom_dirty(sctx, &sctx->b.scissors.atom, true);
+		}
+
+		sctx->b.current_rast_prim = rast_prim;
 		sctx->do_update_shaders = true;
 	}
 
@@ -1217,74 +1269,76 @@
 	if (!si_upload_graphics_shader_descriptors(sctx))
 		return;
 
-	ib_tmp.buffer = NULL;
-
-	if (info->indexed) {
+	if (index_size) {
 		/* Translate or upload, if needed. */
 		/* 8-bit indices are supported on VI. */
-		if (sctx->b.chip_class <= CIK && ib->index_size == 1) {
-			unsigned start, count, start_offset, size;
+		if (sctx->b.chip_class <= CIK && index_size == 1) {
+			unsigned start, count, start_offset, size, offset;
 			void *ptr;
 
 			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * 2;
 			size = count * 2;
 
+			indexbuf = NULL;
 			u_upload_alloc(ctx->stream_uploader, start_offset,
 				       size,
 				       si_optimal_tcc_alignment(sctx, size),
-				       &ib_tmp.offset, &ib_tmp.buffer, &ptr);
-			if (!ib_tmp.buffer)
+				       &offset, &indexbuf, &ptr);
+			if (!indexbuf)
 				return;
 
-			util_shorten_ubyte_elts_to_userptr(&sctx->b.b, ib, 0, 0,
-							   ib->offset + start,
+			util_shorten_ubyte_elts_to_userptr(&sctx->b.b, info, 0, 0,
+							   index_offset + start,
 							   count, ptr);
 
 			/* info->start will be added by the drawing code */
-			ib_tmp.offset -= start_offset;
-			ib_tmp.index_size = 2;
-			ib = &ib_tmp;
-		} else if (ib->user_buffer && !ib->buffer) {
+			index_offset = offset - start_offset;
+			index_size = 2;
+		} else if (info->has_user_indices) {
 			unsigned start_offset;
 
 			assert(!info->indirect);
-			start_offset = info->start * ib->index_size;
+			start_offset = info->start * index_size;
 
+			indexbuf = NULL;
 			u_upload_data(ctx->stream_uploader, start_offset,
-				      info->count * ib->index_size,
+				      info->count * index_size,
 				      sctx->screen->b.info.tcc_cache_line_size,
-				      (char*)ib->user_buffer + start_offset,
-				      &ib_tmp.offset, &ib_tmp.buffer);
-			if (!ib_tmp.buffer)
+				      (char*)info->index.user + start_offset,
+				      &index_offset, &indexbuf);
+			if (!indexbuf)
 				return;
 
 			/* info->start will be added by the drawing code */
-			ib_tmp.offset -= start_offset;
-			ib_tmp.index_size = ib->index_size;
-			ib = &ib_tmp;
+			index_offset -= start_offset;
 		} else if (sctx->b.chip_class <= CIK &&
-			   r600_resource(ib->buffer)->TC_L2_dirty) {
+			   r600_resource(indexbuf)->TC_L2_dirty) {
 			/* VI reads index buffers through TC L2, so it doesn't
 			 * need this. */
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-			r600_resource(ib->buffer)->TC_L2_dirty = false;
+			r600_resource(indexbuf)->TC_L2_dirty = false;
 		}
 	}
 
 	if (info->indirect) {
+		struct pipe_draw_indirect_info *indirect = info->indirect;
+
 		/* Add the buffer size for memory checking in need_cs_space. */
-		r600_context_add_resource_size(ctx, info->indirect);
+		r600_context_add_resource_size(ctx, indirect->buffer);
 
-		if (r600_resource(info->indirect)->TC_L2_dirty) {
-			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-			r600_resource(info->indirect)->TC_L2_dirty = false;
-		}
+		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
+		if (sctx->b.chip_class <= VI) {
+			if (r600_resource(indirect->buffer)->TC_L2_dirty) {
+				sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+				r600_resource(indirect->buffer)->TC_L2_dirty = false;
+			}
 
-		if (info->indirect_params &&
-		    r600_resource(info->indirect_params)->TC_L2_dirty) {
-			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-			r600_resource(info->indirect_params)->TC_L2_dirty = false;
+			if (indirect->indirect_draw_count &&
+			    r600_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
+				sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+				r600_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+			}
 		}
 	}
 
@@ -1337,7 +1391,7 @@
 	si_emit_draw_registers(sctx, info, num_patches);
 
 	si_ce_pre_draw_synchronization(sctx);
-	si_emit_draw_packets(sctx, info, ib);
+	si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
 	si_ce_post_draw_synchronization(sctx);
 
 	if (sctx->trace_buf)
@@ -1352,41 +1406,13 @@
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
 	}
 
-	if (sctx->framebuffer.do_update_surf_dirtiness) {
-		/* Set the depth buffer as dirty. */
-		if (sctx->framebuffer.state.zsbuf) {
-			struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
-			struct r600_texture *rtex = (struct r600_texture *)surf->texture;
-
-			if (!rtex->tc_compatible_htile)
-				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-
-			if (rtex->surface.flags & RADEON_SURF_SBUFFER)
-				rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
-		}
-		if (sctx->framebuffer.compressed_cb_mask) {
-			struct pipe_surface *surf;
-			struct r600_texture *rtex;
-			unsigned mask = sctx->framebuffer.compressed_cb_mask;
-
-			do {
-				unsigned i = u_bit_scan(&mask);
-				surf = sctx->framebuffer.state.cbufs[i];
-				rtex = (struct r600_texture*)surf->texture;
-
-				if (rtex->fmask.size)
-					rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-				if (rtex->dcc_gather_statistics)
-					rtex->separate_dcc_dirty = true;
-			} while (mask);
-		}
-		sctx->framebuffer.do_update_surf_dirtiness = false;
-	}
-
-	pipe_resource_reference(&ib_tmp.buffer, NULL);
 	sctx->b.num_draw_calls++;
+	if (info->primitive_restart)
+		sctx->b.num_prim_restart_calls++;
 	if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
 		sctx->b.num_spill_draw_calls++;
+	if (index_size && indexbuf != info->index.resource)
+		pipe_resource_reference(&indexbuf, NULL);
 }
 
 void si_trace_emit(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index f241390..63fd3ff 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -39,6 +39,7 @@
 
 #include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
+#include "ac_exp_param.h"
 
 /* SHADER_CACHE */
 
@@ -323,10 +324,10 @@
 /* SHADER STATES */
 
 static void si_set_tesseval_regs(struct si_screen *sscreen,
-				 struct si_shader *shader,
+				 struct si_shader_selector *tes,
 				 struct si_pm4_state *pm4)
 {
-	struct tgsi_shader_info *info = &shader->selector->info;
+	struct tgsi_shader_info *info = &tes->info;
 	unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
 	bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
@@ -400,26 +401,29 @@
  *     VS as ES | ES -> GS -> VS             | 30
  *    TES as VS | LS -> HS -> VS             | 14 or 30
  *    TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
+ *
+ * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
  */
 static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen,
+					 struct si_shader_selector *sel,
 					 struct si_shader *shader,
 					 struct si_pm4_state *pm4)
 {
-	unsigned type = shader->selector->type;
+	unsigned type = sel->type;
 
 	if (sscreen->b.family < CHIP_POLARIS10)
 		return;
 
 	/* VS as VS, or VS as ES: */
 	if ((type == PIPE_SHADER_VERTEX &&
-	     !shader->key.as_ls &&
-	     !shader->is_gs_copy_shader) ||
+	     (!shader ||
+	      (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
 	    /* TES as VS, or TES as ES: */
 	    type == PIPE_SHADER_TESS_EVAL) {
 		unsigned vtx_reuse_depth = 30;
 
 		if (type == PIPE_SHADER_TESS_EVAL &&
-		    shader->selector->info.properties[TGSI_PROPERTY_TES_SPACING] ==
+		    sel->info.properties[TGSI_PROPERTY_TES_SPACING] ==
 		    PIPE_TESS_SPACING_FRACTIONAL_ODD)
 			vtx_reuse_depth = 14;
 
@@ -454,8 +458,10 @@
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
 	/* We need at least 2 components for LS.
-	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
-	vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
+	 * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+	 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+	 */
+	vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
 
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
@@ -473,6 +479,7 @@
 {
 	struct si_pm4_state *pm4;
 	uint64_t va;
+	unsigned ls_vgpr_comp_cnt = 0;
 
 	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
@@ -481,17 +488,41 @@
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
-	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
-	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
+	if (sscreen->b.chip_class >= GFX9) {
+		si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+		si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >> 40);
+
+		/* We need at least 2 components for LS.
+		 * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+		 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+		 */
+		ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
+
+		shader->config.rsrc2 =
+			S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) |
+			S_00B42C_USER_SGPR_MSB(GFX9_TCS_NUM_USER_SGPR >> 5) |
+			S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+	} else {
+		si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+		si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
+
+		shader->config.rsrc2 =
+			S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) |
+			S_00B42C_OC_LDS_EN(1) |
+			S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+	}
+
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
 		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) |
 		       S_00B428_DX10_CLAMP(1) |
-		       S_00B428_FLOAT_MODE(shader->config.float_mode));
-	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
-		       S_00B42C_USER_SGPR(SI_TCS_NUM_USER_SGPR) |
-		       S_00B42C_OC_LDS_EN(sscreen->b.chip_class <= VI) |
-		       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+		       S_00B428_FLOAT_MODE(shader->config.float_mode) |
+		       S_00B428_LS_VGPR_COMP_CNT(ls_vgpr_comp_cnt));
+
+	if (sscreen->b.chip_class <= VI) {
+		si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
+			       shader->config.rsrc2);
+	}
 }
 
 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
@@ -512,10 +543,11 @@
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
 	if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
+		/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+		vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
 		num_user_sgprs = SI_TES_NUM_USER_SGPR;
 	} else
 		unreachable("invalid shader selector type");
@@ -538,9 +570,9 @@
 		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		si_set_tesseval_regs(sscreen, shader, pm4);
+		si_set_tesseval_regs(sscreen, shader->selector, pm4);
 
-	polaris_set_vgt_vertex_reuse(sscreen, shader, pm4);
+	polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
 }
 
 /**
@@ -549,6 +581,7 @@
  */
 static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel)
 {
+	enum chip_class chip_class = sel->screen->b.chip_class;
 	unsigned gs_max_vert_out = sel->gs_max_out_vertices;
 	unsigned cut_mode;
 
@@ -565,11 +598,120 @@
 
 	return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
 	       S_028A40_CUT_MODE(cut_mode)|
-	       S_028A40_ES_WRITE_OPTIMIZE(1) |
-	       S_028A40_GS_WRITE_OPTIMIZE(1);
+	       S_028A40_ES_WRITE_OPTIMIZE(chip_class <= VI) |
+	       S_028A40_GS_WRITE_OPTIMIZE(1) |
+	       S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
 }
 
-static void si_shader_gs(struct si_shader *shader)
+struct gfx9_gs_info {
+	unsigned es_verts_per_subgroup;
+	unsigned gs_prims_per_subgroup;
+	unsigned gs_inst_prims_in_subgroup;
+	unsigned max_prims_per_subgroup;
+	unsigned lds_size;
+};
+
+static void gfx9_get_gs_info(struct si_shader_selector *es,
+				   struct si_shader_selector *gs,
+				   struct gfx9_gs_info *out)
+{
+	unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
+	unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+	bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
+			      input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+
+	/* All these are in dwords: */
+	/* We can't allow using the whole LDS, because GS waves compete with
+	 * other shader stages for LDS space. */
+	const unsigned max_lds_size = 8 * 1024;
+	const unsigned esgs_itemsize = es->esgs_itemsize / 4;
+	unsigned esgs_lds_size;
+
+	/* All these are per subgroup: */
+	const unsigned max_out_prims = 32 * 1024;
+	const unsigned max_es_verts = 255;
+	const unsigned ideal_gs_prims = 64;
+	unsigned max_gs_prims, gs_prims;
+	unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+	assert(gs_num_invocations <= 32); /* GL maximum */
+
+	if (uses_adjacency || gs_num_invocations > 1)
+		max_gs_prims = 127 / gs_num_invocations;
+	else
+		max_gs_prims = 255;
+
+	/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+	 * Make sure we don't go over the maximum value.
+	 */
+	if (gs->gs_max_out_vertices > 0) {
+		max_gs_prims = MIN2(max_gs_prims,
+				    max_out_prims /
+				    (gs->gs_max_out_vertices * gs_num_invocations));
+	}
+	assert(max_gs_prims > 0);
+
+	/* If the primitive has adjacency, halve the number of vertices
+	 * that will be reused in multiple primitives.
+	 */
+	min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
+
+	gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+	worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+	/* Compute ESGS LDS size based on the worst case number of ES vertices
+	 * needed to create the target number of GS prims per subgroup.
+	 */
+	esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+	/* If total LDS usage is too big, refactor partitions based on ratio
+	 * of ESGS item sizes.
+	 */
+	if (esgs_lds_size > max_lds_size) {
+		/* Our target GS Prims Per Subgroup was too large. Calculate
+		 * the maximum number of GS Prims Per Subgroup that will fit
+		 * into LDS, capped by the maximum that the hardware can support.
+		 */
+		gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
+				max_gs_prims);
+		assert(gs_prims > 0);
+		worst_case_es_verts = MIN2(min_es_verts * gs_prims,
+					   max_es_verts);
+
+		esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+		assert(esgs_lds_size <= max_lds_size);
+	}
+
+	/* Now calculate remaining ESGS information. */
+	if (esgs_lds_size)
+		es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+	else
+		es_verts = max_es_verts;
+
+	/* Vertices for adjacency primitives are not always reused, so restore
+	 * it for ES_VERTS_PER_SUBGRP.
+	 */
+	min_es_verts = gs->gs_input_verts_per_prim;
+
+	/* For normal primitives, the VGT only checks if they are past the ES
+	 * verts per subgroup after allocating a full GS primitive and if they
+	 * are, kick off a new subgroup.  But if those additional ES verts are
+	 * unique (e.g. not reused) we need to make sure there is enough LDS
+	 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+	 */
+	es_verts -= min_es_verts - 1;
+
+	out->es_verts_per_subgroup = es_verts;
+	out->gs_prims_per_subgroup = gs_prims;
+	out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+	out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
+				      gs->gs_max_out_vertices;
+	out->lds_size = align(esgs_lds_size, 128) / 128;
+
+	assert(out->max_prims_per_subgroup <= max_out_prims);
+}
+
+static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 {
 	struct si_shader_selector *sel = shader->selector;
 	const ubyte *num_components = sel->info.num_stream_output_components;
@@ -598,7 +740,7 @@
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(offset < (1 << 15));
 
-	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);
+	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
 
 	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
 	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
@@ -611,17 +753,79 @@
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-	si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
-	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 
-	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
-		       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
-		       S_00B228_DX10_CLAMP(1) |
-		       S_00B228_FLOAT_MODE(shader->config.float_mode));
-	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-		       S_00B22C_USER_SGPR(SI_GS_NUM_USER_SGPR) |
-		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+	if (sscreen->b.chip_class >= GFX9) {
+		unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+		unsigned es_type = shader->key.part.gs.es->type;
+		unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+		struct gfx9_gs_info gs_info;
+
+		if (es_type == PIPE_SHADER_VERTEX)
+			/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+			es_vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
+		else if (es_type == PIPE_SHADER_TESS_EVAL)
+			es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
+		else
+			unreachable("invalid shader selector type");
+
+		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+		 * VGPR[0:4] are always loaded.
+		 */
+		if (sel->info.uses_invocationid)
+			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+		else if (sel->info.uses_primid)
+			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+		else if (input_prim >= PIPE_PRIM_TRIANGLES)
+			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+		else
+			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+		gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
+
+		si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+		si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, va >> 40);
+
+		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+			       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+			       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+			       S_00B228_DX10_CLAMP(1) |
+			       S_00B228_FLOAT_MODE(shader->config.float_mode) |
+			       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+			       S_00B22C_USER_SGPR(GFX9_GS_NUM_USER_SGPR) |
+			       S_00B22C_USER_SGPR_MSB(GFX9_GS_NUM_USER_SGPR >> 5) |
+			       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+			       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+			       S_00B22C_LDS_SIZE(gs_info.lds_size) |
+			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+
+		si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+			       S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) |
+			       S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) |
+			       S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup));
+		si_pm4_set_reg(pm4, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+			       S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
+		si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+			       shader->key.part.gs.es->esgs_itemsize / 4);
+
+		if (es_type == PIPE_SHADER_TESS_EVAL)
+			si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+		polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
+					     NULL, pm4);
+	} else {
+		si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+		si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
+
+		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+			       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+			       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+			       S_00B228_DX10_CLAMP(1) |
+			       S_00B228_FLOAT_MODE(shader->config.float_mode));
+		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+			       S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+	}
 }
 
 /**
@@ -634,14 +838,15 @@
 static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                          struct si_shader_selector *gs)
 {
+	const struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned num_user_sgprs;
 	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned oc_lds_en;
 	unsigned window_space =
-	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	bool enable_prim_id = si_vs_exports_prim_id(shader);
+	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
 
 	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
@@ -672,6 +877,12 @@
 		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
 	}
 
+	if (sscreen->b.chip_class <= VI) {
+		/* Reuse needs to be set off if we write oViewport. */
+		si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF,
+			       S_028AB4_REUSE_OFF(info->writes_viewport_index));
+	}
+
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
@@ -679,10 +890,14 @@
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
+		/* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
+		 * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
+		 * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+		 */
+		vgpr_comp_cnt = enable_prim_id ? 2 : (shader->info.uses_instanceid ? 1 : 0);
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		vgpr_comp_cnt = enable_prim_id ? 3 : 2;
 		num_user_sgprs = SI_TES_NUM_USER_SGPR;
 	} else
 		unreachable("invalid shader selector type");
@@ -734,9 +949,9 @@
 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		si_set_tesseval_regs(sscreen, shader, pm4);
+		si_set_tesseval_regs(sscreen, shader->selector, pm4);
 
-	polaris_set_vgt_vertex_reuse(sscreen, shader, pm4);
+	polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
 }
 
 static unsigned si_get_ps_num_interp(struct si_shader *ps)
@@ -954,7 +1169,7 @@
 			si_shader_vs(sscreen, shader, NULL);
 		break;
 	case PIPE_SHADER_GEOMETRY:
-		si_shader_gs(shader);
+		si_shader_gs(sscreen, shader);
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		si_shader_ps(shader);
@@ -973,13 +1188,36 @@
 	return PIPE_FUNC_ALWAYS;
 }
 
+static void si_shader_selector_key_vs(struct si_context *sctx,
+				      struct si_shader_selector *vs,
+				      struct si_shader_key *key,
+				      struct si_vs_prolog_bits *prolog_key)
+{
+	if (!sctx->vertex_elements)
+		return;
+
+	prolog_key->instance_divisor_is_one =
+		sctx->vertex_elements->instance_divisor_is_one;
+	prolog_key->instance_divisor_is_fetched =
+		sctx->vertex_elements->instance_divisor_is_fetched;
+
+	/* Prefer a monolithic shader to allow scheduling divisions around
+	 * VBO loads. */
+	if (prolog_key->instance_divisor_is_fetched)
+		key->opt.prefer_mono = 1;
+
+	unsigned count = MIN2(vs->info.num_inputs,
+			      sctx->vertex_elements->count);
+	memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count);
+}
+
 static void si_shader_selector_key_hw_vs(struct si_context *sctx,
 					 struct si_shader_selector *vs,
 					 struct si_shader_key *key)
 {
 	struct si_shader_selector *ps = sctx->ps_shader.cso;
 
-	key->opt.hw_vs.clip_disable =
+	key->opt.clip_disable =
 		sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
 		(vs->info.clipdist_writemask ||
 		 vs->info.writes_clipvertex) &&
@@ -1007,22 +1245,19 @@
 
 	/* Find out which VS outputs aren't used by the PS. */
 	uint64_t outputs_written = vs->outputs_written;
-	uint32_t outputs_written2 = vs->outputs_written2;
 	uint64_t inputs_read = 0;
-	uint32_t inputs_read2 = 0;
 
-	outputs_written &= ~0x3; /* ignore POSITION, PSIZE */
+	/* ignore POSITION, PSIZE */
+	outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0) |
+			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0))));
 
 	if (!ps_disabled) {
 		inputs_read = ps->inputs_read;
-		inputs_read2 = ps->inputs_read2;
 	}
 
 	uint64_t linked = outputs_written & inputs_read;
-	uint32_t linked2 = outputs_written2 & inputs_read2;
 
-	key->opt.hw_vs.kill_outputs = ~linked & outputs_written;
-	key->opt.hw_vs.kill_outputs2 = ~linked2 & outputs_written2;
+	key->opt.kill_outputs = ~linked & outputs_written;
 }
 
 /* Compute the key for the hw shader variant */
@@ -1031,22 +1266,13 @@
 					  struct si_shader_key *key)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	unsigned i;
 
 	memset(key, 0, sizeof(*key));
 
 	switch (sel->type) {
 	case PIPE_SHADER_VERTEX:
-		if (sctx->vertex_elements) {
-			unsigned count = MIN2(sel->info.num_inputs,
-					      sctx->vertex_elements->count);
-			for (i = 0; i < count; ++i)
-				key->part.vs.prolog.instance_divisors[i] =
-					sctx->vertex_elements->elements[i].instance_divisor;
+		si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
 
-			memcpy(key->mono.vs.fix_fetch,
-			       sctx->vertex_elements->fix_fetch, count);
-		}
 		if (sctx->tes_shader.cso)
 			key->as_ls = 1;
 		else if (sctx->gs_shader.cso)
@@ -1055,17 +1281,23 @@
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->part.vs.epilog.export_prim_id = 1;
+				key->mono.u.vs_export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_TESS_CTRL:
+		if (sctx->b.chip_class >= GFX9) {
+			si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
+						  key, &key->part.tcs.ls_prolog);
+			key->part.tcs.ls = sctx->vs_shader.cso;
+		}
+
 		key->part.tcs.epilog.prim_mode =
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		key->part.tcs.epilog.tes_reads_tess_factors =
 			sctx->tes_shader.cso->info.reads_tess_factors;
 
 		if (sel == sctx->fixed_func_tcs_shader.cso)
-			key->mono.tcs.inputs_to_copy = sctx->vs_shader.cso->outputs_written;
+			key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
 		break;
 	case PIPE_SHADER_TESS_EVAL:
 		if (sctx->gs_shader.cso)
@@ -1074,10 +1306,38 @@
 			si_shader_selector_key_hw_vs(sctx, sel, key);
 
 			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->part.tes.epilog.export_prim_id = 1;
+				key->mono.u.vs_export_prim_id = 1;
 		}
 		break;
 	case PIPE_SHADER_GEOMETRY:
+		if (sctx->b.chip_class >= GFX9) {
+			if (sctx->tes_shader.cso) {
+				key->part.gs.es = sctx->tes_shader.cso;
+			} else {
+				si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
+							  key, &key->part.gs.vs_prolog);
+				key->part.gs.es = sctx->vs_shader.cso;
+			}
+
+			/* Merged ES-GS can have unbalanced wave usage.
+			 *
+			 * ES threads are per-vertex, while GS threads are
+			 * per-primitive. So without any amplification, there
+			 * are fewer GS threads than ES threads, which can result
+			 * in empty (no-op) GS waves. With too much amplification,
+			 * there are more GS threads than ES threads, which
+			 * can result in empty (no-op) ES waves.
+			 *
+			 * Non-monolithic shaders are implemented by setting EXEC
+			 * at the beginning of shader parts, and don't jump to
+			 * the end if EXEC is 0.
+			 *
+			 * Monolithic shaders use conditional blocks, so they can
+			 * jump and skip empty waves of ES or GS. So set this to
+			 * always use optimized variants, which are monolithic.
+			 */
+			key->opt.prefer_mono = 1;
+		}
 		key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
 		break;
 	case PIPE_SHADER_FRAGMENT: {
@@ -1135,10 +1395,10 @@
 		}
 
 		if (rs) {
-			bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
-					sctx->current_rast_prim <= PIPE_PRIM_POLYGON) ||
-				       sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
-			bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
+			bool is_poly = (sctx->b.current_rast_prim >= PIPE_PRIM_TRIANGLES &&
+					sctx->b.current_rast_prim <= PIPE_PRIM_POLYGON) ||
+				       sctx->b.current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
+			bool is_line = !is_poly && sctx->b.current_rast_prim != PIPE_PRIM_POINTS;
 
 			key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
 			key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
@@ -1154,6 +1414,12 @@
 							     sctx->framebuffer.nr_samples <= 1;
 			key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
 
+			if (sctx->ps_iter_samples > 1 &&
+			    sel->info.reads_samplemask) {
+				key->part.ps.prolog.samplemask_log_ps_iter =
+					util_logbase2(util_next_power_of_two(sctx->ps_iter_samples));
+			}
+
 			if (rs->force_persample_interp &&
 			    rs->multisample_enable &&
 			    sctx->framebuffer.nr_samples > 1 &&
@@ -1194,11 +1460,15 @@
 	default:
 		assert(0);
 	}
+
+	if (unlikely(sctx->screen->b.debug_flags & DBG_NO_OPT_VARIANT))
+		memset(&key->opt, 0, sizeof(key->opt));
 }
 
-static void si_build_shader_variant(void *job, int thread_index)
+static void si_build_shader_variant(struct si_shader *shader,
+				    int thread_index,
+				    bool low_priority)
 {
-	struct si_shader *shader = (struct si_shader *)job;
 	struct si_shader_selector *sel = shader->selector;
 	struct si_screen *sscreen = sel->screen;
 	LLVMTargetMachineRef tm;
@@ -1206,11 +1476,17 @@
 	int r;
 
 	if (thread_index >= 0) {
-		assert(thread_index < ARRAY_SIZE(sscreen->tm));
-		tm = sscreen->tm[thread_index];
+		if (low_priority) {
+			assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority));
+			tm = sscreen->tm_low_priority[thread_index];
+		} else {
+			assert(thread_index < ARRAY_SIZE(sscreen->tm));
+			tm = sscreen->tm[thread_index];
+		}
 		if (!debug->async)
 			debug = NULL;
 	} else {
+		assert(!low_priority);
 		tm = shader->compiler_ctx_state.tm;
 	}
 
@@ -1234,6 +1510,58 @@
 	si_shader_init_pm4_state(sscreen, shader);
 }
 
+static void si_build_shader_variant_low_priority(void *job, int thread_index)
+{
+	struct si_shader *shader = (struct si_shader *)job;
+
+	assert(thread_index >= 0);
+
+	si_build_shader_variant(shader, thread_index, true);
+}
+
+static const struct si_shader_key zeroed;
+
+static bool si_check_missing_main_part(struct si_screen *sscreen,
+				       struct si_shader_selector *sel,
+				       struct si_compiler_ctx_state *compiler_state,
+				       struct si_shader_key *key)
+{
+	struct si_shader **mainp = si_get_main_shader_part(sel, key);
+
+	if (!*mainp) {
+		struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+
+		if (!main_part)
+			return false;
+
+		main_part->selector = sel;
+		main_part->key.as_es = key->as_es;
+		main_part->key.as_ls = key->as_ls;
+
+		if (si_compile_tgsi_shader(sscreen, compiler_state->tm,
+					   main_part, false,
+					   &compiler_state->debug) != 0) {
+			FREE(main_part);
+			return false;
+		}
+		*mainp = main_part;
+	}
+	return true;
+}
+
+static void si_destroy_shader_selector(struct si_context *sctx,
+				       struct si_shader_selector *sel);
+
+static void si_shader_selector_reference(struct si_context *sctx,
+					 struct si_shader_selector **dst,
+					 struct si_shader_selector *src)
+{
+	if (pipe_reference(&(*dst)->reference, &src->reference))
+		si_destroy_shader_selector(sctx, *dst);
+
+	*dst = src;
+}
+
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select_with_key(struct si_screen *sscreen,
 				     struct si_shader_ctx_state *state,
@@ -1241,15 +1569,11 @@
 				     struct si_shader_key *key,
 				     int thread_index)
 {
-	static const struct si_shader_key zeroed;
 	struct si_shader_selector *sel = state->cso;
+	struct si_shader_selector *previous_stage_sel = NULL;
 	struct si_shader *current = state->current;
 	struct si_shader *iter, *shader = NULL;
 
-	if (unlikely(sscreen->b.debug_flags & DBG_NO_OPT_VARIANT)) {
-		memset(&key->opt, 0, sizeof(key->opt));
-	}
-
 again:
 	/* Check if we don't need to change anything.
 	 * This path is also used for most shaders that don't need multiple
@@ -1310,37 +1634,69 @@
 	shader->key = *key;
 	shader->compiler_ctx_state = *compiler_state;
 
+	/* If this is a merged shader, get the first shader's selector. */
+	if (sscreen->b.chip_class >= GFX9) {
+		if (sel->type == PIPE_SHADER_TESS_CTRL)
+			previous_stage_sel = key->part.tcs.ls;
+		else if (sel->type == PIPE_SHADER_GEOMETRY)
+			previous_stage_sel = key->part.gs.es;
+
+		/* We need to wait for the previous shader. */
+		if (previous_stage_sel && thread_index < 0)
+			util_queue_fence_wait(&previous_stage_sel->ready);
+	}
+
 	/* Compile the main shader part if it doesn't exist. This can happen
 	 * if the initial guess was wrong. */
-	struct si_shader **mainp = si_get_main_shader_part(sel, key);
 	bool is_pure_monolithic =
 		sscreen->use_monolithic_shaders ||
 		memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
 
-	if (!*mainp && !is_pure_monolithic) {
-		struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+	if (!is_pure_monolithic) {
+		bool ok;
 
-		if (!main_part) {
+		/* Make sure the main shader part is present. This is needed
+		 * for shaders that can be compiled as VS, LS, or ES, and only
+		 * one of them is compiled at creation.
+		 *
+		 * For merged shaders, check that the starting shader's main
+		 * part is present.
+		 */
+		if (previous_stage_sel) {
+			struct si_shader_key shader1_key = zeroed;
+
+			if (sel->type == PIPE_SHADER_TESS_CTRL)
+				shader1_key.as_ls = 1;
+			else if (sel->type == PIPE_SHADER_GEOMETRY)
+				shader1_key.as_es = 1;
+			else
+				assert(0);
+
+			mtx_lock(&previous_stage_sel->mutex);
+			ok = si_check_missing_main_part(sscreen,
+							previous_stage_sel,
+							compiler_state, &shader1_key);
+			mtx_unlock(&previous_stage_sel->mutex);
+		} else {
+			ok = si_check_missing_main_part(sscreen, sel,
+							compiler_state, key);
+		}
+		if (!ok) {
 			FREE(shader);
 			mtx_unlock(&sel->mutex);
 			return -ENOMEM; /* skip the draw call */
 		}
-
-		main_part->selector = sel;
-		main_part->key.as_es = key->as_es;
-		main_part->key.as_ls = key->as_ls;
-
-		if (si_compile_tgsi_shader(sscreen, compiler_state->tm,
-					   main_part, false,
-					   &compiler_state->debug) != 0) {
-			FREE(main_part);
-			FREE(shader);
-			mtx_unlock(&sel->mutex);
-			return -ENOMEM; /* skip the draw call */
-		}
-		*mainp = main_part;
 	}
 
+	/* Keep the reference to the 1st shader of merged shaders, so that
+	 * Gallium can't destroy it before we destroy the 2nd shader.
+	 *
+	 * Set sctx = NULL, because it's unused if we're not releasing
+	 * the shader, and we don't have any sctx here.
+	 */
+	si_shader_selector_reference(NULL, &shader->previous_stage_sel,
+				     previous_stage_sel);
+
 	/* Monolithic-only shaders don't make a distinction between optimized
 	 * and unoptimized. */
 	shader->is_monolithic =
@@ -1366,9 +1722,9 @@
 	    !is_pure_monolithic &&
 	    thread_index < 0) {
 		/* Compile it asynchronously. */
-		util_queue_add_job(&sscreen->shader_compiler_queue,
+		util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
 				   shader, &shader->optimized_ready,
-				   si_build_shader_variant, NULL);
+				   si_build_shader_variant_low_priority, NULL);
 
 		/* Use the default (unoptimized) shader for now. */
 		memset(&key->opt, 0, sizeof(key->opt));
@@ -1377,7 +1733,7 @@
 	}
 
 	assert(!shader->is_optimized);
-	si_build_shader_variant(shader, thread_index);
+	si_build_shader_variant(shader, thread_index, false);
 
 	if (!shader->compilation_failed)
 		state->current = shader;
@@ -1515,7 +1871,7 @@
 			for (i = 0; i < sel->info.num_outputs; i++) {
 				unsigned offset = shader->info.vs_output_param_offset[i];
 
-				if (offset <= EXP_PARAM_OFFSET_31)
+				if (offset <= AC_EXP_PARAM_OFFSET_31)
 					continue;
 
 				unsigned name = sel->info.output_semantic_name[i];
@@ -1525,10 +1881,10 @@
 				switch (name) {
 				case TGSI_SEMANTIC_GENERIC:
 					/* don't process indices the function can't handle */
-					if (index >= 60)
+					if (index >= SI_MAX_IO_GENERIC)
 						break;
 					/* fall through */
-				case TGSI_SEMANTIC_CLIPDIST:
+				default:
 					id = si_shader_io_get_unique_index(name, index);
 					sel->outputs_written &= ~(1ull << id);
 					break;
@@ -1537,9 +1893,6 @@
 				case TGSI_SEMANTIC_CLIPVERTEX:
 				case TGSI_SEMANTIC_EDGEFLAG:
 					break;
-				default:
-					id = si_shader_io_get_unique_index2(name, index);
-					sel->outputs_written2 &= ~(1u << id);
 				}
 			}
 		}
@@ -1593,6 +1946,30 @@
 	}
 }
 
+/* Return descriptor slot usage masks from the given shader info. */
+void si_get_active_slot_masks(const struct tgsi_shader_info *info,
+			      uint32_t *const_and_shader_buffers,
+			      uint64_t *samplers_and_images)
+{
+	unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers;
+
+	num_shaderbufs = util_last_bit(info->shader_buffers_declared);
+	num_constbufs = util_last_bit(info->const_buffers_declared);
+	/* two 8-byte images share one 16-byte slot */
+	num_images = align(util_last_bit(info->images_declared), 2);
+	num_samplers = util_last_bit(info->samplers_declared);
+
+	/* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+	start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+	*const_and_shader_buffers =
+		u_bit_consecutive(start, num_shaderbufs + num_constbufs);
+
+	/* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */
+	start = si_get_image_slot(num_images - 1) / 2;
+	*samplers_and_images =
+		u_bit_consecutive64(start, num_images / 2 + num_samplers);
+}
+
 static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
@@ -1604,6 +1981,7 @@
 	if (!sel)
 		return NULL;
 
+	pipe_reference_init(&sel->reference, 1);
 	sel->screen = sscreen;
 	sel->compiler_ctx_state.tm = sctx->tm;
 	sel->compiler_ctx_state.debug = sctx->b.debug;
@@ -1618,6 +1996,16 @@
 	tgsi_scan_shader(state->tokens, &sel->info);
 	sel->type = sel->info.processor;
 	p_atomic_inc(&sscreen->b.num_shaders_created);
+	si_get_active_slot_masks(&sel->info,
+				 &sel->active_const_and_shader_buffers,
+				 &sel->active_samplers_and_images);
+
+	/* Record which streamout buffers are enabled. */
+	for (i = 0; i < sel->so.num_outputs; i++) {
+		sel->enabled_streamout_buffer_mask |=
+			(1 << sel->so.output[i].output_buffer) <<
+			(sel->so.output[i].stream * 4);
+	}
 
 	/* The prolog is a no-op if there are no inputs. */
 	sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
@@ -1662,8 +2050,8 @@
 	case PIPE_SHADER_TESS_CTRL:
 		/* Always reserve space for these. */
 		sel->patch_outputs_written |=
-			(1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0)) |
-			(1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0));
+			(1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
+			(1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
 		/* fall through */
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_TESS_EVAL:
@@ -1676,29 +2064,30 @@
 			case TGSI_SEMANTIC_TESSOUTER:
 			case TGSI_SEMANTIC_PATCH:
 				sel->patch_outputs_written |=
-					1llu << si_shader_io_get_unique_index(name, index);
+					1ull << si_shader_io_get_unique_index_patch(name, index);
 				break;
 
 			case TGSI_SEMANTIC_GENERIC:
 				/* don't process indices the function can't handle */
-				if (index >= 60)
+				if (index >= SI_MAX_IO_GENERIC)
 					break;
 				/* fall through */
-			case TGSI_SEMANTIC_POSITION:
-			case TGSI_SEMANTIC_PSIZE:
-			case TGSI_SEMANTIC_CLIPDIST:
+			default:
 				sel->outputs_written |=
-					1llu << si_shader_io_get_unique_index(name, index);
+					1ull << si_shader_io_get_unique_index(name, index);
 				break;
 			case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
 			case TGSI_SEMANTIC_EDGEFLAG:
 				break;
-			default:
-				sel->outputs_written2 |=
-					1u << si_shader_io_get_unique_index2(name, index);
 			}
 		}
 		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+
+		/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
+		 * conflicts, i.e. each vertex will start at a different bank.
+		 */
+		if (sctx->b.chip_class >= GFX9)
+			sel->esgs_itemsize += 4;
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
@@ -1707,16 +2096,17 @@
 			unsigned index = sel->info.input_semantic_index[i];
 
 			switch (name) {
-			case TGSI_SEMANTIC_CLIPDIST:
 			case TGSI_SEMANTIC_GENERIC:
+				/* don't process indices the function can't handle */
+				if (index >= SI_MAX_IO_GENERIC)
+					break;
+				/* fall through */
+			default:
 				sel->inputs_read |=
-					1llu << si_shader_io_get_unique_index(name, index);
+					1ull << si_shader_io_get_unique_index(name, index);
 				break;
 			case TGSI_SEMANTIC_PCOORD: /* ignore this */
 				break;
-			default:
-				sel->inputs_read2 |=
-					1u << si_shader_io_get_unique_index2(name, index);
 			}
 		}
 
@@ -1733,6 +2123,22 @@
 		break;
 	}
 
+	/* PA_CL_VS_OUT_CNTL */
+	bool misc_vec_ena =
+		sel->info.writes_psize || sel->info.writes_edgeflag ||
+		sel->info.writes_layer || sel->info.writes_viewport_index;
+	sel->pa_cl_vs_out_cntl =
+		S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+		S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag) |
+		S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+		S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+		S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+		S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+	sel->clipdist_mask = sel->info.writes_clipvertex ?
+				     SIX_BITS : sel->info.clipdist_writemask;
+	sel->culldist_mask = sel->info.culldist_writemask <<
+			     sel->info.num_written_clipdistance;
+
 	/* DB_SHADER_CONTROL */
 	sel->db_shader_control =
 		S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
@@ -1798,9 +2204,60 @@
 	return sel;
 }
 
+static void si_update_streamout_state(struct si_context *sctx)
+{
+	struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
+
+	if (!shader_with_so)
+		return;
+
+	sctx->b.streamout.enabled_stream_buffers_mask =
+		shader_with_so->enabled_streamout_buffer_mask;
+	sctx->b.streamout.stride_in_dw = shader_with_so->so.stride;
+}
+
+static void si_update_clip_regs(struct si_context *sctx,
+				struct si_shader_selector *old_hw_vs,
+				struct si_shader *old_hw_vs_variant,
+				struct si_shader_selector *next_hw_vs,
+				struct si_shader *next_hw_vs_variant)
+{
+	if (next_hw_vs &&
+	    (!old_hw_vs ||
+	     old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
+	     next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
+	     old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
+	     old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
+	     old_hw_vs->culldist_mask != next_hw_vs->culldist_mask ||
+	     !old_hw_vs_variant ||
+	     !next_hw_vs_variant ||
+	     old_hw_vs_variant->key.opt.clip_disable !=
+	     next_hw_vs_variant->key.opt.clip_disable))
+		si_mark_atom_dirty(sctx, &sctx->clip_regs);
+}
+
+static void si_update_common_shader_state(struct si_context *sctx)
+{
+	sctx->uses_bindless_samplers =
+		si_shader_uses_bindless_samplers(sctx->vs_shader.cso)  ||
+		si_shader_uses_bindless_samplers(sctx->gs_shader.cso)  ||
+		si_shader_uses_bindless_samplers(sctx->ps_shader.cso)  ||
+		si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
+		si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
+	sctx->uses_bindless_images =
+		si_shader_uses_bindless_images(sctx->vs_shader.cso)  ||
+		si_shader_uses_bindless_images(sctx->gs_shader.cso)  ||
+		si_shader_uses_bindless_images(sctx->ps_shader.cso)  ||
+		si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
+		si_shader_uses_bindless_images(sctx->tes_shader.cso);
+	sctx->do_update_shaders = true;
+}
+
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
 	struct si_shader_selector *sel = state;
 
 	if (sctx->vs_shader.cso == sel)
@@ -1808,14 +2265,18 @@
 
 	sctx->vs_shader.cso = sel;
 	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
-	sctx->do_update_shaders = true;
-	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+
+	si_update_common_shader_state(sctx);
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
+	si_update_streamout_state(sctx);
+	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
+			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
 }
 
-static void si_update_tcs_tes_uses_prim_id(struct si_context *sctx)
+static void si_update_tess_uses_prim_id(struct si_context *sctx)
 {
-	sctx->ia_multi_vgt_param_key.u.tcs_tes_uses_prim_id =
+	sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
 		(sctx->tes_shader.cso &&
 		 sctx->tes_shader.cso->info.uses_primid) ||
 		(sctx->tcs_shader.cso &&
@@ -1829,6 +2290,8 @@
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
 	struct si_shader_selector *sel = state;
 	bool enable_changed = !!sctx->gs_shader.cso != !!sel;
 
@@ -1838,16 +2301,20 @@
 	sctx->gs_shader.cso = sel;
 	sctx->gs_shader.current = sel ? sel->first_variant : NULL;
 	sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
-	sctx->do_update_shaders = true;
-	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+
+	si_update_common_shader_state(sctx);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed) {
 		si_shader_change_notify(sctx);
 		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
-			si_update_tcs_tes_uses_prim_id(sctx);
+			si_update_tess_uses_prim_id(sctx);
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
+	si_update_streamout_state(sctx);
+	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
+			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
@@ -1861,16 +2328,21 @@
 
 	sctx->tcs_shader.cso = sel;
 	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
-	si_update_tcs_tes_uses_prim_id(sctx);
-	sctx->do_update_shaders = true;
+	si_update_tess_uses_prim_id(sctx);
+
+	si_update_common_shader_state(sctx);
 
 	if (enable_changed)
 		sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
 	struct si_shader_selector *sel = state;
 	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
 
@@ -1880,9 +2352,9 @@
 	sctx->tes_shader.cso = sel;
 	sctx->tes_shader.current = sel ? sel->first_variant : NULL;
 	sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
-	si_update_tcs_tes_uses_prim_id(sctx);
-	sctx->do_update_shaders = true;
-	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+	si_update_tess_uses_prim_id(sctx);
+
+	si_update_common_shader_state(sctx);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
 	if (enable_changed) {
@@ -1890,29 +2362,42 @@
 		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
 	}
 	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+	si_set_active_descriptors_for_shader(sctx, sel);
+	si_update_streamout_state(sctx);
+	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
+			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *old_sel = sctx->ps_shader.cso;
 	struct si_shader_selector *sel = state;
 
 	/* skip if supplied shader is one already in use */
-	if (sctx->ps_shader.cso == sel)
+	if (old_sel == sel)
 		return;
 
 	sctx->ps_shader.cso = sel;
 	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
-	sctx->do_update_shaders = true;
-	if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
-		si_update_tcs_tes_uses_prim_id(sctx);
-	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+
+	si_update_common_shader_state(sctx);
+	if (sel) {
+		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+			si_update_tess_uses_prim_id(sctx);
+
+		if (!old_sel ||
+		    old_sel->info.colors_written != sel->info.colors_written)
+			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+	}
+	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
-		util_queue_fence_wait(&shader->optimized_ready);
+		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
+				    &shader->optimized_ready);
 		util_queue_fence_destroy(&shader->optimized_ready);
 	}
 
@@ -1952,14 +2437,14 @@
 		}
 	}
 
+	si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
 	si_shader_destroy(shader);
 	free(shader);
 }
 
-static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+static void si_destroy_shader_selector(struct si_context *sctx,
+				       struct si_shader_selector *sel)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
 	struct si_shader *p = sel->first_variant, *c;
 	struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
 		[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
@@ -1969,7 +2454,7 @@
 		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
 	};
 
-	util_queue_fence_wait(&sel->ready);
+	util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
 
 	if (current_shader[sel->type]->cso == sel) {
 		current_shader[sel->type]->cso = NULL;
@@ -1997,6 +2482,14 @@
 	free(sel);
 }
 
+static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	si_shader_selector_reference(sctx, &sel, NULL);
+}
+
 static unsigned si_get_ps_input_cntl(struct si_context *sctx,
 				     struct si_shader *vs, unsigned name,
 				     unsigned index, unsigned interpolate)
@@ -2019,18 +2512,18 @@
 		    index == vsinfo->output_semantic_index[j]) {
 			offset = vs->info.vs_output_param_offset[j];
 
-			if (offset <= EXP_PARAM_OFFSET_31) {
+			if (offset <= AC_EXP_PARAM_OFFSET_31) {
 				/* The input is loaded from parameter memory. */
 				ps_input_cntl |= S_028644_OFFSET(offset);
 			} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-				if (offset == EXP_PARAM_UNDEFINED) {
+				if (offset == AC_EXP_PARAM_UNDEFINED) {
 					/* This can happen with depth-only rendering. */
 					offset = 0;
 				} else {
 					/* The input is a DEFAULT_VAL constant. */
-					assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 &&
-					       offset <= EXP_PARAM_DEFAULT_VAL_1111);
-					offset -= EXP_PARAM_DEFAULT_VAL_0000;
+					assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+					       offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+					offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
 				}
 
 				ps_input_cntl = S_028644_OFFSET(0x20) |
@@ -2252,6 +2745,22 @@
 	return true;
 }
 
+static void si_shader_lock(struct si_shader *shader)
+{
+	mtx_lock(&shader->selector->mutex);
+	if (shader->previous_stage_sel) {
+		assert(shader->previous_stage_sel != shader->selector);
+		mtx_lock(&shader->previous_stage_sel->mutex);
+	}
+}
+
+static void si_shader_unlock(struct si_shader *shader)
+{
+	if (shader->previous_stage_sel)
+		mtx_unlock(&shader->previous_stage_sel->mutex);
+	mtx_unlock(&shader->selector->mutex);
+}
+
 /**
  * @returns 1 if \p sel has been updated to use a new scratch buffer
  *          0 if not
@@ -2270,25 +2779,40 @@
 	if (shader->config.scratch_bytes_per_wave == 0)
 		return 0;
 
+	/* Prevent race conditions when updating:
+	 * - si_shader::scratch_bo
+	 * - si_shader::binary::code
+	 * - si_shader::previous_stage::binary::code.
+	 */
+	si_shader_lock(shader);
+
 	/* This shader is already configured to use the current
 	 * scratch buffer. */
-	if (shader->scratch_bo == sctx->scratch_buffer)
+	if (shader->scratch_bo == sctx->scratch_buffer) {
+		si_shader_unlock(shader);
 		return 0;
+	}
 
 	assert(sctx->scratch_buffer);
 
-	si_shader_apply_scratch_relocs(sctx, shader, &shader->config, scratch_va);
+	if (shader->previous_stage)
+		si_shader_apply_scratch_relocs(shader->previous_stage, scratch_va);
+
+	si_shader_apply_scratch_relocs(shader, scratch_va);
 
 	/* Replace the shader bo with a new bo that has the relocs applied. */
 	r = si_shader_binary_upload(sctx->screen, shader);
-	if (r)
+	if (r) {
+		si_shader_unlock(shader);
 		return r;
+	}
 
 	/* Update the shader state to use the new shader bo. */
 	si_shader_init_pm4_state(sctx->screen, shader);
 
 	r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
 
+	si_shader_unlock(shader);
 	return 1;
 }
 
@@ -2302,6 +2826,15 @@
 	return shader ? shader->config.scratch_bytes_per_wave : 0;
 }
 
+static struct si_shader *si_get_tcs_current(struct si_context *sctx)
+{
+	if (!sctx->tes_shader.cso)
+		return NULL; /* tessellation disabled */
+
+	return sctx->tcs_shader.cso ? sctx->tcs_shader.current :
+				      sctx->fixed_func_tcs_shader.current;
+}
+
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
 	unsigned bytes = 0;
@@ -2309,11 +2842,71 @@
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
 	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
+
+	if (sctx->tes_shader.cso) {
+		struct si_shader *tcs = si_get_tcs_current(sctx);
+
+		bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(tcs));
+	}
 	return bytes;
 }
 
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+	struct si_shader *tcs = si_get_tcs_current(sctx);
+	int r;
+
+	/* Update the shaders, so that they are using the latest scratch.
+	 * The scratch buffer may have been changed since these shaders were
+	 * last used, so we still need to try to update them, even if they
+	 * require scratch buffers smaller than the current size.
+	 */
+	r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+	r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+	r = si_update_scratch_buffer(sctx, tcs);
+	if (r < 0)
+		return false;
+	if (r == 1)
+		si_pm4_bind_state(sctx, hs, tcs->pm4);
+
+	/* VS can be bound as LS, ES, or VS. */
+	r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1) {
+		if (sctx->tes_shader.current)
+			si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+		else if (sctx->gs_shader.current)
+			si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+		else
+			si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+	}
+
+	/* TES can be bound as ES or VS. */
+	r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+	if (r < 0)
+		return false;
+	if (r == 1) {
+		if (sctx->gs_shader.current)
+			si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+		else
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+	}
+
+	return true;
+}
+
 static bool si_update_spi_tmpring_size(struct si_context *sctx)
 {
 	unsigned current_scratch_buffer_size =
@@ -2323,7 +2916,6 @@
 	unsigned scratch_needed_size = scratch_bytes_per_wave *
 		sctx->scratch_waves;
 	unsigned spi_tmpring_size;
-	int r;
 
 	if (scratch_needed_size > 0) {
 		if (scratch_needed_size > current_scratch_buffer_size) {
@@ -2343,52 +2935,8 @@
 						       &sctx->scratch_buffer->b.b);
 		}
 
-		/* Update the shaders, so they are using the latest scratch.  The
-		 * scratch buffer may have been changed since these shaders were
-		 * last used, so we still need to try to update them, even if
-		 * they require scratch buffers smaller than the current size.
-		 */
-		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-		if (r < 0)
+		if (!si_update_scratch_relocs(sctx))
 			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-		r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
-		r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1)
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-
-		/* VS can be bound as LS, ES, or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1) {
-			if (sctx->tes_shader.current)
-				si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-			else if (sctx->gs_shader.current)
-				si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-			else
-				si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-		}
-
-		/* TES can be bound as ES or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-		if (r < 0)
-			return false;
-		if (r == 1) {
-			if (sctx->gs_shader.current)
-				si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-			else
-				si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
@@ -2409,7 +2957,10 @@
 	bool double_offchip_buffers = sctx->b.chip_class >= CIK &&
 				      sctx->b.family != CHIP_CARRIZO &&
 				      sctx->b.family != CHIP_STONEY;
-	unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+	/* This must be one less than the maximum number due to a hw limitation.
+	 * Various hardware bugs in SI, CIK, and GFX9 need this.
+	 */
+	unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
 	unsigned max_offchip_buffers = max_offchip_buffers_per_se *
 				       sctx->screen->b.info.max_se;
 	unsigned offchip_granularity;
@@ -2426,26 +2977,15 @@
 		break;
 	}
 
-	switch (sctx->b.chip_class) {
-	case SI:
-		max_offchip_buffers = MIN2(max_offchip_buffers, 126);
-		break;
-	case CIK:
-	case VI:
-	case GFX9:
-		max_offchip_buffers = MIN2(max_offchip_buffers, 508);
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
 	assert(!sctx->tf_ring);
+	/* Use 64K alignment for both rings, so that we can pass the address
+	 * to shaders as one SGPR containing bits [16:47].
+	 */
 	sctx->tf_ring = r600_aligned_buffer_create(sctx->b.b.screen,
 						   R600_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
 						   32768 * sctx->screen->b.info.max_se,
-						   256);
+						   64 * 1024);
 	if (!sctx->tf_ring)
 		return;
 
@@ -2457,12 +2997,22 @@
 					   PIPE_USAGE_DEFAULT,
 					   max_offchip_buffers *
 					   sctx->screen->tess_offchip_block_dw_size * 4,
-					   256);
+					   64 * 1024);
 	if (!sctx->tess_offchip_ring)
 		return;
 
 	si_init_config_add_vgt_flush(sctx);
 
+	uint64_t offchip_va = r600_resource(sctx->tess_offchip_ring)->gpu_address;
+	uint64_t factor_va = r600_resource(sctx->tf_ring)->gpu_address;
+	assert((offchip_va & 0xffff) == 0);
+	assert((factor_va & 0xffff) == 0);
+
+	si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tess_offchip_ring),
+		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+	si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tf_ring),
+		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+
 	/* Append these registers to the init config state. */
 	if (sctx->b.chip_class >= CIK) {
 		if (sctx->b.chip_class >= VI)
@@ -2471,10 +3021,10 @@
 		si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
 			       S_030938_SIZE(sctx->tf_ring->width0 / 4));
 		si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
-			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+			       factor_va >> 8);
 		if (sctx->b.chip_class >= GFX9)
 			si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
-				       r600_resource(sctx->tf_ring)->gpu_address >> 40);
+				       factor_va >> 40);
 		si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
 		             S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
 		             S_03093C_OFFCHIP_GRANULARITY(offchip_granularity));
@@ -2483,24 +3033,37 @@
 		si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
 			       S_008988_SIZE(sctx->tf_ring->width0 / 4));
 		si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
-			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+			       factor_va >> 8);
 		si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
 		               S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers));
 	}
 
+	if (sctx->b.chip_class >= GFX9) {
+		si_pm4_set_reg(sctx->init_config,
+			       R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+			       GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+			       offchip_va >> 16);
+		si_pm4_set_reg(sctx->init_config,
+			       R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+			       GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+			       factor_va >> 16);
+	} else {
+		si_pm4_set_reg(sctx->init_config,
+			       R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+			       GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+			       offchip_va >> 16);
+		si_pm4_set_reg(sctx->init_config,
+			       R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+			       GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+			       factor_va >> 16);
+	}
+
 	/* Flush the context to re-emit the init_config state.
 	 * This is done only once in a lifetime of a context.
 	 */
 	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
 	sctx->b.initial_gfx_cs_size = 0; /* force flush */
 	si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
-
-	si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_FACTOR, sctx->tf_ring,
-			   0, sctx->tf_ring->width0, false, false, 0, 0, 0);
-
-	si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_OFFCHIP,
-	                   sctx->tess_offchip_ring, 0,
-	                   sctx->tess_offchip_ring->width0, false, false, 0, 0, 0);
 }
 
 /**
@@ -2571,25 +3134,16 @@
 	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
 }
 
-static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader)
-{
-	struct pipe_stream_output_info *so = &shader->so;
-	uint32_t enabled_stream_buffers_mask = 0;
-	int i;
-
-	for (i = 0; i < so->num_outputs; i++)
-		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
-	sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
-	sctx->b.streamout.stride_in_dw = shader->so.stride;
-}
-
 bool si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
 	struct si_compiler_ctx_state compiler_state;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct si_shader *old_vs = si_get_vs_state(sctx);
-	bool old_clip_disable = old_vs ? old_vs->key.opt.hw_vs.clip_disable : false;
+	bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
+	struct si_shader *old_ps = sctx->ps_shader.current;
+	unsigned old_spi_shader_col_format =
+		old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
 	int r;
 
 	compiler_state.tm = sctx->tm;
@@ -2650,7 +3204,6 @@
 			if (r)
 				return false;
 			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-			si_update_so(sctx, sctx->tes_shader.cso);
 		}
 	} else if (sctx->gs_shader.cso) {
 		if (sctx->b.chip_class <= VI) {
@@ -2670,8 +3223,6 @@
 		if (r)
 			return false;
 		si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-		si_update_so(sctx, sctx->vs_shader.cso);
-
 		si_pm4_bind_state(sctx, ls, NULL);
 		si_pm4_bind_state(sctx, hs, NULL);
 	}
@@ -2683,7 +3234,6 @@
 			return false;
 		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
-		si_update_so(sctx, sctx->gs_shader.cso);
 
 		if (!si_update_gs_ring_buffers(sctx))
 			return false;
@@ -2695,7 +3245,7 @@
 
 	si_update_vgt_shader_config(sctx);
 
-	if (old_clip_disable != si_get_vs_state(sctx)->key.opt.hw_vs.clip_disable)
+	if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
 		si_mark_atom_dirty(sctx, &sctx->clip_regs);
 
 	if (sctx->ps_shader.cso) {
@@ -2718,7 +3268,11 @@
 			si_mark_atom_dirty(sctx, &sctx->spi_map);
 		}
 
-		if (sctx->screen->b.rbplus_allowed && si_pm4_state_changed(sctx, ps))
+		if (sctx->screen->b.rbplus_allowed &&
+		    si_pm4_state_changed(sctx, ps) &&
+		    (!old_ps ||
+		     old_spi_shader_col_format !=
+		     sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
 			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 
 		if (sctx->ps_db_shader_control != db_shader_control) {
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c
index 53a7ce9..d17a665 100644
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -35,6 +35,7 @@
 #include "radeon/radeon_video.h"
 #include "radeon/radeon_uvd.h"
 #include "radeon/radeon_vce.h"
+#include "radeon/radeon_vcn_dec.h"
 
 /**
  * creates an video buffer with an UVD compatible memory layout
@@ -156,9 +157,11 @@
 					       const struct pipe_video_codec *templ)
 {
 	struct si_context *ctx = (struct si_context *)context;
+	bool vcn = (ctx->b.family == CHIP_RAVEN) ? true : false;
 
         if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE)
                 return rvce_create_encoder(context, templ, ctx->b.ws, si_vce_get_buffer);
 
-	return ruvd_create_decoder(context, templ, si_uvd_set_dtb);
+	return (vcn) ? 	radeon_create_decoder(context, templ) :
+		ruvd_create_decoder(context, templ, si_uvd_set_dtb);
 }
diff --git a/src/gallium/drivers/rbug/rbug_context.c b/src/gallium/drivers/rbug/rbug_context.c
index 5d00207..e1f3c4f 100644
--- a/src/gallium/drivers/rbug/rbug_context.c
+++ b/src/gallium/drivers/rbug/rbug_context.c
@@ -778,8 +778,11 @@
 
    if (num_buffers && _buffers) {
       memcpy(unwrapped_buffers, _buffers, num_buffers * sizeof(*_buffers));
-      for (i = 0; i < num_buffers; i++)
-         unwrapped_buffers[i].buffer = rbug_resource_unwrap(_buffers[i].buffer);
+      for (i = 0; i < num_buffers; i++) {
+         if (!_buffers[i].is_user_buffer)
+            unwrapped_buffers[i].buffer.resource =
+               rbug_resource_unwrap(_buffers[i].buffer.resource);
+      }
       buffers = unwrapped_buffers;
    }
 
@@ -791,25 +794,6 @@
 }
 
 static void
-rbug_set_index_buffer(struct pipe_context *_pipe,
-                      const struct pipe_index_buffer *_ib)
-{
-   struct rbug_context *rb_pipe = rbug_context(_pipe);
-   struct pipe_context *pipe = rb_pipe->pipe;
-   struct pipe_index_buffer unwrapped_ib, *ib = NULL;
-
-   if (_ib) {
-      unwrapped_ib = *_ib;
-      unwrapped_ib.buffer = rbug_resource_unwrap(_ib->buffer);
-      ib = &unwrapped_ib;
-   }
-
-   mtx_lock(&rb_pipe->call_mutex);
-   pipe->set_index_buffer(pipe, ib);
-   mtx_unlock(&rb_pipe->call_mutex);
-}
-
-static void
 rbug_set_sample_mask(struct pipe_context *_pipe,
                      unsigned sample_mask)
 {
@@ -1257,7 +1241,6 @@
    rb_pipe->base.set_viewport_states = rbug_set_viewport_states;
    rb_pipe->base.set_sampler_views = rbug_set_sampler_views;
    rb_pipe->base.set_vertex_buffers = rbug_set_vertex_buffers;
-   rb_pipe->base.set_index_buffer = rbug_set_index_buffer;
    rb_pipe->base.set_sample_mask = rbug_set_sample_mask;
    rb_pipe->base.create_stream_output_target = rbug_create_stream_output_target;
    rb_pipe->base.stream_output_target_destroy = rbug_stream_output_target_destroy;
diff --git a/src/gallium/drivers/softpipe/Android.mk b/src/gallium/drivers/softpipe/Android.mk
index 5d3a93b..29cc317 100644
--- a/src/gallium/drivers/softpipe/Android.mk
+++ b/src/gallium/drivers/softpipe/Android.mk
@@ -35,3 +35,8 @@
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_SOFTPIPE),)
+GALLIUM_TARGET_DRIVERS += swrast
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_sw_dri)
+endif
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 0597301..48f87e6 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -114,7 +114,7 @@
    }
 
    for (i = 0; i < softpipe->num_vertex_buffers; i++) {
-      pipe_resource_reference(&softpipe->vertex_buffer[i].buffer, NULL);
+      pipe_vertex_buffer_unreference(&softpipe->vertex_buffer[i]);
    }
 
    tgsi_exec_machine_destroy(softpipe->fs_machine);
@@ -178,10 +178,10 @@
 
 
 static void
-softpipe_render_condition( struct pipe_context *pipe,
-                           struct pipe_query *query,
-                           boolean condition,
-                           uint mode )
+softpipe_render_condition(struct pipe_context *pipe,
+                          struct pipe_query *query,
+                          boolean condition,
+                          enum pipe_render_cond_flag mode)
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index a57f587..7c42403 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -88,7 +88,6 @@
    struct pipe_shader_buffer buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-   struct pipe_index_buffer index_buffer;
    struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
@@ -149,11 +148,9 @@
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect[PIPE_MAX_VIEWPORTS];
 
-   unsigned line_stipple_counter;
-
    /** Conditional query object and mode */
    struct pipe_query *render_cond_query;
-   uint render_cond_mode;
+   enum pipe_render_cond_flag render_cond_mode;
    boolean render_cond_cond;
 
    /** Polygon stipple items */
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 03fcf64..6363701 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -82,34 +82,31 @@
 
    /* Map vertex buffers */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
-      const void *buf = sp->vertex_buffer[i].user_buffer;
+      const void *buf = sp->vertex_buffer[i].is_user_buffer ?
+                           sp->vertex_buffer[i].buffer.user : NULL;
       size_t size = ~0;
       if (!buf) {
-         if (!sp->vertex_buffer[i].buffer) {
+         if (!sp->vertex_buffer[i].buffer.resource) {
             continue;
          }
-         buf = softpipe_resource_data(sp->vertex_buffer[i].buffer);
-         size = sp->vertex_buffer[i].buffer->width0;
+         buf = softpipe_resource_data(sp->vertex_buffer[i].buffer.resource);
+         size = sp->vertex_buffer[i].buffer.resource->width0;
       }
       draw_set_mapped_vertex_buffer(draw, i, buf, size);
    }
 
    /* Map index buffer, if present */
-   if (info->indexed) {
+   if (info->index_size) {
       unsigned available_space = ~0;
-      mapped_indices = sp->index_buffer.user_buffer;
+      mapped_indices = info->has_user_indices ? info->index.user : NULL;
       if (!mapped_indices) {
-         mapped_indices = softpipe_resource_data(sp->index_buffer.buffer);
-         if (sp->index_buffer.buffer->width0 > sp->index_buffer.offset)
-            available_space =
-               (sp->index_buffer.buffer->width0 - sp->index_buffer.offset);
-         else
-            available_space = 0;
+         mapped_indices = softpipe_resource_data(info->index.resource);
+         available_space = info->index.resource->width0;
       }
 
       draw_set_indexes(draw,
-                       (ubyte *) mapped_indices + sp->index_buffer.offset,
-                       sp->index_buffer.index_size, available_space);
+                       (ubyte *) mapped_indices,
+                       info->index_size, available_space);
    }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index f4c44ef..5c96a14 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -302,6 +302,10 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 4;
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
index 48c8d2c..a7a8736 100644
--- a/src/gallium/drivers/softpipe/sp_state_vertex.c
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -97,19 +97,6 @@
 }
 
 
-static void
-softpipe_set_index_buffer(struct pipe_context *pipe,
-                          const struct pipe_index_buffer *ib)
-{
-   struct softpipe_context *softpipe = softpipe_context(pipe);
-
-   if (ib)
-      memcpy(&softpipe->index_buffer, ib, sizeof(softpipe->index_buffer));
-   else
-      memset(&softpipe->index_buffer, 0, sizeof(softpipe->index_buffer));
-}
-
-
 void
 softpipe_init_vertex_funcs(struct pipe_context *pipe)
 {
@@ -118,5 +105,4 @@
    pipe->delete_vertex_elements_state = softpipe_delete_vertex_elements_state;
 
    pipe->set_vertex_buffers = softpipe_set_vertex_buffers;
-   pipe->set_index_buffer = softpipe_set_index_buffer;
 }
diff --git a/src/gallium/drivers/svga/Android.mk b/src/gallium/drivers/svga/Android.mk
index 7d23fd5..edb69bf 100644
--- a/src/gallium/drivers/svga/Android.mk
+++ b/src/gallium/drivers/svga/Android.mk
@@ -34,5 +34,12 @@
 
 LOCAL_MODULE := libmesa_pipe_svga
 
+LOCAL_STATIC_LIBRARIES += libmesa_git_sha1
+
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_VMWGFX),)
+GALLIUM_TARGET_DRIVERS += vmwgfx
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_svga)
+endif
diff --git a/src/gallium/drivers/svga/Makefile.sources b/src/gallium/drivers/svga/Makefile.sources
index 229d286..72024cf 100644
--- a/src/gallium/drivers/svga/Makefile.sources
+++ b/src/gallium/drivers/svga/Makefile.sources
@@ -15,6 +15,8 @@
 	svga_hw_reg.h \
 	svga_link.c \
 	svga_link.h \
+	svga_msg.c \
+	svga_msg.h \
 	svga_mksstats.h \
 	svga_pipe_blend.c \
 	svga_pipe_blit.c \
diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
index 2d60ceb..9c4806c 100644
--- a/src/gallium/drivers/svga/SConscript
+++ b/src/gallium/drivers/svga/SConscript
@@ -5,7 +5,7 @@
 env.MSVC2013Compat()
 
 if env['suncc']:
-	print 'warning: not building svga'
+	print('warning: not building svga')
 	Return()
 
 env.Append(CPPDEFINES = [
diff --git a/src/gallium/drivers/svga/include/svga3d_surfacedefs.h b/src/gallium/drivers/svga/include/svga3d_surfacedefs.h
index efa358b..89baff3 100644
--- a/src/gallium/drivers/svga/include/svga3d_surfacedefs.h
+++ b/src/gallium/drivers/svga/include/svga3d_surfacedefs.h
@@ -486,12 +486,12 @@
       64, {{0}, {8}, {32}, {0}},
       {{0}, {32}, {0}, {0}}},
 
-   {SVGA3D_R32_FLOAT_X8X24_TYPELESS, SVGA3DBLOCKDESC_R_FP,
+   {SVGA3D_R32_FLOAT_X8X24, SVGA3DBLOCKDESC_R_FP,
       {1, 1, 1},  8, 8,
       64, {{0}, {0}, {32}, {0}},
       {{0}, {0}, {0}, {0}}},
 
-   {SVGA3D_X32_TYPELESS_G8X24_UINT, SVGA3DBLOCKDESC_GREEN,
+   {SVGA3D_X32_G8X24_UINT, SVGA3DBLOCKDESC_GREEN,
       {1, 1, 1},  8, 8,
       64, {{0}, {8}, {0}, {0}},
       {{0}, {32}, {0}, {0}}},
@@ -581,12 +581,12 @@
       32, {{0}, {8}, {24}, {0}},
       {{0}, {24}, {0}, {0}}},
 
-   {SVGA3D_R24_UNORM_X8_TYPELESS, SVGA3DBLOCKDESC_RED,
+   {SVGA3D_R24_UNORM_X8, SVGA3DBLOCKDESC_RED,
       {1, 1, 1},  4, 4,
       32, {{0}, {0}, {24}, {0}},
       {{0}, {0}, {0}, {0}}},
 
-   {SVGA3D_X24_TYPELESS_G8_UINT, SVGA3DBLOCKDESC_GREEN,
+   {SVGA3D_X24_G8_UINT, SVGA3DBLOCKDESC_GREEN,
       {1, 1, 1},  4, 4,
       32, {{0}, {8}, {0}, {0}},
       {{0}, {24}, {0}, {0}}},
diff --git a/src/gallium/drivers/svga/include/svga3d_types.h b/src/gallium/drivers/svga/include/svga3d_types.h
index de711c3..ddd9e35 100644
--- a/src/gallium/drivers/svga/include/svga3d_types.h
+++ b/src/gallium/drivers/svga/include/svga3d_types.h
@@ -204,8 +204,8 @@
    SVGA3D_R32G32_SINT                  = 59,
    SVGA3D_R32G8X24_TYPELESS            = 60,
    SVGA3D_D32_FLOAT_S8X24_UINT         = 61,
-   SVGA3D_R32_FLOAT_X8X24_TYPELESS     = 62,
-   SVGA3D_X32_TYPELESS_G8X24_UINT      = 63,
+   SVGA3D_R32_FLOAT_X8X24              = 62,
+   SVGA3D_X32_G8X24_UINT               = 63,
    SVGA3D_R10G10B10A2_TYPELESS         = 64,
    SVGA3D_R10G10B10A2_UINT             = 65,
    SVGA3D_R11G11B10_FLOAT              = 66,
@@ -223,8 +223,8 @@
    SVGA3D_R32_SINT                     = 78,
    SVGA3D_R24G8_TYPELESS               = 79,
    SVGA3D_D24_UNORM_S8_UINT            = 80,
-   SVGA3D_R24_UNORM_X8_TYPELESS        = 81,
-   SVGA3D_X24_TYPELESS_G8_UINT         = 82,
+   SVGA3D_R24_UNORM_X8                 = 81,
+   SVGA3D_X24_G8_UINT                  = 82,
    SVGA3D_R8G8_TYPELESS                = 83,
    SVGA3D_R8G8_UNORM                   = 84,
    SVGA3D_R8G8_UINT                    = 85,
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index a6f3b34..8640da9 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -130,6 +130,8 @@
    struct svga_context *svga = NULL;
    enum pipe_error ret;
 
+   SVGA_STATS_TIME_PUSH(svgascreen->sws, SVGA_STATS_TIME_CREATECONTEXT);
+
    svga = CALLOC_STRUCT(svga_context);
    if (!svga)
       goto cleanup;
@@ -241,6 +243,8 @@
    memset(&svga->state.hw_clear, 0xcd, sizeof(svga->state.hw_clear));
    memset(&svga->state.hw_clear.framebuffer, 0x0,
           sizeof(svga->state.hw_clear.framebuffer));
+   svga->state.hw_clear.num_rendertargets = 0;
+   svga->state.hw_clear.dsv = NULL;
 
    memset(&svga->state.hw_draw, 0xcd, sizeof(svga->state.hw_draw));
    memset(&svga->state.hw_draw.views, 0x0, sizeof(svga->state.hw_draw.views));
@@ -251,8 +255,7 @@
    memset(svga->state.hw_draw.sampler_views, 0,
           sizeof(svga->state.hw_draw.sampler_views));
    svga->state.hw_draw.num_views = 0;
-   svga->state.hw_draw.num_rendertargets = 0;
-   svga->state.hw_draw.dsv = NULL;
+   svga->state.hw_draw.num_backed_views = 0;
    svga->state.hw_draw.rasterizer_discard = FALSE;
 
    /* Initialize the shader pointers */
@@ -297,7 +300,7 @@
    svga->pred.query_id = SVGA3D_INVALID_ID;
    svga->disable_rasterizer = FALSE;
 
-   return &svga->pipe;
+   goto done;
 
 cleanup:
    svga_destroy_swtnl(svga);
@@ -324,7 +327,10 @@
    util_bitmask_destroy(svga->stream_output_id_bm);
    util_bitmask_destroy(svga->query_id_bm);
    FREE(svga);
-   return NULL;
+
+done:
+   SVGA_STATS_TIME_POP(svgascreen->sws);
+   return svga ? &svga->pipe:NULL;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 3789a46..d0306c0 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -72,6 +72,7 @@
    SVGA_QUERY_NUM_STATE_OBJECTS,
    SVGA_QUERY_NUM_SURFACE_VIEWS,
    SVGA_QUERY_NUM_GENERATE_MIPMAP,
+   SVGA_QUERY_NUM_FAILED_ALLOCATIONS,
 
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
    SVGA_QUERY_MAX
@@ -268,7 +269,6 @@
    struct svga_geometry_shader *gs;      /* derived GS */
 
    struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
-   struct pipe_index_buffer ib;
    /** Constant buffers for each shader.
     * The size should probably always match with that of
     * svga_shader_emitter_v10.num_shader_consts.
@@ -324,6 +324,11 @@
 
    struct pipe_framebuffer_state framebuffer;
    struct svga_prescale prescale;
+
+   /* VGPU10 state */
+   unsigned num_rendertargets;
+   struct pipe_surface *rtv[SVGA3D_MAX_RENDER_TARGETS];
+   struct pipe_surface *dsv;
 };
 
 struct svga_hw_view_state
@@ -343,9 +348,12 @@
    unsigned rs[SVGA3D_RS_MAX];
    /** VGPU9 texture sampler and bindings state */
    unsigned ts[SVGA3D_PIXEL_SAMPLERREG_MAX][SVGA3D_TS_MAX];
+
    /** VGPU9 texture views */
    unsigned num_views;
+   unsigned num_backed_views; /* views with backing copy of texture */
    struct svga_hw_view_state views[PIPE_MAX_SAMPLERS];
+
    /** VGPU9 constant buffer values */
    float cb[PIPE_SHADER_TYPES][SVGA3D_CONSTREG_MAX][4];
 
@@ -393,10 +401,6 @@
    struct pipe_sampler_view
       *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
 
-   unsigned num_rendertargets;
-   struct pipe_surface *rtv[SVGA3D_MAX_RENDER_TARGETS];
-   struct pipe_surface *dsv;
-
    /* used for rebinding */
    unsigned default_constbuf_size[PIPE_SHADER_TYPES];
 
@@ -675,6 +679,9 @@
 svga_context_create(struct pipe_screen *screen,
                     void *priv, unsigned flags);
 
+void svga_toggle_render_condition(struct svga_context *svga,
+                                  boolean render_condition_enabled,
+                                  boolean on);
 
 /***********************************************************************
  * Inline conversion functions.  These are better-typed than the
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
index 039f79d..3686cc6 100644
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -45,6 +45,7 @@
 #define DEBUG_QUERY        0x4000
 #define DEBUG_CACHE        0x8000
 #define DEBUG_STREAMOUT    0x10000
+#define DEBUG_SAMPLERS     0x20000
 
 #ifdef DEBUG
 extern int SVGA_DEBUG;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 988267b..5919bd3 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -38,6 +38,7 @@
 #include "svga_resource.h"
 #include "svga_resource_buffer.h"
 #include "svga_resource_texture.h"
+#include "svga_sampler_view.h"
 #include "svga_shader.h"
 #include "svga_surface.h"
 #include "svga_winsys.h"
@@ -74,7 +75,7 @@
    }
 
    for (i = 0; i < hwtnl->cmd.vbuf_count; i++)
-      pipe_resource_reference(&hwtnl->cmd.vbufs[i].buffer, NULL);
+      pipe_vertex_buffer_unreference(&hwtnl->cmd.vbufs[i]);
 
    for (i = 0; i < hwtnl->cmd.prim_count; i++)
       pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
@@ -134,8 +135,21 @@
 svga_hwtnl_vertex_buffers(struct svga_hwtnl *hwtnl,
                           unsigned count, struct pipe_vertex_buffer *buffers)
 {
-   util_set_vertex_buffers_count(hwtnl->cmd.vbufs,
-                                 &hwtnl->cmd.vbuf_count, buffers, 0, count);
+   struct pipe_vertex_buffer *dst = hwtnl->cmd.vbufs;
+   const struct pipe_vertex_buffer *src = buffers;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      pipe_vertex_buffer_reference(&dst[i], &src[i]);
+   }
+
+   /* release old buffer references */
+   for ( ; i < hwtnl->cmd.vbuf_count; i++) {
+      pipe_vertex_buffer_unreference(&dst[i]);
+      /* don't bother zeroing stride/offset fields */
+   }
+
+   hwtnl->cmd.vbuf_count = count;
 }
 
 
@@ -158,7 +172,7 @@
    }
 
    for (i = 0; i < hwtnl->cmd.vbuf_count; ++i) {
-      if (hwtnl->cmd.vbufs[i].buffer == buffer) {
+      if (hwtnl->cmd.vbufs[i].buffer.resource == buffer) {
          return TRUE;
       }
    }
@@ -186,9 +200,28 @@
    SVGA3dPrimitiveRange *prim;
    unsigned i;
 
+   /* Re-validate those sampler views with backing copy
+    * of texture whose original copy has been updated.
+    * This is done here at draw time because the texture binding might not
+    * have modified, hence validation is not triggered at state update time,
+    * and yet the texture might have been updated in another context, so
+    * we need to re-validate the sampler view in order to update the backing
+    * copy of the updated texture.
+    */
+   if (svga->state.hw_draw.num_backed_views) {
+      for (i = 0; i < svga->state.hw_draw.num_views; i++) {
+         struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+         struct svga_texture *tex = svga_texture(view->texture);
+         struct svga_sampler_view *sv = view->v;
+         if (sv && tex && sv->handle != tex->handle && sv->age < tex->age)
+            svga_validate_sampler_view(svga, view->v);
+      }
+   }
+
    for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
       unsigned j = hwtnl->cmd.vdecl_buffer_index[i];
-      handle = svga_buffer_handle(svga, hwtnl->cmd.vbufs[j].buffer);
+      handle = svga_buffer_handle(svga, hwtnl->cmd.vbufs[j].buffer.resource,
+                                  PIPE_BIND_VERTEX_BUFFER);
       if (!handle)
          return PIPE_ERROR_OUT_OF_MEMORY;
 
@@ -197,7 +230,8 @@
 
    for (i = 0; i < hwtnl->cmd.prim_count; i++) {
       if (hwtnl->cmd.prim_ib[i]) {
-         handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
+         handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i],
+                                     PIPE_BIND_INDEX_BUFFER);
          if (!handle)
             return PIPE_ERROR_OUT_OF_MEMORY;
       }
@@ -331,7 +365,8 @@
 
          if (sv) {
             if (sv->base.texture->target == PIPE_BUFFER) {
-               surfaces[i] = svga_buffer_handle(svga, sv->base.texture);
+               surfaces[i] = svga_buffer_handle(svga, sv->base.texture,
+                                                PIPE_BIND_SAMPLER_VIEW);
             }
             else {
                surfaces[i] = svga_texture(sv->base.texture)->handle;
@@ -408,7 +443,8 @@
          unsigned i = u_bit_scan(&enabled_constbufs);
          buffer = svga_buffer(svga->curr.constbufs[shader][i].buffer);
          if (buffer) {
-            handle = svga_buffer_handle(svga, &buffer->b.b);
+            handle = svga_buffer_handle(svga, &buffer->b.b,
+                                        PIPE_BIND_CONSTANT_BUFFER);
 
             if (svga->rebind.flags.constbufs) {
                ret = svga->swc->resource_rebind(svga->swc,
@@ -509,11 +545,12 @@
 
    /* Get handle for each referenced vertex buffer */
    for (i = 0; i < vbuf_count; i++) {
-      struct svga_buffer *sbuf = svga_buffer(hwtnl->cmd.vbufs[i].buffer);
+      struct svga_buffer *sbuf = svga_buffer(hwtnl->cmd.vbufs[i].buffer.resource);
 
       if (sbuf) {
+         vbuffer_handles[i] = svga_buffer_handle(svga, &sbuf->b.b,
+                                                 PIPE_BIND_VERTEX_BUFFER);
          assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_VERTEX_BUFFER);
-         vbuffer_handles[i] = svga_buffer_handle(svga, &sbuf->b.b);
          if (vbuffer_handles[i] == NULL)
             return PIPE_ERROR_OUT_OF_MEMORY;
          vbuffers[i] = &sbuf->b.b;
@@ -537,7 +574,7 @@
       assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_INDEX_BUFFER);
       (void) sbuf; /* silence unused var warning */
 
-      ib_handle = svga_buffer_handle(svga, ib);
+      ib_handle = svga_buffer_handle(svga, ib, PIPE_BIND_INDEX_BUFFER);
       if (!ib_handle)
          return PIPE_ERROR_OUT_OF_MEMORY;
    }
@@ -583,6 +620,16 @@
           */
          num_vbuffers = MAX2(vbuf_count, svga->state.hw_draw.num_vbuffers);
 
+         /* Zero-out the old buffers we want to unbind (the number of loop
+          * iterations here is typically very small, and often zero.)
+          */
+         for (i = vbuf_count; i < num_vbuffers; i++) {
+            vbuffer_attrs[i].sid = 0;
+            vbuffer_attrs[i].stride = 0;
+            vbuffer_attrs[i].offset = 0;
+            vbuffer_handles[i] = NULL;
+         }
+
          if (num_vbuffers > 0) {
 
             ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, num_vbuffers,
@@ -773,7 +820,7 @@
    for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
       unsigned j = hwtnl->cmd.vdecl_buffer_index[i];
       const struct pipe_vertex_buffer *vb = &hwtnl->cmd.vbufs[j];
-      unsigned size = vb->buffer ? vb->buffer->width0 : 0;
+      unsigned size = vb->buffer.resource ? vb->buffer.resource->width0 : 0;
       unsigned offset = hwtnl->cmd.vdecl[i].array.offset;
       unsigned stride = hwtnl->cmd.vdecl[i].array.stride;
       int index_bias = (int) range->indexBias + hwtnl->index_bias;
diff --git a/src/gallium/drivers/svga/svga_format.c b/src/gallium/drivers/svga/svga_format.c
index 2969259..95dd04d 100644
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@@ -43,6 +43,11 @@
    unsigned flags;
 };
 
+struct format_compat_entry
+{
+   enum pipe_format pformat;
+   const SVGA3dSurfaceFormat *compat_format;
+};
 
 static const struct vgpu10_format_entry format_conversion_table[] =
 {
@@ -376,6 +381,29 @@
 }
 
 
+/**
+ * Translate a gallium scanout format to a svga format valid
+ * for screen target surface.
+ */
+static SVGA3dSurfaceFormat
+svga_translate_screen_target_format_vgpu10(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return SVGA3D_B8G8R8A8_UNORM;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return SVGA3D_B8G8R8X8_UNORM;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return SVGA3D_R5G6B5;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      return SVGA3D_A1R5G5B5;
+   default:
+      debug_printf("Invalid format %s specified for screen target\n",
+                   svga_format_name(format));
+      return SVGA3D_FORMAT_INVALID;
+   }
+}
+
 /*
  * Translate from gallium format to SVGA3D format.
  */
@@ -388,6 +416,9 @@
       if (bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER)) {
          return format_conversion_table[format].vertex_format;
       }
+      else if (bind & PIPE_BIND_SCANOUT) {
+         return svga_translate_screen_target_format_vgpu10(format);
+      }
       else {
          return format_conversion_table[format].pixel_format;
       }
@@ -499,10 +530,10 @@
  * avoid querying the host.  In particular, depth/stencil formats which
  * can be rendered to and sampled from.  For example, the gallium format
  * PIPE_FORMAT_Z24_UNORM_S8_UINT is converted to SVGA3D_D24_UNORM_S8_UINT
- * for rendering but converted to SVGA3D_R24_UNORM_X8_TYPELESS for sampling.
+ * for rendering but converted to SVGA3D_R24_UNORM_X8 for sampling.
  * If we want to query if a format supports both rendering and sampling the
  * host will tell us no for SVGA3D_D24_UNORM_S8_UINT, SVGA3D_D16_UNORM and
- * SVGA3D_R24_UNORM_X8_TYPELESS.  So we override the host query for those
+ * SVGA3D_R24_UNORM_X8.  So we override the host query for those
  * formats and report that both can do rendering and sampling.
  */
 static const struct format_cap format_cap_table[] = {
@@ -869,179 +900,116 @@
       "SVGA3D_UYVY",
       SVGA3D_UYVY,
       SVGA3D_DEVCAP_SURFACEFMT_UYVY,
-      0, 0, 0,
-      0
+      0, 0, 0, 0
    },
    {
       "SVGA3D_YUY2",
       SVGA3D_YUY2,
       SVGA3D_DEVCAP_SURFACEFMT_YUY2,
-      0, 0, 0,
-      0
+      0, 0, 0, 0
    },
    {
       "SVGA3D_NV12",
       SVGA3D_NV12,
       SVGA3D_DEVCAP_SURFACEFMT_NV12,
-      0, 0, 0,
-      0
+      0, 0, 0, 0
    },
    {
       "SVGA3D_AYUV",
       SVGA3D_AYUV,
       SVGA3D_DEVCAP_SURFACEFMT_AYUV,
-      0, 0, 0,
-      0
+      0, 0, 0, 0
    },
    {
       "SVGA3D_R32G32B32A32_TYPELESS",
       SVGA3D_R32G32B32A32_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R32G32B32A32_TYPELESS,
-      1, 1, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 16, 0
    },
    {
       "SVGA3D_R32G32B32A32_UINT",
       SVGA3D_R32G32B32A32_UINT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32A32_UINT,
-      1, 1, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 16, 0
    },
    {
       "SVGA3D_R32G32B32A32_SINT",
       SVGA3D_R32G32B32A32_SINT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32A32_SINT,
-      1, 1, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 16, 0
    },
    {
       "SVGA3D_R32G32B32_TYPELESS",
       SVGA3D_R32G32B32_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R32G32B32_TYPELESS,
-      1, 1, 12,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 12, 0
    },
    {
       "SVGA3D_R32G32B32_FLOAT",
       SVGA3D_R32G32B32_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32_FLOAT,
-      1, 1, 12,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 12, 0
    },
    {
       "SVGA3D_R32G32B32_UINT",
       SVGA3D_R32G32B32_UINT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32_UINT,
-      1, 1, 12,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 12, 0
    },
    {
       "SVGA3D_R32G32B32_SINT",
       SVGA3D_R32G32B32_SINT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32_SINT,
-      1, 1, 12,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 12, 0
    },
    {
       "SVGA3D_R16G16B16A16_TYPELESS",
       SVGA3D_R16G16B16A16_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_TYPELESS,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R16G16B16A16_UINT",
       SVGA3D_R16G16B16A16_UINT,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_UINT,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R16G16B16A16_SNORM",
       SVGA3D_R16G16B16A16_SNORM,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_SNORM,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R16G16B16A16_SINT",
       SVGA3D_R16G16B16A16_SINT,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_SINT,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R32G32_TYPELESS",
       SVGA3D_R32G32_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R32G32_TYPELESS,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R32G32_UINT",
       SVGA3D_R32G32_UINT,
       SVGA3D_DEVCAP_DXFMT_R32G32_UINT,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R32G32_SINT",
       SVGA3D_R32G32_SINT,
       SVGA3D_DEVCAP_DXFMT_R32G32_SINT,
       1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      0
    },
    {
       "SVGA3D_R32G8X24_TYPELESS",
       SVGA3D_R32G8X24_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R32G8X24_TYPELESS,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_ZSTENCIL
+      1, 1, 8, 0
    },
    {
       /* Special case: no devcap / report sampler and depth/stencil ability
@@ -1058,8 +1026,8 @@
    {
       /* Special case: no devcap / report sampler and depth/stencil ability
        */
-      "SVGA3D_R32_FLOAT_X8X24_TYPELESS",
-      SVGA3D_R32_FLOAT_X8X24_TYPELESS,
+      "SVGA3D_R32_FLOAT_X8X24",
+      SVGA3D_R32_FLOAT_X8X24,
       0, /*SVGA3D_DEVCAP_DXFMT_R32_FLOAT_X8X24_TYPELESS*/
       1, 1, 8,
       SVGA3DFORMAT_OP_TEXTURE |
@@ -1068,135 +1036,82 @@
       SVGA3DFORMAT_OP_ZSTENCIL
    },
    {
-      "SVGA3D_X32_TYPELESS_G8X24_UINT",
-      SVGA3D_X32_TYPELESS_G8X24_UINT,
+      "SVGA3D_X32_G8X24_UINT",
+      SVGA3D_X32_G8X24_UINT,
       SVGA3D_DEVCAP_DXFMT_X32_TYPELESS_G8X24_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R10G10B10A2_TYPELESS",
       SVGA3D_R10G10B10A2_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R10G10B10A2_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R10G10B10A2_UINT",
       SVGA3D_R10G10B10A2_UINT,
       SVGA3D_DEVCAP_DXFMT_R10G10B10A2_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R11G11B10_FLOAT",
       SVGA3D_R11G11B10_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R11G11B10_FLOAT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8B8A8_TYPELESS",
       SVGA3D_R8G8B8A8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8B8A8_UNORM",
       SVGA3D_R8G8B8A8_UNORM,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_UNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8B8A8_UNORM_SRGB",
       SVGA3D_R8G8B8A8_UNORM_SRGB,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_UNORM_SRGB,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8B8A8_UINT",
       SVGA3D_R8G8B8A8_UINT,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
       },
    {
       "SVGA3D_R8G8B8A8_SINT",
       SVGA3D_R8G8B8A8_SINT,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_SINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_TYPELESS",
       SVGA3D_R16G16_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R16G16_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_UINT",
       SVGA3D_R16G16_UINT,
       SVGA3D_DEVCAP_DXFMT_R16G16_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_SINT",
       SVGA3D_R16G16_SINT,
       SVGA3D_DEVCAP_DXFMT_R16G16_SINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R32_TYPELESS",
       SVGA3D_R32_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R32_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_ZSTENCIL |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       /* Special case: no devcap / report sampler and depth/stencil ability
@@ -1214,31 +1129,19 @@
       "SVGA3D_R32_UINT",
       SVGA3D_R32_UINT,
       SVGA3D_DEVCAP_DXFMT_R32_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R32_SINT",
       SVGA3D_R32_SINT,
       SVGA3D_DEVCAP_DXFMT_R32_SINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R24G8_TYPELESS",
       SVGA3D_R24G8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R24G8_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_ZSTENCIL
+      1, 1, 4, 0
    },
    {
       /* Special case: no devcap / report sampler and depth/stencil ability
@@ -1255,8 +1158,8 @@
    {
       /* Special case: no devcap / report sampler and depth/stencil ability
        */
-      "SVGA3D_R24_UNORM_X8_TYPELESS",
-      SVGA3D_R24_UNORM_X8_TYPELESS,
+      "SVGA3D_R24_UNORM_X8",
+      SVGA3D_R24_UNORM_X8,
       0, /*SVGA3D_DEVCAP_DXFMT_R24_UNORM_X8_TYPELESS*/
       1, 1, 4,
       SVGA3DFORMAT_OP_TEXTURE |
@@ -1265,155 +1168,94 @@
       SVGA3DFORMAT_OP_ZSTENCIL
    },
    {
-      "SVGA3D_X24_TYPELESS_G8_UINT",
-      SVGA3D_X24_TYPELESS_G8_UINT,
+      "SVGA3D_X24_G8_UINT",
+      SVGA3D_X24_G8_UINT,
       SVGA3D_DEVCAP_DXFMT_X24_TYPELESS_G8_UINT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_ZSTENCIL
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8_TYPELESS",
       SVGA3D_R8G8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R8G8_TYPELESS,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R8G8_UNORM",
       SVGA3D_R8G8_UNORM,
       SVGA3D_DEVCAP_DXFMT_R8G8_UNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R8G8_UINT",
       SVGA3D_R8G8_UINT,
       SVGA3D_DEVCAP_DXFMT_R8G8_UINT,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R8G8_SINT",
       SVGA3D_R8G8_SINT,
       SVGA3D_DEVCAP_DXFMT_R8G8_SINT,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_TYPELESS",
       SVGA3D_R16_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R16_TYPELESS,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_ZSTENCIL |
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_UNORM",
       SVGA3D_R16_UNORM,
       SVGA3D_DEVCAP_DXFMT_R16_UNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_UINT",
       SVGA3D_R16_UINT,
       SVGA3D_DEVCAP_DXFMT_R16_UINT,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_SNORM",
       SVGA3D_R16_SNORM,
       SVGA3D_DEVCAP_DXFMT_R16_SNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_SINT",
       SVGA3D_R16_SINT,
       SVGA3D_DEVCAP_DXFMT_R16_SINT,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R8_TYPELESS",
       SVGA3D_R8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_R8_TYPELESS,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_R8_UNORM",
       SVGA3D_R8_UNORM,
       SVGA3D_DEVCAP_DXFMT_R8_UNORM,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_R8_UINT",
       SVGA3D_R8_UINT,
       SVGA3D_DEVCAP_DXFMT_R8_UINT,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_R8_SNORM",
       SVGA3D_R8_SNORM,
       SVGA3D_DEVCAP_DXFMT_R8_SNORM,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_R8_SINT",
       SVGA3D_R8_SINT,
       SVGA3D_DEVCAP_DXFMT_R8_SINT,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_P8",
@@ -1423,11 +1265,7 @@
       "SVGA3D_R9G9B9E5_SHAREDEXP",
       SVGA3D_R9G9B9E5_SHAREDEXP,
       SVGA3D_DEVCAP_DXFMT_R9G9B9E5_SHAREDEXP,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8_B8G8_UNORM",
@@ -1441,56 +1279,42 @@
       "SVGA3D_BC1_TYPELESS",
       SVGA3D_BC1_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_BC1_TYPELESS,
-      4, 4, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 8, 0
    },
    {
       "SVGA3D_BC1_UNORM_SRGB",
       SVGA3D_BC1_UNORM_SRGB,
       SVGA3D_DEVCAP_DXFMT_BC1_UNORM_SRGB,
-      4, 4, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 8, 0
    },
    {
       "SVGA3D_BC2_TYPELESS",
       SVGA3D_BC2_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_BC2_TYPELESS,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_BC2_UNORM_SRGB",
       SVGA3D_BC2_UNORM_SRGB,
       SVGA3D_DEVCAP_DXFMT_BC2_UNORM_SRGB,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_BC3_TYPELESS",
       SVGA3D_BC3_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_BC3_TYPELESS,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_BC3_UNORM_SRGB",
       SVGA3D_BC3_UNORM_SRGB,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_BC4_TYPELESS",
       SVGA3D_BC4_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_BC4_TYPELESS,
-      4, 4, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 8, 0
    },
    {
       "SVGA3D_ATI1",
@@ -1500,17 +1324,13 @@
       "SVGA3D_BC4_SNORM",
       SVGA3D_BC4_SNORM,
       SVGA3D_DEVCAP_DXFMT_BC4_SNORM,
-      4, 4, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 8, 0
    },
    {
       "SVGA3D_BC5_TYPELESS",
       SVGA3D_BC5_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_BC5_TYPELESS,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_ATI2",
@@ -1520,9 +1340,7 @@
       "SVGA3D_BC5_SNORM",
       SVGA3D_BC5_SNORM,
       SVGA3D_DEVCAP_DXFMT_BC5_SNORM,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_R10G10B10_XR_BIAS_A2_UNORM",
@@ -1532,65 +1350,43 @@
       "SVGA3D_B8G8R8A8_TYPELESS",
       SVGA3D_B8G8R8A8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_B8G8R8A8_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_B8G8R8A8_UNORM_SRGB",
       SVGA3D_B8G8R8A8_UNORM_SRGB,
       SVGA3D_DEVCAP_DXFMT_B8G8R8A8_UNORM_SRGB,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_B8G8R8X8_TYPELESS",
       SVGA3D_B8G8R8X8_TYPELESS,
       SVGA3D_DEVCAP_DXFMT_B8G8R8X8_TYPELESS,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_B8G8R8X8_UNORM_SRGB",
       SVGA3D_B8G8R8X8_UNORM_SRGB,
       SVGA3D_DEVCAP_DXFMT_B8G8R8X8_UNORM_SRGB,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_Z_DF16",
       SVGA3D_Z_DF16,
       SVGA3D_DEVCAP_SURFACEFMT_Z_DF16,
-      1, 1, 2,
-      0
+      1, 1, 2, 0
    },
    {
       "SVGA3D_Z_DF24",
       SVGA3D_Z_DF24,
       SVGA3D_DEVCAP_SURFACEFMT_Z_DF24,
-      1, 1, 4,
-      0
+      1, 1, 4, 0
    },
    {
       "SVGA3D_Z_D24S8_INT",
       SVGA3D_Z_D24S8_INT,
       SVGA3D_DEVCAP_SURFACEFMT_Z_D24S8_INT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_ZSTENCIL
+      1, 1, 4, 0
    },
    {
       "SVGA3D_YV12",
@@ -1600,121 +1396,73 @@
       "SVGA3D_R32G32B32A32_FLOAT",
       SVGA3D_R32G32B32A32_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R32G32B32A32_FLOAT,
-      1, 1, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 16, 0
    },
    {
       "SVGA3D_R16G16B16A16_FLOAT",
       SVGA3D_R16G16B16A16_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_FLOAT,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R16G16B16A16_UNORM",
       SVGA3D_R16G16B16A16_UNORM,
       SVGA3D_DEVCAP_DXFMT_R16G16B16A16_UNORM,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R32G32_FLOAT",
       SVGA3D_R32G32_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R32G32_FLOAT,
-      1, 1, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 8, 0
    },
    {
       "SVGA3D_R10G10B10A2_UNORM",
       SVGA3D_R10G10B10A2_UNORM,
       SVGA3D_DEVCAP_DXFMT_R10G10B10A2_UNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8B8A8_SNORM",
       SVGA3D_R8G8B8A8_SNORM,
       SVGA3D_DEVCAP_DXFMT_R8G8B8A8_SNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_FLOAT",
       SVGA3D_R16G16_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R16G16_FLOAT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_UNORM",
       SVGA3D_R16G16_UNORM,
       SVGA3D_DEVCAP_DXFMT_R16G16_UNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R16G16_SNORM",
       SVGA3D_R16G16_SNORM,
       SVGA3D_DEVCAP_DXFMT_R16G16_SNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R32_FLOAT",
       SVGA3D_R32_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R32_FLOAT,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_R8G8_SNORM",
       SVGA3D_R8G8_SNORM,
       SVGA3D_DEVCAP_DXFMT_R8G8_SNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_R16_FLOAT",
       SVGA3D_R16_FLOAT,
       SVGA3D_DEVCAP_DXFMT_R16_FLOAT,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_D16_UNORM",
@@ -1730,97 +1478,85 @@
       "SVGA3D_A8_UNORM",
       SVGA3D_A8_UNORM,
       SVGA3D_DEVCAP_DXFMT_A8_UNORM,
-      1, 1, 1,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 1, 0
    },
    {
       "SVGA3D_BC1_UNORM",
       SVGA3D_BC1_UNORM,
       SVGA3D_DEVCAP_DXFMT_BC1_UNORM,
-      4, 4, 8,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 8, 0
    },
    {
       "SVGA3D_BC2_UNORM",
       SVGA3D_BC2_UNORM,
       SVGA3D_DEVCAP_DXFMT_BC2_UNORM,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_BC3_UNORM",
       SVGA3D_BC3_UNORM,
       SVGA3D_DEVCAP_DXFMT_BC3_UNORM,
-      4, 4, 16,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE
+      4, 4, 16, 0
    },
    {
       "SVGA3D_B5G6R5_UNORM",
       SVGA3D_B5G6R5_UNORM,
       SVGA3D_DEVCAP_DXFMT_B5G6R5_UNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_DISPLAYMODE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_B5G5R5A1_UNORM",
       SVGA3D_B5G5R5A1_UNORM,
       SVGA3D_DEVCAP_DXFMT_B5G5R5A1_UNORM,
-      1, 1, 2,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_DISPLAYMODE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 2, 0
    },
    {
       "SVGA3D_B8G8R8A8_UNORM",
       SVGA3D_B8G8R8A8_UNORM,
       SVGA3D_DEVCAP_DXFMT_B8G8R8A8_UNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_B8G8R8X8_UNORM",
       SVGA3D_B8G8R8X8_UNORM,
       SVGA3D_DEVCAP_DXFMT_B8G8R8X8_UNORM,
-      1, 1, 4,
-      SVGA3DFORMAT_OP_TEXTURE |
-      SVGA3DFORMAT_OP_CUBETEXTURE |
-      SVGA3DFORMAT_OP_VOLUMETEXTURE |
-      SVGA3DFORMAT_OP_DISPLAYMODE |
-      SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET
+      1, 1, 4, 0
    },
    {
       "SVGA3D_BC4_UNORM",
      SVGA3D_BC4_UNORM,
      SVGA3D_DEVCAP_DXFMT_BC4_UNORM,
-     4, 4, 8,
-     SVGA3DFORMAT_OP_TEXTURE |
-     SVGA3DFORMAT_OP_CUBETEXTURE
+     4, 4, 8, 0
    },
    {
       "SVGA3D_BC5_UNORM",
      SVGA3D_BC5_UNORM,
      SVGA3D_DEVCAP_DXFMT_BC5_UNORM,
-     4, 4, 16,
-     SVGA3DFORMAT_OP_TEXTURE |
-     SVGA3DFORMAT_OP_CUBETEXTURE
+     4, 4, 16, 0
    }
 };
 
+static const SVGA3dSurfaceFormat compat_x8r8g8b8[] = {
+   SVGA3D_X8R8G8B8, SVGA3D_A8R8G8B8, SVGA3D_B8G8R8X8_UNORM,
+   SVGA3D_B8G8R8A8_UNORM, 0
+};
+static const SVGA3dSurfaceFormat compat_r8[] = {
+   SVGA3D_R8_UNORM, SVGA3D_NV12, SVGA3D_YV12, 0
+};
+static const SVGA3dSurfaceFormat compat_g8r8[] = {
+   SVGA3D_R8G8_UNORM, SVGA3D_NV12, 0
+};
+static const SVGA3dSurfaceFormat compat_r5g6b5[] = {
+   SVGA3D_R5G6B5, SVGA3D_B5G6R5_UNORM, 0
+};
+
+static const struct format_compat_entry format_compats[] = {
+   {PIPE_FORMAT_B8G8R8X8_UNORM, compat_x8r8g8b8},
+   {PIPE_FORMAT_B8G8R8A8_UNORM, compat_x8r8g8b8},
+   {PIPE_FORMAT_R8_UNORM, compat_r8},
+   {PIPE_FORMAT_R8G8_UNORM, compat_g8r8},
+   {PIPE_FORMAT_B5G6R5_UNORM, compat_r5g6b5}
+};
 
 /**
  * Debug only:
@@ -1875,6 +1611,8 @@
    assert(entry->format == format);
 
    if (entry->devcap && sws->get_cap(sws, entry->devcap, &result)) {
+      assert(format < SVGA3D_UYVY || entry->defaultOperations == 0);
+
       /* Explicitly advertised format */
       if (entry->devcap > SVGA3D_DEVCAP_DX) {
          /* Translate DX/VGPU10 format cap to VGPU9 cap */
@@ -2089,8 +1827,8 @@
       return SVGA3D_R32G32_TYPELESS;
    case SVGA3D_D32_FLOAT_S8X24_UINT:
       return SVGA3D_R32G8X24_TYPELESS;
-   case SVGA3D_X32_TYPELESS_G8X24_UINT:
-      return SVGA3D_R32_FLOAT_X8X24_TYPELESS;
+   case SVGA3D_X32_G8X24_UINT:
+      return SVGA3D_R32_FLOAT_X8X24;
    case SVGA3D_R10G10B10A2_UINT:
    case SVGA3D_R10G10B10A2_UNORM:
       return SVGA3D_R10G10B10A2_TYPELESS;
@@ -2113,8 +1851,8 @@
       return SVGA3D_R32_TYPELESS;
    case SVGA3D_D24_UNORM_S8_UINT:
       return SVGA3D_R24G8_TYPELESS;
-   case SVGA3D_X24_TYPELESS_G8_UINT:
-      return SVGA3D_R24_UNORM_X8_TYPELESS;
+   case SVGA3D_X24_G8_UINT:
+      return SVGA3D_R24_UNORM_X8;
    case SVGA3D_R8G8_UNORM:
    case SVGA3D_R8G8_SNORM:
    case SVGA3D_R8G8_UINT:
@@ -2181,11 +1919,11 @@
    case SVGA3D_D16_UNORM:
       return SVGA3D_R16_UNORM;
    case SVGA3D_D24_UNORM_S8_UINT:
-      return SVGA3D_R24_UNORM_X8_TYPELESS;
+      return SVGA3D_R24_UNORM_X8;
    case SVGA3D_D32_FLOAT:
       return SVGA3D_R32_FLOAT;
    case SVGA3D_D32_FLOAT_S8X24_UINT:
-      return SVGA3D_R32_FLOAT_X8X24_TYPELESS;
+      return SVGA3D_R32_FLOAT_X8X24;
    default:
       return format;
    }
@@ -2241,3 +1979,77 @@
       return false;
    }
 }
+
+
+/**
+ * \brief Can we import a surface with a given SVGA3D format as a texture?
+ *
+ * \param ss[in]  pointer to the svga screen.
+ * \param pformat[in]  pipe format of the local texture.
+ * \param sformat[in]  svga3d format of the imported surface.
+ * \param bind[in]  bind flags of the imported texture.
+ * \param verbose[in]  Print out incompatibilities in debug mode.
+ */
+bool
+svga_format_is_shareable(const struct svga_screen *ss,
+                         enum pipe_format pformat,
+                         SVGA3dSurfaceFormat sformat,
+                         unsigned bind,
+                         bool verbose)
+{
+   SVGA3dSurfaceFormat default_format =
+      svga_translate_format(ss, pformat, bind);
+   int i;
+
+   if (default_format == SVGA3D_FORMAT_INVALID)
+      return false;
+   if (default_format == sformat)
+      return true;
+
+   for (i = 0; i < ARRAY_SIZE(format_compats); ++i) {
+      if (format_compats[i].pformat == pformat) {
+         const SVGA3dSurfaceFormat *compat_format =
+            format_compats[i].compat_format;
+         while (*compat_format != 0) {
+            if (*compat_format == sformat)
+               return true;
+            compat_format++;
+         }
+      }
+   }
+
+   if (verbose) {
+      debug_printf("Incompatible imported surface format.\n");
+      debug_printf("Texture format: \"%s\". Imported format: \"%s\".\n",
+                   svga_format_name(default_format),
+                   svga_format_name(sformat));
+   }
+
+   return false;
+}
+
+
+/**
+  * Return the sRGB format which corresponds to the given (linear) format.
+  * If there's no such sRGB format, return the format as-is.
+  */
+SVGA3dSurfaceFormat
+svga_linear_to_srgb(SVGA3dSurfaceFormat format)
+{
+   switch (format) {
+   case SVGA3D_R8G8B8A8_UNORM:
+      return SVGA3D_R8G8B8A8_UNORM_SRGB;
+   case SVGA3D_BC1_UNORM:
+      return SVGA3D_BC1_UNORM_SRGB;
+   case SVGA3D_BC2_UNORM:
+      return SVGA3D_BC2_UNORM_SRGB;
+   case SVGA3D_BC3_UNORM:
+      return SVGA3D_BC3_UNORM_SRGB;
+   case SVGA3D_B8G8R8A8_UNORM:
+      return SVGA3D_B8G8R8A8_UNORM_SRGB;
+   case SVGA3D_B8G8R8X8_UNORM:
+      return SVGA3D_B8G8R8X8_UNORM_SRGB;
+   default:
+      return format;
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_format.h b/src/gallium/drivers/svga/svga_format.h
index e8af40a..55d89ed 100644
--- a/src/gallium/drivers/svga/svga_format.h
+++ b/src/gallium/drivers/svga/svga_format.h
@@ -111,4 +111,13 @@
 bool
 svga_format_is_typeless(SVGA3dSurfaceFormat format);
 
+bool
+svga_format_is_shareable(const struct svga_screen *ss,
+                         enum pipe_format pformat,
+                         SVGA3dSurfaceFormat sformat,
+                         unsigned bind,
+                         bool verbose);
+
+SVGA3dSurfaceFormat
+svga_linear_to_srgb(SVGA3dSurfaceFormat format);
 #endif /* SVGA_FORMAT_H_ */
diff --git a/src/gallium/drivers/svga/svga_link.c b/src/gallium/drivers/svga/svga_link.c
index 5bc7f61..9c1df0c 100644
--- a/src/gallium/drivers/svga/svga_link.c
+++ b/src/gallium/drivers/svga/svga_link.c
@@ -62,7 +62,7 @@
    free_slot = outshader_info->num_outputs + 1;
 
    for (i = 0; i < inshader_info->num_inputs; i++) {
-      unsigned sem_name = inshader_info->input_semantic_name[i];
+      enum tgsi_semantic sem_name = inshader_info->input_semantic_name[i];
       unsigned sem_index = inshader_info->input_semantic_index[i];
       unsigned j;
       /**
diff --git a/src/gallium/drivers/svga/svga_msg.c b/src/gallium/drivers/svga/svga_msg.c
new file mode 100755
index 0000000..e0346de
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_msg.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright © 2016 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "util/u_math.h" /* for MAX2/MIN2 */
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "pipe/p_defines.h"
+#include "svga_msg.h"
+
+
+#define MESSAGE_STATUS_SUCCESS  0x0001
+#define MESSAGE_STATUS_DORECV   0x0002
+#define MESSAGE_STATUS_CPT      0x0010
+#define MESSAGE_STATUS_HB       0x0080
+
+#define RPCI_PROTOCOL_NUM       0x49435052
+#define GUESTMSG_FLAG_COOKIE    0x80000000
+
+#define RETRIES                 3
+
+#define VMW_HYPERVISOR_MAGIC    0x564D5868
+#define VMW_HYPERVISOR_PORT     0x5658
+#define VMW_HYPERVISOR_HB_PORT  0x5659
+
+#define VMW_PORT_CMD_MSG        30
+#define VMW_PORT_CMD_HB_MSG     0
+#define VMW_PORT_CMD_OPEN_CHANNEL  (MSG_TYPE_OPEN << 16 | VMW_PORT_CMD_MSG)
+#define VMW_PORT_CMD_CLOSE_CHANNEL (MSG_TYPE_CLOSE << 16 | VMW_PORT_CMD_MSG)
+#define VMW_PORT_CMD_SENDSIZE   (MSG_TYPE_SENDSIZE << 16 | VMW_PORT_CMD_MSG)
+#define VMW_PORT_CMD_RECVSIZE   (MSG_TYPE_RECVSIZE << 16 | VMW_PORT_CMD_MSG)
+#define VMW_PORT_CMD_RECVSTATUS (MSG_TYPE_RECVSTATUS << 16 | VMW_PORT_CMD_MSG)
+
+#define HIGH_WORD(X) ((X & 0xFFFF0000) >> 16)
+
+
+#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION > 502)
+
+/**
+ * Hypervisor-specific bi-directional communication channel.  Should never
+ * execute on bare metal hardware.  The caller must make sure to check for
+ * supported hypervisor before using these macros.
+ *
+ * The last two parameters are both input and output and must be initialized.
+ *
+ * @cmd: [IN] Message Cmd
+ * @in_bx: [IN] Message Len, through BX
+ * @in_si: [IN] Input argument through SI, set to 0 if not used
+ * @in_di: [IN] Input argument through DI, set ot 0 if not used
+ * @port_num: [IN] port number + [channel id]
+ * @magic: [IN] hypervisor magic value
+ * @ax: [OUT] value of AX register
+ * @bx: [OUT] e.g. status from an HB message status command
+ * @cx: [OUT] e.g. status from a non-HB message status command
+ * @dx: [OUT] e.g. channel id
+ * @si:  [OUT]
+ * @di:  [OUT]
+ */
+#define VMW_PORT(cmd, in_bx, in_si, in_di, \
+         port_num, magic,                  \
+         ax, bx, cx, dx, si, di)           \
+({                                         \
+   asm volatile ("inl %%dx, %%eax;" :      \
+      "=a"(ax),                            \
+      "=b"(bx),                            \
+      "=c"(cx),                            \
+      "=d"(dx),                            \
+      "=S"(si),                            \
+      "=D"(di) :                           \
+      "a"(magic),                          \
+      "b"(in_bx),                          \
+      "c"(cmd),                            \
+      "d"(port_num),                       \
+      "S"(in_si),                          \
+      "D"(in_di) :                         \
+      "memory");                           \
+})
+
+
+
+/**
+ * Hypervisor-specific bi-directional communication channel.  Should never
+ * execute on bare metal hardware.  The caller must make sure to check for
+ * supported hypervisor before using these macros.
+ *
+ * @cmd: [IN] Message Cmd
+ * @in_cx: [IN] Message Len, through CX
+ * @in_si: [IN] Input argument through SI, set to 0 if not used
+ * @in_di: [IN] Input argument through DI, set to 0 if not used
+ * @port_num: [IN] port number + [channel id]
+ * @magic: [IN] hypervisor magic value
+ * @bp:  [IN]
+ * @ax: [OUT] value of AX register
+ * @bx: [OUT] e.g. status from an HB message status command
+ * @cx: [OUT] e.g. status from a non-HB message status command
+ * @dx: [OUT] e.g. channel id
+ * @si:  [OUT]
+ * @di:  [OUT]
+ */
+#if defined(PIPE_ARCH_X86_64)
+
+typedef uint64_t VMW_REG;
+
+#define VMW_PORT_HB_OUT(cmd, in_cx, in_si, in_di, \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+({                                                \
+   asm volatile ("push %%rbp;"                    \
+      "movq %12, %%rbp;"                          \
+      "rep outsb;"                                \
+      "pop %%rbp;" :                              \
+      "=a"(ax),                                   \
+      "=b"(bx),                                   \
+      "=c"(cx),                                   \
+      "=d"(dx),                                   \
+      "=S"(si),                                   \
+      "=D"(di) :                                  \
+      "a"(magic),                                 \
+      "b"(cmd),                                   \
+      "c"(in_cx),                                 \
+      "d"(port_num),                              \
+      "S"(in_si),                                 \
+      "D"(in_di),                                 \
+      "r"(bp) :                                   \
+      "memory", "cc");                            \
+})
+
+#define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di,  \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+({                                                \
+   asm volatile ("push %%rbp;"                    \
+      "movq %12, %%rbp;"                          \
+      "rep insb;"                                 \
+      "pop %%rbp" :                               \
+      "=a"(ax),                                   \
+      "=b"(bx),                                   \
+      "=c"(cx),                                   \
+      "=d"(dx),                                   \
+      "=S"(si),                                   \
+      "=D"(di) :                                  \
+      "a"(magic),                                 \
+      "b"(cmd),                                   \
+      "c"(in_cx),                                 \
+      "d"(port_num),                              \
+      "S"(in_si),                                 \
+      "D"(in_di),                                 \
+      "r"(bp) :                                   \
+      "memory", "cc");                            \
+})
+
+#else
+
+typedef uint32_t VMW_REG;
+
+/* In the 32-bit version of this macro, we use "m" because there is no
+ * more register left for bp
+ */
+#define VMW_PORT_HB_OUT(cmd, in_cx, in_si, in_di, \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+({                                                \
+   asm volatile ("push %%ebp;"                    \
+      "mov %12, %%ebp;"                           \
+      "rep outsb;"                                \
+      "pop %%ebp;" :                              \
+      "=a"(ax),                                   \
+      "=b"(bx),                                   \
+      "=c"(cx),                                   \
+      "=d"(dx),                                   \
+      "=S"(si),                                   \
+      "=D"(di) :                                  \
+      "a"(magic),                                 \
+      "b"(cmd),                                   \
+      "c"(in_cx),                                 \
+      "d"(port_num),                              \
+      "S"(in_si),                                 \
+      "D"(in_di),                                 \
+      "m"(bp) :                                   \
+      "memory", "cc");                            \
+})
+
+
+#define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di,  \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+({                                                \
+   asm volatile ("push %%ebp;"                    \
+      "mov %12, %%ebp;"                           \
+      "rep insb;"                                 \
+      "pop %%ebp" :                               \
+      "=a"(ax),                                   \
+      "=b"(bx),                                   \
+      "=c"(cx),                                   \
+      "=d"(dx),                                   \
+      "=S"(si),                                   \
+      "=D"(di) :                                  \
+      "a"(magic),                                 \
+      "b"(cmd),                                   \
+      "c"(in_cx),                                 \
+      "d"(port_num),                              \
+      "S"(in_si),                                 \
+      "D"(in_di),                                 \
+      "m"(bp) :                                   \
+      "memory", "cc");                            \
+})
+
+#endif
+
+#else
+
+#define MSG_NOT_IMPLEMENTED 1
+
+/* not implemented */
+
+typedef uint32_t VMW_REG;
+
+
+#define VMW_PORT(cmd, in_bx, in_si, in_di, \
+         port_num, magic,                  \
+         ax, bx, cx, dx, si, di)           \
+         (void) in_bx;                     \
+         (void) ax; (void) bx; (void) cx;  \
+         (void) dx; (void) si; (void) di;
+
+#define VMW_PORT_HB_OUT(cmd, in_cx, in_si, in_di, \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+         (void) in_cx; (void) bp;                 \
+         (void) ax; (void) bx; (void) cx;         \
+         (void) dx; (void) si; (void) di;
+			
+
+#define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di,  \
+         port_num, magic, bp,                     \
+         ax, bx, cx, dx, si, di)                  \
+         (void) bp;                               \
+         (void) ax; (void) bx; (void) cx;         \
+         (void) dx; (void) si; (void) di;
+
+#endif /* #if PIPE_CC_GCC */
+
+
+enum rpc_msg_type {
+   MSG_TYPE_OPEN,
+   MSG_TYPE_SENDSIZE,
+   MSG_TYPE_SENDPAYLOAD,
+   MSG_TYPE_RECVSIZE,
+   MSG_TYPE_RECVPAYLOAD,
+   MSG_TYPE_RECVSTATUS,
+   MSG_TYPE_CLOSE,
+};
+
+struct rpc_channel {
+   uint16_t channel_id;
+   uint32_t cookie_high;
+   uint32_t cookie_low;
+};
+
+
+
+/**
+ * svga_open_channel
+ *
+ * @channel: RPC channel
+ * @protocol:
+ *
+ * Returns: PIPE_OK on success, PIPE_ERROR otherwise
+ */
+static enum pipe_error
+svga_open_channel(struct rpc_channel *channel, unsigned protocol)
+{
+   VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si = 0, di = 0;
+
+   VMW_PORT(VMW_PORT_CMD_OPEN_CHANNEL,
+      (protocol | GUESTMSG_FLAG_COOKIE), si, di,
+      VMW_HYPERVISOR_PORT,
+      VMW_HYPERVISOR_MAGIC,
+      ax, bx, cx, dx, si, di);
+
+   if ((HIGH_WORD(cx) & MESSAGE_STATUS_SUCCESS) == 0)
+      return PIPE_ERROR;
+
+   channel->channel_id = HIGH_WORD(dx);
+   channel->cookie_high = si;
+   channel->cookie_low = di;
+
+   return PIPE_OK;
+}
+
+
+
+/**
+ * svga_close_channel
+ *
+ * @channel: RPC channel
+ *
+ * Returns: PIPE_OK on success, PIPE_ERROR otherwises
+ */
+static enum pipe_error
+svga_close_channel(struct rpc_channel *channel)
+{
+   VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di;
+
+   /* Set up additional parameters */
+   si = channel->cookie_high;
+   di = channel->cookie_low;
+
+   VMW_PORT(VMW_PORT_CMD_CLOSE_CHANNEL,
+      0, si, di,
+      (VMW_HYPERVISOR_PORT | (channel->channel_id << 16)),
+      VMW_HYPERVISOR_MAGIC,
+      ax, bx, cx, dx, si, di);
+
+   if ((HIGH_WORD(cx) & MESSAGE_STATUS_SUCCESS) == 0)
+      return PIPE_ERROR;
+
+   return PIPE_OK;
+}
+
+
+
+/**
+ * svga_send_msg: Sends a message to the host
+ *
+ * @channel: RPC channel
+ * @logmsg: NULL terminated string
+ *
+ * Returns: PIPE_OK on success
+ */
+static enum pipe_error
+svga_send_msg(struct rpc_channel *channel, const char *msg)
+{
+   VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di, bp;
+   size_t msg_len = strlen(msg);
+   int retries = 0;
+
+
+   while (retries < RETRIES) {
+      retries++;
+
+      /* Set up additional parameters */
+      si = channel->cookie_high;
+      di = channel->cookie_low;
+
+      VMW_PORT(VMW_PORT_CMD_SENDSIZE,
+         msg_len, si, di,
+         VMW_HYPERVISOR_PORT | (channel->channel_id << 16),
+         VMW_HYPERVISOR_MAGIC,
+         ax, bx, cx, dx, si, di);
+
+      if ((HIGH_WORD(cx) & MESSAGE_STATUS_SUCCESS) == 0 ||
+          (HIGH_WORD(cx) & MESSAGE_STATUS_HB) == 0) {
+         /* Expected success + high-bandwidth. Give up. */
+         return PIPE_ERROR;
+      }
+
+      /* Send msg */
+      si = (uintptr_t) msg;
+      di = channel->cookie_low;
+      bp = channel->cookie_high;
+
+      VMW_PORT_HB_OUT(
+         (MESSAGE_STATUS_SUCCESS << 16) | VMW_PORT_CMD_HB_MSG,
+         msg_len, si, di,
+         VMW_HYPERVISOR_HB_PORT | (channel->channel_id << 16),
+         VMW_HYPERVISOR_MAGIC, bp,
+         ax, bx, cx, dx, si, di);
+
+      if ((HIGH_WORD(bx) & MESSAGE_STATUS_SUCCESS) != 0) {
+         return PIPE_OK;
+      } else if ((HIGH_WORD(bx) & MESSAGE_STATUS_CPT) != 0) {
+         /* A checkpoint occurred. Retry. */
+         continue;
+      } else {
+         break;
+      }
+   }
+
+   return PIPE_ERROR;
+}
+
+
+
+/**
+ * svga_host_log: Sends a log message to the host
+ *
+ * @log: NULL terminated string
+ *
+ * Returns: PIPE_OK on success
+ */
+enum pipe_error
+svga_host_log(const char *log)
+{
+   struct rpc_channel channel;
+   char *msg;
+   int msg_len;
+   enum pipe_error ret = PIPE_OK;
+
+#ifdef MSG_NOT_IMPLEMENTED
+   return ret;
+#endif
+
+   if (!log)
+      return ret;
+
+   msg_len = strlen(log) + strlen("log ") + 1;
+   msg = CALLOC(1, msg_len);
+   if (msg == NULL) {
+      debug_printf("Cannot allocate memory for log message\n");
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   util_sprintf(msg, "log %s", log);
+
+   if (svga_open_channel(&channel, RPCI_PROTOCOL_NUM) ||
+       svga_send_msg(&channel, msg) ||
+       svga_close_channel(&channel)) {
+      debug_printf("Failed to send log\n");
+
+      ret = PIPE_ERROR;
+   }
+
+   FREE(msg);
+
+   return ret;
+}
+
diff --git a/src/gallium/drivers/svga/svga_msg.h b/src/gallium/drivers/svga/svga_msg.h
new file mode 100644
index 0000000..9132ba7
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_msg.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2016 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Based on code from vmware.c and vmmouse.c.
+ * Author:
+ *   Sinclair Yeh <syeh@vmware.com>
+ */
+#ifndef _SVGA_MSG_H
+#define _SVGA_MSG_H
+
+/**
+ * svga_host_log: Sends a log message to the host
+ *
+ * @log: NULL terminated string
+ *
+ * Returns: PIPE_OK on success
+ */
+enum pipe_error svga_host_log(const char *log);
+
+#endif
+
diff --git a/src/gallium/drivers/svga/svga_pipe_blit.c b/src/gallium/drivers/svga/svga_pipe_blit.c
index 23f5946..e98113c 100644
--- a/src/gallium/drivers/svga/svga_pipe_blit.c
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -1,5 +1,5 @@
 /**********************************************************
- * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ * Copyright 2008-2017 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
@@ -39,6 +39,38 @@
 
 
 /**
+ * Build a struct pipe_blit_info object from the arguments used by the
+ * pipe::resource_copy_region() function.
+ */
+static void
+build_blit_info(struct pipe_resource *dst_tex,
+                unsigned dst_level,
+                unsigned dst_x,
+                unsigned dst_y,
+                unsigned dst_z,
+                struct pipe_resource *src_tex,
+                unsigned src_level,
+                const struct pipe_box *src_box,
+                struct pipe_blit_info *blit)
+{
+   memset(blit, 0, sizeof(*blit));
+
+   blit->src.format = src_tex->format;
+   blit->dst.format = dst_tex->format;
+
+   blit->mask = util_format_get_mask(blit->dst.format);
+   blit->filter = PIPE_TEX_FILTER_NEAREST;
+   blit->src.resource = src_tex;
+   blit->src.level = src_level;
+   blit->dst.resource = dst_tex;
+   blit->dst.level = dst_level;
+   blit->src.box = *src_box;
+   u_box_3d(dst_x, dst_y, dst_z, src_box->width, src_box->height,
+            src_box->depth, &blit->dst.box);
+}
+
+
+/**
  * Copy an image between textures with the vgpu10 CopyRegion command.
  */
 static void
@@ -58,6 +90,8 @@
    stex = svga_texture(src_tex);
    dtex = svga_texture(dst_tex);
 
+   svga_surfaces_flush(svga);
+
    box.x = dst_x;
    box.y = dst_y;
    box.z = dst_z;
@@ -91,6 +125,26 @@
 
 
 /**
+ * Fallback to the copy region utility which uses map/memcpy for the copy
+ */
+static void
+copy_region_fallback(struct svga_context *svga, 
+                     struct pipe_resource *dst_tex, unsigned dst_level,
+                     unsigned dstx, unsigned dsty, unsigned dstz,
+                     struct pipe_resource *src_tex, unsigned src_level,
+                     const struct pipe_box *src_box)
+{
+   struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
+
+   SVGA_STATS_TIME_PUSH(sws, SVGA_STATS_TIME_COPYREGIONFALLBACK);
+   util_resource_copy_region(&svga->pipe, dst_tex, dst_level, dstx,
+                             dsty, dstz, src_tex, src_level, src_box);
+   SVGA_STATS_TIME_POP(sws);
+   (void) sws;
+}
+
+
+/**
  * For some texture types, we need to move the z (slice) coordinate
  * to the layer value.  For example, to select the z=3 slice of a 2D ARRAY
  * texture, we need to use layer=3 and set z=0.
@@ -112,119 +166,15 @@
 }
 
 
-static void
-svga_resource_copy_region(struct pipe_context *pipe,
-                          struct pipe_resource *dst_tex,
-                          unsigned dst_level,
-                          unsigned dstx, unsigned dsty, unsigned dstz,
-                          struct pipe_resource *src_tex,
-                          unsigned src_level,
-                          const struct pipe_box *src_box)
-{
-   struct svga_context *svga = svga_context(pipe);
-   struct svga_texture *stex, *dtex;
-   unsigned dst_face_layer, dst_z, src_face_layer, src_z;
-
-   /* Emit buffered drawing commands, and any back copies.
-    */
-   svga_surfaces_flush( svga );
-
-   if (dst_tex->target == PIPE_BUFFER && src_tex->target == PIPE_BUFFER) {
-      /* can't copy within the same buffer, unfortunately */
-      if (svga_have_vgpu10(svga) && src_tex != dst_tex) {
-         enum pipe_error ret;
-         struct svga_winsys_surface *src_surf;
-         struct svga_winsys_surface *dst_surf;
-         struct svga_buffer *dbuffer = svga_buffer(dst_tex);
-
-         src_surf = svga_buffer_handle(svga, src_tex);
-         dst_surf = svga_buffer_handle(svga, dst_tex);
-
-         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                        src_box->x, dstx, src_box->width);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                           src_box->x, dstx, src_box->width);
-            assert(ret == PIPE_OK);
-         }
-
-         dbuffer->dirty = TRUE;
-      }
-      else {
-         /* use map/memcpy fallback */
-         util_resource_copy_region(pipe, dst_tex, dst_level, dstx,
-                                   dsty, dstz, src_tex, src_level, src_box);
-      }
-      return;
-   }
-
-   stex = svga_texture(src_tex);
-   dtex = svga_texture(dst_tex);
-
-   adjust_z_layer(src_tex->target, src_box->z, &src_face_layer, &src_z);
-   adjust_z_layer(dst_tex->target, dstz, &dst_face_layer, &dst_z);
-
-   if (svga_have_vgpu10(svga)) {
-      /* vgpu10 */
-      if (util_format_is_compressed(src_tex->format) ==
-          util_format_is_compressed(dst_tex->format) &&
-          stex->handle != dtex->handle &&
-          svga_resource_type(src_tex->target) ==
-          svga_resource_type(dst_tex->target) &&
-          stex->b.b.nr_samples == dtex->b.b.nr_samples) {
-         copy_region_vgpu10(svga,
-                            src_tex,
-                            src_box->x, src_box->y, src_z,
-                            src_level, src_face_layer,
-                            dst_tex,
-                            dstx, dsty, dst_z,
-                            dst_level, dst_face_layer,
-                            src_box->width, src_box->height, src_box->depth);
-      }
-      else {
-         util_resource_copy_region(pipe, dst_tex, dst_level, dstx, dsty, dstz,
-                                   src_tex, src_level, src_box);
-      }
-   }
-   else {
-      /* vgpu9 */
-      if (src_tex->format == dst_tex->format) {
-         svga_texture_copy_handle(svga,
-                                  stex->handle,
-                                  src_box->x, src_box->y, src_z,
-                                  src_level, src_face_layer,
-                                  dtex->handle,
-                                  dstx, dsty, dst_z,
-                                   dst_level, dst_face_layer,
-                                  src_box->width, src_box->height,
-                                  src_box->depth);
-      }
-      else {
-         util_resource_copy_region(pipe, dst_tex, dst_level, dstx, dsty, dstz,
-                                   src_tex, src_level, src_box);
-      }
-   }
-
-   /* Mark the destination image as being defined */
-   svga_define_texture_level(dtex, dst_face_layer, dst_level);
-}
-
-
 /**
- * Are the given pipe formats compatible, in terms of vgpu10's
+ * Are the given SVGA3D formats compatible, in terms of vgpu10's
  * PredCopyRegion() command?
  */
 static bool
 formats_compatible(const struct svga_screen *ss,
-                   enum pipe_format src_fmt,
-                   enum pipe_format dst_fmt)
+                   SVGA3dSurfaceFormat src_svga_fmt,
+                   SVGA3dSurfaceFormat dst_svga_fmt)
 {
-   SVGA3dSurfaceFormat src_svga_fmt, dst_svga_fmt;
-
-   src_svga_fmt = svga_translate_format(ss, src_fmt, PIPE_BIND_SAMPLER_VIEW);
-   dst_svga_fmt = svga_translate_format(ss, dst_fmt, PIPE_BIND_SAMPLER_VIEW);
-
    src_svga_fmt = svga_typeless_format(src_svga_fmt);
    dst_svga_fmt = svga_typeless_format(dst_svga_fmt);
 
@@ -233,6 +183,123 @@
 
 
 /**
+ * Check whether the blending is enabled or not
+ */
+static bool
+is_blending_enabled(struct svga_context *svga,
+                    const struct pipe_blit_info *blit)
+{
+   bool blend_enable = false;
+   int i;
+   if (svga->curr.blend) {
+      if (svga->curr.blend->independent_blend_enable) {
+         for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+            if (svga->curr.framebuffer.cbufs[i]->texture == blit->dst.resource) {
+               if (svga->curr.blend->rt[i].blend_enable) {
+                  blend_enable = true;
+               }
+               break;
+            }
+         }
+      }
+      else {
+         if (svga->curr.blend->rt[0].blend_enable)
+            blend_enable = true;
+      }
+   }
+   return blend_enable;
+}
+
+
+/**
+ * If GL_FRAMEBUFFER_SRGB is enabled, then output colorspace is
+ * expected to be sRGB if blending is not enabled.
+ * If GL_FRAMEBUFFER_SRGB is disabled, then we can use
+ * copy_region_vgpu10()
+ * Following table basically tells when copy_region_vgpu10 can be
+ * used if GL_FRAMEBUFFER_SRGB is enabled.
+ * ______________________________________________________________
+ *  | src fmt     | dst_fmt   | blending  |Can use       |
+ *  |             |           |           |copy_region   |
+ * ______________________________________________________________
+ *  | linear      | linear    |   N       |     Y        |
+ *  | linear      | linear    |   Y       |     Y        |
+ *  | linear      | sRGB      |   N       |     N        |
+ *  | linear      | sRGB      |   Y       |     Y        |
+ *  | sRGB        | linear    |   N       |     N        |
+ *  | sRGB        | linear    |   Y       |     N        |
+ *  | sRGB        | sRGB      |   N       |     Y        |
+ *  | sRGB        | sRGB      |   Y       |     N        |
+ * ______________________________________________________________
+ *
+ */
+static bool
+check_blending_and_srgb_cond(struct svga_context *svga,
+                             const struct pipe_blit_info *blit)
+{
+   enum pipe_format sFmt = blit->src.format;
+   enum pipe_format dFmt = blit->dst.format;
+
+   if (is_blending_enabled(svga, blit)) {
+      if (!util_format_is_srgb(blit->src.format))
+         return true;
+   }
+   else {
+      if (util_format_is_srgb(sFmt) && util_format_is_srgb(dFmt))
+         return true;
+      else if (!util_format_is_srgb(sFmt)){
+         if (!util_format_is_srgb(dFmt))
+            return true;
+         else {
+           /**
+            * State tracker converts all sRGB src blit format
+            * to linear if GL_FRAMEBUFFER_SRGB is disabled.
+            * So if src resource format is sRGB and
+            * blit format is linear then it means,
+            * GL_FRAMEBUFFER_SRGB is disabled. In this case also
+            * we can use copy_region_vgpu10().
+            */
+
+            if (util_format_is_srgb(blit->src.resource->format))
+               return true;
+         }
+      }
+   }
+   return false;
+}
+
+/**
+ * Do common checks for svga surface copy.
+ */
+static bool
+can_blit_via_svga_copy_region(struct svga_context *svga,
+                              const struct pipe_blit_info *blit_info)
+{
+   struct pipe_blit_info local_blit = *blit_info;
+
+   /* First basic checks to catch incompatibilities in new or locally unchecked
+    * struct pipe_blit_info members but bypass the format check here.
+    * Also since util_can_blit_via_copy_region() requires a dimension match,
+    * PIPE_FILTER_LINEAR should be equal to PIPE_FILTER_NEAREST.
+    */
+   local_blit.dst.format = local_blit.src.format;
+   if (local_blit.filter == PIPE_TEX_FILTER_LINEAR)
+      local_blit.filter = PIPE_TEX_FILTER_NEAREST;
+   if (!util_can_blit_via_copy_region(&local_blit, TRUE))
+      return false;
+
+   /* For depth+stencil formats, copy with mask != PIPE_MASK_ZS is not
+    * supported
+    */
+   if (util_format_is_depth_and_stencil(blit_info->src.format) &&
+      blit_info->mask != (PIPE_MASK_ZS))
+     return false;
+
+   return check_blending_and_srgb_cond(svga, blit_info);
+}
+
+
+/**
  * The state tracker implements some resource copies with blits (for
  * GL_ARB_copy_image).  This function checks if we should really do the blit
  * with a VGPU10 CopyRegion command or software fallback (for incompatible
@@ -244,45 +311,130 @@
 {
    struct svga_texture *dtex, *stex;
 
-   if (!svga_have_vgpu10(svga))
+   /* can't copy between different resource types */
+   if (svga_resource_type(blit_info->src.resource->target) !=
+       svga_resource_type(blit_info->dst.resource->target))
       return false;
 
    stex = svga_texture(blit_info->src.resource);
    dtex = svga_texture(blit_info->dst.resource);
 
-   /* can't copy within one resource */
+   if (!svga_have_vgpu10(svga))
+      return false;
+
    if (stex->handle == dtex->handle)
       return false;
 
+   return formats_compatible(svga_screen(svga->pipe.screen),
+                             stex->key.format,
+                             dtex->key.format);
+}
+
+
+/**
+ * Check whether we can blit using the surface_copy command.
+ */
+static bool
+can_blit_via_surface_copy(struct svga_context *svga,
+                          const struct pipe_blit_info *blit_info)
+{
+   struct svga_texture *dtex, *stex;
+
+   /* Mimic the format tests in util_can_blit_via_copy_region(), but
+    * skip the other tests that have already been performed.
+    */
+   if (blit_info->src.format != blit_info->dst.format) {
+      const struct util_format_description *src_desc, *dst_desc;
+
+      src_desc = util_format_description(blit_info->src.resource->format);
+      dst_desc = util_format_description(blit_info->dst.resource->format);
+
+      if (blit_info->src.resource->format != blit_info->src.format ||
+          blit_info->dst.resource->format != blit_info->dst.format ||
+          !util_is_format_compatible(src_desc, dst_desc));
+      return FALSE;
+   }
+
+   if (svga->render_condition && blit_info->render_condition_enable)
+      return false;
+
    /* can't copy between different resource types */
    if (svga_resource_type(blit_info->src.resource->target) !=
        svga_resource_type(blit_info->dst.resource->target))
       return false;
 
-   /* check that the blit src/dst regions are same size, no flipping, etc. */
-   if (blit_info->src.box.width != blit_info->dst.box.width ||
-       blit_info->src.box.height != blit_info->dst.box.height)
+   stex = svga_texture(blit_info->src.resource);
+   dtex = svga_texture(blit_info->dst.resource);
+
+   if (stex->handle == dtex->handle)
       return false;
 
-   /* check that sample counts are the same */
-   if (stex->b.b.nr_samples != dtex->b.b.nr_samples)
-      return false;
-
-   /* For depth+stencil formats, copy with mask != PIPE_MASK_ZS is not
-    * supported
+   /*
+    * This is what we've been using before, but it can probably be
+    * relaxed. The device checks are less stringent.
     */
-   if (util_format_is_depth_and_stencil(blit_info->src.format) &&
-      blit_info->mask != (PIPE_MASK_ZS))
-     return false;
+   return (stex->b.b.format == dtex->b.b.format);
+}
 
-   if (blit_info->alpha_blend ||
-       (svga->render_condition && blit_info->render_condition_enable) ||
-       blit_info->scissor_enable)
+
+/**
+ * Try region copy using one of the region copy commands
+ */
+static bool
+try_copy_region(struct svga_context *svga,
+                const struct pipe_blit_info *blit)
+{
+   unsigned src_face, src_z, dst_face, dst_z;
+
+   if (!can_blit_via_svga_copy_region(svga, blit))
       return false;
 
-   return formats_compatible(svga_screen(svga->pipe.screen),
-                             blit_info->src.resource->format,
-                             blit_info->dst.resource->format);
+   adjust_z_layer(blit->src.resource->target, blit->src.box.z,
+                  &src_face, &src_z);
+
+   adjust_z_layer(blit->dst.resource->target, blit->dst.box.z,
+                  &dst_face, &dst_z);
+
+   if (can_blit_via_copy_region_vgpu10(svga, blit)) {
+      svga_toggle_render_condition(svga, blit->render_condition_enable, FALSE);
+
+      copy_region_vgpu10(svga,
+                         blit->src.resource,
+                         blit->src.box.x, blit->src.box.y, src_z,
+                         blit->src.level, src_face,
+                         blit->dst.resource,
+                         blit->dst.box.x, blit->dst.box.y, dst_z,
+                         blit->dst.level, dst_face,
+                         blit->src.box.width, blit->src.box.height,
+                         blit->src.box.depth);
+
+      svga_toggle_render_condition(svga, blit->render_condition_enable, TRUE);
+
+      return true;
+   }
+
+   if (can_blit_via_surface_copy(svga, blit)) {
+      struct svga_texture *stex = svga_texture(blit->src.resource);
+      struct svga_texture *dtex = svga_texture(blit->dst.resource);
+
+      svga_surfaces_flush(svga);
+
+      svga_texture_copy_handle(svga,
+                               stex->handle,
+                               blit->src.box.x, blit->src.box.y, src_z,
+                               blit->src.level, src_face,
+                               dtex->handle,
+                               blit->dst.box.x, blit->dst.box.y, dst_z,
+                               blit->dst.level, dst_face,
+                               blit->src.box.width, blit->src.box.height,
+                               blit->src.box.depth);
+
+      svga_define_texture_level(dtex, dst_face, blit->dst.level);
+      svga_set_texture_rendered_to(dtex, dst_face, blit->dst.level);
+      return true;
+   }
+
+   return false;
 }
 
 
@@ -311,58 +463,30 @@
 }
 
 
-static void
-svga_blit(struct pipe_context *pipe,
-          const struct pipe_blit_info *blit_info)
+/**
+ * Try issuing a quad blit.
+ */
+static bool
+try_blit(struct svga_context *svga, const struct pipe_blit_info *blit_info)
 {
-   struct svga_context *svga = svga_context(pipe);
-   struct pipe_blit_info blit = *blit_info;
-   struct pipe_resource *src = blit.src.resource;
-   struct pipe_resource *dst = blit.dst.resource;
+   struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
+   struct pipe_resource *src = blit_info->src.resource;
+   struct pipe_resource *dst = blit_info->dst.resource;
    struct pipe_resource *newSrc = NULL;
    struct pipe_resource *newDst = NULL;
    bool can_create_src_view;
    bool can_create_dst_view;
+   bool ret = true;
+   struct pipe_blit_info blit = *blit_info;
 
-   if (!svga_have_vgpu10(svga) &&
-       blit.src.resource->nr_samples > 1 &&
-       blit.dst.resource->nr_samples <= 1 &&
-       !util_format_is_depth_or_stencil(blit.src.resource->format) &&
-       !util_format_is_pure_integer(blit.src.resource->format)) {
-      debug_printf("svga: color resolve unimplemented\n");
-      return;
-   }
-
-   if (can_blit_via_copy_region_vgpu10(svga, blit_info)) {
-      unsigned src_face, src_z, dst_face, dst_z;
-
-      adjust_z_layer(blit.src.resource->target, blit.src.box.z,
-                     &src_face, &src_z);
-
-      adjust_z_layer(blit.dst.resource->target, blit.dst.box.z,
-                     &dst_face, &dst_z);
-
-      copy_region_vgpu10(svga,
-                         blit.src.resource,
-                         blit.src.box.x, blit.src.box.y, src_z,
-                         blit.src.level, src_face,
-                         blit.dst.resource,
-                         blit.dst.box.x, blit.dst.box.y, dst_z,
-                         blit.dst.level, dst_face,
-                         blit.src.box.width, blit.src.box.height,
-                         blit.src.box.depth);
-      return;
-   }
-
-   if (util_can_blit_via_copy_region(blit_info, TRUE) ||
-       util_can_blit_via_copy_region(blit_info, FALSE)) {
-      util_resource_copy_region(pipe, blit.dst.resource,
-                                blit.dst.level,
-                                blit.dst.box.x, blit.dst.box.y,
-                                blit.dst.box.z, blit.src.resource,
-                                blit.src.level, &blit.src.box);
-      return; /* done */
-   }
+   SVGA_STATS_TIME_PUSH(sws, SVGA_STATS_TIME_BLITBLITTER);
+   
+   /**
+    * If format is srgb and blend is enabled then color values need
+    * to be converted into linear format.
+    */
+   if (is_blending_enabled(svga, &blit))
+      blit.src.format = util_format_linear(blit.src.format);
 
    /* Check if we can create shader resource view and
     * render target view for the quad blitter to work
@@ -377,12 +501,19 @@
 
    if ((blit.mask & PIPE_MASK_S) ||
        ((!can_create_dst_view || !can_create_src_view)
-        && !svga_have_vgpu10(svga)) ||
-       !util_blitter_is_blit_supported(svga->blitter, &blit)) {
+        && !svga_have_vgpu10(svga))) {
+      /* Can't do stencil blits with textured quad blitter */
+      debug_warn_once("using software stencil blit");
+      ret = false;
+      goto done;
+   }
+
+   if (!util_blitter_is_blit_supported(svga->blitter, &blit)) {
       debug_printf("svga: blit unsupported %s -> %s\n",
                    util_format_short_name(blit.src.resource->format),
                    util_format_short_name(blit.dst.resource->format));
-      return;
+      ret = false;
+      goto done;
    }
 
    /* XXX turn off occlusion and streamout queries */
@@ -409,12 +540,10 @@
    util_blitter_save_fragment_sampler_views(svga->blitter,
                      svga->curr.num_sampler_views[PIPE_SHADER_FRAGMENT],
                      svga->curr.sampler_views[PIPE_SHADER_FRAGMENT]);
-   /*util_blitter_save_render_condition(svga->blitter, svga->render_cond_query,
-                                      svga->render_cond_cond, svga->render_cond_mode);*/
 
    if (!can_create_src_view) {
       struct pipe_resource template;
-      unsigned src_face, src_z;
+      struct pipe_blit_info copy_region_blit;
 
       /**
        * If the source blit format is not compatible with the source resource
@@ -429,30 +558,31 @@
       newSrc = svga_texture_create(svga->pipe.screen, &template);
       if (newSrc == NULL) {
          debug_printf("svga_blit: fails to create temporary src\n");
-         return;
+         ret = false;
+         goto done;
       }
 
-      /* Copy from original resource to the temporary resource */
-      adjust_z_layer(blit.src.resource->target, blit.src.box.z,
-                     &src_face, &src_z);
-
-      copy_region_vgpu10(svga,
-                         blit.src.resource,
-                         blit.src.box.x, blit.src.box.y, src_z,
-                         blit.src.level, src_face,
-                         newSrc,
-                         blit.src.box.x, blit.src.box.y, src_z,
-                         blit.src.level, src_face,
-                         blit.src.box.width, blit.src.box.height,
-                         blit.src.box.depth);
+      /* increment the mksStats for blitter with extra copy */
+      SVGA_STATS_COUNT_INC(sws, SVGA_STATS_COUNT_BLITBLITTERCOPY);
+      build_blit_info(newSrc,
+                      blit.src.level, blit.src.box.x,
+                      blit.src.box.y, blit.src.box.z,
+                      blit.src.resource,
+                      blit.src.level, &blit.src.box,
+                      &copy_region_blit);
+      if (!try_copy_region(svga, &copy_region_blit)) {
+         debug_printf("svga: Source blit format conversion failed.\n");
+         ret = false;
+         goto done;
+      }
 
       blit.src.resource = newSrc;
    }
-   
+
    if (!can_create_dst_view) {
       struct pipe_resource template;
 
-      /**
+      /*
        * If the destination blit format is not compatible with the destination
        * resource format, we will not be able to create a render target view.
        * In order to avoid falling back to software blit, we'll create
@@ -465,47 +595,197 @@
       newDst = svga_texture_create(svga->pipe.screen, &template);
       if (newDst == NULL) {
          debug_printf("svga_blit: fails to create temporary dst\n");
-         return;
+         ret = false;
+         goto done;
       }
 
       blit.dst.resource = newDst;
    }
 
+   svga_toggle_render_condition(svga, blit.render_condition_enable, FALSE);
+
    util_blitter_blit(svga->blitter, &blit);
 
+   svga_toggle_render_condition(svga, blit.render_condition_enable, TRUE);
+
    if (blit.dst.resource != dst) {
-      unsigned dst_face, dst_z;
+      struct pipe_blit_info copy_region_blit;
 
-      adjust_z_layer(blit.dst.resource->target, blit.dst.box.z,
-                     &dst_face, &dst_z);
+      /* increment the mksStats for blitter with extra copy */
+      SVGA_STATS_COUNT_INC(sws, SVGA_STATS_COUNT_BLITBLITTERCOPY);
 
-      /**
+      /*
        * A temporary resource was created for the blit, we need to
        * copy from the temporary resource back to the original destination.
        */
-      copy_region_vgpu10(svga,
-                         blit.dst.resource,
-                         blit.dst.box.x, blit.dst.box.y, dst_z,
-                         blit.dst.level, dst_face,
-                         dst,
-                         blit.dst.box.x, blit.dst.box.y, dst_z,
-                         blit.dst.level, dst_face,
-                         blit.dst.box.width, blit.dst.box.height,
-                         blit.dst.box.depth);
-
-      /* unreference the temporary resource */
-      pipe_resource_reference(&newDst, NULL);
-      blit.dst.resource = dst;
+      build_blit_info(dst,
+                      blit.dst.level, blit.dst.box.x,
+                      blit.dst.box.y, blit.dst.box.z,
+                      newDst,
+                      blit.dst.level, &blit.dst.box,
+                      &copy_region_blit);
+      if (!try_copy_region(svga, &copy_region_blit)) {
+         debug_printf("svga: Destination blit format conversion failed.\n");
+         ret = false;
+         goto done;
+      }
    }
 
-   if (blit.src.resource != src) {
-      /* unreference the temporary resource */
-      pipe_resource_reference(&newSrc, NULL);
-      blit.src.resource = src;
-   }
+done:
+   /* unreference the temporary resources if needed */
+   pipe_resource_reference(&newDst, NULL);
+   pipe_resource_reference(&newSrc, NULL);
+
+   SVGA_STATS_TIME_POP(sws);  /* SVGA_STATS_TIME_BLITBLITTER */
+   (void) sws;
+
+   return ret;
 }
 
 
+/**
+ * Try a cpu copy_region fallback.
+ */
+static bool
+try_cpu_copy_region(struct svga_context *svga,
+                    const struct pipe_blit_info *blit)
+{
+   if (util_can_blit_via_copy_region(blit, TRUE) ||
+       util_can_blit_via_copy_region(blit, FALSE)) {
+
+      if (svga->render_condition && blit->render_condition_enable) {
+         debug_warning("CPU copy_region doesn't support "
+                       "conditional rendering.\n");
+         return false;
+      }
+
+      copy_region_fallback(svga, blit->dst.resource,
+                           blit->dst.level,
+                           blit->dst.box.x, blit->dst.box.y,
+                           blit->dst.box.z, blit->src.resource,
+                           blit->src.level, &blit->src.box);
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * The pipe::blit member.
+ */
+static void
+svga_blit(struct pipe_context *pipe,
+          const struct pipe_blit_info *blit)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_winsys_screen *sws = svga_screen(pipe->screen)->sws;
+
+   if (!svga_have_vgpu10(svga) &&
+       blit->src.resource->nr_samples > 1 &&
+       blit->dst.resource->nr_samples <= 1 &&
+       !util_format_is_depth_or_stencil(blit->src.resource->format) &&
+       !util_format_is_pure_integer(blit->src.resource->format)) {
+      debug_printf("svga: color resolve unimplemented\n");
+      return;
+   }
+
+   SVGA_STATS_TIME_PUSH(sws, SVGA_STATS_TIME_BLIT);
+
+   if (try_copy_region(svga, blit))
+      goto done;
+
+   if (try_blit(svga, blit))
+      goto done;
+
+   if (!try_cpu_copy_region(svga, blit))
+      debug_printf("svga: Blit failed.\n");
+   
+done:
+   SVGA_STATS_TIME_POP(sws);  /* SVGA_STATS_TIME_BLIT */
+   (void) sws;
+}
+
+
+/**
+ * The pipe::resource_copy_region member.
+ */
+static void
+svga_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst_tex,
+                          unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *src_tex,
+                          unsigned src_level,
+                          const struct pipe_box *src_box)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
+
+   SVGA_STATS_TIME_PUSH(sws, SVGA_STATS_TIME_COPYREGION);
+
+   if (dst_tex->target == PIPE_BUFFER && src_tex->target == PIPE_BUFFER) {
+      /* can't copy within the same buffer, unfortunately */
+      if (svga_have_vgpu10(svga) && src_tex != dst_tex) {
+         enum pipe_error ret;
+         struct svga_winsys_surface *src_surf;
+         struct svga_winsys_surface *dst_surf;
+         struct svga_buffer *dbuffer = svga_buffer(dst_tex);
+         struct svga_buffer *sbuffer = svga_buffer(src_tex);
+
+         src_surf = svga_buffer_handle(svga, src_tex, sbuffer->bind_flags);
+         dst_surf = svga_buffer_handle(svga, dst_tex, dbuffer->bind_flags);
+
+         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
+                                        src_box->x, dstx, src_box->width);
+         if (ret != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
+                                           src_box->x, dstx, src_box->width);
+            assert(ret == PIPE_OK);
+         }
+
+         dbuffer->dirty = TRUE;
+      }
+      else {
+         /* use map/memcpy fallback */
+         copy_region_fallback(svga, dst_tex, dst_level, dstx,
+                              dsty, dstz, src_tex, src_level, src_box);
+      }
+   } else {
+      struct pipe_blit_info blit;
+
+      build_blit_info(dst_tex, dst_level, dstx, dsty, dstz,
+                      src_tex, src_level, src_box, &blit);
+
+      if (try_copy_region(svga, &blit))
+         goto done;
+
+      /* Blits are format-converting which is not what we want, so perform a
+       * strict format-check.
+       * FIXME: Need to figure out why srgb blits (tf2) and
+       * 3D blits (piglit) are broken here. Perhaps we set up the
+       * struct pipe_blit_info incorrectly.
+       */
+      if (src_tex->format == dst_tex->format &&
+          !util_format_is_srgb(src_tex->format) &&
+          svga_resource_type(src_tex->target) != SVGA3D_RESOURCE_TEXTURE3D &&
+          try_blit(svga, &blit))
+         goto done;
+
+      copy_region_fallback(svga, dst_tex, dst_level, dstx, dsty, dstz,
+                           src_tex, src_level, src_box);
+   }
+
+done:
+   SVGA_STATS_TIME_POP(sws);
+   (void) sws;
+}
+
+
+/**
+ * The pipe::flush_resource member.
+ */
 static void
 svga_flush_resource(struct pipe_context *pipe,
                     struct pipe_resource *resource)
@@ -513,6 +793,9 @@
 }
 
 
+/**
+ * Setup the pipe blit, resource_copy_region and flush_resource members.
+ */
 void
 svga_init_blit_functions(struct svga_context *svga)
 {
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
index 56db713..e234ef5 100644
--- a/src/gallium/drivers/svga/svga_pipe_clear.c
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -504,44 +504,6 @@
                                     dstx, dsty, width, height);
 }
 
-/**
- * \brief Toggle conditional rendering if already enabled
- *
- * \param svga[in]  The svga context
- * \param render_condition_enabled[in]  Whether to ignore requests to turn
- * conditional rendering off
- * \param on[in]  Whether to turn conditional rendering on or off
- */
-static void
-svga_toggle_render_condition(struct svga_context *svga,
-                             boolean render_condition_enabled,
-                             boolean on)
-{
-   SVGA3dQueryId query_id;
-   enum pipe_error ret;
-
-   if (render_condition_enabled ||
-       svga->pred.query_id == SVGA3D_INVALID_ID) {
-      return;
-   }
-
-   /*
-    * If we get here, it means that the system supports
-    * conditional rendering since svga->pred.query_id has already been
-    * modified for this context and thus support has already been
-    * verified.
-    */
-   query_id = on ? svga->pred.query_id : SVGA3D_INVALID_ID;
-
-   ret = SVGA3D_vgpu10_SetPredication(svga->swc, query_id,
-                                      (uint32) svga->pred.cond);
-   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_SetPredication(svga->swc, query_id,
-                                         (uint32) svga->pred.cond);
-      assert(ret == PIPE_OK);
-   }
-}
 
 /**
  * \brief Clear render target pipe callback
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index eaf4681..fea9da1 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -171,13 +171,13 @@
 need_fallback_prim_restart(const struct svga_context *svga,
                            const struct pipe_draw_info *info)
 {
-   if (info->primitive_restart && info->indexed) {
+   if (info->primitive_restart && info->index_size) {
       if (!svga_have_vgpu10(svga))
          return TRUE;
       else if (!svga->state.sw.need_swtnl) {
-         if (svga->curr.ib.index_size == 1)
+         if (info->index_size == 1)
             return TRUE; /* no device support for 1-byte indexes */
-         else if (svga->curr.ib.index_size == 2)
+         else if (info->index_size == 2)
             return info->restart_index != 0xffff;
          else
             return info->restart_index != 0xffffffff;
@@ -196,6 +196,8 @@
    unsigned count = info->count;
    enum pipe_error ret = 0;
    boolean needed_swtnl;
+   struct pipe_resource *indexbuf =
+      info->has_user_indices ? NULL : info->index.resource;
 
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_DRAWVBO);
 
@@ -206,11 +208,10 @@
       goto done;
 
    /* Upload a user index buffer. */
-   struct pipe_index_buffer ibuffer_saved = {0};
-   if (info->indexed && svga->curr.ib.user_buffer &&
-       !util_save_and_upload_index_buffer(pipe, info, &svga->curr.ib,
-                                          &ibuffer_saved)) {
-      return;
+   unsigned index_offset = 0;
+   if (info->index_size && info->has_user_indices &&
+       !util_upload_index_buffer(pipe, info, &indexbuf, &index_offset)) {
+      goto done;
    }
 
    /*
@@ -229,7 +230,7 @@
 
    if (need_fallback_prim_restart(svga, info)) {
       enum pipe_error r;
-      r = util_draw_vbo_without_prim_restart(pipe, &svga->curr.ib, info);
+      r = util_draw_vbo_without_prim_restart(pipe, info);
       assert(r == PIPE_OK);
       (void) r;
       goto done;
@@ -258,18 +259,18 @@
 
       /* Avoid leaking the previous hwtnl bias to swtnl */
       svga_hwtnl_set_index_bias( svga->hwtnl, 0 );
-      ret = svga_swtnl_draw_vbo( svga, info );
+      ret = svga_swtnl_draw_vbo(svga, info, indexbuf, index_offset);
    }
    else {
-      if (info->indexed && svga->curr.ib.buffer) {
+      if (info->index_size && indexbuf) {
          unsigned offset;
 
-         assert(svga->curr.ib.offset % svga->curr.ib.index_size == 0);
-         offset = svga->curr.ib.offset / svga->curr.ib.index_size;
+         assert(index_offset % info->index_size == 0);
+         offset = index_offset / info->index_size;
 
          ret = retry_draw_range_elements( svga,
-                                          svga->curr.ib.buffer,
-                                          svga->curr.ib.index_size,
+                                          indexbuf,
+                                          info->index_size,
                                           info->index_bias,
                                           info->min_index,
                                           info->max_index,
@@ -296,9 +297,8 @@
    }
 
 done:
-   if (info->indexed && ibuffer_saved.user_buffer)
-      pipe->set_index_buffer(pipe, &ibuffer_saved);
-
+   if (info->index_size && info->index.resource != indexbuf)
+      pipe_resource_reference(&indexbuf, NULL);
    SVGA_STATS_TIME_POP(svga_sws(svga));
 }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_flush.c b/src/gallium/drivers/svga/svga_pipe_flush.c
index 8e0af12..85ec34f 100644
--- a/src/gallium/drivers/svga/svga_pipe_flush.c
+++ b/src/gallium/drivers/svga/svga_pipe_flush.c
@@ -42,6 +42,9 @@
     */
    svga_surfaces_flush( svga );
 
+   if (flags & PIPE_FLUSH_FENCE_FD)
+      svga->swc->hints |= SVGA_HINT_FLAG_EXPORT_FENCE_FD;
+
    /* Flush command queue.
     */
    svga_context_flush(svga, fence);
@@ -71,7 +74,45 @@
 }
 
 
+/**
+ * svga_create_fence_fd
+ *
+ * Wraps a SVGA fence around an imported file descriptor.  This
+ * fd represents a fence from another process/device.  The fence created
+ * here can then be fed into fence_server_sync() so SVGA can synchronize
+ * with an external process
+ */
+static void
+svga_create_fence_fd(struct pipe_context *pipe,
+                     struct pipe_fence_handle **fence,
+                     int fd)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(pipe->screen);
+
+   sws->fence_create_fd(sws, fence, fd);
+}
+
+
+/**
+ * svga_fence_server_sync
+ *
+ * This function imports a fence from another process/device into the current
+ * software context so that SVGA can synchronize with it.
+ */
+static void
+svga_fence_server_sync(struct pipe_context *pipe,
+                       struct pipe_fence_handle *fence)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(pipe->screen);
+   struct svga_context *svga = svga_context(pipe);
+
+   sws->fence_server_sync(sws, &svga->swc->imported_fence_fd, fence);
+}
+
+
 void svga_init_flush_functions( struct svga_context *svga )
 {
    svga->pipe.flush = svga_flush;
+   svga->pipe.create_fence_fd = svga_create_fence_fd;
+   svga->pipe.fence_server_sync = svga_fence_server_sync;
 }
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
index e5d3741..04707f6 100644
--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -150,15 +150,6 @@
 
    util_copy_framebuffer_state(dst, fb);
 
-   /* Set the rendered-to flags */
-   for (i = 0; i < dst->nr_cbufs; i++) {
-      struct pipe_surface *s = dst->cbufs[i];
-      if (s) {
-         struct svga_texture *t = svga_texture(s->texture);
-         svga_set_texture_rendered_to(t, s->u.tex.first_layer, s->u.tex.level);
-      }
-   }
-
    if (svga->curr.framebuffer.zsbuf) {
       switch (svga->curr.framebuffer.zsbuf->format) {
       case PIPE_FORMAT_Z16_UNORM:
@@ -180,13 +171,6 @@
          svga->curr.depthscale = 0.0f;
          break;
       }
-
-      /* Set rendered-to flag */
-      {
-         struct pipe_surface *s = dst->zsbuf;
-         struct svga_texture *t = svga_texture(s->texture);
-         svga_set_texture_rendered_to(t, s->u.tex.first_layer, s->u.tex.level);
-      }
    }
    else {
       svga->curr.depthscale = 0.0f;
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 06c0c81..0490a4a 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -191,7 +191,8 @@
    if (state == SVGA3D_QUERYSTATE_PENDING) {
       if (!wait)
          return FALSE;
-      sws->fence_finish(sws, sq->fence, SVGA_FENCE_FLAG_QUERY);
+      sws->fence_finish(sws, sq->fence, PIPE_TIMEOUT_INFINITE,
+                        SVGA_FENCE_FLAG_QUERY);
       state = sq->queryResult->state;
    }
 
@@ -651,7 +652,8 @@
        queryState == SVGA3D_QUERYSTATE_NEW) {
       if (!wait)
          return FALSE;
-      sws->fence_finish(sws, sq->fence, SVGA_FENCE_FLAG_QUERY);
+      sws->fence_finish(sws, sq->fence, PIPE_TIMEOUT_INFINITE,
+                        SVGA_FENCE_FLAG_QUERY);
       sws->query_get_result(sws, sq->gb_query, sq->offset, &queryState, result, resultLen);
    }
 
@@ -747,6 +749,7 @@
    case SVGA_QUERY_NUM_BUFFER_UPLOADS:
    case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
    case SVGA_QUERY_NUM_CONST_UPDATES:
+   case SVGA_QUERY_NUM_FAILED_ALLOCATIONS:
       break;
    case SVGA_QUERY_FLUSH_TIME:
    case SVGA_QUERY_MAP_BUFFER_TIME:
@@ -826,6 +829,7 @@
    case SVGA_QUERY_NUM_BUFFER_UPLOADS:
    case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
    case SVGA_QUERY_NUM_CONST_UPDATES:
+   case SVGA_QUERY_NUM_FAILED_ALLOCATIONS:
       /* nothing */
       break;
    default:
@@ -937,6 +941,7 @@
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_FAILED_ALLOCATIONS:
       /* nothing */
       break;
    default:
@@ -1049,6 +1054,7 @@
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_FAILED_ALLOCATIONS:
       /* nothing */
       break;
    default:
@@ -1182,6 +1188,9 @@
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
       vresult->u64 = svga->hud.num_generate_mipmap;
       break;
+   case SVGA_QUERY_NUM_FAILED_ALLOCATIONS:
+      vresult->u64 = svgascreen->hud.num_failed_allocations;
+      break;
    default:
       assert(!"unexpected query type in svga_get_query_result");
    }
@@ -1223,7 +1232,8 @@
 
       if ((mode == PIPE_RENDER_COND_WAIT ||
            mode == PIPE_RENDER_COND_BY_REGION_WAIT) && sq->fence) {
-         sws->fence_finish(sws, sq->fence, SVGA_FENCE_FLAG_QUERY);
+         sws->fence_finish(sws, sq->fence, PIPE_TIMEOUT_INFINITE,
+                           SVGA_FENCE_FLAG_QUERY);
       }
    }
    /*
@@ -1272,6 +1282,46 @@
 }
 
 
+/**
+ * \brief Toggle conditional rendering if already enabled
+ *
+ * \param svga[in]  The svga context
+ * \param render_condition_enabled[in]  Whether to ignore requests to turn
+ * conditional rendering off
+ * \param on[in]  Whether to turn conditional rendering on or off
+ */
+void
+svga_toggle_render_condition(struct svga_context *svga,
+                             boolean render_condition_enabled,
+                             boolean on)
+{
+   SVGA3dQueryId query_id;
+   enum pipe_error ret;
+
+   if (render_condition_enabled ||
+       svga->pred.query_id == SVGA3D_INVALID_ID) {
+      return;
+   }
+
+   /*
+    * If we get here, it means that the system supports
+    * conditional rendering since svga->pred.query_id has already been
+    * modified for this context and thus support has already been
+    * verified.
+    */
+   query_id = on ? svga->pred.query_id : SVGA3D_INVALID_ID;
+
+   ret = SVGA3D_vgpu10_SetPredication(svga->swc, query_id,
+                                      (uint32) svga->pred.cond);
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_vgpu10_SetPredication(svga->swc, query_id,
+                                         (uint32) svga->pred.cond);
+      assert(ret == PIPE_OK);
+   }
+}
+
+
 void
 svga_init_query_functions(struct svga_context *svga)
 {
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 0c53c1d..2e98eb4 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -272,7 +272,8 @@
       define_sampler_state_object(svga, cso, sampler);
    }
 
-   SVGA_DBG(DEBUG_VIEWS, "min %u, view(min %u, max %u) lod, mipfilter %s\n",
+   SVGA_DBG(DEBUG_SAMPLERS,
+            "New sampler: min %u, view(min %u, max %u) lod, mipfilter %s\n",
             cso->min_lod, cso->view_min_lod, cso->view_max_lod,
             cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING");
 
diff --git a/src/gallium/drivers/svga/svga_pipe_streamout.c b/src/gallium/drivers/svga/svga_pipe_streamout.c
index 1318b55..0c6c034 100644
--- a/src/gallium/drivers/svga/svga_pipe_streamout.c
+++ b/src/gallium/drivers/svga/svga_pipe_streamout.c
@@ -92,7 +92,8 @@
    for (i = 0; i < info->num_outputs; i++) {
       unsigned reg_idx = info->output[i].register_index;
       unsigned buf_idx = info->output[i].output_buffer;
-      const unsigned sem_name = shader->info.output_semantic_name[reg_idx];
+      const enum tgsi_semantic sem_name =
+         shader->info.output_semantic_name[reg_idx];
 
       assert(buf_idx <= PIPE_MAX_SO_BUFFERS);
 
@@ -157,7 +158,6 @@
 svga_set_stream_output(struct svga_context *svga,
                        struct svga_stream_output *streamout)
 {
-   enum pipe_error ret = PIPE_OK;
    unsigned id = streamout ? streamout->id : SVGA3D_INVALID_ID;
 
    if (!svga_have_vgpu10(svga)) {
@@ -168,17 +168,15 @@
             streamout, id);
 
    if (svga->current_so != streamout) {
-      /* Save current SO state */
-      svga->current_so = streamout;
-
-      ret = SVGA3D_vgpu10_SetStreamOutput(svga->swc, id);
+      enum pipe_error ret = SVGA3D_vgpu10_SetStreamOutput(svga->swc, id);
       if (ret != PIPE_OK) {
-         svga_context_flush(svga, NULL);
-         ret = SVGA3D_vgpu10_SetStreamOutput(svga->swc, id);
+         return ret;
       }
+
+      svga->current_so = streamout;
    }
 
-   return ret;
+   return PIPE_OK;
 }
 
 void
@@ -276,13 +274,14 @@
    for (i = 0; i < num_targets; i++) {
       struct svga_stream_output_target *sot
          = svga_stream_output_target(targets[i]);
-      struct svga_buffer *sbuf = svga_buffer(sot->base.buffer);
       unsigned size;
 
-      assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_STREAM_OUTPUT);
-      (void) sbuf;
+      svga->so_surfaces[i] = svga_buffer_handle(svga, sot->base.buffer,
+                                                PIPE_BIND_STREAM_OUTPUT);
 
-      svga->so_surfaces[i] = svga_buffer_handle(svga, sot->base.buffer);
+      assert(svga_buffer(sot->base.buffer)->key.flags
+             & SVGA3D_SURFACE_BIND_STREAM_OUTPUT);
+
       svga->so_targets[i] = &sot->base;
       soBindings[i].offset = sot->base.buffer_offset;
 
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
index 4b3f5d8..8fbe8a1 100644
--- a/src/gallium/drivers/svga/svga_pipe_vertex.c
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -40,9 +40,10 @@
 #include "svga_screen.h"
 
 
-static void svga_set_vertex_buffers(struct pipe_context *pipe,
-                                    unsigned start_slot, unsigned count,
-                                    const struct pipe_vertex_buffer *buffers)
+static void
+svga_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned start_slot, unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
 {
    struct svga_context *svga = svga_context(pipe);
 
@@ -54,15 +55,6 @@
 }
 
 
-static void svga_set_index_buffer(struct pipe_context *pipe,
-                                  const struct pipe_index_buffer *ib)
-{
-   struct svga_context *svga = svga_context(pipe);
-
-   util_set_index_buffer(&svga->curr.ib, ib);
-}
-
-
 /**
  * Does the given vertex attrib format need range adjustment in the VS?
  * Range adjustment scales and biases values from [0,1] to [-1,1].
@@ -323,12 +315,14 @@
    svga->hud.num_vertexelement_objects--;
 }
 
-void svga_cleanup_vertex_state( struct svga_context *svga )
+
+void
+svga_cleanup_vertex_state(struct svga_context *svga)
 {
    unsigned i;
-   
+
    for (i = 0 ; i < svga->curr.num_vertex_buffers; i++)
-      pipe_resource_reference(&svga->curr.vb[i].buffer, NULL);
+      pipe_vertex_buffer_unreference(&svga->curr.vb[i]);
 
    pipe_resource_reference(&svga->state.hw_draw.ib, NULL);
 
@@ -337,10 +331,10 @@
 }
 
 
-void svga_init_vertex_functions( struct svga_context *svga )
+void
+svga_init_vertex_functions(struct svga_context *svga)
 {
    svga->pipe.set_vertex_buffers = svga_set_vertex_buffers;
-   svga->pipe.set_index_buffer = svga_set_index_buffer;
    svga->pipe.create_vertex_elements_state = svga_create_vertex_elements_state;
    svga->pipe.bind_vertex_elements_state = svga_bind_vertex_elements_state;
    svga->pipe.delete_vertex_elements_state = svga_delete_vertex_elements_state;
diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c
index 6a297a2..874cfa0 100644
--- a/src/gallium/drivers/svga/svga_resource.c
+++ b/src/gallium/drivers/svga/svga_resource.c
@@ -33,14 +33,27 @@
 #include "svga_format.h"
 
 
+/**
+ * This is the primary driver entrypoint for allocating graphics memory
+ * (vertex/index/constant buffers, textures, etc)
+ */
 static struct pipe_resource *
 svga_resource_create(struct pipe_screen *screen,
                      const struct pipe_resource *template)
 {
+   struct pipe_resource *r;
+
    if (template->target == PIPE_BUFFER)
-      return svga_buffer_create(screen, template);
+      r = svga_buffer_create(screen, template);
    else
-      return svga_texture_create(screen, template);
+      r = svga_texture_create(screen, template);
+
+   if (!r) {
+      struct svga_screen *svgascreen = svga_screen(screen);
+      svgascreen->hud.num_failed_allocations++;
+   }
+
+   return r;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index 7808903..e9d31de 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -114,7 +114,7 @@
       assert(svga_have_vgpu10(svga));
 
       if (!sbuf->user) {
-         (void) svga_buffer_handle(svga, resource);
+         (void) svga_buffer_handle(svga, resource, sbuf->bind_flags);
       }
 
       if (sbuf->dma.pending > 0) {
@@ -224,7 +224,7 @@
    }
 
    if (!sbuf->swbuf && !svga_buffer_has_hw_storage(sbuf)) {
-      if (svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK) {
+      if (svga_buffer_create_hw_storage(ss, sbuf, sbuf->bind_flags) != PIPE_OK) {
          /*
           * We can't create a hardware buffer big enough, so create a malloc
           * buffer instead.
@@ -320,6 +320,9 @@
    }
 
    if (svga_buffer_has_hw_storage(sbuf)) {
+      /* Note: we may wind up flushing here and unmapping other buffers
+       * which leads to recursively locking ss->swc_mutex.
+       */
       svga_buffer_hw_storage_unmap(svga, sbuf);
    }
 
@@ -396,6 +399,7 @@
 {
    struct svga_screen *ss = svga_screen(screen);
    struct svga_buffer *sbuf;
+   unsigned bind_flags;
 
    SVGA_STATS_TIME_PUSH(ss->sws, SVGA_STATS_TIME_CREATEBUFFER);
 
@@ -407,39 +411,46 @@
    sbuf->b.vtbl = &svga_buffer_vtbl;
    pipe_reference_init(&sbuf->b.b.reference, 1);
    sbuf->b.b.screen = screen;
-   sbuf->bind_flags = template->bind;
+   bind_flags = template->bind;
 
-   if (template->bind & PIPE_BIND_CONSTANT_BUFFER) {
+   LIST_INITHEAD(&sbuf->surfaces);
+
+   if (bind_flags & PIPE_BIND_CONSTANT_BUFFER) {
       /* Constant buffers can only have the PIPE_BIND_CONSTANT_BUFFER
        * flag set.
        */
       if (ss->sws->have_vgpu10) {
-         sbuf->bind_flags = PIPE_BIND_CONSTANT_BUFFER;
-
-         /* Constant buffer size needs to be in multiples of 16. */
-         sbuf->b.b.width0 = align(sbuf->b.b.width0, 16);
+         bind_flags = PIPE_BIND_CONSTANT_BUFFER;
       }
    }
 
-   if (svga_buffer_needs_hw_storage(template->bind)) {
+   /* Although svga device only requires constant buffer size to be
+    * in multiples of 16, in order to allow bind_flags promotion,
+    * we are mandating all buffer size to be in multiples of 16.
+    */
+   sbuf->b.b.width0 = align(sbuf->b.b.width0, 16);
 
-      /* If the buffer will be used for vertex/index/stream data, set all
-       * the flags so that the buffer will be accepted for all those uses.
+   if (svga_buffer_needs_hw_storage(bind_flags)) {
+
+      /* If the buffer is not used for constant buffer, set
+       * the vertex/index bind flags as well so that the buffer will be
+       * accepted for those uses.
        * Note that the PIPE_BIND_ flags we get from the state tracker are
        * just a hint about how the buffer may be used.  And OpenGL buffer
        * object may be used for many different things.
+       * Also note that we do not unconditionally set the streamout
+       * bind flag since streamout buffer is an output buffer and
+       * might have performance implication.
        */
       if (!(template->bind & PIPE_BIND_CONSTANT_BUFFER)) {
-         /* Not a constant buffer.  The buffer may be used for vertex data,
-          * indexes or stream-out.
+         /* Not a constant buffer.  The buffer may be used for vertex data
+          * or indexes.
           */
-         sbuf->bind_flags |= (PIPE_BIND_VERTEX_BUFFER |
-                              PIPE_BIND_INDEX_BUFFER);
-         if (ss->sws->have_vgpu10)
-            sbuf->bind_flags |= PIPE_BIND_STREAM_OUTPUT;
+         bind_flags |= (PIPE_BIND_VERTEX_BUFFER |
+                        PIPE_BIND_INDEX_BUFFER);
       }
 
-      if (svga_buffer_create_host_surface(ss, sbuf) != PIPE_OK)
+      if (svga_buffer_create_host_surface(ss, sbuf, bind_flags) != PIPE_OK)
          goto error2;
    }
    else {
@@ -451,6 +462,7 @@
    debug_reference(&sbuf->b.b.reference,
                    (debug_reference_descriptor)debug_describe_resource, 0);
 
+   sbuf->bind_flags = bind_flags;
    sbuf->size = util_resource_size(&sbuf->b.b);
    ss->hud.total_resource_bytes += sbuf->size;
 
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index 05025e9..db53341 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -59,6 +59,18 @@
 struct svga_3d_update_gb_image;
 
 /**
+ * This structure describes the bind flags and cache key associated
+ * with the host surface.
+ */
+struct svga_buffer_surface
+{
+   struct list_head list;
+   unsigned bind_flags;
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+};
+
+/**
  * SVGA pipe buffer.
  */
 struct svga_buffer 
@@ -101,6 +113,12 @@
    struct svga_winsys_surface *handle;
 
    /**
+    * List of surfaces created for this buffer resource to support
+    * incompatible bind flags.
+    */
+   struct list_head surfaces;
+
+   /**
     * Information about ongoing and past map operations.
     */
    struct {
@@ -325,7 +343,8 @@
  */
 struct svga_winsys_surface *
 svga_buffer_handle(struct svga_context *svga,
-                   struct pipe_resource *buf);
+                   struct pipe_resource *buf,
+                   unsigned tobind_flags);
 
 void
 svga_context_flush_buffers(struct svga_context *svga);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 9d93b48..104cb6d 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -113,13 +113,14 @@
  */
 enum pipe_error
 svga_buffer_create_hw_storage(struct svga_screen *ss,
-                              struct svga_buffer *sbuf)
+                              struct svga_buffer *sbuf,
+                              unsigned bind_flags)
 {
    assert(!sbuf->user);
 
    if (ss->sws->have_gb_objects) {
       assert(sbuf->handle || !sbuf->dma.pending);
-      return svga_buffer_create_host_surface(ss, sbuf);
+      return svga_buffer_create_host_surface(ss, sbuf, bind_flags);
    }
    if (!sbuf->hwbuf) {
       struct svga_winsys_screen *sws = ss->sws;
@@ -138,11 +139,17 @@
 }
 
 
-
+/**
+ * Allocate graphics memory for vertex/index/constant/etc buffer (not
+ * textures).
+ */
 enum pipe_error
 svga_buffer_create_host_surface(struct svga_screen *ss,
-                                struct svga_buffer *sbuf)
+                                struct svga_buffer *sbuf,
+                                unsigned bind_flags)
 {
+   enum pipe_error ret = PIPE_OK;
+
    assert(!sbuf->user);
 
    if (!sbuf->handle) {
@@ -151,24 +158,24 @@
       sbuf->key.flags = 0;
 
       sbuf->key.format = SVGA3D_BUFFER;
-      if (sbuf->bind_flags & PIPE_BIND_VERTEX_BUFFER) {
+      if (bind_flags & PIPE_BIND_VERTEX_BUFFER) {
          sbuf->key.flags |= SVGA3D_SURFACE_HINT_VERTEXBUFFER;
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_VERTEX_BUFFER;
       }
-      if (sbuf->bind_flags & PIPE_BIND_INDEX_BUFFER) {
+      if (bind_flags & PIPE_BIND_INDEX_BUFFER) {
          sbuf->key.flags |= SVGA3D_SURFACE_HINT_INDEXBUFFER;
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_INDEX_BUFFER;
       }
-      if (sbuf->bind_flags & PIPE_BIND_CONSTANT_BUFFER)
+      if (bind_flags & PIPE_BIND_CONSTANT_BUFFER)
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_CONSTANT_BUFFER;
 
-      if (sbuf->bind_flags & PIPE_BIND_STREAM_OUTPUT)
+      if (bind_flags & PIPE_BIND_STREAM_OUTPUT)
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_STREAM_OUTPUT;
 
-      if (sbuf->bind_flags & PIPE_BIND_SAMPLER_VIEW)
+      if (bind_flags & PIPE_BIND_SAMPLER_VIEW)
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_SHADER_RESOURCE;
 
-      if (!sbuf->bind_flags && sbuf->b.b.usage == PIPE_USAGE_STAGING) {
+      if (!bind_flags && sbuf->b.b.usage == PIPE_USAGE_STAGING) {
          /* This surface is to be used with the
           * SVGA3D_CMD_DX_TRANSFER_FROM_BUFFER command, and no other
           * bind flags are allowed to be set for this surface.
@@ -188,7 +195,7 @@
       SVGA_DBG(DEBUG_DMA, "surface_create for buffer sz %d\n",
                sbuf->b.b.width0);
 
-      sbuf->handle = svga_screen_surface_create(ss, sbuf->b.b.bind,
+      sbuf->handle = svga_screen_surface_create(ss, bind_flags,
                                                 sbuf->b.b.usage,
                                                 &validated, &sbuf->key);
       if (!sbuf->handle)
@@ -202,20 +209,220 @@
 
       SVGA_DBG(DEBUG_DMA, "   --> got sid %p sz %d (buffer)\n",
                sbuf->handle, sbuf->b.b.width0);
+
+      /* Add the new surface to the buffer surface list */
+      ret = svga_buffer_add_host_surface(sbuf, sbuf->handle, &sbuf->key,
+                                         bind_flags);
    }
 
+   return ret;
+}
+
+
+/**
+ * Recreates a host surface with the new bind flags.
+ */
+enum pipe_error
+svga_buffer_recreate_host_surface(struct svga_context *svga,
+                                  struct svga_buffer *sbuf,
+                                  unsigned bind_flags)
+{
+   enum pipe_error ret = PIPE_OK;
+   struct svga_winsys_surface *old_handle = sbuf->handle;
+
+   assert(sbuf->bind_flags != bind_flags);
+   assert(old_handle);
+
+   sbuf->handle = NULL;
+
+   /* Create a new resource with the requested bind_flags */
+   ret = svga_buffer_create_host_surface(svga_screen(svga->pipe.screen),
+                                         sbuf, bind_flags);
+   if (ret == PIPE_OK) {
+      /* Copy the surface data */
+      assert(sbuf->handle);
+      ret = SVGA3D_vgpu10_BufferCopy(svga->swc, old_handle, sbuf->handle,
+                                     0, 0, sbuf->b.b.width0);
+      if (ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, old_handle, sbuf->handle,
+                                        0, 0, sbuf->b.b.width0);
+         assert(ret == PIPE_OK);
+      }
+   }
+
+   /* Set the new bind flags for this buffer resource */
+   sbuf->bind_flags = bind_flags;
+
+   return ret;
+}
+
+
+/**
+ * Returns TRUE if the surface bind flags is compatible with the new bind flags.
+ */
+static boolean
+compatible_bind_flags(unsigned bind_flags,
+                      unsigned tobind_flags)
+{
+   if ((bind_flags & tobind_flags) == tobind_flags)
+      return TRUE;
+   else if ((bind_flags|tobind_flags) & PIPE_BIND_CONSTANT_BUFFER)
+      return FALSE;
+   else
+      return TRUE;
+}
+
+
+/**
+ * Returns a buffer surface from the surface list
+ * that has the requested bind flags or its existing bind flags
+ * can be promoted to include the new bind flags.
+ */
+static struct svga_buffer_surface *
+svga_buffer_get_host_surface(struct svga_buffer *sbuf,
+                             unsigned bind_flags)
+{
+   struct svga_buffer_surface *bufsurf;
+
+   LIST_FOR_EACH_ENTRY(bufsurf, &sbuf->surfaces, list) {
+      if (compatible_bind_flags(bufsurf->bind_flags, bind_flags))
+         return bufsurf;
+   }
+   return NULL;
+}
+
+
+/**
+ * Adds the host surface to the buffer surface list.
+ */
+enum pipe_error
+svga_buffer_add_host_surface(struct svga_buffer *sbuf,
+                             struct svga_winsys_surface *handle,
+                             struct svga_host_surface_cache_key *key,
+                             unsigned bind_flags)
+{
+   struct svga_buffer_surface *bufsurf;
+
+   bufsurf = CALLOC_STRUCT(svga_buffer_surface);
+   if (!bufsurf)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   bufsurf->bind_flags = bind_flags;
+   bufsurf->handle = handle;
+   bufsurf->key = *key;
+
+   /* add the surface to the surface list */
+   LIST_ADD(&bufsurf->list, &sbuf->surfaces);
+
    return PIPE_OK;
 }
 
 
+/**
+ * Start using the specified surface for this buffer resource.
+ */
+void
+svga_buffer_bind_host_surface(struct svga_context *svga,
+                              struct svga_buffer *sbuf,
+                              struct svga_buffer_surface *bufsurf)
+{
+   enum pipe_error ret;
+
+   /* Update the to-bind surface */
+   assert(bufsurf->handle);
+   assert(sbuf->handle);
+
+   /* If we are switching from stream output to other buffer,
+    * make sure to copy the buffer content.
+    */
+   if (sbuf->bind_flags & PIPE_BIND_STREAM_OUTPUT) {
+      ret = SVGA3D_vgpu10_BufferCopy(svga->swc, sbuf->handle, bufsurf->handle,
+                                     0, 0, sbuf->b.b.width0);
+      if (ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, sbuf->handle, bufsurf->handle,
+                                        0, 0, sbuf->b.b.width0);
+         assert(ret == PIPE_OK);
+      }
+   }
+
+   /* Set this surface as the current one */
+   sbuf->handle = bufsurf->handle;
+   sbuf->key = bufsurf->key;
+   sbuf->bind_flags = bufsurf->bind_flags;
+}
+
+
+/**
+ * Prepare a host surface that can be used as indicated in the
+ * tobind_flags. If the existing host surface is not created
+ * with the necessary binding flags and if the new bind flags can be
+ * combined with the existing bind flags, then we will recreate a
+ * new surface with the combined bind flags. Otherwise, we will create
+ * a surface for that incompatible bind flags.
+ * For example, if a stream output buffer is reused as a constant buffer,
+ * since constant buffer surface cannot be bound as a stream output surface,
+ * two surfaces will be created, one for stream output,
+ * and another one for constant buffer.
+ */
+enum pipe_error
+svga_buffer_validate_host_surface(struct svga_context *svga,
+                                  struct svga_buffer *sbuf,
+                                  unsigned tobind_flags)
+{
+   struct svga_buffer_surface *bufsurf;
+   enum pipe_error ret = PIPE_OK;
+
+   /* Flush any pending upload first */
+   svga_buffer_upload_flush(svga, sbuf);
+
+   /* First check from the cached buffer surface list to see if there is
+    * already a buffer surface that has the requested bind flags, or
+    * surface with compatible bind flags that can be promoted.
+    */
+   bufsurf = svga_buffer_get_host_surface(sbuf, tobind_flags);
+
+   if (bufsurf) {
+      if ((bufsurf->bind_flags & tobind_flags) == tobind_flags) {
+         /* there is a surface with the requested bind flags */
+         svga_buffer_bind_host_surface(svga, sbuf, bufsurf);
+      } else {
+
+         /* Recreate a host surface with the combined bind flags */
+         ret = svga_buffer_recreate_host_surface(svga, sbuf,
+                                                 bufsurf->bind_flags |
+                                                 tobind_flags);
+
+         /* Destroy the old surface */
+         svga_screen_surface_destroy(svga_screen(sbuf->b.b.screen),
+                                     &bufsurf->key, &bufsurf->handle);
+
+         LIST_DEL(&bufsurf->list);
+         FREE(bufsurf);
+      }
+   } else {
+      /* Need to create a new surface if the bind flags are incompatible,
+       * such as constant buffer surface & stream output surface.
+       */
+      ret = svga_buffer_recreate_host_surface(svga, sbuf,
+                                              tobind_flags);
+   }
+   return ret;
+}
+
+
 void
 svga_buffer_destroy_host_surface(struct svga_screen *ss,
                                  struct svga_buffer *sbuf)
 {
-   if (sbuf->handle) {
+   struct svga_buffer_surface *bufsurf, *next;
+
+   LIST_FOR_EACH_ENTRY_SAFE(bufsurf, next, &sbuf->surfaces, list) {
       SVGA_DBG(DEBUG_DMA, " ungrab sid %p sz %d\n",
-               sbuf->handle, sbuf->b.b.width0);
-      svga_screen_surface_destroy(ss, &sbuf->key, &sbuf->handle);
+               bufsurf->handle, sbuf->b.b.width0);
+      svga_screen_surface_destroy(ss, &bufsurf->key, &bufsurf->handle);
+      FREE(bufsurf);
    }
 }
 
@@ -228,7 +435,7 @@
  */
 static enum pipe_error
 svga_buffer_upload_gb_command(struct svga_context *svga,
-			      struct svga_buffer *sbuf)
+                              struct svga_buffer *sbuf)
 {
    struct svga_winsys_context *swc = svga->swc;
    SVGA3dCmdUpdateGBImage *update_cmd;
@@ -256,11 +463,12 @@
                                           SVGA_3D_CMD_INVALIDATE_GB_IMAGE,
                                           total_commands_size, 1 + numBoxes);
       if (!invalidate_cmd)
-	 return PIPE_ERROR_OUT_OF_MEMORY;
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
       cicmd = container_of(invalidate_cmd, cicmd, body);
       cicmd->header.size = sizeof(*invalidate_cmd);
-      swc->surface_relocation(swc, &invalidate_cmd->image.sid, NULL, sbuf->handle,
+      swc->surface_relocation(swc, &invalidate_cmd->image.sid, NULL,
+                              sbuf->handle,
                               (SVGA_RELOC_WRITE |
                                SVGA_RELOC_INTERNAL |
                                SVGA_RELOC_DMA));
@@ -284,7 +492,7 @@
                                       SVGA_3D_CMD_UPDATE_GB_IMAGE,
                                       total_commands_size, numBoxes);
       if (!update_cmd)
-	 return PIPE_ERROR_OUT_OF_MEMORY;
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
       /* The whole_update_command is a SVGA3dCmdHeader plus the
        * SVGA3dCmdUpdateGBImage command.
@@ -295,7 +503,7 @@
    /* Init the first UPDATE_GB_IMAGE command */
    whole_update_cmd->header.size = sizeof(*update_cmd);
    swc->surface_relocation(swc, &update_cmd->image.sid, NULL, sbuf->handle,
-			   SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
+                           SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
    update_cmd->image.face = 0;
    update_cmd->image.mipmap = 0;
 
@@ -428,8 +636,7 @@
  * with the final ranges.
  */
 void
-svga_buffer_upload_flush(struct svga_context *svga,
-			 struct svga_buffer *sbuf)
+svga_buffer_upload_flush(struct svga_context *svga, struct svga_buffer *sbuf)
 {
    unsigned i;
    struct pipe_resource *dummy;
@@ -623,7 +830,8 @@
  * Copy the contents of the malloc buffer to a hardware buffer.
  */
 static enum pipe_error
-svga_buffer_update_hw(struct svga_context *svga, struct svga_buffer *sbuf)
+svga_buffer_update_hw(struct svga_context *svga, struct svga_buffer *sbuf,
+                      unsigned bind_flags)
 {
    assert(!sbuf->user);
    if (!svga_buffer_has_hw_storage(sbuf)) {
@@ -637,7 +845,8 @@
       if (!sbuf->swbuf)
          return PIPE_ERROR;
 
-      ret = svga_buffer_create_hw_storage(svga_screen(sbuf->b.b.screen), sbuf);
+      ret = svga_buffer_create_hw_storage(svga_screen(sbuf->b.b.screen), sbuf,
+                                          bind_flags);
       if (ret != PIPE_OK)
          return ret;
 
@@ -646,7 +855,7 @@
       assert(map);
       assert(!retry);
       if (!map) {
-	 mtx_unlock(&ss->swc_mutex);
+         mtx_unlock(&ss->swc_mutex);
          svga_buffer_destroy_hw_storage(ss, sbuf);
          return PIPE_ERROR;
       }
@@ -769,7 +978,8 @@
  * if there are mapped ranges and the data is currently in a malloc'ed buffer.
  */
 struct svga_winsys_surface *
-svga_buffer_handle(struct svga_context *svga, struct pipe_resource *buf)
+svga_buffer_handle(struct svga_context *svga, struct pipe_resource *buf,
+                   unsigned tobind_flags)
 {
    struct pipe_screen *screen = svga->pipe.screen;
    struct svga_screen *ss = svga_screen(screen);
@@ -783,15 +993,30 @@
 
    assert(!sbuf->user);
 
-   if (!sbuf->handle) {
+   if (sbuf->handle) {
+      if ((sbuf->bind_flags & tobind_flags) != tobind_flags) {
+         /* If the allocated resource's bind flags do not include the
+          * requested bind flags, validate the host surface.
+          */
+         ret = svga_buffer_validate_host_surface(svga, sbuf, tobind_flags);
+         if (ret != PIPE_OK)
+            return NULL;
+      }
+   } else {
+      if (!sbuf->bind_flags) {
+         sbuf->bind_flags = tobind_flags;
+      }
+
+      assert((sbuf->bind_flags & tobind_flags) == tobind_flags);
+
       /* This call will set sbuf->handle */
       if (svga_have_gb_objects(svga)) {
-	 ret = svga_buffer_update_hw(svga, sbuf);
+         ret = svga_buffer_update_hw(svga, sbuf, sbuf->bind_flags);
       } else {
-	 ret = svga_buffer_create_host_surface(ss, sbuf);
+         ret = svga_buffer_create_host_surface(ss, sbuf, sbuf->bind_flags);
       }
       if (ret != PIPE_OK)
-	 return NULL;
+         return NULL;
    }
 
    assert(sbuf->handle);
@@ -801,7 +1026,7 @@
          /* No pending DMA/update commands yet. */
 
          /* Migrate the data from swbuf -> hwbuf if necessary */
-         ret = svga_buffer_update_hw(svga, sbuf);
+         ret = svga_buffer_update_hw(svga, sbuf, sbuf->bind_flags);
          if (ret == PIPE_OK) {
             /* Emit DMA or UpdateGBImage commands */
             ret = svga_buffer_upload_command(svga, sbuf);
@@ -847,7 +1072,6 @@
 }
 
 
-
 void
 svga_context_flush_buffers(struct svga_context *svga)
 {
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.h b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
index 13d8f3e..c2d749b 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
@@ -38,7 +38,8 @@
 
 enum pipe_error
 svga_buffer_create_hw_storage(struct svga_screen *ss,
-                              struct svga_buffer *sbuf);
+                              struct svga_buffer *sbuf,
+                              unsigned bind_flags);
 
 void
 svga_buffer_destroy_hw_storage(struct svga_screen *ss,
@@ -46,7 +47,29 @@
 
 enum pipe_error
 svga_buffer_create_host_surface(struct svga_screen *ss,
-                                struct svga_buffer *sbuf);
+                                struct svga_buffer *sbuf,
+                                unsigned bind_flags);
+
+enum pipe_error
+svga_buffer_recreate_host_surface(struct svga_context *svga,
+                                  struct svga_buffer *sbuf,
+                                  unsigned bind_flags);
+
+enum pipe_error
+svga_buffer_add_host_surface(struct svga_buffer *sbuf,
+                             struct svga_winsys_surface *handle,
+                             struct svga_host_surface_cache_key *key,
+                             unsigned bind_flags);
+
+void
+svga_buffer_bind_host_surface(struct svga_context *svga,
+                             struct svga_buffer *sbuf,
+                             struct svga_buffer_surface *bufsurf);
+
+enum pipe_error
+svga_buffer_validate_host_surface(struct svga_context *svga,
+                                  struct svga_buffer *sbuf,
+                                  unsigned bind_flags);
 
 void
 svga_buffer_destroy_host_surface(struct svga_screen *ss,
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 11a8749..5b82e9e 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -128,7 +128,7 @@
 
       if (transfer == SVGA3D_READ_HOST_VRAM) {
          svga_context_flush(svga, &fence);
-         sws->fence_finish(sws, fence, 0);
+         sws->fence_finish(sws, fence, PIPE_TIMEOUT_INFINITE, 0);
          sws->fence_reference(sws, &fence, NULL);
       }
    }
@@ -187,7 +187,7 @@
 
          if (transfer == SVGA3D_READ_HOST_VRAM) {
             svga_context_flush(svga, &fence);
-            sws->fence_finish(sws, fence, 0);
+            sws->fence_finish(sws, fence, PIPE_TIMEOUT_INFINITE, 0);
 
             hw = sws->buffer_map(sws, st->hwbuf, PIPE_TRANSFER_READ);
             assert(hw);
@@ -238,6 +238,10 @@
    SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle);
    svga_screen_surface_destroy(ss, &tex->key, &tex->handle);
 
+   /* Destroy the backed surface handle if exists */
+   if (tex->backed_handle)
+      svga_screen_surface_destroy(ss, &tex->backed_key, &tex->backed_handle);
+      
    ss->hud.total_resource_bytes -= tex->size;
 
    FREE(tex->defined);
@@ -912,6 +916,39 @@
       goto fail_notex;
    }
 
+   /* Verify the number of mipmap levels isn't impossibly large.  For example,
+    * if the base 2D image is 16x16, we can't have 8 mipmap levels.
+    * The state tracker should never ask us to create a resource with invalid
+    * parameters.
+    */
+   {
+      unsigned max_dim = template->width0;
+
+      switch (template->target) {
+      case PIPE_TEXTURE_1D:
+      case PIPE_TEXTURE_1D_ARRAY:
+         // nothing
+         break;
+      case PIPE_TEXTURE_2D:
+      case PIPE_TEXTURE_CUBE:
+      case PIPE_TEXTURE_CUBE_ARRAY:
+      case PIPE_TEXTURE_2D_ARRAY:
+         max_dim = MAX2(max_dim, template->height0);
+         break;
+      case PIPE_TEXTURE_3D:
+         max_dim = MAX3(max_dim, template->height0, template->depth0);
+         break;
+      case PIPE_TEXTURE_RECT:
+      case PIPE_BUFFER:
+         assert(template->last_level == 0);
+         /* the assertion below should always pass */
+         break;
+      default:
+         debug_printf("Unexpected texture target type\n");
+      }
+      assert(1 << template->last_level <= max_dim);
+   }
+
    tex = CALLOC_STRUCT(svga_texture);
    if (!tex) {
       goto fail_notex;
@@ -1120,6 +1157,9 @@
    tex->can_use_upload = svga_texture_transfer_map_can_upload(svgascreen,
                                                               &tex->b.b);
 
+   /* Initialize the backing resource cache */
+   tex->backed_handle = NULL;
+
    svgascreen->hud.total_resource_bytes += tex->size;
    svgascreen->hud.num_resources++;
 
@@ -1143,8 +1183,8 @@
 
 struct pipe_resource *
 svga_texture_from_handle(struct pipe_screen *screen,
-			 const struct pipe_resource *template,
-			 struct winsys_handle *whandle)
+                         const struct pipe_resource *template,
+                         struct winsys_handle *whandle)
 {
    struct svga_winsys_screen *sws = svga_winsys_screen(screen);
    struct svga_screen *ss = svga_screen(screen);
@@ -1166,42 +1206,18 @@
    if (!srf)
       return NULL;
 
-   if (svga_translate_format(svga_screen(screen), template->format,
-                             template->bind) != format) {
-      unsigned f1 = svga_translate_format(svga_screen(screen),
-                                          template->format, template->bind);
-      unsigned f2 = format;
-
-      /* It's okay for XRGB and ARGB or depth with/out stencil to get mixed up.
-       */
-      if (f1 == SVGA3D_B8G8R8A8_UNORM)
-         f1 = SVGA3D_A8R8G8B8;
-      if (f1 == SVGA3D_B8G8R8X8_UNORM)
-         f1 = SVGA3D_X8R8G8B8;
-
-      if ( !( (f1 == f2) ||
-              (f1 == SVGA3D_X8R8G8B8 && f2 == SVGA3D_A8R8G8B8) ||
-              (f1 == SVGA3D_X8R8G8B8 && f2 == SVGA3D_B8G8R8X8_UNORM) ||
-              (f1 == SVGA3D_A8R8G8B8 && f2 == SVGA3D_X8R8G8B8) ||
-              (f1 == SVGA3D_A8R8G8B8 && f2 == SVGA3D_B8G8R8A8_UNORM) ||
-              (f1 == SVGA3D_Z_D24X8 && f2 == SVGA3D_Z_D24S8) ||
-              (f1 == SVGA3D_Z_DF24 && f2 == SVGA3D_Z_D24S8_INT) ) ) {
-         debug_printf("%s wrong format %s != %s\n", __FUNCTION__,
-                      svga_format_name(f1), svga_format_name(f2));
-         return NULL;
-      }
-   }
+   if (!svga_format_is_shareable(ss, template->format, format,
+                                 template->bind, true))
+      goto out_unref;
 
    tex = CALLOC_STRUCT(svga_texture);
    if (!tex)
-      return NULL;
+      goto out_unref;
 
    tex->defined = CALLOC(template->depth0 * template->array_size,
                          sizeof(tex->defined[0]));
-   if (!tex->defined) {
-      FREE(tex);
-      return NULL;
-   }
+   if (!tex->defined)
+      goto out_no_defined;
 
    tex->b.b = *template;
    tex->b.vtbl = &svga_texture_vtbl;
@@ -1216,11 +1232,11 @@
 
    tex->rendered_to = CALLOC(1, sizeof(tex->rendered_to[0]));
    if (!tex->rendered_to)
-      goto fail;
+      goto out_no_rendered_to;
 
    tex->dirty = CALLOC(1, sizeof(tex->dirty[0]));
    if (!tex->dirty)
-      goto fail;
+      goto out_no_dirty;
 
    tex->imported = TRUE;
 
@@ -1228,14 +1244,14 @@
 
    return &tex->b.b;
 
-fail:
-   if (tex->defined)
-      FREE(tex->defined);
-   if (tex->rendered_to)
-      FREE(tex->rendered_to);
-   if (tex->dirty)
-      FREE(tex->dirty);
+out_no_dirty:
+   FREE(tex->rendered_to);
+out_no_rendered_to:
+   FREE(tex->defined);
+out_no_defined:
    FREE(tex);
+out_unref:
+   sws->surface_reference(sws, &srf, NULL);
    return NULL;
 }
 
@@ -1470,7 +1486,7 @@
    /* unmap the texture upload buffer */
    u_upload_unmap(svga->tex_upload);
 
-   srcsurf = svga_buffer_handle(svga, st->upload.buf);
+   srcsurf = svga_buffer_handle(svga, st->upload.buf, 0);
    dstsurf = svga_texture(texture)->handle;
    assert(dstsurf);
 
@@ -1506,3 +1522,28 @@
 
    pipe_resource_reference(&st->upload.buf, NULL);
 }
+
+/**
+ * Does the device format backing this surface have an
+ * alpha channel?
+ *
+ * \param texture[in]  The texture whose format we're querying
+ * \return TRUE if the format has an alpha channel, FALSE otherwise
+ *
+ * For locally created textures, the device (svga) format is typically
+ * identical to svga_format(texture->format), and we can use the gallium
+ * format tests to determine whether the device format has an alpha channel
+ * or not. However, for textures backed by imported svga surfaces that is
+ * not always true, and we have to look at the SVGA3D utilities.
+ */
+boolean
+svga_texture_device_format_has_alpha(struct pipe_resource *texture)
+{
+   /* the svga_texture() call below is invalid for PIPE_BUFFER resources */
+   assert(texture->target != PIPE_BUFFER);
+
+   enum svga3d_block_desc block_desc =
+      svga3dsurface_get_desc(svga_texture(texture)->key.format)->block_desc;
+
+   return !!(block_desc & SVGA3DBLOCKDESC_ALPHA);
+}
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h
index 9f7b0c6..fe52738 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -104,6 +104,16 @@
     *  Set if the level is marked as dirty.
     */ 
    ushort *dirty;
+
+   /**
+    * A cached backing host side surface to be used if this texture is being
+    * used for rendering and sampling at the same time.
+    * Currently we only cache one handle. If needed, we can extend this to
+    * support multiple handles.
+    */
+   struct svga_host_surface_cache_key backed_key;
+   struct svga_winsys_surface *backed_handle;
+   unsigned backed_age;
 };
 
 
@@ -303,4 +313,7 @@
 svga_texture_transfer_unmap_upload(struct svga_context *svga,
                                    struct svga_transfer *st);
 
+boolean
+svga_texture_device_format_has_alpha(struct pipe_resource *texture);
+
 #endif /* SVGA_TEXTURE_H */
diff --git a/src/gallium/drivers/svga/svga_sampler_view.c b/src/gallium/drivers/svga/svga_sampler_view.c
index ee4ef3c..80a1b92 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.c
+++ b/src/gallium/drivers/svga/svga_sampler_view.c
@@ -151,7 +151,7 @@
                                           flags, format,
                                           min_lod,
                                           max_lod - min_lod + 1,
-                                          -1, 1, -1,
+                                          -1, 1, -1, FALSE,
                                           &sv->key);
 
    if (!sv->handle) {
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
index 7521a82..b6f5489 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -100,8 +100,8 @@
 }
 
 boolean
-svga_check_sampler_view_resource_collision(struct svga_context *svga,
-                                           struct svga_winsys_surface *res,
+svga_check_sampler_view_resource_collision(const struct svga_context *svga,
+                                           const struct svga_winsys_surface *res,
                                            enum pipe_shader_type shader);
 
 boolean
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 07f3346..77223c9 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -23,16 +23,20 @@
  *
  **********************************************************/
 
+#include "git_sha1.h" /* For MESA_GIT_SHA1 */
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/u_string.h"
 #include "util/u_math.h"
 
+#include "os/os_process.h"
+
 #include "svga_winsys.h"
 #include "svga_public.h"
 #include "svga_context.h"
 #include "svga_format.h"
+#include "svga_msg.h"
 #include "svga_screen.h"
 #include "svga_tgsi.h"
 #include "svga_resource_texture.h"
@@ -45,6 +49,10 @@
 /* NOTE: this constant may get moved into a svga3d*.h header file */
 #define SVGA3D_DX_MAX_RESOURCE_SIZE (128 * 1024 * 1024)
 
+#ifndef MESA_GIT_SHA1
+#define MESA_GIT_SHA1 "(unknown git revision)"
+#endif
+
 #ifdef DEBUG
 int SVGA_DEBUG = 0;
 
@@ -65,6 +73,7 @@
    { "cache",       DEBUG_CACHE, NULL },
    { "streamout",   DEBUG_STREAMOUT, NULL },
    { "query",       DEBUG_QUERY, NULL },
+   { "samplers",    DEBUG_SAMPLERS, NULL },
    DEBUG_NAMED_VALUE_END
 };
 #endif
@@ -328,6 +337,9 @@
    case PIPE_CAP_GENERATE_MIPMAP:
       return sws->have_generate_mipmap_cmd;
 
+   case PIPE_CAP_NATIVE_FENCE_FD:
+      return sws->have_fence_fd;
+
    /* Unsupported features */
    case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -371,7 +383,6 @@
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
-   case PIPE_CAP_NATIVE_FENCE_FD:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -435,6 +446,10 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
    }
 
@@ -514,6 +529,7 @@
       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -574,6 +590,7 @@
       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -668,6 +685,7 @@
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
    case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
@@ -838,7 +856,7 @@
       SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
                __FUNCTION__, fence);
 
-      retVal = sws->fence_finish(sws, fence, 0) == 0;
+      retVal = sws->fence_finish(sws, fence, timeout, 0) == 0;
    }
 
    SVGA_STATS_TIME_POP(sws);
@@ -848,6 +866,16 @@
 
 
 static int
+svga_fence_get_fd(struct pipe_screen *screen,
+                  struct pipe_fence_handle *fence)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+
+   return sws->fence_get_fd(sws, fence, TRUE);
+}
+
+
+static int
 svga_get_driver_query_info(struct pipe_screen *screen,
                            unsigned index,
                            struct pipe_driver_query_info *info)
@@ -903,6 +931,8 @@
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-generate-mipmap", SVGA_QUERY_NUM_GENERATE_MIPMAP,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-failed-allocations", SVGA_QUERY_NUM_FAILED_ALLOCATIONS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
    };
 #undef QUERY
 
@@ -918,6 +948,35 @@
 
 
 static void
+init_logging(struct pipe_screen *screen)
+{
+   static const char *log_prefix = "Mesa: ";
+   char host_log[1000];
+
+   /* Log Version to Host */
+   util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
+                 "%s%s", log_prefix, svga_get_name(screen));
+   svga_host_log(host_log);
+
+   util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
+                 "%s%s (%s)", log_prefix, PACKAGE_VERSION, MESA_GIT_SHA1);
+   svga_host_log(host_log);
+
+   /* If the SVGA_EXTRA_LOGGING env var is set, log the process's command
+    * line (program name and arguments).
+    */
+   if (debug_get_bool_option("SVGA_EXTRA_LOGGING", FALSE)) {
+      char cmdline[1000];
+      if (os_get_command_line(cmdline, sizeof(cmdline))) {
+         util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
+                       "%s%s", log_prefix, cmdline);
+         svga_host_log(host_log);
+      }
+   }
+}
+
+
+static void
 svga_destroy_screen( struct pipe_screen *screen )
 {
    struct svga_screen *svgascreen = svga_screen(screen);
@@ -977,6 +1036,8 @@
    screen->context_create = svga_context_create;
    screen->fence_reference = svga_fence_reference;
    screen->fence_finish = svga_fence_finish;
+   screen->fence_get_fd = svga_fence_get_fd;
+
    screen->get_driver_query_info = svga_get_driver_query_info;
    svgascreen->sws = sws;
 
@@ -988,6 +1049,13 @@
       svgascreen->hw_version = SVGA3D_HWVERSION_WS65_B1;
    }
 
+   if (svgascreen->hw_version < SVGA3D_HWVERSION_WS8_B1) {
+      /* too old for 3D acceleration */
+      debug_printf("Hardware version 0x%x is too old for accerated 3D\n",
+                   svgascreen->hw_version);
+      goto error2;
+   }
+
    /*
     * The D16, D24X8, and D24S8 formats always do an implicit shadow compare
     * when sampled from, where as the DF16, DF24, and D24S8_INT do not.  So
@@ -1048,6 +1116,11 @@
             get_uint_cap(sws, SVGA3D_DEVCAP_MULTISAMPLE_MASKABLESAMPLES, 0);
       }
 
+      /* We only support 4x, 8x, 16x MSAA */
+      svgascreen->ms_samples &= ((1 << (4-1)) |
+                                 (1 << (8-1)) |
+                                 (1 << (16-1)));
+
       /* Maximum number of constant buffers */
       svgascreen->max_const_buffers =
          get_uint_cap(sws, SVGA3D_DEVCAP_DX_MAX_CONSTANT_BUFFERS, 1);
@@ -1110,10 +1183,12 @@
    }
 
    (void) mtx_init(&svgascreen->tex_mutex, mtx_plain);
-   (void) mtx_init(&svgascreen->swc_mutex, mtx_plain);
+   (void) mtx_init(&svgascreen->swc_mutex, mtx_recursive);
 
    svga_screen_cache_init(svgascreen);
 
+   init_logging(screen);
+
    return screen;
 error2:
    FREE(svgascreen);
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index 68834a6..12b9346 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -86,6 +86,7 @@
       /** Memory used by all resources (buffers and surfaces) */
       uint64_t total_resource_bytes;
       uint64_t num_resources;
+      uint64_t num_failed_allocations;
    } hud;
 };
 
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index 86c9798..5cec435 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -476,6 +476,7 @@
             key->cachable);
 
    if (cachable) {
+      /* Try to re-cycle a previously freed, cached surface */
       if (key->format == SVGA3D_BUFFER) {
          SVGA3dSurfaceFlags hint_flag;
 
@@ -536,6 +537,7 @@
    }
 
    if (!handle) {
+      /* Unable to recycle surface, allocate a new one */
       unsigned usage = 0;
 
       if (!key->cachable)
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index 55f7922..9e2b657 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -25,10 +25,12 @@
 
 #include "util/u_bitmask.h"
 #include "util/u_memory.h"
+#include "util/u_format.h"
 #include "svga_context.h"
 #include "svga_cmd.h"
 #include "svga_format.h"
 #include "svga_shader.h"
+#include "svga_resource_texture.h"
 
 
 /**
@@ -160,6 +162,25 @@
    return remap_table[generic_index];
 }
 
+static const enum pipe_swizzle copy_alpha[PIPE_SWIZZLE_MAX] = {
+   PIPE_SWIZZLE_X,
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_Z,
+   PIPE_SWIZZLE_W,
+   PIPE_SWIZZLE_0,
+   PIPE_SWIZZLE_1,
+   PIPE_SWIZZLE_NONE
+};
+
+static const enum pipe_swizzle set_alpha[PIPE_SWIZZLE_MAX] = {
+   PIPE_SWIZZLE_X,
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_Z,
+   PIPE_SWIZZLE_1,
+   PIPE_SWIZZLE_0,
+   PIPE_SWIZZLE_1,
+   PIPE_SWIZZLE_NONE
+};
 
 /**
  * Initialize the shader-neutral fields of svga_compile_key from context
@@ -177,13 +198,13 @@
    /* In case the number of samplers and sampler_views doesn't match,
     * loop over the lower of the two counts.
     */
-   key->num_textures = MIN2(svga->curr.num_sampler_views[shader],
+   key->num_textures = MAX2(svga->curr.num_sampler_views[shader],
                             svga->curr.num_samplers[shader]);
 
    for (i = 0; i < key->num_textures; i++) {
       struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
       const struct svga_sampler_state *sampler = svga->curr.sampler[shader][i];
-      if (view && sampler) {
+      if (view) {
          assert(view->texture);
          assert(view->texture->target < (1 << 4)); /* texture_target:4 */
 
@@ -202,17 +223,35 @@
             }
          }
 
+         /* If we have a non-alpha view into an svga3d surface with an
+          * alpha channel, then explicitly set the alpha channel to 1
+          * when sampling. Note that we need to check the
+          * actual device format to cover also imported surface cases.
+          */
+         const enum pipe_swizzle *swizzle_tab =
+            (view->texture->target != PIPE_BUFFER &&
+             !util_format_has_alpha(view->format) &&
+             svga_texture_device_format_has_alpha(view->texture)) ?
+            set_alpha : copy_alpha;
+
+         key->tex[i].swizzle_r = swizzle_tab[view->swizzle_r];
+         key->tex[i].swizzle_g = swizzle_tab[view->swizzle_g];
+         key->tex[i].swizzle_b = swizzle_tab[view->swizzle_b];
+         key->tex[i].swizzle_a = swizzle_tab[view->swizzle_a];
+      }
+
+      if (sampler) {
          if (!sampler->normalized_coords) {
             assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
             key->tex[i].width_height_idx = idx++;
             key->tex[i].unnormalized = TRUE;
             ++key->num_unnormalized_coords;
-         }
 
-         key->tex[i].swizzle_r = view->swizzle_r;
-         key->tex[i].swizzle_g = view->swizzle_g;
-         key->tex[i].swizzle_b = view->swizzle_b;
-         key->tex[i].swizzle_a = view->swizzle_a;
+            if (sampler->magfilter == SVGA3D_TEX_FILTER_NEAREST ||
+                sampler->minfilter == SVGA3D_TEX_FILTER_NEAREST) {
+                key->tex[i].texel_bias = TRUE;
+            }
+         }
       }
    }
 }
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index ec116c0..a594d12 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -97,6 +97,7 @@
       unsigned compare_mode:1;
       unsigned compare_func:3;
       unsigned unnormalized:1;
+      unsigned texel_bias:1;
       unsigned width_height_idx:5; /**< texture unit */
       unsigned is_array:1;
       unsigned sprite_texgen:1;
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 31e4be0..2a61e8e 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -72,9 +72,9 @@
    unsigned count = 0;
 
    for (i = 0; i < variant->key.num_textures; i++) {
-      struct pipe_sampler_view *sv = svga->curr.sampler_views[shader][i];
+      const struct pipe_sampler_view *sv = svga->curr.sampler_views[shader][i];
       if (sv) {
-         struct pipe_resource *tex = sv->texture;
+         const struct pipe_resource *tex = sv->texture;
          /* Scaling factors needed for handling unnormalized texture coordinates
           * for texture rectangles.
           */
@@ -150,7 +150,7 @@
 static unsigned
 svga_get_pt_sprite_constants(struct svga_context *svga, float **dest)
 {
-   struct svga_screen *screen = svga_screen(svga->pipe.screen);
+   const struct svga_screen *screen = svga_screen(svga->pipe.screen);
    float *dst = *dest;
 
    dst[0] = 1.0 / (svga->curr.viewport.scale[0] * 2);
@@ -591,7 +591,8 @@
       /* we must unmap the buffer before getting the winsys handle */
       u_upload_unmap(svga->const0_upload);
 
-      dst_handle = svga_buffer_handle(svga, dst_buffer);
+      dst_handle = svga_buffer_handle(svga, dst_buffer,
+                                      PIPE_BIND_CONSTANT_BUFFER);
       if (!dst_handle) {
          pipe_resource_reference(&dst_buffer, NULL);
          return PIPE_ERROR_OUT_OF_MEMORY;
@@ -660,7 +661,8 @@
       struct svga_winsys_surface *handle;
 
       if (buffer) {
-         handle = svga_buffer_handle(svga, &buffer->b.b);
+         handle = svga_buffer_handle(svga, &buffer->b.b,
+                                     PIPE_BIND_CONSTANT_BUFFER);
          enabled_constbufs |= 1 << index;
       }
       else {
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
index ee767bd..c52b7ee 100644
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -34,6 +34,7 @@
 #include "svga_debug.h"
 #include "svga_screen.h"
 #include "svga_surface.h"
+#include "svga_resource_texture.h"
 
 
 /*
@@ -82,6 +83,13 @@
 
          pipe_surface_reference(&hw->cbufs[i], curr->cbufs[i]);
       }
+
+      /* Set the rendered-to flag */
+      struct pipe_surface *s = curr->cbufs[i];
+      if (s) {
+         svga_set_texture_rendered_to(svga_texture(s->texture),
+                                      s->u.tex.first_layer, s->u.tex.level);
+      }
    }
 
    if ((curr->zsbuf != hw->zsbuf) || (reemit && hw->zsbuf)) {
@@ -107,6 +115,13 @@
       }
 
       pipe_surface_reference(&hw->zsbuf, curr->zsbuf);
+
+      /* Set the rendered-to flag */
+      struct pipe_surface *s = curr->zsbuf;
+      if (s) {
+         svga_set_texture_rendered_to(svga_texture(s->texture),
+                                      s->u.tex.first_layer, s->u.tex.level);
+      }
    }
 
    return PIPE_OK;
@@ -195,14 +210,19 @@
     */
    for (i = 0; i < num_color; i++) {
       if (curr->cbufs[i]) {
-         rtv[i] = svga_validate_surface_view(svga,
-                                             svga_surface(curr->cbufs[i]));
+         struct pipe_surface *s = curr->cbufs[i];
+
+         rtv[i] = svga_validate_surface_view(svga, svga_surface(s));
          if (rtv[i] == NULL) {
             return PIPE_ERROR_OUT_OF_MEMORY;
          }
 
          assert(svga_surface(rtv[i])->view_id != SVGA3D_INVALID_ID);
          last_rtv = i;
+
+         /* Set the rendered-to flag */
+         svga_set_texture_rendered_to(svga_texture(s->texture),
+                                      s->u.tex.first_layer, s->u.tex.level);
       }
       else {
          rtv[i] = NULL;
@@ -211,19 +231,25 @@
 
    /* Setup depth stencil view */
    if (curr->zsbuf) {
+      struct pipe_surface *s = curr->zsbuf;
+
       dsv = svga_validate_surface_view(svga, svga_surface(curr->zsbuf));
       if (!dsv) {
          return PIPE_ERROR_OUT_OF_MEMORY;
       }
+
+      /* Set the rendered-to flag */
+      svga_set_texture_rendered_to(svga_texture(s->texture),
+                                      s->u.tex.first_layer, s->u.tex.level);
    }
    else {
       dsv = NULL;
    }
 
    /* avoid emitting redundant SetRenderTargets command */
-   if ((num_color != svga->state.hw_draw.num_rendertargets) ||
-       (dsv != svga->state.hw_draw.dsv) ||
-       memcmp(rtv, svga->state.hw_draw.rtv, num_color * sizeof(rtv[0]))) {
+   if ((num_color != svga->state.hw_clear.num_rendertargets) ||
+       (dsv != svga->state.hw_clear.dsv) ||
+       memcmp(rtv, svga->state.hw_clear.rtv, num_color * sizeof(rtv[0]))) {
 
       ret = SVGA3D_vgpu10_SetRenderTargets(svga->swc, num_color, rtv, dsv);
       if (ret != PIPE_OK)
@@ -232,9 +258,9 @@
       /* number of render targets sent to the device, not including trailing
        * unbound render targets.
        */
-      svga->state.hw_draw.num_rendertargets = last_rtv + 1;
-      svga->state.hw_draw.dsv = dsv;
-      memcpy(svga->state.hw_draw.rtv, rtv, num_color * sizeof(rtv[0]));
+      svga->state.hw_clear.num_rendertargets = last_rtv + 1;
+      svga->state.hw_clear.dsv = dsv;
+      memcpy(svga->state.hw_clear.rtv, rtv, num_color * sizeof(rtv[0]));
     
       for (i = 0; i < ss->max_color_buffers; i++) {
          if (hw->cbufs[i] != curr->cbufs[i]) {
@@ -309,7 +335,7 @@
 enum pipe_error
 svga_rebind_framebuffer_bindings(struct svga_context *svga)
 {
-   struct svga_hw_draw_state *hw = &svga->state.hw_draw;
+   struct svga_hw_clear_state *hw = &svga->state.hw_clear;
    unsigned i;
    enum pipe_error ret;
 
diff --git a/src/gallium/drivers/svga/svga_state_gs.c b/src/gallium/drivers/svga/svga_state_gs.c
index 2174638..19f0887 100644
--- a/src/gallium/drivers/svga/svga_state_gs.c
+++ b/src/gallium/drivers/svga/svga_state_gs.c
@@ -201,11 +201,17 @@
     * it instead of the one from the vertex shader.
     */
    if (svga_have_gs_streamout(svga)) {
-      svga_set_stream_output(svga, gs->base.stream_output);
+      ret = svga_set_stream_output(svga, gs->base.stream_output);
+      if (ret != PIPE_OK) {
+         goto done;
+      }
    }
    else if (!svga_have_vs_streamout(svga)) {
       /* turn off stream out */
-      svga_set_stream_output(svga, NULL);
+      ret = svga_set_stream_output(svga, NULL);
+      if (ret != PIPE_OK) {
+         goto done;
+      }
    }
 
    /* SVGA_NEW_NEED_SWTNL */
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c
index 445afcc..c361dba 100644
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -44,7 +44,7 @@
 #include "svga_shader.h"
 #include "svga_state.h"
 #include "svga_surface.h"
-
+#include "svga3d_surfacedefs.h"
 
 /** Get resource handle for a texture or buffer */
 static inline struct svga_winsys_surface *
@@ -64,8 +64,8 @@
  * any of the resources bound to any of the currently bound sampler views.
  */
 boolean
-svga_check_sampler_view_resource_collision(struct svga_context *svga,
-                                           struct svga_winsys_surface *res,
+svga_check_sampler_view_resource_collision(const struct svga_context *svga,
+                                           const struct svga_winsys_surface *res,
                                            enum pipe_shader_type shader)
 {
    struct pipe_screen *screen = svga->pipe.screen;
@@ -141,11 +141,11 @@
        * create a BGRA view (and vice versa).
        */
       if (viewFormat == PIPE_FORMAT_B8G8R8X8_UNORM &&
-          texture->format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+          svga_texture_device_format_has_alpha(texture)) {
          viewFormat = PIPE_FORMAT_B8G8R8A8_UNORM;
       }
       else if (viewFormat == PIPE_FORMAT_B8G8R8A8_UNORM &&
-          texture->format == PIPE_FORMAT_B8G8R8X8_UNORM) {
+               !svga_texture_device_format_has_alpha(texture)) {
          viewFormat = PIPE_FORMAT_B8G8R8X8_UNORM;
       }
 
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 43b45e5..455b173 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -28,6 +28,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 
+#include "svga_resource_texture.h"
 #include "svga_sampler_view.h"
 #include "svga_winsys.h"
 #include "svga_context.h"
@@ -175,6 +176,8 @@
                             &queue);
    }
 
+   svga->state.hw_draw.num_backed_views = 0;
+
    if (queue.bind_count) {
       SVGA3dTextureState *ts;
 
@@ -185,12 +188,19 @@
 
       for (i = 0; i < queue.bind_count; i++) {
          struct svga_winsys_surface *handle;
+         struct svga_hw_view_state *view = queue.bind[i].view;
 
          ts[i].stage = queue.bind[i].unit;
          ts[i].name = SVGA3D_TS_BIND_TEXTURE;
 
-         if (queue.bind[i].view->v) {
-            handle = queue.bind[i].view->v->handle;
+         if (view->v) {
+            handle = view->v->handle;
+
+            /* Keep track of number of views with a backing copy
+             * of texture.
+             */
+            if (handle != svga_texture(view->texture)->handle)
+               svga->state.hw_draw.num_backed_views++;
          }
          else {
             handle = NULL;
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
index e1b6a1c..fd6a238 100644
--- a/src/gallium/drivers/svga/svga_state_vdecl.c
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -73,10 +73,10 @@
       unsigned int offset = vb->buffer_offset + ve[i].src_offset;
       unsigned tmp_neg_bias = 0;
 
-      if (!vb->buffer)
+      if (!vb->buffer.resource)
          continue;
 
-      buffer = svga_buffer(vb->buffer);
+      buffer = svga_buffer(vb->buffer.resource);
       if (buffer->uploaded.start > offset) {
          tmp_neg_bias = buffer->uploaded.start - offset;
          if (vb->stride)
@@ -91,10 +91,10 @@
       unsigned usage, index;
       struct svga_buffer *buffer;
 
-      if (!vb->buffer)
+      if (!vb->buffer.resource)
          continue;
 
-      buffer = svga_buffer(vb->buffer);
+      buffer = svga_buffer(vb->buffer.resource);
       svga_generate_vdecl_semantics( i, &usage, &index );
 
       /* SVGA_NEW_VELEMENT
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 325ef3e..a0ab868 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -353,11 +353,14 @@
       /* No GS stream out */
       if (svga_have_vs_streamout(svga)) {
          /* Set VS stream out */
-         svga_set_stream_output(svga, vs->base.stream_output);
+         ret = svga_set_stream_output(svga, vs->base.stream_output);
       }
       else {
          /* turn off stream out */
-         svga_set_stream_output(svga, NULL);
+         ret = svga_set_stream_output(svga, NULL);
+      }
+      if (ret != PIPE_OK) {
+         goto done;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c
index 331a4cd..d7c9850 100644
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -158,16 +158,20 @@
                           int layer_pick,
                           unsigned num_layers,
                           int zslice_pick,
+                          boolean cacheable,
                           struct svga_host_surface_cache_key *key) /* OUT */
 {
    struct svga_screen *ss = svga_screen(svga->pipe.screen);
-   struct svga_winsys_surface *handle;
+   struct svga_winsys_surface *handle = NULL;
    boolean validated;
+   boolean needCopyResource;
 
    SVGA_DBG(DEBUG_PERF,
             "svga: Create surface view: layer %d zslice %d mips %d..%d\n",
             layer_pick, zslice_pick, start_mip, start_mip+num_mip-1);
 
+   SVGA_STATS_TIME_PUSH(ss->sws, SVGA_STATS_TIME_EMULATESURFACEVIEW);
+
    key->flags = flags;
    key->format = format;
    key->numMipLevels = num_mip;
@@ -195,15 +199,28 @@
 
    if (key->format == SVGA3D_FORMAT_INVALID) {
       key->cachable = 0;
-      return NULL;
+      goto done;
    }
 
-   SVGA_DBG(DEBUG_DMA, "surface_create for texture view\n");
-   handle = svga_screen_surface_create(ss, bind_flags, PIPE_USAGE_DEFAULT,
-                                       &validated, key);
+   if (cacheable && tex->backed_handle &&
+       memcmp(key, &tex->backed_key, sizeof *key) == 0) {
+      handle = tex->backed_handle;
+      needCopyResource = tex->backed_age < tex->age;
+   } else {
+      SVGA_DBG(DEBUG_DMA, "surface_create for texture view\n");
+      handle = svga_screen_surface_create(ss, bind_flags, PIPE_USAGE_DEFAULT,
+                                          &validated, key);
+      needCopyResource = TRUE;
+
+      if (cacheable && !tex->backed_handle) {
+         tex->backed_handle = handle;
+         memcpy(&tex->backed_key, key, sizeof *key);
+      }
+   }
+
    if (!handle) {
       key->cachable = 0;
-      return NULL;
+      goto done;
    }
 
    SVGA_DBG(DEBUG_DMA, " --> got sid %p (texture view)\n", handle);
@@ -211,10 +228,16 @@
    if (layer_pick < 0)
       layer_pick = 0;
 
-   svga_texture_copy_handle_resource(svga, tex, handle,
-                                     key->numMipLevels,
-                                     key->numFaces * key->arraySize,
-                                     zslice_pick, start_mip, layer_pick);
+   if (needCopyResource) {
+      svga_texture_copy_handle_resource(svga, tex, handle,
+                                        key->numMipLevels,
+                                        key->numFaces * key->arraySize,
+                                        zslice_pick, start_mip, layer_pick);
+      tex->backed_age = tex->age;
+   }
+
+done:
+   SVGA_STATS_TIME_POP(ss->sws);
 
    return handle;
 }
@@ -287,15 +310,23 @@
       bind = PIPE_BIND_RENDER_TARGET;
    }
 
-   if (tex->imported)
+   if (tex->imported) {
+      /* imported resource (a window) */
       format = tex->key.format;
-   else
+      if (util_format_is_srgb(surf_tmpl->format)) {
+         /* sRGB rendering to window */
+         format = svga_linear_to_srgb(format);
+      }
+   }
+   else {
       format = svga_translate_format(ss, surf_tmpl->format, bind);
+   }
 
    assert(format != SVGA3D_FORMAT_INVALID);
 
    if (view) {
-      SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: yes %p, level %u layer %u z %u, %p\n",
+      SVGA_DBG(DEBUG_VIEWS,
+               "New backed surface view: resource %p, level %u layer %u z %u, %p\n",
                pt, surf_tmpl->u.tex.level, layer, zslice, s);
 
       if (svga_have_vgpu10(svga)) {
@@ -327,7 +358,8 @@
       s->handle = svga_texture_view_surface(svga, tex, bind, flags,
                                             tex->key.format,
                                             surf_tmpl->u.tex.level, 1,
-                                            layer, nlayers, zslice, &s->key);
+                                            layer, nlayers, zslice,
+                                            TRUE, &s->key);
       if (!s->handle) {
          FREE(s);
          goto done;
@@ -339,7 +371,7 @@
       s->real_zslice = 0;
    } else {
       SVGA_DBG(DEBUG_VIEWS,
-               "svga: Surface view: no %p, level %u, layer %u, z %u, %p\n",
+               "New surface view: resource %p, level %u, layer %u, z %u, %p\n",
                pt, surf_tmpl->u.tex.level, layer, zslice, s);
 
       memset(&s->key, 0, sizeof s->key);
@@ -400,25 +432,28 @@
 {
    struct svga_texture *tex = svga_texture(s->base.texture);
 
-   SVGA_STATS_TIME_PUSH(svga_sws(svga),
-                        SVGA_STATS_TIME_CREATEBACKEDSURFACEVIEW);
-
    if (!s->backed) {
       struct pipe_surface *backed_view;
 
+      SVGA_STATS_TIME_PUSH(svga_sws(svga),
+                           SVGA_STATS_TIME_CREATEBACKEDSURFACEVIEW);
+
       backed_view = svga_create_surface_view(&svga->pipe,
                                              &tex->b.b,
                                              &s->base,
                                              TRUE);
       if (!backed_view)
-         return NULL;
+         goto done;
 
       s->backed = svga_surface(backed_view);
+
+      SVGA_STATS_TIME_POP(svga_sws(svga));
    }
-   else {
+   else if (s->backed->age < tex->age) {
       /*
        * There is already an existing backing surface, but we still need to
-       * sync the handles.
+       * sync the backing resource if the original resource has been modified
+       * since the last copy.
        */
       struct svga_surface *bs = s->backed;
       unsigned int layer, zslice;
@@ -444,9 +479,9 @@
    }
 
    svga_mark_surface_dirty(&s->backed->base);
+   s->backed->age = tex->age;
 
-   SVGA_STATS_TIME_POP(svga_sws(svga));
-
+done:
    return s->backed;
 }
 
@@ -556,7 +591,6 @@
       }
    }
    
-done:
    SVGA_STATS_TIME_POP(svga_sws(svga));
 
    return s ? &s->base : NULL;
@@ -582,7 +616,10 @@
       s->backed = NULL;
    }
 
-   if (s->handle != t->handle) {
+   /* Destroy the surface handle if this is a backed handle and
+    * it is not being cached in the texture.
+    */
+   if (s->handle != t->handle && s->handle != t->backed_handle) {
       SVGA_DBG(DEBUG_DMA, "unref sid %p (tex surface)\n", s->handle);
       svga_screen_surface_destroy(ss, &s->key, &s->handle);
    }
@@ -646,8 +683,11 @@
 
    /* Increment the view_age and texture age for this surface's mipmap
     * level so that any sampler views into the texture are re-validated too.
+    * Note: we age the texture for backed surface view only when the
+    *       backed surface is propagated to the original surface.
     */
-   svga_age_texture_view(tex, surf->u.tex.level);
+   if (s->handle == tex->handle)
+      svga_age_texture_view(tex, surf->u.tex.level);
 }
 
 
@@ -655,13 +695,27 @@
 svga_mark_surfaces_dirty(struct svga_context *svga)
 {
    unsigned i;
+   struct svga_hw_clear_state *hw = &svga->state.hw_clear;
 
-   for (i = 0; i < svga->curr.framebuffer.nr_cbufs; i++) {
-      if (svga->curr.framebuffer.cbufs[i])
-         svga_mark_surface_dirty(svga->curr.framebuffer.cbufs[i]);
+   if (svga_have_vgpu10(svga)) {
+
+      /* For VGPU10, mark the dirty bit in the rendertarget/depth stencil view surface.
+       * This surface can be the backed surface.
+       */
+      for (i = 0; i < hw->num_rendertargets; i++) {
+         if (hw->rtv[i])
+            svga_mark_surface_dirty(hw->rtv[i]);
+      }
+      if (hw->dsv)
+         svga_mark_surface_dirty(hw->dsv);
+   } else {
+      for (i = 0; i < svga->curr.framebuffer.nr_cbufs; i++) {
+         if (svga->curr.framebuffer.cbufs[i])
+            svga_mark_surface_dirty(svga->curr.framebuffer.cbufs[i]);
+      }
+      if (svga->curr.framebuffer.zsbuf)
+         svga_mark_surface_dirty(svga->curr.framebuffer.zsbuf);
    }
-   if (svga->curr.framebuffer.zsbuf)
-      svga_mark_surface_dirty(svga->curr.framebuffer.zsbuf);
 }
 
 
@@ -714,8 +768,8 @@
       }
 
       SVGA_DBG(DEBUG_VIEWS,
-               "svga: Surface propagate: tex %p, level %u, from %p\n",
-               tex, surf->u.tex.level, surf);
+               "Propagate surface %p to resource %p, level %u\n",
+               surf, tex, surf->u.tex.level);
       for (i = 0; i < nlayers; i++) {
          svga_texture_copy_handle(svga,
                                   s->handle, 0, 0, 0, s->real_level,
@@ -727,6 +781,16 @@
                                   1);
          svga_define_texture_level(tex, layer + i, surf->u.tex.level);
       }
+
+      /* Sync the surface view age with the texture age */
+      s->age = tex->age;
+
+      /* If this backed surface is cached in the texture,
+       * update the backed age as well.
+       */
+      if (tex->backed_handle == s->handle) {
+         tex->backed_age = tex->age;
+      }
    }
 
    SVGA_STATS_TIME_POP(ss->sws);
@@ -750,15 +814,15 @@
     * not the svga->curr.framebuffer surfaces, because it's the former
     * surfaces which may be backing surface views (the actual render targets).
     */
-   for (i = 0; i < svga->state.hw_draw.num_rendertargets; i++) {
-      struct pipe_surface *s = svga->state.hw_draw.rtv[i];
+   for (i = 0; i < svga->state.hw_clear.num_rendertargets; i++) {
+      struct pipe_surface *s = svga->state.hw_clear.rtv[i];
       if (s) {
          svga_propagate_surface(svga, s, FALSE);
       }
    }
 
-   if (svga->state.hw_draw.dsv) {
-      svga_propagate_surface(svga, svga->state.hw_draw.dsv, FALSE);
+   if (svga->state.hw_clear.dsv) {
+      svga_propagate_surface(svga, svga->state.hw_clear.dsv, FALSE);
    }
 }
 
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index 7cbb767..8df1006 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -72,6 +72,9 @@
     * original surface is the shader resource.
     */
    struct svga_surface *backed;
+   unsigned age;                   /* timestamp when the backed resource is
+                                    * synced with the original resource.
+                                    */
 };
 
 
@@ -99,6 +102,7 @@
                           int layer_pick,
                           unsigned num_layers,
                           int zslice_pick,
+                          boolean cacheable,
                           struct svga_host_surface_cache_key *key); /* OUT */
 
 
diff --git a/src/gallium/drivers/svga/svga_swtnl.h b/src/gallium/drivers/svga/svga_swtnl.h
index fc094e5..0661b71 100644
--- a/src/gallium/drivers/svga/svga_swtnl.h
+++ b/src/gallium/drivers/svga/svga_swtnl.h
@@ -39,7 +39,9 @@
 
 enum pipe_error
 svga_swtnl_draw_vbo(struct svga_context *svga,
-                    const struct pipe_draw_info *info);
+                    const struct pipe_draw_info *info,
+                    struct pipe_resource *indexbuf,
+                    unsigned index_offset);
 
 
 #endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
index 576fd85..26f8107 100644
--- a/src/gallium/drivers/svga/svga_swtnl_backend.c
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -45,7 +45,7 @@
 
 
 static const struct vertex_info *
-svga_vbuf_render_get_vertex_info( struct vbuf_render *render )
+svga_vbuf_render_get_vertex_info(struct vbuf_render *render)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
@@ -57,9 +57,9 @@
 
 
 static boolean
-svga_vbuf_render_allocate_vertices( struct vbuf_render *render,
-                                    ushort vertex_size,
-                                    ushort nr_vertices )
+svga_vbuf_render_allocate_vertices(struct vbuf_render *render,
+                                   ushort vertex_size,
+                                   ushort nr_vertices)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
@@ -79,7 +79,8 @@
       new_ibuf = new_vbuf = TRUE;
    svga->swtnl.new_vbuf = FALSE;
 
-   if (svga_render->vbuf_size < svga_render->vbuf_offset + svga_render->vbuf_used + size)
+   if (svga_render->vbuf_size
+       < svga_render->vbuf_offset + svga_render->vbuf_used + size)
       new_vbuf = TRUE;
 
    if (new_vbuf)
@@ -93,7 +94,7 @@
                                              PIPE_BIND_VERTEX_BUFFER,
                                              PIPE_USAGE_STREAM,
                                              svga_render->vbuf_size);
-      if(!svga_render->vbuf) {
+      if (!svga_render->vbuf) {
          svga_context_flush(svga, NULL);
          assert(!svga_render->vbuf);
          svga_render->vbuf = pipe_buffer_create(screen,
@@ -121,8 +122,9 @@
    return TRUE;
 }
 
+
 static void *
-svga_vbuf_render_map_vertices( struct vbuf_render *render )
+svga_vbuf_render_map_vertices(struct vbuf_render *render)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
@@ -158,10 +160,11 @@
    return retPtr;
 }
 
+
 static void
-svga_vbuf_render_unmap_vertices( struct vbuf_render *render,
-                                 ushort min_index,
-                                 ushort max_index )
+svga_vbuf_render_unmap_vertices(struct vbuf_render *render,
+                                ushort min_index,
+                                ushort max_index)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
@@ -186,8 +189,8 @@
    }
 
    pipe_buffer_flush_mapped_range(&svga->pipe,
-				  svga_render->vbuf_transfer,
-				  offset, length);
+                                  svga_render->vbuf_transfer,
+                                  offset, length);
    pipe_buffer_unmap(&svga->pipe, svga_render->vbuf_transfer);
    svga_render->min_index = min_index;
    svga_render->max_index = max_index;
@@ -196,16 +199,18 @@
    SVGA_STATS_TIME_POP(svga_sws(svga));
 }
 
+
 static void
-svga_vbuf_render_set_primitive( struct vbuf_render *render,
-                                enum pipe_prim_type prim )
+svga_vbuf_render_set_primitive(struct vbuf_render *render,
+                               enum pipe_prim_type prim)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    svga_render->prim = prim;
 }
 
+
 static void
-svga_vbuf_submit_state( struct svga_vbuf_render *svga_render )
+svga_vbuf_submit_state(struct svga_vbuf_render *svga_render)
 {
    struct svga_context *svga = svga_render->svga;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
@@ -245,10 +250,10 @@
    /* Specify the vertex buffer (there's only ever one) */
    {
       struct pipe_vertex_buffer vb;
-      vb.buffer = svga_render->vbuf;
+      vb.is_user_buffer = false;
+      vb.buffer.resource = svga_render->vbuf;
       vb.buffer_offset = svga_render->vdecl_offset;
       vb.stride = vdecl[0].array.stride;
-      vb.user_buffer = NULL;
       svga_hwtnl_vertex_buffers(svga->hwtnl, 1, &vb);
    }
 
@@ -260,10 +265,10 @@
       svga_hwtnl_set_fillmode(svga->hwtnl, PIPE_POLYGON_MODE_FILL);
    }
    else {
-      svga_hwtnl_set_flatshade( svga->hwtnl,
+      svga_hwtnl_set_flatshade(svga->hwtnl,
                                 svga->curr.rast->templ.flatshade ||
                                 svga->state.hw_draw.fs->uses_flat_interp,
-                                svga->curr.rast->templ.flatshade_first );
+                                svga->curr.rast->templ.flatshade_first);
 
       svga_hwtnl_set_fillmode(svga->hwtnl, svga->curr.rast->hw_fillmode);
    }
@@ -272,13 +277,15 @@
    SVGA_STATS_TIME_POP(svga_sws(svga));
 }
 
+
 static void
-svga_vbuf_render_draw_arrays( struct vbuf_render *render,
-                              unsigned start, uint nr )
+svga_vbuf_render_draw_arrays(struct vbuf_render *render,
+                              unsigned start, uint nr)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
-   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
+      / svga_render->vertex_size;
    enum pipe_error ret = PIPE_OK;
    /* instancing will already have been resolved at this point by 'draw' */
    const unsigned start_instance = 0;
@@ -293,10 +300,10 @@
     * altered some of our state behind our backs.  Testcase:
     * redbook/polys.c
     */
-   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+   svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
 
-   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias, nr,
-                                start_instance, instance_count);
+   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias,
+                                 nr, start_instance, instance_count);
    if (ret != PIPE_OK) {
       svga_context_flush(svga, NULL);
       ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim,
@@ -310,22 +317,24 @@
 
 
 static void
-svga_vbuf_render_draw_elements( struct vbuf_render *render,
+svga_vbuf_render_draw_elements(struct vbuf_render *render,
                                 const ushort *indices,
                                 uint nr_indices)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
    struct svga_context *svga = svga_render->svga;
    struct pipe_screen *screen = svga->pipe.screen;
-   int bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   int bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
+      / svga_render->vertex_size;
    boolean ret;
    size_t size = 2 * nr_indices;
    /* instancing will already have been resolved at this point by 'draw' */
    const unsigned start_instance = 0;
    const unsigned instance_count = 1;
 
-   assert(( svga_render->vbuf_offset - svga_render->vdecl_offset) % svga_render->vertex_size == 0);
-   
+   assert((svga_render->vbuf_offset - svga_render->vdecl_offset)
+          % svga_render->vertex_size == 0);
+
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_VBUFDRAWELEMENTS);
 
    if (svga_render->ibuf_size < svga_render->ibuf_offset + size)
@@ -341,7 +350,8 @@
    }
 
    pipe_buffer_write_nooverlap(&svga->pipe, svga_render->ibuf,
-			       svga_render->ibuf_offset, 2 * nr_indices, indices);
+                               svga_render->ibuf_offset, 2 * nr_indices,
+                               indices);
 
    /* off to hardware */
    svga_vbuf_submit_state(svga_render);
@@ -350,7 +360,7 @@
     * altered some of our state behind our backs.  Testcase:
     * redbook/polys.c
     */
-   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+   svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
 
    ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
                                         svga_render->ibuf,
@@ -359,9 +369,10 @@
                                         svga_render->min_index,
                                         svga_render->max_index,
                                         svga_render->prim,
-                                        svga_render->ibuf_offset / 2, nr_indices,
+                                        svga_render->ibuf_offset / 2,
+                                        nr_indices,
                                         start_instance, instance_count);
-   if(ret != PIPE_OK) {
+   if (ret != PIPE_OK) {
       svga_context_flush(svga, NULL);
       ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
                                            svga_render->ibuf,
@@ -383,14 +394,14 @@
 
 
 static void
-svga_vbuf_render_release_vertices( struct vbuf_render *render )
+svga_vbuf_render_release_vertices(struct vbuf_render *render)
 {
 
 }
 
 
 static void
-svga_vbuf_render_destroy( struct vbuf_render *render )
+svga_vbuf_render_destroy(struct vbuf_render *render)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
 
@@ -404,7 +415,7 @@
  * Create a new primitive render.
  */
 struct vbuf_render *
-svga_vbuf_render_create( struct svga_context *svga )
+svga_vbuf_render_create(struct svga_context *svga)
 {
    struct svga_vbuf_render *svga_render = CALLOC_STRUCT(svga_vbuf_render);
 
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
index 24b4f5c..3db7a4b 100644
--- a/src/gallium/drivers/svga/svga_swtnl_draw.c
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -38,7 +38,9 @@
 
 enum pipe_error
 svga_swtnl_draw_vbo(struct svga_context *svga,
-                    const struct pipe_draw_info *info)
+                    const struct pipe_draw_info *info,
+                    struct pipe_resource *indexbuf,
+                    unsigned index_offset)
 {
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = { 0 };
    struct pipe_transfer *ib_transfer = NULL;
@@ -70,9 +72,9 @@
     * Map vertex buffers
     */
    for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
-      if (svga->curr.vb[i].buffer) {
+      if (svga->curr.vb[i].buffer.resource) {
          map = pipe_buffer_map(&svga->pipe,
-                               svga->curr.vb[i].buffer,
+                               svga->curr.vb[i].buffer.resource,
                                PIPE_TRANSFER_READ,
                                &vb_transfer[i]);
 
@@ -83,13 +85,14 @@
 
    /* Map index buffer, if present */
    map = NULL;
-   if (info->indexed && svga->curr.ib.buffer) {
-      map = pipe_buffer_map(&svga->pipe, svga->curr.ib.buffer,
+   if (info->index_size && indexbuf) {
+      map = pipe_buffer_map(&svga->pipe, indexbuf,
                             PIPE_TRANSFER_READ,
                             &ib_transfer);
+      map = (ubyte *) map + index_offset;
       draw_set_indexes(draw,
-                       (const ubyte *) map + svga->curr.ib.offset,
-                       svga->curr.ib.index_size, ~0);
+                       (const ubyte *) map,
+                       info->index_size, ~0);
    }
 
    /* Map constant buffers */
@@ -120,7 +123,7 @@
     * unmap vertex/index buffers
     */
    for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
-      if (svga->curr.vb[i].buffer) {
+      if (svga->curr.vb[i].buffer.resource) {
          pipe_buffer_unmap(&svga->pipe, vb_transfer[i]);
          draw_set_mapped_vertex_buffer(draw, i, NULL, 0);
       }
@@ -146,9 +149,8 @@
 }
 
 
-
-
-boolean svga_init_swtnl( struct svga_context *svga )
+boolean
+svga_init_swtnl(struct svga_context *svga)
 {
    struct svga_screen *screen = svga_screen(svga->pipe.screen);
 
@@ -164,8 +166,8 @@
       goto fail;
 
 
-   draw_set_rasterize_stage(svga->swtnl.draw, 
-                            draw_vbuf_stage( svga->swtnl.draw, svga->swtnl.backend ));
+   draw_set_rasterize_stage(svga->swtnl.draw,
+                 draw_vbuf_stage(svga->swtnl.draw, svga->swtnl.backend));
 
    draw_set_render(svga->swtnl.draw, svga->swtnl.backend);
 
@@ -201,16 +203,17 @@
       util_blitter_destroy(svga->blitter);
 
    if (svga->swtnl.backend)
-      svga->swtnl.backend->destroy( svga->swtnl.backend );
+      svga->swtnl.backend->destroy(svga->swtnl.backend);
 
    if (svga->swtnl.draw)
-      draw_destroy( svga->swtnl.draw );
+      draw_destroy(svga->swtnl.draw);
 
    return FALSE;
 }
 
 
-void svga_destroy_swtnl( struct svga_context *svga )
+void
+svga_destroy_swtnl(struct svga_context *svga)
 {
-   draw_destroy( svga->swtnl.draw );
+   draw_destroy(svga->swtnl.draw);
 }
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 71faf3a..06a9be8 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -48,7 +48,8 @@
 #define SVGA_TRIANGLE_ADJ_Y -0.5f
 
 
-static void set_draw_viewport( struct svga_context *svga )
+static void
+set_draw_viewport(struct svga_context *svga)
 {
    struct pipe_viewport_state vp = svga->curr.viewport;
    float adjx = 0.0f;
@@ -97,39 +98,38 @@
 }
 
 static enum pipe_error
-update_swtnl_draw( struct svga_context *svga,
-                   unsigned dirty )
+update_swtnl_draw(struct svga_context *svga, unsigned dirty)
 {
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_SWTNLUPDATEDRAW);
 
-   draw_flush( svga->swtnl.draw );
+   draw_flush(svga->swtnl.draw);
 
-   if (dirty & SVGA_NEW_VS) 
+   if (dirty & SVGA_NEW_VS)
       draw_bind_vertex_shader(svga->swtnl.draw,
                               svga->curr.vs->draw_shader);
 
-   if (dirty & SVGA_NEW_FS) 
+   if (dirty & SVGA_NEW_FS)
       draw_bind_fragment_shader(svga->swtnl.draw,
                                 svga->curr.fs->draw_shader);
 
    if (dirty & SVGA_NEW_VBUFFER)
       draw_set_vertex_buffers(svga->swtnl.draw, 0,
-                              svga->curr.num_vertex_buffers, 
+                              svga->curr.num_vertex_buffers,
                               svga->curr.vb);
 
    if (dirty & SVGA_NEW_VELEMENT)
-      draw_set_vertex_elements(svga->swtnl.draw, 
-                               svga->curr.velems->count, 
-                               svga->curr.velems->velem );
+      draw_set_vertex_elements(svga->swtnl.draw,
+                               svga->curr.velems->count,
+                               svga->curr.velems->velem);
 
    if (dirty & SVGA_NEW_CLIP)
-      draw_set_clip_state(svga->swtnl.draw, 
+      draw_set_clip_state(svga->swtnl.draw,
                           &svga->curr.clip);
 
    if (dirty & (SVGA_NEW_VIEWPORT |
-                SVGA_NEW_REDUCED_PRIMITIVE | 
+                SVGA_NEW_REDUCED_PRIMITIVE |
                 SVGA_NEW_RAST))
-      set_draw_viewport( svga );
+      set_draw_viewport(svga);
 
    if (dirty & SVGA_NEW_RAST)
       draw_set_rasterizer_state(svga->swtnl.draw,
@@ -142,7 +142,7 @@
     * format for no bound depth (PIPE_FORMAT_NONE).
     */
    if (dirty & SVGA_NEW_FRAME_BUFFER)
-      draw_set_zs_format(svga->swtnl.draw, 
+      draw_set_zs_format(svga->swtnl.draw,
          (svga->curr.framebuffer.zsbuf) ?
              svga->curr.framebuffer.zsbuf->format : PIPE_FORMAT_NONE);
 
@@ -211,7 +211,8 @@
    ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id, elements);
    if (ret != PIPE_OK) {
       svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id, elements);
+      ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls,
+                                              id, elements);
       assert(ret == PIPE_OK);
    }
 
@@ -220,7 +221,7 @@
 
 
 enum pipe_error
-svga_swtnl_update_vdecl( struct svga_context *svga )
+svga_swtnl_update_vdecl(struct svga_context *svga)
 {
    struct svga_vbuf_render *svga_render = svga_vbuf_render(svga->swtnl.backend);
    struct draw_context *draw = svga->swtnl.draw;
@@ -253,7 +254,7 @@
    nr_decls++;
 
    for (i = 0; i < fs->base.info.num_inputs; i++) {
-      const unsigned sem_name = fs->base.info.input_semantic_name[i];
+      const enum tgsi_semantic sem_name = fs->base.info.input_semantic_name[i];
       const unsigned sem_index = fs->base.info.input_semantic_index[i];
 
       src = draw_find_shader_output(draw, sem_name, sem_index);
@@ -364,10 +365,9 @@
 
 
 static enum pipe_error
-update_swtnl_vdecl( struct svga_context *svga,
-                    unsigned dirty )
+update_swtnl_vdecl(struct svga_context *svga, unsigned dirty)
 {
-   return svga_swtnl_update_vdecl( svga );
+   return svga_swtnl_update_vdecl(svga);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 3131444..d9b76c2 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -706,7 +706,7 @@
  */
 static unsigned
 get_temp_array_id(const struct svga_shader_emitter_v10 *emit,
-                  unsigned file, unsigned index)
+                  enum tgsi_file_type file, unsigned index)
 {
    if (file == TGSI_FILE_TEMPORARY) {
       return emit->temp_map[index].arrayId;
@@ -723,7 +723,7 @@
  */
 static unsigned
 remap_temp_index(const struct svga_shader_emitter_v10 *emit,
-                 unsigned file, unsigned index)
+                 enum tgsi_file_type file, unsigned index)
 {
    if (file == TGSI_FILE_TEMPORARY) {
       return emit->temp_map[index].index;
@@ -741,7 +741,7 @@
 static VGPU10OperandToken0
 setup_operand0_indexing(struct svga_shader_emitter_v10 *emit,
                         VGPU10OperandToken0 operand0,
-                        unsigned file,
+                        enum tgsi_file_type file,
                         boolean indirect, boolean index2D,
                         unsigned tempArrayID)
 {
@@ -849,9 +849,9 @@
 emit_dst_register(struct svga_shader_emitter_v10 *emit,
                   const struct tgsi_full_dst_register *reg)
 {
-   unsigned file = reg->Register.File;
+   enum tgsi_file_type file = reg->Register.File;
    unsigned index = reg->Register.Index;
-   const unsigned sem_name = emit->info.output_semantic_name[index];
+   const enum tgsi_semantic sem_name = emit->info.output_semantic_name[index];
    const unsigned sem_index = emit->info.output_semantic_index[index];
    unsigned writemask = reg->Register.WriteMask;
    const unsigned indirect = reg->Register.Indirect;
@@ -967,7 +967,7 @@
 emit_src_register(struct svga_shader_emitter_v10 *emit,
                   const struct tgsi_full_src_register *reg)
 {
-   unsigned file = reg->Register.File;
+   enum tgsi_file_type file = reg->Register.File;
    unsigned index = reg->Register.Index;
    const unsigned indirect = reg->Register.Indirect;
    const unsigned tempArrayId = get_temp_array_id(emit, file, index);
@@ -1030,6 +1030,9 @@
    operand0.value = operand1.value = 0;
 
    if (is_prim_id) {
+      /* NOTE: we should be using VGPU10_OPERAND_1_COMPONENT here, but
+       * our virtual GPU accepts this as-is.
+       */
       operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
       operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
    }
@@ -1361,7 +1364,7 @@
  * Create a tgsi_full_src_register.
  */
 static struct tgsi_full_src_register
-make_src_reg(unsigned file, unsigned index)
+make_src_reg(enum tgsi_file_type file, unsigned index)
 {
    struct tgsi_full_src_register reg;
 
@@ -1410,7 +1413,7 @@
  * Create a tgsi_full_dst_register.
  */
 static struct tgsi_full_dst_register
-make_dst_reg(unsigned file, unsigned index)
+make_dst_reg(enum tgsi_file_type file, unsigned index)
 {
    struct tgsi_full_dst_register reg;
 
@@ -1467,7 +1470,7 @@
 
 /** Return the named swizzle term from the src register */
 static inline unsigned
-get_swizzle(const struct tgsi_full_src_register *reg, unsigned term)
+get_swizzle(const struct tgsi_full_src_register *reg, enum tgsi_swizzle term)
 {
    switch (term) {
    case TGSI_SWIZZLE_X:
@@ -1490,8 +1493,8 @@
  */
 static struct tgsi_full_src_register
 swizzle_src(const struct tgsi_full_src_register *reg,
-            unsigned swizzleX, unsigned swizzleY,
-            unsigned swizzleZ, unsigned swizzleW)
+            enum tgsi_swizzle swizzleX, enum tgsi_swizzle swizzleY,
+            enum tgsi_swizzle swizzleZ, enum tgsi_swizzle swizzleW)
 {
    struct tgsi_full_src_register swizzled = *reg;
    /* Note: we swizzle the current swizzle */
@@ -1508,7 +1511,7 @@
  * terms are the same.
  */
 static struct tgsi_full_src_register
-scalar_src(const struct tgsi_full_src_register *reg, unsigned swizzle)
+scalar_src(const struct tgsi_full_src_register *reg, enum tgsi_swizzle swizzle)
 {
    struct tgsi_full_src_register swizzled = *reg;
    /* Note: we swizzle the current swizzle */
@@ -1840,7 +1843,8 @@
  */
 static unsigned
 translate_interpolation(const struct svga_shader_emitter_v10 *emit,
-                        unsigned interp, unsigned interpolate_loc)
+                        enum tgsi_interpolate_mode interp,
+                        enum tgsi_interpolate_loc interpolate_loc)
 {
    if (interp == TGSI_INTERPOLATE_COLOR) {
       interp = emit->key.fs.flatshade ?
@@ -2175,7 +2179,7 @@
  */
 static void
 emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
-                              unsigned semantic_name, unsigned index)
+                              enum tgsi_semantic semantic_name, unsigned index)
 {
    switch (semantic_name) {
    case TGSI_SEMANTIC_INSTANCEID:
@@ -2342,7 +2346,7 @@
    if (emit->unit == PIPE_SHADER_FRAGMENT) {
 
       for (i = 0; i < emit->linkage.num_inputs; i++) {
-         unsigned semantic_name = emit->info.input_semantic_name[i];
+         enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
          unsigned usage_mask = emit->info.input_usage_mask[i];
          unsigned index = emit->linkage.input_map[i];
          unsigned type, interpolationMode, name;
@@ -2401,7 +2405,7 @@
    else if (emit->unit == PIPE_SHADER_GEOMETRY) {
 
       for (i = 0; i < emit->info.num_inputs; i++) {
-         unsigned semantic_name = emit->info.input_semantic_name[i];
+         enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
          unsigned usage_mask = emit->info.input_usage_mask[i];
          unsigned index = emit->linkage.input_map[i];
          unsigned opcodeType, operandType;
@@ -2484,7 +2488,8 @@
 
    for (i = 0; i < emit->info.num_outputs; i++) {
       /*const unsigned usage_mask = emit->info.output_usage_mask[i];*/
-      const unsigned semantic_name = emit->info.output_semantic_name[i];
+      const enum tgsi_semantic semantic_name =
+         emit->info.output_semantic_name[i];
       const unsigned semantic_index = emit->info.output_semantic_index[i];
       unsigned index = i;
 
@@ -2848,7 +2853,11 @@
     */
    total_consts = emit->num_shader_consts[0];
 
-   /* Now, allocate constant slots for the "extra" constants */
+   /* Now, allocate constant slots for the "extra" constants.
+    * Note: it's critical that these extra constant locations
+    * exactly match what's emitted by the "extra" constants code
+    * in svga_state_constants.c
+    */
 
    /* Vertex position scale/translation */
    if (emit->vposition.need_prescale) {
@@ -2872,17 +2881,14 @@
       }
    }
 
-   /* Texcoord scale factors for RECT textures */
-   {
-      for (i = 0; i < emit->num_samplers; i++) {
-         if (emit->key.tex[i].unnormalized) {
-            emit->texcoord_scale_index[i] = total_consts++;
-         }
-      }
-   }
-
-   /* Texture buffer sizes */
    for (i = 0; i < emit->num_samplers; i++) {
+
+      /* Texcoord scale factors for RECT textures */
+      if (emit->key.tex[i].unnormalized) {
+         emit->texcoord_scale_index[i] = total_consts++;
+      }
+
+      /* Texture buffer sizes */
       if (emit->sampler_target[i] == TGSI_TEXTURE_BUFFER) {
          emit->texture_buffer_size_index[i] = total_consts++;
       }
@@ -2950,7 +2956,8 @@
  * Translate TGSI_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x.
  */
 static unsigned
-tgsi_texture_to_resource_dimension(unsigned target, boolean is_array)
+tgsi_texture_to_resource_dimension(enum tgsi_texture_type target,
+                                   boolean is_array)
 {
    switch (target) {
    case TGSI_TEXTURE_BUFFER:
@@ -4844,9 +4851,24 @@
       struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
       struct tgsi_full_src_register scale_src = make_src_const_reg(scale_index);
 
-      /* MUL tmp, coord, const[] */
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst,
-                           coord, &scale_src, FALSE);
+      if (emit->key.tex[unit].texel_bias) {
+         /* to fix texture coordinate rounding issue, 0.0001 offset is
+          * been added. This fixes piglit test fbo-blit-scaled-linear. */
+         struct tgsi_full_src_register offset =
+            make_immediate_reg_float(emit, 0.0001f);
+
+         /* ADD tmp, coord, offset */
+         emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp_dst,
+                              coord, &offset, FALSE);
+         /* MUL tmp, tmp, scale */
+         emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst,
+                              &tmp_src, &scale_src, FALSE);
+      }
+      else {
+         /* MUL tmp, coord, const[] */
+         emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst,
+                              coord, &scale_src, FALSE);
+      }
       return tmp_src;
    }
    else {
@@ -4862,7 +4884,7 @@
  */
 static void
 emit_tex_compare_refcoord(struct svga_shader_emitter_v10 *emit,
-                          unsigned target,
+                          enum tgsi_texture_type target,
                           const struct tgsi_full_src_register *coord)
 {
    struct tgsi_full_src_register coord_src_ref;
@@ -4896,7 +4918,7 @@
    boolean swizzled;
    boolean shadow_compare;
    unsigned unit;
-   unsigned texture_target;  /**< TGSI_TEXTURE_x */
+   enum tgsi_texture_type texture_target;  /**< TGSI_TEXTURE_x */
    struct tgsi_full_src_register tmp_src;
    struct tgsi_full_dst_register tmp_dst;
    const struct tgsi_full_dst_register *inst_dst;
@@ -5041,6 +5063,7 @@
                      ((swz_g == PIPE_SWIZZLE_0) << 1) |
                      ((swz_b == PIPE_SWIZZLE_0) << 2) |
                      ((swz_a == PIPE_SWIZZLE_0) << 3));
+      writemask_0 &= swz->inst_dst->Register.WriteMask;
 
       if (writemask_0) {
          struct tgsi_full_src_register zero = int_tex ?
@@ -5059,6 +5082,7 @@
                      ((swz_g == PIPE_SWIZZLE_1) << 1) |
                      ((swz_b == PIPE_SWIZZLE_1) << 2) |
                      ((swz_a == PIPE_SWIZZLE_1) << 3));
+      writemask_1 &= swz->inst_dst->Register.WriteMask;
 
       if (writemask_1) {
          struct tgsi_full_src_register one = int_tex ?
@@ -5096,6 +5120,9 @@
    /* SAMPLE dst, coord(s0), resource, sampler */
    begin_emit_instruction(emit);
 
+   /* NOTE: for non-fragment shaders, we should use VGPU10_OPCODE_SAMPLE_L
+    * with LOD=0.  But our virtual GPU accepts this as-is.
+    */
    emit_sample_opcode(emit, VGPU10_OPCODE_SAMPLE,
                       inst->Instruction.Saturate, offsets);
    emit_dst_register(emit, get_tex_swizzle_dst(&swz_info));
@@ -5236,6 +5263,9 @@
    begin_emit_instruction(emit);
 
    if (tgsi_is_shadow_target(target))
+      /* NOTE: for non-fragment shaders, we should use
+       * VGPU10_OPCODE_SAMPLE_C_LZ, but our virtual GPU accepts this as-is.
+       */
       opcode = VGPU10_OPCODE_SAMPLE_C;
    else
       opcode = VGPU10_OPCODE_SAMPLE;
@@ -6245,15 +6275,17 @@
    emit->common_immediate_pos[n++] =
       alloc_immediate_float4(emit, 0.0f, 1.0f, 0.5f, -1.0f);
 
-   emit->common_immediate_pos[n++] =
-      alloc_immediate_float4(emit, 128.0f, -128.0f, 2.0f, 3.0f);
+   if (emit->info.opcode_count[TGSI_OPCODE_LIT] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_float4(emit, 128.0f, -128.0f, 0.0f, 0.0f);
+   }
 
    emit->common_immediate_pos[n++] =
       alloc_immediate_int4(emit, 0, 1, 0, -1);
 
    if (emit->key.vs.attrib_puint_to_snorm) {
       emit->common_immediate_pos[n++] =
-         alloc_immediate_float4(emit, -2.0f, -2.0f, -2.0f, -1.66666f);
+         alloc_immediate_float4(emit, -2.0f, 2.0f, 3.0f, -1.66666f);
    }
 
    if (emit->key.vs.attrib_puint_to_uscaled) {
@@ -6269,6 +6301,17 @@
          alloc_immediate_int4(emit, 22, 30, 0, 0);
    }
 
+   unsigned i;
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (emit->key.tex[i].texel_bias) {
+         /* Replace 0.0f if more immediate float value is needed */
+         emit->common_immediate_pos[n++] =
+            alloc_immediate_float4(emit, 0.0001f, 0.0f, 0.0f, 0.0f);
+         break;
+      }
+   }
+
    assert(n <= ARRAY_SIZE(emit->common_immediate_pos));
    emit->num_common_immediates = n;
 }
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 77f9c14..e74d1ca 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -26,16 +26,15 @@
 /**
  * @file
  * VMware SVGA specific winsys interface.
- * 
+ *
  * @author Jose Fonseca <jfonseca@vmware.com>
- * 
+ *
  * Documentation taken from the VMware SVGA DDK.
  */
 
 #ifndef SVGA_WINSYS_H_
 #define SVGA_WINSYS_H_
 
-
 #include "svga_types.h"
 #include "svga_reg.h"
 #include "svga3d_reg.h"
@@ -87,7 +86,8 @@
 #define SVGA_QUERY_FLAG_SET        (1 << 0)
 #define SVGA_QUERY_FLAG_REF        (1 << 1)
 
-#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0)  /* Can preemptively flush */
+#define SVGA_HINT_FLAG_CAN_PRE_FLUSH   (1 << 0)  /* Can preemptively flush */
+#define SVGA_HINT_FLAG_EXPORT_FENCE_FD (1 << 1)  /* Export a Fence FD */
 
 /**
  * SVGA mks statistics info
@@ -101,6 +101,7 @@
 
 enum svga_stats_count {
    SVGA_STATS_COUNT_BLENDSTATE,
+   SVGA_STATS_COUNT_BLITBLITTERCOPY,
    SVGA_STATS_COUNT_DEPTHSTENCILSTATE,
    SVGA_STATS_COUNT_RASTERIZERSTATE,
    SVGA_STATS_COUNT_SAMPLER,
@@ -112,11 +113,16 @@
 };
 
 enum svga_stats_time {
+   SVGA_STATS_TIME_BLIT,
+   SVGA_STATS_TIME_BLITBLITTER,
+   SVGA_STATS_TIME_BLITFALLBACK,
    SVGA_STATS_TIME_BUFFERSFLUSH,
    SVGA_STATS_TIME_BUFFERTRANSFERMAP,
    SVGA_STATS_TIME_BUFFERTRANSFERUNMAP,
    SVGA_STATS_TIME_CONTEXTFINISH,
    SVGA_STATS_TIME_CONTEXTFLUSH,
+   SVGA_STATS_TIME_COPYREGION,
+   SVGA_STATS_TIME_COPYREGIONFALLBACK,
    SVGA_STATS_TIME_CREATEBACKEDSURFACEVIEW,
    SVGA_STATS_TIME_CREATEBUFFER,
    SVGA_STATS_TIME_CREATECONTEXT,
@@ -134,6 +140,7 @@
    SVGA_STATS_TIME_EMITFS,
    SVGA_STATS_TIME_EMITGS,
    SVGA_STATS_TIME_EMITVS,
+   SVGA_STATS_TIME_EMULATESURFACEVIEW,
    SVGA_STATS_TIME_FENCEFINISH,
    SVGA_STATS_TIME_GENERATEINDICES,
    SVGA_STATS_TIME_HWTNLDRAWARRAYS,
@@ -165,6 +172,7 @@
 
 #define SVGA_STATS_COUNT_NAMES                \
    SVGA_STATS_PREFIX "BlendState",            \
+   SVGA_STATS_PREFIX "BlitBlitterCopy",       \
    SVGA_STATS_PREFIX "DepthStencilState",     \
    SVGA_STATS_PREFIX "RasterizerState",       \
    SVGA_STATS_PREFIX "Sampler",               \
@@ -174,11 +182,16 @@
    SVGA_STATS_PREFIX "VertexElement"          \
 
 #define SVGA_STATS_TIME_NAMES                       \
+   SVGA_STATS_PREFIX "Blit",                        \
+   SVGA_STATS_PREFIX "BlitBlitter",                 \
+   SVGA_STATS_PREFIX "BlitFallback",                \
    SVGA_STATS_PREFIX "BuffersFlush",                \
    SVGA_STATS_PREFIX "BufferTransferMap",           \
    SVGA_STATS_PREFIX "BufferTransferUnmap",         \
    SVGA_STATS_PREFIX "ContextFinish",               \
    SVGA_STATS_PREFIX "ContextFlush",                \
+   SVGA_STATS_PREFIX "CopyRegion",                  \
+   SVGA_STATS_PREFIX "CopyRegionFallback",          \
    SVGA_STATS_PREFIX "CreateBackedSurfaceView",     \
    SVGA_STATS_PREFIX "CreateBuffer",                \
    SVGA_STATS_PREFIX "CreateContext",               \
@@ -196,6 +209,7 @@
    SVGA_STATS_PREFIX "EmitFS",                      \
    SVGA_STATS_PREFIX "EmitGS",                      \
    SVGA_STATS_PREFIX "EmitVS",                      \
+   SVGA_STATS_PREFIX "EmulateSurfaceView",          \
    SVGA_STATS_PREFIX "FenceFinish",                 \
    SVGA_STATS_PREFIX "GenerateIndices",             \
    SVGA_STATS_PREFIX "HWtnlDrawArrays",             \
@@ -220,7 +234,7 @@
    SVGA_STATS_PREFIX "VbufRenderMapVertices",       \
    SVGA_STATS_PREFIX "VbufRenderUnmapVertices",     \
    SVGA_STATS_PREFIX "VbufSubmitState"
-   
+
 
 /** Opaque surface handle */
 struct svga_winsys_surface;
@@ -238,10 +252,10 @@
    void
    (*destroy)(struct svga_winsys_context *swc);
 
-   void *       
-   (*reserve)(struct svga_winsys_context *swc, 
-	      uint32_t nr_bytes, uint32_t nr_relocs );
-   
+   void *
+   (*reserve)(struct svga_winsys_context *swc,
+              uint32_t nr_bytes, uint32_t nr_relocs );
+
    /**
     * Returns current size of command buffer, in bytes.
     */
@@ -250,51 +264,51 @@
 
    /**
     * Emit a relocation for a host surface.
-    * 
+    *
     * @param flags bitmask of SVGA_RELOC_* flags
-    * 
+    *
     * NOTE: Order of this call does matter. It should be the same order
     * as relocations appear in the command buffer.
     */
    void
-   (*surface_relocation)(struct svga_winsys_context *swc, 
-	                 uint32 *sid,
+   (*surface_relocation)(struct svga_winsys_context *swc,
+                         uint32 *sid,
                          uint32 *mobid,
-	                 struct svga_winsys_surface *surface,
-	                 unsigned flags);
-   
+                         struct svga_winsys_surface *surface,
+                         unsigned flags);
+
    /**
     * Emit a relocation for a guest memory region.
-    * 
+    *
     * @param flags bitmask of SVGA_RELOC_* flags
-    * 
+    *
     * NOTE: Order of this call does matter. It should be the same order
     * as relocations appear in the command buffer.
     */
    void
-   (*region_relocation)(struct svga_winsys_context *swc, 
-	                struct SVGAGuestPtr *ptr, 
-	                struct svga_winsys_buffer *buffer,
-	                uint32 offset,
+   (*region_relocation)(struct svga_winsys_context *swc,
+                        struct SVGAGuestPtr *ptr,
+                        struct svga_winsys_buffer *buffer,
+                        uint32 offset,
                         unsigned flags);
 
    /**
     * Emit a relocation for a guest-backed shader object.
-    * 
+    *
     * NOTE: Order of this call does matter. It should be the same order
     * as relocations appear in the command buffer.
     */
    void
-   (*shader_relocation)(struct svga_winsys_context *swc, 
-	                uint32 *shid,
-			uint32 *mobid,
-			uint32 *offset,
-	                struct svga_winsys_gb_shader *shader,
+   (*shader_relocation)(struct svga_winsys_context *swc,
+                        uint32 *shid,
+                        uint32 *mobid,
+                        uint32 *offset,
+                        struct svga_winsys_gb_shader *shader,
                         unsigned flags);
 
    /**
     * Emit a relocation for a guest-backed context.
-    * 
+    *
     * NOTE: Order of this call does matter. It should be the same order
     * as relocations appear in the command buffer.
     */
@@ -314,11 +328,11 @@
     */
    void
    (*mob_relocation)(struct svga_winsys_context *swc,
-		     SVGAMobId *id,
-		     uint32 *offset_into_mob,
-		     struct svga_winsys_buffer *buffer,
-		     uint32 offset,
-		     unsigned flags);
+                     SVGAMobId *id,
+                     uint32 *offset_into_mob,
+                     struct svga_winsys_buffer *buffer,
+                     uint32 offset,
+                     unsigned flags);
 
    /**
     * Emit a relocation for a guest-backed query object.
@@ -328,8 +342,8 @@
     */
    void
    (*query_relocation)(struct svga_winsys_context *swc,
-	               SVGAMobId *id,
-	               struct svga_winsys_gb_query *query);
+                       SVGAMobId *id,
+                       struct svga_winsys_gb_query *query);
 
    /**
     * Bind queries to context.
@@ -342,14 +356,14 @@
 
    void
    (*commit)(struct svga_winsys_context *swc);
-   
-   enum pipe_error
-   (*flush)(struct svga_winsys_context *swc, 
-	    struct pipe_fence_handle **pfence);
 
-   /** 
+   enum pipe_error
+   (*flush)(struct svga_winsys_context *swc,
+            struct pipe_fence_handle **pfence);
+
+   /**
     * Context ID used to fill in the commands
-    * 
+    *
     * Context IDs are arbitrary small non-negative integers,
     * global to the entire SVGA device.
     */
@@ -361,6 +375,11 @@
    uint32 hints;
 
    /**
+    * File descriptor for imported fence
+    */
+   int32 imported_fence_fd;
+
+   /**
     ** BEGIN new functions for guest-backed surfaces.
     **/
 
@@ -444,7 +463,7 @@
 {
    void
    (*destroy)(struct svga_winsys_screen *sws);
-   
+
    SVGA3dHardwareVersion
    (*get_hw_version)(struct svga_winsys_screen *sws);
 
@@ -452,7 +471,7 @@
    (*get_cap)(struct svga_winsys_screen *sws,
               SVGA3dDevCapIndex index,
               SVGA3dDevCapResult *result);
-   
+
    /**
     * Create a new context.
     *
@@ -465,8 +484,7 @@
     */
    struct svga_winsys_context *
    (*context_create)(struct svga_winsys_screen *sws);
-   
-   
+
    /**
     * This creates a "surface" object in the SVGA3D device.
     *
@@ -545,10 +563,10 @@
     * Reference a SVGA3D surface object. This allows sharing of a
     * surface between different objects.
     */
-   void 
+   void
    (*surface_reference)(struct svga_winsys_screen *sws,
-			struct svga_winsys_surface **pdst,
-			struct svga_winsys_surface *src);
+                        struct svga_winsys_surface **pdst,
+                        struct svga_winsys_surface *src);
 
    /**
     * Check if a resource (texture, buffer) of the given size
@@ -571,27 +589,27 @@
     * SSE instructions.
     */
    struct svga_winsys_buffer *
-   (*buffer_create)( struct svga_winsys_screen *sws, 
-	             unsigned alignment, 
-	             unsigned usage,
-	             unsigned size );
+   (*buffer_create)( struct svga_winsys_screen *sws,
+                     unsigned alignment,
+                     unsigned usage,
+                     unsigned size );
 
-   /** 
+   /**
     * Map the entire data store of a buffer object into the client's address.
     * usage is a bitmask of PIPE_TRANSFER_*
     */
    void *
-   (*buffer_map)( struct svga_winsys_screen *sws, 
-	          struct svga_winsys_buffer *buf,
-		  unsigned usage );
-   
-   void 
-   (*buffer_unmap)( struct svga_winsys_screen *sws, 
+   (*buffer_map)( struct svga_winsys_screen *sws,
+                  struct svga_winsys_buffer *buf,
+                  unsigned usage );
+
+   void
+   (*buffer_unmap)( struct svga_winsys_screen *sws,
                     struct svga_winsys_buffer *buf );
 
-   void 
+   void
    (*buffer_destroy)( struct svga_winsys_screen *sws,
-	              struct svga_winsys_buffer *buf );
+                      struct svga_winsys_buffer *buf );
 
 
    /**
@@ -613,13 +631,41 @@
 
    /**
     * Wait for the fence to finish.
+    * \param timeout in nanoseconds (may be PIPE_TIMEOUT_INFINITE).
+    *                0 to return immediately, if the API suports it.
     * \param flags  driver-specific meaning
     * \return zero on success.
     */
    int (*fence_finish)( struct svga_winsys_screen *sws,
                         struct pipe_fence_handle *fence,
+                        uint64_t timeout,
                         unsigned flag );
 
+   /**
+    * Get the file descriptor associated with the fence
+    * \param duplicate duplicate the fd before returning it
+    * \return zero on success.
+    */
+   int (*fence_get_fd)( struct svga_winsys_screen *sws,
+                        struct pipe_fence_handle *fence,
+                        boolean duplicate );
+
+   /**
+    * Create a fence using the given file descriptor
+    * \return zero on success.
+    */
+   void (*fence_create_fd)( struct svga_winsys_screen *sws,
+                            struct pipe_fence_handle **fence,
+                            int32_t fd );
+
+   /**
+    * Accumulates fence FD from other devices into the current context
+    * \param context_fd FD the context will be waiting on
+    * \return zero on success
+    */
+   int (*fence_server_sync)( struct svga_winsys_screen *sws,
+                             int32_t *context_fd,
+                             struct pipe_fence_handle *fence );
 
    /**
     ** BEGIN new functions for guest-backed surfaces.
@@ -636,9 +682,9 @@
     */
    struct svga_winsys_gb_shader *
    (*shader_create)(struct svga_winsys_screen *sws,
-		    SVGA3dShaderType shaderType,
-		    const uint32 *bytecode,
-		    uint32 bytecodeLen);
+                    SVGA3dShaderType shaderType,
+                    const uint32 *bytecode,
+                    uint32 bytecodeLen);
 
    /**
     * Destroy a GB shader. It's safe to call this function even
@@ -646,7 +692,7 @@
     */
    void
    (*shader_destroy)(struct svga_winsys_screen *sws,
-		     struct svga_winsys_gb_shader *shader);
+                     struct svga_winsys_gb_shader *shader);
 
    /**
     * Create and define a GB query.
@@ -659,7 +705,7 @@
     */
    void
    (*query_destroy)(struct svga_winsys_screen *sws,
-		    struct svga_winsys_gb_query *query);
+                    struct svga_winsys_gb_query *query);
 
    /**
     * Initialize the query state of the query that resides in the slot
@@ -686,7 +732,7 @@
    /**
     * Increment a statistic counter
     */
-   void 
+   void
    (*stats_inc)(enum svga_stats_count);
 
    /**
@@ -711,6 +757,7 @@
    boolean have_generate_mipmap_cmd;
    boolean have_set_predication_cmd;
    boolean have_transfer_from_buffer_cmd;
+   boolean have_fence_fd;
 };
 
 
@@ -722,8 +769,8 @@
 
 struct pipe_resource *
 svga_screen_buffer_wrap_surface(struct pipe_screen *screen,
-				enum SVGA3dSurfaceFormat format,
-				struct svga_winsys_surface *srf);
+                                enum SVGA3dSurfaceFormat format,
+                                struct svga_winsys_surface *srf);
 
 struct svga_winsys_surface *
 svga_screen_buffer_get_winsys_surface(struct pipe_resource *buffer);
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 6650abd..05fc3b3 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -26,7 +26,14 @@
 
 noinst_LTLIBRARIES = libmesaswr.la
 
-libmesaswr_la_SOURCES = $(LOADER_SOURCES)
+# gen_knobs.* included here to provide driver access to swr configuration
+libmesaswr_la_SOURCES = \
+	$(CXX_SOURCES) \
+	$(COMMON_CXX_SOURCES) \
+	$(JITTER_CXX_SOURCES) \
+	rasterizer/codegen/gen_knobs.cpp \
+	rasterizer/codegen/gen_knobs.h \
+	$(LOADER_SOURCES)
 
 COMMON_CXXFLAGS = \
 	-fno-strict-aliasing \
@@ -34,6 +41,7 @@
 	$(LLVM_CXXFLAGS) \
 	$(SWR_CXX11_CXXFLAGS) \
 	-I$(builddir)/rasterizer/codegen \
+	-I$(builddir)/rasterizer/core \
 	-I$(builddir)/rasterizer/jitter \
 	-I$(builddir)/rasterizer/archrast \
 	-I$(srcdir)/rasterizer \
@@ -42,12 +50,31 @@
 	-I$(srcdir)/rasterizer/jitter \
 	-I$(srcdir)/rasterizer/archrast
 
+# SWR_AVX_CXXFLAGS needed for intrinsic usage in swr api headers
+libmesaswr_la_CXXFLAGS = \
+	$(SWR_AVX_CXXFLAGS) \
+	$(COMMON_CXXFLAGS)
+
+if HAVE_SWR_AVX
+libmesaswr_la_CXXFLAGS += -DHAVE_SWR_AVX
+endif
+
+if HAVE_SWR_AVX2
+libmesaswr_la_CXXFLAGS += -DHAVE_SWR_AVX2
+endif
+
+if HAVE_SWR_KNL
+libmesaswr_la_CXXFLAGS += -DHAVE_SWR_KNL
+endif
+
+if HAVE_SWR_SKX
+libmesaswr_la_CXXFLAGS += -DHAVE_SWR_SKX
+endif
+
 COMMON_SOURCES = \
-	$(CXX_SOURCES) \
 	$(ARCHRAST_CXX_SOURCES) \
 	$(COMMON_CXX_SOURCES) \
 	$(CORE_CXX_SOURCES) \
-	$(JITTER_CXX_SOURCES) \
 	$(MEMORY_CXX_SOURCES) \
 	$(BUILT_SOURCES)
 
@@ -62,7 +89,16 @@
 	rasterizer/archrast/gen_ar_event.cpp \
 	rasterizer/archrast/gen_ar_eventhandler.hpp \
 	rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
-	rasterizer/core/gen_BackendPixelRate0.cpp
+	rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate.hpp \
+	rasterizer/core/backends/gen_rasterizer0.cpp \
+	rasterizer/core/backends/gen_rasterizer1.cpp \
+	rasterizer/core/backends/gen_rasterizer2.cpp \
+	rasterizer/core/backends/gen_rasterizer3.cpp \
+	rasterizer/core/backends/gen_rasterizer.hpp
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -140,38 +176,77 @@
 		--output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
 		--gen_eventhandlerfile_h
 
+rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+rasterizer/core/backends/gen_BackendPixelRate.hpp: \
+backend.intermediate
+
 # 5 SWR_MULTISAMPLE_TYPE_COUNT
 # 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
 # 3 SWR_INPUT_COVERAGE_COUNT
 # 2 centroid
 # 2 forcedSampleCount
 # 2 canEarlyZ
-rasterizer/core/gen_BackendPixelRate0.cpp: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp
+
+# use intermediate rule to tell make that all files can be
+# generated in one invocation of gen_backends.py (prevents
+# parallel make race condition)
+.INTERMEDIATE: backend.intermediate
+backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp rasterizer/codegen/templates/gen_header_init.hpp
 	$(MKDIR_GEN)
 	$(PYTHON_GEN) \
 		$(srcdir)/rasterizer/codegen/gen_backends.py \
-		--outdir rasterizer/core \
+		--outdir rasterizer/core/backends \
 		--dim 5 2 3 2 2 2 \
-		--split 0 \
-		--cpp
+		--numfiles 4 \
+		--cpp \
+		--hpp
 
-COMMON_LIBADD = \
-	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
-	$(top_builddir)/src/mesa/libmesagallium.la \
-	$(LLVM_LIBS)
+rasterizer/core/backends/gen_rasterizer0.cpp \
+rasterizer/core/backends/gen_rasterizer1.cpp \
+rasterizer/core/backends/gen_rasterizer2.cpp \
+rasterizer/core/backends/gen_rasterizer3.cpp \
+rasterizer/core/backends/gen_rasterizer.hpp: \
+rasterizer.intermediate
+
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 CenterPattern
+# 2 Conservative
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 5 STATE_VALID_TRI_EDGE_COUNT
+# 2 RasterScissorEdges
+
+# use intermediate rule to tell make that all files can be
+# generated in one invocation of gen_backends.py (prevents
+# parallel make race condition)
+.INTERMEDIATE: rasterizer.intermediate
+rasterizer.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_rasterizer.cpp rasterizer/codegen/templates/gen_header_init.hpp
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) \
+		$(srcdir)/rasterizer/codegen/gen_backends.py \
+		--outdir rasterizer/core/backends \
+		--rast \
+		--dim 5 2 2 3 5 2 \
+		--numfiles 4 \
+		--cpp \
+		--hpp
 
 COMMON_LDFLAGS = \
 	-shared \
 	-module \
 	-no-undefined \
 	$(GC_SECTIONS) \
-	$(NO_UNDEFINED) \
-	$(LLVM_LDFLAGS)
+	$(LD_NO_UNDEFINED)
 
+lib_LTLIBRARIES =
 
-lib_LTLIBRARIES = libswrAVX.la libswrAVX2.la
+if HAVE_SWR_AVX
+lib_LTLIBRARIES += libswrAVX.la
 
 libswrAVX_la_CXXFLAGS = \
+	$(PTHREAD_CFLAGS) \
 	$(SWR_AVX_CXXFLAGS) \
 	-DKNOB_ARCH=KNOB_ARCH_AVX \
 	$(COMMON_CXXFLAGS)
@@ -180,12 +255,16 @@
 	$(COMMON_SOURCES)
 
 libswrAVX_la_LIBADD = \
-	$(COMMON_LIBADD)
+	$(PTHREAD_LIBS)
 
 libswrAVX_la_LDFLAGS = \
 	$(COMMON_LDFLAGS)
+endif
 
+if HAVE_SWR_AVX2
+lib_LTLIBRARIES += libswrAVX2.la
 libswrAVX2_la_CXXFLAGS = \
+	$(PTHREAD_CFLAGS) \
 	$(SWR_AVX2_CXXFLAGS) \
 	-DKNOB_ARCH=KNOB_ARCH_AVX2 \
 	$(COMMON_CXXFLAGS)
@@ -194,10 +273,49 @@
 	$(COMMON_SOURCES)
 
 libswrAVX2_la_LIBADD = \
-	$(COMMON_LIBADD)
+	$(PTHREAD_LIBS)
 
 libswrAVX2_la_LDFLAGS = \
 	$(COMMON_LDFLAGS)
+endif
+
+if HAVE_SWR_KNL
+lib_LTLIBRARIES += libswrKNL.la
+
+libswrKNL_la_CXXFLAGS = \
+	$(PTHREAD_CFLAGS) \
+	$(SWR_KNL_CXXFLAGS) \
+	-DKNOB_ARCH=KNOB_ARCH_AVX512 -DAVX512F_STRICT \
+	$(COMMON_CXXFLAGS)
+
+libswrKNL_la_SOURCES = \
+	$(COMMON_SOURCES)
+
+libswrKNL_la_LIBADD = \
+	$(PTHREAD_LIBS)
+
+libswrKNL_la_LDFLAGS = \
+	$(COMMON_LDFLAGS)
+endif
+
+if HAVE_SWR_SKX
+lib_LTLIBRARIES += libswrSKX.la
+
+libswrSKX_la_CXXFLAGS = \
+	$(PTHREAD_CFLAGS) \
+	$(SWR_SKX_CXXFLAGS) \
+	-DKNOB_ARCH=KNOB_ARCH_AVX512 \
+	$(COMMON_CXXFLAGS)
+
+libswrSKX_la_SOURCES = \
+	$(COMMON_SOURCES)
+
+libswrSKX_la_LIBADD = \
+	$(PTHREAD_LIBS)
+
+libswrSKX_la_LDFLAGS = \
+	$(COMMON_LDFLAGS)
+endif
 
 include $(top_srcdir)/install-gallium-links.mk
 
@@ -227,5 +345,7 @@
 	rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp \
 	rasterizer/codegen/templates/gen_backend.cpp \
 	rasterizer/codegen/templates/gen_builder.hpp \
+	rasterizer/codegen/templates/gen_header_init.hpp \
 	rasterizer/codegen/templates/gen_knobs.cpp \
-	rasterizer/codegen/templates/gen_llvm.hpp
+	rasterizer/codegen/templates/gen_llvm.hpp \
+	rasterizer/codegen/templates/gen_rasterizer.cpp
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index 1afb532..3c1118b 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -55,7 +55,9 @@
 COMMON_CXX_SOURCES := \
 	rasterizer/common/formats.cpp \
 	rasterizer/common/formats.h \
+	rasterizer/common/intrin.h \
 	rasterizer/common/isa.hpp \
+	rasterizer/common/os.cpp \
 	rasterizer/common/os.h \
 	rasterizer/common/rdtsc_buckets.cpp \
 	rasterizer/common/rdtsc_buckets.h \
@@ -63,6 +65,19 @@
 	rasterizer/common/rdtsc_buckets_shared.h \
 	rasterizer/common/simd16intrin.h \
 	rasterizer/common/simdintrin.h \
+	rasterizer/common/simdlib.hpp \
+	rasterizer/common/simdlib_128_avx.inl \
+	rasterizer/common/simdlib_128_avx2.inl \
+	rasterizer/common/simdlib_128_avx512.inl \
+	rasterizer/common/simdlib_256_avx.inl \
+	rasterizer/common/simdlib_256_avx2.inl \
+	rasterizer/common/simdlib_256_avx512.inl \
+	rasterizer/common/simdlib_512_avx512.inl \
+	rasterizer/common/simdlib_512_avx512_masks.inl \
+	rasterizer/common/simdlib_512_emu.inl \
+	rasterizer/common/simdlib_512_emu_masks.inl \
+	rasterizer/common/simdlib_interface.hpp \
+	rasterizer/common/simdlib_types.hpp \
 	rasterizer/common/swr_assert.cpp \
 	rasterizer/common/swr_assert.h
 
@@ -71,8 +86,13 @@
 	rasterizer/core/api.h \
 	rasterizer/core/arena.h \
 	rasterizer/core/backend.cpp \
+	rasterizer/core/backend_clear.cpp \
+	rasterizer/core/backend_sample.cpp \
+	rasterizer/core/backend_singlesample.cpp \
 	rasterizer/core/backend.h \
+	rasterizer/core/backend_impl.h \
 	rasterizer/core/binner.cpp \
+	rasterizer/core/binner.h \
 	rasterizer/core/blend.h \
 	rasterizer/core/clip.cpp \
 	rasterizer/core/clip.h \
@@ -83,20 +103,22 @@
 	rasterizer/core/format_conversion.h \
 	rasterizer/core/format_traits.h \
 	rasterizer/core/format_types.h \
+	rasterizer/core/format_utils.h \
 	rasterizer/core/frontend.cpp \
 	rasterizer/core/frontend.h \
 	rasterizer/core/knobs.h \
 	rasterizer/core/knobs_init.h \
-	rasterizer/core/multisample.cpp \
 	rasterizer/core/multisample.h \
 	rasterizer/core/pa_avx.cpp \
 	rasterizer/core/pa.h \
 	rasterizer/core/rasterizer.cpp \
 	rasterizer/core/rasterizer.h \
+	rasterizer/core/rasterizer_impl.h \
 	rasterizer/core/rdtsc_core.cpp \
 	rasterizer/core/rdtsc_core.h \
 	rasterizer/core/ringbuffer.h \
 	rasterizer/core/state.h \
+	rasterizer/core/state_funcs.h \
 	rasterizer/core/tessellator.h \
 	rasterizer/core/threads.cpp \
 	rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index cdb85e2..b40830b 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -8,12 +8,12 @@
     Return()
 
 if not env['llvm']:
-    print 'warning: LLVM disabled: not building swr'
+    print('warning: LLVM disabled: not building swr')
     env['swr'] = False
     Return()
 
 if env['LLVM_VERSION'] < distutils.version.LooseVersion('3.9'):
-    print "warning: swr requires LLVM >= 3.9: not building swr"
+    print("warning: swr requires LLVM >= 3.9: not building swr")
     env['swr'] = False
     Return()
 
@@ -28,13 +28,7 @@
 else:
     llvm_config = os.environ.get('LLVM_CONFIG', 'llvm-config')
     llvm_includedir = env.backtick('%s --includedir' % llvm_config).rstrip()
-    print "llvm include dir %s" % llvm_includedir
-
-# the loader is included in the mesa lib itself
-# All the remaining files are in loadable modules
-loadersource = env.ParseSourceList('Makefile.sources', [
-    'LOADER_SOURCES'
-])
+    print("llvm include dir %s" % llvm_includedir)
 
 if not env['msvc'] :
     env.Append(CCFLAGS = [
@@ -59,7 +53,7 @@
     source = '',
     command = python_cmd + ' $SCRIPT --output $TARGET --gen_h'
 )
-Depends('rasterizer/codegen/gen_knobs.cpp',
+Depends('rasterizer/codegen/gen_knobs.h',
         swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp')
 
 env.CodeGenerate(
@@ -140,12 +134,44 @@
 # 2 centroid
 # 2 forcedSampleCount
 # 2 canEarlyZ
+backendPixelRateFileCount = 4
+backendPixelRateFilePat = "rasterizer/core/backends/gen_BackendPixelRate%s.cpp"
+backendPixelRateFiles = map(lambda x: backendPixelRateFilePat % x,
+                            range(0, backendPixelRateFileCount))
 env.CodeGenerate(
-    target = 'rasterizer/core/gen_BackendPixelRate0.cpp',
+    target = 'rasterizer/core/backends/gen_BackendPixelRate.hpp',
     script = swrroot + 'rasterizer/codegen/gen_backends.py',
     source = '',
-    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
-)
+    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --dim 5 2 3 2 2 2 --numfiles ' + str(backendPixelRateFileCount) + ' --cpp --hpp'
+    )
+Depends(backendPixelRateFiles,
+        ['rasterizer/core/backends/gen_BackendPixelRate.hpp',
+         'rasterizer/archrast/gen_ar_event.hpp',
+         'rasterizer/codegen/gen_knobs.h']
+        )
+
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 CenterPattern
+# 2 Conservative
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 5 STATE_VALID_TRI_EDGE_COUNT
+# 2 RasterScissorEdges
+genRasterizerFileCount = 4
+genRasterizerFilePat = "rasterizer/core/backends/gen_rasterizer%s.cpp"
+genRasterizerFiles = map(lambda x: genRasterizerFilePat % x,
+                         range(0, genRasterizerFileCount))
+env.CodeGenerate(
+    target = 'rasterizer/core/backends/gen_rasterizer.hpp',
+    script = swrroot + 'rasterizer/codegen/gen_backends.py',
+    source = '',
+    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --rast --dim 5 2 2 3 5 2 --numfiles ' + str(genRasterizerFileCount) + ' --cpp --hpp'
+    )
+Depends(genRasterizerFiles,
+        ['rasterizer/core/backends/gen_rasterizer.hpp',
+         'rasterizer/archrast/gen_ar_event.hpp',
+         'rasterizer/codegen/gen_knobs.h']
+        )
+
 Depends('rasterizer/jitter/gen_state_llvm.h',
         swrroot + 'rasterizer/codegen/templates/gen_backend.cpp')
 
@@ -153,21 +179,18 @@
 built_sources = [
     'rasterizer/codegen/gen_knobs.cpp',
     'rasterizer/archrast/gen_ar_event.cpp',
-    'rasterizer/core/gen_BackendPixelRate0.cpp',
     ]
 
+built_sources += [backendPixelRateFiles, genRasterizerFiles]
+
 source = built_sources
 source += env.ParseSourceList(swrroot + 'Makefile.sources', [
-    'CXX_SOURCES',
     'ARCHRAST_CXX_SOURCES',
     'COMMON_CXX_SOURCES',
     'CORE_CXX_SOURCES',
-    'JITTER_CXX_SOURCES',
     'MEMORY_CXX_SOURCES'
 ])
 
-env.Prepend(LIBS = [ mesautil, mesa, gallium ])
-
 env.Prepend(CPPPATH = [
     '.',
     'rasterizer',
@@ -209,14 +232,25 @@
     )
 env.Alias('swrAVX2', swrAVX2)
 
+source = env.ParseSourceList(swrroot + 'Makefile.sources', [
+    'CXX_SOURCES',
+    'COMMON_CXX_SOURCES',
+    'JITTER_CXX_SOURCES',
+    'LOADER_SOURCES'
+])
+source += [
+    'rasterizer/codegen/gen_knobs.cpp',
+    'rasterizer/archrast/gen_ar_event.cpp',
+    ]
 
 # main SWR lib
-swr = env.ConvenienceLibrary(
+envSWR = envavx.Clone() # pick up the arch flag for intrinsic usage
+envSWR.Append(CPPDEFINES = ['HAVE_SWR_AVX', 'HAVE_SWR_AVX2'])
+swr = envSWR.ConvenienceLibrary(
     target = 'swr',
-    source = loadersource,
+    source = source,
     )
 
-
 # treat arch libs as dependencies, even though they are not linked
 # into swr, so we don't have to build them separately
 Depends(swr, ['swrAVX', 'swrAVX2'])
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
index d9e938a..414a04e 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
+# copy of this software and associated documentation files (the 'Software'),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
@@ -11,7 +11,7 @@
 # paragraph) shall be included in all copies or substantial portions of the
 # Software.
 #
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ -31,23 +31,41 @@
 
 def main(args=sys.argv[1:]):
     thisDir = os.path.dirname(os.path.realpath(__file__))
-    parser = ArgumentParser("Generate files and initialization functions for all permutuations of BackendPixelRate.")
-    parser.add_argument('--dim', help="gBackendPixelRateTable array dimensions", nargs='+', type=int, required=True)
-    parser.add_argument('--outdir', help="output directory", nargs='?', type=str, default=thisDir)
-    parser.add_argument('--split', help="how many lines of initialization per file [0=no split]", nargs='?', type=int, default='512')
-    parser.add_argument('--cpp', help="Generate cpp file(s)", action='store_true', default=False)
-    parser.add_argument('--cmake', help="Generate cmake file", action='store_true', default=False)
+    parser = ArgumentParser('Generate files and initialization functions for all permutuations of BackendPixelRate.')
+    parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
+    parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
+    parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
+    parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
+    parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
+    parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
+    parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
+    parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
 
-    args = parser.parse_args(args);
+    args = parser.parse_args(args)
+
 
     class backendStrs :
         def __init__(self) :
             self.outFileName = 'gen_BackendPixelRate%s.cpp'
+            self.outHeaderName = 'gen_BackendPixelRate.hpp'
             self.functionTableName = 'gBackendPixelRateTable'
             self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
             self.template = 'gen_backend.cpp'
+            self.hpp_template = 'gen_header_init.hpp'
             self.cmakeFileName = 'gen_backends.cmake'
             self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
+            self.tableName = 'BackendPixelRate'
+
+            if args.rast:
+                self.outFileName = 'gen_rasterizer%s.cpp'
+                self.outHeaderName = 'gen_rasterizer.hpp'
+                self.functionTableName = 'gRasterizerFuncs'
+                self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
+                self.template = 'gen_rasterizer.cpp'
+                self.cmakeFileName = 'gen_rasterizer.cmake'
+                self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
+                self.tableName = 'RasterizerFuncs'
+
 
     backend = backendStrs()
 
@@ -77,6 +95,8 @@
         numFiles = 1
     else:
         numFiles = (len(output_list) + args.split - 1) // args.split
+    if (args.numfiles != 0):
+        numFiles = args.numfiles
     linesPerFile = (len(output_list) + numFiles - 1) // numFiles
     chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
 
@@ -87,7 +107,6 @@
 
         for fileNum in range(numFiles):
             filename = baseCppName % str(fileNum)
-            #print('Generating', filename)
             MakoTemplateWriter.to_file(
                 templateCpp,
                 baseCppName % str(fileNum),
@@ -95,11 +114,23 @@
                 fileNum=fileNum,
                 funcList=chunkedList[fileNum])
 
+    if args.hpp:
+        baseHppName = os.path.join(args.outdir, backend.outHeaderName)
+        templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
+
+        MakoTemplateWriter.to_file(
+            templateHpp,
+            baseHppName,
+            cmdline=sys.argv,
+            numFiles=numFiles,
+            filename=backend.outHeaderName,
+            tableName=backend.tableName)
+
     # generate gen_backend.cmake file
     if args.cmake:
         templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
         cmakeFile = os.path.join(args.outdir, backend.cmakeFileName)
-        #print('Generating', cmakeFile)
+
         MakoTemplateWriter.to_file(
             templateCmake,
             cmakeFile,
@@ -108,8 +139,6 @@
             numFiles=numFiles,
             baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
 
-    #print("Generated %d template instantiations in %d files" % (len(output_list), numFiles))
-
     return 0
 
 if __name__ == '__main__':
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
index 07b455a..7f53ec6 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
@@ -22,6 +22,7 @@
 # Python source
 from __future__ import print_function
 import os
+import errno
 import sys
 import argparse
 from mako.template import Template
@@ -62,6 +63,12 @@
         '''
             Write template data to a file
         '''
+        if not os.path.exists(os.path.dirname(output_filename)):
+            try:
+                os.makedirs(os.path.dirname(output_filename))
+            except OSError as err:
+                if err.errno != errno.EEXIST:
+                    raise
         with open(output_filename, 'w') as outfile:
             print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
 
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 2ed2b2f..8b91530 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -139,6 +139,14 @@
 
                     ignore = False
 
+                    # The following functions need to be ignored in openswr.
+                    # API change in llvm-5.0 breaks baked autogen files
+                    if (
+                        (func_name == 'CreateFence' or
+                         func_name == 'CreateAtomicCmpXchg' or
+                         func_name == 'CreateAtomicRMW')):
+                        ignore = True
+
                     # The following functions need to be ignored.
                     if (func_name == 'CreateInsertNUWNSWBinOp' or
                         func_name == 'CreateMaskedIntrinsic' or
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
index 4cabde3..94f3f9f 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
@@ -62,19 +62,21 @@
             llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
         elif type == '__m128i':
             llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 4)'
-        elif type == 'SIMD8::vector_t':
+        elif type == 'SIMD256::Float':
             llvm_type = 'VectorType::get(Type::getFloatTy(ctx), 8)'
-        elif type == 'SIMD8::vectori_t':
+        elif type == 'SIMD256::Integer':
             llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 8)'
-        elif type == 'SIMD16::vector_t':
+        elif type == 'SIMD512::Float':
             llvm_type = 'VectorType::get(Type::getFloatTy(ctx), 16)'
-        elif type == 'SIMD16::vectori_t':
+        elif type == 'SIMD512::Integer':
             llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 16)'
         elif type == 'simdvector':
-            llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)'
-        elif type == 'SIMD8::attrib_t':
             llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), 8), 4)'
-        elif type == 'SIMD16::attrib_t':
+        elif type == 'simd16vector':
+            llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), 16), 4)'
+        elif type == 'SIMD256::Vec4':
+            llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), 8), 4)'
+        elif type == 'SIMD512::Vec4':
             llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), 16), 4)'
         else:
             llvm_type = 'Gen_%s(pJitMgr)' % type
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
index 0c39a77..09e3124 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -18,6 +18,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
+import sys
 
 # Python source
 KNOBS = [
@@ -129,7 +130,7 @@
 
     ['MAX_DRAWS_IN_FLIGHT', {
         'type'      : 'uint32_t',
-        'default'   : '128',
+        'default'   : '256',
         'desc'      : ['Maximum number of draws outstanding before API thread blocks.',
                        'This value MUST be evenly divisible into 2^32'],
         'category'  : 'perf',
@@ -137,7 +138,7 @@
 
     ['MAX_PRIMS_PER_DRAW', {
         'type'      : 'uint32_t',
-        'default'   : '2040',
+        'default'   : '49152',
         'desc'      : ['Maximum primitives in a single Draw().',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (3 * vectorWidth).'],
@@ -156,11 +157,25 @@
 
     ['DEBUG_OUTPUT_DIR', {
         'type'      : 'std::string',
-        'default'   : '/tmp/Rast/DebugOutput',
+        'default'   : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput',
         'desc'      : ['Output directory for debug data.'],
         'category'  : 'debug',
     }],
 
+    ['JIT_ENABLE_CACHE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Enables caching of compiled shaders'],
+        'category'  : 'debug',
+    }],
+
+    ['JIT_CACHE_DIR', {
+        'type'      : 'std::string',
+        'default'   : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache',
+        'desc'      : ['Cache directory for compiled shaders.'],
+        'category'  : 'debug',
+    }],
+
     ['TOSS_DRAW', {
         'type'      : 'bool',
         'default'   : 'false',
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index 3a618a1..9017e8d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -67,8 +67,9 @@
             char buf[255];
             // There could be multiple threads creating thread pools. We
             // want to make sure they are uniquly identified by adding in
-            // the creator's thread id into the filename.
-            sprintf(buf, "%s/ar_event%d_%d.bin", "/tmp", GetCurrentThreadId(), id);
+            // the creator's thread (process) id into the filename.
+            // Assumes a 1:1 thread:LWP mapping as in linux.
+            sprintf(buf, "%s/ar_event%d_%d.bin", "/tmp", GetCurrentProcessId(), id);
             mFilename = std::string(buf);
 #endif
         }
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
index 4eb4ad4..088b1cd 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
@@ -32,6 +32,7 @@
 //============================================================================
 
 #include "core/backend.h"
+#include "core/backend_impl.h"
 
 void InitBackendPixelRate${fileNum}()
 {
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
new file mode 100644
index 0000000..5625ef8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
@@ -0,0 +1,43 @@
+//============================================================================
+// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+// 
+// @file ${filename}
+// 
+// @brief auto-generated file
+// 
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  ${'\n//    '.join(cmdline)}
+//
+//============================================================================
+
+%for num in range(numFiles):
+void Init${tableName}${num}();
+%endfor
+
+static INLINE void Init${tableName}()
+{
+    %for num in range(numFiles):
+    Init${tableName}${num}();
+    %endfor
+}
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index 81e49da..06b93bd 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -34,14 +34,44 @@
 #pragma once
 #include <string>
 
-template <typename T>
-struct Knob
+struct KnobBase
 {
-    const   T&  Value() const               { return m_Value; }
-    const   T&  Value(const T& newValue)    { m_Value = newValue; return Value(); }
+private:
+    // Update the input string.
+    static void autoExpandEnvironmentVariables(std::string &text);
 
 protected:
-    Knob(const T& defaultValue) : m_Value(defaultValue) {}
+    // Leave input alone and return new string.
+    static std::string expandEnvironmentVariables(std::string const &input)
+    {
+        std::string text = input;
+        autoExpandEnvironmentVariables(text);
+        return text;
+    }
+
+    template <typename T>
+    static T expandEnvironmentVariables(T const &input)
+    {
+        return input;
+    }
+};
+
+template <typename T>
+struct Knob : KnobBase
+{
+public:
+    const   T&  Value() const               { return m_Value; }
+    const   T&  Value(T const &newValue)
+    {
+        m_Value = expandEnvironmentVariables(newValue);
+        return Value();
+    }
+
+protected:
+    Knob(T const &defaultValue) :
+        m_Value(expandEnvironmentVariables(defaultValue))
+    {
+    }
 
 private:
     T m_Value;
@@ -102,6 +132,61 @@
 % for inc in includes:
 #include <${inc}>
 % endfor
+#include <regex>
+#include <core/utils.h>
+
+//========================================================
+// Implementation
+//========================================================
+void KnobBase::autoExpandEnvironmentVariables(std::string &text)
+{
+#if (__GNUC__) && (GCC_VERSION < 409000)
+    // <regex> isn't implemented prior to gcc-4.9.0
+    // unix style variable replacement
+    size_t start;
+    while ((start = text.find("${'${'}")) != std::string::npos) {
+        size_t end = text.find("}");
+        if (end == std::string::npos)
+            break;
+        const std::string var = GetEnv(text.substr(start + 2, end - start - 2));
+        text.replace(start, end - start + 1, var);
+    }
+    // win32 style variable replacement
+    while ((start = text.find("%")) != std::string::npos) {
+        size_t end = text.find("%", start + 1);
+        if (end == std::string::npos)
+            break;
+        const std::string var = GetEnv(text.substr(start + 1, end - start - 1));
+        text.replace(start, end - start + 1, var);
+    }
+#else
+    {
+        // unix style variable replacement
+        static std::regex env("\\$\\{([^}]+)\\}");
+        std::smatch match;
+        while (std::regex_search(text, match, env))
+        {
+            const std::string var = GetEnv(match[1].str());
+            // certain combinations of gcc/libstd++ have problems with this
+            // text.replace(match[0].first, match[0].second, var);
+            text.replace(match.prefix().length(), match[0].length(), var);
+        }
+    }
+    {
+        // win32 style variable replacement
+        static std::regex env("\\%([^}]+)\\%");
+        std::smatch match;
+        while (std::regex_search(text, match, env))
+        {
+            const std::string var = GetEnv(match[1].str());
+            // certain combinations of gcc/libstd++ have problems with this
+            // text.replace(match[0].first, match[0].second, var);
+            text.replace(match.prefix().length(), match[0].length(), var);
+        }
+    }
+#endif
+}
+
 
 //========================================================
 // Static Data Members
@@ -174,4 +259,4 @@
         return ' '*(max_len - name_len)
 
 
-%>
\ No newline at end of file
+%>
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
new file mode 100644
index 0000000..06c8762
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
@@ -0,0 +1,42 @@
+//============================================================================
+// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+// 
+// @file gen_rasterizer${fileNum}.cpp
+// 
+// @brief auto-generated file
+// 
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  ${'\n//    '.join(cmdline)}
+//
+//============================================================================
+
+#include "core/rasterizer.h"
+#include "core/rasterizer_impl.h"
+
+void InitRasterizerFuncs${fileNum}()
+{
+    %for func in funcList:
+    ${func}
+    %endfor
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
index 72020ee..263dec6 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -20,7 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
-* @file gen_formats.cpp
+* @file formats.cpp
 *
 * @brief auto-generated file
 *
@@ -2729,26 +2729,16 @@
         { 0.0f, 0.0f, 0.0f, 0.0f },
         1, 1
     },
-    // R10G10B10_FLOAT_A2_UNORM (0xD5)
+    // padding (0xD5)
     {
-        "R10G10B10_FLOAT_A2_UNORM",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        nullptr,
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
+        0, 0, 0, false, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1
     },
-
     // R32_SINT (0xD6)
     {
         "R32_SINT",
@@ -5179,16 +5169,26 @@
         { 0.0f, 0.0f, 0.0f, 0.0f },
         1, 1
     },
-    // padding (0x180)
+    // DXT1_RGB_SRGB (0x180)
     {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
+        "DXT1_RGB_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        false, // isLuminance
+        { true, false, false, false }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
     },
+
     // padding (0x181)
     {
         nullptr,
@@ -5449,16 +5449,26 @@
         { 0.0f, 0.0f, 0.0f, 0.0f },
         1, 1
     },
-    // padding (0x191)
+    // DXT1_RGB (0x191)
     {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
+        "DXT1_RGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        false, // isLuminance
+        { true, false, false, false }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
     },
+
     // padding (0x192)
     {
         nullptr,
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
index 0056a56..f13f338 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.h
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
@@ -20,7 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 * 
-* @file gen_formats.h
+* @file formats.h
 * 
 * @brief auto-generated file
 * 
@@ -181,6 +181,7 @@
     L8_SINT                     = 0x153,
     I8_UINT                     = 0x154,
     I8_SINT                     = 0x155,
+    DXT1_RGB_SRGB               = 0x180,
     YCRCB_SWAPUVY               = 0x183,
     BC1_UNORM                   = 0x186,
     BC2_UNORM                   = 0x187,
@@ -191,6 +192,7 @@
     BC2_UNORM_SRGB              = 0x18C,
     BC3_UNORM_SRGB              = 0x18D,
     YCRCB_SWAPUV                = 0x18F,
+    DXT1_RGB                    = 0x191,
     R8G8B8_UNORM                = 0x193,
     R8G8B8_SNORM                = 0x194,
     R8G8B8_SSCALED              = 0x195,
diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h
new file mode 100644
index 0000000..33d37e3
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@@ -0,0 +1,117 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_INTRIN_H__
+#define __SWR_INTRIN_H__
+
+#include "os.h"
+
+#define SIMD_ARCH KNOB_ARCH
+#include "simdlib_types.hpp"
+
+typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
+
+typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
+
+typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
+
+#if KNOB_SIMD_WIDTH == 8 
+typedef simd8scalar     simdscalar;
+typedef simd8scalard    simdscalard;
+typedef simd8scalari    simdscalari;
+typedef simd8vector     simdvector;
+typedef simd8mask       simdmask;
+#else
+#error Unsupported vector width
+#endif
+
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
+    return _pdep_u32(a, mask);
+#else
+    UINT result = 0;
+
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // using bsf instead of funky loop
+    DWORD maskIndex;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. populate LSB from src
+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+        // 3. copy bit from mask
+        result |= LSB & lowest;
+
+        // 4. clear lowest bit
+        mask &= ~lowest;
+
+        // 5. prepare for next iteration
+        a >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
+    return _pext_u32(a, mask);
+#else
+    UINT result = 0;
+    DWORD maskIndex;
+    uint32_t currentBit = 0;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. copy bit from mask
+        result |= ((a & lowest) > 0) << currentBit++;
+
+        // 3. clear lowest bit
+        mask &= ~lowest;
+    }
+    return result;
+#endif
+}
+
+#endif//__SWR_INTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.cpp b/src/gallium/drivers/swr/rasterizer/common/os.cpp
new file mode 100644
index 0000000..27ad5e9
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/os.cpp
@@ -0,0 +1,153 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include "common/os.h"
+#include <vector>
+#include <sstream>
+
+#if defined(_WIN32)
+#include <shlobj.h>
+#endif // Windows
+
+#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+#include <pthread.h>
+#endif // Linux
+
+
+
+#if defined(_WIN32)
+static const DWORD MS_VC_EXCEPTION = 0x406D1388;
+
+#pragma pack(push,8)  
+typedef struct tagTHREADNAME_INFO
+{
+    DWORD dwType; // Must be 0x1000.  
+    LPCSTR szName; // Pointer to name (in user addr space).  
+    DWORD dwThreadID; // Thread ID (-1=caller thread).  
+    DWORD dwFlags; // Reserved for future use, must be zero.  
+} THREADNAME_INFO;
+#pragma pack(pop)
+
+void LegacySetThreadName(const char* pThreadName)
+{
+    THREADNAME_INFO info;
+    info.dwType = 0x1000;
+    info.szName = pThreadName;
+    info.dwThreadID = GetCurrentThreadId();
+    info.dwFlags = 0;
+
+    if (!IsDebuggerPresent())
+    {
+        // No debugger attached to interpret exception, no need to actually do it
+        return;
+    }
+
+#pragma warning(push)  
+#pragma warning(disable: 6320 6322)  
+    __try {
+        RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
+    }
+    __except (EXCEPTION_EXECUTE_HANDLER) {
+    }
+#pragma warning(pop)  
+}
+#endif // _WIN32
+
+void SWR_API SetCurrentThreadName(const char* pThreadName)
+{
+#if defined(_WIN32)
+    // The SetThreadDescription API was brought in version 1607 of Windows 10.
+    typedef HRESULT(WINAPI* PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
+    // The SetThreadDescription API works even if no debugger is attached.
+    auto pfnSetThreadDescription =
+        reinterpret_cast<PFNSetThreadDescription>(
+            GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
+
+    if (!pfnSetThreadDescription)
+    {
+        // try KernelBase.dll
+        pfnSetThreadDescription =
+            reinterpret_cast<PFNSetThreadDescription>(
+                GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
+    }
+
+    if (pfnSetThreadDescription)
+    {
+        std::string utf8Name = pThreadName;
+        std::wstring wideName;
+        wideName.resize(utf8Name.size() + 1);
+        swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
+        HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str());
+        SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName);
+
+        // Fall through - it seems like some debuggers only recognize the exception
+    }
+
+    // Fall back to exception based hack
+    LegacySetThreadName(pThreadName);
+#endif // _WIN32
+
+#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+    pthread_setname_np(pthread_self(), pThreadName);
+#endif // Linux
+}
+
+static void SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
+{
+    out_segments.clear();
+
+    std::istringstream f(input);
+    std::string s;
+    while (std::getline(f, s, splitToken))
+    {
+        if (s.size())
+        {
+            out_segments.push_back(s);
+        }
+    }
+}
+
+void SWR_API CreateDirectoryPath(const std::string& path)
+{
+#if defined(_WIN32)
+    SHCreateDirectoryExA(nullptr, path.c_str(), nullptr);
+#endif // Windows
+
+#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+    std::vector<std::string> pathSegments;
+    SplitString(pathSegments, path, '/');
+
+    std::string tmpPath;
+    for (auto const& segment : pathSegments)
+    {
+        tmpPath.push_back('/');
+        tmpPath += segment;
+
+        int result = mkdir(tmpPath.c_str(), 0777);
+        if (result == -1 && errno != EEXIST)
+        {
+            break;
+        }
+    }
+#endif // Unix
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 0b8f273..dc90fca 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -30,6 +30,7 @@
 #if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
 
 #define SWR_API __cdecl
+#define SWR_VISIBLE  __declspec(dllexport)
 
 #ifndef NOMINMAX
 #define NOMINMAX
@@ -91,6 +92,7 @@
 #elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
 
 #define SWR_API
+#define SWR_VISIBLE __attribute__((visibility("default")))
 
 #include <stdlib.h>
 #include <string.h>
@@ -235,10 +237,6 @@
 #define sprintf_s sprintf
 #define strcpy_s(dst,size,src) strncpy(dst,src,size)
 #define GetCurrentProcessId getpid
-pid_t gettid(void);
-#define GetCurrentThreadId gettid
-
-#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
 
 #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
@@ -279,4 +277,12 @@
 #define ATTR_UNUSED
 #endif
 
+#define SWR_FUNC(_retType, _funcName, /* args */...)   \
+   typedef _retType (SWR_API * PFN##_funcName)(__VA_ARGS__); \
+  _retType SWR_API _funcName(__VA_ARGS__);
+
+// Defined in os.cpp
+void SWR_API SetCurrentThreadName(const char* pThreadName);
+void SWR_API CreateDirectoryPath(const std::string& path);
+
 #endif//__SWR_OS_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index 90c903d..a160ca2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -27,773 +27,143 @@
 #if ENABLE_AVX512_SIMD16
 
 #if KNOB_SIMD16_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-struct simd16scalar
-{
-    __m256  lo;
-    __m256  hi;
-};
-struct simd16scalard
-{
-    __m256d lo;
-    __m256d hi;
-};
-struct simd16scalari
-{
-    __m256i lo;
-    __m256i hi;
-};
-typedef uint16_t simd16mask;
-
-#else
-typedef __m512 simd16scalar;
-typedef __m512d simd16scalard;
-typedef __m512i simd16scalari;
-typedef __mmask16 simd16mask;
-#endif//ENABLE_AVX512_EMULATION
+typedef SIMD512                             SIMD16;
 #else
 #error Unsupported vector width
 #endif//KNOB_SIMD16_WIDTH == 16
 
-#define _simd16_masklo(mask) ((mask) & 0xFF)
-#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
-#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
+#define _simd16_setzero_ps                  SIMD16::setzero_ps
+#define _simd16_setzero_si                  SIMD16::setzero_si
+#define _simd16_set1_ps                     SIMD16::set1_ps
+#define _simd16_set1_epi8                   SIMD16::set1_epi8
+#define _simd16_set1_epi32                  SIMD16::set1_epi32
+#define _simd16_set_ps                      SIMD16::set_ps
+#define _simd16_set_epi32                   SIMD16::set_epi32
+#define _simd16_load_ps                     SIMD16::load_ps
+#define _simd16_loadu_ps                    SIMD16::loadu_ps
+#if 1                                       
+#define _simd16_load1_ps                    SIMD16::broadcast_ss
+#endif                                      
+#define _simd16_load_si                     SIMD16::load_si
+#define _simd16_loadu_si                    SIMD16::loadu_si
+#define _simd16_broadcast_ss(m)             SIMD16::broadcast_ss((float const*)m)
+#define _simd16_store_ps                    SIMD16::store_ps
+#define _simd16_store_si                    SIMD16::store_si
+#define _simd16_extract_ps(a, imm8)         SIMD16::extract_ps<imm8>(a)
+#define _simd16_extract_si(a, imm8)         SIMD16::extract_si<imm8>(a)
+#define _simd16_insert_ps(a, b, imm8)       SIMD16::insert_ps<imm8>(a, b)
+#define _simd16_insert_si(a, b, imm8)       SIMD16::insert_si<imm8>(a, b)
+#define _simd16_maskstore_ps                SIMD16::maskstore_ps
+#define _simd16_blend_ps(a, b, mask)        SIMD16::blend_ps<mask>(a, b)
+#define _simd16_blendv_ps                   SIMD16::blendv_ps
+#define _simd16_blendv_epi32                SIMD16::blendv_epi32
+#define _simd16_mul_ps                      SIMD16::mul_ps
+#define _simd16_div_ps                      SIMD16::div_ps
+#define _simd16_add_ps                      SIMD16::add_ps
+#define _simd16_sub_ps                      SIMD16::sub_ps
+#define _simd16_rsqrt_ps                    SIMD16::rsqrt_ps
+#define _simd16_min_ps                      SIMD16::min_ps
+#define _simd16_max_ps                      SIMD16::max_ps
+#define _simd16_movemask_ps                 SIMD16::movemask_ps
+#define _simd16_movemask_pd                 SIMD16::movemask_pd
+#define _simd16_cvtps_epi32                 SIMD16::cvtps_epi32
+#define _simd16_cvttps_epi32                SIMD16::cvttps_epi32
+#define _simd16_cvtepi32_ps                 SIMD16::cvtepi32_ps
+#define _simd16_cmp_ps(a, b, comp)          SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
+#define _simd16_cmplt_ps                    SIMD16::cmplt_ps
+#define _simd16_cmpgt_ps                    SIMD16::cmpgt_ps
+#define _simd16_cmpneq_ps                   SIMD16::cmpneq_ps
+#define _simd16_cmpeq_ps                    SIMD16::cmpeq_ps
+#define _simd16_cmpge_ps                    SIMD16::cmpge_ps
+#define _simd16_cmple_ps                    SIMD16::cmple_ps
+#define _simd16_castsi_ps                   SIMD16::castsi_ps
+#define _simd16_castps_si                   SIMD16::castps_si
+#define _simd16_castsi_pd                   SIMD16::castsi_pd
+#define _simd16_castpd_si                   SIMD16::castpd_si
+#define _simd16_castpd_ps                   SIMD16::castpd_ps
+#define _simd16_castps_pd                   SIMD16::castps_pd
+#define _simd16_and_ps                      SIMD16::and_ps
+#define _simd16_andnot_ps                   SIMD16::andnot_ps
+#define _simd16_or_ps                       SIMD16::or_ps
+#define _simd16_xor_ps                      SIMD16::xor_ps
+#define _simd16_round_ps(a, mode)           SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
+#define _simd16_mul_epi32                   SIMD16::mul_epi32
+#define _simd16_mullo_epi32                 SIMD16::mullo_epi32
+#define _simd16_sub_epi32                   SIMD16::sub_epi32
+#define _simd16_sub_epi64                   SIMD16::sub_epi64
+#define _simd16_min_epi32                   SIMD16::min_epi32
+#define _simd16_max_epi32                   SIMD16::max_epi32
+#define _simd16_min_epu32                   SIMD16::min_epu32
+#define _simd16_max_epu32                   SIMD16::max_epu32
+#define _simd16_add_epi32                   SIMD16::add_epi32
+#define _simd16_and_si                      SIMD16::and_si
+#define _simd16_andnot_si                   SIMD16::andnot_si
+#define _simd16_or_si                       SIMD16::or_si
+#define _simd16_xor_si                      SIMD16::xor_si
+#define _simd16_cmpeq_epi32                 SIMD16::cmpeq_epi32
+#define _simd16_cmpgt_epi32                 SIMD16::cmpgt_epi32
+#define _simd16_cmplt_epi32                 SIMD16::cmplt_epi32
+#define _simd16_testz_ps                    SIMD16::testz_ps
+#define _simd16_unpacklo_ps                 SIMD16::unpacklo_ps
+#define _simd16_unpackhi_ps                 SIMD16::unpackhi_ps
+#define _simd16_unpacklo_pd                 SIMD16::unpacklo_pd
+#define _simd16_unpackhi_pd                 SIMD16::unpackhi_pd
+#define _simd16_unpacklo_epi8               SIMD16::unpacklo_epi8
+#define _simd16_unpackhi_epi8               SIMD16::unpackhi_epi8
+#define _simd16_unpacklo_epi16              SIMD16::unpacklo_epi16
+#define _simd16_unpackhi_epi16              SIMD16::unpackhi_epi16
+#define _simd16_unpacklo_epi32              SIMD16::unpacklo_epi32
+#define _simd16_unpackhi_epi32              SIMD16::unpackhi_epi32
+#define _simd16_unpacklo_epi64              SIMD16::unpacklo_epi64
+#define _simd16_unpackhi_epi64              SIMD16::unpackhi_epi64
+#define _simd16_slli_epi32(a, i)            SIMD16::slli_epi32<i>(a)
+#define _simd16_srli_epi32(a, i)            SIMD16::srli_epi32<i>(a)
+#define _simd16_srai_epi32(a, i)            SIMD16::srai_epi32<i>(a)
+#define _simd16_fmadd_ps                    SIMD16::fmadd_ps
+#define _simd16_fmsub_ps                    SIMD16::fmsub_ps
+#define _simd16_adds_epu8                   SIMD16::adds_epu8
+#define _simd16_subs_epu8                   SIMD16::subs_epu8
+#define _simd16_add_epi8                    SIMD16::add_epi8
+#define _simd16_shuffle_epi8                SIMD16::shuffle_epi8
 
-#if defined(_WIN32)
-#define SIMDAPI __vectorcall
-#else
-#define SIMDAPI
-#endif
+#define _simd16_i32gather_ps(m, index, scale)               SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
 
-OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
-{
-    simd16scalar  v[4];
-    struct
-    {
-        simd16scalar x, y, z, w;
-    };
+#define _simd16_abs_epi32                   SIMD16::abs_epi32
 
-    simd16scalar& operator[] (const int i) { return v[i]; }
-    const simd16scalar& operator[] (const int i) const { return v[i]; }
-};
+#define _simd16_cmpeq_epi64                 SIMD16::cmpeq_epi64
+#define _simd16_cmpgt_epi64                 SIMD16::cmpgt_epi64
+#define _simd16_cmpeq_epi16                 SIMD16::cmpeq_epi16
+#define _simd16_cmpgt_epi16                 SIMD16::cmpgt_epi16
+#define _simd16_cmpeq_epi8                  SIMD16::cmpeq_epi8
+#define _simd16_cmpgt_epi8                  SIMD16::cmpgt_epi8
 
-#if ENABLE_AVX512_EMULATION
-
-#define SIMD16_EMU_AVX512_0(type, func, intrin) \
-INLINE type SIMDAPI func()\
-{\
-    type result;\
-\
-    result.lo = intrin();\
-    result.hi = intrin();\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_1(type, func, intrin) \
-INLINE type SIMDAPI func(type a)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo);\
-    result.hi = intrin(a.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_2(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo);\
-    result.hi = intrin(a.hi, b.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_3(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b, type c)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo, c.lo);\
-    result.hi = intrin(a.hi, b.hi, c.hi);\
-\
-    return result;\
-}
-
-SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
-SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
-
-INLINE simd16scalar SIMDAPI _simd16_set1_ps(float a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set1_ps(a);
-    result.hi = _mm256_set1_ps(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi8(char a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi8(a);
-    result.hi = _mm256_set1_epi8(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi32(int a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi32(a);
-    result.hi = _mm256_set1_epi32(a);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_load_ps(m);
-    result.hi = _mm256_load_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_loadu_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_loadu_ps(m);
-    result.hi = _mm256_loadu_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load1_ps(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_load_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_load_si256(&m[0].lo);
-    result.hi = _mm256_load_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_loadu_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_loadu_si256(&m[0].lo);
-    result.hi = _mm256_loadu_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ss(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ps(__m128 const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ps(m);
-    result.hi = _mm256_broadcast_ps(m);
-
-    return result;
-}
-
-INLINE void SIMDAPI _simd16_store_ps(float *m, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_store_ps(m, a.lo);
-    _mm256_store_ps(n, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_maskstore_ps(m, mask.lo, a.lo);
-    _mm256_maskstore_ps(n, mask.hi, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_store_si(simd16scalari *m, simd16scalari a)
-{
-    _mm256_store_si256(&m[0].lo, a.lo);
-    _mm256_store_si256(&m[0].hi, a.hi);
-}
-
-INLINE simdscalar SIMDAPI _simd16_extract_ps(simd16scalar a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_ps(0.0f);
-}
-
-INLINE simdscalari SIMDAPI _simd16_extract_si(simd16scalari a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_epi32(0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-template <simd16mask mask>
-INLINE simd16scalar SIMDAPI _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
-    result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
-
-    return result;
-}
-
-#define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    simdmask mask_lo = _mm256_movemask_ps(a.lo);
-    simdmask mask_hi = _mm256_movemask_ps(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    simdmask mask_lo = _mm256_movemask_pd(a.lo);
-    simdmask mask_hi = _mm256_movemask_pd(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
-}
-
-INLINE uint64_t SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    uint32_t mask_lo = _simd_movemask_epi8(a.lo);
-    uint32_t mask_hi = _simd_movemask_epi8(a.hi);
-
-    return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvtps_epi32(a.lo);
-    result.hi = _mm256_cvtps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvttps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvttps_epi32(a.lo);
-    result.hi = _mm256_cvttps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_cvtepi32_ps(simd16scalari a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cvtepi32_ps(a.lo);
-    result.hi = _mm256_cvtepi32_ps(a.hi);
-
-    return result;
-}
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
-    result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
-
-    return result;
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
-
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
-
-INLINE simd16scalar SIMDAPI _simd16_castsi_ps(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castps_si(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castsi_pd(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castpd_si(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_castpd_ps(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castps_pd(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_round_ps(a.lo, mode);
-    result.hi = _mm256_round_ps(a.hi, mode);
-
-    return result;
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(a.lo, b.lo);
-    int hi = _simd_testz_ps(a.hi, b.hi);
-
-    return lo & hi;
-}
-
-#define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_slli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_slli_epi32(a.lo, imm8);
-    result.hi = _simd_slli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srai_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srai_epi32(a.lo, imm8);
-    result.hi = _simd_srai_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srli_epi32(a.lo, imm8);
-    result.hi = _simd_srli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
-{
-    simd16scalar result;
-
-    result.lo = _simd_i32gather_ps(m, index.lo, scale);
-    result.hi = _simd_i32gather_ps(m, index.hi, scale);
-
-    return result;
-}
-
-#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    simd16scalar result;
-
-    result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
-    result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
-
-    return result;
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
-
-INLINE simd16scalar SIMDAPI _simd16_permute_ps(simd16scalar a, simd16scalari i)
-{
-    simd16scalar result;
-
-    const simdscalari mask = _simd_set1_epi32(7);
-
-    simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
-    simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
-
-    simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
-    simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
-
-    result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
-    result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_permute_epi32(simd16scalari a, simd16scalari i)
-{
-    return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
-{
-    simd16scalari result;
-
-    result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
-    result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
-    result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
-
-    return result;
-}
-
-#define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi16(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi32(__m128i a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi32(a);
-    result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi32(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return mask;
-}
-
-INLINE int SIMDAPI SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return mask;
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
-}
+#define _simd16_permute_ps                  SIMD16::permute_ps
+#define _simd16_permute_epi32               SIMD16::permute_epi32
+#define _simd16_sllv_epi32                  SIMD16::sllv_epi32
+#define _simd16_srlv_epi32                  SIMD16::sllv_epi32
+#define _simd16_permute2f128_ps(a, b, i)    SIMD16::permute2f128_ps<i>(a, b)
+#define _simd16_permute2f128_pd(a, b, i)    SIMD16::permute2f128_pd<i>(a, b)
+#define _simd16_permute2f128_si(a, b, i)    SIMD16::permute2f128_si<i>(a, b)
+#define _simd16_shuffle_ps(a, b, i)         SIMD16::shuffle_ps<i>(a, b)
+#define _simd16_shuffle_pd(a, b, i)         SIMD16::shuffle_pd<i>(a, b)
+#define _simd16_shuffle_epi32(a, b, imm8)   SIMD16::shuffle_epi32<imm8>(a, b)
+#define _simd16_shuffle_epi64(a, b, imm8)   SIMD16::shuffle_epi64<imm8>(a, b)
+#define _simd16_cvtepu8_epi16               SIMD16::cvtepu8_epi16
+#define _simd16_cvtepu8_epi32               SIMD16::cvtepu8_epi32
+#define _simd16_cvtepu16_epi32              SIMD16::cvtepu16_epi32
+#define _simd16_cvtepu16_epi64              SIMD16::cvtepu16_epi64
+#define _simd16_cvtepu32_epi64              SIMD16::cvtepu32_epi64
+#define _simd16_packus_epi16                SIMD16::packus_epi16
+#define _simd16_packs_epi16                 SIMD16::packs_epi16
+#define _simd16_packus_epi32                SIMD16::packus_epi32
+#define _simd16_packs_epi32                 SIMD16::packs_epi32
+#define _simd16_cmplt_ps_mask               SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
+#define _simd16_int2mask(mask)              simd16mask(mask)
+#define _simd16_mask2int(mask)              int(mask)
 
 // convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
+SIMDINLINE simd16scalar vMask16(int32_t mask)
 {
     simd16scalari temp = _simd16_set1_epi32(mask);
 
@@ -804,343 +174,6 @@
     return _simd16_castsi_ps(result);
 }
 
-#else
-
-INLINE simd16mask SIMDAPI _simd16_scalari2mask(simd16scalari mask)
-{
-    return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
-}
-
-INLINE simd16mask SIMDAPI _simd16_scalard2mask(simd16scalard mask)
-{
-    return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
-}
-
-#define _simd16_setzero_ps      _mm512_setzero_ps
-#define _simd16_setzero_si      _mm512_setzero_si512
-#define _simd16_set1_ps         _mm512_set1_ps
-#define _simd16_set1_epi8       _mm512_set1_epi8
-#define _simd16_set1_epi32      _mm512_set1_epi32
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-#define _simd16_load_ps         _mm512_load_ps
-#define _simd16_loadu_ps        _mm512_loadu_ps
-#if 1
-#define _simd16_load1_ps        _simd16_broadcast_ss
-#endif
-#define _simd16_load_si         _mm512_load_si512
-#define _simd16_loadu_si        _mm512_loadu_si512
-#define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
-#define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
-#define _simd16_store_ps        _mm512_store_ps
-#define _simd16_store_si        _mm512_store_si512
-#define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
-#define _simd16_extract_si      _mm512_extracti64x4_epi64
-#define _simd16_insert_ps(a, b, imm8)  _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
-#define _simd16_insert_si       _mm512_inserti64x4
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    _mm512_mask_store_ps(m, k, a);
-}
-
-#define _simd16_blend_ps(a, b, mask)    _mm512_mask_blend_ps(mask, a, b)
-
-INLINE simd16scalar SIMDAPI _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_ps(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-#define _simd16_mul_ps          _mm512_mul_ps
-#define _simd16_div_ps          _mm512_div_ps
-#define _simd16_add_ps          _mm512_add_ps
-#define _simd16_sub_ps          _mm512_sub_ps
-#define _simd16_rsqrt_ps        _mm512_rsqrt14_ps
-#define _simd16_min_ps          _mm512_min_ps
-#define _simd16_max_ps          _mm512_max_ps
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    return  _simd16_scalari2mask(_mm512_castps_si512(a));
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    return  _simd16_scalard2mask(a);
-}
-
-#if 0
-INLINE int SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    return  _simd16_scalar2mask(a);
-}
-#endif
-
-#define _simd16_cvtps_epi32     _mm512_cvtps_epi32
-#define _simd16_cvttps_epi32    _mm512_cvttps_epi32
-#define _simd16_cvtepi32_ps     _mm512_cvtepi32_ps
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16mask k = _mm512_cmp_ps_mask(a, b, comp);
-
-    return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-#define _simd16_castsi_ps           _mm512_castsi512_ps
-#define _simd16_castps_si           _mm512_castps_si512
-#define _simd16_castsi_pd           _mm512_castsi512_pd
-#define _simd16_castpd_si           _mm512_castpd_si512
-#define _simd16_castpd_ps           _mm512_castpd_ps
-#define _simd16_castps_pd           _mm512_castps_pd
-
-#define _simd16_and_ps              _mm512_and_ps
-#define _simd16_andnot_ps           _mm512_andnot_ps
-#define _simd16_or_ps               _mm512_or_ps
-#define _simd16_xor_ps              _mm512_xor_ps
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    return _mm512_roundscale_ps(a, mode);
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-#define _simd16_mul_epi32         _mm512_mul_epi32
-#define _simd16_mullo_epi32       _mm512_mullo_epi32
-#define _simd16_sub_epi32         _mm512_sub_epi32
-#define _simd16_sub_epi64         _mm512_sub_epi64
-#define _simd16_min_epi32         _mm512_min_epi32
-#define _simd16_max_epi32         _mm512_max_epi32
-#define _simd16_min_epu32         _mm512_min_epu32
-#define _simd16_max_epu32         _mm512_max_epu32
-#define _simd16_add_epi32         _mm512_add_epi32
-
-#define _simd16_and_si            _mm512_and_si512
-#define _simd16_andnot_si         _mm512_andnot_si512
-#define _simd16_or_si             _mm512_or_si512
-#define _simd16_xor_si            _mm512_xor_si512
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmplt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0));
-    int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1));
-
-    return lo & hi;
-}
-
-#define _simd16_unpacklo_ps       _mm512_unpacklo_ps
-#define _simd16_unpackhi_ps       _mm512_unpackhi_ps
-#define _simd16_unpacklo_pd       _mm512_unpacklo_pd
-#define _simd16_unpackhi_pd       _mm512_unpackhi_pd
-#define _simd16_unpacklo_epi8     _mm512_unpacklo_epi8
-#define _simd16_unpackhi_epi8     _mm512_unpackhi_epi8
-#define _simd16_unpacklo_epi16    _mm512_unpacklo_epi16
-#define _simd16_unpackhi_epi16    _mm512_unpackhi_epi16
-#define _simd16_unpacklo_epi32    _mm512_unpacklo_epi32
-#define _simd16_unpackhi_epi32    _mm512_unpackhi_epi32
-#define _simd16_unpacklo_epi64    _mm512_unpacklo_epi64
-#define _simd16_unpackhi_epi64    _mm512_unpackhi_epi64
-#define _simd16_slli_epi32        _mm512_slli_epi32
-#define _simd16_srli_epi32        _mm512_srli_epi32
-#define _simd16_srai_epi32        _mm512_srai_epi32
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-#define _simd16_adds_epu8         _mm512_adds_epu8
-#define _simd16_subs_epu8         _mm512_subs_epu8
-#define _simd16_add_epi8          _mm512_add_epi8
-#define _simd16_shuffle_epi8      _mm512_shuffle_epi8
-
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-
-#define _simd16_i32gather_ps(m, index, scale)               _mm512_i32gather_ps(index, m, scale)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
-
-    return _mm512_mask_i32gather_ps(a, k, index, m, scale);
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-#define _simd16_abs_epi32         _mm512_abs_epi32
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-#define _simd16_permute_ps(a, i)        _mm512_permutexvar_ps(i, a)
-#define _simd16_permute_epi32(a, i)     _mm512_permutexvar_epi32(i, a)
-#define _simd16_sllv_epi32              _mm512_srlv_epi32
-#define _simd16_srlv_epi32              _mm512_sllv_epi32
-#define _simd16_permute2f128_ps         _mm512_shuffle_f32x4
-#define _simd16_permute2f128_pd         _mm512_shuffle_f64x2
-#define _simd16_permute2f128_si         _mm512_shuffle_i32x4
-#define _simd16_shuffle_ps              _mm512_shuffle_ps
-#define _simd16_shuffle_pd              _mm512_shuffle_pd
-#define _simd16_cvtepu8_epi16           _mm512_cvtepu8_epi16
-#define _simd16_cvtepu8_epi32           _mm512_cvtepu8_epi32
-#define _simd16_cvtepu16_epi32          _mm512_cvtepu16_epi32
-#define _simd16_packus_epi16            _mm512_packus_epi16
-#define _simd16_packs_epi16             _mm512_packs_epi16
-#define _simd16_packus_epi32            _mm512_packus_epi32
-#define _simd16_packs_epi32             _mm512_packs_epi32
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return _mm512_int2mask(mask);
-}
-
-INLINE int SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return _mm512_mask2int(mask);
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _mm512_cmplt_ps_mask(a, b);
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
-    simd16scalari temp = _simd16_set1_epi32(mask);
-
-    simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
-    simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
-    return _simd16_castsi_ps(result);
-}
-
-#endif//ENABLE_AVX512_EMULATION
-
 #endif//ENABLE_AVX512_SIMD16
 
 #endif//__SWR_SIMD16INTRIN_H_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 61c0c54..f95c109 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -24,751 +24,218 @@
 #ifndef __SWR_SIMDINTRIN_H__
 #define __SWR_SIMDINTRIN_H__
 
-#include "os.h"
-
-#include <cassert>
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <xmmintrin.h>
-
-#if KNOB_SIMD_WIDTH == 8 
-typedef __m256 simdscalar;
-typedef __m256i simdscalari;
-typedef uint8_t simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-// simd vector
-OSALIGNSIMD(union) simdvector
-{
-    simdscalar  v[4];
-    struct
-    {
-        simdscalar x, y, z, w;
-    };
-
-    simdscalar& operator[] (const int i) { return v[i]; }
-    const simdscalar& operator[] (const int i) const { return v[i]; }
-};
+#include "common/intrin.h"
+#include "common/simdlib.hpp"
 
 #if KNOB_SIMD_WIDTH == 8
-#define _simd128_maskstore_ps _mm_maskstore_ps
-#define _simd_load_ps _mm256_load_ps
-#define _simd_load1_ps _mm256_broadcast_ss
-#define _simd_loadu_ps _mm256_loadu_ps
-#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps   _mm256_set1_ps
-#define _simd_blend_ps  _mm256_blend_ps
-#define _simd_blendv_ps _mm256_blendv_ps
-#define _simd_store_ps _mm256_store_ps
-#define _simd_mul_ps _mm256_mul_ps
-#define _simd_add_ps _mm256_add_ps
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_rsqrt_ps _mm256_rsqrt_ps
-#define _simd_min_ps _mm256_min_ps
-#define _simd_max_ps _mm256_max_ps
-#define _simd_movemask_ps _mm256_movemask_ps
-#define _simd_cvtps_epi32 _mm256_cvtps_epi32
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
-#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
-#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
-#define _simd_and_ps _mm256_and_ps
-#define _simd_or_ps _mm256_or_ps
-
-#define _simd_rcp_ps _mm256_rcp_ps
-#define _simd_div_ps _mm256_div_ps
-#define _simd_castsi_ps _mm256_castsi256_ps
-#define _simd_andnot_ps _mm256_andnot_ps
-#define _simd_round_ps _mm256_round_ps
-#define _simd_castpd_ps _mm256_castpd_ps
-#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
-#define _simd_stream_ps _mm256_stream_ps
-
-#define _simd_load_sd _mm256_load_sd
-#define _simd_movemask_pd _mm256_movemask_pd
-#define _simd_castsi_pd _mm256_castsi256_pd
-
-// emulated integer simd
-#define SIMD_EMU_EPI(func, intrin) \
-INLINE \
-__m256i func(__m256i a, __m256i b)\
-{\
-    __m128i aHi = _mm256_extractf128_si256(a, 1);\
-    __m128i bHi = _mm256_extractf128_si256(b, 1);\
-    __m128i aLo = _mm256_castsi256_si128(a);\
-    __m128i bLo = _mm256_castsi256_si128(b);\
-\
-    __m128i subLo = intrin(aLo, bLo);\
-    __m128i subHi = intrin(aHi, bHi);\
-\
-    __m256i result = _mm256_castsi128_si256(subLo);\
-            result = _mm256_insertf128_si256(result, subHi, 1);\
-\
-    return result;\
-}
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-INLINE
-__m256 _simdemu_permute_ps(__m256 a, __m256i b)
-{
-    __m128 aHi = _mm256_extractf128_ps(a, 1);
-    __m128i bHi = _mm256_extractf128_si256(b, 1);
-    __m128 aLo = _mm256_castps256_ps128(a);
-    __m128i bLo = _mm256_castsi256_si128(b);
-
-    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
-    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
-    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    __m256 result = _mm256_castps128_ps256(blendLowRes);
-    result = _mm256_insertf128_ps(result, blendHiRes, 1);
-
-    return result;
-}
-
-INLINE
-__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
-{
-    return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
-}
-
-INLINE
-__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-
-INLINE
-__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-#define _simd_mul_epi32 _simdemu_mul_epi32
-#define _simd_mullo_epi32 _simdemu_mullo_epi32
-#define _simd_sub_epi32 _simdemu_sub_epi32
-#define _simd_sub_epi64 _simdemu_sub_epi64
-#define _simd_min_epi32 _simdemu_min_epi32
-#define _simd_min_epu32 _simdemu_min_epu32
-#define _simd_max_epi32 _simdemu_max_epi32
-#define _simd_max_epu32 _simdemu_max_epu32
-#define _simd_add_epi32 _simdemu_add_epi32
-#define _simd_and_si _simdemu_and_si
-#define _simd_andnot_si _simdemu_andnot_si
-#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
-#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
-#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
-#define _simd_or_si _simdemu_or_si
-#define _simd_xor_si _simdemu_xor_si
-#define _simd_castps_si _mm256_castps_si256
-#define _simd_adds_epu8 _simdemu_adds_epu8
-#define _simd_subs_epu8 _simdemu_subs_epu8
-#define _simd_add_epi8 _simdemu_add_epi8
-#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
-#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
-#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
-#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
-#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
-#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
-#define _simd_movemask_epi8 _simdemu_movemask_epi8
-#define _simd_permute_ps _simdemu_permute_ps
-#define _simd_permute_epi32 _simdemu_permute_epi32
-#define _simd_srlv_epi32 _simdemu_srlv_epi32
-#define _simd_sllv_epi32 _simdemu_sllv_epi32
-
-SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
-SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
-SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
-SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
-SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
-SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
-SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
-SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
-SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
-SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
-SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
-SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
-SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
-SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
-SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
-
-#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
-#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
-#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
-#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
-#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-#define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-
-#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
-#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
-#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-
-#define _simd128_fmadd_ps _mm_fmaddemu_ps
-#define _simd_fmadd_ps _mm_fmaddemu256_ps
-#define _simd_fmsub_ps _mm_fmsubemu256_ps
-#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 
-SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
-
-INLINE
-__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
-{
-    __m128 res = _mm_mul_ps(a, b);
-    res = _mm_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_sub_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult;
-    float* pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset = offset * scale;
-        pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult = vSrc;
-    float* pResult = (float*)&vResult;
-    DWORD index;
-    uint32_t mask = _simd_movemask_ps(vMask);
-    while (_BitScanForward(&index, mask))
-    {
-        mask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset = offset * scale;
-        pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256i _simd_abs_epi32(__m256i a)
-{
-        __m128i aHi = _mm256_extractf128_si256(a, 1);
-        __m128i aLo = _mm256_castsi256_si128(a);
-        __m128i absLo = _mm_abs_epi32(aLo);
-        __m128i absHi = _mm_abs_epi32(aHi);
-        __m256i result = _mm256_castsi128_si256(absLo);
-        result = _mm256_insertf128_si256(result, absHi, 1);
-        return result;
-}
-
-INLINE 
-int _simdemu_movemask_epi8(__m256i a)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    int resHi = _mm_movemask_epi8(aHi);
-    int resLo = _mm_movemask_epi8(aLo);
-
-    return (resHi << 16) | resLo;
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi16(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi16(a);
-    __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi32(a);
-    __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu16_epi32(a);
-    __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi16(alo, blo);
-    __m128i resulthi = _mm_packus_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi16(alo, blo);
-    __m128i resulthi = _mm_packs_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi32(alo, blo);
-    __m128i resulthi = _mm_packus_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi32(alo, blo);
-    __m128i resulthi = _mm_packs_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
+typedef SIMD256                             SIMD;
 #else
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
 
-#define _simd_mul_epi32 _mm256_mul_epi32
-#define _simd_mullo_epi32 _mm256_mullo_epi32
-#define _simd_sub_epi32 _mm256_sub_epi32
-#define _simd_sub_epi64 _mm256_sub_epi64
-#define _simd_min_epi32 _mm256_min_epi32
-#define _simd_max_epi32 _mm256_max_epi32
-#define _simd_min_epu32 _mm256_min_epu32
-#define _simd_max_epu32 _mm256_max_epu32
-#define _simd_add_epi32 _mm256_add_epi32
-#define _simd_and_si _mm256_and_si256
-#define _simd_andnot_si _mm256_andnot_si256
-#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
-#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-#define _simd_or_si _mm256_or_si256
-#define _simd_xor_si _mm256_xor_si256
-#define _simd_castps_si _mm256_castps_si256
 
-#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
-#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
-#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
-#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
-#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
-#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
-#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
-#define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
+#define _simd128_maskstore_ps               SIMD128::maskstore_ps
+#define _simd128_fmadd_ps                   SIMD128::fmadd_ps
 
-#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-#define _simd_slli_epi32 _mm256_slli_epi32
-#define _simd_srai_epi32 _mm256_srai_epi32
-#define _simd_srli_epi32 _mm256_srli_epi32
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-#define _simd128_fmadd_ps _mm_fmadd_ps
-#define _simd_fmadd_ps _mm256_fmadd_ps
-#define _simd_fmsub_ps _mm256_fmsub_ps
-#define _simd_shuffle_epi8 _mm256_shuffle_epi8 
-#define _simd_adds_epu8 _mm256_adds_epu8
-#define _simd_subs_epu8 _mm256_subs_epu8
-#define _simd_add_epi8 _mm256_add_epi8
-#define _simd_i32gather_ps _mm256_i32gather_ps
-#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
-#define _simd_abs_epi32 _mm256_abs_epi32
+#define _simd_load_ps                       SIMD::load_ps
+#define _simd_load1_ps                      SIMD::broadcast_ss
+#define _simd_loadu_ps                      SIMD::loadu_ps
+#define _simd_setzero_ps                    SIMD::setzero_ps
+#define _simd_set1_ps                       SIMD::set1_ps
+#define _simd_blend_ps(a, b, i)             SIMD::blend_ps<i>(a, b)
+#define _simd_blend_epi32(a, b, i)          SIMD::blend_epi32<i>(a, b)
+#define _simd_blendv_ps                     SIMD::blendv_ps
+#define _simd_store_ps                      SIMD::store_ps
+#define _simd_mul_ps                        SIMD::mul_ps
+#define _simd_add_ps                        SIMD::add_ps
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_rsqrt_ps                      SIMD::rsqrt_ps
+#define _simd_min_ps                        SIMD::min_ps
+#define _simd_max_ps                        SIMD::max_ps
+#define _simd_movemask_ps                   SIMD::movemask_ps
+#define _simd_cvtps_epi32                   SIMD::cvtps_epi32
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_cvtepi32_ps                   SIMD::cvtepi32_ps
+#define _simd_cmplt_ps                      SIMD::cmplt_ps
+#define _simd_cmpgt_ps                      SIMD::cmpgt_ps
+#define _simd_cmpneq_ps                     SIMD::cmpneq_ps
+#define _simd_cmpeq_ps                      SIMD::cmpeq_ps
+#define _simd_cmpge_ps                      SIMD::cmpge_ps
+#define _simd_cmple_ps                      SIMD::cmple_ps
+#define _simd_cmp_ps(a, b, imm)             SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
+#define _simd_and_ps                        SIMD::and_ps
+#define _simd_or_ps                         SIMD::or_ps
+#define _simd_rcp_ps                        SIMD::rcp_ps
+#define _simd_div_ps                        SIMD::div_ps
+#define _simd_castsi_ps                     SIMD::castsi_ps
+#define _simd_castps_pd                     SIMD::castps_pd
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_andnot_ps                     SIMD::andnot_ps
+#define _simd_round_ps(a, i)                SIMD::round_ps<SIMD::RoundMode(i)>(a)
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_broadcast_ps(a)               SIMD::broadcast_ps((SIMD128::Float const *)(a))
+#define _simd_stream_ps                     SIMD::stream_ps
 
-#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
-#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
-#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
-#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
-#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
-#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
-#define _simd_movemask_epi8 _mm256_movemask_epi8
-#define _simd_permute_ps _mm256_permutevar8x32_ps
-#define _simd_permute_epi32 _mm256_permutevar8x32_epi32
-#define _simd_srlv_epi32 _mm256_srlv_epi32
-#define _simd_sllv_epi32 _mm256_sllv_epi32
-#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
-#define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
-#define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
-#define _simd_packus_epi16 _mm256_packus_epi16
-#define _simd_packs_epi16 _mm256_packs_epi16
-#define _simd_packus_epi32 _mm256_packus_epi32
-#define _simd_packs_epi32 _mm256_packs_epi32
+#define _simd_movemask_pd                   SIMD::movemask_pd
+#define _simd_castsi_pd                     SIMD::castsi_pd
 
-#endif
+#define _simd_mul_epi32                     SIMD::mul_epi32
+#define _simd_mullo_epi32                   SIMD::mullo_epi32
+#define _simd_sub_epi32                     SIMD::sub_epi32
+#define _simd_sub_epi64                     SIMD::sub_epi64
+#define _simd_min_epi32                     SIMD::min_epi32
+#define _simd_min_epu32                     SIMD::min_epu32
+#define _simd_max_epi32                     SIMD::max_epi32
+#define _simd_max_epu32                     SIMD::max_epu32
+#define _simd_add_epi32                     SIMD::add_epi32
+#define _simd_and_si                        SIMD::and_si
+#define _simd_andnot_si                     SIMD::andnot_si
+#define _simd_cmpeq_epi32                   SIMD::cmpeq_epi32
+#define _simd_cmplt_epi32                   SIMD::cmplt_epi32
+#define _simd_cmpgt_epi32                   SIMD::cmpgt_epi32
+#define _simd_or_si                         SIMD::or_si
+#define _simd_xor_si                        SIMD::xor_si
+#define _simd_castps_si                     SIMD::castps_si
+#define _simd_adds_epu8                     SIMD::adds_epu8
+#define _simd_subs_epu8                     SIMD::subs_epu8
+#define _simd_add_epi8                      SIMD::add_epi8
+#define _simd_cmpeq_epi64                   SIMD::cmpeq_epi64
+#define _simd_cmpgt_epi64                   SIMD::cmpgt_epi64
+#define _simd_cmpgt_epi8                    SIMD::cmpgt_epi8
+#define _simd_cmpeq_epi8                    SIMD::cmpeq_epi8
+#define _simd_cmpgt_epi16                   SIMD::cmpgt_epi16
+#define _simd_cmpeq_epi16                   SIMD::cmpeq_epi16
+#define _simd_movemask_epi8                 SIMD::movemask_epi8
+#define _simd_permute_ps                    SIMD::permute_ps
+#define _simd_permute_epi32                 SIMD::permute_epi32
+#define _simd_srlv_epi32                    SIMD::srlv_epi32
+#define _simd_sllv_epi32                    SIMD::sllv_epi32
 
-#define _simd_unpacklo_ps _mm256_unpacklo_ps
-#define _simd_unpackhi_ps _mm256_unpackhi_ps
-#define _simd_unpacklo_pd _mm256_unpacklo_pd
-#define _simd_unpackhi_pd _mm256_unpackhi_pd
-#define _simd_insertf128_ps _mm256_insertf128_ps
-#define _simd_insertf128_pd _mm256_insertf128_pd
-#define _simd_insertf128_si _mm256_insertf128_si256
-#define _simd_extractf128_ps _mm256_extractf128_ps
-#define _simd_extractf128_pd _mm256_extractf128_pd
-#define _simd_extractf128_si _mm256_extractf128_si256
-#define _simd_permute2f128_ps _mm256_permute2f128_ps
-#define _simd_permute2f128_pd _mm256_permute2f128_pd
-#define _simd_permute2f128_si _mm256_permute2f128_si256
-#define _simd_shuffle_ps _mm256_shuffle_ps
-#define _simd_shuffle_pd _mm256_shuffle_pd
-#define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
-#define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
-#define _simd_set1_epi32 _mm256_set1_epi32
-#define _simd_set_epi32 _mm256_set_epi32
-#define _simd_set1_epi8 _mm256_set1_epi8
-#define _simd_setzero_si _mm256_setzero_si256
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_store_si _mm256_store_si256
-#define _simd_broadcast_ss _mm256_broadcast_ss
-#define _simd_maskstore_ps _mm256_maskstore_ps
-#define _simd_load_si _mm256_load_si256
-#define _simd_loadu_si _mm256_loadu_si256
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_testz_ps _mm256_testz_ps
-#define _simd_testz_si _mm256_testz_si256
-#define _simd_xor_ps _mm256_xor_ps
+#define _simd_unpacklo_epi8                 SIMD::unpacklo_epi8
+#define _simd_unpackhi_epi8                 SIMD::unpackhi_epi8
+#define _simd_unpacklo_epi16                SIMD::unpacklo_epi16
+#define _simd_unpackhi_epi16                SIMD::unpackhi_epi16
+#define _simd_unpacklo_epi32                SIMD::unpacklo_epi32
+#define _simd_unpackhi_epi32                SIMD::unpackhi_epi32
+#define _simd_unpacklo_epi64                SIMD::unpacklo_epi64
+#define _simd_unpackhi_epi64                SIMD::unpackhi_epi64
 
-INLINE
-simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
+#define _simd_slli_epi32(a,i)               SIMD::slli_epi32<i>(a)
+#define _simd_srai_epi32(a,i)               SIMD::srai_epi32<i>(a)
+#define _simd_srli_epi32(a,i)               SIMD::srli_epi32<i>(a)
+#define _simd_srlisi_ps(a,i)                SIMD::srlisi_ps<i>(a)
+
+#define _simd_fmadd_ps                      SIMD::fmadd_ps
+#define _simd_fmsub_ps                      SIMD::fmsub_ps
+#define _simd_shuffle_epi8                  SIMD::shuffle_epi8
+
+#define _simd_i32gather_ps(p, o, s)         SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
+#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
+#define _simd_abs_epi32                     SIMD::abs_epi32
+
+#define _simd_cvtepu8_epi16                 SIMD::cvtepu8_epi16
+#define _simd_cvtepu8_epi32                 SIMD::cvtepu8_epi32
+#define _simd_cvtepu16_epi32                SIMD::cvtepu16_epi32
+#define _simd_cvtepu16_epi64                SIMD::cvtepu16_epi64
+#define _simd_cvtepu32_epi64                SIMD::cvtepu32_epi64
+
+#define _simd_packus_epi16                  SIMD::packus_epi16
+#define _simd_packs_epi16                   SIMD::packs_epi16
+#define _simd_packus_epi32                  SIMD::packus_epi32
+#define _simd_packs_epi32                   SIMD::packs_epi32
+
+#define _simd_unpacklo_ps                   SIMD::unpacklo_ps
+#define _simd_unpackhi_ps                   SIMD::unpackhi_ps
+#define _simd_unpacklo_pd                   SIMD::unpacklo_pd
+#define _simd_unpackhi_pd                   SIMD::unpackhi_pd
+#define _simd_insertf128_ps                 SIMD::insertf128_ps
+#define _simd_insertf128_pd                 SIMD::insertf128_pd
+#define _simd_insertf128_si(a, b, i)        SIMD::insertf128_si<i>(a, b)
+#define _simd_extractf128_ps(a, i)          SIMD::extractf128_ps<i>(a)
+#define _simd_extractf128_pd(a, i)          SIMD::extractf128_pd<i>(a)
+#define _simd_extractf128_si(a, i)          SIMD::extractf128_si<i>(a)
+#define _simd_permute2f128_ps(a, b, i)      SIMD::permute2f128_ps<i>(a, b)
+#define _simd_permute2f128_pd(a, b, i)      SIMD::permute2f128_pd<i>(a, b)
+#define _simd_permute2f128_si(a, b, i)      SIMD::permute2f128_si<i>(a, b)
+#define _simd_shuffle_ps(a, b, i)           SIMD::shuffle_ps<i>(a, b)
+#define _simd_shuffle_pd(a, b, i)           SIMD::shuffle_pd<i>(a, b)
+#define _simd_shuffle_epi32(a, b, imm8)     SIMD::shuffle_epi32<imm8>(a, b)
+#define _simd_shuffle_epi64(a, b, imm8)     SIMD::shuffle_epi64<imm8>(a, b)
+#define _simd_set1_epi32                    SIMD::set1_epi32
+#define _simd_set_epi32                     SIMD::set_epi32
+#define _simd_set_ps                        SIMD::set_ps
+#define _simd_set1_epi8                     SIMD::set1_epi8
+#define _simd_setzero_si                    SIMD::setzero_si
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_store_si                      SIMD::store_si
+#define _simd_broadcast_ss                  SIMD::broadcast_ss
+#define _simd_maskstore_ps                  SIMD::maskstore_ps
+#define _simd_load_si                       SIMD::load_si
+#define _simd_loadu_si                      SIMD::loadu_si
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_testz_ps                      SIMD::testz_ps
+#define _simd_testz_si                      SIMD::testz_si
+#define _simd_xor_ps                        SIMD::xor_ps
+
+#define _simd_loadu2_si                     SIMD::loadu2_si
+#define _simd_storeu2_si                    SIMD::storeu2_si
+
+#define _simd_blendv_epi32                  SIMD::blendv_epi32
+
+template<int mask> SIMDINLINE
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
 {
-    __m128i lo = _mm_loadu_si128(loaddr);
-    __m128i hi = _mm_loadu_si128(hiaddr);
-
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-}
-
-INLINE
-void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
-{
-    _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
-    _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
-}
-
-template<int mask>
-INLINE
-__m128i _simd_blend4_epi32(__m128i a, __m128i b)
-{
-    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask));
+    return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
 }
 
 // convert bitmask to vector mask
-INLINE
-simdscalar vMask(int32_t mask)
+SIMDINLINE
+SIMD256::Float vMask(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-    return _simd_castsi_ps(vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
+    return SIMD256::castsi_ps(vec);
 }
 
-INLINE
-simdscalari vMaski(int32_t mask)
+SIMDINLINE
+SIMD256::Integer vMaski(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
 }
 
-INLINE
+SIMDINLINE
 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
 {
     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
-    _mm256_store_ps(rArray, r);
-    _mm256_store_ps(sArray, s);
+    SIMD256::store_ps(rArray, r);
+    SIMD256::store_ps(sArray, s);
     rArray[rlane] = sArray[slane];
-    r = _mm256_load_ps(rArray);
+    r = SIMD256::load_ps(rArray);
 }
 
-INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_slli_epi32(aHi, i);
-    __m128i resLo = _mm_slli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srai_epi32(aHi, i);
-    __m128i resLo = _mm_srai_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srli_epi32(aHi, i);
-    __m128i resLo = _mm_srli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-    result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE
-void _simdvec_transpose(simdvector &v)
-{
-    SWR_INVALID("Need to implement 8 wide version");
-}
-
-#else
-#error Unsupported vector width
-#endif
-
 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-INLINE
-void _simdvec_load_ps(simdvector& r, const float *p)
-{
-    r[0] = _simd_set1_ps(p[0]);
-    r[1] = _simd_set1_ps(p[1]);
-    r[2] = _simd_set1_ps(p[2]);
-    r[3] = _simd_set1_ps(p[3]);
-}
+#define _simdvec_load_ps SIMD::vec4_load1_ps
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-    r[0] = s;
-    r[1] = s;
-    r[2] = s;
-    r[3] = s;
+    SIMD::vec4_set1_vps(r, s);
 }
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-    r[0] = v[0];
-    r[1] = v[1];
-    r[2] = v[2];
-    r[3] = v[3];
+    r = v;
 }
 
 #if 0
 // just move a lane from the source simdvector to dest simdvector
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
     _simd_mov(r[0], rlane, s[0], slane);
@@ -778,330 +245,23 @@
 }
 
 #endif
-INLINE
-void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
-
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
-
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
-
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
-    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-simdscalar _simdvec_rcp_length_ps(const simdvector& v)
-{
-    simdscalar length;
-    _simdvec_dp4_ps(length, v, v);
-    return _simd_rsqrt_ps(length);
-}
-
-INLINE
-void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
-{
-    simdscalar vecLength;
-    vecLength = _simdvec_rcp_length_ps(v);
-
-    r[0] = _simd_mul_ps(v[0], vecLength);
-    r[1] = _simd_mul_ps(v[1], vecLength);
-    r[2] = _simd_mul_ps(v[2], vecLength);
-    r[3] = _simd_mul_ps(v[3], vecLength);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
-{
-    r[0] = _simd_mul_ps(v[0], s);
-    r[1] = _simd_mul_ps(v[1], s);
-    r[2] = _simd_mul_ps(v[2], s);
-    r[3] = _simd_mul_ps(v[3], s);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_mul_ps(v0[0], v1[0]);
-    r[1] = _simd_mul_ps(v0[1], v1[1]);
-    r[2] = _simd_mul_ps(v0[2], v1[2]);
-    r[3] = _simd_mul_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_add_ps(v0[0], v1[0]);
-    r[1] = _simd_add_ps(v0[1], v1[1]);
-    r[2] = _simd_add_ps(v0[2], v1[2]);
-    r[3] = _simd_add_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_min_ps(v0[0], s);
-    r[1] = _simd_min_ps(v0[1], s);
-    r[2] = _simd_min_ps(v0[2], s);
-    r[3] = _simd_min_ps(v0[3], s);
-}
-
-INLINE
-void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_max_ps(v0[0], s);
-    r[1] = _simd_max_ps(v0[1], s);
-    r[2] = _simd_max_ps(v0[2], s);
-    r[3] = _simd_max_ps(v0[3], s);
-}
-
-// Matrix4x4 * Vector4
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-INLINE
-void _simd_mat4x4_vec4_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[3] = r0;
-}
-
-// Matrix4x4 * Vector3 - Direction Vector where w = 0.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-INLINE
-void _simd_mat3x3_vec3_w0_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[2] = r0;
-
-    result[3] = _simd_setzero_ps();
-}
-
-// Matrix4x4 * Vector3 - Position vector where w = 1.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-INLINE
-void _simd_mat4x4_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-}
-
-INLINE
-void _simd_mat4x3_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-    result[3] = _simd_set1_ps(1.0f);
-}
+#define _simdvec_dp3_ps                 SIMD::vec4_dp3_ps
+#define _simdvec_dp4_ps                 SIMD::vec4_dp4_ps
+#define _simdvec_rcp_length_ps          SIMD::vec4_rcp_length_ps
+#define _simdvec_normalize_ps           SIMD::vec4_normalize_ps
+#define _simdvec_mul_ps                 SIMD::vec4_mul_ps
+#define _simdvec_add_ps                 SIMD::vec4_add_ps
+#define _simdvec_min_ps                 SIMD::vec4_min_ps
+#define _simdvec_max_ps                 SIMD::vec4_max_ps
+#define _simd_mat4x4_vec4_multiply      SIMD::mat4x4_vec4_multiply
+#define _simd_mat3x3_vec3_w0_multiply   SIMD::mat3x3_vec3_w0_multiply
+#define _simd_mat4x4_vec3_w1_multiply   SIMD::mat4x4_vec3_w1_multiply
+#define _simd_mat4x3_vec3_w1_multiply   SIMD::mat4x3_vec3_w1_multiply
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
 {
     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
     vOut = _simd_fmadd_ps(vB, vY, vOut);
@@ -1110,9 +270,9 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
 {
-    __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
+    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
     vOut = _simd128_fmadd_ps(vB, vY, vOut);
     return vOut;
 }
@@ -1123,7 +283,7 @@
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
@@ -1143,7 +303,7 @@
 /// @brief Interpolates a single component (flat shade).
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
 
@@ -1158,90 +318,34 @@
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
 
-    __m128 vA = _mm_broadcast_ss(pInterpA);
-    __m128 vB = _mm_broadcast_ss(pInterpB);
-    __m128 vC = _mm_broadcast_ss(pInterpC);
+    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
+    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
+    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
 
-    __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
-    vC = _mm_mul_ps(vk, vC);
+    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
+    vC = SIMD128::mul_ps(vk, vC);
 
-    return vplaneps128(vA, vB, vC, vI, vJ);
+    return vplaneps(vA, vB, vC, vI, vJ);
 }
 
-static INLINE __m128 _simd128_abs_ps(__m128 a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
 {
-    __m128i ai = _mm_castps_si128(a);
-    return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
+    simd4scalari ai = SIMD128::castps_si(a);
+    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
 }
 
-static INLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
 {
     simdscalari ai = _simd_castps_si(a);
     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
 }
 
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
-    // using bsf instead of funky loop
-    DWORD maskIndex;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT result = 0;
-    DWORD maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
 
 #if ENABLE_AVX512_SIMD16
 #include "simd16intrin.h"
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
new file mode 100644
index 0000000..fb11132
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -0,0 +1,550 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#include "simdlib_types.hpp"
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    namespace SIMD128Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_128_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_128_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_128_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD128Impl::Float;
+            using Double    = SIMD128Impl::Double;
+            using Integer   = SIMD128Impl::Integer;
+            using Vec4      = SIMD128Impl::Vec4;
+            using Mask      = SIMD128Impl::Mask;
+        };
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_256_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_256_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_256_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD256Impl::Float;
+            using Double    = SIMD256Impl::Double;
+            using Integer   = SIMD256Impl::Integer;
+            using Vec4      = SIMD256Impl::Vec4;
+            using Mask      = SIMD256Impl::Mask;
+        };
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        template<typename SIMD256T>
+        struct AVXImplBase
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_512_emu.inl"
+#include "simdlib_512_emu_masks.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImplBase
+        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_512_avx512.inl"
+#include "simdlib_512_avx512_masks.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX512Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD512Impl::Float;
+            using Double    = SIMD512Impl::Double;
+            using Integer   = SIMD512Impl::Integer;
+            using Vec4      = SIMD512Impl::Vec4;
+            using Mask      = SIMD512Impl::Mask;
+        };
+    } // ns SIMD512Impl
+} // ns SIMDImpl
+
+template <typename Traits>
+struct SIMDBase : Traits::IsaImpl
+{
+    using CompareType   = typename Traits::CompareType;
+    using ScaleFactor   = typename Traits::ScaleFactor;
+    using RoundMode     = typename Traits::RoundMode;
+    using SIMD          = typename Traits::IsaImpl;
+    using Float         = typename Traits::Float;
+    using Double        = typename Traits::Double;
+    using Integer       = typename Traits::Integer;
+    using Vec4          = typename Traits::Vec4;
+    using Mask          = typename Traits::Mask;
+
+    // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+    static SIMDINLINE
+    void vec4_load1_ps(Vec4& r, const float *p)
+    {
+        r[0] = SIMD::set1_ps(p[0]);
+        r[1] = SIMD::set1_ps(p[1]);
+        r[2] = SIMD::set1_ps(p[2]);
+        r[3] = SIMD::set1_ps(p[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_set1_vps(Vec4& r, Float s)
+    {
+        r[0] = s;
+        r[1] = s;
+        r[2] = s;
+        r[3] = s;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_rcp_length_ps(const Vec4& v)
+    {
+        Float length = vec4_dp4_ps(v, v);
+        return SIMD::rsqrt_ps(length);
+    }
+
+    static SIMDINLINE
+    void vec4_normalize_ps(Vec4& r, const Vec4& v)
+    {
+        Float rcpLength = vec4_rcp_length_ps(v);
+
+        r[0] = SIMD::mul_ps(v[0], rcpLength);
+        r[1] = SIMD::mul_ps(v[1], rcpLength);
+        r[2] = SIMD::mul_ps(v[2], rcpLength);
+        r[3] = SIMD::mul_ps(v[3], rcpLength);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+    {
+        r[0] = SIMD::mul_ps(v[0], s);
+        r[1] = SIMD::mul_ps(v[1], s);
+        r[2] = SIMD::mul_ps(v[2], s);
+        r[3] = SIMD::mul_ps(v[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::mul_ps(v0[0], v1[0]);
+        r[1] = SIMD::mul_ps(v0[1], v1[1]);
+        r[2] = SIMD::mul_ps(v0[2], v1[2]);
+        r[3] = SIMD::mul_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::add_ps(v0[0], s);
+        r[1] = SIMD::add_ps(v0[1], s);
+        r[2] = SIMD::add_ps(v0[2], s);
+        r[3] = SIMD::add_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::add_ps(v0[0], v1[0]);
+        r[1] = SIMD::add_ps(v0[1], v1[1]);
+        r[2] = SIMD::add_ps(v0[2], v1[2]);
+        r[3] = SIMD::add_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::min_ps(v0[0], s);
+        r[1] = SIMD::min_ps(v0[1], s);
+        r[2] = SIMD::min_ps(v0[2], s);
+        r[3] = SIMD::min_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::max_ps(v0[0], s);
+        r[1] = SIMD::max_ps(v0[1], s);
+        r[2] = SIMD::max_ps(v0[2], s);
+        r[3] = SIMD::max_ps(v0[3], s);
+    }
+
+    // Matrix4x4 * Vector4
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec4_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[3] = r0;
+    }
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+    static SIMDINLINE
+    void SIMDCALL mat3x3_vec3_w0_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[2] = r0;
+
+        result[3] = SIMD::setzero_ps();
+    }
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    }
+
+    static SIMDINLINE
+    void SIMDCALL mat4x3_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+        result[3] = SIMD::set1_ps(1.0f);
+    }
+}; // struct SIMDBase
+
+using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
+using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
+using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
new file mode 100644
index 0000000..5bcedf3
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -0,0 +1,545 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
+{
+    return sub_ps(mul_ps(a, b), c);
+}
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm_castps_si128(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm_castsi128_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm_castps_pd(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm_castsi128_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm_testz_si128(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm_broadcast_ss(p);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm_permutevar_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm_load_si128(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm_lddqu_si128(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm_movemask_epi8(a));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm_setzero_si128();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm_store_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm_storeu_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm_stream_ps(p, a);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
+{
+    return _mm_set_ps(in3, in2, in1, in0);
+}
+
+template <int ImmT>
+static SIMDINLINE float SIMDCALL extract_ps(Float a)
+{
+    int tmp = _mm_extract_ps(a, ImmT);
+    return *reinterpret_cast<float*>(&tmp);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
new file mode 100644
index 0000000..e8ee0b4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@@ -0,0 +1,68 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD4 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Only 2 shifts and 2 gathers were introduced with AVX 2
+// Also, add native support for FMA operations
+//============================================================================
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    return _mm_sllv_epi32(vA, vB);
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    return _mm_srlv_epi32(vA, vB);
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
+}
+
+#undef SIMD_WRAPPER_3
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
new file mode 100644
index 0000000..012f310
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -0,0 +1,408 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps128_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps128(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf));     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xf));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xf));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xf));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xf),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_storeu_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
new file mode 100644
index 0000000..16eb521
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -0,0 +1,761 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+//============================================================================
+// SIMD256 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return  _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IFWRAPPER_2I(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+// emulated integer simd
+#define SIMD_EMU_IWRAPPER_1(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0]),\
+            SIMD128T::op(a.v4[1]),\
+        };\
+    }
+#define SIMD_EMU_IWRAPPER_1L(op, shift) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a.v4[0]), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
+        };\
+    }\
+    static SIMDINLINE \
+    Integer SIMDCALL op(SIMD128Impl::Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_1I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0], b.v4[0]),\
+            SIMD128T::op(a.v4[1], b.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
+{
+    return sub_ps(mul_ps(a, b), c);
+}
+
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm256_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_EMU_IWRAPPER_2(mullo_epi32);
+SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
+SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm256_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm256_castps_si256(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm256_castsi256_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm256_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm256_castpd_si256(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm256_castsi256_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm256_cvtepi32_ps(a);
+}
+
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm256_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm256_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm256_testz_si256(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm256_broadcast_ss(p);
+}
+
+SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+SIMD_WRAPPER_2I(permute2f128_ps);
+SIMD_DWRAPPER_2I(permute2f128_pd);
+SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
+
+
+SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_EMU_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
+SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm256_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm256_load_si256(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm256_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm256_lddqu_si256(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm256_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return SIMD128T::movemask_epi8(a.v4[0]) |
+           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm256_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm256_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm256_setzero_si256();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm256_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm256_store_si256(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm256_stream_ps(p, a);
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+{
+    return _mm256_broadcast_ps(&p->v);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+{
+    return _mm256_extractf128_pd(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float a)
+{
+    return _mm256_extractf128_ps(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+{
+    return _mm256_extractf128_si256(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+{
+    return _mm256_insertf128_pd(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+{
+    return _mm256_insertf128_ps(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+{
+    return _mm256_insertf128_si256(a, b, ImmT);
+}
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
+    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
+                            /* SIMD128Impl::Integer const* */ loaddr) \
+    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
+#endif
+
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+{
+    return _mm256_loadu2_m128i(&phi->v, &plo->v);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+{
+    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IFWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_3
+#undef SIMD_EMU_IWRAPPER_1
+#undef SIMD_EMU_IWRAPPER_1I
+#undef SIMD_EMU_IWRAPPER_2
+#undef SIMD_EMU_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
new file mode 100644
index 0000000..0a81203
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -0,0 +1,234 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Mostly these are integer operations that are no longer emulated with SSE
+//============================================================================
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1L(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(_mm256_castsi256_si128(a));\
+    }\
+
+#define SIMD_IWRAPPER_1I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##intrin(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Floating point arithmetic operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)   // return (a * b) + c
+{
+    return _mm256_fmadd_ps(a, b, c);
+}
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_si256);     // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_si256);      // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_si256);     // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                          // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1L(cvtepu8_epi16);    // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32);    // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32);   // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64);   // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64);   // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+{
+    return cmpgt_epi32(b, a);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm256_permutevar8x32_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+	// Only for this intrinsic - not sure why. :(
+    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1L
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
new file mode 100644
index 0000000..a8d2a4b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -0,0 +1,409 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps256_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps256(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff));     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xff));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xff));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xff));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xff),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_storeu_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
new file mode 100644
index 0000000..7447d35
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -0,0 +1,707 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
+// gcc missing these intrinsics
+#ifndef _mm512_cmpneq_ps_mask
+#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
+#endif
+
+#ifndef _mm512_cmplt_ps_mask
+#define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
+#endif
+
+#ifndef _mm512_cmplt_pd_mask
+#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
+#endif
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation
+//
+//  TODO: Optimize for KNL / KNH or for SKX??
+//      For now probably optimizing more for KNL as that's where
+//      immediate customers are.
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask16 m)
+    {
+        return _mm512_maskz_set1_epi32(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);       // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+                            // return (a * b) & 0xFFFFFFFF
+                            //
+                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+                            // and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
+
+#if defined(AVX512F_STRICT)
+
+SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+
+#else
+
+SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+
+#endif
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm512_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm512_castps_si512(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm512_castsi512_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm512_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm512_castpd_si512(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm512_castsi512_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm512_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm512_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm512_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
+}
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    // Legacy vector mask generator
+    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
+    return castsi_ps(vmask(result));
+}
+
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+template <int ImmT>
+static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
+}
+
+template <int ImmT>
+static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
+{
+    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
+}
+
+static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
+}
+
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm512_set1_ps(*p);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    return _mm512_extractf64x4_pd(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    return _mm512_extracti64x4_epi64(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    return _mm512_insertf64x4(a, b, imm);
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    return _mm512_inserti64x4(a, b, imm);
+}
+
+#if !defined(AVX512F_STRICT)
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+#else
+SIMD_EMU_IWRAPPER_2(packs_epi16)
+SIMD_EMU_IWRAPPER_2(packs_epi32)
+SIMD_EMU_IWRAPPER_2(packus_epi16)
+SIMD_EMU_IWRAPPER_2(packus_epi32)
+#endif
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_epi32(swiz, a);
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_ps(swiz, a);
+}
+
+SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
+SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
+SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm512_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm512_load_si512(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm512_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm512_loadu_si512(p);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+
+    return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
+    _mm512_mask_store_ps(p, m, src);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
+    return static_cast<uint64_t>(m);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
+    return static_cast<uint32_t>(m);
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
+    return static_cast<uint32_t>(m);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm512_set1_ps(f);
+}
+
+static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
+{
+    return _mm512_setzero_pd();
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm512_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm512_setzero_si512();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_store_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm512_storeu_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm512_stream_ps(p, a);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm512_set_epi32(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm512_set_ps(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_EMU_IWRAPPER_2
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
new file mode 100644
index 0000000..3e36ce5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
new file mode 100644
index 0000000..a45429f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -0,0 +1,842 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        {\
+            return Float\
+            {\
+                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+            };\
+        }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_2(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return Float
+    {
+        SIMD256T::template round_ps<RMT>(a.v8[0]),
+        SIMD256T::template round_ps<RMT>(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
+SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
+SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+{
+    return Integer
+    {
+        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+{
+    return Integer
+    {
+        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+{
+    return Integer
+    {
+        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+{
+    return Integer
+    {
+        SIMD256T::template srli_si<ImmT>(a.v8[0]),
+        SIMD256T::template srli_si<ImmT>(a.v8[1]),
+    };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+{
+    return Float
+    {
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castpd_ps(a.v8[0]),
+        SIMD256T::castpd_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+{
+    return Integer
+    {
+        SIMD256T::castps_si(a.v8[0]),
+        SIMD256T::castps_si(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castsi_pd(a.v8[0]),
+        SIMD256T::castsi_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castps_pd(a.v8[0]),
+        SIMD256T::castps_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castsi_ps(a.v8[0]),
+        SIMD256T::castsi_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+{
+    return Float
+    {
+        SIMD256T::cvtepi32_ps(a.v8[0]),
+        SIMD256T::cvtepi32_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu8_epi16(a.v4[0]),
+        SIMD256T::cvtepu8_epi16(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+{
+    return Integer
+	{
+        SIMD256T::cvtepu8_epi32(a.v4[0]),
+        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+	};
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi32(a.v4[0]),
+        SIMD256T::cvtepu16_epi32(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi64(a.v4[0]),
+        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu32_epi64(a.v4[0]),
+        SIMD256T::cvtepu32_epi64(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return Float
+    {
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+    };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+{
+    float f = *p;
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f),
+    };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+//      SELECT4(src, control) {
+//          CASE(control[1:0])
+//              0:	tmp[127:0] : = src[127:0]
+//              1 : tmp[127:0] : = src[255:128]
+//              2 : tmp[127:0] : = src[383:256]
+//              3 : tmp[127:0] : = src[511:384]
+//              ESAC
+//              RETURN tmp[127:0]
+//      }
+//      
+//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
+//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+//      dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+    return Float
+    {
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+    return Double
+    {
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+    return Integer
+	{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+	};
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float
+    {
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return Float
+    {
+        SIMD256T::load_ps(p),
+        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return Integer
+    {
+        SIMD256T::load_si(&p->v8[0]),
+        SIMD256T::load_si(&p->v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return Float
+    {
+        SIMD256T::loadu_ps(p),
+        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return Integer
+    {
+        SIMD256T::loadu_si(&p->v8[0]),
+        SIMD256T::loadu_si(&p->v8[1]),
+    };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return Float
+    {
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+    return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+    return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+    return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi32(i),
+        SIMD256T::set1_epi32(i)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi8(i),
+        SIMD256T::set1_epi8(i)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return Float
+    {
+        SIMD256T::setzero_ps(),
+        SIMD256T::setzero_ps()
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return Integer
+    {
+        SIMD256T::setzero_si(),
+        SIMD256T::setzero_si()
+    };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    SIMD256T::store_ps(p, a.v8[0]);
+    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    SIMD256T::store_si(&p->v8[0], a.v8[0]);
+    SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    SIMD256T::stream_ps(p, a.v8[0]);
+    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return Integer
+    {
+        SIMD256T::set_epi32(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_epi32(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return Float
+    {
+        SIMD256T::set_ps(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_ps(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    Integer vec = set1_epi32(mask);
+    const Integer bit = set_epi32(
+        0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = and_si(vec, bit);
+    vec = cmplt_epi32(setzero_si(), vec);
+    return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
new file mode 100644
index 0000000..bc5bff4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@@ -0,0 +1,28 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// no backwards compatibility for simd mask-enabled functions
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
new file mode 100644
index 0000000..df2df1b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -0,0 +1,428 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+#if 0
+//===========================================================================
+// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
+//===========================================================================
+struct SIMD256 // or SIMD4 or SIMD16
+{
+    //=======================================================================
+    // SIMD Types
+    //
+    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
+    // use different base types with this same naming.
+    using Float     = __m256;  // Packed single-precision float vector
+    using Double    = __m256d; // Packed double-precision float vector
+    using Integer   = __m256i; // Packed integer vector (mutable element widths)
+    using Mask      = uint8_t; // Integer representing mask bits
+
+    //=======================================================================
+    // Standard interface
+    // (available in both SIMD256 and SIMD16 widths)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Single precision floating point arithmetic operations
+    //-----------------------------------------------------------------------
+    static Float    add_ps(Float a, Float b);               // return a + b
+    static Float    div_ps(Float a, Float b);               // return a / b
+    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
+    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
+    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
+    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
+    static Float    mul_ps(Float a, Float b);               // return a * b
+    static Float    rcp_ps(Float a);                        // return 1.0f / a
+    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
+    static Float    sub_ps(Float a, Float b);               // return a - b
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    // return round_func(a)
+    //
+    // round_func is chosen on the RMT template parameter.  See the documentation
+    // for the RoundMode enumeration above.
+    template <RoundMode RMT>
+    static Float    round_ps(Float a);                  // return round(a) 
+
+
+    //-----------------------------------------------------------------------
+    // Integer (various width) arithmetic operations
+    //-----------------------------------------------------------------------
+    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
+    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
+    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
+    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
+    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
+    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
+    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
+    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
+
+    // return (a * b) & 0xFFFFFFFF
+    //
+    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+    // and store the low 32 bits of the intermediate integers in dst.
+    static Float    mullo_epi32(Integer a, Integer b);
+
+    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
+    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
+    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
+
+    //-----------------------------------------------------------------------
+    // Logical operations
+    //-----------------------------------------------------------------------
+    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
+    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
+    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
+    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
+    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
+    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
+    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
+    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
+
+    //-----------------------------------------------------------------------
+    // Shift operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Integer  slli_epi32(Integer a);              // return a << ImmT
+    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
+    template<int ImmT>
+    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
+    template<int ImmT>
+    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
+    template<int ImmT>                                  // for each 128-bit lane:
+    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
+    template<int ImmT>
+    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
+    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
+
+    //-----------------------------------------------------------------------
+    // Conversion operations
+    //-----------------------------------------------------------------------
+    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
+    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
+    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
+    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
+    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
+    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
+    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
+    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
+    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
+    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
+    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
+    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
+    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
+
+    //-----------------------------------------------------------------------
+    // Comparison operations
+    //-----------------------------------------------------------------------
+
+    // Comparison types used with cmp_ps:
+    //   - ordered comparisons are always false if either operand is NaN
+    //   - unordered comparisons are always true if either operand is NaN
+    //   - signaling comparisons raise an exception if either operand is NaN
+    //   - non-signaling comparisons will never raise an exception
+    // 
+    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
+    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+    // return a (CmpTypeT) b (float)
+    //
+    // See documentation for CompareType above for valid values for CmpTypeT.
+    template<CompareType CmpTypeT>
+    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
+    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
+    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
+    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
+    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
+    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
+    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
+    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
+    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
+    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
+    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
+    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
+    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
+    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
+    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
+    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
+    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
+    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
+
+    //-----------------------------------------------------------------------
+    // Blend / shuffle / permute operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
+    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
+    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
+    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
+    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
+    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
+    template<int SwizT>
+    static Integer  shuffle_epi32(Integer a, Integer b);    
+    template<int SwizT>
+    static Integer  shuffle_epi64(Integer a, Integer b);
+    static Integer  shuffle_epi8(Integer a, Integer b);
+    template<int SwizT>
+    static Float    shuffle_pd(Double a, Double b);
+    template<int SwizT>
+    static Float    shuffle_ps(Float a, Float b);
+    static Integer  unpackhi_epi16(Integer a, Integer b);
+    static Integer  unpackhi_epi32(Integer a, Integer b);
+    static Integer  unpackhi_epi64(Integer a, Integer b);
+    static Integer  unpackhi_epi8(Integer a, Integer b);
+    static Float    unpackhi_pd(Double a, Double b);
+    static Float    unpackhi_ps(Float a, Float b);
+    static Integer  unpacklo_epi16(Integer a, Integer b);
+    static Integer  unpacklo_epi32(Integer a, Integer b);
+    static Integer  unpacklo_epi64(Integer a, Integer b);
+    static Integer  unpacklo_epi8(Integer a, Integer b);
+    static Float    unpacklo_pd(Double a, Double b);
+    static Float    unpacklo_ps(Float a, Float b);
+
+    //-----------------------------------------------------------------------
+    // Load / store operations
+    //-----------------------------------------------------------------------
+    enum class ScaleFactor
+    {
+        SF_1,   // No scaling
+        SF_2,   // Scale offset by 2
+        SF_4,   // Scale offset by 4
+        SF_8,   // Scale offset by 8
+    };
+
+    template<ScaleFactor ScaleT>
+    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
+    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
+    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
+    static Integer  load_si(Integer const *p);                  // return *p
+    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
+    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
+
+    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+    template<int ScaleT>
+    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
+
+    static void     maskstore_ps(float *p, Integer mask, Float src);
+    static int      movemask_epi8(Integer a);
+    static int      movemask_pd(Double a);
+    static int      movemask_ps(Float a);
+    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
+    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
+    static Float    set1_ps(float f);                           // return f (all elements are same value)
+    static Float    setzero_ps();                               // return 0 (float)
+    static Integer  setzero_si();                               // return 0 (integer)
+    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
+    static void     store_si(Integer *p, Integer a);            // *p = a
+    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
+
+    //=======================================================================
+    // Legacy interface (available only in SIMD256 width)
+    //=======================================================================
+
+    static Float    broadcast_ps(__m128 const *p);
+    template<int ImmT>
+    static __m128d  extractf128_pd(Double a);
+    template<int ImmT>
+    static __m128   extractf128_ps(Float a);
+    template<int ImmT>
+    static __m128i  extractf128_si(Integer a);
+    template<int ImmT>
+    static Double   insertf128_pd(Double a, __m128d b);
+    template<int ImmT>
+    static Float    insertf128_ps(Float a, __m128 b);
+    template<int ImmT>
+    static Integer  insertf128_si(Integer a, __m128i b);
+    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
+    template<int ImmT>
+    static Double   permute2f128_pd(Double a, Double b);
+    template<int ImmT>
+    static Float    permute2f128_ps(Float a, Float b);
+    template<int ImmT>
+    static Integer  permute2f128_si(Integer a, Integer b);
+    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
+    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
+
+    //=======================================================================
+    // Advanced masking interface (currently available only in SIMD16 width)
+    //=======================================================================
+
+
+    //=======================================================================
+    // Extended Utility Functions (common to SIMD256 and SIMD16)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Extended Types
+    //-----------------------------------------------------------------------
+
+    // Vec4, an SOA SIMD set of 4-dimensional vectors
+    union Vec4
+    {
+        Vec4() = default;
+        Vec4(Float in)
+        {
+            s.x = in;
+            s.y = in;
+            s.z = in;
+            s.w = in;
+        }
+        Vec4(Float x, Float y, Float z, Float w)
+        {
+            s.x = x;
+            s.y = y;
+            s.z = z;
+            s.w = w;
+        }
+
+        Float      v[4];
+        Integer      vi[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        } s;
+        Float& operator[] (const int i) { return v[i]; }
+        Float const & operator[] (const int i) const { return v[i]; }
+    };
+
+    //-----------------------------------------------------------------------
+    // Extended Functions
+    //-----------------------------------------------------------------------
+    static void     vec4_set1_ps(Vec4& r, const float *p);                  // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
+    static void     vec4_set1_vps(Vec4& r, Float s);                        // r[0] = s, r[1] = s, ...
+    static Float    vec4_dp3_ps(const Vec4& v0, const Vec4& v1);            // return dp3(v0, v1)
+    static Float    vec4_dp4_ps(const Vec4& v0, const Vec4& v1);            // return dp4(v0, v1)
+    static Float    vec4_rcp_length_ps(const Vec4& v);                      // return 1.0f / sqrt(dp4(v, v))
+    static void     vec4_normalize_ps(Vec4& r, const Vec4& v);              // r = v * rcp_length(v)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v, Float s);           // r = v * set1_vps(s)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 * v1
+    static void     vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 + v1
+    static void     vec4_min_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 < s) ? v0 : s
+    static void     vec4_max_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 > s) ? v0 : s
+
+    // Matrix4x4 * Vector4
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
+    static void mat4x4_vec4_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
+    static void mat3x3_vec3_w0_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
+    static void mat4x4_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x3 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = 1
+    static void mat4x3_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+};
+#endif // #if 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
new file mode 100644
index 0000000..bc23867
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -0,0 +1,377 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#if !defined(__cplusplus)
+#error C++ compilation required
+#endif
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#define SIMD_ARCH_AVX       0
+#define SIMD_ARCH_AVX2      1
+#define SIMD_ARCH_AVX512    2
+
+#if !defined(SIMD_ARCH)
+#define SIMD_ARCH SIMD_ARCH_AVX
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCALL __vectorcall
+#define SIMDINLINE __forceinline
+#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
+#else
+#define SIMDCALL
+#define SIMDINLINE inline
+#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
+#endif
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+    enum class CompareTypeInt
+    {
+        EQ  = _MM_CMPINT_EQ,    // Equal
+        LT  = _MM_CMPINT_LT,    // Less than
+        LE  = _MM_CMPINT_LE,    // Less than or Equal
+        NE  = _MM_CMPINT_NE,    // Not Equal
+        GE  = _MM_CMPINT_GE,    // Greater than or Equal
+        GT  = _MM_CMPINT_GT,    // Greater than
+    };
+#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
+
+    enum class ScaleFactor
+    {
+        SF_1 = 1,   // No scaling
+        SF_2 = 2,   // Scale offset by 2
+        SF_4 = 4,   // Scale offset by 4
+        SF_8 = 8,   // Scale offset by 8
+    };
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+        
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+        
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    struct Traits
+    {
+        using CompareType = SIMDImpl::CompareType;
+        using ScaleFactor = SIMDImpl::ScaleFactor;
+        using RoundMode   = SIMDImpl::RoundMode;
+    };
+
+    // Attribute, 4-dimensional attribute in SIMD SOA layout
+    template<typename Float, typename Integer, typename Double>
+    union Vec4
+    {
+        Float   v[4];
+        Integer vi[4];
+        Double  vd[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        };
+        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
+        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
+        SIMDINLINE Vec4& operator=(Vec4 const & in)
+        {
+            v[0] = in.v[0];
+            v[1] = in.v[1];
+            v[2] = in.v[2];
+            v[3] = in.v[3];
+            return *this;
+        }
+    };
+
+    namespace SIMD128Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m128 in) : v(in) {}
+            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128() const { return v; }
+
+            SIMDALIGN(__m128, 16) v;
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m128i in) : v(in) {}
+            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128i() const { return v; }
+            SIMDALIGN(__m128i, 16) v;
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m128d in) : v(in) {}
+            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128d() const { return v; }
+            SIMDALIGN(__m128d, 16) v;
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 4;
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m256 in) : v(in) {}
+            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+            {
+                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256() const { return v; }
+
+            SIMDALIGN(__m256, 32) v;
+            SIMD128Impl::Float v4[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m256i in) : v(in) {}
+            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+            {
+                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256i() const { return v; }
+
+            SIMDALIGN(__m256i, 32) v;
+            SIMD128Impl::Integer v4[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m256d in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+            {
+                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256d() const { return v; }
+
+            SIMDALIGN(__m256d, 32) v;
+            SIMD128Impl::Double v4[2];
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 8;
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if !defined(__AVX512F__)
+        // Define AVX512 types if not included via immintrin.h.
+        // All data members of these types are ONLY to viewed
+        // in a debugger.  Do NOT access them via code!
+        union __m512
+        {
+        private:
+            float m512_f32[16];
+        };
+        struct __m512d
+        {
+        private:
+            double m512d_f64[8];
+        };
+
+        union __m512i
+        {
+        private:
+            int8_t              m512i_i8[64];
+            int16_t             m512i_i16[32];
+            int32_t             m512i_i32[16];
+            int64_t             m512i_i64[8];
+            uint8_t             m512i_u8[64];
+            uint16_t            m512i_u16[32];
+            uint32_t            m512i_u32[16];
+            uint64_t            m512i_u64[8];
+        };
+
+        using __mmask16 = uint16_t;
+#endif
+
+#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
+#define SIMD_ALIGNMENT_BYTES 64
+#else
+#define SIMD_ALIGNMENT_BYTES 32
+#endif
+
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m512 in) : v(in) {}
+            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+            SIMDINLINE operator __m512() const { return v; }
+
+            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Float v8[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m512i in) : v(in) {}
+            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512i() const { return v; }
+
+            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Integer v8[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m512d in) : v(in) {}
+            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512d() const { return v; }
+
+            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Double v8[2];
+        };
+
+        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
+        using Mask = __mmask16;
+
+        static const uint32_t SIMD_WIDTH = 16;
+
+#undef SIMD_ALIGNMENT_BYTES
+    } // ns SIMD512Impl
+} // ns SIMDImpl
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 2423aa7..ccb6dfb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -143,9 +143,6 @@
     // initialize hot tile manager
     pContext->pHotTileMgr = new HotTileMgr();
 
-    // initialize function pointer tables
-    InitClearTilesTable();
-
     // initialize callback functions
     pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
@@ -192,7 +189,7 @@
 
     if (IsDraw)
     {
-        InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
+        InterlockedIncrement(&pContext->drawsOutstandingFE);
     }
 
     _ReadWriteBarrier();
@@ -332,7 +329,6 @@
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
 
         pCurDrawContext->cleanupState = true;
-
     }
     else
     {
@@ -592,12 +588,16 @@
     HANDLE hContext,
     PFN_CS_FUNC pfnCsFunc,
     uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize)
+    uint32_t totalSpillFillSize,
+    uint32_t scratchSpaceSizePerInstance,
+    uint32_t numInstances)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
     pState->pfnCsFunc = pfnCsFunc;
     pState->totalThreadsInGroup = totalThreadsInGroup;
     pState->totalSpillFillSize = totalSpillFillSize;
+    pState->scratchSpaceSize = scratchSpaceSizePerInstance;
+    pState->scratchSpaceNumInstances = numInstances;
 }
 
 void SwrSetTsState(
@@ -680,7 +680,7 @@
 // update guardband multipliers for the viewport
 void updateGuardbands(API_STATE *pState)
 {
-    uint32_t numGbs = pState->gsState.emitsRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
 
     for(uint32_t i = 0; i < numGbs; ++i)
     {
@@ -736,7 +736,7 @@
 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 {
     API_STATE *pState = &pDC->pState->state;
-    uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
     pState->scissorsTileAligned = true;
 
     for (uint32_t index = 0; index < numScissors; ++index)
@@ -782,10 +782,9 @@
     }
 }
 
+
 // templated backend function tables
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
+
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
@@ -803,7 +802,7 @@
         const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
-        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
+        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
@@ -836,7 +835,9 @@
             break;
         }
     }
-    
+
+    SWR_ASSERT(backendFuncs.pfnBackend);
+
     PFN_PROCESS_PRIMS pfnBinner;
 #if USE_SIMD16_FRONTEND
     PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
@@ -956,20 +957,31 @@
                                           (pState->state.depthStencilState.stencilTestEnable  ||
                                            pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
 
-    uint32_t numRTs = pState->state.psState.numRenderTargets;
-    pState->state.colorHottileEnable = 0;
+
+    uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
+
+    // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        for (uint32_t rt = 0; rt < numRTs; ++rt)
+        DWORD rt;
+        uint32_t rtMask = pState->state.psState.renderTargetMask;
+        while (_BitScanForward(&rt, rtMask))
         {
-            pState->state.colorHottileEnable |=  
-                (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+            rtMask &= ~(1 << rt);
+
+            if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
+                pState->state.blendState.renderTarget[rt].writeDisableRed &&
+                pState->state.blendState.renderTarget[rt].writeDisableGreen &&
+                pState->state.blendState.renderTarget[rt].writeDisableBlue)
+            {
+                hotTileEnable &= ~(1 << rt);
+            }
         }
     }
 
+    pState->state.colorHottileEnable = hotTileEnable;
+
+
     // Setup depth quantization function
     if (pState->state.depthHottileEnable)
     {
@@ -1132,7 +1144,6 @@
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
     }
 
-
     int draw = 0;
     while (remainingVerts)
     {
@@ -1173,7 +1184,6 @@
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-
     AR_API_END(APIDraw, numVertices * numInstances);
 }
 
@@ -1275,7 +1285,6 @@
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
     }
 
-
     while (remainingIndices)
     {
         uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
@@ -1321,7 +1330,6 @@
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
  
-
     AR_API_END(APIDrawIndexed, numIndices * numInstances);
 }
 
@@ -1637,3 +1645,74 @@
     pContext->frameCount++;
 }
 
+void InitSimLoadTilesTable();
+void InitSimStoreTilesTable();
+void InitSimClearTilesTable();
+
+void InitClearTilesTable();
+void InitBackendFuncTables();
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Initialize swr backend and memory internal tables
+void SwrInit()
+{
+    InitSimLoadTilesTable();
+    InitSimStoreTilesTable();
+    InitSimClearTilesTable();
+
+    InitClearTilesTable();
+    InitBackendFuncTables();
+    InitRasterizerFunctions();
+}
+
+void SwrGetInterface(SWR_INTERFACE &out_funcs)
+{
+    out_funcs.pfnSwrCreateContext = SwrCreateContext;
+    out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+    out_funcs.pfnSwrSaveState = SwrSaveState;
+    out_funcs.pfnSwrRestoreState = SwrRestoreState;
+    out_funcs.pfnSwrSync = SwrSync;
+    out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
+    out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
+    out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
+    out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
+    out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
+    out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
+    out_funcs.pfnSwrSetSoState = SwrSetSoState;
+    out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
+    out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
+    out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
+    out_funcs.pfnSwrSetGsState = SwrSetGsState;
+    out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
+    out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
+    out_funcs.pfnSwrSetTsState = SwrSetTsState;
+    out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
+    out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
+    out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
+    out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
+    out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
+    out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
+    out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
+    out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
+    out_funcs.pfnSwrDraw = SwrDraw;
+    out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
+    out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
+    out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
+    out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
+    out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
+    out_funcs.pfnSwrDispatch = SwrDispatch;
+    out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
+    out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
+    out_funcs.pfnSwrSetRastState = SwrSetRastState;
+    out_funcs.pfnSwrSetViewports = SwrSetViewports;
+    out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
+    out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
+    out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
+    out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
+    out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
+    out_funcs.pfnSwrEndFrame = SwrEndFrame;
+    out_funcs.pfnSwrInit = SwrInit;
+    out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
+    out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
+    out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index d0f29dd..236e0fc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -34,7 +34,7 @@
 #include <assert.h>
 #include <algorithm>
 
-#include "common/simdintrin.h"
+#include "common/intrin.h"
 #include "common/formats.h"
 #include "core/state.h"
 
@@ -220,13 +220,13 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
-HANDLE SWR_API SwrCreateContext(
+SWR_FUNC(HANDLE, SwrCreateContext,
     SWR_CREATECONTEXT_INFO* pCreateInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Destroys SWR Context.
 /// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrDestroyContext(
+SWR_FUNC(void, SwrDestroyContext,
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
@@ -234,7 +234,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pOutputStateBlock - Memory block to receive API state data
 /// @param memSize - Size of memory pointed to by pOutputStateBlock
-void SWR_API SwrSaveState(
+SWR_FUNC(void, SwrSaveState,
     HANDLE hContext,
     void* pOutputStateBlock,
     size_t memSize);
@@ -244,7 +244,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pStateBlock - Memory block to read API state data from
 /// @param memSize - Size of memory pointed to by pStateBlock
-void SWR_API SwrRestoreState(
+SWR_FUNC(void, SwrRestoreState,
     HANDLE hContext,
     const void* pStateBlock,
     size_t memSize);
@@ -255,23 +255,23 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - pointer to callback function,
 /// @param userData - user data to pass back 
-void SWR_API SwrSync(
+SWR_FUNC(void, SwrSync,
     HANDLE hContext,
     PFN_CALLBACK_FUNC pfnFunc,
     uint64_t userData,
     uint64_t userData2,
-    uint64_t userData3 = 0);
+    uint64_t userData3);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Blocks until all rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrWaitForIdle(
+SWR_FUNC(void, SwrWaitForIdle,
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Blocks until all FE rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrWaitForIdleFE(
+SWR_FUNC(void, SwrWaitForIdleFE,
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
@@ -279,7 +279,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param numBuffers - Number of vertex buffer state descriptors.
 /// @param pVertexBuffers - Array of vertex buffer state descriptors.
-void SWR_API SwrSetVertexBuffers(
+SWR_FUNC(void, SwrSetVertexBuffers,
     HANDLE hContext,
     uint32_t numBuffers,
     const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
@@ -288,7 +288,7 @@
 /// @brief Set index buffer
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pIndexBuffer - Index buffer.
-void SWR_API SwrSetIndexBuffer(
+SWR_FUNC(void, SwrSetIndexBuffer,
     HANDLE hContext,
     const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
 
@@ -296,7 +296,7 @@
 /// @brief Set fetch shader pointer.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFetchFunc - Pointer to shader.
-void SWR_API SwrSetFetchFunc(
+SWR_FUNC(void, SwrSetFetchFunc,
     HANDLE hContext,
     PFN_FETCH_FUNC    pfnFetchFunc);
 
@@ -305,7 +305,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnSoFunc - Pointer to shader.
 /// @param streamIndex - specifies stream
-void SWR_API SwrSetSoFunc(
+SWR_FUNC(void, SwrSetSoFunc,
     HANDLE hContext,
     PFN_SO_FUNC    pfnSoFunc,
     uint32_t streamIndex);
@@ -314,7 +314,7 @@
 /// @brief Set streamout state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pSoState - Pointer to streamout state.
-void SWR_API SwrSetSoState(
+SWR_FUNC(void, SwrSetSoState,
     HANDLE hContext,
     SWR_STREAMOUT_STATE* pSoState);
 
@@ -323,7 +323,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pSoBuffer - Pointer to streamout buffer.
 /// @param slot - Slot to bind SO buffer to.
-void SWR_API SwrSetSoBuffers(
+SWR_FUNC(void, SwrSetSoBuffers,
     HANDLE hContext,
     SWR_STREAMOUT_BUFFER* pSoBuffer,
     uint32_t slot);
@@ -332,7 +332,7 @@
 /// @brief Set vertex shader pointer.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnVertexFunc - Pointer to shader.
-void SWR_API SwrSetVertexFunc(
+SWR_FUNC(void, SwrSetVertexFunc,
     HANDLE hContext,
     PFN_VERTEX_FUNC pfnVertexFunc);
 
@@ -340,7 +340,7 @@
 /// @brief Set frontend state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-void SWR_API SwrSetFrontendState(
+SWR_FUNC(void, SwrSetFrontendState,
     HANDLE hContext,
     SWR_FRONTEND_STATE *pState);
 
@@ -348,7 +348,7 @@
 /// @brief Set geometry shader state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-void SWR_API SwrSetGsState(
+SWR_FUNC(void, SwrSetGsState,
     HANDLE hContext,
     SWR_GS_STATE *pState);
 
@@ -356,7 +356,7 @@
 /// @brief Set geometry shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to geometry shader function
-void SWR_API SwrSetGsFunc(
+SWR_FUNC(void, SwrSetGsFunc,
     HANDLE hContext,
     PFN_GS_FUNC pfnGsFunc);
 
@@ -366,17 +366,22 @@
 /// @param pfnCsFunc - Pointer to compute shader function
 /// @param totalThreadsInGroup - product of thread group dimensions.
 /// @param totalSpillFillSize - size in bytes needed for spill/fill.
-void SWR_API SwrSetCsFunc(
+/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
+/// @param numInstances - number of simd instances that are run per execution of the shader
+SWR_FUNC(void, SwrSetCsFunc,
     HANDLE hContext,
     PFN_CS_FUNC pfnCsFunc,
     uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize);
+    uint32_t totalSpillFillSize,
+    uint32_t scratchSpaceSizePerInstance,
+    uint32_t numInstances
+    );
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set tessellation state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-void SWR_API SwrSetTsState(
+SWR_FUNC(void, SwrSetTsState,
     HANDLE hContext,
     SWR_TS_STATE *pState);
 
@@ -384,7 +389,7 @@
 /// @brief Set hull shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - Pointer to shader function
-void SWR_API SwrSetHsFunc(
+SWR_FUNC(void, SwrSetHsFunc,
     HANDLE hContext,
     PFN_HS_FUNC pfnFunc);
 
@@ -392,7 +397,7 @@
 /// @brief Set domain shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - Pointer to shader function
-void SWR_API SwrSetDsFunc(
+SWR_FUNC(void, SwrSetDsFunc,
     HANDLE hContext,
     PFN_DS_FUNC pfnFunc);
 
@@ -400,7 +405,7 @@
 /// @brief Set depth stencil state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-void SWR_API SwrSetDepthStencilState(
+SWR_FUNC(void, SwrSetDepthStencilState,
     HANDLE hContext,
     SWR_DEPTH_STENCIL_STATE *pState);
 
@@ -408,7 +413,7 @@
 /// @brief Set backend state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-void SWR_API SwrSetBackendState(
+SWR_FUNC(void, SwrSetBackendState,
     HANDLE hContext,
     SWR_BACKEND_STATE *pState);
 
@@ -416,7 +421,7 @@
 /// @brief Set depth bounds state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-void SWR_API SwrSetDepthBoundsState(
+SWR_FUNC(void, SwrSetDepthBoundsState,
     HANDLE hContext,
     SWR_DEPTH_BOUNDS_STATE *pState);
 
@@ -424,7 +429,7 @@
 /// @brief Set pixel shader state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-void SWR_API SwrSetPixelShaderState(
+SWR_FUNC(void, SwrSetPixelShaderState,
     HANDLE hContext,
     SWR_PS_STATE *pState);
 
@@ -432,7 +437,7 @@
 /// @brief Set blend state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-void SWR_API SwrSetBlendState(
+SWR_FUNC(void, SwrSetBlendState,
     HANDLE hContext,
     SWR_BLEND_STATE *pState);
 
@@ -441,7 +446,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param renderTarget - render target index
 /// @param pfnBlendFunc - function pointer
-void SWR_API SwrSetBlendFunc(
+SWR_FUNC(void, SwrSetBlendFunc,
     HANDLE hContext,
     uint32_t renderTarget,
     PFN_BLEND_JIT_FUNC pfnBlendFunc);
@@ -452,7 +457,7 @@
 /// @param topology - Specifies topology for draw.
 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
 /// @param primCount - Number of vertices.
-void SWR_API SwrDraw(
+SWR_FUNC(void, SwrDraw,
     HANDLE hContext,
     PRIMITIVE_TOPOLOGY topology,
     uint32_t startVertex,
@@ -466,7 +471,7 @@
 /// @param numInstances - How many instances to render.
 /// @param startVertex - Specifies start vertex for draw. (vertex data)
 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void SWR_API SwrDrawInstanced(
+SWR_FUNC(void, SwrDrawInstanced,
     HANDLE hContext,
     PRIMITIVE_TOPOLOGY topology,
     uint32_t numVertsPerInstance,
@@ -481,7 +486,7 @@
 /// @param numIndices - Number of indices to read sequentially from index buffer.
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-void SWR_API SwrDrawIndexed(
+SWR_FUNC(void, SwrDrawIndexed,
     HANDLE hContext,
     PRIMITIVE_TOPOLOGY topology,
     uint32_t numIndices,
@@ -497,7 +502,7 @@
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void SWR_API SwrDrawIndexedInstanced(
+SWR_FUNC(void, SwrDrawIndexedInstanced,
     HANDLE hContext,
     PRIMITIVE_TOPOLOGY topology,
     uint32_t numIndices,
@@ -512,7 +517,7 @@
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
 ///                         be hottile size-aligned.
-void SWR_API SwrInvalidateTiles(
+SWR_FUNC(void, SwrInvalidateTiles,
     HANDLE hContext,
     uint32_t attachmentMask,
     const SWR_RECT& invalidateRect);
@@ -523,7 +528,7 @@
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
 /// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
 ///               discarded.
-void SWR_API SwrDiscardRect(
+SWR_FUNC(void, SwrDiscardRect,
     HANDLE hContext,
     uint32_t attachmentMask,
     const SWR_RECT& rect);
@@ -534,7 +539,7 @@
 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-void SWR_API SwrDispatch(
+SWR_FUNC(void, SwrDispatch,
     HANDLE hContext,
     uint32_t threadGroupCountX,
     uint32_t threadGroupCountY,
@@ -549,7 +554,7 @@
 };
 
 /// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
-void SWR_API SwrStoreTiles(
+SWR_FUNC(void, SwrStoreTiles,
     HANDLE hContext,
     uint32_t attachmentMask,
     SWR_TILE_STATE postStoreTileState,
@@ -565,7 +570,7 @@
 /// @param z - depth value use for clearing depth buffer
 /// @param stencil - stencil value used for clearing stencil buffer
 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-void SWR_API SwrClearRenderTarget(
+SWR_FUNC(void, SwrClearRenderTarget,
     HANDLE hContext,
     uint32_t attachmentMask,
     uint32_t renderTargetArrayIndex,
@@ -578,7 +583,7 @@
 /// @brief SwrSetRastState
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
-void SWR_API SwrSetRastState(
+SWR_FUNC(void, SwrSetRastState,
     HANDLE hContext,
     const SWR_RASTSTATE *pRastState);
 
@@ -588,7 +593,7 @@
 /// @param numViewports - number of viewports passed in
 /// @param pViewports - Specifies extents of viewport.
 /// @param pMatrices - If not specified then SWR computes a default one.
-void SWR_API SwrSetViewports(
+SWR_FUNC(void, SwrSetViewports,
     HANDLE hContext,
     uint32_t numViewports,
     const SWR_VIEWPORT* pViewports,
@@ -599,7 +604,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param numScissors - number of scissors passed in
 /// @param pScissors - array of scissors
-void SWR_API SwrSetScissorRects(
+SWR_FUNC(void, SwrSetScissorRects,
     HANDLE hContext,
     uint32_t numScissors,
     const SWR_RECT* pScissors);
@@ -612,7 +617,7 @@
 /// @note  Client needs to resend private state prior to each draw call.
 ///        Also, SWR is responsible for the private state memory.
 /// @param hContext - Handle passed back from SwrCreateContext
-VOID* SWR_API SwrGetPrivateContextState(
+SWR_FUNC(void*, SwrGetPrivateContextState,
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
@@ -623,7 +628,7 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param size - Size of allocation
 /// @param align - Alignment needed for allocation.
-VOID* SWR_API SwrAllocDrawContextMemory(
+SWR_FUNC(void*, SwrAllocDrawContextMemory,
     HANDLE hContext,
     uint32_t size,
     uint32_t align);
@@ -632,7 +637,7 @@
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SWR_API SwrEnableStatsFE(
+SWR_FUNC(void, SwrEnableStatsFE,
     HANDLE hContext,
     bool enable);
 
@@ -640,15 +645,118 @@
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SWR_API SwrEnableStatsBE(
+SWR_FUNC(void, SwrEnableStatsBE,
     HANDLE hContext,
     bool enable);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Mark end of frame - used for performance profiling
 /// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrEndFrame(
+SWR_FUNC(void, SwrEndFrame,
     HANDLE hContext);
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Initialize swr backend and memory internal tables
+SWR_FUNC(void, SwrInit);
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a full hottile from a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param dstFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to src render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pDstHotTile - Pointer to Hot Tile
+SWR_FUNC(void, SwrLoadHotTile,
+    const SWR_SURFACE_STATE *pSrcSurface,
+    SWR_FORMAT dstFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+    uint8_t *pDstHotTile);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Deswizzles and stores a full hottile to a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param srcFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pSrcHotTile - Pointer to Hot Tile
+SWR_FUNC(void, SwrStoreHotTileToSurface,
+    SWR_SURFACE_STATE *pDstSurface,
+    SWR_FORMAT srcFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+    uint8_t *pSrcHotTile);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Writes clear color to every pixel of a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pClearColor - Pointer to clear color
+SWR_FUNC(void, SwrStoreHotTileClear,
+         SWR_SURFACE_STATE *pDstSurface,
+         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+         UINT x,
+         UINT y,
+         uint32_t renderTargetArrayIndex,
+         const float* pClearColor);
+
+struct SWR_INTERFACE
+{
+    PFNSwrCreateContext pfnSwrCreateContext;
+    PFNSwrDestroyContext pfnSwrDestroyContext;
+    PFNSwrSaveState pfnSwrSaveState;
+    PFNSwrRestoreState pfnSwrRestoreState;
+    PFNSwrSync pfnSwrSync;
+    PFNSwrWaitForIdle pfnSwrWaitForIdle;
+    PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
+    PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;
+    PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer;
+    PFNSwrSetFetchFunc pfnSwrSetFetchFunc;
+    PFNSwrSetSoFunc pfnSwrSetSoFunc;
+    PFNSwrSetSoState pfnSwrSetSoState;
+    PFNSwrSetSoBuffers pfnSwrSetSoBuffers;
+    PFNSwrSetVertexFunc pfnSwrSetVertexFunc;
+    PFNSwrSetFrontendState pfnSwrSetFrontendState;
+    PFNSwrSetGsState pfnSwrSetGsState;
+    PFNSwrSetGsFunc pfnSwrSetGsFunc;
+    PFNSwrSetCsFunc pfnSwrSetCsFunc;
+    PFNSwrSetTsState pfnSwrSetTsState;
+    PFNSwrSetHsFunc pfnSwrSetHsFunc;
+    PFNSwrSetDsFunc pfnSwrSetDsFunc;
+    PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState;
+    PFNSwrSetBackendState pfnSwrSetBackendState;
+    PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState;
+    PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState;
+    PFNSwrSetBlendState pfnSwrSetBlendState;
+    PFNSwrSetBlendFunc pfnSwrSetBlendFunc;
+    PFNSwrDraw pfnSwrDraw;
+    PFNSwrDrawInstanced pfnSwrDrawInstanced;
+    PFNSwrDrawIndexed pfnSwrDrawIndexed;
+    PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced;
+    PFNSwrInvalidateTiles pfnSwrInvalidateTiles;
+    PFNSwrDiscardRect pfnSwrDiscardRect;
+    PFNSwrDispatch pfnSwrDispatch;
+    PFNSwrStoreTiles pfnSwrStoreTiles;
+    PFNSwrClearRenderTarget pfnSwrClearRenderTarget;
+    PFNSwrSetRastState pfnSwrSetRastState;
+    PFNSwrSetViewports pfnSwrSetViewports;
+    PFNSwrSetScissorRects pfnSwrSetScissorRects;
+    PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
+    PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
+    PFNSwrEnableStatsFE pfnSwrEnableStatsFE;
+    PFNSwrEnableStatsBE pfnSwrEnableStatsBE;
+    PFNSwrEndFrame pfnSwrEndFrame;
+    PFNSwrInit pfnSwrInit;
+    PFNSwrLoadHotTile pfnSwrLoadHotTile;
+    PFNSwrStoreHotTileToSurface pfnSwrStoreHotTileToSurface;
+    PFNSwrStoreHotTileClear pfnSwrStoreHotTileClear;
+};
+
+extern "C" {
+typedef void (SWR_API * PFNSwrGetInterface)(SWR_INTERFACE &out_funcs);
+SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE &out_funcs);
+}
 
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 39f4802..fe11cdf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -30,22 +30,21 @@
 #include <smmintrin.h>
 
 #include "backend.h"
+#include "backend_impl.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
 #include "core/multisample.h"
+#include "backends/gen_BackendPixelRate.hpp"
 
 #include <algorithm>
 
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
-static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
-
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Process compute work.
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -60,6 +59,12 @@
     {
         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     }
+    
+    size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
+    if (scratchSpaceSize && pScratchSpace == nullptr)
+    {
+        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
+    }
 
     const API_STATE& state = GetApiState(pDC);
 
@@ -70,6 +75,8 @@
     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     csContext.pTGSM = pContext->ppScratch[workerId];
     csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
+    csContext.pScratchSpace = (uint8_t*)pScratchSpace;
+    csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
 
     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
 
@@ -95,238 +102,6 @@
     SWR_ASSERT(x == 0 && y == 0);
 }
 
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#if USE_8x2_TILE_BACKEND
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#endif
-template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
-{
-    // convert clear color to hottile format
-    // clear color is in RGBA float/uint32
-#if USE_8x2_TILE_BACKEND
-    simd16vector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simd16scalar vComp;
-        vComp = _simd16_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#else
-    simdvector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simdscalar vComp;
-        vComp = _simd_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#endif
-    uint32_t tileX, tileY;
-    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
-    // Init to full macrotile
-    SWR_RECT clearTile =
-    {
-        KNOB_MACROTILE_X_DIM * int32_t(tileX),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
-        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
-    };
-
-    // intersect with clear rect
-    clearTile &= rect;
-
-    // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
-    // Make maximums inclusive (needed for convert to raster tiles)
-    clearTile.xmax -= 1;
-    clearTile.ymax -= 1;
-
-    // convert to raster tiles
-    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
-    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-    // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
-    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
-    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
-    // loop over all raster tiles in the current hot tile
-    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
-    {
-        uint8_t* pRasterTile = pRasterTileRow;
-        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
-        {
-            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
-            {
-                ClearRasterTile<format>(pRasterTile, vClear);
-                pRasterTile += rasterTileSampleStep;
-            }
-        }
-        pRasterTileRow += macroTileRowStep;
-    }
-
-    pHotTile->state = HOTTILE_DIRTY;
-}
-
-
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    if (KNOB_FAST_CLEAR)
-    {
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t numSamples = GetNumSamples(sampleCount);
-
-        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
-
-                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-                pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-                pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-                pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-                pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-                pHotTile->state = HOTTILE_CLEAR;
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
-            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
-
-            pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        AR_END(BEClear, 1);
-    }
-    else
-    {
-        // Legacy clear
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-            clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-            clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-            clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&pClear->clearDepth;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = pClear->clearStencil;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        AR_END(BEClear, 1);
-    }
-}
-
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
     SWR_RENDERTARGET_ATTACHMENT attachment)
 {
@@ -360,7 +135,7 @@
         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
         if (pHotTile->state == HOTTILE_CLEAR)
         {
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
@@ -378,7 +153,10 @@
 
         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
         {
-            pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
+            {
+                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+            }
         }
     }
     AR_END(BEStoreTiles, 1);
@@ -418,453 +196,6 @@
     }
 }
 
-#if KNOB_SIMD_WIDTH == 8
-const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#else
-#error Unsupported vector width
-#endif
-
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
-{
-    simdscalar vClipMask = _simd_setzero_ps();
-    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
-
-    for (uint32_t i = 0; i < numClipDistance; ++i)
-    {
-        // pull triangle clip distance values from clip buffer
-        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-
-        // interpolate
-        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-        
-        // clip if interpolated clip distance is < 0 || NAN
-        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-
-        vClipMask = _simd_or_ps(vClipMask, vCull);
-    }
-
-    return _simd_movemask_ps(vClipMask);
-}
-
-template<typename T>
-void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 1);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            simdmask coverageMask = work.coverageMask[0] & MASK;
-
-            if (coverageMask)
-            {
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                }
-
-                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-                {
-                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-                }
-
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                CalcPixelBarycentrics(coeffs, psContext);
-
-                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                AR_END(BEBarycentric, 1);
-
-                // interpolate user clip distance if available
-                if (state.rastState.clipDistanceMask)
-                {
-                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
-                }
-
-                simdscalar vCoverageMask = vMask(coverageMask);
-                simdscalar depthPassMask = vCoverageMask;
-                simdscalar stencilPassMask = vCoverageMask;
-
-                // Early-Z?
-                if (T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BEEarlyDepthTest, 0);
-
-                    // early-exit if no pixels passed depth or earlyZ is forced on
-                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            goto Endtile;
-                        }
-                    }
-                }
-
-                psContext.sampleIndex = 0;
-                psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                // execute pixel shader
-                AR_BEGIN(BEPixelShader, pDC->drawId);
-                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                AR_END(BEPixelShader, 0);
-
-                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                // late-Z
-                if (!T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BELateDepthTest, 0);
-
-                    if (!_simd_movemask_ps(depthPassMask))
-                    {
-                        // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                        goto Endtile;
-                    }
-                }
-
-                uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                // output merger
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                // do final depth write after all pixel kills
-                if (!state.psState.forceEarlyZ)
-                {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESingleSampleBackend, 0);
-}
-
-template<typename T>
-void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESampleRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 0);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
-            {
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    AR_END(BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.rastState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask = vMask(coverageMask);
-                    simdscalar depthPassMask = vCoverageMask;
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    // Early-Z?
-                    if (T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BEEarlyDepthTest, 0);
-
-                        // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                        {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            if (!_simd_movemask_ps(depthPassMask))
-                            {
-                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                                continue;
-                            }
-                        }
-                    }
-
-                    psContext.sampleIndex = sample;
-                    psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                    // execute pixel shader
-                    AR_BEGIN(BEPixelShader, pDC->drawId);
-                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                    AR_END(BEPixelShader, 0);
-
-                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                    // late-Z
-                    if (!T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BELateDepthTest, 0);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                            continue;
-                        }
-                    }
-
-                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                    // output merger
-                    AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                    OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                    OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                    // do final depth write after all pixel kills
-                    if (!state.psState.forceEarlyZ)
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-                    }
-                    AR_END(BEOutputMerger, 0);
-                }
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-Endtile:
-            ATTR_UNUSED;
-
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESampleRateBackend, 0);
-}
-// optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
@@ -962,7 +293,7 @@
                     UPDATE_STAT_BE(DepthPassCount, statCount);
                 }
 
-Endtile:
+            Endtile:
                 ATTR_UNUSED;
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -979,17 +310,7 @@
     AR_END(BENullBackend, 0);
 }
 
-void InitClearTilesTable()
-{
-    memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
-
-    sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
-    sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
-    sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
-    sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-    sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
-}
-
+PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
                                      [2] // centroid
@@ -1008,113 +329,10 @@
                                         [2] // canEarlyZ
                                         = {};
 
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch(tArg)
-        {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample pattern\n");
-        return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample count\n");
-        return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if(tArg == true)
-        {
-            return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-    {
-        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-        {
-            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-            {
-                table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
-                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
-            }
-        }
-    }
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
-    {
-        for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-        {
-            for(uint32_t centroid = 0; centroid < 2; centroid++)
-            {
-                for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                {
-                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, 
-                                             (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-                }
-            }
-        }
-    }
-}
-
-void InitBackendPixelRate0();
 void InitBackendFuncTables()
 {    
+    InitBackendPixelRate();
     InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendPixelRate0();
     InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index ade9afc..c8c37e6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -34,1031 +34,29 @@
 #include "depthstencil.h"
 #include "rdtsc_core.h"
 
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
-void InitClearTilesTable();
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
-void InitBackendFuncTables();
-void InitCPSFuncTables();
-void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
 
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
+
+extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
+extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
+                                     [2]  // centroid
+                                     [2]; // canEarlyZ
 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                              [2]   // isCenterPattern
-                                              [SWR_INPUT_COVERAGE_COUNT]
-                                              [2]  // centroid
-                                              [2]  // forcedSampleCount
-                                              [2]  // canEarlyZ
-                                              ;
-
-enum SWR_BACKEND_FUNCS
-{
-    SWR_BACKEND_SINGLE_SAMPLE,
-    SWR_BACKEND_MSAA_PIXEL_RATE,
-    SWR_BACKEND_MSAA_SAMPLE_RATE,
-    SWR_BACKEND_FUNCS_MAX,
-};
-
-#if KNOB_SIMD_WIDTH == 8
-extern const simdscalar vCenterOffsetsX;
-extern const simdscalar vCenterOffsetsY;
-extern const simdscalar vULOffsetsX;
-extern const simdscalar vULOffsetsY;
-#define MASK 0xff
-#endif
-
-INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileColorOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileColorOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileDepthOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileDepthOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileStencilOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileStencilOffsets[sampleNum];
-}
-
-template<typename T, uint32_t InputCoverage>
-struct generateInputCoverage
-{
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-
-        simdscalari mask[2];
-        simdscalari sampleCoverage[2];
-        
-        if(T::bIsCenterPattern)
-        {
-            // center coverage is the same for all samples; just broadcast to the sample slots
-            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-            if(T::MultisampleT::numSamples == 1)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 2)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 4)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 8)
-            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 16)
-            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-            }
-        }
-        else
-        {
-            __m256i src = _mm256_set1_epi32(0);
-            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-            if(T::MultisampleT::numSamples == 1)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-            }
-            else if(T::MultisampleT::numSamples == 2)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-            }
-            else if(T::MultisampleT::numSamples == 4)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-            }
-            else if(T::MultisampleT::numSamples == 8)
-            {
-                mask[0] = _mm256_set1_epi32(-1);
-            }
-            else if(T::MultisampleT::numSamples == 16)
-            {
-                mask[0] = _mm256_set1_epi32(-1);
-                mask[1] = _mm256_set1_epi32(-1);
-                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-            }
-
-            // gather coverage for samples 0-7
-            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-            if(T::MultisampleT::numSamples > 8)
-            {
-                // gather coverage for samples 8-15
-                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-            }
-        }
-
-        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-        simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-        simdscalari packedCoverage1;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-        }
-
-    #if (KNOB_ARCH == KNOB_ARCH_AVX)
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-        simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-        simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-        simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-    #else
-        simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-        simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-    #endif
-
-        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-        {
-            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-            if(!T::bForcedSampleCount)
-            {
-                // input coverage has to be anded with sample mask if MSAA isn't forced on
-                inputMask[i] &= sampleMask;
-            }
-
-            // shift to the next pixel in the 4x2
-            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-        }
-    }
-
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
-    {
-        uint32_t inputMask[KNOB_SIMD_WIDTH];
-        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-    }
-
-};
-
-template<typename T>
-struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-{
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
-        const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-        vec = _simd_and_si(vec, bit);
-        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
-        inputCoverage = _simd_castsi_ps(vec);
-    }
-
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-    {
-        uint32_t simdCoverage = (coverageMask[0] & MASK);
-        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
-        for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
-        {
-            // set all samples to covered if conservative coverage mask is set for that pixel
-            inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
-        }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
-//     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
-//     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
-//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                            const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH];
-    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-
-    // Case (2) - partially covered pixel
-
-    // scan for first covered sample per pixel in the 4x2 span
-    unsigned long sampleNum[KNOB_SIMD_WIDTH];
-    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
-    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
-    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
-    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
-    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
-    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
-    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
-    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
-    // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
-                                    samplePos.X(sampleNum[6]),
-                                    samplePos.X(sampleNum[5]),
-                                    samplePos.X(sampleNum[4]),
-                                    samplePos.X(sampleNum[3]),
-                                    samplePos.X(sampleNum[2]),
-                                    samplePos.X(sampleNum[1]),
-                                    samplePos.X(sampleNum[0]));
-
-    __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
-                                    samplePos.Y(sampleNum[6]),
-                                    samplePos.Y(sampleNum[5]),
-                                    samplePos.Y(sampleNum[4]),
-                                    samplePos.Y(sampleNum[3]),
-                                    samplePos.Y(sampleNum[2]),
-                                    samplePos.Y(sampleNum[1]),
-                                    samplePos.Y(sampleNum[0]));
-    // add sample offset to UL pixel corner
-    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
-    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
-    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
-    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
-    static const simdscalari vZero = _simd_setzero_si();
-    const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
-    simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
-    // set the centroid position based on results from above
-    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
-    // Case (3a) No samples covered and partial sample mask
-    simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
-    // sample mask should never be all 0's for this case, but handle it anyways
-    unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
-    simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
-    vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
-
-    // blend in case 3a pixel locations
-    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    // evaluate I,J
-    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-}
-
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
-{
-    const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
-    const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
-
-    return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
-}
-
-template<typename T>
-INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
-{
-    // RT has to be single sample if we're in forcedMSAA mode
-    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
-    {
-        return 1;
-    }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
-    {
-        return GetNumSamples(blendSampleCount);
-    }
-    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
-    else
-    {
-        return T::MultisampleT::numSamples;
-    }
-}
-
-inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
-{
-    // broadcast scalars
-
-    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
-
-    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
-
-    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
-
-    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
-
-    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-}
-
-inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
-{
-    assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
-
-    if (pColorBuffer)
-    {
-        for (uint32_t index = 0; index < colorBufferCount; index += 1)
-        {
-            pColorBuffer[index] = renderBuffers.pColor[index];
-        }
-    }
-
-    if (pDepthBuffer)
-    {
-        *pDepthBuffer = renderBuffers.pDepth;
-    }
-
-    if (pStencilBuffer)
-    {
-        *pStencilBuffer = renderBuffers.pStencil;;
-    }
-}
-
-template<typename T>
-void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
-{
-    psContext->pAttribs = work.pAttribs;
-    psContext->pPerspAttribs = work.pPerspAttribs;
-    psContext->frontFace = work.triFlags.frontFacing;
-    psContext->primID = work.triFlags.primID;
-
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-    psContext->I = work.I;
-    psContext->J = work.J;
-
-    psContext->recipDet = work.recipDet;
-    psContext->pRecipW = work.pRecipW;
-    psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
-    psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
-    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
-    psContext->sampleIndex = 0;
-}
-
-template<typename T, bool IsSingleSample>
-void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                  const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
-{
-    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
-    {
-        // for 1x case, centroid is pixel center
-        psContext->vX.centroid = psContext->vX.center;
-        psContext->vY.centroid = psContext->vY.center;
-        psContext->vI.centroid = psContext->vI.center;
-        psContext->vJ.centroid = psContext->vJ.center;
-        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
-    }
-    else
-    {
-        if (T::bCentroidPos)
-        {
-            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-            if (T::bIsCenterPattern)
-            {
-                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
-                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
-            }
-            else
-            {
-                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
-                CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
-            }
-
-            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
-        }
-        else
-        {
-            psContext->vX.centroid = psContext->vX.sample;
-            psContext->vY.centroid = psContext->vY.sample;
-        }
-    }
-}
-
-template<typename T>
-struct PixelRateZTestLoop
-{
-    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
-                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
-                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
-                       samplePos(state.rastState.samplePositions),
-                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
-
-    INLINE
-    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
-                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
-    {
-        SWR_CONTEXT *pContext = pDC->pContext;
-
-        uint32_t statCount = 0;
-        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-        {
-            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
-            vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
-
-            if(!_simd_movemask_ps(vCoverageMask[sample]))
-            {
-                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
-                continue;
-            }
-
-            // offset depth/stencil buffers current sample
-            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-            {
-                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            // calculate per sample positions
-            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-            // calc I & J per sample
-            CalcSampleBarycentrics(coeffs, psContext);
-
-            if(psState.writesODepth)
-            {
-                {
-                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
-                    vZ[sample] = psContext.vZ;
-                }
-            }
-            else
-            {
-                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
-            }
-
-            AR_END(BEBarycentric, 0);
-
-            ///@todo: perspective correct vs non-perspective correct clipping?
-            // if clip distances are enabled, we need to interpolate for each sample
-            if(clipDistanceMask)
-            {
-                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
-            }
-
-            // ZTest for this sample
-            ///@todo Need to uncomment out this bucket.
-            //AR_BEGIN(BEDepthBucket, pDC->drawId);
-            depthPassMask[sample] = vCoverageMask[sample];
-            stencilPassMask[sample] = vCoverageMask[sample];
-            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
-                                                     pStencilSample, &stencilPassMask[sample]);
-            //AR_END(BEDepthBucket, 0);
-
-            // early-exit if no pixels passed depth or earlyZ is forced on
-            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
-            {
-                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
-                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
-
-                if(!_simd_movemask_ps(depthPassMask[sample]))
-                {
-                    continue;
-                }
-            }
-            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
-            statCount += _mm_popcnt_u32(statMask);
-        }
-
-        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
-        // return number of samples that passed depth and coverage
-        return statCount;
-    }
-
-    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
-    simdscalar vZ[T::MultisampleT::numCoverageSamples];
-    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
-    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
-    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
-
-private:
-    // functor inputs
-    DRAW_CONTEXT* pDC;
-    uint32_t workerId;
-
-    const SWR_TRIANGLE_DESC& work;
-    const BarycentricCoeffs& coeffs;
-    const API_STATE& state;
-    const SWR_PS_STATE& psState;
-    const SWR_MULTISAMPLE_POS& samplePos;
-    const uint8_t clipDistanceMask;
-    uint8_t*& pDepthBuffer;
-    uint8_t*& pStencilBuffer;
-};
-
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    // evaluate I,J
-    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
-    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
-    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
-}
-
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    // evaluate I,J
-    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
-    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
-}
-
-// Merge Output to 4x2 SIMD Tile Format
-INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-    simdvector blendOut;
-
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
-        {
-            // pfnBlendFunc may not update all channels.  Initialize with PS output.
-            /// TODO: move this into the blend JIT.
-            blendOut = psContext.shaded[rt];
-
-            // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
-            {
-                pfnBlendFunc[rt](
-                    pBlendState,
-                    psContext.shaded[rt],
-                    psContext.shaded[1],
-                    psContext.shaded[0].w,
-                    sample,
-                    pColorSample,
-                    blendOut,
-                    &psContext.oMask,
-                    (simdscalari*)&coverageMask);
-            }
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
-        // store with color mask
-        if(!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
-        }
-        if(!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
-        }
-        if(!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
-        }
-        if(!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
-        }
-    }
-}
-
-#if USE_8x2_TILE_BACKEND
-// Merge Output to 8x2 SIMD16 Tile Format
-INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-
-    if (useAlternateOffset)
-    {
-        rasterTileColorOffset += sizeof(simdscalar);
-    }
-
-    simdvector blendSrc;
-    simdvector blendOut;
-
-    uint32_t colorBufferBit = 1;
-    for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
-    {
-        simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
-        if (colorBufferBit & colorBufferEnableMask)
-        {
-            blendSrc[0] = pColorSample[0];
-            blendSrc[1] = pColorSample[2];
-            blendSrc[2] = pColorSample[4];
-            blendSrc[3] = pColorSample[6];
-        }
-
-        {
-            // pfnBlendFunc may not update all channels.  Initialize with PS output.
-            /// TODO: move this into the blend JIT.
-            blendOut = psContext.shaded[rt];
-
-            // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
-            {
-                pfnBlendFunc[rt](
-                    pBlendState,
-                    psContext.shaded[rt],
-                    psContext.shaded[1],
-                    psContext.shaded[0].w,
-                    sample,
-                    reinterpret_cast<uint8_t *>(&blendSrc),
-                    blendOut,
-                    &psContext.oMask,
-                    reinterpret_cast<simdscalari *>(&coverageMask));
-            }
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        // store with color mask
-        if (!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
-        }
-        if (!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
-        }
-        if (!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
-        }
-        if (!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
-        }
-    }
-}
-
-#endif
-
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
-
-
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    AR_END(BESetup, 0);
-
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-#endif
-            simdscalar activeLanes;
-            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
-            activeLanes = vMask(work.anyCoveredSamples & MASK);
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            if(T::bForcedSampleCount)
-            {
-                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
-                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
-            }
-
-            // Early-Z?
-            if(T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            if(state.psState.usesSourceDepth)
-            {
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                AR_END(BEBarycentric, 0);
-            }
-
-            // pixels that are currently active
-            psContext.activeMask = _simd_castps_si(activeLanes);
-            psContext.oMask = T::MultisampleT::FullSampleMask();
-
-            // execute pixel shader
-            AR_BEGIN(BEPixelShader, pDC->drawId);
-            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            AR_END(BEPixelShader, 0);
-
-            // update active lanes to remove any discarded or oMask'd pixels
-            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // late-Z
-            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // output merger
-            // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
-            {
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
-                uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
-                simdscalar coverageMask, depthMask;
-                if(T::bForcedSampleCount)
-                {
-                    coverageMask = depthMask = activeLanes;
-                }
-                else
-                {
-                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
-                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
-                    if(!_simd_movemask_ps(depthMask))
-                    {
-                        // stencil should already have been written in early/lateZ tests
-                        AR_END(BEOutputMerger, 0);
-                        continue;
-                    }
-                }
-                
-                // broadcast the results of the PS to all passing pixels
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else // USE_8x2_TILE_BACKEND
-                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif // USE_8x2_TILE_BACKEND
-
-                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
-                {
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
-                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-            {
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-#endif
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BEPixelRateBackend, 0);
-}
+                                       [2] // isCenterPattern
+                                       [SWR_INPUT_COVERAGE_COUNT]
+                                       [2] // centroid
+                                       [2] // forcedSampleCount
+                                       [2] // canEarlyZ
+                                       ;
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
+                                        [SWR_INPUT_COVERAGE_COUNT]
+                                        [2]  // centroid
+                                        [2]; // canEarlyZ
 
-template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
-         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
-    >
-struct SwrBackendTraits
-{
-    static const bool bIsCenterPattern = (isCenter == 1);
-    static const uint32_t InputCoverage = coverage;
-    static const bool bCentroidPos = (centroid == 1);
-    static const bool bForcedSampleCount = (forced == 1);
-    static const bool bCanEarlyZ = (canEarlyZ == 1);
-    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
new file mode 100644
index 0000000..0ef54e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@@ -0,0 +1,281 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
+{
+    auto lambda = [&](int32_t comp)
+    {
+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+    };
+
+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+
+    for (uint32_t i = 0; i < numIter; ++i)
+    {
+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+    }
+}
+
+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
+{
+    auto lambda = [&](int32_t comp)
+    {
+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+    };
+
+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
+
+    for (uint32_t i = 0; i < numIter; ++i)
+    {
+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+    }
+}
+
+#endif
+template<SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
+{
+    // convert clear color to hottile format
+    // clear color is in RGBA float/uint32
+#if USE_8x2_TILE_BACKEND
+    simd16vector vClear;
+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+    {
+        simd16scalar vComp;
+        vComp = _simd16_load1_ps((const float*)&clear[comp]);
+        if (FormatTraits<format>::isNormalized(comp))
+        {
+            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
+            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
+        }
+        vComp = FormatTraits<format>::pack(comp, vComp);
+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+    }
+
+#else
+    simdvector vClear;
+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+    {
+        simdscalar vComp;
+        vComp = _simd_load1_ps((const float*)&clear[comp]);
+        if (FormatTraits<format>::isNormalized(comp))
+        {
+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
+            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+        }
+        vComp = FormatTraits<format>::pack(comp, vComp);
+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+    }
+
+#endif
+    uint32_t tileX, tileY;
+    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
+
+    // Init to full macrotile
+    SWR_RECT clearTile =
+    {
+        KNOB_MACROTILE_X_DIM * int32_t(tileX),
+        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
+        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
+        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
+    };
+
+    // intersect with clear rect
+    clearTile &= rect;
+
+    // translate to local hottile origin
+    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
+
+    // Make maximums inclusive (needed for convert to raster tiles)
+    clearTile.xmax -= 1;
+    clearTile.ymax -= 1;
+
+    // convert to raster tiles
+    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
+    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
+    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
+    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
+
+    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+    // compute steps between raster tile samples / raster tiles / macro tile rows
+    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
+    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+
+    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
+    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
+    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+
+    // loop over all raster tiles in the current hot tile
+    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
+    {
+        uint8_t* pRasterTile = pRasterTileRow;
+        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
+        {
+            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
+            {
+                ClearRasterTile<format>(pRasterTile, vClear);
+                pRasterTile += rasterTileSampleStep;
+            }
+        }
+        pRasterTileRow += macroTileRowStep;
+    }
+
+    pHotTile->state = HOTTILE_DIRTY;
+}
+
+
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    if (KNOB_FAST_CLEAR)
+    {
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
+        uint32_t numSamples = GetNumSamples(sampleCount);
+
+        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
+
+        AR_BEGIN(BEClear, pDC->drawId);
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+        {
+            unsigned long rt = 0;
+            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            while (_BitScanForward(&rt, mask))
+            {
+                mask &= ~(1 << rt);
+
+                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
+
+                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
+                pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+                pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+                pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+                pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+                pHotTile->state = HOTTILE_CLEAR;
+            }
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
+
+            pHotTile->clearData[0] = pClear->clearStencil;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        AR_END(BEClear, 1);
+    }
+    else
+    {
+        // Legacy clear
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        AR_BEGIN(BEClear, pDC->drawId);
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+        {
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+            clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+            clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+            clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            unsigned long rt = 0;
+            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            while (_BitScanForward(&rt, mask))
+            {
+                mask &= ~(1 << rt);
+
+                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            }
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+        {
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&pClear->clearDepth;
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+        {
+            DWORD clearData[4];
+            clearData[0] = pClear->clearStencil;
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+        }
+
+        AR_END(BEClear, 1);
+    }
+}
+
+void InitClearTilesTable()
+{
+    memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
+
+    gClearTilesTable[R8G8B8A8_UNORM]        = ClearMacroTile<R8G8B8A8_UNORM>;
+    gClearTilesTable[B8G8R8A8_UNORM]        = ClearMacroTile<B8G8R8A8_UNORM>;
+    gClearTilesTable[R32_FLOAT]             = ClearMacroTile<R32_FLOAT>;
+    gClearTilesTable[R32G32B32A32_FLOAT]    = ClearMacroTile<R32G32B32A32_FLOAT>;
+    gClearTilesTable[R8_UINT]               = ClearMacroTile<R8_UINT>;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
new file mode 100644
index 0000000..b6a86b5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -0,0 +1,1081 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.h
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+#pragma once
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
+
+
+enum SWR_BACKEND_FUNCS
+{
+    SWR_BACKEND_SINGLE_SAMPLE,
+    SWR_BACKEND_MSAA_PIXEL_RATE,
+    SWR_BACKEND_MSAA_SAMPLE_RATE,
+    SWR_BACKEND_FUNCS_MAX,
+};
+
+#if KNOB_SIMD_WIDTH == 8
+static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+#define MASK 0xff
+#endif
+
+static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+{
+    simdscalar vClipMask = _simd_setzero_ps();
+    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+
+    for (uint32_t i = 0; i < numClipDistance; ++i)
+    {
+        // pull triangle clip distance values from clip buffer
+        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
+
+        // interpolate
+        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
+
+        // clip if interpolated clip distance is < 0 || NAN
+        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
+
+        vClipMask = _simd_or_ps(vClipMask, vCull);
+    }
+
+    return _simd_movemask_ps(vClipMask);
+}
+
+INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileColorOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileColorOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileDepthOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileDepthOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileStencilOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileStencilOffsets[sampleNum];
+}
+
+template<typename T, uint32_t InputCoverage>
+struct generateInputCoverage
+{
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    {
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
+
+        simdscalari mask[2];
+        simdscalari sampleCoverage[2];
+        
+        if(T::bIsCenterPattern)
+        {
+            // center coverage is the same for all samples; just broadcast to the sample slots
+            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+            if(T::MultisampleT::numSamples == 1)
+            {
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
+                sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
+            }
+        }
+        else
+        {
+            simdscalari src = _simd_set1_epi32(0);
+            simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+            if(T::MultisampleT::numSamples == 1)
+            {
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                mask[0] = _simd_set1_epi32(-1);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                mask[0] = _simd_set1_epi32(-1);
+                mask[1] = _simd_set1_epi32(-1);
+                index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+            }
+
+            // gather coverage for samples 0-7
+            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+            if(T::MultisampleT::numSamples > 8)
+            {
+                // gather coverage for samples 8-15
+                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+            }
+        }
+
+        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+        simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+        simdscalari packedCoverage1;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+        }
+
+    #if (KNOB_ARCH == KNOB_ARCH_AVX)
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+        simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+        simdscalari packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+        }
+        else
+        {
+            packedSampleCoverage = packedCoverage0;
+        }
+    #else
+        simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+        simdscalari packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+        }
+        else
+        {
+            packedSampleCoverage = packedCoverage0;
+        }
+    #endif
+
+        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+        {
+            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+            if(!T::bForcedSampleCount)
+            {
+                // input coverage has to be anded with sample mask if MSAA isn't forced on
+                inputMask[i] &= sampleMask;
+            }
+
+            // shift to the next pixel in the 4x2
+            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+        }
+    }
+
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    {
+        uint32_t inputMask[KNOB_SIMD_WIDTH];
+        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+    }
+
+};
+
+template<typename T>
+struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
+{
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    {
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
+        simdscalari vec = _simd_set1_epi32(coverageMask[0]);
+        const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+        vec = _simd_and_si(vec, bit);
+        vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
+        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
+        inputCoverage = _simd_castsi_ps(vec);
+    }
+
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    {
+        uint32_t simdCoverage = (coverageMask[0] & MASK);
+        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
+        for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
+        {
+            // set all samples to covered if conservative coverage mask is set for that pixel
+            inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
+//     have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
+//     coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
+//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
+//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
+                            const uint64_t *const coverageMask, const uint32_t sampleMask,
+                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH];
+    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+
+    // Case (2) - partially covered pixel
+
+    // scan for first covered sample per pixel in the 4x2 span
+    unsigned long sampleNum[KNOB_SIMD_WIDTH];
+    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+    // look up and set the sample offsets from UL pixel corner for first covered sample 
+    simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
+                                    samplePos.X(sampleNum[6]),
+                                    samplePos.X(sampleNum[5]),
+                                    samplePos.X(sampleNum[4]),
+                                    samplePos.X(sampleNum[3]),
+                                    samplePos.X(sampleNum[2]),
+                                    samplePos.X(sampleNum[1]),
+                                    samplePos.X(sampleNum[0]));
+
+    simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
+                                    samplePos.Y(sampleNum[6]),
+                                    samplePos.Y(sampleNum[5]),
+                                    samplePos.Y(sampleNum[4]),
+                                    samplePos.Y(sampleNum[3]),
+                                    samplePos.Y(sampleNum[2]),
+                                    samplePos.Y(sampleNum[1]),
+                                    samplePos.Y(sampleNum[0]));
+    // add sample offset to UL pixel corner
+    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+    simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+    static const simdscalari vZero = _simd_setzero_si();
+    const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+    simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+    simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+    simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+    simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+    // set the centroid position based on results from above
+    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+    // Case (3a) No samples covered and partial sample mask
+    simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+    // sample mask should never be all 0's for this case, but handle it anyways
+    unsigned long firstCoveredSampleMaskSample = 0;
+    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+    simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+    vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
+    vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
+
+    // blend in case 3a pixel locations
+    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    // evaluate I,J
+    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+}
+
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
+{
+    const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
+    const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
+
+    return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
+}
+
+template<typename T>
+INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
+{
+    // RT has to be single sample if we're in forcedMSAA mode
+    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+    {
+        return 1;
+    }
+    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+    {
+        return GetNumSamples(blendSampleCount);
+    }
+    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+    else
+    {
+        return T::MultisampleT::numSamples;
+    }
+}
+
+inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
+{
+    // broadcast scalars
+
+    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+}
+
+inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
+{
+    
+    DWORD index;
+    while (_BitScanForward(&index, colorHotTileMask))
+    {
+        assert(index < SWR_NUM_RENDERTARGETS);
+        colorHotTileMask &= ~(1 << index);
+        pColorBuffer[index] = renderBuffers.pColor[index];
+    }
+
+    if (pDepthBuffer)
+    {
+        *pDepthBuffer = renderBuffers.pDepth;
+    }
+
+    if (pStencilBuffer)
+    {
+        *pStencilBuffer = renderBuffers.pStencil;;
+    }
+}
+
+template<typename T>
+void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
+{
+    psContext->pAttribs = work.pAttribs;
+    psContext->pPerspAttribs = work.pPerspAttribs;
+    psContext->frontFace = work.triFlags.frontFacing;
+    psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
+
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext->I = work.I;
+    psContext->J = work.J;
+
+    psContext->recipDet = work.recipDet;
+    psContext->pRecipW = work.pRecipW;
+    psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
+    psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
+    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
+    psContext->sampleIndex = 0;
+}
+
+template<typename T, bool IsSingleSample>
+void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
+                  const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
+{
+    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
+    {
+        // for 1x case, centroid is pixel center
+        psContext->vX.centroid = psContext->vX.center;
+        psContext->vY.centroid = psContext->vY.center;
+        psContext->vI.centroid = psContext->vI.center;
+        psContext->vJ.centroid = psContext->vJ.center;
+        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
+    }
+    else
+    {
+        if (T::bCentroidPos)
+        {
+            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+            if (T::bIsCenterPattern)
+            {
+                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
+                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
+            }
+            else
+            {
+                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
+                CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
+            }
+
+            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
+        }
+        else
+        {
+            psContext->vX.centroid = psContext->vX.sample;
+            psContext->vY.centroid = psContext->vY.sample;
+        }
+    }
+}
+
+template<typename T>
+struct PixelRateZTestLoop
+{
+    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
+                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
+                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+                       samplePos(state.rastState.samplePositions),
+                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
+
+    INLINE
+    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
+                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+    {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
+        uint32_t statCount = 0;
+        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
+        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+        {
+            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+            vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
+
+            if(!_simd_movemask_ps(vCoverageMask[sample]))
+            {
+                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+                continue;
+            }
+
+            // offset depth/stencil buffers current sample
+            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+            {
+                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            // calculate per sample positions
+            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+            // calc I & J per sample
+            CalcSampleBarycentrics(coeffs, psContext);
+
+            if(psState.writesODepth)
+            {
+                {
+                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+                    vZ[sample] = psContext.vZ;
+                }
+            }
+            else
+            {
+                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
+            }
+
+            AR_END(BEBarycentric, 0);
+
+            ///@todo: perspective correct vs non-perspective correct clipping?
+            // if clip distances are enabled, we need to interpolate for each sample
+            if(clipDistanceMask)
+            {
+                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+            }
+
+            // ZTest for this sample
+            ///@todo Need to uncomment out this bucket.
+            //AR_BEGIN(BEDepthBucket, pDC->drawId);
+            depthPassMask[sample] = vCoverageMask[sample];
+            stencilPassMask[sample] = vCoverageMask[sample];
+            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
+                                                     pStencilSample, &stencilPassMask[sample]);
+            //AR_END(BEDepthBucket, 0);
+
+            // early-exit if no pixels passed depth or earlyZ is forced on
+            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+            {
+                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+
+                if(!_simd_movemask_ps(depthPassMask[sample]))
+                {
+                    continue;
+                }
+            }
+            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+            statCount += _mm_popcnt_u32(statMask);
+        }
+
+        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
+        // return number of samples that passed depth and coverage
+        return statCount;
+    }
+
+    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
+    simdscalar vZ[T::MultisampleT::numCoverageSamples];
+    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
+    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
+    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
+
+private:
+    // functor inputs
+    DRAW_CONTEXT* pDC;
+    uint32_t workerId;
+
+    const SWR_TRIANGLE_DESC& work;
+    const BarycentricCoeffs& coeffs;
+    const API_STATE& state;
+    const SWR_PS_STATE& psState;
+    const SWR_MULTISAMPLE_POS& samplePos;
+    const uint8_t clipDistanceMask;
+    uint8_t*& pDepthBuffer;
+    uint8_t*& pStencilBuffer;
+};
+
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+}
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+}
+
+// Merge Output to 4x2 SIMD Tile Format
+INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+    simdvector blendOut;
+
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, renderTargetMask))
+    {
+        renderTargetMask &= ~(1 << rt);
+        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+        {
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    pColorSample,
+                    blendOut,
+                    &psContext.oMask,
+                    (simdscalari*)&coverageMask);
+            }
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+        // store with color mask
+        if(!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
+        }
+        if(!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
+        }
+        if(!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
+        }
+        if(!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
+        }
+    }
+}
+
+#if USE_8x2_TILE_BACKEND
+// Merge Output to 8x2 SIMD16 Tile Format
+INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+
+    if (useAlternateOffset)
+    {
+        rasterTileColorOffset += sizeof(simdscalar);
+    }
+
+    simdvector blendSrc;
+    simdvector blendOut;
+
+    DWORD rt;
+    while (_BitScanForward(&rt, renderTargetMask))
+    {
+        renderTargetMask &= ~(1 << rt);
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+        simdscalar* pColorSample;
+        bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
+        if (hotTileEnable)
+        {
+            pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
+            blendSrc[0] = pColorSample[0];
+            blendSrc[1] = pColorSample[2];
+            blendSrc[2] = pColorSample[4];
+            blendSrc[3] = pColorSample[6];
+        }
+        else
+        {
+            pColorSample = nullptr;
+        }
+
+        {
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    reinterpret_cast<uint8_t *>(&blendSrc),
+                    blendOut,
+                    &psContext.oMask,
+                    reinterpret_cast<simdscalari *>(&coverageMask));
+            }
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        // store with color mask
+        if (!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
+        }
+        if (!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
+        }
+        if (!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
+        }
+        if (!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
+        }
+    }
+}
+
+#endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
+
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+
+    AR_END(BESetup, 0);
+
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+#endif
+            simdscalar activeLanes;
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            if(T::bForcedSampleCount)
+            {
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+            }
+
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            if(state.psState.usesSourceDepth)
+            {
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+                AR_END(BEBarycentric, 0);
+            }
+
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            AR_BEGIN(BEPixelShader, pDC->drawId);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            AR_END(BEPixelShader, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // output merger
+            // loop over all samples, broadcasting the results of the PS to all passing pixels
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+            {
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
+                simdscalar coverageMask, depthMask;
+                if(T::bForcedSampleCount)
+                {
+                    coverageMask = depthMask = activeLanes;
+                }
+                else
+                {
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
+                    {
+                        // stencil should already have been written in early/lateZ tests
+                        AR_END(BEOutputMerger, 0);
+                        continue;
+                    }
+                }
+                
+                // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
+#else // USE_8x2_TILE_BACKEND
+                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
+#endif // USE_8x2_TILE_BACKEND
+
+                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            {
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while (_BitScanForward(&rt, rtMask))
+                {
+                    rtMask &= ~(1 << rt);
+                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
+            {
+                rtMask &= ~(1 << rt);
+                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BEPixelRateBackend, 0);
+}
+
+template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
+         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
+    >
+struct SwrBackendTraits
+{
+    static const bool bIsCenterPattern = (isCenter == 1);
+    static const uint32_t InputCoverage = coverage;
+    static const bool bCentroidPos = (centroid == 1);
+    static const bool bForcedSampleCount = (forced == 1);
+    static const bool bCanEarlyZ = (canEarlyZ == 1);
+    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
new file mode 100644
index 0000000..d81352a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -0,0 +1,350 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESampleRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+
+    AR_END(BESetup, 0);
+
+    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+#endif
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
+            {
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+
+                if (coverageMask)
+                {
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                    {
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                    }
+
+                    AR_BEGIN(BEBarycentric, pDC->drawId);
+
+                    // calculate per sample positions
+                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+                    CalcSampleBarycentrics(coeffs, psContext);
+
+                    // interpolate and quantize z
+                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+                    AR_END(BEBarycentric, 0);
+
+                    // interpolate user clip distance if available
+                    if (state.rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
+                    simdscalar depthPassMask = vCoverageMask;
+                    simdscalar stencilPassMask = vCoverageMask;
+
+                    // Early-Z?
+                    if (T::bCanEarlyZ)
+                    {
+                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        AR_END(BEEarlyDepthTest, 0);
+
+                        // early-exit if no samples passed depth or earlyZ is forced on.
+                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                        {
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            if (!_simd_movemask_ps(depthPassMask))
+                            {
+                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                                continue;
+                            }
+                        }
+                    }
+
+                    psContext.sampleIndex = sample;
+                    psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                    // execute pixel shader
+                    AR_BEGIN(BEPixelShader, pDC->drawId);
+                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                    AR_END(BEPixelShader, 0);
+
+                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                    // late-Z
+                    if (!T::bCanEarlyZ)
+                    {
+                        AR_BEGIN(BELateDepthTest, pDC->drawId);
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        AR_END(BELateDepthTest, 0);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            // need to call depth/stencil write for stencil write
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                            continue;
+                        }
+                    }
+
+                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statCount = _mm_popcnt_u32(statMask);
+                    UPDATE_STAT_BE(DepthPassCount, statCount);
+
+                    // output merger
+                    AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
+#else
+                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
+#endif
+
+                    // do final depth write after all pixel kills
+                    if (!state.psState.forceEarlyZ)
+                    {
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                    }
+                    AR_END(BEOutputMerger, 0);
+                }
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+        Endtile:
+            ATTR_UNUSED;
+
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while (_BitScanForward(&rt, rtMask))
+                {
+                    rtMask &= ~(1 << rt);
+                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
+            {
+                rtMask &= ~(1 << rt);
+                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BESampleRateBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSampleRate
+{
+    // Last Arg Terminator
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    {
+        switch (tArg)
+        {
+        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_SINGLE_SAMPLE:
+        case SWR_BACKEND_MSAA_PIXEL_RATE:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        default:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+    {
+        switch (tArg)
+        {
+        case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample pattern\n");
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch (tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+    {
+        if (tArg == true)
+        {
+            return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+        return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
+    {
+        for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+        {
+            for (uint32_t centroid = 0; centroid < 2; centroid++)
+            {
+                for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+                {
+                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+                        BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
+                        (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                }
+            }
+        }
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
new file mode 100644
index 0000000..34875d3
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -0,0 +1,326 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+
+    AR_END(BESetup, 1);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+#endif
+            simdmask coverageMask = work.coverageMask[0] & MASK;
+
+            if (coverageMask)
+            {
+                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                {
+                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
+
+                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                }
+
+                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+                {
+                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+                }
+
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+
+                CalcPixelBarycentrics(coeffs, psContext);
+
+                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+                AR_END(BEBarycentric, 1);
+
+                // interpolate user clip distance if available
+                if (state.rastState.clipDistanceMask)
+                {
+                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
+                }
+
+                simdscalar vCoverageMask = vMask(coverageMask);
+                simdscalar depthPassMask = vCoverageMask;
+                simdscalar stencilPassMask = vCoverageMask;
+
+                // Early-Z?
+                if (T::bCanEarlyZ)
+                {
+                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    AR_END(BEEarlyDepthTest, 0);
+
+                    // early-exit if no pixels passed depth or earlyZ is forced on
+                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                    {
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            goto Endtile;
+                        }
+                    }
+                }
+
+                psContext.sampleIndex = 0;
+                psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                // execute pixel shader
+                AR_BEGIN(BEPixelShader, pDC->drawId);
+                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                AR_END(BEPixelShader, 0);
+
+                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                // late-Z
+                if (!T::bCanEarlyZ)
+                {
+                    AR_BEGIN(BELateDepthTest, pDC->drawId);
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    AR_END(BELateDepthTest, 0);
+
+                    if (!_simd_movemask_ps(depthPassMask))
+                    {
+                        // need to call depth/stencil write for stencil write
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                        goto Endtile;
+                    }
+                } else {
+                    // for early z, consolidate discards from shader
+                    // into depthPassMask
+                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
+                }
+
+                uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                uint32_t statCount = _mm_popcnt_u32(statMask);
+                UPDATE_STAT_BE(DepthPassCount, statCount);
+
+                // output merger
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
+#else
+                OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
+#endif
+
+                // do final depth write after all pixel kills
+                if (!state.psState.forceEarlyZ)
+                {
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while(_BitScanForward(&rt, rtMask))
+                {
+                    rtMask &= ~(1 << rt);
+                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
+            {
+                rtMask &= ~(1 << rt);
+                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BESingleSampleBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSingleSample
+{
+    // Last Arg Terminator
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    {
+        switch(tArg)
+        {
+        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_PIXEL_RATE:
+        case SWR_BACKEND_MSAA_SAMPLE_RATE:
+        default:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        default:
+        SWR_ASSERT(0 && "Invalid sample pattern\n");
+        return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+        break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+        SWR_ASSERT(0 && "Invalid sample count\n");
+        return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+        break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+    {
+        if(tArg == true)
+        {
+            return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+        return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+    {
+        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+        {
+            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+            {
+                table[inputCoverage][isCentroid][canEarlyZ] =
+                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
+                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+            }
+        }
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 9d36f21..de6691b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -26,6 +26,7 @@
 *
 ******************************************************************************/
 
+#include "binner.h"
 #include "context.h"
 #include "frontend.h"
 #include "conservativeRast.h"
@@ -36,173 +37,14 @@
 
 // Function Prototype
 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
 
 #if USE_SIMD16_FRONTEND
 void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
 #endif
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
-{
-    _simd_set1_ps(0.0f),    // SWR_PIXEL_LOCATION_CENTER
-    _simd_set1_ps(0.5f),    // SWR_PIXEL_LOCATION_UL
-};
-
-#if USE_SIMD16_FRONTEND
-static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] =
-{
-    _simd16_set1_ps(0.0f),  // SWR_PIXEL_LOCATION_CENTER
-    _simd16_set1_ps(0.5f),  // SWR_PIXEL_LOCATION_UL
-};
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed 
-/// Point precision from FP32.
-template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
-{
-    simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
-    return _simd_cvtps_epi32(vFixed);
-}
-
-#if USE_SIMD16_FRONTEND
-template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
-{
-    simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
-    return _simd16_cvtps_epi32(vFixed);
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the 
-/// requested Fixed Point precision from FP32.
-/// @param tri: simdvector[3] of FP triangle verts
-/// @param vXi: fixed point X coords of tri verts
-/// @param vYi: fixed point Y coords of tri verts
-INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
-{
-    vXi[0] = fpToFixedPointVertical(tri[0].x);
-    vYi[0] = fpToFixedPointVertical(tri[0].y);
-    vXi[1] = fpToFixedPointVertical(tri[1].x);
-    vYi[1] = fpToFixedPointVertical(tri[1].y);
-    vXi[2] = fpToFixedPointVertical(tri[2].x);
-    vYi[2] = fpToFixedPointVertical(tri[2].y);
-}
-
-#if USE_SIMD16_FRONTEND
-INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3])
-{
-    vXi[0] = fpToFixedPointVertical(tri[0].x);
-    vYi[0] = fpToFixedPointVertical(tri[0].y);
-    vXi[1] = fpToFixedPointVertical(tri[1].x);
-    vYi[1] = fpToFixedPointVertical(tri[1].y);
-    vXi[2] = fpToFixedPointVertical(tri[2].x);
-    vYi[2] = fpToFixedPointVertical(tri[2].y);
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Calculate bounding box for current triangle
-/// @tparam CT: ConservativeRastFETraits type
-/// @param vX: fixed point X position for triangle verts
-/// @param vY: fixed point Y position for triangle verts
-/// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type 
-/// of rasterization. This avoids unnecessary FP->fixed conversions.
-template <typename CT>
-INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
-{
-    simdscalari vMinX = vX[0];
-    vMinX = _simd_min_epi32(vMinX, vX[1]);
-    vMinX = _simd_min_epi32(vMinX, vX[2]);
-
-    simdscalari vMaxX = vX[0];
-    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
-    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
-    simdscalari vMinY = vY[0];
-    vMinY = _simd_min_epi32(vMinY, vY[1]);
-    vMinY = _simd_min_epi32(vMinY, vY[2]);
-
-    simdscalari vMaxY = vY[0];
-    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
-    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
-    bbox.xmin = vMinX;
-    bbox.xmax = vMaxX;
-    bbox.ymin = vMinY;
-    bbox.ymax = vMaxY;
-}
-
-#if USE_SIMD16_FRONTEND
-template <typename CT>
-INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
-{
-    simd16scalari vMinX = vX[0];
-
-    vMinX = _simd16_min_epi32(vMinX, vX[1]);
-    vMinX = _simd16_min_epi32(vMinX, vX[2]);
-
-    simd16scalari vMaxX = vX[0];
-
-    vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
-    vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
-
-    simd16scalari vMinY = vY[0];
-
-    vMinY = _simd16_min_epi32(vMinY, vY[1]);
-    vMinY = _simd16_min_epi32(vMinY, vY[2]);
-
-    simd16scalari vMaxY = vY[0];
-
-    vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
-    vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
-
-    bbox.xmin = vMinX;
-    bbox.xmax = vMaxX;
-    bbox.ymin = vMinY;
-    bbox.ymax = vMaxY;
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
-/// Offsets BBox for conservative rast
-template <>
-INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
-{
-    // FE conservative rast traits
-    typedef FEConservativeRastT CT;
-
-    simdscalari vMinX = vX[0];
-    vMinX = _simd_min_epi32(vMinX, vX[1]);
-    vMinX = _simd_min_epi32(vMinX, vX[2]);
-
-    simdscalari vMaxX = vX[0];
-    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
-    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
-    simdscalari vMinY = vY[0];
-    vMinY = _simd_min_epi32(vMinY, vY[1]);
-    vMinY = _simd_min_epi32(vMinY, vY[2]);
-
-    simdscalari vMaxY = vY[0];
-    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
-    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
-    /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
-    /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
-    bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-    bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-    bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-    bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
 /// @brief Processes attributes for the backend based on linkage mask and
 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
 /// @param pDC - Draw context
@@ -238,15 +80,15 @@
         if (IsSwizzledT::value)
         {
             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
-            inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
+            inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
 
         }
         else
         {
-            inputSlot = VERTEX_ATTRIB_START_SLOT + i;
+            inputSlot = backendState.vertexAttribOffset + i;
         }
 
-        __m128 attrib[3];    // triangle attribs (always 4 wide)
+        simd4scalar attrib[3];    // triangle attribs (always 4 wide)
         float* pAttribStart = pBuffer;
 
         if (HasConstantInterpT::value || IsDegenerate::value)
@@ -286,7 +128,7 @@
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[vid]);
+                    SIMD128::store_ps(pBuffer, attrib[vid]);
                     pBuffer += 4;
                 }
             }
@@ -296,7 +138,7 @@
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[i]);
+                    SIMD128::store_ps(pBuffer, attrib[i]);
                     pBuffer += 4;
                 }
             }
@@ -307,7 +149,7 @@
 
             for (uint32_t i = 0; i < NumVertsT::value; ++i)
             {
-                _mm_store_ps(pBuffer, attrib[i]);
+                SIMD128::store_ps(pBuffer, attrib[i]);
                 pBuffer += 4;
             }
         }
@@ -318,7 +160,7 @@
         // effect of the missing vertices in the triangle interpolation.
         for (uint32_t v = NumVertsT::value; v < 3; ++v)
         {
-            _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+            SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
             pBuffer += 4;
         }
 
@@ -437,8 +279,7 @@
 {
     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
         simd16scalari &scisXmin, simd16scalari &scisYmin,
-        simd16scalari &scisXmax, simd16scalari &scisYmax)
-    {
+        simd16scalari &scisXmax, simd16scalari &scisYmax) {
         scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@@ -548,14 +389,14 @@
         uint32_t clipAttribSlot = clipSlot == 0 ?
             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
 
-        __m128 primClipDist[3];
+        simd4scalar primClipDist[3];
         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 
         float vertClipDist[NumVerts];
         for (uint32_t e = 0; e < NumVerts; ++e)
         {
             OSALIGNSIMD(float) aVertClipDist[4];
-            _mm_store_ps(aVertClipDist, primClipDist[e]);
+            SIMD128::store_ps(aVertClipDist, primClipDist[e]);
             vertClipDist[e] = aVertClipDist[clipComp];
         };
 
@@ -592,8 +433,7 @@
     uint32_t workerId,
     simdvector tri[3],
     uint32_t triMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -602,13 +442,27 @@
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
 
+    // Read viewport array index if needed
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[3];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
     if (feState.vpTransformDisable)
     {
         // RHW is passed in directly when VP transform is disabled
@@ -636,7 +490,7 @@
         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
 
         // Viewport transform to screen space coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
         }
@@ -719,34 +573,6 @@
         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
     }
 
-    // Simple non-conformant wireframe mode, useful for debugging
-    if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
-    {
-        // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
-        simdvector line[2];
-        simdscalar recipW[2];
-        line[0] = tri[0];
-        line[1] = tri[1];
-        recipW[0] = vRecipW0;
-        recipW[1] = vRecipW1;
-        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        line[0] = tri[1];
-        line[1] = tri[2];
-        recipW[0] = vRecipW1;
-        recipW[1] = vRecipW2;
-        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        line[0] = tri[2];
-        line[1] = tri[0];
-        recipW[0] = vRecipW2;
-        recipW[1] = vRecipW0;
-        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        AR_END(FEBinTriangles, 1);
-        return;
-    }
-
     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
     // compute per tri backface
     uint32_t frontFaceMask = frontWindingTris;
@@ -798,13 +624,14 @@
             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
     }
 
+    simdBBox bbox;
+
     if (!triMask)
     {
         goto endBinTriangles;
     }
 
     // Calc bounding box of triangles
-    simdBBox bbox;
     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
 
     // determine if triangle falls between pixel centers and discard
@@ -846,24 +673,30 @@
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.gsState.emitsViewportArrayIndex)
     {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+        bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -884,9 +717,43 @@
         triMask = triMask & ~maskOutsideScissor;
     }
 
-    if (!triMask)
+endBinTriangles:
+
+    // Send surviving triangles to the line or point binner based on fill mode
+    if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
     {
-        goto endBinTriangles;
+        // Simple non-conformant wireframe mode, useful for debugging.
+        // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
+        simdvector line[2];
+        simdscalar recipW[2];
+        line[0] = tri[0];
+        line[1] = tri[1];
+        recipW[0] = vRecipW0;
+        recipW[1] = vRecipW1;
+        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        line[0] = tri[1];
+        line[1] = tri[2];
+        recipW[0] = vRecipW1;
+        recipW[1] = vRecipW2;
+        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        line[0] = tri[2];
+        line[1] = tri[0];
+        recipW[0] = vRecipW2;
+        recipW[1] = vRecipW0;
+        BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        AR_END(FEBinTriangles, 1);
+        return;
+    }
+    else if (rastState.fillMode == SWR_FILLMODE_POINT)
+    {
+        // Bin 3 points
+        BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
+        BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
+        BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
+        return;
     }
 
     // Convert triangle bbox to macrotile units.
@@ -903,7 +770,7 @@
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@@ -911,12 +778,12 @@
 
     // store render target array index
     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    if (state.backendState.readRenderTargetArrayIndex)
     {
         simdvector vRtai[3];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+        pa.Assemble(VERTEX_SGV_SLOT, vRtai);
         simdscalari vRtaii;
-        vRtaii = _simd_castps_si(vRtai[0].x);
+        vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd_store_si((simdscalari*)aRTAI, vRtaii);
     }
     else
@@ -924,8 +791,6 @@
         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
     }
 
-endBinTriangles:
-
     // scan remaining valid triangles and bin each separately
     while (_BitScanForward(&triIndex, triMask))
     {
@@ -959,7 +824,6 @@
         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
-        desc.triFlags.primID = pPrimID[triIndex];
         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
 
@@ -975,10 +839,10 @@
         // store triangle vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -1008,14 +872,13 @@
 
 #if USE_SIMD16_FRONTEND
 template <typename CT>
-void BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
     simd16vector tri[3],
     uint32_t triMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -1024,13 +887,26 @@
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
 
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
     simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
     simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
     simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
+    
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[3];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
 
     if (feState.vpTransformDisable)
     {
@@ -1059,7 +935,7 @@
         tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
 
         // Viewport transform to screen space coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
         }
@@ -1144,34 +1020,6 @@
         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
     }
 
-    // Simple non-conformant wireframe mode, useful for debugging
-    if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
-    {
-        // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
-        simd16vector line[2];
-        simd16scalar recipW[2];
-        line[0] = tri[0];
-        line[1] = tri[1];
-        recipW[0] = vRecipW0;
-        recipW[1] = vRecipW1;
-        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        line[0] = tri[1];
-        line[1] = tri[2];
-        recipW[0] = vRecipW1;
-        recipW[1] = vRecipW2;
-        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        line[0] = tri[2];
-        line[1] = tri[0];
-        recipW[0] = vRecipW2;
-        recipW[1] = vRecipW0;
-        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
-        AR_END(FEBinTriangles, 1);
-        return;
-    }
-
     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
     // compute per tri backface
     uint32_t frontFaceMask = frontWindingTris;
@@ -1228,13 +1076,14 @@
             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
     }
 
+    simd16BBox bbox;
+
     if (!triMask)
     {
         goto endBinTriangles;
     }
 
     // Calc bounding box of triangles
-    simd16BBox bbox;
     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
 
     // determine if triangle falls between pixel centers and discard
@@ -1278,25 +1127,31 @@
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.gsState.emitsViewportArrayIndex)
     {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
+
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+        bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -1317,9 +1172,43 @@
         triMask = triMask & ~maskOutsideScissor;
     }
 
-    if (!triMask)
+endBinTriangles:
+
+    // Send surviving triangles to the line or point binner based on fill mode
+    if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
     {
-        goto endBinTriangles;
+        // Simple non-conformant wireframe mode, useful for debugging
+        // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
+        simd16vector line[2];
+        simd16scalar recipW[2];
+        line[0] = tri[0];
+        line[1] = tri[1];
+        recipW[0] = vRecipW0;
+        recipW[1] = vRecipW1;
+        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        line[0] = tri[1];
+        line[1] = tri[2];
+        recipW[0] = vRecipW1;
+        recipW[1] = vRecipW2;
+        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        line[0] = tri[2];
+        line[1] = tri[0];
+        recipW[0] = vRecipW2;
+        recipW[1] = vRecipW0;
+        BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+        AR_END(FEBinTriangles, 1);
+        return;
+    }
+    else if (rastState.fillMode == SWR_FILLMODE_POINT)
+    {
+        // Bin 3 points
+        BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
+        BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
+        BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
+        return;
     }
 
     // Convert triangle bbox to macrotile units.
@@ -1330,17 +1219,17 @@
 
     OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
 
-    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft),    bbox.xmin);
-    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight),   bbox.xmax);
-    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop),     bbox.ymin);
-    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom),  bbox.ymax);
+    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
+    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
+    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
+    _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
 
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@@ -1354,12 +1243,12 @@
 
     // store render target array index
     OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    if (state.backendState.readRenderTargetArrayIndex)
     {
         simd16vector vRtai[3];
-        pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
         simd16scalari vRtaii;
-        vRtaii = _simd16_castps_si(vRtai[0].x);
+        vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
     }
     else
@@ -1367,8 +1256,6 @@
         _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
     }
 
-endBinTriangles:
-
 
     // scan remaining valid triangles and bin each separately
     while (_BitScanForward(&triIndex, triMask))
@@ -1403,7 +1290,6 @@
         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
-        desc.triFlags.primID = pPrimID[triIndex];
         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
 
@@ -1494,18 +1380,11 @@
 
 #endif
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend.  Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-void BinPoints(
+void BinPostSetupPoints(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
-    simdvector prim[3],
+    simdvector prim[],
     uint32_t primMask,
     simdscalari primID,
     simdscalari viewportIdx)
@@ -1517,8 +1396,6 @@
     simdvector& primVerts = prim[0];
 
     const API_STATE& state = GetApiState(pDC);
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
     const SWR_RASTSTATE& rastState = state.rastState;
     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 
@@ -1526,30 +1403,6 @@
     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
-        primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
-        primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
-        primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-
-        // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
-        {
-            viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
-        }
-        else
-        {
-            viewportTransform<1>(&primVerts, state.vpMatrices);
-        }
-    }
-
-    // adjust for pixel center location
-    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
-    primVerts.x = _simd_add_ps(primVerts.x, offset);
-    primVerts.y = _simd_add_ps(primVerts.y, offset);
-
     // convert to fixed point
     simdscalari vXi, vYi;
     vXi = fpToFixedPointVertical(primVerts.x);
@@ -1599,11 +1452,11 @@
 
         // store render target array index
         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        if (state.backendState.readRenderTargetArrayIndex)
         {
             simdvector vRtai;
-            pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai.x);
+            pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
             _simd_store_si((simdscalari*)aRTAI, vRtaii);
         }
         else
@@ -1629,7 +1482,6 @@
 
             // points are always front facing
             desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 
@@ -1677,8 +1529,8 @@
         if (rastState.pointParam)
         {
             simdvector size[3];
-            pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
-            vPointSize = size[0].x;
+            pa.Assemble(VERTEX_SGV_SLOT, size);
+            vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
         }
         else
         {
@@ -1700,24 +1552,26 @@
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.gsState.emitsViewportArrayIndex)
         {
-            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -1740,11 +1594,11 @@
 
         // store render target array index
         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        if (state.backendState.readRenderTargetArrayIndex)
         {
             simdvector vRtai[2];
-            pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+            pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
             _simd_store_si((simdscalari*)aRTAI, vRtaii);
         }
         else
@@ -1779,7 +1633,6 @@
             TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
             desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.pointSize = aPointSize[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
@@ -1837,12 +1690,82 @@
     AR_END(FEBinPoints, 1);
 }
 
-#if USE_SIMD16_FRONTEND
-void BinPoints_simd16(
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD points to the backend.  Only supports point size of 1
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains point position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each point.
+void BinPoints(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
-    simd16vector prim[3],
+    simdvector prim[3],
+    uint32_t primMask,
+    simdscalari primID)
+{
+    simdvector& primVerts = prim[0];
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const SWR_RASTSTATE& rastState = state.rastState;
+
+    // Read back viewport index if required
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[1];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+        // OOB indices => forced to zero.
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
+    if (!feState.vpTransformDisable)
+    {
+        // perspective divide
+        simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
+        primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
+        primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
+        primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
+
+        // viewport transform to screen coords
+        if (state.backendState.readViewportArrayIndex)
+        {
+            viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+        }
+        else
+        {
+            viewportTransform<1>(&primVerts, state.vpMatrices);
+        }
+    }
+
+    // adjust for pixel center location
+    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+    primVerts.x = _simd_add_ps(primVerts.x, offset);
+    primVerts.y = _simd_add_ps(primVerts.y, offset);
+
+    BinPostSetupPoints(
+        pDC,
+        pa,
+        workerId,
+        prim,
+        primMask,
+        primID,
+        viewportIdx);
+}
+
+#if USE_SIMD16_FRONTEND
+void BinPostSetupPoints_simd16(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simd16vector prim[],
     uint32_t primMask,
     simd16scalari primID,
     simd16scalari viewportIdx)
@@ -1854,8 +1777,6 @@
     simd16vector& primVerts = prim[0];
 
     const API_STATE& state = GetApiState(pDC);
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
     const SWR_RASTSTATE& rastState = state.rastState;
     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 
@@ -1863,31 +1784,6 @@
     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
-
-        primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
-        primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
-        primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
-
-        // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
-        {
-            viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
-        }
-        else
-        {
-            viewportTransform<1>(&primVerts, state.vpMatrices);
-        }
-    }
-
-    const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
-
-    primVerts.x = _simd16_add_ps(primVerts.x, offset);
-    primVerts.y = _simd16_add_ps(primVerts.y, offset);
-
     // convert to fixed point
     simd16scalari vXi, vYi;
 
@@ -1941,11 +1837,11 @@
 
         // store render target array index
         OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        if (state.backendState.readRenderTargetArrayIndex)
         {
             simd16vector vRtai;
-            pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai);
-            simd16scalari vRtaii = _simd16_castps_si(vRtai.x);
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, &vRtai);
+            simd16scalari vRtaii = _simd16_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
             _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
         }
         else
@@ -1971,7 +1867,6 @@
 
             // points are always front facing
             desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 
@@ -2021,8 +1916,8 @@
         if (rastState.pointParam)
         {
             simd16vector size[3];
-            pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size);
-            vPointSize = size[0].x;
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, size);
+            vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
         }
         else
         {
@@ -2046,24 +1941,26 @@
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.gsState.emitsViewportArrayIndex)
         {
-            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2087,11 +1984,11 @@
 
         // store render target array index
         OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        if (state.backendState.readRenderTargetArrayIndex)
         {
             simd16vector vRtai[2];
-            pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
-            simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
+            simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
             _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
         }
         else
@@ -2126,7 +2023,6 @@
             TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
             desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.pointSize = aPointSize[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
@@ -2184,6 +2080,70 @@
     AR_END(FEBinPoints, 1);
 }
 
+void SIMDCALL BinPoints_simd16(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simd16vector prim[3],
+    uint32_t primMask,
+    simd16scalari primID)
+{
+    simd16vector& primVerts = prim[0];
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const SWR_RASTSTATE& rastState = state.rastState;
+
+    // Read back viewport index if required
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[1];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
+
+    if (!feState.vpTransformDisable)
+    {
+        // perspective divide
+        simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
+
+        primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
+        primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
+        primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
+
+        // viewport transform to screen coords
+        if (state.backendState.readViewportArrayIndex)
+        {
+            viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+        }
+        else
+        {
+            viewportTransform<1>(&primVerts, state.vpMatrices);
+        }
+    }
+
+    const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
+
+    primVerts.x = _simd16_add_ps(primVerts.x, offset);
+    primVerts.y = _simd16_add_ps(primVerts.y, offset);
+
+    BinPostSetupPoints_simd16(
+        pDC,
+        pa,
+        workerId,
+        prim,
+        primMask,
+        primID,
+        viewportIdx);
+}
+
 #endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief Bin SIMD lines to the backend.
@@ -2209,7 +2169,6 @@
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_GS_STATE& gsState = state.gsState;
 
     // Select attribute processor
     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
@@ -2218,6 +2177,8 @@
     simdscalar& vRecipW0 = recipW[0];
     simdscalar& vRecipW1 = recipW[1];
 
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
     // convert to fixed point
     simdscalari vXi[2], vYi[2];
     vXi[0] = fpToFixedPointVertical(prim[0].x);
@@ -2264,24 +2225,26 @@
     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.gsState.emitsViewportArrayIndex)
     {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2311,7 +2274,6 @@
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@@ -2319,11 +2281,11 @@
 
     // store render target array index
     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    if (state.backendState.readRenderTargetArrayIndex)
     {
         simdvector vRtai[2];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-        simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+        pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+        simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd_store_si((simdscalari*)aRTAI, vRtaii);
     }
     else
@@ -2344,7 +2306,6 @@
         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
         desc.triFlags.frontFacing = 1;
-        desc.triFlags.primID = pPrimID[primIndex];
         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
@@ -2361,10 +2322,10 @@
 
         // store line vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -2413,8 +2374,6 @@
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
 
     // Select attribute processor
     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
@@ -2470,25 +2429,27 @@
     bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.gsState.emitsViewportArrayIndex)
     {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
+
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2499,6 +2460,15 @@
         primMask = primMask & ~maskOutsideScissor;
     }
 
+    const simdscalar unused = _simd_setzero_ps();
+
+    // transpose verts needed for backend
+    /// @todo modify BE to take non-transformed verts
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+
     if (!primMask)
     {
         goto endBinLines;
@@ -2517,15 +2487,6 @@
     _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop),     bbox.ymin);
     _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom),  bbox.ymax);
 
-    // transpose verts needed for backend
-    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-
-    const simdscalar unused = _simd_setzero_ps();
-
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
     vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
@@ -2538,11 +2499,11 @@
 
     // store render target array index
     OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    if (state.backendState.readRenderTargetArrayIndex)
     {
         simd16vector vRtai[2];
-        pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
-        simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
+        simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
     }
     else
@@ -2563,7 +2524,6 @@
         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 
         desc.triFlags.frontFacing = 1;
-        desc.triFlags.primID = pPrimID[primIndex];
         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
@@ -2636,8 +2596,7 @@
     uint32_t workerId,
     simdvector prim[],
     uint32_t primMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari primID)
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2645,6 +2604,20 @@
 
     simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
 
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[2];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+
+        // OOB indices => forced to zero.
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -2661,7 +2634,7 @@
         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
         }
@@ -2691,28 +2664,34 @@
 }
 
 #if USE_SIMD16_FRONTEND
-void BinLines_simd16(
+void SIMDCALL BinLines_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
     simd16vector prim[3],
     uint32_t primMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari primID)
 {
-    SWR_CONTEXT *pContext = pDC->pContext;
-
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
-
-    // Select attribute processor
-    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
-        state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 
     simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
 
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[2];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -2729,7 +2708,7 @@
         prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h
new file mode 100644
index 0000000..875e0b7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -0,0 +1,223 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file binner.h
+*
+* @brief Declaration for the macrotile binner
+*
+******************************************************************************/
+#include "state.h"
+#include "conservativeRast.h"
+#include "utils.h"
+//////////////////////////////////////////////////////////////////////////
+/// @brief Offsets added to post-viewport vertex positions based on
+/// raster state.
+static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
+{
+    _simd_set1_ps(0.0f),    // SWR_PIXEL_LOCATION_CENTER
+    _simd_set1_ps(0.5f),    // SWR_PIXEL_LOCATION_UL
+};
+
+#if USE_SIMD16_FRONTEND
+static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] =
+{
+    _simd16_set1_ps(0.0f),  // SWR_PIXEL_LOCATION_CENTER
+    _simd16_set1_ps(0.5f),  // SWR_PIXEL_LOCATION_UL
+};
+
+#endif
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert the X,Y coords of a triangle to the requested Fixed 
+/// Point precision from FP32.
+template <typename PT = FixedPointTraits<Fixed_16_8>>
+INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
+{
+    simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
+    return _simd_cvtps_epi32(vFixed);
+}
+
+#if USE_SIMD16_FRONTEND
+template <typename PT = FixedPointTraits<Fixed_16_8>>
+INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
+{
+    simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
+    return _simd16_cvtps_epi32(vFixed);
+}
+
+#endif
+//////////////////////////////////////////////////////////////////////////
+/// @brief Helper function to set the X,Y coords of a triangle to the 
+/// requested Fixed Point precision from FP32.
+/// @param tri: simdvector[3] of FP triangle verts
+/// @param vXi: fixed point X coords of tri verts
+/// @param vYi: fixed point Y coords of tri verts
+INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
+{
+    vXi[0] = fpToFixedPointVertical(tri[0].x);
+    vYi[0] = fpToFixedPointVertical(tri[0].y);
+    vXi[1] = fpToFixedPointVertical(tri[1].x);
+    vYi[1] = fpToFixedPointVertical(tri[1].y);
+    vXi[2] = fpToFixedPointVertical(tri[2].x);
+    vYi[2] = fpToFixedPointVertical(tri[2].y);
+}
+
+#if USE_SIMD16_FRONTEND
+INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3])
+{
+    vXi[0] = fpToFixedPointVertical(tri[0].x);
+    vYi[0] = fpToFixedPointVertical(tri[0].y);
+    vXi[1] = fpToFixedPointVertical(tri[1].x);
+    vYi[1] = fpToFixedPointVertical(tri[1].y);
+    vXi[2] = fpToFixedPointVertical(tri[2].x);
+    vYi[2] = fpToFixedPointVertical(tri[2].y);
+}
+
+#endif
+//////////////////////////////////////////////////////////////////////////
+/// @brief Calculate bounding box for current triangle
+/// @tparam CT: ConservativeRastFETraits type
+/// @param vX: fixed point X position for triangle verts
+/// @param vY: fixed point Y position for triangle verts
+/// @param bbox: fixed point bbox
+/// *Note*: expects vX, vY to be in the correct precision for the type 
+/// of rasterization. This avoids unnecessary FP->fixed conversions.
+template <typename CT>
+INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
+{
+    simdscalari vMinX = vX[0];
+    vMinX = _simd_min_epi32(vMinX, vX[1]);
+    vMinX = _simd_min_epi32(vMinX, vX[2]);
+
+    simdscalari vMaxX = vX[0];
+    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
+    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
+
+    simdscalari vMinY = vY[0];
+    vMinY = _simd_min_epi32(vMinY, vY[1]);
+    vMinY = _simd_min_epi32(vMinY, vY[2]);
+
+    simdscalari vMaxY = vY[0];
+    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
+    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
+
+    bbox.xmin = vMinX;
+    bbox.xmax = vMaxX;
+    bbox.ymin = vMinY;
+    bbox.ymax = vMaxY;
+}
+
+#if USE_SIMD16_FRONTEND
+template <typename CT>
+INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
+{
+    simd16scalari vMinX = vX[0];
+
+    vMinX = _simd16_min_epi32(vMinX, vX[1]);
+    vMinX = _simd16_min_epi32(vMinX, vX[2]);
+
+    simd16scalari vMaxX = vX[0];
+
+    vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
+    vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
+
+    simd16scalari vMinY = vY[0];
+
+    vMinY = _simd16_min_epi32(vMinY, vY[1]);
+    vMinY = _simd16_min_epi32(vMinY, vY[2]);
+
+    simd16scalari vMaxY = vY[0];
+
+    vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
+    vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
+
+    bbox.xmin = vMinX;
+    bbox.xmax = vMaxX;
+    bbox.ymin = vMinY;
+    bbox.ymax = vMaxY;
+}
+
+#endif
+//////////////////////////////////////////////////////////////////////////
+/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
+/// Offsets BBox for conservative rast
+template <>
+INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
+{
+    // FE conservative rast traits
+    typedef FEConservativeRastT CT;
+
+    simdscalari vMinX = vX[0];
+    vMinX = _simd_min_epi32(vMinX, vX[1]);
+    vMinX = _simd_min_epi32(vMinX, vX[2]);
+
+    simdscalari vMaxX = vX[0];
+    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
+    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
+
+    simdscalari vMinY = vY[0];
+    vMinY = _simd_min_epi32(vMinY, vY[1]);
+    vMinY = _simd_min_epi32(vMinY, vY[2]);
+
+    simdscalari vMaxY = vY[0];
+    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
+    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
+
+    /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
+    /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
+    bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+}
+
+#if USE_SIMD16_FRONTEND
+template <>
+INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
+{
+    // FE conservative rast traits
+    typedef FEConservativeRastT CT;
+
+    simd16scalari vMinX = vX[0];
+    vMinX = _simd16_min_epi32(vMinX, vX[1]);
+    vMinX = _simd16_min_epi32(vMinX, vX[2]);
+
+    simd16scalari vMaxX = vX[0];
+    vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
+    vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
+
+    simd16scalari vMinY = vY[0];
+    vMinY = _simd16_min_epi32(vMinY, vY[1]);
+    vMinY = _simd16_min_epi32(vMinY, vY[2]);
+
+    simd16scalari vMaxY = vY[0];
+    vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
+    vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
+
+    /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
+    /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
+    bbox.xmin = _simd16_sub_epi32(vMinX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.xmax = _simd16_add_epi32(vMaxX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.ymin = _simd16_sub_epi32(vMinY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
+    bbox.ymax = _simd16_add_epi32(vMaxY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
+}
+
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 6a5bf6c..bf542f1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -160,35 +160,35 @@
     return i;
 }
 
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
     Clipper<3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipTriangles, 1);
 }
 
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
     Clipper<2> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipLines, 1);
 }
 
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
     Clipper<1> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipPoints, 1);
 }
 
 #if USE_SIMD16_FRONTEND
-void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
@@ -198,12 +198,12 @@
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipTriangles, 1);
 }
 
-void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
@@ -213,12 +213,12 @@
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipLines, 1);
 }
 
-void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
@@ -228,7 +228,7 @@
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipPoints, 1);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 28042d5..36c8402 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -459,7 +459,7 @@
 
 #endif
     // clip SIMD primitives
-    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
+    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
     {
         // input/output vertex store for clipper
         simdvertex vertices[7]; // maximum 7 verts generated per triangle
@@ -489,7 +489,7 @@
             // Compute absolute attrib slot in vertex array
             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
-            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
+            uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 
             pa.Assemble(inputSlot, tmpVector);
 
@@ -559,7 +559,6 @@
         
         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-        uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
 
         const simdscalari vOffsets = _mm256_set_epi32(
             0 * sizeof(simdvertex),  // unused lane
@@ -610,7 +609,7 @@
             uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 
 #if USE_SIMD16_FRONTEND
-            // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug
+            // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
             static const float *dummy = reinterpret_cast<const float *>(pBase);
 #endif
 
@@ -626,10 +625,10 @@
             }
 
             // transpose attribs
-            pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
+            pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
             {
-                uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+                uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
                 for (uint32_t c = 0; c < 4; ++c)
                 {
 #if USE_SIMD16_FRONTEND
@@ -673,7 +672,7 @@
                 }
             }
 
-            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
             {
@@ -697,7 +696,7 @@
                         }
 
                         clipPa.useAlternateOffset = false;
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 #else
                     simdvector attrib[NumVertsPerPrim];
@@ -705,7 +704,7 @@
                     if (assemble)
                     {
                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 #endif
                 } while (clipPa.NextPrim());
@@ -717,7 +716,7 @@
     }
     
 #if USE_SIMD16_FRONTEND
-    void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId, const simd16scalari& vViewportIdx)
+    void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId)
     {
         // input/output vertex store for clipper
         simd16vertex vertices[7]; // maximum 7 verts generated per triangle
@@ -747,7 +746,7 @@
             // Compute absolute attrib slot in vertex array
             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
-            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
+            uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 
             pa.Assemble_simd16(inputSlot, tmpVector);
 
@@ -817,7 +816,6 @@
 
         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-        uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
 
         const simdscalari vOffsets = _simd_set_epi32(
             0 * sizeof(simd16vertex),   // unused lane
@@ -867,7 +865,7 @@
             uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 
 #if 0
-            // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug
+            // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
             static const float *dummy = reinterpret_cast<const float *>(pBase);
 #endif
 
@@ -879,10 +877,10 @@
             }
 
             // transpose attribs
-            pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
+            pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
             {
-                uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+                uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
                 for (uint32_t c = 0; c < 4; ++c)
                 {
                     simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
@@ -914,7 +912,7 @@
                 }
             }
 
-            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
             {
@@ -928,7 +926,7 @@
                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
 
                         clipPa.useAlternateOffset = false;
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]), _simd16_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 
                 } while (clipPa.NextPrim());
@@ -945,10 +943,11 @@
 
 #endif
     // execute the clipper stage
-    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
     {
-        SWR_ASSERT(pa.pDC != nullptr);
-        SWR_CONTEXT* pContext = pa.pDC->pContext;
+        SWR_ASSERT(this->pDC != nullptr);
+        SWR_CONTEXT* pContext = this->pDC->pContext;
+        const API_STATE& apiState = this->pDC->pState->state;
 
         // set up binner based on PA state
         PFN_PROCESS_PRIMS pfnBinner;
@@ -965,13 +964,27 @@
             pfnBinner = BinLines;
             break;
         default:
-            pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
+            pfnBinner = GetBinTrianglesFunc((apiState.rastState.conservativeRast > 0));
             break;
         };
 
         // update clipper invocations pipeline stat
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
         UPDATE_STAT_FE(CInvocations, numInvoc);
+        
+        // Read back viewport index if required
+        simdscalari viewportIdx = _simd_set1_epi32(0);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            simdvector vpiAttrib[NumVertsPerPrim];
+            pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+            simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+            // OOB indices => forced to zero.
+            simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+            simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+            viewportIdx = _simd_and_si(vClearMask, vpai);
+        }
 
         ComputeClipCodes(prim, viewportIdx);
 
@@ -1000,7 +1013,7 @@
             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
+            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
             AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -1009,12 +1022,12 @@
             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
-            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
         }
     }
 
 #if USE_SIMD16_FRONTEND
-    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId)
     {
         SWR_ASSERT(pa.pDC != nullptr);
         SWR_CONTEXT* pContext = pa.pDC->pContext;
@@ -1042,6 +1055,19 @@
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
         UPDATE_STAT_FE(CInvocations, numInvoc);
 
+        // Read back viewport index if required
+        simd16scalari viewportIdx = _simd16_set1_epi32(0);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            simd16vector vpiAttrib[NumVertsPerPrim];
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+            // OOB indices => forced to zero.
+            simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+            simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+            simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+            viewportIdx = _simd16_and_si(vClearMask, vpai);
+        }
         ComputeClipCodes(prim, viewportIdx);
 
         // cull prims with NAN coords
@@ -1069,7 +1095,7 @@
             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId, viewportIdx);
+            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
             AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -1078,7 +1104,7 @@
             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
-            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
         }
     }
 
@@ -1154,7 +1180,7 @@
     {
         simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
         simd16scalar vSrc = _simd16_setzero_ps();
-        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
+        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
     }
 
 #endif
@@ -1204,6 +1230,8 @@
         uint32_t numInAttribs,          // number of attributes per vertex.
         float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         // compute interpolation factor
         simdscalar t;
         switch (ClippingPlane)
@@ -1237,7 +1265,7 @@
         // interpolate attributes and store
         for (uint32_t a = 0; a < numInAttribs; ++a)
         {
-            uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+            uint32_t attribSlot = vertexAttribOffset + a;
             for (uint32_t c = 0; c < 4; ++c)
             {
                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
@@ -1286,6 +1314,8 @@
         uint32_t numInAttribs,          // number of attributes per vertex.
         float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         // compute interpolation factor
         simd16scalar t;
         switch (ClippingPlane)
@@ -1319,7 +1349,7 @@
         // interpolate attributes and store
         for (uint32_t a = 0; a < numInAttribs; ++a)
         {
-            uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+            uint32_t attribSlot = vertexAttribOffset + a;
             for (uint32_t c = 0; c < 4; ++c)
             {
                 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
@@ -1395,6 +1425,8 @@
     template<SWR_CLIPCODES ClippingPlane>
     simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         simdscalari vCurIndex = _simd_setzero_si();
         simdscalari vOutIndex = _simd_setzero_si();
         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
@@ -1435,7 +1467,7 @@
                 // store attribs
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
@@ -1489,6 +1521,8 @@
     template<SWR_CLIPCODES ClippingPlane>
     simd16scalari ClipTriToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         simd16scalari vCurIndex = _simd16_setzero_si();
         simd16scalari vOutIndex = _simd16_setzero_si();
         simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
@@ -1529,7 +1563,7 @@
                 // store attribs
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
@@ -1583,6 +1617,8 @@
     template<SWR_CLIPCODES ClippingPlane>
     simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         simdscalari vCurIndex = _simd_setzero_si();
         simdscalari vOutIndex = _simd_setzero_si();
         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
@@ -1620,7 +1656,7 @@
                 // interpolate attributes and store
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
@@ -1653,7 +1689,7 @@
                 // interpolate attributes and store
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
@@ -1673,6 +1709,8 @@
     template<SWR_CLIPCODES ClippingPlane>
     simd16scalari ClipLineToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
     {
+        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
         simd16scalari vCurIndex = _simd16_setzero_si();
         simd16scalari vOutIndex = _simd16_setzero_si();
         simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
@@ -1710,7 +1748,7 @@
                 // interpolate attributes and store
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
@@ -1743,7 +1781,7 @@
                 // interpolate attributes and store
                 for (uint32_t a = 0; a < numInAttribs; ++a)
                 {
-                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    uint32_t attribSlot = vertexAttribOffset + a;
                     for (uint32_t c = 0; c < 4; ++c)
                     {
                         simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
@@ -1853,12 +1891,12 @@
 
 
 // pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 94085e5..5f32f6f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -39,7 +39,7 @@
 #include "core/arena.h"
 #include "core/fifo.hpp"
 #include "core/knobs.h"
-#include "common/simdintrin.h"
+#include "common/intrin.h"
 #include "core/threads.h"
 #include "ringbuffer.h"
 #include "archrast/archrast.h"
@@ -62,7 +62,6 @@
     uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     float pointSize;
-    uint32_t primID;
     uint32_t renderTargetArrayIndex;
     uint32_t viewportIndex;
 };
@@ -215,12 +214,12 @@
 
 // function signature for pipeline stages that execute after primitive assembly
 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
-    uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+    uint32_t primMask, simdscalari primID);
 
 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
-    uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
+    uint32_t primMask, simd16scalari primID);
 
 #endif
 OSALIGNLINE(struct) API_STATE
@@ -245,6 +244,8 @@
     PFN_CS_FUNC             pfnCsFunc;
     uint32_t                totalThreadsInGroup;
     uint32_t                totalSpillFillSize;
+    uint32_t                scratchSpaceSize;
+    uint32_t                scratchSpaceNumInstances;
 
     // FE - Frontend State
     SWR_FRONTEND_STATE      frontendState;
@@ -408,16 +409,14 @@
     bool            dependent;      // Backend work is dependent on all previous BE
     bool            isCompute;      // Is this DC a compute context?
     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
-    volatile bool   doneFE;         // Is FE work done for this draw?
 
     FE_WORK         FeWork;
 
+    volatile OSALIGNLINE(bool)       doneFE;         // Is FE work done for this draw?
     volatile OSALIGNLINE(uint32_t)   FeLock;
-    volatile int32_t    threadsDone;
+    volatile OSALIGNLINE(uint32_t)   threadsDone;
 
     SYNC_DESC       retireCallback; // Call this func when this DC is retired.
-
-
 };
 
 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
@@ -504,9 +503,9 @@
     // Scratch space for workers.
     uint8_t** ppScratch;
 
-    volatile int32_t  drawsOutstandingFE;
+    volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
 
-    CachingAllocator cachingArenaAllocator;
+    OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
     uint32_t frameCount;
 
     uint32_t lastFrameChecked;
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e07e6a..49ba71f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -109,8 +109,8 @@
 
         auto lambda = [&](int32_t i)
         {
-            __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
-            _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+            __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
+            _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
         };
             
         const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 087657b..4e642f8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -202,7 +202,7 @@
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
+INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 {
     // fast path for float32
     if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -247,7 +247,7 @@
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
 {
     if (FormatTraits<Format>::isNormalized(Component))
     {
@@ -293,7 +293,7 @@
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
 {
     if (FormatTraits<Format>::isNormalized(Component))
     {
@@ -309,7 +309,7 @@
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
+INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
 {
     // fast path for float32
     if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
index 6c42804..1721aa4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -20,7 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
-* @file gen_format_traits.h
+* @file format_traits.h
 *
 * @brief Format Traits.  auto-generated file
 *
@@ -30,7 +30,7 @@
 #pragma once
 
 #include "format_types.h"
-#include "utils.h"
+#include "format_utils.h"
 
 //////////////////////////////////////////////////////////////////////////
 /// FormatSwizzle - Component swizzle selects
@@ -2862,6 +2862,28 @@
 };
 
 //////////////////////////////////////////////////////////////////////////
+/// FormatTraits<DXT1_RGB_SRGB> - Format traits specialization for DXT1_RGB_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<DXT1_RGB_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8>                  FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
 /// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
 //////////////////////////////////////////////////////////////////////////
 template<> struct FormatTraits<YCRCB_SWAPUVY> :
@@ -3082,6 +3104,28 @@
 };
 
 //////////////////////////////////////////////////////////////////////////
+/// FormatTraits<DXT1_RGB> - Format traits specialization for DXT1_RGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<DXT1_RGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8>                  FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
 //////////////////////////////////////////////////////////////////////////
 template<> struct FormatTraits<R8G8B8_UNORM> :
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index 5f21c96..43053b6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "utils.h"
+#include "common/simdintrin.h"
 
 //////////////////////////////////////////////////////////////////////////
 /// PackTraits - Helpers for packing / unpacking same pixel sizes
@@ -42,7 +43,7 @@
     static simdscalar pack(simdscalar &in) = delete;
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
     static simd16scalar unpack(simd16scalar &in) = delete;
     static simd16scalar pack(simd16scalar &in) = delete;
 #endif
@@ -62,7 +63,7 @@
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
 #endif
@@ -108,7 +109,7 @@
 
         __m256i result = _mm256_castsi128_si256(resLo);
         result = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
+        return simdscalar{ _mm256_castsi256_ps(result) };
 #else
         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
@@ -143,7 +144,7 @@
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -151,7 +152,8 @@
 
     static simd16scalar unpack(simd16scalar &in)
     {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
@@ -258,7 +260,7 @@
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -266,7 +268,8 @@
 
     static simd16scalar unpack(simd16scalar &in)
     {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
@@ -369,7 +372,7 @@
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -468,7 +471,7 @@
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -513,7 +516,7 @@
         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
     }
@@ -811,7 +814,7 @@
 
 #if ENABLE_AVX512_SIMD16
 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
 {
     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
@@ -833,7 +836,7 @@
     return result;
 }
 
-inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
 {
     // 5/12 is too small, so compute the 4th root of 20/12 instead.
     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
@@ -854,7 +857,7 @@
     return xavg;
 }
 
-inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
 {
     const float *f = reinterpret_cast<const float *>(&base);
 
@@ -1110,53 +1113,53 @@
 };
 
 //////////////////////////////////////////////////////////////////////////
+/// FormatIntType - Calculate base integer type for pixel components based
+///                 on total number of bits.  Components can be smaller
+///                 that this type, but the entire pixel must not be
+///                 any smaller than this type.
+//////////////////////////////////////////////////////////////////////////
+template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
+struct FormatIntType
+{
+    typedef uint32_t TYPE;
+};
+
+template <uint32_t bits>
+struct FormatIntType<bits, true, true>
+{
+    typedef uint8_t TYPE;
+};
+
+template <uint32_t bits>
+struct FormatIntType<bits, false, true>
+{
+    typedef uint16_t TYPE;
+};
+
+//////////////////////////////////////////////////////////////////////////
 /// Format1 - Bitfield for single component formats.
 //////////////////////////////////////////////////////////////////////////
 template<uint32_t x>
-struct Format1
+union Format1
 {
-    union
+    typedef typename FormatIntType<x>::TYPE TYPE;
+    struct
     {
-        uint32_t r : x;
-
-        ///@ The following are here to provide full template needed in Formats.
-        uint32_t g : x;
-        uint32_t b : x;
-        uint32_t a : x;
+        TYPE r : x;
     };
-};
 
-//////////////////////////////////////////////////////////////////////////
-/// Format1 - Bitfield for single component formats - 8 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct Format1<8>
-{
-    union
+    ///@ The following are here to provide full template needed in Formats.
+    struct
     {
-        uint8_t r;
-
-        ///@ The following are here to provide full template needed in Formats.
-        uint8_t g;
-        uint8_t b;
-        uint8_t a;
+        TYPE g : x;
     };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format1 - Bitfield for single component formats - 16 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct Format1<16>
-{
-    union
+    struct 
     {
-        uint16_t r;
-
-        ///@ The following are here to provide full template needed in Formats.
-        uint16_t g;
-        uint16_t b;
-        uint16_t a;
+        TYPE b : x;
+    };
+    struct  
+    {
+        TYPE a : x;
     };
 };
 
@@ -1166,35 +1169,18 @@
 template<uint32_t x, uint32_t y>
 union Format2
 {
-    struct
-    {
-        uint32_t r : x;
-        uint32_t g : y;
-    };
-    struct
-    {
-        ///@ The following are here to provide full template needed in Formats.
-        uint32_t b : x;
-        uint32_t a : y;
-    };
-};
+    typedef typename FormatIntType<x + y>::TYPE TYPE;
 
-//////////////////////////////////////////////////////////////////////////
-/// Format2 - Bitfield for 2 component formats - 16 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-union Format2<8,8>
-{
     struct
     {
-        uint16_t r : 8;
-        uint16_t g : 8;
+        TYPE r : x;
+        TYPE g : y;
     };
     struct
     {
         ///@ The following are here to provide full template needed in Formats.
-        uint16_t b : 8;
-        uint16_t a : 8;
+        TYPE b : x;
+        TYPE a : y;
     };
 };
 
@@ -1204,28 +1190,15 @@
 template<uint32_t x, uint32_t y, uint32_t z>
 union Format3
 {
-    struct
-    {
-        uint32_t r : x;
-        uint32_t g : y;
-        uint32_t b : z;
-    };
-    uint32_t a;  ///@note This is here to provide full template needed in Formats.
-};
+    typedef typename FormatIntType<x + y + z>::TYPE TYPE;
 
-//////////////////////////////////////////////////////////////////////////
-/// Format3 - Bitfield for 3 component formats - 16 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-union Format3<5,6,5>
-{
     struct
     {
-        uint16_t r : 5;
-        uint16_t g : 6;
-        uint16_t b : 5;
+        TYPE r : x;
+        TYPE g : y;
+        TYPE b : z;
     };
-    uint16_t a;  ///@note This is here to provide full template needed in Formats.
+    TYPE a;  ///@note This is here to provide full template needed in Formats.
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -1234,34 +1207,12 @@
 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
 struct Format4
 {
-    uint32_t r : x;
-    uint32_t g : y;
-    uint32_t b : z;
-    uint32_t a : w;
-};
+    typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
 
-//////////////////////////////////////////////////////////////////////////
-/// Format4 - Bitfield for 4 component formats - 16 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct Format4<5,5,5,1>
-{
-    uint16_t r : 5;
-    uint16_t g : 5;
-    uint16_t b : 5;
-    uint16_t a : 1;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format4 - Bitfield for 4 component formats - 16 bit specialization
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct Format4<4,4,4,4>
-{
-    uint16_t r : 4;
-    uint16_t g : 4;
-    uint16_t b : 4;
-    uint16_t a : 4;
+    TYPE r : x;
+    TYPE g : y;
+    TYPE b : z;
+    TYPE a : w;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -1461,7 +1412,7 @@
         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
     }
 
-    INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
new file mode 100644
index 0000000..576f14b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@@ -0,0 +1,882 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file utils.h
+*
+* @brief Utilities used by SWR core related to pixel formats.
+*
+******************************************************************************/
+#pragma once
+
+#include "core/utils.h"
+#include "common/simdintrin.h"
+
+INLINE
+void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
+{
+    simd4scalari row0i = SIMD128::castps_si(row0);
+    simd4scalari row1i = SIMD128::castps_si(row1);
+    simd4scalari row2i = SIMD128::castps_si(row2);
+    simd4scalari row3i = SIMD128::castps_si(row3);
+
+    simd4scalari vTemp = row2i;
+    row2i = SIMD128::unpacklo_epi32(row2i, row3i);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
+
+    row3i = row0i;
+    row0i = SIMD128::unpacklo_epi32(row0i, row1i);
+    row3i = SIMD128::unpackhi_epi32(row3i, row1i);
+
+    row1i = row0i;
+    row0i = SIMD128::unpacklo_epi64(row0i, row2i);
+    row1i = SIMD128::unpackhi_epi64(row1i, row2i);
+
+    row2i = row3i;
+    row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
+    row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
+
+    row0 = SIMD128::castsi_ps(row0i);
+    row1 = SIMD128::castsi_ps(row1i);
+    row2 = SIMD128::castsi_ps(row2i);
+    row3 = SIMD128::castsi_ps(row3i);
+}
+
+INLINE
+void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
+{
+    simd4scalari vTemp = row2;
+    row2 = SIMD128::unpacklo_epi32(row2, row3);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
+
+    row3 = row0;
+    row0 = SIMD128::unpacklo_epi32(row0, row1);
+    row3 = SIMD128::unpackhi_epi32(row3, row1);
+
+    row1 = row0;
+    row0 = SIMD128::unpacklo_epi64(row0, row2);
+    row1 = SIMD128::unpackhi_epi64(row1, row2);
+
+    row2 = row3;
+    row2 = SIMD128::unpacklo_epi64(row2, vTemp);
+    row3 = SIMD128::unpackhi_epi64(row3, vTemp);
+}
+
+#if KNOB_SIMD_WIDTH == 8
+INLINE
+void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
+{
+    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
+    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
+    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);              //x0y0z0w0 x4y4z4w4
+    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);              //x1y1z1w1 x5y5z5w5
+
+    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                             //x2z2x3z3 x6z6x7z7
+    r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps());                //y2w2y3w3 y6w6yw77
+    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
+    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
+
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
+
+    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
+    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
+    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
+    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
+}
+
+INLINE
+void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
+{
+    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
+    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
+    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);  //x0y0z0w0 x4y4z4w4
+    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);  //x1y1z1w1 x5y5z5w5
+
+    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                 //x2z2x3z3 x6z6x7z7
+    r1rx = _simd_unpackhi_ps(vSrc1, vSrc3);                 //y2w2y3w3 y6w6yw77
+    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
+    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
+
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
+
+    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
+    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
+    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
+    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
+}
+
+#if ENABLE_AVX512_SIMD16
+INLINE
+void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
+{
+    const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
+
+    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
+    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
+    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
+    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
+
+    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
+    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
+    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
+    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
+
+    dst[0] = _simd16_unpacklo_ps(rblo, galo);
+    dst[1] = _simd16_unpackhi_ps(rblo, galo);
+    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
+    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
+}
+
+#endif
+INLINE
+void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
+{
+    simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
+    simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
+    simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
+    simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
+    simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
+    simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
+    simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
+    simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
+    simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+    simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+    simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+    simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+    simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+    simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+    simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+    simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+    vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
+    vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
+    vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
+    vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
+    vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
+    vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
+    vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
+    vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
+}
+
+INLINE
+void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
+{
+    vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), 
+        _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// TranposeSingleComponent
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t bpp>
+struct TransposeSingleComponent
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Pass-thru for single component.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH <= KNOB_ARCH_AVX
+        simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
+        simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
+        simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
+        simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
+        simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
+        simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
+        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
+        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
+        SIMD128::store_si((simd4scalari*)pDst, c0123lo);
+        SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
+#else
+        simdscalari dst01 = _simd_shuffle_epi8(src,
+            _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
+        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
+        dst23 = _simd_shuffle_epi8(dst23,
+            _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
+        simdscalari dst = _simd_or_si(dst01, dst23);
+        _simd_store_si((simdscalari*)pDst, dst);
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+
+        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
+        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
+        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
+        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
+
+        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
+        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
+        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
+
+        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
+
+        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+
+        simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
+        simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
+        rg = SIMD128::unpacklo_epi8(rg, g);
+        SIMD128::store_si((simd4scalari*)pDst, rg);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+
+        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
+        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
+
+        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
+
+        simdscalari dst = _simd_or_si(cvt0, shl1);
+
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
+
+        simd4scalar vDst[8];
+        vTranspose4x8(vDst, src0, src1, src2, src3);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst+4, vDst[1]);
+        SIMD128::store_ps((float*)pDst+8, vDst[2]);
+        SIMD128::store_ps((float*)pDst+12, vDst[3]);
+        SIMD128::store_ps((float*)pDst+16, vDst[4]);
+        SIMD128::store_ps((float*)pDst+20, vDst[5]);
+        SIMD128::store_ps((float*)pDst+24, vDst[6]);
+        SIMD128::store_ps((float*)pDst+28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
+        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
+        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
+        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
+
+        simd16scalar dst[4];
+
+        vTranspose4x16(dst, src0, src1, src2, src3);
+
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+
+        simd4scalar vDst[8];
+        vTranspose3x8(vDst, src0, src1, src2);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
+        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
+        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
+        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
+        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
+        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
+        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
+        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
+        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
+        simd16scalar src3 = _simd16_setzero_ps();
+
+        simd16scalar dst[4];
+
+        vTranspose4x16(dst, src0, src1, src2, src3);
+
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        const float* pfSrc = (const float*)pSrc;
+        simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
+        simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
+        simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
+        simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
+
+        simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
+        simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
+        simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
+        simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
+
+        float* pfDst = (float*)pDst;
+        SIMD128::store_ps(pfDst + 0, dst0);
+        SIMD128::store_ps(pfDst + 4, dst1);
+        SIMD128::store_ps(pfDst + 8, dst2);
+        SIMD128::store_ps(pfDst + 12, dst3);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
+        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
+
+        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
+        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
+
+        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
+        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
+
+        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
+        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
+
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
+        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
+
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
+        simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
+
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
+        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
+        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
+        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
+
+        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
+
+        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
+        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
+        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
+        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
+
+        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
+        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
+        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
+        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
+
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
+        simd4scalari src_a = SIMD128::setzero_si();
+
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
+        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
+        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
+        simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
+
+        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
+
+        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
+        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
+        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
+        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
+
+        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
+        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
+        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
+        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
+
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalar src = _simd_load_ps((const float*)pSrc);
+
+        simd4scalar comp0 = _simd_extractf128_ps(src, 0);
+        simd4scalar comp1 = _simd_extractf128_ps(src, 1);
+
+        simd4scalari comp0i = SIMD128::castps_si(comp0);
+        simd4scalari comp1i = SIMD128::castps_si(comp1);
+
+        simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
+        simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
+
+        SIMD128::store_si((simd4scalari*)pDst, resLo);
+        SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
+#else
+#error Unsupported vector width
+#endif
+    }
+#if ENABLE_AVX512_SIMD16
+
+    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+    {
+        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
+        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
+
+        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+
+        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
+        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
+
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
+        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
+    }
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose24_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose24_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_8_24
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_8_24
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose4_4_4_4
+//////////////////////////////////////////////////////////////////////////
+struct Transpose4_4_4_4
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_6_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_6_5
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose9_9_9_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose9_9_9_5
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_5_5_1
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_5_5_1
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose1_5_5_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose1_5_5_5
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose10_10_10_2
+//////////////////////////////////////////////////////////////////////////
+struct Transpose10_10_10_2
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose11_11_10
+//////////////////////////////////////////////////////////////////////////
+struct Transpose11_11_10
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64_64
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose64_64_64_64
+//////////////////////////////////////////////////////////////////////////
+struct Transpose64_64_64_64
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#if ENABLE_AVX512_SIMD16
+
+    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
+#endif
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 8cf234c..8796878 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -495,9 +495,6 @@
     PA_STATE& pa,
     uint32_t workerId,
     uint32_t* pPrimData,
-#if USE_SIMD16_FRONTEND
-    uint32_t numPrims_simd8,
-#endif
     uint32_t streamIndex)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -510,7 +507,7 @@
     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 
     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
-    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+    uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 
     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 
@@ -520,11 +517,7 @@
         soContext.pBuffer[i] = &state.soBuffer[i];
     }
 
-#if USE_SIMD16_FRONTEND
-    uint32_t numPrims = numPrims_simd8;
-#else
     uint32_t numPrims = pa.NumPrims();
-#endif
 
     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
     {
@@ -534,8 +527,8 @@
         // Write all entries into primitive data buffer for SOS.
         while (_BitScanForward(&slot, soMask))
         {
-            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
-            uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
+            uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
             pa.AssembleSingle(paSlot, primIndex, attrib);
 
             // Attribute offset is relative offset from start of vertex.
@@ -551,6 +544,7 @@
 
                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
             }
+
             soMask &= ~(1 << slot);
         }
 
@@ -618,13 +612,13 @@
 ///
 /// attribCount will limit the vector copies to those attribs specified
 ///
-/// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
+/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 ///
 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 {
     SWR_ASSERT(vertex);
     SWR_ASSERT(vertex_simd16);
-    SWR_ASSERT(attribCount <= KNOB_NUM_ATTRIBUTES);
+    SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 
     simd16vertex temp;
 
@@ -709,17 +703,13 @@
             }
             curInputByte >>= 2;
         }
-        
+
         *pCutBuffer++ = outByte;
     }
 }
 
 THREAD SWR_GS_CONTEXT tlsGsContext;
 
-#if USE_SIMD16_FRONTEND
-THREAD simd16vertex tempVertex_simd16[128];
-
-#endif
 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
 struct GsBufferInfo
 {
@@ -797,20 +787,20 @@
     tlsGsContext.PrimitiveID = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
-    simdvector attrib[MAX_ATTRIBUTES];
+    simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 
     // assemble all attributes for the input primitive
     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
     {
-        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+        uint32_t attribSlot = pState->vertexAttribOffset + slot;
         pa.Assemble(attribSlot, attrib);
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
+            tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
         }
     }
-    
+
     // assemble position
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
@@ -818,7 +808,11 @@
         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
     }
 
+#if USE_SIMD16_FRONTEND
+    const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
+#else
     const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
+#endif
 
     // record valid prims from the frontend to avoid over binning the newly generated
     // prims from the GS
@@ -890,7 +884,7 @@
 
             uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
             uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
-            
+
             uint32_t numAttribs = state.feNumAttributes;
 
             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
@@ -922,22 +916,10 @@
                 }
 
 #if USE_SIMD16_FRONTEND
-                // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex
-
-                SWR_ASSERT(numEmittedVerts <= 256);
-
-                PackPairsOfSimdVertexIntoSimd16Vertex(
-                    tempVertex_simd16,
-                    reinterpret_cast<const simdvertex *>(pBase),
-                    numEmittedVerts,
-                    KNOB_NUM_ATTRIBUTES);
-
-#endif
-#if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, reinterpret_cast<uint8_t *>(tempVertex_simd16), numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 
 #else
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 
 #endif
                 while (gsPa.GetNextStreamOutput())
@@ -959,95 +941,22 @@
 
                             if (HasStreamOutT::value)
                             {
-#if USE_SIMD16_FRONTEND
-                                const uint32_t numPrims = gsPa.NumPrims();
-                                const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-                                const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
-
+#if ENABLE_AVX512_SIMD16
                                 gsPa.useAlternateOffset = false;
-                                StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
-
-                                if (numPrims_hi)
-                                {
-                                    gsPa.useAlternateOffset = true;
-                                    StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
-                                }
-#else
-                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 #endif
+                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
                             }
 
                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
                             {
 #if USE_SIMD16_FRONTEND
-                                simd16scalari vPrimId;
-                                // pull primitiveID from the GS output if available
-                                if (state.gsState.emitsPrimitiveID)
-                                {
-                                    simd16vector primIdAttrib[3];
-                                    gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
-                                    vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
-                                }
-                                else
-                                {
-                                    vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
-                                }
-
-                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
-                                simd16scalari vViewPortIdx;
-                                if (state.gsState.emitsViewportArrayIndex)
-                                {
-                                    simd16vector vpiAttrib[3];
-                                    gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
-
-                                    // OOB indices => forced to zero.
-                                    simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
-                                    vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
-
-                                    vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
-                                }
-                                else
-                                {
-                                    vViewPortIdx = _simd16_set1_epi32(0);
-                                }
+                                simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 
                                 gsPa.useAlternateOffset = false;
-                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
 #else
-                                simdscalari vPrimId;
-                                // pull primitiveID from the GS output if available
-                                if (state.gsState.emitsPrimitiveID)
-                                {
-                                    simdvector primIdAttrib[3];
-                                    gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
-                                    vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
-                                }
-                                else
-                                {
-                                    vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-                                }
-
-                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
-                                simdscalari vViewPortIdx;
-                                if (state.gsState.emitsViewportArrayIndex)
-                                {
-                                    simdvector vpiAttrib[3];
-                                    gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
-
-                                    // OOB indices => forced to zero.
-                                    simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
-                                    vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
-
-                                    vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
-                                }
-                                else
-                                {
-                                    vViewPortIdx = _simd_set1_epi32(0);
-                                }
-
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+                                simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 #endif
                             }
                         }
@@ -1224,12 +1133,12 @@
     // assemble all attributes for the input primitives
     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
     {
-        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+        uint32_t attribSlot = tsState.vertexAttribOffset + slot;
         pa.Assemble(attribSlot, simdattrib);
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+            hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
         }
     }
 
@@ -1333,6 +1242,7 @@
             dsContext.pOutputData,
             dsContext.vectorStride,
 #endif
+            SWR_VTX_NUM_SLOTS,
             tsState.numDsOutputAttribs,
             tsData.ppIndices,
             tsData.NumPrimitives,
@@ -1345,10 +1255,6 @@
             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 
-            const uint32_t primMask = GenMask(numPrims);
-            const uint32_t primMask_lo = primMask & 255;
-            const uint32_t primMask_hi = (primMask >> 8) & 255;
-
             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
@@ -1375,25 +1281,18 @@
             {
                 if (HasStreamOutT::value)
                 {
-#if USE_SIMD16_FRONTEND
+#if ENABLE_AVX512_SIMD16
                     tessPa.useAlternateOffset = false;
-                    StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
-
-                    if (numPrims_hi)
-                    {
-                        tessPa.useAlternateOffset = true;
-                        StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
-                    }
-#else
-                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
 #endif
+                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
                 }
 
                 if (HasRastT::value)
                 {
-                    simdvector      prim[3]; // Only deal with triangles, lines, or points
 #if USE_SIMD16_FRONTEND
-                    simd16vector    prim_simd16[3];
+                    simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
+#else
+                    simdvector      prim[3];        // Only deal with triangles, lines, or points
 #endif
                     AR_BEGIN(FEPAAssemble, pDC->drawId);
                     bool assemble =
@@ -1408,10 +1307,10 @@
                     SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND
                     tessPa.useAlternateOffset = false;
-                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
+                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
 #else
                     pfnClipFunc(pDC, tessPa, workerId, prim,
-                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
 #endif
                 }
             }
@@ -1421,9 +1320,21 @@
         } // while (tessPa.HasWork())
     } // for (uint32_t p = 0; p < numPrims; ++p)
 
+#if USE_SIMD16_FRONTEND
+    if (gt_pTessellationThreadData->pDSOutput != nullptr)
+    {
+        AlignedFree(gt_pTessellationThreadData->pDSOutput);
+        gt_pTessellationThreadData->pDSOutput = nullptr;
+    }
+    gt_pTessellationThreadData->numDSOutputVectors = 0;
+
+#endif
     TSDestroyCtx(tsCtx);
 }
 
+THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize = 0;
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrDraw.
 /// @tparam IsIndexedT - Is indexed drawing enabled
@@ -1531,8 +1442,36 @@
         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
     }
 
+    const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
+#if USE_SIMD16_FRONTEND
+    uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
+#else
+    uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+#endif
+
+    SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+
+    // Compute storage requirements for vertex store
+    // TODO: allocation needs to be rethought for better cut support
+    uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+    uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
+
+    // grow the vertex store for the PA as necessary
+    if (gVertexStoreSize < vertexStoreSize)
+    {
+        if (pVertexStore != nullptr)
+        {
+            AlignedFree(pVertexStore);
+        }
+
+        pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
+        gVertexStoreSize = vertexStoreSize;
+
+        SWR_ASSERT(pVertexStore != nullptr);
+    }
+
     // choose primitive assembler
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
     PA_STATE& pa = paFactory.GetPA();
 
 #if USE_SIMD16_FRONTEND
@@ -1690,10 +1629,6 @@
                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 
-                            const uint32_t primMask = GenMask(numPrims);
-                            const uint32_t primMask_lo = primMask & 255;
-                            const uint32_t primMask_hi = (primMask >> 8) & 255;
-
                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
@@ -1725,19 +1660,8 @@
                                 // If streamout is enabled then stream vertices out to memory.
                                 if (HasStreamOutT::value)
                                 {
-#if 1
                                     pa.useAlternateOffset = false;
-                                    StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
-
-                                    if (numPrims_hi)
-                                    {
-                                        pa.useAlternateOffset = true;
-                                        StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
-                                    }
-#else
-                                    pa.useAlternateOffset = false;  // StreamOut() is SIMD16-compatible..
                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
-#endif
                                 }
 
                                 if (HasRastT::value)
@@ -1745,7 +1669,7 @@
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
 
                                     pa.useAlternateOffset = false;
-                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
+                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
                                 }
                             }
                         }
@@ -1770,11 +1694,7 @@
     }
 
 #else
-    simdvertex          vin;
     SWR_VS_CONTEXT      vsContext;
-
-    vsContext.pVin = &vin;
-
     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
 
     fetchInfo.pStreams = &state.vertexBuffers[0];
@@ -1830,6 +1750,7 @@
             }
 
             simdvertex& vout = pa.GetNextVsOutput();
+            vsContext.pVin = &vout;
             vsContext.pVout = &vout;
 
             if (i < endVertex)
@@ -1837,7 +1758,7 @@
 
                 // 1. Execute FS/VS for a single SIMD.
                 AR_BEGIN(FEFetchShader, pDC->drawId);
-                state.pfnFetchFunc(fetchInfo, vin);
+                state.pfnFetchFunc(fetchInfo, vout);
                 AR_END(FEFetchShader, 0);
 
                 // forward fetch generated vertex IDs to the vertex shader
@@ -1910,7 +1831,7 @@
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
 
                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
                                 }
                             }
                         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index a9c36b4..3d7b26d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -28,16 +28,9 @@
 ******************************************************************************/
 #pragma once
 #include "context.h"
+#include "common/simdintrin.h"
 #include <type_traits>
 
-#if ENABLE_AVX512_SIMD16
-// TODO: this belongs in state.h alongside the simdvector definition, but there is a llvm codegen issue
-struct simd16vertex
-{
-    simd16vector    attrib[KNOB_NUM_ATTRIBUTES];
-};
-
-#endif
 // Calculates the A and B coefficients for the 3 edges of the triangle
 // 
 // maths for edge equations:
@@ -71,21 +64,6 @@
 }
 
 INLINE
-void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
-{
-    // generate edge equations
-    // A = y0 - y1
-    // B = x1 - x0
-    vA[0] = _simd_sub_ps(vY[0], vY[1]);
-    vA[1] = _simd_sub_ps(vY[1], vY[2]);
-    vA[2] = _simd_sub_ps(vY[2], vY[0]);
-
-    vB[0] = _simd_sub_ps(vX[1], vX[0]);
-    vB[1] = _simd_sub_ps(vX[2], vX[1]);
-    vB[2] = _simd_sub_ps(vX[0], vX[2]);
-}
-
-INLINE
 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
 {
     // generate edge equations
@@ -170,6 +148,7 @@
 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
 {
     // refer to calcDeterminantInt comment for calculation explanation
+
     // A1*B2
     simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
     simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
@@ -194,8 +173,10 @@
     simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
     simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
 
-    // shuffle 0 1 4 5 -> 0 1 2 3
+    // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
     simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
+
+    // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
     simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
 
     pvDet[0] = vResultLo;
@@ -207,57 +188,38 @@
 void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
 {
     // refer to calcDeterminantInt comment for calculation explanation
+
     // A1*B2
+    simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]);                // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
+    simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]);                // X 2 X 3 X 6 X 7 X A X B X E X F
 
-#if 1
-    // TODO: get the native SIMD16 version working..
+    simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
+    simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
 
-    simdscalari vA_lo[3];
-    simdscalari vA_hi[3];
-    simdscalari vB_lo[3];
-    simdscalari vB_hi[3];
-
-    for (uint32_t i = 0; i < 3; i += 1)
-    {
-        vA_lo[i] = _simd16_extract_si(vA[i], 0);
-        vA_hi[i] = _simd16_extract_si(vA[i], 1);
-        vB_lo[i] = _simd16_extract_si(vB[i], 0);
-        vB_hi[i] = _simd16_extract_si(vB[i], 1);
-    }
-
-    calcDeterminantIntVertical(vA_lo, vB_lo, reinterpret_cast<simdscalari *>(&pvDet[0]));
-    calcDeterminantIntVertical(vA_hi, vB_hi, reinterpret_cast<simdscalari *>(&pvDet[1]));
-#else
-    simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 8 8 9 9 C C D D
-    simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 A A B B E E F F
-
-    simd16scalari vB2Lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
-    simd16scalari vB2Hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
-
-    simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo);    // 0 1 4 5 8 9 C D
-    simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi);    // 2 3 6 7 A B E F
+    simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo);                 // 0 1 4 5 8 9 C D (64b)
+    simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi);                 // 2 3 6 7 A B E F
 
     // B1*A2
-    simd16scalari vA2Lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
-    simd16scalari vA2Hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
+    simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
+    simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
 
-    simd16scalari vB1Lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
-    simd16scalari vB1Hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
+    simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
+    simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
 
-    simd16scalari vA2B1Lo = _simd16_mul_epi32(vA2Lo, vB1Lo);
-    simd16scalari vA2B1Hi = _simd16_mul_epi32(vA2Hi, vB1Hi);
+    simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
+    simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
 
     // A1*B2 - A2*B1
-    simd16scalari detLo = _simd16_sub_epi64(vA1B2Lo, vA2B1Lo);
-    simd16scalari detHi = _simd16_sub_epi64(vA1B2Hi, vA2B1Hi);
+    simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo);               // 0 1 4 5 8 9 C D (64b)
+    simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi);               // 2 3 6 7 A B E F
 
-    // shuffle 0 1 4 5 -> 0 1 2 3
-    simd16scalari vResultLo = _simd16_permute2f128_si(detLo, detHi, 0x20);
-    simd16scalari vResultHi = _simd16_permute2f128_si(detLo, detHi, 0x31);
+    // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
+    simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44);       // 0 1 4 5 2 3 6 7 (64b)
+    simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE);       // 8 9 C D A B E F
 
-    pvDet[0] = vResultLo;
-    pvDet[1] = vResultHi;
-#endif
+    // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
+    pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8);                   // 0 1 2 3 4 5 6 7 (64b)
+    pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8);                   // 8 9 A B C D E F
 }
 
 #endif
@@ -271,19 +233,6 @@
     vC  = _mm_sub_ps(vC, vCy);
 }
 
-INLINE
-void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
-{
-    vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
-    vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
-
-    vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
-    vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
-
-    vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
-    vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
-}
-
 template<uint32_t NumVerts>
 INLINE
 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
@@ -439,10 +388,10 @@
 #endif
 
 struct PA_STATE_BASE;  // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
-void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index 7928f5d..7ad6fe3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -39,7 +39,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 #define ENABLE_AVX512_SIMD16    1
-#define USE_8x2_TILE_BACKEND    0
+#define USE_8x2_TILE_BACKEND    1
 #define USE_SIMD16_FRONTEND     0
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -97,9 +97,6 @@
 // Maximum supported number of active vertex buffer streams
 #define KNOB_NUM_STREAMS                    32
 
-// Maximum supported number of attributes per vertex
-#define KNOB_NUM_ATTRIBUTES                 39
-
 // Maximum supported active viewports and scissors
 #define KNOB_NUM_VIEWPORTS_SCISSORS         16
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
deleted file mode 100644
index 8b20f7a..0000000
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file multisample.cpp
-*
-******************************************************************************/
-
-#include "multisample.h"
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi[1];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi[1];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16];
-
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX[1];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY[1];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16];
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
index 19a5a80..2ca8c1b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.h
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@@ -34,14 +34,6 @@
 typedef std::integral_constant<int, 1> SingleSampleT;
 
 INLINE
-uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount)
-{
-    static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_COUNT] {1, 2, 4, 8, 16};
-    assert(sampleCount < SWR_MULTISAMPLE_TYPE_COUNT);
-    return sampleCountLUT[sampleCount];
-}
-
-INLINE
 SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
 {
     switch(numSamples)
@@ -302,4 +294,4 @@
         }
     }
     return !bIsStandard;
-}
\ No newline at end of file
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 781c094..d2e6109 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -67,9 +67,10 @@
     typedef         simdscalari         SIMDSCALARI;
 
 #endif
-    DRAW_CONTEXT *pDC{ nullptr };              // draw context
-    uint8_t* pStreamBase{ nullptr };           // vertex stream
-    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
+    DRAW_CONTEXT *pDC{ nullptr };       // draw context
+    uint8_t* pStreamBase{ nullptr };    // vertex stream
+    uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
+    uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
 
     // The topology the binner will use. In some cases the FE changes the topology from the api state.
     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
@@ -79,8 +80,8 @@
 
 #endif
     PA_STATE() {}
-    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
-        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
+    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
+        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
 
     virtual bool HasWork() = 0;
     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
@@ -91,7 +92,7 @@
 #if ENABLE_AVX512_SIMD16
     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
 #endif
-    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
     virtual bool NextPrim() = 0;
     virtual SIMDVERTEX& GetNextVsOutput() = 0;
     virtual bool GetNextStreamOutput() = 0;
@@ -119,8 +120,6 @@
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
-    SIMDVERTEX leadingVertex;            // For tri-fan
-
     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 
@@ -128,7 +127,7 @@
 
     uint32_t cur{ 0 };                   // index to current VS output.
     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
-    uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
+    const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
 
     uint32_t counter{ 0 };               // state counter
     bool reset{ false };                 // reset state
@@ -136,11 +135,11 @@
     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     SIMDSCALARI primID;
 
-    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
+    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
-    typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
+    typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 #if ENABLE_AVX512_SIMD16
@@ -166,7 +165,7 @@
     
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+        uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 
     bool HasWork()
     {
@@ -175,15 +174,19 @@
 
     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
-        simdvertex* pVertex = (simdvertex*)pStreamBase;
-        return pVertex[index].attrib[slot];
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t offset = index * vertexStride + slot;
+        simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
+        return vertexSlot;
     }
 
 #if ENABLE_AVX512_SIMD16
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
-        simd16vertex* pVertex = (simd16vertex*)pStreamBase;
-        return pVertex[index].attrib[slot];
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t offset = index * vertexStride + slot;
+        simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
+        return vertexSlot;
     }
 
 #endif
@@ -202,7 +205,7 @@
 
 #endif
     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
     }
@@ -245,13 +248,28 @@
 
     SIMDVERTEX& GetNextVsOutput()
     {
-        // increment cur and prev indices
-        const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
-        this->prev = this->cur;  // prev is undefined for first state.
-        this->cur = this->counter % numSimdVerts;
+        const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
 
-        SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
-        return pVertex[this->cur];
+        // increment cur and prev indices
+        if (counter < numSimdVerts)
+        {
+            // prev undefined for first state
+            prev = cur;
+            cur = counter;
+        }
+        else
+        {
+            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
+            uint32_t temp = prev;
+
+            prev = cur;
+            cur = temp;
+        }
+
+        SWR_ASSERT(cur < numSimdVerts);
+        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
+
+        return *(SIMDVERTEX*)pVertex;
     }
 
     SIMDMASK& GetNextVsIndices()
@@ -313,11 +331,13 @@
 
 #endif
         this->pfnPaFunc = this->pfnPaFuncReset;
+#if ENABLE_AVX512_SIMD16
+        this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
+#endif
         this->numPrimsComplete = 0;
         this->numSimdPrims = 0;
         this->cur = 0;
         this->prev = 0;
-        this->first = 0;
         this->counter = 0;
         this->reset = false;
     }
@@ -409,9 +429,9 @@
     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
-    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
+    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
-        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
+        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
     {
         numVerts = in_streamSizeInVerts;
         numAttribs = in_numAttribs;
@@ -466,7 +486,9 @@
         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
         this->needOffsets = true;
-        return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
+        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
+
+        return *(SIMDVERTEX*)pVertex;
     }
 
     SIMDMASK& GetNextVsIndices()
@@ -621,16 +643,17 @@
     {
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
+            uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 
             // step to simdvertex batch
             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
+            this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 #else
             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
+            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 #endif
 
             // step to index
@@ -645,7 +668,7 @@
         }
     }
 
-    bool Assemble(uint32_t slot, simdvector verts[])
+    bool Assemble(uint32_t slot, simdvector *verts)
     {
         // process any outstanding verts
         ProcessVerts();
@@ -741,7 +764,7 @@
     }
 
 #endif
-    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
     {
         // move to slot
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
@@ -1115,12 +1138,13 @@
         DRAW_CONTEXT *in_pDC,
         const SIMDSCALAR* in_pVertData,
         uint32_t in_attributeStrideInVectors,
+        uint32_t in_vertexStride,
         uint32_t in_numAttributes,
         uint32_t* (&in_ppIndices)[3],
         uint32_t in_numPrims,
         PRIMITIVE_TOPOLOGY in_binTopology) :
 
-        PA_STATE(in_pDC, nullptr, 0),
+        PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
         m_pVertexData(in_pVertData),
         m_attributeStrideInVectors(in_attributeStrideInVectors),
         m_numAttributes(in_numAttributes),
@@ -1226,7 +1250,7 @@
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
 
                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
@@ -1236,7 +1260,7 @@
                     pBase,
                     indices,
                     _simd_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
+                    4); // gcc doesn't like sizeof(float)
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
@@ -1275,7 +1299,7 @@
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
 #else
                 simdscalar temp = _simd_mask_i32gather_ps(
@@ -1294,7 +1318,7 @@
     }
 
 #endif
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);
         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
@@ -1390,7 +1414,7 @@
 template <typename IsIndexedT, typename IsCutIndexEnabledT>
 struct PA_FACTORY
 {
-    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
+    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         const API_STATE& state = GetApiState(pDC);
@@ -1406,15 +1430,15 @@
             memset(&indexStore, 0, sizeof(indexStore));
             uint32_t numAttribs = state.feNumAttributes;
 
-            new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
-                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+            new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
+                vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
             cutPA = true;
         }
         else
 #endif
         {
             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
+            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
             cutPA = false;
         }
 
@@ -1436,10 +1460,10 @@
 
     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;
+
     bool cutPA{ false };
 
     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
 
-    PA_STATE::SIMDVERTEX    vertexStore[MAX_NUM_VERTS_PER_PRIM];
     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
index 6a24963..e53389b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -34,103 +34,103 @@
 
 #if (KNOB_SIMD_WIDTH == 8)
 
-INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane0(const simdvector &v)
+INLINE simd4scalar swizzleLane0(const simdvector &v)
 {
     return swizzleLane0(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane1(const simdvector &v)
+INLINE simd4scalar swizzleLane1(const simdvector &v)
 {
     return swizzleLane1(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane2(const simdvector &v)
+INLINE simd4scalar swizzleLane2(const simdvector &v)
 {
     return swizzleLane2(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane3(const simdvector &v)
+INLINE simd4scalar swizzleLane3(const simdvector &v)
 {
     return swizzleLane3(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane4(const simdvector &v)
+INLINE simd4scalar swizzleLane4(const simdvector &v)
 {
     return swizzleLane4(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane5(const simdvector &v)
+INLINE simd4scalar swizzleLane5(const simdvector &v)
 {
     return swizzleLane5(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane6(const simdvector &v)
+INLINE simd4scalar swizzleLane6(const simdvector &v)
 {
     return swizzleLane6(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane7(const simdvector &v)
+INLINE simd4scalar swizzleLane7(const simdvector &v)
 {
     return swizzleLane7(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
 {
     switch (lane)
     {
@@ -156,87 +156,87 @@
 }
 
 #if ENABLE_AVX512_SIMD16
-INLINE __m128 swizzleLane0(const simd16vector &v)
+INLINE simd4scalar swizzleLane0(const simd16vector &v)
 {
     return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane1(const simd16vector &v)
+INLINE simd4scalar swizzleLane1(const simd16vector &v)
 {
     return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane2(const simd16vector &v)
+INLINE simd4scalar swizzleLane2(const simd16vector &v)
 {
     return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane3(const simd16vector &v)
+INLINE simd4scalar swizzleLane3(const simd16vector &v)
 {
     return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane4(const simd16vector &v)
+INLINE simd4scalar swizzleLane4(const simd16vector &v)
 {
     return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane5(const simd16vector &v)
+INLINE simd4scalar swizzleLane5(const simd16vector &v)
 {
     return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane6(const simd16vector &v)
+INLINE simd4scalar swizzleLane6(const simd16vector &v)
 {
     return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane7(const simd16vector &v)
+INLINE simd4scalar swizzleLane7(const simd16vector &v)
 {
     return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane8(const simd16vector &v)
+INLINE simd4scalar swizzleLane8(const simd16vector &v)
 {
     return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLane9(const simd16vector &v)
+INLINE simd4scalar swizzleLane9(const simd16vector &v)
 {
     return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneA(const simd16vector &v)
+INLINE simd4scalar swizzleLaneA(const simd16vector &v)
 {
     return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneB(const simd16vector &v)
+INLINE simd4scalar swizzleLaneB(const simd16vector &v)
 {
     return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneC(const simd16vector &v)
+INLINE simd4scalar swizzleLaneC(const simd16vector &v)
 {
     return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneD(const simd16vector &v)
+INLINE simd4scalar swizzleLaneD(const simd16vector &v)
 {
     return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneE(const simd16vector &v)
+INLINE simd4scalar swizzleLaneE(const simd16vector &v)
 {
     return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneF(const simd16vector &v)
+INLINE simd4scalar swizzleLaneF(const simd16vector &v)
 {
     return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
 {
     switch (lane)
     {
@@ -286,7 +286,7 @@
 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -294,7 +294,7 @@
 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -302,7 +302,7 @@
 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -310,7 +310,7 @@
 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -318,7 +318,7 @@
 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -326,7 +326,7 @@
 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -334,13 +334,13 @@
 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -350,10 +350,10 @@
 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 template <uint32_t TotalControlPoints>
-void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
@@ -788,7 +788,7 @@
 }
 
 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -1057,7 +1057,7 @@
 }
 
 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@@ -1213,10 +1213,6 @@
 
 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
-    // store off leading vertex for attributes
-    PA_STATE_OPT::SIMDVERTEX* pVertex = (PA_STATE_OPT::SIMDVERTEX*)pa.pStreamBase;
-    pa.leadingVertex = pVertex[pa.cur];
-
     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
     return false;    // Not enough vertices to assemble 8 triangles.
 }
@@ -1228,11 +1224,7 @@
     simdvector a;
     simdvector b;
 
-#if 1
     const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
-#else
-    const simd16vector &leadvert_16 = pa.leadingVertex.attrib[slot];
-#endif
 
     if (!pa.useAlternateOffset)
     {
@@ -1260,10 +1252,9 @@
     }
 
 #else
-    simdvector &leadVert = pa.leadingVertex.attrib[slot];
-
-    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot);
+    const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
+    const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
 
 #endif
     simdscalar s;
@@ -1301,23 +1292,7 @@
 
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-#if USE_SIMD16_FRONTEND
-#if 1
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
-#else
-    const simd16vector &a = pa.leadingVertex.attrib[slot];
-#endif
-#else
-    simd16vector a;
-
-    {
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_insert_ps(_simd16_setzero_ps(), pa.leadingVertex.attrib[slot][i], 0);
-        }
-    }
-
-#endif
     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
@@ -1350,14 +1325,10 @@
 }
 
 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-#if 1
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
-#else
-    const simd16vector &a = pa.leadingVertex.attrib[slot];
-#endif
     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
@@ -1393,7 +1364,7 @@
         verts[2] = swizzleLaneN(c, primIndex - 14);
     }
 #else
-    const simdvector &a = pa.leadingVertex.attrib[slot];
+    const simdvector &a = PaGetSimdVector(pa, pa.first, slot);
     const simdvector &b = PaGetSimdVector(pa, pa.prev, slot);
     const simdvector &c = PaGetSimdVector(pa, pa.cur, slot);
 
@@ -1520,7 +1491,7 @@
 }
 
 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -1770,7 +1741,7 @@
 }
 
 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
     PaLineStripSingle0(pa, slot, primIndex, verts);
 
@@ -1884,11 +1855,11 @@
 }
 
 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -1963,8 +1934,8 @@
         break;
     }
 #else
-    const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector &a = PaGetSimdVector(pa, 0, slot);
+    const simdvector &b = PaGetSimdVector(pa, 1, slot);
 
     switch (primIndex)
     {
@@ -2104,7 +2075,7 @@
 }
 
 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@@ -2228,9 +2199,8 @@
 {
 #if USE_SIMD16_FRONTEND
     simdvector a;
-    simdvector b;
 
-    const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
     if (!pa.useAlternateOffset)
     {
@@ -2248,7 +2218,7 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
+    simdvector &a = PaGetSimdVector(pa, 0, slot);
 
 #endif
     verts[0] = a;  // points only have 1 vertex.
@@ -2269,10 +2239,10 @@
 }
 
 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -2281,7 +2251,7 @@
 
     verts[0] = swizzleLaneN(a, primIndex);
 #else
-    const simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector &a = PaGetSimdVector(pa, 0, slot);
 
     verts[0] = swizzleLaneN(a, primIndex);
 #endif
@@ -2559,7 +2529,7 @@
     PA_STATE_OPT& pa,
     uint32_t slot,
     uint32_t primIndex,
-    __m128 verts[])
+    simd4scalar verts[])
 {
     // We have 12 simdscalars contained within 3 simdvectors which
     // hold at least 8 triangles worth of data. We want to assemble a single
@@ -2618,7 +2588,8 @@
 }
 
 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
-    bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
+    uint32_t in_vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : 
+    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
 {
     const API_STATE& state = GetApiState(pDC);
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index af54779..a3ff557 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -30,974 +30,35 @@
 #include <algorithm>
 
 #include "rasterizer.h"
+#include "backends/gen_rasterizer.hpp"
 #include "rdtsc_core.h"
 #include "backend.h"
 #include "utils.h"
 #include "frontend.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
+#include "rasterizer_impl.h"
 
-template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
-template <typename RT>
-void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
-template <typename RT>
-void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
 
-#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-const __m256d gMaskToVecpd[] =
-{
-    MASKTOVEC(0, 0, 0, 0),
-    MASKTOVEC(0, 0, 0, 1),
-    MASKTOVEC(0, 0, 1, 0),
-    MASKTOVEC(0, 0, 1, 1),
-    MASKTOVEC(0, 1, 0, 0),
-    MASKTOVEC(0, 1, 0, 1),
-    MASKTOVEC(0, 1, 1, 0),
-    MASKTOVEC(0, 1, 1, 1),
-    MASKTOVEC(1, 0, 0, 0),
-    MASKTOVEC(1, 0, 0, 1),
-    MASKTOVEC(1, 0, 1, 0),
-    MASKTOVEC(1, 0, 1, 1),
-    MASKTOVEC(1, 1, 0, 0),
-    MASKTOVEC(1, 1, 0, 1),
-    MASKTOVEC(1, 1, 1, 0),
-    MASKTOVEC(1, 1, 1, 1),
-};
-
-struct POS
-{
-    int32_t x, y;
-};
-
-struct EDGE
-{
-    double a, b;                // a, b edge coefficients in fix8
-    double stepQuadX;           // step to adjacent horizontal quad in fix16
-    double stepQuadY;           // step to adjacent vertical quad in fix16
-    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
-    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
-
-    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
-    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief rasterize a raster tile partially covered by the triangle
-/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
-/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
-/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
-///        Used to step between quads when sweeping over the raster tile.
-template<uint32_t NumEdges, typename EdgeMaskT>
-INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
-{
-    uint64_t coverageMask = 0;
-
-    __m256d vEdges[NumEdges];
-    __m256d vStepX[NumEdges];
-    __m256d vStepY[NumEdges];
-
-    for (uint32_t e = 0; e < NumEdges; ++e)
-    {
-        // Step to the pixel sample locations of the 1st quad
-        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
-
-        // compute step to next quad (mul by 2 in x and y direction)
-        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
-        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
-    }
-
-    // fast unrolled version for 8x8 tile
-#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
-    int edgeMask[NumEdges];
-    uint64_t mask;
-
-    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
-    auto update_lambda = [&](int e){mask &= edgeMask[e];};
-    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
-    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
-    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
-
-// evaluate which pixels in the quad are covered
-#define EVAL \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
-
-    // update coverage mask
-    // if edge 0 is degenerate and will be skipped; init the mask
-#define UPDATE_MASK(bit) \
-            if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
-                mask = 0xf;\
-            }\
-            else{\
-                mask = edgeMask[0]; \
-            }\
-            UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
-            coverageMask |= (mask << bit);
-
-    // step in the +x direction to the next quad 
-#define INCX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
-
-    // step in the +y direction to the next quad 
-#define INCY \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
-
-    // step in the -x direction to the next quad 
-#define DECX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
-
-    // sweep 2x2 quad back and forth through the raster tile, 
-    // computing coverage masks for the entire tile
-
-    // raster tile
-    // 0  1  2  3  4  5  6  7 
-    // x  x
-    // x  x ------------------>  
-    //                   x  x  |
-    // <-----------------x  x  V
-    // ..
-
-    // row 0
-    EVAL;
-    UPDATE_MASK(0);
-    INCX;
-    EVAL;
-    UPDATE_MASK(4);
-    INCX;
-    EVAL;
-    UPDATE_MASK(8);
-    INCX;
-    EVAL;
-    UPDATE_MASK(12);
-    INCY;
-
-    //row 1
-    EVAL;
-    UPDATE_MASK(28);
-    DECX;
-    EVAL;
-    UPDATE_MASK(24);
-    DECX;
-    EVAL;
-    UPDATE_MASK(20);
-    DECX;
-    EVAL;
-    UPDATE_MASK(16);
-    INCY;
-
-    // row 2
-    EVAL;
-    UPDATE_MASK(32);
-    INCX;
-    EVAL;
-    UPDATE_MASK(36);
-    INCX;
-    EVAL;
-    UPDATE_MASK(40);
-    INCX;
-    EVAL;
-    UPDATE_MASK(44);
-    INCY;
-
-    // row 3
-    EVAL;
-    UPDATE_MASK(60);
-    DECX;
-    EVAL;
-    UPDATE_MASK(56);
-    DECX;
-    EVAL;
-    UPDATE_MASK(52);
-    DECX;
-    EVAL;
-    UPDATE_MASK(48);
-#else
-    uint32_t bit = 0;
-    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
-    {
-        __m256d vStartOfRowEdge[NumEdges];
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vStartOfRowEdge[e] = vEdges[e];
-        }
-
-        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
-        {
-            int edgeMask[NumEdges];
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
-            }
-
-            uint64_t mask = edgeMask[0];
-            for (uint32_t e = 1; e < NumEdges; ++e)
-            {
-                mask &= edgeMask[e];
-            }
-            coverageMask |= (mask << bit);
-
-            // step to the next pixel in the x
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
-            }
-            bit+=4;
-        }
-
-        // step to the next row
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
-        }
-    }
-#endif
-    return coverageMask;
-
-}
-// Top left rule:
-// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
-// Top left: a sample is in if it is a top or left edge.
-// Out: !(horizontal && above) = !horizontal && below
-// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
-INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 
-{
-    // if vA < 0, vC--
-    // if vA == 0 && vB < 0, vC--
-
-    __m256d vEdgeOut = vEdge;
-    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
-
-    // if vA < 0 (line is not horizontal and below)
-    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
-
-    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
-    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
-    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
-    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
-
-    // if either of these are true and we're on the line (edge == 0), bump it outside the line
-    vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates difference in precision between the result of manh
-/// calculation and the edge precision, based on compile time trait values
-template<typename RT>
-constexpr int64_t ManhToEdgePrecisionAdjust()
-{
-    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
-                  "Inadequate precision of result of manh calculation ");
-    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct adjustEdgeConservative
-/// @brief Primary template definition used for partially specializing 
-/// the adjustEdgeConservative function. This struct should never
-/// be instantiated.
-/// @tparam RT: rasterizer traits
-/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
-template <typename RT, typename ConservativeEdgeOffsetT>
-struct adjustEdgeConservative
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs calculations to adjust each edge of a triangle away
-    /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-    /// direction. 
-    ///
-    /// Uncertainty regions arise from fixed point rounding, which
-    /// can snap a vertex +/- by min fixed point value.
-    /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
-    /// This allows the rasterizer to test for coverage only at the pixel center, 
-    /// instead of having to test individual pixel corners for conservative coverage
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 
-        // from the pixel center (in the direction of the edge normal A/B)
-
-        // edge = Ax + Bx + C - (manh/e)
-        // manh = manhattan distance = abs(A) + abs(B)
-        // e = absolute rounding error from snapping from float to fixed point precision
-
-        // 'fixed point' multiply (in double to be avx1 friendly) 
-        // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
-        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
-        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
-                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
-
-        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
-                      "Inadequate precision of result of manh calculation ");
-
-        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
-        // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
-        manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
-
-        // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
-        // this allows the rasterizer to do a single conservative coverage test to see if the primitive
-        // intersects the pixel at all
-        vEdge = _mm256_sub_pd(vEdge, manh);
-    };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative specialization where no edge offset is needed
-template <typename RT>
-struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates the distance a degenerate BBox needs to be adjusted 
-/// for conservative rast based on compile time trait values
-template<typename RT>
-constexpr int64_t ConservativeScissorOffset()
-{
-    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
-    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
-    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
-    // 1/2 pixel edge offset + conservative offset - degenerateTriangle
-    return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a vector of evaluated edges out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
-template <typename RT>
-INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
-    vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a scalar evaluated edge out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
-template <typename RT, typename OffsetT>
-INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
-    return (Edge - manh);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform any needed adjustments to evaluated triangle edges
-template <typename RT, typename EdgeOffsetT>
-struct adjustEdgesFix16
-{
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
-                      "Edge equation expected to be in x.16 fixed point");
-
-        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
-
-        // need to apply any edge offsets before applying the top-left rule
-        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
-
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform top left adjustments to evaluated triangle edges
-template <typename RT>
-struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-// max(abs(dz/dx), abs(dz,dy)
-INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
-{
-    /*
-    // evaluate i,j at (0,0)
-    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // evaluate i,j at (1,0)
-    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // compute dz/dx
-    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
-    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
-    float dzdx = abs(d10 - d00);
-
-    // evaluate i,j at (0,1)
-    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
-    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
-
-    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
-    float dzdy = abs(d01 - d00);
-    */
-
-    // optimized version of above
-    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
-    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
-
-    return std::max(dzdx, dzdy);
-}
-
-INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
-{
-    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
-    {
-        return (1.0f / (1 << 24));
-    }
-    else if (pState->depthFormat == R16_UNORM)
-    {
-        return (1.0f / (1 << 16));
-    }
-    else
-    {
-        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
-
-        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
-        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
-        uint32_t zMaxInt = *(uint32_t*)&zMax;
-        zMaxInt &= 0x7f800000;
-        zMax = *(float*)&zMaxInt;
-
-        return zMax * (1.0f / (1 << 23));
-    }
-}
-
-INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
-{
-    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
-    {
-        return 0.0f;
-    }
-
-    float scale = pState->slopeScaledDepthBias;
-    if (scale != 0.0f)
-    {
-        scale *= ComputeMaxDepthSlope(pTri);
-    }
-
-    float bias = pState->depthBias;
-    if (!pState->depthBiasPreAdjusted)
-    {
-        bias *= ComputeBiasFactor(pState, pTri, z);
-    }
-    bias += scale;
-
-    if (pState->depthBiasClamp > 0.0f)
-    {
-        bias = std::min(bias, pState->depthBiasClamp);
-    }
-    else if (pState->depthBiasClamp < 0.0f)
-    {
-        bias = std::max(bias, pState->depthBiasClamp);
-    }
-
-    return bias;
-}
-
-// Prevent DCE by writing coverage mask from rasterizer to volatile
-#if KNOB_ENABLE_TOSS_POINTS
-__declspec(thread) volatile uint64_t gToss;
-#endif
-
-static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
-// try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
-
-INLINE
-void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
-{
-    edge.a = a;
-    edge.b = b;
-
-    // compute constant steps to adjacent quads
-    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
-    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
-
-    // compute constant steps to adjacent raster tiles
-    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
-    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
-
-    // compute quad offsets
-    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
-    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
-    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
-    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
-
-    // compute raster tile offsets
-    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
-    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
-    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
-    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
-}
-
-INLINE
-void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
-{
-    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary template definition used for partially specializing 
-/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 
-/// corner to sample position, and test for coverage
-/// @tparam sampleCount: multisample count
-template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
-    __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
-    // evaluate edge equations at the tile multisample bounding box
-    vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
-    vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
-    vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
-    mask0 = _mm256_movemask_pd(vSampleBboxTest0);
-    mask1 = _mm256_movemask_pd(vSampleBboxTest1);
-    mask2 = _mm256_movemask_pd(vSampleBboxTest2);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
-/// when only rasterizing a single coverage test point
-template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
-                                           int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
-    mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
-    mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
-    mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ComputeScissorEdges
-/// @brief Primary template definition. Allows the function to be generically
-/// called. When paired with below specializations, will result in an empty 
-/// inlined function if scissor is not enabled
-/// @tparam RasterScissorEdgesT: is scissor enabled?
-/// @tparam IsConservativeT: is conservative rast enabled?
-/// @tparam RT: rasterizer traits
-template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
-struct ComputeScissorEdges
-{
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 
-/// specialization. Instantiated when conservative rast and scissor are enabled
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::true_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 
-    /// evaluate edge equations and offset them away from pixel center.
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
-    {
-        // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
-        SWR_RECT scissor;
-        scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
-        scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
-        scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
-        scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
-
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
-        // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
-        adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
-        adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
-        adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
-        adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 
-/// specialization. Instantiated when scissor is enabled and conservative rast
-/// is disabled.
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::false_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Compute scissor edge vectors and evaluate edge equations
-    INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
-    {
-        const SWR_RECT &scissor = scissorBBox;
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialRejectTest. Should
-/// never be called, but TemplateUnroller instantiates a few unused values,
-/// so it calls a runtime assert instead of a static_assert.
-template <typename ValidEdgeMaskT>
-INLINE bool TrivialRejectTest(const int, const int, const int)
-{
-    SWR_INVALID("Primary templated function should never be called");
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 1 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
-{
-    return (!(mask0 && mask1)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
-{
-    return (!(mask0 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
-{
-    return (!(mask1 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
-/// primitive edges for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
-{
-    return (!(mask0 && mask1 && mask2)) ? true : false;;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
-/// point, so return false and rasterize against conservative BBox
-template <>
-INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialAcceptTest. Always returns
-/// false, since it will only be called for degenerate tris, and as such 
-/// will never cover the entire raster tile
-template <typename ScissorEnableT>
-INLINE bool TrivialAcceptTest(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
-/// edge masks for a fully covered raster tile
-template <>
-INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
-{
-    return ((mask0 & mask1 & mask2) == 0xf);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for GenerateSVInnerCoverage. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct GenerateSVInnerCoverage
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of GenerateSVInnerCoverage where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
-/// edge values from OuterConservative to InnerConservative and rasterizes.
-template <typename RT>
-struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
-    {
-        SWR_CONTEXT *pContext = pDC->pContext;
-
-        double startQuadEdgesAdj[RT::NumEdgesT::value];
-        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
-        }
-
-        // not trivial accept or reject, must rasterize full tile
-        AR_BEGIN(BERasterizePartial, pDC->drawId);
-        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
-        AR_END(BERasterizePartial, 0);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct UpdateEdgeMasksInnerConservative
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
-                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
-/// evaluated at raster tile corners to inner conservative position and 
-/// updates edge masks
-template <typename RT>
-struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
-    {
-        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
-
-        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
-        // conservative evaluated edge when adjusting the edge in for inner conservative tests
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
-
-        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
-/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
-/// cover an entire raster tile, set mask0 to 0 to force it down the
-/// rastierizePartialTile path
-template <typename RT, typename ValidEdgeMaskT>
-struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
-                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
-    {
-        // set one mask to zero to force the triangle down the rastierizePartialTile path
-        mask0 = 0;
-    }
-};
-
-template <typename RT>
-void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
     {
         return;
     }
 #endif
-    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
-    AR_BEGIN(BETriangleSetup, pDC->drawId);
+
+    // bloat line to two tris and call the triangle rasterizer twice
+    AR_BEGIN(BERasterizeLine, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-
-    __m128 vX, vY, vZ, vRecipW;
-    
-    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
-    // eg: vX = [x0 x1 x2 dc]
-    vX = _mm_load_ps(workDesc.pTriBuffer);
-    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // convert to fixed point
-    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
-    __m128i vXi = fpToFixedPoint(vX);
-    __m128i vYi = fpToFixedPoint(vY);
-
-    // quantize floating point position to fixed point precision
-    // to prevent attribute creep around the triangle vertices
-    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-
-    // triangle setup - A and B edge equation coefs
-    __m128 vA, vB;
-    triangleSetupAB(vX, vY, vA, vB);
-
-    __m128i vAi, vBi;
-    triangleSetupABInt(vXi, vYi, vAi, vBi);
-    
-    // determinant
-    float det = calcDeterminantInt(vAi, vBi);
-
-    // Verts in Pixel Coordinate Space at this point
-    // Det > 0 = CW winding order 
-    // Convert CW triangles to CCW
-    if (det > 0.0)
-    {
-        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
-        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
-        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
-        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
-        det = -det;
-    }
-
-    __m128 vC;
-    // Finish triangle setup - C edge coef
-    triangleSetupC(vX, vY, vA, vB, vC);
-
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we have degenerate edge(s) to rasterize, set I and J coefs 
-        // to 0 for constant interpolation of attributes
-        triDesc.I[0] = 0.0f;
-        triDesc.I[1] = 0.0f;
-        triDesc.I[2] = 0.0f;
-        triDesc.J[0] = 0.0f;
-        triDesc.J[1] = 0.0f;
-        triDesc.J[2] = 0.0f;
-
-        // Degenerate triangles have no area
-        triDesc.recipDet = 0.0f;
-    }
-    else
-    {
-        // only extract coefs for 2 of the barycentrics; the 3rd can be 
-        // determined from the barycentric equation:
-        // i + j + k = 1 <=> k = 1 - j - i
-        _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
-        _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
-
-        // compute recipDet, used to calculate barycentric i and j in the backend
-        triDesc.recipDet = 1.0f/det;
-    }
-
-    OSALIGNSIMD(float) oneOverW[4];
-    _mm_store_ps(oneOverW, vRecipW);
-    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
-    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
-    triDesc.OneOverW[2] = oneOverW[2];
-
-    // calculate perspective correct coefs per vertex attrib 
-    float* pPerspAttribs = perspAttribsTLS;
-    float* pAttribs = workDesc.pAttribs;
-    triDesc.pPerspAttribs = pPerspAttribs;
-    triDesc.pAttribs = pAttribs;
-    float *pRecipW = workDesc.pTriBuffer + 12;
-    triDesc.pRecipW = pRecipW;
-    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
-    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
-    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
-    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
-    {
-        __m128 attribA = _mm_load_ps(pAttribs);
-        __m128 attribB = _mm_load_ps(pAttribs+=4);
-        __m128 attribC = _mm_load_ps(pAttribs+=4);
-        pAttribs+=4;
-
-        attribA = _mm_mul_ps(attribA, vOneOverWV0);
-        attribB = _mm_mul_ps(attribB, vOneOverWV1);
-        attribC = _mm_mul_ps(attribC, vOneOverWV2);
-
-        _mm_store_ps(pPerspAttribs, attribA);
-        _mm_store_ps(pPerspAttribs+=4, attribB);
-        _mm_store_ps(pPerspAttribs+=4, attribC);
-        pPerspAttribs+=4;
-    }
-
-    // compute bary Z
-    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-    OSALIGNSIMD(float) a[4];
-    _mm_store_ps(a, vZ);
-    triDesc.Z[0] = a[0] - a[2];
-    triDesc.Z[1] = a[1] - a[2];
-    triDesc.Z[2] = a[2];
-        
-    // add depth bias
-    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
-
-    // Calc bounding box of triangle
-    OSALIGNSIMD(SWR_RECT) bbox;
-    calcBoundingBoxInt(vXi, vYi, bbox);
-
-    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
-        bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
-        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
-                   "Conservative rast degenerate handling requires a valid scissor rect");
-    }
-
-    // Intersect with scissor/viewport
-    OSALIGNSIMD(SWR_RECT) intersect;
-    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
-    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
-    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
-    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
-
-    triDesc.triFlags = workDesc.triFlags;
-
-    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+    // macrotile dimensioning
     uint32_t macroX, macroY;
     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
@@ -1005,292 +66,251 @@
     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
-    intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
-    intersect.ymin = std::max(intersect.ymin, macroBoxTop);
-    intersect.xmax = std::min(intersect.xmax, macroBoxRight);
-    intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
+    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
 
-    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
+    // create a copy of the triangle buffer to write our adjusted vertices to
+    OSALIGNSIMD(float) newTriBuffer[4 * 4];
+    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+    newWorkDesc.pTriBuffer = &newTriBuffer[0];
 
-    AR_END(BETriangleSetup, 0);
+    // create a copy of the attrib buffer to write our adjusted attribs to
+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
+    newWorkDesc.pAttribs = &newAttribBuffer[0];
 
-    // update triangle desc
-    uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t numTilesX = maxTileX - minTileX + 1;
-    uint32_t numTilesY = maxTileY - minTileY + 1;
+    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
+    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
 
-    if (numTilesX == 0 || numTilesY == 0) 
+    __m128 vX, vY, vZ, vRecipW;
+
+    vX = _mm_load_ps(workDesc.pTriBuffer);
+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+    // triangle 0
+    // v0,v1 -> v0,v0,v1
+    __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
+
+    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
+    __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
+    if (workDesc.triFlags.yMajor)
     {
-        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
-        AR_END(BERasterizeTriangle, 1);
-        return;
+        vXa = _mm_add_ps(vAdjust, vXa);
+    }
+    else
+    {
+        vYa = _mm_add_ps(vAdjust, vYa);
     }
 
-    AR_BEGIN(BEStepSetup, pDC->drawId);
+    // Store triangle description for rasterizer
+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
 
-    // Step to pixel center of top-left pixel of the triangle bbox
-    // Align intersect bbox (top/left) to raster tile's (top/left).
-    int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
-    int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
-
-    // convenience typedef
-    typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
-
-    // single sample rasterization evaluates edges at pixel center,
-    // multisample evaluates edges UL pixel corner and steps to each sample position
-    if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+    // binner bins 3 edges for lines as v0, v1, v1
+    // tri0 needs v0, v0, v1
+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
     {
-        // Add 0.5, in fixed point, to offset to pixel center
-        x += (FIXED_POINT_SCALE / 2);
-        y += (FIXED_POINT_SCALE / 2);
+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
+
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
     }
 
-    __m128i vTopLeftX = _mm_set1_epi32(x);
-    __m128i vTopLeftY = _mm_set1_epi32(y);
-
-    // evaluate edge equations at top-left pixel using 64bit math
-    // 
-    // line = Ax + By + C
-    // solving for C:
-    // C = -Ax - By
-    // we know x0 and y0 are on the line; plug them in:
-    // C = -Ax0 - By0
-    // plug C back into line equation:
-    // line = Ax - By - Ax0 - By0
-    // line = A(x - x0) + B(y - y0)
-    // dX = (x-x0), dY = (y-y0)
-    // so all this simplifies to 
-    // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
-
-    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
-    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
-
-    // evaluate A(dx) and B(dY) for all points
-    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
-    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
-    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
-    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
-
-    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
-    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
-    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
-
-    // apply any edge adjustments(top-left, crast, etc)
-    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
-
-    // broadcast respective edge results to all lanes
-    double* pEdge = (double*)&vEdge;
-    __m256d vEdgeFix16[7];
-    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
-    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
-    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
-
-    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
-    _mm_store_si128((__m128i*)aAi, vAi);
-    _mm_store_si128((__m128i*)aBi, vBi);
-    EDGE rastEdges[RT::NumEdgesT::value];
-
-    // Compute and store triangle edge data
-    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
-    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
-    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
-
-    // Compute and store triangle edge data if scissor needs to rasterized
-    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
-                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
-
-    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
-    // used to for testing if entire raster tile is inside a triangle
-    for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+    // Store user clip distances for triangle 0
+    float newClipBuffer[3 * 8];
+    uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
+    if (numClipDist)
     {
-        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
-    }
+        newWorkDesc.pUserClipBuffer = newClipBuffer;
 
-    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
-    // step sample positions to the raster tile bbox of multisample points
-    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
-    //                             |      |
-    //                             |      |
-    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdgeTileBbox[3];
-    if (NumCoverageSamplesT::value > 1)
-    {
-        const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-        const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
-        const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
-
-        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
-        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
-
-        // step edge equation tests from Tile
-        // used to for testing if entire raster tile is inside a triangle
-        for (uint32_t e = 0; e < 3; ++e)
+        float* pOldBuffer = workDesc.pUserClipBuffer;
+        float* pNewBuffer = newClipBuffer;
+        for (uint32_t i = 0; i < numClipDist; ++i)
         {
-            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
-            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
-            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+            // read barycentric coeffs from binner
+            float a = *(pOldBuffer++);
+            float b = *(pOldBuffer++);
 
-            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
-            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
+            // reconstruct original clip distance at vertices
+            float c0 = a + b;
+            float c1 = b;
+
+            // construct triangle barycentrics
+            *(pNewBuffer++) = c0 - c1;
+            *(pNewBuffer++) = c0 - c1;
+            *(pNewBuffer++) = c1;
         }
     }
 
-    AR_END(BEStepSetup, 0);
+    // setup triangle rasterizer function
+    PFN_WORK_FUNC pfnTriRast;
+    // conservative rast not supported for points/lines
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
 
-    uint32_t tY = minTileY;
-    uint32_t tX = minTileX;
-    uint32_t maxY = maxTileY;
-    uint32_t maxX = maxTileX;
+    // make sure this macrotile intersects the triangle
+    __m128i vXai = fpToFixedPoint(vXa);
+    __m128i vYai = fpToFixedPoint(vYa);
+    OSALIGNSIMD(SWR_RECT) bboxA;
+    calcBoundingBoxInt(vXai, vYai, bboxA);
 
-    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-    GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
-    currentRenderBufferRow = renderBuffers;
+    if (!(bboxA.xmin > macroBoxRight ||
+        bboxA.xmin > scissorInFixedPoint.xmax ||
+        bboxA.xmax - 1 < macroBoxLeft ||
+        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+        bboxA.ymin > macroBoxBottom ||
+        bboxA.ymin > scissorInFixedPoint.ymax ||
+        bboxA.ymax - 1 < macroBoxTop ||
+        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+        // rasterize triangle
+        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+    }
 
-    // rasterize and generate coverage masks per sample
-    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+    // triangle 1
+    // v0,v1 -> v1,v1,v0
+    vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
+    vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
+    vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
+    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
+
+    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
+    if (workDesc.triFlags.yMajor)
     {
-        __m256d vStartOfRowEdge[RT::NumEdgesT::value];
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        vXa = _mm_add_ps(vAdjust, vXa);
+    }
+    else
+    {
+        vYa = _mm_add_ps(vAdjust, vYa);
+    }
+
+    // Store triangle description for rasterizer
+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
+
+    // binner bins 3 edges for lines as v0, v1, v1
+    // tri1 needs v1, v1, v0
+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
+    {
+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
+
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
+    }
+
+    // store user clip distance for triangle 1
+    if (numClipDist)
+    {
+        float* pOldBuffer = workDesc.pUserClipBuffer;
+        float* pNewBuffer = newClipBuffer;
+        for (uint32_t i = 0; i < numClipDist; ++i)
         {
-            vStartOfRowEdge[e] = vEdgeFix16[e];
+            // read barycentric coeffs from binner
+            float a = *(pOldBuffer++);
+            float b = *(pOldBuffer++);
+
+            // reconstruct original clip distance at vertices
+            float c0 = a + b;
+            float c1 = b;
+
+            // construct triangle barycentrics
+            *(pNewBuffer++) = c1 - c0;
+            *(pNewBuffer++) = c1 - c0;
+            *(pNewBuffer++) = c0;
         }
+    }
 
-        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
-        {
-            triDesc.anyCoveredSamples = 0;
+    vXai = fpToFixedPoint(vXa);
+    vYai = fpToFixedPoint(vYa);
+    calcBoundingBoxInt(vXai, vYai, bboxA);
 
-            // is the corner of the edge outside of the raster tile? (vEdge < 0)
-            int mask0, mask1, mask2;
-            UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
+    if (!(bboxA.xmin > macroBoxRight ||
+        bboxA.xmin > scissorInFixedPoint.xmax ||
+        bboxA.xmax - 1 < macroBoxLeft ||
+        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+        bboxA.ymin > macroBoxBottom ||
+        bboxA.ymin > scissorInFixedPoint.ymax ||
+        bboxA.ymax - 1 < macroBoxTop ||
+        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+        // rasterize triangle
+        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+    }
 
-            for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
-            {
-                // trivial reject, at least one edge has all 4 corners of raster tile outside
-                bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
+    AR_END(BERasterizeLine, 1);
+}
 
-                if (!trivialReject)
-                {
-                    // trivial accept mask
-                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
-
-                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
-                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
-                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
-
-                    // @todo Make this a bit smarter to allow use of trivial accept when:
-                    //   1) scissor/vp intersection rect is raster tile aligned
-                    //   2) raster tile is entirely within scissor/vp intersection rect
-                    if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
-                    {
-                        // trivial accept, all 4 corners of all 3 edges are negative 
-                        // i.e. raster tile completely inside triangle
-                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
-                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
-                        {
-                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
-                        }
-                        RDTSC_EVENT(BETrivialAccept, 1, 0);
-                    }
-                    else
-                    {
-                        __m256d vEdgeAtSample[RT::NumEdgesT::value];
-                        if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
-                        {
-                            // should get optimized out for single sample case (global value numbering or copy propagation)
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                vEdgeAtSample[e] = vEdgeFix16[e];
-                            }
-                        }
-                        else
-                        {
-                            const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-                            __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
-                            __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
-                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
-                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
-
-                            // step edge equation tests from UL tile corner to pixel sample position
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
-                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
-                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
-                            }
-                        }
-
-                        double startQuadEdges[RT::NumEdgesT::value];
-                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                        {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
-                        }
-
-                        // not trivial accept or reject, must rasterize full tile
-                        AR_BEGIN(BERasterizePartial, pDC->drawId);
-                        triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
-                        AR_END(BERasterizePartial, 0);
-
-                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
-                        
-                        // Output SV InnerCoverage, if needed
-                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
-                    }
-                }
-                else
-                {
-                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
-                    if(NumCoverageSamplesT::value > 1)
-                    {
-                        triDesc.coverageMask[sampleNum] = 0;
-                    }
-                    RDTSC_EVENT(BETrivialReject, 1, 0);
-                }
-            }
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
 
 #if KNOB_ENABLE_TOSS_POINTS
-            if(KNOB_TOSS_RS)
-            {
-                gToss = triDesc.coverageMask[0];
-            }
-            else
-#endif
-            if(triDesc.anyCoveredSamples)
-            {
-                // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
-                // copy conservative coverage result to all samples
-                if(RT::IsConservativeT::value)
-                {
-                    auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
-                    UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
-                }
-
-                AR_BEGIN(BEPixelBackend, pDC->drawId);
-                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
-                AR_END(BEPixelBackend, 0);
-            }
-
-            // step to the next tile in X
-            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-            {
-                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
-            }
-            StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
-        }
-
-        // step to the next tile in Y
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
-        }
-        StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
     }
+#endif
 
-    AR_END(BERasterizeTriangle, 1);
+    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // map x,y relative offsets from start of raster tile to bit position in 
+    // coverage mask for the point
+    static const uint32_t coverageMap[8][8] = {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 },
+        { 16, 17, 20, 21, 24, 25, 28, 29 },
+        { 18, 19, 22, 23, 26, 27, 30, 31 },
+        { 32, 33, 36, 37, 40, 41, 44, 45 },
+        { 34, 35, 38, 39, 42, 43, 46, 47 },
+        { 48, 49, 52, 53, 56, 57, 60, 61 },
+        { 50, 51, 54, 55, 58, 59, 62, 63 }
+    };
+
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+
+    // pull point information from triangle buffer
+    // @todo use structs for readability
+    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
+    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
+    float z = *(workDesc.pTriBuffer + 2);
+
+    // construct triangle descriptor for point
+    // no interpolation, set up i,j for constant interpolation of z and attribs
+    // @todo implement an optimized backend that doesn't require triangle information
+
+    // compute coverage mask from x,y packed into the coverageMask flag
+    // mask indices by the maximum valid index for x/y of coveragemap.
+    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
+    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
+    // todo: multisample points?
+    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+
+    // no persp divide needed for points
+    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
+    triDesc.triFlags = workDesc.triFlags;
+    triDesc.recipDet = 1.0f;
+    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
+    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
+    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
+    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
+
+    RenderOutputBuffers renderBuffers;
+    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
+        renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+
+    AR_BEGIN(BEPixelBackend, pDC->drawId);
+    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
+    AR_END(BEPixelBackend, 0);
 }
 
 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
@@ -1312,7 +332,7 @@
     newWorkDesc.pTriBuffer = &newTriBuffer[0];
 
     // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
     newWorkDesc.pAttribs = &newAttribBuffer[0];
 
     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
@@ -1337,13 +357,13 @@
     *pBuf++ = upperY;
     pBuf++;
     _mm_store_ps(pBuf, _mm_set1_ps(z));
-    _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
+    _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
 
     // setup triangle rasterizer function
     PFN_WORK_FUNC pfnTriRast;
     // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, 
-                                   SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
 
     // overwrite texcoords for point sprites
     if (isPointSpriteTexCoordEnabled)
@@ -1421,383 +441,27 @@
     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
 }
 
-void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+void InitRasterizerFunctions()
 {
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
-    // map x,y relative offsets from start of raster tile to bit position in 
-    // coverage mask for the point
-    static const uint32_t coverageMap[8][8] = {
-        { 0, 1, 4, 5, 8, 9, 12, 13 },
-        { 2, 3, 6, 7, 10, 11, 14, 15 },
-        { 16, 17, 20, 21, 24, 25, 28, 29 },
-        { 18, 19, 22, 23, 26, 27, 30, 31 },
-        { 32, 33, 36, 37, 40, 41, 44, 45 },
-        { 34, 35, 38, 39, 42, 43, 46, 47 },
-        { 48, 49, 52, 53, 56, 57, 60, 61 },
-        { 50, 51, 54, 55, 58, 59, 62, 63 }
-    };
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-
-    // pull point information from triangle buffer
-    // @todo use structs for readability
-    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
-    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
-
-    // construct triangle descriptor for point
-    // no interpolation, set up i,j for constant interpolation of z and attribs
-    // @todo implement an optimized backend that doesn't require triangle information
-
-    // compute coverage mask from x,y packed into the coverageMask flag
-    // mask indices by the maximum valid index for x/y of coveragemap.
-    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
-    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-    // todo: multisample points?
-    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
-
-    // no persp divide needed for points
-    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-    triDesc.triFlags = workDesc.triFlags;
-    triDesc.recipDet = 1.0f;
-    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
-    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
-    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
-    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-
-    RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
-        renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
-
-    AR_BEGIN(BEPixelBackend, pDC->drawId);
-    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    AR_END(BEPixelBackend, 0);
+    InitRasterizerFuncs();
 }
 
-// Get pointers to hot tile memory for color RT, depth, stencil
-template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
-{
-    const API_STATE& state = GetApiState(pDC);
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    uint32_t mx, my;
-    MacroTileMgr::getTileIndices(macroID, mx, my);
-    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
-    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
-
-    // compute tile offset for active hottile buffers
-    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-    offset*=numSamples;
-
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
-            numSamples, renderTargetArrayIndex);
-        pColor->state = HOTTILE_DIRTY;
-        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
-        
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
-    if(state.depthHottileEnable)
-    {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
-            numSamples, renderTargetArrayIndex);
-        pDepth->state = HOTTILE_DIRTY;
-        SWR_ASSERT(pDepth->pBuffer != nullptr);
-        renderBuffers.pDepth = pDepth->pBuffer + offset;
-    }
-    if(state.stencilHottileEnable)
-    {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
-            numSamples, renderTargetArrayIndex);
-        pStencil->state = HOTTILE_DIRTY;
-        SWR_ASSERT(pStencil->pBuffer != nullptr);
-        renderBuffers.pStencil = pStencil->pBuffer + offset;
-    }
-}
-
-template <typename RT>
-INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
-{
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        buffers.pColor[rt] += RT::colorRasterTileStep;
-    }
-    
-    buffers.pDepth += RT::depthRasterTileStep;
-    buffers.pStencil += RT::stencilRasterTileStep;
-}
-
-template <typename RT>
-INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
-{
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
-        buffers.pColor[rt] = startBufferRow.pColor[rt];
-    }
-    startBufferRow.pDepth += RT::depthRasterTileRowStep;
-    buffers.pDepth = startBufferRow.pDepth;
-
-    startBufferRow.pStencil += RT::stencilRasterTileRowStep;
-    buffers.pStencil = startBufferRow.pStencil;
-}
-
-void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    // bloat line to two tris and call the triangle rasterizer twice
-    AR_BEGIN(BERasterizeLine, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-    const SWR_RASTSTATE &rastState = state.rastState;
-
-    // macrotile dimensioning
-    uint32_t macroX, macroY;
-    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
-    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
-    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
-
-    __m128 vX, vY, vZ, vRecipW;
-
-    vX = _mm_load_ps(workDesc.pTriBuffer);
-    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // triangle 0
-    // v0,v1 -> v0,v0,v1
-    __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
-
-    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
-    __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri0 needs v0, v0, v1
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
-    }
-
-    // Store user clip distances for triangle 0
-    float newClipBuffer[3 * 8];
-    uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
-    if (numClipDist)
-    {
-        newWorkDesc.pUserClipBuffer = newClipBuffer;
-
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c1;
-        }
-    }
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, 
-                                   SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
-
-    // make sure this macrotile intersects the triangle
-    __m128i vXai = fpToFixedPoint(vXa);
-    __m128i vYai = fpToFixedPoint(vYa);
-    OSALIGNSIMD(SWR_RECT) bboxA;
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    // triangle 1
-    // v0,v1 -> v1,v1,v0
-    vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
-    vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
-    vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
-    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
-
-    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri1 needs v1, v1, v0
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
-    }
-
-    // store user clip distance for triangle 1
-    if (numClipDist)
-    {
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c0;
-        }
-    }
-
-    vXai = fpToFixedPoint(vXa);
-    vYai = fpToFixedPoint(vYa);
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    AR_END(BERasterizeLine, 1);
-}
-
-struct RasterizerChooser
-{
-    typedef PFN_WORK_FUNC FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
-    }
-};
-
 // Selector for correct templated RasterizeTriangle function
 PFN_WORK_FUNC GetRasterizerFunc(
-    uint32_t numSamples,
+    SWR_MULTISAMPLE_COUNT numSamples,
     bool IsCenter,
     bool IsConservative,
-    uint32_t InputCoverage,
+    SWR_INPUT_COVERAGE InputCoverage,
     uint32_t EdgeEnable,
     bool RasterizeScissorEdges
 )
 {
-    return TemplateArgUnroller<RasterizerChooser>::GetFunc(
-        IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
-        IsCenter,
-        IsConservative,
-        IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
-        IntArg<0, STATE_VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
-        RasterizeScissorEdges);
+    SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
+    SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
+    SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
+
+    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
+    SWR_ASSERT(func);
+
+    return func;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
index e99920a..414d0f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@@ -35,6 +35,7 @@
 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void InitRasterizerFunctions();
 
 INLINE
 __m128i fpToFixedPoint(const __m128 vIn)
@@ -43,15 +44,6 @@
     return _mm_cvtps_epi32(vFixed);
 }
 
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(
-    uint32_t numSamples,
-    bool IsCenter,
-    bool IsConservative,
-    uint32_t InputCoverage,
-    uint32_t EdgeEnable,
-    bool RasterizeScissorEdges);
-
 enum TriEdgesStates
 {
     STATE_NO_VALID_EDGES = 0,
@@ -72,6 +64,15 @@
     VALID_TRI_EDGE_COUNT,
 };
 
+// Selector for correct templated RasterizeTriangle function
+PFN_WORK_FUNC GetRasterizerFunc(
+    SWR_MULTISAMPLE_COUNT numSamples,
+    bool IsCenter,
+    bool IsConservative,
+    SWR_INPUT_COVERAGE InputCoverage,
+    uint32_t EdgeEnable,
+    bool RasterizeScissorEdges);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief ValidTriEdges convenience typedefs used for templated function 
 /// specialization supported Fixed Point precisions
@@ -173,7 +174,7 @@
 /// (only used with conservative rasterization)
 /// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
 template <typename NumSamplesT, typename CenterPatternT, typename ConservativeT, typename InputCoverageT, typename EdgeEnableT, typename RasterScissorEdgesT>
-struct RasterizerTraits final : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
+struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
                                 public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
 {
     typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), CenterPatternT::value> MT;
@@ -197,3 +198,13 @@
     static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
     static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
 };
+
+template <uint32_t NumSamplesT, uint32_t CenterPatternT, uint32_t ConservativeT, uint32_t InputCoverageT, uint32_t EdgeEnableT, uint32_t RasterScissorEdgesT>
+struct RasterizerTraits final : public _RasterizerTraits <
+    std::integral_constant<uint32_t, NumSamplesT>,
+    std::integral_constant<bool, CenterPatternT != 0>,
+    std::integral_constant<bool, ConservativeT != 0>,
+    std::integral_constant<uint32_t, InputCoverageT>,
+    std::integral_constant<uint32_t, EdgeEnableT>,
+    std::integral_constant<bool, RasterScissorEdgesT != 0> >
+{};
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
new file mode 100644
index 0000000..081e4dd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
@@ -0,0 +1,1380 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.cpp
+*
+* @brief Implementation for the rasterizer.
+*
+******************************************************************************/
+
+#include <vector>
+#include <algorithm>
+
+#include "rasterizer.h"
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "utils.h"
+#include "frontend.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+
+extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
+
+template <uint32_t numSamples = 1>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
+template <typename RT>
+void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
+template <typename RT>
+void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+static const __m256d gMaskToVecpd[] =
+{
+    MASKTOVEC(0, 0, 0, 0),
+    MASKTOVEC(0, 0, 0, 1),
+    MASKTOVEC(0, 0, 1, 0),
+    MASKTOVEC(0, 0, 1, 1),
+    MASKTOVEC(0, 1, 0, 0),
+    MASKTOVEC(0, 1, 0, 1),
+    MASKTOVEC(0, 1, 1, 0),
+    MASKTOVEC(0, 1, 1, 1),
+    MASKTOVEC(1, 0, 0, 0),
+    MASKTOVEC(1, 0, 0, 1),
+    MASKTOVEC(1, 0, 1, 0),
+    MASKTOVEC(1, 0, 1, 1),
+    MASKTOVEC(1, 1, 0, 0),
+    MASKTOVEC(1, 1, 0, 1),
+    MASKTOVEC(1, 1, 1, 0),
+    MASKTOVEC(1, 1, 1, 1),
+};
+
+struct POS
+{
+    int32_t x, y;
+};
+
+struct EDGE
+{
+    double a, b;                // a, b edge coefficients in fix8
+    double stepQuadX;           // step to adjacent horizontal quad in fix16
+    double stepQuadY;           // step to adjacent vertical quad in fix16
+    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
+    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
+
+    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
+    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief rasterize a raster tile partially covered by the triangle
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
+///        Used to step between quads when sweeping over the raster tile.
+template<uint32_t NumEdges, typename EdgeMaskT>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+{
+    uint64_t coverageMask = 0;
+
+    __m256d vEdges[NumEdges];
+    __m256d vStepX[NumEdges];
+    __m256d vStepY[NumEdges];
+
+    for (uint32_t e = 0; e < NumEdges; ++e)
+    {
+        // Step to the pixel sample locations of the 1st quad
+        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
+
+        // compute step to next quad (mul by 2 in x and y direction)
+        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
+        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
+    }
+
+    // fast unrolled version for 8x8 tile
+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
+    int edgeMask[NumEdges];
+    uint64_t mask;
+
+    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
+    auto update_lambda = [&](int e){mask &= edgeMask[e];};
+    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
+    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
+    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+
+// evaluate which pixels in the quad are covered
+#define EVAL \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
+
+    // update coverage mask
+    // if edge 0 is degenerate and will be skipped; init the mask
+#define UPDATE_MASK(bit) \
+            if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
+                mask = 0xf;\
+            }\
+            else{\
+                mask = edgeMask[0]; \
+            }\
+            UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
+            coverageMask |= (mask << bit);
+
+    // step in the +x direction to the next quad 
+#define INCX \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
+
+    // step in the +y direction to the next quad 
+#define INCY \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
+
+    // step in the -x direction to the next quad 
+#define DECX \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
+
+    // sweep 2x2 quad back and forth through the raster tile, 
+    // computing coverage masks for the entire tile
+
+    // raster tile
+    // 0  1  2  3  4  5  6  7 
+    // x  x
+    // x  x ------------------>  
+    //                   x  x  |
+    // <-----------------x  x  V
+    // ..
+
+    // row 0
+    EVAL;
+    UPDATE_MASK(0);
+    INCX;
+    EVAL;
+    UPDATE_MASK(4);
+    INCX;
+    EVAL;
+    UPDATE_MASK(8);
+    INCX;
+    EVAL;
+    UPDATE_MASK(12);
+    INCY;
+
+    //row 1
+    EVAL;
+    UPDATE_MASK(28);
+    DECX;
+    EVAL;
+    UPDATE_MASK(24);
+    DECX;
+    EVAL;
+    UPDATE_MASK(20);
+    DECX;
+    EVAL;
+    UPDATE_MASK(16);
+    INCY;
+
+    // row 2
+    EVAL;
+    UPDATE_MASK(32);
+    INCX;
+    EVAL;
+    UPDATE_MASK(36);
+    INCX;
+    EVAL;
+    UPDATE_MASK(40);
+    INCX;
+    EVAL;
+    UPDATE_MASK(44);
+    INCY;
+
+    // row 3
+    EVAL;
+    UPDATE_MASK(60);
+    DECX;
+    EVAL;
+    UPDATE_MASK(56);
+    DECX;
+    EVAL;
+    UPDATE_MASK(52);
+    DECX;
+    EVAL;
+    UPDATE_MASK(48);
+#else
+    uint32_t bit = 0;
+    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+    {
+        __m256d vStartOfRowEdge[NumEdges];
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vStartOfRowEdge[e] = vEdges[e];
+        }
+
+        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+        {
+            int edgeMask[NumEdges];
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
+            }
+
+            uint64_t mask = edgeMask[0];
+            for (uint32_t e = 1; e < NumEdges; ++e)
+            {
+                mask &= edgeMask[e];
+            }
+            coverageMask |= (mask << bit);
+
+            // step to the next pixel in the x
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
+            }
+            bit+=4;
+        }
+
+        // step to the next row
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
+        }
+    }
+#endif
+    return coverageMask;
+
+}
+// Top left rule:
+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
+// Top left: a sample is in if it is a top or left edge.
+// Out: !(horizontal && above) = !horizontal && below
+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
+INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 
+{
+    // if vA < 0, vC--
+    // if vA == 0 && vB < 0, vC--
+
+    __m256d vEdgeOut = vEdge;
+    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
+
+    // if vA < 0 (line is not horizontal and below)
+    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
+
+    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
+    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
+    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
+
+    // if either of these are true and we're on the line (edge == 0), bump it outside the line
+    vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates difference in precision between the result of manh
+/// calculation and the edge precision, based on compile time trait values
+template<typename RT>
+constexpr int64_t ManhToEdgePrecisionAdjust()
+{
+    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+                  "Inadequate precision of result of manh calculation ");
+    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct adjustEdgeConservative
+/// @brief Primary template definition used for partially specializing 
+/// the adjustEdgeConservative function. This struct should never
+/// be instantiated.
+/// @tparam RT: rasterizer traits
+/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
+template <typename RT, typename ConservativeEdgeOffsetT>
+struct adjustEdgeConservative
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs calculations to adjust each edge of a triangle away
+    /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+    /// direction. 
+    ///
+    /// Uncertainty regions arise from fixed point rounding, which
+    /// can snap a vertex +/- by min fixed point value.
+    /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
+    /// This allows the rasterizer to test for coverage only at the pixel center, 
+    /// instead of having to test individual pixel corners for conservative coverage
+    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 
+        // from the pixel center (in the direction of the edge normal A/B)
+
+        // edge = Ax + Bx + C - (manh/e)
+        // manh = manhattan distance = abs(A) + abs(B)
+        // e = absolute rounding error from snapping from float to fixed point precision
+
+        // 'fixed point' multiply (in double to be avx1 friendly) 
+        // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
+        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
+        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
+                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
+
+        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+                      "Inadequate precision of result of manh calculation ");
+
+        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
+        // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
+        manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
+
+        // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
+        // this allows the rasterizer to do a single conservative coverage test to see if the primitive
+        // intersects the pixel at all
+        vEdge = _mm256_sub_pd(vEdge, manh);
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief adjustEdgeConservative specialization where no edge offset is needed
+template <typename RT>
+struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
+{
+    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates the distance a degenerate BBox needs to be adjusted 
+/// for conservative rast based on compile time trait values
+template<typename RT>
+constexpr int64_t ConservativeScissorOffset()
+{
+    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
+    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
+    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
+    // 1/2 pixel edge offset + conservative offset - degenerateTriangle
+    return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a vector of evaluated edges out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction. 
+template <typename RT>
+INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
+{
+    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+    int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
+    vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a scalar evaluated edge out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction. 
+template <typename RT, typename OffsetT>
+INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
+{
+    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
+    return (Edge - manh);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform any needed adjustments to evaluated triangle edges
+template <typename RT, typename EdgeOffsetT>
+struct adjustEdgesFix16
+{
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
+                      "Edge equation expected to be in x.16 fixed point");
+
+        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
+
+        // need to apply any edge offsets before applying the top-left rule
+        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
+
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform top left adjustments to evaluated triangle edges
+template <typename RT>
+struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
+{
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
+
+// max(abs(dz/dx), abs(dz,dy)
+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
+{
+    /*
+    // evaluate i,j at (0,0)
+    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // evaluate i,j at (1,0)
+    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // compute dz/dx
+    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
+    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
+    float dzdx = abs(d10 - d00);
+
+    // evaluate i,j at (0,1)
+    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
+    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
+
+    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
+    float dzdy = abs(d01 - d00);
+    */
+
+    // optimized version of above
+    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
+    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
+
+    return std::max(dzdx, dzdy);
+}
+
+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+{
+    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
+    {
+        return (1.0f / (1 << 24));
+    }
+    else if (pState->depthFormat == R16_UNORM)
+    {
+        return (1.0f / (1 << 16));
+    }
+    else
+    {
+        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
+
+        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
+        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+        uint32_t zMaxInt = *(uint32_t*)&zMax;
+        zMaxInt &= 0x7f800000;
+        zMax = *(float*)&zMaxInt;
+
+        return zMax * (1.0f / (1 << 23));
+    }
+}
+
+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+{
+    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
+    {
+        return 0.0f;
+    }
+
+    float scale = pState->slopeScaledDepthBias;
+    if (scale != 0.0f)
+    {
+        scale *= ComputeMaxDepthSlope(pTri);
+    }
+
+    float bias = pState->depthBias;
+    if (!pState->depthBiasPreAdjusted)
+    {
+        bias *= ComputeBiasFactor(pState, pTri, z);
+    }
+    bias += scale;
+
+    if (pState->depthBiasClamp > 0.0f)
+    {
+        bias = std::min(bias, pState->depthBiasClamp);
+    }
+    else if (pState->depthBiasClamp < 0.0f)
+    {
+        bias = std::max(bias, pState->depthBiasClamp);
+    }
+
+    return bias;
+}
+
+// Prevent DCE by writing coverage mask from rasterizer to volatile
+#if KNOB_ENABLE_TOSS_POINTS
+__declspec(thread) volatile uint64_t gToss;
+#endif
+
+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
+// try to avoid _chkstk insertions; make this thread local
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
+
+INLINE
+void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
+{
+    edge.a = a;
+    edge.b = b;
+
+    // compute constant steps to adjacent quads
+    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
+    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
+
+    // compute constant steps to adjacent raster tiles
+    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
+    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
+
+    // compute quad offsets
+    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
+    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
+    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
+    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+
+    // compute raster tile offsets
+    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
+    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
+    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
+    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
+}
+
+INLINE
+void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
+{
+    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary template definition used for partially specializing 
+/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 
+/// corner to sample position, and test for coverage
+/// @tparam sampleCount: multisample count
+template <typename NumSamplesT>
+INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+    __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
+    // evaluate edge equations at the tile multisample bounding box
+    vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+    vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+    vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
+    mask0 = _mm256_movemask_pd(vSampleBboxTest0);
+    mask1 = _mm256_movemask_pd(vSampleBboxTest1);
+    mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
+/// when only rasterizing a single coverage test point
+template <>
+INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
+                                           int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+    mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
+    mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
+    mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct ComputeScissorEdges
+/// @brief Primary template definition. Allows the function to be generically
+/// called. When paired with below specializations, will result in an empty 
+/// inlined function if scissor is not enabled
+/// @tparam RasterScissorEdgesT: is scissor enabled?
+/// @tparam IsConservativeT: is conservative rast enabled?
+/// @tparam RT: rasterizer traits
+template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
+struct ComputeScissorEdges
+{
+    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 
+/// specialization. Instantiated when conservative rast and scissor are enabled
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::true_type, RT>
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 
+    /// evaluate edge equations and offset them away from pixel center.
+    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    {
+        // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
+        SWR_RECT scissor;
+        scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
+        scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
+        scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
+        scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
+
+        POS topLeft{scissor.xmin, scissor.ymin};
+        POS bottomLeft{scissor.xmin, scissor.ymax};
+        POS topRight{scissor.xmax, scissor.ymin};
+        POS bottomRight{scissor.xmax, scissor.ymax};
+
+        // construct 4 scissor edges in ccw direction
+        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+        // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
+        adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
+        adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
+        adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
+        adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
+
+        // Upper left rule for scissor
+        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 
+/// specialization. Instantiated when scissor is enabled and conservative rast
+/// is disabled.
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::false_type, RT>
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Compute scissor edge vectors and evaluate edge equations
+    INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    {
+        const SWR_RECT &scissor = scissorBBox;
+        POS topLeft{scissor.xmin, scissor.ymin};
+        POS bottomLeft{scissor.xmin, scissor.ymax};
+        POS topRight{scissor.xmax, scissor.ymin};
+        POS bottomRight{scissor.xmax, scissor.ymax};
+
+        // construct 4 scissor edges in ccw direction
+        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+        // Upper left rule for scissor
+        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialRejectTest. Should
+/// never be called, but TemplateUnroller instantiates a few unused values,
+/// so it calls a runtime assert instead of a static_assert.
+template <typename ValidEdgeMaskT>
+INLINE bool TrivialRejectTest(const int, const int, const int)
+{
+    SWR_INVALID("Primary templated function should never be called");
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 1 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
+{
+    return (!(mask0 && mask1)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
+{
+    return (!(mask0 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
+{
+    return (!(mask1 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
+/// primitive edges for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
+{
+    return (!(mask0 && mask1 && mask2)) ? true : false;;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
+/// point, so return false and rasterize against conservative BBox
+template <>
+INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
+{
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialAcceptTest. Always returns
+/// false, since it will only be called for degenerate tris, and as such 
+/// will never cover the entire raster tile
+template <typename ScissorEnableT>
+INLINE bool TrivialAcceptTest(const int, const int, const int)
+{
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
+/// edge masks for a fully covered raster tile
+template <>
+INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
+{
+    return ((mask0 & mask1 & mask2) == 0xf);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for GenerateSVInnerCoverage. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct GenerateSVInnerCoverage
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of GenerateSVInnerCoverage where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
+/// edge values from OuterConservative to InnerConservative and rasterizes.
+template <typename RT>
+struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
+    {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
+        double startQuadEdgesAdj[RT::NumEdgesT::value];
+        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
+        }
+
+        // not trivial accept or reject, must rasterize full tile
+        AR_BEGIN(BERasterizePartial, pDC->drawId);
+        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
+        AR_END(BERasterizePartial, 0);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct UpdateEdgeMasksInnerConservative
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
+                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
+/// evaluated at raster tile corners to inner conservative position and 
+/// updates edge masks
+template <typename RT>
+struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
+    {
+        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
+
+        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
+        // conservative evaluated edge when adjusting the edge in for inner conservative tests
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
+
+        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
+/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
+/// cover an entire raster tile, set mask0 to 0 to force it down the
+/// rastierizePartialTile path
+template <typename RT, typename ValidEdgeMaskT>
+struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
+                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
+    {
+        // set one mask to zero to force the triangle down the rastierizePartialTile path
+        mask0 = 0;
+    }
+};
+
+template <typename RT>
+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
+    AR_BEGIN(BETriangleSetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+    const SWR_RASTSTATE &rastState = state.rastState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+
+    __m128 vX, vY, vZ, vRecipW;
+    
+    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
+    // eg: vX = [x0 x1 x2 dc]
+    vX = _mm_load_ps(workDesc.pTriBuffer);
+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+    // convert to fixed point
+    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
+    __m128i vXi = fpToFixedPoint(vX);
+    __m128i vYi = fpToFixedPoint(vY);
+
+    // quantize floating point position to fixed point precision
+    // to prevent attribute creep around the triangle vertices
+    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+
+    // triangle setup - A and B edge equation coefs
+    __m128 vA, vB;
+    triangleSetupAB(vX, vY, vA, vB);
+
+    __m128i vAi, vBi;
+    triangleSetupABInt(vXi, vYi, vAi, vBi);
+    
+    // determinant
+    float det = calcDeterminantInt(vAi, vBi);
+
+    // Verts in Pixel Coordinate Space at this point
+    // Det > 0 = CW winding order 
+    // Convert CW triangles to CCW
+    if (det > 0.0)
+    {
+        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
+        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
+        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
+        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
+        det = -det;
+    }
+
+    __m128 vC;
+    // Finish triangle setup - C edge coef
+    triangleSetupC(vX, vY, vA, vB, vC);
+
+    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    {
+        // If we have degenerate edge(s) to rasterize, set I and J coefs 
+        // to 0 for constant interpolation of attributes
+        triDesc.I[0] = 0.0f;
+        triDesc.I[1] = 0.0f;
+        triDesc.I[2] = 0.0f;
+        triDesc.J[0] = 0.0f;
+        triDesc.J[1] = 0.0f;
+        triDesc.J[2] = 0.0f;
+
+        // Degenerate triangles have no area
+        triDesc.recipDet = 0.0f;
+    }
+    else
+    {
+        // only extract coefs for 2 of the barycentrics; the 3rd can be 
+        // determined from the barycentric equation:
+        // i + j + k = 1 <=> k = 1 - j - i
+        _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
+        _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
+        _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
+        _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
+        _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
+        _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
+
+        // compute recipDet, used to calculate barycentric i and j in the backend
+        triDesc.recipDet = 1.0f/det;
+    }
+
+    OSALIGNSIMD(float) oneOverW[4];
+    _mm_store_ps(oneOverW, vRecipW);
+    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
+    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
+    triDesc.OneOverW[2] = oneOverW[2];
+
+    // calculate perspective correct coefs per vertex attrib 
+    float* pPerspAttribs = perspAttribsTLS;
+    float* pAttribs = workDesc.pAttribs;
+    triDesc.pPerspAttribs = pPerspAttribs;
+    triDesc.pAttribs = pAttribs;
+    float *pRecipW = workDesc.pTriBuffer + 12;
+    triDesc.pRecipW = pRecipW;
+    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
+    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
+    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
+    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+    {
+        __m128 attribA = _mm_load_ps(pAttribs);
+        __m128 attribB = _mm_load_ps(pAttribs+=4);
+        __m128 attribC = _mm_load_ps(pAttribs+=4);
+        pAttribs+=4;
+
+        attribA = _mm_mul_ps(attribA, vOneOverWV0);
+        attribB = _mm_mul_ps(attribB, vOneOverWV1);
+        attribC = _mm_mul_ps(attribC, vOneOverWV2);
+
+        _mm_store_ps(pPerspAttribs, attribA);
+        _mm_store_ps(pPerspAttribs+=4, attribB);
+        _mm_store_ps(pPerspAttribs+=4, attribC);
+        pPerspAttribs+=4;
+    }
+
+    // compute bary Z
+    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
+    OSALIGNSIMD(float) a[4];
+    _mm_store_ps(a, vZ);
+    triDesc.Z[0] = a[0] - a[2];
+    triDesc.Z[1] = a[1] - a[2];
+    triDesc.Z[2] = a[2];
+        
+    // add depth bias
+    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
+
+    // Calc bounding box of triangle
+    OSALIGNSIMD(SWR_RECT) bbox;
+    calcBoundingBoxInt(vXi, vYi, bbox);
+
+    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
+    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    {
+        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
+        bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
+        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
+                   "Conservative rast degenerate handling requires a valid scissor rect");
+    }
+
+    // Intersect with scissor/viewport
+    OSALIGNSIMD(SWR_RECT) intersect;
+    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
+    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
+    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
+    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
+
+    triDesc.triFlags = workDesc.triFlags;
+
+    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+    uint32_t macroX, macroY;
+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+    intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
+    intersect.ymin = std::max(intersect.ymin, macroBoxTop);
+    intersect.xmax = std::min(intersect.xmax, macroBoxRight);
+    intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
+
+    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
+
+    AR_END(BETriangleSetup, 0);
+
+    // update triangle desc
+    uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t numTilesX = maxTileX - minTileX + 1;
+    uint32_t numTilesY = maxTileY - minTileY + 1;
+
+    if (numTilesX == 0 || numTilesY == 0) 
+    {
+        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
+        AR_END(BERasterizeTriangle, 1);
+        return;
+    }
+
+    AR_BEGIN(BEStepSetup, pDC->drawId);
+
+    // Step to pixel center of top-left pixel of the triangle bbox
+    // Align intersect bbox (top/left) to raster tile's (top/left).
+    int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
+    int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
+
+    // convenience typedef
+    typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
+
+    // single sample rasterization evaluates edges at pixel center,
+    // multisample evaluates edges UL pixel corner and steps to each sample position
+    if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+    {
+        // Add 0.5, in fixed point, to offset to pixel center
+        x += (FIXED_POINT_SCALE / 2);
+        y += (FIXED_POINT_SCALE / 2);
+    }
+
+    __m128i vTopLeftX = _mm_set1_epi32(x);
+    __m128i vTopLeftY = _mm_set1_epi32(y);
+
+    // evaluate edge equations at top-left pixel using 64bit math
+    // 
+    // line = Ax + By + C
+    // solving for C:
+    // C = -Ax - By
+    // we know x0 and y0 are on the line; plug them in:
+    // C = -Ax0 - By0
+    // plug C back into line equation:
+    // line = Ax - By - Ax0 - By0
+    // line = A(x - x0) + B(y - y0)
+    // dX = (x-x0), dY = (y-y0)
+    // so all this simplifies to 
+    // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
+
+    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
+    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
+
+    // evaluate A(dx) and B(dY) for all points
+    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
+    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
+    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
+
+    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
+    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
+    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+
+    // apply any edge adjustments(top-left, crast, etc)
+    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
+
+    // broadcast respective edge results to all lanes
+    double* pEdge = (double*)&vEdge;
+    __m256d vEdgeFix16[7];
+    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
+    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
+    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
+
+    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
+    _mm_store_si128((__m128i*)aAi, vAi);
+    _mm_store_si128((__m128i*)aBi, vBi);
+    EDGE rastEdges[RT::NumEdgesT::value];
+
+    // Compute and store triangle edge data
+    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
+    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
+    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
+
+    // Compute and store triangle edge data if scissor needs to rasterized
+    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
+                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+
+    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
+    // used to for testing if entire raster tile is inside a triangle
+    for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+    {
+        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+    }
+
+    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
+    // step sample positions to the raster tile bbox of multisample points
+    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
+    //                             |      |
+    //                             |      |
+    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
+    __m256d vEdgeTileBbox[3];
+    if (NumCoverageSamplesT::value > 1)
+    {
+        const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+        const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
+        const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
+
+        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
+        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
+
+        // step edge equation tests from Tile
+        // used to for testing if entire raster tile is inside a triangle
+        for (uint32_t e = 0; e < 3; ++e)
+        {
+            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
+            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
+        }
+    }
+
+    AR_END(BEStepSetup, 0);
+
+    uint32_t tY = minTileY;
+    uint32_t tX = minTileX;
+    uint32_t maxY = maxTileY;
+    uint32_t maxX = maxTileX;
+
+    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
+    GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+    currentRenderBufferRow = renderBuffers;
+
+    // rasterize and generate coverage masks per sample
+    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+    {
+        __m256d vStartOfRowEdge[RT::NumEdgesT::value];
+        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            vStartOfRowEdge[e] = vEdgeFix16[e];
+        }
+
+        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
+        {
+            triDesc.anyCoveredSamples = 0;
+
+            // is the corner of the edge outside of the raster tile? (vEdge < 0)
+            int mask0, mask1, mask2;
+            UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
+
+            for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
+            {
+                // trivial reject, at least one edge has all 4 corners of raster tile outside
+                bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
+
+                if (!trivialReject)
+                {
+                    // trivial accept mask
+                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+
+                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
+                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
+                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
+
+                    // @todo Make this a bit smarter to allow use of trivial accept when:
+                    //   1) scissor/vp intersection rect is raster tile aligned
+                    //   2) raster tile is entirely within scissor/vp intersection rect
+                    if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
+                    {
+                        // trivial accept, all 4 corners of all 3 edges are negative 
+                        // i.e. raster tile completely inside triangle
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
+                        {
+                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
+                        }
+                        RDTSC_EVENT(BETrivialAccept, 1, 0);
+                    }
+                    else
+                    {
+                        __m256d vEdgeAtSample[RT::NumEdgesT::value];
+                        if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+                        {
+                            // should get optimized out for single sample case (global value numbering or copy propagation)
+                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                            {
+                                vEdgeAtSample[e] = vEdgeFix16[e];
+                            }
+                        }
+                        else
+                        {
+                            const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+                            __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
+                            __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
+                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
+                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
+
+                            // step edge equation tests from UL tile corner to pixel sample position
+                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                            {
+                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+                            }
+                        }
+
+                        double startQuadEdges[RT::NumEdgesT::value];
+                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                        {
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
+                        }
+
+                        // not trivial accept or reject, must rasterize full tile
+                        AR_BEGIN(BERasterizePartial, pDC->drawId);
+                        triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
+                        AR_END(BERasterizePartial, 0);
+
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        
+                        // Output SV InnerCoverage, if needed
+                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
+                    }
+                }
+                else
+                {
+                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
+                    if(NumCoverageSamplesT::value > 1)
+                    {
+                        triDesc.coverageMask[sampleNum] = 0;
+                    }
+                    RDTSC_EVENT(BETrivialReject, 1, 0);
+                }
+            }
+
+#if KNOB_ENABLE_TOSS_POINTS
+            if(KNOB_TOSS_RS)
+            {
+                gToss = triDesc.coverageMask[0];
+            }
+            else
+#endif
+            if(triDesc.anyCoveredSamples)
+            {
+                // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
+                // copy conservative coverage result to all samples
+                if(RT::IsConservativeT::value)
+                {
+                    auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
+                    UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
+                }
+
+                AR_BEGIN(BEPixelBackend, pDC->drawId);
+                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+                AR_END(BEPixelBackend, 0);
+            }
+
+            // step to the next tile in X
+            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+            {
+                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+            }
+            StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
+        }
+
+        // step to the next tile in Y
+        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+        }
+        StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
+    }
+
+    AR_END(BERasterizeTriangle, 1);
+}
+
+// Get pointers to hot tile memory for color RT, depth, stencil
+template <uint32_t numSamples>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
+{
+    const API_STATE& state = GetApiState(pDC);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    uint32_t mx, my;
+    MacroTileMgr::getTileIndices(macroID, mx, my);
+    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
+    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
+
+    // compute tile offset for active hottile buffers
+    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+    offset*=numSamples;
+
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
+            numSamples, renderTargetArrayIndex);
+        pColor->state = HOTTILE_DIRTY;
+        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
+        
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+    if(state.depthHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
+            numSamples, renderTargetArrayIndex);
+        pDepth->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pDepth->pBuffer != nullptr);
+        renderBuffers.pDepth = pDepth->pBuffer + offset;
+    }
+    if(state.stencilHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
+            numSamples, renderTargetArrayIndex);
+        pStencil->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pStencil->pBuffer != nullptr);
+        renderBuffers.pStencil = pStencil->pBuffer + offset;
+    }
+}
+
+template <typename RT>
+INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
+{
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, colorHotTileMask))
+    {
+        colorHotTileMask &= ~(1 << rt);
+        buffers.pColor[rt] += RT::colorRasterTileStep;
+    }
+    
+    buffers.pDepth += RT::depthRasterTileStep;
+    buffers.pStencil += RT::stencilRasterTileStep;
+}
+
+template <typename RT>
+INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
+{
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, colorHotTileMask))
+    {
+        colorHotTileMask &= ~(1 << rt);
+        startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
+        buffers.pColor[rt] = startBufferRow.pColor[rt];
+    }
+    startBufferRow.pDepth += RT::depthRasterTileRowStep;
+    buffers.pDepth = startBufferRow.pDepth;
+
+    startBufferRow.pStencil += RT::stencilRasterTileRowStep;
+    buffers.pStencil = startBufferRow.pStencil;
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 396c19e5..7af3f82 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "common/formats.h"
-#include "common/simdintrin.h"
+#include "common/intrin.h"
 #include <functional>
 #include <algorithm>
 
@@ -175,25 +175,36 @@
 /////////////////////////////////////////////////////////////////////////
 /// simdvertex
 /// @brief Defines a vertex element that holds all the data for SIMD vertices.
-///        Contains position in clip space, hardcoded to attribute 0,
-///        space for up to 32 attributes, as well as any SGV values generated
-///        by the pipeline
+///        Contains space for position, SGV, and 32 generic attributes
 /////////////////////////////////////////////////////////////////////////
-#define VERTEX_POSITION_SLOT 0
-#define VERTEX_ATTRIB_START_SLOT 1
-#define VERTEX_ATTRIB_END_SLOT 32
-#define VERTEX_RTAI_SLOT 33         // GS writes RenderTargetArrayIndex here
-#define VERTEX_PRIMID_SLOT 34       // GS writes PrimId here
-#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS writes lower 4 clip/cull dist
-#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS writes upper 4 clip/cull dist
-#define VERTEX_POINT_SIZE_SLOT 37       // VS writes point size here
-#define VERTEX_VIEWPORT_ARRAY_INDEX_SLOT 38
+enum SWR_VTX_SLOTS
+{
+    VERTEX_SGV_SLOT                 = 0,
+        VERTEX_SGV_RTAI_COMP        = 0,
+        VERTEX_SGV_VAI_COMP         = 1,
+        VERTEX_SGV_POINT_SIZE_COMP  = 2,
+    VERTEX_POSITION_SLOT            = 1,
+    VERTEX_POSITION_END_SLOT        = 1,
+    VERTEX_CLIPCULL_DIST_LO_SLOT    = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist
+    VERTEX_CLIPCULL_DIST_HI_SLOT    = (2 + VERTEX_POSITION_END_SLOT), // VS writes upper 4 clip/cull dist
+    VERTEX_ATTRIB_START_SLOT        = (3 + VERTEX_POSITION_END_SLOT),
+    VERTEX_ATTRIB_END_SLOT          = (34 + VERTEX_POSITION_END_SLOT),
+    SWR_VTX_NUM_SLOTS               = (1 + VERTEX_ATTRIB_END_SLOT)
+};
+
 // SoAoSoA
 struct simdvertex
 {
-    simdvector    attrib[KNOB_NUM_ATTRIBUTES];
+    simdvector      attrib[SWR_VTX_NUM_SLOTS];
 };
 
+#if ENABLE_AVX512_SIMD16
+struct simd16vertex
+{
+    simd16vector    attrib[SWR_VTX_NUM_SLOTS];
+};
+
+#endif
 //////////////////////////////////////////////////////////////////////////
 /// SWR_VS_CONTEXT
 /// @brief Input to vertex shader
@@ -226,7 +237,7 @@
 
 struct ScalarCPoint
 {
-    ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES];
+    ScalarAttrib attrib[SWR_VTX_NUM_SLOTS];
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -329,11 +340,10 @@
     simdvector shaded[SWR_NUM_RENDERTARGETS];
                                 // OUT: result color per rendertarget
 
-    uint32_t frontFace;         // IN: front- 1, back- 0
-    uint32_t primID;            // IN: primitive ID
-    uint32_t sampleIndex;       // IN: sampleIndex
-
-    uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer
+    uint32_t frontFace;                 // IN: front- 1, back- 0
+    uint32_t sampleIndex;               // IN: sampleIndex
+    uint32_t renderTargetArrayIndex;    // IN: render target array index from GS
+    uint32_t rasterizerSampleCount;     // IN: sample count used by the rasterizer
 
     uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles
 };
@@ -372,6 +382,11 @@
     uint8_t* pTGSM;  // Thread Group Shared Memory pointer.
 
     uint8_t* pSpillFillBuffer;  // Spill/fill buffer for barrier support
+
+    uint8_t* pScratchSpace;     // Pointer to scratch space buffer used by the shader, shader is responsible
+                                // for subdividing scratch space per instance/simd
+
+    uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH
 };
 
 // enums
@@ -658,6 +673,9 @@
     // Number of attributes, including position, per vertex that are streamed out.
     // This should match number of bits in stream mask.
     uint32_t streamNumEntries[MAX_SO_STREAMS];
+
+    // Offset to the start of the attributes of the input vertices, in simdvector units
+    uint32_t vertexAttribOffset[MAX_SO_STREAMS];
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -695,15 +713,6 @@
     // instance count
     uint32_t instanceCount;
 
-    // geometry shader emits renderTargetArrayIndex
-    bool emitsRenderTargetArrayIndex;
-
-    // geometry shader emits PrimitiveID
-    bool emitsPrimitiveID;
-
-    // geometry shader emits ViewportArrayIndex
-    bool emitsViewportArrayIndex;
-
     // if true, geometry shader emits a single stream, with separate cut buffer.
     // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
     // to map vertices to streams
@@ -712,6 +721,9 @@
     // when single stream is enabled, singleStreamID dictates which stream is being output.
     // field ignored if isSingleStream is false
     uint32_t singleStreamID;
+
+    // Offset to the start of the attributes of the input vertices, in simdvector units
+    uint32_t vertexAttribOffset;
 };
 
 
@@ -767,6 +779,9 @@
     uint32_t                numHsInputAttribs;
     uint32_t                numHsOutputAttribs;
     uint32_t                numDsOutputAttribs;
+
+    // Offset to the start of the attributes of the input vertices, in simdvector units
+    uint32_t vertexAttribOffset;
 };
 
 // output merger state
@@ -789,6 +804,13 @@
     SWR_MULTISAMPLE_TYPE_COUNT
 };
 
+INLINE uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) // @llvm_func_start
+{
+    static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_COUNT] {1, 2, 4, 8, 16};
+    assert(sampleCount < SWR_MULTISAMPLE_TYPE_COUNT);
+    return sampleCountLUT[sampleCount];
+} // @llvm_func_end
+
 struct SWR_BLEND_STATE
 {
     // constant blend factor color in RGBA float
@@ -844,6 +866,10 @@
         uint32_t bits;
     } provokingVertex;
     uint32_t topologyProvokingVertex; // provoking vertex for the draw topology
+
+    // Size of a vertex in simdvector units. Should be sized to the 
+    // maximum of the input/output of the vertex shader.
+    uint32_t vsVertexSize;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -938,43 +964,13 @@
     INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func
     INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func
     
-    INLINE void PrecalcSampleData(int numSamples)   // @llvm_func_start
-    {                                                                      
-        for(int i = 0; i < numSamples; i++)
-        {
-            _vXi[i] = _mm_set1_epi32(_xi[i]);
-            _vYi[i] = _mm_set1_epi32(_yi[i]);
-            _vX[i] = _simd_set1_ps(_x[i]);
-            _vY[i] = _simd_set1_ps(_y[i]);
-        }
-        // precalculate the raster tile BB for the rasterizer.
-        CalcTileSampleOffsets(numSamples);                                 
-    } // @llvm_func_end
-
+    INLINE void PrecalcSampleData(int numSamples); //@llvm_func
 
 private:
     template <typename MaskT>
-    INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max) // @llvm_func_start
-    {
-        __m128i vMin = _mm_set1_epi32(*min);
-        __m128i vMax = _mm_set1_epi32(*max);
-        return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
-    }  // @llvm_func_end
+    INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func
+    INLINE void CalcTileSampleOffsets(int numSamples);   // @llvm_func
 
-    INLINE void CalcTileSampleOffsets(int numSamples)   // @llvm_func_start
-    {
-        auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
-        auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
-        using xMask = std::integral_constant<int, 0xA>;
-        // BR(max),    BL(min),    UR(max),    UL(min)
-        tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
-
-        auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
-        auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
-        using yMask = std::integral_constant<int, 0xC>;
-        // BR(max),    BL(min),    UR(max),    UL(min)
-        tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
-    };  // @llvm_func_end
     // scalar sample values
     uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES];
     uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES];
@@ -987,8 +983,7 @@
     simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES];
     simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES];
     __m128i tileSampleOffsetsX;
-    __m128i tileSampleOffsetsY;    
-
+    __m128i tileSampleOffsetsY;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -1029,6 +1024,7 @@
     uint8_t clipDistanceMask;
 };
 
+
 enum SWR_CONSTANT_SOURCE
 {
     SWR_CONSTANT_SOURCE_CONST_0000,
@@ -1057,6 +1053,12 @@
                                         // setting up attributes for the backend, otherwise
                                         // all attributes up to numAttributes will be sent
     SWR_ATTRIB_SWIZZLE swizzleMap[32];
+
+    bool readRenderTargetArrayIndex;    // Forward render target array index from last FE stage to the backend
+    bool readViewportArrayIndex;        // Read viewport array index from last FE stage during binning
+    
+	// Offset to the start of the attributes of the input vertices, in simdvector units
+    uint32_t vertexAttribOffset;
 };
 
 
@@ -1137,11 +1139,12 @@
     uint32_t writesODepth           : 1;    // pixel shader writes to depth
     uint32_t usesSourceDepth        : 1;    // pixel shader reads depth
     uint32_t shadingRate            : 2;    // shading per pixel / sample / coarse pixel
-    uint32_t numRenderTargets       : 4;    // number of render target outputs in use (0-8)
     uint32_t posOffset              : 2;    // type of offset (none, sample, centroid) to add to pixel position
     uint32_t barycentricsMask       : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV                : 1;    // pixel shader accesses UAV 
     uint32_t forceEarlyZ            : 1;    // force execution of early depth/stencil test
+
+    uint8_t renderTargetMask;               // Mask of render targets written
 };
 
 // depth bounds state
diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
new file mode 100644
index 0000000..eaf0094
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
@@ -0,0 +1,68 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file state.h
+*
+* @brief Definitions for API state - complex function implementation.
+*
+******************************************************************************/
+#pragma once
+
+#include "core/state.h"
+#include "common/simdintrin.h"
+
+
+template <typename MaskT>
+INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
+{
+    __m128i vMin = _mm_set1_epi32(*min);
+    __m128i vMax = _mm_set1_epi32(*max);
+    return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
+}
+
+INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
+{
+    for(int i = 0; i < numSamples; i++)
+    {
+        _vXi[i] = _mm_set1_epi32(_xi[i]);
+        _vYi[i] = _mm_set1_epi32(_yi[i]);
+        _vX[i] = _simd_set1_ps(_x[i]);
+        _vY[i] = _simd_set1_ps(_y[i]);
+    }
+    // precalculate the raster tile BB for the rasterizer.
+    CalcTileSampleOffsets(numSamples);                                 
+}
+
+INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
+{
+    auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
+    auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
+    using xMask = std::integral_constant<int, 0xA>;
+    // BR(max),    BL(min),    UR(max),    UL(min)
+    tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
+
+    auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
+    auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
+    using yMask = std::integral_constant<int, 0xC>;
+    // BR(max),    BL(min),    UR(max),    UL(min)
+    tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 4c22317..b704d23 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -393,7 +393,7 @@
 // inlined-only version
 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 {
-    int32_t result = InterlockedDecrement((volatile LONG*)&pDC->threadsDone);
+    int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
     SWR_ASSERT(result >= 0);
 
     AR_FLUSH(pDC->drawId);
@@ -639,7 +639,7 @@
     _mm_mfence();
     pDC->doneFE = true;
 
-    InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
+    InterlockedDecrement(&pContext->drawsOutstandingFE);
 }
 
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
@@ -726,10 +726,11 @@
         if (queue.getNumQueued() > 0)
         {
             void* pSpillFillBuffer = nullptr;
+            void* pScratchSpace = nullptr;
             uint32_t threadGroupId = 0;
             while (queue.getWork(threadGroupId))
             {
-                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
+                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
                 queue.finishedWork();
             }
 
@@ -747,7 +748,20 @@
     uint32_t threadId = pThreadData->threadId;
     uint32_t workerId = pThreadData->workerId;
 
-    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
+    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
+
+    {
+        char threadName[64];
+        sprintf_s(threadName,
+#if defined(_WIN32)
+                  "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
+#else
+                  // linux pthread name limited to 16 chars (including \0)
+                  "w%03d-n%d-c%03d-t%d",
+#endif
+            workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
+        SetCurrentThreadName(threadName);
+    }
 
     RDTSC_INIT(threadId);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index eb60eb4..a6c54ab 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -100,7 +100,7 @@
         {
             uint32_t size = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -124,7 +124,7 @@
 
             uint32_t size = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
         }
@@ -194,7 +194,7 @@
         if (create)
         {
             uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
             hotTile.renderTargetArrayIndex = 0;
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index bfff339..8f1cd21 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -151,7 +151,7 @@
     OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
 };
 
-typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
+typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace);
 
 //////////////////////////////////////////////////////////////////////////
 /// DispatchQueue - work queue for dispatch
@@ -231,10 +231,10 @@
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Dispatches a unit of work
-    void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
+    void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
     {
         SWR_ASSERT(mPfnDispatch != nullptr);
-        mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
+        mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
     }
 
     void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index 660a63f..2884007 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -31,50 +31,10 @@
 #include <type_traits>
 #include <algorithm>
 #include "common/os.h"
-#include "common/simdintrin.h"
+#include "common/intrin.h"
 #include "common/swr_assert.h"
 #include "core/api.h"
 
-#if defined(_WIN64) || defined(__x86_64__)
-#define _MM_INSERT_EPI64 _mm_insert_epi64
-#define _MM_EXTRACT_EPI64 _mm_extract_epi64
-#else
-INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
-{
-    OSALIGNLINE(uint32_t) elems[4];
-    _mm_store_si128((__m128i*)elems, a);
-    if (ndx == 0)
-    {
-        uint64_t foo = elems[0];
-        foo |= (uint64_t)elems[1] << 32;
-        return foo;
-    } 
-    else
-    {
-        uint64_t foo = elems[2];
-        foo |= (uint64_t)elems[3] << 32;
-        return foo;
-    }
-}
-
-INLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
-{
-    OSALIGNLINE(int64_t) elems[2];
-    _mm_store_si128((__m128i*)elems, a);
-    if (ndx == 0)
-    {
-        elems[0] = b;
-    }
-    else
-    {
-        elems[1] = b;
-    }
-    __m128i out;
-    out = _mm_load_si128((const __m128i*)elems);
-    return out;
-}
-#endif
-
 struct simdBBox
 {
     simdscalari ymin;
@@ -91,857 +51,8 @@
     simd16scalari xmin;
     simd16scalari xmax;
 };
-
-#endif
-INLINE
-void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
-{
-    __m128i row0i = _mm_castps_si128(row0);
-    __m128i row1i = _mm_castps_si128(row1);
-    __m128i row2i = _mm_castps_si128(row2);
-    __m128i row3i = _mm_castps_si128(row3);
-
-    __m128i vTemp = row2i;
-    row2i = _mm_unpacklo_epi32(row2i, row3i);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
-
-    row3i = row0i;
-    row0i = _mm_unpacklo_epi32(row0i, row1i);
-    row3i = _mm_unpackhi_epi32(row3i, row1i);
-
-    row1i = row0i;
-    row0i = _mm_unpacklo_epi64(row0i, row2i);
-    row1i = _mm_unpackhi_epi64(row1i, row2i);
-
-    row2i = row3i;
-    row2i = _mm_unpacklo_epi64(row2i, vTemp);
-    row3i = _mm_unpackhi_epi64(row3i, vTemp);
-
-    row0 = _mm_castsi128_ps(row0i);
-    row1 = _mm_castsi128_ps(row1i);
-    row2 = _mm_castsi128_ps(row2i);
-    row3 = _mm_castsi128_ps(row3i);
-}
-
-INLINE
-void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
-{
-    __m128i vTemp = row2;
-    row2 = _mm_unpacklo_epi32(row2, row3);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3);
-
-    row3 = row0;
-    row0 = _mm_unpacklo_epi32(row0, row1);
-    row3 = _mm_unpackhi_epi32(row3, row1);
-
-    row1 = row0;
-    row0 = _mm_unpacklo_epi64(row0, row2);
-    row1 = _mm_unpackhi_epi64(row1, row2);
-
-    row2 = row3;
-    row2 = _mm_unpacklo_epi64(row2, vTemp);
-    row3 = _mm_unpackhi_epi64(row3, vTemp);
-}
-
-#if KNOB_SIMD_WIDTH == 8
-INLINE
-void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
-{
-    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
-    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);              //x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);              //x1y1z1w1 x5y5z5w5
-
-    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                             //x2z2x3z3 x6z6x7z7
-    r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps());                //y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-INLINE
-void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
-{
-    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
-    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);  //x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);  //x1y1z1w1 x5y5z5w5
-
-    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                 //x2z2x3z3 x6z6x7z7
-    r1rx = _simd_unpackhi_ps(vSrc1, vSrc3);                 //y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE
-void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
-{
-    const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
-
-    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
-    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
-    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
-    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
-
-    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
-    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
-    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
-    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
-
-    dst[0] = _simd16_unpacklo_ps(rblo, galo);
-    dst[1] = _simd16_unpackhi_ps(rblo, galo);
-    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
-    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
-}
-
-#endif
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
-{
-    simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
-    simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
-    simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
-    simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
-    simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
-    simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
-    simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
-    simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
-    simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
-    vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
-    vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
-    vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
-    vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
-    vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
-    vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
-    vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
-    vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
-}
-
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
-{
-    vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), 
-        _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
-}
 #endif
 
-//////////////////////////////////////////////////////////////////////////
-/// TranposeSingleComponent
-//////////////////////////////////////////////////////////////////////////
-template<uint32_t bpp>
-struct TransposeSingleComponent
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Pass-thru for single component.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
-        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
-        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
-        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
-        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
-        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
-        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
-        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
-        _mm_store_si128((__m128i*)pDst, c0123lo);
-        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
-#else
-        simdscalari dst01 = _simd_shuffle_epi8(src,
-            _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
-        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
-        dst23 = _simd_shuffle_epi8(dst23,
-            _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
-        simdscalari dst = _simd_or_si(dst01, dst23);
-        _simd_store_si((simdscalari*)pDst, dst);
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
-        __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
-        __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
-
-        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
-        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
-        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
-        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
-
-        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
-        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
-        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
-
-        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
-
-        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
-        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
-        rg = _mm_unpacklo_epi8(rg, g);
-        _mm_store_si128((__m128i*)pDst, rg);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
-
-        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
-        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
-
-        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
-
-        simdscalari dst = _simd_or_si(cvt0, shl1);
-
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
-
-        __m128 vDst[8];
-        vTranspose4x8(vDst, src0, src1, src2, src3);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst+4, vDst[1]);
-        _mm_store_ps((float*)pDst+8, vDst[2]);
-        _mm_store_ps((float*)pDst+12, vDst[3]);
-        _mm_store_ps((float*)pDst+16, vDst[4]);
-        _mm_store_ps((float*)pDst+20, vDst[5]);
-        _mm_store_ps((float*)pDst+24, vDst[6]);
-        _mm_store_ps((float*)pDst+28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
-        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-
-        __m128 vDst[8];
-        vTranspose3x8(vDst, src0, src1, src2);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst + 4, vDst[1]);
-        _mm_store_ps((float*)pDst + 8, vDst[2]);
-        _mm_store_ps((float*)pDst + 12, vDst[3]);
-        _mm_store_ps((float*)pDst + 16, vDst[4]);
-        _mm_store_ps((float*)pDst + 20, vDst[5]);
-        _mm_store_ps((float*)pDst + 24, vDst[6]);
-        _mm_store_ps((float*)pDst + 28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
-        simd16scalar src3 = _simd16_setzero_ps();
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        const float* pfSrc = (const float*)pSrc;
-        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
-        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
-        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
-        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
-
-        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
-        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
-        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
-        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
-
-        float* pfDst = (float*)pDst;
-        _mm_store_ps(pfDst + 0, dst0);
-        _mm_store_ps(pfDst + 4, dst1);
-        _mm_store_ps(pfDst + 8, dst2);
-        _mm_store_ps(pfDst + 12, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
-
-        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
-        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
-
-        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
-        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
-
-        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
-        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
-
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
-
-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
-        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
-
-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-
-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
-        __m128i src_a = _mm_undefined_si128();
-
-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src = _simd_load_ps((const float*)pSrc);
-
-        __m128 comp0 = _mm256_castps256_ps128(src);
-        __m128 comp1 = _mm256_extractf128_ps(src, 1);
-
-        __m128i comp0i = _mm_castps_si128(comp0);
-        __m128i comp1i = _mm_castps_si128(comp1);
-
-        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
-        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
-
-        _mm_store_si128((__m128i*)pDst, resLo);
-        _mm_store_si128((__m128i*)pDst + 1, resHi);
-#else
-#error Unsupported vector width
-#endif
-    }
-#if ENABLE_AVX512_SIMD16
-
-    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
-
-        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
-        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
-
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
-    }
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose24_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose24_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_8_24
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_8_24
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose4_4_4_4
-//////////////////////////////////////////////////////////////////////////
-struct Transpose4_4_4_4
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_6_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_6_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose9_9_9_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose9_9_9_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_5_5_1
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_5_5_1
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose1_5_5_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose1_5_5_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose10_10_10_2
-//////////////////////////////////////////////////////////////////////////
-struct Transpose10_10_10_2
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose11_11_10
-//////////////////////////////////////////////////////////////////////////
-struct Transpose11_11_10
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
-    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
-};
 
 // helper function to unroll loops
 template<int Begin, int End, int Step = 1>
@@ -1029,7 +140,7 @@
 INLINE
 static bool IsPow2(T value)
 {
-    return value == (value & (0 - value));
+    return value == (value & (T(0) - value));
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1224,4 +335,31 @@
     }
 };
 
+//////////////////////////////////////////////////////////////////////////
+/// Helpers used to get / set environment variable
+//////////////////////////////////////////////////////////////////////////
+static INLINE std::string GetEnv(const std::string& variableName)
+{
+    std::string output;
+#if defined(_WIN32)
+    DWORD valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0);
+    if (!valueSize) return output;
+    output.resize(valueSize - 1); // valueSize includes null, output.resize() does not
+    GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize);
+#else
+    char *env = getenv(variableName.c_str());
+    output = env ? env : "";
+#endif
+
+    return output;
+}
+
+static INLINE void SetEnv(const std::string& variableName, const std::string& value)
+{
+#if defined(_WIN32)
+    SetEnvironmentVariableA(variableName.c_str(), value.c_str());
+#else
+    setenv(variableName.c_str(), value.c_str(), true);
+#endif
+}
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 5d8ad27..60289ca 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -46,6 +46,15 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+#if HAVE_LLVM < 0x400
+#include "llvm/Bitcode/ReaderWriter.h"
+#else
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#endif
 
 #if LLVM_USE_INTEL_JITEVENTS
 #include "llvm/ExecutionEngine/JITEventListener.h"
@@ -71,6 +80,11 @@
 #define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter"
 #endif // _WIN32
 
+#if defined(__APPLE) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+#include <pwd.h>
+#include <sys/stat.h>
+#endif
+
 
 using namespace llvm;
 using namespace SwrJit;
@@ -101,9 +115,7 @@
     mCore = std::string(core);
     std::transform(mCore.begin(), mCore.end(), mCore.begin(), ::tolower);
 
-    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-    fnName << mJitNumber++;
-    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+    std::unique_ptr<Module> newModule(new Module("", mContext));
     mpCurrentModule = newModule.get();
 
     StringRef hostCPUName;
@@ -123,6 +135,12 @@
         .setMCPU(hostCPUName)
         .create();
 
+    if (KNOB_JIT_ENABLE_CACHE)
+    {
+        mCache.SetCpu(hostCPUName);
+        mpExec->setObjectCache(&mCache);
+    }
+
 #if LLVM_USE_INTEL_JITEVENTS
     JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
     mpExec->RegisterJITEventListener(vTune);
@@ -159,9 +177,9 @@
 #if defined(_WIN32)
     if (KNOB_DUMP_SHADER_IR)
     {
-        CreateDirectory(INTEL_OUTPUT_DIR, NULL);
-        CreateDirectory(SWR_OUTPUT_DIR, NULL);
-        CreateDirectory(JITTER_OUTPUT_DIR, NULL);
+        CreateDirectoryPath(INTEL_OUTPUT_DIR);
+        CreateDirectoryPath(SWR_OUTPUT_DIR);
+        CreateDirectoryPath(JITTER_OUTPUT_DIR);
     }
 #endif
 }
@@ -172,9 +190,7 @@
 {
     SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
     
-    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-    fnName << mJitNumber++;
-    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+    std::unique_ptr<Module> newModule(new Module("", mContext));
     mpCurrentModule = newModule.get();
 #if defined(_WIN32)
     // Needed for MCJIT on windows
@@ -204,7 +220,7 @@
         const char* pBaseName = strrchr(procname, '\\');
         std::stringstream outDir;
         outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-        CreateDirectory(outDir.str().c_str(), NULL);
+        CreateDirectoryPath(outDir.str().c_str());
 #endif
 
         std::error_code EC;
@@ -242,7 +258,7 @@
         const char* pBaseName = strrchr(procname, '\\');
         std::stringstream outDir;
         outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-        CreateDirectory(outDir.str().c_str(), NULL);
+        CreateDirectoryPath(outDir.str().c_str());
 #endif
 
         std::error_code EC;
@@ -264,10 +280,10 @@
 #endif
         fd.flush();
 
-        raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
-        WriteGraph(fd_cfg, (const Function*)f);
+        //raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
+        //WriteGraph(fd_cfg, (const Function*)f);
 
-        fd_cfg.flush();
+        //fd_cfg.flush();
     }
 }
 
@@ -293,3 +309,194 @@
         }
     }
 }
+
+//////////////////////////////////////////////////////////////////////////
+/// JitCache
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+/// JitCacheFileHeader
+//////////////////////////////////////////////////////////////////////////
+struct JitCacheFileHeader
+{
+    void Init(uint32_t llCRC, uint32_t objCRC, const std::string& moduleID, const std::string& cpu, uint64_t bufferSize)
+    {
+        m_MagicNumber = JC_MAGIC_NUMBER;
+        m_BufferSize = bufferSize;
+        m_llCRC = llCRC;
+        m_platformKey = JC_PLATFORM_KEY;
+        m_objCRC = objCRC;
+        strncpy(m_ModuleID, moduleID.c_str(), JC_STR_MAX_LEN - 1);
+        m_ModuleID[JC_STR_MAX_LEN - 1] = 0;
+        strncpy(m_Cpu, cpu.c_str(), JC_STR_MAX_LEN - 1);
+        m_Cpu[JC_STR_MAX_LEN - 1] = 0;
+    }
+
+    bool IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu)
+    {
+        if ((m_MagicNumber != JC_MAGIC_NUMBER) ||
+            (m_llCRC != llCRC) ||
+            (m_platformKey != JC_PLATFORM_KEY))
+        {
+            return false;
+        }
+
+        m_ModuleID[JC_STR_MAX_LEN - 1] = 0;
+        if (strncmp(moduleID.c_str(), m_ModuleID, JC_STR_MAX_LEN - 1))
+        {
+            return false;
+        }
+
+        m_Cpu[JC_STR_MAX_LEN - 1] = 0;
+        if (strncmp(cpu.c_str(), m_Cpu, JC_STR_MAX_LEN - 1))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    uint64_t GetBufferSize() const { return m_BufferSize; }
+    uint64_t GetBufferCRC() const { return m_objCRC; }
+
+private:
+    static const uint64_t   JC_MAGIC_NUMBER = 0xfedcba9876543211ULL;
+    static const size_t     JC_STR_MAX_LEN = 32;
+    static const uint32_t   JC_PLATFORM_KEY =
+        (LLVM_VERSION_MAJOR << 24)  |
+        (LLVM_VERSION_MINOR << 16)  |
+        (LLVM_VERSION_PATCH << 8)   |
+        ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0);
+
+    uint64_t m_MagicNumber;
+    uint64_t m_BufferSize;
+    uint32_t m_llCRC;
+    uint32_t m_platformKey;
+    uint32_t m_objCRC;
+    char m_ModuleID[JC_STR_MAX_LEN];
+    char m_Cpu[JC_STR_MAX_LEN];
+};
+
+static inline uint32_t ComputeModuleCRC(const llvm::Module* M)
+{
+    std::string bitcodeBuffer;
+    raw_string_ostream bitcodeStream(bitcodeBuffer);
+
+    llvm::WriteBitcodeToFile(M, bitcodeStream);
+    //M->print(bitcodeStream, nullptr, false);
+
+    bitcodeStream.flush();
+
+    return ComputeCRC(0, bitcodeBuffer.data(), bitcodeBuffer.size());
+}
+
+/// constructor
+JitCache::JitCache()
+{
+#if defined(__APPLE) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+    if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0) {
+        char *homedir;
+        if (!(homedir = getenv("HOME"))) {
+            homedir = getpwuid(getuid())->pw_dir;
+        }
+        mCacheDir = homedir;
+        mCacheDir += (KNOB_JIT_CACHE_DIR.c_str() + 1);
+    } else
+#endif
+    {
+        mCacheDir = KNOB_JIT_CACHE_DIR;
+    }
+}
+
+/// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
+void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj)
+{
+    const std::string& moduleID = M->getModuleIdentifier();
+    if (!moduleID.length())
+    {
+        return;
+    }
+
+    if (!llvm::sys::fs::exists(mCacheDir.str()) &&
+        llvm::sys::fs::create_directories(mCacheDir.str()))
+    {
+        SWR_INVALID("Unable to create directory: %s", mCacheDir.c_str());
+        return;
+    }
+
+    llvm::SmallString<MAX_PATH> filePath = mCacheDir;
+    llvm::sys::path::append(filePath, moduleID);
+
+    std::error_code err;
+    llvm::raw_fd_ostream fileObj(filePath.c_str(), err, llvm::sys::fs::F_None);
+
+    uint32_t objcrc = ComputeCRC(0, Obj.getBufferStart(), Obj.getBufferSize());
+
+    JitCacheFileHeader header;
+    header.Init(mCurrentModuleCRC, objcrc, moduleID, mCpu, Obj.getBufferSize());
+
+    fileObj.write((const char*)&header, sizeof(header));
+    fileObj << Obj.getBuffer();
+    fileObj.flush();
+}
+
+/// Returns a pointer to a newly allocated MemoryBuffer that contains the
+/// object which corresponds with Module M, or 0 if an object is not
+/// available.
+std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M)
+{
+    const std::string& moduleID = M->getModuleIdentifier();
+    mCurrentModuleCRC = ComputeModuleCRC(M);
+
+    if (!moduleID.length())
+    {
+        return nullptr;
+    }
+
+    if (!llvm::sys::fs::exists(mCacheDir))
+    {
+        return nullptr;
+    }
+
+    llvm::SmallString<MAX_PATH> filePath = mCacheDir;
+    llvm::sys::path::append(filePath, moduleID);
+
+    FILE* fpIn = fopen(filePath.c_str(), "rb");
+    if (!fpIn)
+    {
+        return nullptr;
+    }
+
+    std::unique_ptr<llvm::MemoryBuffer> pBuf = nullptr;
+    do
+    {
+        JitCacheFileHeader header;
+        if (!fread(&header, sizeof(header), 1, fpIn))
+        {
+            break;
+        }
+
+        if (!header.IsValid(mCurrentModuleCRC, moduleID, mCpu))
+        {
+            break;
+        }
+
+        pBuf = llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize()));
+        if (!fread(const_cast<char*>(pBuf->getBufferStart()), header.GetBufferSize(), 1, fpIn))
+        {
+            pBuf = nullptr;
+            break;
+        }
+
+        if (header.GetBufferCRC() != ComputeCRC(0, pBuf->getBufferStart(), pBuf->getBufferSize()))
+        {
+            SWR_TRACE("Invalid object cache file, ignoring: %s", filePath.c_str());
+            pBuf = nullptr;
+            break;
+        }
+    } while (0);
+
+    fclose(fpIn);
+
+    return pBuf;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 97d9312..68377e7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -44,6 +44,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
 
 #include "llvm/Config/llvm-config.h"
 #ifndef LLVM_VERSION_MAJOR
@@ -78,6 +79,8 @@
 #include "common/os.h"
 #include "common/isa.hpp"
 
+#include <mutex>
+
 #pragma pop_macro("DEBUG")
 
 //////////////////////////////////////////////////////////////////////////
@@ -133,6 +136,31 @@
 {
 };
 
+//////////////////////////////////////////////////////////////////////////
+/// JitCache
+//////////////////////////////////////////////////////////////////////////
+class JitCache : public llvm::ObjectCache
+{
+public:
+    /// constructor
+    JitCache();
+    virtual ~JitCache() {}
+
+    void SetCpu(const llvm::StringRef& cpu) { mCpu = cpu.str(); }
+
+    /// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
+    virtual void notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj);
+
+    /// Returns a pointer to a newly allocated MemoryBuffer that contains the
+    /// object which corresponds with Module M, or 0 if an object is not
+    /// available.
+    virtual std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M);
+
+private:
+    std::string mCpu;
+    llvm::SmallString<MAX_PATH> mCacheDir;
+    uint32_t mCurrentModuleCRC;
+};
 
 //////////////////////////////////////////////////////////////////////////
 /// JitManager
@@ -145,6 +173,7 @@
     JitLLVMContext          mContext;   ///< LLVM compiler
     llvm::IRBuilder<>       mBuilder;   ///< LLVM IR Builder
     llvm::ExecutionEngine*  mpExec;
+    JitCache                mCache;
 
     // Need to be rebuilt after a JIT and before building new IR
     llvm::Module* mpCurrentModule;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 5daeea9..4278840 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -514,10 +514,8 @@
 
     Function* Create(const BLEND_COMPILE_STATE& state)
     {
-        static std::size_t jitNum = 0;
-
-        std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-        fnName << jitNum++;
+        std::stringstream fnName("BlendShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        fnName << ComputeCRC(0, &state, sizeof(state));
 
         // blend function signature
         //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
@@ -536,6 +534,7 @@
 
         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
         Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+        blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
 
         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 3b86895..6a33ec2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -47,6 +47,7 @@
         mVoidTy = Type::getVoidTy(pJitMgr->mContext);
         mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
         mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+        mFP32PtrTy = PointerType::get(mFP32Ty, 0);
         mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
         mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
         mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 703f332..8210e49 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -56,6 +56,7 @@
         Type*                mIntPtrTy;
         Type*                mFP16Ty;
         Type*                mFP32Ty;
+        Type*                mFP32PtrTy;
         Type*                mDoubleTy;
         Type*                mInt8PtrTy;
         Type*                mInt16PtrTy;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 09b69c7..fbb4948 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -277,6 +277,22 @@
         return GEPA(ptr, indices);
     }
 
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
     {
         std::vector<Value*> valIndices;
@@ -1377,7 +1393,17 @@
         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
                               pFunc->getEntryBlock().begin());
         Value* pAlloca = ALLOCA(pType);
-        IRB()->restoreIP(saveIP);
+        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+        return pAlloca;
+    }
+
+    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
+    {
+        auto saveIP = IRB()->saveIP();
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
+            pFunc->getEntryBlock().begin());
+        Value* pAlloca = ALLOCA(pType, pArraySize);
+        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
         return pAlloca;
     }
 
@@ -1649,4 +1675,45 @@
         }
     }
 
+
+    uint32_t Builder::GetTypeSize(Type* pType)
+    {
+        if (pType->isStructTy())
+        {
+            uint32_t numElems = pType->getStructNumElements();
+            Type* pElemTy = pType->getStructElementType(0);
+            return numElems * GetTypeSize(pElemTy);
+        }
+
+        if (pType->isArrayTy())
+        {
+            uint32_t numElems = pType->getArrayNumElements();
+            Type* pElemTy = pType->getArrayElementType();
+            return numElems * GetTypeSize(pElemTy);
+        }
+
+        if (pType->isIntegerTy())
+        {
+            uint32_t bitSize = pType->getIntegerBitWidth();
+            return bitSize / 8;
+        }
+
+        if (pType->isFloatTy())
+        {
+            return 4;
+        }
+
+        if (pType->isHalfTy())
+        {
+            return 2;
+        }
+
+        if (pType->isDoubleTy())
+        {
+            return 8;
+        }
+
+        SWR_ASSERT(false, "Unimplemented type.");
+        return 0;
+    }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index aea39c5..662574d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -68,6 +68,9 @@
 
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+
 CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
 CallInst *CALL(Value *Callee) { return CALLA(Callee); }
 CallInst *CALL(Value *Callee, Value* arg);
@@ -159,8 +162,11 @@
 void RDTSC_STOP(Value* pBucketMgr, Value* pId);
 
 Value* CreateEntryAlloca(Function* pFunc, Type* pType);
+Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);
 
 // Static stack allocations for scatter operations
 Value* pScatterStackSrc{ nullptr };
 Value* pScatterStackOffsets{ nullptr };
 
+
+uint32_t GetTypeSize(Type* pType);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 8fc31ae..ae5cd47 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -89,14 +89,14 @@
 
 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 {
-    static std::size_t fetchNum = 0;
-
-    std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-    fnName << fetchNum++;
+    std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+    fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 
     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 
+    fetch->getParent()->setModuleIdentifier(fetch->getName());
+
     IRB()->SetInsertPoint(entry);
 
     auto    argitr = fetch->arg_begin();
@@ -590,64 +590,27 @@
 // gather for odd component size formats
 // gather SIMD full pixels per lane then shift/mask to move each component to their
 // own vector
-void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4])
+void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 {
     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 
     // only works if pixel size is <= 32bits
     SWR_ASSERT(info.bpp <= 32);
 
-    Value* gather = VUNDEF_I();
+	Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1));
 
-    // assign defaults
     for (uint32_t comp = 0; comp < 4; ++comp)
     {
-        result[comp] = VIMMED1((int)info.defaults[comp]);
+        pResult[comp] = VIMMED1((int)info.defaults[comp]);
     }
 
-    // load the proper amount of data based on component size
-    PointerType* pLoadTy = nullptr;
-    switch (info.bpp)
-    {
-    case 8: pLoadTy = Type::getInt8PtrTy(JM()->mContext); break;
-    case 16: pLoadTy = Type::getInt16PtrTy(JM()->mContext); break;
-    case 24:
-    case 32: pLoadTy = Type::getInt32PtrTy(JM()->mContext); break;
-    default: SWR_INVALID("Invalid bpp: %d", info.bpp);
-    }
-
-    // allocate temporary memory for masked off lanes
-    Value* pTmp = ALLOCA(pLoadTy->getElementType());
-
-    // gather SIMD pixels
-    for (uint32_t e = 0; e < JM()->mVWidth; ++e)
-    {
-        Value* pElemOffset = VEXTRACT(offsets, C(e));
-        Value* pLoad = GEP(pBase, pElemOffset);
-        Value* pLaneMask = VEXTRACT(pMask, C(e));
-
-        pLoad = POINTER_CAST(pLoad, pLoadTy);
-
-        // mask in tmp pointer for disabled lanes
-        pLoad = SELECT(pLaneMask, pLoad, pTmp);
-
-        // load pixel
-        Value *val = LOAD(pLoad);
-
-        // zero extend to 32bit integer
-        val = INT_CAST(val, mInt32Ty, false);
-
-        // store in simd lane
-        gather = VINSERT(gather, val, C(e));
-    }
-
-    UnpackComponents(format, gather, result);
+    UnpackComponents(format, pGather, pResult);
 
     // cast to fp32
-    result[0] = BITCAST(result[0], mSimdFP32Ty);
-    result[1] = BITCAST(result[1], mSimdFP32Ty);
-    result[2] = BITCAST(result[2], mSimdFP32Ty);
-    result[3] = BITCAST(result[3], mSimdFP32Ty);
+    pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
+    pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
+    pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
+    pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 }
 
 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
@@ -860,7 +823,7 @@
         if (IsOddFormat((SWR_FORMAT)ied.Format))
         {
             Value* pResults[4];
-            CreateGatherOddFormats((SWR_FORMAT)ied.Format, pMask, pStreamBase, vOffsets, pResults);
+            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 
             for (uint32_t c = 0; c < 4; ++c)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index d5cec70..4f456af 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -95,7 +95,7 @@
 struct FETCH_COMPILE_STATE
 {
     uint32_t numAttribs{ 0 };
-    INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES];
+    INPUT_ELEMENT_DESC layout[SWR_VTX_NUM_SLOTS];
     SWR_FORMAT indexType;
     uint32_t cutIndex{ 0xffffffff };
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
index b072eb3..9f69669 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -57,6 +57,7 @@
 struct JIT_COMPILE_INPUT
 {
     SWR_SHADER_TYPE type;
+    uint32_t        crc;
 
     const void* pIR;        ///< Pointer to LLVM IR text.
     size_t irLength;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 2c19321..fb80fe2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -241,7 +241,7 @@
 
             // increment stream and output buffer pointers
             // stream verts are always 32*4 dwords apart
-            pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
+            pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
 
             // output buffers offset using pitch in buffer state
             for (uint32_t b : activeSOBuffers)
@@ -263,10 +263,8 @@
 
     Function* Create(const STREAMOUT_COMPILE_STATE& state)
     {
-        static std::size_t soNum = 0;
-
-        std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-        fnName << soNum++;
+        std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        fnName << ComputeCRC(0, &state, sizeof(state));
 
         // SO function signature
         // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
@@ -278,6 +276,8 @@
         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
         Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 
+        soFunc->getParent()->setModuleIdentifier(soFunc->getName());
+
         // create return basic block
         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ee13f55..98bf28b 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -152,7 +152,7 @@
 /// @param renderTargetIndex - Index to destination render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pClearColor - Pointer to clear color
-void StoreHotTileClear(
+void SwrStoreHotTileClear(
     SWR_SURFACE_STATE *pDstSurface,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
     UINT x,
@@ -283,7 +283,7 @@
     sStoreTilesClearColorTable[B10G10R10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
     sStoreTilesClearColorTable[B10G10R10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
     sStoreTilesClearColorTable[R8G8B8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear;
 
 //////////////////////////////////////////////////////////////////////////
 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
@@ -293,7 +293,7 @@
     sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
     sStoreTilesClearDepthTable[R32_FLOAT_X8X24_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::StoreClear; \
     sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
-    sStoreTilesClearDepthTable[R16_UNORM] = StoreMacroTileClear<R32_FLOAT, R16_UNORM>::StoreClear; \
+    sStoreTilesClearDepthTable[R16_UNORM] = StoreMacroTileClear<R32_FLOAT, R16_UNORM>::StoreClear;
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Sets up tables for ClearTile
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
index 7d8b32a..9dbc16a 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
@@ -53,7 +53,7 @@
 /// @param renderTargetIndex - Index to src render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pDstHotTile - Pointer to Hot Tile
-void LoadHotTile(
+void SwrLoadHotTile(
     const SWR_SURFACE_STATE *pSrcSurface,
     SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
index 446bca4..9c20669 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
@@ -58,7 +58,7 @@
 /// @param renderTargetIndex - Index to destination render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pSrcHotTile - Pointer to Hot Tile
-void StoreHotTileToSurface(
+void SwrStoreHotTileToSurface(
     SWR_SURFACE_STATE *pDstSurface,
     SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index 12a5f3d..c3d14e9 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -199,15 +199,15 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
     {
         // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)pSrc;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)pSrc;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
 
-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
 
-        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
     }
 };
 
@@ -218,20 +218,20 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
     {
         // 4 x 16 bytes = 64 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         // Unswizzle from SWR-Z order
-        __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
-        __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
-        __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
-        __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
+        simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
+        simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
+        simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
+        simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
 
-        _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
-        _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
-        _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
-        _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
+        SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
+        SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
+        SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
+        SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
     }
 };
 
@@ -251,10 +251,10 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
     {
         // Each 4-pixel row is 32 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
 
         // order of pointers match SWR-Z layout
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
         *pvDsts[0] = pPixSrc[0];
         *pvDsts[1] = pPixSrc[1];
         *pvDsts[2] = pPixSrc[2];
@@ -269,9 +269,9 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
     {
         // 8 x 16 bytes = 128 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         // order of pointers match SWR-Z layout
         *ppDsts128[0] = pSrc128[0];     // 0 1
@@ -301,10 +301,10 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
     {
         // Each 4-pixel row is 64 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
 
         // Unswizzle from SWR-Z order
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
         *pvDsts[0] = pPixSrc[0];
         *pvDsts[1] = pPixSrc[2];
         *pvDsts[2] = pPixSrc[1];
@@ -323,9 +323,9 @@
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
     {
         // 16 x 16 bytes = 256 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         for (uint32_t i = 0; i < 16; i += 4)
         {
@@ -563,8 +563,8 @@
         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
 
         // merge/store data into destination but don't overwrite the X8 bits
-        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
-        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
+        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
 
         simd16scalari dest = _simd16_setzero_si();
 
@@ -575,8 +575,8 @@
 
         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
 
-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
 #else
         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
 
@@ -593,25 +593,25 @@
 
         // Store data into destination but don't overwrite the X8 bits
         // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)aosTile;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)aosTile;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
 
-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
 
-        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
-        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+        simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
+        simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
 
-        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+        simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
 
-        vDst0 = _mm_andnot_si128(vMask, vDst0);
-        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
-        vDst1 = _mm_andnot_si128(vMask, vDst1);
-        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+        vDst0 = SIMD128::andnot_si(vMask, vDst0);
+        vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
+        vDst1 = SIMD128::andnot_si(vMask, vDst1);
+        vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
 
-        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
 #endif
     }
 };
@@ -683,8 +683,8 @@
     // store 8x2 memory order:
     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }
 
 #endif
@@ -736,15 +736,15 @@
 
     // splitting into two sets of 4 wide integer vector types
     // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
 
-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
 
     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
@@ -753,18 +753,18 @@
     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
-    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
 
-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
-    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
 
     // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
 
     simdscalari final = _mm256_castsi128_si256(vRow00);
     final = _mm256_insertf128_si256(final, vRow10, 1);
@@ -785,7 +785,7 @@
     final = _mm256_permute4x64_epi64(final, 0xD8);
 #endif
 
-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }
 
 #if USE_8x2_TILE_BACKEND
@@ -848,8 +848,8 @@
     // store 8x2 memory order:
     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }
 
 #endif
@@ -894,29 +894,29 @@
 
     // splitting into two sets of 4 wide integer vector types
     // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
 
-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
 
     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
 
-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
 
     // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
 
     simdscalari final = _mm256_castsi128_si256(vRow00);
     final = _mm256_insertf128_si256(final, vRow10, 1);
@@ -936,7 +936,7 @@
 
 #endif
 
-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }
 
 template<>
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 226d7dc..c2a87d8 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "core/state.h"
-#include "common/simdintrin.h"
+#include "common/intrin.h"
 
 template<SWR_TILE_MODE mode, int>
 struct TilingTraits
diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp
index 53f4e02..233432e 100644
--- a/src/gallium/drivers/swr/swr_clear.cpp
+++ b/src/gallium/drivers/swr/swr_clear.cpp
@@ -68,11 +68,19 @@
    ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */
 #endif
 
+   SWR_RECT clear_rect;
+   /* If enabled, clear to scissor; otherwise clear full surface */
+   if (ctx->rasterizer && ctx->rasterizer->scissor) {
+      clear_rect = ctx->swr_scissor;
+   } else {
+      clear_rect = {0, 0, (int32_t)fb->width, (int32_t)fb->height};
+   }
+
    for (unsigned i = 0; i < layers; ++i) {
       swr_update_draw_context(ctx);
-      SwrClearRenderTarget(ctx->swrContext, clearMask, i,
-                           color->f, depth, stencil,
-                           ctx->swr_scissor);
+      ctx->api.pfnSwrClearRenderTarget(ctx->swrContext, clearMask, i,
+                                       color->f, depth, stencil,
+                                       clear_rect);
 
       // Mask out the attachments that are out of layers.
       if (fb->zsbuf &&
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 4b7a321..c058870 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -311,8 +311,8 @@
    }
 
    if (ctx->active_queries) {
-      SwrEnableStatsFE(ctx->swrContext, FALSE);
-      SwrEnableStatsBE(ctx->swrContext, FALSE);
+      ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE);
+      ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE);
    }
 
    util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer);
@@ -349,8 +349,8 @@
    util_blitter_blit(ctx->blitter, &info);
 
    if (ctx->active_queries) {
-      SwrEnableStatsFE(ctx->swrContext, TRUE);
-      SwrEnableStatsBE(ctx->swrContext, TRUE);
+      ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE);
+      ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE);
    }
 }
 
@@ -383,10 +383,10 @@
 
    /* Idle core after destroying buffer resources, but before deleting
     * context.  Destroying resources has potentially called StoreTiles.*/
-   SwrWaitForIdle(ctx->swrContext);
+   ctx->api.pfnSwrWaitForIdle(ctx->swrContext);
 
    if (ctx->swrContext)
-      SwrDestroyContext(ctx->swrContext);
+      ctx->api.pfnSwrDestroyContext(ctx->swrContext);
 
    delete ctx->blendJIT;
 
@@ -422,7 +422,7 @@
    if (!pDC)
       return;
 
-   struct swr_query_result *pqr = (struct swr_query_result *)pDC->pStats;
+   struct swr_query_result *pqr = pDC->pStats;
 
    SWR_STATS *pSwrStats = &pqr->core;
 
@@ -439,7 +439,7 @@
    if (!pDC)
       return;
 
-   struct swr_query_result *pqr = (struct swr_query_result *)pDC->pStats;
+   struct swr_query_result *pqr = pDC->pStats;
 
    SWR_STATS_FE *pSwrStats = &pqr->coreFE;
    p_atomic_add(&pSwrStats->IaVertices, pStats->IaVertices);
@@ -467,6 +467,9 @@
       AlignedMalloc(sizeof(struct swr_context), KNOB_SIMD_BYTES);
    memset(ctx, 0, sizeof(struct swr_context));
 
+   swr_screen(p_screen)->pfnSwrGetInterface(ctx->api);
+   ctx->swrDC.pAPI = &ctx->api;
+
    ctx->blendJIT =
       new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
 
@@ -478,12 +481,9 @@
    createInfo.pfnClearTile = swr_StoreHotTileClear;
    createInfo.pfnUpdateStats = swr_UpdateStats;
    createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
-   ctx->swrContext = SwrCreateContext(&createInfo);
+   ctx->swrContext = ctx->api.pfnSwrCreateContext(&createInfo);
 
-   /* Init Load/Store/ClearTiles Tables */
-   swr_InitMemoryModule();
-
-   InitBackendFuncTables();
+   ctx->api.pfnSwrInit();
 
    if (ctx->swrContext == NULL)
       goto fail;
diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
index 4de20c1..8bed78f 100644
--- a/src/gallium/drivers/swr/swr_context.h
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -51,6 +51,7 @@
 #define SWR_NEW_FRAMEBUFFER (1 << 15)
 #define SWR_NEW_CLIP (1 << 16)
 #define SWR_NEW_SO (1 << 17)
+#define SWR_LARGE_CLIENT_DRAW (1<<18) // Indicates client draw will block
 
 namespace std
 {
@@ -101,7 +102,8 @@
    uint32_t polyStipple[32];
 
    SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS];
-   void *pStats;
+   struct swr_query_result *pStats; // @llvm_struct
+   SWR_INTERFACE *pAPI; // @llvm_struct - Needed for the swr_memory callbacks
 };
 
 /* gen_llvm_types FINI */
@@ -137,7 +139,6 @@
 
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-   struct pipe_index_buffer index_buffer;
 
    struct blitter_context *blitter;
 
@@ -170,6 +171,8 @@
    struct swr_draw_context swrDC;
 
    unsigned dirty; /**< Mask of SWR_NEW_x flags */
+
+   SWR_INTERFACE api;
 };
 
 static INLINE struct swr_context *
@@ -183,7 +186,7 @@
       struct swr_query_result *pqr = nullptr)
 {
    swr_draw_context *pDC =
-      (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
+      (swr_draw_context *)ctx->api.pfnSwrGetPrivateContextState(ctx->swrContext);
    if (pqr)
       ctx->swrDC.pStats = pqr;
    memcpy(pDC, &ctx->swrDC, sizeof(swr_draw_context));
diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
index c43f4a5..df1c11a 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -39,6 +39,11 @@
 {
    struct swr_context *ctx = swr_context(pipe);
 
+   if (!info->count_from_stream_output && !info->indirect &&
+       !info->primitive_restart &&
+       !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+      return;
+
    if (!swr_check_render_cond(pipe))
       return;
 
@@ -76,8 +81,11 @@
                offsets[output_buffer] = so->output[i].dst_offset;
             }
 
+            unsigned attrib_slot = so->output[i].register_index;
+            attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
+
             state.stream.decl[num].bufferIndex = output_buffer;
-            state.stream.decl[num].attribSlot = so->output[i].register_index - 1;
+            state.stream.decl[num].attribSlot = attrib_slot;
             state.stream.decl[num].componentMask =
                ((1 << so->output[i].num_components) - 1)
                << so->output[i].start_component;
@@ -95,7 +103,7 @@
          assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL");
       }
 
-      SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0);
+      ctx->api.pfnSwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0);
    }
 
    struct swr_vertex_element_state *velems = ctx->velems;
@@ -118,11 +126,43 @@
       velems->map.insert(std::make_pair(key, velems->fsFunc));
    }
 
-   SwrSetFetchFunc(ctx->swrContext, velems->fsFunc);
+   ctx->api.pfnSwrSetFetchFunc(ctx->swrContext, velems->fsFunc);
 
    /* Set up frontend state
     * XXX setup provokingVertex & topologyProvokingVertex */
    SWR_FRONTEND_STATE feState = {0};
+
+   // feState.vsVertexSize seeds the PA size that is used as an interface
+   // between all the shader stages, so it has to be large enough to
+   // incorporate all interfaces between stages
+
+   // max of gs and vs num_outputs
+   feState.vsVertexSize = ctx->vs->info.base.num_outputs;
+   if (ctx->gs &&
+       ctx->gs->info.base.num_outputs > feState.vsVertexSize) {
+      feState.vsVertexSize = ctx->gs->info.base.num_outputs;
+   }
+
+   if (ctx->vs->info.base.num_outputs) {
+      // gs does not adjust for position in SGV slot at input from vs
+      if (!ctx->gs)
+         feState.vsVertexSize--;
+   }
+
+   // other (non-SGV) slots start at VERTEX_ATTRIB_START_SLOT
+   feState.vsVertexSize += VERTEX_ATTRIB_START_SLOT;
+
+   // The PA in the clipper does not handle BE vertex sizes
+   // different from FE. Increase vertexsize only for the cases that needed it
+
+   // primid needs a slot
+   if (ctx->fs->info.base.uses_primid)
+      feState.vsVertexSize++;
+   // sprite coord enable
+   if (ctx->rasterizer->sprite_coord_enable)
+      feState.vsVertexSize++;
+
+
    if (ctx->rasterizer->flatshade_first) {
       feState.provokingVertex = {1, 0, 0};
    } else {
@@ -160,23 +200,32 @@
    }
 
    feState.bEnableCutIndex = info->primitive_restart;
-   SwrSetFrontendState(ctx->swrContext, &feState);
+   ctx->api.pfnSwrSetFrontendState(ctx->swrContext, &feState);
 
-   if (info->indexed)
-      SwrDrawIndexedInstanced(ctx->swrContext,
-                              swr_convert_prim_topology(info->mode),
-                              info->count,
-                              info->instance_count,
-                              info->start,
-                              info->index_bias,
-                              info->start_instance);
+   if (info->index_size)
+      ctx->api.pfnSwrDrawIndexedInstanced(ctx->swrContext,
+                                          swr_convert_prim_topology(info->mode),
+                                          info->count,
+                                          info->instance_count,
+                                          info->start,
+                                          info->index_bias,
+                                          info->start_instance);
    else
-      SwrDrawInstanced(ctx->swrContext,
-                       swr_convert_prim_topology(info->mode),
-                       info->count,
-                       info->instance_count,
-                       info->start,
-                       info->start_instance);
+      ctx->api.pfnSwrDrawInstanced(ctx->swrContext,
+                                   swr_convert_prim_topology(info->mode),
+                                   info->count,
+                                   info->instance_count,
+                                   info->start,
+                                   info->start_instance);
+
+   /* On large client-buffer draw, we used client buffer directly, without
+    * copy.  Block until draw is finished.
+    * VMD is an example application that benefits from this. */
+   if (ctx->dirty & SWR_LARGE_CLIENT_DRAW) {
+      struct swr_screen *screen = swr_screen(pipe->screen);
+      swr_fence_submit(ctx, screen->flush_fence);
+      swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
+   }
 }
 
 
@@ -210,6 +259,25 @@
    swr_fence_reference(pipe->screen, &fence, NULL);
 }
 
+/*
+ * Invalidate tiles so they can be reloaded back when needed
+ */
+void
+swr_invalidate_render_target(struct pipe_context *pipe,
+                             uint32_t attachment,
+                             uint16_t width, uint16_t height)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   /* grab the rect from the passed in arguments */
+   swr_update_draw_context(ctx);
+   SWR_RECT full_rect =
+      {0, 0, (int32_t)width, (int32_t)height};
+   ctx->api.pfnSwrInvalidateTiles(ctx->swrContext,
+                                  1 << attachment,
+                                  full_rect);
+}
+
 
 /*
  * Store SWR HotTiles back to renderTarget surface.
@@ -230,10 +298,10 @@
          {0, 0,
           (int32_t)u_minify(renderTarget->width, renderTarget->lod),
           (int32_t)u_minify(renderTarget->height, renderTarget->lod)};
-      SwrStoreTiles(ctx->swrContext,
-                    1 << attachment,
-                    post_tile_state,
-                    full_rect);
+      ctx->api.pfnSwrStoreTiles(ctx->swrContext,
+                                1 << attachment,
+                                post_tile_state,
+                                full_rect);
    }
 }
 
diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp
index c73bbbf..abf1d0c 100644
--- a/src/gallium/drivers/swr/swr_fence.cpp
+++ b/src/gallium/drivers/swr/swr_fence.cpp
@@ -59,7 +59,7 @@
 
    fence->write++;
    fence->pending = TRUE;
-   SwrSync(ctx->swrContext, swr_fence_cb, (uint64_t)fence, fence->write, 0);
+   ctx->api.pfnSwrSync(ctx->swrContext, swr_fence_cb, (uint64_t)fence, fence->write, 0);
 }
 
 /*
diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
index 4d71a67..e205fe2 100644
--- a/src/gallium/drivers/swr/swr_loader.cpp
+++ b/src/gallium/drivers/swr/swr_loader.cpp
@@ -24,32 +24,65 @@
 #include "util/u_cpu_detect.h"
 #include "util/u_dl.h"
 #include "swr_public.h"
-
-#include "pipe/p_screen.h"
+#include "swr_screen.h"
 
 #include <stdio.h>
 
-typedef pipe_screen *(*screen_create_proc)(struct sw_winsys *winsys);
-
 struct pipe_screen *
 swr_create_screen(struct sw_winsys *winsys)
 {
-   char filename[256];
+   char filename[256] = { 0 };
    fprintf(stderr, "SWR detected ");
 
    util_dl_library *pLibrary = nullptr;
 
    util_cpu_detect();
-   if (util_cpu_caps.has_avx2) {
-      fprintf(stderr, "AVX2\n");
-      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrAVX2", UTIL_DL_EXT);
-   } else if (util_cpu_caps.has_avx) {
-      fprintf(stderr, "AVX\n");
-      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrAVX", UTIL_DL_EXT);
-   } else {
-      fprintf(stderr, "no AVX/AVX2 support.  Aborting!\n");
-      exit(-1);
+
+   if (!strlen(filename) &&
+       util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
+#if HAVE_SWR_KNL
+      fprintf(stderr, "KNL ");
+      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrKNL", UTIL_DL_EXT);
+#else
+      fprintf(stderr, "KNL (not built) ");
+#endif
    }
+
+   if (!strlen(filename) &&
+       util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) {
+#if HAVE_SWR_SKX
+      fprintf(stderr, "SKX ");
+      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrSKX", UTIL_DL_EXT);
+#else
+      fprintf(stderr, "SKX (not built) ");
+#endif
+   }
+
+   if (!strlen(filename) && util_cpu_caps.has_avx2) {
+#if HAVE_SWR_AVX2
+      fprintf(stderr, "AVX2 ");
+      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrAVX2", UTIL_DL_EXT);
+#else
+      fprintf(stderr, "AVX2 (not built) ");
+#endif
+   }
+
+   if (!strlen(filename) && util_cpu_caps.has_avx) {
+#if HAVE_SWR_AVX
+      fprintf(stderr, "AVX ");
+      sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrAVX", UTIL_DL_EXT);
+#else
+      fprintf(stderr, "AVX (not built) ");
+#endif
+   }
+
+   if (!strlen(filename)) {
+      fprintf(stderr, "- no appropriate swr architecture library.  Aborting!\n");
+      exit(-1);
+   } else {
+      fprintf(stderr, "\n");
+   }
+
    pLibrary = util_dl_open(filename);
 
    if (!pLibrary) {
@@ -57,16 +90,17 @@
       exit(-1);
    }
 
-   util_dl_proc pScreenProc = util_dl_get_proc_address(pLibrary, "swr_create_screen_internal");
+   util_dl_proc pApiProc = util_dl_get_proc_address(pLibrary, "SwrGetInterface");
 
-   if (!pScreenProc) {
+   if (!pApiProc) {
       fprintf(stderr, "SWR library search failure: %s\n", util_dl_error());
       exit(-1);
    }
 
-   screen_create_proc pScreenCreate = (screen_create_proc)pScreenProc;
+   struct pipe_screen *screen = swr_create_screen_internal(winsys);
+   swr_screen(screen)->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc;
 
-   return pScreenCreate(winsys);
+   return screen;
 }
 
 
diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h
index b8ce27f..fc55616 100644
--- a/src/gallium/drivers/swr/swr_memory.h
+++ b/src/gallium/drivers/swr/swr_memory.h
@@ -23,28 +23,6 @@
 
 #pragma once
 
-void LoadHotTile(
-    const SWR_SURFACE_STATE *pSrcSurface,
-    SWR_FORMAT dstFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    UINT x, UINT y, uint32_t renderTargetArrayIndex,
-    uint8_t *pDstHotTile);
-
-void StoreHotTileToSurface(
-    SWR_SURFACE_STATE *pDstSurface,
-    SWR_FORMAT srcFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    UINT x, UINT y, uint32_t renderTargetArrayIndex,
-    uint8_t *pSrcHotTile);
-
-void StoreHotTileClear(
-    SWR_SURFACE_STATE *pDstSurface,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    UINT x,
-    UINT y,
-    uint32_t renderTargetArrayIndex,
-    const float* pClearColor);
-
 INLINE void
 swr_LoadHotTile(HANDLE hPrivateContext,
                 SWR_FORMAT dstFormat,
@@ -56,7 +34,7 @@
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex];
 
-   LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
+   pDC->pAPI->pfnSwrLoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
 }
 
 INLINE void
@@ -70,7 +48,7 @@
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
 
-   StoreHotTileToSurface(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
+   pDC->pAPI->pfnSwrStoreHotTileToSurface(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
 }
 
 INLINE void
@@ -85,17 +63,5 @@
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
 
-   StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, renderTargetArrayIndex, pClearColor);
-}
-
-void InitSimLoadTilesTable();
-void InitSimStoreTilesTable();
-void InitSimClearTilesTable();
-
-/* Init Load/Store/ClearTiles Tables */
-INLINE void swr_InitMemoryModule()
-{
-   InitSimLoadTilesTable();
-   InitSimStoreTilesTable();
-   InitSimClearTilesTable();
+   pDC->pAPI->pfnSwrStoreHotTileClear(pDstSurface, renderTargetIndex, x, y, renderTargetArrayIndex, pClearColor);
 }
diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
index e097790..4c14c52 100644
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -180,8 +180,8 @@
 
       /* Only change stat collection if there are no active queries */
       if (ctx->active_queries == 0) {
-         SwrEnableStatsFE(ctx->swrContext, TRUE);
-         SwrEnableStatsBE(ctx->swrContext, TRUE);
+         ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE);
+         ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE);
       }
       ctx->active_queries++;
       break;
@@ -217,8 +217,8 @@
       /* Only change stat collection if there are no active queries */
       ctx->active_queries--;
       if (ctx->active_queries == 0) {
-         SwrEnableStatsFE(ctx->swrContext, FALSE);
-         SwrEnableStatsBE(ctx->swrContext, FALSE);
+         ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE);
+         ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE);
       }
 
       break;
diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
index ae9954c..4effd46 100644
--- a/src/gallium/drivers/swr/swr_resource.h
+++ b/src/gallium/drivers/swr/swr_resource.h
@@ -96,6 +96,10 @@
 }
 
 
+void swr_invalidate_render_target(struct pipe_context *pipe,
+                                  uint32_t attachment,
+                                  uint16_t width, uint16_t height);
+
 void swr_store_render_target(struct pipe_context *pipe,
                              uint32_t attachment,
                              enum SWR_TILE_STATE post_tile_state);
diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp
index db095de..d298a48 100644
--- a/src/gallium/drivers/swr/swr_scratch.cpp
+++ b/src/gallium/drivers/swr/swr_scratch.cpp
@@ -28,6 +28,7 @@
 #include "swr_fence_work.h"
 #include "api.h"
 
+#define SCRATCH_SINGLE_ALLOCATION_LIMIT 2048
 
 void *
 swr_copy_to_scratch_space(struct swr_context *ctx,
@@ -39,9 +40,9 @@
    assert(space);
    assert(size);
 
-   if (size >= 2048) { /* XXX TODO create KNOB_ for this */
+   if (size >= SCRATCH_SINGLE_ALLOCATION_LIMIT) {
       /* Use per draw SwrAllocDrawContextMemory for larger copies */
-      ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4);
+      ptr = ctx->api.pfnSwrAllocDrawContextMemory(ctx->swrContext, size, 4);
    } else {
       /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
       unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT;
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index b684e41..952ae0c 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -46,11 +46,6 @@
 #include <stdio.h>
 #include <map>
 
-/* MSVC case instensitive compare */
-#if defined(PIPE_CC_MSVC)
-   #define strcasecmp lstrcmpiA
-#endif
-
 /*
  * Max texture sizes
  * XXX Check max texture size values against core and sampler.
@@ -61,6 +56,9 @@
 #define SWR_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
 #define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
 
+/* Default max client_copy_limit */
+#define SWR_CLIENT_COPY_LIMIT 32768
+
 /* Flag indicates creation of alternate surface, to prevent recursive loop
  * in resource creation when msaa_force_enable is set. */
 #define SWR_RESOURCE_FLAG_ALT_SURFACE (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
@@ -337,6 +335,10 @@
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -1012,11 +1014,12 @@
    struct sw_winsys *winsys = screen->winsys;
    struct swr_resource *spr = swr_resource(resource);
    struct pipe_context *pipe = screen->pipe;
+   struct swr_context *ctx = swr_context(pipe);
 
    if (pipe) {
       swr_fence_finish(p_screen, NULL, screen->flush_fence, 0);
       swr_resource_unused(resource);
-      SwrEndFrame(swr_context(pipe)->swrContext);
+      ctx->api.pfnSwrEndFrame(ctx->swrContext);
    }
 
    /* Multisample resolved into resolve_target at flush with store_resource */
@@ -1058,46 +1061,19 @@
    FREE(screen);
 }
 
-PUBLIC
-struct pipe_screen *
-swr_create_screen_internal(struct sw_winsys *winsys)
+
+static void
+swr_validate_env_options(struct swr_screen *screen)
 {
-   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-
-   if (!screen)
-      return NULL;
-
-   if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) {
-      g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152);
-   }
-
-   if (!lp_build_init()) {
-      FREE(screen);
-      return NULL;
-   }
-
-   screen->winsys = winsys;
-   screen->base.get_name = swr_get_name;
-   screen->base.get_vendor = swr_get_vendor;
-   screen->base.is_format_supported = swr_is_format_supported;
-   screen->base.context_create = swr_create_context;
-   screen->base.can_create_resource = swr_can_create_resource;
-
-   screen->base.destroy = swr_destroy_screen;
-   screen->base.get_param = swr_get_param;
-   screen->base.get_shader_param = swr_get_shader_param;
-   screen->base.get_paramf = swr_get_paramf;
-
-   screen->base.resource_create = swr_resource_create;
-   screen->base.resource_destroy = swr_resource_destroy;
-
-   screen->base.flush_frontbuffer = swr_flush_frontbuffer;
-
-   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR, "swr");
-
-   swr_fence_init(&screen->base);
-
-   util_format_s3tc_init();
+   /* The client_copy_limit sets a maximum on the amount of user-buffer memory
+    * copied to scratch space on a draw.  Past this, the draw will access
+    * user-buffer directly and then block.  This is faster than queuing many
+    * large client draws. */
+   screen->client_copy_limit = SWR_CLIENT_COPY_LIMIT;
+   int client_copy_limit =
+      debug_get_num_option("SWR_CLIENT_COPY_LIMIT", SWR_CLIENT_COPY_LIMIT);
+   if (client_copy_limit > 0)
+      screen->client_copy_limit = client_copy_limit;
 
    /* XXX msaa under development, disable by default for now */
    screen->msaa_max_count = 0; /* was SWR_MAX_NUM_MULTISAMPLES; */
@@ -1125,6 +1101,48 @@
          "SWR_MSAA_FORCE_ENABLE", false);
    if (screen->msaa_force_enable)
       fprintf(stderr, "SWR_MSAA_FORCE_ENABLE: true\n");
+}
+
+
+PUBLIC
+struct pipe_screen *
+swr_create_screen_internal(struct sw_winsys *winsys)
+{
+   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
+
+   if (!screen)
+      return NULL;
+
+   if (!lp_build_init()) {
+      FREE(screen);
+      return NULL;
+   }
+
+   screen->winsys = winsys;
+   screen->base.get_name = swr_get_name;
+   screen->base.get_vendor = swr_get_vendor;
+   screen->base.is_format_supported = swr_is_format_supported;
+   screen->base.context_create = swr_create_context;
+   screen->base.can_create_resource = swr_can_create_resource;
+
+   screen->base.destroy = swr_destroy_screen;
+   screen->base.get_param = swr_get_param;
+   screen->base.get_shader_param = swr_get_shader_param;
+   screen->base.get_paramf = swr_get_paramf;
+
+   screen->base.resource_create = swr_resource_create;
+   screen->base.resource_destroy = swr_resource_destroy;
+
+   screen->base.flush_frontbuffer = swr_flush_frontbuffer;
+
+   // Pass in "" for architecture for run-time determination
+   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr");
+
+   swr_fence_init(&screen->base);
+
+   util_format_s3tc_init();
+
+   swr_validate_env_options(screen);
 
    return &screen->base;
 }
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
index dc1bb47..a11ea9f 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -43,10 +43,14 @@
 
    struct sw_winsys *winsys;
 
+   /* Configurable environment settings */
    boolean msaa_force_enable;
    uint8_t msaa_max_count;
+   uint32_t client_copy_limit;
 
    HANDLE hJitMgr;
+
+   PFNSwrGetInterface pfnSwrGetInterface;
 };
 
 static INLINE struct swr_screen *
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index d8f5512..0a81eaa 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -47,12 +47,6 @@
 #include "swr_state.h"
 #include "swr_screen.h"
 
-#if HAVE_LLVM < 0x0500
-namespace llvm {
-typedef AttributeSet AttributeList;
-}
-#endif
-
 using namespace SwrJit;
 using namespace llvm;
 
@@ -226,6 +220,9 @@
       gallivm_free_ir(gallivm);
    }
 
+   void WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput,
+                unsigned slot, unsigned channel);
+
    struct gallivm_state *gallivm;
    PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key);
    PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key);
@@ -373,8 +370,13 @@
 
     IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
 
+#if USE_SIMD16_FRONTEND
+    const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
+    const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
+#else
     const uint32_t simdVertexStride = sizeof(simdvertex);
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + 7) / 8;
+    const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
+#endif
     const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
 
     Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
@@ -391,36 +393,71 @@
           inputPrimStride * 6,
           inputPrimStride * 7 } );
 
-    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), 3);
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), 7);
+#if USE_SIMD16_FRONTEND
+    const uint32_t simdShift = log2(mVWidth * 2);
+    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
+#else
+    const uint32_t simdShift = log2(mVWidth);
+    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
+#endif
+    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
 
     for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
        uint32_t attribSlot = attrib;
-       if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
-          attribSlot = VERTEX_POINT_SIZE_SLOT;
-       else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PRIMID)
-          attribSlot = VERTEX_PRIMID_SLOT;
-       else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER)
-          attribSlot = VERTEX_RTAI_SLOT;
+       uint32_t sgvChannel = 0;
+       if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
+          attribSlot = VERTEX_SGV_SLOT;
+          sgvChannel = VERTEX_SGV_POINT_SIZE_COMP;
+       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) {
+          attribSlot = VERTEX_SGV_SLOT;
+          sgvChannel = VERTEX_SGV_RTAI_COMP;
+       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
+          attribSlot = VERTEX_POSITION_SLOT;
+       } else {
+          attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+          if (iface->info->writes_position) {
+             attribSlot--;
+          }
+       }
 
+#if USE_SIMD16_FRONTEND
+       Value *vOffsetsAttrib =
+          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
+       vOffsetsAttrib =
+          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
+#else
        Value *vOffsetsAttrib =
           ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
        vOffsetsAttrib =
           ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
+#endif
        vOffsetsAttrib =
           ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
 
        for (uint32_t channel = 0; channel < 4; ++channel) {
-          Value *vData = LOAD(unwrap(outputs[attrib][channel]));
           Value *vPtrs = GEP(pStream, vOffsetsAttrib);
+          Value *vData;
 
-          vPtrs = BITCAST(vPtrs,
-                          VectorType::get(PointerType::get(mFP32Ty, 0), 8));
+          if (attribSlot == VERTEX_SGV_SLOT)
+             vData = LOAD(unwrap(outputs[attrib][0]));
+          else
+             vData = LOAD(unwrap(outputs[attrib][channel]));
 
-          MASKED_SCATTER(vData, vPtrs, 32, vMask1);
+          if (attribSlot != VERTEX_SGV_SLOT ||
+              sgvChannel == channel) {
+             vPtrs = BITCAST(vPtrs,
+                             VectorType::get(PointerType::get(mFP32Ty, 0), 8));
 
+             MASKED_SCATTER(vData, vPtrs, 32, vMask1);
+          }
+
+#if USE_SIMD16_FRONTEND
+          vOffsetsAttrib =
+             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
+#else
           vOffsetsAttrib =
              ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
+#endif
        }
     }
 }
@@ -513,14 +550,12 @@
    pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
    pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS];
 
-   pGS->emitsRenderTargetArrayIndex = info->writes_layer;
-   pGS->emitsPrimitiveID = info->writes_primid;
-   pGS->emitsViewportArrayIndex = info->writes_viewport_index;
-
    // XXX: single stream for now...
    pGS->isSingleStream = true;
    pGS->singleStreamID = 0;
 
+   pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+
    struct swr_geometry_shader *gs = ctx->gs;
 
    LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
@@ -530,8 +565,6 @@
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)};
@@ -543,7 +576,13 @@
                                      GlobalValue::ExternalLinkage,
                                      "GS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);
@@ -576,8 +615,15 @@
       ubyte semantic_name = info->input_semantic_name[slot];
       ubyte semantic_idx = info->input_semantic_index[slot];
 
-      unsigned vs_slot =
-         locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base) + 1;
+      unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
+
+      vs_slot += VERTEX_ATTRIB_START_SLOT;
+
+      if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+         vs_slot--;
+
+      if (semantic_name == TGSI_SEMANTIC_POSITION)
+         vs_slot = VERTEX_POSITION_SLOT;
 
       STORE(C(vs_slot), vtxAttribMap, {0, slot});
       mapConstants.push_back(C(vs_slot));
@@ -657,6 +703,23 @@
    return func;
 }
 
+void
+BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel)
+{
+#if USE_SIMD16_FRONTEND
+   // interleave the simdvertex components into the dest simd16vertex
+   //   slot16offset = slot8offset * 2
+   //   comp16offset = comp8offset * 2 + alternateOffset
+
+   Value *offset = LOAD(pVsContext, { 0, SWR_VS_CONTEXT_AlternateOffset });
+   Value *pOut = GEP(pVtxOutput, { C(0), C(0), C(slot * 2), offset } );
+   STORE(pVal, pOut, {channel * 2});
+#else
+   Value *pOut = GEP(pVtxOutput, {0, 0, slot});
+   STORE(pVal, pOut, {0, channel});
+#endif
+}
+
 PFN_VERTEX_FUNC
 BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
 {
@@ -669,8 +732,6 @@
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
@@ -682,7 +743,13 @@
                                      GlobalValue::ExternalLinkage,
                                      "VS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);
@@ -747,12 +814,25 @@
          if (!outputs[attrib][channel])
             continue;
 
-         Value *val = LOAD(unwrap(outputs[attrib][channel]));
+         Value *val;
+         uint32_t outSlot;
 
-         uint32_t outSlot = attrib;
-         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
-            outSlot = VERTEX_POINT_SIZE_SLOT;
-         STORE(val, vtxOutput, {0, 0, outSlot, channel});
+         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
+            if (channel != VERTEX_SGV_POINT_SIZE_COMP)
+               continue;
+            val = LOAD(unwrap(outputs[attrib][0]));
+            outSlot = VERTEX_SGV_SLOT;
+         } else if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
+            val = LOAD(unwrap(outputs[attrib][channel]));
+            outSlot = VERTEX_POSITION_SLOT;
+         } else {
+            val = LOAD(unwrap(outputs[attrib][channel]));
+            outSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+            if (swr_vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+               outSlot--;
+         }
+
+         WriteVS(val, pVsCtx, vtxOutput, outSlot, channel);
       }
    }
 
@@ -762,8 +842,8 @@
 
       unsigned cv = 0;
       if (swr_vs->info.base.writes_clipvertex) {
-         cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0,
-                                 &swr_vs->info.base);
+         cv = locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0,
+                             &swr_vs->info.base);
       } else {
          for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
             if (swr_vs->info.base.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
@@ -782,14 +862,14 @@
          // clip distance overrides user clip planes
          if ((swr_vs->info.base.clipdist_writemask & clip_mask & (1 << val)) ||
              ((swr_vs->info.base.culldist_writemask << swr_vs->info.base.num_written_clipdistance) & (1 << val))) {
-            unsigned cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1,
-                                             &swr_vs->info.base);
+            unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1,
+                                         &swr_vs->info.base);
             if (val < 4) {
                LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], "");
-               STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val});
+               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
             } else {
                LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val - 4], "");
-               STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4});
+               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
             }
             continue;
          }
@@ -807,9 +887,9 @@
                                       FMUL(unwrap(cw), VBROADCAST(pw)))));
 
          if (val < 4)
-            STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val});
+            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
          else
-            STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4});
+            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
       }
    }
 
@@ -846,13 +926,40 @@
    return func;
 }
 
+unsigned
+swr_so_adjust_attrib(unsigned in_attrib,
+                     swr_vertex_shader *swr_vs)
+{
+   ubyte semantic_name;
+   unsigned attrib;
+
+   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
+
+   if (swr_vs) {
+      semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
+      if (semantic_name == TGSI_SEMANTIC_POSITION) {
+         attrib = VERTEX_POSITION_SLOT;
+      } else if (semantic_name == TGSI_SEMANTIC_PSIZE) {
+         attrib = VERTEX_SGV_SLOT;
+      } else if (semantic_name == TGSI_SEMANTIC_LAYER) {
+         attrib = VERTEX_SGV_SLOT;
+      } else {
+         if (swr_vs->info.base.writes_position) {
+               attrib--;
+         }
+      }
+   }
+
+   return attrib;
+}
+
 static unsigned
 locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
 {
    for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
       if ((info->output_semantic_name[i] == name)
           && (info->output_semantic_index[i] == index)) {
-         return i - 1; // position is not part of the linkage
+         return i;
       }
    }
 
@@ -880,8 +987,6 @@
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
@@ -892,7 +997,13 @@
                                      GlobalValue::ExternalLinkage,
                                      "FS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);
@@ -994,23 +1105,23 @@
          inputs[attrib][3] =
             wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW"));
          continue;
-      } else if (semantic_name == TGSI_SEMANTIC_PRIMID) {
-         Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID");
-         inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID));
-         inputs[attrib][1] = wrap(VIMMED1(0));
-         inputs[attrib][2] = wrap(VIMMED1(0));
-         inputs[attrib][3] = wrap(VIMMED1(0));
-         continue;
       }
 
       unsigned linkedAttrib =
-         locate_linkage(semantic_name, semantic_idx, pPrevShader);
+         locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1;
 
-      if (semantic_name == TGSI_SEMANTIC_GENERIC &&
+      uint32_t extraAttribs = 0;
+      if (semantic_name == TGSI_SEMANTIC_PRIMID && !ctx->gs) {
+         /* non-gs generated primID - need to grab from swizzleMap override */
+         linkedAttrib = pPrevShader->num_outputs - 1;
+         swr_fs->constantMask |= 1 << linkedAttrib;
+         extraAttribs++;
+      } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
           key.sprite_coord_enable & (1 << semantic_idx)) {
          /* we add an extra attrib to the backendState in swr_update_derived. */
-         linkedAttrib = pPrevShader->num_outputs - 1;
+         linkedAttrib = pPrevShader->num_outputs + extraAttribs - 1;
          swr_fs->pointSpriteMask |= (1 << linkedAttrib);
+         extraAttribs++;
       } else if (linkedAttrib == 0xFFFFFFFF) {
          inputs[attrib][0] = wrap(VIMMED1(0.0f));
          inputs[attrib][1] = wrap(VIMMED1(0.0f));
@@ -1033,7 +1144,7 @@
       Value *offset = NULL;
       if (semantic_name == TGSI_SEMANTIC_COLOR && key.light_twoside) {
          bcolorAttrib = locate_linkage(
-               TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader);
+               TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader) - 1;
          /* Neither front nor back colors were available. Nothing to load. */
          if (bcolorAttrib == 0xFFFFFFFF && linkedAttrib == 0xFFFFFFFF)
             continue;
diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h
index 1ab6846..6468874 100644
--- a/src/gallium/drivers/swr/swr_shader.h
+++ b/src/gallium/drivers/swr/swr_shader.h
@@ -30,6 +30,9 @@
 struct swr_jit_vs_key;
 struct swr_jit_gs_key;
 
+unsigned swr_so_adjust_attrib(unsigned in_attrib,
+                              swr_vertex_shader *swr_vs);
+
 PFN_VERTEX_FUNC
 swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key);
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index 56b1374..47ab445 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -31,6 +31,7 @@
 #include "jit_api.h"
 #include "gen_state_llvm.h"
 #include "core/multisample.h"
+#include "core/state_funcs.h"
 
 #include "gallivm/lp_bld_tgsi.h"
 #include "util/u_format.h"
@@ -344,8 +345,10 @@
       // soState.streamToRasterizer not used
 
       for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+         unsigned attrib_slot = stream_output->output[i].register_index;
+         attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
          swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-            1 << (stream_output->output[i].register_index - 1);
+            (1 << attrib_slot);
       }
       for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
         swr_vs->soState.streamNumEntries[i] =
@@ -495,6 +498,7 @@
    assert(num_elements <= PIPE_MAX_ATTRIBS);
    velems = new swr_vertex_element_state;
    if (velems) {
+      memset(&velems->fsState, 0, sizeof(velems->fsState));
       velems->fsState.bVertexIDOffsetEnable = true;
       velems->fsState.numAttribs = num_elements;
       for (unsigned i = 0; i < num_elements; i++) {
@@ -591,20 +595,6 @@
 
 
 static void
-swr_set_index_buffer(struct pipe_context *pipe,
-                     const struct pipe_index_buffer *ib)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ib)
-      memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer));
-   else
-      memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer));
-
-   ctx->dirty |= SWR_NEW_VERTEX;
-}
-
-static void
 swr_set_polygon_stipple(struct pipe_context *pipe,
                         const struct pipe_poly_stipple *stipple)
 {
@@ -744,15 +734,14 @@
    /* VBO vertex buffers */
    for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) {
       struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
-      if (!vb->user_buffer)
-         swr_resource_read(vb->buffer);
+      if (!vb->is_user_buffer)
+         swr_resource_read(vb->buffer.resource);
    }
 
    /* VBO index buffer */
-   if (p_draw_info && p_draw_info->indexed) {
-      struct pipe_index_buffer *ib = &ctx->index_buffer;
-      if (!ib->user_buffer)
-         swr_resource_read(ib->buffer);
+   if (p_draw_info && p_draw_info->index_size) {
+      if (!p_draw_info->has_user_indices)
+         swr_resource_read(p_draw_info->index.resource);
    }
 
    /* transform feedback buffers */
@@ -948,6 +937,11 @@
        * INVALID so they are reloaded from surface. */
       swr_store_render_target(&ctx->pipe, attachment, SWR_TILE_INVALID);
       need_fence = true;
+   } else {
+      /* if no previous attachment, invalidate tiles that may be marked
+       * RESOLVED because of an old attachment */
+      swr_invalidate_render_target(&ctx->pipe, attachment, sf->width, sf->height);
+      /* no need to set fence here */
    }
 
    /* Make new attachment */
@@ -1153,6 +1147,10 @@
          rastState->slopeScaledDepthBias = 0;
          rastState->depthBiasClamp = 0;
       }
+
+      /* translate polygon mode, at least for the front==back case */
+      rastState->fillMode = swr_convert_fill_mode(rasterizer->fill_front);
+
       struct pipe_surface *zb = fb->zsbuf;
       if (zb && swr_resource(zb->texture)->has_depth)
          rastState->depthFormat = swr_resource(zb->texture)->swr.format;
@@ -1168,12 +1166,12 @@
       rastState->cullDistanceMask =
          ctx->vs->info.base.culldist_writemask << ctx->vs->info.base.num_written_clipdistance;
 
-      SwrSetRastState(ctx->swrContext, rastState);
+      ctx->api.pfnSwrSetRastState(ctx->swrContext, rastState);
    }
 
    /* Scissor */
    if (ctx->dirty & SWR_NEW_SCISSOR) {
-      SwrSetScissorRects(ctx->swrContext, 1, &ctx->swr_scissor);
+      ctx->api.pfnSwrSetScissorRects(ctx->swrContext, 1, &ctx->swr_scissor);
    }
 
    /* Viewport */
@@ -1213,38 +1211,22 @@
       vp->width = std::min(vp->width, (float)fb->width - vp->x);
       vp->height = std::min(vp->height, (float)fb->height - vp->y);
 
-      SwrSetViewports(ctx->swrContext, 1, vp, vpm);
+      ctx->api.pfnSwrSetViewports(ctx->swrContext, 1, vp, vpm);
    }
 
-   /* Set vertex & index buffers */
-   /* (using draw info if called by swr_draw_vbo) */
-   if (ctx->dirty & SWR_NEW_VERTEX) {
-      uint32_t scratch_total;
-      uint8_t *scratch = NULL;
+   /* Set vertex & index buffers
+    * (using draw info if called by swr_draw_vbo)
+    * If indexed draw, revalidate since index buffer comes from
+    * pipe_draw_info.
+    */
+   if (ctx->dirty & SWR_NEW_VERTEX ||
+      (p_draw_info && p_draw_info->index_size)) {
 
       /* If being called by swr_draw_vbo, copy draw details */
       struct pipe_draw_info info = {0};
       if (p_draw_info)
          info = *p_draw_info;
 
-      /* We must get all the scratch space in one go */
-      scratch_total = 0;
-      for (UINT i = 0; i < ctx->num_vertex_buffers; i++) {
-         struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
-
-         if (!vb->user_buffer)
-            continue;
-
-         uint32_t elems, base, size;
-         swr_user_vbuf_range(&info, ctx->velems, vb, i, &elems, &base, &size);
-         scratch_total += AlignUp(size, 4);
-      }
-
-      if (scratch_total) {
-         scratch = (uint8_t *)swr_copy_to_scratch_space(
-               ctx, &ctx->scratch->vertex_buffer, NULL, scratch_total);
-      }
-
       /* vertex buffers */
       SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS];
       for (UINT i = 0; i < ctx->num_vertex_buffers; i++) {
@@ -1254,16 +1236,27 @@
          struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
 
          pitch = vb->stride;
-         if (!vb->user_buffer) {
-            /* VBO
-             * size is based on buffer->width0 rather than info.max_index
-             * to prevent having to validate VBO on each draw */
-            size = vb->buffer->width0;
-            elems = size / pitch;
-            partial_inbounds = size % pitch;
-            min_vertex_index = 0;
+         if (!vb->is_user_buffer) {
+            /* VBO */
+            if (!pitch) {
+               /* If pitch=0 (ie vb->stride), buffer contains a single
+                * constant attribute.  Use the stream_pitch which was
+                * calculated during creation of vertex_elements_state for the
+                * size of the attribute. */
+               size = ctx->velems->stream_pitch[i];
+               elems = 1;
+               partial_inbounds = 0;
+               min_vertex_index = 0;
+            } else {
+               /* size is based on buffer->width0 rather than info.max_index
+                * to prevent having to validate VBO on each draw. */
+               size = vb->buffer.resource->width0;
+               elems = size / pitch;
+               partial_inbounds = size % pitch;
+               min_vertex_index = 0;
+            }
 
-            p_data = swr_resource_data(vb->buffer) + vb->buffer_offset;
+            p_data = swr_resource_data(vb->buffer.resource) + vb->buffer_offset;
          } else {
             /* Client buffer
              * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
@@ -1275,13 +1268,20 @@
             partial_inbounds = 0;
             min_vertex_index = info.min_index;
 
-            /* Copy only needed vertices to scratch space */
             size = AlignUp(size, 4);
-            const void *ptr = (const uint8_t *) vb->user_buffer + base;
-            memcpy(scratch, ptr, size);
-            ptr = scratch;
-            scratch += size;
-            p_data = (const uint8_t *)ptr - base;
+            /* If size of client memory copy is too large, don't copy. The
+             * draw will access user-buffer directly and then block.  This is
+             * faster than queuing many large client draws. */
+            if (size >= screen->client_copy_limit) {
+               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
+               p_data = (const uint8_t *) vb->buffer.user;
+            } else {
+               /* Copy only needed vertices to scratch space */
+               const void *ptr = (const uint8_t *) vb->buffer.user + base;
+               ptr = (uint8_t *)swr_copy_to_scratch_space(
+                     ctx, &ctx->scratch->vertex_buffer, ptr, size);
+               p_data = (const uint8_t *)ptr - base;
+            }
          }
 
          swrVertexBuffers[i] = {0};
@@ -1294,25 +1294,24 @@
          swrVertexBuffers[i].partialInboundsSize = partial_inbounds;
       }
 
-      SwrSetVertexBuffers(
+      ctx->api.pfnSwrSetVertexBuffers(
          ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers);
 
       /* index buffer, if required (info passed in by swr_draw_vbo) */
       SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */
-      if (info.indexed) {
+      if (info.index_size) {
          const uint8_t *p_data;
          uint32_t size, pitch;
-         struct pipe_index_buffer *ib = &ctx->index_buffer;
 
-         pitch = ib->index_size ? ib->index_size : sizeof(uint32_t);
+         pitch = info.index_size ? info.index_size : sizeof(uint32_t);
          index_type = swr_convert_index_type(pitch);
 
-         if (!ib->user_buffer) {
+         if (!info.has_user_indices) {
             /* VBO
              * size is based on buffer->width0 rather than info.count
              * to prevent having to validate VBO on each draw */
-            size = ib->buffer->width0;
-            p_data = swr_resource_data(ib->buffer) + ib->offset;
+            size = info.index.resource->width0;
+            p_data = swr_resource_data(info.index.resource);
          } else {
             /* Client buffer
              * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
@@ -1321,20 +1320,27 @@
 
             size = info.count * pitch;
             size = AlignUp(size, 4);
-
-            /* Copy indices to scratch space */
-            const void *ptr = ib->user_buffer;
-            ptr = swr_copy_to_scratch_space(
-               ctx, &ctx->scratch->index_buffer, ptr, size);
-            p_data = (const uint8_t *)ptr;
+            /* If size of client memory copy is too large, don't copy. The
+             * draw will access user-buffer directly and then block.  This is
+             * faster than queuing many large client draws. */
+            if (size >= screen->client_copy_limit) {
+               post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW;
+               p_data = (const uint8_t *) info.index.user;
+            } else {
+               /* Copy indices to scratch space */
+               const void *ptr = info.index.user;
+               ptr = swr_copy_to_scratch_space(
+                     ctx, &ctx->scratch->index_buffer, ptr, size);
+               p_data = (const uint8_t *)ptr;
+            }
          }
 
          SWR_INDEX_BUFFER_STATE swrIndexBuffer;
-         swrIndexBuffer.format = swr_convert_index_type(ib->index_size);
+         swrIndexBuffer.format = swr_convert_index_type(info.index_size);
          swrIndexBuffer.pIndices = p_data;
          swrIndexBuffer.size = size;
 
-         SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer);
+         ctx->api.pfnSwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer);
       }
 
       struct swr_vertex_element_state *velems = ctx->velems;
@@ -1359,7 +1365,7 @@
          } else {
             func = swr_compile_gs(ctx, key);
          }
-         SwrSetGsFunc(ctx->swrContext, func);
+         ctx->api.pfnSwrSetGsFunc(ctx->swrContext, func);
 
          /* JIT sampler state */
          if (ctx->dirty & SWR_NEW_SAMPLER) {
@@ -1377,11 +1383,11 @@
                                      ctx->swrDC.texturesGS);
          }
 
-         SwrSetGsState(ctx->swrContext, &ctx->gs->gsState);
+         ctx->api.pfnSwrSetGsState(ctx->swrContext, &ctx->gs->gsState);
       } else {
          SWR_GS_STATE state = { 0 };
-         SwrSetGsState(ctx->swrContext, &state);
-         SwrSetGsFunc(ctx->swrContext, NULL);
+         ctx->api.pfnSwrSetGsState(ctx->swrContext, &state);
+         ctx->api.pfnSwrSetGsFunc(ctx->swrContext, NULL);
       }
    }
 
@@ -1400,7 +1406,7 @@
       } else {
          func = swr_compile_vs(ctx, key);
       }
-      SwrSetVertexFunc(ctx->swrContext, func);
+      ctx->api.pfnSwrSetVertexFunc(ctx->swrContext, func);
 
       /* JIT sampler state */
       if (ctx->dirty & SWR_NEW_SAMPLER) {
@@ -1423,7 +1429,9 @@
    /* and points, since we rasterize them as triangles, too */
    /* Has to be before fragment shader, since it sets SWR_NEW_FS */
    if (p_draw_info) {
-      bool new_prim_is_poly = (u_reduced_prim(p_draw_info->mode) == PIPE_PRIM_TRIANGLES);
+      bool new_prim_is_poly =
+         (u_reduced_prim(p_draw_info->mode) == PIPE_PRIM_TRIANGLES) &&
+         (ctx->derived.rastState.fillMode == SWR_FILLMODE_SOLID);
       if (new_prim_is_poly != ctx->poly_stipple.prim_is_poly) {
          ctx->dirty |= SWR_NEW_FS;
          ctx->poly_stipple.prim_is_poly = new_prim_is_poly;
@@ -1454,7 +1462,7 @@
       psState.writesODepth = ctx->fs->info.base.writes_z;
       psState.usesSourceDepth = ctx->fs->info.base.reads_z;
       psState.shadingRate = SWR_SHADING_RATE_PIXEL;
-      psState.numRenderTargets = ctx->framebuffer.nr_cbufs;
+      psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1;
       psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE;
       uint32_t barycentricsMask = 0;
 #if 0
@@ -1486,7 +1494,7 @@
       psState.barycentricsMask = barycentricsMask;
       psState.usesUAV = false; // XXX
       psState.forceEarlyZ = false;
-      SwrSetPixelShaderState(ctx->swrContext, &psState);
+      ctx->api.pfnSwrSetPixelShaderState(ctx->swrContext, &psState);
 
       /* JIT sampler state */
       if (ctx->dirty & (SWR_NEW_SAMPLER |
@@ -1575,12 +1583,12 @@
       depthStencilState.depthTestEnable = depth->enabled;
       depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func);
       depthStencilState.depthWriteEnable = depth->writemask;
-      SwrSetDepthStencilState(ctx->swrContext, &depthStencilState);
+      ctx->api.pfnSwrSetDepthStencilState(ctx->swrContext, &depthStencilState);
 
       depthBoundsState.depthBoundsTestEnable = depth->bounds_test;
       depthBoundsState.depthBoundsTestMinValue = depth->bounds_min;
       depthBoundsState.depthBoundsTestMaxValue = depth->bounds_max;
-      SwrSetDepthBoundsState(ctx->swrContext, &depthBoundsState);
+      ctx->api.pfnSwrSetDepthBoundsState(ctx->swrContext, &depthBoundsState);
    }
 
    /* Blend State */
@@ -1609,7 +1617,7 @@
          blendState.renderTarget[0].writeDisableGreen = 1;
          blendState.renderTarget[0].writeDisableBlue = 1;
          blendState.renderTarget[0].writeDisableAlpha = 1;
-         SwrSetBlendFunc(ctx->swrContext, 0, NULL);
+         ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, 0, NULL);
       }
       else
          for (int target = 0;
@@ -1641,7 +1649,7 @@
             if (compileState.blendState.blendEnable == false &&
                 compileState.blendState.logicOpEnable == false &&
                 ctx->depth_stencil->alpha.enabled == 0) {
-               SwrSetBlendFunc(ctx->swrContext, target, NULL);
+               ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, NULL);
                continue;
             }
 
@@ -1677,10 +1685,10 @@
 
                ctx->blendJIT->insert(std::make_pair(compileState, func));
             }
-            SwrSetBlendFunc(ctx->swrContext, target, func);
+            ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, func);
          }
 
-      SwrSetBlendState(ctx->swrContext, &blendState);
+      ctx->api.pfnSwrSetBlendState(ctx->swrContext, &blendState);
    }
 
    if (ctx->dirty & SWR_NEW_STIPPLE) {
@@ -1690,7 +1698,7 @@
    if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) {
       ctx->vs->soState.rasterizerDisable =
          ctx->rasterizer->rasterizer_discard;
-      SwrSetSoState(ctx->swrContext, &ctx->vs->soState);
+      ctx->api.pfnSwrSetSoState(ctx->swrContext, &ctx->vs->soState);
 
       pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output;
 
@@ -1706,7 +1714,7 @@
          buffer.pitch = stream_output->stride[i];
          buffer.streamOffset = 0;
 
-         SwrSetSoBuffers(ctx->swrContext, &buffer, i);
+         ctx->api.pfnSwrSetSoBuffers(ctx->swrContext, &buffer, i);
       }
    }
 
@@ -1724,9 +1732,24 @@
 
    // set up backend state
    SWR_BACKEND_STATE backendState = {0};
-   backendState.numAttributes =
-      ((ctx->gs ? ctx->gs->info.base.num_outputs : ctx->vs->info.base.num_outputs) - 1) +
-      (ctx->rasterizer->sprite_coord_enable ? 1 : 0);
+   if (ctx->gs) {
+      backendState.numAttributes = ctx->gs->info.base.num_outputs - 1;
+   } else {
+      backendState.numAttributes = ctx->vs->info.base.num_outputs - 1;
+      if (ctx->fs->info.base.uses_primid) {
+         backendState.numAttributes++;
+         backendState.swizzleEnable = true;
+         for (unsigned i = 0; i < sizeof(backendState.numComponents); i++) {
+            backendState.swizzleMap[i].sourceAttrib = i;
+         }
+         backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].constantSource =
+            SWR_CONSTANT_SOURCE_PRIM_ID;
+         backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].componentOverrideMask = 1;
+      }
+   }
+   if (ctx->rasterizer->sprite_coord_enable)
+      backendState.numAttributes++;
+
    backendState.numAttributes = std::min((size_t)backendState.numAttributes,
                                          sizeof(backendState.numComponents));
    for (unsigned i = 0; i < backendState.numAttributes; i++)
@@ -1735,7 +1758,15 @@
       (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0);
    backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask;
 
-   SwrSetBackendState(ctx->swrContext, &backendState);
+   struct tgsi_shader_info *pLastFE =
+      ctx->gs ?
+      &ctx->gs->info.base :
+      &ctx->vs->info.base;
+   backendState.readRenderTargetArrayIndex = pLastFE->writes_layer;
+   backendState.readViewportArrayIndex = pLastFE->writes_viewport_index;
+   backendState.vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+
+   ctx->api.pfnSwrSetBackendState(ctx->swrContext, &backendState);
 
    /* Ensure that any in-progress attachment change StoreTiles finish */
    if (swr_is_fence_pending(screen->flush_fence))
@@ -1846,7 +1877,6 @@
    pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state;
 
    pipe->set_vertex_buffers = swr_set_vertex_buffers;
-   pipe->set_index_buffer = swr_set_index_buffer;
 
    pipe->set_polygon_stipple = swr_set_polygon_stipple;
    pipe->set_clip_state = swr_set_clip_state;
diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h
index 9a8c4e1..7940a96 100644
--- a/src/gallium/drivers/swr/swr_state.h
+++ b/src/gallium/drivers/swr/swr_state.h
@@ -376,4 +376,24 @@
       return TOP_UNKNOWN;
    }
 };
+
+/*
+ * convert mesa PIPE_POLYGON_MODE_X to SWR enum SWR_FILLMODE
+ */
+static INLINE enum SWR_FILLMODE
+swr_convert_fill_mode(const unsigned mode)
+{
+   switch(mode) {
+   case PIPE_POLYGON_MODE_FILL:
+      return SWR_FILLMODE_SOLID;
+   case PIPE_POLYGON_MODE_LINE:
+      return SWR_FILLMODE_WIREFRAME;
+   case PIPE_POLYGON_MODE_POINT:
+      return SWR_FILLMODE_POINT;
+   default:
+      assert(0 && "Unknown fillmode");
+      return SWR_FILLMODE_SOLID; // at least do something sensible
+   }
+}
+
 #endif
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index eafee7f..6d918d4 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -1034,24 +1034,6 @@
 }
 
 
-static void
-trace_context_set_index_buffer(struct pipe_context *_pipe,
-                               const struct pipe_index_buffer *ib)
-{
-   struct trace_context *tr_ctx = trace_context(_pipe);
-   struct pipe_context *pipe = tr_ctx->pipe;
-
-   trace_dump_call_begin("pipe_context", "set_index_buffer");
-
-   trace_dump_arg(ptr, pipe);
-   trace_dump_arg(index_buffer, ib);
-
-   pipe->set_index_buffer(pipe, ib);
-
-   trace_dump_call_end();
-}
-
-
 static struct pipe_stream_output_target *
 trace_context_create_stream_output_target(struct pipe_context *_pipe,
                                           struct pipe_resource *res,
@@ -1722,6 +1704,112 @@
    trace_dump_call_end();
 }
 
+static uint64_t trace_context_create_texture_handle(struct pipe_context *_pipe,
+                                                    struct pipe_sampler_view *view,
+                                                    const struct pipe_sampler_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   uint64_t handle;
+
+   trace_dump_call_begin("pipe_context", "create_texture_handle");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, view);
+   trace_dump_arg_begin("state");
+   trace_dump_arg(sampler_state, state);
+   trace_dump_arg_end();
+
+   handle = pipe->create_texture_handle(pipe, view, state);
+
+   trace_dump_ret(uint, handle);
+   trace_dump_call_end();
+
+   return handle;
+}
+
+static void trace_context_delete_texture_handle(struct pipe_context *_pipe,
+                                                uint64_t handle)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_texture_handle");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, handle);
+   trace_dump_call_end();
+
+   pipe->delete_texture_handle(pipe, handle);
+}
+
+static void trace_context_make_texture_handle_resident(struct pipe_context *_pipe,
+                                                       uint64_t handle,
+                                                       bool resident)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "make_texture_handle_resident");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, handle);
+   trace_dump_arg(bool, resident);
+   trace_dump_call_end();
+
+   pipe->make_texture_handle_resident(pipe, handle, resident);
+}
+
+static uint64_t trace_context_create_image_handle(struct pipe_context *_pipe,
+                                                  const struct pipe_image_view *image)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   uint64_t handle;
+
+   trace_dump_call_begin("pipe_context", "create_image_handle");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg_begin("image");
+   trace_dump_image_view(image);
+   trace_dump_arg_end();
+
+   handle = pipe->create_image_handle(pipe, image);
+
+   trace_dump_ret(uint, handle);
+   trace_dump_call_end();
+
+   return handle;
+}
+
+static void trace_context_delete_image_handle(struct pipe_context *_pipe,
+                                              uint64_t handle)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_image_handle");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, handle);
+   trace_dump_call_end();
+
+   pipe->delete_image_handle(pipe, handle);
+}
+
+static void trace_context_make_image_handle_resident(struct pipe_context *_pipe,
+                                                    uint64_t handle,
+                                                    unsigned access,
+                                                    bool resident)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "make_image_handle_resident");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, handle);
+   trace_dump_arg(uint, access);
+   trace_dump_arg(bool, resident);
+   trace_dump_call_end();
+
+   pipe->make_image_handle_resident(pipe, handle, access, resident);
+}
+
 struct pipe_context *
 trace_context_create(struct trace_screen *tr_scr,
                      struct pipe_context *pipe)
@@ -1804,7 +1892,6 @@
    TR_CTX_INIT(create_surface);
    TR_CTX_INIT(surface_destroy);
    TR_CTX_INIT(set_vertex_buffers);
-   TR_CTX_INIT(set_index_buffer);
    TR_CTX_INIT(create_stream_output_target);
    TR_CTX_INIT(stream_output_target_destroy);
    TR_CTX_INIT(set_stream_output_targets);
@@ -1824,6 +1911,12 @@
    TR_CTX_INIT(set_shader_buffers);
    TR_CTX_INIT(launch_grid);
    TR_CTX_INIT(set_shader_images);
+   TR_CTX_INIT(create_texture_handle);
+   TR_CTX_INIT(delete_texture_handle);
+   TR_CTX_INIT(make_texture_handle_resident);
+   TR_CTX_INIT(create_image_handle);
+   TR_CTX_INIT(delete_image_handle);
+   TR_CTX_INIT(make_image_handle_resident);
 
    TR_CTX_INIT(transfer_map);
    TR_CTX_INIT(transfer_unmap);
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 4c6f6d6..41f7faf 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -653,30 +653,9 @@
    trace_dump_struct_begin("pipe_vertex_buffer");
 
    trace_dump_member(uint, state, stride);
+   trace_dump_member(bool, state, is_user_buffer);
    trace_dump_member(uint, state, buffer_offset);
-   trace_dump_member(ptr, state, buffer);
-   trace_dump_member(ptr, state, user_buffer);
-
-   trace_dump_struct_end();
-}
-
-
-void trace_dump_index_buffer(const struct pipe_index_buffer *state)
-{
-   if (!trace_dumping_enabled_locked())
-      return;
-
-   if (!state) {
-      trace_dump_null();
-      return;
-   }
-
-   trace_dump_struct_begin("pipe_index_buffer");
-
-   trace_dump_member(uint, state, index_size);
-   trace_dump_member(uint, state, offset);
-   trace_dump_member(ptr, state, buffer);
-   trace_dump_member(ptr, state, user_buffer);
+   trace_dump_member(ptr, state, buffer.resource);
 
    trace_dump_struct_end();
 }
@@ -792,7 +771,8 @@
 
    trace_dump_struct_begin("pipe_draw_info");
 
-   trace_dump_member(bool, state, indexed);
+   trace_dump_member(uint, state, index_size);
+   trace_dump_member(uint, state, has_user_indices);
 
    trace_dump_member(uint, state, mode);
    trace_dump_member(uint, state, start);
@@ -810,10 +790,19 @@
    trace_dump_member(bool, state, primitive_restart);
    trace_dump_member(uint, state, restart_index);
 
+   trace_dump_member(ptr, state, index.resource);
    trace_dump_member(ptr, state, count_from_stream_output);
 
-   trace_dump_member(ptr, state, indirect);
-   trace_dump_member(uint, state, indirect_offset);
+   if (!state->indirect) {
+      trace_dump_member(ptr, state, indirect);
+   } else {
+      trace_dump_member(uint, state, indirect->offset);
+      trace_dump_member(uint, state, indirect->stride);
+      trace_dump_member(uint, state, indirect->draw_count);
+      trace_dump_member(uint, state, indirect->indirect_draw_count_offset);
+      trace_dump_member(ptr, state, indirect->buffer);
+      trace_dump_member(ptr, state, indirect->indirect_draw_count);
+   }
 
    trace_dump_struct_end();
 }
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
index fd2bc50..baff025 100644
--- a/src/gallium/drivers/trace/tr_dump_state.h
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -74,8 +74,6 @@
 
 void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state);
 
-void trace_dump_index_buffer(const struct pipe_index_buffer *state);
-
 void trace_dump_vertex_element(const struct pipe_vertex_element *state);
 
 void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 0fa8d0f..e56434c 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -391,13 +391,10 @@
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
 
-   trace_dump_call_begin("pipe_screen", "resource_destroy");
-
-   trace_dump_arg(ptr, screen);
-   trace_dump_arg(ptr, resource);
-
-   trace_dump_call_end();
-
+   /* Don't trace this, because due to the lack of pipe_resource wrapping,
+    * we can get this call from inside of driver calls, which would try
+    * to lock an already-locked mutex.
+    */
    screen->resource_destroy(screen, resource);
 }
 
diff --git a/src/gallium/drivers/vc4/Android.mk b/src/gallium/drivers/vc4/Android.mk
index fdc0674..34b957a 100644
--- a/src/gallium/drivers/vc4/Android.mk
+++ b/src/gallium/drivers/vc4/Android.mk
@@ -25,16 +25,28 @@
 
 include $(CLEAR_VARS)
 
-LOCAL_CFLAGS_arm := -DVC4_BUILD_NEON
-
 LOCAL_SRC_FILES := \
 	$(C_SOURCES)
 
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(NEON_C_SOURCES)
+endif
+
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/include/drm-uapi
 
 # We need libmesa_nir to get NIR's generated include directories.
-LOCAL_STATIC_LIBRARIES := libmesa_nir
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_nir \
+	libmesa_broadcom_genxml
+
 LOCAL_MODULE := libmesa_pipe_vc4
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_VC4),)
+GALLIUM_TARGET_DRIVERS += vc4
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_vc4)
+endif
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index b361a0c..46f3577 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -28,8 +28,8 @@
 
 AM_CFLAGS = \
 	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/include/drm-uapi \
 	$(LIBDRM_CFLAGS) \
-	$(VC4_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
 	$(VALGRIND_CFLAGS) \
@@ -38,13 +38,15 @@
 noinst_LTLIBRARIES = libvc4.la
 
 libvc4_la_SOURCES = $(C_SOURCES)
-libvc4_la_LIBADD = $(SIM_LIB) $(VC4_LIBS)
-libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
+libvc4_la_LIBADD = $(SIM_LIB)
 
+if HAVE_ARM_ASM
 noinst_LTLIBRARIES += libvc4_neon.la
 libvc4_la_LIBADD += libvc4_neon.la
+libvc4_neon_la_SOURCES = $(NEON_C_SOURCES)
+libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -mfpu=neon
+endif
 
-libvc4_neon_la_SOURCES = vc4_tiling_lt.c
-libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -DVC4_BUILD_NEON
+libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
 
 EXTRA_DIST = kernel/README
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 10de343..76dea70 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -10,6 +10,7 @@
 	vc4_bufmgr.h \
 	vc4_cl.c \
 	vc4_cl_dump.c \
+	vc4_cl_dump.h \
 	vc4_cl.h \
 	vc4_context.c \
 	vc4_context.h \
@@ -59,3 +60,5 @@
 	vc4_tiling.h \
 	vc4_uniforms.c \
 	$()
+
+NEON_C_SOURCES := vc4_tiling_lt_neon.c
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index 1e05656..0e4ab5b 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -212,14 +212,16 @@
         if (vc4_tile_blit(pctx, blit_info))
                 return;
 
-        if (util_try_blit_via_copy_region(pctx, &info)) {
-                return; /* done */
-        }
-
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping\n");
+                if (util_try_blit_via_copy_region(pctx, &info))
+                        return;
+
                 info.mask &= ~PIPE_MASK_S;
+                fprintf(stderr, "cannot blit stencil, skipping\n");
         }
 
-        vc4_render_blit(pctx, &info);
+        if (vc4_render_blit(pctx, &info))
+                return;
+
+        fprintf(stderr, "Unsupported blit\n");
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 12af7f8..0653f88 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -123,6 +123,8 @@
 struct vc4_bo *
 vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
 {
+        bool cleared_and_retried = false;
+        struct drm_vc4_create_bo create;
         struct vc4_bo *bo;
         int ret;
 
@@ -149,12 +151,8 @@
         bo->private = true;
 
  retry:
-        ;
-
-        bool cleared_and_retried = false;
-        struct drm_vc4_create_bo create = {
-                .size = size
-        };
+        memset(&create, 0, sizeof(create));
+        create.size = size;
 
         ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create);
         bo->handle = create.handle;
diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 3557837..508281a 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -26,11 +26,12 @@
 #include "vc4_context.h"
 
 void
-vc4_init_cl(void *mem_ctx, struct vc4_cl *cl)
+vc4_init_cl(struct vc4_job *job, struct vc4_cl *cl)
 {
-        cl->base = rzalloc_size(mem_ctx, 1); /* TODO: don't use rzalloc */
+        cl->base = rzalloc_size(job, 1); /* TODO: don't use rzalloc */
         cl->next = cl->base;
         cl->size = 0;
+        cl->job = job;
 }
 
 void
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 74bf8cf..8df9dbf 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -29,10 +29,9 @@
 #include "util/u_math.h"
 #include "util/macros.h"
 
-#include "kernel/vc4_packet.h"
-
 struct vc4_bo;
 struct vc4_job;
+struct vc4_cl;
 
 /**
  * Undefined structure, used for typechecking that you're passing the pointers
@@ -40,19 +39,35 @@
  */
 struct vc4_cl_out;
 
+/** A reference to a BO used in the CL packing functions */
+struct vc4_cl_reloc {
+        struct vc4_bo *bo;
+        uint32_t offset;
+};
+
+static inline void cl_pack_emit_reloc(struct vc4_cl *cl, const struct vc4_cl_reloc *);
+
+#define __gen_user_data struct vc4_cl
+#define __gen_address_type struct vc4_cl_reloc
+#define __gen_address_offset(reloc) ((reloc)->offset)
+#define __gen_emit_reloc cl_pack_emit_reloc
+
+#include "kernel/vc4_packet.h"
+#include "broadcom/cle/v3d_packet_v21_pack.h"
+
 struct vc4_cl {
         void *base;
+        struct vc4_job *job;
         struct vc4_cl_out *next;
         struct vc4_cl_out *reloc_next;
         uint32_t size;
-#ifdef DEBUG
+#ifndef NDEBUG
         uint32_t reloc_count;
 #endif
 };
 
-void vc4_init_cl(void *mem_ctx, struct vc4_cl *cl);
+void vc4_init_cl(struct vc4_job *job, struct vc4_cl *cl);
 void vc4_reset_cl(struct vc4_cl *cl);
-void vc4_dump_cl(void *cl, uint32_t size, bool is_render);
 uint32_t vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo);
 
 struct PACKED unaligned_16 { uint16_t x; };
@@ -148,8 +163,8 @@
 cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
-#ifdef DEBUG
         assert(cl->reloc_count == 0);
+#ifndef NDEBUG
         cl->reloc_count = n;
 #endif
 
@@ -162,8 +177,8 @@
 static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
-#ifdef DEBUG
         assert(cl->reloc_count == 0);
+#ifndef NDEBUG
         cl->reloc_count = n;
 #endif
         cl->reloc_next = cl->next;
@@ -181,7 +196,7 @@
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(job, bo);
         cl_advance(&cl->reloc_next, 4);
 
-#ifdef DEBUG
+#ifndef NDEBUG
         cl->reloc_count--;
 #endif
 
@@ -196,13 +211,95 @@
         *(uint32_t *)cl->reloc_next = vc4_gem_hindex(job, bo);
         cl_advance(&cl->reloc_next, 4);
 
-#ifdef DEBUG
+#ifndef NDEBUG
         cl->reloc_count--;
 #endif
 
         cl_aligned_u32(cl_out, offset);
 }
 
+/**
+ * Reference to a BO with its associated offset, used in the pack process.
+ */
+static inline struct vc4_cl_reloc
+cl_address(struct vc4_bo *bo, uint32_t offset)
+{
+        struct vc4_cl_reloc reloc = {
+                .bo = bo,
+                .offset = offset,
+        };
+        return reloc;
+}
+
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
 
+#define cl_packet_header(packet) V3D21_ ## packet ## _header
+#define cl_packet_length(packet) V3D21_ ## packet ## _length
+#define cl_packet_pack(packet)   V3D21_ ## packet ## _pack
+#define cl_packet_struct(packet)   V3D21_ ## packet
+
+static inline void *
+cl_get_emit_space(struct vc4_cl_out **cl, size_t size)
+{
+        void *addr = *cl;
+        cl_advance(cl, size);
+        return addr;
+}
+
+/* Macro for setting up an emit of a CL struct.  A temporary unpacked struct
+ * is created, which you get to set fields in of the form:
+ *
+ * cl_emit(bcl, FLAT_SHADE_FLAGS, flags) {
+ *     .flags.flat_shade_flags = 1 << 2,
+ * }
+ *
+ * or default values only can be emitted with just:
+ *
+ * cl_emit(bcl, FLAT_SHADE_FLAGS, flags);
+ *
+ * The trick here is that we make a for loop that will execute the body
+ * (either the block or the ';' after the macro invocation) exactly once.
+ * Also, *dst is actually of the wrong type, it's the
+ * uint8_t[cl_packet_length()] in the CL, not a cl_packet_struct(packet).
+ */
+#define cl_emit(cl, packet, name)                                \
+        for (struct cl_packet_struct(packet) name = {            \
+                cl_packet_header(packet)                         \
+        },                                                       \
+        *_loop_terminate = &name;                                \
+        __builtin_expect(_loop_terminate != NULL, 1);            \
+        ({                                                       \
+                struct vc4_cl_out *cl_out = cl_start(cl);        \
+                cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
+                VG(VALGRIND_CHECK_MEM_IS_DEFINED(cl_out,         \
+                                                 cl_packet_length(packet))); \
+                cl_advance(&cl_out, cl_packet_length(packet));   \
+                cl_end(cl, cl_out);                              \
+                _loop_terminate = NULL;                          \
+        }))                                                      \
+
+#define cl_emit_prepacked(cl, packet) do {                       \
+        memcpy((cl)->next, packet, sizeof(*packet));             \
+        cl_advance(&(cl)->next, sizeof(*packet));                \
+} while (0)
+
+/**
+ * Helper function called by the XML-generated pack functions for filling in
+ * an address field in shader records.
+ *
+ * Relocations for shader recs and texturing involve the packet (or uniforms
+ * stream) being preceded by the handles to the BOs, and the offset within the
+ * BO being in the stream (the output of this function).
+ */
+static inline void
+cl_pack_emit_reloc(struct vc4_cl *cl, const struct vc4_cl_reloc *reloc)
+{
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(cl->job, reloc->bo);
+        cl_advance(&cl->reloc_next, 4);
+
+#ifndef NDEBUG
+        cl->reloc_count--;
+#endif
+}
+
 #endif /* VC4_CL_H */
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index a719f27..b14cf38 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -24,7 +24,16 @@
 #include "util/u_math.h"
 #include "util/u_prim.h"
 #include "util/macros.h"
-#include "vc4_context.h"
+#include "vc4_cl_dump.h"
+#include "kernel/vc4_packet.h"
+
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#define __gen_unpack_address __gen_unpack_uint
+
+#include "broadcom/cle/v3d_packet_v21_pack.h"
 
 #define dump_VC4_PACKET_LINE_WIDTH dump_float
 #define dump_VC4_PACKET_POINT_SIZE dump_float
@@ -187,15 +196,16 @@
 dump_VC4_PACKET_GL_ARRAY_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint8_t *b = cl + offset;
-        uint32_t *count = cl + offset + 1;
-        uint32_t *start = cl + offset + 5;
+
+        struct V3D21_VERTEX_ARRAY_PRIMITIVES values;
+        V3D21_VERTEX_ARRAY_PRIMITIVES_unpack(cl + offset - 1, &values);
 
         fprintf(stderr, "0x%08x 0x%08x:      0x%02x %s\n",
-                offset, hw_offset, b[0], u_prim_name(b[0] & 0x7));
+                offset, hw_offset, b[0], u_prim_name(values.primitive_mode));
         fprintf(stderr, "0x%08x 0x%08x:      %d verts\n",
-                offset + 1, hw_offset + 1, *count);
+                offset + 1, hw_offset + 1, values.length);
         fprintf(stderr, "0x%08x 0x%08x:      0x%08x start\n",
-                offset + 5, hw_offset + 5, *start);
+                offset + 5, hw_offset + 5, values.index_of_first_vertex);
 }
 
 static void
@@ -223,10 +233,15 @@
 {
         uint32_t *scale = cl + offset;
 
+        struct V3D21_CLIPPER_XY_SCALING values;
+        V3D21_CLIPPER_XY_SCALING_unpack(cl + offset - 1, &values);
+
         fprintf(stderr, "0x%08x 0x%08x:      %f, %f (%f, %f, 0x%08x, 0x%08x)\n",
                 offset, hw_offset,
-                uif(scale[0]) / 16.0, uif(scale[1]) / 16.0,
-                uif(scale[0]), uif(scale[1]),
+                values.viewport_half_width_in_1_16th_of_pixel / 16.0,
+                values.viewport_half_height_in_1_16th_of_pixel / 16.0,
+                values.viewport_half_width_in_1_16th_of_pixel,
+                values.viewport_half_height_in_1_16th_of_pixel,
                 scale[0], scale[1]);
 }
 
@@ -236,9 +251,13 @@
         uint32_t *translate = cl + offset;
         uint32_t *scale = cl + offset + 8;
 
+        struct V3D21_CLIPPER_Z_SCALE_AND_OFFSET values;
+        V3D21_CLIPPER_Z_SCALE_AND_OFFSET_unpack(cl + offset - 1, &values);
+
         fprintf(stderr, "0x%08x 0x%08x:      %f, %f (0x%08x, 0x%08x)\n",
                 offset, hw_offset,
-                uif(translate[0]), uif(translate[1]),
+                values.viewport_z_scale_zc_to_zs,
+                values.viewport_z_offset_zc_to_zs,
                 translate[0], translate[1]);
 
         fprintf(stderr, "0x%08x 0x%08x:      %f, %f (0x%08x, 0x%08x)\n",
@@ -250,28 +269,26 @@
 static void
 dump_VC4_PACKET_TILE_BINNING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset)
 {
-        uint32_t *tile_alloc_addr = cl + offset;
-        uint32_t *tile_alloc_size = cl + offset + 4;
-        uint32_t *tile_state_addr = cl + offset + 8;
-        uint8_t *bin_x = cl + offset + 12;
-        uint8_t *bin_y = cl + offset + 13;
         uint8_t *flags = cl + offset + 14;
 
+        struct V3D21_TILE_BINNING_MODE_CONFIGURATION values;
+        V3D21_TILE_BINNING_MODE_CONFIGURATION_unpack(cl + offset - 1, &values);
+
         fprintf(stderr, "0x%08x 0x%08x:       tile alloc addr 0x%08x\n",
                 offset, hw_offset,
-                *tile_alloc_addr);
+                values.tile_allocation_memory_address);
 
         fprintf(stderr, "0x%08x 0x%08x:       tile alloc size %db\n",
                 offset + 4, hw_offset + 4,
-                *tile_alloc_size);
+                values.tile_allocation_memory_size);
 
         fprintf(stderr, "0x%08x 0x%08x:       tile state addr 0x%08x\n",
                 offset + 8, hw_offset + 8,
-                *tile_state_addr);
+                values.tile_state_data_array_address);
 
         fprintf(stderr, "0x%08x 0x%08x:       tiles (%d, %d)\n",
                 offset + 12, hw_offset + 12,
-                *bin_x, *bin_y);
+                values.width_in_tiles, values.height_in_tiles);
 
         fprintf(stderr, "0x%08x 0x%08x:       flags 0x%02x\n",
                 offset + 14, hw_offset + 14,
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.h b/src/gallium/drivers/vc4/vc4_cl_dump.h
new file mode 100644
index 0000000..760ab8d
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_CL_DUMP_H
+#define VC4_CL_DUMP_H
+
+#include <stdbool.h>
+void vc4_dump_cl(void *cl, uint32_t size, bool is_render);
+
+#endif
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 6bd2424..99ec7e5 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -67,7 +67,7 @@
 #define VC4_DIRTY_CONSTBUF      (1 << 13)
 #define VC4_DIRTY_VTXSTATE      (1 << 14)
 #define VC4_DIRTY_VTXBUF        (1 << 15)
-#define VC4_DIRTY_INDEXBUF      (1 << 16)
+
 #define VC4_DIRTY_SCISSOR       (1 << 17)
 #define VC4_DIRTY_FLAT_SHADE_FLAGS (1 << 18)
 #define VC4_DIRTY_PRIM_MODE     (1 << 19)
@@ -84,6 +84,13 @@
         uint32_t texture_p0;
         uint32_t texture_p1;
         bool force_first_level;
+        /**
+         * Resource containing the actual texture that will be sampled.
+         *
+         * We may need to rebase the .base.texture resource to work around the
+         * lack of GL_TEXTURE_BASE_LEVEL, or to upload the texture as tiled.
+         */
+        struct pipe_resource *texture;
 };
 
 struct vc4_sampler_state {
@@ -377,7 +384,6 @@
         struct pipe_viewport_state viewport;
         struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
         struct vc4_vertexbuf_stateobj vertexbuf;
-        struct pipe_index_buffer indexbuf;
         /** @} */
 };
 
@@ -385,27 +391,20 @@
         struct pipe_rasterizer_state base;
 
         /* VC4_CONFIGURATION_BITS */
-        uint8_t config_bits[3];
+        uint8_t config_bits[V3D21_CONFIGURATION_BITS_length];
 
-        float point_size;
-
-        /**
-         * Half-float (1/8/7 bits) value of polygon offset units for
-         * VC4_PACKET_DEPTH_OFFSET
-         */
-        uint16_t offset_units;
-        /**
-         * Half-float (1/8/7 bits) value of polygon offset scale for
-         * VC4_PACKET_DEPTH_OFFSET
-         */
-        uint16_t offset_factor;
+        struct PACKED {
+                uint8_t depth_offset[V3D21_DEPTH_OFFSET_length];
+                uint8_t point_size[V3D21_POINT_SIZE_length];
+                uint8_t line_width[V3D21_LINE_WIDTH_length];
+        } packed;
 };
 
 struct vc4_depth_stencil_alpha_state {
         struct pipe_depth_stencil_alpha_state base;
 
         /* VC4_CONFIGURATION_BITS */
-        uint8_t config_bits[3];
+        uint8_t config_bits[V3D21_CONFIGURATION_BITS_length];
 
         /** Uniforms for stencil state.
          *
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index ebd0802..8000697 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -81,39 +81,32 @@
 
         vc4_get_draw_cl_space(job, 0);
 
-        struct vc4_cl_out *bcl = cl_start(&job->bcl);
-        //   Tile state data is 48 bytes per tile, I think it can be thrown away
-        //   as soon as binning is finished.
-        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
-        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
-        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&bcl, job->draw_tiles_x);
-        cl_u8(&bcl, job->draw_tiles_y);
-        /* Other flags are filled by kernel. */
-        cl_u8(&bcl, job->msaa ? VC4_BIN_CONFIG_MS_MODE_4X : 0);
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION, bin) {
+                bin.width_in_tiles = job->draw_tiles_x;
+                bin.height_in_tiles = job->draw_tiles_y;
+                bin.multisample_mode_4x = job->msaa;
+        }
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
          * figure out what new state packets need to be written to that tile's
          * command list.
          */
-        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);
+        cl_emit(&job->bcl, START_TILE_BINNING, start);
 
         /* Reset the current compressed primitives format.  This gets modified
          * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
          * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
          * of every tile.
          */
-        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
-        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
-                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
+        cl_emit(&job->bcl, PRIMITIVE_LIST_FORMAT, list) {
+                list.data_type = _16_BIT_INDEX;
+                list.primitive_type = TRIANGLES_LIST;
+        }
 
         job->needs_flush = true;
         job->draw_width = vc4->framebuffer.width;
         job->draw_height = vc4->framebuffer.height;
-
-        cl_end(&job->bcl, bcl);
 }
 
 static void
@@ -123,12 +116,13 @@
         struct vc4_context *vc4 = vc4_context(pctx);
 
         for (int i = 0; i < stage_tex->num_textures; i++) {
-                struct pipe_sampler_view *view = stage_tex->textures[i];
+                struct vc4_sampler_view *view =
+                        vc4_sampler_view(stage_tex->textures[i]);
                 if (!view)
                         continue;
-                struct vc4_resource *rsc = vc4_resource(view->texture);
-                if (rsc->shadow_parent)
-                        vc4_update_shadow_baselevel_texture(pctx, view);
+
+                if (view->texture != view->base.texture)
+                        vc4_update_shadow_baselevel_texture(pctx, &view->base);
 
                 vc4_flush_jobs_writing_resource(vc4, view->texture);
         }
@@ -149,44 +143,49 @@
          * we emit a dummy read.
          */
         uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
+
         /* Emit the shader record. */
-        struct vc4_cl_out *shader_rec =
-                cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);
-        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
-        cl_u16(&shader_rec,
-               VC4_SHADER_FLAG_ENABLE_CLIPPING |
-               (vc4->prog.fs->fs_threaded ?
-                0 : VC4_SHADER_FLAG_FS_SINGLE_THREAD) |
-               ((info->mode == PIPE_PRIM_POINTS &&
-                 vc4->rasterizer->base.point_size_per_vertex) ?
-                VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
+        cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);
 
-        /* VC4_DIRTY_COMPILED_FS */
-        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
-        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
-        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
-        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+        cl_emit(&job->shader_rec, SHADER_RECORD, rec) {
+                rec.enable_clipping = true;
 
-        /* VC4_DIRTY_COMPILED_VS */
-        cl_u16(&shader_rec, 0); /* vs num uniforms */
-        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
-        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
-        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+                /* VC4_DIRTY_COMPILED_FS */
+                rec.fragment_shader_is_single_threaded =
+                        !vc4->prog.fs->fs_threaded;
 
-        /* VC4_DIRTY_COMPILED_CS */
-        cl_u16(&shader_rec, 0); /* cs num uniforms */
-        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
-        cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
-        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+                /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
+                rec.point_size_included_in_shaded_vertex_data =
+                         (info->mode == PIPE_PRIM_POINTS &&
+                          vc4->rasterizer->base.point_size_per_vertex);
+
+                /* VC4_DIRTY_COMPILED_FS */
+                rec.fragment_shader_number_of_varyings =
+                        vc4->prog.fs->num_inputs;
+                rec.fragment_shader_code_address =
+                        cl_address(vc4->prog.fs->bo, 0);
+
+                rec.coordinate_shader_attribute_array_select_bits =
+                         vc4->prog.cs->vattrs_live;
+                rec.coordinate_shader_total_attributes_size =
+                         vc4->prog.cs->vattr_offsets[8];
+                rec.coordinate_shader_code_address =
+                        cl_address(vc4->prog.cs->bo, 0);
+
+                rec.vertex_shader_attribute_array_select_bits =
+                         vc4->prog.vs->vattrs_live;
+                rec.vertex_shader_total_attributes_size =
+                         vc4->prog.vs->vattr_offsets[8];
+                rec.vertex_shader_code_address =
+                        cl_address(vc4->prog.vs->bo, 0);
+        };
 
         uint32_t max_index = 0xffff;
         for (int i = 0; i < vtx->num_elements; i++) {
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
-                struct vc4_resource *rsc = vc4_resource(vb->buffer);
+                struct vc4_resource *rsc = vc4_resource(vb->buffer.resource);
                 /* not vc4->dirty tracked: vc4->last_index_bias */
                 uint32_t offset = (vb->buffer_offset +
                                    elem->src_offset +
@@ -196,11 +195,15 @@
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);
 
-                cl_reloc(job, &job->shader_rec, &shader_rec, rsc->bo, offset);
-                cl_u8(&shader_rec, elem_size - 1);
-                cl_u8(&shader_rec, vb->stride);
-                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
-                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
+                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
+                        attr.address = cl_address(rsc->bo, offset);
+                        attr.number_of_bytes_minus_1 = elem_size - 1;
+                        attr.stride = vb->stride;
+                        attr.coordinate_shader_vpm_offset =
+                                vc4->prog.cs->vattr_offsets[i];
+                        attr.vertex_shader_vpm_offset =
+                                vc4->prog.vs->vattr_offsets[i];
+                }
 
                 if (vb->stride > 0) {
                         max_index = MIN2(max_index,
@@ -211,24 +214,25 @@
         if (vtx->num_elements == 0) {
                 assert(num_elements_emit == 1);
                 struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
-                cl_reloc(job, &job->shader_rec, &shader_rec, bo, 0);
-                cl_u8(&shader_rec, 16 - 1); /* element size */
-                cl_u8(&shader_rec, 0); /* stride */
-                cl_u8(&shader_rec, 0); /* VS VPM offset */
-                cl_u8(&shader_rec, 0); /* CS VPM offset */
-                vc4_bo_unreference(&bo);
-        }
-        cl_end(&job->shader_rec, shader_rec);
 
-        struct vc4_cl_out *bcl = cl_start(&job->bcl);
-        /* the actual draw call. */
-        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
-        assert(vtx->num_elements <= 8);
-        /* Note that number of attributes == 0 in the packet means 8
-         * attributes.  This field also contains the offset into shader_rec.
-         */
-        cl_u32(&bcl, num_elements_emit & 0x7);
-        cl_end(&job->bcl, bcl);
+                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
+                        attr.address = cl_address(bo, 0);
+                        attr.number_of_bytes_minus_1 = 16 - 1;
+                        attr.stride = 0;
+                        attr.coordinate_shader_vpm_offset = 0;
+                        attr.vertex_shader_vpm_offset = 0;
+                }
+        }
+
+        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
+                /* Note that number of attributes == 0 in the packet means 8
+                 * attributes.  This field also contains the offset into
+                 * shader_rec.
+                 */
+                assert(vtx->num_elements <= 8);
+                shader_state.number_of_attribute_arrays =
+                        num_elements_emit & 0x7;
+        }
 
         vc4_write_uniforms(vc4, vc4->prog.fs,
                            &vc4->constbuf[PIPE_SHADER_FRAGMENT],
@@ -283,8 +287,12 @@
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
+	if (!info->count_from_stream_output && !info->indirect &&
+	    !info->primitive_restart &&
+	    !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
+		return;
+
         if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf);
                 util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
                 util_primconvert_draw_vbo(vc4->primconvert, info);
                 perf_debug("Fallback conversion for %d %s vertices\n",
@@ -334,28 +342,29 @@
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
          */
-        struct vc4_cl_out *bcl = cl_start(&job->bcl);
-        if (info->indexed) {
-                uint32_t offset = vc4->indexbuf.offset;
-                uint32_t index_size = vc4->indexbuf.index_size;
+        if (info->index_size) {
+                uint32_t index_size = info->index_size;
+                uint32_t offset = info->start * index_size;
                 struct pipe_resource *prsc;
-                if (vc4->indexbuf.index_size == 4) {
-                        prsc = vc4_get_shadow_index_buffer(pctx, &vc4->indexbuf,
+                if (info->index_size == 4) {
+                        prsc = vc4_get_shadow_index_buffer(pctx, info,
+                                                           offset,
                                                            info->count, &offset);
                         index_size = 2;
                 } else {
-                        if (vc4->indexbuf.user_buffer) {
+                        if (info->has_user_indices) {
                                 prsc = NULL;
                                 u_upload_data(vc4->uploader, 0,
                                               info->count * index_size, 4,
-                                              vc4->indexbuf.user_buffer,
+                                              info->index.user,
                                               &offset, &prsc);
                         } else {
-                                prsc = vc4->indexbuf.buffer;
+                                prsc = info->index.resource;
                         }
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
+                struct vc4_cl_out *bcl = cl_start(&job->bcl);
                 cl_start_reloc(&job->bcl, &bcl, 1);
                 cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
                 cl_u8(&bcl,
@@ -366,9 +375,10 @@
                 cl_u32(&bcl, info->count);
                 cl_reloc(job, &job->bcl, &bcl, rsc->bo, offset);
                 cl_u32(&bcl, vc4->max_index);
+                cl_end(&job->bcl, bcl);
                 job->draw_calls_queued++;
 
-                if (vc4->indexbuf.index_size == 4 || vc4->indexbuf.user_buffer)
+                if (info->index_size == 4 || info->has_user_indices)
                         pipe_resource_reference(&prsc, NULL);
         } else {
                 uint32_t count = info->count;
@@ -392,10 +402,8 @@
                          * plus whatever remainder.
                          */
                         if (extra_index_bias) {
-                                cl_end(&job->bcl, bcl);
                                 vc4_emit_gl_shader_state(vc4, info,
                                                          extra_index_bias);
-                                bcl = cl_start(&job->bcl);
                         }
 
                         if (start + count > max_verts) {
@@ -431,10 +439,11 @@
                                 }
                         }
 
-                        cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                        cl_u8(&bcl, info->mode);
-                        cl_u32(&bcl, this_count);
-                        cl_u32(&bcl, start);
+                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, array) {
+                                array.primitive_mode = info->mode;
+                                array.length = this_count;
+                                array.index_of_first_vertex = start;
+                        }
                         job->draw_calls_queued++;
 
                         count -= step;
@@ -442,7 +451,6 @@
                         start = 0;
                 }
         }
-        cl_end(&job->bcl, bcl);
 
         /* We shouldn't have tripped the HW_2116 bug with the GFXH-515
          * workaround.
@@ -495,6 +503,37 @@
         struct vc4_context *vc4 = vc4_context(pctx);
         struct vc4_job *job = vc4_get_job_for_fbo(vc4);
 
+        if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+                struct vc4_resource *rsc =
+                        vc4_resource(vc4->framebuffer.zsbuf->texture);
+                unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
+
+                /* Clearing ZS will clear both Z and stencil, so if we're
+                 * trying to clear just one then we need to draw a quad to do
+                 * it instead.  We need to do this before setting up
+                 * tile-based clears in vc4->job, because the blitter may
+                 * submit the current job.
+                 */
+                if ((zsclear == PIPE_CLEAR_DEPTH ||
+                     zsclear == PIPE_CLEAR_STENCIL) &&
+                    (rsc->initialized_buffers & ~(zsclear | job->cleared)) &&
+                    util_format_is_depth_and_stencil(vc4->framebuffer.zsbuf->format)) {
+                        perf_debug("Partial clear of Z+stencil buffer, "
+                                   "drawing a quad instead of fast clearing\n");
+                        vc4_blitter_save(vc4);
+                        util_blitter_clear(vc4->blitter,
+                                           vc4->framebuffer.width,
+                                           vc4->framebuffer.height,
+                                           1,
+                                           zsclear,
+                                           NULL, depth, stencil);
+                        buffers &= ~zsclear;
+                        if (!buffers)
+                                return;
+                        job = vc4_get_job_for_fbo(vc4);
+                }
+        }
+
         /* We can't flag new buffers for clearing once we've queued draws.  We
          * could avoid this by using the 3d engine to clear.
          */
@@ -530,29 +569,6 @@
         if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
                 struct vc4_resource *rsc =
                         vc4_resource(vc4->framebuffer.zsbuf->texture);
-                unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
-
-                /* Clearing ZS will clear both Z and stencil, so if we're
-                 * trying to clear just one then we need to draw a quad to do
-                 * it instead.
-                 */
-                if ((zsclear == PIPE_CLEAR_DEPTH ||
-                     zsclear == PIPE_CLEAR_STENCIL) &&
-                    (rsc->initialized_buffers & ~(zsclear | job->cleared)) &&
-                    util_format_is_depth_and_stencil(vc4->framebuffer.zsbuf->format)) {
-                        perf_debug("Partial clear of Z+stencil buffer, "
-                                   "drawing a quad instead of fast clearing\n");
-                        vc4_blitter_save(vc4);
-                        util_blitter_clear(vc4->blitter,
-                                           vc4->framebuffer.width,
-                                           vc4->framebuffer.height,
-                                           1,
-                                           zsclear,
-                                           NULL, depth, stencil);
-                        buffers &= ~zsclear;
-                        if (!buffers)
-                                return;
-                }
 
                 /* Though the depth buffer is stored with Z in the high 24,
                  * for this field we just need to store it in the low 24.
@@ -564,7 +580,7 @@
                 if (buffers & PIPE_CLEAR_STENCIL)
                         job->clear_stencil = stencil;
 
-                rsc->initialized_buffers |= zsclear;
+                rsc->initialized_buffers |= (buffers & PIPE_CLEAR_DEPTHSTENCIL);
         }
 
         job->draw_min_x = 0;
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index b48d89a..d0a701f 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -29,7 +29,6 @@
         struct vc4_context *vc4 = vc4_context(pctx);
         struct vc4_job *job = vc4->job;
 
-        struct vc4_cl_out *bcl = cl_start(&job->bcl);
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT |
                           VC4_DIRTY_RASTERIZER)) {
                 float *vpscale = vc4->viewport.scale;
@@ -60,11 +59,12 @@
                         maxy = MIN2(vp_maxy, vc4->scissor.maxy);
                 }
 
-                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
-                cl_u16(&bcl, minx);
-                cl_u16(&bcl, miny);
-                cl_u16(&bcl, maxx - minx);
-                cl_u16(&bcl, maxy - miny);
+                cl_emit(&job->bcl, CLIP_WINDOW, clip) {
+                        clip.clip_window_left_pixel_coordinate = minx;
+                        clip.clip_window_bottom_pixel_coordinate = miny;
+                        clip.clip_window_height_in_pixels = maxy - miny;
+                        clip.clip_window_width_in_pixels = maxx - minx;
+                }
 
                 job->draw_min_x = MIN2(job->draw_min_x, minx);
                 job->draw_min_y = MIN2(job->draw_min_y, miny);
@@ -78,6 +78,7 @@
                 uint8_t ez_enable_mask_out = ~0;
                 uint8_t rasosm_mask_out = ~0;
 
+                struct vc4_cl_out *bcl = cl_start(&job->bcl);
                 /* HW-2905: If the RCL ends up doing a full-res load when
                  * multisampling, then early Z tracking may end up with values
                  * from the previous tile due to a HW bug.  Disable it to
@@ -110,39 +111,41 @@
                 cl_u8(&bcl,
                       (vc4->rasterizer->config_bits[2] |
                        vc4->zsa->config_bits[2]) & ez_enable_mask_out);
+                cl_end(&job->bcl, bcl);
         }
 
         if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
-                cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET);
-                cl_u16(&bcl, vc4->rasterizer->offset_factor);
-                cl_u16(&bcl, vc4->rasterizer->offset_units);
-
-                cl_u8(&bcl, VC4_PACKET_POINT_SIZE);
-                cl_f(&bcl, vc4->rasterizer->point_size);
-
-                cl_u8(&bcl, VC4_PACKET_LINE_WIDTH);
-                cl_f(&bcl, vc4->rasterizer->base.line_width);
+                cl_emit_prepacked(&job->bcl, &vc4->rasterizer->packed);
         }
 
         if (vc4->dirty & VC4_DIRTY_VIEWPORT) {
-                cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING);
-                cl_f(&bcl, vc4->viewport.scale[0] * 16.0f);
-                cl_f(&bcl, vc4->viewport.scale[1] * 16.0f);
+                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                        clip.viewport_half_width_in_1_16th_of_pixel =
+                                vc4->viewport.scale[0] * 16.0f;
+                        clip.viewport_half_height_in_1_16th_of_pixel =
+                                vc4->viewport.scale[1] * 16.0f;
+                }
 
-                cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING);
-                cl_f(&bcl, vc4->viewport.translate[2]);
-                cl_f(&bcl, vc4->viewport.scale[2]);
+                cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+                        clip.viewport_z_offset_zc_to_zs =
+                                vc4->viewport.translate[2];
+                        clip.viewport_z_scale_zc_to_zs =
+                                vc4->viewport.scale[2];
+                }
 
-                cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET);
-                cl_u16(&bcl, 16 * vc4->viewport.translate[0]);
-                cl_u16(&bcl, 16 * vc4->viewport.translate[1]);
+                cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
+                        vp.viewport_centre_x_coordinate =
+                                16 * vc4->viewport.translate[0];
+                        vp.viewport_centre_y_coordinate =
+                                16 * vc4->viewport.translate[1];
+                }
         }
 
         if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) {
-                cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
-                cl_u32(&bcl, vc4->rasterizer->base.flatshade ?
-                       vc4->prog.fs->color_inputs : 0);
+                cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
+                        if (vc4->rasterizer->base.flatshade)
+                                flags.flat_shading_flags =
+                                        vc4->prog.fs->color_inputs;
+                }
         }
-
-        cl_end(&job->bcl, bcl);
 }
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index d39472e..6a1d1a4 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -27,6 +27,7 @@
  */
 
 #include <xf86drm.h>
+#include "vc4_cl_dump.h"
 #include "vc4_context.h"
 #include "util/hash_table.h"
 
@@ -117,12 +118,17 @@
                 struct vc4_job *job = entry->data;
 
                 struct vc4_bo **referenced_bos = job->bo_pointers.base;
+                bool found = false;
                 for (int i = 0; i < cl_offset(&job->bo_handles) / 4; i++) {
                         if (referenced_bos[i] == rsc->bo) {
-                                vc4_job_submit(vc4, job);
-                                continue;
+                                found = true;
+                                break;
                         }
                 }
+                if (found) {
+                        vc4_job_submit(vc4, job);
+                        continue;
+                }
 
                 /* Also check for the Z/color buffers, since the references to
                  * those are only added immediately before submit.
@@ -377,13 +383,11 @@
                  * until the FLUSH completes.
                  */
                 cl_ensure_space(&job->bcl, 8);
-                struct vc4_cl_out *bcl = cl_start(&job->bcl);
-                cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+                cl_emit(&job->bcl, INCREMENT_SEMAPHORE, incr);
                 /* The FLUSH caps all of our bin lists with a
                  * VC4_PACKET_RETURN.
                  */
-                cl_u8(&bcl, VC4_PACKET_FLUSH);
-                cl_end(&job->bcl, bcl);
+                cl_emit(&job->bcl, FLUSH, flush);
         }
         struct drm_vc4_submit_cl submit = {
                 .color_read.hindex = ~0,
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 2ed89ea..a28ebb5 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -494,7 +494,7 @@
         discard->num_components = 1;
         discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
         nir_builder_instr_insert(b, &discard->instr);
-        c->s->info->fs.uses_discard = true;
+        c->s->info.fs.uses_discard = true;
 }
 
 static nir_ssa_def *
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 5936873..333206b 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1347,7 +1347,7 @@
         }
 
         uint32_t discard_cond = QPU_COND_ALWAYS;
-        if (c->s->info->fs.uses_discard) {
+        if (c->s->info.fs.uses_discard) {
                 qir_SF(c, c->discard);
                 discard_cond = QPU_COND_ZS;
         }
@@ -1708,8 +1708,7 @@
 static void
 ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
-        nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-        assert(const_offset->u32[0] == 0);
+        assert(nir_src_as_const_value(instr->src[0])->u32[0] == 0);
 
         /* Reads of the per-sample color need to be done in
          * order.
@@ -2158,7 +2157,7 @@
 static void
 nir_to_qir(struct vc4_compile *c)
 {
-        if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
+        if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
 
         ntq_setup_inputs(c);
@@ -2583,7 +2582,7 @@
 
                 /* Note: the temporary clone in c->s has been freed. */
                 nir_shader *orig_shader = key->shader_state->base.ir.nir;
-                if (orig_shader->info->outputs_written & (1 << FRAG_RESULT_DEPTH))
+                if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
                         shader->disable_early_z = true;
         } else {
                 shader->num_inputs = c->num_inputs;
@@ -2763,11 +2762,11 @@
         vc4->dirty |= VC4_DIRTY_COMPILED_FS;
 
         if (vc4->rasterizer->base.flatshade &&
-            old_fs && vc4->prog.fs->color_inputs != old_fs->color_inputs) {
+            (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
                 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
         }
 
-        if (old_fs && vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
+        if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
                 vc4->dirty |= VC4_DIRTY_FS_INPUTS;
 }
 
@@ -2877,6 +2876,7 @@
 
 static void
 delete_from_cache_if_matches(struct hash_table *ht,
+                             struct vc4_compiled_shader **last_compile,
                              struct hash_entry *entry,
                              struct vc4_uncompiled_shader *so)
 {
@@ -2886,6 +2886,10 @@
                 struct vc4_compiled_shader *shader = entry->data;
                 _mesa_hash_table_remove(ht, entry);
                 vc4_bo_unreference(&shader->bo);
+
+                if (shader == *last_compile)
+                        *last_compile = NULL;
+
                 ralloc_free(shader);
         }
 }
@@ -2897,10 +2901,14 @@
         struct vc4_uncompiled_shader *so = hwcso;
 
         struct hash_entry *entry;
-        hash_table_foreach(vc4->fs_cache, entry)
-                delete_from_cache_if_matches(vc4->fs_cache, entry, so);
-        hash_table_foreach(vc4->vs_cache, entry)
-                delete_from_cache_if_matches(vc4->vs_cache, entry, so);
+        hash_table_foreach(vc4->fs_cache, entry) {
+                delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
+                                             entry, so);
+        }
+        hash_table_foreach(vc4->vs_cache, entry) {
+                delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
+                                             entry, so);
+        }
 
         ralloc_free(so->base.ir.nir);
         free(so);
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 9ecfe65..ad19f06 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -66,7 +66,7 @@
 
         entry = _mesa_hash_table_search(ht, key);
         assert(entry);
-        entry->data--;
+        entry->data = (void *)(((uintptr_t) entry->data) - 1);
         if (entry->data == NULL)
                 _mesa_hash_table_remove(ht, entry);
 }
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index aaa3a04..cd9a498 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -204,9 +204,9 @@
 static void
 set_last_dst_pack(struct qblock *block, struct qinst *inst)
 {
-        bool had_pm = *last_inst(block) & QPU_PM;
-        bool had_ws = *last_inst(block) & QPU_WS;
-        uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
+        MAYBE_UNUSED bool had_pm = *last_inst(block) & QPU_PM;
+        MAYBE_UNUSED bool had_ws = *last_inst(block) & QPU_WS;
+        MAYBE_UNUSED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
 
         if (!inst->dst.pack)
                 return;
@@ -419,7 +419,7 @@
                         break;
                 }
 
-                bool handled_qinst_cond = false;
+                MAYBE_UNUSED bool handled_qinst_cond = false;
 
                 switch (qinst->op) {
                 case QOP_RCP:
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 596f73d..853f7bb 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -29,21 +29,25 @@
 #include "util/u_surface.h"
 #include "util/u_upload_mgr.h"
 
+#include "drm_fourcc.h"
+#include "vc4_drm.h"
 #include "vc4_screen.h"
 #include "vc4_context.h"
 #include "vc4_resource.h"
 #include "vc4_tiling.h"
 
-static bool miptree_debug = false;
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL << 56) - 1)
+#endif
 
 static bool
 vc4_resource_bo_alloc(struct vc4_resource *rsc)
 {
-        struct pipe_resource *prsc = &rsc->base.b;
+        struct pipe_resource *prsc = &rsc->base;
         struct pipe_screen *pscreen = prsc->screen;
         struct vc4_bo *bo;
 
-        if (miptree_debug) {
+        if (vc4_debug & VC4_DEBUG_SURFACE) {
                 fprintf(stderr, "alloc %p: size %d + offset %d -> %d\n",
                         rsc,
                         rsc->slices[0].size,
@@ -258,10 +262,6 @@
                 ptrans->box.z = 0;
         }
 
-        /* Note that the current kernel implementation is synchronous, so no
-         * need to do syncing stuff here yet.
-         */
-
         if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
                 buf = vc4_bo_map_unsynchronized(rsc->bo);
         else
@@ -371,35 +371,70 @@
 vc4_resource_destroy(struct pipe_screen *pscreen,
                      struct pipe_resource *prsc)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_resource *rsc = vc4_resource(prsc);
-        pipe_resource_reference(&rsc->shadow_parent, NULL);
         vc4_bo_unreference(&rsc->bo);
+
+        if (rsc->scanout)
+                renderonly_scanout_destroy(rsc->scanout, screen->ro);
+
         free(rsc);
 }
 
 static boolean
 vc4_resource_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_context *pctx,
                         struct pipe_resource *prsc,
-                        struct winsys_handle *handle)
+                        struct winsys_handle *whandle,
+                        unsigned usage)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_resource *rsc = vc4_resource(prsc);
 
-        return vc4_screen_bo_get_handle(pscreen, rsc->bo, rsc->slices[0].stride,
-                                        handle);
+        whandle->stride = rsc->slices[0].stride;
+        whandle->offset = 0;
+
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching or shadow update avoidance).
+         */
+        rsc->bo->private = false;
+
+        if (rsc->tiled)
+                whandle->modifier = DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED;
+        else
+                whandle->modifier = DRM_FORMAT_MOD_LINEAR;
+
+        switch (whandle->type) {
+        case DRM_API_HANDLE_TYPE_SHARED:
+                if (screen->ro) {
+                        /* This could probably be supported, assuming that a
+                         * control node was used for pl111.
+                         */
+                        fprintf(stderr, "flink unsupported with pl111\n");
+                        return FALSE;
+                }
+
+                return vc4_bo_flink(rsc->bo, &whandle->handle);
+        case DRM_API_HANDLE_TYPE_KMS:
+                if (screen->ro && renderonly_get_handle(rsc->scanout, whandle))
+                        return TRUE;
+                whandle->handle = rsc->bo->handle;
+                return TRUE;
+        case DRM_API_HANDLE_TYPE_FD:
+                /* FDs are cross-device, so we can export directly from vc4.
+                 */
+                whandle->handle = vc4_bo_get_dmabuf(rsc->bo);
+                return whandle->handle != -1;
+        }
+
+        return FALSE;
 }
 
-static const struct u_resource_vtbl vc4_resource_vtbl = {
-        .resource_get_handle      = vc4_resource_get_handle,
-        .resource_destroy         = vc4_resource_destroy,
-        .transfer_map             = vc4_resource_transfer_map,
-        .transfer_flush_region    = u_default_transfer_flush_region,
-        .transfer_unmap           = vc4_resource_transfer_unmap,
-};
-
 static void
-vc4_setup_slices(struct vc4_resource *rsc)
+vc4_setup_slices(struct vc4_resource *rsc, const char *caller)
 {
-        struct pipe_resource *prsc = &rsc->base.b;
+        struct pipe_resource *prsc = &rsc->base;
         uint32_t width = prsc->width0;
         uint32_t height = prsc->height0;
         if (prsc->format == PIPE_FORMAT_ETC1_RGB8) {
@@ -456,16 +491,16 @@
 
                 offset += slice->size;
 
-                if (miptree_debug) {
+                if (vc4_debug & VC4_DEBUG_SURFACE) {
                         static const char tiling_chars[] = {
                                 [VC4_TILING_FORMAT_LINEAR] = 'R',
                                 [VC4_TILING_FORMAT_LT] = 'L',
                                 [VC4_TILING_FORMAT_T] = 'T'
                         };
                         fprintf(stderr,
-                                "rsc setup %p (format %s: vc4 %d), %dx%d: "
+                                "rsc %s %p (format %s: vc4 %d), %dx%d: "
                                 "level %d (%c) -> %dx%d, stride %d@0x%08x\n",
-                                rsc,
+                                caller, rsc,
                                 util_format_short_name(prsc->format),
                                 rsc->vc4_format,
                                 prsc->width0, prsc->height0,
@@ -502,14 +537,13 @@
         struct vc4_resource *rsc = CALLOC_STRUCT(vc4_resource);
         if (!rsc)
                 return NULL;
-        struct pipe_resource *prsc = &rsc->base.b;
+        struct pipe_resource *prsc = &rsc->base;
 
         *prsc = *tmpl;
 
         pipe_reference_init(&prsc->reference, 1);
         prsc->screen = pscreen;
 
-        rsc->base.vtbl = &vc4_resource_vtbl;
         if (prsc->nr_samples <= 1)
                 rsc->cpp = util_format_get_blocksize(tmpl->format);
         else
@@ -538,56 +572,209 @@
         return format;
 }
 
-struct pipe_resource *
-vc4_resource_create(struct pipe_screen *pscreen,
-                    const struct pipe_resource *tmpl)
+static bool
+find_modifier(uint64_t needle, const uint64_t *haystack, int count)
 {
-        struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
-        struct pipe_resource *prsc = &rsc->base.b;
+        int i;
 
-        /* We have to make shared be untiled, since we don't have any way to
-         * communicate metadata about tiling currently.
+        for (i = 0; i < count; i++) {
+                if (haystack[i] == needle)
+                        return true;
+        }
+
+        return false;
+}
+
+static struct pipe_resource *
+vc4_resource_create_with_modifiers(struct pipe_screen *pscreen,
+                                   const struct pipe_resource *tmpl,
+                                   const uint64_t *modifiers,
+                                   int count)
+{
+        struct vc4_screen *screen = vc4_screen(pscreen);
+        struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
+        struct pipe_resource *prsc = &rsc->base;
+        bool linear_ok = find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count);
+        /* Use a tiled layout if we can, for better 3D performance. */
+        bool should_tile = true;
+
+        /* VBOs/PBOs are untiled (and 1 height). */
+        if (tmpl->target == PIPE_BUFFER)
+                should_tile = false;
+
+        /* MSAA buffers are linear. */
+        if (tmpl->nr_samples > 1)
+                should_tile = false;
+
+        /* No tiling when we're sharing with another device (pl111). */
+        if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT))
+                should_tile = false;
+
+        /* Cursors are always linear, and the user can request linear as well.
          */
-        if (tmpl->target == PIPE_BUFFER ||
-            tmpl->nr_samples > 1 ||
-            (tmpl->bind & (PIPE_BIND_SCANOUT |
-                           PIPE_BIND_LINEAR |
-                           PIPE_BIND_SHARED |
-                           PIPE_BIND_CURSOR))) {
+        if (tmpl->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR))
+                should_tile = false;
+
+        /* No shared objects with LT format -- the kernel only has T-format
+         * metadata.  LT objects are small enough it's not worth the trouble to
+         * give them metadata to tile.
+         */
+        if ((tmpl->bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT)) &&
+            vc4_size_is_lt(prsc->width0, prsc->height0, rsc->cpp))
+                should_tile = false;
+
+        /* If we're sharing or scanning out, we need the ioctl present to
+         * inform the kernel or the other side.
+         */
+        if ((tmpl->bind & (PIPE_BIND_SHARED |
+                           PIPE_BIND_SCANOUT)) && !screen->has_tiling_ioctl)
+                should_tile = false;
+
+        /* No user-specified modifier; determine our own. */
+        if (count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID) {
+                linear_ok = true;
+                rsc->tiled = should_tile;
+        } else if (should_tile &&
+                   find_modifier(DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED,
+                                 modifiers, count)) {
+                rsc->tiled = true;
+        } else if (linear_ok) {
                 rsc->tiled = false;
         } else {
-                rsc->tiled = true;
+                fprintf(stderr, "Unsupported modifier requested\n");
+                return NULL;
         }
 
         if (tmpl->target != PIPE_BUFFER)
                 rsc->vc4_format = get_resource_texture_format(prsc);
 
-        vc4_setup_slices(rsc);
+        vc4_setup_slices(rsc, "create");
         if (!vc4_resource_bo_alloc(rsc))
                 goto fail;
 
+        if (screen->has_tiling_ioctl) {
+                uint64_t modifier;
+                if (rsc->tiled)
+                        modifier = DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED;
+                else
+                        modifier = DRM_FORMAT_MOD_LINEAR;
+                struct drm_vc4_set_tiling set_tiling = {
+                        .handle = rsc->bo->handle,
+                        .modifier = modifier,
+                };
+                int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_SET_TILING,
+                                    &set_tiling);
+                if (ret != 0)
+                        goto fail;
+        }
+
+        if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) {
+                rsc->scanout =
+                        renderonly_scanout_for_resource(prsc, screen->ro, NULL);
+                if (!rsc->scanout)
+                        goto fail;
+        }
+
         return prsc;
 fail:
         vc4_resource_destroy(pscreen, prsc);
         return NULL;
 }
 
+struct pipe_resource *
+vc4_resource_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *tmpl)
+{
+        const uint64_t mod = DRM_FORMAT_MOD_INVALID;
+        return vc4_resource_create_with_modifiers(pscreen, tmpl, &mod, 1);
+}
+
 static struct pipe_resource *
 vc4_resource_from_handle(struct pipe_screen *pscreen,
                          const struct pipe_resource *tmpl,
-                         struct winsys_handle *handle,
+                         struct winsys_handle *whandle,
                          unsigned usage)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
-        struct pipe_resource *prsc = &rsc->base.b;
+        struct pipe_resource *prsc = &rsc->base;
         struct vc4_resource_slice *slice = &rsc->slices[0];
-        uint32_t expected_stride =
-            align(prsc->width0, vc4_utile_width(rsc->cpp)) * rsc->cpp;
 
         if (!rsc)
                 return NULL;
 
-        if (handle->stride != expected_stride) {
+        if (whandle->offset != 0) {
+                fprintf(stderr,
+                        "Attempt to import unsupported winsys offset %u\n",
+                        whandle->offset);
+                return NULL;
+        }
+
+        switch (whandle->type) {
+        case DRM_API_HANDLE_TYPE_SHARED:
+                rsc->bo = vc4_bo_open_name(screen,
+                                           whandle->handle, whandle->stride);
+                break;
+        case DRM_API_HANDLE_TYPE_FD:
+                rsc->bo = vc4_bo_open_dmabuf(screen,
+                                             whandle->handle, whandle->stride);
+                break;
+        default:
+                fprintf(stderr,
+                        "Attempt to import unsupported handle type %d\n",
+                        whandle->type);
+        }
+
+        if (!rsc->bo)
+                goto fail;
+
+        struct drm_vc4_get_tiling get_tiling = {
+                .handle = rsc->bo->handle,
+        };
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_TILING, &get_tiling);
+
+        if (ret != 0) {
+                whandle->modifier = DRM_FORMAT_MOD_LINEAR;
+        } else if (whandle->modifier == DRM_FORMAT_MOD_INVALID) {
+                whandle->modifier = get_tiling.modifier;
+        } else if (whandle->modifier != get_tiling.modifier) {
+                fprintf(stderr,
+                        "Modifier 0x%llx vs. tiling (0x%llx) mismatch\n",
+                        (long long)whandle->modifier, get_tiling.modifier);
+                goto fail;
+        }
+
+        switch (whandle->modifier) {
+        case DRM_FORMAT_MOD_LINEAR:
+                rsc->tiled = false;
+                break;
+        case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
+                rsc->tiled = true;
+                break;
+        default:
+                fprintf(stderr,
+                        "Attempt to import unsupported modifier 0x%llx\n",
+                        (long long)whandle->modifier);
+                goto fail;
+        }
+
+        rsc->vc4_format = get_resource_texture_format(prsc);
+        vc4_setup_slices(rsc, "import");
+
+        if (screen->ro) {
+                /* Make sure that renderonly has a handle to our buffer in the
+                 * display's fd, so that a later renderonly_get_handle()
+                 * returns correct handles or GEM names.
+                 */
+                rsc->scanout =
+                        renderonly_create_gpu_import_for_resource(prsc,
+                                                                  screen->ro,
+                                                                  NULL);
+                if (!rsc->scanout)
+                        goto fail;
+        }
+
+        if (whandle->stride != slice->stride) {
                 static bool warned = false;
                 if (!warned) {
                         warned = true;
@@ -596,31 +783,12 @@
                                 "unsupported stride %d instead of %d\n",
                                 prsc->width0, prsc->height0,
                                 util_format_short_name(prsc->format),
-                                handle->stride,
-                                expected_stride);
+                                whandle->stride,
+                                slice->stride);
                 }
                 goto fail;
         }
 
-        rsc->tiled = false;
-        rsc->bo = vc4_screen_bo_from_handle(pscreen, handle);
-        if (!rsc->bo)
-                goto fail;
-
-        slice->stride = handle->stride;
-        slice->tiling = VC4_TILING_FORMAT_LINEAR;
-
-        rsc->vc4_format = get_resource_texture_format(prsc);
-
-        if (miptree_debug) {
-                fprintf(stderr,
-                        "rsc import %p (format %d), %dx%d: "
-                        "level 0 (R) -> stride %d@0x%08x\n",
-                        rsc, rsc->vc4_format,
-                        prsc->width0, prsc->height0,
-                        slice->stride, slice->offset);
-        }
-
         return prsc;
 
 fail:
@@ -847,8 +1015,6 @@
         uint32_t char_w = 140, char_h = 60;
         uint32_t char_w_per_tile = char_w / tiles_w - 1;
         uint32_t char_h_per_tile = char_h / tiles_h - 1;
-        uint32_t found_colors[10];
-        uint32_t num_found_colors = 0;
 
         fprintf(stderr, "Surface: %dx%d (%dx MSAA)\n",
                 psurf->width, psurf->height, psurf->texture->nr_samples);
@@ -886,10 +1052,6 @@
                         fprintf(stderr, "-");
                 fprintf(stderr, "\n");
         }
-
-        for (int i = 0; i < num_found_colors; i++) {
-                fprintf(stderr, "color %d: 0x%08x\n", i, found_colors[i]);
-        }
 }
 
 /** Debug routine to dump the contents of an 8888 surface to the console */
@@ -915,26 +1077,28 @@
 
 void
 vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
-                                    struct pipe_sampler_view *view)
+                                    struct pipe_sampler_view *pview)
 {
+        struct vc4_sampler_view *view = vc4_sampler_view(pview);
         struct vc4_resource *shadow = vc4_resource(view->texture);
-        struct vc4_resource *orig = vc4_resource(shadow->shadow_parent);
-        assert(orig);
+        struct vc4_resource *orig = vc4_resource(pview->texture);
+
+        assert(view->texture != pview->texture);
 
         if (shadow->writes == orig->writes && orig->bo->private)
                 return;
 
         perf_debug("Updating %dx%d@%d shadow texture due to %s\n",
-                   orig->base.b.width0, orig->base.b.height0,
-                   view->u.tex.first_level,
-                   view->u.tex.first_level ? "base level" : "raster layout");
+                   orig->base.width0, orig->base.height0,
+                   pview->u.tex.first_level,
+                   pview->u.tex.first_level ? "base level" : "raster layout");
 
-        for (int i = 0; i <= shadow->base.b.last_level; i++) {
-                unsigned width = u_minify(shadow->base.b.width0, i);
-                unsigned height = u_minify(shadow->base.b.height0, i);
+        for (int i = 0; i <= shadow->base.last_level; i++) {
+                unsigned width = u_minify(shadow->base.width0, i);
+                unsigned height = u_minify(shadow->base.height0, i);
                 struct pipe_blit_info info = {
                         .dst = {
-                                .resource = &shadow->base.b,
+                                .resource = &shadow->base,
                                 .level = i,
                                 .box = {
                                         .x = 0,
@@ -944,11 +1108,11 @@
                                         .height = height,
                                         .depth = 1,
                                 },
-                                .format = shadow->base.b.format,
+                                .format = shadow->base.format,
                         },
                         .src = {
-                                .resource = &orig->base.b,
-                                .level = view->u.tex.first_level + i,
+                                .resource = &orig->base,
+                                .level = pview->u.tex.first_level + i,
                                 .box = {
                                         .x = 0,
                                         .y = 0,
@@ -957,7 +1121,7 @@
                                         .height = height,
                                         .depth = 1,
                                 },
-                                .format = orig->base.b.format,
+                                .format = orig->base.format,
                         },
                         .mask = ~0,
                 };
@@ -980,12 +1144,13 @@
  */
 struct pipe_resource *
 vc4_get_shadow_index_buffer(struct pipe_context *pctx,
-                            const struct pipe_index_buffer *ib,
+                            const struct pipe_draw_info *info,
+                            uint32_t offset,
                             uint32_t count,
                             uint32_t *shadow_offset)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
-        struct vc4_resource *orig = vc4_resource(ib->buffer);
+        struct vc4_resource *orig = vc4_resource(info->index.resource);
         perf_debug("Fallback conversion for %d uint indices\n", count);
 
         void *data;
@@ -996,11 +1161,11 @@
 
         struct pipe_transfer *src_transfer = NULL;
         const uint32_t *src;
-        if (ib->user_buffer) {
-                src = ib->user_buffer;
+        if (info->has_user_indices) {
+                src = info->index.user;
         } else {
-                src = pipe_buffer_map_range(pctx, &orig->base.b,
-                                            ib->offset,
+                src = pipe_buffer_map_range(pctx, &orig->base,
+                                            offset,
                                             count * 4,
                                             PIPE_TRANSFER_READ, &src_transfer);
         }
@@ -1020,18 +1185,34 @@
 void
 vc4_resource_screen_init(struct pipe_screen *pscreen)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
+
         pscreen->resource_create = vc4_resource_create;
+        pscreen->resource_create_with_modifiers =
+                vc4_resource_create_with_modifiers;
         pscreen->resource_from_handle = vc4_resource_from_handle;
-        pscreen->resource_get_handle = u_resource_get_handle_vtbl;
         pscreen->resource_destroy = u_resource_destroy_vtbl;
+        pscreen->resource_get_handle = vc4_resource_get_handle;
+        pscreen->resource_destroy = vc4_resource_destroy;
+
+        /* Test if the kernel has GET_TILING; it will return -EINVAL if the
+         * ioctl does not exist, but -ENOENT if we pass an impossible handle.
+         * 0 cannot be a valid GEM object, so use that.
+         */
+        struct drm_vc4_get_tiling get_tiling = {
+                .handle = 0x0,
+        };
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_TILING, &get_tiling);
+        if (ret == -1 && errno == ENOENT)
+                screen->has_tiling_ioctl = true;
 }
 
 void
 vc4_resource_context_init(struct pipe_context *pctx)
 {
-        pctx->transfer_map = u_transfer_map_vtbl;
-        pctx->transfer_flush_region = u_transfer_flush_region_vtbl;
-        pctx->transfer_unmap = u_transfer_unmap_vtbl;
+        pctx->transfer_map = vc4_resource_transfer_map;
+        pctx->transfer_flush_region = u_default_transfer_flush_region;
+        pctx->transfer_unmap = vc4_resource_transfer_unmap;
         pctx->buffer_subdata = u_default_buffer_subdata;
         pctx->texture_subdata = u_default_texture_subdata;
         pctx->create_surface = vc4_create_surface;
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index 27aa4e8..d4c491e 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -52,8 +52,9 @@
 };
 
 struct vc4_resource {
-        struct u_resource base;
+        struct pipe_resource base;
         struct vc4_bo *bo;
+        struct renderonly_scanout *scanout;
         struct vc4_resource_slice slices[VC4_MAX_MIP_LEVELS];
         uint32_t cube_map_stride;
         int cpp;
@@ -80,20 +81,6 @@
          * buffer) may get marked.
          */
         uint32_t initialized_buffers;
-
-        /**
-         * Resource containing the non-GL_TEXTURE_BASE_LEVEL-rebased texture
-         * contents, or the 4-byte index buffer.
-         *
-         * If the parent is set for an texture, then this resource is actually
-         * the texture contents just starting from the sampler_view's
-         * first_level.
-         *
-         * If the parent is set for an index index buffer, then this resource
-         * is actually a shadow containing a 2-byte index buffer starting from
-         * the ib's offset.
-         */
-        struct pipe_resource *shadow_parent;
 };
 
 static inline struct vc4_resource *
@@ -121,9 +108,10 @@
 void vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
                                          struct pipe_sampler_view *view);
 struct pipe_resource *vc4_get_shadow_index_buffer(struct pipe_context *pctx,
-                                                  const struct pipe_index_buffer *ib,
+                                                  const struct pipe_draw_info *info,
+                                                  uint32_t offset,
                                                   uint32_t count,
-                                                  uint32_t *offset);
+                                                  uint32_t *shadow_offset);
 void vc4_dump_surface(struct pipe_surface *psurf);
 
 #endif /* VC4_RESOURCE_H */
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 27d23dc..f3b47ca 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -27,6 +27,7 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 
+#include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
@@ -34,6 +35,7 @@
 #include "util/ralloc.h"
 
 #include <xf86drm.h>
+#include "drm_fourcc.h"
 #include "vc4_drm.h"
 #include "vc4_screen.h"
 #include "vc4_context.h"
@@ -42,6 +44,8 @@
 static const struct debug_named_value debug_options[] = {
         { "cl",       VC4_DEBUG_CL,
           "Dump command list during creation" },
+        { "surf",       VC4_DEBUG_SURFACE,
+          "Dump surface layouts" },
         { "qpu",      VC4_DEBUG_QPU,
           "Dump generated QPU instructions" },
         { "qir",      VC4_DEBUG_QIR,
@@ -99,6 +103,7 @@
         util_hash_table_destroy(screen->bo_handles);
         vc4_bufmgr_destroy(pscreen);
         slab_destroy_parent(&screen->transfer_pool);
+        free(screen->ro);
 
 #if USE_VC4_SIMULATOR
         vc4_simulator_destroy(screen);
@@ -126,6 +131,7 @@
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
         case PIPE_CAP_TEXTURE_SWIZZLE:
         case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+        case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -252,6 +258,9 @@
         case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
         case PIPE_CAP_TGSI_BALLOT:
         case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+        case PIPE_CAP_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_BINDLESS_TEXTURE:
                 return 0;
 
                 /* Stream output. */
@@ -411,6 +420,7 @@
         case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+        case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
                 return 0;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
@@ -525,6 +535,34 @@
         return retval == usage;
 }
 
+static void
+vc4_screen_query_dmabuf_modifiers(struct pipe_screen *pscreen,
+                                  enum pipe_format format, int max,
+                                  uint64_t *modifiers,
+                                  unsigned int *external_only,
+                                  int *count)
+{
+        if (!modifiers) {
+                *count = 2;
+                return;
+        }
+
+        *count = MIN2(max, 2);
+
+        /* We support both modifiers (tiled and linear) for all sampler
+         * formats.
+         */
+        modifiers[0] = DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED;
+        if (external_only)
+                external_only[0] = false;
+        if (max < 2)
+                return;
+
+        modifiers[1] = DRM_FORMAT_MOD_LINEAR;
+        if (external_only)
+                external_only[1] = false;
+}
+
 #define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x)))
 
 static unsigned handle_hash(void *key)
@@ -587,7 +625,7 @@
         uint32_t minor = (ident1.value >> 0) & 0xf;
         screen->v3d_ver = major * 10 + minor;
 
-        if (screen->v3d_ver != 21) {
+        if (screen->v3d_ver != 21 && screen->v3d_ver != 26) {
                 fprintf(stderr,
                         "V3D %d.%d not supported by this version of Mesa.\n",
                         screen->v3d_ver / 10,
@@ -599,7 +637,7 @@
 }
 
 struct pipe_screen *
-vc4_screen_create(int fd)
+vc4_screen_create(int fd, struct renderonly *ro)
 {
         struct vc4_screen *screen = rzalloc(NULL, struct vc4_screen);
         struct pipe_screen *pscreen;
@@ -614,6 +652,15 @@
         pscreen->is_format_supported = vc4_screen_is_format_supported;
 
         screen->fd = fd;
+        if (ro) {
+                screen->ro = renderonly_dup(ro);
+                if (!screen->ro) {
+                        fprintf(stderr, "Failed to dup renderonly object\n");
+                        ralloc_free(screen);
+                        return NULL;
+                }
+        }
+
         list_inithead(&screen->bo_cache.time_list);
         (void) mtx_init(&screen->bo_handles_mutex, mtx_plain);
         screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
@@ -628,6 +675,8 @@
         if (!vc4_get_chip_info(screen))
                 goto fail;
 
+        util_cpu_detect();
+
         slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16);
 
         vc4_fence_init(screen);
@@ -646,6 +695,7 @@
         pscreen->get_vendor = vc4_screen_get_vendor;
         pscreen->get_device_vendor = vc4_screen_get_vendor;
         pscreen->get_compiler_options = vc4_screen_get_compiler_options;
+        pscreen->query_dmabuf_modifiers = vc4_screen_query_dmabuf_modifiers;
 
         return pscreen;
 
@@ -654,57 +704,3 @@
         ralloc_free(pscreen);
         return NULL;
 }
-
-boolean
-vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
-                         struct vc4_bo *bo,
-                         unsigned stride,
-                         struct winsys_handle *whandle)
-{
-        whandle->stride = stride;
-
-        /* If we're passing some reference to our BO out to some other part of
-         * the system, then we can't do any optimizations about only us being
-         * the ones seeing it (like BO caching or shadow update avoidance).
-         */
-        bo->private = false;
-
-        switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
-                return vc4_bo_flink(bo, &whandle->handle);
-        case DRM_API_HANDLE_TYPE_KMS:
-                whandle->handle = bo->handle;
-                return TRUE;
-        case DRM_API_HANDLE_TYPE_FD:
-                whandle->handle = vc4_bo_get_dmabuf(bo);
-                return whandle->handle != -1;
-        }
-
-        return FALSE;
-}
-
-struct vc4_bo *
-vc4_screen_bo_from_handle(struct pipe_screen *pscreen,
-                          struct winsys_handle *whandle)
-{
-        struct vc4_screen *screen = vc4_screen(pscreen);
-
-        if (whandle->offset != 0) {
-                fprintf(stderr,
-                        "Attempt to import unsupported winsys offset %u\n",
-                        whandle->offset);
-                return NULL;
-        }
-
-        switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
-                return vc4_bo_open_name(screen, whandle->handle, whandle->stride);
-        case DRM_API_HANDLE_TYPE_FD:
-                return vc4_bo_open_dmabuf(screen, whandle->handle, whandle->stride);
-        default:
-                fprintf(stderr,
-                        "Attempt to import unsupported handle type %d\n",
-                        whandle->type);
-                return NULL;
-        }
-}
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 34d1538..8510821 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -25,6 +25,7 @@
 #define VC4_SCREEN_H
 
 #include "pipe/p_screen.h"
+#include "renderonly/renderonly.h"
 #include "os/os_thread.h"
 #include "state_tracker/drm_driver.h"
 #include "util/list.h"
@@ -47,6 +48,7 @@
 #define VC4_DEBUG_ALWAYS_SYNC  0x0100
 #define VC4_DEBUG_NIR       0x0200
 #define VC4_DEBUG_DUMP      0x0400
+#define VC4_DEBUG_SURFACE   0x0800
 
 #define VC4_MAX_MIP_LEVELS 12
 #define VC4_MAX_TEXTURE_SAMPLERS 16
@@ -55,6 +57,8 @@
 
 struct vc4_screen {
         struct pipe_screen base;
+        struct renderonly *ro;
+
         int fd;
 
         int v3d_ver;
@@ -91,6 +95,7 @@
         bool has_control_flow;
         bool has_etc1;
         bool has_threaded_fs;
+        bool has_tiling_ioctl;
 
         struct vc4_simulator_file *sim_file;
 };
@@ -101,14 +106,7 @@
         return (struct vc4_screen *)screen;
 }
 
-struct pipe_screen *vc4_screen_create(int fd);
-boolean vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
-                                 struct vc4_bo *bo,
-                                 unsigned stride,
-                                 struct winsys_handle *whandle);
-struct vc4_bo *
-vc4_screen_bo_from_handle(struct pipe_screen *pscreen,
-                          struct winsys_handle *whandle);
+struct pipe_screen *vc4_screen_create(int fd, struct renderonly *ro);
 
 const void *
 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 9565c49e..ff306f2 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -55,6 +55,7 @@
 #include "util/ralloc.h"
 
 #include "vc4_screen.h"
+#include "vc4_cl_dump.h"
 #include "vc4_context.h"
 #include "kernel/vc4_drv.h"
 #include "vc4_simulator_validate.h"
@@ -387,7 +388,7 @@
                         ctex->bo->size);
 #endif
 
-                for (int y = 0; y < ctex->base.b.height0; y++) {
+                for (int y = 0; y < ctex->base.height0; y++) {
                         memcpy(ctex->bo->map + y * sim_stride,
                                csim_bo->winsys_map + y * winsys_stride,
                                row_len);
@@ -448,7 +449,7 @@
         }
 
         if (ctex && csim_bo->winsys_map) {
-                for (int y = 0; y < ctex->base.b.height0; y++) {
+                for (int y = 0; y < ctex->base.height0; y++) {
                         memcpy(csim_bo->winsys_map + y * winsys_stride,
                                ctex->bo->map + y * sim_stride,
                                row_len);
@@ -652,6 +653,13 @@
                  */
                 return 0;
 
+        case DRM_IOCTL_VC4_GET_TILING:
+        case DRM_IOCTL_VC4_SET_TILING:
+                /* Disable these for now, since the sharing with i965 requires
+                 * linear buffers.
+                 */
+                return -1;
+
         case DRM_IOCTL_VC4_GET_PARAM:
                 return vc4_simulator_get_param_ioctl(fd, args);
 
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 2e00104..d6d4479 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -94,6 +94,9 @@
                             const struct pipe_rasterizer_state *cso)
 {
         struct vc4_rasterizer_state *so;
+        struct V3D21_DEPTH_OFFSET depth_offset = { V3D21_DEPTH_OFFSET_header };
+        struct V3D21_POINT_SIZE point_size = { V3D21_POINT_SIZE_header };
+        struct V3D21_LINE_WIDTH line_width = { V3D21_LINE_WIDTH_header };
 
         so = CALLOC_STRUCT(vc4_rasterizer_state);
         if (!so)
@@ -109,7 +112,9 @@
         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
          * BCM21553).
          */
-        so->point_size = MAX2(cso->point_size, .125f);
+        point_size.point_size = MAX2(cso->point_size, .125f);
+
+        line_width.line_width = cso->line_width;
 
         if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;
@@ -117,13 +122,19 @@
         if (cso->offset_tri) {
                 so->config_bits[0] |= VC4_CONFIG_BITS_ENABLE_DEPTH_OFFSET;
 
-                so->offset_units = float_to_187_half(cso->offset_units);
-                so->offset_factor = float_to_187_half(cso->offset_scale);
+                depth_offset.depth_offset_units =
+                        float_to_187_half(cso->offset_units);
+                depth_offset.depth_offset_factor =
+                        float_to_187_half(cso->offset_scale);
         }
 
         if (cso->multisample)
                 so->config_bits[0] |= VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X;
 
+        V3D21_DEPTH_OFFSET_pack(NULL, so->packed.depth_offset, &depth_offset);
+        V3D21_POINT_SIZE_pack(NULL, so->packed.point_size, &point_size);
+        V3D21_LINE_WIDTH_pack(NULL, so->packed.line_width, &line_width);
+
         return so;
 }
 
@@ -302,24 +313,6 @@
 }
 
 static void
-vc4_set_index_buffer(struct pipe_context *pctx,
-                     const struct pipe_index_buffer *ib)
-{
-        struct vc4_context *vc4 = vc4_context(pctx);
-
-        if (ib) {
-                pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
-                vc4->indexbuf.index_size = ib->index_size;
-                vc4->indexbuf.offset = ib->offset;
-                vc4->indexbuf.user_buffer = ib->user_buffer;
-        } else {
-                pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
-        }
-
-        vc4->dirty |= VC4_DIRTY_INDEXBUF;
-}
-
-static void
 vc4_blend_state_bind(struct pipe_context *pctx, void *hwcso)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
@@ -563,6 +556,9 @@
         so->base = *cso;
 
         pipe_reference(NULL, &prsc->reference);
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
 
         /* There is no hardware level clamping, and the start address of a
          * texture may be misaligned, so in that case we have to copy to a
@@ -574,33 +570,36 @@
         if ((cso->u.tex.first_level &&
              (cso->u.tex.first_level != cso->u.tex.last_level)) ||
             rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
-                struct vc4_resource *shadow_parent = vc4_resource(prsc);
-                struct pipe_resource tmpl = shadow_parent->base.b;
-                struct vc4_resource *clone;
+                struct vc4_resource *shadow_parent = rsc;
+                struct pipe_resource tmpl = *prsc;
 
                 tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
                 tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
                 tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
                 tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
+                /* Create the shadow texture.  The rest of the texture
+                 * parameter setup will use the shadow.
+                 */
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
                 if (!prsc) {
                         free(so);
                         return NULL;
                 }
                 rsc = vc4_resource(prsc);
-                clone = vc4_resource(prsc);
-                clone->shadow_parent = &shadow_parent->base.b;
-                /* Flag it as needing update of the contents from the parent. */
-                clone->writes = shadow_parent->writes - 1;
 
-                assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
-        } else if (cso->u.tex.first_level) {
-                so->force_first_level = true;
+                /* Flag it as needing update of the contents from the parent. */
+                rsc->writes = shadow_parent->writes - 1;
+                assert(rsc->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
+
+                so->texture = prsc;
+        } else {
+                pipe_resource_reference(&so->texture, prsc);
+
+                if (cso->u.tex.first_level) {
+                        so->force_first_level = true;
+                }
         }
-        so->base.texture = prsc;
-        so->base.reference.count = 1;
-        so->base.context = pctx;
 
         so->texture_p0 =
                 (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
@@ -624,8 +623,10 @@
 
 static void
 vc4_sampler_view_destroy(struct pipe_context *pctx,
-                         struct pipe_sampler_view *view)
+                         struct pipe_sampler_view *pview)
 {
+        struct vc4_sampler_view *view = vc4_sampler_view(pview);
+        pipe_resource_reference(&pview->texture, NULL);
         pipe_resource_reference(&view->texture, NULL);
         free(view);
 }
@@ -670,7 +671,6 @@
         pctx->set_viewport_states = vc4_set_viewport_states;
 
         pctx->set_vertex_buffers = vc4_set_vertex_buffers;
-        pctx->set_index_buffer = vc4_set_index_buffer;
 
         pctx->create_blend_state = vc4_create_blend_state;
         pctx->bind_blend_state = vc4_blend_state_bind;
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index ba1ad6f..66767e7 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -27,6 +27,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "util/macros.h"
+#include "util/u_cpu_detect.h"
 
 /** Return the width in pixels of a 64-byte microtile. */
 static inline uint32_t
@@ -83,23 +84,20 @@
                            uint8_t tiling_format, int cpp,
                            const struct pipe_box *box);
 
-/* If we're building for ARMv7 (Pi 2+), assume it has NEON.  For Raspbian we
- * should extend this to have some runtime detection of being built for ARMv6
- * on a Pi 2+.
- */
-#if defined(__ARM_ARCH) && __ARM_ARCH == 7
-#define NEON_SUFFIX(x) x ## _neon
-#else
-#define NEON_SUFFIX(x) x ## _base
-#endif
-
 static inline void
 vc4_load_lt_image(void *dst, uint32_t dst_stride,
                   void *src, uint32_t src_stride,
                   int cpp, const struct pipe_box *box)
 {
-        NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride,
+#ifdef USE_ARM_ASM
+        if (util_cpu_caps.has_neon) {
+                vc4_load_lt_image_neon(dst, dst_stride, src, src_stride,
                                        cpp, box);
+                return;
+        }
+#endif
+        vc4_load_lt_image_base(dst, dst_stride, src, src_stride,
+                               cpp, box);
 }
 
 static inline void
@@ -107,10 +105,16 @@
                    void *src, uint32_t src_stride,
                    int cpp, const struct pipe_box *box)
 {
-        NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride,
+#ifdef USE_ARM_ASM
+        if (util_cpu_caps.has_neon) {
+                vc4_store_lt_image_neon(dst, dst_stride, src, src_stride,
                                         cpp, box);
-}
+                return;
+        }
+#endif
 
-#undef NEON_SUFFIX
+        vc4_store_lt_image_base(dst, dst_stride, src, src_stride,
+                                cpp, box);
+}
 
 #endif /* VC4_TILING_H */
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c
new file mode 100644
index 0000000..7ba66ae
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* Wrapper file for building vc4_tiling_lt.c with the "build NEON assembly if
+ * possible" flag set, since Android.mk doesn't have a way to set CFLAGS for a
+ * single file.
+ */
+
+#define VC4_BUILD_NEON
+#include "vc4_tiling_lt.c"
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 07781b8..b816934 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -35,7 +35,7 @@
 {
         struct vc4_sampler_view *sview =
                 vc4_sampler_view(texstate->textures[unit]);
-        struct vc4_resource *rsc = vc4_resource(sview->base.texture);
+        struct vc4_resource *rsc = vc4_resource(sview->texture);
 
         cl_reloc(job, &job->uniforms, uniforms, rsc->bo, sview->texture_p0);
 }
diff --git a/src/gallium/drivers/virgl/Android.mk b/src/gallium/drivers/virgl/Android.mk
index 7c1ba42..0067dfa 100644
--- a/src/gallium/drivers/virgl/Android.mk
+++ b/src/gallium/drivers/virgl/Android.mk
@@ -32,3 +32,8 @@
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_VIRGL),)
+GALLIUM_TARGET_DRIVERS += virtio_gpu
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_virgl libmesa_winsys_virgl_vtest)
+endif
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 4b990a6..232d295 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -30,6 +30,7 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
+#include "util/u_prim.h"
 #include "util/u_transfer.h"
 #include "util/u_helpers.h"
 #include "util/slab.h"
@@ -123,18 +124,19 @@
    unsigned i;
 
    for (i = 0; i < vctx->num_vertex_buffers; i++) {
-      res = virgl_resource(vctx->vertex_buffer[i].buffer);
+      res = virgl_resource(vctx->vertex_buffer[i].buffer.resource);
       if (res)
          vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
    }
 }
 
-static void virgl_attach_res_index_buffer(struct virgl_context *vctx)
+static void virgl_attach_res_index_buffer(struct virgl_context *vctx,
+					  struct virgl_indexbuf *ib)
 {
    struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
    struct virgl_resource *res;
 
-   res = virgl_resource(vctx->index_buffer.buffer);
+   res = virgl_resource(ib->buffer);
    if (res)
       vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
 }
@@ -182,7 +184,6 @@
       virgl_attach_res_sampler_views(vctx, shader_type);
       virgl_attach_res_uniform_buffers(vctx, shader_type);
    }
-   virgl_attach_res_index_buffer(vctx);
    virgl_attach_res_vertex_buffers(vctx);
    virgl_attach_res_so_targets(vctx);
 }
@@ -403,25 +404,12 @@
    virgl_encoder_set_blend_color(vctx, color);
 }
 
-static void virgl_set_index_buffer(struct pipe_context *ctx,
-                                  const struct pipe_index_buffer *ib)
-{
-   struct virgl_context *vctx = virgl_context(ctx);
-
-   if (ib) {
-      pipe_resource_reference(&vctx->index_buffer.buffer, ib->buffer);
-      memcpy(&vctx->index_buffer, ib, sizeof(*ib));
-   } else {
-      pipe_resource_reference(&vctx->index_buffer.buffer, NULL);
-   }
-}
-
 static void virgl_hw_set_index_buffer(struct pipe_context *ctx,
-                                     struct pipe_index_buffer *ib)
+                                     struct virgl_indexbuf *ib)
 {
    struct virgl_context *vctx = virgl_context(ctx);
    virgl_encoder_set_index_buffer(vctx, ib);
-   virgl_attach_res_index_buffer(vctx);
+   virgl_attach_res_index_buffer(vctx, ib);
 }
 
 static void virgl_set_constant_buffer(struct pipe_context *ctx,
@@ -589,19 +577,23 @@
 {
    struct virgl_context *vctx = virgl_context(ctx);
    struct virgl_screen *rs = virgl_screen(ctx->screen);
-   struct pipe_index_buffer ib = {};
+   struct virgl_indexbuf ib = {};
    struct pipe_draw_info info = *dinfo;
 
+   if (!dinfo->count_from_stream_output && !dinfo->indirect &&
+       !dinfo->primitive_restart &&
+       !u_trim_pipe_prim(dinfo->mode, (unsigned*)&dinfo->count))
+      return;
+
    if (!(rs->caps.caps.v1.prim_mask & (1 << dinfo->mode))) {
-      util_primconvert_save_index_buffer(vctx->primconvert, &vctx->index_buffer);
       util_primconvert_draw_vbo(vctx->primconvert, dinfo);
       return;
    }
-   if (info.indexed) {
-           pipe_resource_reference(&ib.buffer, vctx->index_buffer.buffer);
-           ib.user_buffer = vctx->index_buffer.user_buffer;
-           ib.index_size = vctx->index_buffer.index_size;
-           ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
+   if (info.index_size) {
+           pipe_resource_reference(&ib.buffer, info.has_user_indices ? NULL : info.index.resource);
+           ib.user_buffer = info.has_user_indices ? info.index.user : NULL;
+           ib.index_size = dinfo->index_size;
+           ib.offset = info.start * ib.index_size;
 
            if (ib.user_buffer) {
                    u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256,
@@ -614,7 +606,7 @@
 
    vctx->num_draws++;
    virgl_hw_set_vertex_buffers(ctx);
-   if (info.indexed)
+   if (info.index_size)
       virgl_hw_set_index_buffer(ctx, &ib);
 
    virgl_encoder_draw_vbo(vctx, &info);
@@ -899,7 +891,6 @@
    vctx->base.bind_vertex_elements_state = virgl_bind_vertex_elements_state;
    vctx->base.delete_vertex_elements_state = virgl_delete_vertex_elements_state;
    vctx->base.set_vertex_buffers = virgl_set_vertex_buffers;
-   vctx->base.set_index_buffer = virgl_set_index_buffer;
    vctx->base.set_constant_buffer = virgl_set_constant_buffer;
 
    vctx->base.create_vs_state = virgl_create_vs_state;
diff --git a/src/gallium/drivers/virgl/virgl_context.h b/src/gallium/drivers/virgl/virgl_context.h
index 597ed49..d8d4ccb 100644
--- a/src/gallium/drivers/virgl/virgl_context.h
+++ b/src/gallium/drivers/virgl/virgl_context.h
@@ -58,7 +58,6 @@
 
    struct slab_child_pool texture_transfer_pool;
 
-   struct pipe_index_buffer index_buffer;
    struct u_upload_mgr *uploader;
 
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
diff --git a/src/gallium/drivers/virgl/virgl_encode.c b/src/gallium/drivers/virgl/virgl_encode.c
index cbe8d19..ee68fe0 100644
--- a/src/gallium/drivers/virgl/virgl_encode.c
+++ b/src/gallium/drivers/virgl/virgl_encode.c
@@ -389,7 +389,7 @@
    int i;
    virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_VERTEX_BUFFERS, 0, VIRGL_SET_VERTEX_BUFFERS_SIZE(num_buffers)));
    for (i = 0; i < num_buffers; i++) {
-      struct virgl_resource *res = virgl_resource(buffers[i].buffer);
+      struct virgl_resource *res = virgl_resource(buffers[i].buffer.resource);
       virgl_encoder_write_dword(ctx->cbuf, buffers[i].stride);
       virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_offset);
       virgl_encoder_write_res(ctx, res);
@@ -398,7 +398,7 @@
 }
 
 int virgl_encoder_set_index_buffer(struct virgl_context *ctx,
-                                  const struct pipe_index_buffer *ib)
+                                  const struct virgl_indexbuf *ib)
 {
    int length = VIRGL_SET_INDEX_BUFFER_SIZE(ib);
    struct virgl_resource *res = NULL;
@@ -421,7 +421,7 @@
    virgl_encoder_write_dword(ctx->cbuf, info->start);
    virgl_encoder_write_dword(ctx->cbuf, info->count);
    virgl_encoder_write_dword(ctx->cbuf, info->mode);
-   virgl_encoder_write_dword(ctx->cbuf, info->indexed);
+   virgl_encoder_write_dword(ctx->cbuf, !!info->index_size);
    virgl_encoder_write_dword(ctx->cbuf, info->instance_count);
    virgl_encoder_write_dword(ctx->cbuf, info->index_bias);
    virgl_encoder_write_dword(ctx->cbuf, info->start_instance);
diff --git a/src/gallium/drivers/virgl/virgl_encode.h b/src/gallium/drivers/virgl/virgl_encode.h
index 78d4194..02c032d 100644
--- a/src/gallium/drivers/virgl/virgl_encode.h
+++ b/src/gallium/drivers/virgl/virgl_encode.h
@@ -39,6 +39,13 @@
    uint32_t handle;
 };
 
+struct virgl_indexbuf {
+   unsigned offset;
+   unsigned index_size;  /**< size of an index, in bytes */
+   struct pipe_resource *buffer; /**< the actual buffer */
+   const void *user_buffer;  /**< pointer to a user buffer if buffer == NULL */
+};
+
 static inline struct virgl_surface *virgl_surface(struct pipe_surface *surf)
 {
    return (struct virgl_surface *)surf;
@@ -167,7 +174,7 @@
                                     uint32_t *handles);
 
 int virgl_encoder_set_index_buffer(struct virgl_context *ctx,
-                                  const struct pipe_index_buffer *ib);
+                                  const struct virgl_indexbuf *ib);
 
 uint32_t virgl_object_assign_handle(void);
 
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index ff500ee..5df0840 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -27,7 +27,6 @@
 #include "os/os_time.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
-#include "draw/draw_context.h"
 
 #include "tgsi/tgsi_exec.h"
 
@@ -262,6 +261,10 @@
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_DOUBLES:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_BINDLESS_TEXTURE:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
@@ -327,6 +330,7 @@
       case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
          return 4096 * sizeof(float[4]);
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       default:
          return 0;
       }
diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
index 4a2271f..7ad1cbd 100644
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -48,6 +48,15 @@
    }
 }
 
+static void
+virgl_tgsi_transform_instruction(struct tgsi_transform_context *ctx,
+				 struct tgsi_full_instruction *inst)
+{
+   if (inst->Instruction.Precise)
+      inst->Instruction.Precise = 0;
+   ctx->emit_instruction(ctx, inst);
+}
+
 struct tgsi_token *virgl_tgsi_transform(const struct tgsi_token *tokens_in)
 {
 
@@ -61,6 +70,7 @@
 
    memset(&transform, 0, sizeof(transform));
    transform.base.transform_property = virgl_tgsi_transform_property;
+   transform.base.transform_instruction = virgl_tgsi_transform_instruction;
    tgsi_transform_shader(tokens_in, new_tokens, newLen, &transform.base);
 
    return new_tokens;
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 0d7b014..2869517 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -93,17 +93,6 @@
 #endif
 #endif
 
-/* Forced function inlining */
-#ifndef ALWAYS_INLINE
-#  ifdef __GNUC__
-#    define ALWAYS_INLINE inline __attribute__((always_inline))
-#  elif defined(_MSC_VER)
-#    define ALWAYS_INLINE __forceinline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#endif
-
 
 /* XXX: Use standard `__func__` instead */
 #ifndef __FUNCTION__
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index 98c433f..3fa43ed 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -53,6 +53,7 @@
 
 #if defined(__GNUC__)
 #define PIPE_CC_GCC
+#define PIPE_CC_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
 /*
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 4d5535b..c2b1ad2 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -53,7 +53,6 @@
 struct pipe_fence_handle;
 struct pipe_framebuffer_state;
 struct pipe_image_view;
-struct pipe_index_buffer;
 struct pipe_query;
 struct pipe_poly_stipple;
 struct pipe_rasterizer_state;
@@ -354,9 +353,6 @@
                                unsigned num_buffers,
                                const struct pipe_vertex_buffer * );
 
-   void (*set_index_buffer)( struct pipe_context *pipe,
-                             const struct pipe_index_buffer * );
-
    /*@}*/
 
    /**
@@ -770,6 +766,65 @@
                               unsigned last_level,
                               unsigned first_layer,
                               unsigned last_layer);
+
+   /**
+    * Create a 64-bit texture handle.
+    *
+    * \param ctx        pipe context
+    * \param view       pipe sampler view object
+    * \param state      pipe sampler state template
+    * \return           a 64-bit texture handle if success, 0 otherwise
+    */
+   uint64_t (*create_texture_handle)(struct pipe_context *ctx,
+                                     struct pipe_sampler_view *view,
+                                     const struct pipe_sampler_state *state);
+
+   /**
+    * Delete a texture handle.
+    *
+    * \param ctx        pipe context
+    * \param handle     64-bit texture handle
+    */
+   void (*delete_texture_handle)(struct pipe_context *ctx, uint64_t handle);
+
+   /**
+    * Make a texture handle resident.
+    *
+    * \param ctx        pipe context
+    * \param handle     64-bit texture handle
+    * \param resident   TRUE for resident, FALSE otherwise
+    */
+   void (*make_texture_handle_resident)(struct pipe_context *ctx,
+                                        uint64_t handle, bool resident);
+
+   /**
+    * Create a 64-bit image handle.
+    *
+    * \param ctx        pipe context
+    * \param image      pipe image view template
+    * \return           a 64-bit image handle if success, 0 otherwise
+    */
+   uint64_t (*create_image_handle)(struct pipe_context *ctx,
+                                   const struct pipe_image_view *image);
+
+   /**
+    * Delete an image handle.
+    *
+    * \param ctx        pipe context
+    * \param handle     64-bit image handle
+    */
+   void (*delete_image_handle)(struct pipe_context *ctx, uint64_t handle);
+
+   /**
+    * Make an image handle resident.
+    *
+    * \param ctx        pipe context
+    * \param handle     64-bit image handle
+    * \param access     GL_READ_ONLY, GL_WRITE_ONLY or GL_READ_WRITE
+    * \param resident   TRUE for resident, FALSE otherwise
+    */
+   void (*make_image_handle_resident)(struct pipe_context *ctx, uint64_t handle,
+                                      unsigned access, bool resident);
 };
 
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index a389b11..2ccdf44 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -384,6 +384,20 @@
 #define PIPE_CONTEXT_ROBUST_BUFFER_ACCESS (1 << 2)
 
 /**
+ * Prefer threaded pipe_context. It also implies that video codec functions
+ * will not be used. (they will be either no-ops or NULL when threading is
+ * enabled)
+ */
+#define PIPE_CONTEXT_PREFER_THREADED   (1 << 3)
+
+/**
+ * Implicit and explicit derivatives after KILL behave as if KILL didn't
+ * happen.
+ */
+#define PIPE_SCREEN_ENABLE_CORRECT_TGSI_DERIVATIVES_AFTER_KILL (1 << 0)
+
+
+/**
  * Flags for pipe_context::memory_barrier.
  */
 #define PIPE_BARRIER_MAPPED_BUFFER     (1 << 0)
@@ -762,6 +776,10 @@
    PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE,
    PIPE_CAP_TGSI_BALLOT,
    PIPE_CAP_TGSI_TES_LAYER_VIEWPORT,
+   PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX,
+   PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION,
+   PIPE_CAP_POST_DEPTH_COVERAGE,
+   PIPE_CAP_BINDLESS_TEXTURE,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -830,6 +848,7 @@
    PIPE_SHADER_CAP_SUPPORTED_IRS,
    PIPE_SHADER_CAP_MAX_SHADER_IMAGES,
    PIPE_SHADER_CAP_LOWER_IF_THRESHOLD,
+   PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS,
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 8b4239c..65e954a 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -328,6 +328,35 @@
     * driver doesn't support an on-disk shader cache.
     */
    struct disk_cache *(*get_disk_shader_cache)(struct pipe_screen *screen);
+
+   /**
+    * Create a new texture object from the given template info, taking
+    * format modifiers into account. \p modifiers specifies a list of format
+    * modifier tokens, as defined in drm_fourcc.h. The driver then picks the
+    * best modifier among these and creates the resource. \p count must
+    * contain the size of \p modifiers array.
+    *
+    * Returns NULL if an entry in \p modifiers is unsupported by the driver,
+    * or if only DRM_FORMAT_MOD_INVALID is provided.
+    */
+   struct pipe_resource * (*resource_create_with_modifiers)(
+                           struct pipe_screen *,
+                           const struct pipe_resource *templat,
+                           const uint64_t *modifiers, int count);
+
+   /**
+    * Get supported modifiers for a format.
+    * If \p max is 0, the total number of supported modifiers for the supplied
+    * format is returned in \p count, with no modification to \p modifiers.
+    * Otherwise, \p modifiers is filled with upto \p max supported modifier
+    * codes, and \p count with the number of modifiers copied.
+    * The \p external_only array is used to return whether the format and
+    * modifier combination can only be used with an external texture target.
+    */
+   void (*query_dmabuf_modifiers)(struct pipe_screen *screen,
+                                  enum pipe_format format, int max,
+                                  uint64_t *modifiers,
+                                  unsigned int *external_only, int *count);
 };
 
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a671121..aa0fb3e 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -233,6 +233,7 @@
    TGSI_RETURN_TYPE_SINT,
    TGSI_RETURN_TYPE_UINT,
    TGSI_RETURN_TYPE_FLOAT,
+   TGSI_RETURN_TYPE_UNKNOWN,
    TGSI_RETURN_TYPE_COUNT
 };
 
@@ -292,6 +293,7 @@
    TGSI_PROPERTY_NUM_CLIPDIST_ENABLED,
    TGSI_PROPERTY_NUM_CULLDIST_ENABLED,
    TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL,
+   TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE,
    TGSI_PROPERTY_NEXT_SHADER,
    TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH,
    TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT,
@@ -636,7 +638,8 @@
    unsigned Label      : 1;
    unsigned Texture    : 1;
    unsigned Memory     : 1;
-   unsigned Padding    : 2;
+   unsigned Precise    : 1;
+   unsigned Padding    : 1;
 };
 
 /*
@@ -694,7 +697,8 @@
 {
    unsigned Texture  : 8;    /* TGSI_TEXTURE_ */
    unsigned NumOffsets : 4;
-   unsigned Padding : 20;
+   unsigned ReturnType : 3; /* TGSI_RETURN_TYPE_x */
+   unsigned Padding : 17;
 };
 
 /* for texture offsets in GLSL and DirectX.
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index ce9ca34..15be8cb 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -534,7 +534,6 @@
 };
 
 
-
 /**
  * A vertex buffer.  Typically, all the vertex data/attributes for
  * drawing something will be in one buffer.  But it's also possible, for
@@ -542,10 +541,14 @@
  */
 struct pipe_vertex_buffer
 {
-   unsigned stride;    /**< stride to same attrib in next vertex, in bytes */
+   uint16_t stride;    /**< stride to same attrib in next vertex, in bytes */
+   bool is_user_buffer;
    unsigned buffer_offset;  /**< offset to start of data in buffer, in bytes */
-   struct pipe_resource *buffer;  /**< the actual buffer */
-   const void *user_buffer;  /**< pointer to a user buffer if buffer == NULL */
+
+   union {
+      struct pipe_resource *resource;  /**< the actual buffer */
+      const void *user;  /**< pointer to a user buffer */
+   } buffer;
 };
 
 
@@ -625,16 +628,37 @@
 };
 
 
-/**
- * An index buffer.  When an index buffer is bound, all indices to vertices
- * will be looked up in the buffer.
- */
-struct pipe_index_buffer
+struct pipe_draw_indirect_info
 {
-   unsigned index_size;  /**< size of an index, in bytes */
-   unsigned offset;  /**< offset to start of data in buffer, in bytes */
-   struct pipe_resource *buffer; /**< the actual buffer */
-   const void *user_buffer;  /**< pointer to a user buffer if buffer == NULL */
+   unsigned offset; /**< must be 4 byte aligned */
+   unsigned stride; /**< must be 4 byte aligned */
+   unsigned draw_count; /**< number of indirect draws */
+   unsigned indirect_draw_count_offset; /**< must be 4 byte aligned */
+
+   /* Indirect draw parameters resource is laid out as follows:
+    *
+    * if using indexed drawing:
+    *  struct {
+    *     uint32_t count;
+    *     uint32_t instance_count;
+    *     uint32_t start;
+    *     int32_t index_bias;
+    *     uint32_t start_instance;
+    *  };
+    * otherwise:
+    *  struct {
+    *     uint32_t count;
+    *     uint32_t instance_count;
+    *     uint32_t start;
+    *     uint32_t start_instance;
+    *  };
+    */
+   struct pipe_resource *buffer;
+
+   /* Indirect draw count resource: If not NULL, contains a 32-bit value which
+    * is to be used as the real draw_count.
+    */
+   struct pipe_resource *indirect_draw_count;
 };
 
 
@@ -643,12 +667,18 @@
  */
 struct pipe_draw_info
 {
-   boolean indexed;  /**< use index buffer */
+   ubyte index_size;  /**< if 0, the draw is not indexed. */
    enum pipe_prim_type mode:8;  /**< the mode of the primitive */
-   boolean primitive_restart;
+   unsigned primitive_restart:1;
+   unsigned has_user_indices:1; /**< if true, use index.user_buffer */
    ubyte vertices_per_patch; /**< the number of vertices per patch */
 
-   unsigned start;  /**< the index of the first vertex */
+   /**
+    * Direct draws: start is the index of the first vertex
+    * Non-indexed indirect draws: not used
+    * Indexed indirect draws: start is added to the indirect start.
+    */
+   unsigned start;
    unsigned count;  /**< number of vertices */
 
    unsigned start_instance; /**< first instance id */
@@ -668,40 +698,20 @@
     */
    unsigned restart_index;
 
-   unsigned indirect_offset; /**< must be 4 byte aligned */
-   unsigned indirect_stride; /**< must be 4 byte aligned */
-   unsigned indirect_count; /**< number of indirect draws */
-
-   unsigned indirect_params_offset; /**< must be 4 byte aligned */
-
    /* Pointers must be at the end for an optimal structure layout on 64-bit. */
 
-   /* Indirect draw parameters resource: If not NULL, most values are taken
-    * from this buffer instead, which is laid out as follows:
+   /**
+    * An index buffer.  When an index buffer is bound, all indices to vertices
+    * will be looked up from the buffer.
     *
-    * if indexed is TRUE:
-    *  struct {
-    *     uint32_t count;
-    *     uint32_t instance_count;
-    *     uint32_t start;
-    *     int32_t index_bias;
-    *     uint32_t start_instance;
-    *  };
-    * otherwise:
-    *  struct {
-    *     uint32_t count;
-    *     uint32_t instance_count;
-    *     uint32_t start;
-    *     uint32_t start_instance;
-    *  };
+    * If has_user_indices, use index.user, else use index.resource.
     */
-   struct pipe_resource *indirect;
+   union {
+      struct pipe_resource *resource;  /**< real buffer */
+      const void *user;  /**< pointer to a user buffer */
+   } index;
 
-   /* Indirect draw count resource: If not NULL, contains a 32-bit value which
-    * is to be used as the real indirect_count. In that case indirect_count
-    * becomes the maximum possible value.
-    */
-   struct pipe_resource *indirect_params;
+   struct pipe_draw_indirect_info *indirect; /**< Indirect draw. */
 
    /**
     * Stream output target. If not NULL, it's used to provide the 'count'
diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index c80fb09..4383276 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -45,6 +45,12 @@
     * Output for texture_get_handle.
     */
    unsigned offset;
+
+   /**
+    * Input to resource_from_handle.
+    * Output from resource_get_handle.
+    */
+   uint64_t modifier;
 };
 
 
@@ -96,7 +102,7 @@
     * This function does any wrapping of the screen.
     * For example wrapping trace or rbug debugging drivers around it.
     */
-   struct pipe_screen* (*create_screen)(int drm_fd);
+   struct pipe_screen* (*create_screen)(int drm_fd, unsigned flags);
 
    /**
     * Return a configuration value.
diff --git a/src/gallium/include/state_tracker/st_api.h b/src/gallium/include/state_tracker/st_api.h
index 9d0eb3a..bc62a69 100644
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -90,6 +90,7 @@
 #define ST_CONTEXT_FLAG_FORWARD_COMPATIBLE  (1 << 1)
 #define ST_CONTEXT_FLAG_ROBUST_ACCESS       (1 << 2)
 #define ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED (1 << 3)
+#define ST_CONTEXT_FLAG_NO_ERROR            (1 << 4)
 
 /**
  * Reasons that context creation might fail.
@@ -179,6 +180,7 @@
 struct pipe_context;
 struct pipe_resource;
 struct pipe_fence_handle;
+struct util_queue_monitoring;
 
 /**
  * Used in st_context_iface->get_resource_for_egl_image.
@@ -246,6 +248,7 @@
    unsigned force_glsl_version;
    boolean force_s3tc_enable;
    boolean allow_glsl_extension_directive_midshader;
+   boolean allow_glsl_builtin_variable_redeclaration;
    boolean allow_higher_compat_version;
    boolean glsl_zero_init;
    boolean force_glsl_abs_sqrt;
@@ -281,6 +284,7 @@
 };
 
 struct st_context_iface;
+struct st_manager;
 
 /**
  * Represent a windowing system drawable.
@@ -309,6 +313,16 @@
    int32_t stamp;
 
    /**
+    * Identifier that uniquely identifies the framebuffer interface object.
+    */
+   uint32_t ID;
+
+   /**
+    * The state tracker manager that manages this object.
+    */
+   struct st_manager *state_manager;
+
+   /**
     * Available for the state tracker manager to use.
     */
    void *st_manager_private;
@@ -368,6 +382,11 @@
    void *st_manager_private;
 
    /**
+    * The state tracker manager that manages this object.
+    */
+   struct st_manager *state_manager;
+
+   /**
     * The CSO context associated with this context in case we need to draw
     * something before swap buffers.
     */
@@ -473,7 +492,18 @@
     * Call the loader function setBackgroundContext. Called from the worker
     * thread.
     */
-   void (*set_background_context)(struct st_context_iface *stctxi);
+   void (*set_background_context)(struct st_context_iface *stctxi,
+                                  struct util_queue_monitoring *queue_info);
+
+   /**
+    * Destroy any private data used by the state tracker manager.
+    */
+   void (*destroy)(struct st_manager *smapi);
+
+   /**
+    * Available for the state tracker manager to use.
+    */
+   void *st_manager_private;
 };
 
 /**
@@ -543,6 +573,13 @@
     * Get the currently bound context in the calling thread.
     */
    struct st_context_iface *(*get_current)(struct st_api *stapi);
+
+   /**
+    * Notify the st manager the framebuffer interface object
+    * is no longer valid.
+    */
+   void (*destroy_drawable)(struct st_api *stapi,
+                            struct st_framebuffer_iface *stfbi);
 };
 
 /**
diff --git a/src/gallium/state_trackers/clover/api/device.cpp b/src/gallium/state_trackers/clover/api/device.cpp
index a80ca46..0b33350 100644
--- a/src/gallium/state_trackers/clover/api/device.cpp
+++ b/src/gallium/state_trackers/clover/api/device.cpp
@@ -336,7 +336,7 @@
       break;
 
    case CL_DEVICE_HOST_UNIFIED_MEMORY:
-      buf.as_scalar<cl_bool>() = CL_TRUE;
+      buf.as_scalar<cl_bool>() = dev.has_unified_memory();
       break;
 
    case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 158c9aa..2ad9e49 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -42,7 +42,7 @@
 
 device::device(clover::platform &platform, pipe_loader_device *ldev) :
    platform(platform), ldev(ldev) {
-   pipe = pipe_loader_create_screen(ldev);
+   pipe = pipe_loader_create_screen(ldev, 0);
    if (!pipe || !pipe->get_param(pipe, PIPE_CAP_COMPUTE)) {
       if (pipe)
          pipe->destroy(pipe);
@@ -189,6 +189,11 @@
    return pipe->get_param(pipe, PIPE_CAP_DOUBLES);
 }
 
+bool
+device::has_unified_memory() const {
+   return pipe->get_param(pipe, PIPE_CAP_UMA);
+}
+
 std::vector<size_t>
 device::max_block_size() const {
    auto v = get_compute_param<uint64_t>(pipe, ir_format(),
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index 94a61d1..7b3353d 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -67,6 +67,7 @@
       cl_uint max_compute_units() const;
       bool image_support() const;
       bool has_doubles() const;
+      bool has_unified_memory() const;
 
       std::vector<size_t> max_block_size() const;
       cl_uint subgroup_size() const;
diff --git a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
index 0751834..ddf2083 100644
--- a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
+++ b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
@@ -70,7 +70,6 @@
    make_kernel_args(const Module &mod, const std::string &kernel_name,
                     const clang::CompilerInstance &c) {
       std::vector<module::argument> args;
-      const auto address_spaces = c.getTarget().getAddressSpaceMap();
       const Function &f = *mod.getFunction(kernel_name);
       ::llvm::DataLayout dl(&mod);
       const auto size_type =
@@ -128,8 +127,8 @@
                const unsigned address_space =
                   cast< ::llvm::PointerType>(actual_type)->getAddressSpace();
 
-               if (address_space == address_spaces[clang::LangAS::opencl_local
-                                                   - compat::lang_as_offset]) {
+               if (address_space == compat::target_address_space(
+                                  c.getTarget(), clang::LangAS::opencl_local)) {
                   args.emplace_back(module::argument::local, arg_api_size,
                                     target_size, target_align,
                                     module::argument::zero_ext);
diff --git a/src/gallium/state_trackers/clover/llvm/compat.hpp b/src/gallium/state_trackers/clover/llvm/compat.hpp
index cee51b9..1b4bc23 100644
--- a/src/gallium/state_trackers/clover/llvm/compat.hpp
+++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
@@ -68,10 +68,20 @@
          typedef ::llvm::TargetLibraryInfo target_library_info;
 #endif
 
+         template<typename T, typename AS>
+         unsigned target_address_space(const T &target, const AS lang_as) {
+            const auto &map = target.getAddressSpaceMap();
 #if HAVE_LLVM >= 0x0500
-         const auto lang_as_offset = 0;
+            return map[static_cast<unsigned>(lang_as)];
 #else
-         const auto lang_as_offset = clang::LangAS::Offset;
+            return map[lang_as - clang::LangAS::Offset];
+#endif
+         }
+
+#if HAVE_LLVM >= 0x0500
+         const clang::InputKind ik_opencl = clang::InputKind::OpenCL;
+#else
+         const clang::InputKind ik_opencl = clang::IK_OpenCL;
 #endif
 
          inline void
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index deebef5..6412377 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -126,7 +126,7 @@
       c->getDiagnosticOpts().ShowCarets = false;
 
       compat::set_lang_defaults(c->getInvocation(), c->getLangOpts(),
-                                clang::IK_OpenCL, ::llvm::Triple(target.triple),
+                                compat::ik_opencl, ::llvm::Triple(target.triple),
                                 c->getPreprocessorOpts(),
                                 clang::LangStandard::lang_opencl11);
 
diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
index fd322a3..a867e50 100644
--- a/src/gallium/state_trackers/dri/Android.mk
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -27,7 +27,9 @@
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := $(common_SOURCES)
+LOCAL_SRC_FILES := \
+	$(common_SOURCES) \
+	$(dri2_SOURCES)
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
@@ -40,14 +42,10 @@
 LOCAL_STATIC_LIBRARIES := \
 	libmesa_dri_common
 
-ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
+ifneq ($(HAVE_GALLIUM_SOFTPIPE),)
 LOCAL_SRC_FILES += $(drisw_SOURCES)
 endif
 
-ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_SRC_FILES += $(dri2_SOURCES)
-endif
-
 LOCAL_MODULE := libmesa_st_dri
 
 LOCAL_GENERATED_SOURCES := $(MESA_DRI_OPTIONS_H)
diff --git a/src/gallium/state_trackers/dri/Makefile.sources b/src/gallium/state_trackers/dri/Makefile.sources
index 52d60ac..46da886 100644
--- a/src/gallium/state_trackers/dri/Makefile.sources
+++ b/src/gallium/state_trackers/dri/Makefile.sources
@@ -3,6 +3,8 @@
 	dri_context.h \
 	dri_drawable.c \
 	dri_drawable.h \
+	dri_extensions.c \
+	dri_extensions.h \
 	dri_query_renderer.c \
 	dri_query_renderer.h \
 	dri_screen.c \
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index ed6004f..27477d4 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -29,7 +29,6 @@
  */
 
 #include <xf86drm.h>
-#include <dlfcn.h>
 #include <fcntl.h>
 #include "GL/mesa_glinterop.h"
 #include "util/u_memory.h"
@@ -49,9 +48,41 @@
 #include "dri_screen.h"
 #include "dri_context.h"
 #include "dri_drawable.h"
+#include "dri_extensions.h"
 #include "dri_query_renderer.h"
 #include "dri2_buffer.h"
 
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL<<56) - 1)
+#endif
+
+static const int fourcc_formats[] = {
+   __DRI_IMAGE_FOURCC_ARGB8888,
+   __DRI_IMAGE_FOURCC_ABGR8888,
+   __DRI_IMAGE_FOURCC_SARGB8888,
+   __DRI_IMAGE_FOURCC_XRGB8888,
+   __DRI_IMAGE_FOURCC_XBGR8888,
+   __DRI_IMAGE_FOURCC_ARGB1555,
+   __DRI_IMAGE_FOURCC_RGB565,
+   __DRI_IMAGE_FOURCC_R8,
+   __DRI_IMAGE_FOURCC_R16,
+   __DRI_IMAGE_FOURCC_GR88,
+   __DRI_IMAGE_FOURCC_GR1616,
+   __DRI_IMAGE_FOURCC_YUV410,
+   __DRI_IMAGE_FOURCC_YUV411,
+   __DRI_IMAGE_FOURCC_YUV420,
+   __DRI_IMAGE_FOURCC_YUV422,
+   __DRI_IMAGE_FOURCC_YUV444,
+   __DRI_IMAGE_FOURCC_YVU410,
+   __DRI_IMAGE_FOURCC_YVU411,
+   __DRI_IMAGE_FOURCC_YVU420,
+   __DRI_IMAGE_FOURCC_YVU422,
+   __DRI_IMAGE_FOURCC_YVU444,
+   __DRI_IMAGE_FOURCC_NV12,
+   __DRI_IMAGE_FOURCC_NV16,
+   __DRI_IMAGE_FOURCC_YUYV
+};
+
 static int convert_fourcc(int format, int *dri_components_p)
 {
    int dri_components;
@@ -155,6 +186,9 @@
    case __DRI_IMAGE_FORMAT_ARGB8888:
       pf = PIPE_FORMAT_BGRA8888_UNORM;
       break;
+   case __DRI_IMAGE_FORMAT_XBGR8888:
+      pf = PIPE_FORMAT_RGBX8888_UNORM;
+      break;
    case __DRI_IMAGE_FORMAT_ABGR8888:
       pf = PIPE_FORMAT_RGBA8888_UNORM;
       break;
@@ -172,6 +206,70 @@
    return pf;
 }
 
+static enum pipe_format fourcc_to_pipe_format(int fourcc)
+{
+   enum pipe_format pf;
+
+   switch (fourcc) {
+   case __DRI_IMAGE_FOURCC_R8:
+      pf = PIPE_FORMAT_R8_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_GR88:
+      pf = PIPE_FORMAT_RG88_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_ARGB1555:
+      pf = PIPE_FORMAT_B5G5R5A1_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_R16:
+      pf = PIPE_FORMAT_R16_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_GR1616:
+      pf = PIPE_FORMAT_RG1616_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_RGB565:
+      pf = PIPE_FORMAT_B5G6R5_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_ARGB8888:
+      pf = PIPE_FORMAT_BGRA8888_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_XRGB8888:
+      pf = PIPE_FORMAT_BGRX8888_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_ABGR8888:
+      pf = PIPE_FORMAT_RGBA8888_UNORM;
+      break;
+   case __DRI_IMAGE_FOURCC_XBGR8888:
+      pf = PIPE_FORMAT_RGBX8888_UNORM;
+      break;
+
+   case __DRI_IMAGE_FOURCC_NV12:
+      pf = PIPE_FORMAT_NV12;
+      break;
+   case __DRI_IMAGE_FOURCC_YUYV:
+      pf = PIPE_FORMAT_YUYV;
+      break;
+   case __DRI_IMAGE_FOURCC_YUV420:
+   case __DRI_IMAGE_FOURCC_YVU420:
+      pf = PIPE_FORMAT_YV12;
+      break;
+
+   case __DRI_IMAGE_FOURCC_SARGB8888:
+   case __DRI_IMAGE_FOURCC_YUV410:
+   case __DRI_IMAGE_FOURCC_YUV411:
+   case __DRI_IMAGE_FOURCC_YUV422:
+   case __DRI_IMAGE_FOURCC_YUV444:
+   case __DRI_IMAGE_FOURCC_NV16:
+   case __DRI_IMAGE_FOURCC_YVU410:
+   case __DRI_IMAGE_FOURCC_YVU411:
+   case __DRI_IMAGE_FOURCC_YVU422:
+   case __DRI_IMAGE_FOURCC_YVU444:
+   default:
+      pf = PIPE_FORMAT_NONE;
+   }
+
+   return pf;
+}
+
 /**
  * DRI2 flush extension.
  */
@@ -261,9 +359,11 @@
        */
       switch(format) {
       case PIPE_FORMAT_BGRA8888_UNORM:
+      case PIPE_FORMAT_RGBA8888_UNORM:
 	 depth = 32;
 	 break;
       case PIPE_FORMAT_BGRX8888_UNORM:
+      case PIPE_FORMAT_RGBX8888_UNORM:
 	 depth = 24;
 	 break;
       case PIPE_FORMAT_B5G6R5_UNORM:
@@ -339,6 +439,9 @@
       case PIPE_FORMAT_BGRA8888_UNORM:
          image_format = __DRI_IMAGE_FORMAT_ARGB8888;
          break;
+      case PIPE_FORMAT_RGBX8888_UNORM:
+         image_format = __DRI_IMAGE_FORMAT_XBGR8888;
+         break;
       case PIPE_FORMAT_RGBA8888_UNORM:
          image_format = __DRI_IMAGE_FORMAT_ABGR8888;
          break;
@@ -869,6 +972,7 @@
    memset(&whandle, 0, sizeof(whandle));
    whandle.type = DRM_API_HANDLE_TYPE_SHARED;
    whandle.handle = name;
+   whandle.modifier = DRM_FORMAT_MOD_INVALID;
 
    pf = dri2_format_to_pipe_format (format);
    if (pf == PIPE_FORMAT_NONE)
@@ -883,8 +987,8 @@
 static __DRIimage *
 dri2_create_image_from_fd(__DRIscreen *_screen,
                           int width, int height, int fourcc,
-                          int *fds, int num_fds, int *strides,
-                          int *offsets, unsigned *error,
+                          uint64_t modifier, int *fds, int num_fds,
+                          int *strides, int *offsets, unsigned *error,
                           int *dri_components, void *loaderPrivate)
 {
    struct winsys_handle whandles[3];
@@ -929,6 +1033,7 @@
       whandles[i].handle = (unsigned)fds[i];
       whandles[i].stride = (unsigned)strides[i];
       whandles[i].offset = (unsigned)offsets[i];
+      whandles[i].modifier = modifier;
    }
 
    if (fourcc == __DRI_IMAGE_FOURCC_YVU420) {
@@ -965,9 +1070,12 @@
 }
 
 static __DRIimage *
-dri2_create_image(__DRIscreen *_screen,
-                   int width, int height, int format,
-                   unsigned int use, void *loaderPrivate)
+dri2_create_image_common(__DRIscreen *_screen,
+                         int width, int height,
+                         int format, unsigned int use,
+                         const uint64_t *modifiers,
+                         const unsigned count,
+                         void *loaderPrivate)
 {
    struct dri_screen *screen = dri_screen(_screen);
    __DRIimage *img;
@@ -975,7 +1083,13 @@
    unsigned tex_usage;
    enum pipe_format pf;
 
+   /* createImageWithModifiers doesn't supply usage, and we should not get
+    * here with both modifiers and a usage flag.
+    */
+   assert(!(use && (modifiers != NULL)));
+
    tex_usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+
    if (use & __DRI_IMAGE_USE_SCANOUT)
       tex_usage |= PIPE_BIND_SCANOUT;
    if (use & __DRI_IMAGE_USE_SHARE)
@@ -1006,7 +1120,16 @@
    templ.depth0 = 1;
    templ.array_size = 1;
 
-   img->texture = screen->base.screen->resource_create(screen->base.screen, &templ);
+   if (modifiers)
+      img->texture =
+         screen->base.screen
+            ->resource_create_with_modifiers(screen->base.screen,
+                                             &templ,
+                                             modifiers,
+                                             count);
+   else
+      img->texture =
+         screen->base.screen->resource_create(screen->base.screen, &templ);
    if (!img->texture) {
       FREE(img);
       return NULL;
@@ -1022,6 +1145,28 @@
    return img;
 }
 
+static __DRIimage *
+dri2_create_image(__DRIscreen *_screen,
+                   int width, int height, int format,
+                   unsigned int use, void *loaderPrivate)
+{
+   return dri2_create_image_common(_screen, width, height, format, use,
+                                   NULL /* modifiers */, 0 /* count */,
+                                   loaderPrivate);
+}
+
+static __DRIimage *
+dri2_create_image_with_modifiers(__DRIscreen *dri_screen,
+                                 int width, int height, int format,
+                                 const uint64_t *modifiers,
+                                 const unsigned count,
+                                 void *loaderPrivate)
+{
+   return dri2_create_image_common(dri_screen, width, height, format,
+                                   0 /* use */, modifiers, count,
+                                   loaderPrivate);
+}
+
 static GLboolean
 dri2_query_image(__DRIimage *image, int attrib, int *value)
 {
@@ -1038,20 +1183,30 @@
    switch (attrib) {
    case __DRI_IMAGE_ATTRIB_STRIDE:
       whandle.type = DRM_API_HANDLE_TYPE_KMS;
-      image->texture->screen->resource_get_handle(image->texture->screen,
-            NULL, image->texture, &whandle, usage);
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+            NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
       *value = whandle.stride;
       return GL_TRUE;
+   case __DRI_IMAGE_ATTRIB_OFFSET:
+      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+            NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
+      *value = whandle.offset;
+      return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_HANDLE:
       whandle.type = DRM_API_HANDLE_TYPE_KMS;
-      image->texture->screen->resource_get_handle(image->texture->screen,
-         NULL, image->texture, &whandle, usage);
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+         NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_NAME:
       whandle.type = DRM_API_HANDLE_TYPE_SHARED;
-      image->texture->screen->resource_get_handle(image->texture->screen,
-         NULL, image->texture, &whandle, usage);
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+         NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FD:
@@ -1082,6 +1237,26 @@
    case __DRI_IMAGE_ATTRIB_NUM_PLANES:
       *value = 1;
       return GL_TRUE;
+   case __DRI_IMAGE_ATTRIB_MODIFIER_UPPER:
+      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.modifier = DRM_FORMAT_MOD_INVALID;
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+            NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
+      if (whandle.modifier == DRM_FORMAT_MOD_INVALID)
+         return GL_FALSE;
+      *value = (whandle.modifier >> 32) & 0xffffffff;
+      return GL_TRUE;
+   case __DRI_IMAGE_ATTRIB_MODIFIER_LOWER:
+      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.modifier = DRM_FORMAT_MOD_INVALID;
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+            NULL, image->texture, &whandle, usage))
+         return GL_FALSE;
+      if (whandle.modifier == DRM_FORMAT_MOD_INVALID)
+         return GL_FALSE;
+      *value = whandle.modifier & 0xffffffff;
+      return GL_TRUE;
    default:
       return GL_FALSE;
    }
@@ -1143,6 +1318,7 @@
    whandle.handle = names[0];
    whandle.stride = strides[0];
    whandle.offset = offsets[0];
+   whandle.modifier = DRM_FORMAT_MOD_INVALID;
 
    img = dri2_create_image_from_winsys(screen, width, height, format,
                                        1, &whandle, loaderPrivate);
@@ -1252,7 +1428,8 @@
    int dri_components;
 
    img = dri2_create_image_from_fd(screen, width, height, fourcc,
-                                   fds, num_fds, strides, offsets, NULL,
+                                   DRM_FORMAT_MOD_INVALID, fds, num_fds,
+                                   strides, offsets, NULL,
                                    &dri_components, loaderPrivate);
    if (img == NULL)
       return NULL;
@@ -1261,6 +1438,50 @@
    return img;
 }
 
+static boolean
+dri2_query_dma_buf_formats(__DRIscreen *_screen, int max, int *formats,
+                           int *count)
+{
+   struct dri_screen *screen = dri_screen(_screen);
+   struct pipe_screen *pscreen = screen->base.screen;
+   const unsigned bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+   int i, j;
+
+   for (i = 0, j = 0; (i < ARRAY_SIZE(fourcc_formats)) &&
+         (j < max || max == 0); i++) {
+      if (pscreen->is_format_supported(pscreen,
+                                       fourcc_to_pipe_format(
+                                          fourcc_formats[i]),
+                                       screen->target,
+                                       0, bind)) {
+         if (j < max)
+            formats[j] = fourcc_formats[i];
+         j++;
+      }
+   }
+   *count = j;
+   return true;
+}
+
+static boolean
+dri2_query_dma_buf_modifiers(__DRIscreen *_screen, int fourcc, int max,
+                             uint64_t *modifiers, unsigned int *external_only,
+                             int *count)
+{
+   struct dri_screen *screen = dri_screen(_screen);
+   struct pipe_screen *pscreen = screen->base.screen;
+   enum pipe_format format = fourcc_to_pipe_format(fourcc);
+   const unsigned usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+
+   if (pscreen->query_dmabuf_modifiers != NULL &&
+       pscreen->is_format_supported(pscreen, format, screen->target, 0, usage)) {
+      pscreen->query_dmabuf_modifiers(pscreen, format, max, modifiers,
+                                      external_only, count);
+      return true;
+   }
+   return false;
+}
+
 static __DRIimage *
 dri2_from_dma_bufs(__DRIscreen *screen,
                    int width, int height, int fourcc,
@@ -1277,7 +1498,8 @@
    int dri_components;
 
    img = dri2_create_image_from_fd(screen, width, height, fourcc,
-                                   fds, num_fds, strides, offsets, error,
+                                   DRM_FORMAT_MOD_INVALID, fds, num_fds,
+                                   strides, offsets, error,
                                    &dri_components, loaderPrivate);
    if (img == NULL)
       return NULL;
@@ -1292,6 +1514,37 @@
    return img;
 }
 
+static __DRIimage *
+dri2_from_dma_bufs2(__DRIscreen *screen,
+                    int width, int height, int fourcc,
+                    uint64_t modifier, int *fds, int num_fds,
+                    int *strides, int *offsets,
+                    enum __DRIYUVColorSpace yuv_color_space,
+                    enum __DRISampleRange sample_range,
+                    enum __DRIChromaSiting horizontal_siting,
+                    enum __DRIChromaSiting vertical_siting,
+                    unsigned *error,
+                    void *loaderPrivate)
+{
+   __DRIimage *img;
+   int dri_components;
+
+   img = dri2_create_image_from_fd(screen, width, height, fourcc,
+                                   modifier, fds, num_fds, strides, offsets,
+                                   error, &dri_components, loaderPrivate);
+   if (img == NULL)
+      return NULL;
+
+   img->yuv_color_space = yuv_color_space;
+   img->sample_range = sample_range;
+   img->horizontal_siting = horizontal_siting;
+   img->vertical_siting = vertical_siting;
+   img->dri_components = dri_components;
+
+   *error = __DRI_IMAGE_ERROR_SUCCESS;
+   return img;
+}
+
 static void
 dri2_blit_image(__DRIcontext *context, __DRIimage *dst, __DRIimage *src,
                 int dstx0, int dsty0, int dstwidth, int dstheight,
@@ -1395,7 +1648,7 @@
 
 /* The extension is modified during runtime if DRI_PRIME is detected */
 static __DRIimageExtension dri2ImageExtension = {
-    .base = { __DRI_IMAGE, 12 },
+    .base = { __DRI_IMAGE, 15 },
 
     .createImageFromName          = dri2_create_image_from_name,
     .createImageFromRenderbuffer  = dri2_create_image_from_renderbuffer,
@@ -1415,208 +1668,6 @@
     .unmapImage                   = dri2_unmap_image,
 };
 
-
-static bool
-dri2_is_opencl_interop_loaded_locked(struct dri_screen *screen)
-{
-   return screen->opencl_dri_event_add_ref &&
-          screen->opencl_dri_event_release &&
-          screen->opencl_dri_event_wait &&
-          screen->opencl_dri_event_get_fence;
-}
-
-static bool
-dri2_load_opencl_interop(struct dri_screen *screen)
-{
-#if defined(RTLD_DEFAULT)
-   bool success;
-
-   mtx_lock(&screen->opencl_func_mutex);
-
-   if (dri2_is_opencl_interop_loaded_locked(screen)) {
-      mtx_unlock(&screen->opencl_func_mutex);
-      return true;
-   }
-
-   screen->opencl_dri_event_add_ref =
-      dlsym(RTLD_DEFAULT, "opencl_dri_event_add_ref");
-   screen->opencl_dri_event_release =
-      dlsym(RTLD_DEFAULT, "opencl_dri_event_release");
-   screen->opencl_dri_event_wait =
-      dlsym(RTLD_DEFAULT, "opencl_dri_event_wait");
-   screen->opencl_dri_event_get_fence =
-      dlsym(RTLD_DEFAULT, "opencl_dri_event_get_fence");
-
-   success = dri2_is_opencl_interop_loaded_locked(screen);
-   mtx_unlock(&screen->opencl_func_mutex);
-   return success;
-#else
-   return false;
-#endif
-}
-
-struct dri2_fence {
-   struct dri_screen *driscreen;
-   struct pipe_fence_handle *pipe_fence;
-   void *cl_event;
-};
-
-static unsigned dri2_fence_get_caps(__DRIscreen *_screen)
-{
-   struct dri_screen *driscreen = dri_screen(_screen);
-   struct pipe_screen *screen = driscreen->base.screen;
-   unsigned caps = 0;
-
-   if (screen->get_param(screen, PIPE_CAP_NATIVE_FENCE_FD))
-      caps |= __DRI_FENCE_CAP_NATIVE_FD;
-
-   return caps;
-}
-
-static void *
-dri2_create_fence(__DRIcontext *_ctx)
-{
-   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
-   struct dri2_fence *fence = CALLOC_STRUCT(dri2_fence);
-
-   if (!fence)
-      return NULL;
-
-   ctx->flush(ctx, &fence->pipe_fence, 0);
-
-   if (!fence->pipe_fence) {
-      FREE(fence);
-      return NULL;
-   }
-
-   fence->driscreen = dri_screen(_ctx->driScreenPriv);
-   return fence;
-}
-
-static void *
-dri2_create_fence_fd(__DRIcontext *_ctx, int fd)
-{
-   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
-   struct dri2_fence *fence = CALLOC_STRUCT(dri2_fence);
-
-   if (fd == -1) {
-      /* exporting driver created fence, flush: */
-      ctx->flush(ctx, &fence->pipe_fence,
-                 PIPE_FLUSH_DEFERRED | PIPE_FLUSH_FENCE_FD);
-   } else {
-      /* importing a foreign fence fd: */
-      ctx->create_fence_fd(ctx, &fence->pipe_fence, fd);
-   }
-   if (!fence->pipe_fence) {
-      FREE(fence);
-      return NULL;
-   }
-
-   fence->driscreen = dri_screen(_ctx->driScreenPriv);
-   return fence;
-}
-
-static int
-dri2_get_fence_fd(__DRIscreen *_screen, void *_fence)
-{
-   struct dri_screen *driscreen = dri_screen(_screen);
-   struct pipe_screen *screen = driscreen->base.screen;
-   struct dri2_fence *fence = (struct dri2_fence*)_fence;
-
-   return screen->fence_get_fd(screen, fence->pipe_fence);
-}
-
-static void *
-dri2_get_fence_from_cl_event(__DRIscreen *_screen, intptr_t cl_event)
-{
-   struct dri_screen *driscreen = dri_screen(_screen);
-   struct dri2_fence *fence;
-
-   if (!dri2_load_opencl_interop(driscreen))
-      return NULL;
-
-   fence = CALLOC_STRUCT(dri2_fence);
-   if (!fence)
-      return NULL;
-
-   fence->cl_event = (void*)cl_event;
-
-   if (!driscreen->opencl_dri_event_add_ref(fence->cl_event)) {
-      free(fence);
-      return NULL;
-   }
-
-   fence->driscreen = driscreen;
-   return fence;
-}
-
-static void
-dri2_destroy_fence(__DRIscreen *_screen, void *_fence)
-{
-   struct dri_screen *driscreen = dri_screen(_screen);
-   struct pipe_screen *screen = driscreen->base.screen;
-   struct dri2_fence *fence = (struct dri2_fence*)_fence;
-
-   if (fence->pipe_fence)
-      screen->fence_reference(screen, &fence->pipe_fence, NULL);
-   else if (fence->cl_event)
-      driscreen->opencl_dri_event_release(fence->cl_event);
-   else
-      assert(0);
-
-   FREE(fence);
-}
-
-static GLboolean
-dri2_client_wait_sync(__DRIcontext *_ctx, void *_fence, unsigned flags,
-                      uint64_t timeout)
-{
-   struct dri2_fence *fence = (struct dri2_fence*)_fence;
-   struct dri_screen *driscreen = fence->driscreen;
-   struct pipe_screen *screen = driscreen->base.screen;
-
-   /* No need to flush. The context was flushed when the fence was created. */
-
-   if (fence->pipe_fence)
-      return screen->fence_finish(screen, NULL, fence->pipe_fence, timeout);
-   else if (fence->cl_event) {
-      struct pipe_fence_handle *pipe_fence =
-         driscreen->opencl_dri_event_get_fence(fence->cl_event);
-
-      if (pipe_fence)
-         return screen->fence_finish(screen, NULL, pipe_fence, timeout);
-      else
-         return driscreen->opencl_dri_event_wait(fence->cl_event, timeout);
-   }
-   else {
-      assert(0);
-      return false;
-   }
-}
-
-static void
-dri2_server_wait_sync(__DRIcontext *_ctx, void *_fence, unsigned flags)
-{
-   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
-   struct dri2_fence *fence = (struct dri2_fence*)_fence;
-
-   if (ctx->fence_server_sync)
-      ctx->fence_server_sync(ctx, fence->pipe_fence);
-}
-
-static __DRI2fenceExtension dri2FenceExtension = {
-   .base = { __DRI2_FENCE, 2 },
-
-   .create_fence = dri2_create_fence,
-   .get_fence_from_cl_event = dri2_get_fence_from_cl_event,
-   .destroy_fence = dri2_destroy_fence,
-   .client_wait_sync = dri2_client_wait_sync,
-   .server_wait_sync = dri2_server_wait_sync,
-   .get_capabilities = dri2_fence_get_caps,
-   .create_fence_fd = dri2_create_fence_fd,
-   .get_fence_fd = dri2_get_fence_fd,
-};
-
 static const __DRIrobustnessExtension dri2Robustness = {
    .base = { __DRI2_ROBUSTNESS, 1 }
 };
@@ -1880,6 +1931,69 @@
    .export_object = dri2_interop_export_object
 };
 
+/**
+ * \brief the DRI2ConfigQueryExtension configQueryb method
+ */
+static int
+dri2GalliumConfigQueryb(__DRIscreen *sPriv, const char *var,
+                        unsigned char *val)
+{
+   struct dri_screen *screen = dri_screen(sPriv);
+
+   if (!driCheckOption(&screen->optionCache, var, DRI_BOOL))
+      return dri2ConfigQueryExtension.configQueryb(sPriv, var, val);
+
+   *val = driQueryOptionb(&screen->optionCache, var);
+
+   return 0;
+}
+
+/**
+ * \brief the DRI2ConfigQueryExtension configQueryi method
+ */
+static int
+dri2GalliumConfigQueryi(__DRIscreen *sPriv, const char *var, int *val)
+{
+   struct dri_screen *screen = dri_screen(sPriv);
+
+   if (!driCheckOption(&screen->optionCache, var, DRI_INT) &&
+       !driCheckOption(&screen->optionCache, var, DRI_ENUM))
+      return dri2ConfigQueryExtension.configQueryi(sPriv, var, val);
+
+    *val = driQueryOptioni(&screen->optionCache, var);
+
+    return 0;
+}
+
+/**
+ * \brief the DRI2ConfigQueryExtension configQueryf method
+ */
+static int
+dri2GalliumConfigQueryf(__DRIscreen *sPriv, const char *var, float *val)
+{
+   struct dri_screen *screen = dri_screen(sPriv);
+
+   if (!driCheckOption(&screen->optionCache, var, DRI_FLOAT))
+      return dri2ConfigQueryExtension.configQueryf(sPriv, var, val);
+
+    *val = driQueryOptionf(&screen->optionCache, var);
+
+    return 0;
+}
+
+/**
+ * \brief the DRI2ConfigQueryExtension struct.
+ *
+ * We first query the driver option cache. Then the dri2 option cache.
+ */
+static const __DRI2configQueryExtension dri2GalliumConfigQueryExtension = {
+   .base = { __DRI2_CONFIG_QUERY, 1 },
+
+   .configQueryb        = dri2GalliumConfigQueryb,
+   .configQueryi        = dri2GalliumConfigQueryi,
+   .configQueryf        = dri2GalliumConfigQueryf,
+};
+
 /*
  * Backend function init_screen.
  */
@@ -1889,10 +2003,11 @@
    &dri2FlushExtension.base,
    &dri2ImageExtension.base,
    &dri2RendererQueryExtension.base,
-   &dri2ConfigQueryExtension.base,
+   &dri2GalliumConfigQueryExtension.base,
    &dri2ThrottleExtension.base,
    &dri2FenceExtension.base,
    &dri2InteropExtension.base,
+   &dri2NoErrorExtension.base,
    NULL
 };
 
@@ -1901,11 +2016,12 @@
    &dri2FlushExtension.base,
    &dri2ImageExtension.base,
    &dri2RendererQueryExtension.base,
-   &dri2ConfigQueryExtension.base,
+   &dri2GalliumConfigQueryExtension.base,
    &dri2ThrottleExtension.base,
    &dri2FenceExtension.base,
    &dri2InteropExtension.base,
    &dri2Robustness.base,
+   &dri2NoErrorExtension.base,
    NULL
 };
 
@@ -1937,8 +2053,13 @@
    if (screen->fd < 0 || (fd = fcntl(screen->fd, F_DUPFD_CLOEXEC, 3)) < 0)
       goto free_screen;
 
-   if (pipe_loader_drm_probe_fd(&screen->dev, fd))
-      pscreen = pipe_loader_create_screen(screen->dev);
+
+   if (pipe_loader_drm_probe_fd(&screen->dev, fd)) {
+      unsigned flags =
+         dri_init_options_get_screen_flags(screen, screen->dev->driver_name);
+
+      pscreen = pipe_loader_create_screen(screen->dev, flags);
+   }
 
    if (!pscreen)
        goto release_pipe;
@@ -1951,6 +2072,10 @@
       screen->default_throttle_frames = throttle_ret->val.val_int;
    }
 
+   if (pscreen->resource_create_with_modifiers)
+      dri2ImageExtension.createImageWithModifiers =
+         dri2_create_image_with_modifiers;
+
    if (dmabuf_ret && dmabuf_ret->val.val_bool) {
       uint64_t cap;
 
@@ -1958,6 +2083,12 @@
           (cap & DRM_PRIME_CAP_IMPORT)) {
          dri2ImageExtension.createImageFromFds = dri2_from_fds;
          dri2ImageExtension.createImageFromDmaBufs = dri2_from_dma_bufs;
+         dri2ImageExtension.createImageFromDmaBufs2 = dri2_from_dma_bufs2;
+         if (pscreen->query_dmabuf_modifiers) {
+            dri2ImageExtension.queryDmaBufFormats = dri2_query_dma_buf_formats;
+            dri2ImageExtension.queryDmaBufModifiers =
+                                       dri2_query_dma_buf_modifiers;
+         }
       }
    }
 
@@ -1968,7 +2099,7 @@
    else
       sPriv->extensions = dri_screen_extensions;
 
-   configs = dri_init_screen_helper(screen, pscreen, screen->dev->driver_name);
+   configs = dri_init_screen_helper(screen, pscreen);
    if (!configs)
       goto destroy_screen;
 
@@ -2020,21 +2151,30 @@
    if (screen->fd < 0 || (fd = fcntl(screen->fd, F_DUPFD_CLOEXEC, 3)) < 0)
       goto free_screen;
 
+   unsigned flags = dri_init_options_get_screen_flags(screen, "swrast");
+
    if (pipe_loader_sw_probe_kms(&screen->dev, fd))
-      pscreen = pipe_loader_create_screen(screen->dev);
+      pscreen = pipe_loader_create_screen(screen->dev, flags);
 
    if (!pscreen)
        goto release_pipe;
 
+   if (pscreen->resource_create_with_modifiers)
+      dri2ImageExtension.createImageWithModifiers =
+         dri2_create_image_with_modifiers;
+
    if (drmGetCap(sPriv->fd, DRM_CAP_PRIME, &cap) == 0 &&
           (cap & DRM_PRIME_CAP_IMPORT)) {
       dri2ImageExtension.createImageFromFds = dri2_from_fds;
       dri2ImageExtension.createImageFromDmaBufs = dri2_from_dma_bufs;
+      dri2ImageExtension.createImageFromDmaBufs2 = dri2_from_dma_bufs2;
+      dri2ImageExtension.queryDmaBufFormats = dri2_query_dma_buf_formats;
+      dri2ImageExtension.queryDmaBufModifiers = dri2_query_dma_buf_modifiers;
    }
 
    sPriv->extensions = dri_screen_extensions;
 
-   configs = dri_init_screen_helper(screen, pscreen, "swrast");
+   configs = dri_init_screen_helper(screen, pscreen);
    if (!configs)
       goto destroy_screen;
 
@@ -2125,7 +2265,6 @@
     &driImageDriverExtension.base,
     &driDRI2Extension.base,
     &gallium_config_options.base,
-    &dri2FenceExtension.base,
     NULL
 };
 
diff --git a/src/gallium/state_trackers/dri/dri_context.c b/src/gallium/state_trackers/dri/dri_context.c
index 92d7984..8c3797e4 100644
--- a/src/gallium/state_trackers/dri/dri_context.c
+++ b/src/gallium/state_trackers/dri/dri_context.c
@@ -57,7 +57,10 @@
    struct st_context_attribs attribs;
    enum st_context_error ctx_err = 0;
    unsigned allowed_flags = __DRI_CTX_FLAG_DEBUG |
-                            __DRI_CTX_FLAG_FORWARD_COMPATIBLE;
+                            __DRI_CTX_FLAG_FORWARD_COMPATIBLE |
+                            __DRI_CTX_FLAG_NO_ERROR;
+   const __DRIbackgroundCallableExtension *backgroundCallable =
+      screen->sPriv->dri2.backgroundCallable;
 
    if (screen->has_reset_status_query)
       allowed_flags |= __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS;
@@ -104,6 +107,9 @@
    if (notify_reset)
       attribs.flags |= ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED;
 
+   if (flags & __DRI_CTX_FLAG_NO_ERROR)
+      attribs.flags |= ST_CONTEXT_FLAG_NO_ERROR;
+
    if (sharedContextPrivate) {
       st_share = ((struct dri_context *)sharedContextPrivate)->st;
    }
@@ -118,6 +124,9 @@
    ctx->cPriv = cPriv;
    ctx->sPriv = sPriv;
 
+   if (driQueryOptionb(&screen->optionCache, "mesa_no_error"))
+      attribs.flags |= ST_CONTEXT_FLAG_NO_ERROR;
+
    attribs.options = screen->options;
    dri_fill_st_visual(&attribs.visual, screen, visual);
    ctx->st = stapi->create_context(stapi, &screen->base, &attribs, &ctx_err,
@@ -158,10 +167,21 @@
 
    /* Do this last. */
    if (ctx->st->start_thread &&
-       /* the driver loader must implement this */
-       screen->sPriv->dri2.backgroundCallable &&
-       driQueryOptionb(&screen->optionCache, "mesa_glthread"))
-      ctx->st->start_thread(ctx->st);
+         driQueryOptionb(&screen->optionCache, "mesa_glthread")) {
+
+      if (backgroundCallable && backgroundCallable->base.version >= 2 &&
+            backgroundCallable->isThreadSafe) {
+
+         if (backgroundCallable->isThreadSafe(cPriv->loaderPrivate))
+            ctx->st->start_thread(ctx->st);
+         else
+            fprintf(stderr, "dri_create_context: glthread isn't thread safe "
+                  "- missing call XInitThreads\n");
+      } else {
+         fprintf(stderr, "dri_create_context: requested glthread but driver "
+               "is missing backgroundCallable V2 extension\n");
+      }
+   }
 
    *error = __DRI_CTX_ERROR_SUCCESS;
    return GL_TRUE;
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index 3c2e307..4176c1c 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -38,6 +38,8 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 
+static uint32_t drifb_ID = 0;
+
 static void
 swap_fences_unref(struct dri_drawable *draw);
 
@@ -97,10 +99,8 @@
       return TRUE;
 
    /* Set the window-system buffers for the state tracker. */
-   for (i = 0; i < count; i++) {
-      out[i] = NULL;
+   for (i = 0; i < count; i++)
       pipe_resource_reference(&out[i], textures[statts[i]]);
-   }
 
    return TRUE;
 }
@@ -155,6 +155,8 @@
 
    dPriv->driverPrivate = (void *)drawable;
    p_atomic_set(&drawable->base.stamp, 1);
+   drawable->base.ID = p_atomic_inc_return(&drifb_ID);
+   drawable->base.state_manager = &screen->base;
 
    return GL_TRUE;
 fail:
@@ -166,6 +168,8 @@
 dri_destroy_buffer(__DRIdrawable * dPriv)
 {
    struct dri_drawable *drawable = dri_drawable(dPriv);
+   struct dri_screen *screen = drawable->screen;
+   struct st_api *stapi = screen->st_api;
    int i;
 
    pipe_surface_reference(&drawable->drisw_surface, NULL);
@@ -177,6 +181,9 @@
 
    swap_fences_unref(drawable);
 
+   /* Notify the st manager that this drawable is no longer valid */
+   stapi->destroy_drawable(stapi, &drawable->base);
+
    FREE(drawable);
 }
 
diff --git a/src/gallium/state_trackers/dri/dri_extensions.c b/src/gallium/state_trackers/dri/dri_extensions.c
new file mode 100644
index 0000000..b3a2cb7
--- /dev/null
+++ b/src/gallium/state_trackers/dri/dri_extensions.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <dlfcn.h>
+#include "dri_context.h"
+#include "dri_screen.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+
+static bool
+dri2_is_opencl_interop_loaded_locked(struct dri_screen *screen)
+{
+   return screen->opencl_dri_event_add_ref &&
+          screen->opencl_dri_event_release &&
+          screen->opencl_dri_event_wait &&
+          screen->opencl_dri_event_get_fence;
+}
+
+static bool
+dri2_load_opencl_interop(struct dri_screen *screen)
+{
+#if defined(RTLD_DEFAULT)
+   bool success;
+
+   mtx_lock(&screen->opencl_func_mutex);
+
+   if (dri2_is_opencl_interop_loaded_locked(screen)) {
+      mtx_unlock(&screen->opencl_func_mutex);
+      return true;
+   }
+
+   screen->opencl_dri_event_add_ref =
+      dlsym(RTLD_DEFAULT, "opencl_dri_event_add_ref");
+   screen->opencl_dri_event_release =
+      dlsym(RTLD_DEFAULT, "opencl_dri_event_release");
+   screen->opencl_dri_event_wait =
+      dlsym(RTLD_DEFAULT, "opencl_dri_event_wait");
+   screen->opencl_dri_event_get_fence =
+      dlsym(RTLD_DEFAULT, "opencl_dri_event_get_fence");
+
+   success = dri2_is_opencl_interop_loaded_locked(screen);
+   mtx_unlock(&screen->opencl_func_mutex);
+   return success;
+#else
+   return false;
+#endif
+}
+
+struct dri2_fence {
+   struct dri_screen *driscreen;
+   struct pipe_fence_handle *pipe_fence;
+   void *cl_event;
+};
+
+static unsigned dri2_fence_get_caps(__DRIscreen *_screen)
+{
+   struct dri_screen *driscreen = dri_screen(_screen);
+   struct pipe_screen *screen = driscreen->base.screen;
+   unsigned caps = 0;
+
+   if (screen->get_param(screen, PIPE_CAP_NATIVE_FENCE_FD))
+      caps |= __DRI_FENCE_CAP_NATIVE_FD;
+
+   return caps;
+}
+
+static void *
+dri2_create_fence(__DRIcontext *_ctx)
+{
+   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
+   struct dri2_fence *fence = CALLOC_STRUCT(dri2_fence);
+
+   if (!fence)
+      return NULL;
+
+   ctx->flush(ctx, &fence->pipe_fence, 0);
+
+   if (!fence->pipe_fence) {
+      FREE(fence);
+      return NULL;
+   }
+
+   fence->driscreen = dri_screen(_ctx->driScreenPriv);
+   return fence;
+}
+
+static void *
+dri2_create_fence_fd(__DRIcontext *_ctx, int fd)
+{
+   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
+   struct dri2_fence *fence = CALLOC_STRUCT(dri2_fence);
+
+   if (fd == -1) {
+      /* exporting driver created fence, flush: */
+      ctx->flush(ctx, &fence->pipe_fence,
+                 PIPE_FLUSH_DEFERRED | PIPE_FLUSH_FENCE_FD);
+   } else {
+      /* importing a foreign fence fd: */
+      ctx->create_fence_fd(ctx, &fence->pipe_fence, fd);
+   }
+   if (!fence->pipe_fence) {
+      FREE(fence);
+      return NULL;
+   }
+
+   fence->driscreen = dri_screen(_ctx->driScreenPriv);
+   return fence;
+}
+
+static int
+dri2_get_fence_fd(__DRIscreen *_screen, void *_fence)
+{
+   struct dri_screen *driscreen = dri_screen(_screen);
+   struct pipe_screen *screen = driscreen->base.screen;
+   struct dri2_fence *fence = (struct dri2_fence*)_fence;
+
+   return screen->fence_get_fd(screen, fence->pipe_fence);
+}
+
+static void *
+dri2_get_fence_from_cl_event(__DRIscreen *_screen, intptr_t cl_event)
+{
+   struct dri_screen *driscreen = dri_screen(_screen);
+   struct dri2_fence *fence;
+
+   if (!dri2_load_opencl_interop(driscreen))
+      return NULL;
+
+   fence = CALLOC_STRUCT(dri2_fence);
+   if (!fence)
+      return NULL;
+
+   fence->cl_event = (void*)cl_event;
+
+   if (!driscreen->opencl_dri_event_add_ref(fence->cl_event)) {
+      free(fence);
+      return NULL;
+   }
+
+   fence->driscreen = driscreen;
+   return fence;
+}
+
+static void
+dri2_destroy_fence(__DRIscreen *_screen, void *_fence)
+{
+   struct dri_screen *driscreen = dri_screen(_screen);
+   struct pipe_screen *screen = driscreen->base.screen;
+   struct dri2_fence *fence = (struct dri2_fence*)_fence;
+
+   if (fence->pipe_fence)
+      screen->fence_reference(screen, &fence->pipe_fence, NULL);
+   else if (fence->cl_event)
+      driscreen->opencl_dri_event_release(fence->cl_event);
+   else
+      assert(0);
+
+   FREE(fence);
+}
+
+static GLboolean
+dri2_client_wait_sync(__DRIcontext *_ctx, void *_fence, unsigned flags,
+                      uint64_t timeout)
+{
+   struct dri2_fence *fence = (struct dri2_fence*)_fence;
+   struct dri_screen *driscreen = fence->driscreen;
+   struct pipe_screen *screen = driscreen->base.screen;
+
+   /* No need to flush. The context was flushed when the fence was created. */
+
+   if (fence->pipe_fence)
+      return screen->fence_finish(screen, NULL, fence->pipe_fence, timeout);
+   else if (fence->cl_event) {
+      struct pipe_fence_handle *pipe_fence =
+         driscreen->opencl_dri_event_get_fence(fence->cl_event);
+
+      if (pipe_fence)
+         return screen->fence_finish(screen, NULL, pipe_fence, timeout);
+      else
+         return driscreen->opencl_dri_event_wait(fence->cl_event, timeout);
+   }
+   else {
+      assert(0);
+      return false;
+   }
+}
+
+static void
+dri2_server_wait_sync(__DRIcontext *_ctx, void *_fence, unsigned flags)
+{
+   struct pipe_context *ctx = dri_context(_ctx)->st->pipe;
+   struct dri2_fence *fence = (struct dri2_fence*)_fence;
+
+   if (ctx->fence_server_sync)
+      ctx->fence_server_sync(ctx, fence->pipe_fence);
+}
+
+const __DRI2fenceExtension dri2FenceExtension = {
+   .base = { __DRI2_FENCE, 2 },
+
+   .create_fence = dri2_create_fence,
+   .get_fence_from_cl_event = dri2_get_fence_from_cl_event,
+   .destroy_fence = dri2_destroy_fence,
+   .client_wait_sync = dri2_client_wait_sync,
+   .server_wait_sync = dri2_server_wait_sync,
+   .get_capabilities = dri2_fence_get_caps,
+   .create_fence_fd = dri2_create_fence_fd,
+   .get_fence_fd = dri2_get_fence_fd,
+};
+
+/* vim: set sw=3 ts=8 sts=3 expandtab: */
diff --git a/src/gallium/state_trackers/dri/dri_extensions.h b/src/gallium/state_trackers/dri/dri_extensions.h
new file mode 100644
index 0000000..89b01cd
--- /dev/null
+++ b/src/gallium/state_trackers/dri/dri_extensions.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DRI_EXTENSIONS_H
+#define DRI_EXTENSIONS_H
+
+extern const __DRI2fenceExtension dri2FenceExtension;
+
+#endif
+
+/* vim: set sw=3 ts=8 sts=3 expandtab: */
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index 998e8ef..6bd4790 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -56,6 +56,9 @@
    DRI_CONF_BEGIN
       DRI_CONF_SECTION_PERFORMANCE
          DRI_CONF_MESA_GLTHREAD("false")
+         DRI_CONF_MESA_NO_ERROR("false")
+         DRI_CONF_DISABLE_EXT_BUFFER_AGE("false")
+         DRI_CONF_DISABLE_OML_SYNC_CONTROL("false")
       DRI_CONF_SECTION_END
 
       DRI_CONF_SECTION_QUALITY
@@ -75,8 +78,10 @@
          DRI_CONF_DISABLE_SHADER_BIT_ENCODING("false")
          DRI_CONF_FORCE_GLSL_VERSION(0)
          DRI_CONF_ALLOW_GLSL_EXTENSION_DIRECTIVE_MIDSHADER("false")
+         DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION("false")
          DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION("false")
          DRI_CONF_FORCE_GLSL_ABS_SQRT("false")
+         DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD("false")
       DRI_CONF_SECTION_END
 
       DRI_CONF_SECTION_MISCELLANEOUS
@@ -108,6 +113,8 @@
       driQueryOptionb(optionCache, "force_s3tc_enable");
    options->allow_glsl_extension_directive_midshader =
       driQueryOptionb(optionCache, "allow_glsl_extension_directive_midshader");
+   options->allow_glsl_builtin_variable_redeclaration =
+      driQueryOptionb(optionCache, "allow_glsl_builtin_variable_redeclaration");
    options->allow_higher_compat_version =
       driQueryOptionb(optionCache, "allow_higher_compat_version");
    options->glsl_zero_init = driQueryOptionb(optionCache, "glsl_zero_init");
@@ -126,6 +133,33 @@
       MESA_FORMAT_B8G8R8A8_SRGB,
       MESA_FORMAT_B8G8R8X8_SRGB,
       MESA_FORMAT_B5G6R5_UNORM,
+#ifdef ANDROID
+      /*
+       * To reduce the risk of breaking non-Android users in stable release
+       * let's keep these for Android alone until this is handled properly.
+       */
+
+      /* The 32-bit RGBA format must not precede the 32-bit BGRA format.
+       * Likewise for RGBX and BGRX.  Otherwise, the GLX client and the GLX
+       * server may disagree on which format the GLXFBConfig represents,
+       * resulting in swapped color channels.
+       *
+       * The problem, as of 2017-05-30:
+       * When matching a GLXFBConfig to a __DRIconfig, GLX ignores the channel
+       * order and chooses the first __DRIconfig with the expected channel
+       * sizes. Specifically, GLX compares the GLXFBConfig's and __DRIconfig's
+       * __DRI_ATTRIB_{CHANNEL}_SIZE but ignores __DRI_ATTRIB_{CHANNEL}_MASK.
+       *
+       * EGL does not suffer from this problem. It correctly compares the
+       * channel masks when matching EGLConfig to __DRIconfig.
+       */
+
+      /* Required by Android, for HAL_PIXEL_FORMAT_RGBA_8888. */
+      MESA_FORMAT_R8G8B8A8_UNORM,
+
+      /* Required by Android, for HAL_PIXEL_FORMAT_RGBX_8888. */
+      MESA_FORMAT_R8G8B8X8_UNORM,
+#endif
    };
    static const enum pipe_format pipe_formats[] = {
       PIPE_FORMAT_BGRA8888_UNORM,
@@ -133,6 +167,14 @@
       PIPE_FORMAT_BGRA8888_SRGB,
       PIPE_FORMAT_BGRX8888_SRGB,
       PIPE_FORMAT_B5G6R5_UNORM,
+#ifdef ANDROID
+      /*
+       * To reduce the risk of breaking non-Android users in stable release
+       * let's keep these for Android alone until this is handled properly.
+       */
+      PIPE_FORMAT_RGBA8888_UNORM,
+      PIPE_FORMAT_RGBX8888_UNORM,
+#endif
    };
    mesa_format format;
    __DRIconfig **configs = NULL;
@@ -269,19 +311,41 @@
    if (!mode)
       return;
 
-   if (mode->redBits == 8) {
-      if (mode->alphaBits == 8)
-         if (mode->sRGBCapable)
-            stvis->color_format = PIPE_FORMAT_BGRA8888_SRGB;
-         else
-            stvis->color_format = PIPE_FORMAT_BGRA8888_UNORM;
-      else
-         if (mode->sRGBCapable)
-            stvis->color_format = PIPE_FORMAT_BGRX8888_SRGB;
-         else
-            stvis->color_format = PIPE_FORMAT_BGRX8888_UNORM;
-   } else {
+   /* Deduce the color format. */
+   switch (mode->redMask) {
+   case 0x00FF0000:
+      if (mode->alphaMask) {
+         assert(mode->alphaMask == 0xFF000000);
+         stvis->color_format = mode->sRGBCapable ?
+                                  PIPE_FORMAT_BGRA8888_SRGB :
+                                  PIPE_FORMAT_BGRA8888_UNORM;
+      } else {
+         stvis->color_format = mode->sRGBCapable ?
+                                  PIPE_FORMAT_BGRX8888_SRGB :
+                                  PIPE_FORMAT_BGRX8888_UNORM;
+      }
+      break;
+
+   case 0x000000FF:
+      if (mode->alphaMask) {
+         assert(mode->alphaMask == 0xFF000000);
+         stvis->color_format = mode->sRGBCapable ?
+                                  PIPE_FORMAT_RGBA8888_SRGB :
+                                  PIPE_FORMAT_RGBA8888_UNORM;
+      } else {
+         stvis->color_format = mode->sRGBCapable ?
+                                  PIPE_FORMAT_RGBX8888_SRGB :
+                                  PIPE_FORMAT_RGBX8888_UNORM;
+      }
+      break;
+
+   case 0x0000F800:
       stvis->color_format = PIPE_FORMAT_B5G6R5_UNORM;
+      break;
+
+   default:
+      assert(!"unsupported visual: invalid red mask");
+      return;
    }
 
    if (mode->sampleBuffers) {
@@ -405,6 +469,9 @@
 void
 dri_destroy_screen_helper(struct dri_screen * screen)
 {
+   if (screen->base.destroy)
+      screen->base.destroy(&screen->base);
+
    if (screen->st_api && screen->st_api->destroy)
       screen->st_api->destroy(screen->st_api);
 
@@ -441,7 +508,8 @@
 }
 
 static void
-dri_set_background_context(struct st_context_iface *st)
+dri_set_background_context(struct st_context_iface *st,
+                           struct util_queue_monitoring *queue_info)
 {
    struct dri_context *ctx = (struct dri_context *)st->st_manager_private;
    const __DRIbackgroundCallableExtension *backgroundCallable =
@@ -453,12 +521,34 @@
     */
    assert(backgroundCallable);
    backgroundCallable->setBackgroundContext(ctx->cPriv->loaderPrivate);
+
+   if (ctx->hud)
+      hud_add_queue_for_monitoring(ctx->hud, queue_info);
+}
+
+unsigned
+dri_init_options_get_screen_flags(struct dri_screen *screen,
+                                  const char* driver_name)
+{
+   unsigned flags = 0;
+
+   driParseOptionInfo(&screen->optionCacheDefaults, gallium_config_options.xml);
+   driParseConfigFiles(&screen->optionCache,
+                       &screen->optionCacheDefaults,
+                       screen->sPriv->myNum,
+                       driver_name);
+   dri_fill_st_options(screen);
+
+   if (driQueryOptionb(&screen->optionCache,
+                       "glsl_correct_derivatives_after_discard"))
+      flags |= PIPE_SCREEN_ENABLE_CORRECT_TGSI_DERIVATIVES_AFTER_KILL;
+
+   return flags;
 }
 
 const __DRIconfig **
 dri_init_screen_helper(struct dri_screen *screen,
-                       struct pipe_screen *pscreen,
-                       const char* driver_name)
+                       struct pipe_screen *pscreen)
 {
    screen->base.screen = pscreen;
    screen->base.get_egl_image = dri_get_egl_image;
@@ -474,15 +564,6 @@
    else
       screen->target = PIPE_TEXTURE_RECT;
 
-   driParseOptionInfo(&screen->optionCacheDefaults, gallium_config_options.xml);
-
-   driParseConfigFiles(&screen->optionCache,
-                       &screen->optionCacheDefaults,
-                       screen->sPriv->myNum,
-                       driver_name);
-
-   dri_fill_st_options(screen);
-
    /* Handle force_s3tc_enable. */
    if (!util_format_s3tc_enabled && screen->options.force_s3tc_enable) {
       /* Ensure libtxc_dxtn has been loaded if available.
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 7f5fd13..550bc51 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -137,10 +137,13 @@
 dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
                    const struct gl_config *mode);
 
+unsigned
+dri_init_options_get_screen_flags(struct dri_screen *screen,
+                                  const char* driver_name);
+
 const __DRIconfig **
 dri_init_screen_helper(struct dri_screen *screen,
-                       struct pipe_screen *pscreen,
-                       const char* driver_name);
+                       struct pipe_screen *pscreen);
 
 void
 dri_destroy_screen_helper(struct dri_screen * screen);
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index b85a73c..ac40956 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -46,6 +46,7 @@
 #include "dri_screen.h"
 #include "dri_context.h"
 #include "dri_drawable.h"
+#include "dri_extensions.h"
 #include "dri_query_renderer.h"
 
 DEBUG_GET_ONCE_BOOL_OPTION(swrast_no_present, "SWRAST_NO_PRESENT", FALSE);
@@ -369,6 +370,8 @@
    &driTexBufferExtension.base,
    &dri2RendererQueryExtension.base,
    &dri2ConfigQueryExtension.base,
+   &dri2FenceExtension.base,
+   &dri2NoErrorExtension.base,
    NULL
 };
 
@@ -397,13 +400,15 @@
    sPriv->driverPrivate = (void *)screen;
    sPriv->extensions = drisw_screen_extensions;
 
+   unsigned flags = dri_init_options_get_screen_flags(screen, "swrast");
+
    if (pipe_loader_sw_probe_dri(&screen->dev, &drisw_lf))
-      pscreen = pipe_loader_create_screen(screen->dev);
+      pscreen = pipe_loader_create_screen(screen->dev, flags);
 
    if (!pscreen)
       goto fail;
 
-   configs = dri_init_screen_helper(screen, pscreen, "swrast");
+   configs = dri_init_screen_helper(screen, pscreen);
    if (!configs)
       goto fail;
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
index 881dd44..828253b 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -181,6 +181,9 @@
     *    xmdpy->screen->destroy(xmdpy->screen);
     * }
     */
+
+   if (xmdpy->smapi->destroy)
+      xmdpy->smapi->destroy(xmdpy->smapi);
    free(xmdpy->smapi);
 
    XFree((char *) info);
@@ -595,6 +598,11 @@
           */
          b->ws.drawable = 0;
 
+         /* Notify the st manager that the associated framebuffer interface
+          * object is no longer valid.
+          */
+         stapi->destroy_drawable(stapi, buffer->stfb);
+
          /* XXX we should move the buffer to a delete-pending list and destroy
           * the buffer until it is no longer current.
           */
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index 9e30efa..946b5dc 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -245,10 +245,8 @@
       }
    }
 
-   for (i = 0; i < count; i++) {
-      out[i] = NULL;
+   for (i = 0; i < count; i++)
       pipe_resource_reference(&out[i], xstfb->textures[statts[i]]);
-   }
 
    return TRUE;
 }
@@ -273,6 +271,7 @@
    return ret;
 }
 
+static uint32_t xmesa_stfbi_ID = 0;
 
 struct st_framebuffer_iface *
 xmesa_create_st_framebuffer(XMesaDisplay xmdpy, XMesaBuffer b)
@@ -302,6 +301,8 @@
    stfbi->visual = &xstfb->stvis;
    stfbi->flush_front = xmesa_st_framebuffer_flush_front;
    stfbi->validate = xmesa_st_framebuffer_validate;
+   stfbi->ID = p_atomic_inc_return(&xmesa_stfbi_ID);
+   stfbi->state_manager = xmdpy->smapi;
    p_atomic_set(&stfbi->stamp, 1);
    stfbi->st_manager_private = (void *) xstfb;
 
diff --git a/src/gallium/state_trackers/hgl/hgl.c b/src/gallium/state_trackers/hgl/hgl.c
index 1b70281..bbc477a 100644
--- a/src/gallium/state_trackers/hgl/hgl.c
+++ b/src/gallium/state_trackers/hgl/hgl.c
@@ -193,10 +193,8 @@
 		//}
 	}
 
-	for (i = 0; i < count; i++) {
-		out[i] = NULL;
+	for (i = 0; i < count; i++)
 		pipe_resource_reference(&out[i], buffer->textures[statts[i]]);
-	}
 
 	return TRUE;
 }
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 4943658..88df38c 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -2815,26 +2815,27 @@
 
     vtxbuf.stride = VertexStreamZeroStride;
     vtxbuf.buffer_offset = 0;
-    vtxbuf.buffer = NULL;
-    vtxbuf.user_buffer = pVertexStreamZeroData;
+    vtxbuf.is_user_buffer = true;
+    vtxbuf.buffer.user = pVertexStreamZeroData;
 
     if (!This->driver_caps.user_vbufs) {
+        vtxbuf.is_user_buffer = false;
+        vtxbuf.buffer.resource = NULL;
         u_upload_data(This->vertex_uploader,
                       0,
                       (prim_count_to_vertex_count(PrimitiveType, PrimitiveCount)) * VertexStreamZeroStride, /* XXX */
                       4,
-                      vtxbuf.user_buffer,
+                      pVertexStreamZeroData,
                       &vtxbuf.buffer_offset,
-                      &vtxbuf.buffer);
+                      &vtxbuf.buffer.resource);
         u_upload_unmap(This->vertex_uploader);
-        vtxbuf.user_buffer = NULL;
     }
 
     NineBeforeDraw(This);
     nine_context_draw_primitive_from_vtxbuf(This, PrimitiveType, PrimitiveCount, &vtxbuf);
     NineAfterDraw(This);
 
-    pipe_resource_reference(&vtxbuf.buffer, NULL);
+    pipe_vertex_buffer_unreference(&vtxbuf);
 
     NineDevice9_PauseRecording(This);
     NineDevice9_SetStreamSource(This, 0, NULL, 0, 0);
@@ -2855,7 +2856,6 @@
                                     UINT VertexStreamZeroStride )
 {
     struct pipe_vertex_buffer vbuf;
-    struct pipe_index_buffer ibuf;
 
     DBG("iface %p, PrimitiveType %u, MinVertexIndex %u, NumVertices %u "
         "PrimitiveCount %u, pIndexData %p, IndexDataFormat %u "
@@ -2872,38 +2872,38 @@
 
     vbuf.stride = VertexStreamZeroStride;
     vbuf.buffer_offset = 0;
-    vbuf.buffer = NULL;
-    vbuf.user_buffer = pVertexStreamZeroData;
+    vbuf.is_user_buffer = true;
+    vbuf.buffer.user = pVertexStreamZeroData;
 
-    ibuf.index_size = (IndexDataFormat == D3DFMT_INDEX16) ? 2 : 4;
-    ibuf.offset = 0;
-    ibuf.buffer = NULL;
-    ibuf.user_buffer = pIndexData;
+    unsigned index_size = (IndexDataFormat == D3DFMT_INDEX16) ? 2 : 4;
+    struct pipe_resource *ibuf = NULL;
 
     if (!This->driver_caps.user_vbufs) {
         const unsigned base = MinVertexIndex * VertexStreamZeroStride;
+        vbuf.is_user_buffer = false;
+        vbuf.buffer.resource = NULL;
         u_upload_data(This->vertex_uploader,
                       base,
                       NumVertices * VertexStreamZeroStride, /* XXX */
                       4,
-                      (const uint8_t *)vbuf.user_buffer + base,
+                      (const uint8_t *)pVertexStreamZeroData + base,
                       &vbuf.buffer_offset,
-                      &vbuf.buffer);
+                      &vbuf.buffer.resource);
         u_upload_unmap(This->vertex_uploader);
         /* Won't be used: */
         vbuf.buffer_offset -= base;
-        vbuf.user_buffer = NULL;
     }
+
+    unsigned index_offset = 0;
     if (This->csmt_active) {
         u_upload_data(This->pipe_secondary->stream_uploader,
                       0,
-                      (prim_count_to_vertex_count(PrimitiveType, PrimitiveCount)) * ibuf.index_size,
+                      (prim_count_to_vertex_count(PrimitiveType, PrimitiveCount)) * index_size,
                       4,
-                      ibuf.user_buffer,
-                      &ibuf.offset,
-                      &ibuf.buffer);
+                      pIndexData,
+                      &index_offset,
+                      &ibuf);
         u_upload_unmap(This->pipe_secondary->stream_uploader);
-        ibuf.user_buffer = NULL;
     }
 
     NineBeforeDraw(This);
@@ -2912,11 +2912,14 @@
                                                            NumVertices,
                                                            PrimitiveCount,
                                                            &vbuf,
-                                                           &ibuf);
+                                                           ibuf,
+                                                           ibuf ? NULL : (void*)pIndexData,
+                                                           index_offset,
+                                                           index_size);
     NineAfterDraw(This);
 
-    pipe_resource_reference(&vbuf.buffer, NULL);
-    pipe_resource_reference(&ibuf.buffer, NULL);
+    pipe_vertex_buffer_unreference(&vbuf);
+    pipe_resource_reference(&ibuf, NULL);
 
     NineDevice9_PauseRecording(This);
     NineDevice9_SetIndices(This, NULL);
@@ -3029,9 +3032,8 @@
     draw.restart_index = 0;
     draw.count_from_stream_output = NULL;
     draw.indirect = NULL;
-    draw.indirect_params = NULL;
     draw.instance_count = 1;
-    draw.indexed = FALSE;
+    draw.index_size = 0;
     draw.start = 0;
     draw.index_bias = 0;
     draw.min_index = 0;
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.c b/src/gallium/state_trackers/nine/indexbuffer9.c
index cbd75fb..e73d29b 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.c
+++ b/src/gallium/state_trackers/nine/indexbuffer9.c
@@ -49,17 +49,13 @@
     if (FAILED(hr))
         return hr;
 
-    This->buffer.buffer = NULL;
-    This->buffer.offset = 0;
-
     switch (pDesc->Format) {
-    case D3DFMT_INDEX16: This->buffer.index_size = 2; break;
-    case D3DFMT_INDEX32: This->buffer.index_size = 4; break;
+    case D3DFMT_INDEX16: This->index_size = 2; break;
+    case D3DFMT_INDEX32: This->index_size = 4; break;
     default:
         user_assert(!"Invalid index format.", D3DERR_INVALIDCALL);
         break;
     }
-    This->buffer.user_buffer = NULL;
 
     pDesc->Type = D3DRTYPE_INDEXBUFFER;
     This->desc = *pDesc;
@@ -73,12 +69,11 @@
     NineBuffer9_dtor(&This->base);
 }
 
-const struct pipe_index_buffer *
-NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This )
+struct pipe_resource *
+NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This, unsigned *offset )
 {
     /* The resource may change */
-    This->buffer.buffer = NineBuffer9_GetResource(&This->base, &This->buffer.offset);
-    return &This->buffer;
+    return NineBuffer9_GetResource(&This->base, offset);
 }
 
 HRESULT NINE_WINAPI
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.h b/src/gallium/state_trackers/nine/indexbuffer9.h
index e695082..e688488 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.h
+++ b/src/gallium/state_trackers/nine/indexbuffer9.h
@@ -29,7 +29,6 @@
 
 struct pipe_screen;
 struct pipe_context;
-struct pipe_index_buffer;
 struct pipe_transfer;
 struct NineDevice9;
 
@@ -38,7 +37,7 @@
     struct NineBuffer9 base;
 
     /* g3d stuff */
-    struct pipe_index_buffer buffer;
+    unsigned index_size;
 
     D3DINDEXBUFFER_DESC desc;
 };
@@ -63,8 +62,9 @@
 
 /*** Nine private ***/
 
-const struct pipe_index_buffer *
-NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This );
+struct pipe_resource *
+NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This,
+                            unsigned *offset );
 
 /*** Direct3D public ***/
 
diff --git a/src/gallium/state_trackers/nine/nine_csmt_helper.h b/src/gallium/state_trackers/nine/nine_csmt_helper.h
index dc46bbd..7286cc3 100644
--- a/src/gallium/state_trackers/nine/nine_csmt_helper.h
+++ b/src/gallium/state_trackers/nine/nine_csmt_helper.h
@@ -402,7 +402,18 @@
         ,\
         y
 
-#define ARG_BIND_BUF(x, y) \
+#define ARG_BIND_VBUF(x, y) \
+        x _##y ,\
+        memcpy(&args->_##y , y, sizeof(x)); \
+        args->_##y.buffer.resource = NULL; \
+        pipe_resource_reference(&args->_##y.buffer.resource, y->buffer.resource); ,\
+        x *y ,\
+        &args->_##y ,\
+        pipe_resource_reference(&args->_##y.buffer.resource, NULL); ,\
+        ,\
+        y
+
+#define ARG_BIND_IBUF(x, y) \
         x _##y ,\
         memcpy(&args->_##y , y, sizeof(x)); \
         args->_##y.buffer = NULL; \
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 40fb6be..f405090 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -1879,7 +1879,7 @@
     struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
     src[0] = tx_src_param(tx, &tx->insn.src[0]);
     src[1] = tx_src_param(tx, &tx->insn.src[1]);
-    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2);
+    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
     ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
     return D3D_OK;
 }
@@ -1897,7 +1897,7 @@
     struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
     src[0] = tx_src_param(tx, &tx->insn.src[0]);
     src[1] = tx_src_param(tx, &tx->insn.src[1]);
-    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2);
+    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
     ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
     ureg_BRK(tx->ureg);
     tx_endcond(tx);
@@ -3029,7 +3029,7 @@
 
     ureg_insn(tx->ureg, tx->insn.info->opcode,
               dst, tx->insn.ndst,
-              src, tx->insn.nsrc);
+              src, tx->insn.nsrc, 0);
     return D3D_OK;
 }
 
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 26c21f2..a9a41af 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -293,7 +293,7 @@
     if (!device->csmt_active)
         return device->context.pipe;
 
-    if (!pipe_thread_is_self(ctx->worker))
+    if (!u_thread_is_self(ctx->worker))
         nine_csmt_process(device);
 
     return device->context.pipe;
@@ -899,9 +899,9 @@
 
     if (context->dummy_vbo_bound_at >= 0) {
         if (!context->vbo_bound_done) {
-            dummy_vtxbuf.buffer = device->dummy_vbo;
+            dummy_vtxbuf.buffer.resource = device->dummy_vbo;
             dummy_vtxbuf.stride = 0;
-            dummy_vtxbuf.user_buffer = NULL;
+            dummy_vtxbuf.is_user_buffer = false;
             dummy_vtxbuf.buffer_offset = 0;
             pipe->set_vertex_buffers(pipe, context->dummy_vbo_bound_at,
                                      1, &dummy_vtxbuf);
@@ -912,7 +912,7 @@
 
     for (i = 0; mask; mask >>= 1, ++i) {
         if (mask & 1) {
-            if (context->vtxbuf[i].buffer)
+            if (context->vtxbuf[i].buffer.resource)
                 pipe->set_vertex_buffers(pipe, i, 1, &context->vtxbuf[i]);
             else
                 pipe->set_vertex_buffers(pipe, i, 1, NULL);
@@ -1100,17 +1100,6 @@
 }
 
 static inline void
-commit_index_buffer(struct NineDevice9 *device)
-{
-    struct nine_context *context = &device->context;
-    struct pipe_context *pipe = context->pipe;
-    if (context->idxbuf.buffer)
-        pipe->set_index_buffer(pipe, &context->idxbuf);
-    else
-        pipe->set_index_buffer(pipe, NULL);
-}
-
-static inline void
 commit_vs_constants(struct NineDevice9 *device)
 {
     struct nine_context *context = &device->context;
@@ -1235,8 +1224,6 @@
             update_viewport(device);
         if (group & (NINE_STATE_VDECL | NINE_STATE_VS | NINE_STATE_STREAMFREQ))
             update_vertex_elements(device);
-        if (group & NINE_STATE_IDXBUF)
-            commit_index_buffer(device);
     }
 
     if (likely(group & (NINE_STATE_FREQUENT | NINE_STATE_VS | NINE_STATE_PS | NINE_STATE_SWVP))) {
@@ -1526,7 +1513,7 @@
 
     context->vtxbuf[i].stride = Stride;
     context->vtxbuf[i].buffer_offset = OffsetInBytes;
-    pipe_resource_reference(&context->vtxbuf[i].buffer, res);
+    pipe_resource_reference(&context->vtxbuf[i].buffer.resource, res);
 
     context->changed.vtxbuf |= 1 << StreamNumber;
 }
@@ -1575,10 +1562,9 @@
 {
     struct nine_context *context = &device->context;
 
-    context->idxbuf.index_size = IndexSize;
-    context->idxbuf.offset = OffsetInBytes;
-    pipe_resource_reference(&context->idxbuf.buffer, res);
-    context->idxbuf.user_buffer = NULL;
+    context->index_size = IndexSize;
+    context->index_offset = OffsetInBytes;
+    pipe_resource_reference(&context->idxbuf, res);
 
     context->changed.group |= NINE_STATE_IDXBUF;
 }
@@ -1587,16 +1573,13 @@
 nine_context_set_indices(struct NineDevice9 *device,
                          struct NineIndexBuffer9 *idxbuf)
 {
-    const struct pipe_index_buffer *pipe_idxbuf;
     struct pipe_resource *res = NULL;
     UINT IndexSize = 0;
-    UINT OffsetInBytes = 0;
+    unsigned OffsetInBytes = 0;
 
     if (idxbuf) {
-        pipe_idxbuf = NineIndexBuffer9_GetBuffer(idxbuf);
-        IndexSize = pipe_idxbuf->index_size;
-        res = pipe_idxbuf->buffer;
-        OffsetInBytes = pipe_idxbuf->offset;
+        res = NineIndexBuffer9_GetBuffer(idxbuf, &OffsetInBytes);
+        IndexSize = idxbuf->index_size;
     }
 
     nine_context_set_indices_apply(device, res, IndexSize, OffsetInBytes);
@@ -2556,10 +2539,10 @@
     if (dev->context.stream_instancedata_mask & dev->context.stream_usage_mask)
         info->instance_count = MAX2(dev->context.stream_freq[0] & 0x7FFFFF, 1);
     info->primitive_restart = FALSE;
+    info->has_user_indices = FALSE;
     info->restart_index = 0;
     info->count_from_stream_output = NULL;
     info->indirect = NULL;
-    info->indirect_params = NULL;
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_draw_primitive,
@@ -2573,22 +2556,23 @@
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = FALSE;
+    info.index_size = 0;
     info.start = StartVertex;
     info.index_bias = 0;
     info.min_index = info.start;
     info.max_index = info.count - 1;
+    info.index.resource = NULL;
 
     context->pipe->draw_vbo(context->pipe, &info);
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_draw_indexed_primitive,
                   ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
-                   ARG_VAL(INT, BaseVertexIndex),
-                   ARG_VAL(UINT, MinVertexIndex),
-                   ARG_VAL(UINT, NumVertices),
-                   ARG_VAL(UINT, StartIndex),
-                   ARG_VAL(UINT, PrimitiveCount))
+                  ARG_VAL(INT, BaseVertexIndex),
+                  ARG_VAL(UINT, MinVertexIndex),
+                  ARG_VAL(UINT, NumVertices),
+                  ARG_VAL(UINT, StartIndex),
+                  ARG_VAL(UINT, PrimitiveCount))
 {
     struct nine_context *context = &device->context;
     struct pipe_draw_info info;
@@ -2596,12 +2580,13 @@
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = TRUE;
-    info.start = StartIndex;
+    info.index_size = context->index_size;
+    info.start = context->index_offset / context->index_size + StartIndex;
     info.index_bias = BaseVertexIndex;
     /* These don't include index bias: */
     info.min_index = MinVertexIndex;
     info.max_index = MinVertexIndex + NumVertices - 1;
+    info.index.resource = context->idxbuf;
 
     context->pipe->draw_vbo(context->pipe, &info);
 }
@@ -2609,7 +2594,7 @@
 CSMT_ITEM_NO_WAIT(nine_context_draw_primitive_from_vtxbuf,
                   ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
                   ARG_VAL(UINT, PrimitiveCount),
-                  ARG_BIND_BUF(struct pipe_vertex_buffer, vtxbuf))
+                  ARG_BIND_VBUF(struct pipe_vertex_buffer, vtxbuf))
 {
     struct nine_context *context = &device->context;
     struct pipe_draw_info info;
@@ -2617,11 +2602,12 @@
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = FALSE;
+    info.index_size = 0;
     info.start = 0;
     info.index_bias = 0;
     info.min_index = 0;
     info.max_index = info.count - 1;
+    info.index.resource = NULL;
 
     context->pipe->set_vertex_buffers(context->pipe, 0, 1, vtxbuf);
 
@@ -2633,8 +2619,11 @@
                   ARG_VAL(UINT, MinVertexIndex),
                   ARG_VAL(UINT, NumVertices),
                   ARG_VAL(UINT, PrimitiveCount),
-                  ARG_BIND_BUF(struct pipe_vertex_buffer, vbuf),
-                  ARG_BIND_BUF(struct pipe_index_buffer, ibuf))
+                  ARG_BIND_VBUF(struct pipe_vertex_buffer, vbuf),
+                  ARG_BIND_RES(struct pipe_resource, ibuf),
+                  ARG_VAL(void *, user_ibuf),
+                  ARG_VAL(UINT, index_offset),
+                  ARG_VAL(UINT, index_size))
 {
     struct nine_context *context = &device->context;
     struct pipe_draw_info info;
@@ -2642,13 +2631,18 @@
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = TRUE;
-    info.start = 0;
+    info.index_size = index_size;
+    info.start = index_offset / info.index_size;
     info.index_bias = 0;
     info.min_index = MinVertexIndex;
     info.max_index = MinVertexIndex + NumVertices - 1;
+    info.has_user_indices = ibuf == NULL;
+    if (ibuf)
+        info.index.resource = ibuf;
+    else
+        info.index.user = user_ibuf;
+
     context->pipe->set_vertex_buffers(context->pipe, 0, 1, vbuf);
-    context->pipe->set_index_buffer(context->pipe, ibuf);
 
     context->pipe->draw_vbo(context->pipe, &info);
 }
@@ -3136,7 +3130,6 @@
     cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, 0, NULL);
 
     pipe->set_vertex_buffers(pipe, 0, device->caps.MaxStreams, NULL);
-    pipe->set_index_buffer(pipe, NULL);
 
     for (i = 0; i < ARRAY_SIZE(context->rt); ++i)
        nine_bind(&context->rt[i], NULL);
@@ -3145,8 +3138,8 @@
     nine_bind(&context->ps, NULL);
     nine_bind(&context->vdecl, NULL);
     for (i = 0; i < PIPE_MAX_ATTRIBS; ++i)
-        pipe_resource_reference(&context->vtxbuf[i].buffer, NULL);
-    pipe_resource_reference(&context->idxbuf.buffer, NULL);
+        pipe_vertex_buffer_unreference(&context->vtxbuf[i]);
+    pipe_resource_reference(&context->idxbuf, NULL);
 
     for (i = 0; i < NINE_MAX_SAMPLERS; ++i) {
         context->texture[i].enabled = FALSE;
@@ -3283,33 +3276,36 @@
                 unsigned offset;
                 struct pipe_resource *buf;
                 struct pipe_box box;
+                void *userbuf;
 
                 vtxbuf = state->vtxbuf[i];
-                vtxbuf.buffer = NineVertexBuffer9_GetResource(state->stream[i], &offset);
+                buf = NineVertexBuffer9_GetResource(state->stream[i], &offset);
 
-                DBG("Locking %p (offset %d, length %d)\n", vtxbuf.buffer,
+                DBG("Locking %p (offset %d, length %d)\n", buf,
                     vtxbuf.buffer_offset, num_vertices * vtxbuf.stride);
 
                 u_box_1d(vtxbuf.buffer_offset + offset + start_vertice * vtxbuf.stride,
                          num_vertices * vtxbuf.stride, &box);
-                buf = vtxbuf.buffer;
-                vtxbuf.user_buffer = pipe->transfer_map(pipe, buf, 0, PIPE_TRANSFER_READ, &box,
-                                                        &(sw_internal->transfers_so[i]));
-                vtxbuf.buffer = NULL;
+
+                userbuf = pipe->transfer_map(pipe, buf, 0, PIPE_TRANSFER_READ, &box,
+                                             &(sw_internal->transfers_so[i]));
+                vtxbuf.is_user_buffer = true;
+                vtxbuf.buffer.user = userbuf;
+
                 if (!device->driver_caps.user_sw_vbufs) {
+                    vtxbuf.buffer.resource = NULL;
+                    vtxbuf.is_user_buffer = false;
                     u_upload_data(device->pipe_sw->stream_uploader,
                                   0,
                                   box.width,
                                   16,
-                                  vtxbuf.user_buffer,
+                                  userbuf,
                                   &(vtxbuf.buffer_offset),
-                                  &(vtxbuf.buffer));
+                                  &(vtxbuf.buffer.resource));
                     u_upload_unmap(device->pipe_sw->stream_uploader);
-                    vtxbuf.user_buffer = NULL;
                 }
                 pipe_sw->set_vertex_buffers(pipe_sw, i, 1, &vtxbuf);
-                if (vtxbuf.buffer)
-                    pipe_resource_reference(&vtxbuf.buffer, NULL);
+                pipe_vertex_buffer_unreference(&vtxbuf);
             } else
                 pipe_sw->set_vertex_buffers(pipe_sw, i, 1, NULL);
         }
diff --git a/src/gallium/state_trackers/nine/nine_state.h b/src/gallium/state_trackers/nine/nine_state.h
index a487d8c..f5fd1ef 100644
--- a/src/gallium/state_trackers/nine/nine_state.h
+++ b/src/gallium/state_trackers/nine/nine_state.h
@@ -271,7 +271,9 @@
     uint32_t stream_instancedata_mask; /* derived from stream_freq */
     uint32_t stream_usage_mask; /* derived from VS and vdecl */
 
-    struct pipe_index_buffer idxbuf;
+    struct pipe_resource *idxbuf;
+    unsigned index_offset;
+    unsigned index_size;
 
     struct pipe_clip_state clip;
 
@@ -515,7 +517,10 @@
                                                        UINT NumVertices,
                                                        UINT PrimitiveCount,
                                                        struct pipe_vertex_buffer *vbuf,
-                                                       struct pipe_index_buffer *ibuf);
+                                                       struct pipe_resource *ibuf,
+                                                       void *user_ibuf,
+                                                       unsigned index_offset,
+						       unsigned index_size);
 
 void
 nine_context_resource_copy_region(struct NineDevice9 *device,
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index fa2074a..251cc7d 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -103,7 +103,9 @@
          if (!omx_display)
             goto error;
 
-         omx_screen = vl_dri2_screen_create(omx_display, 0);
+         omx_screen = vl_dri3_screen_create(omx_display, 0);
+         if (!omx_screen)
+            omx_screen = vl_dri2_screen_create(omx_display, 0);
          if (!omx_screen) {
             XCloseDisplay(omx_display);
             goto error;
diff --git a/src/gallium/state_trackers/omx/vid_dec.c b/src/gallium/state_trackers/omx/vid_dec.c
index 9a6efb8..313bc0a 100644
--- a/src/gallium/state_trackers/omx/vid_dec.c
+++ b/src/gallium/state_trackers/omx/vid_dec.c
@@ -178,7 +178,7 @@
       return OMX_ErrorInsufficientResources;
 
    screen = priv->screen->pscreen;
-   priv->pipe = screen->context_create(screen, priv->screen, 0);
+   priv->pipe = screen->context_create(screen, NULL, 0);
    if (!priv->pipe)
       return OMX_ErrorInsufficientResources;
 
diff --git a/src/gallium/state_trackers/omx/vid_dec.h b/src/gallium/state_trackers/omx/vid_dec.h
index 5a64857..7a10e75 100644
--- a/src/gallium/state_trackers/omx/vid_dec.h
+++ b/src/gallium/state_trackers/omx/vid_dec.h
@@ -45,7 +45,6 @@
 #include <bellagio/omx_base_video_port.h>
 
 #include "pipe/p_video_state.h"
-#include "state_tracker/drm_driver.h"
 #include "os/os_thread.h"
 #include "util/list.h"
 
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index 5274f64..1a4fb62 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -48,7 +48,6 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_video_codec.h"
-#include "state_tracker/drm_driver.h"
 #include "util/u_memory.h"
 #include "vl/vl_video_buffer.h"
 
@@ -180,7 +179,7 @@
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
       return OMX_ErrorBadParameter;
 
-   priv->s_pipe = screen->context_create(screen, priv->screen, 0);
+   priv->s_pipe = screen->context_create(screen, NULL, 0);
    if (!priv->s_pipe)
       return OMX_ErrorInsufficientResources;
 
@@ -197,7 +196,7 @@
       return OMX_ErrorInsufficientResources;
    }
 
-   priv->t_pipe = screen->context_create(screen, priv->screen, 0);
+   priv->t_pipe = screen->context_create(screen, NULL, 0);
    if (!priv->t_pipe)
       return OMX_ErrorInsufficientResources;
 
diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c
index 18f1b88..8326918 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -62,6 +62,7 @@
 #include "util/u_box.h"
 #include "util/u_debug.h"
 #include "util/u_format.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 
 #include "postprocess/filters.h"
@@ -432,6 +433,7 @@
 
       templat.format = format;
       templat.bind = bind;
+      pipe_resource_reference(&out[i], NULL);
       out[i] = osbuffer->textures[statts[i]] =
          screen->resource_create(screen, &templat);
    }
@@ -439,6 +441,7 @@
    return TRUE;
 }
 
+static uint32_t osmesa_fb_ID = 0;
 
 static struct st_framebuffer_iface *
 osmesa_create_st_framebuffer(void)
@@ -448,6 +451,8 @@
       stfbi->flush_front = osmesa_st_framebuffer_flush_front;
       stfbi->validate = osmesa_st_framebuffer_validate;
       p_atomic_set(&stfbi->stamp, 1);
+      stfbi->ID = p_atomic_inc_return(&osmesa_fb_ID);
+      stfbi->state_manager = get_st_manager();
    }
    return stfbi;
 }
@@ -508,6 +513,14 @@
 static void
 osmesa_destroy_buffer(struct osmesa_buffer *osbuffer)
 {
+   struct st_api *stapi = get_st_api();
+
+   /*
+    * Notify the state manager that the associated framebuffer interface
+    * is no longer valid.
+    */
+   stapi->destroy_drawable(stapi, osbuffer->stfb);
+
    FREE(osbuffer->stfb);
    FREE(osbuffer);
 }
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index fb5b20e..deaeb19 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -125,9 +125,15 @@
    }
 
    if (buf->derived_surface.resource) {
-      *pbuff = pipe_buffer_map(drv->pipe, buf->derived_surface.resource,
-                               PIPE_TRANSFER_WRITE,
-                               &buf->derived_surface.transfer);
+      struct pipe_resource *resource;
+      struct pipe_box box = {};
+
+      resource = buf->derived_surface.resource;
+      box.width = resource->width0;
+      box.height = resource->height0;
+      box.depth = resource->depth0;
+      *pbuff = drv->pipe->transfer_map(drv->pipe, resource, 0, PIPE_TRANSFER_WRITE,
+                                       &box, &buf->derived_surface.transfer);
       mtx_unlock(&drv->mutex);
 
       if (!buf->derived_surface.transfer || !*pbuff)
diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c
index 68b9230..7d3bd64 100644
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -29,6 +29,7 @@
 #include "pipe/p_screen.h"
 
 #include "util/u_video.h"
+#include "util/u_memory.h"
 
 #include "vl/vl_winsys.h"
 
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 27bc8bfe..186f5066 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -103,7 +103,6 @@
 VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
 {
    vlVaDriver *drv;
-   struct drm_state *drm_info;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
@@ -121,13 +120,11 @@
       drv->vscreen = vl_dri3_screen_create(ctx->native_dpy, ctx->x11_screen);
       if (!drv->vscreen)
          drv->vscreen = vl_dri2_screen_create(ctx->native_dpy, ctx->x11_screen);
-      if (!drv->vscreen)
-         goto error_screen;
       break;
    case VA_DISPLAY_WAYLAND:
    case VA_DISPLAY_DRM:
    case VA_DISPLAY_DRM_RENDERNODES: {
-      drm_info = (struct drm_state *) ctx->drm_state;
+      const struct drm_state *drm_info = (struct drm_state *) ctx->drm_state;
 
       if (!drm_info || drm_info->fd < 0) {
          FREE(drv);
@@ -135,8 +132,6 @@
       }
 
       drv->vscreen = vl_drm_screen_create(drm_info->fd);
-      if (!drv->vscreen)
-         goto error_screen;
       break;
    }
    default:
@@ -144,8 +139,11 @@
       return VA_STATUS_ERROR_INVALID_DISPLAY;
    }
 
+   if (!drv->vscreen)
+      goto error_screen;
+
    drv->pipe = drv->vscreen->pscreen->context_create(drv->vscreen->pscreen,
-                                                     drv->vscreen, 0);
+                                                     NULL, 0);
    if (!drv->pipe)
       goto error_pipe;
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index f87de8e..86ae868 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -487,10 +487,13 @@
          ((format != PIPE_FORMAT_YV12) || (surf->buffer->buffer_format != PIPE_FORMAT_NV12)) &&
          ((format != PIPE_FORMAT_IYUV) || (surf->buffer->buffer_format != PIPE_FORMAT_NV12))) {
       struct pipe_video_buffer *tmp_buf;
-      struct pipe_video_buffer templat = surf->templat;
 
-      templat.buffer_format = format;
-      tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &templat);
+      surf->templat.buffer_format = format;
+      if (format == PIPE_FORMAT_YUYV || format == PIPE_FORMAT_UYVY ||
+          format == PIPE_FORMAT_B8G8R8A8_UNORM || format == PIPE_FORMAT_B8G8R8X8_UNORM ||
+          format == PIPE_FORMAT_R8G8B8A8_UNORM || format == PIPE_FORMAT_R8G8B8X8_UNORM)
+         surf->templat.interlaced = false;
+      tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
 
       if (!tmp_buf) {
          mtx_unlock(&drv->mutex);
@@ -499,7 +502,6 @@
 
       surf->buffer->destroy(surf->buffer);
       surf->buffer = tmp_buf;
-      surf->templat.buffer_format = format;
    }
 
    views = surf->buffer->get_sampler_view_planes(surf->buffer);
diff --git a/src/gallium/state_trackers/va/picture_hevc.c b/src/gallium/state_trackers/va/picture_hevc.c
index 28743ee..e879259 100644
--- a/src/gallium/state_trackers/va/picture_hevc.c
+++ b/src/gallium/state_trackers/va/picture_hevc.c
@@ -25,6 +25,7 @@
  *
  **************************************************************************/
 
+#include "vl/vl_zscan.h"
 #include "va_private.h"
 
 void vlVaHandlePictureParameterBufferHEVC(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
@@ -179,14 +180,32 @@
 void vlVaHandleIQMatrixBufferHEVC(vlVaContext *context, vlVaBuffer *buf)
 {
    VAIQMatrixBufferHEVC *h265 = buf->data;
+   int i, j;
 
-   assert(buf->size >= sizeof(VAIQMatrixBufferH264) && buf->num_elements == 1);
-   memcpy(&context->desc.h265.pps->sps->ScalingList4x4, h265->ScalingList4x4, 6 * 16);
-   memcpy(&context->desc.h265.pps->sps->ScalingList8x8, h265->ScalingList8x8, 6 * 64);
-   memcpy(&context->desc.h265.pps->sps->ScalingList16x16, h265->ScalingList16x16, 6 * 64);
-   memcpy(&context->desc.h265.pps->sps->ScalingList32x32, h265->ScalingList32x32, 2 * 64);
-   memcpy(&context->desc.h265.pps->sps->ScalingListDCCoeff16x16, h265->ScalingListDC16x16, 6);
-   memcpy(&context->desc.h265.pps->sps->ScalingListDCCoeff32x32, h265->ScalingListDC32x32, 2);
+   assert(buf->size >= sizeof(VAIQMatrixBufferHEVC) && buf->num_elements == 1);
+
+   for (i = 0; i < 6; i++) {
+      for (j = 0; j < 16; j++)
+         context->desc.h265.pps->sps->ScalingList4x4[i][j] =
+                                h265->ScalingList4x4[i][vl_zscan_h265_up_right_diagonal_16[j]];
+
+      for (j = 0; j < 64; j++) {
+         context->desc.h265.pps->sps->ScalingList8x8[i][j] =
+                                h265->ScalingList8x8[i][vl_zscan_h265_up_right_diagonal[j]];
+         context->desc.h265.pps->sps->ScalingList16x16[i][j] =
+                                h265->ScalingList16x16[i][vl_zscan_h265_up_right_diagonal[j]];
+
+         if (i < 2)
+            context->desc.h265.pps->sps->ScalingList32x32[i][j] =
+                                   h265->ScalingList32x32[i][vl_zscan_h265_up_right_diagonal[j]];
+      }
+
+      context->desc.h265.pps->sps->ScalingListDCCoeff16x16[i] =
+                             h265->ScalingListDC16x16[i];
+      if (i < 2)
+         context->desc.h265.pps->sps->ScalingListDCCoeff32x32[i] =
+                                h265->ScalingListDC32x32[i];
+   }
 }
 
 void vlVaHandleSliceParameterBufferHEVC(vlVaContext *context, vlVaBuffer *buf)
diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c
index 8467b0e..6349691 100644
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 #include "util/u_handle_table.h"
+#include "util/u_memory.h"
 
 #include "vl/vl_defines.h"
 #include "vl/vl_video_buffer.h"
@@ -34,7 +35,7 @@
 #include "va_private.h"
 
 static const VARectangle *
-vlVaRegionDefault(const VARectangle *region, struct pipe_video_buffer *buf,
+vlVaRegionDefault(const VARectangle *region, vlVaSurface *surf,
 		  VARectangle *def)
 {
    if (region)
@@ -42,8 +43,8 @@
 
    def->x = 0;
    def->y = 0;
-   def->width = buf->width;
-   def->height = buf->height;
+   def->width = surf->templat.width;
+   def->height = surf->templat.height;
 
    return def;
 }
@@ -229,7 +230,7 @@
    const VARectangle *src_region, *dst_region;
    VAProcPipelineParameterBuffer *param;
    struct pipe_video_buffer *src;
-   vlVaSurface *src_surface;
+   vlVaSurface *src_surface, *dst_surface;
    unsigned i;
 
    if (!drv || !context)
@@ -244,6 +245,8 @@
    param = buf->data;
 
    src_surface = handle_table_get(drv->htab, param->surface);
+   dst_surface = handle_table_get(drv->htab, context->target_id);
+
    if (!src_surface || !src_surface->buffer)
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
@@ -289,8 +292,8 @@
       }
    }
 
-   src_region = vlVaRegionDefault(param->surface_region, src_surface->buffer, &def_src_region);
-   dst_region = vlVaRegionDefault(param->output_region, context->target, &def_dst_region);
+   src_region = vlVaRegionDefault(param->surface_region, src_surface, &def_src_region);
+   dst_region = vlVaRegionDefault(param->output_region, dst_surface, &def_dst_region);
 
    if (context->target->buffer_format != PIPE_FORMAT_NV12 &&
        context->target->buffer_format != PIPE_FORMAT_P016)
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index c7d6ef7..f968e9e 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -749,7 +749,7 @@
          assert(0);
       }
 
-      util_dynarray_init(&surf->subpics);
+      util_dynarray_init(&surf->subpics, NULL);
       surfaces[i] = handle_table_add(drv->htab, surf);
       if (!surfaces[i]) {
          vaStatus = VA_STATUS_ERROR_ALLOCATION_FAILED;
diff --git a/src/gallium/state_trackers/vdpau/device.c b/src/gallium/state_trackers/vdpau/device.c
index eae9f04..c3f156f 100644
--- a/src/gallium/state_trackers/vdpau/device.c
+++ b/src/gallium/state_trackers/vdpau/device.c
@@ -72,7 +72,7 @@
    }
 
    pscreen = dev->vscreen->pscreen;
-   dev->context = pscreen->context_create(pscreen, dev->vscreen, 0);
+   dev->context = pscreen->context_create(pscreen, NULL, 0);
    if (!dev->context) {
       ret = VDP_STATUS_RESOURCES;
       goto no_context;
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index 884ae30..c678eb7 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -350,6 +350,8 @@
 
          /* adjust the template parameters */
          p_surf->templat.buffer_format = nformat;
+         if (nformat == PIPE_FORMAT_YUYV || nformat == PIPE_FORMAT_UYVY)
+            p_surf->templat.interlaced = false;
 
          /* and try to create the video buffer with the new format */
          p_surf->video_buffer = pipe->create_video_buffer(pipe, &p_surf->templat);
diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c
index f3145a9..58fe3b0 100644
--- a/src/gallium/state_trackers/wgl/stw_context.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -170,7 +170,13 @@
       iPixelFormat = fb->iPixelFormat;
       stw_framebuffer_unlock(fb);
    } else {
-      return 0;
+      /* Applications should call SetPixelFormat before creating a context,
+       * but not all do, and the opengl32 runtime seems to use a default
+       * pixel format in some cases, so use that.
+       */
+      iPixelFormat = GetPixelFormat(hdc);
+      if (!iPixelFormat)
+         return 0;
    }
 
    pfi = stw_pixelformat_get_info( iPixelFormat );
diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index 42a2f0e..b88e110 100644
--- a/src/gallium/state_trackers/wgl/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -90,7 +90,7 @@
    debug_disable_error_message_boxes();
 
    debug_printf("%s\n", __FUNCTION__);
-   
+
    assert(!stw_dev);
 
    stw_tls_init();
@@ -101,7 +101,7 @@
 #ifdef DEBUG
    stw_dev->memdbg_no = debug_memory_begin();
 #endif
-   
+
    stw_dev->stw_winsys = stw_winsys;
 
    stw_dev->stapi = stw_st_create_api();
@@ -110,10 +110,10 @@
       goto error1;
 
    screen = stw_winsys->create_screen();
-   if(!screen)
+   if (!screen)
       goto error1;
 
-   if(stw_winsys->get_adapter_luid)
+   if (stw_winsys->get_adapter_luid)
       stw_winsys->get_adapter_luid(screen, &stw_dev->AdapterLuid);
 
    stw_dev->smapi->screen = screen;
@@ -178,7 +178,7 @@
 
    if (!stw_dev)
       return;
-   
+
    /*
     * Abort cleanup if there are still active contexts. In some situations
     * this DLL may be unloaded before the DLL that is using GL contexts is.
@@ -195,10 +195,13 @@
    handle_table_destroy(stw_dev->ctx_table);
 
    stw_framebuffer_cleanup();
-   
+
    DeleteCriticalSection(&stw_dev->fb_mutex);
    DeleteCriticalSection(&stw_dev->ctx_mutex);
-   
+
+   if (stw_dev->smapi->destroy)
+      stw_dev->smapi->destroy(stw_dev->smapi);
+
    FREE(stw_dev->smapi);
    stw_dev->stapi->destroy(stw_dev->stapi);
 
@@ -220,9 +223,7 @@
 
 
 void APIENTRY
-DrvSetCallbackProcs(
-   INT nProcs,
-   PROC *pProcs )
+DrvSetCallbackProcs(INT nProcs, PROC *pProcs)
 {
    size_t size;
 
@@ -237,8 +238,7 @@
 
 
 BOOL APIENTRY
-DrvValidateVersion(
-   ULONG ulVersion )
+DrvValidateVersion(ULONG ulVersion)
 {
    /* ulVersion is the version reported by the KMD:
     * - via D3DKMTQueryAdapterInfo(KMTQAITYPE_UMOPENGLINFO) on WDDM,
diff --git a/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
index 4ee4fcd..6281d5d 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
@@ -27,9 +27,9 @@
 
 /**
  * @file
- * 
+ *
  * WGL_ARB_pixel_format extension implementation.
- * 
+ *
  * @sa http://www.opengl.org/registry/specs/ARB/wgl_pixel_format.txt
  */
 
@@ -49,11 +49,7 @@
 
 
 static boolean
-stw_query_attrib(
-   int iPixelFormat,
-   int iLayerPlane,
-   int attrib,
-   int *pvalue )
+stw_query_attrib(int iPixelFormat, int iLayerPlane, int attrib, int *pvalue)
 {
    uint count;
    const struct stw_pixelformat_info *pfi;
@@ -65,7 +61,7 @@
       return TRUE;
    }
 
-   pfi = stw_pixelformat_get_info( iPixelFormat );
+   pfi = stw_pixelformat_get_info(iPixelFormat);
    if (!pfi) {
       return FALSE;
    }
@@ -286,7 +282,6 @@
 };
 
 static const struct attrib_match_info attrib_match[] = {
-
    /* WGL_ARB_pixel_format */
    { WGL_DRAW_TO_WINDOW_ARB,      0, TRUE },
    { WGL_DRAW_TO_BITMAP_ARB,      0, TRUE },
@@ -334,12 +329,12 @@
    uint index;
 };
 
+
 static BOOL
-score_pixelformats(
-   struct stw_pixelformat_score *scores,
-   uint count,
-   int attribute,
-   int expected_value )
+score_pixelformats(struct stw_pixelformat_score *scores,
+                   uint count,
+                   int attribute,
+                   int expected_value)
 {
    uint i;
    const struct attrib_match_info *ami = NULL;
@@ -347,7 +342,7 @@
 
    /* Find out if a given attribute should be considered for score calculation.
     */
-   for (i = 0; i < sizeof( attrib_match ) / sizeof( attrib_match[0] ); i++) {
+   for (i = 0; i < ARRAY_SIZE(attrib_match); i++) {
       if (attrib_match[i].attribute == attribute) {
          ami = &attrib_match[i];
          break;
@@ -362,41 +357,40 @@
    for (index = 0; index < count; index++) {
       int actual_value;
 
-      if (!stw_query_attrib( index + 1, 0, attribute, &actual_value ))
+      if (!stw_query_attrib(index + 1, 0, attribute, &actual_value))
          return FALSE;
 
       if (ami->exact) {
-         /* For an exact match criteria, if the actual and expected values differ,
-          * the score is set to 0 points, effectively removing the pixelformat
-          * from a list of matching pixelformats.
+         /* For an exact match criteria, if the actual and expected values
+          * differ, the score is set to 0 points, effectively removing the
+          * pixelformat from a list of matching pixelformats.
           */
          if (actual_value != expected_value)
             scores[index].points = 0;
       }
       else {
-         /* For a minimum match criteria, if the actual value is smaller than the expected
-          * value, the pixelformat is rejected (score set to 0). However, if the actual
-          * value is bigger, the pixelformat is given a penalty to favour pixelformats that
-          * more closely match the expected values.
+         /* For a minimum match criteria, if the actual value is smaller than
+          * the expected value, the pixelformat is rejected (score set to
+          * 0). However, if the actual value is bigger, the pixelformat is
+          * given a penalty to favour pixelformats that more closely match the
+          * expected values.
           */
          if (actual_value < expected_value)
             scores[index].points = 0;
          else if (actual_value > expected_value)
-            scores[index].points -= (actual_value - expected_value) * ami->weight;
+            scores[index].points -= (actual_value - expected_value)
+               * ami->weight;
       }
    }
 
    return TRUE;
 }
 
+
 WINGDIAPI BOOL APIENTRY
-wglChoosePixelFormatARB(
-   HDC hdc,
-   const int *piAttribIList,
-   const FLOAT *pfAttribFList,
-   UINT nMaxFormats,
-   int *piFormats,
-   UINT *nNumFormats )
+wglChoosePixelFormatARB(HDC hdc, const int *piAttribIList,
+                        const FLOAT *pfAttribFList, UINT nMaxFormats,
+                        int *piFormats, UINT *nNumFormats)
 {
    uint count;
    struct stw_pixelformat_score *scores;
@@ -410,7 +404,8 @@
     * Set a score to 0 if there is a mismatch for an exact match criteria.
     */
    count = stw_pixelformat_get_extended_count();
-   scores = (struct stw_pixelformat_score *) MALLOC( count * sizeof( struct stw_pixelformat_score ) );
+   scores = (struct stw_pixelformat_score *)
+      MALLOC(count * sizeof(struct stw_pixelformat_score));
    if (scores == NULL)
       return FALSE;
    for (i = 0; i < count; i++) {
@@ -422,8 +417,9 @@
     */
    if (piAttribIList != NULL) {
       while (*piAttribIList != 0) {
-         if (!score_pixelformats( scores, count, piAttribIList[0], piAttribIList[1] )) {
-            FREE( scores );
+         if (!score_pixelformats(scores, count, piAttribIList[0],
+                                 piAttribIList[1])) {
+            FREE(scores);
             return FALSE;
          }
          piAttribIList += 2;
@@ -431,16 +427,17 @@
    }
    if (pfAttribFList != NULL) {
       while (*pfAttribFList != 0) {
-         if (!score_pixelformats( scores, count, (int) pfAttribFList[0], (int) pfAttribFList[1] )) {
-            FREE( scores );
+         if (!score_pixelformats(scores, count, (int) pfAttribFList[0],
+                                 (int) pfAttribFList[1])) {
+            FREE(scores);
             return FALSE;
          }
          pfAttribFList += 2;
       }
    }
 
-   /* Bubble-sort the resulting scores. Pixelformats with higher scores go first.
-    * TODO: Find out if there are any patent issues with it.
+   /* Bubble-sort the resulting scores. Pixelformats with higher scores go
+    * first.  TODO: Find out if there are any patent issues with it.
     */
    if (count > 1) {
       uint n = count;
@@ -467,26 +464,23 @@
     */
    for (i = 0; i < count; i++) {
       if (scores[i].points > 0) {
-	 piFormats[*nNumFormats] = scores[i].index + 1;
+         piFormats[*nNumFormats] = scores[i].index + 1;
          (*nNumFormats)++;
-	 if (*nNumFormats >= nMaxFormats) {
-	    break;
-	 }
+         if (*nNumFormats >= nMaxFormats) {
+            break;
+         }
       }
    }
 
-   FREE( scores );
+   FREE(scores);
    return TRUE;
 }
 
+
 WINGDIAPI BOOL APIENTRY
-wglGetPixelFormatAttribfvARB(
-   HDC hdc,
-   int iPixelFormat,
-   int iLayerPlane,
-   UINT nAttributes,
-   const int *piAttributes,
-   FLOAT *pfValues )
+wglGetPixelFormatAttribfvARB(HDC hdc, int iPixelFormat, int iLayerPlane,
+                             UINT nAttributes, const int *piAttributes,
+                             FLOAT *pfValues)
 {
    UINT i;
 
@@ -495,7 +489,8 @@
    for (i = 0; i < nAttributes; i++) {
       int value;
 
-      if (!stw_query_attrib( iPixelFormat, iLayerPlane, piAttributes[i], &value ))
+      if (!stw_query_attrib(iPixelFormat, iLayerPlane,
+                             piAttributes[i], &value))
          return FALSE;
       pfValues[i] = (FLOAT) value;
    }
@@ -503,21 +498,19 @@
    return TRUE;
 }
 
+
 WINGDIAPI BOOL APIENTRY
-wglGetPixelFormatAttribivARB(
-   HDC hdc,
-   int iPixelFormat,
-   int iLayerPlane,
-   UINT nAttributes,
-   const int *piAttributes,
-   int *piValues )
+wglGetPixelFormatAttribivARB(HDC hdc, int iPixelFormat, int iLayerPlane,
+                             UINT nAttributes, const int *piAttributes,
+                             int *piValues)
 {
    UINT i;
 
    (void) hdc;
 
    for (i = 0; i < nAttributes; i++) {
-      if (!stw_query_attrib( iPixelFormat, iLayerPlane, piAttributes[i], &piValues[i] ))
+      if (!stw_query_attrib(iPixelFormat, iLayerPlane,
+                            piAttributes[i], &piValues[i]))
          return FALSE;
    }
 
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index 321fbb6..06b5c8d 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -601,8 +601,11 @@
       int64_t min_swap_period =
          1.0e6 / stw_dev->refresh_rate * stw_dev->swap_interval;
 
-      /* if time since last swap is less than wait period, wait */
-      if (delta < min_swap_period) {
+      /* If time since last swap is less than wait period, wait.
+       * Note that it's possible for the delta to be negative because of
+       * rollover.  See https://bugs.freedesktop.org/show_bug.cgi?id=102241
+       */
+      if ((delta >= 0) && (delta < min_swap_period)) {
          float fudge = 1.75f;  /* emperical fudge factor */
          int64_t wait = (min_swap_period - delta) * fudge;
          os_time_sleep(wait);
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index c924f76..833308d 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2008 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include "pipe/p_format.h"
@@ -88,10 +88,10 @@
 
 static const struct stw_pf_color_info
 stw_pf_color_extended[] = {
-    { PIPE_FORMAT_R32G32B32A32_FLOAT, { 32,  32, 32,  32}, { 0,  32, 64, 96} }
+   { PIPE_FORMAT_R32G32B32A32_FLOAT, {32, 32, 32, 32}, {0, 32, 64, 96} }
 };
 
-static const struct stw_pf_depth_info 
+static const struct stw_pf_depth_info
 stw_pf_depth_stencil[] = {
    /* pure depth */
    { PIPE_FORMAT_Z32_UNORM,   {32, 0} },
@@ -104,14 +104,14 @@
 };
 
 
-static const boolean 
+static const boolean
 stw_pf_doublebuffer[] = {
    FALSE,
    TRUE,
 };
 
 
-const unsigned 
+const unsigned
 stw_pf_multisample[] = {
    0,
    4,
@@ -121,19 +121,18 @@
 
 
 static void
-stw_pixelformat_add(
-   struct stw_device *stw_dev,
-   boolean extended,
-   const struct stw_pf_color_info *color,
-   const struct stw_pf_depth_info *depth,
-   unsigned accum,
-   boolean doublebuffer,
-   unsigned samples )
+stw_pixelformat_add(struct stw_device *stw_dev,
+                    boolean extended,
+                    const struct stw_pf_color_info *color,
+                    const struct stw_pf_depth_info *depth,
+                    unsigned accum,
+                    boolean doublebuffer,
+                    unsigned samples)
 {
    struct stw_pixelformat_info *pfi;
-   
+
    assert(stw_dev->pixelformat_extended_count < STW_MAX_PIXELFORMATS);
-   if(stw_dev->pixelformat_extended_count >= STW_MAX_PIXELFORMATS)
+   if (stw_dev->pixelformat_extended_count >= STW_MAX_PIXELFORMATS)
       return;
 
    assert(util_format_get_component_bits(color->format, UTIL_FORMAT_COLORSPACE_RGB, 0) == color->bits.red);
@@ -142,16 +141,16 @@
    assert(util_format_get_component_bits(color->format, UTIL_FORMAT_COLORSPACE_RGB, 3) == color->bits.alpha);
    assert(util_format_get_component_bits(depth->format, UTIL_FORMAT_COLORSPACE_ZS, 0) == depth->bits.depth);
    assert(util_format_get_component_bits(depth->format, UTIL_FORMAT_COLORSPACE_ZS, 1) == depth->bits.stencil);
-   
+
    pfi = &stw_dev->pixelformats[stw_dev->pixelformat_extended_count];
-   
+
    memset(pfi, 0, sizeof *pfi);
-   
+
    pfi->pfd.nSize = sizeof pfi->pfd;
    pfi->pfd.nVersion = 1;
 
    pfi->pfd.dwFlags = PFD_SUPPORT_OPENGL;
-   
+
    /* TODO: also support non-native pixel formats */
    if (!extended) {
       pfi->pfd.dwFlags |= PFD_DRAW_TO_WINDOW;
@@ -162,10 +161,11 @@
 
    if (doublebuffer)
       pfi->pfd.dwFlags |= PFD_DOUBLEBUFFER | PFD_SWAP_EXCHANGE;
-   
+
    pfi->pfd.iPixelType = PFD_TYPE_RGBA;
 
-   pfi->pfd.cColorBits = color->bits.red + color->bits.green + color->bits.blue + color->bits.alpha;
+   pfi->pfd.cColorBits =
+      color->bits.red + color->bits.green + color->bits.blue + color->bits.alpha;
    pfi->pfd.cRedBits = color->bits.red;
    pfi->pfd.cRedShift = color->shift.red;
    pfi->pfd.cGreenBits = color->bits.green;
@@ -204,16 +204,16 @@
 
    pfi->stvis.samples = samples;
    pfi->stvis.render_buffer = ST_ATTACHMENT_INVALID;
-   
+
    /* WGL_ARB_render_texture */
    if (color->bits.alpha)
       pfi->bindToTextureRGBA = TRUE;
-   else
-      pfi->bindToTextureRGB = TRUE;
+
+   pfi->bindToTextureRGB = TRUE;
 
    ++stw_dev->pixelformat_extended_count;
-   
-   if(!extended) {
+
+   if (!extended) {
       ++stw_dev->pixelformat_count;
       assert(stw_dev->pixelformat_count == stw_dev->pixelformat_extended_count);
    }
@@ -225,8 +225,7 @@
  */
 static unsigned
 add_color_format_variants(const struct stw_pf_color_info *color_formats,
-                          unsigned num_color_formats,
-                          boolean extended)
+                          unsigned num_color_formats, boolean extended)
 {
    struct pipe_screen *screen = stw_dev->screen;
    unsigned cfmt, ms, db, ds, acc;
@@ -288,12 +287,12 @@
 
 
 void
-stw_pixelformat_init( void )
+stw_pixelformat_init(void)
 {
    unsigned num_formats;
 
-   assert( !stw_dev->pixelformat_count );
-   assert( !stw_dev->pixelformat_extended_count );
+   assert(!stw_dev->pixelformat_count);
+   assert(!stw_dev->pixelformat_extended_count);
 
    /* normal, displayable formats */
    num_formats = add_color_format_variants(stw_pf_color,
@@ -304,24 +303,27 @@
    add_color_format_variants(stw_pf_color_extended,
                              ARRAY_SIZE(stw_pf_color_extended), TRUE);
 
-   assert( stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count );
-   assert( stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS );
+   assert(stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count);
+   assert(stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS);
 }
 
+
 uint
-stw_pixelformat_get_count( void )
+stw_pixelformat_get_count(void)
 {
    return stw_dev->pixelformat_count;
 }
 
+
 uint
-stw_pixelformat_get_extended_count( void )
+stw_pixelformat_get_extended_count(void)
 {
    return stw_dev->pixelformat_extended_count;
 }
 
+
 const struct stw_pixelformat_info *
-stw_pixelformat_get_info( int iPixelFormat )
+stw_pixelformat_get_info(int iPixelFormat)
 {
    unsigned index;
 
@@ -339,11 +341,8 @@
 
 
 LONG APIENTRY
-DrvDescribePixelFormat(
-   HDC hdc,
-   INT iPixelFormat,
-   ULONG cjpfd,
-   PIXELFORMATDESCRIPTOR *ppfd )
+DrvDescribePixelFormat(HDC hdc, INT iPixelFormat, ULONG cjpfd,
+                       PIXELFORMATDESCRIPTOR *ppfd)
 {
    uint count;
    const struct stw_pixelformat_info *pfi;
@@ -357,70 +356,61 @@
 
    if (ppfd == NULL)
       return count;
-   if (cjpfd != sizeof( PIXELFORMATDESCRIPTOR ))
+
+   if (cjpfd != sizeof(PIXELFORMATDESCRIPTOR))
       return 0;
 
-   pfi = stw_pixelformat_get_info( iPixelFormat );
+   pfi = stw_pixelformat_get_info(iPixelFormat);
    if (!pfi) {
       return 0;
    }
-   
-   memcpy(ppfd, &pfi->pfd, sizeof( PIXELFORMATDESCRIPTOR ));
+
+   memcpy(ppfd, &pfi->pfd, sizeof(PIXELFORMATDESCRIPTOR));
 
    return count;
 }
 
+
 BOOL APIENTRY
-DrvDescribeLayerPlane(
-   HDC hdc,
-   INT iPixelFormat,
-   INT iLayerPlane,
-   UINT nBytes,
-   LPLAYERPLANEDESCRIPTOR plpd )
+DrvDescribeLayerPlane(HDC hdc, INT iPixelFormat, INT iLayerPlane,
+                      UINT nBytes, LPLAYERPLANEDESCRIPTOR plpd)
 {
    assert(0);
    return FALSE;
 }
 
+
 int APIENTRY
-DrvGetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   COLORREF *pcr )
+DrvGetLayerPaletteEntries(HDC hdc, INT iLayerPlane, INT iStart,
+                          INT cEntries, COLORREF *pcr)
 {
    assert(0);
    return 0;
 }
 
+
 int APIENTRY
-DrvSetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   CONST COLORREF *pcr )
+DrvSetLayerPaletteEntries(HDC hdc, INT iLayerPlane, INT iStart,
+                          INT cEntries, CONST COLORREF *pcr)
 {
    assert(0);
    return 0;
 }
 
+
 BOOL APIENTRY
-DrvRealizeLayerPalette(
-   HDC hdc,
-   INT iLayerPlane,
-   BOOL bRealize )
+DrvRealizeLayerPalette(HDC hdc, INT iLayerPlane, BOOL bRealize)
 {
    assert(0);
    return FALSE;
 }
 
+
 /* Only used by the wgl code, but have it here to avoid exporting the
  * pixelformat.h functionality.
  */
-int stw_pixelformat_choose( HDC hdc,
-                            CONST PIXELFORMATDESCRIPTOR *ppfd )
+int
+stw_pixelformat_choose(HDC hdc, CONST PIXELFORMATDESCRIPTOR *ppfd)
 {
    uint count;
    uint index;
@@ -435,7 +425,7 @@
 
    for (index = 1; index <= count; index++) {
       uint delta = 0;
-      const struct stw_pixelformat_info *pfi = stw_pixelformat_get_info( index );
+      const struct stw_pixelformat_info *pfi = stw_pixelformat_get_info(index);
 
       if (!(ppfd->dwFlags & PFD_DOUBLEBUFFER_DONTCARE) &&
           !!(ppfd->dwFlags & PFD_DOUBLEBUFFER) !=
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index 7806a2a..7cf18f0 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -46,7 +46,7 @@
    unsigned texture_mask;
 };
 
-
+static uint32_t stwfb_ID = 0;
 
 /**
  * Is the given mutex held by the calling thread?
@@ -161,10 +161,8 @@
       stwfb->fb->must_resize = FALSE;
    }
 
-   for (i = 0; i < count; i++) {
-      out[i] = NULL;
+   for (i = 0; i < count; i++)
       pipe_resource_reference(&out[i], stwfb->textures[statts[i]]);
-   }
 
    stw_framebuffer_unlock(stwfb->fb);
 
@@ -234,6 +232,8 @@
 
    stwfb->fb = fb;
    stwfb->stvis = fb->pfi->stvis;
+   stwfb->base.ID = p_atomic_inc_return(&stwfb_ID);
+   stwfb->base.state_manager = stw_dev->smapi;
 
    stwfb->base.visual = &stwfb->stvis;
    p_atomic_set(&stwfb->base.stamp, 1);
@@ -255,6 +255,11 @@
    for (i = 0; i < ST_ATTACHMENT_COUNT; i++)
       pipe_resource_reference(&stwfb->textures[i], NULL);
 
+   /* Notify the st manager that the framebuffer interface is no
+    * longer valid.
+    */
+   stw_dev->stapi->destroy_drawable(stw_dev->stapi, &stwfb->base);
+
    FREE(stwfb);
 }
 
diff --git a/src/gallium/state_trackers/wgl/stw_wgl.c b/src/gallium/state_trackers/wgl/stw_wgl.c
index de4b4f6..532a5ad 100644
--- a/src/gallium/state_trackers/wgl/stw_wgl.c
+++ b/src/gallium/state_trackers/wgl/stw_wgl.c
@@ -214,14 +214,7 @@
    DWORD count,
    DWORD listBase )
 {
-   (void) hdc;
-   (void) first;
-   (void) count;
-   (void) listBase;
-
-   assert( 0 );
-
-   return FALSE;
+   return wglUseFontBitmapsW(hdc, first, count, listBase);
 }
 
 WINGDIAPI BOOL APIENTRY
@@ -240,14 +233,54 @@
    DWORD count,
    DWORD listBase )
 {
-   (void) hdc;
-   (void) first;
-   (void) count;
-   (void) listBase;
+   GLYPHMETRICS gm;
+   MAT2 tra;
+   FIXED one, minus_one, zero;
+   void *buffer = NULL;
+   BOOL result = TRUE;
 
-   assert( 0 );
+   one.value = 1;
+   one.fract = 0;
+   minus_one.value = -1;
+   minus_one.fract = 0;
+   zero.value = 0;
+   zero.fract = 0;
 
-   return FALSE;
+   tra.eM11 = one;
+   tra.eM22 = minus_one;
+   tra.eM12 = tra.eM21 = zero;
+
+   for (int i = 0; i < count; i++) {
+      DWORD size = GetGlyphOutline(hdc, first + i, GGO_BITMAP, &gm, 0,
+                                   NULL, &tra);
+
+      glNewList(listBase + i, GL_COMPILE);
+
+      if (size != GDI_ERROR) {
+         if (size == 0) {
+            glBitmap(0, 0, -gm.gmptGlyphOrigin.x, gm.gmptGlyphOrigin.y,
+                     gm.gmCellIncX, gm.gmCellIncY, NULL);
+         }
+         else {
+            buffer = realloc(buffer, size);
+            size = GetGlyphOutline(hdc, first + i, GGO_BITMAP, &gm,
+                                   size, buffer, &tra);
+
+            glBitmap(gm.gmBlackBoxX, gm.gmBlackBoxY,
+                     -gm.gmptGlyphOrigin.x, gm.gmptGlyphOrigin.y,
+                     gm.gmCellIncX, gm.gmCellIncY, buffer);
+         }
+      }
+      else {
+         result = FALSE;
+      }
+
+      glEndList();
+   }
+
+   free(buffer);
+
+   return result;
 }
 
 WINGDIAPI BOOL APIENTRY
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index 03a3abf..e5addcf 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -162,7 +162,7 @@
 	goto out_no_fd;
 
     if (pipe_loader_drm_probe_fd(&xa->dev, fd))
-	xa->screen = pipe_loader_create_screen(xa->dev);
+	xa->screen = pipe_loader_create_screen(xa->dev, 0);
 
     if (!xa->screen)
 	goto out_no_screen;
diff --git a/src/gallium/state_trackers/xvmc/context.c b/src/gallium/state_trackers/xvmc/context.c
index e9014c8..1ecff5e 100644
--- a/src/gallium/state_trackers/xvmc/context.c
+++ b/src/gallium/state_trackers/xvmc/context.c
@@ -229,7 +229,9 @@
       return BadAlloc;
 
    /* TODO: Reuse screen if process creates another context */
-   vscreen = vl_dri2_screen_create(dpy, scrn);
+   vscreen = vl_dri3_screen_create(dpy, scrn);
+   if (!vscreen)
+      vscreen = vl_dri2_screen_create(dpy, scrn);
 
    if (!vscreen) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL screen.\n");
@@ -237,7 +239,7 @@
       return BadAlloc;
    }
 
-   pipe = vscreen->pscreen->context_create(vscreen->pscreen, vscreen, 0);
+   pipe = vscreen->pscreen->context_create(vscreen->pscreen, NULL, 0);
    if (!pipe) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Could not create VL context.\n");
       vscreen->destroy(vscreen);
diff --git a/src/gallium/state_trackers/xvmc/subpicture.c b/src/gallium/state_trackers/xvmc/subpicture.c
index 8b95a4e..bc26976 100644
--- a/src/gallium/state_trackers/xvmc/subpicture.c
+++ b/src/gallium/state_trackers/xvmc/subpicture.c
@@ -48,22 +48,43 @@
 #define FOURCC_AI44 0x34344941
 #define FOURCC_IA44 0x34344149
 
-static enum pipe_format XvIDToPipe(int xvimage_id)
+static enum pipe_format XvIDToPipe(struct pipe_screen *screen,
+                                   int xvimage_id)
 {
+   enum pipe_format ret;
+   assert(screen);
+
    switch (xvimage_id) {
-      case FOURCC_RGB:
-         return PIPE_FORMAT_B8G8R8X8_UNORM;
+   case FOURCC_RGB:
+      ret = PIPE_FORMAT_B8G8R8X8_UNORM;
+      break;
 
-      case FOURCC_AI44:
-         return PIPE_FORMAT_R4A4_UNORM;
+   case FOURCC_AI44:
+      ret = PIPE_FORMAT_R4A4_UNORM;
+      if (!screen->is_format_supported(
+                screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW))
+         ret = PIPE_FORMAT_B4G4R4A4_UNORM;
+      break;
 
-      case FOURCC_IA44:
-         return PIPE_FORMAT_A4R4_UNORM;
+   case FOURCC_IA44:
+      ret = PIPE_FORMAT_A4R4_UNORM;
+      if (!screen->is_format_supported(
+                screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW))
+         ret = PIPE_FORMAT_B4G4R4A4_UNORM;
+      break;
 
-      default:
-         XVMC_MSG(XVMC_ERR, "[XvMC] Unrecognized Xv image ID 0x%08X.\n", xvimage_id);
-         return PIPE_FORMAT_NONE;
+   default:
+      XVMC_MSG(XVMC_ERR, "[XvMC] Unrecognized Xv image ID 0x%08X.\n", xvimage_id);
+      return PIPE_FORMAT_NONE;
    }
+
+   if (!screen->is_format_supported(
+             screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW)) {
+      XVMC_MSG(XVMC_ERR, "[XvMC] Unsupported 2D format %s for Xv image ID 0x%08X.\n", util_format_name(ret), xvimage_id);
+      ret = PIPE_FORMAT_NONE;
+   }
+   return ret;
+
 }
 
 static unsigned NumPaletteEntries4XvID(int xvimage_id)
@@ -82,29 +103,44 @@
    }
 }
 
-static int PipeToComponentOrder(enum pipe_format format, char *component_order)
+static int PipeToComponentOrder(struct pipe_screen *screen,
+                                enum pipe_format format,
+                                enum pipe_format *palette_format,
+                                char *component_order)
 {
+   assert(screen);
    assert(component_order);
+   assert(palette_format);
 
    switch (format) {
-      case PIPE_FORMAT_B8G8R8X8_UNORM:
-         return 0;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return 0;
 
-      case PIPE_FORMAT_A4R4_UNORM:
-      case PIPE_FORMAT_R4A4_UNORM:
-         component_order[0] = 'Y';
-         component_order[1] = 'U';
-         component_order[2] = 'V';
-         component_order[3] = 'A';
-         return 4;
+   case PIPE_FORMAT_A4R4_UNORM:
+   case PIPE_FORMAT_R4A4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      *palette_format = PIPE_FORMAT_R8G8B8X8_UNORM;
+      component_order[0] = 'Y';
+      component_order[1] = 'U';
+      component_order[2] = 'V';
+      component_order[3] = 'A';
+      if (!screen->is_format_supported(
+                screen, *palette_format, PIPE_TEXTURE_1D, 0,
+                PIPE_BIND_SAMPLER_VIEW)) {
+         /* One of these formats better be supported... */
+         *palette_format = PIPE_FORMAT_B8G8R8X8_UNORM;
+         component_order[0] = 'V';
+         component_order[2] = 'Y';
+      }
+      return 4;
 
-      default:
-         XVMC_MSG(XVMC_ERR, "[XvMC] Unrecognized PIPE_FORMAT 0x%08X.\n", format);
-         component_order[0] = 0;
-         component_order[1] = 0;
-         component_order[2] = 0;
-         component_order[3] = 0;
-         return 0;
+   default:
+      XVMC_MSG(XVMC_ERR, "[XvMC] Unrecognized PIPE_FORMAT 0x%08X.\n", format);
+      component_order[0] = 0;
+      component_order[1] = 0;
+      component_order[2] = 0;
+      component_order[3] = 0;
+      return 0;
    }
 }
 
@@ -186,6 +222,41 @@
    pipe->transfer_unmap(pipe, transfer);
 }
 
+static void
+upload_sampler_convert(struct pipe_context *pipe, struct pipe_sampler_view *dst,
+                       const struct pipe_box *dst_box, const XvImage *image,
+                       unsigned src_x, unsigned src_y)
+{
+   struct pipe_transfer *transfer;
+   int i, j;
+   char *map, *src;
+
+   map = pipe->transfer_map(pipe, dst->texture, 0, PIPE_TRANSFER_WRITE,
+                            dst_box, &transfer);
+   if (!map)
+      return;
+
+   src = image->data;
+   src += src_y * image->width + src_x;
+   if (image->id == FOURCC_AI44) {
+      /* The format matches what we want, we just have to insert dummy
+       * bytes. So just copy the same value in twice.
+       */
+      for (i = 0; i < dst_box->height; i++, map += transfer->stride, src += image->width)
+         for (j = 0; j < dst_box->width; j++)
+            map[j * 2 + 0] = map[j * 2 + 1] = src[j];
+   } else {
+      assert(image->id == FOURCC_IA44);
+      /* Same idea as above, but we have to swap the low and high nibbles.
+       */
+      for (i = 0; i < dst_box->height; i++, map += transfer->stride, src += image->width)
+         for (j = 0; j < dst_box->width; j++)
+            map[j * 2 + 0] = map[j * 2 + 1] = (src[j] >> 4) | (src[j] << 4);
+   }
+
+   pipe->transfer_unmap(pipe, transfer);
+}
+
 PUBLIC
 Status XvMCCreateSubpicture(Display *dpy, XvMCContext *context, XvMCSubpicture *subpicture,
                             unsigned short width, unsigned short height, int xvimage_id)
@@ -195,6 +266,7 @@
    struct pipe_context *pipe;
    struct pipe_resource tex_templ, *tex;
    struct pipe_sampler_view sampler_templ;
+   enum pipe_format palette_format;
    Status ret;
 
    XVMC_MSG(XVMC_TRACE, "[XvMC] Creating subpicture %p.\n", subpicture);
@@ -224,7 +296,7 @@
 
    memset(&tex_templ, 0, sizeof(tex_templ));
    tex_templ.target = PIPE_TEXTURE_2D;
-   tex_templ.format = XvIDToPipe(xvimage_id);
+   tex_templ.format = XvIDToPipe(pipe->screen, xvimage_id);
    tex_templ.last_level = 0;
    if (pipe->screen->get_video_param(pipe->screen,
                                      PIPE_VIDEO_PROFILE_UNKNOWN,
@@ -262,12 +334,14 @@
    subpicture->width = width;
    subpicture->height = height;
    subpicture->num_palette_entries = NumPaletteEntries4XvID(xvimage_id);
-   subpicture->entry_bytes = PipeToComponentOrder(tex_templ.format, subpicture->component_order);
+   subpicture->entry_bytes = PipeToComponentOrder(
+         pipe->screen, tex_templ.format, &palette_format,
+         subpicture->component_order);
    subpicture->privData = subpicture_priv;
 
    if (subpicture->num_palette_entries > 0) {
       tex_templ.target = PIPE_TEXTURE_1D;
-      tex_templ.format = PIPE_FORMAT_R8G8B8X8_UNORM;
+      tex_templ.format = palette_format;
       tex_templ.width0 = subpicture->num_palette_entries;
       tex_templ.height0 = 1;
       tex_templ.usage = PIPE_USAGE_DEFAULT;
@@ -366,8 +440,13 @@
 
    /* clipping should be done by upload_sampler and regardles what the documentation
    says image->pitches[0] doesn't seems to be in bytes, so don't use it */
-   src_stride = image->width * util_format_get_blocksize(subpicture_priv->sampler->texture->format);
-   upload_sampler(pipe, subpicture_priv->sampler, &dst_box, image->data, src_stride, srcx, srcy);
+   if ((image->id == FOURCC_IA44 || image->id == FOURCC_AI44) &&
+       subpicture_priv->sampler->texture->format == PIPE_FORMAT_B4G4R4A4_UNORM) {
+      upload_sampler_convert(pipe, subpicture_priv->sampler, &dst_box, image, srcx, srcy);
+   } else {
+      src_stride = image->width * util_format_get_blocksize(subpicture_priv->sampler->texture->format);
+      upload_sampler(pipe, subpicture_priv->sampler, &dst_box, image->data, src_stride, srcx, srcy);
+   }
 
    XVMC_MSG(XVMC_TRACE, "[XvMC] Subpicture %p composited.\n", subpicture);
 
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 6163734..27c3fb1 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -204,7 +204,7 @@
 {
     struct d3dadapter9drm_context *ctx = CALLOC_STRUCT(d3dadapter9drm_context);
     HRESULT hr;
-    int different_device;
+    bool different_device;
     const struct drm_conf_ret *throttle_ret = NULL;
     const struct drm_conf_ret *dmabuf_ret = NULL;
     driOptionCache defaultInitOptions;
@@ -220,7 +220,7 @@
      * takes ownership of it. */
     fd = loader_get_user_preferred_fd(fd, &different_device);
     ctx->fd = fd;
-    ctx->base.linear_framebuffer = !!different_device;
+    ctx->base.linear_framebuffer = different_device;
 
     if (!pipe_loader_drm_probe_fd(&ctx->dev, fd)) {
         ERR("Failed to probe drm fd %d.\n", fd);
@@ -229,7 +229,7 @@
         return D3DERR_DRIVERINTERNALERROR;
     }
 
-    ctx->base.hal = pipe_loader_create_screen(ctx->dev);
+    ctx->base.hal = pipe_loader_create_screen(ctx->dev, 0);
     if (!ctx->base.hal) {
         ERR("Unable to load requested driver.\n");
         drm_destroy(&ctx->base);
@@ -271,7 +271,7 @@
     if (driCheckOption(&userInitOptions, "thread_submit", DRI_BOOL))
         ctx->base.thread_submit = driQueryOptionb(&userInitOptions, "thread_submit");
     else
-        ctx->base.thread_submit = !!different_device;
+        ctx->base.thread_submit = different_device;
 
     if (ctx->base.thread_submit && (throttling_value_user == -2 || throttling_value_user == 0)) {
         ctx->base.throttling_value = 0;
@@ -312,7 +312,7 @@
 
     /* wrap it to create a software screen that can share resources */
     if (pipe_loader_sw_probe_wrapped(&ctx->swdev, ctx->base.hal))
-        ctx->base.ref = pipe_loader_create_screen(ctx->swdev);
+        ctx->base.ref = pipe_loader_create_screen(ctx->swdev, 0);
 
     if (!ctx->base.ref) {
         ERR("Couldn't wrap drm screen to swrast screen. Software devices "
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 0a137a5..96b570e 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -38,60 +38,11 @@
 	libexpat \
 	libz
 
-ifneq ($(filter freedreno,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DGALLIUM_FREEDRENO
-gallium_DRIVERS += libmesa_winsys_freedreno libmesa_pipe_freedreno
-LOCAL_SHARED_LIBRARIES += libdrm_freedreno
-endif
-ifneq ($(filter i915g,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_i915 libmesa_pipe_i915
-LOCAL_SHARED_LIBRARIES += libdrm_intel
-LOCAL_CFLAGS += -DGALLIUM_I915
-endif
-ifneq ($(filter nouveau,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS +=  libmesa_winsys_nouveau libmesa_pipe_nouveau
-LOCAL_CFLAGS += -DGALLIUM_NOUVEAU
-LOCAL_SHARED_LIBRARIES += libdrm_nouveau
-endif
-ifneq ($(filter r%,$(MESA_GPU_DRIVERS)),)
-ifneq ($(filter r300g,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_r300
-LOCAL_CFLAGS += -DGALLIUM_R300
-endif
-ifneq ($(filter r600g,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_r600
-LOCAL_CFLAGS += -DGALLIUM_R600
-endif
-ifneq ($(filter radeonsi,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_radeonsi libmesa_winsys_amdgpu libmesa_amd_common
-LOCAL_SHARED_LIBRARIES += libLLVM libdrm_amdgpu
-LOCAL_CFLAGS += -DGALLIUM_RADEONSI
-endif
-gallium_DRIVERS += libmesa_winsys_radeon libmesa_pipe_radeon libmesa_amdgpu_addrlib
-LOCAL_SHARED_LIBRARIES += libdrm_radeon
-endif
-ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri
-LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
-endif
-ifneq ($(filter vc4,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DGALLIUM_VC4
-gallium_DRIVERS += libmesa_winsys_vc4 libmesa_pipe_vc4
-endif
-ifneq ($(filter virgl,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DGALLIUM_VIRGL
-gallium_DRIVERS += libmesa_winsys_virgl libmesa_winsys_virgl_vtest libmesa_pipe_virgl
-endif
-ifneq ($(filter vmwgfx,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_svga libmesa_pipe_svga
-LOCAL_CFLAGS += -DGALLIUM_VMWGFX
-endif
-ifneq ($(filter nouveau r600g,$(MESA_GPU_DRIVERS)),)
-LOCAL_SHARED_LIBRARIES += libc++
-endif
+$(foreach d, $(MESA_BUILD_GALLIUM), $(eval LOCAL_CFLAGS += $(patsubst HAVE_%,-D%,$(d))))
 
+# sort GALLIUM_LIBS to remove any duplicates
 LOCAL_WHOLE_STATIC_LIBRARIES := \
-	$(gallium_DRIVERS) \
+	$(sort $(GALLIUM_LIBS)) \
 	libmesa_st_dri \
 	libmesa_st_mesa \
 	libmesa_glsl \
@@ -104,16 +55,17 @@
 	libmesa_util \
 	libmesa_loader
 
-LOCAL_STATIC_LIBRARIES :=
+# sort GALLIUM_SHARED_LIBS to remove any duplicates
+LOCAL_SHARED_LIBRARIES += $(sort $(GALLIUM_SHARED_LIBS))
 
-ifeq ($(MESA_ENABLE_LLVM),true)
-LOCAL_STATIC_LIBRARIES += \
-	libLLVMR600CodeGen \
-	libLLVMR600Desc \
-	libLLVMR600Info \
-	libLLVMR600AsmPrinter \
-	libelf
-LOCAL_LDLIBS += -lgcc
+ifneq ($(filter 5 6 7, $(MESA_ANDROID_MAJOR_VERSION)),)
+LOCAL_POST_INSTALL_CMD := \
+	$(foreach l, lib $(if $(filter true,$(TARGET_IS_64_BIT)),lib64), \
+	  mkdir -p $(TARGET_OUT)/$(l)/$(MESA_DRI_MODULE_REL_PATH); \
+	  $(foreach d, $(GALLIUM_TARGET_DRIVERS), ln -sf gallium_dri.so $(TARGET_OUT)/$(l)/$(MESA_DRI_MODULE_REL_PATH)/$(d)_dri.so;) \
+	)
+else
+LOCAL_MODULE_SYMLINKS := $(foreach d, $(GALLIUM_TARGET_DRIVERS), $(d)_dri.so)
 endif
 
 include $(GALLIUM_COMMON_MK)
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index 8363406..2d2e1ae 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -1,9 +1,5 @@
 include $(top_srcdir)/src/gallium/Automake.inc
 
-if HAVE_SHARED_GLAPI
-SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
-endif
-
 AM_CFLAGS = \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
@@ -55,7 +51,7 @@
 	$(top_builddir)/src/gallium/drivers/noop/libnoop.la \
 	$(top_builddir)/src/gallium/drivers/rbug/librbug.la \
 	$(top_builddir)/src/gallium/drivers/trace/libtrace.la \
-	$(SHARED_GLAPI_LIB) \
+	$(top_builddir)/src/mapi/shared-glapi/libglapi.la \
 	$(SELINUX_LIBS) \
 	$(EXPAT_LIBS) \
 	$(LIBDRM_LIBS) \
@@ -86,6 +82,7 @@
 include $(top_srcdir)/src/gallium/drivers/freedreno/Automake.inc
 
 include $(top_srcdir)/src/gallium/drivers/vc4/Automake.inc
+include $(top_srcdir)/src/gallium/drivers/pl111/Automake.inc
 
 include $(top_srcdir)/src/gallium/drivers/virgl/Automake.inc
 
diff --git a/src/gallium/targets/dri/SConscript b/src/gallium/targets/dri/SConscript
index d7a8cbd..f5c2818 100644
--- a/src/gallium/targets/dri/SConscript
+++ b/src/gallium/targets/dri/SConscript
@@ -3,7 +3,7 @@
 env = drienv.Clone()
 
 if env['suncc']:
-    print 'warning: not building dri-vmwgfx'
+    print('warning: not building dri-vmwgfx')
     Return()
 
 env.Append(CPPPATH = [
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index d24a61d..a831e35 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -73,6 +73,9 @@
 
 #if defined(GALLIUM_VC4)
 DEFINE_LOADER_DRM_ENTRYPOINT(vc4)
+#if defined(GALLIUM_PL111)
+DEFINE_LOADER_DRM_ENTRYPOINT(pl111)
+#endif
 #endif
 
 #if defined(GALLIUM_ETNAVIV)
diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am
index 7853422..a29199f 100644
--- a/src/gallium/targets/libgl-xlib/Makefile.am
+++ b/src/gallium/targets/libgl-xlib/Makefile.am
@@ -25,7 +25,6 @@
 GL_TINY = $(MESA_MAJOR)$(MESA_MINOR)0$(MESA_TINY)
 
 if HAVE_SHARED_GLAPI
-SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
 SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
 endif
 
@@ -40,7 +39,6 @@
 	-I$(top_srcdir)/src/gallium/state_trackers/glx/xlib \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/winsys \
-	$(SHARED_GLAPI_CFLAGS) \
 	-DGALLIUM_SOFTPIPE \
 	-DGALLIUM_RBUG \
 	-DGALLIUM_TRACE
diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am
index 6d340f1..2b4af57 100644
--- a/src/gallium/targets/osmesa/Makefile.am
+++ b/src/gallium/targets/osmesa/Makefile.am
@@ -66,7 +66,8 @@
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
 	$(SHARED_GLAPI_LIB) \
 	$(OSMESA_LIB_DEPS) \
-	$(CLOCK_LIB)
+	$(CLOCK_LIB) \
+	$(LIBUNWIND_LIBS)
 
 if HAVE_GALLIUM_LLVM
 AM_CPPFLAGS += -DGALLIUM_LLVMPIPE
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index 5f629a2..6b11618 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -86,7 +86,7 @@
 	$(top_builddir)/src/gallium/winsys/i915/drm/libi915drm.la \
 	$(top_builddir)/src/gallium/drivers/i915/libi915.la \
 	$(LIBDRM_LIBS) \
-	$(INTEL_LIBS)
+	$(I915_LIBS)
 
 endif
 
@@ -129,13 +129,8 @@
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la \
 	$(top_builddir)/src/gallium/drivers/r600/libr600.la \
 	$(LIBDRM_LIBS) \
-	$(RADEON_LIBS)
-
-if HAVE_GALLIUM_LLVM
-pipe_r600_la_LIBADD += \
-	$(top_builddir)/src/amd/common/libamd_common.la
-endif
-
+	$(RADEON_LIBS) \
+	$(LIBELF_LIBS)
 endif
 
 if HAVE_GALLIUM_RADEONSI
diff --git a/src/gallium/targets/pipe-loader/pipe_i915.c b/src/gallium/targets/pipe-loader/pipe_i915.c
index 2183dc3..43061b0 100644
--- a/src/gallium/targets/pipe-loader/pipe_i915.c
+++ b/src/gallium/targets/pipe-loader/pipe_i915.c
@@ -5,7 +5,7 @@
 #include "i915/i915_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct i915_winsys *iws;
    struct pipe_screen *screen;
diff --git a/src/gallium/targets/pipe-loader/pipe_msm.c b/src/gallium/targets/pipe-loader/pipe_msm.c
index 858b248..180e0f9 100644
--- a/src/gallium/targets/pipe-loader/pipe_msm.c
+++ b/src/gallium/targets/pipe-loader/pipe_msm.c
@@ -4,7 +4,7 @@
 #include "freedreno/drm/freedreno_drm_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
diff --git a/src/gallium/targets/pipe-loader/pipe_nouveau.c b/src/gallium/targets/pipe-loader/pipe_nouveau.c
index d9c0c5d..de61f44 100644
--- a/src/gallium/targets/pipe-loader/pipe_nouveau.c
+++ b/src/gallium/targets/pipe-loader/pipe_nouveau.c
@@ -4,7 +4,7 @@
 #include "nouveau/drm/nouveau_drm_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct pipe_screen *screen;
 
diff --git a/src/gallium/targets/pipe-loader/pipe_r300.c b/src/gallium/targets/pipe-loader/pipe_r300.c
index dd5c0bd..da72859 100644
--- a/src/gallium/targets/pipe-loader/pipe_r300.c
+++ b/src/gallium/targets/pipe-loader/pipe_r300.c
@@ -5,11 +5,11 @@
 #include "r300/r300_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *sws;
 
-   sws = radeon_drm_winsys_create(fd, r300_screen_create);
+   sws = radeon_drm_winsys_create(fd, flags, r300_screen_create);
    return sws ? debug_screen_wrap(sws->screen) : NULL;
 }
 
diff --git a/src/gallium/targets/pipe-loader/pipe_r600.c b/src/gallium/targets/pipe-loader/pipe_r600.c
index 70760d0..dfe130a 100644
--- a/src/gallium/targets/pipe-loader/pipe_r600.c
+++ b/src/gallium/targets/pipe-loader/pipe_r600.c
@@ -5,11 +5,11 @@
 #include "r600/r600_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, r600_screen_create);
+   rw = radeon_drm_winsys_create(fd, flags, r600_screen_create);
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 
diff --git a/src/gallium/targets/pipe-loader/pipe_radeonsi.c b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
index 01b1d8a..1bbd97f 100644
--- a/src/gallium/targets/pipe-loader/pipe_radeonsi.c
+++ b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
@@ -6,15 +6,15 @@
 #include "radeonsi/si_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct radeon_winsys *rw;
 
    /* First, try amdgpu. */
-   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+   rw = amdgpu_winsys_create(fd, flags, radeonsi_screen_create);
 
    if (!rw)
-      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+      rw = radeon_drm_winsys_create(fd, flags, radeonsi_screen_create);
 
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
diff --git a/src/gallium/targets/pipe-loader/pipe_vmwgfx.c b/src/gallium/targets/pipe-loader/pipe_vmwgfx.c
index 7aa4421..6320831 100644
--- a/src/gallium/targets/pipe-loader/pipe_vmwgfx.c
+++ b/src/gallium/targets/pipe-loader/pipe_vmwgfx.c
@@ -5,7 +5,7 @@
 #include "svga/svga_public.h"
 
 static struct pipe_screen *
-create_screen(int fd)
+create_screen(int fd, unsigned flags)
 {
    struct svga_winsys_screen *sws;
    struct pipe_screen *screen;
diff --git a/src/gallium/tests/graw/fs-fragcoord.c b/src/gallium/tests/graw/fs-fragcoord.c
index 9b85cf7..cf7642c 100644
--- a/src/gallium/tests/graw/fs-fragcoord.c
+++ b/src/gallium/tests/graw/fs-fragcoord.c
@@ -68,7 +68,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/fs-frontface.c b/src/gallium/tests/graw/fs-frontface.c
index a0c8a2d..32a13cb 100644
--- a/src/gallium/tests/graw/fs-frontface.c
+++ b/src/gallium/tests/graw/fs-frontface.c
@@ -90,7 +90,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/fs-test.c b/src/gallium/tests/graw/fs-test.c
index e2e7ac8..d1ade1d 100644
--- a/src/gallium/tests/graw/fs-test.c
+++ b/src/gallium/tests/graw/fs-test.c
@@ -173,7 +173,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/fs-write-z.c b/src/gallium/tests/graw/fs-write-z.c
index eabae64..12267ed 100644
--- a/src/gallium/tests/graw/fs-write-z.c
+++ b/src/gallium/tests/graw/fs-write-z.c
@@ -94,7 +94,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/gs-test.c b/src/gallium/tests/graw/gs-test.c
index 46042c6..dad3298 100644
--- a/src/gallium/tests/graw/gs-test.c
+++ b/src/gallium/tests/graw/gs-test.c
@@ -235,13 +235,13 @@
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
    if (draw_strip) {
-      vbuf.buffer = pipe_buffer_create_with_data(ctx,
+      vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                                  PIPE_BIND_VERTEX_BUFFER,
                                                  PIPE_USAGE_DEFAULT,
                                                  sizeof(vertices_strip),
                                                  vertices_strip);
    } else {
-      vbuf.buffer = pipe_buffer_create_with_data(ctx,
+      vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                                  PIPE_BIND_VERTEX_BUFFER,
                                                  PIPE_USAGE_DEFAULT,
                                                  sizeof(vertices),
diff --git a/src/gallium/tests/graw/occlusion-query.c b/src/gallium/tests/graw/occlusion-query.c
index d03934f..444b645 100644
--- a/src/gallium/tests/graw/occlusion-query.c
+++ b/src/gallium/tests/graw/occlusion-query.c
@@ -94,7 +94,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               bytes,
diff --git a/src/gallium/tests/graw/quad-sample.c b/src/gallium/tests/graw/quad-sample.c
index 03f51fc..7917420 100644
--- a/src/gallium/tests/graw/quad-sample.c
+++ b/src/gallium/tests/graw/quad-sample.c
@@ -99,7 +99,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/quad-tex.c b/src/gallium/tests/graw/quad-tex.c
index 8a9d1b8..444f64e 100644
--- a/src/gallium/tests/graw/quad-tex.c
+++ b/src/gallium/tests/graw/quad-tex.c
@@ -57,7 +57,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/shader-leak.c b/src/gallium/tests/graw/shader-leak.c
index dddb69c..fb4344c 100644
--- a/src/gallium/tests/graw/shader-leak.c
+++ b/src/gallium/tests/graw/shader-leak.c
@@ -89,7 +89,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/tex-srgb.c b/src/gallium/tests/graw/tex-srgb.c
index 9d3af94..503350a 100644
--- a/src/gallium/tests/graw/tex-srgb.c
+++ b/src/gallium/tests/graw/tex-srgb.c
@@ -73,7 +73,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               num_verts * sizeof(struct vertex),
diff --git a/src/gallium/tests/graw/tex-swizzle.c b/src/gallium/tests/graw/tex-swizzle.c
index bc56a95..787f324 100644
--- a/src/gallium/tests/graw/tex-swizzle.c
+++ b/src/gallium/tests/graw/tex-swizzle.c
@@ -55,7 +55,7 @@
 
    vbuf.stride = sizeof(struct vertex);
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/tri-gs.c b/src/gallium/tests/graw/tri-gs.c
index 6d9e41d..2ca36ce 100644
--- a/src/gallium/tests/graw/tri-gs.c
+++ b/src/gallium/tests/graw/tri-gs.c
@@ -90,7 +90,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/tri-instanced.c b/src/gallium/tests/graw/tri-instanced.c
index b1fa21d..6c6783c 100644
--- a/src/gallium/tests/graw/tri-instanced.c
+++ b/src/gallium/tests/graw/tri-instanced.c
@@ -104,7 +104,6 @@
 {
    struct pipe_vertex_element ve[3];
    struct pipe_vertex_buffer vbuf[2];
-   struct pipe_index_buffer ibuf;
    void *handle;
 
    memset(ve, 0, sizeof ve);
@@ -133,7 +132,7 @@
    /* vertex data */
    vbuf[0].stride = sizeof( struct vertex );
    vbuf[0].buffer_offset = 0;
-   vbuf[0].buffer = pipe_buffer_create_with_data(ctx,
+   vbuf[0].buffer.resource = pipe_buffer_create_with_data(ctx,
                                                  PIPE_BIND_VERTEX_BUFFER,
                                                  PIPE_USAGE_DEFAULT,
                                                  sizeof(vertices),
@@ -142,25 +141,13 @@
    /* instance data */
    vbuf[1].stride = sizeof( inst_data[0] );
    vbuf[1].buffer_offset = 0;
-   vbuf[1].buffer = pipe_buffer_create_with_data(ctx,
+   vbuf[1].buffer.resource = pipe_buffer_create_with_data(ctx,
                                                  PIPE_BIND_VERTEX_BUFFER,
                                                  PIPE_USAGE_DEFAULT,
                                                  sizeof(inst_data),
                                                  inst_data);
 
    ctx->set_vertex_buffers(ctx, 0, 2, vbuf);
-
-   /* index data */
-   ibuf.buffer = pipe_buffer_create_with_data(ctx,
-                                              PIPE_BIND_INDEX_BUFFER,
-                                              PIPE_USAGE_DEFAULT,
-                                              sizeof(indices),
-                                              indices);
-   ibuf.offset = 0;
-   ibuf.index_size = 2;
-
-   ctx->set_index_buffer(ctx, &ibuf);
-
 }
 
 static void set_vertex_shader( void )
@@ -203,16 +190,29 @@
 
    ctx->clear(ctx, PIPE_CLEAR_COLOR, &clear_color, 0, 0);
 
+
    util_draw_init_info(&info);
-   info.indexed = (draw_elements != 0);
+   info.index_size = draw_elements ? 2 : 0;
    info.mode = PIPE_PRIM_TRIANGLES;
    info.start = 0;
    info.count = 3;
    /* draw NUM_INST triangles */
    info.instance_count = NUM_INST;
 
+   /* index data */
+   if (info.index_size) {
+      info.index.resource =
+         pipe_buffer_create_with_data(ctx,
+                                      PIPE_BIND_INDEX_BUFFER,
+                                      PIPE_USAGE_DEFAULT,
+                                      sizeof(indices),
+                                      indices);
+   }
+
    ctx->draw_vbo(ctx, &info);
 
+   pipe_resource_reference(&info.index.resource, NULL);
+
    ctx->flush(ctx, NULL, 0);
 
    graw_save_surface_to_file(ctx, surf, NULL);
diff --git a/src/gallium/tests/graw/tri-large.c b/src/gallium/tests/graw/tri-large.c
index 4ccb7c5..1ca915a 100644
--- a/src/gallium/tests/graw/tri-large.c
+++ b/src/gallium/tests/graw/tri-large.c
@@ -59,7 +59,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/tri.c b/src/gallium/tests/graw/tri.c
index 0968387..b62a2ab 100644
--- a/src/gallium/tests/graw/tri.c
+++ b/src/gallium/tests/graw/tri.c
@@ -56,7 +56,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(info.ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/graw/vs-test.c b/src/gallium/tests/graw/vs-test.c
index 14e63cd..e3b50ea 100644
--- a/src/gallium/tests/graw/vs-test.c
+++ b/src/gallium/tests/graw/vs-test.c
@@ -167,7 +167,7 @@
 
    vbuf.stride = sizeof( struct vertex );
    vbuf.buffer_offset = 0;
-   vbuf.buffer = pipe_buffer_create_with_data(ctx,
+   vbuf.buffer.resource = pipe_buffer_create_with_data(ctx,
                                               PIPE_BIND_VERTEX_BUFFER,
                                               PIPE_USAGE_DEFAULT,
                                               sizeof(vertices),
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index 443451e..a2e882c 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -76,7 +76,7 @@
         ret = pipe_loader_probe(&ctx->dev, 1);
         assert(ret);
 
-        ctx->screen = pipe_loader_create_screen(ctx->dev);
+        ctx->screen = pipe_loader_create_screen(ctx->dev, 0);
         assert(ctx->screen);
 
         ctx->pipe = ctx->screen->context_create(ctx->screen, NULL, 0);
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index 6e9957a..113cb92 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -96,7 +96,7 @@
 	assert(ret);
 
 	/* init a pipe screen */
-	p->screen = pipe_loader_create_screen(p->dev);
+	p->screen = pipe_loader_create_screen(p->dev, 0);
 	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
@@ -273,7 +273,8 @@
 	p->fs = util_make_fragment_tex_shader(p->pipe, TGSI_TEXTURE_2D,
 	                                      TGSI_INTERPOLATE_LINEAR,
 	                                      TGSI_RETURN_TYPE_FLOAT,
-	                                      TGSI_RETURN_TYPE_FLOAT);
+	                                      TGSI_RETURN_TYPE_FLOAT, false,
+                                              false);
 }
 
 static void close_prog(struct program *p)
diff --git a/src/gallium/tests/trivial/tri.c b/src/gallium/tests/trivial/tri.c
index a203169..df02e96 100644
--- a/src/gallium/tests/trivial/tri.c
+++ b/src/gallium/tests/trivial/tri.c
@@ -91,7 +91,7 @@
 	assert(ret);
 
 	/* init a pipe screen */
-	p->screen = pipe_loader_create_screen(p->dev);
+	p->screen = pipe_loader_create_screen(p->dev, 0);
 	assert(p->screen);
 
 	/* create the pipe driver context and cso context */
diff --git a/src/gallium/tests/unit/u_format_test.c b/src/gallium/tests/unit/u_format_test.c
index 3145d13..69d6c7d 100644
--- a/src/gallium/tests/unit/u_format_test.c
+++ b/src/gallium/tests/unit/u_format_test.c
@@ -220,6 +220,11 @@
       }
    }
 
+   /* Ignore S3TC errors */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      success = TRUE;
+   }
+
    if (!success) {
       print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n");
       print_unpacked_rgba_doubl(format_desc, "        ", test->unpacked, " expected\n");
@@ -252,6 +257,11 @@
       }
    }
 
+   /* Ignore S3TC errors */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      success = TRUE;
+   }
+
    if (!success) {
       print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n");
       print_unpacked_rgba_doubl(format_desc, "        ", test->unpacked, " expected\n");
@@ -302,6 +312,11 @@
    if (util_is_double_nan(test->unpacked[0][0][0]))
       success = TRUE;
 
+   /* Ignore S3TC errors */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      success = TRUE;
+   }
+
    if (!success) {
       print_packed(format_desc, "FAILED: ", packed, " obtained\n");
       print_packed(format_desc, "        ", test->packed, " expected\n");
@@ -365,6 +380,11 @@
    if (util_is_double_nan(test->unpacked[0][0][0]))
       success = TRUE;
 
+   /* Ignore S3TC errors */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      success = TRUE;
+   }
+
    if (!success) {
       print_unpacked_rgba_8unorm(format_desc, "FAILED: ", unpacked, " obtained\n");
       print_unpacked_rgba_8unorm(format_desc, "        ", expected, " expected\n");
@@ -422,6 +442,11 @@
    if ((test->unpacked[0][0][0] * 255.0) != (int)(test->unpacked[0][0][0] * 255.0))
       success = TRUE;
 
+   /* Ignore S3TC errors */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      success = TRUE;
+   }
+
    if (!success) {
       print_packed(format_desc, "FAILED: ", packed, " obtained\n");
       print_packed(format_desc, "        ", test->packed, " expected\n");
diff --git a/src/gallium/winsys/amdgpu/drm/Android.mk b/src/gallium/winsys/amdgpu/drm/Android.mk
index 9030a83..a05304a 100644
--- a/src/gallium/winsys/amdgpu/drm/Android.mk
+++ b/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -34,15 +34,17 @@
 	$(AMDGPU_CFLAGS) \
 	-DBRAHMA_BUILD=1
 
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src \
-	$(MESA_TOP)/src/amd \
-	$(MESA_TOP)/src/amd/addrlib/core \
-	$(MESA_TOP)/src/amd/addrlib/inc/chip/r800 \
-	$(MESA_TOP)/src/amd/addrlib/r800/chip
+LOCAL_STATIC_LIBRARIES := libmesa_amdgpu_addrlib
 
 LOCAL_SHARED_LIBRARIES := libdrm_amdgpu
 LOCAL_MODULE := libmesa_winsys_amdgpu
 
+$(call mesa-build-with-llvm)
+
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_RADEONSI),)
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) $(LOCAL_STATIC_LIBRARIES))
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.am b/src/gallium/winsys/amdgpu/drm/Makefile.am
index 543325c..0889591 100644
--- a/src/gallium/winsys/amdgpu/drm/Makefile.am
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.am
@@ -10,5 +10,7 @@
 
 noinst_LTLIBRARIES = libamdgpuwinsys.la
 
-libamdgpuwinsys_la_LIBADD = $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la
+libamdgpuwinsys_la_LIBADD = \
+   $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la
+
 libamdgpuwinsys_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 6bdcce5..97bbe23 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -398,8 +398,6 @@
    if (initial_domain & RADEON_DOMAIN_GTT)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 
-   if (flags & RADEON_FLAG_CPU_ACCESS)
-      request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
    if (flags & RADEON_FLAG_GTT_WC)
@@ -415,6 +413,8 @@
    }
 
    va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
+   if (size > ws->info.pte_fragment_size)
+	   alignment = MAX2(alignment, ws->info.pte_fragment_size);
    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
                              size + va_gap_size, alignment, 0, &va, &va_handle, 0);
    if (r)
@@ -495,33 +495,16 @@
 {
    struct amdgpu_winsys *ws = priv;
    struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
-   enum radeon_bo_domain domains;
-   enum radeon_bo_flag flags = 0;
+   enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
+   enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
    uint32_t base_id;
 
    if (!slab)
       return NULL;
 
-   if (heap & 1)
-      flags |= RADEON_FLAG_GTT_WC;
-   if (heap & 2)
-      flags |= RADEON_FLAG_CPU_ACCESS;
-
-   switch (heap >> 2) {
-   case 0:
-      domains = RADEON_DOMAIN_VRAM;
-      break;
-   default:
-   case 1:
-      domains = RADEON_DOMAIN_VRAM_GTT;
-      break;
-   case 2:
-      domains = RADEON_DOMAIN_GTT;
-      break;
-   }
-
+   unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2;
    slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
-                                                    64 * 1024, 64 * 1024,
+                                                    slab_size, slab_size,
                                                     domains, flags));
    if (!slab->buffer)
       goto fail;
@@ -688,7 +671,7 @@
 
       buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
                              bo->initial_domain,
-                             bo->u.sparse.flags | RADEON_FLAG_HANDLE);
+                             bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
       if (!buf) {
          FREE(best_backing->chunks);
          FREE(best_backing);
@@ -1153,34 +1136,22 @@
    struct amdgpu_winsys_bo *bo;
    unsigned usage = 0, pb_cache_bucket;
 
+   /* VRAM implies WC. This is not optional. */
+   assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
+
+   /* NO_CPU_ACCESS is valid with VRAM only. */
+   assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
+
    /* Sub-allocate small buffers from slabs. */
-   if (!(flags & (RADEON_FLAG_HANDLE | RADEON_FLAG_SPARSE)) &&
+   if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
        size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
        alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
       struct pb_slab_entry *entry;
-      unsigned heap = 0;
+      int heap = radeon_get_heap_index(domain, flags);
 
-      if (flags & RADEON_FLAG_GTT_WC)
-         heap |= 1;
-      if (flags & RADEON_FLAG_CPU_ACCESS)
-         heap |= 2;
-      if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS))
+      if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
          goto no_slab;
 
-      switch (domain) {
-      case RADEON_DOMAIN_VRAM:
-         heap |= 0 * 4;
-         break;
-      case RADEON_DOMAIN_VRAM_GTT:
-         heap |= 1 * 4;
-         break;
-      case RADEON_DOMAIN_GTT:
-         heap |= 2 * 4;
-         break;
-      default:
-         goto no_slab;
-      }
-
       entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
       if (!entry) {
          /* Clear the cache and try again. */
@@ -1202,7 +1173,6 @@
 
    if (flags & RADEON_FLAG_SPARSE) {
       assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
-      assert(!(flags & RADEON_FLAG_CPU_ACCESS));
 
       flags |= RADEON_FLAG_NO_CPU_ACCESS;
 
@@ -1210,7 +1180,7 @@
    }
 
    /* This flag is irrelevant for the cache. */
-   flags &= ~RADEON_FLAG_HANDLE;
+   flags &= ~RADEON_FLAG_NO_SUBALLOC;
 
    /* Align size to page size. This is the minimum alignment for normal
     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
@@ -1219,22 +1189,11 @@
    size = align64(size, ws->info.gart_page_size);
    alignment = align(alignment, ws->info.gart_page_size);
 
-   /* Only set one usage bit each for domains and flags, or the cache manager
-    * might consider different sets of domains / flags compatible
-    */
-   if (domain == RADEON_DOMAIN_VRAM_GTT)
-      usage = 1 << 2;
-   else
-      usage = domain >> 1;
-   assert(flags < sizeof(usage) * 8 - 3);
-   usage |= 1 << (flags + 3);
+   int heap = radeon_get_heap_index(domain, flags);
+   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+   usage = 1 << heap; /* Only set one usage bit for each heap. */
 
-   /* Determine the pb_cache bucket for minimizing pb_cache misses. */
-   pb_cache_bucket = 0;
-   if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */
-      pb_cache_bucket += 1;
-   if (flags == RADEON_FLAG_GTT_WC) /* WC */
-      pb_cache_bucket += 2;
+   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
 
    /* Get a buffer from the cache. */
@@ -1363,10 +1322,9 @@
    enum amdgpu_bo_handle_type type;
    int r;
 
-   if (!bo->bo) {
-      offset += bo->va - bo->u.slab.real->va;
-      bo = bo->u.slab.real;
-   }
+   /* Don't allow exports of slab entries and sparse buffers. */
+   if (!bo->bo)
+      return false;
 
    bo->u.real.use_reusable_pool = false;
 
@@ -1453,6 +1411,13 @@
    return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
 }
 
+static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
+{
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+
+   return !bo->bo && !bo->sparse;
+}
+
 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
 {
    return ((struct amdgpu_winsys_bo*)buf)->va;
@@ -1469,6 +1434,7 @@
    ws->base.buffer_from_handle = amdgpu_bo_from_handle;
    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
    ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
+   ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
    ws->base.buffer_commit = amdgpu_bo_sparse_commit;
    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 6295c61..d266253 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -259,7 +259,8 @@
 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
 {
    return cs->request.ip_type != AMDGPU_HW_IP_UVD &&
-          cs->request.ip_type != AMDGPU_HW_IP_VCE;
+          cs->request.ip_type != AMDGPU_HW_IP_VCE &&
+          cs->request.ip_type != AMDGPU_HW_IP_VCN_DEC;
 }
 
 static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
@@ -535,7 +536,7 @@
       buffer = &cs->sparse_buffers[index];
    }
 
-   buffer->u.real.priority_usage |= 1llu << priority;
+   buffer->u.real.priority_usage |= 1ull << priority;
    buffer->usage |= usage;
 
    cs->last_added_bo = bo;
@@ -580,8 +581,7 @@
 
    pb = ws->base.buffer_create(&ws->base, buffer_size,
                                ws->info.gart_page_size,
-                               RADEON_DOMAIN_GTT,
-                               RADEON_FLAG_CPU_ACCESS);
+                               RADEON_DOMAIN_GTT, 0);
    if (!pb)
       return false;
 
@@ -712,6 +712,10 @@
       cs->request.ip_type = AMDGPU_HW_IP_COMPUTE;
       break;
 
+   case RING_VCN_DEC:
+      cs->request.ip_type = AMDGPU_HW_IP_VCN_DEC;
+      break;
+
    default:
    case RING_GFX:
       cs->request.ip_type = AMDGPU_HW_IP_GFX;
@@ -1220,6 +1224,9 @@
          cs->flags[i] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4;
       }
 
+      if (acs->ring_type == RING_GFX)
+         ws->gfx_bo_list_counter += cs->num_real_buffers;
+
       r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers,
                                 cs->handles, cs->flags,
                                 &cs->request.resources);
@@ -1330,6 +1337,10 @@
       while (rcs->current.cdw & 15)
          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
       break;
+   case RING_VCN_DEC:
+      while (rcs->current.cdw & 15)
+         radeon_emit(rcs, 0x81ff); /* nop packet */
+      break;
    default:
       break;
    }
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_public.h b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
index ad133b2..3cb5a1b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
@@ -32,9 +32,11 @@
 struct radeon_winsys;
 struct pipe_screen;
 
-typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *);
+typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *,
+						      unsigned);
 
 struct radeon_winsys *
-amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create);
+amdgpu_winsys_create(int fd, unsigned flags,
+		     radeon_screen_create_t screen_create);
 
 #endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 4d532e3..1a2b7c4 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -32,32 +32,8 @@
 #include "amdgpu_winsys.h"
 #include "util/u_format.h"
 
-#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
-#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
-#endif
-
-#ifndef CIASICIDGFXENGINE_ARCTICISLAND
-#define CIASICIDGFXENGINE_ARCTICISLAND 0x0000000D
-#endif
-
 static int amdgpu_surface_sanity(const struct pipe_resource *tex)
 {
-   /* all dimension must be at least 1 ! */
-   if (!tex->width0 || !tex->height0 || !tex->depth0 ||
-       !tex->array_size)
-      return -EINVAL;
-
-   switch (tex->nr_samples) {
-   case 0:
-   case 1:
-   case 2:
-   case 4:
-   case 8:
-      break;
-   default:
-      return -EINVAL;
-   }
-
    switch (tex->target) {
    case PIPE_TEXTURE_1D:
       if (tex->height0 > 1)
@@ -88,912 +64,39 @@
    return 0;
 }
 
-static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
-{
-   return malloc(pInput->sizeInBytes);
-}
-
-static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
-{
-   free(pInput->pVirtAddr);
-   return ADDR_OK;
-}
-
-ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
-{
-   ADDR_CREATE_INPUT addrCreateInput = {0};
-   ADDR_CREATE_OUTPUT addrCreateOutput = {0};
-   ADDR_REGISTER_VALUE regValue = {0};
-   ADDR_CREATE_FLAGS createFlags = {{0}};
-   ADDR_E_RETURNCODE addrRet;
-
-   addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
-   addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
-
-   regValue.gbAddrConfig = ws->amdinfo.gb_addr_cfg;
-   createFlags.value = 0;
-
-   if (ws->info.chip_class >= GFX9) {
-      addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND;
-      regValue.blockVarSizeLog2 = 0;
-   } else {
-      regValue.noOfBanks = ws->amdinfo.mc_arb_ramcfg & 0x3;
-      regValue.noOfRanks = (ws->amdinfo.mc_arb_ramcfg & 0x4) >> 2;
-
-      regValue.backendDisables = ws->amdinfo.enabled_rb_pipes_mask;
-      regValue.pTileConfig = ws->amdinfo.gb_tile_mode;
-      regValue.noOfEntries = ARRAY_SIZE(ws->amdinfo.gb_tile_mode);
-      if (ws->info.chip_class == SI) {
-         regValue.pMacroTileConfig = NULL;
-         regValue.noOfMacroEntries = 0;
-      } else {
-         regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
-         regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode);
-      }
-
-      createFlags.useTileIndex = 1;
-      createFlags.useHtileSliceAlign = 1;
-
-      addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
-      addrCreateInput.chipFamily = ws->family;
-      addrCreateInput.chipRevision = ws->rev_id;
-   }
-
-   addrCreateInput.chipFamily = ws->family;
-   addrCreateInput.chipRevision = ws->rev_id;
-   addrCreateInput.callbacks.allocSysMem = allocSysMem;
-   addrCreateInput.callbacks.freeSysMem = freeSysMem;
-   addrCreateInput.callbacks.debugPrint = 0;
-   addrCreateInput.createFlags = createFlags;
-   addrCreateInput.regValue = regValue;
-
-   addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
-   if (addrRet != ADDR_OK)
-      return NULL;
-
-   return addrCreateOutput.hLib;
-}
-
-static int gfx6_compute_level(struct amdgpu_winsys *ws,
-                              const struct pipe_resource *tex,
-                              struct radeon_surf *surf, bool is_stencil,
-                              unsigned level, bool compressed,
-                              ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
-                              ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
-                              ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                              ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
-                              ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
-                              ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
-{
-   struct legacy_surf_level *surf_level;
-   ADDR_E_RETURNCODE ret;
-
-   AddrSurfInfoIn->mipLevel = level;
-   AddrSurfInfoIn->width = u_minify(tex->width0, level);
-   AddrSurfInfoIn->height = u_minify(tex->height0, level);
-
-   if (tex->target == PIPE_TEXTURE_3D)
-      AddrSurfInfoIn->numSlices = u_minify(tex->depth0, level);
-   else if (tex->target == PIPE_TEXTURE_CUBE)
-      AddrSurfInfoIn->numSlices = 6;
-   else
-      AddrSurfInfoIn->numSlices = tex->array_size;
-
-   if (level > 0) {
-      /* Set the base level pitch. This is needed for calculation
-       * of non-zero levels. */
-      if (is_stencil)
-         AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x;
-      else
-         AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x;
-
-      /* Convert blocks to pixels for compressed formats. */
-      if (compressed)
-         AddrSurfInfoIn->basePitch *= surf->blk_w;
-   }
-
-   ret = AddrComputeSurfaceInfo(ws->addrlib,
-                                AddrSurfInfoIn,
-                                AddrSurfInfoOut);
-   if (ret != ADDR_OK) {
-      return ret;
-   }
-
-   surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level];
-   surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign);
-   surf_level->slice_size = AddrSurfInfoOut->sliceSize;
-   surf_level->nblk_x = AddrSurfInfoOut->pitch;
-   surf_level->nblk_y = AddrSurfInfoOut->height;
-
-   switch (AddrSurfInfoOut->tileMode) {
-   case ADDR_TM_LINEAR_ALIGNED:
-      surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-      break;
-   case ADDR_TM_1D_TILED_THIN1:
-      surf_level->mode = RADEON_SURF_MODE_1D;
-      break;
-   case ADDR_TM_2D_TILED_THIN1:
-      surf_level->mode = RADEON_SURF_MODE_2D;
-      break;
-   default:
-      assert(0);
-   }
-
-   if (is_stencil)
-      surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
-   else
-      surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex;
-
-   surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize;
-
-   /* Clear DCC fields at the beginning. */
-   surf_level->dcc_offset = 0;
-
-   /* The previous level's flag tells us if we can use DCC for this level. */
-   if (AddrSurfInfoIn->flags.dccCompatible &&
-       (level == 0 || AddrDccOut->subLvlCompressible)) {
-      AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
-      AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
-      AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
-      AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
-      AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-      ret = AddrComputeDccInfo(ws->addrlib,
-                               AddrDccIn,
-                               AddrDccOut);
-
-      if (ret == ADDR_OK) {
-         surf_level->dcc_offset = surf->dcc_size;
-         surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
-         surf->num_dcc_levels = level + 1;
-         surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
-         surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
-      }
-   }
-
-   /* TC-compatible HTILE. */
-   if (!is_stencil &&
-       AddrSurfInfoIn->flags.depth &&
-       AddrSurfInfoIn->flags.tcCompatible &&
-       surf_level->mode == RADEON_SURF_MODE_2D &&
-       level == 0) {
-      AddrHtileIn->flags.tcCompatible = 1;
-      AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
-      AddrHtileIn->height = AddrSurfInfoOut->height;
-      AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
-      AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
-      AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
-      AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
-      AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
-      AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-      ret = AddrComputeHtileInfo(ws->addrlib,
-                                 AddrHtileIn,
-                                 AddrHtileOut);
-
-      if (ret == ADDR_OK) {
-         surf->htile_size = AddrHtileOut->htileBytes;
-         surf->htile_alignment = AddrHtileOut->baseAlign;
-      }
-   }
-
-   return 0;
-}
-
-#define   G_009910_MICRO_TILE_MODE(x)          (((x) >> 0) & 0x03)
-#define   G_009910_MICRO_TILE_MODE_NEW(x)      (((x) >> 22) & 0x07)
-
-static void gfx6_set_micro_tile_mode(struct radeon_surf *surf,
-                                     struct radeon_info *info)
-{
-   uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]];
-
-   if (info->chip_class >= CIK)
-      surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode);
-   else
-      surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode);
-}
-
-static unsigned cik_get_macro_tile_index(struct radeon_surf *surf)
-{
-	unsigned index, tileb;
-
-	tileb = 8 * 8 * surf->bpe;
-	tileb = MIN2(surf->u.legacy.tile_split, tileb);
-
-	for (index = 0; tileb > 64; index++)
-		tileb >>= 1;
-
-	assert(index < 16);
-	return index;
-}
-
-static int gfx6_surface_init(struct radeon_winsys *rws,
-                             const struct pipe_resource *tex,
-                             unsigned flags, unsigned bpe,
-                             enum radeon_surf_mode mode,
-                             struct radeon_surf *surf)
+static int amdgpu_surface_init(struct radeon_winsys *rws,
+                               const struct pipe_resource *tex,
+                               unsigned flags, unsigned bpe,
+                               enum radeon_surf_mode mode,
+                               struct radeon_surf *surf)
 {
    struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
-   unsigned level;
-   bool compressed;
-   ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
-   ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
-   ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
-   ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
-   ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
-   ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
-   ADDR_TILEINFO AddrTileInfoIn = {0};
-   ADDR_TILEINFO AddrTileInfoOut = {0};
    int r;
 
    r = amdgpu_surface_sanity(tex);
    if (r)
       return r;
 
-   AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
-   AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
-   AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
-   AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
-   AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
-   AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
-   AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
-
    surf->blk_w = util_format_get_blockwidth(tex->format);
    surf->blk_h = util_format_get_blockheight(tex->format);
    surf->bpe = bpe;
    surf->flags = flags;
 
-   compressed = surf->blk_w == 4 && surf->blk_h == 4;
+   struct ac_surf_config config;
 
-   /* MSAA and FMASK require 2D tiling. */
-   if (tex->nr_samples > 1 ||
-       (flags & RADEON_SURF_FMASK))
-      mode = RADEON_SURF_MODE_2D;
+   config.info.width = tex->width0;
+   config.info.height = tex->height0;
+   config.info.depth = tex->depth0;
+   config.info.array_size = tex->array_size;
+   config.info.samples = tex->nr_samples;
+   config.info.levels = tex->last_level + 1;
+   config.is_3d = !!(tex->target == PIPE_TEXTURE_3D);
+   config.is_cube = !!(tex->target == PIPE_TEXTURE_CUBE);
 
-   /* DB doesn't support linear layouts. */
-   if (flags & (RADEON_SURF_Z_OR_SBUFFER) &&
-       mode < RADEON_SURF_MODE_1D)
-      mode = RADEON_SURF_MODE_1D;
-
-   /* Set the requested tiling mode. */
-   switch (mode) {
-   case RADEON_SURF_MODE_LINEAR_ALIGNED:
-      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
-      break;
-   case RADEON_SURF_MODE_1D:
-      AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
-      break;
-   case RADEON_SURF_MODE_2D:
-      AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
-      break;
-   default:
-      assert(0);
-   }
-
-   /* The format must be set correctly for the allocation of compressed
-    * textures to work. In other cases, setting the bpp is sufficient. */
-   if (compressed) {
-      switch (bpe) {
-      case 8:
-         AddrSurfInfoIn.format = ADDR_FMT_BC1;
-         break;
-      case 16:
-         AddrSurfInfoIn.format = ADDR_FMT_BC3;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   else {
-      AddrDccIn.bpp = AddrSurfInfoIn.bpp = bpe * 8;
-   }
-
-   AddrDccIn.numSamples = AddrSurfInfoIn.numSamples =
-      tex->nr_samples ? tex->nr_samples : 1;
-   AddrSurfInfoIn.tileIndex = -1;
-
-   /* Set the micro tile type. */
-   if (flags & RADEON_SURF_SCANOUT)
-      AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
-   else if (flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))
-      AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
-   else
-      AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
-
-   AddrSurfInfoIn.flags.color = !(flags & RADEON_SURF_Z_OR_SBUFFER);
-   AddrSurfInfoIn.flags.depth = (flags & RADEON_SURF_ZBUFFER) != 0;
-   AddrSurfInfoIn.flags.cube = tex->target == PIPE_TEXTURE_CUBE;
-   AddrSurfInfoIn.flags.fmask = (flags & RADEON_SURF_FMASK) != 0;
-   AddrSurfInfoIn.flags.display = (flags & RADEON_SURF_SCANOUT) != 0;
-   AddrSurfInfoIn.flags.pow2Pad = tex->last_level > 0;
-   AddrSurfInfoIn.flags.tcCompatible = (flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
-
-   /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
-    * requested, because TC-compatible HTILE requires 2D tiling.
-    */
-   AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible &&
-                                    !AddrSurfInfoIn.flags.fmask &&
-                                    tex->nr_samples <= 1 &&
-                                    (flags & RADEON_SURF_OPTIMIZE_FOR_SPACE);
-
-   /* DCC notes:
-    * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
-    *   with samples >= 4.
-    * - Mipmapped array textures have low performance (discovered by a closed
-    *   driver team).
-    */
-   AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI &&
-                                        !(flags & RADEON_SURF_Z_OR_SBUFFER) &&
-                                        !(flags & RADEON_SURF_DISABLE_DCC) &&
-                                        !compressed && AddrDccIn.numSamples <= 1 &&
-                                        ((tex->array_size == 1 && tex->depth0 == 1) ||
-                                         tex->last_level == 0);
-
-   AddrSurfInfoIn.flags.noStencil = (flags & RADEON_SURF_SBUFFER) == 0;
-   AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth;
-
-   /* noStencil = 0 can result in a depth part that is incompatible with
-    * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in
-    * this case, we may end up setting stencil_adjusted).
-    *
-    * TODO: update addrlib to a newer version, remove this, and
-    * use flags.matchStencilTileCfg = 1 as an alternative fix.
-    */
-  if (tex->last_level > 0)
-      AddrSurfInfoIn.flags.noStencil = 1;
-
-   /* Set preferred macrotile parameters. This is usually required
-    * for shared resources. This is for 2D tiling only. */
-   if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
-       surf->u.legacy.bankw && surf->u.legacy.bankh &&
-       surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
-      assert(!(flags & RADEON_SURF_FMASK));
-
-      /* If any of these parameters are incorrect, the calculation
-       * will fail. */
-      AddrTileInfoIn.banks = surf->u.legacy.num_banks;
-      AddrTileInfoIn.bankWidth = surf->u.legacy.bankw;
-      AddrTileInfoIn.bankHeight = surf->u.legacy.bankh;
-      AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea;
-      AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split;
-      AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */
-      AddrSurfInfoIn.flags.opt4Space = 0;
-      AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
-
-      /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
-       * the tile index, because we are expected to know it if
-       * we know the other parameters.
-       *
-       * This is something that can easily be fixed in Addrlib.
-       * For now, just figure it out here.
-       * Note that only 2D_TILE_THIN1 is handled here.
-       */
-      assert(!(flags & RADEON_SURF_Z_OR_SBUFFER));
-      assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
-
-      if (ws->info.chip_class == SI) {
-         if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) {
-            if (bpe == 2)
-               AddrSurfInfoIn.tileIndex = 11; /* 16bpp */
-            else
-               AddrSurfInfoIn.tileIndex = 12; /* 32bpp */
-         } else {
-            if (bpe == 1)
-               AddrSurfInfoIn.tileIndex = 14; /* 8bpp */
-            else if (bpe == 2)
-               AddrSurfInfoIn.tileIndex = 15; /* 16bpp */
-            else if (bpe == 4)
-               AddrSurfInfoIn.tileIndex = 16; /* 32bpp */
-            else
-               AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */
-         }
-      } else {
-         /* CIK - VI */
-         if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
-            AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
-         else
-            AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
-
-         /* Addrlib doesn't set this if tileIndex is forced like above. */
-         AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
-      }
-   }
-
-   surf->num_dcc_levels = 0;
-   surf->surf_size = 0;
-   surf->dcc_size = 0;
-   surf->dcc_alignment = 1;
-   surf->htile_size = 0;
-   surf->htile_alignment = 1;
-
-   /* Calculate texture layout information. */
-   for (level = 0; level <= tex->last_level; level++) {
-      r = gfx6_compute_level(ws, tex, surf, false, level, compressed,
-                             &AddrSurfInfoIn, &AddrSurfInfoOut,
-                             &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
-      if (r)
-         return r;
-
-      if (level == 0) {
-         surf->surf_alignment = AddrSurfInfoOut.baseAlign;
-         surf->u.legacy.pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
-         gfx6_set_micro_tile_mode(surf, &ws->info);
-
-         /* For 2D modes only. */
-         if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
-            surf->u.legacy.bankw = AddrSurfInfoOut.pTileInfo->bankWidth;
-            surf->u.legacy.bankh = AddrSurfInfoOut.pTileInfo->bankHeight;
-            surf->u.legacy.mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio;
-            surf->u.legacy.tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes;
-            surf->u.legacy.num_banks = AddrSurfInfoOut.pTileInfo->banks;
-            surf->u.legacy.macro_tile_index = AddrSurfInfoOut.macroModeIndex;
-         } else {
-            surf->u.legacy.macro_tile_index = 0;
-         }
-      }
-   }
-
-   /* Calculate texture layout information for stencil. */
-   if (flags & RADEON_SURF_SBUFFER) {
-      AddrSurfInfoIn.bpp = 8;
-      AddrSurfInfoIn.flags.depth = 0;
-      AddrSurfInfoIn.flags.stencil = 1;
-      AddrSurfInfoIn.flags.tcCompatible = 0;
-      /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
-      AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split;
-
-      for (level = 0; level <= tex->last_level; level++) {
-         r = gfx6_compute_level(ws, tex, surf, true, level, compressed,
-                                &AddrSurfInfoIn, &AddrSurfInfoOut,
-                                &AddrDccIn, &AddrDccOut,
-                                NULL, NULL);
-         if (r)
-            return r;
-
-         /* DB uses the depth pitch for both stencil and depth. */
-         if (surf->u.legacy.stencil_level[level].nblk_x != surf->u.legacy.level[level].nblk_x)
-            surf->u.legacy.stencil_adjusted = true;
-
-         if (level == 0) {
-            /* For 2D modes only. */
-            if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
-               surf->u.legacy.stencil_tile_split =
-                     AddrSurfInfoOut.pTileInfo->tileSplitBytes;
-            }
-         }
-      }
-   }
-
-   /* Recalculate the whole DCC miptree size including disabled levels.
-    * This is what addrlib does, but calling addrlib would be a lot more
-    * complicated.
-    */
-   if (surf->dcc_size && tex->last_level > 0) {
-      surf->dcc_size = align64(surf->surf_size >> 8,
-                               ws->info.pipe_interleave_bytes *
-                               ws->info.num_tile_pipes);
-   }
-
-   /* Make sure HTILE covers the whole miptree, because the shader reads
-    * TC-compatible HTILE even for levels where it's disabled by DB.
-    */
-   if (surf->htile_size && tex->last_level)
-	   surf->htile_size *= 2;
-
-   surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
-   return 0;
-}
-
-/* This is only called when expecting a tiled layout. */
-static int
-gfx9_get_preferred_swizzle_mode(struct amdgpu_winsys *ws,
-                                ADDR2_COMPUTE_SURFACE_INFO_INPUT *in,
-                                bool is_fmask, AddrSwizzleMode *swizzle_mode)
-{
-   ADDR_E_RETURNCODE ret;
-   ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0};
-   ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0};
-
-   sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT);
-   sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT);
-
-   sin.flags = in->flags;
-   sin.resourceType = in->resourceType;
-   sin.format = in->format;
-   sin.resourceLoction = ADDR_RSRC_LOC_INVIS;
-   /* TODO: We could allow some of these: */
-   sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */
-   sin.forbiddenBlock.var = 1; /* don't allow the variable-sized swizzle modes */
-   sin.forbiddenBlock.linear = 1; /* don't allow linear swizzle modes */
-   sin.bpp = in->bpp;
-   sin.width = in->width;
-   sin.height = in->height;
-   sin.numSlices = in->numSlices;
-   sin.numMipLevels = in->numMipLevels;
-   sin.numSamples = in->numSamples;
-   sin.numFrags = in->numFrags;
-
-   if (is_fmask) {
-      sin.flags.color = 0;
-      sin.flags.fmask = 1;
-   }
-
-   ret = Addr2GetPreferredSurfaceSetting(ws->addrlib, &sin, &sout);
-   if (ret != ADDR_OK)
-      return ret;
-
-   *swizzle_mode = sout.swizzleMode;
-   return 0;
-}
-
-static int gfx9_compute_miptree(struct amdgpu_winsys *ws,
-                                struct radeon_surf *surf, bool compressed,
-                                ADDR2_COMPUTE_SURFACE_INFO_INPUT *in)
-{
-   ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {};
-   ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
-   ADDR_E_RETURNCODE ret;
-
-   out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
-   out.pMipInfo = mip_info;
-
-   ret = Addr2ComputeSurfaceInfo(ws->addrlib, in, &out);
-   if (ret != ADDR_OK)
-      return ret;
-
-   if (in->flags.stencil) {
-      surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode;
-      surf->u.gfx9.stencil.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
-                                                         out.mipChainPitch - 1;
-      surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign);
-      surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign);
-      surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize;
-      return 0;
-   }
-
-   surf->u.gfx9.surf.swizzle_mode = in->swizzleMode;
-   surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
-                                                   out.mipChainPitch - 1;
-
-   /* CMASK fast clear uses these even if FMASK isn't allocated.
-    * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4.
-    */
-   surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3;
-   surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch;
-
-   surf->u.gfx9.surf_slice_size = out.sliceSize;
-   surf->u.gfx9.surf_pitch = out.pitch;
-   surf->u.gfx9.surf_height = out.height;
-   surf->surf_size = out.surfSize;
-   surf->surf_alignment = out.baseAlign;
-
-   if (in->swizzleMode == ADDR_SW_LINEAR) {
-      for (unsigned i = 0; i < in->numMipLevels; i++)
-         surf->u.gfx9.offset[i] = mip_info[i].offset;
-   }
-
-   if (in->flags.depth) {
-      assert(in->swizzleMode != ADDR_SW_LINEAR);
-
-      /* HTILE */
-      ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0};
-      ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0};
-
-      hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT);
-      hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT);
-
-      hin.hTileFlags.pipeAligned = 1;
-      hin.hTileFlags.rbAligned = 1;
-      hin.depthFlags = in->flags;
-      hin.swizzleMode = in->swizzleMode;
-      hin.unalignedWidth = in->width;
-      hin.unalignedHeight = in->height;
-      hin.numSlices = in->numSlices;
-      hin.numMipLevels = in->numMipLevels;
-
-      ret = Addr2ComputeHtileInfo(ws->addrlib, &hin, &hout);
-      if (ret != ADDR_OK)
-         return ret;
-
-      surf->u.gfx9.htile.rb_aligned = hin.hTileFlags.rbAligned;
-      surf->u.gfx9.htile.pipe_aligned = hin.hTileFlags.pipeAligned;
-      surf->htile_size = hout.htileBytes;
-      surf->htile_alignment = hout.baseAlign;
-   } else {
-      /* DCC */
-      if (!(surf->flags & RADEON_SURF_DISABLE_DCC) &&
-          !(surf->flags & RADEON_SURF_SCANOUT) &&
-          !compressed &&
-          in->swizzleMode != ADDR_SW_LINEAR &&
-          /* TODO: We could support DCC with MSAA. */
-          in->numSamples == 1) {
-         ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
-         ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
-
-         din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
-         dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
-
-         din.dccKeyFlags.pipeAligned = 1;
-         din.dccKeyFlags.rbAligned = 1;
-         din.colorFlags = in->flags;
-         din.resourceType = in->resourceType;
-         din.swizzleMode = in->swizzleMode;
-         din.bpp = in->bpp;
-         din.unalignedWidth = in->width;
-         din.unalignedHeight = in->height;
-         din.numSlices = in->numSlices;
-         din.numFrags = in->numFrags;
-         din.numMipLevels = in->numMipLevels;
-         din.dataSurfaceSize = out.surfSize;
-
-         ret = Addr2ComputeDccInfo(ws->addrlib, &din, &dout);
-         if (ret != ADDR_OK)
-            return ret;
-
-         surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned;
-         surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned;
-         surf->u.gfx9.dcc_pitch_max = dout.pitch - 1;
-         surf->dcc_size = dout.dccRamSize;
-         surf->dcc_alignment = dout.dccRamBaseAlign;
-      }
-
-      /* FMASK */
-      if (in->numSamples > 1) {
-         ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0};
-         ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
-
-         fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT);
-         fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT);
-
-         ret = gfx9_get_preferred_swizzle_mode(ws, in, true, &fin.swizzleMode);
-         if (ret != ADDR_OK)
-            return ret;
-
-         fin.unalignedWidth = in->width;
-         fin.unalignedHeight = in->height;
-         fin.numSlices = in->numSlices;
-         fin.numSamples = in->numSamples;
-         fin.numFrags = in->numFrags;
-
-         ret = Addr2ComputeFmaskInfo(ws->addrlib, &fin, &fout);
-         if (ret != ADDR_OK)
-            return ret;
-
-         surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
-         surf->u.gfx9.fmask.epitch = fout.pitch - 1;
-         surf->u.gfx9.fmask_size = fout.fmaskBytes;
-         surf->u.gfx9.fmask_alignment = fout.baseAlign;
-      }
-
-      /* CMASK */
-      if (in->swizzleMode != ADDR_SW_LINEAR) {
-         ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0};
-         ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0};
-
-         cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
-         cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);
-
-         cin.cMaskFlags.pipeAligned = 1;
-         cin.cMaskFlags.rbAligned = 1;
-         cin.colorFlags = in->flags;
-         cin.resourceType = in->resourceType;
-         cin.unalignedWidth = in->width;
-         cin.unalignedHeight = in->height;
-         cin.numSlices = in->numSlices;
-
-         if (in->numSamples > 1)
-            cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode;
-         else
-            cin.swizzleMode = in->swizzleMode;
-
-         ret = Addr2ComputeCmaskInfo(ws->addrlib, &cin, &cout);
-         if (ret != ADDR_OK)
-            return ret;
-
-         surf->u.gfx9.cmask.rb_aligned = cin.cMaskFlags.rbAligned;
-         surf->u.gfx9.cmask.pipe_aligned = cin.cMaskFlags.pipeAligned;
-         surf->u.gfx9.cmask_size = cout.cmaskBytes;
-         surf->u.gfx9.cmask_alignment = cout.baseAlign;
-      }
-   }
-
-   return 0;
-}
-
-static int gfx9_surface_init(struct radeon_winsys *rws,
-                             const struct pipe_resource *tex,
-                             unsigned flags, unsigned bpe,
-                             enum radeon_surf_mode mode,
-                             struct radeon_surf *surf)
-{
-   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
-   bool compressed;
-   ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
-   int r;
-
-   assert(!(flags & RADEON_SURF_FMASK));
-
-   r = amdgpu_surface_sanity(tex);
-   if (r)
-      return r;
-
-   AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
-
-   surf->blk_w = util_format_get_blockwidth(tex->format);
-   surf->blk_h = util_format_get_blockheight(tex->format);
-   surf->bpe = bpe;
-   surf->flags = flags;
-
-   compressed = surf->blk_w == 4 && surf->blk_h == 4;
-
-   /* The format must be set correctly for the allocation of compressed
-    * textures to work. In other cases, setting the bpp is sufficient. */
-   if (compressed) {
-      switch (bpe) {
-      case 8:
-         AddrSurfInfoIn.format = ADDR_FMT_BC1;
-         break;
-      case 16:
-         AddrSurfInfoIn.format = ADDR_FMT_BC3;
-         break;
-      default:
-         assert(0);
-      }
-   } else {
-      AddrSurfInfoIn.bpp = bpe * 8;
-   }
-
-   AddrSurfInfoIn.flags.color = !(flags & RADEON_SURF_Z_OR_SBUFFER);
-   AddrSurfInfoIn.flags.depth = (flags & RADEON_SURF_ZBUFFER) != 0;
-   AddrSurfInfoIn.flags.display = (flags & RADEON_SURF_SCANOUT) != 0;
-   AddrSurfInfoIn.flags.texture = 1;
-   AddrSurfInfoIn.flags.opt4space = 1;
-
-   AddrSurfInfoIn.numMipLevels = tex->last_level + 1;
-   AddrSurfInfoIn.numSamples = tex->nr_samples ? tex->nr_samples : 1;
-   AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;
-
-   switch (tex->target) {
-   /* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
-    * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
-    * must sample 1D textures as 2D. */
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-   case PIPE_TEXTURE_3D:
-      if (tex->target == PIPE_TEXTURE_3D)
-         AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D;
-      else
-         AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D;
-
-      AddrSurfInfoIn.width = tex->width0;
-      AddrSurfInfoIn.height = tex->height0;
-
-      if (tex->target == PIPE_TEXTURE_3D)
-         AddrSurfInfoIn.numSlices = tex->depth0;
-      else if (tex->target == PIPE_TEXTURE_CUBE)
-         AddrSurfInfoIn.numSlices = 6;
-      else
-         AddrSurfInfoIn.numSlices = tex->array_size;
-
-      switch (mode) {
-      case RADEON_SURF_MODE_LINEAR_ALIGNED:
-         assert(tex->nr_samples <= 1);
-         assert(!(flags & RADEON_SURF_Z_OR_SBUFFER));
-         AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR;
-         break;
-
-      case RADEON_SURF_MODE_1D:
-      case RADEON_SURF_MODE_2D:
-         r = gfx9_get_preferred_swizzle_mode(ws, &AddrSurfInfoIn, false,
-                                             &AddrSurfInfoIn.swizzleMode);
-         if (r)
-            return r;
-         break;
-
-      default:
-         assert(0);
-      }
-      break;
-
-   default:
-      assert(0);
-   }
-
-   surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType;
-
-   surf->surf_size = 0;
-   surf->dcc_size = 0;
-   surf->htile_size = 0;
-   surf->u.gfx9.surf_offset = 0;
-   surf->u.gfx9.stencil_offset = 0;
-   surf->u.gfx9.fmask_size = 0;
-   surf->u.gfx9.cmask_size = 0;
-
-   /* Calculate texture layout information. */
-   r = gfx9_compute_miptree(ws, surf, compressed, &AddrSurfInfoIn);
-   if (r)
-      return r;
-
-   /* Calculate texture layout information for stencil. */
-   if (flags & RADEON_SURF_SBUFFER) {
-      AddrSurfInfoIn.bpp = 8;
-      AddrSurfInfoIn.flags.depth = 0;
-      AddrSurfInfoIn.flags.stencil = 1;
-
-      r = gfx9_compute_miptree(ws, surf, compressed, &AddrSurfInfoIn);
-      if (r)
-         return r;
-   }
-
-   surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;
-   surf->num_dcc_levels = surf->dcc_size ? tex->last_level + 1 : 0;
-
-   switch (surf->u.gfx9.surf.swizzle_mode) {
-   /* S = standard. */
-   case ADDR_SW_256B_S:
-   case ADDR_SW_4KB_S:
-   case ADDR_SW_64KB_S:
-   case ADDR_SW_VAR_S:
-   case ADDR_SW_64KB_S_T:
-   case ADDR_SW_4KB_S_X:
-   case ADDR_SW_64KB_S_X:
-   case ADDR_SW_VAR_S_X:
-      surf->micro_tile_mode = RADEON_MICRO_MODE_THIN;
-      break;
-
-   /* D = display. */
-   case ADDR_SW_LINEAR:
-   case ADDR_SW_256B_D:
-   case ADDR_SW_4KB_D:
-   case ADDR_SW_64KB_D:
-   case ADDR_SW_VAR_D:
-   case ADDR_SW_64KB_D_T:
-   case ADDR_SW_4KB_D_X:
-   case ADDR_SW_64KB_D_X:
-   case ADDR_SW_VAR_D_X:
-      surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY;
-      break;
-
-   /* R = rotated. */
-   case ADDR_SW_256B_R:
-   case ADDR_SW_4KB_R:
-   case ADDR_SW_64KB_R:
-   case ADDR_SW_VAR_R:
-   case ADDR_SW_64KB_R_T:
-   case ADDR_SW_4KB_R_X:
-   case ADDR_SW_64KB_R_X:
-   case ADDR_SW_VAR_R_X:
-      surf->micro_tile_mode = RADEON_MICRO_MODE_ROTATED;
-      break;
-
-   /* Z = depth. */
-   case ADDR_SW_4KB_Z:
-   case ADDR_SW_64KB_Z:
-   case ADDR_SW_VAR_Z:
-   case ADDR_SW_64KB_Z_T:
-   case ADDR_SW_4KB_Z_X:
-   case ADDR_SW_64KB_Z_X:
-   case ADDR_SW_VAR_Z_X:
-      surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH;
-      break;
-
-   default:
-      assert(0);
-   }
-
-   return 0;
+   return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf);
 }
 
 void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)
 {
-   if (ws->info.chip_class >= GFX9)
-      ws->base.surface_init = gfx9_surface_init;
-   else
-      ws->base.surface_init = gfx6_surface_init;
+   ws->base.surface_init = amdgpu_surface_init;
 }
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 70319db..837c1e2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -43,192 +43,18 @@
 #include "amd/common/sid.h"
 #include "amd/common/gfx9d.h"
 
-#define CIK_TILE_MODE_COLOR_2D			14
-
-#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+#ifndef AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS
+#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS	0x1E
+#endif
 
 static struct util_hash_table *dev_tab = NULL;
 static mtx_t dev_tab_mutex = _MTX_INITIALIZER_NP;
 
-static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
-{
-   unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D];
-
-   switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
-   case CIK__PIPE_CONFIG__ADDR_SURF_P2:
-       return 2;
-   case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
-       return 4;
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
-       return 8;
-   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
-   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
-       return 16;
-   default:
-       fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n");
-       assert(!"this should never occur");
-       return 2;
-   }
-}
-
 /* Helper function to do the ioctls needed for setup and init. */
 static bool do_winsys_init(struct amdgpu_winsys *ws, int fd)
 {
-   struct amdgpu_buffer_size_alignments alignment_info = {};
-   struct amdgpu_heap_info vram, vram_vis, gtt;
-   struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {};
-   uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
-   uint32_t unused_feature;
-   int r, i, j;
-   drmDevicePtr devinfo;
-
-   /* Get PCI info. */
-   r = drmGetDevice2(fd, 0, &devinfo);
-   if (r) {
-      fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
+   if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
       goto fail;
-   }
-   ws->info.pci_domain = devinfo->businfo.pci->domain;
-   ws->info.pci_bus = devinfo->businfo.pci->bus;
-   ws->info.pci_dev = devinfo->businfo.pci->dev;
-   ws->info.pci_func = devinfo->businfo.pci->func;
-   drmFreeDevice(&devinfo);
-
-   /* Query hardware and driver information. */
-   r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM,
-                              AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
-                              &vram_vis);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_UVD, 0, &uvd);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_ME, 0, 0,
-				     &ws->info.me_fw_version, &unused_feature);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0,
-				     &ws->info.pfp_fw_version, &unused_feature);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_CE, 0, 0,
-				     &ws->info.ce_fw_version, &unused_feature);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_UVD, 0, 0,
-				     &uvd_version, &uvd_feature);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
-      goto fail;
-   }
-
-   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_VCE, 0, 0,
-				     &vce_version, &vce_feature);
-   if (r) {
-      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
-      goto fail;
-   }
-
-   /* Set chip identification. */
-   ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
-   ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config;
-
-   switch (ws->info.pci_id) {
-#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break;
-#include "pci_ids/radeonsi_pci_ids.h"
-#undef CHIPSET
-
-   default:
-      fprintf(stderr, "amdgpu: Invalid PCI ID.\n");
-      goto fail;
-   }
-
-   if (ws->info.family >= CHIP_VEGA10)
-      ws->info.chip_class = GFX9;
-   else if (ws->info.family >= CHIP_TONGA)
-      ws->info.chip_class = VI;
-   else if (ws->info.family >= CHIP_BONAIRE)
-      ws->info.chip_class = CIK;
-   else if (ws->info.family >= CHIP_TAHITI)
-      ws->info.chip_class = SI;
-   else {
-      fprintf(stderr, "amdgpu: Unknown family.\n");
-      goto fail;
-   }
 
    /* LLVM 5.0 is required for GFX9. */
    if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
@@ -237,162 +63,17 @@
       goto fail;
    }
 
-   /* family and rev_id are for addrlib */
-   switch (ws->info.family) {
-   case CHIP_TAHITI:
-      ws->family = FAMILY_SI;
-      ws->rev_id = SI_TAHITI_P_A0;
-      break;
-   case CHIP_PITCAIRN:
-      ws->family = FAMILY_SI;
-      ws->rev_id = SI_PITCAIRN_PM_A0;
-      break;
-   case CHIP_VERDE:
-      ws->family = FAMILY_SI;
-      ws->rev_id = SI_CAPEVERDE_M_A0;
-      break;
-   case CHIP_OLAND:
-      ws->family = FAMILY_SI;
-      ws->rev_id = SI_OLAND_M_A0;
-      break;
-   case CHIP_HAINAN:
-      ws->family = FAMILY_SI;
-      ws->rev_id = SI_HAINAN_V_A0;
-      break;
-   case CHIP_BONAIRE:
-      ws->family = FAMILY_CI;
-      ws->rev_id = CI_BONAIRE_M_A0;
-      break;
-   case CHIP_KAVERI:
-      ws->family = FAMILY_KV;
-      ws->rev_id = KV_SPECTRE_A0;
-      break;
-   case CHIP_KABINI:
-      ws->family = FAMILY_KV;
-      ws->rev_id = KB_KALINDI_A0;
-      break;
-   case CHIP_HAWAII:
-      ws->family = FAMILY_CI;
-      ws->rev_id = CI_HAWAII_P_A0;
-      break;
-   case CHIP_MULLINS:
-      ws->family = FAMILY_KV;
-      ws->rev_id = ML_GODAVARI_A0;
-      break;
-   case CHIP_TONGA:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_TONGA_P_A0;
-      break;
-   case CHIP_ICELAND:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_ICELAND_M_A0;
-      break;
-   case CHIP_CARRIZO:
-      ws->family = FAMILY_CZ;
-      ws->rev_id = CARRIZO_A0;
-      break;
-   case CHIP_STONEY:
-      ws->family = FAMILY_CZ;
-      ws->rev_id = STONEY_A0;
-      break;
-   case CHIP_FIJI:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_FIJI_P_A0;
-      break;
-   case CHIP_POLARIS10:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_POLARIS10_P_A0;
-      break;
-   case CHIP_POLARIS11:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_POLARIS11_M_A0;
-      break;
-   case CHIP_POLARIS12:
-      ws->family = FAMILY_VI;
-      ws->rev_id = VI_POLARIS12_V_A0;
-      break;
-   case CHIP_VEGA10:
-      ws->family = FAMILY_AI;
-      ws->rev_id = AI_VEGA10_P_A0;
-      break;
-   case CHIP_RAVEN:
-      ws->family = FAMILY_RV;
-      ws->rev_id = RAVEN_A0;
-      break;
-   default:
-      fprintf(stderr, "amdgpu: Unknown family.\n");
-      goto fail;
-   }
-
-   ws->addrlib = amdgpu_addr_create(ws);
+   ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, NULL);
    if (!ws->addrlib) {
       fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
       goto fail;
    }
 
-   /* Set which chips have dedicated VRAM. */
-   ws->info.has_dedicated_vram =
-      !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION);
-
-   /* Set hardware information. */
-   ws->info.gart_size = gtt.heap_size;
-   ws->info.vram_size = vram.heap_size;
-   ws->info.vram_vis_size = vram_vis.heap_size;
-   /* The kernel can split large buffers in VRAM but not in GTT, so large
-    * allocations can fail or cause buffer movement failures in the kernel.
-    */
-   ws->info.max_alloc_size = MIN2(ws->info.vram_size * 0.9, ws->info.gart_size * 0.7);
-   /* convert the shader clock from KHz to MHz */
-   ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000;
-   ws->info.max_se = ws->amdinfo.num_shader_engines;
-   ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
-   ws->info.has_uvd = uvd.available_rings != 0;
-   ws->info.uvd_fw_version =
-         uvd.available_rings ? uvd_version : 0;
-   ws->info.vce_fw_version =
-         vce.available_rings ? vce_version : 0;
-   ws->info.has_userptr = true;
-   ws->info.num_render_backends = ws->amdinfo.rb_pipes;
-   ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
-   ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
-   if (ws->info.chip_class == GFX9) {
-      ws->info.num_tile_pipes = 1 << G_0098F8_NUM_PIPES(ws->amdinfo.gb_addr_cfg);
-      ws->info.pipe_interleave_bytes =
-         256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(ws->amdinfo.gb_addr_cfg);
-   } else {
-      ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
-      ws->info.pipe_interleave_bytes =
-         256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(ws->amdinfo.gb_addr_cfg);
-   }
-   ws->info.has_virtual_memory = true;
-   ws->info.has_sdma = dma.available_rings != 0;
-
-   /* Get the number of good compute units. */
-   ws->info.num_good_compute_units = 0;
-   for (i = 0; i < ws->info.max_se; i++)
-      for (j = 0; j < ws->info.max_sh_per_se; j++)
-         ws->info.num_good_compute_units +=
-            util_bitcount(ws->amdinfo.cu_bitmap[i][j]);
-
-   memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode,
-          sizeof(ws->amdinfo.gb_tile_mode));
-   ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask;
-
-   memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode,
-          sizeof(ws->amdinfo.gb_macro_tile_mode));
-
-   ws->info.gart_page_size = alignment_info.size_remote;
-
-   if (ws->info.chip_class == SI)
-      ws->info.gfx_ib_pad_with_type2 = TRUE;
-
    ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL;
 
    return true;
 
 fail:
-   if (ws->addrlib)
-      AddrDestroy(ws->addrlib);
    amdgpu_device_deinitialize(ws->dev);
    ws->dev = NULL;
    return false;
@@ -459,12 +140,17 @@
       return ws->num_gfx_IBs;
    case RADEON_NUM_SDMA_IBS:
       return ws->num_sdma_IBs;
+   case RADEON_GFX_BO_LIST_COUNTER:
+      return ws->gfx_bo_list_counter;
    case RADEON_NUM_BYTES_MOVED:
       amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval);
       return retval;
    case RADEON_NUM_EVICTIONS:
       amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_EVICTIONS, 8, &retval);
       return retval;
+   case RADEON_NUM_VRAM_CPU_PAGE_FAULTS:
+      amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS, 8, &retval);
+      return retval;
    case RADEON_VRAM_USAGE:
       amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap);
       return heap.heap_usage;
@@ -537,8 +223,16 @@
    return destroy;
 }
 
+static const char* amdgpu_get_chip_name(struct radeon_winsys *ws)
+{
+   amdgpu_device_handle dev = ((struct amdgpu_winsys *)ws)->dev;
+   return amdgpu_get_marketing_name(dev);
+}
+
+
 PUBLIC struct radeon_winsys *
-amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
+amdgpu_winsys_create(int fd, unsigned flags,
+		     radeon_screen_create_t screen_create)
 {
    struct amdgpu_winsys *ws;
    drmVersionPtr version = drmGetVersion(fd);
@@ -593,7 +287,7 @@
 
    if (!pb_slabs_init(&ws->bo_slabs,
                       AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2,
-                      12, /* number of heaps (domain/flags combinations) */
+                      RADEON_MAX_SLAB_HEAPS,
                       ws,
                       amdgpu_bo_can_reclaim_slab,
                       amdgpu_bo_slab_alloc,
@@ -612,6 +306,7 @@
    ws->base.cs_request_feature = amdgpu_cs_request_feature;
    ws->base.query_value = amdgpu_query_value;
    ws->base.read_registers = amdgpu_read_registers;
+   ws->base.get_chip_name = amdgpu_get_chip_name;
 
    amdgpu_bo_init_functions(ws);
    amdgpu_cs_init_functions(ws);
@@ -621,7 +316,8 @@
    (void) mtx_init(&ws->global_bo_list_lock, mtx_plain);
    (void) mtx_init(&ws->bo_fence_lock, mtx_plain);
 
-   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1)) {
+   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1,
+                        UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
       amdgpu_winsys_destroy(&ws->base);
       mtx_unlock(&dev_tab_mutex);
       return NULL;
@@ -632,7 +328,7 @@
     *
     * Alternatively, we could create the screen based on "ws->gen"
     * and link all drivers into one binary blob. */
-   ws->base.screen = screen_create(&ws->base);
+   ws->base.screen = screen_create(&ws->base, flags);
    if (!ws->base.screen) {
       amdgpu_winsys_destroy(&ws->base);
       mtx_unlock(&dev_tab_mutex);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index a5154ff..7cd2f20 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -41,8 +41,9 @@
 
 struct amdgpu_cs;
 
-#define AMDGPU_SLAB_MIN_SIZE_LOG2 9
-#define AMDGPU_SLAB_MAX_SIZE_LOG2 14
+#define AMDGPU_SLAB_MIN_SIZE_LOG2   9  /* 512 bytes */
+#define AMDGPU_SLAB_MAX_SIZE_LOG2   16 /* 64 KB */
+#define AMDGPU_SLAB_BO_SIZE_LOG2    17 /* 128 KB */
 
 struct amdgpu_winsys {
    struct radeon_winsys base;
@@ -65,6 +66,7 @@
    uint64_t num_gfx_IBs;
    uint64_t num_sdma_IBs;
    uint64_t num_mapped_buffers;
+   uint64_t gfx_bo_list_counter;
 
    struct radeon_info info;
 
@@ -73,8 +75,6 @@
 
    struct amdgpu_gpu_info amdinfo;
    ADDR_HANDLE addrlib;
-   uint32_t rev_id;
-   unsigned family;
 
    bool check_vm;
 
@@ -91,6 +91,5 @@
 }
 
 void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
-ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws);
 
 #endif
diff --git a/src/gallium/winsys/etnaviv/drm/Android.mk b/src/gallium/winsys/etnaviv/drm/Android.mk
new file mode 100644
index 0000000..32091be
--- /dev/null
+++ b/src/gallium/winsys/etnaviv/drm/Android.mk
@@ -0,0 +1,33 @@
+# Copyright (C) 2016 Linaro, Ltd, Rob Herring <robh@kernel.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
+
+LOCAL_MODULE := libmesa_winsys_etnaviv
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/i915/drm/Android.mk b/src/gallium/winsys/i915/drm/Android.mk
index b38bd8d..bab3e85 100644
--- a/src/gallium/winsys/i915/drm/Android.mk
+++ b/src/gallium/winsys/i915/drm/Android.mk
@@ -35,3 +35,7 @@
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_I915),)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/winsys/i915/drm/Makefile.am b/src/gallium/winsys/i915/drm/Makefile.am
index 82c477d..f7f1180 100644
--- a/src/gallium/winsys/i915/drm/Makefile.am
+++ b/src/gallium/winsys/i915/drm/Makefile.am
@@ -26,7 +26,7 @@
 AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/drivers \
 	$(GALLIUM_WINSYS_CFLAGS) \
-	$(INTEL_CFLAGS)
+	$(I915_CFLAGS)
 
 noinst_LTLIBRARIES = libi915drm.la
 
diff --git a/src/gallium/winsys/imx/drm/Android.mk b/src/gallium/winsys/imx/drm/Android.mk
new file mode 100644
index 0000000..d001974
--- /dev/null
+++ b/src/gallium/winsys/imx/drm/Android.mk
@@ -0,0 +1,40 @@
+# Copyright (C) 2016 Linaro, Ltd, Rob Herring <robh@kernel.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm_etnaviv
+
+LOCAL_MODULE := libmesa_winsys_imx
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+ifneq ($(HAVE_GALLIUM_IMX),)
+GALLIUM_TARGET_DRIVERS += imx-drm
+$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_etnaviv)
+$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES))
+endif
diff --git a/src/gallium/winsys/pl111/drm/Android.mk b/src/gallium/winsys/pl111/drm/Android.mk
new file mode 100644
index 0000000..16edd97
--- /dev/null
+++ b/src/gallium/winsys/pl111/drm/Android.mk
@@ -0,0 +1,33 @@
+# Copyright (C) 2014 Emil Velikov <emil.l.velikov@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_MODULE := libmesa_winsys_pl111
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/pl111/drm/Makefile.am b/src/gallium/winsys/pl111/drm/Makefile.am
new file mode 100644
index 0000000..34027dc
--- /dev/null
+++ b/src/gallium/winsys/pl111/drm/Makefile.am
@@ -0,0 +1,34 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	-I$(top_srcdir)/src/gallium/drivers \
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(GALLIUM_WINSYS_CFLAGS) \
+	$(LIBDRM_CFLAGS)
+
+noinst_LTLIBRARIES = libpl111drm.la
+
+libpl111drm_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/pl111/drm/Makefile.sources b/src/gallium/winsys/pl111/drm/Makefile.sources
new file mode 100644
index 0000000..b4496e6
--- /dev/null
+++ b/src/gallium/winsys/pl111/drm/Makefile.sources
@@ -0,0 +1,3 @@
+C_SOURCES := \
+   pl111_drm_public.h \
+   pl111_drm_winsys.c
diff --git a/src/gallium/winsys/pl111/drm/pl111_drm_public.h b/src/gallium/winsys/pl111/drm/pl111_drm_public.h
new file mode 100644
index 0000000..f362b0f
--- /dev/null
+++ b/src/gallium/winsys/pl111/drm/pl111_drm_public.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2016 Christian Gmeiner <christian.gmeiner@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Christian Gmeiner <christian.gmeiner@gmail.com>
+ */
+
+#ifndef __PL111_DRM_PUBLIC_H__
+#define __PL111_DRM_PUBLIC_H__
+
+struct pipe_screen;
+
+struct pipe_screen *pl111_drm_screen_create(int fd);
+
+#endif /* __PL111_DRM_PUBLIC_H__ */
diff --git a/src/gallium/winsys/pl111/drm/pl111_drm_winsys.c b/src/gallium/winsys/pl111/drm/pl111_drm_winsys.c
new file mode 100644
index 0000000..ef7b080f
--- /dev/null
+++ b/src/gallium/winsys/pl111/drm/pl111_drm_winsys.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2016 Christian Gmeiner <christian.gmeiner@gmail.com>
+ * Copyright (C) 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "pl111_drm_public.h"
+#include "vc4/drm/vc4_drm_public.h"
+#include "xf86drm.h"
+
+#include "pipe/p_screen.h"
+#include "renderonly/renderonly.h"
+
+struct pipe_screen *pl111_drm_screen_create(int fd)
+{
+   struct renderonly ro = {
+      /* Passes the vc4-allocated BO through to the pl111 DRM device using
+       * PRIME buffer sharing.  The VC4 BO must be linear, which the SCANOUT
+       * flag on allocation will have ensured.
+       */
+      .create_for_resource = renderonly_create_gpu_import_for_resource,
+      .kms_fd = fd,
+      .gpu_fd = drmOpenWithType("vc4", NULL, DRM_NODE_RENDER),
+   };
+
+   if (ro.gpu_fd < 0)
+      return NULL;
+
+   struct pipe_screen *screen = vc4_drm_screen_create_renderonly(&ro);
+   if (!screen)
+      close(ro.gpu_fd);
+
+   return screen;
+}
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 2700c6f..8027a5f 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -611,8 +611,6 @@
 
     if (flags & RADEON_FLAG_GTT_WC)
         args.flags |= RADEON_GEM_GTT_WC;
-    if (flags & RADEON_FLAG_CPU_ACCESS)
-        args.flags |= RADEON_GEM_CPU_ACCESS;
     if (flags & RADEON_FLAG_NO_CPU_ACCESS)
         args.flags |= RADEON_GEM_NO_CPU_ACCESS;
 
@@ -731,31 +729,13 @@
 {
     struct radeon_drm_winsys *ws = priv;
     struct radeon_slab *slab = CALLOC_STRUCT(radeon_slab);
-    enum radeon_bo_domain domains;
-    enum radeon_bo_flag flags = 0;
+    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
+    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
     unsigned base_hash;
 
     if (!slab)
         return NULL;
 
-    if (heap & 1)
-        flags |= RADEON_FLAG_GTT_WC;
-    if (heap & 2)
-        flags |= RADEON_FLAG_CPU_ACCESS;
-
-    switch (heap >> 2) {
-    case 0:
-        domains = RADEON_DOMAIN_VRAM;
-        break;
-    default:
-    case 1:
-        domains = RADEON_DOMAIN_VRAM_GTT;
-        break;
-    case 2:
-        domains = RADEON_DOMAIN_GTT;
-        break;
-    }
-
     slab->buffer = radeon_bo(radeon_winsys_bo_create(&ws->base,
                                                      64 * 1024, 64 * 1024,
                                                      domains, flags));
@@ -942,35 +922,24 @@
     if (size > UINT_MAX)
         return NULL;
 
+    /* VRAM implies WC. This is not optional. */
+    if (domain & RADEON_DOMAIN_VRAM)
+        flags |= RADEON_FLAG_GTT_WC;
+    /* NO_CPU_ACCESS is valid with VRAM only. */
+    if (domain != RADEON_DOMAIN_VRAM)
+        flags &= ~RADEON_FLAG_NO_CPU_ACCESS;
+
     /* Sub-allocate small buffers from slabs. */
-    if (!(flags & RADEON_FLAG_HANDLE) &&
+    if (!(flags & RADEON_FLAG_NO_SUBALLOC) &&
         size <= (1 << RADEON_SLAB_MAX_SIZE_LOG2) &&
         ws->info.has_virtual_memory &&
         alignment <= MAX2(1 << RADEON_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
         struct pb_slab_entry *entry;
-        unsigned heap = 0;
+        int heap = radeon_get_heap_index(domain, flags);
 
-        if (flags & RADEON_FLAG_GTT_WC)
-            heap |= 1;
-        if (flags & RADEON_FLAG_CPU_ACCESS)
-            heap |= 2;
-        if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS))
+        if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
             goto no_slab;
 
-        switch (domain) {
-        case RADEON_DOMAIN_VRAM:
-            heap |= 0 * 4;
-            break;
-        case RADEON_DOMAIN_VRAM_GTT:
-            heap |= 1 * 4;
-            break;
-        case RADEON_DOMAIN_GTT:
-            heap |= 2 * 4;
-            break;
-        default:
-            goto no_slab;
-        }
-
         entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
         if (!entry) {
             /* Clear the cache and try again. */
@@ -991,7 +960,7 @@
 no_slab:
 
     /* This flag is irrelevant for the cache. */
-    flags &= ~RADEON_FLAG_HANDLE;
+    flags &= ~RADEON_FLAG_NO_SUBALLOC;
 
     /* Align size to page size. This is the minimum alignment for normal
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
@@ -1000,22 +969,11 @@
     size = align(size, ws->info.gart_page_size);
     alignment = align(alignment, ws->info.gart_page_size);
 
-    /* Only set one usage bit each for domains and flags, or the cache manager
-     * might consider different sets of domains / flags compatible
-     */
-    if (domain == RADEON_DOMAIN_VRAM_GTT)
-        usage = 1 << 2;
-    else
-        usage = (unsigned)domain >> 1;
-    assert(flags < sizeof(usage) * 8 - 3);
-    usage |= 1 << (flags + 3);
+    int heap = radeon_get_heap_index(domain, flags);
+    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+    usage = 1 << heap; /* Only set one usage bit for each heap. */
 
-    /* Determine the pb_cache bucket for minimizing pb_cache misses. */
-    pb_cache_bucket = 0;
-    if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */
-       pb_cache_bucket += 1;
-    if (flags == RADEON_FLAG_GTT_WC) /* WC */
-       pb_cache_bucket += 2;
+    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
     assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
 
     bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
@@ -1290,10 +1248,9 @@
     struct radeon_bo *bo = radeon_bo(buffer);
     struct radeon_drm_winsys *ws = bo->rws;
 
-    if (!bo->handle) {
-        offset += bo->va - bo->u.slab.real->va;
-        bo = bo->u.slab.real;
-    }
+    /* Don't allow exports of slab entries. */
+    if (!bo->handle)
+        return false;
 
     memset(&flink, 0, sizeof(flink));
 
@@ -1333,6 +1290,11 @@
    return ((struct radeon_bo*)buf)->user_ptr != NULL;
 }
 
+static bool radeon_winsys_bo_is_suballocated(struct pb_buffer *buf)
+{
+   return !((struct radeon_bo*)buf)->handle;
+}
+
 static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf)
 {
     return ((struct radeon_bo*)buf)->va;
@@ -1359,6 +1321,7 @@
     ws->base.buffer_from_handle = radeon_winsys_bo_from_handle;
     ws->base.buffer_from_ptr = radeon_winsys_bo_from_ptr;
     ws->base.buffer_is_user_ptr = radeon_winsys_bo_is_user_ptr;
+    ws->base.buffer_is_suballocated = radeon_winsys_bo_is_suballocated;
     ws->base.buffer_get_handle = radeon_winsys_bo_get_handle;
     ws->base.buffer_get_virtual_address = radeon_winsys_bo_va;
     ws->base.buffer_get_reloc_offset = radeon_winsys_bo_get_reloc_offset;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index f59b539..1e7060e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -367,7 +367,7 @@
     reloc->read_domains |= rd;
     reloc->write_domain |= wd;
     reloc->flags = MAX2(reloc->flags, priority);
-    cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;
+    cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
 
     if (added_domains & RADEON_DOMAIN_VRAM)
         cs->base.used_vram += bo->base.size;
@@ -751,7 +751,7 @@
 
     /* Create a fence, which is a dummy BO. */
     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
-                                       RADEON_DOMAIN_GTT, RADEON_FLAG_HANDLE);
+                                       RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
     if (!fence)
        return NULL;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_public.h b/src/gallium/winsys/radeon/drm/radeon_drm_public.h
index dfcaaa4..2192aa6 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_public.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_public.h
@@ -6,9 +6,11 @@
 struct radeon_winsys;
 struct pipe_screen;
 
-typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *);
+typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *,
+						      unsigned);
 
 struct radeon_winsys *
-radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create);
+radeon_drm_winsys_create(int fd, unsigned flags,
+			 radeon_screen_create_t screen_create);
 
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index b4b9e99..ad1db3c 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -305,20 +305,20 @@
     }
 
     /* Check for dma */
-    ws->info.has_sdma = false;
+    ws->info.num_sdma_rings = 0;
     /* DMA is disabled on R700. There is IB corruption and hangs. */
     if (ws->info.chip_class >= EVERGREEN && ws->info.drm_minor >= 27) {
-        ws->info.has_sdma = true;
+        ws->info.num_sdma_rings = 1;
     }
 
     /* Check for UVD and VCE */
-    ws->info.has_uvd = false;
+    ws->info.has_hw_decode = false;
     ws->info.vce_fw_version = 0x00000000;
     if (ws->info.drm_minor >= 32) {
 	uint32_t value = RADEON_CS_RING_UVD;
         if (radeon_get_drm_value(ws->fd, RADEON_INFO_RING_WORKING,
                                  "UVD Ring working", &value))
-            ws->info.has_uvd = value;
+            ws->info.has_hw_decode = value;
 
         value = RADEON_CS_RING_VCE;
         if (radeon_get_drm_value(ws->fd, RADEON_INFO_RING_WORKING,
@@ -629,7 +629,9 @@
                              "num-bytes-moved", (uint32_t*)&retval);
         return retval;
     case RADEON_NUM_EVICTIONS:
+    case RADEON_NUM_VRAM_CPU_PAGE_FAULTS:
     case RADEON_VRAM_VIS_USAGE:
+    case RADEON_GFX_BO_LIST_COUNTER:
         return 0; /* unimplemented */
     case RADEON_VRAM_USAGE:
         radeon_get_drm_value(ws->fd, RADEON_INFO_VRAM_USAGE,
@@ -734,7 +736,8 @@
 }
 
 PUBLIC struct radeon_winsys *
-radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
+radeon_drm_winsys_create(int fd, unsigned flags,
+			 radeon_screen_create_t screen_create)
 {
     struct radeon_drm_winsys *ws;
 
@@ -773,7 +776,7 @@
          */
         if (!pb_slabs_init(&ws->bo_slabs,
                            RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2,
-                           12,
+                           RADEON_MAX_SLAB_HEAPS,
                            ws,
                            radeon_bo_can_reclaim_slab,
                            radeon_bo_slab_alloc,
@@ -822,14 +825,14 @@
     ws->info.gart_page_size = sysconf(_SC_PAGESIZE);
 
     if (ws->num_cpus > 1 && debug_get_option_thread())
-        util_queue_init(&ws->cs_queue, "radeon_cs", 8, 1);
+        util_queue_init(&ws->cs_queue, "radeon_cs", 8, 1, 0);
 
     /* Create the screen at the end. The winsys must be initialized
      * completely.
      *
      * Alternatively, we could create the screen based on "ws->gen"
      * and link all drivers into one binary blob. */
-    ws->base.screen = screen_create(&ws->base);
+    ws->base.screen = screen_create(&ws->base, flags);
     if (!ws->base.screen) {
         radeon_winsys_destroy(&ws->base);
         mtx_unlock(&fd_tab_mutex);
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index c306d98..c0ee833 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -236,11 +236,13 @@
 
       if (vswc->command.used || pfence != NULL)
          vmw_ioctl_command(vws,
-			   vswc->base.cid,
-			   0,
+                           vswc->base.cid,
+                           0,
                            vswc->command.buffer,
                            vswc->command.used,
-                           &fence);
+                           &fence,
+                           vswc->base.imported_fence_fd,
+                           vswc->base.hints);
 
       pb_validate_fence(vswc->validate, fence);
       mtx_lock(&vws->cs_mutex);
@@ -280,11 +282,17 @@
    debug_flush_flush(vswc->fctx);
 #endif
    swc->hints &= ~SVGA_HINT_FLAG_CAN_PRE_FLUSH;
+   swc->hints &= ~SVGA_HINT_FLAG_EXPORT_FENCE_FD;
    vswc->preemptive_flush = FALSE;
    vswc->seen_surfaces = 0;
    vswc->seen_regions = 0;
    vswc->seen_mobs = 0;
 
+   if (vswc->base.imported_fence_fd != -1) {
+      close(vswc->base.imported_fence_fd);
+      vswc->base.imported_fence_fd = -1;
+   }
+
    if(pfence)
       vmw_fence_reference(vswc->vws, pfence, fence);
 
@@ -823,6 +831,8 @@
    if (vswc->base.cid == -1)
       goto out_no_context;
 
+   vswc->base.imported_fence_fd = -1;
+
    vswc->base.have_gb_objects = sws->have_gb_objects;
 
    vswc->vws = vws;
diff --git a/src/gallium/winsys/svga/drm/vmw_fence.c b/src/gallium/winsys/svga/drm/vmw_fence.c
index edf205e..061f588 100644
--- a/src/gallium/winsys/svga/drm/vmw_fence.c
+++ b/src/gallium/winsys/svga/drm/vmw_fence.c
@@ -22,6 +22,8 @@
  * SOFTWARE.
  *
  **********************************************************/
+#include <libsync.h>
+
 #include "util/u_memory.h"
 #include "util/u_atomic.h"
 #include "util/list.h"
@@ -32,7 +34,7 @@
 #include "vmw_screen.h"
 #include "vmw_fence.h"
 
-struct vmw_fence_ops 
+struct vmw_fence_ops
 {
    /*
     * Immutable members.
@@ -58,6 +60,8 @@
    uint32_t mask;
    int32_t signalled;
    uint32_t seqno;
+   int32_t fence_fd;
+   boolean imported; /* TRUE if imported from another process */
 };
 
 /**
@@ -175,15 +179,16 @@
  * @fence_ops: The fence_ops manager to register with.
  * @handle: Handle identifying the kernel fence object.
  * @mask: Mask of flags that this fence object may signal.
+ * @fd: File descriptor to associate with the fence
  *
  * Returns NULL on failure.
  */
 struct pipe_fence_handle *
 vmw_fence_create(struct pb_fence_ops *fence_ops, uint32_t handle,
-                 uint32_t seqno, uint32_t mask)
+                 uint32_t seqno, uint32_t mask, int32_t fd)
 {
    struct vmw_fence *fence = CALLOC_STRUCT(vmw_fence);
-   struct vmw_fence_ops *ops = vmw_fence_ops(fence_ops);
+   struct vmw_fence_ops *ops = NULL;
 
    if (!fence)
       return NULL;
@@ -192,7 +197,20 @@
    fence->handle = handle;
    fence->mask = mask;
    fence->seqno = seqno;
+   fence->fence_fd = fd;
    p_atomic_set(&fence->signalled, 0);
+
+   /*
+    * If the fence was not created by our device, then we won't
+    * manage it with our ops
+    */
+   if (!fence_ops) {
+      fence->imported = true;
+      return (struct pipe_fence_handle *) fence;
+   }
+
+   ops = vmw_fence_ops(fence_ops);
+
    mtx_lock(&ops->mutex);
 
    if (vmw_fence_seq_is_signaled(seqno, ops->last_signaled, seqno)) {
@@ -210,6 +228,21 @@
 
 
 /**
+ * vmw_fence_destroy - Frees a vmw fence object.
+ *
+ * Also closes the file handle associated with the object, if any
+ */
+static
+void vmw_fence_destroy(struct vmw_fence *vfence)
+{
+   if (vfence->fence_fd != -1)
+      close(vfence->fence_fd);
+
+   FREE(vfence);
+}
+
+
+/**
  * vmw_fence_reference - Reference / unreference a vmw fence object.
  *
  * @vws: Pointer to the winsys screen.
@@ -227,13 +260,15 @@
       if (p_atomic_dec_zero(&vfence->refcount)) {
          struct vmw_fence_ops *ops = vmw_fence_ops(vws->fence_ops);
 
-	 vmw_ioctl_fence_unref(vws, vfence->handle);
+         if (!vfence->imported) {
+            vmw_ioctl_fence_unref(vws, vfence->handle);
 
-         mtx_lock(&ops->mutex);
-         LIST_DELINIT(&vfence->ops_list);
-         mtx_unlock(&ops->mutex);
+            mtx_lock(&ops->mutex);
+            LIST_DELINIT(&vfence->ops_list);
+            mtx_unlock(&ops->mutex);
+         }
 
-	 FREE(vfence);
+         vmw_fence_destroy(vfence);
       }
    }
 
@@ -300,6 +335,7 @@
  *
  * @vws: Pointer to the winsys screen.
  * @fence: Handle to the fence object.
+ * @timeout: How long to wait before timing out.
  * @flag: Fence flags to wait for. If the fence object can't signal
  * a flag, it is assumed to be already signaled.
  *
@@ -308,6 +344,7 @@
 int
 vmw_fence_finish(struct vmw_winsys_screen *vws,
 		 struct pipe_fence_handle *fence,
+		 uint64_t timeout,
 		 unsigned flag)
 {
    struct vmw_fence *vfence;
@@ -319,6 +356,16 @@
       return 0;
 
    vfence = vmw_fence(fence);
+
+   if (vfence->imported) {
+      ret = sync_wait(vfence->fence_fd, timeout / 1000000);
+
+      if (!ret)
+         p_atomic_set(&vfence->signalled, 1);
+
+      return !!ret;
+   }
+
    old = p_atomic_read(&vfence->signalled);
    vflags &= ~vfence->mask;
 
@@ -339,6 +386,23 @@
    return ret;
 }
 
+/**
+ * vmw_fence_get_fd
+ *
+ * Returns the file descriptor associated with the fence
+ */
+int
+vmw_fence_get_fd(struct pipe_fence_handle *fence)
+{
+   struct vmw_fence *vfence;
+
+   if (!fence)
+      return -1;
+
+   vfence = vmw_fence(fence);
+   return vfence->fence_fd;
+}
+
 
 /**
  * vmw_fence_ops_fence_reference - wrapper for the pb_fence_ops api.
@@ -383,7 +447,7 @@
 {
    struct vmw_winsys_screen *vws = vmw_fence_ops(ops)->vws;
 
-   return vmw_fence_finish(vws, fence, flag);
+   return vmw_fence_finish(vws, fence, PIPE_TIMEOUT_INFINITE, flag);
 }
 
 
diff --git a/src/gallium/winsys/svga/drm/vmw_fence.h b/src/gallium/winsys/svga/drm/vmw_fence.h
index f6381fe..aab558f 100644
--- a/src/gallium/winsys/svga/drm/vmw_fence.h
+++ b/src/gallium/winsys/svga/drm/vmw_fence.h
@@ -38,12 +38,17 @@
 
 struct pipe_fence_handle *
 vmw_fence_create(struct pb_fence_ops *fence_ops,
-		 uint32_t handle, uint32_t seqno, uint32_t mask);
+		 uint32_t handle, uint32_t seqno, uint32_t mask, int32_t fd);
 
 int
 vmw_fence_finish(struct vmw_winsys_screen *vws,
 		 struct pipe_fence_handle *fence,
+		 uint64_t timeout,
 		 unsigned flag);
+
+int
+vmw_fence_get_fd(struct pipe_fence_handle *fence);
+
 int
 vmw_fence_signalled(struct vmw_winsys_screen *vws,
 		    struct pipe_fence_handle *fence,
diff --git a/src/gallium/winsys/svga/drm/vmw_screen.h b/src/gallium/winsys/svga/drm/vmw_screen.h
index 0ef8e84..f21cabb 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen.h
+++ b/src/gallium/winsys/svga/drm/vmw_screen.h
@@ -162,11 +162,13 @@
 
 void
 vmw_ioctl_command(struct vmw_winsys_screen *vws,
-		  int32_t cid,
-		  uint32_t throttle_us,
-		  void *commands,
-		  uint32_t size,
-		  struct pipe_fence_handle **fence);
+                  int32_t cid,
+                  uint32_t throttle_us,
+                  void *commands,
+                  uint32_t size,
+                  struct pipe_fence_handle **fence,
+                  int32_t imported_fence_fd,
+                  uint32_t flags);
 
 struct vmw_region *
 vmw_ioctl_region_create(struct vmw_winsys_screen *vws, uint32_t size);
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 1740d1a..79f9d95 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -408,8 +408,9 @@
 
 void
 vmw_ioctl_command(struct vmw_winsys_screen *vws, int32_t cid,
-		  uint32_t throttle_us, void *commands, uint32_t size,
-		  struct pipe_fence_handle **pfence)
+                  uint32_t throttle_us, void *commands, uint32_t size,
+                  struct pipe_fence_handle **pfence, int32_t imported_fence_fd,
+                  uint32_t flags)
 {
    struct drm_vmw_execbuf_arg arg;
    struct drm_vmw_fence_rep rep;
@@ -439,6 +440,14 @@
    memset(&arg, 0, sizeof(arg));
    memset(&rep, 0, sizeof(rep));
 
+   if (flags & SVGA_HINT_FLAG_EXPORT_FENCE_FD) {
+      arg.flags |= DRM_VMW_EXECBUF_FLAG_EXPORT_FENCE_FD;
+   }
+
+   if (imported_fence_fd != -1) {
+      arg.flags |= DRM_VMW_EXECBUF_FLAG_IMPORT_FENCE_FD;
+   }
+
    rep.error = -EFAULT;
    if (pfence)
       arg.fence_rep = (unsigned long)&rep;
@@ -448,6 +457,10 @@
    arg.version = vws->ioctl.drm_execbuf_version;
    arg.context_handle = (vws->base.have_vgpu10 ? cid : SVGA3D_INVALID_ID);
 
+   /* Older DRM module requires this to be zero */
+   if (vws->base.have_fence_fd)
+      arg.imported_fence_fd = imported_fence_fd;
+
    /* In DRM_VMW_EXECBUF_VERSION 1, the drm_vmw_execbuf_arg structure ends with
     * the flags field. The structure size sent to drmCommandWrite must match
     * the drm_execbuf_version. Otherwise, an invalid value will be returned.
@@ -474,15 +487,20 @@
          vmw_fences_signal(vws->fence_ops, rep.passed_seqno, rep.seqno,
                            TRUE);
 
-	 *pfence = vmw_fence_create(vws->fence_ops, rep.handle,
-				    rep.seqno, rep.mask);
-	 if (*pfence == NULL) {
-	    /*
-	     * Fence creation failed. Need to sync.
-	     */
-	    (void) vmw_ioctl_fence_finish(vws, rep.handle, rep.mask);
-	    vmw_ioctl_fence_unref(vws, rep.handle);
-	 }
+         /* Older DRM module will set this to zero, but -1 is the proper FD
+          * to use for no Fence FD support */
+         if (!vws->base.have_fence_fd)
+            rep.fd = -1;
+
+         *pfence = vmw_fence_create(vws->fence_ops, rep.handle,
+                                    rep.seqno, rep.mask, rep.fd);
+         if (*pfence == NULL) {
+            /*
+             * Fence creation failed. Need to sync.
+             */
+            (void) vmw_ioctl_fence_finish(vws, rep.handle, rep.mask);
+            vmw_ioctl_fence_unref(vws, rep.handle);
+         }
       }
    }
 }
@@ -1033,6 +1051,10 @@
       vws->base.have_set_predication_cmd = TRUE;
    }
 
+   if (version->version_major == 2 && version->version_minor >= 14) {
+      vws->base.have_fence_fd = TRUE;
+   }
+
    free(cap_buffer);
    drmFreeVersion(version);
    vmw_printf("%s OK\n", __FUNCTION__);
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_svga.c b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
index 31cbda9..7c80642 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_svga.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
@@ -32,6 +32,7 @@
  * @author Jose Fonseca
  */
 
+#include <libsync.h>
 
 #include "svga_cmd.h"
 #include "svga3d_caps.h"
@@ -123,14 +124,44 @@
 static int
 vmw_svga_winsys_fence_finish(struct svga_winsys_screen *sws,
                              struct pipe_fence_handle *fence,
+                             uint64_t timeout,
                              unsigned flag)
 {
    struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
 
-   return vmw_fence_finish(vws, fence, flag);
+   return vmw_fence_finish(vws, fence, timeout, flag);
 }
 
 
+static int
+vmw_svga_winsys_fence_get_fd(struct svga_winsys_screen *sws,
+                             struct pipe_fence_handle *fence,
+                             boolean duplicate)
+{
+   if (duplicate)
+      return dup(vmw_fence_get_fd(fence));
+   else
+      return vmw_fence_get_fd(fence);
+}
+
+
+static void
+vmw_svga_winsys_fence_create_fd(struct svga_winsys_screen *sws,
+                                struct pipe_fence_handle **fence,
+                                int32_t fd)
+{
+   *fence = vmw_fence_create(NULL, 0, 0, 0, dup(fd));
+}
+
+static int
+vmw_svga_winsys_fence_server_sync(struct svga_winsys_screen *sws,
+                                  int32_t *context_fd,
+                                  struct pipe_fence_handle *fence)
+{
+   return sync_accumulate("vmwgfx", context_fd,
+                          sws->fence_get_fd(sws, fence, FALSE));
+}
+
 
 static struct svga_winsys_surface *
 vmw_svga_winsys_surface_create(struct svga_winsys_screen *sws,
@@ -200,22 +231,25 @@
                                                  surface->buf ? NULL :
 						 &desc.region);
 
-      if (surface->sid == SVGA3D_INVALID_ID && surface->buf) {
-
-         /*
-          * Kernel refused to allocate a surface for us.
-          * Perhaps something was wrong with our buffer?
-          * This is really a guard against future new size requirements
-          * on the backing buffers.
-          */
-         vmw_svga_winsys_buffer_destroy(sws, surface->buf);
-         surface->buf = NULL;
-         surface->sid = vmw_ioctl_gb_surface_create(vws, flags, format, usage,
-                                                    size, numLayers,
-                                                    numMipLevels, sampleCount,
-                                                    0, &desc.region);
-         if (surface->sid == SVGA3D_INVALID_ID)
+      if (surface->sid == SVGA3D_INVALID_ID) {
+         if (surface->buf == NULL) {
             goto no_sid;
+         } else {
+            /*
+             * Kernel refused to allocate a surface for us.
+             * Perhaps something was wrong with our buffer?
+             * This is really a guard against future new size requirements
+             * on the backing buffers.
+             */
+            vmw_svga_winsys_buffer_destroy(sws, surface->buf);
+            surface->buf = NULL;
+            surface->sid = vmw_ioctl_gb_surface_create(vws, flags, format, usage,
+                                                       size, numLayers,
+                                                       numMipLevels, sampleCount,
+                                                       0, &desc.region);
+            if (surface->sid == SVGA3D_INVALID_ID)
+               goto no_sid;
+         }
       }
 
       /*
@@ -431,6 +465,9 @@
    vws->base.shader_create = vmw_svga_winsys_shader_create;
    vws->base.shader_destroy = vmw_svga_winsys_shader_destroy;
    vws->base.fence_finish = vmw_svga_winsys_fence_finish;
+   vws->base.fence_get_fd = vmw_svga_winsys_fence_get_fd;
+   vws->base.fence_create_fd = vmw_svga_winsys_fence_create_fd;
+   vws->base.fence_server_sync = vmw_svga_winsys_fence_server_sync;
 
    vws->base.query_create = vmw_svga_winsys_query_create;
    vws->base.query_init = vmw_svga_winsys_query_init;
diff --git a/src/gallium/winsys/svga/drm/vmwgfx_drm.h b/src/gallium/winsys/svga/drm/vmwgfx_drm.h
index 807ec90..13d7794 100644
--- a/src/gallium/winsys/svga/drm/vmwgfx_drm.h
+++ b/src/gallium/winsys/svga/drm/vmwgfx_drm.h
@@ -294,13 +294,17 @@
  * @version: Allows expanding the execbuf ioctl parameters without breaking
  * backwards compatibility, since user-space will always tell the kernel
  * which version it uses.
- * @flags: Execbuf flags. None currently.
+ * @flags: Execbuf flags.
+ * @imported_fence_fd:  FD for a fence imported from another device
  *
  * Argument to the DRM_VMW_EXECBUF Ioctl.
  */
 
 #define DRM_VMW_EXECBUF_VERSION 2
 
+#define DRM_VMW_EXECBUF_FLAG_IMPORT_FENCE_FD (1 << 0)
+#define DRM_VMW_EXECBUF_FLAG_EXPORT_FENCE_FD (1 << 1)
+
 struct drm_vmw_execbuf_arg {
 	uint64_t commands;
 	uint32_t command_size;
@@ -309,7 +313,7 @@
 	uint32_t version;
 	uint32_t flags;
 	uint32_t context_handle;
-	uint32_t pad64;
+	int32_t imported_fence_fd;
 };
 
 /**
@@ -325,6 +329,7 @@
  * @passed_seqno: The highest seqno number processed by the hardware
  * so far. This can be used to mark user-space fence objects as signaled, and
  * to determine whether a fence seqno might be stale.
+ * @fd: FD associated with the fence, -1 if not exported
  * @error: This member should've been set to -EFAULT on submission.
  * The following actions should be take on completion:
  * error == -EFAULT: Fence communication failed. The host is synchronized.
@@ -342,7 +347,7 @@
 	uint32_t mask;
 	uint32_t seqno;
 	uint32_t passed_seqno;
-	uint32_t pad64;
+	int32_t fd;
 	int32_t error;
 };
 
diff --git a/src/gallium/winsys/vc4/drm/vc4_drm_public.h b/src/gallium/winsys/vc4/drm/vc4_drm_public.h
index f9d0585..102c148 100644
--- a/src/gallium/winsys/vc4/drm/vc4_drm_public.h
+++ b/src/gallium/winsys/vc4/drm/vc4_drm_public.h
@@ -25,7 +25,9 @@
 #define __VC4_DRM_PUBLIC_H__
 
 struct pipe_screen;
+struct renderonly;
 
 struct pipe_screen *vc4_drm_screen_create(int drmFD);
+struct pipe_screen *vc4_drm_screen_create_renderonly(struct renderonly *ro);
 
 #endif /* __VC4_DRM_PUBLIC_H__ */
diff --git a/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c b/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
index 23fe8e7..b2ffa90 100644
--- a/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
+++ b/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
@@ -24,12 +24,18 @@
 #include <unistd.h>
 #include <fcntl.h>
 
+#include "renderonly/renderonly.h"
 #include "vc4_drm_public.h"
-
 #include "vc4/vc4_screen.h"
 
 struct pipe_screen *
 vc4_drm_screen_create(int fd)
 {
-	return vc4_screen_create(fcntl(fd, F_DUPFD_CLOEXEC, 3));
+   return vc4_screen_create(fcntl(fd, F_DUPFD_CLOEXEC, 3), NULL);
+}
+
+struct pipe_screen *
+vc4_drm_screen_create_renderonly(struct renderonly *ro)
+{
+   return vc4_screen_create(fcntl(ro->gpu_fd, F_DUPFD_CLOEXEC, 3), ro);
 }
diff --git a/src/gbm/Makefile.am b/src/gbm/Makefile.am
index 60b0924..de83960 100644
--- a/src/gbm/Makefile.am
+++ b/src/gbm/Makefile.am
@@ -5,6 +5,7 @@
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/loader \
 	-I$(top_srcdir)/src/gbm/main \
 	$(DLOPEN_CFLAGS) \
diff --git a/src/gbm/Makefile.sources b/src/gbm/Makefile.sources
index 7a2d6a4..461025c 100644
--- a/src/gbm/Makefile.sources
+++ b/src/gbm/Makefile.sources
@@ -1,7 +1,6 @@
 gbm_core_FILES = \
 	main/backend.c \
 	main/backend.h \
-	main/common_drm.h \
 	main/gbm.c \
 	main/gbm.h \
 	main/gbmint.h
diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
index 8cca35e..1b2cc4c 100644
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -48,6 +48,7 @@
 
 #include "gbmint.h"
 #include "loader.h"
+#include "util/macros.h"
 
 /* For importing wl_buffer */
 #if HAVE_WAYLAND_PLATFORM
@@ -256,7 +257,6 @@
    { __DRI2_FLUSH, 1, offsetof(struct gbm_dri_device, flush) },
    { __DRI_IMAGE, 1, offsetof(struct gbm_dri_device, image) },
    { __DRI2_FENCE, 1, offsetof(struct gbm_dri_device, fence), 1 },
-   { __DRI2_INTEROP, 1, offsetof(struct gbm_dri_device, interop), 1 },
    { NULL, 0, 0 }
 };
 
@@ -344,26 +344,17 @@
       len = next - p;
 #if GLX_USE_TLS
       snprintf(path, sizeof path,
-               "%.*s/tls/%s_dri.so", len, p, dri->base.driver_name);
+               "%.*s/tls/%s_dri.so", len, p, dri->driver_name);
       dri->driver = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
 #endif
       if (dri->driver == NULL) {
          snprintf(path, sizeof path,
-                  "%.*s/%s_dri.so", len, p, dri->base.driver_name);
+                  "%.*s/%s_dri.so", len, p, dri->driver_name);
          dri->driver = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
       }
       /* not need continue to loop all paths once the driver is found */
       if (dri->driver != NULL)
          break;
-
-#ifdef ANDROID
-      snprintf(path, sizeof path, "%.*s/gallium_dri.so", len, p);
-      dri->driver = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
-      if (dri->driver == NULL)
-         sprintf("failed to open %s: %s\n", path, dlerror());
-      else
-         break;
-#endif
    }
 
    if (dri->driver == NULL) {
@@ -373,7 +364,7 @@
       return NULL;
    }
 
-   get_extensions_name = loader_get_extensions_name(dri->base.driver_name);
+   get_extensions_name = loader_get_extensions_name(dri->driver_name);
    if (get_extensions_name) {
       const __DRIextension **(*get_extensions)(void);
 
@@ -440,13 +431,13 @@
    const __DRIextension **extensions;
    int ret = 0;
 
-   dri->base.driver_name = driver_name;
-   if (dri->base.driver_name == NULL)
+   dri->driver_name = driver_name;
+   if (dri->driver_name == NULL)
       return -1;
 
    ret = dri_load_driver(dri);
    if (ret) {
-      fprintf(stderr, "failed to load driver: %s\n", dri->base.driver_name);
+      fprintf(stderr, "failed to load driver: %s\n", dri->driver_name);
       return ret;
    };
 
@@ -456,12 +447,12 @@
       return -1;
 
    if (dri->dri2->base.version >= 4) {
-      dri->screen = dri->dri2->createNewScreen2(0, dri->base.base.fd,
+      dri->screen = dri->dri2->createNewScreen2(0, dri->base.fd,
                                                 dri->loader_extensions,
                                                 dri->driver_extensions,
                                                 &dri->driver_configs, dri);
    } else {
-      dri->screen = dri->dri2->createNewScreen(0, dri->base.base.fd,
+      dri->screen = dri->dri2->createNewScreen(0, dri->base.fd,
                                                dri->loader_extensions,
                                                &dri->driver_configs, dri);
    }
@@ -490,8 +481,8 @@
 {
    int ret;
 
-   dri->base.driver_name = strdup("swrast");
-   if (dri->base.driver_name == NULL)
+   dri->driver_name = strdup("swrast");
+   if (dri->driver_name == NULL)
       return -1;
 
    ret = dri_load_driver_swrast(dri);
@@ -527,7 +518,7 @@
 {
    char *driver_name;
 
-   driver_name = loader_get_driver_for_fd(dri->base.base.fd);
+   driver_name = loader_get_driver_for_fd(dri->base.fd);
    if (!driver_name)
       return -1;
 
@@ -551,30 +542,100 @@
    return dri_screen_create_swrast(dri);
 }
 
+static const struct {
+   uint32_t gbm_format;
+   int dri_image_format;
+} gbm_to_dri_image_formats[] = {
+   { GBM_FORMAT_R8,          __DRI_IMAGE_FORMAT_R8          },
+   { GBM_FORMAT_GR88,        __DRI_IMAGE_FORMAT_GR88        },
+   { GBM_FORMAT_RGB565,      __DRI_IMAGE_FORMAT_RGB565      },
+   { GBM_FORMAT_XRGB8888,    __DRI_IMAGE_FORMAT_XRGB8888    },
+   { GBM_FORMAT_ARGB8888,    __DRI_IMAGE_FORMAT_ARGB8888    },
+   { GBM_FORMAT_XBGR8888,    __DRI_IMAGE_FORMAT_XBGR8888    },
+   { GBM_FORMAT_ABGR8888,    __DRI_IMAGE_FORMAT_ABGR8888    },
+   { GBM_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XRGB2101010 },
+   { GBM_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ARGB2101010 },
+};
+
+/* The two GBM_BO_FORMAT_[XA]RGB8888 formats alias the GBM_FORMAT_*
+ * formats of the same name. We want to accept them whenever someone
+ * has a GBM format, but never return them to the user. */
+static int
+gbm_format_canonicalize(uint32_t gbm_format)
+{
+   switch (gbm_format) {
+   case GBM_BO_FORMAT_XRGB8888:
+      return GBM_FORMAT_XRGB8888;
+   case GBM_BO_FORMAT_ARGB8888:
+      return GBM_FORMAT_ARGB8888;
+   default:
+      return gbm_format;
+   }
+}
+
+static int
+gbm_format_to_dri_format(uint32_t gbm_format)
+{
+   int i;
+
+   gbm_format = gbm_format_canonicalize(gbm_format);
+   for (i = 0; i < ARRAY_SIZE(gbm_to_dri_image_formats); i++) {
+      if (gbm_to_dri_image_formats[i].gbm_format == gbm_format)
+         return gbm_to_dri_image_formats[i].dri_image_format;
+   }
+
+   return 0;
+}
+
+static uint32_t
+gbm_dri_to_gbm_format(int dri_format)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(gbm_to_dri_image_formats); i++) {
+      if (gbm_to_dri_image_formats[i].dri_image_format == dri_format)
+         return gbm_to_dri_image_formats[i].gbm_format;
+   }
+
+   return 0;
+}
+
 static int
 gbm_dri_is_format_supported(struct gbm_device *gbm,
                             uint32_t format,
                             uint32_t usage)
 {
-   switch (format) {
-   case GBM_BO_FORMAT_XRGB8888:
-   case GBM_FORMAT_XBGR8888:
-   case GBM_FORMAT_XRGB8888:
-      break;
-   case GBM_BO_FORMAT_ARGB8888:
-   case GBM_FORMAT_ARGB8888:
-      if (usage & GBM_BO_USE_SCANOUT)
-         return 0;
-      break;
-   default:
+   struct gbm_dri_device *dri = gbm_dri_device(gbm);
+   int count;
+
+   if ((usage & GBM_BO_USE_CURSOR) && (usage & GBM_BO_USE_RENDERING))
       return 0;
+
+   format = gbm_format_canonicalize(format);
+   if (gbm_format_to_dri_format(format) == 0)
+      return 0;
+
+   /* If there is no query, fall back to the small table which was originally
+    * here. */
+   if (dri->image->base.version <= 15 || !dri->image->queryDmaBufModifiers) {
+      switch (format) {
+      case GBM_FORMAT_XRGB8888:
+      case GBM_FORMAT_ARGB8888:
+      case GBM_FORMAT_XBGR8888:
+         return 1;
+      default:
+         return 0;
+      }
    }
 
-   if (usage & GBM_BO_USE_CURSOR &&
-       usage & GBM_BO_USE_RENDERING)
+   /* Check if the driver returns any modifiers for this format; since linear
+    * is counted as a modifier, we will have at least one modifier for any
+    * supported format. */
+   if (!dri->image->queryDmaBufModifiers(dri->screen, format, 0, NULL, NULL,
+                                         &count))
       return 0;
 
-   return 1;
+   return (count > 0);
 }
 
 static int
@@ -790,41 +851,12 @@
       gbm_dri_bo_unmap_dumb(bo);
       memset(&arg, 0, sizeof(arg));
       arg.handle = bo->handle;
-      drmIoctl(dri->base.base.fd, DRM_IOCTL_MODE_DESTROY_DUMB, &arg);
+      drmIoctl(dri->base.fd, DRM_IOCTL_MODE_DESTROY_DUMB, &arg);
    }
 
    free(bo);
 }
 
-static uint32_t
-gbm_dri_to_gbm_format(uint32_t dri_format)
-{
-   uint32_t ret = 0;
-
-   switch (dri_format) {
-   case __DRI_IMAGE_FORMAT_RGB565:
-      ret = GBM_FORMAT_RGB565;
-      break;
-   case __DRI_IMAGE_FORMAT_XRGB8888:
-      ret = GBM_FORMAT_XRGB8888;
-      break;
-   case __DRI_IMAGE_FORMAT_ARGB8888:
-      ret = GBM_FORMAT_ARGB8888;
-      break;
-   case __DRI_IMAGE_FORMAT_XBGR8888:
-      ret = GBM_FORMAT_XBGR8888;
-      break;
-   case __DRI_IMAGE_FORMAT_ABGR8888:
-      ret = GBM_FORMAT_ABGR8888;
-      break;
-   default:
-      ret = 0;
-      break;
-   }
-
-   return ret;
-}
-
 static struct gbm_bo *
 gbm_dri_bo_import(struct gbm_device *gbm,
                   uint32_t type, void *buffer, uint32_t usage)
@@ -860,23 +892,9 @@
 
       image = dri->image->dupImage(wb->driver_buffer, NULL);
 
-      switch (wb->format) {
-      case WL_DRM_FORMAT_XRGB8888:
-         gbm_format = GBM_FORMAT_XRGB8888;
-         break;
-      case WL_DRM_FORMAT_ARGB8888:
-         gbm_format = GBM_FORMAT_ARGB8888;
-         break;
-      case WL_DRM_FORMAT_RGB565:
-         gbm_format = GBM_FORMAT_RGB565;
-         break;
-      case WL_DRM_FORMAT_YUYV:
-         gbm_format = GBM_FORMAT_YUYV;
-         break;
-      default:
-         dri->image->destroyImage(image);
-         return NULL;
-      }
+      /* GBM_FORMAT_* is identical to WL_DRM_FORMAT_*, so no conversion
+       * required. */
+      gbm_format = wb->format;
       break;
    }
 #endif
@@ -905,23 +923,17 @@
    {
       struct gbm_import_fd_data *fd_data = buffer;
       int stride = fd_data->stride, offset = 0;
-      int dri_format;
+      int fourcc;
 
-      switch (fd_data->format) {
-      case GBM_BO_FORMAT_XRGB8888:
-         dri_format = GBM_FORMAT_XRGB8888;
-         break;
-      case GBM_BO_FORMAT_ARGB8888:
-         dri_format = GBM_FORMAT_ARGB8888;
-         break;
-      default:
-         dri_format = fd_data->format;
-      }
+      /* GBM's GBM_FORMAT_* tokens are a strict superset of the DRI FourCC
+       * tokens accepted by createImageFromFds, except for not supporting
+       * the sARGB format. */
+      fourcc = gbm_format_canonicalize(fd_data->format);
 
       image = dri->image->createImageFromFds(dri->screen,
                                              fd_data->width,
                                              fd_data->height,
-                                             dri_format,
+                                             fourcc,
                                              &fd_data->fd, 1,
                                              &stride, &offset,
                                              NULL);
@@ -933,6 +945,42 @@
       break;
    }
 
+   case GBM_BO_IMPORT_FD_MODIFIER:
+   {
+      struct gbm_import_fd_modifier_data *fd_data = buffer;
+      unsigned int error;
+      int fourcc;
+
+      /* Import with modifier requires createImageFromDmaBufs2 */
+      if (dri->image == NULL || dri->image->base.version < 15 ||
+          dri->image->createImageFromDmaBufs2 == NULL) {
+         errno = ENOSYS;
+         return NULL;
+      }
+
+      /* GBM's GBM_FORMAT_* tokens are a strict superset of the DRI FourCC
+       * tokens accepted by createImageFromDmaBufs2, except for not supporting
+       * the sARGB format. */
+      fourcc = gbm_format_canonicalize(fd_data->format);
+
+      image = dri->image->createImageFromDmaBufs2(dri->screen, fd_data->width,
+                                                  fd_data->height, fourcc,
+                                                  fd_data->modifier,
+                                                  fd_data->fds,
+                                                  fd_data->num_fds,
+                                                  fd_data->strides,
+                                                  fd_data->offsets,
+                                                  0, 0, 0, 0,
+                                                  &error, NULL);
+      if (image == NULL) {
+         errno = ENOSYS;
+         return NULL;
+      }
+
+      gbm_format = fourcc;
+      break;
+   }
+
    default:
       errno = ENOSYS;
       return NULL;
@@ -959,44 +1007,19 @@
       return NULL;
    }
 
-   bo->base.base.gbm = gbm;
-   bo->base.base.format = gbm_format;
+   bo->base.gbm = gbm;
+   bo->base.format = gbm_format;
 
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_WIDTH,
-                          (int*)&bo->base.base.width);
+                          (int*)&bo->base.width);
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_HEIGHT,
-                          (int*)&bo->base.base.height);
+                          (int*)&bo->base.height);
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_STRIDE,
-                          (int*)&bo->base.base.stride);
+                          (int*)&bo->base.stride);
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_HANDLE,
-                          &bo->base.base.handle.s32);
+                          &bo->base.handle.s32);
 
-   return &bo->base.base;
-}
-
-static bool
-is_planar_format(uint32_t format)
-{
-   switch (format) {
-   case GBM_FORMAT_NV12:
-   case GBM_FORMAT_NV21:
-   case GBM_FORMAT_NV16:
-   case GBM_FORMAT_NV61:
-   case GBM_FORMAT_YUV410:
-   case GBM_FORMAT_YVU410:
-   case GBM_FORMAT_YUV411:
-   case GBM_FORMAT_YVU411:
-   case GBM_FORMAT_YUV420:
-   case GBM_FORMAT_YVU420:
-   case GBM_FORMAT_YUV422:
-   case GBM_FORMAT_YVU422:
-   case GBM_FORMAT_YUV444:
-   case GBM_FORMAT_YVU444:
-      return true;
-   default:
-      return false;
-   }
-
+   return &bo->base;
 }
 
 static struct gbm_bo *
@@ -1014,17 +1037,12 @@
    is_cursor = (usage & GBM_BO_USE_CURSOR) != 0 &&
       format == GBM_FORMAT_ARGB8888;
    is_scanout = (usage & GBM_BO_USE_SCANOUT) != 0 &&
-      format == GBM_FORMAT_XRGB8888;
+      (format == GBM_FORMAT_XRGB8888 || format == GBM_FORMAT_XBGR8888);
    if (!is_cursor && !is_scanout) {
       errno = EINVAL;
       return NULL;
    }
 
-   if (is_planar_format(format)) {
-      errno = EINVAL;
-      return NULL;
-   }
-
    bo = calloc(1, sizeof *bo);
    if (bo == NULL)
       return NULL;
@@ -1034,28 +1052,28 @@
    create_arg.width = width;
    create_arg.height = height;
 
-   ret = drmIoctl(dri->base.base.fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_arg);
+   ret = drmIoctl(dri->base.fd, DRM_IOCTL_MODE_CREATE_DUMB, &create_arg);
    if (ret)
       goto free_bo;
 
-   bo->base.base.gbm = gbm;
-   bo->base.base.width = width;
-   bo->base.base.height = height;
-   bo->base.base.stride = create_arg.pitch;
-   bo->base.base.format = format;
-   bo->base.base.handle.u32 = create_arg.handle;
+   bo->base.gbm = gbm;
+   bo->base.width = width;
+   bo->base.height = height;
+   bo->base.stride = create_arg.pitch;
+   bo->base.format = format;
+   bo->base.handle.u32 = create_arg.handle;
    bo->handle = create_arg.handle;
    bo->size = create_arg.size;
 
    if (gbm_dri_bo_map_dumb(bo) == NULL)
       goto destroy_dumb;
 
-   return &bo->base.base;
+   return &bo->base;
 
 destroy_dumb:
    memset(&destroy_arg, 0, sizeof destroy_arg);
    destroy_arg.handle = create_arg.handle;
-   drmIoctl(dri->base.base.fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_arg);
+   drmIoctl(dri->base.fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_arg);
 free_bo:
    free(bo);
 
@@ -1079,6 +1097,8 @@
     */
    assert(!(usage && count));
 
+   format = gbm_format_canonicalize(format);
+
    if (usage & GBM_BO_USE_WRITE || dri->image == NULL)
       return create_dumb(gbm, width, height, format, usage);
 
@@ -1086,42 +1106,13 @@
    if (bo == NULL)
       return NULL;
 
-   bo->base.base.gbm = gbm;
-   bo->base.base.width = width;
-   bo->base.base.height = height;
-   bo->base.base.format = format;
+   bo->base.gbm = gbm;
+   bo->base.width = width;
+   bo->base.height = height;
+   bo->base.format = format;
 
-   switch (format) {
-   case GBM_FORMAT_R8:
-      dri_format = __DRI_IMAGE_FORMAT_R8;
-      break;
-   case GBM_FORMAT_GR88:
-      dri_format = __DRI_IMAGE_FORMAT_GR88;
-      break;
-   case GBM_FORMAT_RGB565:
-      dri_format = __DRI_IMAGE_FORMAT_RGB565;
-      break;
-   case GBM_FORMAT_XRGB8888:
-   case GBM_BO_FORMAT_XRGB8888:
-      dri_format = __DRI_IMAGE_FORMAT_XRGB8888;
-      break;
-   case GBM_FORMAT_ARGB8888:
-   case GBM_BO_FORMAT_ARGB8888:
-      dri_format = __DRI_IMAGE_FORMAT_ARGB8888;
-      break;
-   case GBM_FORMAT_ABGR8888:
-      dri_format = __DRI_IMAGE_FORMAT_ABGR8888;
-      break;
-   case GBM_FORMAT_XBGR8888:
-      dri_format = __DRI_IMAGE_FORMAT_XBGR8888;
-      break;
-   case GBM_FORMAT_ARGB2101010:
-      dri_format = __DRI_IMAGE_FORMAT_ARGB2101010;
-      break;
-   case GBM_FORMAT_XRGB2101010:
-      dri_format = __DRI_IMAGE_FORMAT_XRGB2101010;
-      break;
-   default:
+   dri_format = gbm_format_to_dri_format(format);
+   if (dri_format == 0) {
       errno = EINVAL;
       goto failed;
    }
@@ -1165,7 +1156,7 @@
 
       if (bo->image) {
          /* The client passed in a list of invalid modifiers */
-         assert(gbm_dri_bo_get_modifier(&bo->base.base) != DRM_FORMAT_MOD_INVALID);
+         assert(gbm_dri_bo_get_modifier(&bo->base) != DRM_FORMAT_MOD_INVALID);
       }
    } else {
       bo->image = dri->image->createImage(dri->screen, width, height,
@@ -1176,11 +1167,11 @@
       goto failed;
 
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_HANDLE,
-                          &bo->base.base.handle.s32);
+                          &bo->base.handle.s32);
    dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_STRIDE,
-                          (int *) &bo->base.base.stride);
+                          (int *) &bo->base.stride);
 
-   return &bo->base.base;
+   return &bo->base;
 
 failed:
    free(bo);
@@ -1198,8 +1189,8 @@
 
    /* If it's a dumb buffer, we already have a mapping */
    if (bo->map) {
-      *map_data = (char *)bo->map + (bo->base.base.stride * y) + (x * 4);
-      *stride = bo->base.base.stride;
+      *map_data = (char *)bo->map + (bo->base.stride * y) + (x * 4);
+      *stride = bo->base.stride;
       return *map_data;
    }
 
@@ -1289,7 +1280,7 @@
    surf->base.gbm = gbm;
    surf->base.width = width;
    surf->base.height = height;
-   surf->base.format = format;
+   surf->base.format = gbm_format_canonicalize(format);
    surf->base.flags = flags;
    if (!modifiers) {
       assert(!count);
@@ -1336,7 +1327,7 @@
       free((__DRIconfig *) dri->driver_configs[i]);
    free(dri->driver_configs);
    dlclose(dri->driver);
-   free(dri->base.driver_name);
+   free(dri->driver_name);
 
    free(dri);
 }
@@ -1351,26 +1342,25 @@
    if (!dri)
       return NULL;
 
-   dri->base.base.fd = fd;
-   dri->base.base.bo_create = gbm_dri_bo_create;
-   dri->base.base.bo_import = gbm_dri_bo_import;
-   dri->base.base.bo_map = gbm_dri_bo_map;
-   dri->base.base.bo_unmap = gbm_dri_bo_unmap;
-   dri->base.base.is_format_supported = gbm_dri_is_format_supported;
-   dri->base.base.bo_write = gbm_dri_bo_write;
-   dri->base.base.bo_get_fd = gbm_dri_bo_get_fd;
-   dri->base.base.bo_get_planes = gbm_dri_bo_get_planes;
-   dri->base.base.bo_get_handle = gbm_dri_bo_get_handle_for_plane;
-   dri->base.base.bo_get_stride = gbm_dri_bo_get_stride;
-   dri->base.base.bo_get_offset = gbm_dri_bo_get_offset;
-   dri->base.base.bo_get_modifier = gbm_dri_bo_get_modifier;
-   dri->base.base.bo_destroy = gbm_dri_bo_destroy;
-   dri->base.base.destroy = dri_destroy;
-   dri->base.base.surface_create = gbm_dri_surface_create;
-   dri->base.base.surface_destroy = gbm_dri_surface_destroy;
+   dri->base.fd = fd;
+   dri->base.bo_create = gbm_dri_bo_create;
+   dri->base.bo_import = gbm_dri_bo_import;
+   dri->base.bo_map = gbm_dri_bo_map;
+   dri->base.bo_unmap = gbm_dri_bo_unmap;
+   dri->base.is_format_supported = gbm_dri_is_format_supported;
+   dri->base.bo_write = gbm_dri_bo_write;
+   dri->base.bo_get_fd = gbm_dri_bo_get_fd;
+   dri->base.bo_get_planes = gbm_dri_bo_get_planes;
+   dri->base.bo_get_handle = gbm_dri_bo_get_handle_for_plane;
+   dri->base.bo_get_stride = gbm_dri_bo_get_stride;
+   dri->base.bo_get_offset = gbm_dri_bo_get_offset;
+   dri->base.bo_get_modifier = gbm_dri_bo_get_modifier;
+   dri->base.bo_destroy = gbm_dri_bo_destroy;
+   dri->base.destroy = dri_destroy;
+   dri->base.surface_create = gbm_dri_surface_create;
+   dri->base.surface_destroy = gbm_dri_surface_destroy;
 
-   dri->base.type = GBM_DRM_DRIVER_TYPE_DRI;
-   dri->base.base.name = "drm";
+   dri->base.name = "drm";
 
    mtx_init(&dri->mutex, mtx_plain);
 
@@ -1386,7 +1376,7 @@
    if (ret)
       goto err_dri;
 
-   return &dri->base.base;
+   return &dri->base;
 
 err_dri:
    free(dri);
diff --git a/src/gbm/backends/dri/gbm_driint.h b/src/gbm/backends/dri/gbm_driint.h
index 29a8ec2..db9038a 100644
--- a/src/gbm/backends/dri/gbm_driint.h
+++ b/src/gbm/backends/dri/gbm_driint.h
@@ -34,8 +34,6 @@
 #include "gbmint.h"
 #include "c11/threads.h"
 
-#include "common_drm.h"
-
 #include <GL/gl.h> /* dri_interface needs GL types */
 #include "GL/internal/dri_interface.h"
 
@@ -43,9 +41,10 @@
 struct gbm_dri_bo;
 
 struct gbm_dri_device {
-   struct gbm_drm_device base;
+   struct gbm_device base;
 
    void *driver;
+   char *driver_name; /* Name of the DRI module, without the _dri suffix */
 
    __DRIscreen *screen;
    __DRIcontext *context;
@@ -57,8 +56,6 @@
    const __DRIimageExtension  *image;
    const __DRIswrastExtension *swrast;
    const __DRI2flushExtension *flush;
-   const __DRIdri2LoaderExtension *loader;
-   const __DRI2interopExtension *interop;
 
    const __DRIconfig   **driver_configs;
    const __DRIextension **loader_extensions;
@@ -103,7 +100,7 @@
 };
 
 struct gbm_dri_bo {
-   struct gbm_drm_bo base;
+   struct gbm_bo base;
 
    __DRIimage *image;
 
@@ -151,12 +148,12 @@
    memset(&map_arg, 0, sizeof(map_arg));
    map_arg.handle = bo->handle;
 
-   ret = drmIoctl(bo->base.base.gbm->fd, DRM_IOCTL_MODE_MAP_DUMB, &map_arg);
+   ret = drmIoctl(bo->base.gbm->fd, DRM_IOCTL_MODE_MAP_DUMB, &map_arg);
    if (ret)
       return NULL;
 
    bo->map = mmap(0, bo->size, PROT_WRITE,
-                  MAP_SHARED, bo->base.base.gbm->fd, map_arg.offset);
+                  MAP_SHARED, bo->base.gbm->fd, map_arg.offset);
    if (bo->map == MAP_FAILED) {
       bo->map = NULL;
       return NULL;
diff --git a/src/gbm/main/common_drm.h b/src/gbm/main/common_drm.h
deleted file mode 100644
index d28c3f0..0000000
--- a/src/gbm/main/common_drm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke@googlemail.com>
- */
-
-#ifndef _COMMON_DRM_H_
-#define _COMMON_DRM_H_
-
-#include "gbmint.h"
-
-enum gbm_drm_driver_type {
-   GBM_DRM_DRIVER_TYPE_DRI,
-   GBM_DRM_DRIVER_TYPE_GALLIUM,
-};
-
-struct gbm_drm_device {
-   struct gbm_device base;
-   enum gbm_drm_driver_type type;
-   char *driver_name;
-};
-
-struct gbm_drm_bo {
-   struct gbm_bo base;
-};
-
-#endif
diff --git a/src/gbm/main/gbm.c b/src/gbm/main/gbm.c
index 79d78b7..1de14f8 100644
--- a/src/gbm/main/gbm.c
+++ b/src/gbm/main/gbm.c
@@ -186,7 +186,7 @@
  * The format of the pixels in the buffer.
  *
  * \param bo The buffer object
- * \return The format of buffer object, on of the GBM_FORMAT_* codes
+ * \return The format of buffer object, one of the GBM_FORMAT_* codes
  */
 GBM_EXPORT uint32_t
 gbm_bo_get_format(struct gbm_bo *bo)
@@ -426,8 +426,8 @@
  * independent of the foreign object.
  *
  * \param gbm The gbm device returned from gbm_create_device()
- * \param gbm The type of object we're importing
- * \param gbm Pointer to the external object
+ * \param type The type of object we're importing
+ * \param buffer Pointer to the external object
  * \param usage The union of the usage flags for this buffer
  *
  * \return A newly allocated buffer object that should be freed with
diff --git a/src/gbm/main/gbm.h b/src/gbm/main/gbm.h
index b52137e..879f003 100644
--- a/src/gbm/main/gbm.h
+++ b/src/gbm/main/gbm.h
@@ -77,6 +77,12 @@
    GBM_BO_FORMAT_ARGB8888
 };
 
+
+/**
+ * The FourCC format codes are taken from the drm_fourcc.h definition, and
+ * re-namespaced. New GBM formats must not be added, unless they are
+ * identical ports from drm_fourcc.
+ */
 #define __gbm_fourcc_code(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \
 			      ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
 
@@ -252,6 +258,7 @@
 #define GBM_BO_IMPORT_WL_BUFFER         0x5501
 #define GBM_BO_IMPORT_EGL_IMAGE         0x5502
 #define GBM_BO_IMPORT_FD                0x5503
+#define GBM_BO_IMPORT_FD_MODIFIER       0x5504
 
 struct gbm_import_fd_data {
    int fd;
@@ -261,6 +268,17 @@
    uint32_t format;
 };
 
+struct gbm_import_fd_modifier_data {
+   uint32_t width;
+   uint32_t height;
+   uint32_t format;
+   uint32_t num_fds;
+   int fds[4];
+   int strides[4];
+   int offsets[4];
+   uint64_t modifier;
+};
+
 struct gbm_bo *
 gbm_bo_import(struct gbm_device *gbm, uint32_t type,
               void *buffer, uint32_t usage);
diff --git a/src/glx/Makefile.am b/src/glx/Makefile.am
index 41e4939..b306bcc 100644
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -20,7 +20,6 @@
 # IN THE SOFTWARE.
 
 if HAVE_SHARED_GLAPI
-SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
 SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
 endif
 
@@ -42,7 +41,6 @@
 	-I$(top_builddir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
-	$(SHARED_GLAPI_CFLAGS) \
 	$(EXTRA_DEFINES_XF86VIDMODE) \
 	-D_REENTRANT \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
diff --git a/src/glx/SConscript b/src/glx/SConscript
index 9727803..ca94d79 100644
--- a/src/glx/SConscript
+++ b/src/glx/SConscript
@@ -40,9 +40,6 @@
     env.Append(CPPDEFINES = ['XF86VIDMODE'])
     env.PkgUseModules('XF86VIDMODE')
 
-if False: # XXX: SHARED_GLAPI
-    env.Append(CPPDEFINES = ['GLX_SHARED_GLAPI'])
-
 sources = [
     'clientattrib.c',
     'clientinfo.c',
diff --git a/src/glx/apple/Makefile.am b/src/glx/apple/Makefile.am
index ca74aa7..bfa18b1 100644
--- a/src/glx/apple/Makefile.am
+++ b/src/glx/apple/Makefile.am
@@ -12,7 +12,6 @@
 	-I$(top_builddir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
-	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
 	$(X11_INCLUDES)
 
diff --git a/src/glx/dri2_glx.c b/src/glx/dri2_glx.c
index 145f44d..ae8cb11 100644
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -953,6 +953,18 @@
    __glXSetCurrentContext(&pcp->base);
 }
 
+static GLboolean
+driIsThreadSafe(void *loaderPrivate)
+{
+   struct dri2_context *pcp = (struct dri2_context *) loaderPrivate;
+   /* Check Xlib is running in thread safe mode
+    *
+    * 'lock_fns' is the XLockDisplay function pointer of the X11 display 'dpy'.
+    * It wll be NULL if XInitThreads wasn't called.
+    */
+   return pcp->base.psc->dpy->lock_fns != NULL;
+}
+
 static const __DRIdri2LoaderExtension dri2LoaderExtension = {
    .base = { __DRI_DRI2_LOADER, 3 },
 
@@ -974,9 +986,10 @@
 };
 
 static const __DRIbackgroundCallableExtension driBackgroundCallable = {
-   .base = { __DRI_BACKGROUND_CALLABLE, 1 },
+   .base = { __DRI_BACKGROUND_CALLABLE, 2 },
 
    .setBackgroundContext    = driSetBackgroundContext,
+   .isThreadSafe            = driIsThreadSafe,
 };
 
 _X_HIDDEN void
@@ -1298,12 +1311,17 @@
    psp->getBufferAge = NULL;
 
    if (pdp->driMinor >= 2) {
+      unsigned char disable;
+
       psp->getDrawableMSC = dri2DrawableGetMSC;
       psp->waitForMSC = dri2WaitForMSC;
       psp->waitForSBC = dri2WaitForSBC;
       psp->setSwapInterval = dri2SetSwapInterval;
       psp->getSwapInterval = dri2GetSwapInterval;
-      __glXEnableDirectExtension(&psc->base, "GLX_OML_sync_control");
+      if (psc->config->configQueryb(psc->driScreen,
+                                    "glx_disable_oml_sync_control",
+                                    &disable) || !disable)
+         __glXEnableDirectExtension(&psc->base, "GLX_OML_sync_control");
    }
 
    /* DRI2 supports SubBuffer through DRI2CopyRegion, so it's always
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index e7ad40a..5091606 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -509,6 +509,15 @@
    __glXSetCurrentContext(&pcp->base);
 }
 
+static GLboolean
+dri_is_thread_safe(void *loaderPrivate)
+{
+   /* Unlike DRI2, DRI3 doesn't call GetBuffers/GetBuffersWithFormat
+    * during draw so we're safe here.
+    */
+   return true;
+}
+
 /* The image loader extension record for DRI3
  */
 static const __DRIimageLoaderExtension imageLoaderExtension = {
@@ -523,9 +532,10 @@
 };
 
 static const __DRIbackgroundCallableExtension driBackgroundCallable = {
-   .base = { __DRI_BACKGROUND_CALLABLE, 1 },
+   .base = { __DRI_BACKGROUND_CALLABLE, 2 },
 
    .setBackgroundContext = dri_set_background_context,
+   .isThreadSafe         = dri_is_thread_safe,
 };
 
 static const __DRIextension *loader_extensions[] = {
@@ -786,6 +796,7 @@
    struct glx_config *configs = NULL, *visuals = NULL;
    char *driverName, *deviceName, *tmp;
    int i;
+   unsigned char disable;
 
    psc = calloc(1, sizeof *psc);
    if (psc == NULL)
@@ -924,13 +935,19 @@
    psp->waitForSBC = dri3_wait_for_sbc;
    psp->setSwapInterval = dri3_set_swap_interval;
    psp->getSwapInterval = dri3_get_swap_interval;
-   __glXEnableDirectExtension(&psc->base, "GLX_OML_sync_control");
+   if (psc->config->configQueryb(psc->driScreen,
+                                 "glx_disable_oml_sync_control",
+                                 &disable) || !disable)
+      __glXEnableDirectExtension(&psc->base, "GLX_OML_sync_control");
 
    psp->copySubBuffer = dri3_copy_sub_buffer;
    __glXEnableDirectExtension(&psc->base, "GLX_MESA_copy_sub_buffer");
 
    psp->getBufferAge = dri3_get_buffer_age;
-   __glXEnableDirectExtension(&psc->base, "GLX_EXT_buffer_age");
+   if (psc->config->configQueryb(psc->driScreen,
+                                 "glx_disable_ext_buffer_age",
+                                 &disable) || !disable)
+      __glXEnableDirectExtension(&psc->base, "GLX_EXT_buffer_age");
 
    free(driverName);
    free(deviceName);
diff --git a/src/glx/dri3_priv.h b/src/glx/dri3_priv.h
index 0822377..1d3c03f 100644
--- a/src/glx/dri3_priv.h
+++ b/src/glx/dri3_priv.h
@@ -101,7 +101,7 @@
 
    void *driver;
    int fd;
-   int is_different_gpu;
+   bool is_different_gpu;
 
    int show_fps_interval;
 
diff --git a/src/glx/glxcmds.c b/src/glx/glxcmds.c
index a3af417..44992f1 100644
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -820,7 +820,7 @@
 {
 #ifdef GLX_USE_APPLEGL
    struct glx_context * gc = __glXGetCurrentContext();
-   if(gc != &DummyContext && apple_glx_is_current_drawable(dpy, gc->driContext, drawable)) {
+   if(gc != &dummyContext && apple_glx_is_current_drawable(dpy, gc->driContext, drawable)) {
       apple_glx_swap_buffers(gc->driContext);
    } else {
       __glXSendError(dpy, GLXBadCurrentWindow, 0, X_GLXSwapBuffers, false);
@@ -1014,6 +1014,7 @@
 
    MATCH_MASK(drawableType);
    MATCH_MASK(renderType);
+   MATCH_DONT_CARE(sRGBCapable);
 
    /* There is a bug in a few of the XFree86 DDX drivers.  They contain
     * visuals with a "transparent type" of 0 when they really mean GLX_NONE.
diff --git a/src/glx/windows/Makefile.am b/src/glx/windows/Makefile.am
index 6de3cf2..f84288b 100644
--- a/src/glx/windows/Makefile.am
+++ b/src/glx/windows/Makefile.am
@@ -27,6 +27,5 @@
 	-I$(top_builddir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
-	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
 	$(X11_INCLUDES)
diff --git a/src/intel/Android.genxml.mk b/src/intel/Android.genxml.mk
index 4b0746c..e4d8dd8 100644
--- a/src/intel/Android.genxml.mk
+++ b/src/intel/Android.genxml.mk
@@ -96,6 +96,11 @@
 $(intermediates)/genxml/gen9_pack.h: $(LOCAL_PATH)/genxml/gen9.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
 	$(call header-gen)
 
+$(intermediates)/genxml/gen10_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py
+$(intermediates)/genxml/gen10_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen10.xml
+$(intermediates)/genxml/gen10_pack.h: $(LOCAL_PATH)/genxml/gen10.xml $(LOCAL_PATH)/genxml/gen_pack_header.py
+	$(call header-gen)
+
 $(intermediates)/genxml/genX_xml.h: $(addprefix $(MESA_TOP)/src/intel/,$(GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
diff --git a/src/intel/Android.isl.mk b/src/intel/Android.isl.mk
index 67e6d2d..516ac3a 100644
--- a/src/intel/Android.isl.mk
+++ b/src/intel/Android.isl.mk
@@ -161,6 +161,25 @@
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
+# Build libmesa_isl_gen10
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl_gen10
+
+LOCAL_SRC_FILES := $(ISL_GEN10_FILES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
+
+LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
 # Build libmesa_isl
 # ---------------------------------------
 
@@ -187,6 +206,7 @@
 	libmesa_isl_gen75 \
 	libmesa_isl_gen8 \
 	libmesa_isl_gen9 \
+	libmesa_isl_gen10 \
 	libmesa_genxml
 
 # Autogenerated sources
diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk
index 2bf56a4..398f2e7 100644
--- a/src/intel/Android.vulkan.mk
+++ b/src/intel/Android.vulkan.mk
@@ -31,7 +31,9 @@
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/mesa \
 	$(MESA_TOP)/src/vulkan/wsi \
+	$(MESA_TOP)/src/vulkan/util \
 	$(MESA_TOP)/src/intel \
+	$(MESA_TOP)/include/drm-uapi \
 	$(MESA_TOP)/src/intel/vulkan
 
 # libmesa_anv_entrypoints with header and dummy.c
@@ -92,7 +94,7 @@
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
-LOCAL_SHARED_LIBRARIES := libdrm_intel
+LOCAL_SHARED_LIBRARIES := libdrm
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -158,6 +160,26 @@
 include $(BUILD_STATIC_LIBRARY)
 
 #
+# libanv for gen10
+#
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := libmesa_anv_gen10
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+LOCAL_SRC_FILES := $(VULKAN_GEN10_FILES)
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
+
+LOCAL_C_INCLUDES := $(ANV_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
+
+LOCAL_SHARED_LIBRARIES := libdrm
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+#
 # libmesa_vulkan_common
 #
 
@@ -206,6 +228,8 @@
 LOCAL_MODULE := libvulkan_intel
 LOCAL_MODULE_CLASS := SHARED_LIBRARIES
 
+LOCAL_LDFLAGS += -Wl,--build-id=sha1
+
 LOCAL_SRC_FILES := \
 	$(VULKAN_GEM_FILES)
 
@@ -228,6 +252,7 @@
 	libmesa_anv_gen75 \
 	libmesa_anv_gen8 \
 	libmesa_anv_gen9 \
+	libmesa_anv_gen10 \
 	libmesa_intel_compiler \
 	libmesa_anv_entrypoints
 
diff --git a/src/intel/BUILD.gn b/src/intel/BUILD.gn
index 6f60d54..577f958 100644
--- a/src/intel/BUILD.gn
+++ b/src/intel/BUILD.gn
@@ -56,6 +56,7 @@
     ":isl_gen75",
     ":isl_gen8",
     ":isl_gen9",
+    ":isl_gen10",
     "$mesa_build_root/include:c_compat",
     "$mesa_build_root/src/util",
   ]
@@ -135,6 +136,23 @@
   ]
 }
 
+source_set("isl_gen10") {
+  configs += [ ":intel_config" ]
+
+  sources = [
+    "isl/isl_emit_depth_stencil.c",
+    "isl/isl_surface_state.c",
+  ]
+
+  defines = [ "GEN_VERSIONx10=100" ]
+
+  deps = [
+    ":genxml",
+    "$mesa_build_root/include:c_compat",
+    "$mesa_build_root/src/util",
+  ]
+}
+
 config("gen_public_config") {
   include_dirs = [ "$target_gen_dir" ]
 }
@@ -146,6 +164,7 @@
     "genxml:gen_pack7_header",
     "genxml:gen_pack8_header",
     "genxml:gen_pack9_header",
+    "genxml:gen_pack10_header",
     "genxml:gen_bits_header",
   ]
 }
diff --git a/src/intel/Makefile.am b/src/intel/Makefile.am
index 269d73d..02c625a 100644
--- a/src/intel/Makefile.am
+++ b/src/intel/Makefile.am
@@ -38,6 +38,7 @@
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/include \
 	$(VALGRIND_CFLAGS) \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES)
 
 AM_CFLAGS = \
diff --git a/src/intel/Makefile.isl.am b/src/intel/Makefile.isl.am
index ee2215d..31273af 100644
--- a/src/intel/Makefile.isl.am
+++ b/src/intel/Makefile.isl.am
@@ -27,6 +27,7 @@
 	isl/libisl-gen75.la                              \
 	isl/libisl-gen8.la                               \
 	isl/libisl-gen9.la                               \
+	isl/libisl-gen10.la                              \
 	$(NULL)
 
 noinst_LTLIBRARIES += $(ISL_GEN_LIBS) isl/libisl.la
@@ -55,6 +56,9 @@
 isl_libisl_gen9_la_SOURCES = $(ISL_GEN9_FILES)
 isl_libisl_gen9_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=90
 
+isl_libisl_gen10_la_SOURCES = $(ISL_GEN10_FILES)
+isl_libisl_gen10_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=100
+
 BUILT_SOURCES += $(ISL_GENERATED_FILES)
 
 isl/isl_format_layout.c: isl/gen_format_layout.py \
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 0d44661..2b3065e 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -3,10 +3,12 @@
 	blorp/blorp.h \
 	blorp/blorp_blit.c \
 	blorp/blorp_clear.c \
+	blorp/blorp_nir_builder.h \
 	blorp/blorp_genX_exec.h \
 	blorp/blorp_priv.h
 
 COMMON_FILES = \
+	common/gen_clflush.h \
 	common/gen_debug.c \
 	common/gen_debug.h \
 	common/gen_device_info.c \
@@ -23,6 +25,14 @@
 COMPILER_FILES = \
 	compiler/brw_cfg.cpp \
 	compiler/brw_cfg.h \
+	compiler/brw_clip.h \
+	compiler/brw_clip_line.c \
+	compiler/brw_clip_point.c \
+	compiler/brw_clip_tri.c \
+	compiler/brw_clip_unfilled.c \
+	compiler/brw_clip_util.c \
+	compiler/brw_compile_clip.c \
+	compiler/brw_compile_sf.c \
 	compiler/brw_compiler.c \
 	compiler/brw_compiler.h \
 	compiler/brw_dead_control_flow.cpp \
@@ -65,6 +75,7 @@
 	compiler/brw_nir.h \
 	compiler/brw_nir.c \
 	compiler/brw_nir_analyze_boolean_resolves.c \
+	compiler/brw_nir_analyze_ubo_ranges.c \
 	compiler/brw_nir_attribute_workarounds.c \
 	compiler/brw_nir_intrinsics.c \
 	compiler/brw_nir_opt_peephole_ffma.c \
@@ -117,7 +128,8 @@
 	genxml/gen7.xml \
 	genxml/gen75.xml \
 	genxml/gen8.xml \
-	genxml/gen9.xml
+	genxml/gen9.xml \
+	genxml/gen10.xml
 
 GENXML_GENERATED_PACK_FILES = \
 	genxml/gen4_pack.h \
@@ -127,7 +139,8 @@
 	genxml/gen7_pack.h \
 	genxml/gen75_pack.h \
 	genxml/gen8_pack.h \
-	genxml/gen9_pack.h
+	genxml/gen9_pack.h \
+	genxml/gen10_pack.h
 
 GENXML_GENERATED_FILES = \
 	$(GENXML_GENERATED_PACK_FILES) \
@@ -137,6 +150,7 @@
 ISL_FILES = \
 	isl/isl.c \
 	isl/isl.h \
+	isl/isl_drm.c \
 	isl/isl_format.c \
 	isl/isl_priv.h \
 	isl/isl_storage_image.c
@@ -179,6 +193,10 @@
 	isl/isl_emit_depth_stencil.c \
 	isl/isl_surface_state.c
 
+ISL_GEN10_FILES = \
+	isl/isl_emit_depth_stencil.c \
+	isl/isl_surface_state.c
+
 ISL_GENERATED_FILES = \
 	isl/isl_format_layout.c
 
@@ -197,11 +215,13 @@
 	vulkan/anv_nir.h \
 	vulkan/anv_nir_apply_pipeline_layout.c \
 	vulkan/anv_nir_lower_input_attachments.c \
+	vulkan/anv_nir_lower_multiview.c \
 	vulkan/anv_nir_lower_push_constants.c \
 	vulkan/anv_pass.c \
 	vulkan/anv_pipeline.c \
 	vulkan/anv_pipeline_cache.c \
 	vulkan/anv_private.h \
+	vulkan/anv_queue.c \
 	vulkan/anv_util.c \
 	vulkan/anv_wsi.c \
 	vulkan/vk_format_info.h
@@ -245,3 +265,7 @@
 VULKAN_GEN9_FILES := \
 	vulkan/gen8_cmd_buffer.c \
 	$(VULKAN_GENX_FILES)
+
+VULKAN_GEN10_FILES := \
+	vulkan/gen8_cmd_buffer.c \
+	$(VULKAN_GENX_FILES)
diff --git a/src/intel/Makefile.tools.am b/src/intel/Makefile.tools.am
index 576beea..8071220 100644
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -26,13 +26,13 @@
 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
 	tools/disasm.c \
-	tools/gen_disasm.h
+	tools/gen_disasm.h \
+	tools/intel_aub.h
 
 tools_aubinator_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(EXPAT_CFLAGS) \
-	$(ZLIB_CFLAGS) \
-	$(INTEL_CFLAGS)
+	$(ZLIB_CFLAGS)
 
 tools_aubinator_LDADD = \
 	common/libintel_common.la \
@@ -47,11 +47,15 @@
 
 
 tools_aubinator_error_decode_SOURCES = \
-	tools/aubinator_error_decode.c
+	tools/aubinator_error_decode.c \
+	tools/disasm.c \
+	tools/gen_disasm.h
 
 tools_aubinator_error_decode_LDADD = \
 	common/libintel_common.la \
+	compiler/libintel_compiler.la \
 	$(top_builddir)/src/util/libmesautil.la \
+	$(PTHREAD_LIBS) \
 	$(EXPAT_LIBS) \
 	$(ZLIB_LIBS)
 
diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am
index 878abf3..6550f68 100644
--- a/src/intel/Makefile.vulkan.am
+++ b/src/intel/Makefile.vulkan.am
@@ -72,7 +72,8 @@
 	vulkan/libanv-gen7.la \
 	vulkan/libanv-gen75.la \
 	vulkan/libanv-gen8.la \
-	vulkan/libanv-gen9.la
+	vulkan/libanv-gen9.la \
+	vulkan/libanv-gen10.la
 
 noinst_LTLIBRARIES += $(VULKAN_PER_GEN_LIBS)
 
@@ -83,6 +84,7 @@
 VULKAN_CPPFLAGS = \
 	-I$(top_srcdir)/src/compiler \
 	-I$(top_srcdir)/src/intel/compiler \
+	-I$(top_srcdir)/include/drm-uapi \
 	-I$(top_builddir)/src/intel/vulkan \
 	-I$(top_srcdir)/src/intel/vulkan \
 	-I$(top_srcdir)/src/vulkan/wsi \
@@ -107,6 +109,10 @@
 vulkan_libanv_gen9_la_CPPFLAGS = $(VULKAN_CPPFLAGS) -DGEN_VERSIONx10=90
 vulkan_libanv_gen9_la_SOURCES = $(VULKAN_GEN9_FILES)
 
+vulkan_libanv_gen10_la_CFLAGS = $(VULKAN_CFLAGS)
+vulkan_libanv_gen10_la_CPPFLAGS = $(VULKAN_CPPFLAGS) -DGEN_VERSIONx10=100
+vulkan_libanv_gen10_la_SOURCES = $(VULKAN_GEN10_FILES)
+
 VULKAN_SOURCES = \
 	$(VULKAN_GENERATED_FILES) \
 	$(VULKAN_FILES)
diff --git a/src/intel/blorp/blorp.c b/src/intel/blorp/blorp.c
index 0b2395d..a426a03 100644
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@@ -66,6 +66,10 @@
                             unsigned int level, unsigned int layer,
                             enum isl_format format, bool is_render_target)
 {
+   assert(level < surf->surf->levels);
+   assert(layer < MAX2(surf->surf->logical_level0_px.depth >> level,
+                       surf->surf->logical_level0_px.array_len));
+
    info->enabled = true;
 
    if (format == ISL_FORMAT_UNSUPPORTED)
@@ -81,11 +85,6 @@
        * map it as 8-bit BGRA.
        */
       format = ISL_FORMAT_B8G8R8A8_UNORM;
-   } else if (surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) {
-      assert(surf->surf->format == ISL_FORMAT_R8_UINT);
-      /* Prior to Broadwell, we can't render to R8_UINT */
-      if (blorp->isl_dev->info->gen < 8)
-         format = ISL_FORMAT_R8_UNORM;
    }
 
    info->surf = *surf->surf;
@@ -95,6 +94,9 @@
    if (info->aux_usage != ISL_AUX_USAGE_NONE) {
       info->aux_surf = *surf->aux_surf;
       info->aux_addr = surf->aux_addr;
+      assert(level < info->aux_surf.levels);
+      assert(layer < MAX2(info->aux_surf.logical_level0_px.depth >> level,
+                          info->aux_surf.logical_level0_px.array_len));
    }
 
    info->clear_color = surf->clear_color;
@@ -129,10 +131,10 @@
       info->z_offset = 0;
    }
 
-   /* Sandy Bridge has a limit of a maximum of 512 layers for layered
-    * rendering.
+   /* Sandy Bridge and earlier have a limit of a maximum of 512 layers for
+    * layered rendering.
     */
-   if (is_render_target && blorp->isl_dev->info->gen == 6)
+   if (is_render_target && blorp->isl_dev->info->gen <= 6)
       info->view.array_len = MIN2(info->view.array_len, 512);
 }
 
@@ -158,7 +160,7 @@
 const unsigned *
 blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,
                  struct nir_shader *nir,
-                 const struct brw_wm_prog_key *wm_key,
+                 struct brw_wm_prog_key *wm_key,
                  bool use_repclear,
                  struct brw_wm_prog_data *wm_prog_data,
                  unsigned *program_size)
@@ -182,6 +184,13 @@
    nir_remove_dead_variables(nir, nir_var_shader_in);
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 
+   if (blorp->compiler->devinfo->gen < 6) {
+      if (nir->info.fs.uses_discard)
+         wm_key->iz_lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
+
+      wm_key->input_slots_valid = nir->info.inputs_read | VARYING_BIT_POS;
+   }
+
    const unsigned *program =
       brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx, wm_key,
                      wm_prog_data, nir, NULL, -1, -1, false, use_repclear,
@@ -204,12 +213,12 @@
    nir = brw_preprocess_nir(compiler, nir);
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 
-   vs_prog_data->inputs_read = nir->info->inputs_read;
+   vs_prog_data->inputs_read = nir->info.inputs_read;
 
    brw_compute_vue_map(compiler->devinfo,
                        &vs_prog_data->base.vue_map,
-                       nir->info->outputs_written,
-                       nir->info->separate_shader);
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
 
    struct brw_vs_prog_key vs_key = { 0, };
 
@@ -221,61 +230,130 @@
    return program;
 }
 
+struct blorp_sf_key {
+   enum blorp_shader_type shader_type; /* Must be BLORP_SHADER_TYPE_GEN4_SF */
+
+   struct brw_sf_prog_key key;
+};
+
+bool
+blorp_ensure_sf_program(struct blorp_context *blorp,
+                        struct blorp_params *params)
+{
+   const struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
+   assert(params->wm_prog_data);
+
+   /* Gen6+ doesn't need a strips and fans program */
+   if (blorp->compiler->devinfo->gen >= 6)
+      return true;
+
+   struct blorp_sf_key key = {
+      .shader_type = BLORP_SHADER_TYPE_GEN4_SF,
+   };
+
+   /* Everything gets compacted in vertex setup, so we just need a
+    * pass-through for the correct number of input varyings.
+    */
+   const uint64_t slots_valid = VARYING_BIT_POS |
+      ((1ull << wm_prog_data->num_varying_inputs) - 1) << VARYING_SLOT_VAR0;
+
+   key.key.attrs = slots_valid;
+   key.key.primitive = BRW_SF_PRIM_TRIANGLES;
+   key.key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+
+   STATIC_ASSERT(sizeof(key.key.interp_mode) ==
+                 sizeof(wm_prog_data->interp_mode));
+   memcpy(key.key.interp_mode, wm_prog_data->interp_mode,
+          sizeof(key.key.interp_mode));
+
+   if (blorp->lookup_shader(blorp, &key, sizeof(key),
+                            &params->sf_prog_kernel, &params->sf_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   const unsigned *program;
+   unsigned program_size;
+
+   struct brw_vue_map vue_map;
+   brw_compute_vue_map(blorp->compiler->devinfo, &vue_map, slots_valid, false);
+
+   struct brw_sf_prog_data prog_data_tmp;
+   program = brw_compile_sf(blorp->compiler, mem_ctx, &key.key,
+                            &prog_data_tmp, &vue_map, &program_size);
+
+   bool result =
+      blorp->upload_shader(blorp, &key, sizeof(key), program, program_size,
+                           (void *)&prog_data_tmp, sizeof(prog_data_tmp),
+                           &params->sf_prog_kernel, &params->sf_prog_data);
+
+   ralloc_free(mem_ctx);
+
+   return result;
+}
+
 void
-blorp_gen6_hiz_op(struct blorp_batch *batch,
-                  struct blorp_surf *surf, unsigned level, unsigned layer,
-                  enum blorp_hiz_op op)
+blorp_hiz_op(struct blorp_batch *batch, struct blorp_surf *surf,
+             uint32_t level, uint32_t start_layer, uint32_t num_layers,
+             enum blorp_hiz_op op)
 {
    struct blorp_params params;
    blorp_params_init(&params);
 
    params.hiz_op = op;
+   params.full_surface_hiz_op = true;
 
-   brw_blorp_surface_info_init(batch->blorp, &params.depth, surf, level, layer,
-                               surf->surf->format, true);
+   for (uint32_t a = 0; a < num_layers; a++) {
+      const uint32_t layer = start_layer + a;
 
-   /* Align the rectangle primitive to 8x4 pixels.
-    *
-    * During fast depth clears, the emitted rectangle primitive  must be
-    * aligned to 8x4 pixels.  From the Ivybridge PRM, Vol 2 Part 1 Section
-    * 11.5.3.1 Depth Buffer Clear (and the matching section in the Sandybridge
-    * PRM):
-    *     If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
-    *     aligned to an 8x4 pixel block relative to the upper left corner
-    *     of the depth buffer [...]
-    *
-    * For hiz resolves, the rectangle must also be 8x4 aligned. Item
-    * WaHizAmbiguate8x4Aligned from the Haswell workarounds page and the
-    * Ivybridge simulator require the alignment.
-    *
-    * To be safe, let's just align the rect for all hiz operations and all
-    * hardware generations.
-    *
-    * However, for some miptree slices of a Z24 texture, emitting an 8x4
-    * aligned rectangle that covers the slice may clobber adjacent slices if
-    * we strictly adhered to the texture alignments specified in the PRM.  The
-    * Ivybridge PRM, Section "Alignment Unit Size", states that
-    * SURFACE_STATE.Surface_Horizontal_Alignment should be 4 for Z24 surfaces,
-    * not 8. But commit 1f112cc increased the alignment from 4 to 8, which
-    * prevents the clobbering.
-    */
-   params.x1 = minify(params.depth.surf.logical_level0_px.width,
-                      params.depth.view.base_level);
-   params.y1 = minify(params.depth.surf.logical_level0_px.height,
-                      params.depth.view.base_level);
-   params.x1 = ALIGN(params.x1, 8);
-   params.y1 = ALIGN(params.y1, 4);
+      brw_blorp_surface_info_init(batch->blorp, &params.depth, surf, level,
+                                  layer, surf->surf->format, true);
 
-   if (params.depth.view.base_level == 0) {
-      /* TODO: What about MSAA? */
-      params.depth.surf.logical_level0_px.width = params.x1;
-      params.depth.surf.logical_level0_px.height = params.y1;
+      /* Align the rectangle primitive to 8x4 pixels.
+       *
+       * During fast depth clears, the emitted rectangle primitive  must be
+       * aligned to 8x4 pixels.  From the Ivybridge PRM, Vol 2 Part 1 Section
+       * 11.5.3.1 Depth Buffer Clear (and the matching section in the
+       * Sandybridge PRM):
+       *
+       *     If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
+       *     aligned to an 8x4 pixel block relative to the upper left corner
+       *     of the depth buffer [...]
+       *
+       * For hiz resolves, the rectangle must also be 8x4 aligned. Item
+       * WaHizAmbiguate8x4Aligned from the Haswell workarounds page and the
+       * Ivybridge simulator require the alignment.
+       *
+       * To be safe, let's just align the rect for all hiz operations and all
+       * hardware generations.
+       *
+       * However, for some miptree slices of a Z24 texture, emitting an 8x4
+       * aligned rectangle that covers the slice may clobber adjacent slices
+       * if we strictly adhered to the texture alignments specified in the
+       * PRM.  The Ivybridge PRM, Section "Alignment Unit Size", states that
+       * SURFACE_STATE.Surface_Horizontal_Alignment should be 4 for Z24
+       * surfaces, not 8. But commit 1f112cc increased the alignment from 4 to
+       * 8, which prevents the clobbering.
+       */
+      params.x1 = minify(params.depth.surf.logical_level0_px.width,
+                         params.depth.view.base_level);
+      params.y1 = minify(params.depth.surf.logical_level0_px.height,
+                         params.depth.view.base_level);
+      params.x1 = ALIGN(params.x1, 8);
+      params.y1 = ALIGN(params.y1, 4);
+
+      if (params.depth.view.base_level == 0) {
+         /* TODO: What about MSAA? */
+         params.depth.surf.logical_level0_px.width = params.x1;
+         params.depth.surf.logical_level0_px.height = params.y1;
+      }
+
+      params.dst.surf.samples = params.depth.surf.samples;
+      params.dst.surf.logical_level0_px = params.depth.surf.logical_level0_px;
+      params.depth_format =
+         isl_format_get_depth_format(surf->surf->format, false);
+      params.num_samples = params.depth.surf.samples;
+
+      batch->blorp->exec(batch, &params);
    }
-
-   params.dst.surf.samples = params.depth.surf.samples;
-   params.dst.surf.logical_level0_px = params.depth.surf.logical_level0_px;
-   params.depth_format = isl_format_get_depth_format(surf->surf->format, false);
-   params.num_samples = params.depth.surf.samples;
-
-   batch->blorp->exec(batch, &params);
 }
diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h
index eab75d7..d19920e 100644
--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@@ -75,6 +75,9 @@
     * hardware.
     */
    BLORP_BATCH_NO_EMIT_DEPTH_STENCIL = (1 << 0),
+
+   /* This flag indicates that the blorp call should be predicated. */
+   BLORP_BATCH_PREDICATE_ENABLE      = (1 << 1),
 };
 
 struct blorp_batch {
@@ -191,6 +194,23 @@
                   enum isl_format format,
                   enum blorp_fast_clear_op resolve_op);
 
+/* Resolves subresources of the image subresource range specified in the
+ * binding table.
+ */
+void
+blorp_ccs_resolve_attachment(struct blorp_batch *batch,
+                             const uint32_t binding_table_offset,
+                             struct blorp_surf * const surf,
+                             const uint32_t level, const uint32_t num_layers,
+                             const enum isl_format format,
+                             const enum blorp_fast_clear_op resolve_op);
+
+void
+blorp_mcs_partial_resolve(struct blorp_batch *batch,
+                          struct blorp_surf *surf,
+                          enum isl_format format,
+                          uint32_t start_layer, uint32_t num_layers);
+
 /**
  * For an overview of the HiZ operations, see the following sections of the
  * Sandy Bridge PRM, Volume 1, Part2:
@@ -209,9 +229,9 @@
 };
 
 void
-blorp_gen6_hiz_op(struct blorp_batch *batch,
-                  struct blorp_surf *surf, unsigned level, unsigned layer,
-                  enum blorp_hiz_op op);
+blorp_hiz_op(struct blorp_batch *batch, struct blorp_surf *surf,
+             uint32_t level, uint32_t start_layer, uint32_t num_layers,
+             enum blorp_hiz_op op);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/src/intel/blorp/blorp_blit.c b/src/intel/blorp/blorp_blit.c
index 691564c..35008cb 100644
--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -55,6 +55,7 @@
    nir_variable *v_src_z;
    nir_variable *v_src_offset;
    nir_variable *v_dst_offset;
+   nir_variable *v_src_inv_size;
 
    /* gl_FragCoord */
    nir_variable *frag_coord;
@@ -79,6 +80,7 @@
    LOAD_INPUT(src_z, glsl_uint_type())
    LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2))
    LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2))
+   LOAD_INPUT(src_inv_size, glsl_vector_type(GLSL_TYPE_FLOAT, 2))
 
 #undef LOAD_INPUT
 
@@ -133,7 +135,7 @@
    nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),
                                   nir_channel(b, coord_transform, 2));
 
-   return nir_ffma(b, src_pos, mul, offset);
+   return nir_fadd(b, nir_fmul(b, src_pos, mul), offset);
 }
 
 static inline void
@@ -198,10 +200,18 @@
 
 static nir_ssa_def *
 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,
-              nir_ssa_def *pos, nir_alu_type dst_type)
+              const struct brw_blorp_blit_prog_key *key, nir_ssa_def *pos)
 {
+   if (key->need_src_offset)
+      pos = nir_fadd(b, pos, nir_i2f32(b, nir_load_var(b, v->v_src_offset)));
+
+   /* If the sampler requires normalized coordinates, we need to compensate. */
+   if (key->src_coords_normalized)
+      pos = nir_fmul(b, pos, nir_load_var(b, v->v_src_inv_size));
+
    nir_tex_instr *tex =
-      blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type);
+      blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2,
+                                 key->texture_data_type);
 
    assert(pos->num_components == 2);
    tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
@@ -1185,10 +1195,11 @@
           * representing the four samples that maxe up a pixel.  So we need
           * to multiply our X and Y coordinates each by 2 and then add 1.
           */
-         src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
-         src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
-         src_pos = nir_i2f32(&b, src_pos);
-         color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
+         assert(key->src_coords_normalized);
+         src_pos = nir_fadd(&b,
+                            nir_i2f32(&b, src_pos),
+                            nir_imm_float(&b, 0.5f));
+         color = blorp_nir_tex(&b, &v, key, src_pos);
       } else {
          /* Gen7+ hardware doesn't automaticaly blend. */
          color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples,
@@ -1200,7 +1211,7 @@
       color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
    } else {
       if (key->bilinear_filter) {
-         color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
+         color = blorp_nir_tex(&b, &v, key, src_pos);
       } else {
          /* We're going to use texelFetch, so we need integers */
          if (src_pos->num_components == 2) {
@@ -1301,7 +1312,7 @@
    struct brw_wm_prog_data prog_data;
 
    nir_shader *nir = brw_blorp_build_nir_shader(blorp, mem_ctx, prog_key);
-   nir->info->name = ralloc_strdup(nir, "BLORP-blit");
+   nir->info.name = ralloc_strdup(nir, "BLORP-blit");
 
    struct brw_wm_prog_key wm_key;
    brw_blorp_init_wm_prog_key(&wm_key);
@@ -1371,9 +1382,9 @@
    }
 }
 
-static void
-surf_convert_to_single_slice(const struct isl_device *isl_dev,
-                             struct brw_blorp_surface_info *info)
+void
+blorp_surf_convert_to_single_slice(const struct isl_device *isl_dev,
+                                   struct brw_blorp_surface_info *info)
 {
    bool ok UNUSED;
 
@@ -1394,42 +1405,25 @@
    else
       layer = info->view.base_array_layer;
 
-   uint32_t x_offset_sa, y_offset_sa;
-   isl_surf_get_image_offset_sa(&info->surf, info->view.base_level,
-                                layer, z, &x_offset_sa, &y_offset_sa);
-
    uint32_t byte_offset;
-   isl_tiling_get_intratile_offset_sa(isl_dev, info->surf.tiling,
-                                      info->surf.format, info->surf.row_pitch,
-                                      x_offset_sa, y_offset_sa,
-                                      &byte_offset,
-                                      &info->tile_x_sa, &info->tile_y_sa);
+   isl_surf_get_image_surf(isl_dev, &info->surf,
+                           info->view.base_level, layer, z,
+                           &info->surf,
+                           &byte_offset, &info->tile_x_sa, &info->tile_y_sa);
    info->addr.offset += byte_offset;
 
-   const uint32_t slice_width_px =
-      minify(info->surf.logical_level0_px.width, info->view.base_level);
-   const uint32_t slice_height_px =
-      minify(info->surf.logical_level0_px.height, info->view.base_level);
-
    uint32_t tile_x_px, tile_y_px;
    surf_get_intratile_offset_px(info, &tile_x_px, &tile_y_px);
 
-   struct isl_surf_init_info init_info = {
-      .dim = ISL_SURF_DIM_2D,
-      .format = info->surf.format,
-      .width = slice_width_px + tile_x_px,
-      .height = slice_height_px + tile_y_px,
-      .depth = 1,
-      .levels = 1,
-      .array_len = 1,
-      .samples = info->surf.samples,
-      .row_pitch = info->surf.row_pitch,
-      .usage = info->surf.usage,
-      .tiling_flags = 1 << info->surf.tiling,
-   };
-
-   ok = isl_surf_init_s(isl_dev, &info->surf, &init_info);
-   assert(ok);
+   /* Instead of using the X/Y Offset fields in RENDER_SURFACE_STATE, we place
+    * the image at the tile boundary and offset our sampling or rendering.
+    * For this reason, we need to grow the image by the offset to ensure that
+    * the hardware doesn't think we've gone past the edge.
+    */
+   info->surf.logical_level0_px.w += tile_x_px;
+   info->surf.logical_level0_px.h += tile_y_px;
+   info->surf.phys_level0_sa.w += info->tile_x_sa;
+   info->surf.phys_level0_sa.h += info->tile_y_sa;
 
    /* The view is also different now. */
    info->view.base_level = 0;
@@ -1446,7 +1440,7 @@
    assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);
 
    /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
-   surf_convert_to_single_slice(isl_dev, info);
+   blorp_surf_convert_to_single_slice(isl_dev, info);
 
    info->surf.logical_level0_px = info->surf.phys_level0_sa;
    info->surf.samples = 1;
@@ -1460,7 +1454,7 @@
    assert(info->surf.tiling == ISL_TILING_W);
 
    /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
-   surf_convert_to_single_slice(isl_dev, info);
+   blorp_surf_convert_to_single_slice(isl_dev, info);
 
    /* On gen7+, we don't have interleaved multisampling for color render
     * targets so we have to fake it.
@@ -1551,10 +1545,11 @@
                        struct brw_blorp_surface_info *info,
                        uint32_t *x, uint32_t *width)
 {
-   surf_convert_to_single_slice(isl_dev, info);
+   blorp_surf_convert_to_single_slice(isl_dev, info);
 
    info->surf.logical_level0_px.width *= 3;
    info->surf.phys_level0_sa.width *= 3;
+   info->tile_x_sa *= 3;
    *x *= 3;
    *width *= 3;
 
@@ -1665,6 +1660,22 @@
                                    coords->y.dst0, coords->y.dst1,
                                    coords->y.mirror);
 
+
+   if (devinfo->gen == 4) {
+      /* The MinLOD and MinimumArrayElement don't work properly for cube maps.
+       * Convert them to a single slice on gen4.
+       */
+      if (params->dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT) {
+         blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, &params->dst);
+         wm_prog_key->need_dst_offset = true;
+      }
+
+      if (params->src.surf.usage & ISL_SURF_USAGE_CUBE_BIT) {
+         blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, &params->src);
+         wm_prog_key->need_src_offset = true;
+      }
+   }
+
    if (devinfo->gen > 6 &&
        params->dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
       assert(params->dst.surf.samples > 1);
@@ -1804,6 +1815,19 @@
 
    params->num_samples = params->dst.surf.samples;
 
+   if ((wm_prog_key->bilinear_filter ||
+        (wm_prog_key->blend && !wm_prog_key->blit_scaled)) &&
+       batch->blorp->isl_dev->info->gen <= 6) {
+      /* Gen4-5 don't support non-normalized texture coordinates */
+      wm_prog_key->src_coords_normalized = true;
+      params->wm_inputs.src_inv_size[0] =
+         1.0f / minify(params->src.surf.logical_level0_px.width,
+                       params->src.view.base_level);
+      params->wm_inputs.src_inv_size[1] =
+         1.0f / minify(params->src.surf.logical_level0_px.height,
+                       params->src.view.base_level);
+   }
+
    if (params->src.tile_x_sa || params->src.tile_y_sa) {
       assert(wm_prog_key->need_src_offset);
       surf_get_intratile_offset_px(&params->src,
@@ -1828,6 +1852,9 @@
    if (!brw_blorp_get_blit_kernel(batch->blorp, params, wm_prog_key))
       return 0;
 
+   if (!blorp_ensure_sf_program(batch->blorp, params))
+      return 0;
+
    unsigned result = 0;
    unsigned max_surface_size = get_max_surface_size(devinfo, params);
    if (params->src.surf.logical_level0_px.width > max_surface_size ||
@@ -1883,7 +1910,7 @@
    struct isl_extent2d px_size_sa;
    int adjust;
 
-   surf_convert_to_single_slice(dev, info);
+   blorp_surf_convert_to_single_slice(dev, info);
 
    px_size_sa = get_px_size_sa(&info->surf);
 
@@ -1893,7 +1920,7 @@
     */
    x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa;
    y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa;
-   isl_tiling_get_intratile_offset_sa(dev, info->surf.tiling,
+   isl_tiling_get_intratile_offset_sa(info->surf.tiling,
                                       info->surf.format, info->surf.row_pitch,
                                       x_offset_sa, y_offset_sa,
                                       &byte_offset,
@@ -2020,6 +2047,21 @@
    struct blorp_params params;
    blorp_params_init(&params);
 
+   /* We cannot handle combined depth and stencil. */
+   if (src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT)
+      assert(src_surf->surf->format == ISL_FORMAT_R8_UINT);
+   if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT)
+      assert(dst_surf->surf->format == ISL_FORMAT_R8_UINT);
+
+   if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) {
+      assert(src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT);
+      /* Prior to Broadwell, we can't render to R8_UINT */
+      if (batch->blorp->isl_dev->info->gen < 8) {
+         src_format = ISL_FORMAT_R8_UNORM;
+         dst_format = ISL_FORMAT_R8_UNORM;
+      }
+   }
+
    brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
                                src_layer, src_format, false);
    brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
@@ -2047,8 +2089,9 @@
    wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale;
 
    if (filter == GL_LINEAR &&
-       params.src.surf.samples <= 1 && params.dst.surf.samples <= 1)
+       params.src.surf.samples <= 1 && params.dst.surf.samples <= 1) {
       wm_prog_key.bilinear_filter = true;
+   }
 
    if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 &&
        (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 &&
@@ -2298,7 +2341,7 @@
     * ones with the same bpb) and divide x, y, width, and height by the
     * block size.
     */
-   surf_convert_to_single_slice(isl_dev, info);
+   blorp_surf_convert_to_single_slice(isl_dev, info);
 
    if (width || height) {
 #ifndef NDEBUG
@@ -2399,16 +2442,30 @@
    }
 
    if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      /* It's safe to do a blorp_copy between things which are sRGB with CCS_E
+       * enabled even though CCS_E doesn't technically do sRGB on SKL because
+       * we stomp everything to UINT anyway.  The one thing we have to be
+       * careful of is clear colors.  Because fast clear colors for sRGB on
+       * gen9 are encoded as the float values between format conversion and
+       * sRGB curve application, a given clear color float will convert to the
+       * same bits regardless of whether the format is UNORM or sRGB.
+       * Therefore, we can handle sRGB without any special cases.
+       */
+      UNUSED enum isl_format linear_src_format =
+         isl_format_srgb_to_linear(src_surf->surf->format);
       assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info,
-                                              src_surf->surf->format,
+                                              linear_src_format,
                                               params.src.view.format));
       params.src.clear_color =
          bitcast_color_value_to_uint(params.src.clear_color, src_fmtl);
    }
 
    if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      /* See above where we handle linear_src_format */
+      UNUSED enum isl_format linear_dst_format =
+         isl_format_srgb_to_linear(dst_surf->surf->format);
       assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info,
-                                              dst_surf->surf->format,
+                                              linear_dst_format,
                                               params.dst.view.format));
       params.dst.clear_color =
          bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl);
diff --git a/src/intel/blorp/blorp_clear.c b/src/intel/blorp/blorp_clear.c
index 4e834ba..0feebef 100644
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@@ -29,7 +29,7 @@
 #include "blorp_priv.h"
 #include "compiler/brw_eu_defines.h"
 
-#include "compiler/nir/nir_builder.h"
+#include "blorp_nir_builder.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLORP
 
@@ -58,7 +58,7 @@
 
    nir_builder b;
    nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
-   b.shader->info->name = ralloc_strdup(b.shader, "BLORP-clear");
+   b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear");
 
    nir_variable *v_color =
       BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type());
@@ -120,7 +120,7 @@
 
    nir_builder b;
    nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_VERTEX, NULL);
-   b.shader->info->name = ralloc_strdup(b.shader, "BLORP-layer-offset-vs");
+   b.shader->info.name = ralloc_strdup(b.shader, "BLORP-layer-offset-vs");
 
    const struct glsl_type *uvec4_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
 
@@ -308,6 +308,11 @@
                  uint32_t level, uint32_t start_layer, uint32_t num_layers,
                  uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
 {
+   /* Ensure that all layers undergoing the clear have an auxiliary buffer. */
+   assert(start_layer + num_layers <=
+          MAX2(surf->aux_surf->logical_level0_px.depth >> level,
+               surf->aux_surf->logical_level0_px.array_len));
+
    struct blorp_params params;
    blorp_params_init(&params);
    params.num_layers = num_layers;
@@ -366,11 +371,6 @@
    struct blorp_params params;
    blorp_params_init(&params);
 
-   params.x0 = x0;
-   params.y0 = y0;
-   params.x1 = x1;
-   params.y1 = y1;
-
    /* Manually apply the clear destination swizzle.  This way swizzled clears
     * will work for swizzles which we can't normally use for rendering and it
     * also ensures that they work on pre-Haswell hardware which can't swizlle
@@ -404,6 +404,10 @@
    if (surf->surf->tiling == ISL_TILING_LINEAR)
       use_simd16_replicated_data = false;
 
+   /* Replicated clears don't work yet before gen6 */
+   if (batch->blorp->isl_dev->info->gen < 6)
+      use_simd16_replicated_data = false;
+
    /* Constant color writes ignore everyting in blend and color calculator
     * state.  This is not documented.
     */
@@ -419,11 +423,35 @@
                                       use_simd16_replicated_data))
       return;
 
+   if (!blorp_ensure_sf_program(batch->blorp, &params))
+      return;
+
    while (num_layers > 0) {
       brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, level,
                                   start_layer, format, true);
       params.dst.view.swizzle = swizzle;
 
+      params.x0 = x0;
+      params.y0 = y0;
+      params.x1 = x1;
+      params.y1 = y1;
+
+      /* The MinLOD and MinimumArrayElement don't work properly for cube maps.
+       * Convert them to a single slice on gen4.
+       */
+      if (batch->blorp->isl_dev->info->gen == 4 &&
+          (params.dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT)) {
+         blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, &params.dst);
+
+         if (params.dst.tile_x_sa || params.dst.tile_y_sa) {
+            /* This is gen4 so there is no multisampling and sa == px. */
+            params.x0 += params.dst.tile_x_sa;
+            params.y0 += params.dst.tile_y_sa;
+            params.x1 += params.dst.tile_x_sa;
+            params.y1 += params.dst.tile_y_sa;
+         }
+      }
+
       params.num_samples = params.dst.surf.samples;
 
       /* We may be restricted on the number of layers we can bind at any one
@@ -456,6 +484,16 @@
    params.x1 = x1;
    params.y1 = y1;
 
+   if (ISL_DEV_GEN(batch->blorp->isl_dev) == 6) {
+      /* For some reason, Sandy Bridge gets occlusion queries wrong if we
+       * don't have a shader.  In particular, it records samples even though
+       * we disable statistics in 3DSTATE_WM.  Give it the usual clear shader
+       * to work around the issue.
+       */
+      if (!blorp_params_get_clear_kernel(batch->blorp, &params, false))
+         return;
+   }
+
    while (num_layers > 0) {
       params.num_layers = num_layers;
 
@@ -665,20 +703,16 @@
    batch->blorp->exec(batch, &params);
 }
 
-void
-blorp_ccs_resolve(struct blorp_batch *batch,
-                  struct blorp_surf *surf, uint32_t level, uint32_t layer,
-                  enum isl_format format,
-                  enum blorp_fast_clear_op resolve_op)
+static void
+prepare_ccs_resolve(struct blorp_batch * const batch,
+                    struct blorp_params * const params,
+                    const struct blorp_surf * const surf,
+                    const uint32_t level, const uint32_t layer,
+                    const enum isl_format format,
+                    const enum blorp_fast_clear_op resolve_op)
 {
-   struct blorp_params params;
-   blorp_params_init(&params);
-
-   /* Layered and mipmapped fast clear is only available from Gen8 onwards. */
-   assert(ISL_DEV_GEN(batch->blorp->isl_dev) >= 8 ||
-          (level == 0 && layer == 0));
-
-   brw_blorp_surface_info_init(batch->blorp, &params.dst, surf,
+   blorp_params_init(params);
+   brw_blorp_surface_info_init(batch->blorp, &params->dst, surf,
                                level, layer, format, true);
 
    /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve":
@@ -691,7 +725,7 @@
     * multiply by 8 and 16. On Sky Lake, we multiply by 8.
     */
    const struct isl_format_layout *aux_fmtl =
-      isl_format_get_layout(params.dst.aux_surf.format);
+      isl_format_get_layout(params->dst.aux_surf.format);
    assert(aux_fmtl->txc == ISL_TXC_CCS);
 
    unsigned x_scaledown, y_scaledown;
@@ -705,11 +739,11 @@
       x_scaledown = aux_fmtl->bw / 2;
       y_scaledown = aux_fmtl->bh / 2;
    }
-   params.x0 = params.y0 = 0;
-   params.x1 = minify(params.dst.aux_surf.logical_level0_px.width, level);
-   params.y1 = minify(params.dst.aux_surf.logical_level0_px.height, level);
-   params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown;
-   params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown;
+   params->x0 = params->y0 = 0;
+   params->x1 = minify(params->dst.aux_surf.logical_level0_px.width, level);
+   params->y1 = minify(params->dst.aux_surf.logical_level0_px.height, level);
+   params->x1 = ALIGN(params->x1, x_scaledown) / x_scaledown;
+   params->y1 = ALIGN(params->y1, y_scaledown) / y_scaledown;
 
    if (batch->blorp->isl_dev->info->gen >= 9) {
       assert(resolve_op == BLORP_FAST_CLEAR_OP_RESOLVE_FULL ||
@@ -718,7 +752,7 @@
       /* Broadwell and earlier do not have a partial resolve */
       assert(resolve_op == BLORP_FAST_CLEAR_OP_RESOLVE_FULL);
    }
-   params.fast_clear_op = resolve_op;
+   params->fast_clear_op = resolve_op;
 
    /* Note: there is no need to initialize push constants because it doesn't
     * matter what data gets dispatched to the render target.  However, we must
@@ -726,7 +760,139 @@
     * color" message.
     */
 
-   if (!blorp_params_get_clear_kernel(batch->blorp, &params, true))
+   if (!blorp_params_get_clear_kernel(batch->blorp, params, true))
+      return;
+}
+
+void
+blorp_ccs_resolve(struct blorp_batch *batch,
+                  struct blorp_surf *surf, uint32_t level, uint32_t layer,
+                  enum isl_format format,
+                  enum blorp_fast_clear_op resolve_op)
+{
+   struct blorp_params params;
+
+   prepare_ccs_resolve(batch, &params, surf, level, layer, format, resolve_op);
+
+   batch->blorp->exec(batch, &params);
+}
+
+void
+blorp_ccs_resolve_attachment(struct blorp_batch *batch,
+                             const uint32_t binding_table_offset,
+                             struct blorp_surf * const surf,
+                             const uint32_t level, const uint32_t num_layers,
+                             const enum isl_format format,
+                             const enum blorp_fast_clear_op resolve_op)
+{
+   struct blorp_params params;
+
+   prepare_ccs_resolve(batch, &params, surf, level, 0, format, resolve_op);
+   params.use_pre_baked_binding_table = true;
+   params.pre_baked_binding_table_offset = binding_table_offset;
+   params.num_layers = num_layers;
+
+   batch->blorp->exec(batch, &params);
+}
+
+struct blorp_mcs_partial_resolve_key
+{
+   enum blorp_shader_type shader_type;
+   uint32_t num_samples;
+};
+
+static bool
+blorp_params_get_mcs_partial_resolve_kernel(struct blorp_context *blorp,
+                                            struct blorp_params *params)
+{
+   const struct blorp_mcs_partial_resolve_key blorp_key = {
+      .shader_type = BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
+      .num_samples = params->num_samples,
+   };
+
+   if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key),
+                            &params->wm_prog_kernel, &params->wm_prog_data))
+      return true;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_builder b;
+   nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
+   b.shader->info.name = ralloc_strdup(b.shader, "BLORP-mcs-partial-resolve");
+
+   nir_variable *v_color =
+      BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type());
+
+   nir_variable *frag_color =
+      nir_variable_create(b.shader, nir_var_shader_out,
+                          glsl_vec4_type(), "gl_FragColor");
+   frag_color->data.location = FRAG_RESULT_COLOR;
+
+   /* Do an MCS fetch and check if it is equal to the magic clear value */
+   nir_ssa_def *mcs =
+      blorp_nir_txf_ms_mcs(&b, nir_f2i32(&b, blorp_nir_frag_coord(&b)),
+                               nir_load_layer_id(&b));
+   nir_ssa_def *is_clear =
+      blorp_nir_mcs_is_clear_color(&b, mcs, blorp_key.num_samples);
+
+   /* If we aren't the clear value, discard. */
+   nir_intrinsic_instr *discard =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_discard_if);
+   discard->src[0] = nir_src_for_ssa(nir_inot(&b, is_clear));
+   nir_builder_instr_insert(&b, &discard->instr);
+
+   nir_copy_var(&b, frag_color, v_color);
+
+   struct brw_wm_prog_key wm_key;
+   brw_blorp_init_wm_prog_key(&wm_key);
+   wm_key.tex.compressed_multisample_layout_mask = 1;
+   wm_key.tex.msaa_16 = blorp_key.num_samples == 16;
+   wm_key.multisample_fbo = true;
+
+   struct brw_wm_prog_data prog_data;
+   unsigned program_size;
+   const unsigned *program =
+      blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, false,
+                       &prog_data, &program_size);
+
+   bool result =
+      blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
+                           program, program_size,
+                           &prog_data.base, sizeof(prog_data),
+                           &params->wm_prog_kernel, &params->wm_prog_data);
+
+   ralloc_free(mem_ctx);
+   return result;
+}
+
+void
+blorp_mcs_partial_resolve(struct blorp_batch *batch,
+                          struct blorp_surf *surf,
+                          enum isl_format format,
+                          uint32_t start_layer, uint32_t num_layers)
+{
+   struct blorp_params params;
+   blorp_params_init(&params);
+
+   assert(batch->blorp->isl_dev->info->gen >= 7);
+
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = surf->surf->logical_level0_px.width;
+   params.y1 = surf->surf->logical_level0_px.height;
+
+   brw_blorp_surface_info_init(batch->blorp, &params.src, surf, 0,
+                               start_layer, format, false);
+   brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, 0,
+                               start_layer, format, true);
+
+   params.num_samples = params.dst.surf.samples;
+   params.num_layers = num_layers;
+
+   memcpy(&params.wm_inputs.clear_color,
+          surf->clear_color.f32, sizeof(float) * 4);
+
+   if (!blorp_params_get_mcs_partial_resolve_kernel(batch->blorp, &params))
       return;
 
    batch->blorp->exec(batch, &params);
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
index 6f0d31a..9353416 100644
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -27,6 +27,7 @@
 #include "blorp_priv.h"
 #include "common/gen_device_info.h"
 #include "common/gen_sample_positions.h"
+#include "genxml/gen_macros.h"
 
 /**
  * This file provides the blorp pipeline setup and execution functionality.
@@ -59,6 +60,11 @@
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
 
+#if GEN_GEN >= 8
+static struct blorp_address
+blorp_get_workaround_page(struct blorp_batch *batch);
+#endif
+
 static void
 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
                           unsigned state_size, unsigned state_alignment,
@@ -73,12 +79,15 @@
                     struct blorp_address address, uint32_t delta);
 
 static void
-blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size);
+blorp_emit_urb_config(struct blorp_batch *batch,
+                      unsigned vs_entry_size, unsigned sf_entry_size);
+
+static void
+blorp_emit_pipeline(struct blorp_batch *batch,
+                    const struct blorp_params *params);
 
 /***** BEGIN blorp_exec implementation ******/
 
-#include "genxml/gen_macros.h"
-
 static uint64_t
 _blorp_combine_address(struct blorp_batch *batch, void *location,
                        struct blorp_address address, uint32_t delta)
@@ -180,7 +189,10 @@
    /* The URB size is expressed in units of 64 bytes (512 bits) */
    const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
 
-   blorp_emit_urb_config(batch, vs_entry_size);
+   const unsigned sf_entry_size =
+      params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
+
+   blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
 }
 
 static void
@@ -256,32 +268,42 @@
    blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
    vb[0].VertexBufferIndex = 0;
    vb[0].BufferPitch = 3 * sizeof(float);
+#if GEN_GEN >= 6
    vb[0].VertexBufferMOCS = batch->blorp->mocs.vb;
+#endif
 #if GEN_GEN >= 7
    vb[0].AddressModifyEnable = true;
 #endif
 #if GEN_GEN >= 8
    vb[0].BufferSize = size;
-#else
+#elif GEN_GEN >= 5
    vb[0].BufferAccessType = VERTEXDATA;
    vb[0].EndAddress = vb[0].BufferStartingAddress;
    vb[0].EndAddress.offset += size - 1;
+#elif GEN_GEN == 4
+   vb[0].BufferAccessType = VERTEXDATA;
+   vb[0].MaxIndex = 2;
 #endif
 
    blorp_emit_input_varying_data(batch, params,
                                  &vb[1].BufferStartingAddress, &size);
    vb[1].VertexBufferIndex = 1;
    vb[1].BufferPitch = 0;
+#if GEN_GEN >= 6
    vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
+#endif
 #if GEN_GEN >= 7
    vb[1].AddressModifyEnable = true;
 #endif
 #if GEN_GEN >= 8
    vb[1].BufferSize = size;
-#else
+#elif GEN_GEN >= 5
    vb[1].BufferAccessType = INSTANCEDATA;
    vb[1].EndAddress = vb[1].BufferStartingAddress;
    vb[1].EndAddress.offset += size - 1;
+#elif GEN_GEN == 4
+   vb[1].BufferAccessType = INSTANCEDATA;
+   vb[1].MaxIndex = 0;
 #endif
 
    const unsigned num_dwords = 1 + GENX(VERTEX_BUFFER_STATE_length) * 2;
@@ -301,7 +323,8 @@
 {
    const unsigned num_varyings =
       params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
-   const unsigned num_elements = 2 + num_varyings;
+   bool need_ndc = batch->blorp->compiler->devinfo->gen <= 5;
+   const unsigned num_elements = 2 + need_ndc + num_varyings;
 
    struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
    memset(ve, 0, num_elements * sizeof(*ve));
@@ -352,42 +375,84 @@
     *
     * See the vertex element setup below.
     */
-   ve[0].VertexBufferIndex = 1;
-   ve[0].Valid = true;
-   ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-   ve[0].SourceElementOffset = 0;
-   ve[0].Component0Control = VFCOMP_STORE_SRC;
+   unsigned slot = 0;
 
-   /* From Gen8 onwards hardware is no more instructed to overwrite components
-    * using an element specifier. Instead one has separate 3DSTATE_VF_SGVS
-    * (System Generated Value Setup) state packet for it.
-    */
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 1,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+
+      /* From Gen8 onwards hardware is no more instructed to overwrite
+       * components using an element specifier. Instead one has separate
+       * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
+       */
 #if GEN_GEN >= 8
-   ve[0].Component1Control = VFCOMP_STORE_0;
+      .Component1Control = VFCOMP_STORE_0,
+#elif GEN_GEN >= 5
+      .Component1Control = VFCOMP_STORE_IID,
 #else
-   ve[0].Component1Control = VFCOMP_STORE_IID;
+      .Component1Control = VFCOMP_STORE_0,
 #endif
-   ve[0].Component2Control = VFCOMP_STORE_SRC;
-   ve[0].Component3Control = VFCOMP_STORE_SRC;
+      .Component2Control = VFCOMP_STORE_SRC,
+      .Component3Control = VFCOMP_STORE_SRC,
+#if GEN_GEN <= 5
+      .DestinationElementOffset = slot * 4,
+#endif
+   };
+   slot++;
 
-   ve[1].VertexBufferIndex = 0;
-   ve[1].Valid = true;
-   ve[1].SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT;
-   ve[1].SourceElementOffset = 0;
-   ve[1].Component0Control = VFCOMP_STORE_SRC;
-   ve[1].Component1Control = VFCOMP_STORE_SRC;
-   ve[1].Component2Control = VFCOMP_STORE_SRC;
-   ve[1].Component3Control = VFCOMP_STORE_1_FP;
+#if GEN_GEN <= 5
+   /* On Iron Lake and earlier, a native device coordinates version of the
+    * position goes right after the normal VUE header and before position.
+    * Since w == 1 for all of our coordinates, this is just a copy of the
+    * position.
+    */
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 0,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+      .Component1Control = VFCOMP_STORE_SRC,
+      .Component2Control = VFCOMP_STORE_SRC,
+      .Component3Control = VFCOMP_STORE_1_FP,
+      .DestinationElementOffset = slot * 4,
+   };
+   slot++;
+#endif
+
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 0,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+      .Component1Control = VFCOMP_STORE_SRC,
+      .Component2Control = VFCOMP_STORE_SRC,
+      .Component3Control = VFCOMP_STORE_1_FP,
+#if GEN_GEN <= 5
+      .DestinationElementOffset = slot * 4,
+#endif
+   };
+   slot++;
 
    for (unsigned i = 0; i < num_varyings; ++i) {
-      ve[i + 2].VertexBufferIndex = 1;
-      ve[i + 2].Valid = true;
-      ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-      ve[i + 2].SourceElementOffset = 16 + i * 4 * sizeof(float);
-      ve[i + 2].Component0Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component1Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component2Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component3Control = VFCOMP_STORE_SRC;
+      ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 1,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 16 + i * 4 * sizeof(float),
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_SRC,
+         .Component2Control = VFCOMP_STORE_SRC,
+         .Component3Control = VFCOMP_STORE_SRC,
+#if GEN_GEN <= 5
+         .DestinationElementOffset = slot * 4,
+#endif
+      };
+      slot++;
    }
 
    const unsigned num_dwords =
@@ -424,6 +489,79 @@
 #endif
 }
 
+/* 3DSTATE_VIEWPORT_STATE_POINTERS */
+static uint32_t
+blorp_emit_cc_viewport(struct blorp_batch *batch,
+                       const struct blorp_params *params)
+{
+   uint32_t cc_vp_offset;
+   blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
+      vp.MinimumDepth = 0.0;
+      vp.MaximumDepth = 1.0;
+   }
+
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
+      vsp.CCViewportPointer = cc_vp_offset;
+   }
+#elif GEN_GEN == 6
+   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
+      vsp.CCViewportStateChange = true;
+      vsp.PointertoCC_VIEWPORT = cc_vp_offset;
+   }
+#endif
+
+   return cc_vp_offset;
+}
+
+static uint32_t
+blorp_emit_sampler_state(struct blorp_batch *batch,
+                         const struct blorp_params *params)
+{
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
+      sampler.MipModeFilter = MIPFILTER_NONE;
+      sampler.MagModeFilter = MAPFILTER_LINEAR;
+      sampler.MinModeFilter = MAPFILTER_LINEAR;
+      sampler.MinLOD = 0;
+      sampler.MaxLOD = 0;
+      sampler.TCXAddressControlMode = TCM_CLAMP;
+      sampler.TCYAddressControlMode = TCM_CLAMP;
+      sampler.TCZAddressControlMode = TCM_CLAMP;
+      sampler.MaximumAnisotropy = RATIO21;
+      sampler.RAddressMinFilterRoundingEnable = true;
+      sampler.RAddressMagFilterRoundingEnable = true;
+      sampler.VAddressMinFilterRoundingEnable = true;
+      sampler.VAddressMagFilterRoundingEnable = true;
+      sampler.UAddressMinFilterRoundingEnable = true;
+      sampler.UAddressMagFilterRoundingEnable = true;
+#if GEN_GEN > 6
+      sampler.NonnormalizedCoordinateEnable = true;
+#endif
+   }
+
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
+      ssp.PointertoPSSamplerState = offset;
+   }
+#elif GEN_GEN == 6
+   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
+      ssp.VSSamplerStateChange = true;
+      ssp.GSSamplerStateChange = true;
+      ssp.PSSamplerStateChange = true;
+      ssp.PointertoPSSamplerState = offset;
+   }
+#endif
+
+   return offset;
+}
+
+/* What follows is the code for setting up a "pipeline" on Sandy Bridge and
+ * later hardware.  This file will be included by i965 for gen4-5 as well, so
+ * this code is guarded by GEN_GEN >= 6.
+ */
+#if GEN_GEN >= 6
+
 static void
 blorp_emit_vs_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
@@ -432,7 +570,7 @@
 
    blorp_emit(batch, GENX(3DSTATE_VS), vs) {
       if (vs_prog_data) {
-         vs.FunctionEnable = true;
+         vs.Enable = true;
 
          vs.KernelStartPointer = params->vs_prog_kernel;
 
@@ -781,97 +919,39 @@
 #endif /* GEN_GEN */
 }
 
-static const uint32_t isl_to_gen_ds_surftype [] = {
-#if GEN_GEN >= 9
-   /* From the SKL PRM, "3DSTATE_DEPTH_STENCIL::SurfaceType":
-    *
-    *    "If depth/stencil is enabled with 1D render target, depth/stencil
-    *    surface type needs to be set to 2D surface type and height set to 1.
-    *    Depth will use (legacy) TileY and stencil will use TileW. For this
-    *    case only, the Surface Type of the depth buffer can be 2D while the
-    *    Surface Type of the render target(s) are 1D, representing an
-    *    exception to a programming note above.
-    */
-   [ISL_SURF_DIM_1D] = SURFTYPE_2D,
-#else
-   [ISL_SURF_DIM_1D] = SURFTYPE_1D,
-#endif
-   [ISL_SURF_DIM_2D] = SURFTYPE_2D,
-   [ISL_SURF_DIM_3D] = SURFTYPE_3D,
-};
-
-static void
-blorp_emit_depth_stencil_config(struct blorp_batch *batch,
-                                const struct blorp_params *params)
-{
-   const struct isl_device *isl_dev = batch->blorp->isl_dev;
-
-   uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
-   if (dw == NULL)
-      return;
-
-   struct isl_depth_stencil_hiz_emit_info info = {
-#if GEN_GEN >= 7
-      .mocs = 1, /* GEN7_MOCS_L3 */
-#else
-      .mocs = 0,
-#endif
-   };
-
-   if (params->depth.enabled) {
-      info.view = &params->depth.view;
-   } else if (params->stencil.enabled) {
-      info.view = &params->stencil.view;
-   }
-
-   if (params->depth.enabled) {
-      info.depth_surf = &params->depth.surf;
-
-      info.depth_address =
-         blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
-                          params->depth.addr, 0);
-
-      info.hiz_usage = params->depth.aux_usage;
-      if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
-         info.hiz_surf = &params->depth.aux_surf;
-
-         info.hiz_address =
-            blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
-                             params->depth.aux_addr, 0);
-
-         info.depth_clear_value = params->depth.clear_color.u32[0];
-      }
-   }
-
-   if (params->stencil.enabled) {
-      info.stencil_surf = &params->stencil.surf;
-
-      info.stencil_address =
-         blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
-                          params->stencil.addr, 0);
-   }
-
-   isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
-}
-
 static uint32_t
 blorp_emit_blend_state(struct blorp_batch *batch,
                        const struct blorp_params *params)
 {
-   uint32_t offset;
-   blorp_emit_dynamic(batch, GENX(BLEND_STATE), blend, 64, &offset) {
-      for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
-         blend.Entry[i].PreBlendColorClampEnable = true;
-         blend.Entry[i].PostBlendColorClampEnable = true;
-         blend.Entry[i].ColorClampRange = COLORCLAMP_RTFORMAT;
+   struct GENX(BLEND_STATE) blend;
+   memset(&blend, 0, sizeof(blend));
 
-         blend.Entry[i].WriteDisableRed = params->color_write_disable[0];
-         blend.Entry[i].WriteDisableGreen = params->color_write_disable[1];
-         blend.Entry[i].WriteDisableBlue = params->color_write_disable[2];
-         blend.Entry[i].WriteDisableAlpha = params->color_write_disable[3];
-      }
+   uint32_t offset;
+   int size = GENX(BLEND_STATE_length) * 4;
+   size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
+   uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
+   uint32_t *pos = state;
+
+   GENX(BLEND_STATE_pack)(NULL, pos, &blend);
+   pos += GENX(BLEND_STATE_length);
+
+   for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
+      struct GENX(BLEND_STATE_ENTRY) entry = {
+         .PreBlendColorClampEnable = true,
+         .PostBlendColorClampEnable = true,
+         .ColorClampRange = COLORCLAMP_RTFORMAT,
+
+         .WriteDisableRed = params->color_write_disable[0],
+         .WriteDisableGreen = params->color_write_disable[1],
+         .WriteDisableBlue = params->color_write_disable[2],
+         .WriteDisableAlpha = params->color_write_disable[3],
+      };
+      GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
+      pos += GENX(BLEND_STATE_ENTRY_length);
    }
 
+   blorp_flush_range(batch, state, size);
+
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
       sp.BlendStatePointer = offset;
@@ -992,9 +1072,141 @@
 }
 
 static void
+blorp_emit_3dstate_multisample(struct blorp_batch *batch,
+                               const struct blorp_params *params)
+{
+   blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+      ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
+
+#if GEN_GEN >= 8
+      /* The PRM says that this bit is valid only for DX9:
+       *
+       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
+       *    should not have any effect by setting or not setting this bit.
+       */
+      ms.PixelPositionOffsetEnable  = false;
+#elif GEN_GEN >= 7
+
+      switch (params->num_samples) {
+      case 1:
+         GEN_SAMPLE_POS_1X(ms.Sample);
+         break;
+      case 2:
+         GEN_SAMPLE_POS_2X(ms.Sample);
+         break;
+      case 4:
+         GEN_SAMPLE_POS_4X(ms.Sample);
+         break;
+      case 8:
+         GEN_SAMPLE_POS_8X(ms.Sample);
+         break;
+      default:
+         break;
+      }
+#else
+      GEN_SAMPLE_POS_4X(ms.Sample);
+#endif
+      ms.PixelLocation              = CENTER;
+   }
+}
+
+static void
+blorp_emit_pipeline(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   uint32_t blend_state_offset = 0;
+   uint32_t color_calc_state_offset;
+   uint32_t depth_stencil_state_offset;
+
+   emit_urb_config(batch, params);
+
+   if (params->wm_prog_data) {
+      blend_state_offset = blorp_emit_blend_state(batch, params);
+   }
+   color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
+   depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
+
+#if GEN_GEN == 6
+   /* 3DSTATE_CC_STATE_POINTERS
+    *
+    * The pointer offsets are relative to
+    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+    *
+    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
+    *
+    * The dynamic state emit helpers emit their own STATE_POINTERS packets on
+    * gen7+.  However, on gen6 and earlier, they're all lumpped together in
+    * one CC_STATE_POINTERS packet so we have to emit that here.
+    */
+   blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
+      cc.BLEND_STATEChange = true;
+      cc.ColorCalcStatePointerValid = true;
+      cc.DEPTH_STENCIL_STATEChange = true;
+      cc.PointertoBLEND_STATE = blend_state_offset;
+      cc.ColorCalcStatePointer = color_calc_state_offset;
+      cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
+   }
+#else
+   (void)blend_state_offset;
+   (void)color_calc_state_offset;
+   (void)depth_stencil_state_offset;
+#endif
+
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
+#endif
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+
+   if (params->src.enabled)
+      blorp_emit_sampler_state(batch, params);
+
+   blorp_emit_3dstate_multisample(batch, params);
+
+   blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
+      mask.SampleMask = (1 << params->num_samples) - 1;
+   }
+
+   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
+    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
+    *
+    *   [DevSNB] A pipeline flush must be programmed prior to a
+    *   3DSTATE_VS command that causes the VS Function Enable to
+    *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
+    *   command with CS stall bit set and a post sync operation.
+    *
+    * We've already done one at the start of the BLORP operation.
+    */
+   blorp_emit_vs_config(batch, params);
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_HS), hs);
+   blorp_emit(batch, GENX(3DSTATE_TE), te);
+   blorp_emit(batch, GENX(3DSTATE_DS), DS);
+   blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+#endif
+   blorp_emit(batch, GENX(3DSTATE_GS), gs);
+
+   blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   blorp_emit_sf_config(batch, params);
+   blorp_emit_ps_config(batch, params);
+
+   blorp_emit_cc_viewport(batch, params);
+}
+
+/******** This is the end of the pipeline setup code ********/
+
+#endif /* GEN_GEN >= 6 */
+
+static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
                          void *state, uint32_t state_offset,
+                         const bool color_write_disables[4],
                          bool is_render_target)
 {
    const struct isl_device *isl_dev = batch->blorp->isl_dev;
@@ -1011,13 +1223,26 @@
    if (aux_usage == ISL_AUX_USAGE_HIZ)
       aux_usage = ISL_AUX_USAGE_NONE;
 
+   isl_channel_mask_t write_disable_mask = 0;
+   if (is_render_target && GEN_GEN <= 5) {
+      if (color_write_disables[0])
+         write_disable_mask |= ISL_CHANNEL_RED_BIT;
+      if (color_write_disables[1])
+         write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
+      if (color_write_disables[2])
+         write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
+      if (color_write_disables[3])
+         write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
+   }
+
    const uint32_t mocs =
       is_render_target ? batch->blorp->mocs.rb : batch->blorp->mocs.tex;
 
    isl_surf_fill_state(batch->blorp->isl_dev, state,
                        .surf = &surf, .view = &surface->view,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
-                       .mocs = mocs, .clear_color = surface->clear_color);
+                       .mocs = mocs, .clear_color = surface->clear_color,
+                       .write_disables = write_disable_mask);
 
    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
                        surface->addr, 0);
@@ -1050,7 +1275,9 @@
       .MinimumArrayElement = surface->view.base_array_layer,
       .Depth = surface->view.array_len - 1,
       .RenderTargetViewExtent = surface->view.array_len - 1,
+#if GEN_GEN >= 6
       .NumberofMultisamples = ffs(surface->surf.samples) - 1,
+#endif
 
 #if GEN_GEN >= 7
       .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
@@ -1088,7 +1315,7 @@
          blorp_emit_surface_state(batch, &params->dst,
                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
-                                  true);
+                                  params->color_write_disable, true);
       } else {
          assert(params->depth.enabled || params->stencil.enabled);
          const struct brw_blorp_surface_info *surface =
@@ -1100,7 +1327,8 @@
       if (params->src.enabled) {
          blorp_emit_surface_state(batch, &params->src,
                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
-                                  surface_offsets[BLORP_TEXTURE_BT_INDEX], false);
+                                  surface_offsets[BLORP_TEXTURE_BT_INDEX],
+                                  NULL, false);
       }
    }
 
@@ -1113,91 +1341,98 @@
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
       bt.PointertoPSBindingTable = bind_offset;
    }
-#else
+#elif GEN_GEN >= 6
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
       bt.PSBindingTableChange = true;
       bt.PointertoPSBindingTable = bind_offset;
    }
+#else
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
+      bt.PointertoPSBindingTable = bind_offset;
+   }
 #endif
 }
 
 static void
-blorp_emit_sampler_state(struct blorp_batch *batch,
-                         const struct blorp_params *params)
+blorp_emit_depth_stencil_config(struct blorp_batch *batch,
+                                const struct blorp_params *params)
 {
-   uint32_t offset;
-   blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
-      sampler.MipModeFilter = MIPFILTER_NONE;
-      sampler.MagModeFilter = MAPFILTER_LINEAR;
-      sampler.MinModeFilter = MAPFILTER_LINEAR;
-      sampler.MinLOD = 0;
-      sampler.MaxLOD = 0;
-      sampler.TCXAddressControlMode = TCM_CLAMP;
-      sampler.TCYAddressControlMode = TCM_CLAMP;
-      sampler.TCZAddressControlMode = TCM_CLAMP;
-      sampler.MaximumAnisotropy = RATIO21;
-      sampler.RAddressMinFilterRoundingEnable = true;
-      sampler.RAddressMagFilterRoundingEnable = true;
-      sampler.VAddressMinFilterRoundingEnable = true;
-      sampler.VAddressMagFilterRoundingEnable = true;
-      sampler.UAddressMinFilterRoundingEnable = true;
-      sampler.UAddressMagFilterRoundingEnable = true;
-      sampler.NonnormalizedCoordinateEnable = true;
-   }
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
 
+   uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
+   if (dw == NULL)
+      return;
+
+   struct isl_depth_stencil_hiz_emit_info info = {
 #if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
-      ssp.PointertoPSSamplerState = offset;
-   }
+      .mocs = 1, /* GEN7_MOCS_L3 */
 #else
-   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
-      ssp.VSSamplerStateChange = true;
-      ssp.GSSamplerStateChange = true;
-      ssp.PSSamplerStateChange = true;
-      ssp.PointertoPSSamplerState = offset;
-   }
+      .mocs = 0,
 #endif
-}
+   };
 
-static void
-blorp_emit_3dstate_multisample(struct blorp_batch *batch,
-                               const struct blorp_params *params)
-{
-   blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-      ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
+   if (params->depth.enabled) {
+      info.view = &params->depth.view;
+   } else if (params->stencil.enabled) {
+      info.view = &params->stencil.view;
+   }
 
-#if GEN_GEN >= 8
-      /* The PRM says that this bit is valid only for DX9:
-       *
-       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
-       *    should not have any effect by setting or not setting this bit.
-       */
-      ms.PixelPositionOffsetEnable  = false;
-      ms.PixelLocation              = CENTER;
-#elif GEN_GEN >= 7
-      ms.PixelLocation              = PIXLOC_CENTER;
+   if (params->depth.enabled) {
+      info.depth_surf = &params->depth.surf;
 
-      switch (params->num_samples) {
-      case 1:
-         GEN_SAMPLE_POS_1X(ms.Sample);
-         break;
-      case 2:
-         GEN_SAMPLE_POS_2X(ms.Sample);
-         break;
-      case 4:
-         GEN_SAMPLE_POS_4X(ms.Sample);
-         break;
-      case 8:
-         GEN_SAMPLE_POS_8X(ms.Sample);
-         break;
-      default:
-         break;
+      info.depth_address =
+         blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
+                          params->depth.addr, 0);
+
+      info.hiz_usage = params->depth.aux_usage;
+      if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
+         info.hiz_surf = &params->depth.aux_surf;
+
+         struct blorp_address hiz_address = params->depth.aux_addr;
+#if GEN_GEN == 6
+         /* Sandy bridge hardware does not technically support mipmapped HiZ.
+          * However, we have a special layout that allows us to make it work
+          * anyway by manually offsetting to the specified miplevel.
+          */
+         assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
+         uint32_t offset_B;
+         isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
+                                             info.view->base_level, 0, 0,
+                                             &offset_B, NULL, NULL);
+         hiz_address.offset += offset_B;
+#endif
+
+         info.hiz_address =
+            blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
+                             hiz_address, 0);
+
+         info.depth_clear_value = params->depth.clear_color.f32[0];
       }
-#else
-      ms.PixelLocation              = PIXLOC_CENTER;
-      GEN_SAMPLE_POS_4X(ms.Sample);
-#endif
    }
+
+   if (params->stencil.enabled) {
+      info.stencil_surf = &params->stencil.surf;
+
+      struct blorp_address stencil_address = params->stencil.addr;
+#if GEN_GEN == 6
+      /* Sandy bridge hardware does not technically support mipmapped stencil.
+       * However, we have a special layout that allows us to make it work
+       * anyway by manually offsetting to the specified miplevel.
+       */
+      assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
+      uint32_t offset_B;
+      isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
+                                          info.view->base_level, 0, 0,
+                                          &offset_B, NULL, NULL);
+      stencil_address.offset += offset_B;
+#endif
+
+      info.stencil_address =
+         blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
+                          stencil_address, 0);
+   }
+
+   isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
 }
 
 #if GEN_GEN >= 8
@@ -1235,11 +1470,14 @@
          hzp.StencilBufferClearEnable = params->stencil.enabled;
          hzp.DepthBufferClearEnable = params->depth.enabled;
          hzp.StencilClearValue = params->stencil_ref;
+         hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
          break;
       case BLORP_HIZ_OP_DEPTH_RESOLVE:
+         assert(params->full_surface_hiz_op);
          hzp.DepthBufferResolveEnable = true;
          break;
       case BLORP_HIZ_OP_HIZ_RESOLVE:
+         assert(params->full_surface_hiz_op);
          hzp.HierarchicalDepthBufferResolveEnable = true;
          break;
       case BLORP_HIZ_OP_NONE:
@@ -1264,49 +1502,15 @@
    /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
     * to “Write Immediate Data” enabled.
     */
-   // This pipe control generates a store to address 0, which is bad.  The bspec say a
-   // CommandStreamerStallEnable should suffice here and experimentation shows this to be true.
-   // Upstream mesa may handle this differently.
    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
-      pc.CommandStreamerStallEnable = true;
+      pc.PostSyncOperation = WriteImmediateData;
+      pc.Address = blorp_get_workaround_page(batch);
    }
 
    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
-
-   /* Perform depth clear specific flushing */
-   if (params->hiz_op == BLORP_HIZ_OP_DEPTH_CLEAR && params->depth.enabled) {
-      blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
-         pc.DepthStallEnable = true;
-         pc.DepthCacheFlushEnable = true;
-      }
-   }
 }
 #endif
 
-/* 3DSTATE_VIEWPORT_STATE_POINTERS */
-static void
-blorp_emit_viewport_state(struct blorp_batch *batch,
-                          const struct blorp_params *params)
-{
-   uint32_t cc_vp_offset;
-   blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
-      vp.MinimumDepth = 0.0;
-      vp.MaximumDepth = 1.0;
-   }
-
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
-      vsp.CCViewportPointer = cc_vp_offset;
-   }
-#else
-   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
-      vsp.CCViewportStateChange = true;
-      vsp.PointertoCC_VIEWPORT = cc_vp_offset;
-   }
-#endif
-}
-
-
 /**
  * \brief Execute a blit or render pass operation.
  *
@@ -1319,10 +1523,6 @@
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
-   uint32_t blend_state_offset = 0;
-   uint32_t color_calc_state_offset = 0;
-   uint32_t depth_stencil_state_offset;
-
 #if GEN_GEN >= 8
    if (params->hiz_op != BLORP_HIZ_OP_NONE) {
       blorp_emit_gen8_hiz_op(batch, params);
@@ -1333,93 +1533,19 @@
    blorp_emit_vertex_buffers(batch, params);
    blorp_emit_vertex_elements(batch, params);
 
-   emit_urb_config(batch, params);
-
-   if (params->wm_prog_data) {
-      blend_state_offset = blorp_emit_blend_state(batch, params);
-   }
-   color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
-   depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
-
-#if GEN_GEN <= 6
-   /* 3DSTATE_CC_STATE_POINTERS
-    *
-    * The pointer offsets are relative to
-    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
-    *
-    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
-    *
-    * The dynamic state emit helpers emit their own STATE_POINTERS packets on
-    * gen7+.  However, on gen6 and earlier, they're all lumpped together in
-    * one CC_STATE_POINTERS packet so we have to emit that here.
-    */
-   blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
-      cc.BLEND_STATEChange = true;
-      cc.COLOR_CALC_STATEChange = true;
-      cc.DEPTH_STENCIL_STATEChange = true;
-      cc.PointertoBLEND_STATE = blend_state_offset;
-      cc.PointertoCOLOR_CALC_STATE = color_calc_state_offset;
-      cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
-   }
-#else
-   (void)blend_state_offset;
-   (void)color_calc_state_offset;
-   (void)depth_stencil_state_offset;
-#endif
-
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
-#endif
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+   blorp_emit_pipeline(batch, params);
 
    blorp_emit_surface_states(batch, params);
 
-   if (params->src.enabled)
-      blorp_emit_sampler_state(batch, params);
-
-   blorp_emit_3dstate_multisample(batch, params);
-
-   blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
-      mask.SampleMask = (1 << params->num_samples) - 1;
-   }
-
-   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
-    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
-    *
-    *   [DevSNB] A pipeline flush must be programmed prior to a
-    *   3DSTATE_VS command that causes the VS Function Enable to
-    *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
-    *   command with CS stall bit set and a post sync operation.
-    *
-    * We've already done one at the start of the BLORP operation.
-    */
-   blorp_emit_vs_config(batch, params);
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_HS), hs);
-   blorp_emit(batch, GENX(3DSTATE_TE), te);
-   blorp_emit(batch, GENX(3DSTATE_DS), DS);
-   blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
-#endif
-   blorp_emit(batch, GENX(3DSTATE_GS), gs);
-
-   blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
-      clip.PerspectiveDivideDisable = true;
-   }
-
-   blorp_emit_sf_config(batch, params);
-   blorp_emit_ps_config(batch, params);
-
-   blorp_emit_viewport_state(batch, params);
-
    if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
       blorp_emit_depth_stencil_config(batch, params);
 
    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType = SEQUENTIAL;
       prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+#if GEN_GEN >= 7
+      prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
+#endif
       prim.VertexCountPerInstance = 3;
       prim.InstanceCount = params->num_layers;
    }
diff --git a/src/intel/blorp/blorp_nir_builder.h b/src/intel/blorp/blorp_nir_builder.h
new file mode 100644
index 0000000..7f23abd
--- /dev/null
+++ b/src/intel/blorp/blorp_nir_builder.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+
+static inline nir_ssa_def *
+blorp_nir_frag_coord(nir_builder *b)
+{
+   nir_variable *frag_coord =
+      nir_variable_create(b->shader, nir_var_shader_in,
+                          glsl_vec4_type(), "gl_FragCoord");
+
+   frag_coord->data.location = VARYING_SLOT_POS;
+   frag_coord->data.origin_upper_left = true;
+
+   return nir_load_var(b, frag_coord);
+}
+
+static inline nir_ssa_def *
+blorp_nir_txf_ms_mcs(nir_builder *b, nir_ssa_def *xy_pos, nir_ssa_def *layer)
+{
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+   tex->op = nir_texop_txf_ms_mcs;
+   tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+   tex->dest_type = nir_type_int;
+
+   nir_ssa_def *coord;
+   if (layer) {
+      tex->is_array = true;
+      tex->coord_components = 3;
+      coord = nir_vec3(b, nir_channel(b, xy_pos, 0),
+                          nir_channel(b, xy_pos, 1),
+                          layer);
+   } else {
+      tex->is_array = false;
+      tex->coord_components = 2;
+      coord = nir_channels(b, xy_pos, 0x3);
+   }
+   tex->src[0].src_type = nir_tex_src_coord;
+   tex->src[0].src = nir_src_for_ssa(coord);
+
+   /* Blorp only has one texture and it's bound at unit 0 */
+   tex->texture_index = 0;
+   tex->sampler_index = 0;
+
+   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
+   nir_builder_instr_insert(b, &tex->instr);
+
+   return &tex->dest.ssa;
+}
+
+static inline nir_ssa_def *
+blorp_nir_mcs_is_clear_color(nir_builder *b,
+                             nir_ssa_def *mcs,
+                             uint32_t samples)
+{
+   switch (samples) {
+   case 2:
+      /* Empirical evidence suggests that the value returned from the
+       * sampler is not always 0x3 for clear color so we need to mask it.
+       */
+      return nir_ieq(b, nir_iand(b, nir_channel(b, mcs, 0),
+                                    nir_imm_int(b, 0x3)),
+                    nir_imm_int(b, 0x3));
+
+   case 4:
+      return nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0xff));
+
+   case 8:
+      return nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0));
+
+   case 16:
+      /* For 16x MSAA, the MCS is actually an ivec2 */
+      return nir_iand(b, nir_ieq(b, nir_channel(b, mcs, 0),
+                                    nir_imm_int(b, ~0)),
+                         nir_ieq(b, nir_channel(b, mcs, 1),
+                                    nir_imm_int(b, ~0)));
+
+   default:
+      unreachable("Invalid sample count");
+   }
+}
diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h
index c61ab08..81bf8c6 100644
--- a/src/intel/blorp/blorp_priv.h
+++ b/src/intel/blorp/blorp_priv.h
@@ -71,6 +71,9 @@
                             const struct blorp_surf *surf,
                             unsigned int level, unsigned int layer,
                             enum isl_format format, bool is_render_target);
+void
+blorp_surf_convert_to_single_slice(const struct isl_device *isl_dev,
+                                   struct brw_blorp_surface_info *info);
 
 
 struct brw_blorp_coord_transform
@@ -124,6 +127,9 @@
    struct blorp_surf_offset src_offset;
    struct blorp_surf_offset dst_offset;
 
+   /* (1/width, 1/height) for the source surface */
+   float src_inv_size[2];
+
    /* Minimum layer setting works for all the textures types but texture_3d
     * for which the setting has no effect. Use the z-coordinate instead.
     */
@@ -179,6 +185,7 @@
    struct brw_blorp_surface_info src;
    struct brw_blorp_surface_info dst;
    enum blorp_hiz_op hiz_op;
+   bool full_surface_hiz_op;
    enum blorp_fast_clear_op fast_clear_op;
    bool color_write_disable[4];
    struct brw_blorp_wm_inputs wm_inputs;
@@ -188,6 +195,8 @@
    unsigned num_layers;
    uint32_t vs_prog_kernel;
    struct brw_vs_prog_data *vs_prog_data;
+   uint32_t sf_prog_kernel;
+   struct brw_sf_prog_data *sf_prog_data;
    uint32_t wm_prog_kernel;
    struct brw_wm_prog_data *wm_prog_data;
 
@@ -200,7 +209,9 @@
 enum blorp_shader_type {
    BLORP_SHADER_TYPE_BLIT,
    BLORP_SHADER_TYPE_CLEAR,
+   BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
    BLORP_SHADER_TYPE_LAYER_OFFSET_VS,
+   BLORP_SHADER_TYPE_GEN4_SF,
 };
 
 struct brw_blorp_blit_prog_key
@@ -228,6 +239,9 @@
    /* Number of bits per channel in the source image. */
    uint8_t src_bpc;
 
+   /* True if the source requires normalized coordinates */
+   bool src_coords_normalized;
+
    /* Number of samples per pixel that have been configured in the render
     * target.
     */
@@ -321,7 +335,7 @@
 const unsigned *
 blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,
                  struct nir_shader *nir,
-                 const struct brw_wm_prog_key *wm_key,
+                 struct brw_wm_prog_key *wm_key,
                  bool use_repclear,
                  struct brw_wm_prog_data *wm_prog_data,
                  unsigned *program_size);
@@ -332,6 +346,10 @@
                  struct brw_vs_prog_data *vs_prog_data,
                  unsigned *program_size);
 
+bool
+blorp_ensure_sf_program(struct blorp_context *blorp,
+                        struct blorp_params *params);
+
 /** \} */
 
 #ifdef __cplusplus
diff --git a/src/intel/common/gen_clflush.h b/src/intel/common/gen_clflush.h
new file mode 100644
index 0000000..806564f
--- /dev/null
+++ b/src/intel/common/gen_clflush.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GEN_CLFLUSH_H
+#define GEN_CLFLUSH_H
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_MASK 63
+
+static inline void
+gen_clflush_range(void *start, size_t size)
+{
+   char *p = (char *) (((uintptr_t) start) & ~CACHELINE_MASK);
+   char *end = (char*) start + size;
+
+   while (p < end) {
+      __builtin_ia32_clflush(p);
+      p += CACHELINE_SIZE;
+   }
+}
+
+static inline void
+gen_flush_range(void *start, size_t size)
+{
+   __builtin_ia32_mfence();
+   gen_clflush_range(start, size);
+}
+
+static inline void
+gen_invalidate_range(void *start, size_t size)
+{
+   gen_clflush_range(start, size);
+
+   /* Modern Atom CPUs (Baytrail+) have issues with clflush serialization,
+    * where mfence is not a sufficient synchronization barrier.  We must
+    * double clflush the last cacheline.  This guarantees it will be ordered
+    * after the preceding clflushes, and then the mfence guards against
+    * prefetches crossing the clflush boundary.
+    *
+    * See kernel commit 396f5d62d1a5fd99421855a08ffdef8edb43c76e
+    * ("drm: Restore double clflush on the last partial cacheline")
+    * and https://bugs.freedesktop.org/show_bug.cgi?id=92845.
+    */
+   __builtin_ia32_clflush((char*) start + size - 1);
+   __builtin_ia32_mfence();
+}
+
+#endif
diff --git a/src/intel/common/gen_debug.c b/src/intel/common/gen_debug.c
index be6fcdb..b604d56 100644
--- a/src/intel/common/gen_debug.c
+++ b/src/intel/common/gen_debug.c
@@ -57,7 +57,6 @@
    { "vert",        DEBUG_VERTS },
    { "dri",         DEBUG_DRI },
    { "sf",          DEBUG_SF },
-   { "stats",       DEBUG_STATS },
    { "wm",          DEBUG_WM },
    { "urb",         DEBUG_URB },
    { "vs",          DEBUG_VS },
@@ -69,7 +68,6 @@
    { "optimizer",   DEBUG_OPTIMIZER },
    { "ann",         DEBUG_ANNOTATION },
    { "no8",         DEBUG_NO8 },
-   { "vec4",        DEBUG_VEC4VS },
    { "spill_fs",    DEBUG_SPILL_FS },
    { "spill_vec4",  DEBUG_SPILL_VEC4 },
    { "cs",          DEBUG_CS },
diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h
index c0b74ea..d290303 100644
--- a/src/intel/common/gen_debug.h
+++ b/src/intel/common/gen_debug.h
@@ -57,7 +57,7 @@
 #define DEBUG_VERTS               (1ull << 13)
 #define DEBUG_DRI                 (1ull << 14)
 #define DEBUG_SF                  (1ull << 15)
-#define DEBUG_STATS               (1ull << 16)
+/* Hole - feel free to reuse      (1ull << 16) */
 #define DEBUG_WM                  (1ull << 17)
 #define DEBUG_URB                 (1ull << 18)
 #define DEBUG_VS                  (1ull << 19)
@@ -69,7 +69,7 @@
 #define DEBUG_OPTIMIZER           (1ull << 25)
 #define DEBUG_ANNOTATION          (1ull << 26)
 #define DEBUG_NO8                 (1ull << 27)
-#define DEBUG_VEC4VS              (1ull << 28)
+/* Hole - feel free to reuse      (1ull << 28) */
 #define DEBUG_SPILL_FS            (1ull << 29)
 #define DEBUG_SPILL_VEC4          (1ull << 30)
 #define DEBUG_CS                  (1ull << 31)
diff --git a/src/intel/common/gen_decoder.c b/src/intel/common/gen_decoder.c
index e3327d3..ba3a513 100644
--- a/src/intel/common/gen_decoder.c
+++ b/src/intel/common/gen_decoder.c
@@ -31,6 +31,7 @@
 #include <zlib.h>
 
 #include <util/macros.h>
+#include <util/ralloc.h>
 
 #include "gen_decoder.h"
 
@@ -38,6 +39,8 @@
 
 #define XML_BUFFER_SIZE 4096
 
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+
 #define MAKE_GEN(major, minor) ( ((major) << 8) | (minor) )
 
 struct gen_spec {
@@ -67,9 +70,6 @@
    struct gen_group *group;
    struct gen_enum *enoom;
 
-   int nfields;
-   struct gen_field *fields[128];
-
    int nvalues;
    struct gen_value *values[256];
 
@@ -178,8 +178,32 @@
    return fail_on_null(zalloc(s));
 }
 
+static void
+get_group_offset_count(const char **atts, uint32_t *offset, uint32_t *count,
+                       uint32_t *size, bool *variable)
+{
+   char *p;
+   int i;
+
+   for (i = 0; atts[i]; i += 2) {
+      if (strcmp(atts[i], "count") == 0) {
+         *count = strtoul(atts[i + 1], &p, 0);
+         if (*count == 0)
+            *variable = true;
+      } else if (strcmp(atts[i], "start") == 0) {
+         *offset = strtoul(atts[i + 1], &p, 0);
+      } else if (strcmp(atts[i], "size") == 0) {
+         *size = strtoul(atts[i + 1], &p, 0);
+      }
+   }
+   return;
+}
+
 static struct gen_group *
-create_group(struct parser_context *ctx, const char *name, const char **atts)
+create_group(struct parser_context *ctx,
+             const char *name,
+             const char **atts,
+             struct gen_group *parent)
 {
    struct gen_group *group;
 
@@ -190,6 +214,16 @@
    group->spec = ctx->spec;
    group->group_offset = 0;
    group->group_count = 0;
+   group->variable = false;
+
+   if (parent) {
+      group->parent = parent;
+      get_group_offset_count(atts,
+                             &group->group_offset,
+                             &group->group_count,
+                             &group->group_size,
+                             &group->variable);
+   }
 
    return group;
 }
@@ -209,22 +243,6 @@
 }
 
 static void
-get_group_offset_count(struct parser_context *ctx, const char *name,
-                       const char **atts, uint32_t *offset, uint32_t *count)
-{
-   char *p;
-   int i;
-
-   for (i = 0; atts[i]; i += 2) {
-      if (strcmp(atts[i], "count") == 0)
-         *count = strtoul(atts[i + 1], &p, 0);
-      else if (strcmp(atts[i], "start") == 0)
-         *offset = strtoul(atts[i + 1], &p, 0);
-   }
-   return;
-}
-
-static void
 get_register_offset(const char **atts, uint32_t *offset)
 {
    char *p;
@@ -328,11 +346,9 @@
       if (strcmp(atts[i], "name") == 0)
          field->name = xstrdup(atts[i + 1]);
       else if (strcmp(atts[i], "start") == 0)
-         field->start = ctx->group->group_offset+strtoul(atts[i + 1], &p, 0);
+         field->start = strtoul(atts[i + 1], &p, 0);
       else if (strcmp(atts[i], "end") == 0) {
-         field->end = ctx->group->group_offset+strtoul(atts[i + 1], &p, 0);
-         if (ctx->group->group_offset)
-            ctx->group->group_offset = field->end+1;
+         field->end = strtoul(atts[i + 1], &p, 0);
       } else if (strcmp(atts[i], "type") == 0)
          field->type = string_to_type(ctx, atts[i + 1]);
       else if (strcmp(atts[i], "default") == 0 &&
@@ -361,6 +377,21 @@
 }
 
 static void
+create_and_append_field(struct parser_context *ctx,
+                        const char **atts)
+{
+   if (ctx->group->nfields == ctx->group->fields_size) {
+      ctx->group->fields_size = MAX(ctx->group->fields_size * 2, 2);
+      ctx->group->fields =
+         (struct gen_field **) realloc(ctx->group->fields,
+                                       sizeof(ctx->group->fields[0]) *
+                                       ctx->group->fields_size);
+   }
+
+   ctx->group->fields[ctx->group->nfields++] = create_field(ctx, atts);
+}
+
+static void
 start_element(void *data, const char *element_name, const char **atts)
 {
    struct parser_context *ctx = data;
@@ -394,24 +425,27 @@
       ctx->spec->gen = MAKE_GEN(major, minor);
    } else if (strcmp(element_name, "instruction") == 0 ||
               strcmp(element_name, "struct") == 0) {
-      ctx->group = create_group(ctx, name, atts);
+      ctx->group = create_group(ctx, name, atts, NULL);
    } else if (strcmp(element_name, "register") == 0) {
-      ctx->group = create_group(ctx, name, atts);
+      ctx->group = create_group(ctx, name, atts, NULL);
       get_register_offset(atts, &ctx->group->register_offset);
    } else if (strcmp(element_name, "group") == 0) {
-      get_group_offset_count(ctx, name, atts, &ctx->group->group_offset,
-                             &ctx->group->group_count);
+      struct gen_group *previous_group = ctx->group;
+      while (previous_group->next)
+         previous_group = previous_group->next;
+
+      struct gen_group *group = create_group(ctx, "", atts, ctx->group);
+      previous_group->next = group;
+      ctx->group = group;
    } else if (strcmp(element_name, "field") == 0) {
-      do {
-         ctx->fields[ctx->nfields++] = create_field(ctx, atts);
-         if (ctx->group->group_count)
-            ctx->group->group_count--;
-      } while (ctx->group->group_count > 0);
+      create_and_append_field(ctx, atts);
    } else if (strcmp(element_name, "enum") == 0) {
       ctx->enoom = create_enum(ctx, name, atts);
    } else if (strcmp(element_name, "value") == 0) {
       ctx->values[ctx->nvalues++] = create_value(ctx, atts);
+      assert(ctx->nvalues < ARRAY_SIZE(ctx->values));
    }
+
 }
 
 static void
@@ -421,21 +455,16 @@
    struct gen_spec *spec = ctx->spec;
 
    if (strcmp(name, "instruction") == 0 ||
-      strcmp(name, "struct") == 0 ||
-      strcmp(name, "register") == 0) {
-      size_t size = ctx->nfields * sizeof(ctx->fields[0]);
+       strcmp(name, "struct") == 0 ||
+       strcmp(name, "register") == 0) {
       struct gen_group *group = ctx->group;
 
-      group->fields = xzalloc(size);
-      group->nfields = ctx->nfields;
-      memcpy(group->fields, ctx->fields, size);
-      ctx->nfields = 0;
-      ctx->group = NULL;
+      ctx->group = ctx->group->parent;
 
       for (int i = 0; i < group->nfields; i++) {
          if (group->fields[i]->start >= 16 &&
-            group->fields[i]->end <= 31 &&
-            group->fields[i]->has_default) {
+             group->fields[i]->end <= 31 &&
+             group->fields[i]->has_default) {
             group->opcode_mask |=
                mask(group->fields[i]->start % 32, group->fields[i]->end % 32);
             group->opcode |=
@@ -449,12 +478,15 @@
          spec->structs[spec->nstructs++] = group;
       else if (strcmp(name, "register") == 0)
          spec->registers[spec->nregisters++] = group;
+
+      assert(spec->ncommands < ARRAY_SIZE(spec->commands));
+      assert(spec->nstructs < ARRAY_SIZE(spec->structs));
+      assert(spec->nregisters < ARRAY_SIZE(spec->registers));
    } else if (strcmp(name, "group") == 0) {
-      ctx->group->group_offset = 0;
-      ctx->group->group_count = 0;
+      ctx->group = ctx->group->parent;
    } else if (strcmp(name, "field") == 0) {
-      assert(ctx->nfields > 0);
-      struct gen_field *field = ctx->fields[ctx->nfields - 1];
+      assert(ctx->group->nfields > 0);
+      struct gen_field *field = ctx->group->fields[ctx->group->nfields - 1];
       size_t size = ctx->nvalues * sizeof(ctx->values[0]);
       field->inline_enum.values = xzalloc(size);
       field->inline_enum.nvalues = ctx->nvalues;
@@ -636,11 +668,11 @@
    do {
       buf = XML_GetBuffer(ctx.parser, XML_BUFFER_SIZE);
       len = fread(buf, 1, XML_BUFFER_SIZE, input);
-      if (len < 0) {
+      if (len == 0) {
          fprintf(stderr, "fread: %m\n");
-         fclose(input);
-         free(filename);
-         return NULL;
+         free(ctx.spec);
+         ctx.spec = NULL;
+         goto end;
       }
       if (XML_ParseBuffer(ctx.parser, len, len == 0) == 0) {
          fprintf(stderr,
@@ -648,12 +680,13 @@
                  XML_GetCurrentLineNumber(ctx.parser),
                  XML_GetCurrentColumnNumber(ctx.parser),
                  XML_ErrorString(XML_GetErrorCode(ctx.parser)));
-         fclose(input);
-         free(filename);
-         return NULL;
+         free(ctx.spec);
+         ctx.spec = NULL;
+         goto end;
       }
    } while (len > 0);
 
+ end:
    XML_ParserFree(ctx.parser);
 
    fclose(input);
@@ -690,12 +723,19 @@
       break;
    }
 
+   case 2: /* BLT */ {
+      return field(h, 0, 7) + 2;
+   }
+
    case 3: /* Render */ {
       uint32_t subtype = field(h, 27, 28);
       uint32_t opcode = field(h, 24, 26);
+      uint16_t whole_opcode = field(h, 16, 31);
       switch (subtype) {
       case 0:
-         if (opcode < 2)
+         if (whole_opcode == 0x6104 /* PIPELINE_SELECT_965 */)
+            return 1;
+         else if (opcode < 2)
             return field(h, 0, 7) + 2;
          else
             return -1;
@@ -713,7 +753,9 @@
             return -1;
       }
       case 3:
-         if (opcode < 4)
+         if (whole_opcode == 0x780b)
+            return 1;
+         else if (opcode < 4)
             return field(h, 0, 7) + 2;
          else
             return -1;
@@ -730,9 +772,10 @@
                         const uint32_t *p,
                         bool print_colors)
 {
+   memset(iter, 0, sizeof(*iter));
+
    iter->group = group;
    iter->p = p;
-   iter->i = 0;
    iter->print_colors = print_colors;
 }
 
@@ -747,6 +790,70 @@
    return NULL;
 }
 
+static bool
+iter_more_fields(const struct gen_field_iterator *iter)
+{
+   return iter->field_iter < iter->group->nfields;
+}
+
+static uint32_t
+iter_group_offset_bits(const struct gen_field_iterator *iter,
+                       uint32_t group_iter)
+{
+   return iter->group->group_offset + (group_iter * iter->group->group_size);
+}
+
+static bool
+iter_more_groups(const struct gen_field_iterator *iter)
+{
+   if (iter->group->variable) {
+      return iter_group_offset_bits(iter, iter->group_iter + 1) <
+              (gen_group_get_length(iter->group, iter->p) * 32);
+   } else {
+      return (iter->group_iter + 1) < iter->group->group_count ||
+         iter->group->next != NULL;
+   }
+}
+
+static void
+iter_advance_group(struct gen_field_iterator *iter)
+{
+   if (iter->group->variable)
+      iter->group_iter++;
+   else {
+      if ((iter->group_iter + 1) < iter->group->group_count) {
+         iter->group_iter++;
+      } else {
+         iter->group = iter->group->next;
+         iter->group_iter = 0;
+      }
+   }
+
+   iter->field_iter = 0;
+}
+
+static bool
+iter_advance_field(struct gen_field_iterator *iter)
+{
+   while (!iter_more_fields(iter)) {
+      if (!iter_more_groups(iter))
+         return false;
+
+      iter_advance_group(iter);
+   }
+
+   iter->field = iter->group->fields[iter->field_iter++];
+   if (iter->field->name)
+       strncpy(iter->name, iter->field->name, sizeof(iter->name));
+   else
+      memset(iter->name, 0, sizeof(iter->name));
+   iter->dword = iter_group_offset_bits(iter, iter->group_iter) / 32 +
+      iter->field->start / 32;
+   iter->struct_desc = NULL;
+
+   return true;
+}
+
 bool
 gen_field_iterator_next(struct gen_field_iterator *iter)
 {
@@ -755,14 +862,9 @@
       float f;
    } v;
 
-   if (iter->i == iter->group->nfields)
+   if (!iter_advance_field(iter))
       return false;
 
-   iter->field = iter->group->fields[iter->i++];
-   iter->name = iter->field->name;
-   iter->dword = iter->field->start / 32;
-   iter->struct_desc = NULL;
-
    if ((iter->field->end - iter->field->start) > 32)
       v.qw = ((uint64_t) iter->p[iter->dword+1] << 32) | iter->p[iter->dword];
    else
@@ -827,6 +929,12 @@
    }
    }
 
+   if (strlen(iter->group->name) == 0) {
+      int length = strlen(iter->name);
+      snprintf(iter->name + length, sizeof(iter->name) - length,
+               "[%i]", iter->group_iter);
+   }
+
    if (enum_name) {
       int length = strlen(iter->value);
       snprintf(iter->value + length, sizeof(iter->value) - length,
@@ -875,7 +983,6 @@
          fprintf(outfile, "    %s: %s\n", iter.name, iter.value);
          if (iter.struct_desc) {
             uint64_t struct_offset = offset + 4 * iter.dword;
-            print_dword_header(outfile, &iter, struct_offset);
             gen_print_group(outfile, iter.struct_desc, struct_offset,
                             &p[iter.dword], color);
          }
diff --git a/src/intel/common/gen_decoder.h b/src/intel/common/gen_decoder.h
index 870bd7f..cfc9f2e 100644
--- a/src/intel/common/gen_decoder.h
+++ b/src/intel/common/gen_decoder.h
@@ -53,12 +53,15 @@
 
 struct gen_field_iterator {
    struct gen_group *group;
-   const char *name;
+   char name[128];
    char value[128];
    struct gen_group *struct_desc;
    const uint32_t *p;
    int dword; /**< current field starts at &p[dword] */
-   int i;
+
+   int field_iter;
+   int group_iter;
+
    struct gen_field *field;
    bool print_colors;
 };
@@ -66,9 +69,17 @@
 struct gen_group {
    struct gen_spec *spec;
    char *name;
-   int nfields;
+
    struct gen_field **fields;
+   uint32_t nfields;
+   uint32_t fields_size;
+
    uint32_t group_offset, group_count;
+   uint32_t group_size;
+   bool variable;
+
+   struct gen_group *parent;
+   struct gen_group *next;
 
    uint32_t opcode_mask;
    uint32_t opcode;
diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
index e5c74c7..c0eb7c3 100644
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -21,22 +21,26 @@
  * IN THE SOFTWARE.
  */
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "gen_device_info.h"
 #include "compiler/shader_enums.h"
+#include "util/macros.h"
 
 static const struct gen_device_info gen_device_info_i965 = {
    .gen = 4,
    .has_negative_rhw_bug = true,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 4,
    .max_vs_threads = 16,
    .max_gs_threads = 2,
    .max_wm_threads = 8 * 4,
    .urb = {
       .size = 256,
    },
-   .timebase_scale = 80,
+   .timestamp_frequency = 12500000,
 };
 
 static const struct gen_device_info gen_device_info_g4x = {
@@ -46,13 +50,15 @@
    .has_surface_tile_offset = true,
    .is_g4x = true,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 5,
    .max_vs_threads = 32,
    .max_gs_threads = 2,
    .max_wm_threads = 10 * 5,
    .urb = {
       .size = 384,
    },
-   .timebase_scale = 80,
+   .timestamp_frequency = 12500000,
 };
 
 static const struct gen_device_info gen_device_info_ilk = {
@@ -61,13 +67,15 @@
    .has_compr4 = true,
    .has_surface_tile_offset = true,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 6,
    .max_vs_threads = 72,
    .max_gs_threads = 32,
    .max_wm_threads = 12 * 6,
    .urb = {
       .size = 1024,
    },
-   .timebase_scale = 80,
+   .timestamp_frequency = 12500000,
 };
 
 static const struct gen_device_info gen_device_info_snb_gt1 = {
@@ -79,6 +87,8 @@
    .has_surface_tile_offset = true,
    .needs_unlit_centroid_workaround = true,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 6, /* Not confirmed */
    .max_vs_threads = 24,
    .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
    .max_wm_threads = 40,
@@ -92,7 +102,7 @@
          [MESA_SHADER_GEOMETRY] = 256,
       },
    },
-   .timebase_scale = 80,
+   .timestamp_frequency = 12500000,
 };
 
 static const struct gen_device_info gen_device_info_snb_gt2 = {
@@ -104,6 +114,8 @@
    .has_surface_tile_offset = true,
    .needs_unlit_centroid_workaround = true,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 6, /* Not confirmed */
    .max_vs_threads = 60,
    .max_gs_threads = 60,
    .max_wm_threads = 80,
@@ -117,7 +129,7 @@
          [MESA_SHADER_GEOMETRY] = 256,
       },
    },
-   .timebase_scale = 80,
+   .timestamp_frequency = 12500000,
 };
 
 #define GEN7_FEATURES                               \
@@ -127,11 +139,13 @@
    .has_llc = true,                                 \
    .has_pln = true,                                 \
    .has_surface_tile_offset = true,                 \
-   .timebase_scale = 80
+   .timestamp_frequency = 12500000
 
 static const struct gen_device_info gen_device_info_ivb_gt1 = {
    GEN7_FEATURES, .is_ivybridge = true, .gt = 1,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 6,
    .l3_banks = 2,
    .max_vs_threads = 36,
    .max_tcs_threads = 36,
@@ -157,6 +171,9 @@
 static const struct gen_device_info gen_device_info_ivb_gt2 = {
    GEN7_FEATURES, .is_ivybridge = true, .gt = 2,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
+                            * @max_wm_threads ... */
    .l3_banks = 4,
    .max_vs_threads = 128,
    .max_tcs_threads = 128,
@@ -182,6 +199,8 @@
 static const struct gen_device_info gen_device_info_byt = {
    GEN7_FEATURES, .is_baytrail = true, .gt = 1,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 8,
    .l3_banks = 1,
    .has_llc = false,
    .max_vs_threads = 36,
@@ -214,6 +233,8 @@
 static const struct gen_device_info gen_device_info_hsw_gt1 = {
    HSW_FEATURES, .gt = 1,
    .num_slices = 1,
+   .num_subslices = { 1, },
+   .num_thread_per_eu = 7,
    .l3_banks = 2,
    .max_vs_threads = 70,
    .max_tcs_threads = 70,
@@ -239,6 +260,8 @@
 static const struct gen_device_info gen_device_info_hsw_gt2 = {
    HSW_FEATURES, .gt = 2,
    .num_slices = 1,
+   .num_subslices = { 2, },
+   .num_thread_per_eu = 7,
    .l3_banks = 4,
    .max_vs_threads = 280,
    .max_tcs_threads = 256,
@@ -264,6 +287,8 @@
 static const struct gen_device_info gen_device_info_hsw_gt3 = {
    HSW_FEATURES, .gt = 3,
    .num_slices = 2,
+   .num_subslices = { 2, },
+   .num_thread_per_eu = 7,
    .l3_banks = 8,
    .max_vs_threads = 280,
    .max_tcs_threads = 256,
@@ -300,11 +325,14 @@
    .max_tes_threads = 504,                          \
    .max_gs_threads = 504,                           \
    .max_wm_threads = 384,                           \
-   .timebase_scale = 80
+   .timestamp_frequency = 12500000
 
 static const struct gen_device_info gen_device_info_bdw_gt1 = {
    GEN8_FEATURES, .gt = 1,
+   .is_broadwell = true,
    .num_slices = 1,
+   .num_subslices = { 2, },
+   .num_thread_per_eu = 7,
    .l3_banks = 2,
    .max_cs_threads = 42,
    .urb = {
@@ -324,7 +352,10 @@
 
 static const struct gen_device_info gen_device_info_bdw_gt2 = {
    GEN8_FEATURES, .gt = 2,
+   .is_broadwell = true,
    .num_slices = 1,
+   .num_subslices = { 3, },
+   .num_thread_per_eu = 7,
    .l3_banks = 4,
    .max_cs_threads = 56,
    .urb = {
@@ -344,7 +375,10 @@
 
 static const struct gen_device_info gen_device_info_bdw_gt3 = {
    GEN8_FEATURES, .gt = 3,
+   .is_broadwell = true,
    .num_slices = 2,
+   .num_subslices = { 3, 3, },
+   .num_thread_per_eu = 7,
    .l3_banks = 8,
    .max_cs_threads = 56,
    .urb = {
@@ -366,6 +400,8 @@
    GEN8_FEATURES, .is_cherryview = 1, .gt = 1,
    .has_llc = false,
    .num_slices = 1,
+   .num_subslices = { 2, },
+   .num_thread_per_eu = 7,
    .l3_banks = 2,
    .max_vs_threads = 80,
    .max_tcs_threads = 80,
@@ -388,21 +424,14 @@
    }
 };
 
-#define GEN9_FEATURES                               \
+#define GEN9_HW_INFO                                \
    .gen = 9,                                        \
-   .has_hiz_and_separate_stencil = true,            \
-   .has_resource_streamer = true,                   \
-   .must_use_separate_stencil = true,               \
-   .has_llc = true,                                 \
-   .has_pln = true,                                 \
-   .supports_simd16_3src = true,                    \
-   .has_surface_tile_offset = true,                 \
    .max_vs_threads = 336,                           \
    .max_gs_threads = 336,                           \
    .max_tcs_threads = 336,                          \
    .max_tes_threads = 336,                          \
    .max_cs_threads = 56,                            \
-   .timebase_scale = 1000000000.0 / 12000000.0,     \
+   .timestamp_frequency = 12000000,                 \
    .urb = {                                         \
       .size = 384,                                  \
       .min_entries = {                              \
@@ -418,17 +447,18 @@
    }
 
 #define GEN9_LP_FEATURES                           \
-   GEN9_FEATURES,                                  \
-   .is_broxton = 1,                                \
+   GEN8_FEATURES,                                  \
+   GEN9_HW_INFO,                                   \
    .gt = 1,                                        \
    .has_llc = false,                               \
    .num_slices = 1,                                \
+   .num_thread_per_eu = 6,                         \
    .max_vs_threads = 112,                          \
    .max_tcs_threads = 112,                         \
    .max_tes_threads = 112,                         \
    .max_gs_threads = 112,                          \
    .max_cs_threads = 6 * 6,                        \
-   .timebase_scale = 1000000000.0 / 19200123.0,    \
+   .timestamp_frequency = 19200000,                \
    .urb = {                                        \
       .size = 192,                                 \
       .min_entries = {                             \
@@ -443,8 +473,13 @@
       },                                           \
    }
 
+#define GEN9_LP_FEATURES_3X6                       \
+   GEN9_LP_FEATURES,                               \
+   .num_subslices = { 3, }
+
 #define GEN9_LP_FEATURES_2X6                       \
    GEN9_LP_FEATURES,                               \
+   .num_subslices = { 2, },                        \
    .max_vs_threads = 56,                           \
    .max_tcs_threads = 56,                          \
    .max_tes_threads = 56,                          \
@@ -464,28 +499,41 @@
       },                                           \
    }
 
+#define GEN9_FEATURES                               \
+   GEN8_FEATURES,                                   \
+   GEN9_HW_INFO,                                    \
+   .num_thread_per_eu = 7
+
 static const struct gen_device_info gen_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
+   .is_skylake = true,
    .num_slices = 1,
+   .num_subslices = { 2, },
    .l3_banks = 2,
    .urb.size = 192,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt2 = {
    GEN9_FEATURES, .gt = 2,
+   .is_skylake = true,
    .num_slices = 1,
+   .num_subslices = { 3, },
    .l3_banks = 4,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt3 = {
    GEN9_FEATURES, .gt = 3,
+   .is_skylake = true,
    .num_slices = 2,
+   .num_subslices = { 3, 3, },
    .l3_banks = 8,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt4 = {
    GEN9_FEATURES, .gt = 4,
+   .is_skylake = true,
    .num_slices = 3,
+   .num_subslices = { 3, 3, 3, },
    .l3_banks = 12,
    /* From the "L3 Allocation and Programming" documentation:
     *
@@ -499,12 +547,14 @@
 };
 
 static const struct gen_device_info gen_device_info_bxt = {
-   GEN9_LP_FEATURES,
+   GEN9_LP_FEATURES_3X6,
+   .is_broxton = true,
    .l3_banks = 2,
 };
 
 static const struct gen_device_info gen_device_info_bxt_2x6 = {
    GEN9_LP_FEATURES_2X6,
+   .is_broxton = true,
    .l3_banks = 1,
 };
 /*
@@ -520,6 +570,7 @@
    .max_cs_threads = 7 * 6,
    .urb.size = 192,
    .num_slices = 1,
+   .num_subslices = { 2, },
    .l3_banks = 2,
 };
 
@@ -530,6 +581,7 @@
 
    .max_cs_threads = 7 * 6,
    .num_slices = 1,
+   .num_subslices = { 3, },
    .l3_banks = 4,
 };
 
@@ -539,6 +591,7 @@
    .gt = 2,
 
    .num_slices = 1,
+   .num_subslices = { 3, },
    .l3_banks = 4,
 };
 
@@ -548,6 +601,7 @@
    .gt = 3,
 
    .num_slices = 2,
+   .num_subslices = { 3, 3, },
    .l3_banks = 8,
 };
 
@@ -568,17 +622,106 @@
     */
    .urb.size = 1008 / 3,
    .num_slices = 3,
+   .num_subslices = { 3, 3, 3, },
    .l3_banks = 12,
 };
 
 static const struct gen_device_info gen_device_info_glk = {
-   GEN9_LP_FEATURES,
+   GEN9_LP_FEATURES_3X6,
+   .is_geminilake = true,
    .l3_banks = 2,
 };
 
 /*TODO: Initialize l3_banks when we know the number. */
 static const struct gen_device_info gen_device_info_glk_2x6 = {
-   GEN9_LP_FEATURES_2X6
+   GEN9_LP_FEATURES_2X6,
+   .is_geminilake = true,
+};
+
+static const struct gen_device_info gen_device_info_cfl_gt1 = {
+   GEN9_FEATURES,
+   .is_coffeelake = true,
+   .gt = 1,
+
+   .num_slices = 1,
+   .num_subslices = { 2, },
+   .l3_banks = 2,
+};
+static const struct gen_device_info gen_device_info_cfl_gt2 = {
+   GEN9_FEATURES,
+   .is_coffeelake = true,
+   .gt = 2,
+
+   .num_slices = 1,
+   .num_subslices = { 3, },
+   .l3_banks = 4,
+};
+
+static const struct gen_device_info gen_device_info_cfl_gt3 = {
+   GEN9_FEATURES,
+   .is_coffeelake = true,
+   .gt = 3,
+
+   .num_slices = 2,
+   .num_subslices = { 3, 3, },
+   .l3_banks = 8,
+};
+
+#define GEN10_HW_INFO                               \
+   .gen = 10,                                       \
+   .num_thread_per_eu = 7,                          \
+   .max_vs_threads = 728,                           \
+   .max_gs_threads = 432,                           \
+   .max_tcs_threads = 432,                          \
+   .max_tes_threads = 624,                          \
+   .max_cs_threads = 56,                            \
+   .timestamp_frequency = 19200000,                 \
+   .urb = {                                         \
+      .size = 256,                                  \
+      .min_entries = {                              \
+         [MESA_SHADER_VERTEX]    = 64,              \
+         [MESA_SHADER_TESS_EVAL] = 34,              \
+      },                                            \
+      .max_entries = {                              \
+      [MESA_SHADER_VERTEX]       = 3936,            \
+      [MESA_SHADER_TESS_CTRL]    = 896,             \
+      [MESA_SHADER_TESS_EVAL]    = 2064,            \
+      [MESA_SHADER_GEOMETRY]     = 832,             \
+      },                                            \
+   }
+
+#define subslices(args...) { args, }
+
+#define GEN10_FEATURES(_gt, _slices, _subslices, _l3) \
+   GEN8_FEATURES,                                   \
+   GEN10_HW_INFO,                                   \
+   .gt = _gt,                                       \
+   .num_slices = _slices,                           \
+   .num_subslices = _subslices,                     \
+   .l3_banks = _l3
+
+static const struct gen_device_info gen_device_info_cnl_2x8 = {
+   /* GT0.5 */
+   GEN10_FEATURES(1, 1, subslices(2), 2),
+   .is_cannonlake = true,
+};
+
+static const struct gen_device_info gen_device_info_cnl_3x8 = {
+   /* GT1 */
+   GEN10_FEATURES(1, 1, subslices(3), 3),
+   .is_cannonlake = true,
+};
+
+static const struct gen_device_info gen_device_info_cnl_4x8 = {
+   /* GT 1.5 */
+   GEN10_FEATURES(1, 2, subslices(2, 2), 6),
+   .is_cannonlake = true,
+};
+
+static const struct gen_device_info gen_device_info_cnl_5x8 = {
+   /* GT2 */
+   GEN10_FEATURES(2, 2, subslices(3, 2), 6),
+   .is_cannonlake = true,
 };
 
 bool
@@ -613,6 +756,8 @@
                               * 4; /* effective subslices per slice */
    }
 
+   assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
+
    return true;
 }
 
diff --git a/src/intel/common/gen_device_info.h b/src/intel/common/gen_device_info.h
index 6207630..ede4915 100644
--- a/src/intel/common/gen_device_info.h
+++ b/src/intel/common/gen_device_info.h
@@ -26,6 +26,7 @@
 #define GEN_DEVICE_INFO_H
 
 #include <stdbool.h>
+#include <stdint.h>
 
 /**
  * Intel hardware information and quirks
@@ -39,9 +40,14 @@
    bool is_ivybridge;
    bool is_baytrail;
    bool is_haswell;
+   bool is_broadwell;
    bool is_cherryview;
+   bool is_skylake;
    bool is_broxton;
    bool is_kabylake;
+   bool is_geminilake;
+   bool is_coffeelake;
+   bool is_cannonlake;
 
    bool has_hiz_and_separate_stencil;
    bool must_use_separate_stencil;
@@ -96,6 +102,17 @@
     * to change, so we program @max_cs_threads as the lower maximum.
     */
    unsigned num_slices;
+
+   /**
+    * Number of subslices for each slice (used to be uniform until CNL).
+    */
+   unsigned num_subslices[3];
+
+   /**
+    * Number of threads per eu, varies between 4 and 8 between generations.
+    */
+   unsigned num_thread_per_eu;
+
    unsigned l3_banks;
    unsigned max_vs_threads;   /**< Maximum Vertex Shader threads */
    unsigned max_tcs_threads;  /**< Maximum Hull Shader threads */
@@ -155,7 +172,7 @@
     * corresponded to 80 nanoseconds.
     *
     * Since Gen9 the numbers aren't so round, with a a frequency of 12MHz for
-    * SKL (or scale factor of 83.33333333) and a frequency of 19200123Hz for
+    * SKL (or scale factor of 83.33333333) and a frequency of 19200000Hz for
     * BXT.
     *
     * For simplicty to fit with the current code scaling by a single constant
@@ -170,11 +187,14 @@
     * E.g. with crude testing on my system using the 'correct' scale factor I'm
     * seeing a drift of ~2 milliseconds per second.
     */
-   double timebase_scale;
+   uint64_t timestamp_frequency;
 
    /** @} */
 };
 
+#define gen_device_info_is_9lp(devinfo) \
+   (devinfo->is_broxton || devinfo->is_geminilake)
+
 bool gen_get_device_info(int devid, struct gen_device_info *devinfo);
 const char *gen_get_device_name(int devid);
 
diff --git a/src/intel/common/gen_l3_config.c b/src/intel/common/gen_l3_config.c
index b67134f..aff13c0 100644
--- a/src/intel/common/gen_l3_config.c
+++ b/src/intel/common/gen_l3_config.c
@@ -116,6 +116,23 @@
 };
 
 /**
+ * CNL validated L3 configurations.  \sa ivb_l3_configs.
+ */
+static const struct gen_l3_config cnl_l3_configs[] = {
+   /* SLM URB ALL DC  RO  IS   C   T */
+   {{  0, 64, 64,  0,  0,  0,  0,  0 }},
+   {{  0, 64,  0, 16, 48,  0,  0,  0 }},
+   {{  0, 48,  0, 16, 64,  0,  0,  0 }},
+   {{  0, 32,  0,  0, 96,  0,  0,  0 }},
+   {{  0, 32, 96,  0,  0,  0,  0,  0 }},
+   {{  0, 32,  0, 16, 80,  0,  0,  0 }},
+   {{ 32, 16, 80,  0,  0,  0,  0,  0 }},
+   {{ 32, 16,  0, 64, 16,  0,  0,  0 }},
+   {{ 32,  0, 96,  0,  0,  0,  0,  0 }},
+   {{ 0 }}
+};
+
+/**
  * Return a zero-terminated array of validated L3 configurations for the
  * specified device.
  */
@@ -131,9 +148,12 @@
 
    case 9:
       if (devinfo->l3_banks == 1)
-	 return bxt_2x6_l3_configs;
+         return bxt_2x6_l3_configs;
       return chv_l3_configs;
 
+   case 10:
+      return cnl_l3_configs;
+
    default:
       unreachable("Not implemented");
    }
@@ -270,16 +290,11 @@
 static unsigned
 get_l3_way_size(const struct gen_device_info *devinfo)
 {
-   if (devinfo->is_baytrail)
-      return 2;
+   const unsigned way_size_per_bank =
+      devinfo->gen >= 9 && devinfo->l3_banks == 1 ? 4 : 2;
 
-   else if (devinfo->gt == 1 ||
-            devinfo->is_cherryview ||
-            devinfo->is_broxton)
-      return 4;
-
-   else
-      return 8 * devinfo->num_slices;
+   assert(devinfo->l3_banks);
+   return way_size_per_bank * devinfo->l3_banks;
 }
 
 /**
diff --git a/src/intel/compiler/BUILD.gn b/src/intel/compiler/BUILD.gn
index 6eeb0ab..fa3a7c8 100644
--- a/src/intel/compiler/BUILD.gn
+++ b/src/intel/compiler/BUILD.gn
@@ -24,7 +24,6 @@
 config("compiler_config") {
   include_dirs = [
     ".",
-    "$magma_build_root/third_party/libdrm",
     "$mesa_build_root/src/mesa",  # because brw_compiler.h includes main/mtypes.h
     "$mesa_build_root/src/gallium/auxiliary",  # because main/macros.h includes gallium/auxiliary/util/u_math.h
     "$mesa_build_root/src/gallium/include",  # because u_math.h includes pipe/p_compiler.h
@@ -54,6 +53,7 @@
     "brw_cfg.h",
     "brw_compiler.c",
     "brw_compiler.h",
+    "brw_compile_sf.c",
     "brw_dead_control_flow.cpp",
     "brw_dead_control_flow.h",
     "brw_disasm.c",
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/intel/compiler/brw_clip.h
similarity index 85%
rename from src/mesa/drivers/dri/i965/brw_clip.h
rename to src/intel/compiler/brw_clip.h
index 9dc1f12..bdf7ee2 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/intel/compiler/brw_clip.h
@@ -32,9 +32,8 @@
 #ifndef BRW_CLIP_H
 #define BRW_CLIP_H
 
-
-#include "brw_context.h"
-#include "compiler/brw_eu.h"
+#include "brw_compiler.h"
+#include "brw_eu.h"
 
 /* Initial 3 verts, plus at most 6 additional verts from intersections
  * with fixed planes, plus at most 8 additional verts from intersections
@@ -42,38 +41,6 @@
  */
 #define MAX_VERTS (3+6+8)
 
-/* Note that if unfilled primitives are being emitted, we have to fix
- * up polygon offset and flatshading at this point:
- */
-struct brw_clip_prog_key {
-   GLbitfield64 attrs;
-   bool contains_flat_varying;
-   bool contains_noperspective_varying;
-   const unsigned char *interp_mode;
-   GLuint primitive:4;
-   GLuint nr_userclip:4;
-   GLuint pv_first:1;
-   GLuint do_unfilled:1;
-   GLuint fill_cw:2;		/* includes cull information */
-   GLuint fill_ccw:2;		/* includes cull information */
-   GLuint offset_cw:1;
-   GLuint offset_ccw:1;
-   GLuint copy_bfc_cw:1;
-   GLuint copy_bfc_ccw:1;
-   GLuint clip_mode:3;
-
-   GLfloat offset_factor;
-   GLfloat offset_units;
-   GLfloat offset_clamp;
-};
-
-
-#define CLIP_LINE   0
-#define CLIP_POINT  1
-#define CLIP_FILL   2
-#define CLIP_CULL   3
-
-
 #define PRIM_MASK  (0x1f)
 
 struct brw_clip_compile {
@@ -192,4 +159,5 @@
              struct brw_reg pos );
 void brw_clip_ff_sync(struct brw_clip_compile *c);
 void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/intel/compiler/brw_clip_line.c
similarity index 98%
rename from src/mesa/drivers/dri/i965/brw_clip_line.c
rename to src/intel/compiler/brw_clip_line.c
index 788dc96..37f2266 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/intel/compiler/brw_clip_line.c
@@ -33,14 +33,8 @@
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
-#include "brw_defines.h"
-#include "brw_context.h"
 #include "brw_clip.h"
 
-
-
 static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 {
    const struct gen_device_info *devinfo = c->func.devinfo;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/intel/compiler/brw_clip_point.c
similarity index 94%
rename from src/mesa/drivers/dri/i965/brw_clip_point.c
rename to src/intel/compiler/brw_clip_point.c
index bdbf969..ac8f315 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/intel/compiler/brw_clip_point.c
@@ -33,10 +33,6 @@
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
-#include "brw_defines.h"
-#include "brw_context.h"
 #include "brw_clip.h"
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/intel/compiler/brw_clip_tri.c
similarity index 98%
rename from src/mesa/drivers/dri/i965/brw_clip_tri.c
rename to src/intel/compiler/brw_clip_tri.c
index d98e1cc..8ccf9e4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/intel/compiler/brw_clip_tri.c
@@ -33,10 +33,6 @@
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
-#include "brw_defines.h"
-#include "brw_context.h"
 #include "brw_clip.h"
 
 static void release_tmps( struct brw_clip_compile *c )
@@ -652,8 +648,8 @@
    if (c->key.contains_flat_varying)
       brw_clip_tri_flat_shade(c);
 
-   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
-       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
+   if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP))
       do_clip_tri(c);
    else
       maybe_do_clip_tri(c);
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/intel/compiler/brw_clip_unfilled.c
similarity index 94%
rename from src/mesa/drivers/dri/i965/brw_clip_unfilled.c
rename to src/intel/compiler/brw_clip_unfilled.c
index 65ccf33..83f9447 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/intel/compiler/brw_clip_unfilled.c
@@ -33,14 +33,9 @@
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
-#include "brw_defines.h"
-#include "brw_context.h"
 #include "brw_clip.h"
 
 
-
 /* This is performed against the original triangles, so no indirection
  * required:
 BZZZT!
@@ -99,10 +94,10 @@
    struct brw_codegen *p = &c->func;
    GLuint conditional;
 
-   assert (!(c->key.fill_ccw == CLIP_CULL &&
-	     c->key.fill_cw == CLIP_CULL));
+   assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+	     c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL));
 
-   if (c->key.fill_ccw == CLIP_CULL)
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL)
       conditional = BRW_CONDITIONAL_GE;
    else
       conditional = BRW_CONDITIONAL_L;
@@ -405,19 +400,19 @@
 			     bool do_offset )
 {
    switch (mode) {
-   case CLIP_FILL:
+   case BRW_CLIP_FILL_MODE_FILL:
       brw_clip_tri_emit_polygon(c);
       break;
 
-   case CLIP_LINE:
+   case BRW_CLIP_FILL_MODE_LINE:
       emit_lines(c, do_offset);
       break;
 
-   case CLIP_POINT:
+   case BRW_CLIP_FILL_MODE_POINT:
       emit_points(c, do_offset);
       break;
 
-   case CLIP_CULL:
+   case BRW_CLIP_FILL_MODE_CULL:
       unreachable("not reached");
    }
 }
@@ -431,8 +426,8 @@
    /* Direction culling has already been done.
     */
    if (c->key.fill_ccw != c->key.fill_cw &&
-       c->key.fill_ccw != CLIP_CULL &&
-       c->key.fill_cw != CLIP_CULL)
+       c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
    {
       brw_CMP(p,
 	      vec1(brw_null_reg()),
@@ -450,10 +445,10 @@
       }
       brw_ENDIF(p);
    }
-   else if (c->key.fill_cw != CLIP_CULL) {
+   else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) {
       emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
    }
-   else if (c->key.fill_ccw != CLIP_CULL) {
+   else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) {
       emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
    }
 }
@@ -480,8 +475,8 @@
 
    c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
 			(c->key.fill_ccw != c->key.fill_cw) ||
-			c->key.fill_ccw == CLIP_CULL ||
-			c->key.fill_cw == CLIP_CULL ||
+			c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+			c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL ||
 			c->key.copy_bfc_cw ||
 			c->key.copy_bfc_ccw);
 
@@ -491,8 +486,8 @@
 
    assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE));
 
-   if (c->key.fill_ccw == CLIP_CULL &&
-       c->key.fill_cw == CLIP_CULL) {
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) {
       brw_clip_kill_thread(c);
       return;
    }
@@ -504,8 +499,8 @@
    if (c->need_direction)
       compute_tri_direction(c);
 
-   if (c->key.fill_ccw == CLIP_CULL ||
-       c->key.fill_cw == CLIP_CULL)
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)
       cull_direction(c);
 
    if (c->key.offset_ccw ||
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/intel/compiler/brw_clip_util.c
similarity index 98%
rename from src/mesa/drivers/dri/i965/brw_clip_util.c
rename to src/intel/compiler/brw_clip_util.c
index e0fdd3d..e01fbc6 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/intel/compiler/brw_clip_util.c
@@ -34,15 +34,9 @@
 #include "main/enums.h"
 #include "program/program.h"
 
-#include "intel_batchbuffer.h"
-
-#include "brw_defines.h"
-#include "brw_context.h"
 #include "brw_clip.h"
 
 
-
-
 struct brw_reg get_tmp( struct brw_clip_compile *c )
 {
    struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
diff --git a/src/intel/compiler/brw_compile_clip.c b/src/intel/compiler/brw_compile_clip.c
new file mode 100644
index 0000000..83788e4
--- /dev/null
+++ b/src/intel/compiler/brw_compile_clip.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_clip.h"
+
+#include "common/gen_debug.h"
+
+const unsigned *
+brw_compile_clip(const struct brw_compiler *compiler,
+                 void *mem_ctx,
+                 const struct brw_clip_prog_key *key,
+                 struct brw_clip_prog_data *prog_data,
+                 struct brw_vue_map *vue_map,
+                 unsigned *final_assembly_size)
+{
+   struct brw_clip_compile c;
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(compiler->devinfo, &c.func, mem_ctx);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+
+   /* nr_regs is the number of registers filled by reading data from the VUE.
+    * This program accesses the entire VUE, so nr_regs needs to be the size of
+    * the VUE (measured in pairs, since two slots are stored in each
+    * register).
+    */
+   c.nr_regs = (c.vue_map.num_slots + 1)/2;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case GL_TRIANGLES:
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case GL_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case GL_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   brw_compact_instructions(&c.func, 0, 0, NULL);
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_CLIP)) {
+      fprintf(stderr, "clip:\n");
+      brw_disassemble(compiler->devinfo,
+                      program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/intel/compiler/brw_compile_sf.c
similarity index 78%
rename from src/mesa/drivers/dri/i965/brw_sf_emit.c
rename to src/intel/compiler/brw_compile_sf.c
index dc90503..91e8a6d 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/intel/compiler/brw_compile_sf.c
@@ -1,45 +1,73 @@
 /*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
+#include "brw_compiler.h"
+#include "brw_eu.h"
 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
+#include "common/gen_debug.h"
 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+struct brw_sf_compile {
+   struct brw_codegen func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
 
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
 
-#include "main/macros.h"
-#include "main/enums.h"
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
 
-#include "intel_batchbuffer.h"
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
 
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_util.h"
-#include "brw_sf.h"
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
 
+   GLuint nr_verts;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_regs;
+   int urb_entry_read_offset;
+
+   /** The last known value of the f0.0 flag register. */
+   unsigned flag_value;
+
+   struct brw_vue_map vue_map;
+};
 
 /**
  * Determine the vue slot corresponding to the given half of the given register.
@@ -119,7 +147,7 @@
 
    /* Already done in clip program:
     */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
       return;
 
    /* If the vertex shader provides backface color, do the selection. The VS
@@ -195,7 +223,7 @@
 
    /* Already done in clip program:
     */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
       return;
 
    if (p->devinfo->gen == 5)
@@ -227,7 +255,7 @@
 
    /* Already done in clip program:
     */
-   if (c->key.primitive == SF_UNFILLED_TRIS)
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
       return;
 
    if (p->devinfo->gen == 5)
@@ -410,7 +438,7 @@
    }
 }
 
-void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
+static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
 {
    struct brw_codegen *p = &c->func;
    GLuint i;
@@ -499,7 +527,7 @@
 
 
 
-void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
+static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
 {
    struct brw_codegen *p = &c->func;
    GLuint i;
@@ -539,7 +567,7 @@
 
 	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
 
- 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
 	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
 
 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
@@ -571,7 +599,7 @@
    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
-void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
+static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
 {
    struct brw_codegen *p = &c->func;
    GLuint i;
@@ -663,7 +691,7 @@
 /* Points setup - several simplifications as all attributes are
  * constant across the face of the point (point sprites excluded!)
  */
-void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
+static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
 {
    struct brw_codegen *p = &c->func;
    GLuint i;
@@ -722,7 +750,7 @@
    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
-void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 {
    struct brw_codegen *p = &c->func;
    struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
@@ -771,6 +799,81 @@
    brw_emit_point_setup( c, false );
 }
 
+const unsigned *
+brw_compile_sf(const struct brw_compiler *compiler,
+               void *mem_ctx,
+               const struct brw_sf_prog_key *key,
+               struct brw_sf_prog_data *prog_data,
+               struct brw_vue_map *vue_map,
+               unsigned *final_assembly_size)
+{
+   struct brw_sf_compile c;
+   memset(&c, 0, sizeof(c));
 
+   /* Begin the compilation:
+    */
+   brw_init_codegen(compiler->devinfo, &c.func, mem_ctx);
 
+   c.key = *key;
+   c.vue_map = *vue_map;
+   if (c.key.do_point_coord) {
+      /*
+       * gl_PointCoord is a FS instead of VS builtin variable, thus it's
+       * not included in c.vue_map generated in VS stage. Here we add
+       * it manually to let SF shader generate the needed interpolation
+       * coefficient for FS shader.
+       */
+      c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
+      c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
+   }
+   c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
+   c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
+   c.nr_setup_regs = c.nr_attr_regs;
 
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case BRW_SF_PRIM_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_POINTS:
+      c.nr_verts = 1;
+      if (key->do_point_sprite)
+	  brw_emit_point_sprite_setup( &c, true );
+      else
+	  brw_emit_point_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_UNFILLED_TRIS:
+      c.nr_verts = 3;
+      brw_emit_anyprim_setup( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
+    * source). Compacting would be difficult.
+    */
+   /* brw_compact_instructions(&c.func, 0, 0, NULL); */
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SF)) {
+      fprintf(stderr, "sf:\n");
+      brw_disassemble(compiler->devinfo,
+                      program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index cd9473f..2f6af7d 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -57,6 +57,8 @@
    .lower_unpack_snorm_4x8 = true,
    .lower_unpack_unorm_2x16 = true,
    .lower_unpack_unorm_4x8 = true,
+   .lower_subgroup_masks = true,
+   .max_subgroup_size = 32,
    .max_unroll_iterations = 32,
 };
 
@@ -78,6 +80,7 @@
    .lower_unpack_unorm_2x16 = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .lower_vote_trivial = true,
    .max_unroll_iterations = 32,
 };
 
@@ -96,6 +99,7 @@
    .lower_unpack_unorm_2x16 = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .lower_vote_trivial = true,
    .max_unroll_iterations = 32,
 };
 
@@ -112,16 +116,22 @@
 
    compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
 
-   compiler->scalar_stage[MESA_SHADER_VERTEX] =
-      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true);
-   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
-   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
-   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
-   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+   if (devinfo->gen >= 10) {
+      /* We don't support vec4 mode on Cannonlake. */
+      for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
+         compiler->scalar_stage[i] = true;
+   } else {
+      compiler->scalar_stage[MESA_SHADER_VERTEX] =
+         devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_VS", true);
+      compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
+         devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true);
+      compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+         devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+      compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+         devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+      compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+      compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+   }
 
    /* We want the GLSL compiler to emit code that uses condition codes */
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 9228413..bebd244 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -100,6 +100,12 @@
     * This can negatively impact performance.
     */
    bool precise_trig;
+
+   /**
+    * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State
+    * Base Address?  (If not, it's a normal GPU address.)
+    */
+   bool constant_buffer_0_is_relative;
 };
 
 
@@ -168,6 +174,7 @@
    uint32_t y_u_v_image_mask;
    uint32_t y_uv_image_mask;
    uint32_t yx_xuxv_image_mask;
+   uint32_t xy_uxvx_image_mask;
 };
 
 /**
@@ -260,6 +267,68 @@
    struct brw_sampler_prog_key_data tex;
 };
 
+enum brw_sf_primitive {
+   BRW_SF_PRIM_POINTS = 0,
+   BRW_SF_PRIM_LINES = 1,
+   BRW_SF_PRIM_TRIANGLES = 2,
+   BRW_SF_PRIM_UNFILLED_TRIS = 3,
+};
+
+struct brw_sf_prog_key {
+   uint64_t attrs;
+   bool contains_flat_varying;
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+   uint8_t point_sprite_coord_replace;
+   enum brw_sf_primitive primitive:2;
+   bool do_twoside_color:1;
+   bool frontface_ccw:1;
+   bool do_point_sprite:1;
+   bool do_point_coord:1;
+   bool sprite_origin_lower_left:1;
+   bool userclip_active:1;
+};
+
+enum brw_clip_mode {
+   BRW_CLIP_MODE_NORMAL             = 0,
+   BRW_CLIP_MODE_CLIP_ALL           = 1,
+   BRW_CLIP_MODE_CLIP_NON_REJECTED  = 2,
+   BRW_CLIP_MODE_REJECT_ALL         = 3,
+   BRW_CLIP_MODE_ACCEPT_ALL         = 4,
+   BRW_CLIP_MODE_KERNEL_CLIP        = 5,
+};
+
+enum brw_clip_fill_mode {
+   BRW_CLIP_FILL_MODE_LINE = 0,
+   BRW_CLIP_FILL_MODE_POINT = 1,
+   BRW_CLIP_FILL_MODE_FILL = 2,
+   BRW_CLIP_FILL_MODE_CULL = 3,
+};
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   uint64_t attrs;
+   bool contains_flat_varying;
+   bool contains_noperspective_varying;
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+   unsigned primitive:4;
+   unsigned nr_userclip:4;
+   bool pv_first:1;
+   bool do_unfilled:1;
+   enum brw_clip_fill_mode fill_cw:2;  /* includes cull information */
+   enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */
+   bool offset_cw:1;
+   bool offset_ccw:1;
+   bool copy_bfc_cw:1;
+   bool copy_bfc_ccw:1;
+   enum brw_clip_mode clip_mode:3;
+
+   float offset_factor;
+   float offset_units;
+   float offset_clamp;
+};
+
 /* A big lookup table is used to figure out which and how many
  * additional regs will inserted before the main payload in the WM
  * program execution.  These mainly relate to depth and stencil
@@ -399,6 +468,13 @@
  */
 #define BRW_SHADER_TIME_STRIDE 64
 
+struct brw_ubo_range
+{
+   uint16_t block;
+   uint8_t start;
+   uint8_t length;
+};
+
 struct brw_stage_prog_data {
    struct {
       /** size of our binding table. */
@@ -419,6 +495,8 @@
       /** @} */
    } binding_table;
 
+   struct brw_ubo_range ubo_ranges[4];
+
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    unsigned nr_image_params;
@@ -458,6 +536,27 @@
       MAX2(prog_data->binding_table.size_bytes, (surf_index + 1) * 4);
 }
 
+enum brw_barycentric_mode {
+   BRW_BARYCENTRIC_PERSPECTIVE_PIXEL       = 0,
+   BRW_BARYCENTRIC_PERSPECTIVE_CENTROID    = 1,
+   BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE      = 2,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL    = 3,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE   = 5,
+   BRW_BARYCENTRIC_MODE_COUNT              = 6
+};
+#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
+   ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
+
+enum brw_pixel_shader_computed_depth_mode {
+   BRW_PSCDEPTH_OFF   = 0, /* PS does not compute depth */
+   BRW_PSCDEPTH_ON    = 1, /* PS computes depth; no guarantee about value */
+   BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
+   BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
+};
+
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
  * corresponding to a different brw_wm_prog_key struct, with different
@@ -850,6 +949,26 @@
    unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
 };
 
+struct brw_sf_prog_data {
+   uint32_t urb_read_length;
+   uint32_t total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   unsigned urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   uint32_t curb_read_length;	/* user planes? */
+   uint32_t clip_mode;
+   uint32_t urb_read_length;
+   uint32_t total_grf;
+};
+
 #define DEFINE_PROG_DATA_DOWNCAST(stage)                       \
 static inline struct brw_##stage##_prog_data *                 \
 brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \
@@ -940,6 +1059,38 @@
                char **error_str);
 
 /**
+ * Compile a strips and fans shader.
+ *
+ * This is a fixed-function shader determined entirely by the shader key and
+ * a VUE map.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_sf(const struct brw_compiler *compiler,
+               void *mem_ctx,
+               const struct brw_sf_prog_key *key,
+               struct brw_sf_prog_data *prog_data,
+               struct brw_vue_map *vue_map,
+               unsigned *final_assembly_size);
+
+/**
+ * Compile a clipper shader.
+ *
+ * This is a fixed-function shader determined entirely by the shader key and
+ * a VUE map.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_clip(const struct brw_compiler *compiler,
+                 void *mem_ctx,
+                 const struct brw_clip_prog_key *key,
+                 struct brw_clip_prog_data *prog_data,
+                 struct brw_vue_map *vue_map,
+                 unsigned *final_assembly_size);
+
+/**
  * Compile a fragment shader.
  *
  * Returns the final assembly and the program's size.
@@ -1021,7 +1172,7 @@
     * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
     * the NIR front-end before changing this assertion.
     */
-   assert(devinfo->gen <= 9);
+   assert(devinfo->gen <= 10);
 
    switch (stage) {
    case MESA_SHADER_FRAGMENT: {
diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
index 8b44736..ad41e35 100644
--- a/src/intel/compiler/brw_disasm.c
+++ b/src/intel/compiler/brw_disasm.c
@@ -719,7 +719,7 @@
 }
 
 static int
-dest(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+dest(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    unsigned elem_size = brw_element_size(devinfo, inst, dst);
    int err = 0;
@@ -776,7 +776,7 @@
 }
 
 static int
-dest_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+dest_3src(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    int err = 0;
    uint32_t reg_file;
@@ -942,7 +942,7 @@
 }
 
 static int
-src0_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+src0_3src(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    int err = 0;
    unsigned src0_subreg_nr = brw_inst_3src_src0_subreg_nr(devinfo, inst);
@@ -969,7 +969,7 @@
 }
 
 static int
-src1_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+src1_3src(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    int err = 0;
    unsigned src1_subreg_nr = brw_inst_3src_src1_subreg_nr(devinfo, inst);
@@ -997,7 +997,7 @@
 
 
 static int
-src2_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+src2_3src(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    int err = 0;
    unsigned src2_subreg_nr = brw_inst_3src_src2_subreg_nr(devinfo, inst);
@@ -1024,7 +1024,7 @@
 }
 
 static int
-imm(FILE *file, const struct gen_device_info *devinfo, unsigned type, brw_inst *inst)
+imm(FILE *file, const struct gen_device_info *devinfo, unsigned type, const brw_inst *inst)
 {
    switch (type) {
    case BRW_HW_REG_TYPE_UD:
@@ -1066,7 +1066,7 @@
 }
 
 static int
-src0(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+src0(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
       return imm(file, devinfo, brw_inst_src0_reg_type(devinfo, inst), inst);
@@ -1122,7 +1122,7 @@
 }
 
 static int
-src1(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+src1(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
       return imm(file, devinfo, brw_inst_src1_reg_type(devinfo, inst), inst);
@@ -1178,7 +1178,7 @@
 }
 
 static int
-qtr_ctrl(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+qtr_ctrl(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst)
 {
    int qtr_ctl = brw_inst_qtr_control(devinfo, inst);
    int exec_size = 1 << brw_inst_exec_size(devinfo, inst);
@@ -1225,7 +1225,7 @@
 
 int
 brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
-                     brw_inst *inst, bool is_compacted)
+                     const brw_inst *inst, bool is_compacted)
 {
    int err = 0;
    int space = 0;
diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
index 77400c1..0ef52e2 100644
--- a/src/intel/compiler/brw_eu.c
+++ b/src/intel/compiler/brw_eu.c
@@ -366,12 +366,12 @@
 
 void
 brw_disassemble(const struct gen_device_info *devinfo,
-                void *assembly, int start, int end, FILE *out)
+                const void *assembly, int start, int end, FILE *out)
 {
    bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0;
 
    for (int offset = start; offset < end;) {
-      brw_inst *insn = assembly + offset;
+      const brw_inst *insn = assembly + offset;
       brw_inst uncompacted;
       bool compacted = brw_inst_cmpt_control(devinfo, insn);
       if (0)
@@ -412,6 +412,7 @@
    GEN75 = (1 << 5),
    GEN8  = (1 << 6),
    GEN9  = (1 << 7),
+   GEN10  = (1 << 8),
    GEN_ALL = ~0
 };
 
@@ -688,6 +689,7 @@
    case 7: return devinfo->is_haswell ? GEN75 : GEN7;
    case 8: return GEN8;
    case 9: return GEN9;
+   case 10: return GEN10;
    default:
       unreachable("not reached");
    }
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index f422595..a3a9c63 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -115,9 +115,9 @@
 void brw_init_codegen(const struct gen_device_info *, struct brw_codegen *p,
 		      void *mem_ctx);
 int brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
-                         struct brw_inst *inst, bool is_compacted);
-void brw_disassemble(const struct gen_device_info *devinfo, void *assembly,
-                     int start, int end, FILE *out);
+                         const struct brw_inst *inst, bool is_compacted);
+void brw_disassemble(const struct gen_device_info *devinfo,
+                     const void *assembly, int start, int end, FILE *out);
 const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz );
 
 brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode);
@@ -548,7 +548,8 @@
                                  brw_inst *orig, brw_inst *uncompacted);
 
 /* brw_eu_validate.c */
-bool brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+bool brw_validate_instructions(const struct gen_device_info *devinfo,
+                               void *assembly, int start_offset, int end_offset,
                                struct annotation_info *annotation);
 
 static inline int
diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c
index b2af76d..740a395 100644
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -1362,6 +1362,7 @@
    assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0);
 
    switch (devinfo->gen) {
+   case 10:
    case 9:
    case 8:
       control_index_table = gen8_control_index_table;
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 13a70f6..1af835d 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -72,32 +72,13 @@
 #define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
 #define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
 
-enum brw_barycentric_mode {
-   BRW_BARYCENTRIC_PERSPECTIVE_PIXEL       = 0,
-   BRW_BARYCENTRIC_PERSPECTIVE_CENTROID    = 1,
-   BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE      = 2,
-   BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL    = 3,
-   BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
-   BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE   = 5,
-   BRW_BARYCENTRIC_MODE_COUNT              = 6
-};
-#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
-   ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
-    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
-    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
-
-enum brw_pixel_shader_computed_depth_mode {
-   BRW_PSCDEPTH_OFF   = 0, /* PS does not compute depth */
-   BRW_PSCDEPTH_ON    = 1, /* PS computes depth; no guarantee about value */
-   BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
-   BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
-};
-
 /* Bitfields for the URB_WRITE message, DW2 of message header: */
 #define URB_WRITE_PRIM_END		0x1
 #define URB_WRITE_PRIM_START		0x2
 #define URB_WRITE_PRIM_TYPE_SHIFT	2
 
+#define BRW_SPRITE_POINT_ENABLE  16
+
 # define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT		0
 # define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID		1
 
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 231d6fd..fa59086 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -306,7 +306,7 @@
        reg.nr == BRW_ARF_ACCUMULATOR)
       assert(reg.swizzle == BRW_SWIZZLE_XYZW);
 
-   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
+   assert(reg.hstride < ARRAY_SIZE(hstride_for_reg));
    hstride = hstride_for_reg[reg.hstride];
 
    if (reg.vstride == 0xf) {
@@ -1376,7 +1376,7 @@
    struct brw_reg ip = brw_ip_reg();
    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
 
-   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
+   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
    brw_inst_set_pred_control(devinfo, inst, predicate_control);
diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
index f231ea0..e089c1f 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -434,18 +434,6 @@
     * In fact, checking it would weaken testing of the other rules.
     */
 
-   if (num_sources == 3)
-      return (struct string){};
-
-   if (exec_size == 1)
-      return (struct string){};
-
-   if (inst_is_send(devinfo, inst))
-      return (struct string){};
-
-   if (desc->ndst == 0)
-      return (struct string){};
-
    unsigned dst_stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
    bool dst_type_is_byte =
       brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_B ||
@@ -633,7 +621,7 @@
       /* VertStride must be used to cross GRF register boundaries. This rule
        * implies that elements within a 'Width' cannot cross GRF boundaries.
        */
-      const uint64_t mask = (1 << element_size) - 1;
+      const uint64_t mask = (1ULL << element_size) - 1;
       unsigned rowbase = subreg;
 
       for (int y = 0; y < exec_size / width; y++) {
@@ -686,7 +674,7 @@
                    unsigned exec_size, unsigned element_size, unsigned subreg,
                    unsigned vstride, unsigned width, unsigned hstride)
 {
-   const uint64_t mask = (1 << element_size) - 1;
+   const uint64_t mask = (1ULL << element_size) - 1;
    unsigned rowbase = subreg;
    unsigned element = 0;
 
@@ -1042,17 +1030,23 @@
 }
 
 bool
-brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+brw_validate_instructions(const struct gen_device_info *devinfo,
+                          void *assembly, int start_offset, int end_offset,
                           struct annotation_info *annotation)
 {
-   const struct gen_device_info *devinfo = p->devinfo;
-   const void *store = p->store;
    bool valid = true;
 
-   for (int src_offset = start_offset; src_offset < p->next_insn_offset;
-        src_offset += sizeof(brw_inst)) {
+   for (int src_offset = start_offset; src_offset < end_offset;) {
       struct string error_msg = { .str = NULL, .len = 0 };
-      const brw_inst *inst = store + src_offset;
+      const brw_inst *inst = assembly + src_offset;
+      bool is_compact = brw_inst_cmpt_control(devinfo, inst);
+      brw_inst uncompacted;
+
+      if (is_compact) {
+         brw_compact_inst *compacted = (void *)inst;
+         brw_uncompact_instruction(devinfo, &uncompacted, compacted);
+         inst = &uncompacted;
+      }
 
       if (is_unsupported_inst(devinfo, inst)) {
          ERROR("Instruction not supported on this Gen");
@@ -1069,6 +1063,12 @@
       }
       valid = valid && error_msg.len == 0;
       free(error_msg.str);
+
+      if (is_compact) {
+         src_offset += sizeof(brw_compact_inst);
+      } else {
+         src_offset += sizeof(brw_inst);
+      }
    }
 
    return valid;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4dcdc1b..d9c7157 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -857,14 +857,29 @@
       const unsigned end = start + inst->exec_size;
       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
    }
+
+   unsigned
+   bit_mask(unsigned n)
+   {
+      return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
+   }
+
+   unsigned
+   flag_mask(const fs_reg &r, unsigned sz)
+   {
+      if (r.file == ARF) {
+         const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
+         const unsigned end = start + sz;
+         return bit_mask(end) & ~bit_mask(start);
+      } else {
+         return 0;
+      }
+   }
 }
 
 unsigned
 fs_inst::flags_read(const gen_device_info *devinfo) const
 {
-   /* XXX - This doesn't consider explicit uses of the flag register as source
-    *       region.
-    */
    if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
        predicate == BRW_PREDICATE_ALIGN1_ALLV) {
       /* The vertical predication modes combine corresponding bits from
@@ -875,23 +890,24 @@
    } else if (predicate) {
       return flag_mask(this);
    } else {
-      return 0;
+      unsigned mask = 0;
+      for (int i = 0; i < sources; i++) {
+         mask |= flag_mask(src[i], size_read(i));
+      }
+      return mask;
    }
 }
 
 unsigned
 fs_inst::flags_written() const
 {
-   /* XXX - This doesn't consider explicit uses of the flag register as
-    *       destination region.
-    */
    if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
                             opcode != BRW_OPCODE_IF &&
                             opcode != BRW_OPCODE_WHILE)) ||
        opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
       return flag_mask(this);
    } else {
-      return 0;
+      return flag_mask(dst, size_written);
    }
 }
 
@@ -1383,7 +1399,16 @@
 void
 fs_visitor::assign_curb_setup()
 {
-   prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
+   unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
+
+   unsigned ubo_push_length = 0;
+   unsigned ubo_push_start[4];
+   for (int i = 0; i < 4; i++) {
+      ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
+      ubo_push_length += stage_prog_data->ubo_ranges[i].length;
+   }
+
+   prog_data->curb_read_length = uniform_push_length + ubo_push_length;
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
@@ -1391,7 +1416,11 @@
 	 if (inst->src[i].file == UNIFORM) {
             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
             int constant_nr;
-            if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
+            if (inst->src[i].nr >= UBO_START) {
+               /* constant_nr is in 32-bit units, the rest are in bytes */
+               constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
+                             inst->src[i].offset / 4;
+            } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
                constant_nr = push_constant_loc[uniform_nr];
             } else {
                /* Section 5.11 of the OpenGL 4.1 spec says:
@@ -1433,7 +1462,7 @@
    int urb_next = 0;
    /* Figure out where each of the incoming setup attributes lands. */
    if (devinfo->gen >= 6) {
-      if (_mesa_bitcount_64(nir->info->inputs_read &
+      if (_mesa_bitcount_64(nir->info.inputs_read &
                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
           * first 16 varying inputs, so we can put them wherever we want.
@@ -1445,14 +1474,14 @@
           * a different vertex (or geometry) shader.
           */
          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
                 BITFIELD64_BIT(i)) {
                prog_data->urb_setup[i] = urb_next++;
             }
          }
       } else {
          bool include_vue_header =
-            nir->info->inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+            nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
 
          /* We have enough input varyings that the SF/SBE pipeline stage can't
           * arbitrarily rearrange them to suit our whim; we have to put them
@@ -1462,7 +1491,7 @@
          struct brw_vue_map prev_stage_vue_map;
          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
                              key->input_slots_valid,
-                             nir->info->separate_shader);
+                             nir->info.separate_shader);
          int first_slot =
             include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
 
@@ -1471,7 +1500,7 @@
               slot++) {
             int varying = prev_stage_vue_map.slot_to_varying[slot];
             if (varying != BRW_VARYING_SLOT_PAD &&
-                (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
                  BITFIELD64_BIT(varying))) {
                prog_data->urb_setup[varying] = slot - first_slot;
             }
@@ -1504,7 +1533,7 @@
        *
        * See compile_sf_prog() for more info.
        */
-      if (nir->info->inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
    }
 
@@ -1631,7 +1660,7 @@
    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
 
    first_non_payload_grf +=
-      8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in;
+      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       /* Rewrite all ATTR file references to GRFs. */
@@ -2062,6 +2091,20 @@
    stage_prog_data->nr_params = num_push_constants;
    stage_prog_data->nr_pull_params = num_pull_constants;
 
+   /* Now that we know how many regular uniforms we'll push, reduce the
+    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
+    */
+   unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
+   for (int i = 0; i < 4; i++) {
+      struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+      if (push_length + range->length > 64)
+         range->length = 64 - push_length;
+
+      push_length += range->length;
+   }
+   assert(push_length <= 64);
+
    /* Up until now, the param[] array has been indexed by reg + offset
     * of UNIFORM registers.  Move pull constants into pull_param[] and
     * condense param[] to only contain the uniforms we chose to push.
@@ -2089,6 +2132,38 @@
          new_thread_local_id_index;
 }
 
+bool
+fs_visitor::get_pull_locs(const fs_reg &src,
+                          unsigned *out_surf_index,
+                          unsigned *out_pull_index)
+{
+   assert(src.file == UNIFORM);
+
+   if (src.nr >= UBO_START) {
+      const struct brw_ubo_range *range =
+         &prog_data->ubo_ranges[src.nr - UBO_START];
+
+      /* If this access is in our (reduced) range, use the push data. */
+      if (src.offset / 32 < range->length)
+         return false;
+
+      *out_surf_index = prog_data->binding_table.ubo_start + range->block;
+      *out_pull_index = (32 * range->start + src.offset) / 4;
+      return true;
+   }
+
+   const unsigned location = src.nr + src.offset / 4;
+
+   if (location < uniforms && pull_constant_loc[location] != -1) {
+      /* A regular uniform push constant */
+      *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
+      *out_pull_index = pull_constant_loc[location];
+      return true;
+   }
+
+   return false;
+}
+
 /**
  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
@@ -2096,7 +2171,7 @@
 void
 fs_visitor::lower_constant_loads()
 {
-   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+   unsigned index, pull_index;
 
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
       /* Set up the annotation tracking for new generated instructions. */
@@ -2110,18 +2185,11 @@
          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
             continue;
 
-         unsigned location = inst->src[i].nr + inst->src[i].offset / 4;
-         if (location >= uniforms)
-            continue; /* Out of bounds access */
-
-         int pull_index = pull_constant_loc[location];
-
-         if (pull_index == -1)
+         if (!get_pull_locs(inst->src[i], &index, &pull_index))
 	    continue;
 
          assert(inst->src[i].stride == 0);
 
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
          const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
@@ -2142,14 +2210,8 @@
       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
           inst->src[0].file == UNIFORM) {
 
-         unsigned location = inst->src[0].nr + inst->src[0].offset / 4;
-         if (location >= uniforms)
-            continue; /* Out of bounds access */
-
-         int pull_index = pull_constant_loc[location];
-
-         if (pull_index == -1)
-	    continue;
+         if (!get_pull_locs(inst->src[0], &index, &pull_index))
+            continue;
 
          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
                                     brw_imm_ud(index),
@@ -2445,7 +2507,7 @@
    if (stage != MESA_SHADER_FRAGMENT)
       return false;
 
-   if (devinfo->gen < 9 && !devinfo->is_cherryview)
+   if (devinfo->gen != 9 && !devinfo->is_cherryview)
       return false;
 
    /* FINISHME: It should be possible to implement this optimization when there
@@ -3349,7 +3411,7 @@
           * operation directly, but CHV/BXT cannot.
           */
          if (devinfo->gen >= 8 &&
-             !devinfo->is_cherryview && !devinfo->is_broxton)
+             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo))
             continue;
 
          if (inst->src[1].file == IMM &&
@@ -4951,7 +5013,9 @@
 {
    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
             (inst->components_read(i) == 1 &&
-             lbld.dispatch_width() <= inst->exec_size));
+             lbld.dispatch_width() <= inst->exec_size)) ||
+          (inst->flags_written() &
+           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
 }
 
 /**
@@ -5456,7 +5520,7 @@
 
    /* R27: interpolated depth if uses source depth */
    prog_data->uses_src_depth =
-      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
    if (prog_data->uses_src_depth) {
       payload.source_depth_reg = payload.num_regs;
       payload.num_regs++;
@@ -5468,7 +5532,7 @@
 
    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
    prog_data->uses_src_w =
-      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
    if (prog_data->uses_src_w) {
       payload.source_w_reg = payload.num_regs;
       payload.num_regs++;
@@ -5480,7 +5544,7 @@
 
    /* R31: MSAA position offsets. */
    if (prog_data->persample_dispatch &&
-       (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
        *
        *    "MSDISPMODE_PERSAMPLE is required in order to select
@@ -5497,7 +5561,7 @@
 
    /* R32: MSAA input coverage mask */
    prog_data->uses_sample_mask =
-      (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
    if (prog_data->uses_sample_mask) {
       assert(devinfo->gen >= 7);
       payload.sample_mask_in_reg = payload.num_regs;
@@ -5511,7 +5575,7 @@
    /* R34-: bary for 32-pixel. */
    /* R58-59: interp W for 32-pixel. */
 
-   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
       source_depth_to_render_target = true;
    }
 }
@@ -5548,15 +5612,15 @@
     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
     * have to multiply by VerticesIn to obtain the total storage requirement.
     */
-   if (8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in >
+   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
        max_push_components || gs_prog_data->invocations > 1) {
       gs_prog_data->base.include_vue_handles = true;
 
       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
-      payload.num_regs += nir->info->gs.vertices_in;
+      payload.num_regs += nir->info.gs.vertices_in;
 
       vue_prog_data->urb_read_length =
-         ROUND_DOWN_TO(max_push_components / nir->info->gs.vertices_in, 8) / 8;
+         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
    }
 }
 
@@ -5657,7 +5721,7 @@
       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
          char filename[64];                                             \
          snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
-                  stage_abbrev, dispatch_width, nir->info->name, iteration, pass_num); \
+                  stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
                                                                         \
          backend_shader::dump_instructions(filename);                   \
       }                                                                 \
@@ -5671,7 +5735,7 @@
    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
       char filename[64];
       snprintf(filename, 64, "%s%d-%s-00-00-start",
-               stage_abbrev, dispatch_width, nir->info->name);
+               stage_abbrev, dispatch_width, nir->info.name);
 
       backend_shader::dump_instructions(filename);
    }
@@ -5968,15 +6032,15 @@
    }
 
    /* Fix the disptach mask */
-   if (nir->info->tess.tcs_vertices_out % 8) {
+   if (nir->info.tess.tcs_vertices_out % 8) {
       bld.CMP(bld.null_reg_ud(), invocation_id,
-              brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
+              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
       bld.IF(BRW_PREDICATE_NORMAL);
    }
 
    emit_nir_code();
 
-   if (nir->info->tess.tcs_vertices_out % 8) {
+   if (nir->info.tess.tcs_vertices_out % 8) {
       bld.emit(BRW_OPCODE_ENDIF);
    }
 
@@ -6096,6 +6160,31 @@
    return !failed;
 }
 
+/* From the SKL PRM, Volume 16, Workarounds:
+ *
+ *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
+ *              only header phases (R0-R2)
+ *
+ *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
+ *       have been header only.
+ *
+ * Instead of enabling push constants one can alternatively enable one of the
+ * inputs. Here one simply chooses "layer" which shouldn't impose much
+ * overhead.
+ */
+static void
+gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
+{
+   if (wm_prog_data->num_varying_inputs)
+      return;
+
+   if (wm_prog_data->base.curb_read_length)
+      return;
+
+   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+   wm_prog_data->num_varying_inputs = 1;
+}
+
 bool
 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 {
@@ -6119,8 +6208,8 @@
          emit_shader_time_begin();
 
       calculate_urb_setup();
-      if (nir->info->inputs_read > 0 ||
-          (nir->info->outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
+      if (nir->info.inputs_read > 0 ||
+          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
          if (devinfo->gen < 6)
             emit_interpolation_setup_gen4();
          else
@@ -6159,6 +6248,10 @@
       optimize();
 
       assign_curb_setup();
+
+      if (devinfo->gen >= 9)
+         gen9_ps_header_only_workaround(wm_prog_data);
+
       assign_urb_setup();
 
       fixup_3src_null_dest();
@@ -6284,8 +6377,8 @@
 static uint8_t
 computed_depth_mode(const nir_shader *shader)
 {
-   if (shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-      switch (shader->info->fs.depth_layout) {
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
       case FRAG_DEPTH_LAYOUT_NONE:
       case FRAG_DEPTH_LAYOUT_ANY:
          return BRW_PSCDEPTH_ON;
@@ -6465,25 +6558,25 @@
    /* key->alpha_test_func means simulating alpha testing via discards,
     * so the shader definitely kills pixels.
     */
-   prog_data->uses_kill = shader->info->fs.uses_discard ||
+   prog_data->uses_kill = shader->info.fs.uses_discard ||
       key->alpha_test_func;
    prog_data->uses_omask = key->multisample_fbo &&
-      shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data->computed_depth_mode = computed_depth_mode(shader);
    prog_data->computed_stencil =
-      shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
    prog_data->persample_dispatch =
       key->multisample_fbo &&
       (key->persample_interp ||
-       (shader->info->system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
                                             SYSTEM_BIT_SAMPLE_POS)) ||
-       shader->info->fs.uses_sample_qualifier ||
-       shader->info->outputs_read);
+       shader->info.fs.uses_sample_qualifier ||
+       shader->info.outputs_read);
 
-   prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
-   prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
-   prog_data->inner_coverage = shader->info->fs.inner_coverage;
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
 
    prog_data->barycentric_interp_modes =
       brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
@@ -6566,9 +6659,9 @@
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
-                                     shader->info->label ?
-                                        shader->info->label : "unnamed",
-                                     shader->info->name));
+                                     shader->info.label ?
+                                        shader->info.label : "unnamed",
+                                     shader->info.name));
    }
 
    if (simd8_cfg) {
@@ -6700,12 +6793,12 @@
    brw_nir_lower_intrinsics(shader, &prog_data->base);
    shader = brw_postprocess_nir(shader, compiler, true);
 
-   prog_data->local_size[0] = shader->info->cs.local_size[0];
-   prog_data->local_size[1] = shader->info->cs.local_size[1];
-   prog_data->local_size[2] = shader->info->cs.local_size[2];
+   prog_data->local_size[0] = shader->info.cs.local_size[0];
+   prog_data->local_size[1] = shader->info.cs.local_size[1];
+   prog_data->local_size[2] = shader->info.cs.local_size[2];
    unsigned local_workgroup_size =
-      shader->info->cs.local_size[0] * shader->info->cs.local_size[1] *
-      shader->info->cs.local_size[2];
+      shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
+      shader->info.cs.local_size[2];
 
    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
    unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
@@ -6795,9 +6888,9 @@
                   MESA_SHADER_COMPUTE);
    if (INTEL_DEBUG & DEBUG_CS) {
       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
-                                   shader->info->label ? shader->info->label :
+                                   shader->info.label ? shader->info.label :
                                                         "unnamed",
-                                   shader->info->name);
+                                   shader->info.name);
       g.enable_debug(name);
    }
 
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index e230b5e..f1ba193 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -50,6 +50,8 @@
    return offset(reg, bld.dispatch_width(), delta);
 }
 
+#define UBO_START ((1 << 16) - 4)
+
 /**
  * The fragment shader front-end.
  *
@@ -125,6 +127,8 @@
    void split_virtual_grfs();
    bool compact_virtual_grfs();
    void assign_constant_locations();
+   bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
+                      unsigned *out_pull_index);
    void lower_constant_loads();
    void invalidate_live_intervals();
    void calculate_live_intervals();
@@ -175,7 +179,6 @@
    fs_reg *emit_samplepos_setup();
    fs_reg *emit_sampleid_setup();
    fs_reg *emit_samplemaskin_setup();
-   fs_reg *emit_vs_system_value(int location);
    void emit_interpolation_setup_gen4();
    void emit_interpolation_setup_gen6();
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 2d50c92..4625d69 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -142,6 +142,31 @@
                 scan_inst->opcode == BRW_OPCODE_CMPN)
                break;
 
+            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             */
+            if (scan_inst->saturate)
+               break;
+
+            /* From the Sky Lake PRM, Vol 2a, "Multiply":
+             *
+             *    "When multiplying integer data types, if one of the sources
+             *     is a DW, the resulting full precision data is stored in
+             *     the accumulator. However, if the destination data type is
+             *     either W or DW, the low bits of the result are written to
+             *     the destination register and the remaining high bits are
+             *     discarded. This results in undefined Overflow and Sign
+             *     flags. Therefore, conditional modifiers and saturation
+             *     (.sat) cannot be used in this case."
+             *
+             * We just disallow cmod propagation on all integer multiplies.
+             */
+            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
+                scan_inst->opcode == BRW_OPCODE_MUL)
+               break;
+
             /* Otherwise, try propagating the conditional. */
             enum brw_conditional_mod cond =
                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index a7f95cc..0c14c03 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -397,7 +397,6 @@
       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
 
       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
-      brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
       {
          /* Don't send AA data */
          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
@@ -2167,10 +2166,16 @@
    annotation_finalize(&annotation, p->next_insn_offset);
 
 #ifndef NDEBUG
-   bool validated = brw_validate_instructions(p, start_offset, &annotation);
+   bool validated = brw_validate_instructions(devinfo, p->store,
+                                              start_offset,
+                                              p->next_insn_offset,
+                                              &annotation);
 #else
    if (unlikely(debug_flag))
-      brw_validate_instructions(p, start_offset, &annotation);
+      brw_validate_instructions(devinfo, p->store,
+                                start_offset,
+                                p->next_insn_offset,
+                                &annotation);
 #endif
 
    int before_size = p->next_insn_offset - start_offset;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 23cd4b7..d760946 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -89,39 +89,11 @@
          unreachable("should be lowered by lower_vertex_id().");
 
       case nir_intrinsic_load_vertex_id_zero_base:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
-         break;
-
       case nir_intrinsic_load_base_vertex:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
-         break;
-
       case nir_intrinsic_load_instance_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
-         break;
-
       case nir_intrinsic_load_base_instance:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
-         break;
-
       case nir_intrinsic_load_draw_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
-         break;
+         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 
       case nir_intrinsic_load_invocation_id:
          if (v->stage == MESA_SHADER_TESS_CTRL)
@@ -667,7 +639,7 @@
        */
       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
           nir_src_bit_size(instr->src[0].src) == 32 &&
-          (devinfo->is_cherryview || devinfo->is_broxton)) {
+          (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
          fs_reg tmp = bld.vgrf(result.type, 1);
          tmp = subscript(tmp, op[0].type, 0);
          inst = bld.MOV(tmp, op[0]);
@@ -1804,7 +1776,7 @@
    assert(gs_compile->control_data_bits_per_vertex == 2);
 
    /* Must be a valid stream */
-   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+   assert(stream_id < MAX_VERTEX_STREAMS);
 
    /* Control data bits are initialized to 0 so we don't have to set any
     * bits when sending vertices to stream 0.
@@ -1853,7 +1825,7 @@
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
       return;
 
    /* If we're outputting 32 control data bits or less, then we can wait
@@ -1942,27 +1914,15 @@
    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
 
-   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
-    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
-    * gl_PointSize is available as a GS input, however, so it must be that.
-    */
-   const bool is_point_size = (base_offset == 0);
-
    /* TODO: figure out push input layout for invocations == 1 */
    if (gs_prog_data->invocations == 1 &&
        offset_const != NULL && vertex_const != NULL &&
        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
                        vertex_const->u32[0] * push_reg_count;
-      /* This input was pushed into registers. */
-      if (is_point_size) {
-         /* gl_PointSize comes in .w */
-         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
-      } else {
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i),
-                    fs_reg(ATTR, imm_offset + i + first_component, dst.type));
-         }
+      for (unsigned i = 0; i < num_components; i++) {
+         bld.MOV(offset(dst, bld, i),
+                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
       }
       return;
    }
@@ -2008,12 +1968,12 @@
 
          /* Use first_icp_handle as the base offset.  There is one register
           * of URB handles per vertex, so inform the register allocator that
-          * we might read up to nir->info->gs.vertices_in registers.
+          * we might read up to nir->info.gs.vertices_in registers.
           */
          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
                   fs_reg(icp_offset_bytes),
-                  brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
+                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
       }
    } else {
       assert(gs_prog_data->invocations > 1);
@@ -2039,12 +1999,12 @@
 
          /* Use first_icp_handle as the base offset.  There is one DWord
           * of URB handles per vertex, so inform the register allocator that
-          * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
+          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
           */
          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
                   fs_reg(icp_offset_bytes),
-                  brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
+                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
                              REG_SIZE));
       }
    }
@@ -2132,14 +2092,6 @@
          }
       }
    }
-
-   if (is_point_size) {
-      /* Read the whole VUE header (because of alignment) and read .w. */
-      fs_reg tmp = bld.vgrf(dst.type, 4);
-      inst->dst = tmp;
-      inst->size_written = 4 * REG_SIZE;
-      bld.MOV(dst, offset(tmp, bld, 3));
-   }
 }
 
 fs_reg
@@ -2251,7 +2203,7 @@
    }
 
    case nir_intrinsic_load_input: {
-      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
+      fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
       unsigned first_component = nir_intrinsic_component(instr);
       unsigned num_components = instr->num_components;
       enum brw_reg_type type = dest.type;
@@ -3796,7 +3748,7 @@
             (instr->num_components - 1) * type_sz(dest.type);
 
          bool supports_64bit_indirects =
-            !devinfo->is_cherryview && !devinfo->is_broxton;
+            !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
 
          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
             for (unsigned j = 0; j < instr->num_components; j++) {
@@ -3849,7 +3801,7 @@
           */
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ubo_start +
-                               nir->info->num_ubos - 1);
+                               nir->info.num_ubos - 1);
       }
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
@@ -3870,6 +3822,34 @@
           * and we have to split it if necessary.
           */
          const unsigned type_size = type_sz(dest.type);
+
+         /* See if we've selected this as a push constant candidate */
+         if (const_index) {
+            const unsigned ubo_block = const_index->u32[0];
+            const unsigned offset_256b = const_offset->u32[0] / 32;
+
+            fs_reg push_reg;
+            for (int i = 0; i < 4; i++) {
+               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+               if (range->block == ubo_block &&
+                   offset_256b >= range->start &&
+                   offset_256b < range->start + range->length) {
+
+                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
+                  push_reg.offset = const_offset->u32[0] - 32 * range->start;
+                  break;
+               }
+            }
+
+            if (push_reg.file != BAD_FILE) {
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          byte_offset(push_reg, i * type_size));
+               }
+               break;
+            }
+         }
+
          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
@@ -3919,7 +3899,7 @@
           */
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       fs_reg offset_reg;
@@ -3959,7 +3939,7 @@
 
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       /* Value */
@@ -4123,7 +4103,11 @@
       break;
    }
 
-   case nir_intrinsic_load_channel_num: {
+   case nir_intrinsic_load_subgroup_size:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
+      break;
+
+   case nir_intrinsic_load_subgroup_invocation: {
       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
       dest = retype(dest, BRW_REGISTER_TYPE_UD);
       const fs_builder allbld8 = bld.group(8, 0).exec_all();
@@ -4138,6 +4122,102 @@
       break;
    }
 
+   case nir_intrinsic_load_subgroup_eq_mask:
+   case nir_intrinsic_load_subgroup_ge_mask:
+   case nir_intrinsic_load_subgroup_gt_mask:
+   case nir_intrinsic_load_subgroup_le_mask:
+   case nir_intrinsic_load_subgroup_lt_mask:
+      unreachable("not reached");
+
+   case nir_intrinsic_vote_any: {
+      const fs_builder ubld = bld.exec_all();
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+      bld.MOV(dest, brw_imm_d(-1));
+      set_predicate(dispatch_width == 8 ?
+                    BRW_PREDICATE_ALIGN1_ANY8H :
+                    BRW_PREDICATE_ALIGN1_ANY16H,
+                    bld.SEL(dest, dest, brw_imm_d(0)));
+      break;
+   }
+   case nir_intrinsic_vote_all: {
+      const fs_builder ubld = bld.exec_all();
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+      bld.MOV(dest, brw_imm_d(-1));
+      set_predicate(dispatch_width == 8 ?
+                    BRW_PREDICATE_ALIGN1_ALL8H :
+                    BRW_PREDICATE_ALIGN1_ALL16H,
+                    bld.SEL(dest, dest, brw_imm_d(0)));
+      break;
+   }
+   case nir_intrinsic_vote_eq: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      fs_reg uniformized = bld.emit_uniformize(value);
+      const fs_builder ubld = bld.exec_all();
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+      bld.MOV(dest, brw_imm_d(-1));
+      set_predicate(dispatch_width == 8 ?
+                    BRW_PREDICATE_ALIGN1_ALL8H :
+                    BRW_PREDICATE_ALIGN1_ALL16H,
+                    bld.SEL(dest, dest, brw_imm_d(0)));
+      break;
+   }
+
+   case nir_intrinsic_ballot: {
+      const fs_reg value = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+      const struct brw_reg flag = retype(brw_flag_reg(0, 0),
+                                         BRW_REGISTER_TYPE_UD);
+
+      bld.exec_all().MOV(flag, brw_imm_ud(0u));
+      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
+
+      if (instr->dest.ssa.bit_size > 32) {
+         dest.type = BRW_REGISTER_TYPE_UQ;
+      } else {
+         dest.type = BRW_REGISTER_TYPE_UD;
+      }
+      bld.MOV(dest, flag);
+      break;
+   }
+
+   case nir_intrinsic_read_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg invocation = get_nir_src(instr->src[1]);
+      fs_reg tmp = bld.vgrf(value.type);
+
+      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
+                          component(invocation, 0));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_read_first_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              bld.emit_uniformize(value));
+      break;
+   }
+
    default:
       unreachable("unknown intrinsic");
    }
@@ -4171,7 +4251,7 @@
        */
       brw_mark_surface_used(prog_data,
                             stage_prog_data->binding_table.ssbo_start +
-                            nir->info->num_ssbos - 1);
+                            nir->info.num_ssbos - 1);
    }
 
    fs_reg offset = get_nir_src(instr->src[1]);
diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp
index 8cd897f..fd02792 100644
--- a/src/intel/compiler/brw_fs_sel_peephole.cpp
+++ b/src/intel/compiler/brw_fs_sel_peephole.cpp
@@ -68,7 +68,8 @@
 {
    int then_movs = 0;
    foreach_inst_in_block(fs_inst, inst, then_block) {
-      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written())
          break;
 
       then_mov[then_movs] = inst;
@@ -77,7 +78,8 @@
 
    int else_movs = 0;
    foreach_inst_in_block(fs_inst, inst, else_block) {
-      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written())
          break;
 
       else_mov[else_movs] = inst;
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index cea38d8..758c8bf 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -32,50 +32,6 @@
 
 using namespace brw;
 
-fs_reg *
-fs_visitor::emit_vs_system_value(int location)
-{
-   fs_reg *reg = new(this->mem_ctx)
-      fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
-             BRW_REGISTER_TYPE_D);
-   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
-
-   switch (location) {
-   case SYSTEM_VALUE_BASE_VERTEX:
-      reg->offset = 0;
-      vs_prog_data->uses_basevertex = true;
-      break;
-   case SYSTEM_VALUE_BASE_INSTANCE:
-      reg->offset = REG_SIZE;
-      vs_prog_data->uses_baseinstance = true;
-      break;
-   case SYSTEM_VALUE_VERTEX_ID:
-      unreachable("should have been lowered");
-   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
-      reg->offset = 2 * REG_SIZE;
-      vs_prog_data->uses_vertexid = true;
-      break;
-   case SYSTEM_VALUE_INSTANCE_ID:
-      reg->offset = 3 * REG_SIZE;
-      vs_prog_data->uses_instanceid = true;
-      break;
-   case SYSTEM_VALUE_DRAW_ID:
-      if (nir->info->system_values_read &
-          (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
-           BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
-           BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
-           BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)))
-         reg->nr += 4;
-      reg->offset = 0;
-      vs_prog_data->uses_drawid = true;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   return reg;
-}
-
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
 fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
@@ -414,13 +370,13 @@
    fs_reg src_depth, src_stencil;
 
    if (source_depth_to_render_target) {
-      if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+      if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
          src_depth = frag_depth;
       else
          src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
    }
 
-   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
       src_stencil = frag_stencil;
 
    const fs_reg sources[] = {
@@ -459,7 +415,7 @@
       limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
    }
 
-   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
       /* From the 'Render Target Write message' section of the docs:
        * "Output Stencil is not supported with SIMD16 Render Target Write
        * Messages."
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index a0b8fb6..5b2ce32 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -88,7 +88,7 @@
    } else {                                                                  \
       high = hi4;  low = lo4;                                                \
    }                                                                         \
-   assert(((int) high) != -1 && ((int) low) != -1);                          \
+   assert(((int) high) != -1 && ((int) low) != -1);
 
 /* A general macro for cases where the field has moved to several different
  * bit locations across generations.  GCC appears to combine cases where the
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 36ccdf3..ce21c01 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -97,49 +97,6 @@
 }
 
 static bool
-remap_vs_attrs(nir_block *block, shader_info *nir_info)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      if (intrin->intrinsic == nir_intrinsic_load_input) {
-         /* Attributes come in a contiguous block, ordered by their
-          * gl_vert_attrib value.  That means we can compute the slot
-          * number for an attribute by masking out the enabled attributes
-          * before it and counting the bits.
-          */
-         int attr = intrin->const_index[0];
-         int slot = _mesa_bitcount_64(nir_info->inputs_read &
-                                      BITFIELD64_MASK(attr));
-         intrin->const_index[0] = 4 * slot;
-      }
-   }
-   return true;
-}
-
-static bool
-remap_inputs_with_vue_map(nir_block *block, const struct brw_vue_map *vue_map)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      if (intrin->intrinsic == nir_intrinsic_load_input ||
-          intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
-         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
-         assert(vue_slot != -1);
-         intrin->const_index[0] = vue_slot;
-      }
-   }
-   return true;
-}
-
-static bool
 remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
                   GLenum primitive_mode)
 {
@@ -199,8 +156,8 @@
                         const struct brw_vue_map *vue_map,
                         GLenum tes_primitive_mode)
 {
-   const bool is_passthrough_tcs = b->shader->info->name &&
-      strcmp(b->shader->info->name, "passthrough") == 0;
+   const bool is_passthrough_tcs = b->shader->info.name &&
+      strcmp(b->shader->info.name, "passthrough") == 0;
 
    nir_foreach_instr_safe(instr, block) {
       if (instr->type != nir_instr_type_intrinsic)
@@ -254,7 +211,6 @@
 
 void
 brw_nir_lower_vs_inputs(nir_shader *nir,
-                        bool is_scalar,
                         bool use_legacy_snorm_formula,
                         const uint8_t *vs_attrib_wa_flags)
 {
@@ -277,13 +233,100 @@
    brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula,
                                        vs_attrib_wa_flags);
 
-   if (is_scalar) {
-      /* Finally, translate VERT_ATTRIB_* values into the actual registers. */
+   /* The last step is to remap VERT_ATTRIB_* to actual registers */
 
-      nir_foreach_function(function, nir) {
-         if (function->impl) {
-            nir_foreach_block(block, function->impl) {
-               remap_vs_attrs(block, nir->info);
+   /* Whether or not we have any system generated values.  gl_DrawID is not
+    * included here as it lives in its own vec4.
+    */
+   const bool has_sgvs =
+      nir->info.system_values_read &
+      (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+       BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+       BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+       BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID));
+
+   const unsigned num_inputs = _mesa_bitcount_64(nir->info.inputs_read);
+
+   nir_foreach_function(function, nir) {
+      if (!function->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, function->impl);
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_base_vertex:
+            case nir_intrinsic_load_base_instance:
+            case nir_intrinsic_load_vertex_id_zero_base:
+            case nir_intrinsic_load_instance_id:
+            case nir_intrinsic_load_draw_id: {
+               b.cursor = nir_after_instr(&intrin->instr);
+
+               /* gl_VertexID and friends are stored by the VF as the last
+                * vertex element.  We convert them to load_input intrinsics at
+                * the right location.
+                */
+               nir_intrinsic_instr *load =
+                  nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
+               load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+
+               nir_intrinsic_set_base(load, num_inputs);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_base_vertex:
+                  nir_intrinsic_set_component(load, 0);
+                  break;
+               case nir_intrinsic_load_base_instance:
+                  nir_intrinsic_set_component(load, 1);
+                  break;
+               case nir_intrinsic_load_vertex_id_zero_base:
+                  nir_intrinsic_set_component(load, 2);
+                  break;
+               case nir_intrinsic_load_instance_id:
+                  nir_intrinsic_set_component(load, 3);
+                  break;
+               case nir_intrinsic_load_draw_id:
+                  /* gl_DrawID is stored right after gl_VertexID and friends
+                   * if any of them exist.
+                   */
+                  nir_intrinsic_set_base(load, num_inputs + has_sgvs);
+                  nir_intrinsic_set_component(load, 0);
+                  break;
+               default:
+                  unreachable("Invalid system value intrinsic");
+               }
+
+               load->num_components = 1;
+               nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+               nir_builder_instr_insert(&b, &load->instr);
+
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                        nir_src_for_ssa(&load->dest.ssa));
+               nir_instr_remove(&intrin->instr);
+               break;
+            }
+
+            case nir_intrinsic_load_input: {
+               /* Attributes come in a contiguous block, ordered by their
+                * gl_vert_attrib value.  That means we can compute the slot
+                * number for an attribute by masking out the enabled attributes
+                * before it and counting the bits.
+                */
+               int attr = nir_intrinsic_base(intrin);
+               int slot = _mesa_bitcount_64(nir->info.inputs_read &
+                                            BITFIELD64_MASK(attr));
+               nir_intrinsic_set_base(intrin, slot);
+               break;
+            }
+
+            default:
+               break; /* Nothing to do */
             }
          }
       }
@@ -291,7 +334,7 @@
 }
 
 void
-brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+brw_nir_lower_vue_inputs(nir_shader *nir,
                          const struct brw_vue_map *vue_map)
 {
    foreach_list_typed(nir_variable, var, node, &nir->inputs) {
@@ -301,16 +344,42 @@
    /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
    nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
 
-   if (is_scalar || nir->stage != MESA_SHADER_GEOMETRY) {
-      /* This pass needs actual constants */
-      nir_opt_constant_folding(nir);
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
 
-      add_const_offset_to_base(nir, nir_var_shader_in);
+   add_const_offset_to_base(nir, nir_var_shader_in);
 
-      nir_foreach_function(function, nir) {
-         if (function->impl) {
-            nir_foreach_block(block, function->impl) {
-               remap_inputs_with_vue_map(block, vue_map);
+   nir_foreach_function(function, nir) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic == nir_intrinsic_load_input ||
+                intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+               /* Offset 0 is the VUE header, which contains
+                * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
+                * VARYING_SLOT_PSIZ [.w].
+                */
+               int varying = nir_intrinsic_base(intrin);
+               int vue_slot;
+               switch (varying) {
+               case VARYING_SLOT_PSIZ:
+                  nir_intrinsic_set_base(intrin, 0);
+                  nir_intrinsic_set_component(intrin, 3);
+                  break;
+
+               default:
+                  vue_slot = vue_map->varying_to_slot[varying];
+                  assert(vue_slot != -1);
+                  nir_intrinsic_set_base(intrin, vue_slot);
+                  break;
+               }
             }
          }
       }
@@ -337,7 +406,7 @@
          nir_builder_init(&b, function->impl);
          nir_foreach_block(block, function->impl) {
             remap_patch_urb_offsets(block, &b, vue_map,
-                                    nir->info->tess.primitive_mode);
+                                    nir->info.tess.primitive_mode);
          }
       }
    }
@@ -484,6 +553,7 @@
       OPT(nir_opt_dce);
       OPT(nir_opt_cse);
       OPT(nir_opt_peephole_select, 0);
+      OPT(nir_opt_intrinsics);
       OPT(nir_opt_algebraic);
       OPT(nir_opt_constant_folding);
       OPT(nir_opt_dead_cf);
@@ -550,6 +620,7 @@
 
    OPT(nir_lower_tex, &tex_options);
    OPT(nir_normalize_cubemap_coords);
+   OPT(nir_lower_read_invocation_to_scalar);
 
    OPT(nir_lower_global_vars_to_local);
 
@@ -605,6 +676,12 @@
 
    UNUSED bool progress; /* Written by OPT */
 
+
+   do {
+      progress = false;
+      OPT(nir_opt_algebraic_before_ffma);
+   } while (progress);
+
    nir = nir_optimize(nir, compiler, is_scalar);
 
    if (devinfo->gen >= 6) {
@@ -695,6 +772,7 @@
    tex_options.lower_y_uv_external = key_tex->y_uv_image_mask;
    tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask;
    tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask;
+   tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask;
 
    if (nir_lower_tex(nir, &tex_options)) {
       nir_validate_shader(nir);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index b96072c..560027c 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -98,10 +98,9 @@
 bool brw_nir_lower_intrinsics(nir_shader *nir,
                               struct brw_stage_prog_data *prog_data);
 void brw_nir_lower_vs_inputs(nir_shader *nir,
-                             bool is_scalar,
                              bool use_legacy_snorm_formula,
                              const uint8_t *vs_attrib_wa_flags);
-void brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+void brw_nir_lower_vue_inputs(nir_shader *nir,
                               const struct brw_vue_map *vue_map);
 void brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue);
 void brw_nir_lower_fs_inputs(nir_shader *nir,
@@ -143,6 +142,10 @@
 void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
                                 struct brw_stage_prog_data *stage_prog_data);
 
+void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                                nir_shader *nir,
+                                struct brw_ubo_range out_ranges[4]);
+
 bool brw_nir_opt_peephole_ffma(nir_shader *shader);
 
 #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
diff --git a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
new file mode 100644
index 0000000..097aa8e
--- /dev/null
+++ b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/**
+ * \file brw_nir_analyze_ubo_ranges.c
+ *
+ * This pass decides which portions of UBOs to upload as push constants,
+ * so shaders can access them as part of the thread payload, rather than
+ * having to issue expensive memory reads to pull the data.
+ *
+ * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
+ * buffers, in GRF (256-bit/32-byte) units.
+ *
+ * To do this, we examine NIR load_ubo intrinsics, recording the number of
+ * loads at each offset.  We track offsets at a 32-byte granularity, so even
+ * fields with a bit of padding between them tend to fall into contiguous
+ * ranges.  We build a list of these ranges, tracking their "cost" (number
+ * of registers required) and "benefit" (number of pull loads eliminated
+ * by pushing the range).  We then sort the list to obtain the four best
+ * ranges (most benefit for the least cost).
+ */
+
+struct ubo_range_entry
+{
+   struct brw_ubo_range range;
+   int benefit;
+};
+
+static int
+score(const struct ubo_range_entry *entry)
+{
+   return 2 * entry->benefit - entry->range.length;
+}
+
+/**
+ * Compares score for two UBO range entries.
+ *
+ * For a descending qsort().
+ */
+static int
+cmp_ubo_range_entry(const void *va, const void *vb)
+{
+   const struct ubo_range_entry *a = va;
+   const struct ubo_range_entry *b = vb;
+
+   /* Rank based on scores */
+   int delta = score(b) - score(a);
+
+   /* Then use the UBO block index as a tie-breaker */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   /* Finally use the UBO offset as a second tie-breaker */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   return delta;
+}
+
+struct ubo_block_info
+{
+   /* Each bit in the offsets bitfield represents a 32-byte section of data.
+    * If it's set to one, there is interesting UBO data at that offset.  If
+    * not, there's a "hole" - padding between data - or just nothing at all.
+    */
+   uint64_t offsets;
+   uint8_t uses[64];
+};
+
+struct ubo_analysis_state
+{
+   struct hash_table *blocks;
+   bool uses_regular_uniforms;
+};
+
+static struct ubo_block_info *
+get_block_info(struct ubo_analysis_state *state, int block)
+{
+   uint32_t hash = block + 1;
+   void *key = (void *) (uintptr_t) hash;
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
+
+   if (entry)
+      return (struct ubo_block_info *) entry->data;
+
+   struct ubo_block_info *info =
+      rzalloc(state->blocks, struct ubo_block_info);
+   _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
+
+   return info;
+}
+
+static void
+analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic == nir_intrinsic_load_uniform)
+         state->uses_regular_uniforms = true;
+
+      if (intrin->intrinsic != nir_intrinsic_load_ubo)
+         continue;
+
+      nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]);
+      nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]);
+
+      if (block_const && offset_const) {
+         const int block = block_const->u32[0];
+         const int offset = offset_const->u32[0] / 32;
+
+         /* Won't fit in our bitfield */
+         if (offset >= 64)
+            continue;
+
+         /* TODO: should we count uses in loops as higher benefit? */
+
+         struct ubo_block_info *info = get_block_info(state, block);
+         info->offsets |= 1ull << offset;
+         info->uses[offset]++;
+      }
+   }
+}
+
+static void
+print_ubo_entry(FILE *file,
+                const struct ubo_range_entry *entry,
+                struct ubo_analysis_state *state)
+{
+   struct ubo_block_info *info = get_block_info(state, entry->range.block);
+
+   fprintf(file,
+           "block %2d, start %2d, length %2d, bits = %zx, "
+           "benefit %2d, cost %2d, score = %2d\n",
+           entry->range.block, entry->range.start, entry->range.length,
+           info->offsets, entry->benefit, entry->range.length, score(entry));
+}
+
+void
+brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                           nir_shader *nir,
+                           struct brw_ubo_range out_ranges[4])
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
+   if ((devinfo->gen <= 7 && !devinfo->is_haswell) ||
+       !compiler->scalar_stage[nir->stage]) {
+      memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
+      return;
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct ubo_analysis_state state = {
+      .uses_regular_uniforms = false,
+      .blocks =
+         _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
+   };
+
+   /* Walk the IR, recording how many times each UBO block/offset is used. */
+   nir_foreach_function(function, nir) {
+      if (function->impl) {
+         nir_foreach_block(block, function->impl) {
+            analyze_ubos_block(&state, block);
+         }
+      }
+   }
+
+   /* Find ranges: a block, starting 32-byte offset, and length. */
+   struct util_dynarray ranges;
+   util_dynarray_init(&ranges, mem_ctx);
+
+   struct hash_entry *entry;
+   hash_table_foreach(state.blocks, entry) {
+      const int b = entry->hash - 1;
+      const struct ubo_block_info *info = entry->data;
+      uint64_t offsets = info->offsets;
+
+      /* Walk through the offsets bitfield, finding contiguous regions of
+       * set bits:
+       *
+       *   0000000001111111111111000000000000111111111111110000000011111100
+       *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
+       *
+       * Each of these will become a UBO range.
+       */
+      while (offsets != 0) {
+         /* Find the first 1 in the offsets bitfield.  This represents the
+          * start of a range of interesting UBO data.  Make it zero-indexed.
+          */
+         int first_bit = ffsll(offsets) - 1;
+
+         /* Find the first 0 bit in offsets beyond first_bit.  To find the
+          * first zero bit, we find the first 1 bit in the complement.  In
+          * order to ignore bits before first_bit, we mask off those bits.
+          */
+         int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
+
+         if (first_hole == -1) {
+            /* If we didn't find a hole, then set it to the end of the
+             * bitfield.  There are no more ranges to process.
+             */
+            first_hole = 64;
+            offsets = 0;
+         } else {
+            /* We've processed all bits before first_hole.  Mask them off. */
+            offsets &= ~((1ull << first_hole) - 1);
+         }
+
+         struct ubo_range_entry *entry =
+            util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry));
+
+         entry->range.block = b;
+         entry->range.start = first_bit;
+         /* first_hole is one beyond the end, so we don't need to add 1 */
+         entry->range.length = first_hole - first_bit;
+         entry->benefit = 0;
+
+         for (int i = 0; i < entry->range.length; i++)
+            entry->benefit += info->uses[first_bit + i];
+      }
+   }
+
+   int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
+
+   if (0) {
+      util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
+         print_ubo_entry(stderr, entry, &state);
+      }
+   }
+
+   /* TODO: Consider combining ranges.
+    *
+    * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS.  If there are
+    * more ranges, and two are close by with only a small hole, it may be
+    * worth combining them.  The holes will waste register space, but the
+    * benefit of removing pulls may outweigh that cost.
+    */
+
+   /* Sort the list so the most beneficial ranges are at the front. */
+   qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
+         cmp_ubo_range_entry);
+
+   struct ubo_range_entry *entries = ranges.data;
+
+   /* Return the top 4 or so.  We drop by one if regular uniforms are in
+    * use, assuming one push buffer will be dedicated to those.  We may
+    * also only get 3 on Haswell if we can't write INSTPM.
+    *
+    * The backend may need to shrink these ranges to ensure that they
+    * don't exceed the maximum push constant limits.  It can simply drop
+    * the tail of the list, as that's the least valuable portion.  We
+    * unfortunately can't truncate it here, because we don't know what
+    * the backend is planning to do with regular uniforms.
+    */
+   const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
+                        state.uses_regular_uniforms;
+   nr_entries = MIN2(nr_entries, max_ubos);
+
+   for (int i = 0; i < nr_entries; i++) {
+      out_ranges[i] = entries[i].range;
+   }
+   for (int i = nr_entries; i < 4; i++) {
+      out_ranges[i].block = 0;
+      out_ranges[i].start = 0;
+      out_ranges[i].length = 0;
+   }
+
+   ralloc_free(ranges.mem_ctx);
+}
diff --git a/src/intel/compiler/brw_nir_intrinsics.c b/src/intel/compiler/brw_nir_intrinsics.c
index 901a1fb..abbbc6f 100644
--- a/src/intel/compiler/brw_nir_intrinsics.c
+++ b/src/intel/compiler/brw_nir_intrinsics.c
@@ -41,7 +41,7 @@
 {
    nir_builder *b = &state->builder;
    nir_shader *nir = state->nir;
-   const unsigned *sizes = nir->info->cs.local_size;
+   const unsigned *sizes = nir->info.cs.local_size;
    const unsigned group_size = sizes[0] * sizes[1] * sizes[2];
 
    /* Some programs have local_size dimensions so small that the thread local
@@ -88,10 +88,10 @@
          /* We construct the local invocation index from:
           *
           *    gl_LocalInvocationIndex =
-          *       cs_thread_local_id + channel_num;
+          *       cs_thread_local_id + subgroup_invocation;
           */
          nir_ssa_def *thread_local_id = read_thread_local_id(state);
-         nir_ssa_def *channel = nir_load_channel_num(b);
+         nir_ssa_def *channel = nir_load_subgroup_invocation(b);
          sysval = nir_iadd(b, channel, thread_local_id);
          break;
       }
@@ -111,7 +111,7 @@
           *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
           *       gl_WorkGroupSize.z;
           */
-         unsigned *size = nir->info->cs.local_size;
+         unsigned *size = nir->info.cs.local_size;
 
          nir_ssa_def *local_index = nir_load_local_invocation_index(b);
 
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index 304b4ec..53d0742 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -1168,8 +1168,8 @@
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
 
    nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
-   nir->info->inputs_read = key->inputs_read;
-   nir->info->patch_inputs_read = key->patch_inputs_read;
+   nir->info.inputs_read = key->inputs_read;
+   nir->info.patch_inputs_read = key->patch_inputs_read;
 
    nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
    brw_nir_lower_tes_inputs(nir, input_vue_map);
@@ -1177,8 +1177,8 @@
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
    brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
-                       nir->info->outputs_written,
-                       nir->info->separate_shader);
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
 
    unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
 
@@ -1190,13 +1190,21 @@
    }
 
    prog_data->base.clip_distance_mask =
-      ((1 << nir->info->clip_distance_array_size) - 1);
+      ((1 << nir->info.clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
-      ((1 << nir->info->cull_distance_array_size) - 1) <<
-      nir->info->clip_distance_array_size;
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
 
    /* URB entry sizes are stored as a multiple of 64 bytes. */
    prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   /* On Cannonlake software shall not program an allocation size that
+    * specifies a size that is a multiple of 3 64B (512-bit) cachelines.
+    */
+   if (devinfo->gen == 10 &&
+       prog_data->base.urb_entry_size % 3 == 0)
+      prog_data->base.urb_entry_size++;
+
    prog_data->base.urb_read_length = 0;
 
    STATIC_ASSERT(BRW_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
@@ -1206,9 +1214,9 @@
                  TESS_SPACING_FRACTIONAL_EVEN - 1);
 
    prog_data->partitioning =
-      (enum brw_tess_partitioning) (nir->info->tess.spacing - 1);
+      (enum brw_tess_partitioning) (nir->info.tess.spacing - 1);
 
-   switch (nir->info->tess.primitive_mode) {
+   switch (nir->info.tess.primitive_mode) {
    case GL_QUADS:
       prog_data->domain = BRW_TESS_DOMAIN_QUAD;
       break;
@@ -1222,14 +1230,14 @@
       unreachable("invalid domain shader primitive mode");
    }
 
-   if (nir->info->tess.point_mode) {
+   if (nir->info.tess.point_mode) {
       prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT;
-   } else if (nir->info->tess.primitive_mode == GL_ISOLINES) {
+   } else if (nir->info.tess.primitive_mode == GL_ISOLINES) {
       prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE;
    } else {
       /* Hardware winding order is backwards from OpenGL */
       prog_data->output_topology =
-         nir->info->tess.ccw ? BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW
+         nir->info.tess.ccw ? BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW
                              : BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
    }
 
@@ -1259,9 +1267,9 @@
       if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
          g.enable_debug(ralloc_asprintf(mem_ctx,
                                         "%s tessellation evaluation shader %s",
-                                        nir->info->label ? nir->info->label
+                                        nir->info.label ? nir->info.label
                                                         : "unnamed",
-                                        nir->info->name));
+                                        nir->info.name));
       }
 
       g.generate_code(v.cfg, 8);
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index ff90580..410922c 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -985,7 +985,7 @@
     * affected, at least by the 64b restriction, since DepCtrl with double
     * precision instructions seems to produce GPU hangs in some cases.
     */
-   if (devinfo->gen == 8 || devinfo->is_broxton) {
+   if (devinfo->gen == 8 || gen_device_info_is_9lp(devinfo)) {
       if (inst->opcode == BRW_OPCODE_MUL &&
          IS_DWORD(inst->src[0]) &&
          IS_DWORD(inst->src[1]))
@@ -1730,103 +1730,26 @@
 }
 
 
-static inline struct brw_reg
-attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
-{
-   struct brw_reg reg;
-
-   unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
-   if (interleaved) {
-      reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
-   } else {
-      reg = brw_vecn_grf(width, attr, 0);
-   }
-
-   reg.type = type;
-   return reg;
-}
-
-
-/**
- * Replace each register of type ATTR in this->instructions with a reference
- * to a fixed HW register.
- *
- * If interleaved is true, then each attribute takes up half a register, with
- * register N containing attribute 2*N in its first half and attribute 2*N+1
- * in its second half (this corresponds to the payload setup used by geometry
- * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
- * false, then each attribute takes up a whole register, with register N
- * containing attribute N (this corresponds to the payload setup used by
- * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
- */
-void
-vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
-                                          bool interleaved)
-{
-   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file != ATTR)
-            continue;
-
-         int grf = attribute_map[inst->src[i].nr +
-                                 inst->src[i].offset / REG_SIZE];
-         assert(inst->src[i].offset % REG_SIZE == 0);
-
-         /* All attributes used in the shader need to have been assigned a
-          * hardware register by the caller
-          */
-         assert(grf != 0);
-
-         struct brw_reg reg =
-            attribute_to_hw_reg(grf, inst->src[i].type, interleaved);
-         reg.swizzle = inst->src[i].swizzle;
-         if (inst->src[i].abs)
-            reg = brw_abs(reg);
-         if (inst->src[i].negate)
-            reg = negate(reg);
-
-         inst->src[i] = reg;
-      }
-   }
-}
-
 int
 vec4_vs_visitor::setup_attributes(int payload_reg)
 {
-   int nr_attributes;
-   int attribute_map[VERT_ATTRIB_MAX + 2];
-   memset(attribute_map, 0, sizeof(attribute_map));
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == ATTR) {
+            assert(inst->src[i].offset % REG_SIZE == 0);
+            int grf = payload_reg + inst->src[i].nr +
+                      inst->src[i].offset / REG_SIZE;
 
-   nr_attributes = 0;
-   GLbitfield64 vs_inputs = vs_prog_data->inputs_read;
-   while (vs_inputs) {
-      GLuint first = ffsll(vs_inputs) - 1;
-      int needed_slots =
-         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) ? 2 : 1;
-      for (int c = 0; c < needed_slots; c++) {
-         attribute_map[first + c] = payload_reg + nr_attributes;
-         nr_attributes++;
-         vs_inputs &= ~BITFIELD64_BIT(first + c);
+            struct brw_reg reg = brw_vec8_grf(grf, 0);
+            reg.swizzle = inst->src[i].swizzle;
+            reg.type = inst->src[i].type;
+            reg.abs = inst->src[i].abs;
+            reg.negate = inst->src[i].negate;
+            inst->src[i] = reg;
+         }
       }
    }
 
-   /* VertexID is stored by the VF as the last vertex element, but we
-    * don't represent it with a flag in inputs_read, so we call it
-    * VERT_ATTRIB_MAX.
-    */
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
-       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
-      attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
-      nr_attributes++;
-   }
-
-   if (vs_prog_data->uses_drawid) {
-      attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
-      nr_attributes++;
-   }
-
-   lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
-
    return payload_reg + vs_prog_data->nr_attribute_slots;
 }
 
@@ -1853,6 +1776,9 @@
       reg += ALIGN(uniforms, 2) / 2;
    }
 
+   for (int i = 0; i < 4; i++)
+      reg += stage_prog_data->ubo_ranges[i].length;
+
    stage_prog_data->nr_params = this->uniforms * 4;
 
    prog_data->base.curb_read_length =
@@ -2673,7 +2599,7 @@
       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
          char filename[64];                                            \
          snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass,              \
-                  stage_abbrev, nir->info->name, iteration, pass_num); \
+                  stage_abbrev, nir->info.name, iteration, pass_num); \
                                                                        \
          backend_shader::dump_instructions(filename);                  \
       }                                                                \
@@ -2686,7 +2612,7 @@
    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
       char filename[64];
       snprintf(filename, 64, "%s-%s-00-00-start",
-               stage_abbrev, nir->info->name);
+               stage_abbrev, nir->info.name);
 
       backend_shader::dump_instructions(filename);
    }
@@ -2824,25 +2750,44 @@
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
-   brw_nir_lower_vs_inputs(shader, is_scalar,
-                           use_legacy_snorm_formula, key->gl_attrib_wa_flags);
-   brw_nir_lower_vue_outputs(shader, is_scalar);
-   shader = brw_postprocess_nir(shader, compiler, is_scalar);
 
    const unsigned *assembly = NULL;
 
+   if (prog_data->base.vue_map.varying_to_slot[VARYING_SLOT_EDGE] != -1) {
+      /* If the output VUE map contains VARYING_SLOT_EDGE then we need to copy
+       * the edge flag from VERT_ATTRIB_EDGEFLAG.  This will be done
+       * automatically by brw_vec4_visitor::emit_urb_slot but we need to
+       * ensure that prog_data->inputs_read is accurate.
+       *
+       * In order to make late NIR passes aware of the change, we actually
+       * whack shader->info.inputs_read instead.  This is safe because we just
+       * made a copy of the shader.
+       */
+      assert(!is_scalar);
+      assert(key->copy_edgeflag);
+      shader->info.inputs_read |= VERT_BIT_EDGEFLAG;
+   }
+
+   prog_data->inputs_read = shader->info.inputs_read;
+   prog_data->double_inputs_read = shader->info.double_inputs_read;
+
+   brw_nir_lower_vs_inputs(shader, use_legacy_snorm_formula,
+                           key->gl_attrib_wa_flags);
+   brw_nir_lower_vue_outputs(shader, is_scalar);
+   shader = brw_postprocess_nir(shader, compiler, is_scalar);
+
    prog_data->base.clip_distance_mask =
-      ((1 << shader->info->clip_distance_array_size) - 1);
+      ((1 << shader->info.clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
-      ((1 << shader->info->cull_distance_array_size) - 1) <<
-      shader->info->clip_distance_array_size;
+      ((1 << shader->info.cull_distance_array_size) - 1) <<
+      shader->info.clip_distance_array_size;
 
    unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
 
    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
     * incoming vertex attribute.  So, add an extra slot.
     */
-   if (shader->info->system_values_read &
+   if (shader->info.system_values_read &
        (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
         BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
         BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
@@ -2850,14 +2795,31 @@
       nr_attribute_slots++;
    }
 
+   if (shader->info.system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX))
+      prog_data->uses_basevertex = true;
+
+   if (shader->info.system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE))
+      prog_data->uses_baseinstance = true;
+
+   if (shader->info.system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
+      prog_data->uses_vertexid = true;
+
+   if (shader->info.system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))
+      prog_data->uses_instanceid = true;
+
    /* gl_DrawID has its very own vec4 */
-   if (shader->info->system_values_read &
+   if (shader->info.system_values_read &
        BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
+      prog_data->uses_drawid = true;
       nr_attribute_slots++;
    }
 
    unsigned nr_attributes = nr_attribute_slots -
-      DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
+      DIV_ROUND_UP(_mesa_bitcount_64(shader->info.double_inputs_read), 2);
 
    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
@@ -2880,10 +2842,17 @@
    const unsigned vue_entries =
       MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
 
-   if (compiler->devinfo->gen == 6)
+   if (compiler->devinfo->gen == 6) {
       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
-   else
+   } else {
       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+      /* On Cannonlake software shall not program an allocation size that
+       * specifies a size that is a multiple of 3 64B (512-bit) cachelines.
+       */
+      if (compiler->devinfo->gen == 10 &&
+          prog_data->base.urb_entry_size % 3 == 0)
+         prog_data->base.urb_entry_size++;
+   }
 
    if (INTEL_DEBUG & DEBUG_VS) {
       fprintf(stderr, "VS Output ");
@@ -2911,9 +2880,9 @@
       if (INTEL_DEBUG & DEBUG_VS) {
          const char *debug_name =
             ralloc_asprintf(mem_ctx, "%s vertex shader %s",
-                            shader->info->label ? shader->info->label :
+                            shader->info.label ? shader->info.label :
                                "unnamed",
-                            shader->info->name);
+                            shader->info.name);
 
          g.enable_debug(debug_name);
       }
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
index 89adfaa..d828da0 100644
--- a/src/intel/compiler/brw_vec4.h
+++ b/src/intel/compiler/brw_vec4.h
@@ -332,8 +332,6 @@
 
    virtual void emit_nir_code();
    virtual void nir_setup_uniforms();
-   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
-   virtual void nir_setup_system_values();
    virtual void nir_emit_impl(nir_function_impl *impl);
    virtual void nir_emit_cf_list(exec_list *list);
    virtual void nir_emit_if(nir_if *if_stmt);
@@ -359,16 +357,11 @@
                        unsigned num_components = 4);
    src_reg get_indirect_offset(nir_intrinsic_instr *instr);
 
-   virtual dst_reg *make_reg_for_system_value(int location) = 0;
-
    dst_reg *nir_locals;
    dst_reg *nir_ssa_values;
-   dst_reg *nir_system_values;
 
 protected:
    void emit_vertex();
-   void lower_attributes_to_hw_regs(const int *attribute_map,
-                                    bool interleaved);
    void setup_payload_interference(struct ra_graph *g, int first_payload_node,
                                    int reg_node_count);
    virtual void setup_payload() = 0;
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
index 4454cdb..0d72d82 100644
--- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -129,6 +129,31 @@
                 scan_inst->opcode == BRW_OPCODE_CMPN)
                break;
 
+            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             */
+            if (scan_inst->saturate)
+               break;
+
+            /* From the Sky Lake PRM, Vol 2a, "Multiply":
+             *
+             *    "When multiplying integer data types, if one of the sources
+             *    is a DW, the resulting full precision data is stored in
+             *    the accumulator. However, if the destination data type is
+             *    either W or DW, the low bits of the result are written to
+             *    the destination register and the remaining high bits are
+             *    discarded. This results in undefined Overflow and Sign
+             *    flags. Therefore, conditional modifiers and saturation
+             *    (.sat) cannot be used in this case.
+             *
+             * We just disallow cmod propagation on all integer multiplies.
+             */
+            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
+                scan_inst->opcode == BRW_OPCODE_MUL)
+               break;
+
             /* Otherwise, try propagating the conditional. */
             enum brw_conditional_mod cond =
                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 753b00c..334933d 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -2180,10 +2180,14 @@
    annotation_finalize(&annotation, p->next_insn_offset);
 
 #ifndef NDEBUG
-   bool validated = brw_validate_instructions(p, 0, &annotation);
+   bool validated = brw_validate_instructions(devinfo, p->store,
+                                              0, p->next_insn_offset,
+                                              &annotation);
 #else
    if (unlikely(debug_flag))
-      brw_validate_instructions(p, 0, &annotation);
+      brw_validate_instructions(devinfo, p->store,
+                                0, p->next_insn_offset,
+                                &annotation);
 #endif
 
    int before_size = p->next_insn_offset;
@@ -2192,8 +2196,8 @@
 
    if (unlikely(debug_flag)) {
       fprintf(stderr, "Native code for %s %s shader %s:\n",
-              nir->info->label ? nir->info->label : "unnamed",
-              _mesa_shader_stage_to_string(nir->stage), nir->info->name);
+              nir->info.label ? nir->info.label : "unnamed",
+              _mesa_shader_stage_to_string(nir->stage), nir->info.name);
 
       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
                       "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp b/src/intel/compiler/brw_vec4_gs_nir.cpp
index ed8c03b..77b753a 100644
--- a/src/intel/compiler/brw_vec4_gs_nir.cpp
+++ b/src/intel/compiler/brw_vec4_gs_nir.cpp
@@ -31,28 +31,6 @@
 }
 
 void
-vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
-{
-   dst_reg *reg;
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_primitive_id:
-      /* We'll just read g1 directly; don't create a temporary. */
-      break;
-
-   case nir_intrinsic_load_invocation_id:
-      reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
-      if (reg->file == BAD_FILE)
-         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID);
-      break;
-
-   default:
-      vec4_visitor::nir_setup_system_value_intrinsic(instr);
-   }
-
-}
-
-void
 vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 {
    dst_reg dest;
@@ -66,8 +44,10 @@
       nir_const_value *vertex = nir_src_as_const_value(instr->src[0]);
       nir_const_value *offset_reg = nir_src_as_const_value(instr->src[1]);
 
+      const unsigned input_array_stride = prog_data->urb_read_length * 2;
+
       if (nir_dest_bit_size(instr->dest) == 64) {
-         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+         src = src_reg(ATTR, input_array_stride * vertex->u32[0] +
                        instr->const_index[0] + offset_reg->u32[0],
                        glsl_type::dvec4_type);
 
@@ -85,15 +65,11 @@
          /* Make up a type...we have no way of knowing... */
          const glsl_type *const type = glsl_type::ivec(instr->num_components);
 
-         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+         src = src_reg(ATTR, input_array_stride * vertex->u32[0] +
                        instr->const_index[0] + offset_reg->u32[0],
                        type);
          src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
 
-         /* gl_PointSize is passed in the .w component of the VUE header */
-         if (instr->const_index[0] == VARYING_SLOT_PSIZ)
-            src.swizzle = BRW_SWIZZLE_WWWW;
-
          dest = get_nir_dest(instr->dest, src.type);
          dest.writemask = brw_writemask_for_size(instr->num_components);
          emit(MOV(dest, src));
@@ -130,11 +106,11 @@
       break;
 
    case nir_intrinsic_load_invocation_id: {
-      src_reg invocation_id =
-         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
-      assert(invocation_id.file != BAD_FILE);
-      dest = get_nir_dest(instr->dest, invocation_id.type);
-      emit(MOV(dest, invocation_id));
+      dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      if (gs_prog_data->invocations > 1)
+         emit(GS_OPCODE_GET_INSTANCE_ID, dest);
+      else
+         emit(MOV(dest, brw_imm_ud(0)));
       break;
    }
 
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
index 5fcd028..a8e445c 100644
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -29,6 +29,7 @@
 
 #include "brw_vec4_gs_visitor.h"
 #include "gen6_gs_visitor.h"
+#include "brw_cfg.h"
 #include "brw_fs.h"
 #include "brw_nir.h"
 #include "common/gen_debug.h"
@@ -52,29 +53,36 @@
 }
 
 
-dst_reg *
-vec4_gs_visitor::make_reg_for_system_value(int location)
+static inline struct brw_reg
+attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
 {
-   dst_reg *reg = new(mem_ctx) dst_reg(this, glsl_type::int_type);
+   struct brw_reg reg;
 
-   switch (location) {
-   case SYSTEM_VALUE_INVOCATION_ID:
-      this->current_annotation = "initialize gl_InvocationID";
-      if (gs_prog_data->invocations > 1)
-         emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
-      else
-         emit(MOV(*reg, brw_imm_ud(0)));
-      break;
-   default:
-      unreachable("not reached");
+   unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
+   if (interleaved) {
+      reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
+   } else {
+      reg = brw_vecn_grf(width, attr, 0);
    }
 
+   reg.type = type;
    return reg;
 }
 
-
+/**
+ * Replace each register of type ATTR in this->instructions with a reference
+ * to a fixed HW register.
+ *
+ * If interleaved is true, then each attribute takes up half a register, with
+ * register N containing attribute 2*N in its first half and attribute 2*N+1
+ * in its second half (this corresponds to the payload setup used by geometry
+ * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
+ * false, then each attribute takes up a whole register, with register N
+ * containing attribute N (this corresponds to the payload setup used by
+ * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
+ */
 int
-vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
+vec4_gs_visitor::setup_varying_inputs(int payload_reg,
                                       int attributes_per_reg)
 {
    /* For geometry shaders there are N copies of the input attributes, where N
@@ -85,16 +93,28 @@
     * so the total number of input slots that will be delivered to the GS (and
     * thus the stride of the input arrays) is urb_read_length * 2.
     */
-   const unsigned num_input_vertices = nir->info->gs.vertices_in;
+   const unsigned num_input_vertices = nir->info.gs.vertices_in;
    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
    unsigned input_array_stride = prog_data->urb_read_length * 2;
 
-   for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
-      int varying = c->input_vue_map.slot_to_varying[slot];
-      for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
-         attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
-            attributes_per_reg * payload_reg + input_array_stride * vertex +
-            slot;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         assert(inst->src[i].offset % REG_SIZE == 0);
+         int grf = payload_reg * attributes_per_reg +
+                   inst->src[i].nr + inst->src[i].offset / REG_SIZE;
+
+         struct brw_reg reg =
+            attribute_to_hw_reg(grf, inst->src[i].type, attributes_per_reg > 1);
+         reg.swizzle = inst->src[i].swizzle;
+         if (inst->src[i].abs)
+            reg = brw_abs(reg);
+         if (inst->src[i].negate)
+            reg = negate(reg);
+
+         inst->src[i] = reg;
       }
    }
 
@@ -103,25 +123,15 @@
    return payload_reg + regs_used;
 }
 
-
 void
 vec4_gs_visitor::setup_payload()
 {
-   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
-
    /* If we are in dual instanced or single mode, then attributes are going
     * to be interleaved, so one register contains two attribute slots.
     */
    int attributes_per_reg =
       prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 
-   /* If a geometry shader tries to read from an input that wasn't written by
-    * the vertex shader, that produces undefined results, but it shouldn't
-    * crash anything.  So initialize attribute_map to zeros--that ensures that
-    * these undefined results are read from r0.
-    */
-   memset(attribute_map, 0, sizeof(attribute_map));
-
    int reg = 0;
 
    /* The payload always contains important data in r0, which contains
@@ -132,13 +142,11 @@
 
    /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
    if (gs_prog_data->include_primitive_id)
-      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
+      reg++;
 
    reg = setup_uniforms(reg);
 
-   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
-
-   lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
+   reg = setup_varying_inputs(reg, attributes_per_reg);
 
    this->first_non_payload_grf = reg;
 }
@@ -414,7 +422,7 @@
    assert(c->control_data_bits_per_vertex == 2);
 
    /* Must be a valid stream */
-   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+   assert(stream_id < MAX_VERTEX_STREAMS);
 
    /* Control data bits are initialized to 0 so we don't have to set any
     * bits when sending vertices to stream 0.
@@ -455,7 +463,7 @@
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
       return;
 
    /* If we're outputting 32 control data bits or less, then we can wait
@@ -628,32 +636,32 @@
     * For SSO pipelines, we use a fixed VUE map layout based on variable
     * locations, so we can rely on rendezvous-by-location making this work.
     */
-   GLbitfield64 inputs_read = shader->info->inputs_read;
+   GLbitfield64 inputs_read = shader->info.inputs_read;
    brw_compute_vue_map(compiler->devinfo,
                        &c.input_vue_map, inputs_read,
-                       shader->info->separate_shader);
+                       shader->info.separate_shader);
 
    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
-   brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
+   brw_nir_lower_vue_inputs(shader, &c.input_vue_map);
    brw_nir_lower_vue_outputs(shader, is_scalar);
    shader = brw_postprocess_nir(shader, compiler, is_scalar);
 
    prog_data->base.clip_distance_mask =
-      ((1 << shader->info->clip_distance_array_size) - 1);
+      ((1 << shader->info.clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
-      ((1 << shader->info->cull_distance_array_size) - 1) <<
-      shader->info->clip_distance_array_size;
+      ((1 << shader->info.cull_distance_array_size) - 1) <<
+      shader->info.clip_distance_array_size;
 
    prog_data->include_primitive_id =
-      (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;
+      (shader->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;
 
-   prog_data->invocations = shader->info->gs.invocations;
+   prog_data->invocations = shader->info.gs.invocations;
 
    if (compiler->devinfo->gen >= 8)
       prog_data->static_vertex_count = nir_gs_count_vertices(shader);
 
    if (compiler->devinfo->gen >= 7) {
-      if (shader->info->gs.output_primitive == GL_POINTS) {
+      if (shader->info.gs.output_primitive == GL_POINTS) {
          /* When the output type is points, the geometry shader may output data
           * to multiple streams, and EndPrimitive() has no effect.  So we
           * configure the hardware to interpret the control data as stream ID.
@@ -678,14 +686,14 @@
           * EndPrimitive().
           */
          c.control_data_bits_per_vertex =
-            shader->info->gs.uses_end_primitive ? 1 : 0;
+            shader->info.gs.uses_end_primitive ? 1 : 0;
       }
    } else {
       /* There are no control data bits in gen6. */
       c.control_data_bits_per_vertex = 0;
    }
    c.control_data_header_size_bits =
-      shader->info->gs.vertices_out * c.control_data_bits_per_vertex;
+      shader->info.gs.vertices_out * c.control_data_bits_per_vertex;
 
    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data->control_data_header_size_hwords =
@@ -780,7 +788,7 @@
    unsigned output_size_bytes;
    if (compiler->devinfo->gen >= 7) {
       output_size_bytes =
-         prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
+         prog_data->output_vertex_size_hwords * 32 * shader->info.gs.vertices_out;
       output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
    } else {
       output_size_bytes = prog_data->output_vertex_size_hwords * 32;
@@ -809,16 +817,23 @@
    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
     * a multiple of 128 bytes in gen6.
     */
-   if (compiler->devinfo->gen >= 7)
+   if (compiler->devinfo->gen >= 7) {
       prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
-   else
+      /* On Cannonlake software shall not program an allocation size that
+       * specifies a size that is a multiple of 3 64B (512-bit) cachelines.
+       */
+      if (compiler->devinfo->gen == 10 &&
+          prog_data->base.urb_entry_size % 3 == 0)
+         prog_data->base.urb_entry_size++;
+   } else {
       prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+   }
 
-   assert(shader->info->gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
+   assert(shader->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
    prog_data->output_topology =
-      gl_prim_to_hw_prim[shader->info->gs.output_primitive];
+      gl_prim_to_hw_prim[shader->info.gs.output_primitive];
 
-   prog_data->vertices_in = shader->info->gs.vertices_in;
+   prog_data->vertices_in = shader->info.gs.vertices_in;
 
    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
@@ -847,9 +862,9 @@
                         false, MESA_SHADER_GEOMETRY);
          if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
             const char *label =
-               shader->info->label ? shader->info->label : "unnamed";
+               shader->info.label ? shader->info.label : "unnamed";
             char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
-                                         label, shader->info->name);
+                                         label, shader->info.name);
             g.enable_debug(name);
          }
          g.generate_code(v.cfg, 8);
@@ -897,6 +912,7 @@
             memcpy(prog_data->base.base.param, param,
                    sizeof(gl_constant_value*) * param_count);
             prog_data->base.base.nr_params = param_count;
+            prog_data->base.base.nr_pull_params = 0;
             ralloc_free(param);
          }
       }
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h b/src/intel/compiler/brw_vec4_gs_visitor.h
index 09221f9..c656559 100644
--- a/src/intel/compiler/brw_vec4_gs_visitor.h
+++ b/src/intel/compiler/brw_vec4_gs_visitor.h
@@ -50,10 +50,8 @@
                    int shader_time_index);
 
    virtual void nir_setup_inputs();
-   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_thread_end();
@@ -64,8 +62,7 @@
    virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
-   int setup_varying_inputs(int payload_reg, int *attribute_map,
-                            int attributes_per_reg);
+   int setup_varying_inputs(int payload_reg, int attributes_per_reg);
    void emit_control_data_bits();
    void set_stream_control_data_bits(unsigned stream_id);
 
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 80115ac..9bd1bba 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -37,8 +37,6 @@
    if (nir->num_uniforms > 0)
       nir_setup_uniforms();
 
-   nir_setup_system_values();
-
    /* get the main function and emit it */
    nir_foreach_function(function, nir) {
       assert(strcmp(function->name, "main") == 0);
@@ -48,81 +46,6 @@
 }
 
 void
-vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
-{
-   dst_reg *reg;
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id().");
-
-   case nir_intrinsic_load_vertex_id_zero_base:
-      reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
-      break;
-
-   case nir_intrinsic_load_base_vertex:
-      reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX);
-      break;
-
-   case nir_intrinsic_load_instance_id:
-      reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID);
-      break;
-
-   case nir_intrinsic_load_base_instance:
-      reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE);
-      break;
-
-   case nir_intrinsic_load_draw_id:
-      reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID);
-      break;
-
-   default:
-      break;
-   }
-}
-
-static bool
-setup_system_values_block(nir_block *block, vec4_visitor *v)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-      v->nir_setup_system_value_intrinsic(intrin);
-   }
-
-   return true;
-}
-
-void
-vec4_visitor::nir_setup_system_values()
-{
-   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
-   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
-      nir_system_values[i] = dst_reg();
-   }
-
-   nir_foreach_function(function, nir) {
-      assert(strcmp(function->name, "main") == 0);
-      assert(function->impl);
-      nir_foreach_block(block, function->impl) {
-         setup_system_values_block(block, this);
-      }
-   }
-}
-
-void
 vec4_visitor::nir_setup_uniforms()
 {
    uniforms = nir->num_uniforms / 16;
@@ -570,7 +493,7 @@
 
          brw_mark_surface_used(&prog_data->base,
                                prog_data->base.binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       /* Offset */
@@ -736,7 +659,7 @@
           */
          brw_mark_surface_used(&prog_data->base,
                                prog_data->base.binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       src_reg offset_reg;
@@ -826,14 +749,8 @@
    case nir_intrinsic_load_instance_id:
    case nir_intrinsic_load_base_instance:
    case nir_intrinsic_load_draw_id:
-   case nir_intrinsic_load_invocation_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      src_reg val = src_reg(nir_system_values[sv]);
-      assert(val.file != BAD_FILE);
-      dest = get_nir_dest(instr->dest, val.type);
-      emit(MOV(dest, val));
-      break;
-   }
+   case nir_intrinsic_load_invocation_id:
+      unreachable("should be lowered by brw_nir_lower_vs_inputs()");
 
    case nir_intrinsic_load_uniform: {
       /* Offsets are in bytes but they should always be multiples of 4 */
@@ -887,9 +804,17 @@
       break;
    }
 
-   case nir_intrinsic_atomic_counter_read:
    case nir_intrinsic_atomic_counter_inc:
-   case nir_intrinsic_atomic_counter_dec: {
+   case nir_intrinsic_atomic_counter_dec:
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_min:
+   case nir_intrinsic_atomic_counter_max:
+   case nir_intrinsic_atomic_counter_and:
+   case nir_intrinsic_atomic_counter_or:
+   case nir_intrinsic_atomic_counter_xor:
+   case nir_intrinsic_atomic_counter_exchange:
+   case nir_intrinsic_atomic_counter_comp_swap: {
       unsigned surf_index = prog_data->base.binding_table.abo_start +
          (unsigned) instr->const_index[0];
       const vec4_builder bld =
@@ -955,7 +880,7 @@
           */
          brw_mark_surface_used(&prog_data->base,
                                prog_data->base.binding_table.ubo_start +
-                               nir->info->num_ubos - 1);
+                               nir->info.num_ubos - 1);
       }
 
       src_reg offset_reg;
@@ -1053,7 +978,7 @@
        */
       brw_mark_surface_used(&prog_data->base,
                             prog_data->base.binding_table.ssbo_start +
-                            nir->info->num_ssbos - 1);
+                            nir->info.num_ssbos - 1);
    }
 
    src_reg offset = get_nir_src(instr->src[1], 1);
@@ -2302,6 +2227,15 @@
       }
    }
 
+   /* TXS and TXL require a LOD but not everything we implement using those
+    * two opcodes provides one.  Provide a default LOD of 0.
+    */
+   if ((instr->op == nir_texop_txs ||
+        instr->op == nir_texop_txl) &&
+       lod.file == BAD_FILE) {
+      lod = brw_imm_ud(0u);
+   }
+
    if (instr->op == nir_texop_txf_ms ||
        instr->op == nir_texop_samples_identical) {
       assert(coord_type != NULL);
diff --git a/src/intel/compiler/brw_vec4_surface_builder.cpp b/src/intel/compiler/brw_vec4_surface_builder.cpp
index 00c94fe..75386cf 100644
--- a/src/intel/compiler/brw_vec4_surface_builder.cpp
+++ b/src/intel/compiler/brw_vec4_surface_builder.cpp
@@ -212,10 +212,15 @@
          const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
          const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
 
-         if (size >= 1)
-            bld.MOV(writemask(srcs, WRITEMASK_X), src0);
-         if (size >= 2)
-            bld.MOV(writemask(srcs, WRITEMASK_Y), src1);
+         if (size >= 1) {
+            bld.MOV(writemask(srcs, WRITEMASK_X),
+                    swizzle(src0, BRW_SWIZZLE_XXXX));
+         }
+
+         if (size >= 2) {
+            bld.MOV(writemask(srcs, WRITEMASK_Y),
+                    swizzle(src1, BRW_SWIZZLE_XXXX));
+         }
 
          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC, src_reg(),
                           emit_insert(bld, addr, dims, has_simd4x2),
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index d4a647d..c4d9f89 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -50,18 +50,6 @@
 
 
 void
-vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
-{
-}
-
-dst_reg *
-vec4_tcs_visitor::make_reg_for_system_value(int location)
-{
-   return NULL;
-}
-
-
-void
 vec4_tcs_visitor::setup_payload()
 {
    int reg = 0;
@@ -95,9 +83,9 @@
     * HS instance dispatched will only have its bottom half doing real
     * work, and so we need to disable the upper half:
     */
-   if (nir->info->tess.tcs_vertices_out % 2) {
+   if (nir->info.tess.tcs_vertices_out % 2) {
       emit(CMP(dst_null_d(), invocation_id,
-               brw_imm_ud(nir->info->tess.tcs_vertices_out),
+               brw_imm_ud(nir->info.tess.tcs_vertices_out),
                BRW_CONDITIONAL_L));
 
       /* Matching ENDIF is in emit_thread_end() */
@@ -112,7 +100,7 @@
    vec4_instruction *inst;
    current_annotation = "thread end";
 
-   if (nir->info->tess.tcs_vertices_out % 2) {
+   if (nir->info.tess.tcs_vertices_out % 2) {
       emit(BRW_OPCODE_ENDIF);
    }
 
@@ -402,18 +390,18 @@
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
 
    nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
-   nir->info->outputs_written = key->outputs_written;
-   nir->info->patch_outputs_written = key->patch_outputs_written;
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
 
    struct brw_vue_map input_vue_map;
-   brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read,
-                       nir->info->separate_shader);
+   brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
+                       nir->info.separate_shader);
    brw_compute_tess_vue_map(&vue_prog_data->vue_map,
-                            nir->info->outputs_written,
-                            nir->info->patch_outputs_written);
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
 
    nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
-   brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
+   brw_nir_lower_vue_inputs(nir, &input_vue_map);
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
                              key->tes_primitive_mode);
    if (key->quads_workaround)
@@ -422,9 +410,9 @@
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
    if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 8);
+      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
    else
-      prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 2);
+      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
@@ -443,7 +431,7 @@
    unsigned output_size_bytes = 0;
    /* Note that the patch header is counted in num_per_patch_slots. */
    output_size_bytes += num_per_patch_slots * 16;
-   output_size_bytes += nir->info->tess.tcs_vertices_out *
+   output_size_bytes += nir->info.tess.tcs_vertices_out *
                         num_per_vertex_slots * 16;
 
    assert(output_size_bytes >= 1);
@@ -453,6 +441,13 @@
    /* URB entry sizes are stored as a multiple of 64 bytes. */
    vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
 
+   /* On Cannonlake software shall not program an allocation size that
+    * specifies a size that is a multiple of 3 64B (512-bit) cachelines.
+    */
+   if (devinfo->gen == 10 &&
+       vue_prog_data->urb_entry_size % 3 == 0)
+      vue_prog_data->urb_entry_size++;
+
    /* HS does not use the usual payload pushing from URB to GRFs,
     * because we don't have enough registers for a full-size payload, and
     * the hardware is broken on Haswell anyway.
@@ -485,9 +480,9 @@
       if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
          g.enable_debug(ralloc_asprintf(mem_ctx,
                                         "%s tessellation control shader %s",
-                                        nir->info->label ? nir->info->label
+                                        nir->info.label ? nir->info.label
                                                         : "unnamed",
-                                        nir->info->name));
+                                        nir->info.name));
       }
 
       g.generate_code(v.cfg, 8);
diff --git a/src/intel/compiler/brw_vec4_tcs.h b/src/intel/compiler/brw_vec4_tcs.h
index 030eb5e..efa13ec 100644
--- a/src/intel/compiler/brw_vec4_tcs.h
+++ b/src/intel/compiler/brw_vec4_tcs.h
@@ -49,8 +49,6 @@
                     const struct brw_vue_map *input_vue_map);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
-   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_thread_end();
diff --git a/src/intel/compiler/brw_vec4_tes.cpp b/src/intel/compiler/brw_vec4_tes.cpp
index bcf9a87..35aff0f 100644
--- a/src/intel/compiler/brw_vec4_tes.cpp
+++ b/src/intel/compiler/brw_vec4_tes.cpp
@@ -45,26 +45,6 @@
 {
 }
 
-
-dst_reg *
-vec4_tes_visitor::make_reg_for_system_value(int location)
-{
-   return NULL;
-}
-
-void
-vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
-{
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_tess_level_outer:
-   case nir_intrinsic_load_tess_level_inner:
-      break;
-   default:
-      vec4_visitor::nir_setup_system_value_intrinsic(instr);
-   }
-}
-
-
 void
 vec4_tes_visitor::setup_payload()
 {
diff --git a/src/intel/compiler/brw_vec4_tes.h b/src/intel/compiler/brw_vec4_tes.h
index 31a28f3..7da3e5f 100644
--- a/src/intel/compiler/brw_vec4_tes.h
+++ b/src/intel/compiler/brw_vec4_tes.h
@@ -47,8 +47,6 @@
                    int shader_time_index);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
-   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
    virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
    virtual void setup_payload();
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
index 262a084..22ee4dd 100644
--- a/src/intel/compiler/brw_vec4_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -1315,7 +1315,7 @@
       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
       break;
-   case VARYING_SLOT_EDGE:
+   case VARYING_SLOT_EDGE: {
       /* This is present when doing unfilled polygons.  We're supposed to copy
        * the edge flag from the user-provided vertex array
        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
@@ -1323,9 +1323,12 @@
        * determine which edges should be drawn as wireframe.
        */
       current_annotation = "edge flag";
-      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
+      int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
+                                        BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
+      emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
                                     glsl_type::float_type, WRITEMASK_XYZW))));
       break;
+   }
    case BRW_VARYING_SLOT_PAD:
       /* No need to write to this slot */
       break;
diff --git a/src/intel/compiler/brw_vec4_vs.h b/src/intel/compiler/brw_vec4_vs.h
index 8c346d7..cd07e0e 100644
--- a/src/intel/compiler/brw_vec4_vs.h
+++ b/src/intel/compiler/brw_vec4_vs.h
@@ -42,7 +42,6 @@
                    bool use_legacy_snorm_formula);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_thread_end();
diff --git a/src/intel/compiler/brw_vec4_vs_visitor.cpp b/src/intel/compiler/brw_vec4_vs_visitor.cpp
index 0cec779..ad7f067 100644
--- a/src/intel/compiler/brw_vec4_vs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_vs_visitor.cpp
@@ -33,46 +33,6 @@
 }
 
 
-dst_reg *
-vec4_vs_visitor::make_reg_for_system_value(int location)
-{
-   /* VertexID is stored by the VF as the last vertex element, but
-    * we don't represent it with a flag in inputs_read, so we call
-    * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
-    */
-   dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
-
-   switch (location) {
-   case SYSTEM_VALUE_BASE_VERTEX:
-      reg->writemask = WRITEMASK_X;
-      vs_prog_data->uses_basevertex = true;
-      break;
-   case SYSTEM_VALUE_BASE_INSTANCE:
-      reg->writemask = WRITEMASK_Y;
-      vs_prog_data->uses_baseinstance = true;
-      break;
-   case SYSTEM_VALUE_VERTEX_ID:
-   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
-      reg->writemask = WRITEMASK_Z;
-      vs_prog_data->uses_vertexid = true;
-      break;
-   case SYSTEM_VALUE_INSTANCE_ID:
-      reg->writemask = WRITEMASK_W;
-      vs_prog_data->uses_instanceid = true;
-      break;
-   case SYSTEM_VALUE_DRAW_ID:
-      reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX + 1);
-      reg->writemask = WRITEMASK_X;
-      vs_prog_data->uses_drawid = true;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   return reg;
-}
-
-
 void
 vec4_vs_visitor::emit_urb_write_header(int mrf)
 {
diff --git a/src/intel/compiler/brw_wm_iz.cpp b/src/intel/compiler/brw_wm_iz.cpp
index 11d4f76..fead165 100644
--- a/src/intel/compiler/brw_wm_iz.cpp
+++ b/src/intel/compiler/brw_wm_iz.cpp
@@ -142,7 +142,7 @@
    }
 
    prog_data->uses_src_depth =
-      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
    if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
        kill_stats_promoted_workaround) {
       payload.source_depth_reg = reg;
diff --git a/src/intel/compiler/gen6_gs_visitor.cpp b/src/intel/compiler/gen6_gs_visitor.cpp
index 075bc4a..66c69fb 100644
--- a/src/intel/compiler/gen6_gs_visitor.cpp
+++ b/src/intel/compiler/gen6_gs_visitor.cpp
@@ -64,7 +64,7 @@
    this->vertex_output = src_reg(this,
                                  glsl_type::uint_type,
                                  (prog_data->vue_map.num_slots + 1) *
-                                 nir->info->gs.vertices_out);
+                                 nir->info.gs.vertices_out);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
 
@@ -178,7 +178,7 @@
    dst_reg dst(this->vertex_output);
    dst.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
-   if (nir->info->gs.output_primitive == GL_POINTS) {
+   if (nir->info.gs.output_primitive == GL_POINTS) {
       /* If we are outputting points, then every vertex has PrimStart and
        * PrimEnd set.
        */
@@ -207,7 +207,7 @@
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
     */
-   if (nir->info->gs.output_primitive == GL_POINTS)
+   if (nir->info.gs.output_primitive == GL_POINTS)
       return;
 
    /* Otherwise we know that the last vertex we have processed was the last
@@ -219,7 +219,7 @@
     * comparison below (hence the num_output_vertices + 1 in the comparison
     * below).
     */
-   unsigned num_output_vertices = nir->info->gs.vertices_out;
+   unsigned num_output_vertices = nir->info.gs.vertices_out;
    emit(CMP(dst_null_ud(), this->vertex_count,
             brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
    vec4_instruction *inst = emit(CMP(dst_null_ud(),
@@ -323,7 +323,7 @@
     * first_vertex is not zero. This is only relevant for outputs other than
     * points because in the point case we set PrimEnd on all vertices.
     */
-   if (nir->info->gs.output_primitive != GL_POINTS) {
+   if (nir->info.gs.output_primitive != GL_POINTS) {
       emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
       emit(IF(BRW_PREDICATE_NORMAL));
       gs_end_primitive();
@@ -516,9 +516,7 @@
 
    reg = setup_uniforms(reg);
 
-   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
-
-   lower_attributes_to_hw_regs(attribute_map, true);
+   reg = setup_varying_inputs(reg, attributes_per_reg);
 
    this->first_non_payload_grf = reg;
 }
@@ -625,7 +623,7 @@
    emit(BRW_OPCODE_ENDIF);
 
    /* Write transform feedback data for all processed vertices. */
-   for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
+   for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
       emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
                BRW_CONDITIONAL_L));
@@ -689,18 +687,7 @@
          emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          data.type = output_reg[varying][0].type;
-
-         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
-          * same slot, so make sure we write the appropriate channel
-          */
-         if (varying == VARYING_SLOT_PSIZ)
-            data.swizzle = BRW_SWIZZLE_WWWW;
-         else if (varying == VARYING_SLOT_LAYER)
-            data.swizzle = BRW_SWIZZLE_YYYY;
-         else if (varying == VARYING_SLOT_VIEWPORT)
-            data.swizzle = BRW_SWIZZLE_ZZZZ;
-         else
-            data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
+         data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
 
          /* Write data */
          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
diff --git a/src/intel/compiler/intel_asm_annotation.c b/src/intel/compiler/intel_asm_annotation.c
index 1f3b784..b07a545 100644
--- a/src/intel/compiler/intel_asm_annotation.c
+++ b/src/intel/compiler/intel_asm_annotation.c
@@ -27,6 +27,8 @@
 #include "intel_asm_annotation.h"
 #include "compiler/nir/nir.h"
 
+__attribute__((weak)) void nir_print_instr(const nir_instr *instr, FILE *fp) {}
+
 void
 dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
               const struct gen_device_info *devinfo)
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
index 76652dc..09f4cc1 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -118,7 +118,8 @@
       annotation.ann[annotation.ann_count].offset = p->next_insn_offset;
    }
 
-   bool ret = brw_validate_instructions(p, 0, &annotation);
+   bool ret = brw_validate_instructions(p->devinfo, p->store, 0,
+                                        p->next_insn_offset, &annotation);
 
    if (print) {
       dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo);
diff --git a/src/intel/genxml/BUILD.gn b/src/intel/genxml/BUILD.gn
index 15384f8..d041bee 100644
--- a/src/intel/genxml/BUILD.gn
+++ b/src/intel/genxml/BUILD.gn
@@ -125,6 +125,32 @@
   ]
 }
 
+action("gen_pack10_header") {
+  output_name = "gen10_pack.h"
+  script_name = "gen_pack_header.py"
+
+  script = "$mesa_build_root/scripts/gn_script_wrapper.py"
+
+  outputs = [
+    "$target_gen_dir/$output_name",
+  ]
+
+  sources = [
+    "gen10.xml",
+  ]
+
+  inputs = [
+    "$script_name",
+  ]
+
+  args = [
+    "$magma_python_path",
+    rebase_path("$target_gen_dir") + "/$output_name",
+    rebase_path(".") + "/$script_name",
+    rebase_path(".") + "/gen10.xml",
+  ]
+}
+
 action("gen_bits_header") {
   output_name = "genX_bits.h"
   script_name = "gen_bits_header.py"
diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml
new file mode 100644
index 0000000..a7ae49a
--- /dev/null
+++ b/src/intel/genxml/gen10.xml
@@ -0,0 +1,3755 @@
+<?xml version="1.0" ?>
+<genxml name="CNL" gen="10">
+  <enum name="3D_Prim_Topo_Type" prefix="3DPRIM">
+    <value name="POINTLIST" value="1"/>
+    <value name="LINELIST" value="2"/>
+    <value name="LINESTRIP" value="3"/>
+    <value name="TRILIST" value="4"/>
+    <value name="TRISTRIP" value="5"/>
+    <value name="TRIFAN" value="6"/>
+    <value name="QUADLIST" value="7"/>
+    <value name="QUADSTRIP" value="8"/>
+    <value name="LINELIST_ADJ" value="9"/>
+    <value name="LINESTRIP_ADJ" value="10"/>
+    <value name="TRILIST_ADJ" value="11"/>
+    <value name="TRISTRIP_ADJ" value="12"/>
+    <value name="TRISTRIP_REVERSE" value="13"/>
+    <value name="POLYGON" value="14"/>
+    <value name="RECTLIST" value="15"/>
+    <value name="LINELOOP" value="16"/>
+    <value name="POINTLIST _BF" value="17"/>
+    <value name="LINESTRIP_CONT" value="18"/>
+    <value name="LINESTRIP_BF" value="19"/>
+    <value name="LINESTRIP_CONT_BF" value="20"/>
+    <value name="TRIFAN_NOSTIPPLE" value="22"/>
+    <value name="PATCHLIST_1" value="32"/>
+    <value name="PATCHLIST_2" value="33"/>
+    <value name="PATCHLIST_3" value="34"/>
+    <value name="PATCHLIST_4" value="35"/>
+    <value name="PATCHLIST_5" value="36"/>
+    <value name="PATCHLIST_6" value="37"/>
+    <value name="PATCHLIST_7" value="38"/>
+    <value name="PATCHLIST_8" value="39"/>
+    <value name="PATCHLIST_9" value="40"/>
+    <value name="PATCHLIST_10" value="41"/>
+    <value name="PATCHLIST_11" value="42"/>
+    <value name="PATCHLIST_12" value="43"/>
+    <value name="PATCHLIST_13" value="44"/>
+    <value name="PATCHLIST_14" value="45"/>
+    <value name="PATCHLIST_15" value="46"/>
+    <value name="PATCHLIST_16" value="47"/>
+    <value name="PATCHLIST_17" value="48"/>
+    <value name="PATCHLIST_18" value="49"/>
+    <value name="PATCHLIST_19" value="50"/>
+    <value name="PATCHLIST_20" value="51"/>
+    <value name="PATCHLIST_21" value="52"/>
+    <value name="PATCHLIST_22" value="53"/>
+    <value name="PATCHLIST_23" value="54"/>
+    <value name="PATCHLIST_24" value="55"/>
+    <value name="PATCHLIST_25" value="56"/>
+    <value name="PATCHLIST_26" value="57"/>
+    <value name="PATCHLIST_27" value="58"/>
+    <value name="PATCHLIST_28" value="59"/>
+    <value name="PATCHLIST_29" value="60"/>
+    <value name="PATCHLIST_30" value="61"/>
+    <value name="PATCHLIST_31" value="62"/>
+    <value name="PATCHLIST_32" value="63"/>
+  </enum>
+
+  <enum name="3D_Vertex_Component_Control" prefix="VFCOMP">
+    <value name="NOSTORE" value="0"/>
+    <value name="STORE_SRC" value="1"/>
+    <value name="STORE_0" value="2"/>
+    <value name="STORE_1_FP" value="3"/>
+    <value name="STORE_1_INT" value="4"/>
+    <value name="STORE_PID" value="7"/>
+  </enum>
+
+  <enum name="COMPONENT_ENABLES" prefix="CE">
+    <value name="NONE" value="0"/>
+    <value name="X" value="1"/>
+    <value name="Y" value="2"/>
+    <value name="XY" value="3"/>
+    <value name="Z" value="4"/>
+    <value name="XZ" value="5"/>
+    <value name="YZ" value="6"/>
+    <value name="XYZ" value="7"/>
+    <value name="W" value="8"/>
+    <value name="XW" value="9"/>
+    <value name="YW" value="10"/>
+    <value name="XYW" value="11"/>
+    <value name="ZW" value="12"/>
+    <value name="XZW" value="13"/>
+    <value name="YZW" value="14"/>
+    <value name="XYZW" value="15"/>
+  </enum>
+
+  <enum name="Attribute_Component_Format" prefix="ACF">
+    <value name="disabled" value="0"/>
+    <value name=".xy" value="1"/>
+    <value name=".xyz" value="2"/>
+    <value name=".xyzw" value="3"/>
+  </enum>
+
+  <enum name="WRAP_SHORTEST_ENABLE" prefix="WSE">
+    <value name="X" value="1"/>
+    <value name="Y" value="2"/>
+    <value name="XY" value="3"/>
+    <value name="Z" value="4"/>
+    <value name="XZ" value="5"/>
+    <value name="YZ" value="6"/>
+    <value name="XYZ" value="7"/>
+    <value name="W" value="8"/>
+    <value name="XW" value="9"/>
+    <value name="YW" value="10"/>
+    <value name="XYW" value="11"/>
+    <value name="ZW" value="12"/>
+    <value name="XZW" value="13"/>
+    <value name="YZW" value="14"/>
+    <value name="XYZW" value="15"/>
+  </enum>
+
+  <enum name="3D_Stencil_Operation" prefix="STENCILOP">
+    <value name="KEEP" value="0"/>
+    <value name="ZERO" value="1"/>
+    <value name="REPLACE" value="2"/>
+    <value name="INCRSAT" value="3"/>
+    <value name="DECRSAT" value="4"/>
+    <value name="INCR" value="5"/>
+    <value name="DECR" value="6"/>
+    <value name="INVERT" value="7"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Factor" prefix="BLENDFACTOR">
+    <value name="ONE" value="1"/>
+    <value name="SRC_COLOR" value="2"/>
+    <value name="SRC_ALPHA" value="3"/>
+    <value name="DST_ALPHA" value="4"/>
+    <value name="DST_COLOR" value="5"/>
+    <value name="SRC_ALPHA_SATURATE" value="6"/>
+    <value name="CONST_COLOR" value="7"/>
+    <value name="CONST_ALPHA" value="8"/>
+    <value name="SRC1_COLOR" value="9"/>
+    <value name="SRC1_ALPHA" value="10"/>
+    <value name="ZERO" value="17"/>
+    <value name="INV_SRC_COLOR" value="18"/>
+    <value name="INV_SRC_ALPHA" value="19"/>
+    <value name="INV_DST_ALPHA" value="20"/>
+    <value name="INV_DST_COLOR" value="21"/>
+    <value name="INV_CONST_COLOR" value="23"/>
+    <value name="INV_CONST_ALPHA" value="24"/>
+    <value name="INV_SRC1_COLOR" value="25"/>
+    <value name="INV_SRC1_ALPHA" value="26"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Function" prefix="BLENDFUNCTION">
+    <value name="ADD" value="0"/>
+    <value name="SUBTRACT" value="1"/>
+    <value name="REVERSE_SUBTRACT" value="2"/>
+    <value name="MIN" value="3"/>
+    <value name="MAX" value="4"/>
+  </enum>
+
+  <enum name="3D_Compare_Function" prefix="COMPAREFUNCTION">
+    <value name="ALWAYS" value="0"/>
+    <value name="NEVER" value="1"/>
+    <value name="LESS" value="2"/>
+    <value name="EQUAL" value="3"/>
+    <value name="LEQUAL" value="4"/>
+    <value name="GREATER" value="5"/>
+    <value name="NOTEQUAL" value="6"/>
+    <value name="GEQUAL" value="7"/>
+  </enum>
+
+  <enum name="3D_Logic_Op_Function" prefix="LOGICOP">
+    <value name="CLEAR" value="0"/>
+    <value name="NOR" value="1"/>
+    <value name="AND_INVERTED" value="2"/>
+    <value name="COPY_INVERTED" value="3"/>
+    <value name="AND_REVERSE" value="4"/>
+    <value name="INVERT" value="5"/>
+    <value name="XOR" value="6"/>
+    <value name="NAND" value="7"/>
+    <value name="AND" value="8"/>
+    <value name="EQUIV" value="9"/>
+    <value name="NOOP" value="10"/>
+    <value name="OR_INVERTED" value="11"/>
+    <value name="COPY" value="12"/>
+    <value name="OR_REVERSE" value="13"/>
+    <value name="OR" value="14"/>
+    <value name="SET" value="15"/>
+  </enum>
+
+  <enum name="SURFACE_FORMAT" prefix="SF">
+    <value name="R32G32B32A32_FLOAT" value="0"/>
+    <value name="R32G32B32A32_SINT" value="1"/>
+    <value name="R32G32B32A32_UINT" value="2"/>
+    <value name="R32G32B32A32_UNORM" value="3"/>
+    <value name="R32G32B32A32_SNORM" value="4"/>
+    <value name="R64G64_FLOAT" value="5"/>
+    <value name="R32G32B32X32_FLOAT" value="6"/>
+    <value name="R32G32B32A32_SSCALED" value="7"/>
+    <value name="R32G32B32A32_USCALED" value="8"/>
+    <value name="R32G32B32A32_SFIXED" value="32"/>
+    <value name="R64G64_PASSTHRU" value="33"/>
+    <value name="R32G32B32_FLOAT" value="64"/>
+    <value name="R32G32B32_SINT" value="65"/>
+    <value name="R32G32B32_UINT" value="66"/>
+    <value name="R32G32B32_UNORM" value="67"/>
+    <value name="R32G32B32_SNORM" value="68"/>
+    <value name="R32G32B32_SSCALED" value="69"/>
+    <value name="R32G32B32_USCALED" value="70"/>
+    <value name="R32G32B32_SFIXED" value="80"/>
+    <value name="R16G16B16A16_UNORM" value="128"/>
+    <value name="R16G16B16A16_SNORM" value="129"/>
+    <value name="R16G16B16A16_SINT" value="130"/>
+    <value name="R16G16B16A16_UINT" value="131"/>
+    <value name="R16G16B16A16_FLOAT" value="132"/>
+    <value name="R32G32_FLOAT" value="133"/>
+    <value name="R32G32_SINT" value="134"/>
+    <value name="R32G32_UINT" value="135"/>
+    <value name="R32_FLOAT_X8X24_TYPELESS" value="136"/>
+    <value name="X32_TYPELESS_G8X24_UINT" value="137"/>
+    <value name="L32A32_FLOAT" value="138"/>
+    <value name="R32G32_UNORM" value="139"/>
+    <value name="R32G32_SNORM" value="140"/>
+    <value name="R64_FLOAT" value="141"/>
+    <value name="R16G16B16X16_UNORM" value="142"/>
+    <value name="R16G16B16X16_FLOAT" value="143"/>
+    <value name="A32X32_FLOAT" value="144"/>
+    <value name="L32X32_FLOAT" value="145"/>
+    <value name="I32X32_FLOAT" value="146"/>
+    <value name="R16G16B16A16_SSCALED" value="147"/>
+    <value name="R16G16B16A16_USCALED" value="148"/>
+    <value name="R32G32_SSCALED" value="149"/>
+    <value name="R32G32_USCALED" value="150"/>
+    <value name="R32G32_SFIXED" value="160"/>
+    <value name="R64_PASSTHRU" value="161"/>
+    <value name="B8G8R8A8_UNORM" value="192"/>
+    <value name="B8G8R8A8_UNORM_SRGB" value="193"/>
+    <value name="R10G10B10A2_UNORM" value="194"/>
+    <value name="R10G10B10A2_UNORM_SRGB" value="195"/>
+    <value name="R10G10B10A2_UINT" value="196"/>
+    <value name="R10G10B10_SNORM_A2_UNORM" value="197"/>
+    <value name="R8G8B8A8_UNORM" value="199"/>
+    <value name="R8G8B8A8_UNORM_SRGB" value="200"/>
+    <value name="R8G8B8A8_SNORM" value="201"/>
+    <value name="R8G8B8A8_SINT" value="202"/>
+    <value name="R8G8B8A8_UINT" value="203"/>
+    <value name="R16G16_UNORM" value="204"/>
+    <value name="R16G16_SNORM" value="205"/>
+    <value name="R16G16_SINT" value="206"/>
+    <value name="R16G16_UINT" value="207"/>
+    <value name="R16G16_FLOAT" value="208"/>
+    <value name="B10G10R10A2_UNORM" value="209"/>
+    <value name="B10G10R10A2_UNORM_SRGB" value="210"/>
+    <value name="R11G11B10_FLOAT" value="211"/>
+    <value name="R32_SINT" value="214"/>
+    <value name="R32_UINT" value="215"/>
+    <value name="R32_FLOAT" value="216"/>
+    <value name="R24_UNORM_X8_TYPELESS" value="217"/>
+    <value name="X24_TYPELESS_G8_UINT" value="218"/>
+    <value name="L32_UNORM" value="221"/>
+    <value name="A32_UNORM" value="222"/>
+    <value name="L16A16_UNORM" value="223"/>
+    <value name="I24X8_UNORM" value="224"/>
+    <value name="L24X8_UNORM" value="225"/>
+    <value name="A24X8_UNORM" value="226"/>
+    <value name="I32_FLOAT" value="227"/>
+    <value name="L32_FLOAT" value="228"/>
+    <value name="A32_FLOAT" value="229"/>
+    <value name="X8B8_UNORM_G8R8_SNORM" value="230"/>
+    <value name="A8X8_UNORM_G8R8_SNORM" value="231"/>
+    <value name="B8X8_UNORM_G8R8_SNORM" value="232"/>
+    <value name="B8G8R8X8_UNORM" value="233"/>
+    <value name="B8G8R8X8_UNORM_SRGB" value="234"/>
+    <value name="R8G8B8X8_UNORM" value="235"/>
+    <value name="R8G8B8X8_UNORM_SRGB" value="236"/>
+    <value name="R9G9B9E5_SHAREDEXP" value="237"/>
+    <value name="B10G10R10X2_UNORM" value="238"/>
+    <value name="L16A16_FLOAT" value="240"/>
+    <value name="R32_UNORM" value="241"/>
+    <value name="R32_SNORM" value="242"/>
+    <value name="R10G10B10X2_USCALED" value="243"/>
+    <value name="R8G8B8A8_SSCALED" value="244"/>
+    <value name="R8G8B8A8_USCALED" value="245"/>
+    <value name="R16G16_SSCALED" value="246"/>
+    <value name="R16G16_USCALED" value="247"/>
+    <value name="R32_SSCALED" value="248"/>
+    <value name="R32_USCALED" value="249"/>
+    <value name="B5G6R5_UNORM" value="256"/>
+    <value name="B5G6R5_UNORM_SRGB" value="257"/>
+    <value name="B5G5R5A1_UNORM" value="258"/>
+    <value name="B5G5R5A1_UNORM_SRGB" value="259"/>
+    <value name="B4G4R4A4_UNORM" value="260"/>
+    <value name="B4G4R4A4_UNORM_SRGB" value="261"/>
+    <value name="R8G8_UNORM" value="262"/>
+    <value name="R8G8_SNORM" value="263"/>
+    <value name="R8G8_SINT" value="264"/>
+    <value name="R8G8_UINT" value="265"/>
+    <value name="R16_UNORM" value="266"/>
+    <value name="R16_SNORM" value="267"/>
+    <value name="R16_SINT" value="268"/>
+    <value name="R16_UINT" value="269"/>
+    <value name="R16_FLOAT" value="270"/>
+    <value name="A8P8_UNORM_PALETTE0" value="271"/>
+    <value name="A8P8_UNORM_PALETTE1" value="272"/>
+    <value name="I16_UNORM" value="273"/>
+    <value name="L16_UNORM" value="274"/>
+    <value name="A16_UNORM" value="275"/>
+    <value name="L8A8_UNORM" value="276"/>
+    <value name="I16_FLOAT" value="277"/>
+    <value name="L16_FLOAT" value="278"/>
+    <value name="A16_FLOAT" value="279"/>
+    <value name="L8A8_UNORM_SRGB" value="280"/>
+    <value name="R5G5_SNORM_B6_UNORM" value="281"/>
+    <value name="B5G5R5X1_UNORM" value="282"/>
+    <value name="B5G5R5X1_UNORM_SRGB" value="283"/>
+    <value name="R8G8_SSCALED" value="284"/>
+    <value name="R8G8_USCALED" value="285"/>
+    <value name="R16_SSCALED" value="286"/>
+    <value name="R16_USCALED" value="287"/>
+    <value name="P8A8_UNORM_PALETTE0" value="290"/>
+    <value name="P8A8_UNORM_PALETTE1" value="291"/>
+    <value name="A1B5G5R5_UNORM" value="292"/>
+    <value name="A4B4G4R4_UNORM" value="293"/>
+    <value name="L8A8_UINT" value="294"/>
+    <value name="L8A8_SINT" value="295"/>
+    <value name="R8_UNORM" value="320"/>
+    <value name="R8_SNORM" value="321"/>
+    <value name="R8_SINT" value="322"/>
+    <value name="R8_UINT" value="323"/>
+    <value name="A8_UNORM" value="324"/>
+    <value name="I8_UNORM" value="325"/>
+    <value name="L8_UNORM" value="326"/>
+    <value name="P4A4_UNORM_PALETTE0" value="327"/>
+    <value name="A4P4_UNORM_PALETTE0" value="328"/>
+    <value name="R8_SSCALED" value="329"/>
+    <value name="R8_USCALED" value="330"/>
+    <value name="P8_UNORM_PALETTE0" value="331"/>
+    <value name="L8_UNORM_SRGB" value="332"/>
+    <value name="P8_UNORM_PALETTE1" value="333"/>
+    <value name="P4A4_UNORM_PALETTE1" value="334"/>
+    <value name="A4P4_UNORM_PALETTE1" value="335"/>
+    <value name="Y8_UNORM" value="336"/>
+    <value name="L8_UINT" value="338"/>
+    <value name="L8_SINT" value="339"/>
+    <value name="I8_UINT" value="340"/>
+    <value name="I8_SINT" value="341"/>
+    <value name="DXT1_RGB_SRGB" value="384"/>
+    <value name="R1_UNORM" value="385"/>
+    <value name="YCRCB_NORMAL" value="386"/>
+    <value name="YCRCB_SWAPUVY" value="387"/>
+    <value name="P2_UNORM_PALETTE0" value="388"/>
+    <value name="P2_UNORM_PALETTE1" value="389"/>
+    <value name="BC1_UNORM" value="390"/>
+    <value name="BC2_UNORM" value="391"/>
+    <value name="BC3_UNORM" value="392"/>
+    <value name="BC4_UNORM" value="393"/>
+    <value name="BC5_UNORM" value="394"/>
+    <value name="BC1_UNORM_SRGB" value="395"/>
+    <value name="BC2_UNORM_SRGB" value="396"/>
+    <value name="BC3_UNORM_SRGB" value="397"/>
+    <value name="MONO8" value="398"/>
+    <value name="YCRCB_SWAPUV" value="399"/>
+    <value name="YCRCB_SWAPY" value="400"/>
+    <value name="DXT1_RGB" value="401"/>
+    <value name="FXT1" value="402"/>
+    <value name="R8G8B8_UNORM" value="403"/>
+    <value name="R8G8B8_SNORM" value="404"/>
+    <value name="R8G8B8_SSCALED" value="405"/>
+    <value name="R8G8B8_USCALED" value="406"/>
+    <value name="R64G64B64A64_FLOAT" value="407"/>
+    <value name="R64G64B64_FLOAT" value="408"/>
+    <value name="BC4_SNORM" value="409"/>
+    <value name="BC5_SNORM" value="410"/>
+    <value name="R16G16B16_FLOAT" value="411"/>
+    <value name="R16G16B16_UNORM" value="412"/>
+    <value name="R16G16B16_SNORM" value="413"/>
+    <value name="R16G16B16_SSCALED" value="414"/>
+    <value name="R16G16B16_USCALED" value="415"/>
+    <value name="BC6H_SF16" value="417"/>
+    <value name="BC7_UNORM" value="418"/>
+    <value name="BC7_UNORM_SRGB" value="419"/>
+    <value name="BC6H_UF16" value="420"/>
+    <value name="PLANAR_420_8" value="421"/>
+    <value name="PLANAR_420_16" value="422"/>
+    <value name="R8G8B8_UNORM_SRGB" value="424"/>
+    <value name="ETC1_RGB8" value="425"/>
+    <value name="ETC2_RGB8" value="426"/>
+    <value name="EAC_R11" value="427"/>
+    <value name="EAC_RG11" value="428"/>
+    <value name="EAC_SIGNED_R11" value="429"/>
+    <value name="EAC_SIGNED_RG11" value="430"/>
+    <value name="ETC2_SRGB8" value="431"/>
+    <value name="R16G16B16_UINT" value="432"/>
+    <value name="R16G16B16_SINT" value="433"/>
+    <value name="R32_SFIXED" value="434"/>
+    <value name="R10G10B10A2_SNORM" value="435"/>
+    <value name="R10G10B10A2_USCALED" value="436"/>
+    <value name="R10G10B10A2_SSCALED" value="437"/>
+    <value name="R10G10B10A2_SINT" value="438"/>
+    <value name="B10G10R10A2_SNORM" value="439"/>
+    <value name="B10G10R10A2_USCALED" value="440"/>
+    <value name="B10G10R10A2_SSCALED" value="441"/>
+    <value name="B10G10R10A2_UINT" value="442"/>
+    <value name="B10G10R10A2_SINT" value="443"/>
+    <value name="R64G64B64A64_PASSTHRU" value="444"/>
+    <value name="R64G64B64_PASSTHRU" value="445"/>
+    <value name="ETC2_RGB8_PTA" value="448"/>
+    <value name="ETC2_SRGB8_PTA" value="449"/>
+    <value name="ETC2_EAC_RGBA8" value="450"/>
+    <value name="ETC2_EAC_SRGB8_A8" value="451"/>
+    <value name="R8G8B8_UINT" value="456"/>
+    <value name="R8G8B8_SINT" value="457"/>
+    <value name="RAW" value="511"/>
+  </enum>
+
+  <enum name="Shader Channel Select" prefix="SCS">
+    <value name="ZERO" value="0"/>
+    <value name="ONE" value="1"/>
+    <value name="RED" value="4"/>
+    <value name="GREEN" value="5"/>
+    <value name="BLUE" value="6"/>
+    <value name="ALPHA" value="7"/>
+  </enum>
+
+  <enum name="Texture Coordinate Mode" prefix="TCM">
+    <value name="WRAP" value="0"/>
+    <value name="MIRROR" value="1"/>
+    <value name="CLAMP" value="2"/>
+    <value name="CUBE" value="3"/>
+    <value name="CLAMP_BORDER" value="4"/>
+    <value name="MIRROR_ONCE" value="5"/>
+    <value name="HALF_BORDER" value="6"/>
+  </enum>
+
+  <struct name="3DSTATE_CONSTANT_BODY" length="10">
+    <group count="4" start="0" size="16">
+      <field name="Read Length" start="0" end="15" type="uint"/>
+    </group>
+    <group count="4" start="64" size="64">
+      <field name="Buffer" start="5" end="63" type="address"/>
+    </group>
+  </struct>
+
+  <struct name="BINDING_TABLE_EDIT_ENTRY" length="1">
+    <field name="Binding Table Index" start="16" end="23" type="uint"/>
+    <field name="Surface State Pointer" start="0" end="15" type="offset"/>
+  </struct>
+
+  <struct name="GATHER_CONSTANT_ENTRY" length="1">
+    <field name="Constant Buffer Offset" start="8" end="15" type="offset"/>
+    <field name="Channel Mask" start="4" end="7" type="uint"/>
+    <field name="Binding Table Index Offset" start="0" end="3" type="uint"/>
+  </struct>
+
+  <struct name="MEMORY_OBJECT_CONTROL_STATE" length="1">
+    <field name="Index to MOCS Tables" start="1" end="6" type="uint"/>
+  </struct>
+
+  <struct name="VERTEX_BUFFER_STATE" length="4">
+    <field name="Vertex Buffer Index" start="26" end="31" type="uint"/>
+    <field name="Memory Object Control State" start="16" end="22" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Vertex Buffer MOCS" start="16" end="22" type="uint"/>
+    <field name="Address Modify Enable" start="14" end="14" type="bool"/>
+    <field name="Null Vertex Buffer" start="13" end="13" type="bool"/>
+    <field name="Buffer Pitch" start="0" end="11" type="uint"/>
+    <field name="Buffer Starting Address" start="32" end="95" type="address"/>
+    <field name="Buffer Size" start="96" end="127" type="uint"/>
+  </struct>
+
+  <struct name="VERTEX_ELEMENT_STATE" length="2">
+    <field name="Vertex Buffer Index" start="26" end="31" type="uint"/>
+    <field name="Valid" start="25" end="25" type="bool"/>
+    <field name="Source Element Format" start="16" end="24" type="SURFACE_FORMAT"/>
+    <field name="Edge Flag Enable" start="15" end="15" type="bool"/>
+    <field name="Source Element Offset" start="0" end="11" type="uint"/>
+    <field name="Component 0 Control" start="60" end="62" type="3D_Vertex_Component_Control"/>
+    <field name="Component 1 Control" start="56" end="58" type="3D_Vertex_Component_Control"/>
+    <field name="Component 2 Control" start="52" end="54" type="3D_Vertex_Component_Control"/>
+    <field name="Component 3 Control" start="48" end="50" type="3D_Vertex_Component_Control"/>
+  </struct>
+
+  <struct name="SO_DECL" length="1">
+    <field name="Output Buffer Slot" start="12" end="13" type="uint"/>
+    <field name="Hole Flag" start="11" end="11" type="uint"/>
+    <field name="Register Index" start="4" end="9" type="uint"/>
+    <field name="Component Mask" start="0" end="3" type="uint" default="0"/>
+  </struct>
+
+  <struct name="SO_DECL_ENTRY" length="2">
+    <field name="Stream 3 Decl" start="48" end="63" type="SO_DECL"/>
+    <field name="Stream 2 Decl" start="32" end="47" type="SO_DECL"/>
+    <field name="Stream 1 Decl" start="16" end="31" type="SO_DECL"/>
+    <field name="Stream 0 Decl" start="0" end="15" type="SO_DECL"/>
+  </struct>
+
+  <struct name="SF_OUTPUT_ATTRIBUTE_DETAIL" length="1">
+    <field name="Component Override W" start="15" end="15" type="bool"/>
+    <field name="Component Override Z" start="14" end="14" type="bool"/>
+    <field name="Component Override Y" start="13" end="13" type="bool"/>
+    <field name="Component Override X" start="12" end="12" type="bool"/>
+    <field name="Swizzle Control Mode" start="11" end="11" type="uint"/>
+    <field name="Constant Source" start="9" end="10" type="uint">
+      <value name="CONST_0000" value="0"/>
+      <value name="CONST_0001_FLOAT" value="1"/>
+      <value name="CONST_1111_FLOAT" value="2"/>
+      <value name="PRIM_ID" value="3"/>
+    </field>
+    <field name="Swizzle Select" start="6" end="7" type="uint">
+      <value name="INPUTATTR" value="0"/>
+      <value name="INPUTATTR_FACING" value="1"/>
+      <value name="INPUTATTR_W" value="2"/>
+      <value name="INPUTATTR_FACING_W" value="3"/>
+    </field>
+    <field name="Source Attribute" start="0" end="4" type="uint"/>
+  </struct>
+
+  <struct name="SCISSOR_RECT" length="2">
+    <field name="Scissor Rectangle Y Min" start="16" end="31" type="uint"/>
+    <field name="Scissor Rectangle X Min" start="0" end="15" type="uint"/>
+    <field name="Scissor Rectangle Y Max" start="48" end="63" type="uint"/>
+    <field name="Scissor Rectangle X Max" start="32" end="47" type="uint"/>
+  </struct>
+
+  <struct name="SF_CLIP_VIEWPORT" length="16">
+    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
+    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
+    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
+    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
+    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
+    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
+    <field name="X Min Clip Guardband" start="256" end="287" type="float"/>
+    <field name="X Max Clip Guardband" start="288" end="319" type="float"/>
+    <field name="Y Min Clip Guardband" start="320" end="351" type="float"/>
+    <field name="Y Max Clip Guardband" start="352" end="383" type="float"/>
+    <field name="X Min ViewPort" start="384" end="415" type="float"/>
+    <field name="X Max ViewPort" start="416" end="447" type="float"/>
+    <field name="Y Min ViewPort" start="448" end="479" type="float"/>
+    <field name="Y Max ViewPort" start="480" end="511" type="float"/>
+  </struct>
+
+  <struct name="BLEND_STATE_ENTRY" length="2">
+    <field name="Logic Op Enable" start="63" end="63" type="bool"/>
+    <field name="Logic Op Function" start="59" end="62" type="3D_Logic_Op_Function"/>
+    <field name="Pre-Blend Source Only Clamp Enable" start="36" end="36" type="bool"/>
+    <field name="Color Clamp Range" start="34" end="35" type="uint">
+      <value name="COLORCLAMP_UNORM" value="0"/>
+      <value name="COLORCLAMP_SNORM" value="1"/>
+      <value name="COLORCLAMP_RTFORMAT" value="2"/>
+    </field>
+    <field name="Pre-Blend Color Clamp Enable" start="33" end="33" type="bool"/>
+    <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
+    <field name="Color Buffer Blend Enable" start="31" end="31" type="bool"/>
+    <field name="Source Blend Factor" start="26" end="30" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Blend Factor" start="21" end="25" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Color Blend Function" start="18" end="20" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Alpha Blend Factor" start="13" end="17" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Alpha Blend Factor" start="8" end="12" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Alpha Blend Function" start="5" end="7" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Write Disable Alpha" start="3" end="3" type="bool"/>
+    <field name="Write Disable Red" start="2" end="2" type="bool"/>
+    <field name="Write Disable Green" start="1" end="1" type="bool"/>
+    <field name="Write Disable Blue" start="0" end="0" type="bool"/>
+  </struct>
+
+  <struct name="BLEND_STATE" length="1">
+    <field name="Alpha To Coverage Enable" start="31" end="31" type="bool"/>
+    <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
+    <field name="Alpha To One Enable" start="29" end="29" type="bool"/>
+    <field name="Alpha To Coverage Dither Enable" start="28" end="28" type="bool"/>
+    <field name="Alpha Test Enable" start="27" end="27" type="bool"/>
+    <field name="Alpha Test Function" start="24" end="26" type="3D_Compare_Function"/>
+    <field name="Color Dither Enable" start="23" end="23" type="bool"/>
+    <field name="X Dither Offset" start="21" end="22" type="uint"/>
+    <field name="Y Dither Offset" start="19" end="20" type="uint"/>
+    <group count="0" start="32" size="64">
+      <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
+    </group>
+  </struct>
+
+  <struct name="CC_VIEWPORT" length="2">
+    <field name="Minimum Depth" start="0" end="31" type="float"/>
+    <field name="Maximum Depth" start="32" end="63" type="float"/>
+  </struct>
+
+  <struct name="COLOR_CALC_STATE" length="6">
+    <field name="Round Disable Function Disable" start="15" end="15" type="bool"/>
+    <field name="Alpha Test Format" start="0" end="0" type="uint">
+      <value name="ALPHATEST_UNORM8" value="0"/>
+      <value name="ALPHATEST_FLOAT32" value="1"/>
+    </field>
+    <field name="Alpha Reference Value As UNORM8" start="32" end="63" type="uint"/>
+    <field name="Alpha Reference Value As FLOAT32" start="32" end="63" type="float"/>
+    <field name="Blend Constant Color Red" start="64" end="95" type="float"/>
+    <field name="Blend Constant Color Green" start="96" end="127" type="float"/>
+    <field name="Blend Constant Color Blue" start="128" end="159" type="float"/>
+    <field name="Blend Constant Color Alpha" start="160" end="191" type="float"/>
+  </struct>
+
+  <struct name="EXECUTION_UNIT_EXTENDED_MESSAGE_DESCRIPTOR" length="1">
+    <field name="Extended Message Length" start="6" end="9" type="uint"/>
+    <field name="End Of Thread" start="5" end="5" type="uint">
+      <value name="No Termination" value="0"/>
+      <value name="EOT" value="1"/>
+    </field>
+    <field name="Target Function ID" start="0" end="3" type="uint"/>
+  </struct>
+
+  <struct name="INTERFACE_DESCRIPTOR_DATA" length="8">
+    <field name="Kernel Start Pointer" start="6" end="47" type="offset"/>
+    <field name="Thread Preemption disable" start="84" end="84" type="bool"/>
+    <field name="Denorm Mode" start="83" end="83" type="uint">
+      <value name="Ftz" value="0"/>
+      <value name="SetByKernel" value="1"/>
+    </field>
+    <field name="Single Program Flow" start="82" end="82" type="bool"/>
+    <field name="Thread Priority" start="81" end="81" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="80" end="80" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="77" end="77" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="75" end="75" type="bool"/>
+    <field name="Software Exception Enable" start="71" end="71" type="bool"/>
+    <field name="Sampler State Pointer" start="101" end="127" type="offset"/>
+    <field name="Sampler Count" start="98" end="100" type="uint">
+      <value name="No samplers used" value="0"/>
+      <value name="Between 1 and 4 samplers used" value="1"/>
+      <value name="Between 5 and 8 samplers used" value="2"/>
+      <value name="Between 9 and 12 samplers used" value="3"/>
+      <value name="Between 13 and 16 samplers used" value="4"/>
+    </field>
+    <field name="Binding Table Pointer" start="133" end="143" type="offset"/>
+    <field name="Binding Table Entry Count" start="128" end="132" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="176" end="191" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="160" end="175" type="uint"/>
+    <field name="Rounding Mode" start="214" end="215" type="uint">
+      <value name="RTNE" value="0"/>
+      <value name="RU" value="1"/>
+      <value name="RD" value="2"/>
+      <value name="RTZ" value="3"/>
+    </field>
+    <field name="Barrier Enable" start="213" end="213" type="bool"/>
+    <field name="Shared Local Memory Size" start="208" end="212" type="uint">
+      <value name="Encodes 0K" value="0"/>
+      <value name="Encodes 1K" value="1"/>
+      <value name="Encodes 2K" value="2"/>
+      <value name="Encodes 4K" value="3"/>
+      <value name="Encodes 8K" value="4"/>
+      <value name="Encodes 16K" value="5"/>
+      <value name="Encodes 32K" value="6"/>
+      <value name="Encodes 64K" value="7"/>
+    </field>
+    <field name="Global Barrier Enable" start="207" end="207" type="bool"/>
+    <field name="Number of Threads in GPGPU Thread Group" start="192" end="201" type="uint"/>
+    <field name="Cross-Thread Constant Data Read Length" start="224" end="231" type="uint"/>
+  </struct>
+
+  <struct name="ROUNDINGPRECISIONTABLE_3_BITS" length="1">
+    <field name="Rounding Precision" start="0" end="2" type="uint">
+      <value name="+1/16" value="0"/>
+      <value name="+2/16" value="1"/>
+      <value name="+3/16" value="2"/>
+      <value name="+4/16" value="3"/>
+      <value name="+5/16" value="4"/>
+      <value name="+6/16" value="5"/>
+      <value name="+7/16" value="6"/>
+      <value name="+8/16" value="7"/>
+    </field>
+  </struct>
+
+  <struct name="PALETTE_ENTRY" length="1">
+    <field name="Alpha" start="24" end="31" type="uint"/>
+    <field name="Red" start="16" end="23" type="uint"/>
+    <field name="Green" start="8" end="15" type="uint"/>
+    <field name="Blue" start="0" end="7" type="uint"/>
+  </struct>
+
+  <struct name="BINDING_TABLE_STATE" length="1">
+    <field name="Surface State Pointer" start="6" end="31" type="offset"/>
+  </struct>
+
+  <struct name="RENDER_SURFACE_STATE" length="16">
+    <field name="Surface Type" start="29" end="31" type="uint">
+      <value name="SURFTYPE_1D" value="0"/>
+      <value name="SURFTYPE_2D" value="1"/>
+      <value name="SURFTYPE_3D" value="2"/>
+      <value name="SURFTYPE_CUBE" value="3"/>
+      <value name="SURFTYPE_BUFFER" value="4"/>
+      <value name="SURFTYPE_STRBUF" value="5"/>
+      <value name="SURFTYPE_NULL" value="7"/>
+    </field>
+    <field name="Surface Array" start="28" end="28" type="bool"/>
+    <field name="Surface Format" start="18" end="27" type="SURFACE_FORMAT"/>
+    <field name="Surface Vertical Alignment" start="16" end="17" type="uint">
+      <value name="VALIGN 4" value="1"/>
+      <value name="VALIGN 8" value="2"/>
+      <value name="VALIGN 16" value="3"/>
+    </field>
+    <field name="Surface Horizontal Alignment" start="14" end="15" type="uint">
+      <value name="HALIGN 4" value="1"/>
+      <value name="HALIGN 8" value="2"/>
+      <value name="HALIGN 16" value="3"/>
+    </field>
+    <field name="Tile Mode" start="12" end="13" type="uint">
+      <value name="LINEAR" value="0"/>
+      <value name="WMAJOR" value="1"/>
+      <value name="XMAJOR" value="2"/>
+      <value name="YMAJOR" value="3"/>
+    </field>
+    <field name="Vertical Line Stride" start="11" end="11" type="uint"/>
+    <field name="Vertical Line Stride Offset" start="10" end="10" type="uint"/>
+    <field name="Sampler L2 Bypass Mode Disable" start="9" end="9" type="bool"/>
+    <field name="Render Cache Read Write Mode" start="8" end="8" type="uint">
+      <value name="Write-Only Cache" value="0"/>
+      <value name="Read-Write Cache" value="1"/>
+    </field>
+    <field name="Media Boundary Pixel Mode" start="6" end="7" type="uint">
+      <value name="NORMAL_MODE" value="0"/>
+      <value name="PROGRESSIVE_FRAME" value="2"/>
+      <value name="INTERLACED_FRAME" value="3"/>
+    </field>
+    <field name="Cube Face Enable - Positive Z" start="0" end="0" type="bool"/>
+    <field name="Cube Face Enable - Negative Z" start="1" end="1" type="bool"/>
+    <field name="Cube Face Enable - Positive Y" start="2" end="2" type="bool"/>
+    <field name="Cube Face Enable - Negative Y" start="3" end="3" type="bool"/>
+    <field name="Cube Face Enable - Positive X" start="4" end="4" type="bool"/>
+    <field name="Cube Face Enable - Negative X" start="5" end="5" type="bool"/>
+    <field name="Memory Object Control State" start="56" end="62" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="MOCS" start="56" end="62" type="uint"/>
+    <field name="Base Mip Level" start="51" end="55" type="u4.1"/>
+    <field name="Surface QPitch" start="32" end="46" type="uint"/>
+    <field name="Height" start="80" end="93" type="uint"/>
+    <field name="Width" start="64" end="77" type="uint"/>
+    <field name="Depth" start="117" end="127" type="uint"/>
+    <field name="Tile Address Mapping Mode" start="116" end="116" type="uint">
+      <value name="Gen9" value="0"/>
+      <value name="Gen10+" value="1"/>
+    </field>
+    <field name="Surface Pitch" start="96" end="113" type="uint"/>
+    <field name="Force Non-Comparison Reduction Type" start="159" end="159" type="bool"/>
+    <field name="Render Target And Sample Unorm Rotation" start="157" end="158" type="uint">
+      <value name="0DEG" value="0"/>
+      <value name="90DEG" value="1"/>
+      <value name="180DEG" value="2"/>
+      <value name="270DEG" value="3"/>
+    </field>
+    <field name="Minimum Array Element" start="146" end="156" type="uint"/>
+    <field name="Render Target View Extent" start="135" end="145" type="uint"/>
+    <field name="Multisampled Surface Storage Format" start="134" end="134" type="uint">
+      <value name="MSFMT_MSS" value="0"/>
+      <value name="MSFMT_DEPTH_STENCIL" value="1"/>
+    </field>
+    <field name="Number of Multisamples" start="131" end="133" type="uint">
+      <value name="MULTISAMPLECOUNT_1" value="0"/>
+      <value name="MULTISAMPLECOUNT_2" value="1"/>
+      <value name="MULTISAMPLECOUNT_4" value="2"/>
+      <value name="MULTISAMPLECOUNT_8" value="3"/>
+      <value name="MULTISAMPLECOUNT_16" value="4"/>
+    </field>
+    <field name="Multisample Position Palette Index" start="128" end="130" type="uint"/>
+    <field name="X Offset" start="185" end="191" type="uint"/>
+    <field name="Y Offset" start="181" end="183" type="uint"/>
+    <field name="EWA Disable For Cube" start="180" end="180" type="bool"/>
+    <field name="Tiled Resource Mode" start="178" end="179" type="uint">
+      <value name="NONE" value="0"/>
+      <value name="4KB" value="1"/>
+      <value name="64KB" value="2"/>
+      <value name="TILEYF" value="1"/>
+      <value name="TILEYS" value="2"/>
+    </field>
+    <field name="Coherency Type" start="174" end="174" type="uint">
+      <value name="GPU coherent" value="0"/>
+      <value name="IA coherent" value="1"/>
+    </field>
+    <field name="Mip Tail Start LOD" start="168" end="171" type="uint"/>
+    <field name="Surface Min LOD" start="164" end="167" type="uint"/>
+    <field name="MIP Count / LOD" start="160" end="163" type="uint"/>
+    <field name="Auxiliary Surface QPitch" start="208" end="222" type="uint"/>
+    <field name="Auxiliary Surface Pitch" start="195" end="203" type="uint"/>
+    <field name="Auxiliary Surface Mode" start="192" end="194" type="uint">
+      <value name="AUX_NONE" value="0"/>
+      <value name="AUX_CCS_D" value="1"/>
+      <value name="AUX_APPEND" value="2"/>
+      <value name="AUX_HIZ" value="3"/>
+      <value name="AUX_CCS_E" value="5"/>
+    </field>
+    <field name="Separate UV Plane Enable" start="223" end="223" type="bool"/>
+    <field name="X Offset for U or UV Plane" start="208" end="221" type="uint"/>
+    <field name="Y Offset for U or UV Plane" start="192" end="205" type="uint"/>
+    <field name="Memory Compression Mode" start="255" end="255" type="uint">
+      <value name="Horizontal" value="0"/>
+      <value name="Vertical" value="1"/>
+    </field>
+    <field name="Memory Compression Enable" start="254" end="254" type="bool"/>
+    <field name="Shader Channel Select Red" start="249" end="251" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Green" start="246" end="248" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Blue" start="243" end="245" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Alpha" start="240" end="242" type="Shader Channel Select"/>
+    <field name="Resource Min LOD" start="224" end="235" type="u4.8"/>
+    <field name="Surface Base Address" start="256" end="319" type="address"/>
+    <field name="X Offset for V Plane" start="368" end="381" type="uint"/>
+    <field name="Y Offset for V Plane" start="352" end="365" type="uint"/>
+    <field name="Auxiliary Table Index for Media Compressed Surface" start="341" end="351" type="uint"/>
+    <field name="Auxiliary Surface Base Address" start="332" end="383" type="address"/>
+    <field name="Clear Value Address Enable" start="330" end="330" type="bool"/>
+    <field name="Quilt Height" start="325" end="329" type="uint"/>
+    <field name="Quilt Width" start="320" end="324" type="uint"/>
+    <field name="Red Clear Color" start="384" end="415" type="int"/>
+    <field name="Clear Color Address" start="390" end="415" type="address"/>
+    <field name="Clear Depth Address Low" start="390" end="415" type="address"/>
+    <field name="Green Clear Color" start="416" end="447" type="int"/>
+    <field name="Clear Color Address High" start="416" end="431" type="address"/>
+    <field name="Clear Depth Address High" start="416" end="431" type="address"/>
+    <field name="Blue Clear Color" start="448" end="479" type="int"/>
+    <field name="Alpha Clear Color" start="480" end="511" type="int"/>
+  </struct>
+
+  <struct name="SAMPLER_INDIRECT_STATE_BORDER_COLOR" length="4">
+    <field name="Border Color Red As S31" start="0" end="31" type="int"/>
+    <field name="Border Color Red As U32" start="0" end="31" type="uint"/>
+    <field name="Border Color Red As Float" start="0" end="31" type="float"/>
+    <field name="Border Color Alpha As U8" start="24" end="31" type="uint"/>
+    <field name="Border Color Blue As U8" start="16" end="23" type="uint"/>
+    <field name="Border Color Green As U8" start="8" end="15" type="uint"/>
+    <field name="Border Color Red As U8" start="0" end="7" type="uint"/>
+    <field name="Border Color Green As S31" start="32" end="63" type="int"/>
+    <field name="Border Color Green As U32" start="32" end="63" type="uint"/>
+    <field name="Border Color Green As Float" start="32" end="63" type="float"/>
+    <field name="Border Color Blue As S31" start="64" end="95" type="int"/>
+    <field name="Border Color Blue As U32" start="64" end="95" type="uint"/>
+    <field name="Border Color Blue As Float" start="64" end="95" type="float"/>
+    <field name="Border Color Alpha As S31" start="96" end="127" type="int"/>
+    <field name="Border Color Alpha As U32" start="96" end="127" type="uint"/>
+    <field name="Border Color Alpha As Float" start="96" end="127" type="float"/>
+  </struct>
+
+  <struct name="FILTER_COEFFICIENT" length="1">
+    <field name="Filter Coefficient" start="0" end="7" type="s1.6"/>
+  </struct>
+
+  <struct name="SAMPLER_BORDER_COLOR_STATE" length="4">
+    <field name="Border Color Float Red" start="0" end="31" type="float"/>
+    <field name="Border Color Float Green" start="32" end="63" type="float"/>
+    <field name="Border Color Float Blue" start="64" end="95" type="float"/>
+    <field name="Border Color Float Alpha" start="96" end="127" type="float"/>
+
+    <field name="Border Color 32bit Red" start="0" end="31" type="uint"/>
+    <field name="Border Color 32bit Green" start="32" end="63" type="uint"/>
+    <field name="Border Color 32bit Blue" start="64" end="95" type="uint"/>
+    <field name="Border Color 32bit Alpha" start="96" end="127" type="uint"/>
+  </struct>
+
+  <struct name="SAMPLER_STATE" length="4">
+    <field name="Sampler Disable" start="31" end="31" type="bool"/>
+    <field name="Texture Border Color Mode" start="29" end="29" type="uint">
+      <value name="DX10/OGL" value="0"/>
+      <value name="DX9" value="1"/>
+    </field>
+    <field name="LOD PreClamp Mode" start="27" end="28" type="uint" prefix="CLAMP_MODE">
+      <value name="NONE" value="0"/>
+      <value name="OGL" value="2"/>
+    </field>
+    <field name="Coarse LOD Quality Mode" start="22" end="26" type="uint"/>
+    <field name="Mip Mode Filter" start="20" end="21" type="uint" prefix="MIPFILTER">
+      <value name="NONE" value="0"/>
+      <value name="NEAREST" value="1"/>
+      <value name="LINEAR" value="3"/>
+    </field>
+    <field name="Mag Mode Filter" start="17" end="19" type="uint" prefix="MAPFILTER">
+      <value name="NEAREST" value="0"/>
+      <value name="LINEAR" value="1"/>
+      <value name="ANISOTROPIC" value="2"/>
+      <value name="MONO" value="6"/>
+    </field>
+    <field name="Min Mode Filter" start="14" end="16" type="uint" prefix="MAPFILTER">
+      <value name="NEAREST" value="0"/>
+      <value name="LINEAR" value="1"/>
+      <value name="ANISOTROPIC" value="2"/>
+      <value name="MONO" value="6"/>
+    </field>
+    <field name="Texture LOD Bias" start="1" end="13" type="s4.8"/>
+    <field name="Anisotropic Algorithm" start="0" end="0" type="uint">
+      <value name="LEGACY" value="0"/>
+      <value name="EWA Approximation" value="1"/>
+    </field>
+    <field name="Min LOD" start="52" end="63" type="u4.8"/>
+    <field name="Max LOD" start="40" end="51" type="u4.8"/>
+    <field name="ChromaKey Enable" start="39" end="39" type="bool"/>
+    <field name="ChromaKey Index" start="37" end="38" type="uint"/>
+    <field name="ChromaKey Mode" start="36" end="36" type="uint">
+      <value name="KEYFILTER_KILL_ON_ANY_MATCH" value="0"/>
+      <value name="KEYFILTER_REPLACE_BLACK" value="1"/>
+    </field>
+    <field name="Shadow Function" start="33" end="35" type="uint">
+      <value name="PREFILTEROP ALWAYS" value="0"/>
+      <value name="PREFILTEROP NEVER" value="1"/>
+      <value name="PREFILTEROP LESS" value="2"/>
+      <value name="PREFILTEROP EQUAL" value="3"/>
+      <value name="PREFILTEROP LEQUAL" value="4"/>
+      <value name="PREFILTEROP GREATER" value="5"/>
+      <value name="PREFILTEROP NOTEQUAL" value="6"/>
+      <value name="PREFILTEROP GEQUAL" value="7"/>
+    </field>
+    <field name="Cube Surface Control Mode" start="32" end="32" type="uint">
+      <value name="PROGRAMMED" value="0"/>
+      <value name="OVERRIDE" value="1"/>
+    </field>
+    <field name="Border Color Pointer" start="70" end="87" type="offset"/>
+    <field name="Force gather4 Behavior" start="69" end="69" type="bool"/>
+    <field name="LOD Clamp Magnification Mode" start="64" end="64" type="uint">
+      <value name="MIPNONE" value="0"/>
+      <value name="MIPFILTER" value="1"/>
+    </field>
+    <field name="Reduction Type" start="118" end="119" type="uint">
+      <value name="STD_FILTER" value="0"/>
+      <value name="COMPARISON" value="1"/>
+      <value name="MINIMUM" value="2"/>
+      <value name="MAXIMUM" value="3"/>
+    </field>
+    <field name="Maximum Anisotropy" start="115" end="117" type="uint">
+      <value name="RATIO 2:1" value="0"/>
+      <value name="RATIO 4:1" value="1"/>
+      <value name="RATIO 6:1" value="2"/>
+      <value name="RATIO 8:1" value="3"/>
+      <value name="RATIO 10:1" value="4"/>
+      <value name="RATIO 12:1" value="5"/>
+      <value name="RATIO 14:1" value="6"/>
+      <value name="RATIO 16:1" value="7"/>
+    </field>
+    <field name="R Address Min Filter Rounding Enable" start="109" end="109" type="bool"/>
+    <field name="R Address Mag Filter Rounding Enable" start="110" end="110" type="bool"/>
+    <field name="V Address Min Filter Rounding Enable" start="111" end="111" type="bool"/>
+    <field name="V Address Mag Filter Rounding Enable" start="112" end="112" type="bool"/>
+    <field name="U Address Min Filter Rounding Enable" start="113" end="113" type="bool"/>
+    <field name="U Address Mag Filter Rounding Enable" start="114" end="114" type="bool"/>
+    <field name="Trilinear Filter Quality" start="107" end="108" type="uint">
+      <value name="FULL" value="0"/>
+      <value name="HIGH" value="1"/>
+      <value name="MED" value="2"/>
+      <value name="LOW" value="3"/>
+    </field>
+    <field name="Non-normalized Coordinate Enable" start="106" end="106" type="bool"/>
+    <field name="Reduction Type Enable" start="105" end="105" type="bool"/>
+    <field name="TCX Address Control Mode" start="102" end="104" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="99" end="101" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="96" end="98" type="Texture Coordinate Mode"/>
+  </struct>
+
+  <struct name="SAMPLER_STATE_8X8_AVS_COEFFICIENTS" length="8">
+    <field name="Table 0Y Filter Coefficient[n,1]" start="24" end="31" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,1]" start="16" end="23" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,0]" start="8" end="15" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,0]" start="0" end="7" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,3]" start="56" end="63" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,3]" start="48" end="55" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,2]" start="40" end="47" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,2]" start="32" end="39" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,5]" start="88" end="95" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,5]" start="80" end="87" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,4]" start="72" end="79" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,4]" start="64" end="71" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,7]" start="120" end="127" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,7]" start="112" end="119" type="s1.6"/>
+    <field name="Table 0Y Filter Coefficient[n,6]" start="104" end="111" type="s1.6"/>
+    <field name="Table 0X Filter Coefficient[n,6]" start="96" end="103" type="s1.6"/>
+    <field name="Table 1X Filter Coefficient[n,3]" start="152" end="159" type="s1.6"/>
+    <field name="Table 1X Filter Coefficient[n,2]" start="144" end="151" type="s1.6"/>
+    <field name="Table 1X Filter Coefficient[n,5]" start="168" end="175" type="s1.6"/>
+    <field name="Table 1X Filter Coefficient[n,4]" start="160" end="167" type="s1.6"/>
+    <field name="Table 1Y Filter Coefficient[n,3]" start="216" end="223" type="s1.6"/>
+    <field name="Table 1Y Filter Coefficient[n,2]" start="208" end="215" type="s1.6"/>
+    <field name="Table 1Y Filter Coefficient[n,5]" start="232" end="239" type="s1.6"/>
+    <field name="Table 1Y Filter Coefficient[n,4]" start="224" end="231" type="s1.6"/>
+  </struct>
+
+  <struct name="MI_MATH_ALU_INSTRUCTION" length="1">
+    <field name="ALU Opcode" start="20" end="31" type="uint" prefix="MI_ALU">
+      <value name="NOOP" value="0x000"/>
+      <value name="LOAD" value="0x080"/>
+      <value name="LOADINV" value="0x480"/>
+      <value name="LOAD0" value="0x081"/>
+      <value name="LOAD1" value="0x481"/>
+      <value name="ADD" value="0x100"/>
+      <value name="SUB" value="0x101"/>
+      <value name="AND" value="0x102"/>
+      <value name="OR" value="0x103"/>
+      <value name="XOR" value="0x104"/>
+      <value name="STORE" value="0x180"/>
+      <value name="STOREINV" value="0x580"/>
+    </field>
+    <field name="Operand 1" start="10" end="19" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+    <field name="Operand 2" start="0" end="9" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+  </struct>
+
+  <instruction name="3DPRIMITIVE" bias="2" length="7">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="3"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="Extended Parameters Present" start="11" end="11" type="uint"/>
+    <field name="Indirect Parameter Enable" start="10" end="10" type="bool"/>
+    <field name="UAV Coherency Required" start="9" end="9" type="bool"/>
+    <field name="Predicate Enable" start="8" end="8" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
+    <field name="End Offset Enable" start="41" end="41" type="bool"/>
+    <field name="Vertex Access Type" start="40" end="40" type="uint">
+      <value name="SEQUENTIAL" value="0"/>
+      <value name="RANDOM" value="1"/>
+    </field>
+    <field name="Primitive Topology Type" start="32" end="37" type="3D_Prim_Topo_Type"/>
+    <field name="Vertex Count Per Instance" start="64" end="95" type="uint"/>
+    <field name="Start Vertex Location" start="96" end="127" type="uint"/>
+    <field name="Instance Count" start="128" end="159" type="uint"/>
+    <field name="Start Instance Location" start="160" end="191" type="uint"/>
+    <field name="Base Vertex Location" start="192" end="223" type="int"/>
+    <field name="Extended Parameter 0" start="224" end="255" type="uint"/>
+    <field name="Extended Parameter 1" start="256" end="287" type="uint"/>
+    <field name="Extended Parameter 2" start="288" end="319" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_AA_LINE_PARAMETERS" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="AA Point Coverage Bias" start="56" end="63" type="u0.8"/>
+    <field name="AA Coverage Bias" start="48" end="55" type="u0.8"/>
+    <field name="AA Point Coverage Slope" start="40" end="47" type="u0.8"/>
+    <field name="AA Coverage Slope" start="32" end="39" type="u0.8"/>
+    <field name="AA Point Coverage EndCap Bias" start="88" end="95" type="u0.8"/>
+    <field name="AA Coverage EndCap Bias" start="80" end="87" type="u0.8"/>
+    <field name="AA Point Coverage EndCap Slope" start="72" end="79" type="u0.8"/>
+    <field name="AA Coverage EndCap Slope" start="64" end="71" type="u0.8"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_EDIT_DS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="70"/>
+    <field name="DWord Length" start="0" end="8" type="uint" default="0"/>
+    <field name="Binding Table Block Clear" start="48" end="63" type="uint"/>
+    <field name="Binding Table Edit Target" start="32" end="33" type="uint">
+      <value name="All Cores" value="3"/>
+      <value name="Core 1" value="2"/>
+      <value name="Core 0" value="1"/>
+    </field>
+    <group count="0" start="64" size="32">
+      <field name="Entry [n]" start="0" end="31" type="BINDING_TABLE_EDIT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_EDIT_GS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="68"/>
+    <field name="DWord Length" start="0" end="8" type="uint" default="0"/>
+    <field name="Binding Table Block Clear" start="48" end="63" type="uint"/>
+    <field name="Binding Table Edit Target" start="32" end="33" type="uint">
+      <value name="All Cores" value="3"/>
+      <value name="Core 1" value="2"/>
+      <value name="Core 0" value="1"/>
+    </field>
+    <group count="0" start="64" size="32">
+      <field name="Entry [n]" start="0" end="31" type="BINDING_TABLE_EDIT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_EDIT_HS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="69"/>
+    <field name="DWord Length" start="0" end="8" type="uint" default="0"/>
+    <field name="Binding Table Block Clear" start="48" end="63" type="uint"/>
+    <field name="Binding Table Edit Target" start="32" end="33" type="uint">
+      <value name="All Cores" value="3"/>
+      <value name="Core 1" value="2"/>
+      <value name="Core 0" value="1"/>
+    </field>
+    <group count="0" start="64" size="32">
+      <field name="Entry [n]" start="0" end="31" type="BINDING_TABLE_EDIT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_EDIT_PS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="71"/>
+    <field name="DWord Length" start="0" end="8" type="uint" default="0"/>
+    <field name="Binding Table Block Clear" start="48" end="63" type="uint"/>
+    <field name="Binding Table Edit Target" start="32" end="33" type="uint">
+      <value name="All Cores" value="3"/>
+      <value name="Core 1" value="2"/>
+      <value name="Core 0" value="1"/>
+    </field>
+    <group count="0" start="64" size="32">
+      <field name="Entry [n]" start="0" end="31" type="BINDING_TABLE_EDIT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_EDIT_VS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="67"/>
+    <field name="DWord Length" start="0" end="8" type="uint" default="0"/>
+    <field name="Binding Table Block Clear" start="48" end="63" type="uint"/>
+    <field name="Binding Table Edit Target" start="32" end="33" type="uint">
+      <value name="All Cores" value="3"/>
+      <value name="Core 1" value="2"/>
+      <value name="Core 0" value="1"/>
+    </field>
+    <group count="0" start="64" size="32">
+      <field name="Entry [n]" start="0" end="31" type="BINDING_TABLE_EDIT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS_DS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="40"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to DS Binding Table" start="37" end="47" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS_GS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="41"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to GS Binding Table" start="37" end="47" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS_HS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="39"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to HS Binding Table" start="37" end="47" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS_PS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="42"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to PS Binding Table" start="37" end="47" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS_VS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="38"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to VS Binding Table" start="37" end="47" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POOL_ALLOC" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="25"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Binding Table Pool Base Address" start="44" end="95" type="address"/>
+    <field name="Binding Table Pool Enable" start="43" end="43" type="uint"/>
+    <field name="Surface Object Control State" start="32" end="38" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Binding Table Pool Buffer Size" start="108" end="127" type="uint">
+      <value name="No Valid Data" value="0"/>
+    </field>
+  </instruction>
+
+  <instruction name="3DSTATE_BLEND_STATE_POINTERS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="36"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Blend State Pointer" start="38" end="63" type="offset"/>
+    <field name="Blend State Pointer Valid" start="32" end="32" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CC_STATE_POINTERS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="14"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Color Calc State Pointer" start="38" end="63" type="offset"/>
+    <field name="Color Calc State Pointer Valid" start="32" end="32" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CHROMA_KEY" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="ChromaKey Table Index" start="62" end="63" type="uint"/>
+    <field name="ChromaKey Low Value" start="64" end="95" type="uint"/>
+    <field name="ChromaKey High Value" start="96" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CLEAR_PARAMS" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Depth Clear Value" start="32" end="63" type="float"/>
+    <field name="Depth Clear Value Valid" start="64" end="64" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CLIP" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="18"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Force User Clip Distance Cull Test Enable Bitmask" start="52" end="52" type="bool"/>
+    <field name="Vertex Sub Pixel Precision Select" start="51" end="51" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
+    <field name="Early Cull Enable" start="50" end="50" type="bool"/>
+    <field name="Force User Clip Distance Clip Test Enable Bitmask" start="49" end="49" type="bool"/>
+    <field name="Force Clip Mode" start="48" end="48" type="bool"/>
+    <field name="Statistics Enable" start="42" end="42" type="bool"/>
+    <field name="User Clip Distance Cull Test Enable Bitmask" start="32" end="39" type="uint"/>
+    <field name="Clip Enable" start="95" end="95" type="bool"/>
+    <field name="API Mode" start="94" end="94" type="uint">
+      <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
+    </field>
+    <field name="Viewport XY Clip Test Enable" start="92" end="92" type="bool"/>
+    <field name="Guardband Clip Test Enable" start="90" end="90" type="bool"/>
+    <field name="User Clip Distance Clip Test Enable Bitmask" start="80" end="87" type="uint"/>
+    <field name="Clip Mode" start="77" end="79" type="uint">
+      <value name="CLIPMODE_NORMAL" value="0"/>
+      <value name="CLIPMODE_REJECT_ALL" value="3"/>
+      <value name="CLIPMODE_ACCEPT_ALL" value="4"/>
+    </field>
+    <field name="Perspective Divide Disable" start="73" end="73" type="bool"/>
+    <field name="Non-Perspective Barycentric Enable" start="72" end="72" type="bool"/>
+    <field name="Triangle Strip/List Provoking Vertex Select" start="68" end="69" type="uint"/>
+    <field name="Line Strip/List Provoking Vertex Select" start="66" end="67" type="uint"/>
+    <field name="Triangle Fan Provoking Vertex Select" start="64" end="65" type="uint"/>
+    <field name="Minimum Point Width" start="113" end="123" type="u8.3"/>
+    <field name="Maximum Point Width" start="102" end="112" type="u8.3"/>
+    <field name="Force Zero RTA Index Enable" start="101" end="101" type="bool"/>
+    <field name="Maximum VP Index" start="96" end="99" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_DS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="26"/>
+    <field name="Constant Buffer Object Control State" start="8" end="14" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Constant Body" start="32" end="351" type="3DSTATE_CONSTANT_BODY"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_GS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="22"/>
+    <field name="Constant Buffer Object Control State" start="8" end="14" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Constant Body" start="32" end="351" type="3DSTATE_CONSTANT_BODY"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_HS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="25"/>
+    <field name="Constant Buffer Object Control State" start="8" end="14" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Constant Body" start="32" end="351" type="3DSTATE_CONSTANT_BODY"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_PS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="23"/>
+    <field name="Disable Gather at Set Shader Hint" start="15" end="15" type="uint"/>
+    <field name="Constant Buffer Object Control State" start="8" end="14" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Constant Body" start="32" end="351" type="3DSTATE_CONSTANT_BODY"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_VS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="21"/>
+    <field name="Constant Buffer Object Control State" start="8" end="14" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Constant Body" start="32" end="351" type="3DSTATE_CONSTANT_BODY"/>
+  </instruction>
+
+  <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="5"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="Surface Type" start="61" end="63" type="uint">
+      <value name="SURFTYPE_2D" value="1"/>
+      <value name="SURFTYPE_CUBE" value="3"/>
+      <value name="SURFTYPE_NULL" value="7"/>
+    </field>
+    <field name="Depth Write Enable" start="60" end="60" type="bool"/>
+    <field name="Stencil Write Enable" start="59" end="59" type="bool"/>
+    <field name="Hierarchical Depth Buffer Enable" start="54" end="54" type="bool"/>
+    <field name="Surface Format" start="50" end="52" type="uint">
+      <value name="D32_FLOAT" value="1"/>
+      <value name="D24_UNORM_X8_UINT" value="3"/>
+      <value name="D16_UNORM" value="5"/>
+    </field>
+    <field name="Surface Pitch" start="32" end="49" type="uint"/>
+    <field name="Surface Base Address" start="64" end="127" type="address"/>
+    <field name="Height" start="146" end="159" type="uint"/>
+    <field name="Width" start="132" end="145" type="uint"/>
+    <field name="LOD" start="128" end="131" type="uint"/>
+    <field name="Depth" start="181" end="191" type="uint"/>
+    <field name="Minimum Array Element" start="170" end="180" type="uint"/>
+    <field name="Depth Buffer Object Control State" start="160" end="166" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Depth Buffer MOCS" start="160" end="166" type="uint"/>
+    <field name="Tiled Resource Mode" start="222" end="223" type="uint">
+      <value name="NONE" value="0"/>
+      <value name="TILEYF" value="1"/>
+      <value name="TILEYS" value="2"/>
+    </field>
+    <field name="Mip Tail Start LOD" start="218" end="221" type="uint"/>
+    <field name="Render Target View Extent" start="245" end="255" type="uint"/>
+    <field name="Surface QPitch" start="224" end="238" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_DRAWING_RECTANGLE" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="Core Mode Select" start="14" end="15" type="uint">
+      <value name="Legacy" value="0"/>
+      <value name="Core 0 Enabled" value="1"/>
+      <value name="Core 1 Enabled" value="2"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Clipped Drawing Rectangle Y Min" start="48" end="63" type="uint"/>
+    <field name="Clipped Drawing Rectangle X Min" start="32" end="47" type="uint"/>
+    <field name="Clipped Drawing Rectangle Y Max" start="80" end="95" type="uint"/>
+    <field name="Clipped Drawing Rectangle X Max" start="64" end="79" type="uint"/>
+    <field name="Drawing Rectangle Origin Y" start="112" end="127" type="int"/>
+    <field name="Drawing Rectangle Origin X" start="96" end="111" type="int"/>
+  </instruction>
+
+  <instruction name="3DSTATE_DS" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="29"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <field name="Kernel Start Pointer" start="38" end="95" type="offset"/>
+    <field name="Vector Mask Enable" start="126" end="126" type="bool"/>
+    <field name="Sampler Count" start="123" end="125" type="uint">
+      <value name="No Samplers" value="0"/>
+      <value name="1-4 Samplers" value="1"/>
+      <value name="5-8 Samplers" value="2"/>
+      <value name="9-12 Samplers" value="3"/>
+      <value name="13-16 Samplers" value="4"/>
+    </field>
+    <field name="Binding Table Entry Count" start="114" end="121" type="uint"/>
+    <field name="Thread Dispatch Priority" start="113" end="113" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="112" end="112" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Accesses UAV" start="110" end="110" type="bool"/>
+    <field name="Illegal Opcode Exception Enable" start="109" end="109" type="bool"/>
+    <field name="Software Exception Enable" start="103" end="103" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="138" end="191" type="address"/>
+    <field name="Per-Thread Scratch Space" start="128" end="131" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="212" end="216" type="uint"/>
+    <field name="Patch URB Entry Read Length" start="203" end="209" type="uint"/>
+    <field name="Patch URB Entry Read Offset" start="196" end="201" type="uint"/>
+    <field name="Maximum Number of Threads" start="245" end="254" type="uint"/>
+    <field name="Statistics Enable" start="234" end="234" type="bool"/>
+    <field name="Dispatch Mode" start="227" end="228" type="uint" prefix="DISPATCH_MODE">
+      <value name="SIMD4X2" value="0"/>
+      <value name="SIMD8_SINGLE_PATCH" value="1"/>
+      <value name="SIMD8_SINGLE_OR_DUAL_PATCH" value="2"/>
+    </field>
+    <field name="Compute W Coordinate Enable" start="226" end="226" type="bool"/>
+    <field name="Cache Disable" start="225" end="225" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
+    <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
+    <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
+    <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
+    <field name="User Clip Distance Cull Test Enable Bitmask" start="256" end="263" type="uint"/>
+    <field name="DUAL_PATCH Kernel Start Pointer" start="294" end="351" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_CONSTANT_DS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="55"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Constant Buffer Valid" start="48" end="63" type="uint"/>
+    <field name="Constant Buffer Binding Table Block" start="44" end="47" type="uint"/>
+    <field name="Update Gather Table Only" start="33" end="33" type="uint">
+      <value name="Commit Gather" value="0"/>
+      <value name="Non-Commit Gather" value="1"/>
+    </field>
+    <field name="Gather Buffer Offset" start="70" end="86" type="offset"/>
+    <field name="Constant Buffer Dx9 Generate Stall" start="69" end="69" type="bool"/>
+    <field name="On-Die Table" start="67" end="67" type="uint">
+      <value name="Load" value="0"/>
+      <value name="Read" value="1"/>
+    </field>
+    <group count="0" start="96" size="32">
+      <field name="Entry_0" start="0" end="15" type="GATHER_CONSTANT_ENTRY"/>
+      <field name="Entry_1" start="16" end="31" type="GATHER_CONSTANT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_CONSTANT_GS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="53"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Constant Buffer Valid" start="48" end="63" type="uint"/>
+    <field name="Constant Buffer Binding Table Block" start="44" end="47" type="uint"/>
+    <field name="Update Gather Table Only" start="33" end="33" type="uint">
+      <value name="Commit Gather" value="0"/>
+      <value name="Non-Commit Gather" value="1"/>
+    </field>
+    <field name="Gather Buffer Offset" start="70" end="86" type="offset"/>
+    <field name="Constant Buffer Dx9 Generate Stall" start="69" end="69" type="bool"/>
+    <field name="On-Die Table" start="67" end="67" type="uint">
+      <value name="Load" value="0"/>
+      <value name="Read" value="1"/>
+    </field>
+    <group count="0" start="96" size="32">
+      <field name="Entry_0" start="0" end="15" type="GATHER_CONSTANT_ENTRY"/>
+      <field name="Entry_1" start="16" end="31" type="GATHER_CONSTANT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_CONSTANT_HS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="54"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Constant Buffer Valid" start="48" end="63" type="uint"/>
+    <field name="Constant Buffer Binding Table Block" start="44" end="47" type="uint"/>
+    <field name="Update Gather Table Only" start="33" end="33" type="uint">
+      <value name="Commit Gather" value="0"/>
+      <value name="Non-Commit Gather" value="1"/>
+    </field>
+    <field name="Gather Buffer Offset" start="70" end="86" type="offset"/>
+    <field name="Constant Buffer Dx9 Generate Stall" start="69" end="69" type="bool"/>
+    <field name="On-Die Table" start="67" end="67" type="uint">
+      <value name="Load" value="0"/>
+      <value name="Read" value="1"/>
+    </field>
+    <group count="0" start="96" size="32">
+      <field name="Entry_0" start="0" end="15" type="GATHER_CONSTANT_ENTRY"/>
+      <field name="Entry_1" start="16" end="31" type="GATHER_CONSTANT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_CONSTANT_PS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="56"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Constant Buffer Valid" start="48" end="63" type="uint"/>
+    <field name="Constant Buffer Binding Table Block" start="44" end="47" type="uint"/>
+    <field name="Update Gather Table Only" start="33" end="33" type="uint">
+      <value name="Commit Gather" value="0"/>
+      <value name="Non-Commit Gather" value="1"/>
+    </field>
+    <field name="DX9 On-Die Register Read Enable" start="32" end="32" type="bool"/>
+    <field name="Gather Buffer Offset" start="70" end="86" type="offset"/>
+    <field name="Constant Buffer Dx9 Generate Stall" start="69" end="69" type="bool"/>
+    <field name="Constant Buffer Dx9 Enable" start="68" end="68" type="bool"/>
+    <field name="On-Die Table" start="67" end="67" type="uint">
+      <value name="Load" value="0"/>
+      <value name="Read" value="1"/>
+    </field>
+    <group count="0" start="96" size="32">
+      <field name="Entry_0" start="0" end="15" type="GATHER_CONSTANT_ENTRY"/>
+      <field name="Entry_1" start="16" end="31" type="GATHER_CONSTANT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_CONSTANT_VS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="52"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Valid" start="48" end="63" type="uint"/>
+    <field name="Constant Buffer Binding Table Block" start="44" end="47" type="uint"/>
+    <field name="Update Gather Table Only" start="33" end="33" type="uint">
+      <value name="Commit Gather" value="0"/>
+      <value name="Non-Commit Gather" value="1"/>
+    </field>
+    <field name="DX9 On-Die Register Read Enable" start="32" end="32" type="bool"/>
+    <field name="Gather Buffer Offset" start="70" end="86" type="offset"/>
+    <field name="Constant Buffer Dx9 Generate Stall" start="69" end="69" type="bool"/>
+    <field name="Constant Buffer Dx9 Enable" start="68" end="68" type="bool"/>
+    <field name="On-Die Table" start="67" end="67" type="uint">
+      <value name="Load" value="0"/>
+      <value name="Read" value="1"/>
+    </field>
+    <group count="0" start="96" size="32">
+      <field name="Entry_0" start="0" end="15" type="GATHER_CONSTANT_ENTRY"/>
+      <field name="Entry_1" start="16" end="31" type="GATHER_CONSTANT_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_GATHER_POOL_ALLOC" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="26"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Gather Pool Base Address" start="44" end="95" type="address"/>
+    <field name="Gather Pool Enable" start="43" end="43" type="bool"/>
+    <field name="Memory Object Control State" start="32" end="38" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Gather Pool Buffer Size" start="108" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_GS" bias="2" length="10">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="17"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="8"/>
+    <field name="Kernel Start Pointer" start="38" end="95" type="offset"/>
+    <field name="Single Program Flow" start="127" end="127" type="bool"/>
+    <field name="Vector Mask Enable" start="126" end="126" type="bool"/>
+    <field name="Sampler Count" start="123" end="125" type="uint">
+      <value name="No Samplers" value="0"/>
+      <value name="1-4 Samplers" value="1"/>
+      <value name="5-8 Samplers" value="2"/>
+      <value name="9-12 Samplers" value="3"/>
+      <value name="13-16 Samplers" value="4"/>
+    </field>
+    <field name="Binding Table Entry Count" start="114" end="121" type="uint"/>
+    <field name="Thread Dispatch Priority" start="113" end="113" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="112" end="112" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="109" end="109" type="bool"/>
+    <field name="Accesses UAV" start="108" end="108" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="107" end="107" type="bool"/>
+    <field name="Software Exception Enable" start="103" end="103" type="bool"/>
+    <field name="Expected Vertex Count" start="96" end="101" type="uint"/>
+    <field name="Scratch Space Base Pointer" start="138" end="191" type="address"/>
+    <field name="Per-Thread Scratch Space" start="128" end="131" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data [5:4]" start="221" end="222" type="uint"/>
+    <field name="Output Vertex Size" start="215" end="220" type="uint"/>
+    <field name="Output Topology" start="209" end="214" type="3D_Prim_Topo_Type"/>
+    <field name="Vertex URB Entry Read Length" start="203" end="208" type="uint"/>
+    <field name="Include Vertex Handles" start="202" end="202" type="bool"/>
+    <field name="Vertex URB Entry Read Offset" start="196" end="201" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="192" end="195" type="uint"/>
+    <field name="Control Data Header Size" start="244" end="247" type="uint"/>
+    <field name="Instance Control" start="239" end="243" type="uint"/>
+    <field name="Default Stream Id" start="237" end="238" type="uint"/>
+    <field name="Dispatch Mode" start="235" end="236" type="uint" prefix="DISPATCH_MODE">
+      <value name="Dual Instance" value="1"/>
+      <value name="Dual Object" value="2"/>
+      <value name="SIMD8" value="3"/>
+    </field>
+    <field name="Statistics Enable" start="234" end="234" type="bool"/>
+    <field name="Invocations Increment Value" start="229" end="233" type="uint"/>
+    <field name="Include Primitive ID" start="228" end="228" type="bool"/>
+    <field name="Hint" start="227" end="227" type="uint"/>
+    <field name="Reorder Mode" start="226" end="226" type="uint">
+      <value name="LEADING" value="0"/>
+      <value name="TRAILING" value="1"/>
+    </field>
+    <field name="Discard Adjacency" start="225" end="225" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
+    <field name="Control Data Format" start="287" end="287" type="uint">
+      <value name="CUT" value="0"/>
+      <value name="SID" value="1"/>
+    </field>
+    <field name="Static Output" start="286" end="286" type="bool"/>
+    <field name="Static Output Vertex Count" start="272" end="282" type="uint"/>
+    <field name="Maximum Number of Threads" start="256" end="264" type="uint"/>
+    <field name="Vertex URB Entry Output Read Offset" start="309" end="314" type="uint"/>
+    <field name="Vertex URB Entry Output Length" start="304" end="308" type="uint"/>
+    <field name="User Clip Distance Clip Test Enable Bitmask" start="296" end="303" type="uint"/>
+    <field name="User Clip Distance Cull Test Enable Bitmask" start="288" end="295" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_HIER_DEPTH_BUFFER" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="7"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Hierarchical Depth Buffer Object Control State" start="57" end="63" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Hierarchical Depth Buffer MOCS" start="57" end="63" type="uint"/>
+    <field name="Surface Pitch" start="32" end="48" type="uint"/>
+    <field name="Surface Base Address" start="64" end="127" type="address"/>
+    <field name="Surface QPitch" start="128" end="142" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_HS" bias="2" length="9">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="27"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="7"/>
+    <field name="Sampler Count" start="59" end="61" type="uint">
+      <value name="No Samplers" value="0"/>
+      <value name="1-4 Samplers" value="1"/>
+      <value name="5-8 Samplers" value="2"/>
+      <value name="9-12 Samplers" value="3"/>
+      <value name="13-16 Samplers" value="4"/>
+    </field>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Dispatch Priority" start="49" end="49" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Software Exception Enable" start="44" end="44" type="bool"/>
+    <field name="Enable" start="95" end="95" type="bool"/>
+    <field name="Statistics Enable" start="93" end="93" type="bool"/>
+    <field name="Maximum Number of Threads" start="72" end="80" type="uint"/>
+    <field name="Instance Count" start="64" end="67" type="uint"/>
+    <field name="Kernel Start Pointer" start="102" end="159" type="offset"/>
+    <field name="Scratch Space Base Pointer" start="170" end="223" type="address"/>
+    <field name="Per-Thread Scratch Space" start="160" end="163" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data [5]" start="252" end="252" type="uint"/>
+    <field name="Single Program Flow" start="251" end="251" type="bool"/>
+    <field name="Vector Mask Enable" start="250" end="250" type="bool"/>
+    <field name="Accesses UAV" start="249" end="249" type="bool"/>
+    <field name="Include Vertex Handles" start="248" end="248" type="bool"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="243" end="247" type="uint"/>
+    <field name="Dispatch Mode" start="241" end="242" type="uint" prefix="DISPATCH_MODE">
+      <value name="SINGLE_PATCH" value="0"/>
+      <value name="DUAL_PATCH" value="1"/>
+      <value name="8_PATCH" value="2"/>
+    </field>
+    <field name="Vertex URB Entry Read Length" start="235" end="240" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="228" end="233" type="uint"/>
+    <field name="Include Primitive ID" start="224" end="224" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_INDEX_BUFFER" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Index Format" start="40" end="41" type="uint" prefix="INDEX">
+      <value name="BYTE" value="0"/>
+      <value name="WORD" value="1"/>
+      <value name="DWORD" value="2"/>
+    </field>
+    <field name="Memory Object Control State" start="32" end="38" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="32" end="38" type="uint"/>
+    <field name="Buffer Starting Address" start="64" end="127" type="address"/>
+    <field name="Buffer Size" start="128" end="159" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_LINE_STIPPLE" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="8"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Modify Enable (Current Repeat Counter, Current Stipple Index)" start="63" end="63" type="bool"/>
+    <field name="Current Repeat Counter" start="53" end="61" type="uint"/>
+    <field name="Current Stipple Index" start="48" end="51" type="uint"/>
+    <field name="Line Stipple Pattern" start="32" end="47" type="uint"/>
+    <field name="Line Stipple Inverse Repeat Count" start="79" end="95" type="u1.16"/>
+    <field name="Line Stipple Repeat Count" start="64" end="72" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_MONOFILTER_SIZE" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="17"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Monochrome Filter Width" start="35" end="37" type="uint"/>
+    <field name="Monochrome Filter Height" start="32" end="34" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_MULTISAMPLE" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="13"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pixel Position Offset Enable" start="37" end="37" type="bool"/>
+    <field name="Pixel Location" start="36" end="36" type="uint">
+      <value name="CENTER" value="0"/>
+      <value name="UL_CORNER" value="1"/>
+    </field>
+    <field name="Number of Multisamples" start="33" end="35" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_POLY_STIPPLE_OFFSET" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="6"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Polygon Stipple X Offset" start="40" end="44" type="uint"/>
+    <field name="Polygon Stipple Y Offset" start="32" end="36" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_POLY_STIPPLE_PATTERN" bias="2" length="33">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="7"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="31"/>
+    <group count="32" start="32" size="32">
+      <field name="Pattern Row" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_PS" bias="2" length="12">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="32"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="10"/>
+    <field name="Kernel Start Pointer 0" start="38" end="95" type="offset"/>
+    <field name="Single Program Flow" start="127" end="127" type="bool"/>
+    <field name="Vector Mask Enable" start="126" end="126" type="bool"/>
+    <field name="Sampler Count" start="123" end="125" type="uint">
+      <value name="No Samplers" value="0"/>
+      <value name="1-4 Samplers" value="1"/>
+      <value name="5-8 Samplers" value="2"/>
+      <value name="9-12 Samplers" value="3"/>
+      <value name="13-16 Samplers" value="4"/>
+    </field>
+    <field name="Single Precision Denormal Mode" start="122" end="122" type="uint">
+      <value name="Flushed to Zero" value="0"/>
+      <value name="Retained" value="1"/>
+    </field>
+    <field name="Binding Table Entry Count" start="114" end="121" type="uint"/>
+    <field name="Thread Dispatch Priority" start="113" end="113" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="112" end="112" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Rounding Mode" start="110" end="111" type="uint">
+      <value name="RTNE" value="0"/>
+      <value name="RU" value="1"/>
+      <value name="RD" value="2"/>
+      <value name="RTZ" value="3"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="109" end="109" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="107" end="107" type="bool"/>
+    <field name="Software Exception Enable" start="103" end="103" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="138" end="191" type="address"/>
+    <field name="Per Thread Scratch Space" start="128" end="131" type="uint"/>
+    <field name="Maximum Number of Threads Per PSD" start="215" end="223" type="uint"/>
+    <field name="Push Constant Enable" start="203" end="203" type="bool"/>
+    <field name="Render Target Fast Clear Enable" start="200" end="200" type="bool"/>
+    <field name="Render Target Resolve Type" start="198" end="199" type="uint">
+      <value name="RESOLVE_DISABLED" value="0"/>
+      <value name="RESOLVE_PARTIAL" value="1"/>
+      <value name="FAST_CLEAR_0" value="2"/>
+      <value name="RESOLVE_FULL" value="3"/>
+    </field>
+    <field name="Position XY Offset Select" start="195" end="196" type="uint">
+      <value name="POSOFFSET_NONE" value="0"/>
+      <value name="POSOFFSET_CENTROID" value="2"/>
+      <value name="POSOFFSET_SAMPLE" value="3"/>
+    </field>
+    <field name="32 Pixel Dispatch Enable" start="194" end="194" type="bool"/>
+    <field name="16 Pixel Dispatch Enable" start="193" end="193" type="bool"/>
+    <field name="8 Pixel Dispatch Enable" start="192" end="192" type="bool"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 0" start="240" end="246" type="uint"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 1" start="232" end="238" type="uint"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 2" start="224" end="230" type="uint"/>
+    <field name="Kernel Start Pointer 1" start="262" end="319" type="offset"/>
+    <field name="Kernel Start Pointer 2" start="326" end="383" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PS_BLEND" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="77"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Alpha To Coverage Enable" start="63" end="63" type="bool"/>
+    <field name="Has Writeable RT" start="62" end="62" type="bool"/>
+    <field name="Color Buffer Blend Enable" start="61" end="61" type="bool"/>
+    <field name="Source Alpha Blend Factor" start="56" end="60" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Alpha Blend Factor" start="51" end="55" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Source Blend Factor" start="46" end="50" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Blend Factor" start="41" end="45" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Alpha Test Enable" start="40" end="40" type="bool"/>
+    <field name="Independent Alpha Blend Enable" start="39" end="39" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PS_EXTRA" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="79"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pixel Shader Valid" start="63" end="63" type="bool"/>
+    <field name="Pixel Shader Does not write to RT" start="62" end="62" type="bool"/>
+    <field name="oMask Present to Render Target" start="61" end="61" type="bool"/>
+    <field name="Pixel Shader Kills Pixel" start="60" end="60" type="bool"/>
+    <field name="Pixel Shader Computed Depth Mode" start="58" end="59" type="uint">
+      <value name="PSCDEPTH_OFF" value="0"/>
+      <value name="PSCDEPTH_ON" value="1"/>
+      <value name="PSCDEPTH_ON_GE" value="2"/>
+      <value name="PSCDEPTH_ON_LE" value="3"/>
+    </field>
+    <field name="Force Computed Depth" start="57" end="57" type="bool"/>
+    <field name="Pixel Shader Uses Source Depth" start="56" end="56" type="bool"/>
+    <field name="Pixel Shader Uses Source W" start="55" end="55" type="bool"/>
+    <field name="Pixel Shader Requires Source Depth and/or W Plane Coefficients" start="53" end="53" type="bool"/>
+    <field name="Pixel Shader Requires Perspective Bary Plane Coefficients" start="52" end="52" type="bool"/>
+    <field name="Pixel Shader Requires Non-Perspective Bary Plane Coefficients" start="51" end="51" type="bool"/>
+    <field name="Pixel Shader Requires Subpixel Sample Offsets" start="50" end="50" type="bool"/>
+    <field name="Simple PS Hint" start="41" end="41" type="bool"/>
+    <field name="Attribute Enable" start="40" end="40" type="bool"/>
+    <field name="Pixel Shader Disables Alpha To Coverage" start="39" end="39" type="bool"/>
+    <field name="Pixel Shader Is Per Sample" start="38" end="38" type="bool"/>
+    <field name="Pixel Shader Computes Stencil" start="37" end="37" type="bool"/>
+    <field name="Pixel Shader Pulls Bary" start="35" end="35" type="bool"/>
+    <field name="Pixel Shader Has UAV" start="34" end="34" type="bool"/>
+    <field name="Input Coverage Mask State" start="32" end="33" type="uint" prefix="ICMS">
+      <value name="NONE" value="0"/>
+      <value name="NORMAL" value="1"/>
+      <value name="INNER_CONSERVATIVE" value="2"/>
+      <value name="DEPTH_COVERAGE" value="3"/>
+    </field>
+  </instruction>
+
+  <instruction name="3DSTATE_PUSH_CONSTANT_ALLOC_DS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="20"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Offset" start="48" end="52" type="uint"/>
+    <field name="Constant Buffer Size" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PUSH_CONSTANT_ALLOC_GS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="21"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Offset" start="48" end="52" type="uint"/>
+    <field name="Constant Buffer Size" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PUSH_CONSTANT_ALLOC_HS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="19"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Offset" start="48" end="52" type="uint"/>
+    <field name="Constant Buffer Size" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PUSH_CONSTANT_ALLOC_PS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="22"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Offset" start="48" end="52" type="uint"/>
+    <field name="Constant Buffer Size" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_PUSH_CONSTANT_ALLOC_VS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="18"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Constant Buffer Offset" start="48" end="52" type="uint"/>
+    <field name="Constant Buffer Size" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_RASTER" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="80"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Viewport Z Far Clip Test Enable" start="58" end="58" type="bool"/>
+    <field name="Conservative Rasterization Enable" start="56" end="56" type="bool"/>
+    <field name="API Mode" start="54" end="55" type="uint">
+      <value name="DX9/OGL" value="0"/>
+      <value name="DX10.0" value="1"/>
+      <value name="DX10.1+" value="2"/>
+    </field>
+    <field name="Front Winding" start="53" end="53" type="uint">
+      <value name="Clockwise" value="0"/>
+      <value name="Counter Clockwise" value="1"/>
+    </field>
+    <field name="Forced Sample Count" start="50" end="52" type="uint" prefix="FSC">
+      <value name="NUMRASTSAMPLES_0" value="0"/>
+      <value name="NUMRASTSAMPLES_1" value="1"/>
+      <value name="NUMRASTSAMPLES_2" value="2"/>
+      <value name="NUMRASTSAMPLES_4" value="3"/>
+      <value name="NUMRASTSAMPLES_8" value="4"/>
+      <value name="NUMRASTSAMPLES_16" value="5"/>
+    </field>
+    <field name="Cull Mode" start="48" end="49" type="uint" prefix="CULLMODE">
+      <value name="BOTH" value="0"/>
+      <value name="NONE" value="1"/>
+      <value name="FRONT" value="2"/>
+      <value name="BACK" value="3"/>
+    </field>
+    <field name="Force Multisampling" start="46" end="46" type="uint"/>
+    <field name="Smooth Point Enable" start="45" end="45" type="bool"/>
+    <field name="DX Multisample Rasterization Enable" start="44" end="44" type="bool"/>
+    <field name="DX Multisample Rasterization Mode" start="42" end="43" type="uint">
+      <value name="MSRASTMODE_ OFF_PIXEL" value="0"/>
+      <value name="MSRASTMODE_ OFF_PATTERN" value="1"/>
+      <value name="MSRASTMODE_ ON_PIXEL" value="2"/>
+      <value name="MSRASTMODE_ ON_PATTERN" value="3"/>
+    </field>
+    <field name="Global Depth Offset Enable Solid" start="41" end="41" type="bool"/>
+    <field name="Global Depth Offset Enable Wireframe" start="40" end="40" type="bool"/>
+    <field name="Global Depth Offset Enable Point" start="39" end="39" type="bool"/>
+    <field name="Front Face Fill Mode" start="37" end="38" type="uint" prefix="FILL_MODE">
+      <value name="SOLID" value="0"/>
+      <value name="WIREFRAME" value="1"/>
+      <value name="POINT" value="2"/>
+    </field>
+    <field name="Back Face Fill Mode" start="35" end="36" type="uint" prefix="FILL_MODE">
+      <value name="SOLID" value="0"/>
+      <value name="WIREFRAME" value="1"/>
+      <value name="POINT" value="2"/>
+    </field>
+    <field name="Antialiasing Enable" start="34" end="34" type="bool"/>
+    <field name="Scissor Rectangle Enable" start="33" end="33" type="bool"/>
+    <field name="Viewport Z Near Clip Test Enable" start="32" end="32" type="bool"/>
+    <field name="Global Depth Offset Constant" start="64" end="95" type="float"/>
+    <field name="Global Depth Offset Scale" start="96" end="127" type="float"/>
+    <field name="Global Depth Offset Clamp" start="128" end="159" type="float"/>
+  </instruction>
+
+  <instruction name="3DSTATE_RS_CONSTANT_POINTER" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="84"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Shader Select" start="60" end="62" type="uint">
+      <value name="VS" value="0"/>
+      <value name="PS" value="4"/>
+    </field>
+    <field name="Operation Load or Store" start="44" end="44" type="uint" prefix="RS">
+      <value name="Store" value="0"/>
+      <value name="Load" value="1"/>
+    </field>
+    <field name="Global Constant Buffer Address" start="70" end="95" type="address"/>
+    <field name="Global Constant Buffer Address High" start="96" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_PALETTE_LOAD0" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="7" type="uint"/>
+    <group count="0" start="32" size="32">
+      <field name="Entry" start="0" end="31" type="PALETTE_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_PALETTE_LOAD1" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="12"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <group count="0" start="32" size="32">
+      <field name="Palette Alpha[0:N-1]" start="24" end="31" type="uint"/>
+      <field name="Palette Red[0:N-1]" start="16" end="23" type="uint"/>
+      <field name="Palette Green[0:N-1]" start="8" end="15" type="uint"/>
+      <field name="Palette Blue[0:N-1]" start="0" end="7" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_STATE_POINTERS_DS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="45"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to DS Sampler State" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_STATE_POINTERS_GS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="46"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to GS Sampler State" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_STATE_POINTERS_HS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="44"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to HS Sampler State" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_STATE_POINTERS_PS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="47"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to PS Sampler State" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLER_STATE_POINTERS_VS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="43"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Pointer to VS Sampler State" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLE_MASK" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="24"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Sample Mask" start="32" end="47" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SAMPLE_PATTERN" bias="2" length="9">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="28"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="7"/>
+    <field name="16x Sample3 X Offset" start="60" end="63" type="u0.4"/>
+    <field name="16x Sample3 Y Offset" start="56" end="59" type="u0.4"/>
+    <field name="16x Sample2 X Offset" start="52" end="55" type="u0.4"/>
+    <field name="16x Sample2 Y Offset" start="48" end="51" type="u0.4"/>
+    <field name="16x Sample1 X Offset" start="44" end="47" type="u0.4"/>
+    <field name="16x Sample1 Y Offset" start="40" end="43" type="u0.4"/>
+    <field name="16x Sample0 X Offset" start="36" end="39" type="u0.4"/>
+    <field name="16x Sample0 Y Offset" start="32" end="35" type="u0.4"/>
+    <field name="16x Sample7 X Offset" start="92" end="95" type="u0.4"/>
+    <field name="16x Sample7 Y Offset" start="88" end="91" type="u0.4"/>
+    <field name="16x Sample6 X Offset" start="84" end="87" type="u0.4"/>
+    <field name="16x Sample6 Y Offset" start="80" end="83" type="u0.4"/>
+    <field name="16x Sample5 X Offset" start="76" end="79" type="u0.4"/>
+    <field name="16x Sample5 Y Offset" start="72" end="75" type="u0.4"/>
+    <field name="16x Sample4 X Offset" start="68" end="71" type="u0.4"/>
+    <field name="16x Sample4 Y Offset" start="64" end="67" type="u0.4"/>
+    <field name="16x Sample11 X Offset" start="124" end="127" type="u0.4"/>
+    <field name="16x Sample11 Y Offset" start="120" end="123" type="u0.4"/>
+    <field name="16x Sample10 X Offset" start="116" end="119" type="u0.4"/>
+    <field name="16x Sample10 Y Offset" start="112" end="115" type="u0.4"/>
+    <field name="16x Sample9 X Offset" start="108" end="111" type="u0.4"/>
+    <field name="16x Sample9 Y Offset" start="104" end="107" type="u0.4"/>
+    <field name="16x Sample8 X Offset" start="100" end="103" type="u0.4"/>
+    <field name="16x Sample8 Y Offset" start="96" end="99" type="u0.4"/>
+    <field name="16x Sample15 X Offset" start="156" end="159" type="u0.4"/>
+    <field name="16x Sample15 Y Offset" start="152" end="155" type="u0.4"/>
+    <field name="16x Sample14 X Offset" start="148" end="151" type="u0.4"/>
+    <field name="16x Sample14 Y Offset" start="144" end="147" type="u0.4"/>
+    <field name="16x Sample13 X Offset" start="140" end="143" type="u0.4"/>
+    <field name="16x Sample13 Y Offset" start="136" end="139" type="u0.4"/>
+    <field name="16x Sample12 X Offset" start="132" end="135" type="u0.4"/>
+    <field name="16x Sample12 Y Offset" start="128" end="131" type="u0.4"/>
+    <field name="8x Sample7 X Offset" start="188" end="191" type="u0.4"/>
+    <field name="8x Sample7 Y Offset" start="184" end="187" type="u0.4"/>
+    <field name="8x Sample6 X Offset" start="180" end="183" type="u0.4"/>
+    <field name="8x Sample6 Y Offset" start="176" end="179" type="u0.4"/>
+    <field name="8x Sample5 X Offset" start="172" end="175" type="u0.4"/>
+    <field name="8x Sample5 Y Offset" start="168" end="171" type="u0.4"/>
+    <field name="8x Sample4 X Offset" start="164" end="167" type="u0.4"/>
+    <field name="8x Sample4 Y Offset" start="160" end="163" type="u0.4"/>
+    <field name="8x Sample3 X Offset" start="220" end="223" type="u0.4"/>
+    <field name="8x Sample3 Y Offset" start="216" end="219" type="u0.4"/>
+    <field name="8x Sample2 X Offset" start="212" end="215" type="u0.4"/>
+    <field name="8x Sample2 Y Offset" start="208" end="211" type="u0.4"/>
+    <field name="8x Sample1 X Offset" start="204" end="207" type="u0.4"/>
+    <field name="8x Sample1 Y Offset" start="200" end="203" type="u0.4"/>
+    <field name="8x Sample0 X Offset" start="196" end="199" type="u0.4"/>
+    <field name="8x Sample0 Y Offset" start="192" end="195" type="u0.4"/>
+    <field name="4x Sample3 X Offset" start="252" end="255" type="u0.4"/>
+    <field name="4x Sample3 Y Offset" start="248" end="251" type="u0.4"/>
+    <field name="4x Sample2 X Offset" start="244" end="247" type="u0.4"/>
+    <field name="4x Sample2 Y Offset" start="240" end="243" type="u0.4"/>
+    <field name="4x Sample1 X Offset" start="236" end="239" type="u0.4"/>
+    <field name="4x Sample1 Y Offset" start="232" end="235" type="u0.4"/>
+    <field name="4x Sample0 X Offset" start="228" end="231" type="u0.4"/>
+    <field name="4x Sample0 Y Offset" start="224" end="227" type="u0.4"/>
+    <field name="1x Sample0 X Offset" start="276" end="279" type="u0.4"/>
+    <field name="1x Sample0 Y Offset" start="272" end="275" type="u0.4"/>
+    <field name="2x Sample1 X Offset" start="268" end="271" type="u0.4"/>
+    <field name="2x Sample1 Y Offset" start="264" end="267" type="u0.4"/>
+    <field name="2x Sample0 X Offset" start="260" end="263" type="u0.4"/>
+    <field name="2x Sample0 Y Offset" start="256" end="259" type="u0.4"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SBE" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="31"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="Force Vertex URB Entry Read Length" start="61" end="61" type="bool"/>
+    <field name="Force Vertex URB Entry Read Offset" start="60" end="60" type="bool"/>
+    <field name="Number of SF Output Attributes" start="54" end="59" type="uint"/>
+    <field name="Attribute Swizzle Enable" start="53" end="53" type="bool"/>
+    <field name="Point Sprite Texture Coordinate Origin" start="52" end="52" type="uint">
+      <value name="UPPERLEFT" value="0"/>
+      <value name="LOWERLEFT" value="1"/>
+    </field>
+    <field name="Primitive ID Override Component W" start="51" end="51" type="bool"/>
+    <field name="Primitive ID Override Component Z" start="50" end="50" type="bool"/>
+    <field name="Primitive ID Override Component Y" start="49" end="49" type="bool"/>
+    <field name="Primitive ID Override Component X" start="48" end="48" type="bool"/>
+    <field name="Vertex URB Entry Read Length" start="43" end="47" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="37" end="42" type="uint"/>
+    <field name="Primitive ID Override Attribute Select" start="32" end="36" type="uint"/>
+    <field name="Point Sprite Texture Coordinate Enable" start="64" end="95" type="uint"/>
+    <field name="Constant Interpolation Enable" start="96" end="127" type="uint"/>
+    <group count="32" start="128" size="2">
+      <field name="Attribute Active Component Format" start="0" end="1" type="uint" prefix="ACTIVE_COMPONENT">
+         <value name="DISABLED" value="0"/>
+         <value name="XY" value="1"/>
+         <value name="XYZ" value="2"/>
+         <value name="XYZW" value="3"/>
+      </field>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_SBE_SWIZ" bias="2" length="11">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="81"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="9"/>
+    <group count="16" start="32" size="16">
+      <field name="Attribute" start="0" end="15" type="SF_OUTPUT_ATTRIBUTE_DETAIL"/>
+    </group>
+    <group count="16" start="288" size="4">
+      <field name="Attribute Wrap Shortest Enables" start="0" end="3" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_SCISSOR_STATE_POINTERS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="15"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Scissor Rect Pointer" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SF" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="19"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Line Width" start="44" end="61" type="u11.7"/>
+    <field name="Legacy Global Depth Bias Enable" start="43" end="43" type="bool"/>
+    <field name="Statistics Enable" start="42" end="42" type="bool"/>
+    <field name="Viewport Transform Enable" start="33" end="33" type="bool"/>
+    <field name="Line End Cap Antialiasing Region Width" start="80" end="81" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Last Pixel Enable" start="127" end="127" type="bool"/>
+    <field name="Triangle Strip/List Provoking Vertex Select" start="125" end="126" type="uint"/>
+    <field name="Line Strip/List Provoking Vertex Select" start="123" end="124" type="uint"/>
+    <field name="Triangle Fan Provoking Vertex Select" start="121" end="122" type="uint"/>
+    <field name="AA Line Distance Mode" start="110" end="110" type="uint">
+      <value name="AALINEDISTANCE_TRUE" value="1"/>
+    </field>
+    <field name="Smooth Point Enable" start="109" end="109" type="bool"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Point Width Source" start="107" end="107" type="uint">
+      <value name="Vertex" value="0"/>
+      <value name="State" value="1"/>
+    </field>
+    <field name="Point Width" start="96" end="106" type="u8.3"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SO_BUFFER" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="24"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="SO Buffer Enable" start="63" end="63" type="bool"/>
+    <field name="SO Buffer Index" start="61" end="62" type="uint"/>
+    <field name="SO Buffer Object Control State" start="54" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="SO Buffer MOCS" start="54" end="60" type="uint"/>
+    <field name="Stream Offset Write Enable" start="53" end="53" type="bool"/>
+    <field name="Stream Output Buffer Offset Address Enable" start="52" end="52" type="bool"/>
+    <field name="Surface Base Address" start="66" end="111" type="address"/>
+    <field name="Surface Size" start="128" end="157" type="uint"/>
+    <field name="Stream Output Buffer Offset Address" start="162" end="207" type="address"/>
+    <field name="Stream Offset" start="224" end="255" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_SO_DECL_LIST" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="23"/>
+    <field name="DWord Length" start="0" end="8" type="uint"/>
+    <field name="Stream to Buffer Selects [3]" start="44" end="47" type="uint"/>
+    <field name="Stream to Buffer Selects [2]" start="40" end="43" type="uint"/>
+    <field name="Stream to Buffer Selects [1]" start="36" end="39" type="uint"/>
+    <field name="Stream to Buffer Selects [0]" start="32" end="35" type="uint"/>
+    <field name="Num Entries [3]" start="88" end="95" type="uint"/>
+    <field name="Num Entries [2]" start="80" end="87" type="uint"/>
+    <field name="Num Entries [1]" start="72" end="79" type="uint"/>
+    <field name="Num Entries [0]" start="64" end="71" type="uint"/>
+    <group count="0" start="96" size="64">
+      <field name="Entry" start="0" end="63" type="SO_DECL_ENTRY"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_STENCIL_BUFFER" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="6"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Stencil Buffer Enable" start="63" end="63" type="bool"/>
+    <field name="Stencil Buffer Object Control State" start="54" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Stencil Buffer MOCS" start="54" end="60" type="uint"/>
+    <field name="Surface Pitch" start="32" end="48" type="uint"/>
+    <field name="Surface Base Address" start="64" end="127" type="address"/>
+    <field name="Surface QPitch" start="128" end="142" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_STREAMOUT" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="30"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="SO Function Enable" start="63" end="63" type="bool"/>
+    <field name="Rendering Disable" start="62" end="62" type="bool"/>
+    <field name="Render Stream Select" start="59" end="60" type="uint"/>
+    <field name="Reorder Mode" start="58" end="58" type="uint">
+      <value name="LEADING" value="0"/>
+      <value name="TRAILING" value="1"/>
+    </field>
+    <field name="SO Statistics Enable" start="57" end="57" type="bool"/>
+    <field name="Force Rendering" start="55" end="56" type="uint">
+      <value name="Resreved" value="1"/>
+      <value name="Force_Off" value="2"/>
+      <value name="Force_on" value="3"/>
+    </field>
+    <field name="Stream 3 Vertex Read Offset" start="93" end="93" type="uint"/>
+    <field name="Stream 3 Vertex Read Length" start="88" end="92" type="uint"/>
+    <field name="Stream 2 Vertex Read Offset" start="85" end="85" type="uint"/>
+    <field name="Stream 2 Vertex Read Length" start="80" end="84" type="uint"/>
+    <field name="Stream 1 Vertex Read Offset" start="77" end="77" type="uint"/>
+    <field name="Stream 1 Vertex Read Length" start="72" end="76" type="uint"/>
+    <field name="Stream 0 Vertex Read Offset" start="69" end="69" type="uint"/>
+    <field name="Stream 0 Vertex Read Length" start="64" end="68" type="uint"/>
+    <field name="Buffer 1 Surface Pitch" start="112" end="123" type="uint"/>
+    <field name="Buffer 0 Surface Pitch" start="96" end="107" type="uint"/>
+    <field name="Buffer 3 Surface Pitch" start="144" end="155" type="uint"/>
+    <field name="Buffer 2 Surface Pitch" start="128" end="139" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_TE" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="28"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Partitioning" start="44" end="45" type="uint">
+      <value name="INTEGER" value="0"/>
+      <value name="ODD_FRACTIONAL" value="1"/>
+      <value name="EVEN_FRACTIONAL" value="2"/>
+    </field>
+    <field name="Output Topology" start="40" end="41" type="uint" prefix="OUTPUT">
+      <value name="POINT" value="0"/>
+      <value name="LINE" value="1"/>
+      <value name="TRI_CW" value="2"/>
+      <value name="TRI_CCW" value="3"/>
+    </field>
+    <field name="TE Domain" start="36" end="37" type="uint">
+      <value name="QUAD" value="0"/>
+      <value name="TRI" value="1"/>
+      <value name="ISOLINE" value="2"/>
+    </field>
+    <field name="TE Mode" start="33" end="34" type="uint">
+      <value name="HW_TESS" value="0"/>
+    </field>
+    <field name="TE Enable" start="32" end="32" type="bool"/>
+    <field name="Maximum Tessellation Factor Odd" start="64" end="95" type="float"/>
+    <field name="Maximum Tessellation Factor Not Odd" start="96" end="127" type="float"/>
+  </instruction>
+
+  <instruction name="3DSTATE_URB_CLEAR" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="29"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="URB Clear Length" start="48" end="61" type="uint"/>
+    <field name="URB Address" start="32" end="46" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_URB_DS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="50"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="DS URB Starting Address" start="57" end="63" type="uint"/>
+    <field name="DS URB Entry Allocation Size" start="48" end="56" type="uint"/>
+    <field name="DS Number of URB Entries" start="32" end="47" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_URB_GS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="51"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="GS URB Starting Address" start="57" end="63" type="uint"/>
+    <field name="GS URB Entry Allocation Size" start="48" end="56" type="uint"/>
+    <field name="GS Number of URB Entries" start="32" end="47" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_URB_HS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="49"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="HS URB Starting Address" start="57" end="63" type="uint"/>
+    <field name="HS URB Entry Allocation Size" start="48" end="56" type="uint"/>
+    <field name="HS Number of URB Entries" start="32" end="47" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_URB_VS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="48"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="VS URB Starting Address" start="57" end="63" type="uint"/>
+    <field name="VS URB Entry Allocation Size" start="48" end="56" type="uint"/>
+    <field name="VS Number of URB Entries" start="32" end="47" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VERTEX_BUFFERS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="8"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <group count="0" start="32" size="128">
+      <field name="Vertex Buffer State" start="0" end="127" type="VERTEX_BUFFER_STATE"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_VERTEX_ELEMENTS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <group count="0" start="32" size="64">
+      <field name="Element" start="0" end="63" type="VERTEX_ELEMENT_STATE"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_VF" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="12"/>
+    <field name="VertexID Offset Enable" start="11" end="11" type="bool"/>
+    <field name="Sequential Draw Cut Index Enable" start="10" end="10" type="bool"/>
+    <field name="Component Packing Enable" start="9" end="9" type="bool"/>
+    <field name="Indexed Draw Cut Index Enable" start="8" end="8" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Cut Index" start="32" end="63" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_COMPONENT_PACKING" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="85"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Vertex Element 07 Enables" start="60" end="63" type="uint"/>
+    <field name="Vertex Element 06 Enables" start="56" end="59" type="uint"/>
+    <field name="Vertex Element 05 Enables" start="52" end="55" type="uint"/>
+    <field name="Vertex Element 04 Enables" start="48" end="51" type="uint"/>
+    <field name="Vertex Element 03 Enables" start="44" end="47" type="uint"/>
+    <field name="Vertex Element 02 Enables" start="40" end="43" type="uint"/>
+    <field name="Vertex Element 01 Enables" start="36" end="39" type="uint"/>
+    <field name="Vertex Element 00 Enables" start="32" end="35" type="uint"/>
+    <field name="Vertex Element 15 Enables" start="92" end="95" type="uint"/>
+    <field name="Vertex Element 14 Enables" start="88" end="91" type="uint"/>
+    <field name="Vertex Element 13 Enables" start="84" end="87" type="uint"/>
+    <field name="Vertex Element 12 Enables" start="80" end="83" type="uint"/>
+    <field name="Vertex Element 11 Enables" start="76" end="79" type="uint"/>
+    <field name="Vertex Element 10 Enables" start="72" end="75" type="uint"/>
+    <field name="Vertex Element 09 Enables" start="68" end="71" type="uint"/>
+    <field name="Vertex Element 08 Enables" start="64" end="67" type="uint"/>
+    <field name="Vertex Element 23 Enables" start="124" end="127" type="uint"/>
+    <field name="Vertex Element 22 Enables" start="120" end="123" type="uint"/>
+    <field name="Vertex Element 21 Enables" start="116" end="119" type="uint"/>
+    <field name="Vertex Element 20 Enables" start="112" end="115" type="uint"/>
+    <field name="Vertex Element 19 Enables" start="108" end="111" type="uint"/>
+    <field name="Vertex Element 18 Enables" start="104" end="107" type="uint"/>
+    <field name="Vertex Element 17 Enables" start="100" end="103" type="uint"/>
+    <field name="Vertex Element 16 Enables" start="96" end="99" type="uint"/>
+    <field name="Vertex Element 31 Enables" start="156" end="159" type="uint"/>
+    <field name="Vertex Element 30 Enables" start="152" end="155" type="uint"/>
+    <field name="Vertex Element 29 Enables" start="148" end="151" type="uint"/>
+    <field name="Vertex Element 28 Enables" start="144" end="147" type="uint"/>
+    <field name="Vertex Element 27 Enables" start="140" end="143" type="uint"/>
+    <field name="Vertex Element 26 Enables" start="136" end="139" type="uint"/>
+    <field name="Vertex Element 25 Enables" start="132" end="135" type="uint"/>
+    <field name="Vertex Element 24 Enables" start="128" end="131" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_INSTANCING" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="73"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Instancing Enable" start="40" end="40" type="bool"/>
+    <field name="Vertex Element Index" start="32" end="37" type="uint"/>
+    <field name="Instance Data Step Rate" start="64" end="95" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_SGVS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="74"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="InstanceID Enable" start="63" end="63" type="bool"/>
+    <field name="InstanceID Component Number" start="61" end="62" type="uint">
+      <value name="COMP_0" value="0"/>
+      <value name="COMP_1" value="1"/>
+      <value name="COMP_2" value="2"/>
+      <value name="COMP_3" value="3"/>
+    </field>
+    <field name="InstanceID Element Offset" start="48" end="53" type="uint"/>
+    <field name="VertexID Enable" start="47" end="47" type="bool"/>
+    <field name="VertexID Component Number" start="45" end="46" type="uint">
+      <value name="COMP_0" value="0"/>
+      <value name="COMP_1" value="1"/>
+      <value name="COMP_2" value="2"/>
+      <value name="COMP_3" value="3"/>
+    </field>
+    <field name="VertexID Element Offset" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_SGVS_2" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="86"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="XP1 Enable" start="63" end="63" type="uint"/>
+    <field name="XP1 Component Number" start="61" end="62" type="uint">
+      <value name="COMP_0" value="0"/>
+      <value name="COMP_1" value="1"/>
+      <value name="COMP_2" value="2"/>
+      <value name="COMP_3" value="3"/>
+    </field>
+    <field name="XP1 Source Select" start="60" end="60" type="uint">
+      <value name="Starting Instance Location" value="1"/>
+      <value name="XP1_PARAMETER" value="0"/>
+    </field>
+    <field name="XP1 Element Offset" start="48" end="53" type="uint"/>
+    <field name="XP0 Enable" start="47" end="47" type="uint"/>
+    <field name="XP0 Component Number" start="45" end="46" type="uint">
+      <value name="COMP_0" value="0"/>
+      <value name="COMP_1" value="1"/>
+      <value name="COMP_2" value="2"/>
+      <value name="COMP_3" value="3"/>
+    </field>
+    <field name="XP0 Source Select" start="44" end="44" type="uint">
+      <value name="VERTEX_LOCATION" value="1"/>
+      <value name="XP0_PARAMETER" value="0"/>
+    </field>
+    <field name="XP0 Element Offset" start="32" end="37" type="uint"/>
+    <field name="XP2 Enable" start="79" end="79" type="uint"/>
+    <field name="XP2 Component Number" start="77" end="78" type="uint">
+      <value name="COMP_0" value="0"/>
+      <value name="COMP_1" value="1"/>
+      <value name="COMP_2" value="2"/>
+      <value name="COMP_3" value="3"/>
+    </field>
+    <field name="XP2 Element Offset" start="64" end="69" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_STATISTICS" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="1"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="11"/>
+    <field name="Statistics Enable" start="0" end="0" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_TOPOLOGY" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="75"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Primitive Topology Type" start="32" end="37" type="3D_Prim_Topo_Type"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VIEWPORT_STATE_POINTERS_CC" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="35"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="CC Viewport Pointer" start="37" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="33"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="SF Clip Viewport Pointer" start="38" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VS" bias="2" length="9">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="16"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="7"/>
+    <field name="Kernel Start Pointer" start="38" end="95" type="offset"/>
+    <field name="Single Vertex Dispatch" start="127" end="127" type="bool"/>
+    <field name="Vector Mask Enable" start="126" end="126" type="bool"/>
+    <field name="Sampler Count" start="123" end="125" type="uint">
+      <value name="No Samplers" value="0"/>
+      <value name="1-4 Samplers" value="1"/>
+      <value name="5-8 Samplers" value="2"/>
+      <value name="9-12 Samplers" value="3"/>
+      <value name="13-16 Samplers" value="4"/>
+    </field>
+    <field name="Binding Table Entry Count" start="114" end="121" type="uint"/>
+    <field name="Thread Dispatch Priority" start="113" end="113" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="112" end="112" type="uint">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="109" end="109" type="bool"/>
+    <field name="Accesses UAV" start="108" end="108" type="bool"/>
+    <field name="Software Exception Enable" start="103" end="103" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="138" end="191" type="address"/>
+    <field name="Per-Thread Scratch Space" start="128" end="131" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="212" end="216" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="203" end="208" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="196" end="201" type="uint"/>
+    <field name="Maximum Number of Threads" start="246" end="255" type="uint"/>
+    <field name="Statistics Enable" start="234" end="234" type="bool"/>
+    <field name="SIMD8 Single Instance Dispatch Enable" start="233" end="233" type="bool"/>
+    <field name="SIMD8 Dispatch Enable" start="226" end="226" type="bool"/>
+    <field name="Vertex Cache Disable" start="225" end="225" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
+    <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
+    <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
+    <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
+    <field name="User Clip Distance Cull Test Enable Bitmask" start="256" end="263" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_WM" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="20"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Statistics Enable" start="63" end="63" type="bool"/>
+    <field name="Legacy Depth Buffer Clear Enable" start="62" end="62" type="bool"/>
+    <field name="Legacy Depth Buffer Resolve Enable" start="60" end="60" type="bool"/>
+    <field name="Legacy Hierarchical Depth Buffer Resolve Enable" start="59" end="59" type="bool"/>
+    <field name="Legacy Diamond Line Rasterization" start="58" end="58" type="bool"/>
+    <field name="Early Depth/Stencil Control" start="53" end="54" type="uint">
+      <value name="EDSC_NORMAL" value="0"/>
+      <value name="EDSC_PSEXEC" value="1"/>
+      <value name="EDSC_PREPS" value="2"/>
+    </field>
+    <field name="Force Thread Dispatch Enable" start="51" end="52" type="uint">
+      <value name="ForceOff" value="1"/>
+      <value name="ForceON" value="2"/>
+    </field>
+    <field name="Position ZW Interpolation Mode" start="49" end="50" type="uint">
+      <value name="INTERP_PIXEL" value="0"/>
+      <value name="INTERP_CENTROID" value="2"/>
+      <value name="INTERP_SAMPLE" value="3"/>
+    </field>
+    <field name="Barycentric Interpolation Mode" start="43" end="48" type="uint">
+      <value name="BIM_PERSPECTIVE_PIXEL" value="1"/>
+      <value name="BIM_PERSPECTIVE_CENTROID" value ="2"/>
+      <value name="BIM_PERSPECTIVE_SAMPLE" value="4"/>
+      <value name="BIM_LINEAR_PIXEL" value="8"/>
+      <value name="BIM_LINEAR_CENTROID" value="16"/>
+      <value name="BIM_LINEAR_SAMPLE" value="32"/>
+    </field>
+    <field name="Line End Cap Antialiasing Region Width" start="40" end="41" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Line Antialiasing Region Width" start="38" end="39" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Polygon Stipple Enable" start="36" end="36" type="bool"/>
+    <field name="Line Stipple Enable" start="35" end="35" type="bool"/>
+    <field name="Point Rasterization Rule" start="34" end="34" type="uint">
+      <value name="RASTRULE_UPPER_LEFT" value="0"/>
+      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
+    </field>
+    <field name="Force Kill Pixel Enable" start="32" end="33" type="uint">
+      <value name="ForceOff" value="1"/>
+      <value name="ForceON" value="2"/>
+    </field>
+  </instruction>
+
+  <instruction name="3DSTATE_WM_CHROMAKEY" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="76"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="ChromaKey Kill Enable" start="63" end="63" type="bool"/>
+  </instruction>
+
+  <instruction name="3DSTATE_WM_DEPTH_STENCIL" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="78"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Stencil Fail Op" start="61" end="63" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Fail Op" start="58" end="60" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Pass Op" start="55" end="57" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Test Function" start="52" end="54" type="3D_Compare_Function"/>
+    <field name="Backface Stencil Fail Op" start="49" end="51" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Fail Op" start="46" end="48" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Pass Op" start="43" end="45" type="3D_Stencil_Operation"/>
+    <field name="Stencil Test Function" start="40" end="42" type="3D_Compare_Function"/>
+    <field name="Depth Test Function" start="37" end="39" type="3D_Compare_Function"/>
+    <field name="Double Sided Stencil Enable" start="36" end="36" type="bool"/>
+    <field name="Stencil Test Enable" start="35" end="35" type="bool"/>
+    <field name="Stencil Buffer Write Enable" start="34" end="34" type="bool"/>
+    <field name="Depth Test Enable" start="33" end="33" type="bool"/>
+    <field name="Depth Buffer Write Enable" start="32" end="32" type="bool"/>
+    <field name="Stencil Test Mask" start="88" end="95" type="uint"/>
+    <field name="Stencil Write Mask" start="80" end="87" type="uint"/>
+    <field name="Backface Stencil Test Mask" start="72" end="79" type="uint"/>
+    <field name="Backface Stencil Write Mask" start="64" end="71" type="uint"/>
+    <field name="Stencil Reference Value" start="104" end="111" type="uint"/>
+    <field name="Backface Stencil Reference Value" start="96" end="103" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_WM_HZ_OP" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="82"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Stencil Buffer Clear Enable" start="63" end="63" type="bool"/>
+    <field name="Depth Buffer Clear Enable" start="62" end="62" type="bool"/>
+    <field name="Scissor Rectangle Enable" start="61" end="61" type="bool"/>
+    <field name="Depth Buffer Resolve Enable" start="60" end="60" type="bool"/>
+    <field name="Hierarchical Depth Buffer Resolve Enable" start="59" end="59" type="bool"/>
+    <field name="Pixel Position Offset Enable" start="58" end="58" type="bool"/>
+    <field name="Full Surface Depth and Stencil Clear" start="57" end="57" type="bool"/>
+    <field name="Stencil Clear Value" start="48" end="55" type="uint"/>
+    <field name="Number of Multisamples" start="45" end="47" type="uint"/>
+    <field name="Clear Rectangle Y Min" start="80" end="95" type="uint"/>
+    <field name="Clear Rectangle X Min" start="64" end="79" type="uint"/>
+    <field name="Clear Rectangle Y Max" start="112" end="127" type="uint"/>
+    <field name="Clear Rectangle X Max" start="96" end="111" type="uint"/>
+    <field name="Sample Mask" start="128" end="143" type="uint"/>
+  </instruction>
+
+  <instruction name="GPGPU_WALKER" bias="2" length="15">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="5"/>
+    <field name="Indirect Parameter Enable" start="10" end="10" type="bool"/>
+    <field name="Predicate Enable" start="8" end="8" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="13"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+    <field name="Indirect Data Length" start="64" end="80" type="uint"/>
+    <field name="Indirect Data Start Address" start="102" end="127" type="offset"/>
+    <field name="SIMD Size" start="158" end="159" type="uint">
+      <value name="SIMD8" value="0"/>
+      <value name="SIMD16" value="1"/>
+      <value name="SIMD32" value="2"/>
+    </field>
+    <field name="Thread Depth Counter Maximum" start="144" end="149" type="uint"/>
+    <field name="Thread Height Counter Maximum" start="136" end="141" type="uint"/>
+    <field name="Thread Width Counter Maximum" start="128" end="133" type="uint"/>
+    <field name="Thread Group ID Starting X" start="160" end="191" type="uint"/>
+    <field name="Thread Group ID X Dimension" start="224" end="255" type="uint"/>
+    <field name="Thread Group ID Starting Y" start="256" end="287" type="uint"/>
+    <field name="Thread Group ID Y Dimension" start="320" end="351" type="uint"/>
+    <field name="Thread Group ID Starting/Resume Z" start="352" end="383" type="uint"/>
+    <field name="Thread Group ID Z Dimension" start="384" end="415" type="uint"/>
+    <field name="Right Execution Mask" start="416" end="447" type="uint"/>
+    <field name="Bottom Execution Mask" start="448" end="479" type="uint"/>
+  </instruction>
+
+  <instruction name="MEDIA_CURBE_LOAD" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="15" type="uint" default="2"/>
+    <field name="CURBE Total Data Length" start="64" end="80" type="uint"/>
+    <field name="CURBE Data Start Address" start="96" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="MEDIA_INTERFACE_DESCRIPTOR_LOAD" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="15" type="uint" default="2"/>
+    <field name="Interface Descriptor Total Length" start="64" end="80" type="uint"/>
+    <field name="Interface Descriptor Data Start Address" start="96" end="127" type="offset"/>
+  </instruction>
+
+  <instruction name="MEDIA_OBJECT" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Media Command Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="Media Command Sub-Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="14" type="uint" default="4"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+    <field name="Children Present" start="95" end="95" type="bool"/>
+    <field name="Slice Destination Select MSBs" start="89" end="90" type="uint"/>
+    <field name="Thread Synchronization" start="88" end="88" type="uint">
+      <value name="No thread synchronization" value="0"/>
+      <value name="Thread dispatch is synchronized by the 'spawn root thread' message" value="1"/>
+    </field>
+    <field name="Force Destination" start="86" end="86" type="uint"/>
+    <field name="Use Scoreboard" start="85" end="85" type="uint">
+      <value name="Not using scoreboard" value="0"/>
+      <value name="Using scoreboard" value="1"/>
+    </field>
+    <field name="Slice Destination Select" start="83" end="84" type="uint">
+      <value name="Slice 0" value="0"/>
+      <value name="Slice 1" value="1"/>
+      <value name="Slice 2" value="2"/>
+    </field>
+    <field name="SubSlice Destination Select" start="81" end="82" type="uint">
+      <value name="Subslice 3" value="3"/>
+      <value name="SubSlice 2" value="2"/>
+      <value name="SubSlice 1" value="1"/>
+      <value name="SubSlice 0" value="0"/>
+    </field>
+    <field name="Indirect Data Length" start="64" end="80" type="uint"/>
+    <field name="Indirect Data Start Address" start="96" end="127" type="address"/>
+    <field name="Scoredboard Y" start="144" end="152" type="uint"/>
+    <field name="Scoreboard X" start="128" end="136" type="uint"/>
+    <field name="Scoreboard Color" start="176" end="179" type="uint"/>
+    <field name="Scoreboard Mask" start="160" end="167" type="uint"/>
+    <group count="0" start="192" size="32">
+      <field name="Inline Data" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MEDIA_OBJECT_GRPID" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Media Command Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="Media Command Sub-Opcode" start="16" end="23" type="uint" default="6"/>
+    <field name="DWord Length" start="0" end="15" type="uint" default="5"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+    <field name="End of Thread Group" start="87" end="87" type="uint"/>
+    <field name="Use Scoreboard" start="85" end="85" type="uint">
+      <value name="Not using scoreboard" value="0"/>
+      <value name="Using scoreboard" value="1"/>
+    </field>
+    <field name="Indirect Data Length" start="64" end="80" type="uint"/>
+    <field name="Indirect Data Start Address" start="96" end="127" type="address"/>
+    <field name="Scoreboard Y" start="144" end="152" type="uint"/>
+    <field name="Scoreboard X" start="128" end="136" type="uint"/>
+    <field name="Scoreboard Color" start="176" end="179" type="uint"/>
+    <field name="Scoreboard Mask" start="160" end="167" type="uint"/>
+    <field name="GroupID" start="192" end="223" type="uint"/>
+    <group count="0" start="224" size="32">
+      <field name="Inline Data" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MEDIA_OBJECT_PRT" bias="2" length="16">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="14" type="uint" default="14"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+    <field name="Children Present" start="95" end="95" type="bool"/>
+    <field name="PRT_Fence Needed" start="87" end="87" type="bool"/>
+    <field name="PRT_FenceType" start="86" end="86" type="uint">
+      <value name="Root thread queue" value="0"/>
+      <value name="VFE state flush" value="1"/>
+    </field>
+    <group count="12" start="128" size="32">
+      <field name="Inline Data" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MEDIA_OBJECT_WALKER" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="3"/>
+    <field name="DWord Length" start="0" end="14" type="uint" default="15"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+    <field name="Thread Synchronization" start="88" end="88" type="uint">
+      <value name="No thread synchronization" value="0"/>
+      <value name="Thread dispatch is synchronized by the 'spawn root thread' message" value="1"/>
+    </field>
+    <field name="Masked Dispatch" start="86" end="87" type="uint"/>
+    <field name="Use Scoreboard" start="85" end="85" type="uint">
+      <value name="Not using scoreboard" value="0"/>
+      <value name="Using scoreboard" value="1"/>
+    </field>
+    <field name="Indirect Data Length" start="64" end="80" type="uint"/>
+    <field name="Indirect Data Start Address" start="96" end="127" type="uint"/>
+    <field name="Group ID Loop Select" start="168" end="191" type="uint">
+      <value name="No_Groups" value="0"/>
+      <value name="Color_Groups" value="1"/>
+      <value name="InnerLocal_Groups" value="2"/>
+      <value name="MidLocal_Groups" value="3"/>
+      <value name="OuterLocal_Groups" value="4"/>
+      <value name="InnerGlobal_Groups" value="5"/>
+    </field>
+    <field name="Scoreboard Mask" start="160" end="167" type="uint"/>
+    <field name="Color Count Minus One" start="216" end="219" type="uint"/>
+    <field name="Middle Loop Extra Steps" start="208" end="212" type="uint"/>
+    <field name="Local Mid-Loop Unit Y" start="204" end="205" type="int"/>
+    <field name="Mid-Loop Unit X" start="200" end="201" type="int"/>
+    <field name="Global Loop Exec Count" start="240" end="251" type="uint"/>
+    <field name="Local Loop Exec Count" start="224" end="235" type="uint"/>
+    <field name="Block Resolution Y" start="272" end="282" type="uint"/>
+    <field name="Block Resolution X" start="256" end="266" type="uint"/>
+    <field name="Local Start Y" start="304" end="314" type="uint"/>
+    <field name="Local Start X" start="288" end="298" type="uint"/>
+    <field name="Local Outer Loop Stride Y" start="368" end="379" type="int"/>
+    <field name="Local Outer Loop Stride X" start="352" end="363" type="int"/>
+    <field name="Local Inner Loop Unit Y" start="400" end="411" type="int"/>
+    <field name="Local Inner Loop Unit X" start="384" end="395" type="int"/>
+    <field name="Global Resolution Y" start="432" end="442" type="uint"/>
+    <field name="Global Resolution X" start="416" end="426" type="uint"/>
+    <field name="Global Start Y" start="464" end="475" type="int"/>
+    <field name="Global Start X" start="448" end="459" type="int"/>
+    <field name="Global Outer Loop Stride Y" start="496" end="507" type="int"/>
+    <field name="Global Outer Loop Stride X" start="480" end="491" type="int"/>
+    <field name="Global Inner Loop Unit Y" start="528" end="539" type="int"/>
+    <field name="Global Inner Loop Unit X" start="512" end="523" type="int"/>
+    <group count="0" start="544" size="32">
+      <field name="Inline Data" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MEDIA_STATE_FLUSH" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="4"/>
+    <field name="DWord Length" start="0" end="15" type="uint" default="0"/>
+    <field name="Flush to GO" start="39" end="39" type="bool"/>
+    <field name="Watermark Required" start="38" end="38" type="uint"/>
+    <field name="Interface Descriptor Offset" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="MEDIA_VFE_STATE" bias="2" length="9">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Pipeline" start="27" end="28" type="uint" default="2"/>
+    <field name="Media Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="SubOpcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="15" type="uint" default="7"/>
+    <field name="Scratch Space Base Pointer" start="42" end="79" type="address"/>
+    <field name="Stack Size" start="36" end="39" type="uint"/>
+    <field name="Per Thread Scratch Space" start="32" end="35" type="uint"/>
+    <field name="Maximum Number of Threads" start="112" end="127" type="uint"/>
+    <field name="Number of URB Entries" start="104" end="111" type="uint"/>
+    <field name="Reset Gateway Timer" start="103" end="103" type="uint">
+      <value name="Maintaining the existing timestamp state" value="0"/>
+      <value name="Resetting relative timer and latching the global timestamp" value="1"/>
+    </field>
+    <field name="Thread Dispatch Selection Policy" start="100" end="101" type="uint">
+      <value name="Legacy" value="0"/>
+      <value name="Prefer 1SS" value="1"/>
+      <value name="Prefer 2SS" value="2"/>
+      <value name="Load Balance" value="3"/>
+    </field>
+    <field name="SLM Bank Selection Policy" start="99" end="99" type="uint">
+      <value name="Legacy" value="0"/>
+      <value name="SLM Load Balance" value="1"/>
+    </field>
+    <field name="Slice Disable" start="128" end="129" type="uint">
+      <value name="All Subslices Enabled" value="0"/>
+      <value name="Only Slice 0 Enabled" value="1"/>
+      <value name="Only Slice 0 Subslice 0 Enabled" value="3"/>
+    </field>
+    <field name="URB Entry Allocation Size" start="176" end="191" type="uint"/>
+    <field name="CURBE Allocation Size" start="160" end="175" type="uint"/>
+    <field name="Scoreboard Enable" start="223" end="223" type="bool"/>
+    <field name="Scoreboard Type" start="222" end="222" type="uint">
+      <value name="Stalling Scoreboard" value="0"/>
+      <value name="Non-Stalling Scoreboard" value="1"/>
+    </field>
+    <field name="Number of Media Objects per Pre-Emption Checkpoint" start="200" end="207" type="uint"/>
+    <field name="Scoreboard Mask" start="192" end="199" type="uint"/>
+    <field name="Scoreboard 3 Delta Y" start="252" end="255" type="int"/>
+    <field name="Scoreboard 3 Delta X" start="248" end="251" type="int"/>
+    <field name="Scoreboard 2 Delta Y" start="244" end="247" type="int"/>
+    <field name="Scoreboard 2 Delta X" start="240" end="243" type="int"/>
+    <field name="Scoreboard 1 Delta Y" start="236" end="239" type="int"/>
+    <field name="Scoreboard 1 Delta X" start="232" end="235" type="int"/>
+    <field name="Scoreboard 0 Delta Y" start="228" end="231" type="int"/>
+    <field name="Scoreboard 0 Delta X" start="224" end="227" type="int"/>
+    <field name="Scoreboard 7 Delta Y" start="284" end="287" type="int"/>
+    <field name="Scoreboard 7 Delta X" start="280" end="283" type="int"/>
+    <field name="Scoreboard 6 Delta Y" start="276" end="279" type="int"/>
+    <field name="Scoreboard 6 Delta X" start="272" end="275" type="int"/>
+    <field name="Scoreboard 5 Delta Y" start="268" end="271" type="int"/>
+    <field name="Scoreboard 5 Delta X" start="264" end="267" type="int"/>
+    <field name="Scoreboard 4 Delta Y" start="260" end="263" type="int"/>
+    <field name="Scoreboard 4 Delta X" start="256" end="259" type="int"/>
+  </instruction>
+
+  <instruction name="MI_ARB_CHECK" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="5"/>
+  </instruction>
+
+  <instruction name="MI_ATOMIC" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="47"/>
+    <field name="Memory Type" start="22" end="22" type="uint">
+      <value name="Per Process Graphics Address" value="0"/>
+      <value name="Global Graphics Address" value="1"/>
+    </field>
+    <field name="Post-Sync Operation" start="21" end="21" type="bool"/>
+    <field name="Data Size" start="19" end="20" type="uint">
+      <value name="DWORD" value="0"/>
+      <value name="QWORD" value="1"/>
+      <value name="OCTWORD" value="2"/>
+      <value name="RESERVED" value="3"/>
+    </field>
+    <field name="Inline Data" start="18" end="18" type="uint"/>
+    <field name="CS STALL" start="17" end="17" type="uint"/>
+    <field name="Return Data Control" start="16" end="16" type="uint"/>
+    <field name="ATOMIC OPCODE" start="8" end="15" type="uint"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Memory Address" start="34" end="79" type="address"/>
+    <field name="Operand1 Data Dword 0" start="96" end="127" type="uint"/>
+    <field name="Operand2 Data Dword 0" start="128" end="159" type="uint"/>
+    <field name="Operand1 Data Dword 1" start="160" end="191" type="uint"/>
+    <field name="Operand2 Data Dword 1" start="192" end="223" type="uint"/>
+    <field name="Operand1 Data Dword 2" start="224" end="255" type="uint"/>
+    <field name="Operand2 Data Dword 2" start="256" end="287" type="uint"/>
+    <field name="Operand1 Data Dword 3" start="288" end="319" type="uint"/>
+    <field name="Operand2 Data Dword 3" start="320" end="351" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_BATCH_BUFFER_END" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="10"/>
+    <field name="End Context" start="0" end="0" type="bool"/>
+  </instruction>
+
+  <instruction name="MI_BATCH_BUFFER_START" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="49"/>
+    <field name="Second Level Batch Buffer" start="22" end="22" type="uint">
+      <value name="First level batch" value="0"/>
+      <value name="Second level batch" value="1"/>
+    </field>
+    <field name="Add Offset Enable" start="16" end="16" type="bool"/>
+    <field name="Predication Enable" start="15" end="15" type="bool"/>
+    <field name="Resource Streamer Enable" start="10" end="10" type="bool"/>
+    <field name="Address Space Indicator" start="8" end="8" type="uint" prefix="ASI">
+      <value name="GGTT" value="0"/>
+      <value name="PPGTT" value="1"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Batch Buffer Start Address" start="34" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="MI_CLFLUSH" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="39"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="DWord Length" start="0" end="9" type="uint" default="1"/>
+    <field name="Page Base Address" start="44" end="79" type="address"/>
+    <field name="Starting Cacheline Offset" start="38" end="43" type="uint"/>
+    <group count="0" start="96" size="32">
+      <field name="DW Representing a Half Cache Line" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MI_CONDITIONAL_BATCH_BUFFER_END" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="54"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="Compare Semaphore" start="21" end="21" type="uint" default="0"/>
+    <field name="Compare Mask Mode" start="19" end="19" type="uint">
+      <value name="Compare Mask Mode Disabled" value="0"/>
+      <value name="Compare Mask Mode Enabled" value="1"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Compare Data Dword" start="32" end="63" type="uint"/>
+    <field name="Compare Address" start="67" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="MI_COPY_MEM_MEM" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="46"/>
+    <field name="Use Global GTT Source" start="22" end="22" type="bool"/>
+    <field name="Use Global GTT Destination" start="21" end="21" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Destination Memory Address" start="34" end="95" type="address"/>
+    <field name="Source Memory Address" start="98" end="159" type="address"/>
+  </instruction>
+
+  <instruction name="MI_DISPLAY_FLIP" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="20"/>
+    <field name="Async Flip Indicator" start="22" end="22" type="bool"/>
+    <field name="Display Plane Select" start="8" end="12" type="uint">
+      <value name="Display Plane 1" value="0"/>
+      <value name="Display Plane 2" value="1"/>
+      <value name="Display Plane 3" value="2"/>
+      <value name="Display Plane 4" value="4"/>
+      <value name="Display Plane 5" value="5"/>
+      <value name="Display Plane 6" value="6"/>
+      <value name="Display Plane 7" value="7"/>
+      <value name="Display Plane 8" value="8"/>
+      <value name="Display Plane 9" value="9"/>
+      <value name="Display Plane 10" value="10"/>
+      <value name="Display Plane 11" value="11"/>
+      <value name="Display Plane 12" value="12"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Stereoscopic 3D Mode" start="63" end="63" type="bool"/>
+    <field name="Display Buffer Pitch" start="38" end="47" type="uint"/>
+    <field name="Tile Parameter" start="32" end="34" type="bool"/>
+    <field name="Display Buffer Base Address" start="76" end="95" type="address"/>
+    <field name="VRR Master Flip" start="75" end="75" type="uint"/>
+    <field name="Flip Type" start="64" end="65" type="uint">
+      <value name="Sync Flip" value="0"/>
+      <value name="Async Flip" value="1"/>
+      <value name="Stereo 3D Flip" value="2"/>
+    </field>
+    <field name="Left Eye Display Buffer Base Address" start="108" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="MI_FORCE_WAKEUP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="29"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Mask Bits" start="48" end="63" type="uint"/>
+    <field name="Force Render Awake" start="33" end="33" type="uint"/>
+    <field name="Force Media Awake" start="32" end="32" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_LOAD_REGISTER_IMM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="34"/>
+    <field name="Byte Write Disables" start="8" end="11" type="uint"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Register Offset" start="34" end="54" type="offset"/>
+    <field name="Data DWord" start="64" end="95" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_LOAD_REGISTER_MEM" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="41"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="Async Mode Enable" start="21" end="21" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Register Address" start="34" end="54" type="offset"/>
+    <field name="Memory Address" start="66" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="MI_LOAD_REGISTER_REG" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="42"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Source Register Address" start="34" end="54" type="offset"/>
+    <field name="Destination Register Address" start="66" end="86" type="offset"/>
+  </instruction>
+
+  <instruction name="MI_LOAD_SCAN_LINES_EXCL" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="19"/>
+    <field name="Display (Plane) Select" start="19" end="21" type="uint">
+      <value name="Display Plane A" value="0"/>
+      <value name="Display Plane B" value="1"/>
+      <value name="Display Plane C" value="4"/>
+    </field>
+    <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
+    <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
+    <field name="End Scan Line Number" start="32" end="44" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_LOAD_SCAN_LINES_INCL" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="18"/>
+    <field name="Display (Plane) Select" start="19" end="21" type="uint">
+      <value name="Display Plane 1 A" value="0"/>
+      <value name="Display Plane 1 B" value="1"/>
+      <value name="Display Plane 1 C" value="4"/>
+    </field>
+    <field name="Scan Line Event Done Forward" start="17" end="18" type="bool"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
+    <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
+    <field name="End Scan Line Number" start="32" end="44" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_MATH" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="26"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <group count="0" start="32" size="32">
+      <field name="Instruction" start="0" end="31" type="MI_MATH_ALU_INSTRUCTION"/>
+    </group>
+  </instruction>
+
+  <instruction name="MI_NOOP" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="0"/>
+    <field name="Identification Number Register Write Enable" start="22" end="22" type="bool"/>
+    <field name="Identification Number" start="0" end="21" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_PREDICATE" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="12"/>
+    <field name="Load Operation" start="6" end="7" type="uint" prefix="LOAD">
+      <value name="KEEP" value="0"/>
+      <value name="LOAD" value="2"/>
+      <value name="LOADINV" value="3"/>
+    </field>
+    <field name="Combine Operation" start="3" end="4" type="uint" prefix="COMBINE">
+      <value name="SET" value="0"/>
+      <value name="AND" value="1"/>
+      <value name="OR" value="2"/>
+      <value name="XOR" value="3"/>
+    </field>
+    <field name="Compare Operation" start="0" end="1" type="uint" prefix="COMPARE">
+      <value name="SRCS_EQUAL" value="2"/>
+      <value name="DELTAS_EQUAL" value="3"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_REPORT_HEAD" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="7"/>
+  </instruction>
+
+  <instruction name="MI_REPORT_PERF_COUNT" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="40"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="2"/>
+    <field name="Memory Address" start="38" end="95" type="address"/>
+    <field name="Core Mode Enable" start="36" end="36" type="uint"/>
+    <field name="Use Global GTT" start="32" end="32" type="bool"/>
+    <field name="Report ID" start="96" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_RS_CONTEXT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="15"/>
+    <field name="Resource Streamer Save" start="0" end="0" type="uint" prefix="RS">
+      <value name="Restore" value="0"/>
+      <value name="Save" value="1"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_RS_CONTROL" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="6"/>
+    <field name="Resource Streamer Control" start="0" end="0" type="uint" prefix="RS">
+      <value name="Stop" value="0"/>
+      <value name="Start" value="1"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_RS_STORE_DATA_IMM" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="43"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Destination Address" start="34" end="95" type="address"/>
+    <field name="Core Mode Enable" start="32" end="32" type="uint"/>
+    <field name="Data DWord 0" start="96" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_SEMAPHORE_SIGNAL" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="27"/>
+    <field name="Post-Sync Operation" start="21" end="21" type="bool"/>
+    <field name="Target Engine Select" start="15" end="17" type="uint">
+      <value name="RCS" value="0"/>
+      <value name="VCS0" value="1"/>
+      <value name="BCS" value="2"/>
+      <value name="VECS" value="3"/>
+      <value name="VCS1" value="4"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Target Context ID" start="32" end="63" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_SEMAPHORE_WAIT" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="28"/>
+    <field name="Memory Type" start="22" end="22" type="uint">
+      <value name="Per Process Graphics Address" value="0"/>
+      <value name="Global Graphics Address" value="1"/>
+    </field>
+    <field name="Register Poll Mode" start="16" end="16" type="bool"/>
+    <field name="Wait Mode" start="15" end="15" type="uint">
+      <value name="Polling Mode" value="1"/>
+      <value name="Signal Mode" value="0"/>
+    </field>
+    <field name="Compare Operation" start="12" end="14" type="uint" prefix="COMPARE">
+      <value name="SAD_GREATER_THAN_SDD" value="0"/>
+      <value name="SAD_GREATER_THAN_OR_EQUAL_SDD" value="1"/>
+      <value name="SAD_LESS_THAN_SDD" value="2"/>
+      <value name="SAD_LESS_THAN_OR_EQUAL_SDD" value="3"/>
+      <value name="SAD_EQUAL_SDD" value="4"/>
+      <value name="SAD_NOT_EQUAL_SDD" value="5"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Semaphore Data Dword" start="32" end="63" type="uint"/>
+    <field name="Semaphore Address" start="66" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="MI_SET_CONTEXT" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="24"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Logical Context Address" start="44" end="63" type="address"/>
+    <field name="Reserved, Must be 1" start="40" end="40" type="uint"/>
+    <field name="Core Mode Enable" start="36" end="36" type="bool"/>
+    <field name="Resource Streamer State Save Enable" start="35" end="35" type="bool"/>
+    <field name="Resource Streamer State Restore Enable" start="34" end="34" type="bool"/>
+    <field name="Force Restore" start="33" end="33" type="uint"/>
+    <field name="Restore Inhibit" start="32" end="32" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_SET_PREDICATE" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="1"/>
+    <field name="PREDICATE ENABLE" start="0" end="3" type="uint">
+      <value name="NOOP Never" value="0"/>
+      <value name="NOOP on Result2 clear" value="1"/>
+      <value name="NOOP on Result2 set" value="2"/>
+      <value name="NOOP on Result clear" value="3"/>
+      <value name="NOOP on Result set" value="4"/>
+      <value name="NOOP Always" value="15"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_STORE_DATA_IMM" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="32"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="Store Qword" start="21" end="21" type="uint"/>
+    <field name="DWord Length" start="0" end="9" type="uint" default="2"/>
+    <field name="Address" start="34" end="79" type="address"/>
+    <field name="Core Mode Enable" start="32" end="32" type="uint"/>
+    <field name="Immediate Data" start="96" end="159" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_DATA_INDEX" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="33"/>
+    <field name="Use Per-Process Hardware Status Page" start="21" end="21" type="uint"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Offset" start="34" end="43" type="uint"/>
+    <field name="Data DWord 0" start="64" end="95" type="uint"/>
+    <field name="Data DWord 1" start="96" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_REGISTER_MEM" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="36"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="Predicate Enable" start="21" end="21" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Register Address" start="34" end="54" type="offset"/>
+    <field name="Memory Address" start="66" end="127" type="address"/>
+  </instruction>
+
+  <instruction name="MI_SUSPEND_FLUSH" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="11"/>
+    <field name="Suspend Flush" start="0" end="0" type="bool"/>
+  </instruction>
+
+  <instruction name="MI_TOPOLOGY_FILTER" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="13"/>
+    <field name="Topology Filter Value" start="0" end="5" type="3D_Prim_Topo_Type"/>
+  </instruction>
+
+  <instruction name="MI_UPDATE_GTT" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="35"/>
+    <field name="DWord Length" start="0" end="9" type="uint" default="0"/>
+    <field name="Entry Address" start="44" end="63" type="address"/>
+    <group count="0" start="64" size="64">
+      <field name="Entry Data" start="0" end="63" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="MI_USER_INTERRUPT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="2"/>
+  </instruction>
+
+  <instruction name="MI_WAIT_FOR_EVENT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="3"/>
+    <field name="Display Plane 1 C Vertical Blank Wait Enable" start="21" end="21" type="bool"/>
+    <field name="Display Plane 6 Flip Pending Wait Enable" start="20" end="20" type="bool"/>
+    <field name="Display Plane 12 Flip Pending Wait Enable" start="19" end="19" type="bool"/>
+    <field name="Display Plane 11 Flip Pending Wait Enable" start="18" end="18" type="bool"/>
+    <field name="Display Plane 10 Flip Pending Wait Enable" start="17" end="17" type="bool"/>
+    <field name="Display Plane 9 Flip Pending Wait Enable" start="16" end="16" type="bool"/>
+    <field name="Display Plane 3 Flip Pending Wait Enable" start="15" end="15" type="bool"/>
+    <field name="Display Plane 1 C Scan Line Wait Enable" start="14" end="14" type="bool"/>
+    <field name="Display Plane 1 B Vertical Blank Wait Enable" start="11" end="11" type="bool"/>
+    <field name="Display Plane 5 Flip Pending Wait Enable" start="10" end="10" type="bool"/>
+    <field name="Display Plane 2 Flip Pending Wait Enable" start="9" end="9" type="bool"/>
+    <field name="Display Plane 1 B Scan Line Wait Enable" start="8" end="8" type="bool"/>
+    <field name="Display Plane 8 Flip Pending Wait Enable" start="7" end="7" type="bool"/>
+    <field name="Display Plane 7 Flip Pending Wait Enable" start="6" end="6" type="bool"/>
+    <field name="Display Plane 1 A Vertical Blank Wait Enable" start="3" end="3" type="bool"/>
+    <field name="Display Plane 4 Flip Pending Wait Enable" start="2" end="2" type="bool"/>
+    <field name="Display Plane 1 Flip Pending Wait Enable" start="1" end="1" type="bool"/>
+    <field name="Display Plnae 1 A Scan Line Wait Enable" start="0" end="0" type="bool"/>
+  </instruction>
+
+  <instruction name="PIPELINE_SELECT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="1"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
+    <field name="Mask Bits" start="8" end="15" type="uint"/>
+    <field name="Force Media Awake" start="5" end="5" type="bool"/>
+    <field name="Media Sampler DOP Clock Gate Enable" start="4" end="4" type="bool"/>
+    <field name="Pipeline Selection" start="0" end="1" type="uint">
+      <value name="3D" value="0"/>
+      <value name="Media" value="1"/>
+      <value name="GPGPU" value="2"/>
+    </field>
+  </instruction>
+
+  <instruction name="PIPE_CONTROL" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="2"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="Flush LLC" start="58" end="58" type="bool"/>
+    <field name="Destination Address Type" start="56" end="56" type="uint" prefix="DAT">
+      <value name="PPGTT" value="0"/>
+      <value name="GGTT" value="1"/>
+    </field>
+    <field name="LRI Post Sync Operation" start="55" end="55" type="uint">
+      <value name="No LRI Operation" value="0"/>
+      <value name="MMIO Write Immediate Data" value="1"/>
+    </field>
+    <field name="Store Data Index" start="53" end="53" type="uint"/>
+    <field name="Command Streamer Stall Enable" start="52" end="52" type="bool"/>
+    <field name="Global Snapshot Count Reset" start="51" end="51" type="bool"/>
+    <field name="TLB Invalidate" start="50" end="50" type="bool"/>
+    <field name="PSD Sync Enable" start="49" end="49" type="bool"/>
+    <field name="Generic Media State Clear" start="48" end="48" type="bool"/>
+    <field name="Post Sync Operation" start="46" end="47" type="uint">
+      <value name="No Write" value="0"/>
+      <value name="Write Immediate Data" value="1"/>
+      <value name="Write PS Depth Count" value="2"/>
+      <value name="Write Timestamp" value="3"/>
+    </field>
+    <field name="Depth Stall Enable" start="45" end="45" type="bool"/>
+    <field name="Render Target Cache Flush Enable" start="44" end="44" type="bool"/>
+    <field name="Instruction Cache Invalidate Enable" start="43" end="43" type="bool"/>
+    <field name="Texture Cache Invalidation Enable" start="42" end="42" type="bool"/>
+    <field name="Indirect State Pointers Disable" start="41" end="41" type="bool"/>
+    <field name="Notify Enable" start="40" end="40" type="bool"/>
+    <field name="Pipe Control Flush Enable" start="39" end="39" type="bool"/>
+    <field name="DC Flush Enable" start="37" end="37" type="bool"/>
+    <field name="VF Cache Invalidation Enable" start="36" end="36" type="bool"/>
+    <field name="Constant Cache Invalidation Enable" start="35" end="35" type="bool"/>
+    <field name="State Cache Invalidation Enable" start="34" end="34" type="bool"/>
+    <field name="Stall At Pixel Scoreboard" start="33" end="33" type="bool"/>
+    <field name="Depth Cache Flush Enable" start="32" end="32" type="bool"/>
+    <field name="Address" start="66" end="111" type="address"/>
+    <field name="Immediate Data" start="128" end="191" type="uint"/>
+  </instruction>
+
+  <instruction name="STATE_BASE_ADDRESS" bias="2" length="22">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="20"/>
+    <field name="General State Base Address" start="44" end="95" type="address"/>
+    <field name="General State Memory Object Control State" start="36" end="42" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="General State Base Address Modify Enable" start="32" end="32" type="bool"/>
+    <field name="Stateless Data Port Access Memory Object Control State" start="112" end="118" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Surface State Base Address" start="140" end="191" type="address"/>
+    <field name="Surface State Memory Object Control State" start="132" end="138" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Surface State Base Address Modify Enable" start="128" end="128" type="bool"/>
+    <field name="Dynamic State Base Address" start="204" end="255" type="address"/>
+    <field name="Dynamic State Memory Object Control State" start="196" end="202" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Dynamic State Base Address Modify Enable" start="192" end="192" type="bool"/>
+    <field name="Indirect Object Base Address" start="268" end="319" type="address"/>
+    <field name="Indirect Object Memory Object Control State" start="260" end="266" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Indirect Object Base Address Modify Enable" start="256" end="256" type="bool"/>
+    <field name="Instruction Base Address" start="332" end="383" type="address"/>
+    <field name="Instruction Memory Object Control State" start="324" end="330" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Instruction Base Address Modify Enable" start="320" end="320" type="bool"/>
+    <field name="General State Buffer Size" start="396" end="415" type="uint"/>
+    <field name="General State Buffer Size Modify Enable" start="384" end="384" type="bool"/>
+    <field name="Dynamic State Buffer Size" start="428" end="447" type="uint"/>
+    <field name="Dynamic State Buffer Size Modify Enable" start="416" end="416" type="bool"/>
+    <field name="Indirect Object Buffer Size" start="460" end="479" type="uint"/>
+    <field name="Indirect Object Buffer Size Modify Enable" start="448" end="448" type="bool"/>
+    <field name="Instruction Buffer Size" start="492" end="511" type="uint"/>
+    <field name="Instruction Buffer size Modify Enable" start="480" end="480" type="bool"/>
+    <field name="Bindless Surface State Base Address" start="524" end="575" type="address"/>
+    <field name="Bindless Surface State Memory Object Control State" start="516" end="522" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Bindless Surface State Base Address Modify Enable" start="512" end="512" type="bool"/>
+    <field name="Bindless Surface State Size" start="588" end="607" type="uint"/>
+    <field name="Bindless Sampler State Base Address" start="620" end="671" type="address"/>
+    <field name="Bindless Sampler State Memory Object Control State" start="612" end="618" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="Bindless Sampler State Base Address Modify Enable" start="608" end="608" type="bool"/>
+    <field name="Bindless Sampler State Buffer Size" start="684" end="703" type="uint"/>
+  </instruction>
+
+  <instruction name="STATE_SIP" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="System Instruction Pointer" start="36" end="95" type="offset"/>
+  </instruction>
+
+  <register name="IA_VERTICES_COUNT" length="2" num="0x2310">
+    <field name="IA Vertices Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="IA_PRIMITIVES_COUNT" length="2" num="0x2318">
+    <field name="IA Primitives Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="VS_INVOCATION_COUNT" length="2" num="0x2320">
+    <field name="VS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="HS_INVOCATION_COUNT" length="2" num="0x2300">
+    <field name="HS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="DS_INVOCATION_COUNT" length="2" num="0x2308">
+    <field name="DS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="GS_INVOCATION_COUNT" length="2" num="0x2328">
+    <field name="GS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="GS_PRIMITIVES_COUNT" length="2" num="0x2330">
+    <field name="GS Primitives Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="CL_INVOCATION_COUNT" length="2" num="0x2338">
+    <field name="CL Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="CL_PRIMITIVES_COUNT" length="2" num="0x2340">
+    <field name="CL Primitives Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="PS_INVOCATION_COUNT" length="2" num="0x2348">
+    <field name="PS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="CS_INVOCATION_COUNT" length="2" num="0x2290">
+    <field name="CS Invocation Count Report" start="0" end="63" type="uint"/>
+  </register>
+
+  <register name="BCS_INSTDONE" length="1" num="0x2206c">
+    <field name="Ring Enable" start="0" end="0" type="bool"/>
+    <field name="Blitter IDLE" start="1" end="1" type="bool" default="1"/>
+    <field name="GAB IDLE" start="2" end="2" type="bool" default="1"/>
+    <field name="BCS Done" start="3" end="3" type="bool" default="1"/>
+  </register>
+
+  <register name="INSTDONE_1" length="1" num="0x206c">
+    <field name="PRB0 Ring Enable" start="0" end="0" type="bool"/>
+    <field name="VFG Done" start="1" end="1" type="bool"/>
+    <field name="VS Done" start="2" end="2" type="bool"/>
+    <field name="HS Done" start="3" end="3" type="bool"/>
+    <field name="TE Done" start="4" end="4" type="bool"/>
+    <field name="DS Done" start="5" end="5" type="bool"/>
+    <field name="GS Done" start="6" end="6" type="bool"/>
+    <field name="SOL Done" start="7" end="7" type="bool"/>
+    <field name="CL Done" start="8" end="8" type="bool"/>
+    <field name="SF Done" start="9" end="9" type="bool"/>
+    <field name="TDG Done" start="12" end="12" type="bool"/>
+    <field name="URBM Done" start="13" end="13" type="bool"/>
+    <field name="SVG Done" start="14" end="14" type="bool"/>
+    <field name="GAFS Done" start="15" end="15" type="bool"/>
+    <field name="VFE Done" start="16" end="16" type="bool"/>
+    <field name="TSG Done" start="17" end="17" type="bool"/>
+    <field name="GAFM Done" start="18" end="18" type="bool"/>
+    <field name="GAM Done" start="19" end="19" type="bool"/>
+    <field name="SDE Done" start="22" end="22" type="bool"/>
+    <field name="RCCFBC CS Done" start="23" end="23" type="bool"/>
+  </register>
+
+  <register name="VCS_INSTDONE" length="1" num="0x1206c">
+    <field name="Ring Enable" start="0" end="0" type="bool"/>
+    <field name="USB Done" start="1" end="1" type="bool"/>
+    <field name="QRC Done" start="2" end="2" type="bool"/>
+    <field name="SEC Done" start="3" end="3" type="bool"/>
+    <field name="MPC Done" start="4" end="4" type="bool"/>
+    <field name="VFT Done" start="5" end="5" type="bool"/>
+    <field name="BSP Done" start="6" end="6" type="bool"/>
+    <field name="VLF Done" start="7" end="7" type="bool"/>
+    <field name="VOP Done" start="8" end="8" type="bool"/>
+    <field name="VMC Done" start="9" end="9" type="bool"/>
+    <field name="VIP Done" start="10" end="10" type="bool"/>
+    <field name="VIT Done" start="11" end="11" type="bool"/>
+    <field name="VDS Done" start="12" end="12" type="bool"/>
+    <field name="VMX Done" start="13" end="13" type="bool"/>
+    <field name="VCP Done" start="14" end="14" type="bool"/>
+    <field name="VCD Done" start="15" end="15" type="bool"/>
+    <field name="VAD Done" start="16" end="16" type="bool"/>
+    <field name="VMD Done" start="17" end="17" type="bool"/>
+    <field name="VIS Done" start="18" end="18" type="bool"/>
+    <field name="VAC Done" start="19" end="19" type="bool"/>
+    <field name="VAM Done" start="20" end="20" type="bool"/>
+    <field name="JPG Done" start="21" end="21" type="bool"/>
+    <field name="VBP Done" start="22" end="22" type="bool"/>
+    <field name="VHR Done" start="23" end="23" type="bool"/>
+    <field name="VCI Done" start="24" end="24" type="bool"/>
+    <field name="VCR Done" start="25" end="25" type="bool"/>
+    <field name="VIN Done" start="26" end="26" type="bool"/>
+    <field name="VPR Done" start="27" end="27" type="bool"/>
+    <field name="VTQ Done" start="28" end="28" type="bool"/>
+    <field name="Reserved" start="29" end="29" type="bool"/>
+    <field name="VCS Done" start="30" end="30" type="bool"/>
+    <field name="GAC Done" start="31" end="31" type="bool"/>
+  </register>
+
+  <register name="VECS_INSTDONE" length="1" num="0x1a06c">
+    <field name="Ring Enable" start="0" end="0" type="bool"/>
+    <field name="VECS Done" start="30" end="30" type="bool"/>
+    <field name="GAM Done" start="31" end="31" type="bool"/>
+  </register>
+
+  <register name="L3CNTLREG" length="1" num="0x7034">
+    <field name="SLM Enable" start="0" end="0" type="uint"/>
+    <field name="URB Allocation" start="1" end="7" type="uint"/>
+    <field name="RO Allocation" start="11" end="17" type="uint"/>
+    <field name="DC Allocation" start="18" end="24" type="uint"/>
+    <field name="All Allocation" start="25" end="31" type="uint"/>
+  </register>
+
+  <register name="SO_WRITE_OFFSET0" length="1" num="0x5280">
+    <field name="Write Offset" start="2" end="31" type="offset"/>
+  </register>
+
+  <register name="SO_WRITE_OFFSET1" length="1" num="0x5284">
+    <field name="Write Offset" start="2" end="31" type="offset"/>
+  </register>
+
+  <register name="SO_WRITE_OFFSET2" length="1" num="0x5288">
+    <field name="Write Offset" start="2" end="31" type="offset"/>
+  </register>
+
+  <register name="SO_WRITE_OFFSET3" length="1" num="0x528c">
+    <field name="Write Offset" start="2" end="31" type="offset"/>
+  </register>
+
+  <register name="CACHE_MODE_0" length="1" num="0x7000">
+    <field name="Null tile fix disable" start="0" end="0" type="bool"/>
+    <field name="Disable clock gating in the pixel backend" start="1" end="1" type="bool"/>
+    <field name="Hierarchical Z RAW Stall Optimization Disable" start="2" end="2" type="bool"/>
+    <field name="RCC Eviction Policy" start="4" end="4" type="bool"/>
+    <field name="STC PMA Optimization Enable" start="5" end="5" type="bool"/>
+    <field name="Sampler L2 Request Arbitration" start="6" end="7" type="uint">
+      <value name="Round Robin" value="0"/>
+      <value name="Fetch are Highest Priority" value="1"/>
+      <value name="Constants are Highest Priority" value="2"/>
+    </field>
+    <field name="Sampler L2 TLB Prefetch Enable" start="9" end="9" type="bool"/>
+    <field name="Sampler Set Remapping for 3D Disable" start="11" end="11" type="bool"/>
+    <field name="MSAA Compression Plane Number Threshold for eLLC" start="12" end="14" type="uint"/>
+    <field name="Sampler L2 Disable" start="15" end="15" type="bool"/>
+
+    <field name="Null tile fix disable Mask" start="16" end="16" type="bool"/>
+    <field name="Disable clock gating in the pixel backend Mask" start="17" end="17" type="bool"/>
+    <field name="Hierarchical Z RAW Stall Optimization Disable Mask" start="18" end="18" type="bool"/>
+    <field name="RCC Eviction Policy Mask" start="20" end="20" type="bool"/>
+    <field name="STC PMA Optimization Enable Mask" start="21" end="21" type="bool"/>
+    <field name="Sampler L2 Request Arbitration Mask" start="22" end="23" type="uint"/>
+    <field name="Sampler L2 TLB Prefetch Enable Mask" start="25" end="25" type="bool"/>
+    <field name="Sampler Set Remapping for 3D Disable Mask" start="27" end="27" type="bool"/>
+    <field name="MSAA Compression Plane Number Threshold for eLLC Mask" start="28" end="30" type="uint"/>
+    <field name="Sampler L2 Disable Mask" start="31" end="31" type="bool"/>
+  </register>
+
+  <register name="CACHE_MODE_1" length="1" num="0x7004">
+    <field name="Partial Resolve Disable In VC" start="1" end="1" type="bool"/>
+    <field name="RCZ PMA Promoted 2 Not-Promoted Allocation stall optimization Disable" start="3" end="3" type="bool"/>
+    <field name="MCS Cache Disable" start="5" end="5" type="bool"/>
+    <field name="MSC RAW Hazard Avoidance Bit" start="9" end="9" type="bool"/>
+    <field name="NP Early Z Fails Disable" start="13" end="13" type="uint"/>
+    <field name="Blend Optimization Fix Disable" start="14" end="14" type="bool"/>
+    <field name="Color Compression Disable" start="15" end="15" type="bool"/>
+
+    <field name="Partial Resolve Disable In VC Mask" start="17" end="17" type="bool"/>
+    <field name="RCZ PMA Promoted 2 Not-Promoted Allocation stall optimization Disable Mask" start="19" end="19" type="bool"/>
+    <field name="MCS Cache Disable Mask" start="21" end="21" type="bool"/>
+    <field name="MSC RAW Hazard Avoidance Bit Mask" start="25" end="25" type="bool"/>
+    <field name="NP Early Z Fails Disable Mask" start="29" end="29" type="bool"/>
+    <field name="Blend Optimization Fix Disable Mask" start="30" end="30" type="bool"/>
+    <field name="Color Compression Disable Mask" start="31" end="31" type="bool"/>
+  </register>
+
+</genxml>
diff --git a/src/intel/genxml/gen4.xml b/src/intel/genxml/gen4.xml
index 5ea15e7..6f6f1bf 100644
--- a/src/intel/genxml/gen4.xml
+++ b/src/intel/genxml/gen4.xml
@@ -9,6 +9,10 @@
     <value name="TRIFAN" value="6"/>
     <value name="QUADLIST" value="7"/>
     <value name="QUADSTRIP" value="8"/>
+    <value name="LINELIST_ADJ" value="9"/>
+    <value name="LINESTRIP_ADJ" value="10"/>
+    <value name="TRILIST_ADJ" value="11"/>
+    <value name="TRISTRIP_ADJ" value="12"/>
     <value name="TRISTRIP_REVERSE" value="13"/>
     <value name="POLYGON" value="14"/>
     <value name="RECTLIST" value="15"/>
@@ -27,6 +31,38 @@
     <value name="STORE_1_FP" value="3"/>
     <value name="STORE_1_INT" value="4"/>
     <value name="STORE_VID" value="5"/>
+    <value name="STORE_IID" value="6"/>
+    <value name="STORE_PID" value="7"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Function" prefix="BLENDFUNCTION">
+    <value name="ADD" value="0"/>
+    <value name="SUBTRACT" value="1"/>
+    <value name="REVERSE_SUBTRACT" value="2"/>
+    <value name="MIN" value="3"/>
+    <value name="MAX" value="4"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Factor" prefix="BLENDFACTOR">
+    <value name="ONE" value="1"/>
+    <value name="SRC_COLOR" value="2"/>
+    <value name="SRC_ALPHA" value="3"/>
+    <value name="DST_ALPHA" value="4"/>
+    <value name="DST_COLOR" value="5"/>
+    <value name="SRC_ALPHA_SATURATE" value="6"/>
+    <value name="CONST_COLOR" value="7"/>
+    <value name="CONST_ALPHA" value="8"/>
+    <value name="SRC1_COLOR" value="9"/>
+    <value name="SRC1_ALPHA" value="10"/>
+    <value name="ZERO" value="17"/>
+    <value name="INV_SRC_COLOR" value="18"/>
+    <value name="INV_SRC_ALPHA" value="19"/>
+    <value name="INV_DST_ALPHA" value="20"/>
+    <value name="INV_DST_COLOR" value="21"/>
+    <value name="INV_CONST_COLOR" value="23"/>
+    <value name="INV_CONST_ALPHA" value="24"/>
+    <value name="INV_SRC1_COLOR" value="25"/>
+    <value name="INV_SRC1_ALPHA" value="26"/>
   </enum>
 
   <enum name="3D_Compare_Function" prefix="COMPAREFUNCTION">
@@ -40,6 +76,36 @@
     <value name="GEQUAL" value="7"/>
   </enum>
 
+  <enum name="3D_Stencil_Operation" prefix="STENCILOP">
+    <value name="KEEP" value="0"/>
+    <value name="ZERO" value="1"/>
+    <value name="REPLACE" value="2"/>
+    <value name="INCRSAT" value="3"/>
+    <value name="DECRSAT" value="4"/>
+    <value name="INCR" value="5"/>
+    <value name="DECR" value="6"/>
+    <value name="INVERT" value="7"/>
+  </enum>
+
+  <enum name="3D_Logic_Op_Function" prefix="LOGICOP">
+    <value name="CLEAR" value="0"/>
+    <value name="NOR" value="1"/>
+    <value name="AND_INVERTED" value="2"/>
+    <value name="COPY_INVERTED" value="3"/>
+    <value name="AND_REVERSE" value="4"/>
+    <value name="INVERT" value="5"/>
+    <value name="XOR" value="6"/>
+    <value name="NAND" value="7"/>
+    <value name="AND" value="8"/>
+    <value name="EQUIV" value="9"/>
+    <value name="NOOP" value="10"/>
+    <value name="OR_INVERTED" value="11"/>
+    <value name="COPY" value="12"/>
+    <value name="OR_REVERSE" value="13"/>
+    <value name="OR" value="14"/>
+    <value name="SET" value="15"/>
+  </enum>
+
   <enum name="SURFACE_FORMAT" prefix="SF">
     <value name="R32G32B32A32_FLOAT" value="0"/>
     <value name="R32G32B32A32_SINT" value="1"/>
@@ -50,8 +116,6 @@
     <value name="R32G32B32X32_FLOAT" value="6"/>
     <value name="R32G32B32A32_SSCALED" value="7"/>
     <value name="R32G32B32A32_USCALED" value="8"/>
-    <value name="R32G32B32A32_SFIXED" value="32"/>
-    <value name="R64G64_PASSTHRU" value="33"/>
     <value name="R32G32B32_FLOAT" value="64"/>
     <value name="R32G32B32_SINT" value="65"/>
     <value name="R32G32B32_UINT" value="66"/>
@@ -83,8 +147,6 @@
     <value name="R16G16B16A16_USCALED" value="148"/>
     <value name="R32G32_SSCALED" value="149"/>
     <value name="R32G32_USCALED" value="150"/>
-    <value name="R32G32_SFIXED" value="160"/>
-    <value name="R64_PASSTHRU" value="161"/>
     <value name="B8G8R8A8_UNORM" value="192"/>
     <value name="B8G8R8A8_UNORM_SRGB" value="193"/>
     <value name="R10G10B10A2_UNORM" value="194"/>
@@ -264,7 +326,7 @@
     <value name="RAW" value="511"/>
   </enum>
 
-  <enum name="Texture Coordinate Mode" prefix="TCM">
+  <enum name="Texture_Coordinate_Mode" prefix="TCM">
     <value name="WRAP" value="0"/>
     <value name="MIRROR" value="1"/>
     <value name="CLAMP" value="2"/>
@@ -273,363 +335,102 @@
     <value name="MIRROR_ONCE" value="5"/>
   </enum>
 
-  <struct name="VS_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <field name="Statistics Enable" start="138" end="138" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
-    <field name="Sampler Count" start="160" end="162" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Vertex Cache Disable" start="193" end="193" type="bool"/>
-    <field name="VS Function Enable" start="192" end="192" type="bool"/>
-  </struct>
-
-  <struct name="GS_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="157" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <!-- DWord 5 -->
-    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
-    <field name="Sampler Count" start="160" end="162" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Reorder Enable" start="222" end="222" type="bool"/>
-    <field name="Maximum VPIndex" start="192" end="195" type="uint"/>
-  </struct>
-
-  <struct name="CLIP_STATE" length="11">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="157" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <!-- DWord 5 -->
-    <field name="Vertex Position Space" start="189" end="189" type="uint"/>
-    <field name="Viewport XY ClipTest Enable" start="188" end="188" type="bool"/>
-    <field name="Viewport Z ClipTest Enable" start="187" end="187" type="bool"/>
-    <field name="Guardband ClipTest Enable" start="186" end="186" type="bool"/>
-    <field name="UserClipFlags MustClip Enable" start="184" end="184" type="bool"/>
-    <field name="UserClipFlags ClipTest Enable Bitmask" start="176" end="183" type="uint"/>
-    <field name="Clip Mode" start="173" end="175" type="uint">
-      <value name="CLIPMODE_NORMAL" value="0"/>
-      <value name="CLIPMODE_ALL" value="1"/>
-      <value name="CLIPMODE_CLIP_NON_REJECTED" value="2"/>
-      <value name="CLIPMODE_REJECT_ALL" value="3"/>
-      <value name="CLIPMODE_ACCEPT_ALL" value="4"/>
-      <value name="CLIPMODE_NORMAL_FFCLIP" value="5"/>
-    </field>
-
-    <!-- DWord 6 -->
-    <field name="Clipper Viewport State Pointer" start="197" end="223" type="offset"/>
-    <!-- DWord 7 -->
-    <field name="Screen Space Viewport X Min" start="224" end="255" type="uint"/>
-    <!-- DWord 8 -->
-    <field name="Screen Space Viewport X Max" start="256" end="287" type="uint"/>
-    <!-- DWord 9 -->
-    <field name="Screen Space Viewport Y Min" start="288" end="319" type="uint"/>
-    <!-- DWord 10 -->
-    <field name="Screen Space Viewport Y Max" start="320" end="351" type="uint"/>
-  </struct>
-
-  <struct name="SF_STATE" length="8">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
-    <!-- DWord 5 -->
-    <field name="Setup Viewport State Pointer" start="165" end="195" type="offset"/>
-    <field name="Viewport Transform Enable" start="161" end="161" type="uint"/>
-    <field name="Front Winding" start="160" end="160" type="uint">
-      <value name="FRONTWINDING_CW" value="0"/>
-      <value name="FRONTWINDING_CCW" value="1"/>
-    </field>
-    <!-- DWord 6 -->
-    <field name="Anti-aliasing Enable" start="223" end="223" type="bool"/>
-    <field name="Cull Mode" start="221" end="222" type="uint" prefix="CULLMODE">
-      <value name="BOTH" value="0"/>
-      <value name="NONE" value="1"/>
-      <value name="FRONT" value="2"/>
-      <value name="BACK" value="3"/>
-    </field>
-    <field name="Fast Scissor Clip Disable" start="220" end="220" type="bool"/>
-    <field name="Line Width" start="216" end="219" type="uint"/>
-    <field name="Line End Cap Antialiasing Region Width" start="214" end="215" type="uint"/>
-    <field name="Point Rasterization Rule" start="212" end="213" type="uint">
-      <value name="RASTRULE_UPPER_LEFT" value="0"/>
-      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
-    </field>
-    <field name="Zero Pixel Triangle Filter Disable" start="211" end="211" type="bool"/>
-    <field name="2x2 Pixel Triangle Filter Disable" start="210" end="210" type="bool"/>
-    <field name="Scissor Rectangle Enable" start="209" end="209" type="bool"/>
-    <field name="Destination Origin Horizontal Bias" start="205" end="208" type="uint"/>
-    <field name="Destination Origin Vertical Bias" start="201" end="204" type="uint"/>
-
-    <!-- DWord 7 -->
-    <field name="Last Pixel Enable" start="255" end="255" type="bool"/>
-    <field name="Triangle Strip/List Provoking Vertex Select" start="253" end="254" type="uint"/>
-    <field name="Line Strip/List Provoking Vertex Select" start="251" end="252" type="uint"/>
-    <field name="Triangle Fan Provoking Vertex Select" start="249" end="250" type="uint"/>
-    <field name="AA Line Distance Mode" start="238" end="238" type="uint" prefix="AALINEDISTANCE">
-      <value name="MANHATTAN" value="0"/>
-      <value name="TRUE" value="1"/>
-    </field>
-    <field name="Sprite Point Enable" start="237" end="237" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="236" end="236" type="uint"/>
-    <field name="Use Point Width State" start="235" end="235" type="uint"/>
-    <field name="Point Width" start="224" end="234" type="uint"/>
-  </struct>
-
-  <struct name="WM_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer[0]" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Depth Coefficient URB Read Offset" start="40" end="45" type="uint"/>
-    <field name="Illegal Opcode Exception Enable" start="36" end="36" type="bool"/>
-    <field name="MaskStack Exception Enable" start="34" end="34" type="bool"/>
-    <field name="Software Exception Enable" start="33" end="33" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Setup URB Entry Read Length" start="107" end="113" type="uint"/>
-    <field name="Setup URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Sampler State Pointer" start="133" end="159" type="address"/>
-    <field name="Sampler Count" start="130" end="132" type="uint"/>
-    <field name="Statistics Enable" start="128" end="128" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
-    <field name="Legacy Diamond Line Rasterization" start="183" end="183" type="bool"/>
-    <field name="Pixel Shader Kill Pixel" start="182" end="182" type="bool"/>
-    <field name="Pixel Shader Computed Depth" start="181" end="181" type="bool"/>
-    <field name="Pixel Shader Uses Source Depth" start="180" end="180" type="bool"/>
-    <field name="Thread Dispatch Enable" start="179" end="179" type="bool"/>
-    <field name="Early Depth Test Enable" start="178" end="178" type="bool"/>
-    <field name="Line End Cap Antialiasing Region Width" start="176" end="177" type="uint"/>
-    <field name="Line Antialiasing Region Width" start="174" end="175" type="uint"/>
-    <field name="Polygon Stipple Enable" start="173" end="173" type="bool"/>
-    <field name="Global Depth Offset Enable" start="172" end="172" type="bool"/>
-    <field name="Line Stipple Enable" start="171" end="171" type="bool"/>
-    <field name="Legacy Global Depth Bias Enable" start="170" end="170" type="bool"/>
-    <field name="32-Pixel Dispatch Enable" start="162" end="162" type="bool"/>
-    <field name="16-Pixel Dispatch Enable" start="161" end="161" type="bool"/>
-    <field name="8-Pixel Dispatch Enable" start="160" end="160" type="bool"/>
-    <!-- DWord 6 -->
-    <field name="Global Depth Offset Constant" start="192" end="223" type="float"/>
-    <!-- DWord 7 -->
-    <field name="Global Depth Offset Scale" start="224" end="255" type="float"/>
-  </struct>
-
-  <struct name="VERTEX_BUFFER_STATE" length="4">
-    <!-- DWord 0 -->
-    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
-    <field name="Buffer Access Type" start="26" end="26" type="uint">
-      <value name="VERTEXDATA" value="0"/>
-      <value name="INSTANCEDATA" value="1"/>
-    </field>
-    <field name="Buffer Pitch" start="0" end="10" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
-    <!-- DWord 2 -->
-    <field name="Max Index" start="64" end="95" type="uint"/>
-    <!-- DWord 3 -->
-  </struct>
-
-  <struct name="VERTEX_ELEMENT_STATE" length="2">
-    <!-- DWord 0 -->
-    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
-    <field name="Valid" start="26" end="26" type="uint"/>
-    <field name="Source Element Format" start="16" end="24" type="uint"/>
-    <field name="Source Element Offset" start="0" end="10" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Component 0 Control" start="60" end="62" type="uint"/>
-    <field name="Component 1 Control" start="56" end="58" type="uint"/>
-    <field name="Component 2 Control" start="52" end="54" type="uint"/>
-    <field name="Component 3 Control" start="48" end="50" type="uint"/>
-    <field name="Destination Element Offset" start="32" end="39" type="uint"/>
-  </struct>
-
-  <struct name="CLIP_VIEWPORT" length="4">
-    <field name="XMin Clip Guardband" start="0" end="31" type="uint"/>
-    <field name="XMax Clip Guardband" start="32" end="63" type="uint"/>
-    <field name="YMin Clip Guardband" start="64" end="95" type="uint"/>
-    <field name="YMax Clip Guardband" start="96" end="127" type="uint"/>
-  </struct>
-
-  <struct name="SF_VIEWPORT" length="8">
-    <!-- DWord 0 -->
-    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
-    <!-- DWord 1 -->
-    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
-    <!-- DWord 2 -->
-    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
-    <!-- DWord 3 -->
-    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
-    <!-- DWord 4 -->
-    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
-    <!-- DWord 5 -->
-    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
-    <!-- DWord 6 -->
-    <field name="Scissor Rectangle Y Min" start="208" end="223" type="uint"/>
-    <field name="Scissor Rectangle X Min" start="192" end="207" type="uint"/>
-    <!-- DWord 7 -->
-    <field name="Scissor Rectangle Y Max" start="224" end="239" type="uint"/>
-    <field name="Scissor Rectangle X Max" start="240" end="255" type="uint"/>
-  </struct>
-
   <struct name="CC_VIEWPORT" length="2">
     <field name="Minimum Depth" start="0" end="31" type="float"/>
     <field name="Maximum Depth" start="32" end="63" type="float"/>
   </struct>
 
-  <struct name="COLOR_CALC_STATE" length="6">
-    <!-- DWord 0 -->
+  <struct name="CLIP_STATE" length="11">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Clipper Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="GS Output Object Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="API Mode" start="190" end="190" type="uint" prefix="APIMODE">
+      <value name="OGL" value="0"/>
+      <value name="D3D" value="1"/>
+    </field>
+    <field name="Vertex Position Space" start="189" end="189" type="uint" prefix="VPOS">
+      <value name="NDCSPACE" value="0"/>
+      <value name="SCREENSPACE" value="1"/>
+    </field>
+    <field name="Viewport XY ClipTest Enable" start="188" end="188" type="bool"/>
+    <field name="Viewport Z ClipTest Enable" start="187" end="187" type="bool"/>
+    <field name="Guardband ClipTest Enable" start="186" end="186" type="bool"/>
+    <field name="UserClipFlags MustClip Enable" start="184" end="184" type="bool"/>
+    <field name="UserClipDistance ClipTest Enable Bitmask" start="176" end="183" type="uint"/>
+    <field name="Clip Mode" start="173" end="175" type="uint" prefix="CLIPMODE">
+      <value name="NORMAL" value="0"/>
+      <value name="ALL" value="1"/>
+      <value name="CLIP_NON_REJECTED" value="2"/>
+      <value name="REJECT_ALL" value="3"/>
+      <value name="ACCEPT_ALL" value="4"/>
+    </field>
+    <field name="Clipper Viewport State Pointer" start="197" end="223" type="address"/>
+    <field name="Screen Space Viewport X Min" start="224" end="255" type="float"/>
+    <field name="Screen Space Viewport X Max" start="256" end="287" type="float"/>
+    <field name="Screen Space Viewport Y Min" start="288" end="319" type="float"/>
+    <field name="Screen Space Viewport Y Max" start="320" end="351" type="float"/>
+  </struct>
+
+  <struct name="CLIP_VIEWPORT" length="4">
+    <field name="XMin Clip Guardband" start="0" end="31" type="float"/>
+    <field name="XMax Clip Guardband" start="32" end="63" type="float"/>
+    <field name="YMin Clip Guardband" start="64" end="95" type="float"/>
+    <field name="YMax Clip Guardband" start="96" end="127" type="float"/>
+  </struct>
+
+  <struct name="SCISSOR_RECT" length="2">
+    <field name="Scissor Rectangle Y Min" start="16" end="31" type="uint"/>
+    <field name="Scissor Rectangle X Min" start="0" end="15" type="uint"/>
+    <field name="Scissor Rectangle Y Max" start="48" end="63" type="uint"/>
+    <field name="Scissor Rectangle X Max" start="32" end="47" type="uint"/>
+  </struct>
+
+  <struct name="COLOR_CALC_STATE" length="8">
     <field name="Stencil Test Enable" start="31" end="31" type="bool"/>
     <field name="Stencil Test Function" start="28" end="30" type="3D_Compare_Function"/>
-    <field name="Stencil Fail Op" start="25" end="27" type="uint">
-      <value name="STENCILOP_KEEP" value="0"/>
-      <value name="STENCILOP_ZERO" value="1"/>
-      <value name="STENCILOP_REPLACE" value="2"/>
-      <value name="STENCILOP_INCRSAT" value="3"/>
-      <value name="STENCILOP_DECRSAT" value="4"/>
-      <value name="STENCILOP_INCR" value="5"/>
-      <value name="STENCILOP_DECR" value="6"/>
-      <value name="STENCILOP_INVERT" value="7"/>
-    </field>
-    <field name="Stencil Pass Depth Fail Op" start="22" end="24" type="uint"/>
-    <field name="Stencil Pass Depth Pass Op" start="19" end="21" type="uint"/>
+    <field name="Stencil Fail Op" start="25" end="27" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Fail Op" start="22" end="24" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Pass Op" start="19" end="21" type="3D_Stencil_Operation"/>
     <field name="Stencil Buffer Write Enable" start="18" end="18" type="bool"/>
     <field name="Double Sided Stencil Enable" start="15" end="15" type="bool"/>
-    <field name="BackFace Stencil Test Function" start="12" end="14" type="3D_Compare_Function"/>
-    <field name="Backface Stencil Fail Op" start="9" end="11" type="uint">
-      <value name="STENCILOP_KEEP" value="0"/>
-      <value name="STENCILOP_ZERO" value="1"/>
-      <value name="STENCILOP_REPLACE" value="2"/>
-      <value name="STENCILOP_INCRSAT" value="3"/>
-      <value name="STENCILOP_DECRSAT" value="4"/>
-      <value name="STENCILOP_INCR" value="5"/>
-      <value name="STENCILOP_DECR" value="6"/>
-      <value name="STENCILOP_INVERT" value="7"/>
-    </field>
-    <field name="Backface Stencil Pass Depth Fail Op" start="6" end="8" type="uint"/>
-    <field name="Backface Stencil Pass Depth Pass Op" start="3" end="5" type="uint"/>
-    <!-- DWord 1 -->
+    <field name="Backface Stencil Test Function" start="12" end="14" type="3D_Compare_Function"/>
+    <field name="Backface Stencil Fail Op" start="9" end="11" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Fail Op" start="6" end="8" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Pass Op" start="3" end="5" type="3D_Stencil_Operation"/>
     <field name="Stencil Reference Value" start="56" end="63" type="uint"/>
     <field name="Stencil Test Mask" start="48" end="55" type="uint"/>
     <field name="Stencil Write Mask" start="40" end="47" type="uint"/>
-    <field name="BackFace Stencil Reference Value" start="32" end="39" type="uint"/>
-    <!-- DWord 2 -->
+    <field name="Backface Stencil Reference Value" start="32" end="39" type="uint"/>
     <field name="Backface Stencil Test Mask" start="88" end="95" type="uint"/>
     <field name="Backface Stencil Write Mask" start="80" end="87" type="uint"/>
     <field name="Depth Test Enable" start="79" end="79" type="bool"/>
     <field name="Depth Test Function" start="76" end="78" type="3D_Compare_Function"/>
     <field name="Depth Buffer Write Enable" start="75" end="75" type="bool"/>
     <field name="Logic Op Enable" start="64" end="64" type="bool"/>
-    <!-- DWord 3 -->
     <field name="Alpha Test Format" start="111" end="111" type="uint">
       <value name="ALPHATEST_UNORM8" value="0"/>
       <value name="ALPHATEST_FLOAT32" value="1"/>
@@ -638,69 +439,17 @@
     <field name="Color Buffer Blend Enable" start="108" end="108" type="bool"/>
     <field name="Alpha Test Enable" start="107" end="107" type="bool"/>
     <field name="Alpha Test Function" start="104" end="106" type="3D_Compare_Function"/>
-    <!-- DWord 4 -->
-    <field name="Color Calculator Viewport State Pointer" start="133" end="159" type="address"/>
-    <!-- DWord 5 -->
+    <field name="CC Viewport State Pointer" start="133" end="159" type="address"/>
     <field name="Color Dither Enable" start="191" end="191" type="bool"/>
     <field name="Round Disable Function Disable" start="190" end="190" type="bool"/>
-    <field name="Logic Op Function" start="176" end="179" type="uint">
-      <value name="LOGICOP_CLEAR" value="0"/>
-      <value name="LOGICOP_NOR" value="1"/>
-      <value name="LOGICOP_AND_INVERTED" value="2"/>
-      <value name="LOGICOP_COPY_INVERTED" value="3"/>
-      <value name="LOGICOP_AND_REVERSE" value="4"/>
-      <value name="LOGICOP_INVERT" value="5"/>
-      <value name="LOGICOP_XOR" value="6"/>
-      <value name="LOGICOP_NAND" value="7"/>
-      <value name="LOGICOP_AND" value="8"/>
-      <value name="LOGICOP_EQUIV" value="9"/>
-      <value name="LOGICOP_NOOP" value="10"/>
-      <value name="LOGICOP_OR_INVERTED" value="11"/>
-      <value name="LOGICOP_COPY" value="12"/>
-      <value name="LOGICOP_OR_REVERSE" value="13"/>
-      <value name="LOGICOP_OR" value="14"/>
-      <value name="LOGICOP_SET" value="15"/>
-    </field>
+    <field name="Logic Op Function" start="176" end="179" type="3D_Logic_Op_Function"/>
     <field name="Statistics Enable" start="175" end="175" type="bool"/>
-    <field name="Alpha Blend Function" start="172" end="174" type="uint">
-      <value name="BLENDFUNCTION_ADD" value="0"/>
-      <value name="BLENDFUNCTION_SUBTRACT" value="1"/>
-      <value name="BLENDFUNCTION_REVERSE_SUBTRACT" value="2"/>
-      <value name="BLENDFUNCTION_MIN" value="3"/>
-      <value name="BLENDFUNCTION_MAX" value="4"/>
-    </field>
-    <field name="Source Alpha Blend Factor" start="167" end="171" type="uint">
-      <value name="BLENDFACTOR_ONE" value="1"/>
-      <value name="BLENDFACTOR_SRC_COLOR" value="2"/>
-      <value name="BLENDFACTOR_SRC_ALPHA" value="3"/>
-      <value name="BLENDFACTOR_DST_ALPHA" value="4"/>
-      <value name="BLENDFACTOR_DST_COLOR" value="5"/>
-      <value name="BLENDFACTOR_SRC_ALPHA_SATURATE" value="6"/>
-      <value name="BLENDFACTOR_CONST_COLOR" value="7"/>
-      <value name="BLENDFACTOR_CONST_ALPHA" value="8"/>
-      <value name="BLENDFACTOR_SRC1_COLOR" value="9"/>
-      <value name="BLENDFACTOR_SRC1_ALPHA" value="10"/>
-      <value name="BLENDFACTOR_ZERO" value="17"/>
-      <value name="BLENDFACTOR_INV_SRC_COLOR" value="18"/>
-      <value name="BLENDFACTOR_INV_SRC_ALPHA" value="19"/>
-      <value name="BLENDFACTOR_INV_DST_ALPHA" value="20"/>
-      <value name="BLENDFACTOR_INV_DST_COLOR" value="21"/>
-      <value name="BLENDFACTOR_INV_CONST_COLOR" value="23"/>
-      <value name="BLENDFACTOR_INV_CONST_ALPHA" value="24"/>
-      <value name="BLENDFACTOR_INV_SRC1_COLOR" value="25"/>
-      <value name="BLENDFACTOR_INV_SRC1_ALPHA" value="26"/>
-    </field>
-    <field name="Destination Alpha Blend Factor" start="162" end="166" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Color Blend Function" start="221" end="223" type="uint">
-      <value name="BLENDFUNCTION_ADD" value="0"/>
-      <value name="BLENDFUNCTION_SUBTRACT" value="1"/>
-      <value name="BLENDFUNCTION_REVERSE_SUBTRACT" value="2"/>
-      <value name="BLENDFUNCTION_MIN" value="3"/>
-      <value name="BLENDFUNCTION_MAX" value="4"/>
-    </field>
-    <field name="Source Blend Factor" start="216" end="220" type="uint"/>
-    <field name="Destination Blend Factor" start="211" end="215" type="uint"/>
+    <field name="Alpha Blend Function" start="172" end="174" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Alpha Blend Factor" start="167" end="171" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Alpha Blend Factor" start="162" end="166" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Color Blend Function" start="221" end="223" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Blend Factor" start="216" end="220" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Blend Factor" start="211" end="215" type="3D_Color_Buffer_Blend_Factor"/>
     <field name="X Dither Offset" start="209" end="210" type="uint"/>
     <field name="Y Dither Offset" start="207" end="208" type="uint"/>
     <field name="Color Clamp Range" start="194" end="195" type="uint">
@@ -710,11 +459,40 @@
     </field>
     <field name="Pre-Blend Color Clamp Enable" start="193" end="193" type="bool"/>
     <field name="Post-Blend Color Clamp Enable" start="192" end="192" type="bool"/>
-    <!-- DWord 7 -->
-    <field name="Alpha Reference Value" start="224" end="255" type="float"/>
+    <field name="Alpha Reference Value As UNORM8" start="224" end="255" type="uint"/>
+    <field name="Alpha Reference Value As FLOAT32" start="224" end="255" type="float"/>
   </struct>
 
-  <struct name="RENDER_SURFACE_STATE" length="6">
+  <struct name="GS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Reorder Enable" start="222" end="222" type="bool"/>
+    <field name="Discard Adjacency" start="221" end="221" type="bool"/>
+    <field name="Maximum VPIndex" start="192" end="195" type="uint"/>
+  </struct>
+
+  <struct name="RENDER_SURFACE_STATE" length="5">
     <field name="Surface Type" start="29" end="31" type="uint">
       <value name="SURFTYPE_1D" value="0"/>
       <value name="SURFTYPE_2D" value="1"/>
@@ -765,12 +543,16 @@
     <field name="Render Target View Extent" start="136" end="144" type="uint"/>
   </struct>
 
+  <struct name="SAMPLER_BORDER_COLOR_STATE" length="12">
+    <field name="Border Color Red" start="0" end="31" type="float"/>
+    <field name="Border Color Green" start="32" end="63" type="float"/>
+    <field name="Border Color Blue" start="64" end="95" type="float"/>
+    <field name="Border Color Alpha" start="96" end="127" type="float"/>
+  </struct>
+
   <struct name="SAMPLER_STATE" length="4">
     <field name="Sampler Disable" start="31" end="31" type="bool"/>
-    <field name="LOD PreClamp Enable" start="28" end="28" type="uint" prefix="CLAMP_ENABLE">
-      <value name="D3D" value="0"/>
-      <value name="OGL" value="1"/>
-    </field>
+    <field name="LOD PreClamp Enable" start="28" end="28" type="bool"/>
     <field name="Base Mip Level" start="22" end="26" type="u4.1"/>
     <field name="Mip Mode Filter" start="20" end="21" type="uint" prefix="MIPFILTER">
       <value name="NONE" value="0"/>
@@ -801,17 +583,15 @@
       <value name="CUBECTRLMODE_PROGRAMMED" value="0"/>
       <value name="CUBECTRLMODE_OVERRIDE" value="1"/>
     </field>
-    <field name="TCX Address Control Mode" start="38" end="40" type="uint"/>
-    <field name="TCY Address Control Mode" start="35" end="37" type="uint"/>
-    <field name="TCZ Address Control Mode" start="32" end="34" type="uint"/>
+    <field name="TCX Address Control Mode" start="38" end="40" type="Texture_Coordinate_Mode"/>
+    <field name="TCY Address Control Mode" start="35" end="37" type="Texture_Coordinate_Mode"/>
+    <field name="TCZ Address Control Mode" start="32" end="34" type="Texture_Coordinate_Mode"/>
     <field name="Border Color Pointer" start="69" end="95" type="offset"/>
-    <field name="Monochrome Filter Height: Reserved" start="125" end="127" type="uint"/>
-    <field name="Monochrome Filter Width" start="122" end="124" type="uint"/>
     <field name="ChromaKey Enable" start="121" end="121" type="bool"/>
     <field name="ChromaKey Index" start="119" end="120" type="uint"/>
-    <field name="ChromaKey Mode" start="118" end="118" type="uint">
-      <value name="KEYFILTER_KILL_ON_ANY_MATCH" value="0"/>
-      <value name="KEYFILTER_REPLACE_BLACK" value="1"/>
+    <field name="ChromaKey Mode" start="118" end="118" type="uint" prefix="KEYFILTER">
+      <value name="KILL_ON_ANY_MATCH" value="0"/>
+      <value name="REPLACE_BLACK" value="1"/>
     </field>
     <field name="Maximum Anisotropy" start="115" end="117" type="uint">
       <value name="RATIO 2:1" value="0"/>
@@ -831,8 +611,213 @@
     <field name="U Address Mag Filter Rounding Enable" start="114" end="114" type="bool"/>
   </struct>
 
+  <struct name="SF_STATE" length="8">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Setup Viewport State Offset" start="165" end="191" type="address"/>
+    <field name="Viewport Transform Enable" start="161" end="161" type="bool"/>
+    <field name="Front Winding" start="160" end="160" type="uint">
+      <value name="FRONTWINDING_CW" value="0"/>
+      <value name="FRONTWINDING_CCW" value="1"/>
+    </field>
+    <field name="Anti-Aliasing Enable" start="223" end="223" type="bool"/>
+    <field name="Cull Mode" start="221" end="222" type="uint" prefix="CULLMODE">
+      <value name="BOTH" value="0"/>
+      <value name="NONE" value="1"/>
+      <value name="FRONT" value="2"/>
+      <value name="BACK" value="3"/>
+    </field>
+    <field name="Fast Scissor Clip Disable" start="220" end="220" type="bool"/>
+    <field name="Line Width" start="216" end="219" type="u3.1"/>
+    <field name="Line End Cap Antialiasing Region Width" start="214" end="215" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Point Rasterization Rule" start="212" end="213" type="uint">
+      <value name="RASTRULE_UPPER_LEFT" value="0"/>
+      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
+    </field>
+    <field name="Zero Pixel Triangle Filter Disable" start="211" end="211" type="bool"/>
+    <field name="2x2 Pixel Triangle Filter Disable" start="210" end="210" type="bool"/>
+    <field name="Scissor Rectangle Enable" start="209" end="209" type="bool"/>
+    <field name="Destination Origin Horizontal Bias" start="205" end="208" type="u0.4"/>
+    <field name="Destination Origin Vertical Bias" start="201" end="204" type="u0.4"/>
+    <field name="Last Pixel Enable" start="255" end="255" type="bool"/>
+    <field name="Triangle Strip/List Provoking Vertex Select" start="253" end="254" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="Line Strip/List Provoking Vertex Select" start="251" end="252" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+    </field>
+    <field name="Triangle Fan Provoking Vertex Select" start="249" end="250" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="Sprite Point Enable" start="237" end="237" type="bool"/>
+    <field name="Vertex Sub Pixel Precision Select" start="236" end="236" type="uint">
+      <value name="8 Sub-Pixel Precision Bits" value="0"/>
+      <value name="4 Sub-Pixel Precision Bits" value="1"/>
+    </field>
+    <field name="Point Width Source" start="235" end="235" type="uint">
+      <value name="Vertex" value="0"/>
+      <value name="State" value="1"/>
+    </field>
+    <field name="Point Width" start="224" end="234" type="u8.3"/>
+  </struct>
+
+  <struct name="SF_VIEWPORT" length="8">
+    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
+    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
+    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
+    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
+    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
+    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
+    <field name="Scissor Rectangle" start="192" end="255" type="SCISSOR_RECT"/>
+  </struct>
+
+  <struct name="VERTEX_BUFFER_STATE" length="4">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Buffer Access Type" start="26" end="26" type="uint">
+      <value name="VERTEXDATA" value="0"/>
+      <value name="INSTANCEDATA" value="1"/>
+    </field>
+    <field name="Buffer Pitch" start="0" end="10" type="uint"/>
+    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
+    <field name="Max Index" start="64" end="95" type="uint"/>
+    <field name="Instance Data Step Rate" start="96" end="127" type="uint"/>
+  </struct>
+
+  <struct name="VERTEX_ELEMENT_STATE" length="2">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Valid" start="26" end="26" type="bool"/>
+    <field name="Source Element Format" start="16" end="24" type="SURFACE_FORMAT"/>
+    <field name="Source Element Offset" start="0" end="10" type="uint"/>
+    <field name="Destination Element Offset" start="32" end="39" type="uint"/>
+    <field name="Component 0 Control" start="60" end="62" type="3D_Vertex_Component_Control"/>
+    <field name="Component 1 Control" start="56" end="58" type="3D_Vertex_Component_Control"/>
+    <field name="Component 2 Control" start="52" end="54" type="3D_Vertex_Component_Control"/>
+    <field name="Component 3 Control" start="48" end="50" type="3D_Vertex_Component_Control"/>
+  </struct>
+
+  <struct name="VS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Vertex Cache Disable" start="193" end="193" type="bool"/>
+    <field name="Enable" start="192" end="192" type="bool"/>
+  </struct>
+
+  <struct name="WM_STATE" length="8">
+    <field name="Kernel Start Pointer 0" start="6" end="31" type="address"/>
+    <field name="GRF Register Count 0" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Depth Coefficient URB Read Offset" start="40" end="45" type="uint"/>
+    <field name="Illegal Opcode Exception Enable" start="36" end="36" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="34" end="34" type="bool"/>
+    <field name="Software  Exception Enable" start="33" end="33" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Setup URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Setup URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 0" start="96" end="99" type="uint"/>
+    <field name="Sampler State Pointer" start="133" end="159" type="address"/>
+    <field name="Sampler Count" start="130" end="132" type="uint"/>
+    <field name="Statistics Enable" start="128" end="128" type="bool"/>
+    <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
+    <field name="Legacy Diamond Line Rasterization" start="183" end="183" type="bool"/>
+    <field name="Pixel Shader Kills Pixel" start="182" end="182" type="bool"/>
+    <field name="Pixel Shader Computed Depth" start="181" end="181" type="bool"/>
+    <field name="Pixel Shader Uses Source Depth" start="180" end="180" type="bool"/>
+    <field name="Thread Dispatch Enable" start="179" end="179" type="bool"/>
+    <field name="Early Depth Test Enable" start="178" end="178" type="bool"/>
+    <field name="Line End Cap Antialiasing Region Width" start="176" end="177" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Line Antialiasing Region Width" start="174" end="175" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Polygon Stipple Enable" start="173" end="173" type="bool"/>
+    <field name="Global Depth Offset Enable" start="172" end="172" type="bool"/>
+    <field name="Line Stipple Enable" start="171" end="171" type="bool"/>
+    <field name="Legacy Global Depth Bias Enable" start="170" end="170" type="bool"/>
+    <field name="32 Pixel Dispatch Enable" start="162" end="162" type="bool"/>
+    <field name="16 Pixel Dispatch Enable" start="161" end="161" type="bool"/>
+    <field name="8 Pixel Dispatch Enable" start="160" end="160" type="bool"/>
+    <field name="Global Depth Offset Constant" start="192" end="223" type="float"/>
+    <field name="Global Depth Offset Scale" start="224" end="255" type="float"/>
+  </struct>
+
   <instruction name="3DPRIMITIVE" bias="2" length="6">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="3"/>
@@ -841,98 +826,48 @@
       <value name="SEQUENTIAL" value="0"/>
       <value name="RANDOM" value="1"/>
     </field>
-    <field name="Primitive Topology Type" start="10" end="14" type="uint"/>
+    <field name="Primitive Topology Type" start="10" end="14" type="3D_Prim_Topo_Type"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
-    <!-- DWord 1 -->
-    <field name="Vertex Count" start="32" end="63" type="uint"/>
-    <!-- DWord 2 -->
+    <field name="Vertex Count Per Instance" start="32" end="63" type="uint"/>
     <field name="Start Vertex Location" start="64" end="95" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Reserved" start="96" end="127" type="mbo"/>
-    <!-- DWord 4: MBZ -->
-    <!-- DWord 5 -->
+    <field name="Instance Count" start="96" end="127" type="uint" default="1">
+      <value name="UNDEFINED" value="0"/>
+      <value name="'non-instanced' operation" value="1"/>
+    </field>
     <field name="Base Vertex Location" start="160" end="191" type="int"/>
   </instruction>
 
-  <instruction name="3DSTATE_PIPELINED_POINTERS" bias="2" length="7">
-    <!-- DWord 0 -->
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
-    <!-- DWord 1 -->
-    <field name="Pointer to VS_STATE" start="37" end="63" type="address"/>
-    <!-- DWord 2 -->
-    <field name="Pointer to GS_STATE" start="69" end="95" type="address"/>
-    <field name="GS Enable" start="64" end="64" type="bool"/>
-    <!-- DWord 3 -->
-    <field name="Pointer to CLIP_STATE" start="101" end="127" type="address"/>
-    <field name="CLIP Enable" start="96" end="96" type="bool"/>
-    <!-- DWord 4 -->
-    <field name="Pointer to SF_STATE" start="133" end="159" type="address"/>
-    <!-- DWord 5 -->
-    <field name="Pointer to WM_STATE" start="165" end="191" type="address"/>
-    <!-- DWord 6 -->
-    <field name="Pointer to COLOR_CALC_STATE" start="198" end="223" type="address"/>
-  </instruction>
-
-  <instruction name="3DSTATE_AA_LINE_PARAMETERS" bias="2" length="3">
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
-    <field name="AA Coverage Bias" start="48" end="55" type="u0.8"/>
-    <field name="AA Coverage Slope" start="32" end="39" type="u0.8"/>
-    <field name="AA Coverage EndCap Bias" start="80" end="87" type="u0.8"/>
-    <field name="AA Coverage EndCap Slope" start="64" end="71" type="u0.8"/>
-  </instruction>
-
   <instruction name="3DSTATE_BINDING_TABLE_POINTERS" bias="2" length="6">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
-    <!-- DWord 1 -->
     <field name="Pointer to VS Binding Table" start="37" end="63" type="offset"/>
-    <!-- DWord 2 -->
     <field name="Pointer to GS Binding Table" start="69" end="95" type="offset"/>
-    <!-- DWord 3 -->
     <field name="Pointer to CLIP Binding Table" start="101" end="127" type="offset"/>
-    <!-- DWord 4 -->
     <field name="Pointer to SF Binding Table" start="133" end="159" type="offset"/>
-    <!-- DWord 5 -->
     <field name="Pointer to PS Binding Table" start="165" end="191" type="offset"/>
   </instruction>
 
   <instruction name="3DSTATE_CONSTANT_COLOR" bias="2" length="5">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <!-- DWord 1 -->
     <field name="Blend Constant Color Red" start="32" end="63" type="float"/>
-    <!-- DWord 2 -->
     <field name="Blend Constant Color Green" start="64" end="95" type="float"/>
-    <!-- DWord 3 -->
     <field name="Blend Constant Color Blue" start="96" end="127" type="float"/>
-    <!-- DWord 4 -->
-    <field name="Blend Constant Color Alpha" start="128" end="160" type="float"/>
+    <field name="Blend Constant Color Alpha" start="128" end="159" type="float"/>
   </instruction>
 
   <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="5">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="5"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
-    <!-- DWord 1 -->
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
     <field name="Surface Type" start="61" end="63" type="uint">
       <value name="SURFTYPE_1D" value="0"/>
       <value name="SURFTYPE_2D" value="1"/>
@@ -958,9 +893,7 @@
       <value name="D16_UNORM" value="5"/>
     </field>
     <field name="Surface Pitch" start="32" end="48" type="uint"/>
-    <!-- DWord 2 -->
     <field name="Surface Base Address" start="64" end="95" type="address"/>
-    <!-- DWord 3 -->
     <field name="Height" start="115" end="127" type="uint">
       <value name="SURFTYPE_1D:  must be zero" value="0"/>
     </field>
@@ -970,7 +903,6 @@
       <value name="MIPLAYOUT_BELOW" value="0"/>
       <value name="MIPLAYOUT_RIGHT" value="1"/>
     </field>
-    <!-- DWord 4 -->
     <field name="Depth" start="149" end="159" type="uint">
       <value name="SURFTYPE_CUBE:  must be zero" value="0"/>
     </field>
@@ -992,8 +924,16 @@
     <field name="Drawing Rectangle Origin X" start="96" end="111" type="int"/>
   </instruction>
 
+  <instruction name="3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Global Depth Offset Clamp" start="32" end="63" type="float"/>
+  </instruction>
+
   <instruction name="3DSTATE_INDEX_BUFFER" bias="2" length="3">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
@@ -1005,9 +945,7 @@
       <value name="DWORD" value="2"/>
     </field>
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
-    <!-- DWord 1 -->
     <field name="Buffer Starting Address" start="32" end="63" type="address"/>
-    <!-- DWord 2 -->
     <field name="Buffer Ending Address" start="64" end="95" type="address"/>
   </instruction>
 
@@ -1017,14 +955,30 @@
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="8"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
-    <field name="Modify Enable (Current Repeat Counter, Current Stipple Index)" start="63" end="63" type="bool"/>
+    <field name="Modify Enable" start="63" end="63" type="bool"/>
     <field name="Current Repeat Counter" start="53" end="61" type="uint"/>
     <field name="Current Stipple Index" start="48" end="51" type="uint"/>
     <field name="Line Stipple Pattern" start="32" end="47" type="uint"/>
-    <field name="Line Stipple Inverse Repeat Count" start="79" end="95" type="u1.16"/>
+    <field name="Line Stipple Inverse Repeat Count" start="80" end="95" type="u1.13"/>
     <field name="Line Stipple Repeat Count" start="64" end="72" type="uint"/>
   </instruction>
 
+  <instruction name="3DSTATE_PIPELINED_POINTERS" bias="2" length="7">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
+    <field name="Pointer to VS State" start="37" end="63" type="address"/>
+    <field name="Pointer to GS State" start="69" end="95" type="address"/>
+    <field name="GS Enable" start="64" end="64" type="bool"/>
+    <field name="Pointer to CLIP State" start="101" end="127" type="address"/>
+    <field name="Clip Enable" start="96" end="96" type="bool"/>
+    <field name="Pointer to SF State" start="133" end="159" type="address"/>
+    <field name="Pointer to WM State" start="165" end="191" type="address"/>
+    <field name="Pointer to Color Calc State" start="197" end="223" type="address"/>
+  </instruction>
+
   <instruction name="3DSTATE_POLY_STIPPLE_OFFSET" bias="2" length="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1046,17 +1000,6 @@
     </group>
   </instruction>
 
-  <instruction name="3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" bias="2" length="2">
-    <!-- DWord 0 -->
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
-    <field name="DWord Length" start="0" end="7" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Global Depth Offset Clamp" start="32" end="63" type="float"/>
-  </instruction>
-
   <instruction name="3DSTATE_VERTEX_BUFFERS" bias="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1087,23 +1030,81 @@
     <field name="Statistics Enable" start="0" end="0" type="bool"/>
   </instruction>
 
-  <instruction name="PIPELINE_SELECT" bias="1" length="1">
+  <instruction name="CS_URB_STATE" bias="2" length="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
-    <field name="Pipeline Selection" start="0" end="1" type="uint">
-      <value name="3D" value="0"/>
-      <value name="Media" value="1"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="URB Entry Allocation Size" start="36" end="40" type="uint"/>
+    <field name="Number of URB Entries" start="32" end="34" type="uint"/>
+  </instruction>
+
+  <instruction name="CONSTANT_BUFFER" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="Valid" start="8" end="8" type="bool" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Buffer Starting Address" start="38" end="63" type="address"/>
+    <field name="Buffer Length" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_FLUSH" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="4"/>
+    <field name="Global Snapshot Count Reset" start="3" end="3" type="uint">
+      <value name="Don't Reset" value="0"/>
+      <value name="Reset" value="1"/>
+    </field>
+    <field name="Render Cache Flush Inhibit" start="2" end="2" type="uint">
+      <value name="Flush" value="0"/>
+      <value name="Don't Flush" value="1"/>
+    </field>
+    <field name="State/Instruction Cache Invalidate" start="1" end="1" type="uint">
+      <value name="Don't Invalidate" value="0"/>
+      <value name="Invalidate" value="1"/>
     </field>
   </instruction>
 
-  <instruction name="PIPE_CONTROL" bias="2" length="5">
-    <!-- DWord 0 -->
+  <instruction name="MI_LOAD_REGISTER_IMM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="34"/>
+    <field name="Byte Write Disables" start="8" end="11" type="uint"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="1"/>
+    <field name="Register Offset" start="34" end="63" type="offset"/>
+    <field name="Data DWord" start="64" end="95" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_DATA_IMM" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="32"/>
+    <field name="Memory Address Type" start="22" end="22" type="bool"/>
+    <field name="BitFieldName" start="21" end="21" type="uint"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="2"/>
+    <field name="Physical Start Address Extension" start="32" end="35" type="address"/>
+    <field name="Address" start="66" end="95" type="address"/>
+    <field name="Data DWord 0" start="96" end="127" type="uint"/>
+    <field name="Data DWord 1" start="128" end="159" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_REGISTER_MEM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="36"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Physical Start Address Extension" start="60" end="63" type="address"/>
+    <field name="Register Address" start="34" end="54" type="offset"/>
+    <field name="Memory Address" start="66" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="PIPE_CONTROL" bias="2" length="4">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="2"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="Post Sync Operation" start="14" end="15" type="uint">
       <value name="No Write" value="0"/>
       <value name="Write Immediate Data" value="1"/>
@@ -1111,20 +1112,28 @@
       <value name="Write Timestamp" value="3"/>
     </field>
     <field name="Depth Stall Enable" start="13" end="13" type="bool"/>
-    <field name="Write Cache Flush Enable" start="12" end="12" type="bool"/>
-    <field name="Instruction/State Cache Flush Enable" start="11" end="11" type="bool"/>
+    <field name="Write Cache Flush" start="12" end="12" type="bool"/>
+    <field name="Instruction Cache Invalidate Enable" start="11" end="11" type="bool"/>
     <field name="Notify Enable" start="8" end="8" type="bool"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <!-- DWord 1 -->
-    <field name="Destination Address" start="63" end="35" type="address"/>
+    <field name="Address" start="35" end="63" type="address"/>
     <field name="Destination Address Type" start="34" end="34" type="uint" prefix="DAT">
-      <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
     <field name="Immediate Data" start="64" end="127" type="uint"/>
   </instruction>
 
-  <instruction name="STATE_BASE_ADDRESS" bias="2" length="10">
+  <instruction name="PIPELINE_SELECT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
+    <field name="Pipeline Selection" start="0" end="0" type="uint">
+      <value name="3D" value="0"/>
+      <value name="Media" value="1"/>
+    </field>
+  </instruction>
+
+  <instruction name="STATE_BASE_ADDRESS" bias="2" length="6">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
@@ -1138,18 +1147,8 @@
     <field name="Indirect Object Base Address Modify Enable" start="96" end="96" type="bool"/>
     <field name="General State Access Upper Bound" start="140" end="159" type="address"/>
     <field name="General State Access Upper Bound Modify Enable" start="128" end="128" type="bool"/>
-    <field name="Indirect Object Access Upper Bound" start="172" end="191" type="address"/>
-    <field name="Indirect Object Access Upper Bound Modify Enable" start="160" end="160" type="bool"/>
-  </instruction>
-
-  <instruction name="STATE_PREFETCH" bias="2" length="2">
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="3"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
-    <field name="Prefetch Pointer" start="38" end="63" type="address"/>
-    <field name="Prefetch Count" start="32" end="34" type="uint"/>
+    <field name="Instruction Access Upper Bound" start="172" end="191" type="address"/>
+    <field name="Instruction Access Upper Bound Modify Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="STATE_SIP" bias="2" length="2">
@@ -1160,4 +1159,117 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
     <field name="System Instruction Pointer" start="36" end="63" type="offset"/>
   </instruction>
+
+  <instruction name="URB_FENCE" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="CS Unit URB Reallocation Request" start="13" end="13" type="bool"/>
+    <field name="VFE Unit URB Reallocation Request" start="12" end="12" type="bool"/>
+    <field name="SF Unit URB Reallocation Request" start="11" end="11" type="bool"/>
+    <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
+    <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
+    <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="bool"/>
+    <field name="GS Fence" start="42" end="51" type="bool"/>
+    <field name="VS Fence" start="32" end="41" type="bool"/>
+    <field name="CS Fence" start="84" end="94" type="bool"/>
+    <field name="VFE Fence" start="74" end="83" type="bool"/>
+    <field name="SF Fence" start="64" end="73" type="bool"/>
+  </instruction>
+
+  <instruction name="XY_COLOR_BLT" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="80"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Solid Pattern Color" start="160" end="191" type="int"/>
+  </instruction>
+
+  <instruction name="XY_SETUP_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Mono Source Transparency Mode" start="61" end="61" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="ClipRect Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="ClipRect X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="ClipRect Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="ClipRect X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Background Color" start="160" end="191" type="uint"/>
+    <field name="Foreground Color" start="192" end="223" type="uint"/>
+    <field name="Pattern Base Address" start="224" end="255" type="uint"/>
+  </instruction>
+
+  <instruction name="XY_SRC_COPY_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="83"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Source Tiling Enable" start="15" end="15" type="bool"/>
+    <field name="Destination Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Source Y1 Coordinate" start="176" end="191" type="int"/>
+    <field name="Source X1 Coordinate" start="160" end="175" type="int"/>
+    <field name="Source Pitch" start="192" end="207" type="int"/>
+    <field name="Source Base Address" start="224" end="255" type="address"/>
+  </instruction>
+
+  <instruction name="XY_TEXT_IMMEDIATE_BLT" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="49"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Packing" start="16" end="16" type="uint">
+      <value name="Bit Packed" value="0"/>
+      <value name="Byte Packed" value="1"/>
+    </field>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="48" end="63" type="int"/>
+    <field name="Destination X1 Coordinate" start="32" end="47" type="int"/>
+    <field name="Destination Y2 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X2 Coordinate" start="64" end="79" type="int"/>
+  </instruction>
 </genxml>
diff --git a/src/intel/genxml/gen45.xml b/src/intel/genxml/gen45.xml
index 4582beb..7b2f769 100644
--- a/src/intel/genxml/gen45.xml
+++ b/src/intel/genxml/gen45.xml
@@ -1,5 +1,29 @@
 <?xml version="1.0" ?>
 <genxml name="CTG" gen="4.5">
+  <enum name="3D_Prim_Topo_Type" prefix="3DPRIM">
+    <value name="POINTLIST" value="1"/>
+    <value name="LINELIST" value="2"/>
+    <value name="LINESTRIP" value="3"/>
+    <value name="TRILIST" value="4"/>
+    <value name="TRISTRIP" value="5"/>
+    <value name="TRIFAN" value="6"/>
+    <value name="QUADLIST" value="7"/>
+    <value name="QUADSTRIP" value="8"/>
+    <value name="LINELIST_ADJ" value="9"/>
+    <value name="LINESTRIP_ADJ" value="10"/>
+    <value name="TRILIST_ADJ" value="11"/>
+    <value name="TRISTRIP_ADJ" value="12"/>
+    <value name="TRISTRIP_REVERSE" value="13"/>
+    <value name="POLYGON" value="14"/>
+    <value name="RECTLIST" value="15"/>
+    <value name="LINELOOP" value="16"/>
+    <value name="POINTLIST _BF" value="17"/>
+    <value name="LINESTRIP_CONT" value="18"/>
+    <value name="LINESTRIP_BF" value="19"/>
+    <value name="LINESTRIP_CONT_BF" value="20"/>
+    <value name="TRIFAN_NOSTIPPLE" value="22"/>
+  </enum>
+
   <enum name="3D_Vertex_Component_Control" prefix="VFCOMP">
     <value name="NOSTORE" value="0"/>
     <value name="STORE_SRC" value="1"/>
@@ -11,6 +35,36 @@
     <value name="STORE_PID" value="7"/>
   </enum>
 
+  <enum name="3D_Color_Buffer_Blend_Function" prefix="BLENDFUNCTION">
+    <value name="ADD" value="0"/>
+    <value name="SUBTRACT" value="1"/>
+    <value name="REVERSE_SUBTRACT" value="2"/>
+    <value name="MIN" value="3"/>
+    <value name="MAX" value="4"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Factor" prefix="BLENDFACTOR">
+    <value name="ONE" value="1"/>
+    <value name="SRC_COLOR" value="2"/>
+    <value name="SRC_ALPHA" value="3"/>
+    <value name="DST_ALPHA" value="4"/>
+    <value name="DST_COLOR" value="5"/>
+    <value name="SRC_ALPHA_SATURATE" value="6"/>
+    <value name="CONST_COLOR" value="7"/>
+    <value name="CONST_ALPHA" value="8"/>
+    <value name="SRC1_COLOR" value="9"/>
+    <value name="SRC1_ALPHA" value="10"/>
+    <value name="ZERO" value="17"/>
+    <value name="INV_SRC_COLOR" value="18"/>
+    <value name="INV_SRC_ALPHA" value="19"/>
+    <value name="INV_DST_ALPHA" value="20"/>
+    <value name="INV_DST_COLOR" value="21"/>
+    <value name="INV_CONST_COLOR" value="23"/>
+    <value name="INV_CONST_ALPHA" value="24"/>
+    <value name="INV_SRC1_COLOR" value="25"/>
+    <value name="INV_SRC1_ALPHA" value="26"/>
+  </enum>
+
   <enum name="3D_Compare_Function" prefix="COMPAREFUNCTION">
     <value name="ALWAYS" value="0"/>
     <value name="NEVER" value="1"/>
@@ -22,6 +76,36 @@
     <value name="GEQUAL" value="7"/>
   </enum>
 
+  <enum name="3D_Stencil_Operation" prefix="STENCILOP">
+    <value name="KEEP" value="0"/>
+    <value name="ZERO" value="1"/>
+    <value name="REPLACE" value="2"/>
+    <value name="INCRSAT" value="3"/>
+    <value name="DECRSAT" value="4"/>
+    <value name="INCR" value="5"/>
+    <value name="DECR" value="6"/>
+    <value name="INVERT" value="7"/>
+  </enum>
+
+  <enum name="3D_Logic_Op_Function" prefix="LOGICOP">
+    <value name="CLEAR" value="0"/>
+    <value name="NOR" value="1"/>
+    <value name="AND_INVERTED" value="2"/>
+    <value name="COPY_INVERTED" value="3"/>
+    <value name="AND_REVERSE" value="4"/>
+    <value name="INVERT" value="5"/>
+    <value name="XOR" value="6"/>
+    <value name="NAND" value="7"/>
+    <value name="AND" value="8"/>
+    <value name="EQUIV" value="9"/>
+    <value name="NOOP" value="10"/>
+    <value name="OR_INVERTED" value="11"/>
+    <value name="COPY" value="12"/>
+    <value name="OR_REVERSE" value="13"/>
+    <value name="OR" value="14"/>
+    <value name="SET" value="15"/>
+  </enum>
+
   <enum name="SURFACE_FORMAT" prefix="SF">
     <value name="R32G32B32A32_FLOAT" value="0"/>
     <value name="R32G32B32A32_SINT" value="1"/>
@@ -32,8 +116,6 @@
     <value name="R32G32B32X32_FLOAT" value="6"/>
     <value name="R32G32B32A32_SSCALED" value="7"/>
     <value name="R32G32B32A32_USCALED" value="8"/>
-    <value name="R32G32B32A32_SFIXED" value="32"/>
-    <value name="R64G64_PASSTHRU" value="33"/>
     <value name="R32G32B32_FLOAT" value="64"/>
     <value name="R32G32B32_SINT" value="65"/>
     <value name="R32G32B32_UINT" value="66"/>
@@ -65,8 +147,6 @@
     <value name="R16G16B16A16_USCALED" value="148"/>
     <value name="R32G32_SSCALED" value="149"/>
     <value name="R32G32_USCALED" value="150"/>
-    <value name="R32G32_SFIXED" value="160"/>
-    <value name="R64_PASSTHRU" value="161"/>
     <value name="B8G8R8A8_UNORM" value="192"/>
     <value name="B8G8R8A8_UNORM_SRGB" value="193"/>
     <value name="R10G10B10A2_UNORM" value="194"/>
@@ -246,7 +326,7 @@
     <value name="RAW" value="511"/>
   </enum>
 
-  <enum name="Texture Coordinate Mode" prefix="TCM">
+  <enum name="Texture_Coordinate_Mode" prefix="TCM">
     <value name="WRAP" value="0"/>
     <value name="MIRROR" value="1"/>
     <value name="CLAMP" value="2"/>
@@ -255,378 +335,103 @@
     <value name="MIRROR_ONCE" value="5"/>
   </enum>
 
-  <struct name="VS_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <field name="Statistics Enable" start="138" end="138" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
-    <field name="Sampler Count" start="160" end="162" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Vertex Cache Disable" start="193" end="193" type="bool"/>
-    <field name="VS Function Enable" start="192" end="192" type="bool"/>
-  </struct>
-
-  <struct name="GS_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="157" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <field name="Statistics Enable" start="138" end="138" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
-    <field name="Sampler Count" start="160" end="162" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Reorder Enable" start="222" end="222" type="bool"/>
-    <field name="Discard Adjacency" start="221" end="221" type="bool"/>
-    <field name="Maximum VPIndex" start="192" end="195" type="uint"/>
-  </struct>
-
-  <struct name="CLIP_STATE" length="11">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="157" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="145" type="uint"/>
-    <field name="Clipper Statistics Enable" start="138" end="138" type="bool"/>
-    <field name="GS Output Object Statistics Enable" start="137" end="137" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="API Mode" start="190" end="190" type="uint">
-      <value name="APIMODE_OGL" value="0"/>
-      <value name="APIMODE_D3D" value="1"/>
-    </field>
-    <field name="Vertex Position Space" start="189" end="189" type="uint"/>
-    <field name="Viewport XY ClipTest Enable" start="188" end="188" type="bool"/>
-    <field name="Viewport Z ClipTest Enable" start="187" end="187" type="bool"/>
-    <field name="Guardband ClipTest Enable" start="186" end="186" type="bool"/>
-    <field name="Negative W ClipTest Enable" start="185" end="185" type="bool"/>
-    <field name="UserClipFlags MustClip Enable" start="184" end="184" type="bool"/>
-    <field name="UserClipFlags ClipTest Enable Bitmask" start="176" end="183" type="uint"/>
-    <field name="Clip Mode" start="173" end="175" type="uint">
-      <value name="CLIPMODE_NORMAL" value="0"/>
-      <value name="CLIPMODE_ALL" value="1"/>
-      <value name="CLIPMODE_CLIP_NON_REJECTED" value="2"/>
-      <value name="CLIPMODE_REJECT_ALL" value="3"/>
-      <value name="CLIPMODE_ACCEPT_ALL" value="4"/>
-      <value name="CLIPMODE_NORMAL_FFCLIP" value="5"/>
-    </field>
-
-    <!-- DWord 6 -->
-    <field name="Clipper Viewport State Pointer" start="197" end="223" type="offset"/>
-    <!-- DWord 7 -->
-    <field name="Screen Space Viewport X Min" start="224" end="255" type="uint"/>
-    <!-- DWord 8 -->
-    <field name="Screen Space Viewport X Max" start="256" end="287" type="uint"/>
-    <!-- DWord 9 -->
-    <field name="Screen Space Viewport Y Min" start="288" end="319" type="uint"/>
-    <!-- DWord 10 -->
-    <field name="Screen Space Viewport Y Max" start="320" end="351" type="uint"/>
-  </struct>
-
-  <struct name="SF_STATE" length="8">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
-    <field name="MaskStack Exception Enable" start="43" end="43" type="bool"/>
-    <field name="Software Exception Enable" start="39" end="39" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
-    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
-    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
-    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
-    <field name="Statistics Enable" start="138" end="138" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Setup Viewport State Pointer" start="165" end="195" type="offset"/>
-    <field name="Viewport Transform Enable" start="161" end="161" type="uint"/>
-    <field name="Front Winding" start="160" end="160" type="uint">
-      <value name="FRONTWINDING_CW" value="0"/>
-      <value name="FRONTWINDING_CCW" value="1"/>
-    </field>
-    <!-- DWord 6 -->
-    <field name="Anti-aliasing Enable" start="223" end="223" type="bool"/>
-    <field name="Cull Mode" start="221" end="222" type="uint" prefix="CULLMODE">
-      <value name="BOTH" value="0"/>
-      <value name="NONE" value="1"/>
-      <value name="FRONT" value="2"/>
-      <value name="BACK" value="3"/>
-    </field>
-    <field name="Fast Scissor Clip Disable" start="220" end="220" type="bool"/>
-    <field name="Line Width" start="216" end="219" type="uint"/>
-    <field name="Line End Cap Antialiasing Region Width" start="214" end="215" type="uint"/>
-    <field name="Point Rasterization Rule" start="212" end="213" type="uint">
-      <value name="RASTRULE_UPPER_LEFT" value="0"/>
-      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
-    </field>
-    <field name="Zero Pixel Triangle Filter Disable" start="211" end="211" type="bool"/>
-    <field name="2x2 Pixel Triangle Filter Disable" start="210" end="210" type="bool"/>
-    <field name="Scissor Rectangle Enable" start="209" end="209" type="bool"/>
-    <field name="Destination Origin Horizontal Bias" start="205" end="208" type="uint"/>
-    <field name="Destination Origin Vertical Bias" start="201" end="204" type="uint"/>
-
-    <!-- DWord 7 -->
-    <field name="Last Pixel Enable" start="255" end="255" type="bool"/>
-    <field name="Triangle Strip/List Provoking Vertex Select" start="253" end="254" type="uint"/>
-    <field name="Line Strip/List Provoking Vertex Select" start="251" end="252" type="uint"/>
-    <field name="Triangle Fan Provoking Vertex Select" start="249" end="250" type="uint"/>
-    <field name="AA Line Distance Mode" start="238" end="238" type="uint" prefix="AALINEDISTANCE">
-      <value name="MANHATTAN" value="0"/>
-      <value name="TRUE" value="1"/>
-    </field>
-    <field name="Sprite Point Enable" start="237" end="237" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="236" end="236" type="uint"/>
-    <field name="Use Point Width State" start="235" end="235" type="uint"/>
-    <field name="Point Width" start="224" end="234" type="uint"/>
-  </struct>
-
-  <struct name="WM_STATE" length="7">
-    <!-- DWord 0 -->
-    <field name="Kernel Start Pointer[0]" start="6" end="31" type="address"/>
-    <field name="GRF Register Count" start="1" end="3" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Single Program Flow" start="63" end="63" type="bool"/>
-    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
-    <field name="Thread Priority" start="49" end="49" type="uint"/>
-    <field name="Floating Point Mode" start="48" end="48" type="uint">
-      <value name="IEEE-754" value="0"/>
-      <value name="Alternate" value="1"/>
-    </field>
-    <field name="Depth Coefficient URB Read Offset" start="40" end="45" type="uint"/>
-    <field name="Illegal Opcode Exception Enable" start="36" end="36" type="bool"/>
-    <field name="MaskStack Exception Enable" start="34" end="34" type="bool"/>
-    <field name="Software Exception Enable" start="33" end="33" type="bool"/>
-    <!-- DWord 2 -->
-    <field name="Scratch Space Base Pointer" start="74" end="95" type="offset"/>
-    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
-    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
-    <field name="Setup URB Entry Read Length" start="107" end="113" type="uint"/>
-    <field name="Setup URB Entry Read Offset" start="100" end="105" type="uint"/>
-    <field name="Dispatch GRF Start Register for URB Data" start="96" end="99" type="uint"/>
-    <!-- DWord 4 -->
-    <field name="Sampler State Pointer" start="133" end="159" type="address"/>
-    <field name="Sampler Count" start="130" end="132" type="uint"/>
-    <field name="Statistics Enable" start="128" end="128" type="bool"/>
-    <!-- DWord 5 -->
-    <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
-    <field name="Transposed URB Read Enable" start="184" end="184" type="bool"/>
-    <field name="Legacy Diamond Line Rasterization" start="183" end="183" type="bool"/>
-    <field name="Pixel Shader Kill Pixel" start="182" end="182" type="bool"/>
-    <field name="Pixel Shader Computed Depth" start="181" end="181" type="bool"/>
-    <field name="Pixel Shader Uses Source Depth" start="180" end="180" type="bool"/>
-    <field name="Thread Dispatch Enable" start="179" end="179" type="bool"/>
-    <field name="Early Depth Test Enable" start="178" end="178" type="bool"/>
-    <field name="Line End Cap Antialiasing Region Width" start="176" end="177" type="uint"/>
-    <field name="Line Antialiasing Region Width" start="174" end="175" type="uint"/>
-    <field name="Polygon Stipple Enable" start="173" end="173" type="bool"/>
-    <field name="Global Depth Offset Enable" start="172" end="172" type="bool"/>
-    <field name="Line Stipple Enable" start="171" end="171" type="bool"/>
-    <field name="Legacy Global Depth Bias Enable" start="170" end="170" type="bool"/>
-    <field name="Contiguous 64-Pixel Dispatch Enable" start="164" end="164" type="bool"/>
-    <field name="Contiguous 32-Pixel Dispatch Enable" start="163" end="163" type="bool"/>
-    <field name="32-Pixel Dispatch Enable" start="162" end="162" type="bool"/>
-    <field name="16-Pixel Dispatch Enable" start="161" end="161" type="bool"/>
-    <field name="8-Pixel Dispatch Enable" start="160" end="160" type="bool"/>
-    <!-- DWord 6 -->
-    <field name="Global Depth Offset Constant" start="192" end="223" type="float"/>
-    <!-- DWord 7 -->
-    <field name="Global Depth Offset Scale" start="224" end="255" type="float"/>
-  </struct>
-
-  <struct name="VERTEX_BUFFER_STATE" length="4">
-    <!-- DWord 0 -->
-    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
-    <field name="Buffer Access Type" start="26" end="26" type="uint">
-      <value name="VERTEXDATA" value="0"/>
-      <value name="INSTANCEDATA" value="1"/>
-    </field>
-    <field name="Vertex Fetch Invalidate" start="12" end="12" type="uint" default="0"/>
-    <field name="Buffer Pitch" start="0" end="11" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
-    <!-- DWord 2 -->
-    <field name="Max Index" start="64" end="95" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Instance Data Step Rate" start="96" end="127" type="uint"/>
-  </struct>
-
-  <struct name="VERTEX_ELEMENT_STATE" length="2">
-    <!-- DWord 0 -->
-    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
-    <field name="Valid" start="26" end="26" type="uint"/>
-    <field name="Source Element Format" start="16" end="24" type="uint"/>
-    <field name="Source Element Offset" start="0" end="10" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Component 0 Control" start="60" end="62" type="uint"/>
-    <field name="Component 1 Control" start="56" end="58" type="uint"/>
-    <field name="Component 2 Control" start="52" end="54" type="uint"/>
-    <field name="Component 3 Control" start="48" end="50" type="uint"/>
-    <field name="Destination Element Offset" start="32" end="39" type="uint"/>
-  </struct>
-
-  <struct name="CLIP_VIEWPORT" length="4">
-    <field name="XMin Clip Guardband" start="0" end="31" type="uint"/>
-    <field name="XMax Clip Guardband" start="32" end="63" type="uint"/>
-    <field name="YMin Clip Guardband" start="64" end="95" type="uint"/>
-    <field name="YMax Clip Guardband" start="96" end="127" type="uint"/>
-  </struct>
-
-  <struct name="SF_VIEWPORT" length="8">
-    <!-- DWord 0 -->
-    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
-    <!-- DWord 1 -->
-    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
-    <!-- DWord 2 -->
-    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
-    <!-- DWord 3 -->
-    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
-    <!-- DWord 4 -->
-    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
-    <!-- DWord 5 -->
-    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
-    <!-- DWord 6 -->
-    <field name="Scissor Rectangle Y Min" start="208" end="223" type="uint"/>
-    <field name="Scissor Rectangle X Min" start="192" end="207" type="uint"/>
-    <!-- DWord 7 -->
-    <field name="Scissor Rectangle Y Max" start="224" end="239" type="uint"/>
-    <field name="Scissor Rectangle X Max" start="240" end="255" type="uint"/>
-  </struct>
-
   <struct name="CC_VIEWPORT" length="2">
     <field name="Minimum Depth" start="0" end="31" type="float"/>
     <field name="Maximum Depth" start="32" end="63" type="float"/>
   </struct>
 
-  <struct name="COLOR_CALC_STATE" length="6">
-    <!-- DWord 0 -->
+  <struct name="CLIP_STATE" length="11">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Clipper Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="GS Output Object Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="API Mode" start="190" end="190" type="uint" prefix="APIMODE">
+      <value name="OGL" value="0"/>
+      <value name="D3D" value="1"/>
+    </field>
+    <field name="Vertex Position Space" start="189" end="189" type="uint" prefix="VPOS">
+      <value name="NDCSPACE" value="0"/>
+      <value name="SCREENSPACE" value="1"/>
+    </field>
+    <field name="Viewport XY ClipTest Enable" start="188" end="188" type="bool"/>
+    <field name="Viewport Z ClipTest Enable" start="187" end="187" type="bool"/>
+    <field name="Guardband ClipTest Enable" start="186" end="186" type="bool"/>
+    <field name="Negative W ClipTest Enable" start="185" end="185" type="bool"/>
+    <field name="UserClipFlags MustClip Enable" start="184" end="184" type="bool"/>
+    <field name="UserClipDistance ClipTest Enable Bitmask" start="176" end="183" type="uint"/>
+    <field name="Clip Mode" start="173" end="175" type="uint" prefix="CLIPMODE">
+      <value name="NORMAL" value="0"/>
+      <value name="ALL" value="1"/>
+      <value name="CLIP_NON_REJECTED" value="2"/>
+      <value name="REJECT_ALL" value="3"/>
+      <value name="ACCEPT_ALL" value="4"/>
+    </field>
+    <field name="Clipper Viewport State Pointer" start="197" end="223" type="address"/>
+    <field name="Screen Space Viewport X Min" start="224" end="255" type="float"/>
+    <field name="Screen Space Viewport X Max" start="256" end="287" type="float"/>
+    <field name="Screen Space Viewport Y Min" start="288" end="319" type="float"/>
+    <field name="Screen Space Viewport Y Max" start="320" end="351" type="float"/>
+  </struct>
+
+  <struct name="CLIP_VIEWPORT" length="4">
+    <field name="XMin Clip Guardband" start="0" end="31" type="float"/>
+    <field name="XMax Clip Guardband" start="32" end="63" type="float"/>
+    <field name="YMin Clip Guardband" start="64" end="95" type="float"/>
+    <field name="YMax Clip Guardband" start="96" end="127" type="float"/>
+  </struct>
+
+  <struct name="SCISSOR_RECT" length="2">
+    <field name="Scissor Rectangle Y Min" start="16" end="31" type="uint"/>
+    <field name="Scissor Rectangle X Min" start="0" end="15" type="uint"/>
+    <field name="Scissor Rectangle Y Max" start="48" end="63" type="uint"/>
+    <field name="Scissor Rectangle X Max" start="32" end="47" type="uint"/>
+  </struct>
+
+  <struct name="COLOR_CALC_STATE" length="8">
     <field name="Stencil Test Enable" start="31" end="31" type="bool"/>
     <field name="Stencil Test Function" start="28" end="30" type="3D_Compare_Function"/>
-    <field name="Stencil Fail Op" start="25" end="27" type="uint">
-      <value name="STENCILOP_KEEP" value="0"/>
-      <value name="STENCILOP_ZERO" value="1"/>
-      <value name="STENCILOP_REPLACE" value="2"/>
-      <value name="STENCILOP_INCRSAT" value="3"/>
-      <value name="STENCILOP_DECRSAT" value="4"/>
-      <value name="STENCILOP_INCR" value="5"/>
-      <value name="STENCILOP_DECR" value="6"/>
-      <value name="STENCILOP_INVERT" value="7"/>
-    </field>
-    <field name="Stencil Pass Depth Fail Op" start="22" end="24" type="uint"/>
-    <field name="Stencil Pass Depth Pass Op" start="19" end="21" type="uint"/>
+    <field name="Stencil Fail Op" start="25" end="27" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Fail Op" start="22" end="24" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Pass Op" start="19" end="21" type="3D_Stencil_Operation"/>
     <field name="Stencil Buffer Write Enable" start="18" end="18" type="bool"/>
     <field name="Double Sided Stencil Enable" start="15" end="15" type="bool"/>
-    <field name="BackFace Stencil Test Function" start="12" end="14" type="3D_Compare_Function"/>
-    <field name="Backface Stencil Fail Op" start="9" end="11" type="uint">
-      <value name="STENCILOP_KEEP" value="0"/>
-      <value name="STENCILOP_ZERO" value="1"/>
-      <value name="STENCILOP_REPLACE" value="2"/>
-      <value name="STENCILOP_INCRSAT" value="3"/>
-      <value name="STENCILOP_DECRSAT" value="4"/>
-      <value name="STENCILOP_INCR" value="5"/>
-      <value name="STENCILOP_DECR" value="6"/>
-      <value name="STENCILOP_INVERT" value="7"/>
-    </field>
-    <field name="Backface Stencil Pass Depth Fail Op" start="6" end="8" type="uint"/>
-    <field name="Backface Stencil Pass Depth Pass Op" start="3" end="5" type="uint"/>
-    <!-- DWord 1 -->
+    <field name="Backface Stencil Test Function" start="12" end="14" type="3D_Compare_Function"/>
+    <field name="Backface Stencil Fail Op" start="9" end="11" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Fail Op" start="6" end="8" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Pass Op" start="3" end="5" type="3D_Stencil_Operation"/>
     <field name="Stencil Reference Value" start="56" end="63" type="uint"/>
     <field name="Stencil Test Mask" start="48" end="55" type="uint"/>
     <field name="Stencil Write Mask" start="40" end="47" type="uint"/>
-    <field name="BackFace Stencil Reference Value" start="32" end="39" type="uint"/>
-    <!-- DWord 2 -->
+    <field name="Backface Stencil Reference Value" start="32" end="39" type="uint"/>
     <field name="Backface Stencil Test Mask" start="88" end="95" type="uint"/>
     <field name="Backface Stencil Write Mask" start="80" end="87" type="uint"/>
     <field name="Depth Test Enable" start="79" end="79" type="bool"/>
     <field name="Depth Test Function" start="76" end="78" type="3D_Compare_Function"/>
     <field name="Depth Buffer Write Enable" start="75" end="75" type="bool"/>
     <field name="Logic Op Enable" start="64" end="64" type="bool"/>
-    <!-- DWord 3 -->
     <field name="Alpha Test Format" start="111" end="111" type="uint">
       <value name="ALPHATEST_UNORM8" value="0"/>
       <value name="ALPHATEST_FLOAT32" value="1"/>
@@ -635,69 +440,17 @@
     <field name="Color Buffer Blend Enable" start="108" end="108" type="bool"/>
     <field name="Alpha Test Enable" start="107" end="107" type="bool"/>
     <field name="Alpha Test Function" start="104" end="106" type="3D_Compare_Function"/>
-    <!-- DWord 4 -->
-    <field name="Color Calculator Viewport State Pointer" start="133" end="159" type="address"/>
-    <!-- DWord 5 -->
+    <field name="CC Viewport State Pointer" start="133" end="159" type="address"/>
     <field name="Color Dither Enable" start="191" end="191" type="bool"/>
     <field name="Round Disable Function Disable" start="190" end="190" type="bool"/>
-    <field name="Logic Op Function" start="176" end="179" type="uint">
-      <value name="LOGICOP_CLEAR" value="0"/>
-      <value name="LOGICOP_NOR" value="1"/>
-      <value name="LOGICOP_AND_INVERTED" value="2"/>
-      <value name="LOGICOP_COPY_INVERTED" value="3"/>
-      <value name="LOGICOP_AND_REVERSE" value="4"/>
-      <value name="LOGICOP_INVERT" value="5"/>
-      <value name="LOGICOP_XOR" value="6"/>
-      <value name="LOGICOP_NAND" value="7"/>
-      <value name="LOGICOP_AND" value="8"/>
-      <value name="LOGICOP_EQUIV" value="9"/>
-      <value name="LOGICOP_NOOP" value="10"/>
-      <value name="LOGICOP_OR_INVERTED" value="11"/>
-      <value name="LOGICOP_COPY" value="12"/>
-      <value name="LOGICOP_OR_REVERSE" value="13"/>
-      <value name="LOGICOP_OR" value="14"/>
-      <value name="LOGICOP_SET" value="15"/>
-    </field>
+    <field name="Logic Op Function" start="176" end="179" type="3D_Logic_Op_Function"/>
     <field name="Statistics Enable" start="175" end="175" type="bool"/>
-    <field name="Alpha Blend Function" start="172" end="174" type="uint">
-      <value name="BLENDFUNCTION_ADD" value="0"/>
-      <value name="BLENDFUNCTION_SUBTRACT" value="1"/>
-      <value name="BLENDFUNCTION_REVERSE_SUBTRACT" value="2"/>
-      <value name="BLENDFUNCTION_MIN" value="3"/>
-      <value name="BLENDFUNCTION_MAX" value="4"/>
-    </field>
-    <field name="Source Alpha Blend Factor" start="167" end="171" type="uint">
-      <value name="BLENDFACTOR_ONE" value="1"/>
-      <value name="BLENDFACTOR_SRC_COLOR" value="2"/>
-      <value name="BLENDFACTOR_SRC_ALPHA" value="3"/>
-      <value name="BLENDFACTOR_DST_ALPHA" value="4"/>
-      <value name="BLENDFACTOR_DST_COLOR" value="5"/>
-      <value name="BLENDFACTOR_SRC_ALPHA_SATURATE" value="6"/>
-      <value name="BLENDFACTOR_CONST_COLOR" value="7"/>
-      <value name="BLENDFACTOR_CONST_ALPHA" value="8"/>
-      <value name="BLENDFACTOR_SRC1_COLOR" value="9"/>
-      <value name="BLENDFACTOR_SRC1_ALPHA" value="10"/>
-      <value name="BLENDFACTOR_ZERO" value="17"/>
-      <value name="BLENDFACTOR_INV_SRC_COLOR" value="18"/>
-      <value name="BLENDFACTOR_INV_SRC_ALPHA" value="19"/>
-      <value name="BLENDFACTOR_INV_DST_ALPHA" value="20"/>
-      <value name="BLENDFACTOR_INV_DST_COLOR" value="21"/>
-      <value name="BLENDFACTOR_INV_CONST_COLOR" value="23"/>
-      <value name="BLENDFACTOR_INV_CONST_ALPHA" value="24"/>
-      <value name="BLENDFACTOR_INV_SRC1_COLOR" value="25"/>
-      <value name="BLENDFACTOR_INV_SRC1_ALPHA" value="26"/>
-    </field>
-    <field name="Destination Alpha Blend Factor" start="162" end="166" type="uint"/>
-    <!-- DWord 6 -->
-    <field name="Color Blend Function" start="221" end="223" type="uint">
-      <value name="BLENDFUNCTION_ADD" value="0"/>
-      <value name="BLENDFUNCTION_SUBTRACT" value="1"/>
-      <value name="BLENDFUNCTION_REVERSE_SUBTRACT" value="2"/>
-      <value name="BLENDFUNCTION_MIN" value="3"/>
-      <value name="BLENDFUNCTION_MAX" value="4"/>
-    </field>
-    <field name="Source Blend Factor" start="216" end="220" type="uint"/>
-    <field name="Destination Blend Factor" start="211" end="215" type="uint"/>
+    <field name="Alpha Blend Function" start="172" end="174" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Alpha Blend Factor" start="167" end="171" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Alpha Blend Factor" start="162" end="166" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Color Blend Function" start="221" end="223" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Blend Factor" start="216" end="220" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Blend Factor" start="211" end="215" type="3D_Color_Buffer_Blend_Factor"/>
     <field name="X Dither Offset" start="209" end="210" type="uint"/>
     <field name="Y Dither Offset" start="207" end="208" type="uint"/>
     <field name="Color Clamp Range" start="194" end="195" type="uint">
@@ -707,8 +460,39 @@
     </field>
     <field name="Pre-Blend Color Clamp Enable" start="193" end="193" type="bool"/>
     <field name="Post-Blend Color Clamp Enable" start="192" end="192" type="bool"/>
-    <!-- DWord 7 -->
-    <field name="Alpha Reference Value" start="224" end="255" type="float"/>
+    <field name="Alpha Reference Value As UNORM8" start="224" end="255" type="uint"/>
+    <field name="Alpha Reference Value As FLOAT32" start="224" end="255" type="float"/>
+  </struct>
+
+  <struct name="GS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="GS Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="Rendering Enable" start="136" end="136" type="bool"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Reorder Enable" start="222" end="222" type="bool"/>
+    <field name="Discard Adjacency" start="221" end="221" type="bool"/>
+    <field name="Maximum VPIndex" start="192" end="195" type="uint"/>
   </struct>
 
   <struct name="RENDER_SURFACE_STATE" length="6">
@@ -766,16 +550,16 @@
     <field name="Y Offset" start="180" end="183" type="uint"/>
   </struct>
 
+  <struct name="SAMPLER_BORDER_COLOR_STATE" length="12">
+    <field name="Border Color Red" start="0" end="31" type="float"/>
+    <field name="Border Color Green" start="32" end="63" type="float"/>
+    <field name="Border Color Blue" start="64" end="95" type="float"/>
+    <field name="Border Color Alpha" start="96" end="127" type="float"/>
+  </struct>
+
   <struct name="SAMPLER_STATE" length="4">
     <field name="Sampler Disable" start="31" end="31" type="bool"/>
-    <field name="Texture Border Color Mode" start="29" end="29" type="uint">
-      <value name="DX10/OGL" value="0"/>
-      <value name="DX9" value="1"/>
-    </field>
-    <field name="LOD PreClamp Enable" start="28" end="28" type="uint" prefix="CLAMP_ENABLE">
-      <value name="D3D" value="0"/>
-      <value name="OGL" value="1"/>
-    </field>
+    <field name="LOD PreClamp Enable" start="28" end="28" type="bool"/>
     <field name="Base Mip Level" start="22" end="26" type="u4.1"/>
     <field name="Mip Mode Filter" start="20" end="21" type="uint" prefix="MIPFILTER">
       <value name="NONE" value="0"/>
@@ -806,17 +590,17 @@
       <value name="CUBECTRLMODE_PROGRAMMED" value="0"/>
       <value name="CUBECTRLMODE_OVERRIDE" value="1"/>
     </field>
-    <field name="TCX Address Control Mode" start="38" end="40" type="uint"/>
-    <field name="TCY Address Control Mode" start="35" end="37" type="uint"/>
-    <field name="TCZ Address Control Mode" start="32" end="34" type="uint"/>
+    <field name="TCX Address Control Mode" start="38" end="40" type="Texture_Coordinate_Mode"/>
+    <field name="TCY Address Control Mode" start="35" end="37" type="Texture_Coordinate_Mode"/>
+    <field name="TCZ Address Control Mode" start="32" end="34" type="Texture_Coordinate_Mode"/>
     <field name="Border Color Pointer" start="69" end="95" type="offset"/>
-    <field name="Monochrome Filter Height: Reserved" start="125" end="127" type="uint"/>
+    <field name="Monochrome Filter Height" start="125" end="127" type="uint"/>
     <field name="Monochrome Filter Width" start="122" end="124" type="uint"/>
     <field name="ChromaKey Enable" start="121" end="121" type="bool"/>
     <field name="ChromaKey Index" start="119" end="120" type="uint"/>
-    <field name="ChromaKey Mode" start="118" end="118" type="uint">
-      <value name="KEYFILTER_KILL_ON_ANY_MATCH" value="0"/>
-      <value name="KEYFILTER_REPLACE_BLACK" value="1"/>
+    <field name="ChromaKey Mode" start="118" end="118" type="uint" prefix="KEYFILTER">
+      <value name="KILL_ON_ANY_MATCH" value="0"/>
+      <value name="REPLACE_BLACK" value="1"/>
     </field>
     <field name="Maximum Anisotropy" start="115" end="117" type="uint">
       <value name="RATIO 2:1" value="0"/>
@@ -836,8 +620,221 @@
     <field name="U Address Mag Filter Rounding Enable" start="114" end="114" type="bool"/>
   </struct>
 
+  <struct name="SF_STATE" length="8">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="Setup Viewport State Offset" start="165" end="191" type="address"/>
+    <field name="Viewport Transform Enable" start="161" end="161" type="bool"/>
+    <field name="Front Winding" start="160" end="160" type="uint">
+      <value name="FRONTWINDING_CW" value="0"/>
+      <value name="FRONTWINDING_CCW" value="1"/>
+    </field>
+    <field name="Anti-Aliasing Enable" start="223" end="223" type="bool"/>
+    <field name="Cull Mode" start="221" end="222" type="uint" prefix="CULLMODE">
+      <value name="BOTH" value="0"/>
+      <value name="NONE" value="1"/>
+      <value name="FRONT" value="2"/>
+      <value name="BACK" value="3"/>
+    </field>
+    <field name="Fast Scissor Clip Disable" start="220" end="220" type="bool"/>
+    <field name="Line Width" start="216" end="219" type="u3.1"/>
+    <field name="Line End Cap Antialiasing Region Width" start="214" end="215" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Point Rasterization Rule" start="212" end="213" type="uint">
+      <value name="RASTRULE_UPPER_LEFT" value="0"/>
+      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
+    </field>
+    <field name="Zero Pixel Triangle Filter Disable" start="211" end="211" type="bool"/>
+    <field name="2x2 Pixel Triangle Filter Disable" start="210" end="210" type="bool"/>
+    <field name="Scissor Rectangle Enable" start="209" end="209" type="bool"/>
+    <field name="Destination Origin Horizontal Bias" start="205" end="208" type="u0.4"/>
+    <field name="Destination Origin Vertical Bias" start="201" end="204" type="u0.4"/>
+    <field name="Last Pixel Enable" start="255" end="255" type="bool"/>
+    <field name="Triangle Strip/List Provoking Vertex Select" start="253" end="254" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="Line Strip/List Provoking Vertex Select" start="251" end="252" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+    </field>
+    <field name="Triangle Fan Provoking Vertex Select" start="249" end="250" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="AA Line Distance Mode" start="238" end="238" type="uint" prefix="AALINEDISTANCE">
+      <value name="MANHATTAN" value="0"/>
+      <value name="TRUE" value="1"/>
+    </field>
+    <field name="Sprite Point Enable" start="237" end="237" type="bool"/>
+    <field name="Vertex Sub Pixel Precision Select" start="236" end="236" type="uint">
+      <value name="8 Sub-Pixel Precision Bits" value="0"/>
+      <value name="4 Sub-Pixel Precision Bits" value="1"/>
+    </field>
+    <field name="Point Width Source" start="235" end="235" type="uint">
+      <value name="Vertex" value="0"/>
+      <value name="State" value="1"/>
+    </field>
+    <field name="Point Width" start="224" end="234" type="u8.3"/>
+  </struct>
+
+  <struct name="SF_VIEWPORT" length="8">
+    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
+    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
+    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
+    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
+    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
+    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
+    <field name="Scissor Rectangle" start="192" end="255" type="SCISSOR_RECT"/>
+  </struct>
+
+  <struct name="VERTEX_BUFFER_STATE" length="4">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Buffer Access Type" start="26" end="26" type="uint">
+      <value name="VERTEXDATA" value="0"/>
+      <value name="INSTANCEDATA" value="1"/>
+    </field>
+    <field name="Buffer Pitch" start="0" end="10" type="uint"/>
+    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
+    <field name="Max Index" start="64" end="95" type="uint"/>
+    <field name="Instance Data Step Rate" start="96" end="127" type="uint"/>
+  </struct>
+
+  <struct name="VERTEX_ELEMENT_STATE" length="2">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Valid" start="26" end="26" type="bool"/>
+    <field name="Source Element Format" start="16" end="24" type="SURFACE_FORMAT"/>
+    <field name="Source Element Offset" start="0" end="10" type="uint"/>
+    <field name="Destination Element Offset" start="32" end="39" type="uint"/>
+    <field name="Component 0 Control" start="60" end="62" type="3D_Vertex_Component_Control"/>
+    <field name="Component 1 Control" start="56" end="58" type="3D_Vertex_Component_Control"/>
+    <field name="Component 2 Control" start="52" end="54" type="3D_Vertex_Component_Control"/>
+    <field name="Component 3 Control" start="48" end="50" type="3D_Vertex_Component_Control"/>
+  </struct>
+
+  <struct name="VS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="address"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Vertex Cache Disable" start="193" end="193" type="bool"/>
+    <field name="Enable" start="192" end="192" type="bool"/>
+  </struct>
+
+  <struct name="WM_STATE" length="8">
+    <field name="Kernel Start Pointer 0" start="6" end="31" type="address"/>
+    <field name="GRF Register Count 0" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal" value="0"/>
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Depth Coefficient URB Read Offset" start="40" end="45" type="uint"/>
+    <field name="Illegal Opcode Exception Enable" start="36" end="36" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="34" end="34" type="bool"/>
+    <field name="Software  Exception Enable" start="33" end="33" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Setup URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Setup URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 0" start="96" end="99" type="uint"/>
+    <field name="Sampler State Pointer" start="133" end="159" type="address"/>
+    <field name="Sampler Count" start="130" end="132" type="uint"/>
+    <field name="Statistics Enable" start="128" end="128" type="bool"/>
+    <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
+    <field name="Legacy Diamond Line Rasterization" start="183" end="183" type="bool"/>
+    <field name="Pixel Shader Kills Pixel" start="182" end="182" type="bool"/>
+    <field name="Pixel Shader Computed Depth" start="181" end="181" type="bool"/>
+    <field name="Pixel Shader Uses Source Depth" start="180" end="180" type="bool"/>
+    <field name="Thread Dispatch Enable" start="179" end="179" type="bool"/>
+    <field name="Early Depth Test Enable" start="178" end="178" type="bool"/>
+    <field name="Line End Cap Antialiasing Region Width" start="176" end="177" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Line Antialiasing Region Width" start="174" end="175" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Polygon Stipple Enable" start="173" end="173" type="bool"/>
+    <field name="Global Depth Offset Enable" start="172" end="172" type="bool"/>
+    <field name="Line Stipple Enable" start="171" end="171" type="bool"/>
+    <field name="Legacy Global Depth Bias Enable" start="170" end="170" type="bool"/>
+    <field name="Contiguous 64 Pixel Dispatch Enable" start="164" end="164" type="bool"/>
+    <field name="Contiguous 32 Pixel Dispatch Enable" start="163" end="163" type="bool"/>
+    <field name="32 Pixel Dispatch Enable" start="162" end="162" type="bool"/>
+    <field name="16 Pixel Dispatch Enable" start="161" end="161" type="bool"/>
+    <field name="8 Pixel Dispatch Enable" start="160" end="160" type="bool"/>
+    <field name="Global Depth Offset Constant" start="192" end="223" type="float"/>
+    <field name="Global Depth Offset Scale" start="224" end="255" type="float"/>
+  </struct>
+
   <instruction name="3DPRIMITIVE" bias="2" length="6">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="3"/>
@@ -846,47 +843,16 @@
       <value name="SEQUENTIAL" value="0"/>
       <value name="RANDOM" value="1"/>
     </field>
-    <field name="Primitive Topology Type" start="10" end="14" type="uint"/>
+    <field name="Primitive Topology Type" start="10" end="14" type="3D_Prim_Topo_Type"/>
     <field name="Indirect Vertex Count" start="9" end="9" type="uint"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
-    <!-- DWord 1 -->
     <field name="Vertex Count Per Instance" start="32" end="63" type="uint"/>
-    <!-- DWord 2 -->
     <field name="Start Vertex Location" start="64" end="95" type="uint"/>
-    <!-- DWord 3 -->
-    <field name="Instance Count" start="96" end="127" type="uint">
-      <value name="UNDEFINED" value="0"/>
-      <value name="'non-instanced' operation" value="1"/>
-    </field>
-    <!-- DWord 4 -->
+    <field name="Instance Count" start="96" end="127" type="uint"/>
     <field name="Start Instance Location" start="128" end="159" type="uint"/>
-    <!-- DWord 5 -->
     <field name="Base Vertex Location" start="160" end="191" type="int"/>
   </instruction>
 
-  <instruction name="3DSTATE_PIPELINED_POINTERS" bias="2" length="7">
-    <!-- DWord 0 -->
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
-    <!-- DWord 1 -->
-    <field name="Pointer to VS_STATE" start="37" end="63" type="address"/>
-    <!-- DWord 2 -->
-    <field name="Pointer to GS_STATE" start="69" end="95" type="address"/>
-    <field name="GS Enable" start="64" end="64" type="bool"/>
-    <!-- DWord 3 -->
-    <field name="Pointer to CLIP_STATE" start="101" end="127" type="address"/>
-    <field name="CLIP Enable" start="96" end="96" type="bool"/>
-    <!-- DWord 4 -->
-    <field name="Pointer to SF_STATE" start="133" end="159" type="address"/>
-    <!-- DWord 5 -->
-    <field name="Pointer to WM_STATE" start="165" end="191" type="address"/>
-    <!-- DWord 6 -->
-    <field name="Pointer to COLOR_CALC_STATE" start="198" end="223" type="address"/>
-  </instruction>
-
   <instruction name="3DSTATE_AA_LINE_PARAMETERS" bias="2" length="3">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -900,49 +866,36 @@
   </instruction>
 
   <instruction name="3DSTATE_BINDING_TABLE_POINTERS" bias="2" length="6">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
-    <!-- DWord 1 -->
     <field name="Pointer to VS Binding Table" start="37" end="63" type="offset"/>
-    <!-- DWord 2 -->
     <field name="Pointer to GS Binding Table" start="69" end="95" type="offset"/>
-    <!-- DWord 3 -->
     <field name="Pointer to CLIP Binding Table" start="101" end="127" type="offset"/>
-    <!-- DWord 4 -->
     <field name="Pointer to SF Binding Table" start="133" end="159" type="offset"/>
-    <!-- DWord 5 -->
     <field name="Pointer to PS Binding Table" start="165" end="191" type="offset"/>
   </instruction>
 
   <instruction name="3DSTATE_CONSTANT_COLOR" bias="2" length="5">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <!-- DWord 1 -->
     <field name="Blend Constant Color Red" start="32" end="63" type="float"/>
-    <!-- DWord 2 -->
     <field name="Blend Constant Color Green" start="64" end="95" type="float"/>
-    <!-- DWord 3 -->
     <field name="Blend Constant Color Blue" start="96" end="127" type="float"/>
-    <!-- DWord 4 -->
-    <field name="Blend Constant Color Alpha" start="128" end="160" type="float"/>
+    <field name="Blend Constant Color Alpha" start="128" end="159" type="float"/>
   </instruction>
 
-  <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="7">
-    <!-- DWord 0 -->
+  <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="6">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="5"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
-    <!-- DWord 1 -->
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
     <field name="Surface Type" start="61" end="63" type="uint">
       <value name="SURFTYPE_1D" value="0"/>
       <value name="SURFTYPE_2D" value="1"/>
@@ -968,9 +921,7 @@
       <value name="D16_UNORM" value="5"/>
     </field>
     <field name="Surface Pitch" start="32" end="48" type="uint"/>
-    <!-- DWord 2 -->
     <field name="Surface Base Address" start="64" end="95" type="address"/>
-    <!-- DWord 3 -->
     <field name="Height" start="115" end="127" type="uint">
       <value name="SURFTYPE_1D:  must be zero" value="0"/>
     </field>
@@ -980,13 +931,11 @@
       <value name="MIPLAYOUT_BELOW" value="0"/>
       <value name="MIPLAYOUT_RIGHT" value="1"/>
     </field>
-    <!-- DWord 4 -->
     <field name="Depth" start="149" end="159" type="uint">
       <value name="SURFTYPE_CUBE:  must be zero" value="0"/>
     </field>
     <field name="Minimum Array Element" start="138" end="148" type="uint"/>
     <field name="Render Target View Extent" start="129" end="137" type="uint"/>
-    <!-- DWord 5 -->
     <field name="Depth Coordinate Offset Y" start="176" end="191" type="int"/>
     <field name="Depth Coordinate Offset X" start="160" end="175" type="int"/>
   </instruction>
@@ -1005,8 +954,16 @@
     <field name="Drawing Rectangle Origin X" start="96" end="111" type="int"/>
   </instruction>
 
+  <instruction name="3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Global Depth Offset Clamp" start="32" end="63" type="float"/>
+  </instruction>
+
   <instruction name="3DSTATE_INDEX_BUFFER" bias="2" length="3">
-    <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
@@ -1018,9 +975,7 @@
       <value name="DWORD" value="2"/>
     </field>
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
-    <!-- DWord 1 -->
     <field name="Buffer Starting Address" start="32" end="63" type="address"/>
-    <!-- DWord 2 -->
     <field name="Buffer Ending Address" start="64" end="95" type="address"/>
   </instruction>
 
@@ -1034,10 +989,26 @@
     <field name="Current Repeat Counter" start="53" end="61" type="uint"/>
     <field name="Current Stipple Index" start="48" end="51" type="uint"/>
     <field name="Line Stipple Pattern" start="32" end="47" type="uint"/>
-    <field name="Line Stipple Inverse Repeat Count" start="79" end="95" type="u1.16"/>
+    <field name="Line Stipple Inverse Repeat Count" start="80" end="95" type="u1.13"/>
     <field name="Line Stipple Repeat Count" start="64" end="72" type="uint"/>
   </instruction>
 
+  <instruction name="3DSTATE_PIPELINED_POINTERS" bias="2" length="7">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
+    <field name="Pointer to VS State" start="37" end="63" type="address"/>
+    <field name="Pointer to GS State" start="69" end="95" type="address"/>
+    <field name="GS Enable" start="64" end="64" type="bool"/>
+    <field name="Pointer to CLIP State" start="101" end="127" type="address"/>
+    <field name="Clip Enable" start="96" end="96" type="bool"/>
+    <field name="Pointer to SF State" start="133" end="159" type="address"/>
+    <field name="Pointer to WM State" start="165" end="191" type="address"/>
+    <field name="Pointer to Color Calc State" start="197" end="223" type="address"/>
+  </instruction>
+
   <instruction name="3DSTATE_POLY_STIPPLE_OFFSET" bias="2" length="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1059,17 +1030,6 @@
     </group>
   </instruction>
 
-  <instruction name="3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" bias="2" length="2">
-    <!-- DWord 0 -->
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
-    <field name="DWord Length" start="0" end="7" type="uint"/>
-    <!-- DWord 1 -->
-    <field name="Global Depth Offset Clamp" start="32" end="63" type="float"/>
-  </instruction>
-
   <instruction name="3DSTATE_VERTEX_BUFFERS" bias="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1100,6 +1060,101 @@
     <field name="Statistics Enable" start="0" end="0" type="bool"/>
   </instruction>
 
+  <instruction name="CS_URB_STATE" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="URB Entry Allocation Size" start="36" end="40" type="uint"/>
+    <field name="Number of URB Entries" start="32" end="34" type="uint"/>
+  </instruction>
+
+  <instruction name="CONSTANT_BUFFER" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="Valid" start="8" end="8" type="bool" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Buffer Starting Address" start="38" end="63" type="address"/>
+    <field name="Buffer Length" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_FLUSH" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="4"/>
+    <field name="Global Snapshot Count Reset" start="3" end="3" type="uint">
+      <value name="Don't Reset" value="0"/>
+      <value name="Reset" value="1"/>
+    </field>
+    <field name="Render Cache Flush Inhibit" start="2" end="2" type="uint">
+      <value name="Flush" value="0"/>
+      <value name="Don't Flush" value="1"/>
+    </field>
+    <field name="State/Instruction Cache Invalidate" start="1" end="1" type="uint">
+      <value name="Don't Invalidate" value="0"/>
+      <value name="Invalidate" value="1"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_LOAD_REGISTER_IMM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="34"/>
+    <field name="Byte Write Disables" start="8" end="11" type="uint"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="1"/>
+    <field name="Register Offset" start="34" end="63" type="offset"/>
+    <field name="Data DWord" start="64" end="95" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_DATA_IMM" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="32"/>
+    <field name="Memory Address Type" start="22" end="22" type="bool"/>
+    <field name="BitFieldName" start="21" end="21" type="uint"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="2"/>
+    <field name="Physical Start Address Extension" start="32" end="35" type="address"/>
+    <field name="Address" start="66" end="95" type="address"/>
+    <field name="Data DWord 0" start="96" end="127" type="uint"/>
+    <field name="Data DWord 1" start="128" end="159" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_REGISTER_MEM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="36"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Physical Start Address Extension" start="60" end="63" type="address"/>
+    <field name="Register Address" start="34" end="54" type="offset"/>
+    <field name="Memory Address" start="66" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="PIPE_CONTROL" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="2"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Post Sync Operation" start="14" end="15" type="uint">
+      <value name="No Write" value="0"/>
+      <value name="Write Immediate Data" value="1"/>
+      <value name="Write PS Depth Count" value="2"/>
+      <value name="Write Timestamp" value="3"/>
+    </field>
+    <field name="Depth Stall Enable" start="13" end="13" type="bool"/>
+    <field name="Write Cache Flush" start="12" end="12" type="bool"/>
+    <field name="Instruction Cache Invalidate Enable" start="11" end="11" type="bool"/>
+    <field name="Texture Cache Flush Enable" start="10" end="10" type="bool"/>
+    <field name="Indirect State Pointers Disable" start="9" end="9" type="bool"/>
+    <field name="Notify Enable" start="8" end="8" type="bool"/>
+    <field name="Address" start="35" end="63" type="address"/>
+    <field name="Destination Address Type" start="34" end="34" type="uint" prefix="DAT">
+      <value name="PGTT" value="0"/>
+      <value name="GGTT" value="1"/>
+    </field>
+    <field name="Immediate Data" start="64" end="127" type="uint"/>
+  </instruction>
+
   <instruction name="PIPELINE_SELECT" bias="1" length="1">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
@@ -1111,35 +1166,7 @@
     </field>
   </instruction>
 
-  <instruction name="PIPE_CONTROL" bias="2" length="5">
-    <!-- DWord 0 -->
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="2"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
-    <field name="Post Sync Operation" start="14" end="15" type="uint">
-      <value name="No Write" value="0"/>
-      <value name="Write Immediate Data" value="1"/>
-      <value name="Write PS Depth Count" value="2"/>
-      <value name="Write Timestamp" value="3"/>
-    </field>
-    <field name="Depth Stall Enable" start="13" end="13" type="bool"/>
-    <field name="Write Cache Flush Enable" start="12" end="12" type="bool"/>
-    <field name="Instruction/State Cache Flush Enable" start="11" end="11" type="bool"/>
-    <field name="Texture Cache Flush Enable" start="10" end="10" type="bool"/>
-    <field name="Indirect State Pointers Disable" start="9" end="9" type="bool"/>
-    <field name="Notify Enable" start="8" end="8" type="bool"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <!-- DWord 1 -->
-    <field name="Destination Address" start="63" end="35" type="address"/>
-    <field name="Destination Address Type" start="34" end="34" type="uint" prefix="DAT">
-      <value name="PPGTT" value="0"/>
-      <value name="GGTT" value="1"/>
-    </field>
-    <field name="Immediate Data" start="64" end="127" type="uint"/>
-  </instruction>
-
-  <instruction name="STATE_BASE_ADDRESS" bias="2" length="10">
+  <instruction name="STATE_BASE_ADDRESS" bias="2" length="6">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
@@ -1157,16 +1184,6 @@
     <field name="Indirect Object Access Upper Bound Modify Enable" start="160" end="160" type="bool"/>
   </instruction>
 
-  <instruction name="STATE_PREFETCH" bias="2" length="2">
-    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
-    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
-    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
-    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="3"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
-    <field name="Prefetch Pointer" start="38" end="63" type="address"/>
-    <field name="Prefetch Count" start="32" end="34" type="uint"/>
-  </instruction>
-
   <instruction name="STATE_SIP" bias="2" length="2">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
@@ -1175,4 +1192,117 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
     <field name="System Instruction Pointer" start="36" end="63" type="offset"/>
   </instruction>
+
+  <instruction name="URB_FENCE" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="CS Unit URB Reallocation Request" start="13" end="13" type="bool"/>
+    <field name="VFE Unit URB Reallocation Request" start="12" end="12" type="bool"/>
+    <field name="SF Unit URB Reallocation Request" start="11" end="11" type="bool"/>
+    <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
+    <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
+    <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="bool"/>
+    <field name="GS Fence" start="42" end="51" type="bool"/>
+    <field name="VS Fence" start="32" end="41" type="bool"/>
+    <field name="CS Fence" start="84" end="94" type="bool"/>
+    <field name="VFE Fence" start="74" end="83" type="bool"/>
+    <field name="SF Fence" start="64" end="73" type="bool"/>
+  </instruction>
+
+  <instruction name="XY_COLOR_BLT" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="80"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Solid Pattern Color" start="160" end="191" type="int"/>
+  </instruction>
+
+  <instruction name="XY_SETUP_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Mono Source Transparency Mode" start="61" end="61" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="ClipRect Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="ClipRect X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="ClipRect Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="ClipRect X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Background Color" start="160" end="191" type="uint"/>
+    <field name="Foreground Color" start="192" end="223" type="uint"/>
+    <field name="Pattern Base Address" start="224" end="255" type="uint"/>
+  </instruction>
+
+  <instruction name="XY_SRC_COPY_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="83"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Source Tiling Enable" start="15" end="15" type="bool"/>
+    <field name="Destination Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Source Y1 Coordinate" start="176" end="191" type="int"/>
+    <field name="Source X1 Coordinate" start="160" end="175" type="int"/>
+    <field name="Source Pitch" start="192" end="207" type="int"/>
+    <field name="Source Base Address" start="224" end="255" type="address"/>
+  </instruction>
+
+  <instruction name="XY_TEXT_IMMEDIATE_BLT" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="49"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Packing" start="16" end="16" type="uint">
+      <value name="Bit Packed" value="0"/>
+      <value name="Byte Packed" value="1"/>
+    </field>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="48" end="63" type="int"/>
+    <field name="Destination X1 Coordinate" start="32" end="47" type="int"/>
+    <field name="Destination Y2 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X2 Coordinate" start="64" end="79" type="int"/>
+  </instruction>
 </genxml>
diff --git a/src/intel/genxml/gen5.xml b/src/intel/genxml/gen5.xml
index 97c1369..3608187 100644
--- a/src/intel/genxml/gen5.xml
+++ b/src/intel/genxml/gen5.xml
@@ -1,5 +1,603 @@
 <?xml version="1.0" ?>
 <genxml name="ILK" gen="5">
+  <enum name="3D_Prim_Topo_Type" prefix="3DPRIM">
+    <value name="POINTLIST" value="1"/>
+    <value name="LINELIST" value="2"/>
+    <value name="LINESTRIP" value="3"/>
+    <value name="TRILIST" value="4"/>
+    <value name="TRISTRIP" value="5"/>
+    <value name="TRIFAN" value="6"/>
+    <value name="QUADLIST" value="7"/>
+    <value name="QUADSTRIP" value="8"/>
+    <value name="LINELIST_ADJ" value="9"/>
+    <value name="LINESTRIP_ADJ" value="10"/>
+    <value name="TRILIST_ADJ" value="11"/>
+    <value name="TRISTRIP_ADJ" value="12"/>
+    <value name="TRISTRIP_REVERSE" value="13"/>
+    <value name="POLYGON" value="14"/>
+    <value name="RECTLIST" value="15"/>
+    <value name="LINELOOP" value="16"/>
+    <value name="POINTLIST _BF" value="17"/>
+    <value name="LINESTRIP_CONT" value="18"/>
+    <value name="LINESTRIP_BF" value="19"/>
+    <value name="LINESTRIP_CONT_BF" value="20"/>
+    <value name="TRIFAN_NOSTIPPLE" value="22"/>
+  </enum>
+
+  <enum name="3D_Vertex_Component_Control" prefix="VFCOMP">
+    <value name="NOSTORE" value="0"/>
+    <value name="STORE_SRC" value="1"/>
+    <value name="STORE_0" value="2"/>
+    <value name="STORE_1_FP" value="3"/>
+    <value name="STORE_1_INT" value="4"/>
+    <value name="STORE_VID" value="5"/>
+    <value name="STORE_IID" value="6"/>
+    <value name="STORE_PID" value="7"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Function" prefix="BLENDFUNCTION">
+    <value name="ADD" value="0"/>
+    <value name="SUBTRACT" value="1"/>
+    <value name="REVERSE_SUBTRACT" value="2"/>
+    <value name="MIN" value="3"/>
+    <value name="MAX" value="4"/>
+  </enum>
+
+  <enum name="3D_Color_Buffer_Blend_Factor" prefix="BLENDFACTOR">
+    <value name="ONE" value="1"/>
+    <value name="SRC_COLOR" value="2"/>
+    <value name="SRC_ALPHA" value="3"/>
+    <value name="DST_ALPHA" value="4"/>
+    <value name="DST_COLOR" value="5"/>
+    <value name="SRC_ALPHA_SATURATE" value="6"/>
+    <value name="CONST_COLOR" value="7"/>
+    <value name="CONST_ALPHA" value="8"/>
+    <value name="SRC1_COLOR" value="9"/>
+    <value name="SRC1_ALPHA" value="10"/>
+    <value name="ZERO" value="17"/>
+    <value name="INV_SRC_COLOR" value="18"/>
+    <value name="INV_SRC_ALPHA" value="19"/>
+    <value name="INV_DST_ALPHA" value="20"/>
+    <value name="INV_DST_COLOR" value="21"/>
+    <value name="INV_CONST_COLOR" value="23"/>
+    <value name="INV_CONST_ALPHA" value="24"/>
+    <value name="INV_SRC1_COLOR" value="25"/>
+    <value name="INV_SRC1_ALPHA" value="26"/>
+  </enum>
+
+  <enum name="3D_Compare_Function" prefix="COMPAREFUNCTION">
+    <value name="ALWAYS" value="0"/>
+    <value name="NEVER" value="1"/>
+    <value name="LESS" value="2"/>
+    <value name="EQUAL" value="3"/>
+    <value name="LEQUAL" value="4"/>
+    <value name="GREATER" value="5"/>
+    <value name="NOTEQUAL" value="6"/>
+    <value name="GEQUAL" value="7"/>
+  </enum>
+
+  <enum name="3D_Stencil_Operation" prefix="STENCILOP">
+    <value name="KEEP" value="0"/>
+    <value name="ZERO" value="1"/>
+    <value name="REPLACE" value="2"/>
+    <value name="INCRSAT" value="3"/>
+    <value name="DECRSAT" value="4"/>
+    <value name="INCR" value="5"/>
+    <value name="DECR" value="6"/>
+    <value name="INVERT" value="7"/>
+  </enum>
+
+  <enum name="3D_Logic_Op_Function" prefix="LOGICOP">
+    <value name="CLEAR" value="0"/>
+    <value name="NOR" value="1"/>
+    <value name="AND_INVERTED" value="2"/>
+    <value name="COPY_INVERTED" value="3"/>
+    <value name="AND_REVERSE" value="4"/>
+    <value name="INVERT" value="5"/>
+    <value name="XOR" value="6"/>
+    <value name="NAND" value="7"/>
+    <value name="AND" value="8"/>
+    <value name="EQUIV" value="9"/>
+    <value name="NOOP" value="10"/>
+    <value name="OR_INVERTED" value="11"/>
+    <value name="COPY" value="12"/>
+    <value name="OR_REVERSE" value="13"/>
+    <value name="OR" value="14"/>
+    <value name="SET" value="15"/>
+  </enum>
+
+  <enum name="SURFACE_FORMAT" prefix="SF">
+    <value name="R32G32B32A32_FLOAT" value="0"/>
+    <value name="R32G32B32A32_SINT" value="1"/>
+    <value name="R32G32B32A32_UINT" value="2"/>
+    <value name="R32G32B32A32_UNORM" value="3"/>
+    <value name="R32G32B32A32_SNORM" value="4"/>
+    <value name="R64G64_FLOAT" value="5"/>
+    <value name="R32G32B32X32_FLOAT" value="6"/>
+    <value name="R32G32B32A32_SSCALED" value="7"/>
+    <value name="R32G32B32A32_USCALED" value="8"/>
+    <value name="R32G32B32A32_SFIXED" value="32"/>
+    <value name="R64G64_PASSTHRU" value="33"/>
+    <value name="R32G32B32_FLOAT" value="64"/>
+    <value name="R32G32B32_SINT" value="65"/>
+    <value name="R32G32B32_UINT" value="66"/>
+    <value name="R32G32B32_UNORM" value="67"/>
+    <value name="R32G32B32_SNORM" value="68"/>
+    <value name="R32G32B32_SSCALED" value="69"/>
+    <value name="R32G32B32_USCALED" value="70"/>
+    <value name="R32G32B32_SFIXED" value="80"/>
+    <value name="R16G16B16A16_UNORM" value="128"/>
+    <value name="R16G16B16A16_SNORM" value="129"/>
+    <value name="R16G16B16A16_SINT" value="130"/>
+    <value name="R16G16B16A16_UINT" value="131"/>
+    <value name="R16G16B16A16_FLOAT" value="132"/>
+    <value name="R32G32_FLOAT" value="133"/>
+    <value name="R32G32_SINT" value="134"/>
+    <value name="R32G32_UINT" value="135"/>
+    <value name="R32_FLOAT_X8X24_TYPELESS" value="136"/>
+    <value name="X32_TYPELESS_G8X24_UINT" value="137"/>
+    <value name="L32A32_FLOAT" value="138"/>
+    <value name="R32G32_UNORM" value="139"/>
+    <value name="R32G32_SNORM" value="140"/>
+    <value name="R64_FLOAT" value="141"/>
+    <value name="R16G16B16X16_UNORM" value="142"/>
+    <value name="R16G16B16X16_FLOAT" value="143"/>
+    <value name="A32X32_FLOAT" value="144"/>
+    <value name="L32X32_FLOAT" value="145"/>
+    <value name="I32X32_FLOAT" value="146"/>
+    <value name="R16G16B16A16_SSCALED" value="147"/>
+    <value name="R16G16B16A16_USCALED" value="148"/>
+    <value name="R32G32_SSCALED" value="149"/>
+    <value name="R32G32_USCALED" value="150"/>
+    <value name="R32G32_SFIXED" value="160"/>
+    <value name="R64_PASSTHRU" value="161"/>
+    <value name="B8G8R8A8_UNORM" value="192"/>
+    <value name="B8G8R8A8_UNORM_SRGB" value="193"/>
+    <value name="R10G10B10A2_UNORM" value="194"/>
+    <value name="R10G10B10A2_UNORM_SRGB" value="195"/>
+    <value name="R10G10B10A2_UINT" value="196"/>
+    <value name="R10G10B10_SNORM_A2_UNORM" value="197"/>
+    <value name="R8G8B8A8_UNORM" value="199"/>
+    <value name="R8G8B8A8_UNORM_SRGB" value="200"/>
+    <value name="R8G8B8A8_SNORM" value="201"/>
+    <value name="R8G8B8A8_SINT" value="202"/>
+    <value name="R8G8B8A8_UINT" value="203"/>
+    <value name="R16G16_UNORM" value="204"/>
+    <value name="R16G16_SNORM" value="205"/>
+    <value name="R16G16_SINT" value="206"/>
+    <value name="R16G16_UINT" value="207"/>
+    <value name="R16G16_FLOAT" value="208"/>
+    <value name="B10G10R10A2_UNORM" value="209"/>
+    <value name="B10G10R10A2_UNORM_SRGB" value="210"/>
+    <value name="R11G11B10_FLOAT" value="211"/>
+    <value name="R32_SINT" value="214"/>
+    <value name="R32_UINT" value="215"/>
+    <value name="R32_FLOAT" value="216"/>
+    <value name="R24_UNORM_X8_TYPELESS" value="217"/>
+    <value name="X24_TYPELESS_G8_UINT" value="218"/>
+    <value name="L32_UNORM" value="221"/>
+    <value name="A32_UNORM" value="222"/>
+    <value name="L16A16_UNORM" value="223"/>
+    <value name="I24X8_UNORM" value="224"/>
+    <value name="L24X8_UNORM" value="225"/>
+    <value name="A24X8_UNORM" value="226"/>
+    <value name="I32_FLOAT" value="227"/>
+    <value name="L32_FLOAT" value="228"/>
+    <value name="A32_FLOAT" value="229"/>
+    <value name="X8B8_UNORM_G8R8_SNORM" value="230"/>
+    <value name="A8X8_UNORM_G8R8_SNORM" value="231"/>
+    <value name="B8X8_UNORM_G8R8_SNORM" value="232"/>
+    <value name="B8G8R8X8_UNORM" value="233"/>
+    <value name="B8G8R8X8_UNORM_SRGB" value="234"/>
+    <value name="R8G8B8X8_UNORM" value="235"/>
+    <value name="R8G8B8X8_UNORM_SRGB" value="236"/>
+    <value name="R9G9B9E5_SHAREDEXP" value="237"/>
+    <value name="B10G10R10X2_UNORM" value="238"/>
+    <value name="L16A16_FLOAT" value="240"/>
+    <value name="R32_UNORM" value="241"/>
+    <value name="R32_SNORM" value="242"/>
+    <value name="R10G10B10X2_USCALED" value="243"/>
+    <value name="R8G8B8A8_SSCALED" value="244"/>
+    <value name="R8G8B8A8_USCALED" value="245"/>
+    <value name="R16G16_SSCALED" value="246"/>
+    <value name="R16G16_USCALED" value="247"/>
+    <value name="R32_SSCALED" value="248"/>
+    <value name="R32_USCALED" value="249"/>
+    <value name="B5G6R5_UNORM" value="256"/>
+    <value name="B5G6R5_UNORM_SRGB" value="257"/>
+    <value name="B5G5R5A1_UNORM" value="258"/>
+    <value name="B5G5R5A1_UNORM_SRGB" value="259"/>
+    <value name="B4G4R4A4_UNORM" value="260"/>
+    <value name="B4G4R4A4_UNORM_SRGB" value="261"/>
+    <value name="R8G8_UNORM" value="262"/>
+    <value name="R8G8_SNORM" value="263"/>
+    <value name="R8G8_SINT" value="264"/>
+    <value name="R8G8_UINT" value="265"/>
+    <value name="R16_UNORM" value="266"/>
+    <value name="R16_SNORM" value="267"/>
+    <value name="R16_SINT" value="268"/>
+    <value name="R16_UINT" value="269"/>
+    <value name="R16_FLOAT" value="270"/>
+    <value name="A8P8_UNORM_PALETTE0" value="271"/>
+    <value name="A8P8_UNORM_PALETTE1" value="272"/>
+    <value name="I16_UNORM" value="273"/>
+    <value name="L16_UNORM" value="274"/>
+    <value name="A16_UNORM" value="275"/>
+    <value name="L8A8_UNORM" value="276"/>
+    <value name="I16_FLOAT" value="277"/>
+    <value name="L16_FLOAT" value="278"/>
+    <value name="A16_FLOAT" value="279"/>
+    <value name="L8A8_UNORM_SRGB" value="280"/>
+    <value name="R5G5_SNORM_B6_UNORM" value="281"/>
+    <value name="B5G5R5X1_UNORM" value="282"/>
+    <value name="B5G5R5X1_UNORM_SRGB" value="283"/>
+    <value name="R8G8_SSCALED" value="284"/>
+    <value name="R8G8_USCALED" value="285"/>
+    <value name="R16_SSCALED" value="286"/>
+    <value name="R16_USCALED" value="287"/>
+    <value name="P8A8_UNORM_PALETTE0" value="290"/>
+    <value name="P8A8_UNORM_PALETTE1" value="291"/>
+    <value name="A1B5G5R5_UNORM" value="292"/>
+    <value name="A4B4G4R4_UNORM" value="293"/>
+    <value name="L8A8_UINT" value="294"/>
+    <value name="L8A8_SINT" value="295"/>
+    <value name="R8_UNORM" value="320"/>
+    <value name="R8_SNORM" value="321"/>
+    <value name="R8_SINT" value="322"/>
+    <value name="R8_UINT" value="323"/>
+    <value name="A8_UNORM" value="324"/>
+    <value name="I8_UNORM" value="325"/>
+    <value name="L8_UNORM" value="326"/>
+    <value name="P4A4_UNORM_PALETTE0" value="327"/>
+    <value name="A4P4_UNORM_PALETTE0" value="328"/>
+    <value name="R8_SSCALED" value="329"/>
+    <value name="R8_USCALED" value="330"/>
+    <value name="P8_UNORM_PALETTE0" value="331"/>
+    <value name="L8_UNORM_SRGB" value="332"/>
+    <value name="P8_UNORM_PALETTE1" value="333"/>
+    <value name="P4A4_UNORM_PALETTE1" value="334"/>
+    <value name="A4P4_UNORM_PALETTE1" value="335"/>
+    <value name="Y8_UNORM" value="336"/>
+    <value name="L8_UINT" value="338"/>
+    <value name="L8_SINT" value="339"/>
+    <value name="I8_UINT" value="340"/>
+    <value name="I8_SINT" value="341"/>
+    <value name="DXT1_RGB_SRGB" value="384"/>
+    <value name="R1_UNORM" value="385"/>
+    <value name="YCRCB_NORMAL" value="386"/>
+    <value name="YCRCB_SWAPUVY" value="387"/>
+    <value name="P2_UNORM_PALETTE0" value="388"/>
+    <value name="P2_UNORM_PALETTE1" value="389"/>
+    <value name="BC1_UNORM" value="390"/>
+    <value name="BC2_UNORM" value="391"/>
+    <value name="BC3_UNORM" value="392"/>
+    <value name="BC4_UNORM" value="393"/>
+    <value name="BC5_UNORM" value="394"/>
+    <value name="BC1_UNORM_SRGB" value="395"/>
+    <value name="BC2_UNORM_SRGB" value="396"/>
+    <value name="BC3_UNORM_SRGB" value="397"/>
+    <value name="MONO8" value="398"/>
+    <value name="YCRCB_SWAPUV" value="399"/>
+    <value name="YCRCB_SWAPY" value="400"/>
+    <value name="DXT1_RGB" value="401"/>
+    <value name="FXT1" value="402"/>
+    <value name="R8G8B8_UNORM" value="403"/>
+    <value name="R8G8B8_SNORM" value="404"/>
+    <value name="R8G8B8_SSCALED" value="405"/>
+    <value name="R8G8B8_USCALED" value="406"/>
+    <value name="R64G64B64A64_FLOAT" value="407"/>
+    <value name="R64G64B64_FLOAT" value="408"/>
+    <value name="BC4_SNORM" value="409"/>
+    <value name="BC5_SNORM" value="410"/>
+    <value name="R16G16B16_FLOAT" value="411"/>
+    <value name="R16G16B16_UNORM" value="412"/>
+    <value name="R16G16B16_SNORM" value="413"/>
+    <value name="R16G16B16_SSCALED" value="414"/>
+    <value name="R16G16B16_USCALED" value="415"/>
+    <value name="BC6H_SF16" value="417"/>
+    <value name="BC7_UNORM" value="418"/>
+    <value name="BC7_UNORM_SRGB" value="419"/>
+    <value name="BC6H_UF16" value="420"/>
+    <value name="PLANAR_420_8" value="421"/>
+    <value name="R8G8B8_UNORM_SRGB" value="424"/>
+    <value name="ETC1_RGB8" value="425"/>
+    <value name="ETC2_RGB8" value="426"/>
+    <value name="EAC_R11" value="427"/>
+    <value name="EAC_RG11" value="428"/>
+    <value name="EAC_SIGNED_R11" value="429"/>
+    <value name="EAC_SIGNED_RG11" value="430"/>
+    <value name="ETC2_SRGB8" value="431"/>
+    <value name="R16G16B16_UINT" value="432"/>
+    <value name="R16G16B16_SINT" value="433"/>
+    <value name="R32_SFIXED" value="434"/>
+    <value name="R10G10B10A2_SNORM" value="435"/>
+    <value name="R10G10B10A2_USCALED" value="436"/>
+    <value name="R10G10B10A2_SSCALED" value="437"/>
+    <value name="R10G10B10A2_SINT" value="438"/>
+    <value name="B10G10R10A2_SNORM" value="439"/>
+    <value name="B10G10R10A2_USCALED" value="440"/>
+    <value name="B10G10R10A2_SSCALED" value="441"/>
+    <value name="B10G10R10A2_UINT" value="442"/>
+    <value name="B10G10R10A2_SINT" value="443"/>
+    <value name="R64G64B64A64_PASSTHRU" value="444"/>
+    <value name="R64G64B64_PASSTHRU" value="445"/>
+    <value name="ETC2_RGB8_PTA" value="448"/>
+    <value name="ETC2_SRGB8_PTA" value="449"/>
+    <value name="ETC2_EAC_RGBA8" value="450"/>
+    <value name="ETC2_EAC_SRGB8_A8" value="451"/>
+    <value name="R8G8B8_UINT" value="456"/>
+    <value name="R8G8B8_SINT" value="457"/>
+    <value name="RAW" value="511"/>
+  </enum>
+
+  <enum name="Texture_Coordinate_Mode" prefix="TCM">
+    <value name="WRAP" value="0"/>
+    <value name="MIRROR" value="1"/>
+    <value name="CLAMP" value="2"/>
+    <value name="CUBE" value="3"/>
+    <value name="CLAMP_BORDER" value="4"/>
+    <value name="MIRROR_ONCE" value="5"/>
+  </enum>
+
+  <struct name="CC_VIEWPORT" length="2">
+    <field name="Minimum Depth" start="0" end="31" type="float"/>
+    <field name="Maximum Depth" start="32" end="63" type="float"/>
+  </struct>
+
+  <struct name="CLIP_STATE" length="11">
+    <field name="Kernel Start Pointer" start="6" end="31" type="offset"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="API Mode" start="190" end="190" type="uint" prefix="APIMODE">
+      <value name="OGL" value="0"/>
+      <value name="D3D" value="1"/>
+    </field>
+    <field name="Vertex Position Space" start="189" end="189" type="uint" prefix="VPOS">
+      <value name="NDCSPACE" value="0"/>
+      <value name="SCREENSPACE" value="1"/>
+    </field>
+    <field name="Viewport XY ClipTest Enable" start="188" end="188" type="bool"/>
+    <field name="Viewport Z ClipTest Enable" start="187" end="187" type="bool"/>
+    <field name="Guardband ClipTest Enable" start="186" end="186" type="bool"/>
+    <field name="Negative W ClipTest Enable" start="185" end="185" type="bool"/>
+    <field name="UserClipFlags MustClip Enable" start="184" end="184" type="bool"/>
+    <field name="UserClipDistance ClipTest Enable Bitmask" start="176" end="183" type="uint"/>
+    <field name="Clip Mode" start="173" end="175" type="uint" prefix="CLIPMODE">
+      <value name="NORMAL" value="0"/>
+      <value name="ALL" value="1"/>
+      <value name="CLIP_NON_REJECTED" value="2"/>
+      <value name="REJECT_ALL" value="3"/>
+      <value name="ACCEPT_ALL" value="4"/>
+    </field>
+    <field name="Clipper Viewport State Pointer" start="197" end="223" type="address"/>
+    <field name="Screen Space Viewport X Min" start="224" end="255" type="float"/>
+    <field name="Screen Space Viewport X Max" start="256" end="287" type="float"/>
+    <field name="Screen Space Viewport Y Min" start="288" end="319" type="float"/>
+    <field name="Screen Space Viewport Y Max" start="320" end="351" type="float"/>
+  </struct>
+
+  <struct name="CLIP_VIEWPORT" length="4">
+    <field name="XMin Clip Guardband" start="0" end="31" type="float"/>
+    <field name="XMax Clip Guardband" start="32" end="63" type="float"/>
+    <field name="YMin Clip Guardband" start="64" end="95" type="float"/>
+    <field name="YMax Clip Guardband" start="96" end="127" type="float"/>
+  </struct>
+
+  <struct name="SCISSOR_RECT" length="2">
+    <field name="Scissor Rectangle Y Min" start="16" end="31" type="uint"/>
+    <field name="Scissor Rectangle X Min" start="0" end="15" type="uint"/>
+    <field name="Scissor Rectangle Y Max" start="48" end="63" type="uint"/>
+    <field name="Scissor Rectangle X Max" start="32" end="47" type="uint"/>
+  </struct>
+
+  <struct name="COLOR_CALC_STATE" length="8">
+    <field name="Stencil Test Enable" start="31" end="31" type="bool"/>
+    <field name="Stencil Test Function" start="28" end="30" type="3D_Compare_Function"/>
+    <field name="Stencil Fail Op" start="25" end="27" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Fail Op" start="22" end="24" type="3D_Stencil_Operation"/>
+    <field name="Stencil Pass Depth Pass Op" start="19" end="21" type="3D_Stencil_Operation"/>
+    <field name="Stencil Buffer Write Enable" start="18" end="18" type="bool"/>
+    <field name="Double Sided Stencil Enable" start="15" end="15" type="bool"/>
+    <field name="Backface Stencil Test Function" start="12" end="14" type="3D_Compare_Function"/>
+    <field name="Backface Stencil Fail Op" start="9" end="11" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Fail Op" start="6" end="8" type="3D_Stencil_Operation"/>
+    <field name="Backface Stencil Pass Depth Pass Op" start="3" end="5" type="3D_Stencil_Operation"/>
+    <field name="Stencil Reference Value" start="56" end="63" type="uint"/>
+    <field name="Stencil Test Mask" start="48" end="55" type="uint"/>
+    <field name="Stencil Write Mask" start="40" end="47" type="uint"/>
+    <field name="Backface Stencil Reference Value" start="32" end="39" type="uint"/>
+    <field name="Backface Stencil Test Mask" start="88" end="95" type="uint"/>
+    <field name="Backface Stencil Write Mask" start="80" end="87" type="uint"/>
+    <field name="Depth Test Enable" start="79" end="79" type="bool"/>
+    <field name="Depth Test Function" start="76" end="78" type="3D_Compare_Function"/>
+    <field name="Depth Buffer Write Enable" start="75" end="75" type="bool"/>
+    <field name="Logic Op Enable" start="64" end="64" type="bool"/>
+    <field name="Alpha Test Format" start="111" end="111" type="uint">
+      <value name="ALPHATEST_UNORM8" value="0"/>
+      <value name="ALPHATEST_FLOAT32" value="1"/>
+    </field>
+    <field name="Independent Alpha Blend Enable" start="109" end="109" type="bool"/>
+    <field name="Color Buffer Blend Enable" start="108" end="108" type="bool"/>
+    <field name="Alpha Test Enable" start="107" end="107" type="bool"/>
+    <field name="Alpha Test Function" start="104" end="106" type="3D_Compare_Function"/>
+    <field name="CC Viewport State Pointer" start="133" end="159" type="address"/>
+    <field name="Color Dither Enable" start="191" end="191" type="bool"/>
+    <field name="Round Disable Function Disable" start="190" end="190" type="bool"/>
+    <field name="Logic Op Function" start="176" end="179" type="3D_Logic_Op_Function"/>
+    <field name="Statistics Enable" start="175" end="175" type="bool"/>
+    <field name="Alpha Blend Function" start="172" end="174" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Alpha Blend Factor" start="167" end="171" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Alpha Blend Factor" start="162" end="166" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Color Blend Function" start="221" end="223" type="3D_Color_Buffer_Blend_Function"/>
+    <field name="Source Blend Factor" start="216" end="220" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="Destination Blend Factor" start="211" end="215" type="3D_Color_Buffer_Blend_Factor"/>
+    <field name="X Dither Offset" start="209" end="210" type="uint"/>
+    <field name="Y Dither Offset" start="207" end="208" type="uint"/>
+    <field name="Color Clamp Range" start="194" end="195" type="uint">
+      <value name="COLORCLAMP_UNORM" value="0"/>
+      <value name="COLORCLAMP_SNORM" value="1"/>
+      <value name="COLORCLAMP_RTFORMAT" value="2"/>
+    </field>
+    <field name="Pre-Blend Color Clamp Enable" start="193" end="193" type="bool"/>
+    <field name="Post-Blend Color Clamp Enable" start="192" end="192" type="bool"/>
+    <field name="Alpha Reference Value As UNORM8" start="224" end="255" type="uint"/>
+    <field name="Alpha Reference Value As FLOAT32" start="224" end="255" type="float"/>
+  </struct>
+
+  <struct name="GS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="offset"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="GS Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="SO Statistics Enable" start="137" end="137" type="bool"/>
+    <field name="Rendering Enabled" start="136" end="136" type="bool"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Reorder Enable" start="222" end="222" type="bool"/>
+    <field name="Maximum VPIndex" start="192" end="195" type="uint"/>
+  </struct>
+
+  <struct name="MEMORY_OBJECT_CONTROL_STATE" length="1">
+    <field name="Encrypted Data" start="3" end="3" type="bool"/>
+    <field name="Graphics Data Type (GFDT)" start="2" end="2" type="uint"/>
+    <field name="Cacheability Control" start="0" end="1" type="uint"/>
+  </struct>
+
+  <struct name="SAMPLER_BORDER_COLOR_STATE" length="12">
+    <field name="Border Color Unorm Red" start="0" end="7" type="uint"/>
+    <field name="Border Color Unorm Green" start="8" end="15" type="uint"/>
+    <field name="Border Color Unorm Blue" start="16" end="23" type="uint"/>
+    <field name="Border Color Unorm Alpha" start="24" end="31" type="uint"/>
+
+    <field name="Border Color Float Red" start="32" end="63" type="float"/>
+    <field name="Border Color Float Green" start="64" end="95" type="float"/>
+    <field name="Border Color Float Blue" start="96" end="127" type="float"/>
+    <field name="Border Color Float Alpha" start="128" end="159" type="float"/>
+
+    <field name="Border Color Float16 Red" start="160" end="175" type="uint"/>
+    <field name="Border Color Float16 Green" start="176" end="191" type="uint"/>
+    <field name="Border Color Float16 Blue" start="192" end="207" type="uint"/>
+    <field name="Border Color Float16 Alpha" start="208" end="223" type="uint"/>
+
+    <field name="Border Color Unorm16 Red" start="224" end="239" type="uint"/>
+    <field name="Border Color Unorm16 Green" start="240" end="255" type="uint"/>
+    <field name="Border Color Unorm16 Blue" start="256" end="271" type="uint"/>
+    <field name="Border Color Unorm16 Alpha" start="272" end="287" type="uint"/>
+
+    <field name="Border Color Snorm16 Red" start="288" end="303" type="int"/>
+    <field name="Border Color Snorm16 Green" start="304" end="319" type="int"/>
+    <field name="Border Color Snorm16 Blue" start="320" end="335" type="int"/>
+    <field name="Border Color Snorm16 Alpha" start="336" end="351" type="int"/>
+
+    <field name="Border Color Snorm8 Red" start="352" end="359" type="int"/>
+    <field name="Border Color Snorm8 Green" start="360" end="367" type="int"/>
+    <field name="Border Color Snorm8 Blue" start="368" end="375" type="int"/>
+    <field name="Border Color Snorm8 Alpha" start="376" end="383" type="int"/>
+  </struct>
+
+  <struct name="SAMPLER_STATE" length="4">
+    <field name="Sampler Disable" start="31" end="31" type="bool"/>
+    <field name="LOD PreClamp Enable" start="28" end="28" type="bool"/>
+    <field name="Base Mip Level" start="22" end="26" type="u4.1"/>
+    <field name="Mip Mode Filter" start="20" end="21" type="uint" prefix="MIPFILTER">
+      <value name="NONE" value="0"/>
+      <value name="NEAREST" value="1"/>
+      <value name="LINEAR" value="3"/>
+    </field>
+    <field name="Mag Mode Filter" start="17" end="19" type="uint" prefix="MAPFILTER">
+      <value name="NEAREST" value="0"/>
+      <value name="LINEAR" value="1"/>
+      <value name="ANISOTROPIC" value="2"/>
+      <value name="MONO" value="6"/>
+    </field>
+    <field name="Min Mode Filter" start="14" end="16" type="uint" prefix="MAPFILTER"/>
+    <field name="Texture LOD Bias" start="3" end="13" type="s4.6"/>
+    <field name="Shadow Function" start="0" end="2" type="uint">
+      <value name="PREFILTEROP_ALWAYS" value="0"/>
+      <value name="PREFILTEROP_NEVER" value="1"/>
+      <value name="PREFILTEROP_LESS" value="2"/>
+      <value name="PREFILTEROP_EQUAL" value="3"/>
+      <value name="PREFILTEROP_LEQUAL" value="4"/>
+      <value name="PREFILTEROP_GREATER" value="5"/>
+      <value name="PREFILTEROP_NOTEQUAL" value="6"/>
+      <value name="PREFILTEROP_GEQUAL" value="7"/>
+    </field>
+    <field name="Min LOD" start="54" end="63" type="u4.6"/>
+    <field name="Max LOD" start="44" end="53" type="u4.6"/>
+    <field name="Cube Surface Control Mode" start="41" end="41" type="uint">
+      <value name="CUBECTRLMODE_PROGRAMMED" value="0"/>
+      <value name="CUBECTRLMODE_OVERRIDE" value="1"/>
+    </field>
+    <field name="TCX Address Control Mode" start="38" end="40" type="Texture_Coordinate_Mode"/>
+    <field name="TCY Address Control Mode" start="35" end="37" type="Texture_Coordinate_Mode"/>
+    <field name="TCZ Address Control Mode" start="32" end="34" type="Texture_Coordinate_Mode"/>
+    <field name="Border Color Pointer" start="69" end="95" type="offset"/>
+    <field name="Monochrome Filter Height" start="125" end="127" type="uint"/>
+    <field name="Monochrome Filter Width" start="122" end="124" type="uint"/>
+    <field name="ChromaKey Enable" start="121" end="121" type="bool"/>
+    <field name="ChromaKey Index" start="119" end="120" type="uint"/>
+    <field name="ChromaKey Mode" start="118" end="118" type="uint" prefix="KEYFILTER">
+      <value name="KILL_ON_ANY_MATCH" value="0"/>
+      <value name="REPLACE_BLACK" value="1"/>
+    </field>
+    <field name="Maximum Anisotropy" start="115" end="117" type="uint">
+      <value name="RATIO 2:1" value="0"/>
+      <value name="RATIO 4:1" value="1"/>
+      <value name="RATIO 6:1" value="2"/>
+      <value name="RATIO 8:1" value="3"/>
+      <value name="RATIO 10:1" value="4"/>
+      <value name="RATIO 12:1" value="5"/>
+      <value name="RATIO 14:1" value="6"/>
+      <value name="RATIO 16:1" value="7"/>
+    </field>
+    <field name="R Address Min Filter Rounding Enable" start="109" end="109" type="bool"/>
+    <field name="R Address Mag Filter Rounding Enable" start="110" end="110" type="bool"/>
+    <field name="V Address Min Filter Rounding Enable" start="111" end="111" type="bool"/>
+    <field name="V Address Mag Filter Rounding Enable" start="112" end="112" type="bool"/>
+    <field name="U Address Min Filter Rounding Enable" start="113" end="113" type="bool"/>
+    <field name="U Address Mag Filter Rounding Enable" start="114" end="114" type="bool"/>
+  </struct>
+
   <struct name="RENDER_SURFACE_STATE" length="6">
     <field name="Surface Type" start="29" end="31" type="uint">
       <value name="SURFTYPE_1D" value="0"/>
@@ -27,20 +625,27 @@
       <value name="MIPLAYOUT_BELOW" value="0"/>
       <value name="MIPLAYOUT_RIGHT" value="1"/>
     </field>
+    <field name="Cube Map Corner Mode" start="9" end="9" type="uint">
+      <value name="CUBE_REPLICATE" value="0"/>
+      <value name="CUBE_AVERAGE" value="1"/>
+    </field>
     <field name="Render Cache Read Write Mode" start="8" end="8" type="uint">
       <value name="WRITE_ONLY" value="0"/>
       <value name="READ_WRITE" value="1"/>
     </field>
     <field name="Media Boundary Pixel Mode" start="6" end="7" type="uint">
       <value name="NORMAL_MODE" value="0"/>
-      <value name="PROGRESSIVE_FRAME" value="2"/>
-      <value name="INTERLACED_FRAME" value="3"/>
     </field>
     <field name="Cube Face Enables" start="0" end="5" type="uint"/>
     <field name="Surface Base Address" start="32" end="63" type="address"/>
     <field name="Height" start="83" end="95" type="uint"/>
     <field name="Width" start="70" end="82" type="uint"/>
     <field name="MIP Count / LOD" start="66" end="69" type="uint"/>
+    <field name="Render Target Rotation" start="64" end="65" type="uint">
+      <value name="RTROTATE_0DEG" value="0"/>
+      <value name="RTROTATE_90DEG" value="1"/>
+      <value name="RTROTATE_270DEG" value="3"/>
+    </field>
     <field name="Depth" start="117" end="127" type="uint"/>
     <field name="Surface Pitch" start="99" end="115" type="uint"/>
     <field name="Tiled Surface" start="97" end="97" type="uint"/>
@@ -55,14 +660,14 @@
     <field name="Y Offset" start="180" end="183" type="uint"/>
   </struct>
 
-  <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="5">
-    <!-- The hardware docs incorrectly say length is 6. The actual length is 5. -->
+  <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="6">
+    <!-- The hardware docs incorrectly say length is 7. The actual length is 6. -->
     <!-- DWord 0 -->
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="5"/>
-    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
     <!-- DWord 1 -->
     <field name="Surface Type" start="61" end="63" type="uint">
       <value name="SURFTYPE_1D" value="0"/>
@@ -110,4 +715,690 @@
     <!-- DWord 6 -->
     <!-- The hardware docs lie. There is no dword 6. -->
   </instruction>
+
+  <struct name="SF_STATE" length="8">
+    <field name="Kernel Start Pointer" start="6" end="31" type="offset"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Setup Viewport State Offset" start="165" end="191" type="address"/>
+    <field name="Viewport Transform Enable" start="161" end="161" type="bool"/>
+    <field name="Front Winding" start="160" end="160" type="uint">
+      <value name="FRONTWINDING_CW" value="0"/>
+      <value name="FRONTWINDING_CCW" value="1"/>
+    </field>
+    <field name="Anti-Aliasing Enable" start="223" end="223" type="bool"/>
+    <field name="Cull Mode" start="221" end="222" type="uint" prefix="CULLMODE">
+      <value name="BOTH" value="0"/>
+      <value name="NONE" value="1"/>
+      <value name="FRONT" value="2"/>
+      <value name="BACK" value="3"/>
+    </field>
+    <field name="Fast Scissor Clip Disable" start="220" end="220" type="bool"/>
+    <field name="Line Width" start="216" end="219" type="u3.1"/>
+    <field name="Line End Cap Antialiasing Region Width" start="214" end="215" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Point Rasterization Rule" start="212" end="213" type="uint">
+      <value name="RASTRULE_UPPER_LEFT" value="0"/>
+      <value name="RASTRULE_UPPER_RIGHT" value="1"/>
+    </field>
+    <field name="Zero Pixel Triangle Filter Disable" start="211" end="211" type="bool"/>
+    <field name="2x2 Pixel Triangle Filter Disable" start="210" end="210" type="bool"/>
+    <field name="Scissor Rectangle Enable" start="209" end="209" type="bool"/>
+    <field name="Destination Origin Horizontal Bias" start="205" end="208" type="u0.4"/>
+    <field name="Destination Origin Vertical Bias" start="201" end="204" type="u0.4"/>
+    <field name="Last Pixel Enable" start="255" end="255" type="bool"/>
+    <field name="Triangle Strip/List Provoking Vertex Select" start="253" end="254" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="Line Strip/List Provoking Vertex Select" start="251" end="252" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+    </field>
+    <field name="Triangle Fan Provoking Vertex Select" start="249" end="250" type="uint">
+      <value name="Vertex 0" value="0"/>
+      <value name="Vertex 1" value="1"/>
+      <value name="Vertex 2" value="2"/>
+    </field>
+    <field name="AA Line Distance Mode" start="238" end="238" type="uint" prefix="AALINEDISTANCE">
+      <value name="MANHATTAN" value="0"/>
+      <value name="TRUE" value="1"/>
+    </field>
+    <field name="Sprite Point Enable" start="237" end="237" type="bool"/>
+    <field name="Vertex Sub Pixel Precision Select" start="236" end="236" type="uint">
+      <value name="8 Sub-Pixel Precision Bits" value="0"/>
+      <value name="4 Sub-Pixel Precision Bits" value="1"/>
+    </field>
+    <field name="Point Width Source" start="235" end="235" type="uint">
+      <value name="Vertex" value="0"/>
+      <value name="State" value="1"/>
+    </field>
+    <field name="Point Width" start="224" end="234" type="u8.3"/>
+  </struct>
+
+  <struct name="SF_VIEWPORT" length="8">
+    <field name="Viewport Matrix Element m00" start="0" end="31" type="float"/>
+    <field name="Viewport Matrix Element m11" start="32" end="63" type="float"/>
+    <field name="Viewport Matrix Element m22" start="64" end="95" type="float"/>
+    <field name="Viewport Matrix Element m30" start="96" end="127" type="float"/>
+    <field name="Viewport Matrix Element m31" start="128" end="159" type="float"/>
+    <field name="Viewport Matrix Element m32" start="160" end="191" type="float"/>
+    <field name="Scissor Rectangle" start="192" end="255" type="SCISSOR_RECT"/>
+  </struct>
+
+  <struct name="VERTEX_BUFFER_STATE" length="4">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Buffer Access Type" start="26" end="26" type="uint">
+      <value name="VERTEXDATA" value="0"/>
+      <value name="INSTANCEDATA" value="1"/>
+    </field>
+    <field name="Null Vertex Buffer" start="13" end="13" type="bool"/>
+    <field name="Buffer Pitch" start="0" end="11" type="uint"/>
+    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
+    <field name="End Address" start="64" end="95" type="address"/>
+    <field name="Instance Data Step Rate" start="96" end="127" type="uint"/>
+  </struct>
+
+  <struct name="VERTEX_ELEMENT_STATE" length="2">
+    <field name="Vertex Buffer Index" start="27" end="31" type="uint"/>
+    <field name="Valid" start="26" end="26" type="bool"/>
+    <field name="Source Element Format" start="16" end="24" type="SURFACE_FORMAT"/>
+    <field name="Source Element Offset" start="0" end="10" type="uint"/>
+    <field name="Destination Element Offset" start="32" end="39" type="uint"/>
+    <field name="Component 0 Control" start="60" end="62" type="3D_Vertex_Component_Control"/>
+    <field name="Component 1 Control" start="56" end="58" type="3D_Vertex_Component_Control"/>
+    <field name="Component 2 Control" start="52" end="54" type="3D_Vertex_Component_Control"/>
+    <field name="Component 3 Control" start="48" end="50" type="3D_Vertex_Component_Control"/>
+  </struct>
+
+  <struct name="VS_STATE" length="7">
+    <field name="Kernel Start Pointer" start="6" end="31" type="offset"/>
+    <field name="GRF Register Count" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="Normal Priority" value="0"/>
+      <value name="High Priority" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="43" end="43" type="bool"/>
+    <field name="Software  Exception Enable" start="39" end="39" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Vertex URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Vertex URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For URB Data" start="96" end="99" type="uint"/>
+    <field name="Maximum Number of Threads" start="153" end="158" type="uint"/>
+    <field name="URB Entry Allocation Size" start="147" end="151" type="uint"/>
+    <field name="Number of URB Entries" start="139" end="146" type="uint"/>
+    <field name="Statistics Enable" start="138" end="138" type="bool"/>
+    <field name="Sampler State Pointer" start="165" end="191" type="address"/>
+    <field name="Sampler Count" start="160" end="162" type="uint"/>
+    <field name="Vertex Cache Disable" start="193" end="193" type="bool"/>
+    <field name="Enable" start="192" end="192" type="bool"/>
+  </struct>
+
+  <struct name="WM_STATE" length="11">
+    <field name="Kernel Start Pointer 0" start="6" end="31" type="offset"/>
+    <field name="GRF Register Count 0" start="1" end="3" type="uint"/>
+    <field name="Single Program Flow" start="63" end="63" type="bool"/>
+    <field name="Binding Table Entry Count" start="50" end="57" type="uint"/>
+    <field name="Thread Priority" start="49" end="49" type="uint">
+      <value name="High" value="1"/>
+    </field>
+    <field name="Floating Point Mode" start="48" end="48" type="uint" prefix="FLOATING_POINT_MODE">
+      <value name="IEEE-754" value="0"/>
+      <value name="Alternate" value="1"/>
+    </field>
+    <field name="Depth Coefficient URB Read Offset" start="40" end="45" type="uint"/>
+    <field name="Illegal Opcode Exception Enable" start="36" end="36" type="bool"/>
+    <field name="Mask Stack Exception Enable" start="34" end="34" type="bool"/>
+    <field name="Software  Exception Enable" start="33" end="33" type="bool"/>
+    <field name="Scratch Space Base Pointer" start="74" end="95" type="address"/>
+    <field name="Per-Thread Scratch Space" start="64" end="67" type="uint"/>
+    <field name="Constant URB Entry Read Length" start="121" end="126" type="uint"/>
+    <field name="Constant URB Entry Read Offset" start="114" end="119" type="uint"/>
+    <field name="Setup URB Entry Read Length" start="107" end="112" type="uint"/>
+    <field name="Setup URB Entry Read Offset" start="100" end="105" type="uint"/>
+    <field name="Dispatch GRF Start Register For Constant/Setup Data 0" start="96" end="99" type="uint"/>
+    <field name="Sampler State Pointer" start="133" end="159" type="address"/>
+    <field name="Sampler Count" start="130" end="132" type="uint"/>
+    <field name="Statistics Enable" start="128" end="128" type="bool"/>
+    <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
+    <field name="Legacy Diamond Line Rasterization" start="183" end="183" type="bool"/>
+    <field name="Pixel Shader Kills Pixel" start="182" end="182" type="bool"/>
+    <field name="Pixel Shader Computed Depth" start="181" end="181" type="bool"/>
+    <field name="Pixel Shader Uses Source Depth" start="180" end="180" type="bool"/>
+    <field name="Thread Dispatch Enable" start="179" end="179" type="bool"/>
+    <field name="Early Depth Test Enable" start="178" end="178" type="bool"/>
+    <field name="Line End Cap Antialiasing Region Width" start="176" end="177" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Line Antialiasing Region Width" start="174" end="175" type="uint">
+      <value name="0.5 pixels" value="0"/>
+      <value name="1.0 pixels" value="1"/>
+      <value name="2.0 pixels" value="2"/>
+      <value name="4.0 pixels" value="3"/>
+    </field>
+    <field name="Polygon Stipple Enable" start="173" end="173" type="bool"/>
+    <field name="Global Depth Offset Enable" start="172" end="172" type="bool"/>
+    <field name="Line Stipple Enable" start="171" end="171" type="bool"/>
+    <field name="Legacy Global Depth Bias Enable" start="170" end="170" type="bool"/>
+    <field name="Hierarchical Depth Buffer Resolve Enable" start="169" end="169" type="bool"/>
+    <field name="Depth Buffer Resolve Enable" start="168" end="168" type="bool"/>
+    <field name="Depth Buffer Clear" start="167" end="167" type="bool"/>
+    <field name="Fast Span Coverage Enable" start="166" end="166" type="bool"/>
+    <field name="Contiguous 64 Pixel Dispatch Enable" start="164" end="164" type="bool"/>
+    <field name="Contiguous 32 Pixel Dispatch Enable" start="163" end="163" type="bool"/>
+    <field name="32 Pixel Dispatch Enable" start="162" end="162" type="bool"/>
+    <field name="16 Pixel Dispatch Enable" start="161" end="161" type="bool"/>
+    <field name="8 Pixel Dispatch Enable" start="160" end="160" type="bool"/>
+    <field name="Global Depth Offset Constant" start="192" end="223" type="float"/>
+    <field name="Global Depth Offset Scale" start="224" end="255" type="float"/>
+    <field name="Kernel Start Pointer 1" start="262" end="287" type="offset"/>
+    <field name="GRF Register Count 1" start="257" end="259" type="uint"/>
+    <field name="Kernel Start Pointer 2" start="294" end="319" type="offset"/>
+    <field name="GRF Register Count 2" start="289" end="291" type="uint"/>
+    <field name="Kernel Start Pointer 3" start="326" end="351" type="offset"/>
+    <field name="GRF Register Count 3" start="321" end="323" type="uint"/>
+  </struct>
+
+  <instruction name="3DPRIMITIVE" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="3"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="Vertex Access Type" start="15" end="15" type="uint">
+      <value name="SEQUENTIAL" value="0"/>
+      <value name="RANDOM" value="1"/>
+    </field>
+    <field name="Primitive Topology Type" start="10" end="14" type="3D_Prim_Topo_Type"/>
+    <field name="Indirect Vertex Count" start="9" end="9" type="uint"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="Vertex Count Per Instance" start="32" end="63" type="uint"/>
+    <field name="Start Vertex Location" start="64" end="95" type="uint"/>
+    <field name="Instance Count" start="96" end="127" type="uint"/>
+    <field name="Start Instance Location" start="128" end="159" type="uint"/>
+    <field name="Base Vertex Location" start="160" end="191" type="int"/>
+  </instruction>
+
+  <instruction name="3DSTATE_AA_LINE_PARAMETERS" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="AA Coverage Bias" start="48" end="55" type="u0.8"/>
+    <field name="AA Coverage Slope" start="32" end="39" type="u0.8"/>
+    <field name="AA Coverage EndCap Bias" start="80" end="87" type="u0.8"/>
+    <field name="AA Coverage EndCap Slope" start="64" end="71" type="u0.8"/>
+  </instruction>
+
+  <instruction name="3DSTATE_BINDING_TABLE_POINTERS" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="Pointer to VS Binding Table" start="37" end="63" type="offset"/>
+    <field name="Pointer to GS Binding Table" start="69" end="95" type="offset"/>
+    <field name="Pointer to CLIP Binding Table" start="101" end="127" type="offset"/>
+    <field name="Pointer to SF Binding Table" start="133" end="159" type="offset"/>
+    <field name="Pointer to PS Binding Table" start="165" end="191" type="offset"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CLEAR_PARAMS" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="16"/>
+    <field name="Depth Clear Value Valid" start="15" end="15" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Depth Clear Value" start="32" end="63" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_CONSTANT_COLOR" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <field name="Blend Constant Color Red" start="32" end="63" type="float"/>
+    <field name="Blend Constant Color Green" start="64" end="95" type="float"/>
+    <field name="Blend Constant Color Blue" start="96" end="127" type="float"/>
+    <field name="Blend Constant Color Alpha" start="128" end="159" type="float"/>
+  </instruction>
+
+  <instruction name="3DSTATE_DRAWING_RECTANGLE" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Clipped Drawing Rectangle Y Min" start="48" end="63" type="uint"/>
+    <field name="Clipped Drawing Rectangle X Min" start="32" end="47" type="uint"/>
+    <field name="Clipped Drawing Rectangle Y Max" start="80" end="95" type="uint"/>
+    <field name="Clipped Drawing Rectangle X Max" start="64" end="79" type="uint"/>
+    <field name="Drawing Rectangle Origin Y" start="112" end="127" type="int"/>
+    <field name="Drawing Rectangle Origin X" start="96" end="111" type="int"/>
+  </instruction>
+
+  <instruction name="3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Global Depth Offset Clamp" start="32" end="63" type="float"/>
+  </instruction>
+
+  <instruction name="3DSTATE_HIER_DEPTH_BUFFER" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="15"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Surface Pitch" start="32" end="48" type="uint"/>
+    <field name="Surface Base Address" start="64" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="3DSTATE_INDEX_BUFFER" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
+    <field name="Cut Index Enable" start="10" end="10" type="bool"/>
+    <field name="Index Format" start="8" end="9" type="uint" prefix="INDEX">
+      <value name="BYTE" value="0"/>
+      <value name="WORD" value="1"/>
+      <value name="DWORD" value="2"/>
+    </field>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Buffer Starting Address" start="32" end="63" type="address"/>
+    <field name="Buffer Ending Address" start="64" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="3DSTATE_LINE_STIPPLE" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="8"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Modify Enable" start="63" end="63" type="bool"/>
+    <field name="Current Repeat Counter" start="53" end="61" type="uint"/>
+    <field name="Current Stipple Index" start="48" end="51" type="uint"/>
+    <field name="Line Stipple Pattern" start="32" end="47" type="uint"/>
+    <field name="Line Stipple Inverse Repeat Count" start="80" end="95" type="u1.13"/>
+    <field name="Line Stipple Repeat Count" start="64" end="72" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_POLY_STIPPLE_OFFSET" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="6"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Polygon Stipple X Offset" start="40" end="44" type="uint"/>
+    <field name="Polygon Stipple Y Offset" start="32" end="36" type="uint"/>
+  </instruction>
+
+  <instruction name="3DSTATE_POLY_STIPPLE_PATTERN" bias="2" length="33">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="7"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="31"/>
+    <group count="32" start="32" size="32">
+      <field name="Pattern Row" start="0" end="31" type="uint"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_PIPELINED_POINTERS" bias="2" length="7">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="5"/>
+    <field name="Pointer to VS State" start="37" end="63" type="address"/>
+    <field name="Pointer to GS State" start="69" end="95" type="address"/>
+    <field name="GS Enable" start="64" end="64" type="bool"/>
+    <field name="Pointer to CLIP State" start="101" end="127" type="address"/>
+    <field name="Clip Enable" start="96" end="96" type="bool"/>
+    <field name="Pointer to SF State" start="133" end="159" type="address"/>
+    <field name="Pointer to WM State" start="165" end="191" type="address"/>
+    <field name="Pointer to Color Calc State" start="197" end="223" type="address"/>
+  </instruction>
+
+  <instruction name="3DSTATE_STENCIL_BUFFER" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="14"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Surface Pitch" start="32" end="48" type="uint"/>
+    <field name="Surface Base Address" start="64" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="3DSTATE_VERTEX_BUFFERS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="8"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
+    <group count="0" start="32" size="128">
+      <field name="Vertex Buffer State" start="0" end="127" type="VERTEX_BUFFER_STATE"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_VERTEX_ELEMENTS" bias="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="9"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <group count="0" start="32" size="64">
+      <field name="Element" start="0" end="63" type="VERTEX_ELEMENT_STATE"/>
+    </group>
+  </instruction>
+
+  <instruction name="3DSTATE_VF_STATISTICS" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="1"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="11"/>
+    <field name="Statistics Enable" start="0" end="0" type="bool"/>
+  </instruction>
+
+  <instruction name="CS_URB_STATE" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="URB Entry Allocation Size" start="36" end="40" type="uint"/>
+    <field name="Number of URB Entries" start="32" end="34" type="uint"/>
+  </instruction>
+
+  <instruction name="CONSTANT_BUFFER" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="Valid" start="8" end="8" type="bool"/>
+    <field name="Buffer Starting Address" start="38" end="63" type="address"/>
+    <field name="Buffer Length" start="32" end="37" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_FLUSH" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="4"/>
+    <field name="Protected Memory Enable" start="6" end="6" type="bool"/>
+    <field name="Indirect State Pointers Disable" start="5" end="5" type="bool"/>
+    <field name="Generic Media State Clear" start="4" end="4" type="bool"/>
+    <field name="Global Snapshot Count Reset" start="3" end="3" type="uint">
+      <value name="Don't Reset" value="0"/>
+      <value name="Reset" value="1"/>
+    </field>
+    <field name="Render Cache Flush Inhibit" start="2" end="2" type="uint">
+      <value name="Flush" value="0"/>
+      <value name="Don't Flush" value="1"/>
+    </field>
+    <field name="State/Instruction Cache Invalidate" start="1" end="1" type="uint">
+      <value name="Don't Invalidate" value="0"/>
+      <value name="Invalidate" value="1"/>
+    </field>
+  </instruction>
+
+  <instruction name="MI_LOAD_REGISTER_IMM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="34"/>
+    <field name="Byte Write Disables" start="8" end="11" type="uint"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="1"/>
+    <field name="Register Offset" start="34" end="63" type="offset"/>
+    <field name="Data DWord" start="64" end="95" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_DATA_IMM" bias="2" length="5">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="32"/>
+    <field name="Memory Address Type" start="22" end="22" type="bool"/>
+    <field name="DWord Length" start="0" end="5" type="uint" default="2"/>
+    <field name="Physical Start Address Extension" start="32" end="35" type="address"/>
+    <field name="Address" start="66" end="95" type="address"/>
+    <field name="Data DWord 0" start="96" end="127" type="uint"/>
+    <field name="Data DWord 1" start="128" end="159" type="uint"/>
+  </instruction>
+
+  <instruction name="MI_STORE_REGISTER_MEM" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="0"/>
+    <field name="MI Command Opcode" start="23" end="28" type="uint" default="36"/>
+    <field name="Use Global GTT" start="22" end="22" type="bool"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="Register Address" start="34" end="57" type="offset"/>
+    <field name="Memory Address" start="66" end="95" type="address"/>
+  </instruction>
+
+  <instruction name="PIPE_CONTROL" bias="2" length="4">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="2"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
+    <field name="Post Sync Operation" start="14" end="15" type="uint">
+      <value name="No Write" value="0"/>
+      <value name="Write Immediate Data" value="1"/>
+      <value name="Write PS Depth Count" value="2"/>
+      <value name="Write Timestamp" value="3"/>
+    </field>
+    <field name="Depth Stall Enable" start="13" end="13" type="bool"/>
+    <field name="Write Cache Flush" start="12" end="12" type="bool"/>
+    <field name="Instruction Cache Invalidate Enable" start="11" end="11" type="bool"/>
+    <field name="Texture Cache Flush Enable" start="10" end="10" type="bool"/>
+    <field name="Indirect State Pointers Disable" start="9" end="9" type="bool"/>
+    <field name="Notify Enable" start="8" end="8" type="bool"/>
+    <field name="Address" start="35" end="63" type="address"/>
+    <field name="Destination Address Type" start="34" end="34" type="uint" prefix="DAT">
+      <value name="PGTT" value="0"/>
+      <value name="GGTT" value="1"/>
+    </field>
+    <field name="Stall At Pixel Scoreboard" start="33" end="33" type="bool"/>
+    <field name="Depth Cache Flush Inhibit" start="32" end="32" type="uint">
+      <value name="Flushed" value="0"/>
+      <value name="Not Flushed" value="1"/>
+    </field>
+    <field name="Immediate Data" start="64" end="127" type="uint"/>
+  </instruction>
+
+  <instruction name="PIPELINE_SELECT" bias="1" length="1">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="4"/>
+    <field name="Pipeline Selection" start="0" end="1" type="uint">
+      <value name="3D" value="0"/>
+      <value name="Media" value="1"/>
+      <value name="GPGPU" value="2"/>
+    </field>
+  </instruction>
+
+  <instruction name="STATE_BASE_ADDRESS" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="General State Base Address" start="44" end="63" type="address"/>
+    <field name="General State Base Address Modify Enable" start="32" end="32" type="bool"/>
+    <field name="Surface State Base Address" start="76" end="95" type="address"/>
+    <field name="Surface State Base Address Modify Enable" start="64" end="64" type="bool"/>
+    <field name="Indirect Object Base Address" start="108" end="127" type="address"/>
+    <field name="Indirect Object Base Address Modify Enable" start="96" end="96" type="bool"/>
+    <field name="Instruction Base Address" start="140" end="159" type="address"/>
+    <field name="Instruction Base Address Modify Enable" start="128" end="128" type="bool"/>
+    <field name="General State Access Upper Bound" start="172" end="191" type="address"/>
+    <field name="General State Access Upper Bound Modify Enable" start="160" end="160" type="bool"/>
+    <field name="Indirect Object Access Upper Bound" start="204" end="223" type="address"/>
+    <field name="Indirect Object Access Upper Bound Modify Enable" start="192" end="192" type="bool"/>
+    <field name="Instruction Access Upper Bound" start="236" end="255" type="address"/>
+    <field name="Instruction Access Upper Bound Modify Enable" start="224" end="224" type="bool"/>
+  </instruction>
+i
+  <instruction name="STATE_SIP" bias="2" length="2">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="1"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="2"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
+    <field name="System Instruction Pointer" start="36" end="63" type="offset"/>
+  </instruction>
+
+  <instruction name="URB_FENCE" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="3"/>
+    <field name="Command SubType" start="27" end="28" type="uint" default="0"/>
+    <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
+    <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="0"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="CS Unit URB Reallocation Request" start="13" end="13" type="bool"/>
+    <field name="VFE Unit URB Reallocation Request" start="12" end="12" type="bool"/>
+    <field name="SF Unit URB Reallocation Request" start="11" end="11" type="bool"/>
+    <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
+    <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
+    <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="bool"/>
+    <field name="GS Fence" start="42" end="51" type="bool"/>
+    <field name="VS Fence" start="32" end="41" type="bool"/>
+    <field name="CS Fence" start="84" end="94" type="bool"/>
+    <field name="VFE Fence" start="74" end="83" type="bool"/>
+    <field name="SF Fence" start="64" end="73" type="bool"/>
+  </instruction>
+
+  <instruction name="XY_COLOR_BLT" bias="2" length="6">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="80"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="4"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="58" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Solid Pattern Color" start="160" end="191" type="int"/>
+  </instruction>
+
+  <instruction name="XY_SETUP_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="1"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Mono Source Transparency Mode" start="60" end="60" type="bool"/>
+    <field name="Color Depth" start="56" end="57" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="ClipRect Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="ClipRect X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="ClipRect Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="ClipRect X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Background Color" start="160" end="191" type="uint"/>
+    <field name="Foreground Color" start="192" end="223" type="uint"/>
+    <field name="Pattern Base Address" start="224" end="255" type="uint"/>
+  </instruction>
+
+  <instruction name="XY_SRC_COPY_BLT" bias="2" length="8">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="83"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="6"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Source Tiling Enable" start="15" end="15" type="bool"/>
+    <field name="Destination Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Clipping Enabled" start="62" end="62" type="bool"/>
+    <field name="Color Depth" start="56" end="58" type="uint" prefix="COLOR_DEPTH">
+      <value name="8 bit" value="0"/>
+      <value name="565" value="1"/>
+      <value name="1555" value="2"/>
+      <value name="32 bit" value="3"/>
+    </field>
+    <field name="Raster Operation" start="48" end="55" type="int"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X1 Coordinate" start="64" end="79" type="int"/>
+    <field name="Destination Y2 Coordinate" start="112" end="127" type="int"/>
+    <field name="Destination X2 Coordinate" start="96" end="111" type="int"/>
+    <field name="Destination Base Address" start="128" end="159" type="address"/>
+    <field name="Source Y1 Coordinate" start="176" end="191" type="int"/>
+    <field name="Source X1 Coordinate" start="160" end="175" type="int"/>
+    <field name="Source Pitch" start="192" end="207" type="int"/>
+    <field name="Source Base Address" start="224" end="255" type="address"/>
+  </instruction>
+
+  <instruction name="XY_TEXT_IMMEDIATE_BLT" bias="2" length="3">
+    <field name="Command Type" start="29" end="31" type="uint" default="2"/>
+    <field name="2D Command Opcode" start="22" end="28" type="uint" default="49"/>
+    <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
+    <field name="32bpp Byte Mask" start="20" end="21" type="uint"/>
+    <field name="Packing" start="16" end="16" type="uint">
+      <value name="Bit Packed" value="0"/>
+      <value name="Byte Packed" value="1"/>
+    </field>
+    <field name="Tiling Enable" start="11" end="11" type="bool"/>
+    <field name="Destination Pitch" start="32" end="47" type="int"/>
+    <field name="Destination Y1 Coordinate" start="48" end="63" type="int"/>
+    <field name="Destination X1 Coordinate" start="32" end="47" type="int"/>
+    <field name="Destination Y2 Coordinate" start="80" end="95" type="int"/>
+    <field name="Destination X2 Coordinate" start="64" end="79" type="int"/>
+  </instruction>
 </genxml>
diff --git a/src/intel/genxml/gen6.xml b/src/intel/genxml/gen6.xml
index 5083f07..8aa0335 100644
--- a/src/intel/genxml/gen6.xml
+++ b/src/intel/genxml/gen6.xml
@@ -394,10 +394,10 @@
   </struct>
 
   <struct name="CLIP_VIEWPORT" length="4">
-    <field name="XMin Clip Guardband" start="0" end="31" type="uint"/>
-    <field name="XMax Clip Guardband" start="32" end="63" type="uint"/>
-    <field name="YMin Clip Guardband" start="64" end="95" type="uint"/>
-    <field name="YMax Clip Guardband" start="96" end="127" type="uint"/>
+    <field name="XMin Clip Guardband" start="0" end="31" type="float"/>
+    <field name="XMax Clip Guardband" start="32" end="63" type="float"/>
+    <field name="YMin Clip Guardband" start="64" end="95" type="float"/>
+    <field name="YMax Clip Guardband" start="96" end="127" type="float"/>
   </struct>
 
   <struct name="SCISSOR_RECT" length="2">
@@ -452,8 +452,8 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
-  <struct name="BLEND_STATE" length="16">
-    <group count="8" start="0" size="64">
+  <struct name="BLEND_STATE" length="0">
+    <group count="0" start="0" size="64">
       <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
     </group>
   </struct>
@@ -684,9 +684,9 @@
       <value name="CUBECTRLMODE_PROGRAMMED" value="0"/>
       <value name="CUBECTRLMODE_OVERRIDE" value="1"/>
     </field>
-    <field name="TCX Address Control Mode" start="38" end="40" type="uint"/>
-    <field name="TCY Address Control Mode" start="35" end="37" type="uint"/>
-    <field name="TCZ Address Control Mode" start="32" end="34" type="uint"/>
+    <field name="TCX Address Control Mode" start="38" end="40" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="35" end="37" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="32" end="34" type="Texture Coordinate Mode"/>
     <field name="Border Color Pointer" start="69" end="95" type="offset"/>
     <field name="Monochrome Filter Height: Reserved" start="125" end="127" type="uint"/>
     <field name="Monochrome Filter Width" start="122" end="124" type="uint"/>
@@ -773,8 +773,8 @@
     <field name="BLEND_STATE Change" start="32" end="32" type="bool"/>
     <field name="Pointer to DEPTH_STENCIL_STATE" start="70" end="95" type="offset"/>
     <field name="DEPTH_STENCIL_STATE Change" start="64" end="64" type="bool"/>
-    <field name="Pointer to COLOR_CALC_STATE" start="102" end="127" type="offset"/>
-    <field name="COLOR_CALC_STATE Change" start="96" end="96" type="bool"/>
+    <field name="Color Calc State Pointer" start="102" end="127" type="offset"/>
+    <field name="Color Calc State Pointer Valid" start="96" end="96" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_CHROMA_KEY" bias="2" length="4">
@@ -806,7 +806,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="Statistics Enable" start="42" end="42" type="bool"/>
     <field name="User Clip Distance Cull Test Enable Bitmask" start="32" end="39" type="uint"/>
-    <field name="CLIP Enable" start="95" end="95" type="bool"/>
+    <field name="Clip Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
       <value name="APIMODE_OGL" value="0"/>
       <value name="APIMODE_D3D" value="1"/>
@@ -1006,12 +1006,15 @@
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="SO Statistics Enable" start="169" end="169" type="bool"/>
     <field name="Rendering Enabled" start="168" end="168" type="uint"/>
-    <field name="Reorder Enable" start="222" end="222" type="bool"/>
+    <field name="Reorder Mode" start="222" end="222" type="uint">
+      <value name="LEADING" value="0"/>
+      <value name="TRAILING" value="1"/>
+    </field>
     <field name="Discard Adjacency" start="221" end="221" type="bool"/>
     <field name="SVBI Payload Enable" start="220" end="220" type="bool"/>
     <field name="SVBI Post-Increment Enable" start="219" end="219" type="bool"/>
     <field name="SVBI Post-Increment Value" start="208" end="217" type="uint"/>
-    <field name="Function Enable" start="207" end="207" type="bool"/>
+    <field name="Enable" start="207" end="207" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_GS_SVB_INDEX" bias="2" length="7">
@@ -1044,6 +1047,7 @@
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
     <field name="Memory Object Control State" start="12" end="15" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="12" end="15" type="uint"/>
     <field name="Cut Index Enable" start="10" end="10" type="bool"/>
     <field name="Index Format" start="8" end="9" type="uint" prefix="INDEX">
       <value name="BYTE" value="0"/>
@@ -1065,7 +1069,7 @@
     <field name="Current Repeat Counter" start="53" end="61" type="uint"/>
     <field name="Current Stipple Index" start="48" end="51" type="uint"/>
     <field name="Line Stipple Pattern" start="32" end="47" type="uint"/>
-    <field name="Line Stipple Inverse Repeat Count" start="79" end="95" type="u1.16"/>
+    <field name="Line Stipple Inverse Repeat Count" start="80" end="95" type="u1.13"/>
     <field name="Line Stipple Repeat Count" start="64" end="72" type="uint"/>
   </instruction>
 
@@ -1086,8 +1090,8 @@
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="13"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
     <field name="Pixel Location" start="36" end="36" type="uint">
-      <value name="PIXLOC_CENTER" value="0"/>
-      <value name="PIXLOC_UL_CORNER" value="1"/>
+      <value name="CENTER" value="0"/>
+      <value name="UL_CORNER" value="1"/>
     </field>
     <field name="Number of Multisamples" start="33" end="35" type="uint">
       <value name="NUMSAMPLES_1" value="0"/>
@@ -1215,7 +1219,7 @@
       <value name="FRONTWINDING_CW" value="0"/>
       <value name="FRONTWINDING_CCW" value="1"/>
     </field>
-    <field name="Anti-aliasing Enable" start="127" end="127" type="bool"/>
+    <field name="Anti-Aliasing Enable" start="127" end="127" type="bool"/>
     <field name="Cull Mode" start="125" end="126" type="uint" prefix="CULLMODE">
       <value name="BOTH" value="0"/>
       <value name="NONE" value="1"/>
@@ -1230,7 +1234,12 @@
       <value name="4.0 pixels" value="3"/>
     </field>
     <field name="Scissor Rectangle Enable" start="107" end="107" type="bool"/>
-    <field name="Multisample Rasterization Mode" start="104" end="105" type="uint"/>
+    <field name="Multisample Rasterization Mode" start="104" end="105" type="uint">
+      <value name="MSRASTMODE_OFF_PIXEL" value="0"/>
+      <value name="MSRASTMODE_OFF_PATTERN" value="1"/>
+      <value name="MSRASTMODE_ON_PIXEL" value="2"/>
+      <value name="MSRASTMODE_ON_PATTERN" value="3"/>
+    </field>
     <field name="Last Pixel Enable" start="159" end="159" type="bool"/>
     <field name="Triangle Strip/List Provoking Vertex Select" start="157" end="158" type="uint">
       <value name="Vertex 0" value="0"/>
@@ -1253,9 +1262,9 @@
       <value name="8 Sub-Pixel Precision Bits" value="0"/>
       <value name="4 Sub-Pixel Precision Bits" value="1"/>
     </field>
-    <field name="Use Point Width State" start="139" end="139" type="uint">
-      <value name="Use Point Width on Vertex" value="0"/>
-      <value name="Use Point Width from State" value="1"/>
+    <field name="Point Width Source" start="139" end="139" type="uint">
+      <value name="Vertex" value="0"/>
+      <value name="State" value="1"/>
     </field>
     <field name="Point Width" start="128" end="138" type="u8.3"/>
     <field name="Global Depth Offset Constant" start="160" end="191" type="float"/>
@@ -1383,7 +1392,7 @@
     <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="Vertex Cache Disable" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_WM" bias="2" length="9">
@@ -1392,7 +1401,7 @@
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="20"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="7"/>
-    <field name="Kernel Start Pointer[0]" start="38" end="63" type="offset"/>
+    <field name="Kernel Start Pointer 0" start="38" end="63" type="offset"/>
     <field name="Single Program Flow" start="95" end="95" type="bool"/>
     <field name="Vector Mask Enable" start="94" end="94" type="bool"/>
     <field name="Sampler Count" start="91" end="93" type="uint"/>
@@ -1400,9 +1409,9 @@
     <field name="Thread Priority" start="81" end="81" type="uint">
       <value name="High" value="1"/>
     </field>
-    <field name="Floating Point Mode" start="80" end="80" type="uint">
+    <field name="Floating Point Mode" start="80" end="80" type="uint" prefix="FLOATING_POINT_MODE">
       <value name="IEEE-745" value="0"/>
-      <value name="Alt" value="1"/>
+      <value name="Alternate" value="1"/>
     </field>
     <field name="Illegal Opcode Exception Enable" start="77" end="77" type="bool"/>
     <field name="MaskStack Exception Enable" start="75" end="75" type="bool"/>
@@ -1475,8 +1484,8 @@
       <value name="MSDISPMODE_PERSAMPLE" value="0"/>
       <value name="MSDISPMODE_PERPIXEL" value="1"/>
     </field>
-    <field name="Kernel Start Pointer[1]" start="230" end="255" type="offset"/>
-    <field name="Kernel Start Pointer[2]" start="262" end="287" type="offset"/>
+    <field name="Kernel Start Pointer 1" start="230" end="255" type="offset"/>
+    <field name="Kernel Start Pointer 2" start="262" end="287" type="offset"/>
   </instruction>
 
   <instruction name="MEDIA_CURBE_LOAD" bias="2" length="4">
diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml
index ada8f74..993d102 100644
--- a/src/intel/genxml/gen7.xml
+++ b/src/intel/genxml/gen7.xml
@@ -378,15 +378,13 @@
   </struct>
 
   <struct name="3DSTATE_CONSTANT_BODY" length="6">
-    <field name="Constant Buffer 1 Read Length" start="16" end="31" type="uint"/>
-    <field name="Constant Buffer 0 Read Length" start="0" end="15" type="uint"/>
-    <field name="Constant Buffer 3 Read Length" start="48" end="63" type="uint"/>
-    <field name="Constant Buffer 2 Read Length" start="32" end="47" type="uint"/>
-    <field name="Pointer To Constant Buffer 0" start="69" end="95" type="address"/>
+    <group count="4" start="0" size="16">
+      <field name="Read Length" start="0" end="15" type="uint"/>
+    </group>
     <field name="Constant Buffer Object Control State" start="64" end="68" type="MEMORY_OBJECT_CONTROL_STATE"/>
-    <field name="Pointer To Constant Buffer 1" start="101" end="127" type="address"/>
-    <field name="Pointer To Constant Buffer 2" start="133" end="159" type="address"/>
-    <field name="Pointer To Constant Buffer 3" start="165" end="191" type="address"/>
+    <group count="4" start="64" size="32">
+      <field name="Buffer" start="5" end="31" type="address"/>
+    </group>
   </struct>
 
   <struct name="VERTEX_BUFFER_STATE" length="4">
@@ -507,8 +505,8 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
-  <struct name="BLEND_STATE" length="16">
-    <group count="8" start="0" size="64">
+  <struct name="BLEND_STATE" length="0">
+    <group count="0" start="0" size="64">
       <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
     </group>
   </struct>
@@ -786,9 +784,9 @@
       <value name="LOW" value="3"/>
     </field>
     <field name="Non-normalized Coordinate Enable" start="106" end="106" type="bool"/>
-    <field name="TCX Address Control Mode" start="102" end="104" type="uint"/>
-    <field name="TCY Address Control Mode" start="99" end="101" type="uint"/>
-    <field name="TCZ Address Control Mode" start="96" end="98" type="uint"/>
+    <field name="TCX Address Control Mode" start="102" end="104" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="99" end="101" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="96" end="98" type="Texture Coordinate Mode"/>
   </struct>
 
   <instruction name="3DPRIMITIVE" bias="2" length="7">
@@ -1101,7 +1099,7 @@
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="Compute W Coordinate Enable" start="162" end="162" type="bool"/>
     <field name="DS Cache Disable" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_GS" bias="2" length="7">
@@ -1157,9 +1155,12 @@
     <field name="GS Invocations Increment Value" start="165" end="169" type="uint"/>
     <field name="Include Primitive ID" start="164" end="164" type="bool"/>
     <field name="Hint" start="163" end="163" type="uint"/>
-    <field name="Reorder Enable" start="162" end="162" type="bool"/>
+    <field name="Reorder Mode" start="162" end="162" type="uint">
+      <value name="LEADING" value="0"/>
+      <value name="TRAILING" value="1"/>
+    </field>
     <field name="Discard Adjacency" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
     <field name="Semaphore Handle" start="192" end="203" type="offset"/>
   </instruction>
 
@@ -1196,7 +1197,7 @@
     <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
     <field name="Software Exception Enable" start="39" end="39" type="bool"/>
     <field name="Maximum Number of Threads" start="32" end="38" type="uint"/>
-    <field name="Function Enable" start="95" end="95" type="bool"/>
+    <field name="Enable" start="95" end="95" type="bool"/>
     <field name="Statistics Enable" start="93" end="93" type="bool"/>
     <field name="Instance Count" start="64" end="67" type="uint"/>
     <field name="Kernel Start Pointer" start="102" end="127" type="offset"/>
@@ -1217,6 +1218,7 @@
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
     <field name="Memory Object Control State" start="12" end="15" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="12" end="15" type="uint"/>
     <field name="Cut Index Enable" start="10" end="10" type="bool"/>
     <field name="Index Format" start="8" end="9" type="uint" prefix="INDEX">
       <value name="BYTE" value="0"/>
@@ -1259,8 +1261,8 @@
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="13"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="Pixel Location" start="36" end="36" type="uint">
-      <value name="PIXLOC_CENTER" value="0"/>
-      <value name="PIXLOC_UL_CORNER" value="1"/>
+      <value name="CENTER" value="0"/>
+      <value name="UL_CORNER" value="1"/>
     </field>
     <field name="Number of Multisamples" start="33" end="35" type="uint">
       <value name="NUMSAMPLES_1" value="0"/>
@@ -1602,7 +1604,12 @@
       <value name="4.0 pixels" value="3"/>
     </field>
     <field name="Scissor Rectangle Enable" start="75" end="75" type="bool"/>
-    <field name="Multisample Rasterization Mode" start="72" end="73" type="uint"/>
+    <field name="Multisample Rasterization Mode" start="72" end="73" type="uint">
+      <value name="MSRASTMODE_OFF_PIXEL" value="0"/>
+      <value name="MSRASTMODE_OFF_PATTERN" value="1"/>
+      <value name="MSRASTMODE_ON_PIXEL" value="2"/>
+      <value name="MSRASTMODE_ON_PATTERN" value="3"/>
+   </field>
     <field name="Last Pixel Enable" start="127" end="127" type="bool"/>
     <field name="Triangle Strip/List Provoking Vertex Select" start="125" end="126" type="uint">
       <value name="Vertex 0" value="0"/>
@@ -1637,6 +1644,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="SO Buffer Index" start="61" end="62" type="uint"/>
     <field name="SO Buffer Object Control State" start="57" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="SO Buffer MOCS" start="57" end="60" type="uint"/>
     <field name="Surface Pitch" start="32" end="43" type="uint"/>
     <field name="Surface Base Address" start="66" end="95" type="address"/>
     <field name="Surface End Address" start="98" end="127" type="address"/>
@@ -1855,7 +1863,7 @@
     <field name="Maximum Number of Threads" start="185" end="191" type="uint"/>
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="Vertex Cache Disable" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_WM" bias="2" length="3">
diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml
index 16d2d74..105effa 100644
--- a/src/intel/genxml/gen75.xml
+++ b/src/intel/genxml/gen75.xml
@@ -362,6 +362,15 @@
     <value name="RAW" value="511"/>
   </enum>
 
+  <enum name="Shader Channel Select" prefix="SCS">
+    <value name="ZERO" value="0"/>
+    <value name="ONE" value="1"/>
+    <value name="RED" value="4"/>
+    <value name="GREEN" value="5"/>
+    <value name="BLUE" value="6"/>
+    <value name="ALPHA" value="7"/>
+  </enum>
+
   <enum name="Texture Coordinate Mode" prefix="TCM">
     <value name="WRAP" value="0"/>
     <value name="MIRROR" value="1"/>
@@ -377,15 +386,13 @@
   </struct>
 
   <struct name="3DSTATE_CONSTANT_BODY" length="6">
-    <field name="Constant Buffer 1 Read Length" start="16" end="31" type="uint"/>
-    <field name="Constant Buffer 0 Read Length" start="0" end="15" type="uint"/>
-    <field name="Constant Buffer 3 Read Length" start="48" end="63" type="uint"/>
-    <field name="Constant Buffer 2 Read Length" start="32" end="47" type="uint"/>
-    <field name="Pointer To Constant Buffer 0" start="69" end="95" type="address"/>
+    <group count="4" start="0" size="16">
+      <field name="Read Length" start="0" end="15" type="uint"/>
+    </group>
     <field name="Constant Buffer Object Control State" start="64" end="68" type="MEMORY_OBJECT_CONTROL_STATE"/>
-    <field name="Pointer To Constant Buffer 1" start="101" end="127" type="address"/>
-    <field name="Pointer To Constant Buffer 2" start="133" end="159" type="address"/>
-    <field name="Pointer To Constant Buffer 3" start="165" end="191" type="address"/>
+    <group count="4" start="64" size="32">
+      <field name="Buffer" start="5" end="31" type="address"/>
+    </group>
   </struct>
 
   <struct name="BINDING_TABLE_EDIT_ENTRY" length="1">
@@ -517,8 +524,8 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
-  <struct name="BLEND_STATE" length="16">
-    <group count="8" start="0" size="64">
+  <struct name="BLEND_STATE" length="0">
+    <group count="0" start="0" size="64">
       <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
     </group>
   </struct>
@@ -693,17 +700,10 @@
     <field name="Green Clear Color" start="254" end="254" type="uint" />
     <field name="Blue Clear Color" start="253" end="253" type="uint" />
     <field name="Alpha Clear Color" start="252" end="252" type="uint" />
-    <field name="Shader Channel Select Red" start="249" end="251" type="uint">
-      <value name="SCS_ZERO" value="0"/>
-      <value name="SCS_ONE" value="1"/>
-      <value name="SCS_RED" value="4"/>
-      <value name="SCS_GREEN" value="5"/>
-      <value name="SCS_BLUE" value="6"/>
-      <value name="SCS_ALPHA" value="7"/>
-    </field>
-    <field name="Shader Channel Select Green" start="246" end="248" type="uint"/>
-    <field name="Shader Channel Select Blue" start="243" end="245" type="uint"/>
-    <field name="Shader Channel Select Alpha" start="240" end="242" type="uint"/>
+    <field name="Shader Channel Select Red" start="249" end="251" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Green" start="246" end="248" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Blue" start="243" end="245" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Alpha" start="240" end="242" type="Shader Channel Select"/>
     <field name="Resource Min LOD" start="224" end="235" type="u4.8"/>
   </struct>
 
@@ -807,9 +807,72 @@
       <value name="LOW" value="3"/>
     </field>
     <field name="Non-normalized Coordinate Enable" start="106" end="106" type="bool"/>
-    <field name="TCX Address Control Mode" start="102" end="104" type="uint"/>
-    <field name="TCY Address Control Mode" start="99" end="101" type="uint"/>
-    <field name="TCZ Address Control Mode" start="96" end="98" type="uint"/>
+    <field name="TCX Address Control Mode" start="102" end="104" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="99" end="101" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="96" end="98" type="Texture Coordinate Mode"/>
+  </struct>
+
+  <struct name="MI_MATH_ALU_INSTRUCTION" length="1">
+    <field name="ALU Opcode" start="20" end="31" type="uint" prefix="MI_ALU">
+      <value name="NOOP" value="0x000"/>
+      <value name="LOAD" value="0x080"/>
+      <value name="LOADINV" value="0x480"/>
+      <value name="LOAD0" value="0x081"/>
+      <value name="LOAD1" value="0x481"/>
+      <value name="ADD" value="0x100"/>
+      <value name="SUB" value="0x101"/>
+      <value name="AND" value="0x102"/>
+      <value name="OR" value="0x103"/>
+      <value name="XOR" value="0x104"/>
+      <value name="STORE" value="0x180"/>
+      <value name="STOREINV" value="0x580"/>
+    </field>
+    <field name="Operand 1" start="10" end="19" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+    <field name="Operand 2" start="0" end="9" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
   </struct>
 
   <instruction name="3DPRIMITIVE" bias="2" length="7">
@@ -1229,7 +1292,7 @@
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="Compute W Coordinate Enable" start="162" end="162" type="bool"/>
     <field name="DS Cache Disable" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_GATHER_CONSTANT_DS" bias="2">
@@ -1377,7 +1440,7 @@
       <value name="TRAILING" value="1"/>
     </field>
     <field name="Discard Adjacency" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
     <field name="Control Data Format" start="223" end="223" type="uint">
       <value name="GSCTL_CUT" value="0"/>
       <value name="GSCTL_SID" value="1"/>
@@ -1421,7 +1484,7 @@
     <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
     <field name="Software Exception Enable" start="44" end="44" type="bool"/>
     <field name="Maximum Number of Threads" start="32" end="39" type="uint"/>
-    <field name="Function Enable" start="95" end="95" type="bool"/>
+    <field name="Enable" start="95" end="95" type="bool"/>
     <field name="Statistics Enable" start="93" end="93" type="bool"/>
     <field name="Instance Count" start="64" end="67" type="uint"/>
     <field name="Kernel Start Pointer" start="102" end="127" type="offset"/>
@@ -1443,6 +1506,7 @@
     <field name="3D Command Opcode" start="24" end="26" type="uint" default="0"/>
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="10"/>
     <field name="Memory Object Control State" start="12" end="15" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="12" end="15" type="uint"/>
     <field name="Index Format" start="8" end="9" type="uint" prefix="INDEX">
       <value name="BYTE" value="0"/>
       <value name="WORD" value="1"/>
@@ -1485,8 +1549,8 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="Multi Sample Enable" start="37" end="37" type="bool"/>
     <field name="Pixel Location" start="36" end="36" type="uint">
-      <value name="PIXLOC_CENTER" value="0"/>
-      <value name="PIXLOC_UL_CORNER" value="1"/>
+      <value name="CENTER" value="0"/>
+      <value name="UL_CORNER" value="1"/>
     </field>
     <field name="Number of Multisamples" start="33" end="35" type="uint">
       <value name="NUMSAMPLES_1" value="0"/>
@@ -1894,6 +1958,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="2"/>
     <field name="SO Buffer Index" start="61" end="62" type="uint"/>
     <field name="SO Buffer Object Control State" start="57" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="SO Buffer MOCS" start="57" end="60" type="uint"/>
     <field name="Surface Pitch" start="32" end="43" type="uint"/>
     <field name="Surface Base Address" start="66" end="95" type="address"/>
     <field name="Surface End Address" start="98" end="127" type="address"/>
@@ -2128,7 +2193,7 @@
     <field name="Maximum Number of Threads" start="183" end="191" type="uint"/>
     <field name="Statistics Enable" start="170" end="170" type="bool"/>
     <field name="Vertex Cache Disable" start="161" end="161" type="bool"/>
-    <field name="Function Enable" start="160" end="160" type="bool"/>
+    <field name="Enable" start="160" end="160" type="bool"/>
   </instruction>
 
   <instruction name="3DSTATE_WM" bias="2" length="3">
@@ -2594,10 +2659,8 @@
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="26"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
-    <field name="ALU INSTRUCTION 1" start="32" end="63" type="uint"/>
-    <field name="ALU INSTRUCTION 2" start="64" end="95" type="uint"/>
-    <group count="0" start="96" size="32">
-      <field name="ALU INSTRUCTION n" start="0" end="31" type="uint"/>
+    <group count="0" start="32" size="32">
+      <field name="Instruction" start="0" end="31" type="MI_MATH_ALU_INSTRUCTION"/>
     </group>
   </instruction>
 
diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml
index 1390fe6..99c4aca 100644
--- a/src/intel/genxml/gen8.xml
+++ b/src/intel/genxml/gen8.xml
@@ -403,14 +403,12 @@
   </enum>
 
   <struct name="3DSTATE_CONSTANT_BODY" length="10">
-    <field name="Constant Buffer 1 Read Length" start="16" end="31" type="uint"/>
-    <field name="Constant Buffer 0 Read Length" start="0" end="15" type="uint"/>
-    <field name="Constant Buffer 3 Read Length" start="48" end="63" type="uint"/>
-    <field name="Constant Buffer 2 Read Length" start="32" end="47" type="uint"/>
-    <field name="Pointer To Constant Buffer 0" start="69" end="127" type="address"/>
-    <field name="Pointer To Constant Buffer 1" start="133" end="191" type="address"/>
-    <field name="Pointer To Constant Buffer 2" start="197" end="255" type="address"/>
-    <field name="Pointer To Constant Buffer 3" start="261" end="319" type="address"/>
+    <group count="4" start="0" size="16">
+      <field name="Read Length" start="0" end="15" type="uint"/>
+    </group>
+    <group count="4" start="64" size="64">
+      <field name="Buffer" start="5" end="63" type="address"/>
+    </group>
   </struct>
 
   <struct name="BINDING_TABLE_EDIT_ENTRY" length="1">
@@ -546,7 +544,7 @@
     <field name="Write Disable Blue" start="0" end="0" type="bool"/>
   </struct>
 
-  <struct name="BLEND_STATE" length="17">
+  <struct name="BLEND_STATE" length="1">
     <field name="Alpha To Coverage Enable" start="31" end="31" type="bool"/>
     <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
     <field name="Alpha To One Enable" start="29" end="29" type="bool"/>
@@ -556,7 +554,7 @@
     <field name="Color Dither Enable" start="23" end="23" type="bool"/>
     <field name="X Dither Offset" start="21" end="22" type="uint"/>
     <field name="Y Dither Offset" start="19" end="20" type="uint"/>
-    <group count="8" start="32" size="64">
+    <group count="0" start="32" size="64">
       <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
     </group>
   </struct>
@@ -568,7 +566,7 @@
 
   <struct name="COLOR_CALC_STATE" length="6">
     <field name="Stencil Reference Value" start="24" end="31" type="uint"/>
-    <field name="BackFace Stencil Reference Value" start="16" end="23" type="uint"/>
+    <field name="Backface Stencil Reference Value" start="16" end="23" type="uint"/>
     <field name="Round Disable Function Disable" start="15" end="15" type="bool"/>
     <field name="Alpha Test Format" start="0" end="0" type="uint">
       <value name="ALPHATEST_UNORM8" value="0"/>
@@ -734,14 +732,14 @@
     <field name="Separate UV Plane Enable" start="223" end="223" type="bool"/>
     <field name="X Offset for U or UV Plane" start="208" end="221" type="uint"/>
     <field name="Y Offset for U or UV Plane" start="192" end="205" type="uint"/>
-    <field name="Red Clear Color" start="255" end="255" type="uint"/>
-    <field name="Green Clear Color" start="254" end="254" type="uint"/>
-    <field name="Blue Clear Color" start="253" end="253" type="uint"/>
-    <field name="Alpha Clear Color" start="252" end="252" type="uint"/>
-    <field name="Shader Channel Select Red" start="249" end="251" type="uint"/>
-    <field name="Shader Channel Select Green" start="246" end="248" type="uint"/>
-    <field name="Shader Channel Select Blue" start="243" end="245" type="uint"/>
-    <field name="Shader Channel Select Alpha" start="240" end="242" type="uint"/>
+    <field name="Red Clear Color" start="255" end="255" type="Clear Color"/>
+    <field name="Green Clear Color" start="254" end="254" type="Clear Color"/>
+    <field name="Blue Clear Color" start="253" end="253" type="Clear Color"/>
+    <field name="Alpha Clear Color" start="252" end="252" type="Clear Color"/>
+    <field name="Shader Channel Select Red" start="249" end="251" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Green" start="246" end="248" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Blue" start="243" end="245" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Alpha" start="240" end="242" type="Shader Channel Select"/>
     <field name="Resource Min LOD" start="224" end="235" type="u4.8"/>
     <field name="Surface Base Address" start="256" end="319" type="address"/>
     <field name="X Offset for V Plane" start="368" end="381" type="uint"/>
@@ -849,9 +847,9 @@
       <value name="LOW" value="3"/>
     </field>
     <field name="Non-normalized Coordinate Enable" start="106" end="106" type="bool"/>
-    <field name="TCX Address Control Mode" start="102" end="104" type="uint"/>
-    <field name="TCY Address Control Mode" start="99" end="101" type="uint"/>
-    <field name="TCZ Address Control Mode" start="96" end="98" type="uint"/>
+    <field name="TCX Address Control Mode" start="102" end="104" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="99" end="101" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="96" end="98" type="Texture Coordinate Mode"/>
   </struct>
 
   <struct name="SAMPLER_STATE_8X8_AVS_COEFFICIENTS" length="8">
@@ -881,6 +879,69 @@
     <field name="Table 1Y Filter Coefficient[n,4]" start="224" end="231" type="s1.6"/>
   </struct>
 
+  <struct name="MI_MATH_ALU_INSTRUCTION" length="1">
+    <field name="ALU Opcode" start="20" end="31" type="uint" prefix="MI_ALU">
+      <value name="NOOP" value="0x000"/>
+      <value name="LOAD" value="0x080"/>
+      <value name="LOADINV" value="0x480"/>
+      <value name="LOAD0" value="0x081"/>
+      <value name="LOAD1" value="0x481"/>
+      <value name="ADD" value="0x100"/>
+      <value name="SUB" value="0x101"/>
+      <value name="AND" value="0x102"/>
+      <value name="OR" value="0x103"/>
+      <value name="XOR" value="0x104"/>
+      <value name="STORE" value="0x180"/>
+      <value name="STOREINV" value="0x580"/>
+    </field>
+    <field name="Operand 1" start="10" end="19" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+    <field name="Operand 2" start="0" end="9" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+  </struct>
+
   <instruction name="3DPRIMITIVE" bias="2" length="7">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1287,7 +1348,7 @@
     </field>
     <field name="Compute W Coordinate Enable" start="226" end="226" type="bool"/>
     <field name="Cache Disable" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
     <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
@@ -1443,7 +1504,7 @@
       <value name="TRAILING" value="1"/>
     </field>
     <field name="Discard Adjacency" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Control Data Format" start="287" end="287" type="uint">
       <value name="CUT" value="0"/>
       <value name="SID" value="1"/>
@@ -1492,7 +1553,7 @@
     </field>
     <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
     <field name="Software Exception Enable" start="44" end="44" type="bool"/>
-    <field name="Function Enable" start="95" end="95" type="bool"/>
+    <field name="Enable" start="95" end="95" type="bool"/>
     <field name="Statistics Enable" start="93" end="93" type="bool"/>
     <field name="Maximum Number of Threads" start="72" end="80" type="uint"/>
     <field name="Instance Count" start="64" end="67" type="uint"/>
@@ -1520,6 +1581,7 @@
       <value name="DWORD" value="2"/>
     </field>
     <field name="Memory Object Control State" start="32" end="38" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="32" end="38" type="uint"/>
     <field name="Buffer Starting Address" start="64" end="127" type="address"/>
     <field name="Buffer Size" start="128" end="159" type="uint"/>
   </instruction>
@@ -1969,6 +2031,7 @@
     <field name="Legacy Global Depth Bias Enable" start="43" end="43" type="bool"/>
     <field name="Statistics Enable" start="42" end="42" type="bool"/>
     <field name="Viewport Transform Enable" start="33" end="33" type="bool"/>
+    <field name="CHV Line Width" start="44" end="61" type="u11.7"/>
     <field name="Line Width" start="82" end="91" type="u3.7"/>
     <field name="Line End Cap Antialiasing Region Width" start="80" end="81" type="uint">
       <value name="0.5 pixels" value="0"/>
@@ -2001,6 +2064,7 @@
     <field name="SO Buffer Enable" start="63" end="63" type="bool"/>
     <field name="SO Buffer Index" start="61" end="62" type="uint"/>
     <field name="SO Buffer Object Control State" start="54" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="SO Buffer MOCS" start="54" end="60" type="uint"/>
     <field name="Stream Offset Write Enable" start="53" end="53" type="bool"/>
     <field name="Stream Output Buffer Offset Address Enable" start="52" end="52" type="bool"/>
     <field name="Surface Base Address" start="66" end="111" type="address"/>
@@ -2288,7 +2352,7 @@
     <field name="Statistics Enable" start="234" end="234" type="bool"/>
     <field name="SIMD8 Dispatch Enable" start="226" end="226" type="bool"/>
     <field name="Vertex Cache Disable" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
     <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
@@ -2822,10 +2886,8 @@
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="26"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
-    <field name="ALU INSTRUCTION 1" start="32" end="63" type="uint"/>
-    <field name="ALU INSTRUCTION 2" start="64" end="95" type="uint"/>
-    <group count="0" start="96" size="32">
-      <field name="ALU INSTRUCTION n" start="0" end="31" type="uint"/>
+    <group count="0" start="32" size="32">
+      <field name="Instruction" start="0" end="31" type="MI_MATH_ALU_INSTRUCTION"/>
     </group>
   </instruction>
 
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index 4bf0fb6..1422463 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -424,14 +424,12 @@
   </enum>
 
   <struct name="3DSTATE_CONSTANT_BODY" length="10">
-    <field name="Constant Buffer 1 Read Length" start="16" end="31" type="uint"/>
-    <field name="Constant Buffer 0 Read Length" start="0" end="15" type="uint"/>
-    <field name="Constant Buffer 3 Read Length" start="48" end="63" type="uint"/>
-    <field name="Constant Buffer 2 Read Length" start="32" end="47" type="uint"/>
-    <field name="Pointer To Constant Buffer 0" start="69" end="127" type="address"/>
-    <field name="Pointer To Constant Buffer 1" start="133" end="191" type="address"/>
-    <field name="Pointer To Constant Buffer 2" start="197" end="255" type="address"/>
-    <field name="Pointer To Constant Buffer 3" start="261" end="319" type="address"/>
+    <group count="4" start="0" size="16">
+      <field name="Read Length" start="0" end="15" type="uint"/>
+    </group>
+    <group count="4" start="64" size="64">
+      <field name="Buffer" start="5" end="63" type="address"/>
+    </group>
   </struct>
 
   <struct name="BINDING_TABLE_EDIT_ENTRY" length="1">
@@ -555,7 +553,7 @@
     <field name="Write Disable Blue" start="0" end="0" type="bool"/>
   </struct>
 
-  <struct name="BLEND_STATE" length="17">
+  <struct name="BLEND_STATE" length="1">
     <field name="Alpha To Coverage Enable" start="31" end="31" type="bool"/>
     <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
     <field name="Alpha To One Enable" start="29" end="29" type="bool"/>
@@ -565,7 +563,7 @@
     <field name="Color Dither Enable" start="23" end="23" type="bool"/>
     <field name="X Dither Offset" start="21" end="22" type="uint"/>
     <field name="Y Dither Offset" start="19" end="20" type="uint"/>
-    <group count="8" start="32" size="64">
+    <group count="0" start="32" size="64">
       <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
     </group>
   </struct>
@@ -782,10 +780,10 @@
       <value name="Vertical" value="1"/>
     </field>
     <field name="Memory Compression Enable" start="254" end="254" type="bool"/>
-    <field name="Shader Channel Select Red" start="249" end="251" type="uint"/>
-    <field name="Shader Channel Select Green" start="246" end="248" type="uint"/>
-    <field name="Shader Channel Select Blue" start="243" end="245" type="uint"/>
-    <field name="Shader Channel Select Alpha" start="240" end="242" type="uint"/>
+    <field name="Shader Channel Select Red" start="249" end="251" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Green" start="246" end="248" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Blue" start="243" end="245" type="Shader Channel Select"/>
+    <field name="Shader Channel Select Alpha" start="240" end="242" type="Shader Channel Select"/>
     <field name="Resource Min LOD" start="224" end="235" type="u4.8"/>
     <field name="Surface Base Address" start="256" end="319" type="address"/>
     <field name="X Offset for V Plane" start="368" end="381" type="uint"/>
@@ -907,9 +905,9 @@
     </field>
     <field name="Non-normalized Coordinate Enable" start="106" end="106" type="bool"/>
     <field name="Reduction Type Enable" start="105" end="105" type="bool"/>
-    <field name="TCX Address Control Mode" start="102" end="104" type="uint"/>
-    <field name="TCY Address Control Mode" start="99" end="101" type="uint"/>
-    <field name="TCZ Address Control Mode" start="96" end="98" type="uint"/>
+    <field name="TCX Address Control Mode" start="102" end="104" type="Texture Coordinate Mode"/>
+    <field name="TCY Address Control Mode" start="99" end="101" type="Texture Coordinate Mode"/>
+    <field name="TCZ Address Control Mode" start="96" end="98" type="Texture Coordinate Mode"/>
   </struct>
 
   <struct name="SAMPLER_STATE_8X8_AVS_COEFFICIENTS" length="8">
@@ -939,6 +937,69 @@
     <field name="Table 1Y Filter Coefficient[n,4]" start="224" end="231" type="s1.6"/>
   </struct>
 
+  <struct name="MI_MATH_ALU_INSTRUCTION" length="1">
+    <field name="ALU Opcode" start="20" end="31" type="uint" prefix="MI_ALU">
+      <value name="NOOP" value="0x000"/>
+      <value name="LOAD" value="0x080"/>
+      <value name="LOADINV" value="0x480"/>
+      <value name="LOAD0" value="0x081"/>
+      <value name="LOAD1" value="0x481"/>
+      <value name="ADD" value="0x100"/>
+      <value name="SUB" value="0x101"/>
+      <value name="AND" value="0x102"/>
+      <value name="OR" value="0x103"/>
+      <value name="XOR" value="0x104"/>
+      <value name="STORE" value="0x180"/>
+      <value name="STOREINV" value="0x580"/>
+    </field>
+    <field name="Operand 1" start="10" end="19" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+    <field name="Operand 2" start="0" end="9" type="uint" prefix="MI_ALU">
+      <value name="REG0" value="0x00"/>
+      <value name="REG1" value="0x01"/>
+      <value name="REG2" value="0x02"/>
+      <value name="REG3" value="0x03"/>
+      <value name="REG4" value="0x04"/>
+      <value name="REG5" value="0x05"/>
+      <value name="REG6" value="0x06"/>
+      <value name="REG7" value="0x07"/>
+      <value name="REG8" value="0x08"/>
+      <value name="REG9" value="0x09"/>
+      <value name="REG10" value="0x0a"/>
+      <value name="REG11" value="0x0b"/>
+      <value name="REG12" value="0x0c"/>
+      <value name="REG13" value="0x0d"/>
+      <value name="REG14" value="0x0e"/>
+      <value name="REG15" value="0x0f"/>
+      <value name="SRCA" value="0x20"/>
+      <value name="SRCB" value="0x21"/>
+      <value name="ACCU" value="0x31"/>
+      <value name="ZF" value="0x32"/>
+      <value name="CF" value="0x33"/>
+    </field>
+  </struct>
+
   <instruction name="3DPRIMITIVE" bias="2" length="7">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -1349,7 +1410,7 @@
     </field>
     <field name="Compute W Coordinate Enable" start="226" end="226" type="bool"/>
     <field name="Cache Disable" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
     <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
@@ -1548,7 +1609,7 @@
       <value name="TRAILING" value="1"/>
     </field>
     <field name="Discard Adjacency" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Control Data Format" start="287" end="287" type="uint">
       <value name="CUT" value="0"/>
       <value name="SID" value="1"/>
@@ -1598,7 +1659,7 @@
     </field>
     <field name="Illegal Opcode Exception Enable" start="45" end="45" type="bool"/>
     <field name="Software Exception Enable" start="44" end="44" type="bool"/>
-    <field name="Function Enable" start="95" end="95" type="bool"/>
+    <field name="Enable" start="95" end="95" type="bool"/>
     <field name="Statistics Enable" start="93" end="93" type="bool"/>
     <field name="Maximum Number of Threads" start="72" end="80" type="uint"/>
     <field name="Instance Count" start="64" end="67" type="uint"/>
@@ -1633,6 +1694,7 @@
       <value name="DWORD" value="2"/>
     </field>
     <field name="Memory Object Control State" start="32" end="38" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="IndexBufferMOCS" start="32" end="38" type="uint"/>
     <field name="Buffer Starting Address" start="64" end="127" type="address"/>
     <field name="Buffer Size" start="128" end="159" type="uint"/>
   </instruction>
@@ -2110,7 +2172,12 @@
     <field name="Point Sprite Texture Coordinate Enable" start="64" end="95" type="uint"/>
     <field name="Constant Interpolation Enable" start="96" end="127" type="uint"/>
     <group count="32" start="128" size="2">
-      <field name="Attribute Active Component Format" start="0" end="1" type="uint"/>
+      <field name="Attribute Active Component Format" start="0" end="1" type="uint" prefix="ACTIVE_COMPONENT">
+         <value name="DISABLED" value="0"/>
+         <value name="XY" value="1"/>
+         <value name="XYZ" value="2"/>
+         <value name="XYZW" value="3"/>
+      </field>
     </group>
   </instruction>
 
@@ -2178,6 +2245,7 @@
     <field name="SO Buffer Enable" start="63" end="63" type="bool"/>
     <field name="SO Buffer Index" start="61" end="62" type="uint"/>
     <field name="SO Buffer Object Control State" start="54" end="60" type="MEMORY_OBJECT_CONTROL_STATE"/>
+    <field name="SO Buffer MOCS" start="54" end="60" type="uint"/>
     <field name="Stream Offset Write Enable" start="53" end="53" type="bool"/>
     <field name="Stream Output Buffer Offset Address Enable" start="52" end="52" type="bool"/>
     <field name="Surface Base Address" start="66" end="111" type="address"/>
@@ -2516,7 +2584,7 @@
     <field name="Statistics Enable" start="234" end="234" type="bool"/>
     <field name="SIMD8 Dispatch Enable" start="226" end="226" type="bool"/>
     <field name="Vertex Cache Disable" start="225" end="225" type="bool"/>
-    <field name="Function Enable" start="224" end="224" type="bool"/>
+    <field name="Enable" start="224" end="224" type="bool"/>
     <field name="Vertex URB Entry Output Read Offset" start="277" end="282" type="uint"/>
     <field name="Vertex URB Entry Output Length" start="272" end="276" type="uint"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="264" end="271" type="uint"/>
@@ -3103,10 +3171,8 @@
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="26"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="0"/>
-    <field name="ALU INSTRUCTION 1" start="32" end="63" type="uint"/>
-    <field name="ALU INSTRUCTION 2" start="64" end="95" type="uint"/>
-    <group count="0" start="96" size="32">
-      <field name="ALU INSTRUCTION n" start="0" end="31" type="uint"/>
+    <group count="0" start="32" size="32">
+      <field name="Instruction" start="0" end="31" type="MI_MATH_ALU_INSTRUCTION"/>
     </group>
   </instruction>
 
@@ -3614,6 +3680,36 @@
     <field name="Sampler L2 Disable Mask" start="31" end="31" type="bool"/>
   </register>
 
+  <register name="CACHE_MODE_1" length="1" num="0x7004">
+    <field name="Partial Resolve Disable In VC" start="1" end="1" type="bool"/>
+    <field name="RCZ Read after expansion control fix 2" start="2" end="2" type="bool"/>
+    <field name="Depth Read Hit Write-Only Optimization Disable" start="3" end="3" type="bool"/>
+    <field name="Float Blend Optimization Enable" start="4" end="4" type="bool"/>
+    <field name="MCS Cache Disable" start="5" end="5" type="bool"/>
+    <field name="4X4 RCPFE-STC Optimization Disable" start="6" end="6" type="bool"/>
+    <field name="Sampler Cache Set XOR selection" start="7" end="8" type="uint"/>
+    <field name="MSC RAW Hazard Avoidance Bit" start="9" end="9" type="bool"/>
+    <field name="NP PMA Fix Enable" start="11" end="11" type="uint"/>
+    <field name="HIZ Eviction Policy" start="12" end="12" type="uint"/>
+    <field name="NP Early Z Fails Disable" start="13" end="13" type="uint"/>
+    <field name="Blend Optimization Fix Disable" start="14" end="14" type="bool"/>
+    <field name="Color Compression Disable" start="15" end="15" type="bool"/>
+
+    <field name="Partial Resolve Disable In VC Mask" start="17" end="17" type="bool"/>
+    <field name="RCZ Read after expansion control fix 2 Mask" start="18" end="18" type="bool"/>
+    <field name="Depth Read Hit Write-Only Optimization Disable Mask" start="19" end="19" type="bool"/>
+    <field name="Float Blend Optimization Enable Mask" start="20" end="20" type="bool"/>
+    <field name="MCS Cache Disable Mask" start="21" end="21" type="bool"/>
+    <field name="4X4 RCPFE-STC Optimization Disable Mask" start="22" end="22" type="bool"/>
+    <field name="Sampler Cache Set XOR selection Mask" start="23" end="24" type="uint"/>
+    <field name="MSC RAW Hazard Avoidance Bit Mask" start="25" end="25" type="bool"/>
+    <field name="NP PMA Fix Enable Mask" start="27" end="27" type="bool"/>
+    <field name="HIZ Eviction Policy Mask" start="28" end="28" type="bool"/>
+    <field name="NP Early Z Fails Disable Mask" start="29" end="29" type="bool"/>
+    <field name="Blend Optimization Fix Disable Mask" start="30" end="30" type="bool"/>
+    <field name="Color Compression Disable Mask" start="31" end="31" type="bool"/>
+  </register>
+
   <register name="GFX_ARB_ERROR_RPT" length="1" num="0x40a0">
     <field name="TLB Page Fault Error" start="0" end="0" type="bool"/>
     <field name="RSTRM PAVP Read Invalid" start="1" end="1" type="bool"/>
diff --git a/src/intel/genxml/genX_pack.h b/src/intel/genxml/genX_pack.h
index 2ec2226..187e75c 100644
--- a/src/intel/genxml/genX_pack.h
+++ b/src/intel/genxml/genX_pack.h
@@ -44,6 +44,8 @@
 #  include "genxml/gen8_pack.h"
 #elif (GEN_VERSIONx10 == 90)
 #  include "genxml/gen9_pack.h"
+#elif (GEN_VERSIONx10 == 100)
+#  include "genxml/gen10_pack.h"
 #else
 #  error "Need to add a pack header include for this gen"
 #endif
diff --git a/src/intel/genxml/gen_bits_header.py b/src/intel/genxml/gen_bits_header.py
index ac8ec4c..1b35040 100644
--- a/src/intel/genxml/gen_bits_header.py
+++ b/src/intel/genxml/gen_bits_header.py
@@ -80,6 +80,7 @@
 ${item.token_name}_${prop}(const struct gen_device_info *devinfo)
 {
    switch (devinfo->gen) {
+   case 10: return ${item.get_prop(prop, 10)};
    case 9: return ${item.get_prop(prop, 9)};
    case 8: return ${item.get_prop(prop, 8)};
    case 7:
@@ -167,10 +168,7 @@
 
     def __init__(self, z):
         # Convert potential "major.minor" string
-        z = float(z)
-        if z < 10:
-            z *= 10
-        self.tenx = int(z)
+        self.tenx = int(float(z) * 10)
 
     def __lt__(self, other):
         return self.tenx < other.tenx
diff --git a/src/intel/genxml/gen_macros.h b/src/intel/genxml/gen_macros.h
index b4941b9..a85c082 100644
--- a/src/intel/genxml/gen_macros.h
+++ b/src/intel/genxml/gen_macros.h
@@ -85,6 +85,9 @@
 #elif (GEN_VERSIONx10 == 90)
 #  define GENX(X) GEN9_##X
 #  define genX(x) gen9_##x
+#elif (GEN_VERSIONx10 == 100)
+#  define GENX(X) GEN10_##X
+#  define genX(x) gen10_##x
 #else
 #  error "Need to add prefixing macros for this gen"
 #endif
diff --git a/src/intel/genxml/gen_pack_header.py b/src/intel/genxml/gen_pack_header.py
index 2a70945..9021f00 100644
--- a/src/intel/genxml/gen_pack_header.py
+++ b/src/intel/genxml/gen_pack_header.py
@@ -3,10 +3,12 @@
 from __future__ import (
     absolute_import, division, print_function, unicode_literals
 )
+import ast
 import xml.parsers.expat
 import re
 import sys
 import copy
+import textwrap
 
 license =  """/*
  * Copyright (C) 2016 Intel Corporation
@@ -272,14 +274,14 @@
             return
         else:
             print("#error unhandled type: %s" % self.type)
+            return
 
         print("   %-36s %s%s;" % (type, self.name, dim))
 
+        prefix = ""
         if len(self.values) > 0 and self.default == None:
             if self.prefix:
                 prefix = self.prefix + "_"
-            else:
-                prefix = ""
 
         for value in self.values:
             print("#define %-40s %d" % (prefix + value.name, value.value))
@@ -346,7 +348,7 @@
                 dwords[index + 1] = dwords[index]
                 index = index + 1
 
-    def emit_pack_function(self, start):
+    def collect_dwords_and_length(self):
         dwords = {}
         self.collect_dwords(dwords, 0, "")
 
@@ -356,9 +358,14 @@
         # index we've seen plus one.
         if self.size > 0:
             length = self.size // 32
-        else:
+        elif dwords:
             length = max(dwords.keys()) + 1
+        else:
+            length = 0
 
+        return (dwords, length)
+
+    def emit_pack_function(self, dwords, length):
         for index in range(length):
             # Handle MBZ dwords
             if not index in dwords:
@@ -461,13 +468,13 @@
 
             if dw.size == 32:
                 if dw.address:
-                    print("   dw[%d] = __gen_combine_address(data, &dw[%d], values->%s, %s);" % (index, index, dw.address.name, v))
+                    print("   dw[%d] = __gen_combine_address(data, &dw[%d], values->%s, %s);" % (index, index, dw.address.name + field.dim, v))
                 continue
 
             if dw.address:
                 v_address = "v%d_address" % index
                 print("   const uint64_t %s =\n      __gen_combine_address(data, &dw[%d], values->%s, %s);" %
-                      (v_address, index, dw.address.name, v))
+                      (v_address, index, dw.address.name + field.dim, v))
                 v = v_address
 
             print("   dw[%d] = %s;" % (index, v))
@@ -476,7 +483,7 @@
 class Value(object):
     def __init__(self, attrs):
         self.name = safe_name(attrs["name"])
-        self.value = int(attrs["value"])
+        self.value = ast.literal_eval(attrs["value"])
 
 class Parser(object):
     def __init__(self):
@@ -572,13 +579,19 @@
 
     def emit_pack_function(self, name, group):
         name = self.gen_prefix(name)
-        print("static inline void\n%s_pack(__gen_user_data *data, void * restrict dst,\n%sconst struct %s * restrict values)\n{" %
-              (name, ' ' * (len(name) + 6), name))
+        print(textwrap.dedent("""\
+            static inline void
+            %s_pack(__attribute__((unused)) __gen_user_data *data,
+                  %s__attribute__((unused)) void * restrict dst,
+                  %s__attribute__((unused)) const struct %s * restrict values)
+            {""") % (name, ' ' * len(name), ' ' * len(name), name))
 
-        # Cast dst to make header C++ friendly
-        print("   uint32_t * restrict dw = (uint32_t * restrict) dst;")
+        (dwords, length) = group.collect_dwords_and_length()
+        if length:
+            # Cast dst to make header C++ friendly
+            print("   uint32_t * restrict dw = (uint32_t * restrict) dst;")
 
-        group.emit_pack_function(0)
+            group.emit_pack_function(dwords, length)
 
         print("}\n")
 
diff --git a/src/intel/isl/gen_format_layout.py b/src/intel/isl/gen_format_layout.py
index 53dbcb6..cdf8e13 100755
--- a/src/intel/isl/gen_format_layout.py
+++ b/src/intel/isl/gen_format_layout.py
@@ -37,64 +37,76 @@
 # Load the template, ensure that __future__.division is imported, and set the
 # bytes encoding to be utf-8. This last bit is important to getting simple
 # consistent behavior for python 3 when we get there.
-TEMPLATE = template.Template(
-    text=textwrap.dedent("""\
-        /* This file is autogenerated by gen_format_layout.py. DO NOT EDIT! */
+TEMPLATE = template.Template(future_imports=['division'],
+                             output_encoding='utf-8',
+                             text="""\
+/* This file is autogenerated by gen_format_layout.py. DO NOT EDIT! */
 
-        /*
-         * Copyright 2015 Intel Corporation
-         *
-         *  Permission is hereby granted, free of charge, to any person obtaining a
-         *  copy of this software and associated documentation files (the "Software"),
-         *  to deal in the Software without restriction, including without limitation
-         *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
-         *  and/or sell copies of the Software, and to permit persons to whom the
-         *  Software is furnished to do so, subject to the following conditions:
-         *
-         *  The above copyright notice and this permission notice (including the next
-         *  paragraph) shall be included in all copies or substantial portions of the
-         *  Software.
-         *
-         *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-         *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-         *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-         *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-         *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-         *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-         *  IN THE SOFTWARE.
-         */
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
 
-        #include "isl/isl.h"
+#include "isl/isl.h"
 
-        const struct isl_format_layout
-        isl_format_layouts[] = {
-        % for format in formats:
-          [ISL_FORMAT_${format.name}] = {
-            .format = ISL_FORMAT_${format.name},
-            .name = "ISL_FORMAT_${format.name}",
-            .bpb = ${format.bpb},
-            .bw = ${format.bw},
-            .bh = ${format.bh},
-            .bd = ${format.bd},
-            .channels = {
-            % for mask in ['r', 'g', 'b', 'a', 'l', 'i', 'p']:
-              <% channel = getattr(format, mask, None) %>\\
-              % if channel.type is not None:
-                .${mask} = { ISL_${channel.type}, ${channel.size} },
-              % else:
-                .${mask} = {},
-              % endif
-            % endfor
-            },
-            .colorspace = ISL_COLORSPACE_${format.colorspace},
-            .txc = ISL_TXC_${format.txc},
-          },
+const struct isl_format_layout
+isl_format_layouts[] = {
+% for format in formats:
+  [ISL_FORMAT_${format.name}] = {
+    .format = ISL_FORMAT_${format.name},
+    .name = "ISL_FORMAT_${format.name}",
+    .bpb = ${format.bpb},
+    .bw = ${format.bw},
+    .bh = ${format.bh},
+    .bd = ${format.bd},
+    .channels = {
+    % for mask in ['r', 'g', 'b', 'a', 'l', 'i', 'p']:
+      <% channel = getattr(format, mask, None) %>\\
+      % if channel.type is not None:
+        .${mask} = { ISL_${channel.type}, ${channel.size} },
+      % else:
+        .${mask} = {},
+      % endif
+    % endfor
+    },
+    .colorspace = ISL_COLORSPACE_${format.colorspace},
+    .txc = ISL_TXC_${format.txc},
+  },
 
-        % endfor
-        };
-    """),
-    future_imports=['division'],
-    output_encoding='utf-8')
+% endfor
+};
+
+enum isl_format
+isl_format_srgb_to_linear(enum isl_format format)
+{
+    switch (format) {
+% for srgb, rgb in srgb_to_linear_map:
+    case ISL_FORMAT_${srgb}:
+        return ISL_FORMAT_${rgb};
+%endfor
+    default:
+        return format;
+    }
+}
+""")
 
 
 class Channel(object):
@@ -173,6 +185,34 @@
             if line and not line[0].startswith('#'):
                 yield line
 
+def get_srgb_to_linear_map(formats):
+    """Compute a map from sRGB to linear formats.
+
+    This function uses some probably somewhat fragile string munging to do
+    the conversion.  However, we do assert that, if it's SRGB, the munging
+    succeeded so that gives some safety.
+    """
+    names = {f.name for f in formats}
+    for fmt in formats:
+        if fmt.colorspace != 'SRGB':
+            continue
+
+        replacements = [
+            ('_SRGB',   ''),
+            ('SRGB',    'RGB'),
+            ('U8SRGB',  'FLT16'),
+        ]
+
+        found = False;
+        for rep in replacements:
+            rgb_name = fmt.name.replace(rep[0], rep[1])
+            if rgb_name in names:
+                found = True
+                yield fmt.name, rgb_name
+                break;
+
+        # We should have found a format name
+        assert found
 
 def main():
     """Main function."""
@@ -190,11 +230,14 @@
     # problem: Unicode can be rendered even if the shell calling this script
     # doesn't.
     with open(args.out, 'wb') as f:
+        formats = [Format(l) for l in reader(args.csv)]
         try:
             # This basically does lazy evaluation and initialization, which
             # saves on memory and startup overhead.
             f.write(TEMPLATE.render(
-                formats=(Format(l) for l in reader(args.csv))))
+                formats             = formats,
+                srgb_to_linear_map  = list(get_srgb_to_linear_map(formats)),
+            ))
         except Exception:
             # In the even there's an error this imports some helpers from mako
             # to print a useful stack trace and prints it, then exits with
diff --git a/src/intel/isl/isl.c b/src/intel/isl/isl.c
index 1a44a6d..a10f2c9 100644
--- a/src/intel/isl/isl.c
+++ b/src/intel/isl/isl.c
@@ -73,6 +73,15 @@
    dev->ss.size = RENDER_SURFACE_STATE_length(info) * 4;
    dev->ss.align = isl_align(dev->ss.size, 32);
 
+   dev->ss.clear_value_size =
+      isl_align(RENDER_SURFACE_STATE_RedClearColor_bits(info) +
+                RENDER_SURFACE_STATE_GreenClearColor_bits(info) +
+                RENDER_SURFACE_STATE_BlueClearColor_bits(info) +
+                RENDER_SURFACE_STATE_AlphaClearColor_bits(info), 32) / 8;
+
+   dev->ss.clear_value_offset =
+      RENDER_SURFACE_STATE_RedClearColor_start(info) / 32 * 4;
+
    assert(RENDER_SURFACE_STATE_SurfaceBaseAddress_start(info) % 8 == 0);
    dev->ss.addr_offset =
       RENDER_SURFACE_STATE_SurfaceBaseAddress_start(info) / 8;
@@ -84,17 +93,16 @@
    dev->ss.aux_addr_offset =
       (RENDER_SURFACE_STATE_AuxiliarySurfaceBaseAddress_start(info) & ~31) / 8;
 
-   dev->ds.size =
-      _3DSTATE_DEPTH_BUFFER_length(info) * 4 +
-      _3DSTATE_STENCIL_BUFFER_length(info) * 4 +
-      _3DSTATE_HIER_DEPTH_BUFFER_length(info) * 4 +
-      _3DSTATE_CLEAR_PARAMS_length(info) * 4;
-
+   dev->ds.size = _3DSTATE_DEPTH_BUFFER_length(info) * 4;
    assert(_3DSTATE_DEPTH_BUFFER_SurfaceBaseAddress_start(info) % 8 == 0);
    dev->ds.depth_offset =
       _3DSTATE_DEPTH_BUFFER_SurfaceBaseAddress_start(info) / 8;
 
-   if (info->has_hiz_and_separate_stencil) {
+   if (dev->use_separate_stencil) {
+      dev->ds.size += _3DSTATE_STENCIL_BUFFER_length(info) * 4 +
+                      _3DSTATE_HIER_DEPTH_BUFFER_length(info) * 4 +
+                      _3DSTATE_CLEAR_PARAMS_length(info) * 4;
+
       assert(_3DSTATE_STENCIL_BUFFER_SurfaceBaseAddress_start(info) % 8 == 0);
       dev->ds.stencil_offset =
          _3DSTATE_DEPTH_BUFFER_length(info) * 4 +
@@ -146,9 +154,8 @@
 /**
  * @param[out] info is written only on success
  */
-static bool
-isl_tiling_get_info(const struct isl_device *dev,
-                    enum isl_tiling tiling,
+static void
+isl_tiling_get_info(enum isl_tiling tiling,
                     uint32_t format_bpb,
                     struct isl_tile_info *tile_info)
 {
@@ -163,7 +170,8 @@
        */
       assert(tiling == ISL_TILING_X || tiling == ISL_TILING_Y0);
       assert(bs % 3 == 0 && isl_is_pow2(format_bpb / 3));
-      return isl_tiling_get_info(dev, tiling, format_bpb / 3, tile_info);
+      isl_tiling_get_info(tiling, format_bpb / 3, tile_info);
+      return;
    }
 
    switch (tiling) {
@@ -204,12 +212,6 @@
 
    case ISL_TILING_Yf:
    case ISL_TILING_Ys: {
-      if (ISL_DEV_GEN(dev) < 9)
-         return false;
-
-      if (!isl_is_pow2(bs))
-         return false;
-
       bool is_Ys = tiling == ISL_TILING_Ys;
 
       assert(bs > 0);
@@ -264,6 +266,31 @@
       .logical_extent_el = logical_el,
       .phys_extent_B = phys_B,
    };
+}
+
+bool
+isl_color_value_is_zero_one(union isl_color_value value,
+                            enum isl_format format)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+
+#define RETURN_FALSE_IF_NOT_0_1(c, i, field) \
+   if (fmtl->channels.c.bits && value.field[i] != 0 && value.field[i] != 1) \
+      return false
+
+   if (isl_format_has_int_channel(format)) {
+      RETURN_FALSE_IF_NOT_0_1(r, 0, u32);
+      RETURN_FALSE_IF_NOT_0_1(g, 1, u32);
+      RETURN_FALSE_IF_NOT_0_1(b, 2, u32);
+      RETURN_FALSE_IF_NOT_0_1(a, 3, u32);
+   } else {
+      RETURN_FALSE_IF_NOT_0_1(r, 0, f32);
+      RETURN_FALSE_IF_NOT_0_1(g, 1, f32);
+      RETURN_FALSE_IF_NOT_0_1(b, 2, f32);
+      RETURN_FALSE_IF_NOT_0_1(a, 3, f32);
+   }
+
+#undef RETURN_FALSE_IF_NOT_0_1
 
    return true;
 }
@@ -297,8 +324,7 @@
    if (ISL_DEV_GEN(dev) >= 6) {
       isl_gen6_filter_tiling(dev, info, &tiling_flags);
    } else {
-      isl_finishme("%s: gen%u", __func__, ISL_DEV_GEN(dev));
-      isl_gen6_filter_tiling(dev, info, &tiling_flags);
+      isl_gen4_filter_tiling(dev, info, &tiling_flags);
    }
 
    #define CHOOSE(__tiling) \
@@ -457,7 +483,6 @@
           *    the storage for LODs other than LOD 0 is not needed.
           */
          assert(info->levels == 1);
-         assert(phys_level0_sa->array_len == 1);
          return ISL_ARRAY_PITCH_SPAN_COMPACT;
       } else {
          if ((ISL_DEV_GEN(dev) == 5 || ISL_DEV_GEN(dev) == 6) &&
@@ -489,6 +514,12 @@
        * compact QPitch possible in order to conserve memory.
        */
       return ISL_ARRAY_PITCH_SPAN_COMPACT;
+
+   case ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ:
+      /* Each array image in the gen6 stencil of HiZ surface is compact in the
+       * sense that every LOD is a compact array of the same size as LOD0.
+       */
+      return ISL_ARRAY_PITCH_SPAN_COMPACT;
    }
 
    unreachable("bad isl_dim_layout");
@@ -520,10 +551,15 @@
       return;
    } else if (info->format == ISL_FORMAT_HIZ) {
       assert(ISL_DEV_GEN(dev) >= 6);
-      /* HiZ surfaces are always aligned to 16x8 pixels in the primary surface
-       * which works out to 2x2 HiZ elments.
-       */
-      *image_align_el = isl_extent3d(2, 2, 1);
+      if (ISL_DEV_GEN(dev) == 6) {
+         /* HiZ surfaces on Sandy Bridge are packed tightly. */
+         *image_align_el = isl_extent3d(1, 1, 1);
+      } else {
+         /* On gen7+, HiZ surfaces are always aligned to 16x8 pixels in the
+          * primary surface which works out to 2x2 HiZ elments.
+          */
+         *image_align_el = isl_extent3d(2, 2, 1);
+      }
       return;
    }
 
@@ -548,8 +584,14 @@
 static enum isl_dim_layout
 isl_surf_choose_dim_layout(const struct isl_device *dev,
                            enum isl_surf_dim logical_dim,
-                           enum isl_tiling tiling)
+                           enum isl_tiling tiling,
+                           isl_surf_usage_flags_t usage)
 {
+   /* Sandy bridge needs a special layout for HiZ and stencil. */
+   if (ISL_DEV_GEN(dev) == 6 &&
+       (tiling == ISL_TILING_W || tiling == ISL_TILING_HIZ))
+      return ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ;
+
    if (ISL_DEV_GEN(dev) >= 9) {
       switch (logical_dim) {
       case ISL_SURF_DIM_1D:
@@ -579,6 +621,16 @@
       switch (logical_dim) {
       case ISL_SURF_DIM_1D:
       case ISL_SURF_DIM_2D:
+         /* From the G45 PRM Vol. 1a, "6.17.4.1 Hardware Cube Map Layout":
+          *
+          * The cube face textures are stored in the same way as 3D surfaces
+          * are stored (see section 6.17.5 for details).  For cube surfaces,
+          * however, the depth is equal to the number of faces (always 6) and 
+          * is not reduced for each MIP.
+          */
+         if (ISL_DEV_GEN(dev) == 4 && (usage & ISL_SURF_USAGE_CUBE_BIT))
+            return ISL_DIM_LAYOUT_GEN4_3D;
+
          return ISL_DIM_LAYOUT_GEN4_2D;
       case ISL_SURF_DIM_3D:
          return ISL_DIM_LAYOUT_GEN4_3D;
@@ -618,6 +670,7 @@
 
       case ISL_DIM_LAYOUT_GEN9_1D:
       case ISL_DIM_LAYOUT_GEN4_2D:
+      case ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ:
          *phys_level0_sa = (struct isl_extent4d) {
             .w = isl_align_npot(info->width, fmtl->bw),
             .h = fmtl->bh,
@@ -629,7 +682,11 @@
       break;
 
    case ISL_SURF_DIM_2D:
-      assert(dim_layout == ISL_DIM_LAYOUT_GEN4_2D);
+      if (ISL_DEV_GEN(dev) == 4 && (info->usage & ISL_SURF_USAGE_CUBE_BIT))
+         assert(dim_layout == ISL_DIM_LAYOUT_GEN4_3D);
+      else
+         assert(dim_layout == ISL_DIM_LAYOUT_GEN4_2D ||
+                dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
 
       if (tiling == ISL_TILING_Ys && info->samples > 1)
          isl_finishme("%s:%s: multisample TileYs layout", __FILE__, __func__);
@@ -694,6 +751,7 @@
 
       switch (dim_layout) {
       case ISL_DIM_LAYOUT_GEN9_1D:
+      case ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ:
          unreachable("bad isl_dim_layout");
 
       case ISL_DIM_LAYOUT_GEN4_2D:
@@ -722,6 +780,108 @@
 }
 
 /**
+ * Calculate the pitch between physical array slices, in units of rows of
+ * surface elements.
+ */
+static uint32_t
+isl_calc_array_pitch_el_rows_gen4_2d(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      const struct isl_tile_info *tile_info,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      enum isl_array_pitch_span array_pitch_span,
+      const struct isl_extent2d *phys_slice0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+   uint32_t pitch_sa_rows = 0;
+
+   switch (array_pitch_span) {
+   case ISL_ARRAY_PITCH_SPAN_COMPACT:
+      pitch_sa_rows = isl_align_npot(phys_slice0_sa->h, image_align_sa->h);
+      break;
+   case ISL_ARRAY_PITCH_SPAN_FULL: {
+      /* The QPitch equation is found in the Broadwell PRM >> Volume 5:
+       * Memory Views >> Common Surface Formats >> Surface Layout >> 2D
+       * Surfaces >> Surface Arrays.
+       */
+      uint32_t H0_sa = phys_level0_sa->h;
+      uint32_t H1_sa = isl_minify(H0_sa, 1);
+
+      uint32_t h0_sa = isl_align_npot(H0_sa, image_align_sa->h);
+      uint32_t h1_sa = isl_align_npot(H1_sa, image_align_sa->h);
+
+      uint32_t m;
+      if (ISL_DEV_GEN(dev) >= 7) {
+         /* The QPitch equation changed slightly in Ivybridge. */
+         m = 12;
+      } else {
+         m = 11;
+      }
+
+      pitch_sa_rows = h0_sa + h1_sa + (m * image_align_sa->h);
+
+      if (ISL_DEV_GEN(dev) == 6 && info->samples > 1 &&
+          (info->height % 4 == 1)) {
+         /* [SNB] Errata from the Sandy Bridge PRM >> Volume 4 Part 1:
+          * Graphics Core >> Section 7.18.3.7: Surface Arrays:
+          *
+          *    [SNB] Errata: Sampler MSAA Qpitch will be 4 greater than
+          *    the value calculated in the equation above , for every
+          *    other odd Surface Height starting from 1 i.e. 1,5,9,13.
+          *
+          * XXX(chadv): Is the errata natural corollary of the physical
+          * layout of interleaved samples?
+          */
+         pitch_sa_rows += 4;
+      }
+
+      pitch_sa_rows = isl_align_npot(pitch_sa_rows, fmtl->bh);
+      } /* end case */
+      break;
+   }
+
+   assert(pitch_sa_rows % fmtl->bh == 0);
+   uint32_t pitch_el_rows = pitch_sa_rows / fmtl->bh;
+
+   if (ISL_DEV_GEN(dev) >= 9 && fmtl->txc == ISL_TXC_CCS) {
+      /*
+       * From the Sky Lake PRM Vol 7, "MCS Buffer for Render Target(s)" (p. 632):
+       *
+       *    "Mip-mapped and arrayed surfaces are supported with MCS buffer
+       *    layout with these alignments in the RT space: Horizontal
+       *    Alignment = 128 and Vertical Alignment = 64."
+       *
+       * From the Sky Lake PRM Vol. 2d, "RENDER_SURFACE_STATE" (p. 435):
+       *
+       *    "For non-multisampled render target's CCS auxiliary surface,
+       *    QPitch must be computed with Horizontal Alignment = 128 and
+       *    Surface Vertical Alignment = 256. These alignments are only for
+       *    CCS buffer and not for associated render target."
+       *
+       * The first restriction is already handled by isl_choose_image_alignment_el
+       * but the second restriction, which is an extension of the first, only
+       * applies to qpitch and must be applied here.
+       */
+      assert(fmtl->bh == 4);
+      pitch_el_rows = isl_align(pitch_el_rows, 256 / 4);
+   }
+
+   if (ISL_DEV_GEN(dev) >= 9 &&
+       info->dim == ISL_SURF_DIM_3D &&
+       tile_info->tiling != ISL_TILING_LINEAR) {
+      /* From the Skylake BSpec >> RENDER_SURFACE_STATE >> Surface QPitch:
+       *
+       *    Tile Mode != Linear: This field must be set to an integer multiple
+       *    of the tile height
+       */
+      pitch_el_rows = isl_align(pitch_el_rows, tile_info->logical_extent_el.height);
+   }
+
+   return pitch_el_rows;
+}
+
+/**
  * A variant of isl_calc_phys_slice0_extent_sa() specific to
  * ISL_DIM_LAYOUT_GEN4_2D.
  */
@@ -797,43 +957,158 @@
    };
 }
 
+static void
+isl_calc_phys_total_extent_el_gen4_2d(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      const struct isl_tile_info *tile_info,
+      enum isl_msaa_layout msaa_layout,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      enum isl_array_pitch_span array_pitch_span,
+      uint32_t *array_pitch_el_rows,
+      struct isl_extent2d *total_extent_el)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   struct isl_extent2d phys_slice0_sa;
+   isl_calc_phys_slice0_extent_sa_gen4_2d(dev, info, msaa_layout,
+                                          image_align_sa, phys_level0_sa,
+                                          &phys_slice0_sa);
+   *array_pitch_el_rows =
+      isl_calc_array_pitch_el_rows_gen4_2d(dev, info, tile_info,
+                                           image_align_sa, phys_level0_sa,
+                                           array_pitch_span,
+                                           &phys_slice0_sa);
+   *total_extent_el = (struct isl_extent2d) {
+      .w = isl_assert_div(phys_slice0_sa.w, fmtl->bw),
+      .h = *array_pitch_el_rows * (phys_level0_sa->array_len - 1) +
+           isl_assert_div(phys_slice0_sa.h, fmtl->bh),
+   };
+}
+
 /**
  * A variant of isl_calc_phys_slice0_extent_sa() specific to
  * ISL_DIM_LAYOUT_GEN4_3D.
  */
 static void
-isl_calc_phys_slice0_extent_sa_gen4_3d(
+isl_calc_phys_total_extent_el_gen4_3d(
       const struct isl_device *dev,
       const struct isl_surf_init_info *restrict info,
       const struct isl_extent3d *image_align_sa,
       const struct isl_extent4d *phys_level0_sa,
-      struct isl_extent2d *phys_slice0_sa)
+      uint32_t *array_pitch_el_rows,
+      struct isl_extent2d *phys_total_el)
 {
-   assert(info->samples == 1);
-   assert(phys_level0_sa->array_len == 1);
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
 
-   uint32_t slice_w = 0;
-   uint32_t slice_h = 0;
+   assert(info->samples == 1);
+
+   if (info->dim != ISL_SURF_DIM_3D) {
+      /* From the G45 PRM Vol. 1a, "6.17.4.1 Hardware Cube Map Layout":
+       *
+       * The cube face textures are stored in the same way as 3D surfaces
+       * are stored (see section 6.17.5 for details).  For cube surfaces,
+       * however, the depth is equal to the number of faces (always 6) and
+       * is not reduced for each MIP.
+       */
+      assert(ISL_DEV_GEN(dev) == 4);
+      assert(info->usage & ISL_SURF_USAGE_CUBE_BIT);
+      assert(phys_level0_sa->array_len == 6);
+   } else {
+      assert(phys_level0_sa->array_len == 1);
+   }
+
+   uint32_t total_w = 0;
+   uint32_t total_h = 0;
 
    uint32_t W0 = phys_level0_sa->w;
    uint32_t H0 = phys_level0_sa->h;
    uint32_t D0 = phys_level0_sa->d;
+   uint32_t A0 = phys_level0_sa->a;
 
    for (uint32_t l = 0; l < info->levels; ++l) {
       uint32_t level_w = isl_align_npot(isl_minify(W0, l), image_align_sa->w);
       uint32_t level_h = isl_align_npot(isl_minify(H0, l), image_align_sa->h);
-      uint32_t level_d = isl_align_npot(isl_minify(D0, l), image_align_sa->d);
+      uint32_t level_d = info->dim == ISL_SURF_DIM_3D ? isl_minify(D0, l) : A0;
 
       uint32_t max_layers_horiz = MIN(level_d, 1u << l);
       uint32_t max_layers_vert = isl_align(level_d, 1u << l) / (1u << l);
 
-      slice_w = MAX(slice_w, level_w * max_layers_horiz);
-      slice_h += level_h * max_layers_vert;
+      total_w = MAX(total_w, level_w * max_layers_horiz);
+      total_h += level_h * max_layers_vert;
    }
 
-   *phys_slice0_sa = (struct isl_extent2d) {
-      .w = slice_w,
-      .h = slice_h,
+   /* GEN4_3D layouts don't really have an array pitch since each LOD has a
+    * different number of horizontal and vertical layers.  We have to set it
+    * to something, so at least make it true for LOD0.
+    */
+   *array_pitch_el_rows =
+      isl_align_npot(phys_level0_sa->h, image_align_sa->h) / fmtl->bw;
+   *phys_total_el = (struct isl_extent2d) {
+      .w = isl_assert_div(total_w, fmtl->bw),
+      .h = isl_assert_div(total_h, fmtl->bh),
+   };
+}
+
+/**
+ * A variant of isl_calc_phys_slice0_extent_sa() specific to
+ * ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ.
+ */
+static void
+isl_calc_phys_total_extent_el_gen6_stencil_hiz(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      const struct isl_tile_info *tile_info,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      uint32_t *array_pitch_el_rows,
+      struct isl_extent2d *phys_total_el)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   const struct isl_extent2d tile_extent_sa = {
+      .w = tile_info->logical_extent_el.w * fmtl->bw,
+      .h = tile_info->logical_extent_el.h * fmtl->bh,
+   };
+   /* Tile size is a multiple of image alignment */
+   assert(tile_extent_sa.w % image_align_sa->w == 0);
+   assert(tile_extent_sa.h % image_align_sa->h == 0);
+
+   const uint32_t W0 = phys_level0_sa->w;
+   const uint32_t H0 = phys_level0_sa->h;
+
+   /* Each image has the same height as LOD0 because the hardware thinks
+    * everything is LOD0
+    */
+   const uint32_t H = isl_align(H0, image_align_sa->h) * phys_level0_sa->a;
+
+   uint32_t total_top_w = 0;
+   uint32_t total_bottom_w = 0;
+   uint32_t total_h = 0;
+
+   for (uint32_t l = 0; l < info->levels; ++l) {
+      const uint32_t W = isl_minify(W0, l);
+
+      const uint32_t w = isl_align(W, tile_extent_sa.w);
+      const uint32_t h = isl_align(H, tile_extent_sa.h);
+
+      if (l == 0) {
+         total_top_w = w;
+         total_h = h;
+      } else if (l == 1) {
+         total_bottom_w = w;
+         total_h += h;
+      } else {
+         total_bottom_w += w;
+      }
+   }
+
+   *array_pitch_el_rows =
+      isl_assert_div(isl_align(H0, image_align_sa->h), fmtl->bh);
+   *phys_total_el = (struct isl_extent2d) {
+      .w = isl_assert_div(MAX(total_top_w, total_bottom_w), fmtl->bw),
+      .h = isl_assert_div(total_h, fmtl->bh),
    };
 }
 
@@ -842,16 +1117,17 @@
  * ISL_DIM_LAYOUT_GEN9_1D.
  */
 static void
-isl_calc_phys_slice0_extent_sa_gen9_1d(
+isl_calc_phys_total_extent_el_gen9_1d(
       const struct isl_device *dev,
       const struct isl_surf_init_info *restrict info,
       const struct isl_extent3d *image_align_sa,
       const struct isl_extent4d *phys_level0_sa,
-      struct isl_extent2d *phys_slice0_sa)
+      uint32_t *array_pitch_el_rows,
+      struct isl_extent2d *phys_total_el)
 {
    MAYBE_UNUSED const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
 
-   assert(phys_level0_sa->height == 1);
+   assert(phys_level0_sa->height / fmtl->bh == 1);
    assert(phys_level0_sa->depth == 1);
    assert(info->samples == 1);
    assert(image_align_sa->w >= fmtl->bw);
@@ -866,156 +1142,60 @@
       slice_w += w;
    }
 
-   *phys_slice0_sa = isl_extent2d(slice_w, 1);
+   *array_pitch_el_rows = 1;
+   *phys_total_el = (struct isl_extent2d) {
+      .w = isl_assert_div(slice_w, fmtl->bw),
+      .h = phys_level0_sa->array_len,
+   };
 }
 
 /**
- * Calculate the physical extent of the surface's first array slice, in units
- * of surface samples. If the surface is multi-leveled, then the result will
- * be aligned to \a image_align_sa.
+ * Calculate the two-dimensional total physical extent of the surface, in
+ * units of surface elements.
  */
 static void
-isl_calc_phys_slice0_extent_sa(const struct isl_device *dev,
-                               const struct isl_surf_init_info *restrict info,
-                               enum isl_dim_layout dim_layout,
-                               enum isl_msaa_layout msaa_layout,
-                               const struct isl_extent3d *image_align_sa,
-                               const struct isl_extent4d *phys_level0_sa,
-                               struct isl_extent2d *phys_slice0_sa)
+isl_calc_phys_total_extent_el(const struct isl_device *dev,
+                              const struct isl_surf_init_info *restrict info,
+                              const struct isl_tile_info *tile_info,
+                              enum isl_dim_layout dim_layout,
+                              enum isl_msaa_layout msaa_layout,
+                              const struct isl_extent3d *image_align_sa,
+                              const struct isl_extent4d *phys_level0_sa,
+                              enum isl_array_pitch_span array_pitch_span,
+                              uint32_t *array_pitch_el_rows,
+                              struct isl_extent2d *total_extent_el)
 {
    switch (dim_layout) {
    case ISL_DIM_LAYOUT_GEN9_1D:
-      isl_calc_phys_slice0_extent_sa_gen9_1d(dev, info,
-                                             image_align_sa, phys_level0_sa,
-                                             phys_slice0_sa);
+      assert(array_pitch_span == ISL_ARRAY_PITCH_SPAN_COMPACT);
+      isl_calc_phys_total_extent_el_gen9_1d(dev, info,
+                                            image_align_sa, phys_level0_sa,
+                                            array_pitch_el_rows,
+                                            total_extent_el);
       return;
    case ISL_DIM_LAYOUT_GEN4_2D:
-      isl_calc_phys_slice0_extent_sa_gen4_2d(dev, info, msaa_layout,
-                                             image_align_sa, phys_level0_sa,
-                                             phys_slice0_sa);
+      isl_calc_phys_total_extent_el_gen4_2d(dev, info, tile_info, msaa_layout,
+                                            image_align_sa, phys_level0_sa,
+                                            array_pitch_span,
+                                            array_pitch_el_rows,
+                                            total_extent_el);
       return;
-   case ISL_DIM_LAYOUT_GEN4_3D:
-      isl_calc_phys_slice0_extent_sa_gen4_3d(dev, info, image_align_sa,
-                                             phys_level0_sa, phys_slice0_sa);
+   case ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ:
+      assert(array_pitch_span == ISL_ARRAY_PITCH_SPAN_COMPACT);
+      isl_calc_phys_total_extent_el_gen6_stencil_hiz(dev, info, tile_info,
+                                                     image_align_sa,
+                                                     phys_level0_sa,
+                                                     array_pitch_el_rows,
+                                                     total_extent_el);
       return;
-   }
-}
-
-/**
- * Calculate the pitch between physical array slices, in units of rows of
- * surface elements.
- */
-static uint32_t
-isl_calc_array_pitch_el_rows(const struct isl_device *dev,
-                             const struct isl_surf_init_info *restrict info,
-                             const struct isl_tile_info *tile_info,
-                             enum isl_dim_layout dim_layout,
-                             enum isl_array_pitch_span array_pitch_span,
-                             const struct isl_extent3d *image_align_sa,
-                             const struct isl_extent4d *phys_level0_sa,
-                             const struct isl_extent2d *phys_slice0_sa)
-{
-   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
-   uint32_t pitch_sa_rows = 0;
-
-   switch (dim_layout) {
-   case ISL_DIM_LAYOUT_GEN9_1D:
-      /* Each row is an array slice */
-      pitch_sa_rows = 1;
-      break;
-   case ISL_DIM_LAYOUT_GEN4_2D:
-      switch (array_pitch_span) {
-      case ISL_ARRAY_PITCH_SPAN_COMPACT:
-         pitch_sa_rows = isl_align_npot(phys_slice0_sa->h, image_align_sa->h);
-         break;
-      case ISL_ARRAY_PITCH_SPAN_FULL: {
-         /* The QPitch equation is found in the Broadwell PRM >> Volume 5:
-          * Memory Views >> Common Surface Formats >> Surface Layout >> 2D
-          * Surfaces >> Surface Arrays.
-          */
-         uint32_t H0_sa = phys_level0_sa->h;
-         uint32_t H1_sa = isl_minify(H0_sa, 1);
-
-         uint32_t h0_sa = isl_align_npot(H0_sa, image_align_sa->h);
-         uint32_t h1_sa = isl_align_npot(H1_sa, image_align_sa->h);
-
-         uint32_t m;
-         if (ISL_DEV_GEN(dev) >= 7) {
-            /* The QPitch equation changed slightly in Ivybridge. */
-            m = 12;
-         } else {
-            m = 11;
-         }
-
-         pitch_sa_rows = h0_sa + h1_sa + (m * image_align_sa->h);
-
-         if (ISL_DEV_GEN(dev) == 6 && info->samples > 1 &&
-             (info->height % 4 == 1)) {
-            /* [SNB] Errata from the Sandy Bridge PRM >> Volume 4 Part 1:
-             * Graphics Core >> Section 7.18.3.7: Surface Arrays:
-             *
-             *    [SNB] Errata: Sampler MSAA Qpitch will be 4 greater than
-             *    the value calculated in the equation above , for every
-             *    other odd Surface Height starting from 1 i.e. 1,5,9,13.
-             *
-             * XXX(chadv): Is the errata natural corollary of the physical
-             * layout of interleaved samples?
-             */
-            pitch_sa_rows += 4;
-         }
-
-         pitch_sa_rows = isl_align_npot(pitch_sa_rows, fmtl->bh);
-         } /* end case */
-         break;
-      }
-      break;
    case ISL_DIM_LAYOUT_GEN4_3D:
       assert(array_pitch_span == ISL_ARRAY_PITCH_SPAN_COMPACT);
-      pitch_sa_rows = isl_align_npot(phys_slice0_sa->h, image_align_sa->h);
-      break;
-   default:
-      unreachable("bad isl_dim_layout");
-      break;
+      isl_calc_phys_total_extent_el_gen4_3d(dev, info,
+                                            image_align_sa, phys_level0_sa,
+                                            array_pitch_el_rows,
+                                            total_extent_el);
+      return;
    }
-
-   assert(pitch_sa_rows % fmtl->bh == 0);
-   uint32_t pitch_el_rows = pitch_sa_rows / fmtl->bh;
-
-   if (ISL_DEV_GEN(dev) >= 9 && fmtl->txc == ISL_TXC_CCS) {
-      /*
-       * From the Sky Lake PRM Vol 7, "MCS Buffer for Render Target(s)" (p. 632):
-       *
-       *    "Mip-mapped and arrayed surfaces are supported with MCS buffer
-       *    layout with these alignments in the RT space: Horizontal
-       *    Alignment = 128 and Vertical Alignment = 64."
-       *
-       * From the Sky Lake PRM Vol. 2d, "RENDER_SURFACE_STATE" (p. 435):
-       *
-       *    "For non-multisampled render target's CCS auxiliary surface,
-       *    QPitch must be computed with Horizontal Alignment = 128 and
-       *    Surface Vertical Alignment = 256. These alignments are only for
-       *    CCS buffer and not for associated render target."
-       *
-       * The first restriction is already handled by isl_choose_image_alignment_el
-       * but the second restriction, which is an extension of the first, only
-       * applies to qpitch and must be applied here.
-       */
-      assert(fmtl->bh == 4);
-      pitch_el_rows = isl_align(pitch_el_rows, 256 / 4);
-   }
-
-   if (ISL_DEV_GEN(dev) >= 9 &&
-       info->dim == ISL_SURF_DIM_3D &&
-       tile_info->tiling != ISL_TILING_LINEAR) {
-      /* From the Skylake BSpec >> RENDER_SURFACE_STATE >> Surface QPitch:
-       *
-       *    Tile Mode != Linear: This field must be set to an integer multiple
-       *    of the tile height
-       */
-      pitch_el_rows = isl_align(pitch_el_rows, tile_info->logical_extent_el.height);
-   }
-
-   return pitch_el_rows;
 }
 
 static uint32_t
@@ -1056,33 +1236,29 @@
 static uint32_t
 isl_calc_linear_min_row_pitch(const struct isl_device *dev,
                               const struct isl_surf_init_info *info,
-                              const struct isl_extent2d *phys_slice0_sa,
+                              const struct isl_extent2d *phys_total_el,
                               uint32_t alignment)
 {
    const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
    const uint32_t bs = fmtl->bpb / 8;
 
-   assert(phys_slice0_sa->w % fmtl->bw == 0);
-
-   return isl_align_npot(bs * (phys_slice0_sa->w / fmtl->bw), alignment);
+   return isl_align_npot(bs * phys_total_el->w, alignment);
 }
 
 static uint32_t
 isl_calc_tiled_min_row_pitch(const struct isl_device *dev,
                              const struct isl_surf_init_info *surf_info,
                              const struct isl_tile_info *tile_info,
-                             const struct isl_extent2d *phys_slice0_sa,
+                             const struct isl_extent2d *phys_total_el,
                              uint32_t alignment)
 {
    const struct isl_format_layout *fmtl = isl_format_get_layout(surf_info->format);
 
    assert(fmtl->bpb % tile_info->format_bpb == 0);
-   assert(phys_slice0_sa->w % fmtl->bw == 0);
 
    const uint32_t tile_el_scale = fmtl->bpb / tile_info->format_bpb;
-   const uint32_t total_w_el = phys_slice0_sa->width / fmtl->bw;
    const uint32_t total_w_tl =
-      isl_align_div(total_w_el * tile_el_scale,
+      isl_align_div(phys_total_el->w * tile_el_scale,
                     tile_info->logical_extent_el.width);
 
    assert(alignment == tile_info->phys_extent_B.width);
@@ -1093,15 +1269,15 @@
 isl_calc_min_row_pitch(const struct isl_device *dev,
                        const struct isl_surf_init_info *surf_info,
                        const struct isl_tile_info *tile_info,
-                       const struct isl_extent2d *phys_slice0_sa,
+                       const struct isl_extent2d *phys_total_el,
                        uint32_t alignment)
 {
    if (tile_info->tiling == ISL_TILING_LINEAR) {
-      return isl_calc_linear_min_row_pitch(dev, surf_info, phys_slice0_sa,
+      return isl_calc_linear_min_row_pitch(dev, surf_info, phys_total_el,
                                            alignment);
    } else {
       return isl_calc_tiled_min_row_pitch(dev, surf_info, tile_info,
-                                          phys_slice0_sa, alignment);
+                                          phys_total_el, alignment);
    }
 }
 
@@ -1126,14 +1302,28 @@
                    const struct isl_surf_init_info *surf_info,
                    const struct isl_tile_info *tile_info,
                    enum isl_dim_layout dim_layout,
-                   const struct isl_extent2d *phys_slice0_sa,
+                   const struct isl_extent2d *phys_total_el,
                    uint32_t *out_row_pitch)
 {
-   const uint32_t alignment =
+   uint32_t alignment =
       isl_calc_row_pitch_alignment(surf_info, tile_info);
 
+   /* If pitch isn't given and it can be chosen freely, align it by cache line
+    * allowing one to use blit engine on the surface.
+    */
+   if (surf_info->row_pitch == 0 && tile_info->tiling == ISL_TILING_LINEAR) {
+      /* From the Broadwell PRM docs for XY_SRC_COPY_BLT::SourceBaseAddress:
+       *
+       *    "Base address of the destination surface: X=0, Y=0. Lower 32bits
+       *    of the 48bit addressing. When Src Tiling is enabled (Bit_15
+       *    enabled), this address must be 4KB-aligned. When Tiling is not
+       *    enabled, this address should be CL (64byte) aligned."
+       */
+      alignment = MAX2(alignment, 64);
+   }
+
    const uint32_t min_row_pitch =
-      isl_calc_min_row_pitch(dev, surf_info, tile_info, phys_slice0_sa,
+      isl_calc_min_row_pitch(dev, surf_info, tile_info, phys_total_el,
                              alignment);
 
    uint32_t row_pitch = min_row_pitch;
@@ -1177,124 +1367,19 @@
        !pitch_in_range(row_pitch, _3DSTATE_HIER_DEPTH_BUFFER_SurfacePitch_bits(dev->info)))
       return false;
 
-   if (surf_info->usage & ISL_SURF_USAGE_STENCIL_BIT)
-      isl_finishme("validate row pitch of stencil surfaces");
+   const uint32_t stencil_pitch_bits = dev->use_separate_stencil ?
+      _3DSTATE_STENCIL_BUFFER_SurfacePitch_bits(dev->info) :
+      _3DSTATE_DEPTH_BUFFER_SurfacePitch_bits(dev->info);
+
+   if ((surf_info->usage & ISL_SURF_USAGE_STENCIL_BIT) &&
+       !pitch_in_range(row_pitch, stencil_pitch_bits))
+      return false;
 
  done:
    *out_row_pitch = row_pitch;
    return true;
 }
 
-/**
- * Calculate and apply any padding required for the surface.
- *
- * @param[inout] total_h_el is updated with the new height
- * @param[out] pad_bytes is overwritten with additional padding requirements.
- */
-static void
-isl_apply_surface_padding(const struct isl_device *dev,
-                          const struct isl_surf_init_info *restrict info,
-                          const struct isl_tile_info *tile_info,
-                          uint32_t *total_h_el,
-                          uint32_t *pad_bytes)
-{
-   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
-
-   *pad_bytes = 0;
-
-   /* From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
-    * Formats >> Surface Padding Requirements >> Render Target and Media
-    * Surfaces:
-    *
-    *   The data port accesses data (pixels) outside of the surface if they
-    *   are contained in the same cache request as pixels that are within the
-    *   surface. These pixels will not be returned by the requesting message,
-    *   however if these pixels lie outside of defined pages in the GTT,
-    *   a GTT error will result when the cache request is processed. In
-    *   order to avoid these GTT errors, “padding” at the bottom of the
-    *   surface is sometimes necessary.
-    *
-    * From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
-    * Formats >> Surface Padding Requirements >> Sampling Engine Surfaces:
-    *
-    *    ... Lots of padding requirements, all listed separately below.
-    */
-
-   /* We can safely ignore the first padding requirement, quoted below,
-    * because isl doesn't do buffers.
-    *
-    *    - [pre-BDW] For buffers, which have no inherent “height,” padding
-    *      requirements are different. A buffer must be padded to the next
-    *      multiple of 256 array elements, with an additional 16 bytes added
-    *      beyond that to account for the L1 cache line.
-    */
-
-   /*
-    *    - For compressed textures [...], padding at the bottom of the surface
-    *      is to an even compressed row.
-    */
-   if (isl_format_is_compressed(info->format))
-      *total_h_el = isl_align(*total_h_el, 2);
-
-   /*
-    *    - For cube surfaces, an additional two rows of padding are required
-    *      at the bottom of the surface.
-    */
-   if (info->usage & ISL_SURF_USAGE_CUBE_BIT)
-      *total_h_el += 2;
-
-   /*
-    *    - For packed YUV, 96 bpt, 48 bpt, and 24 bpt surface formats,
-    *      additional padding is required. These surfaces require an extra row
-    *      plus 16 bytes of padding at the bottom in addition to the general
-    *      padding requirements.
-    */
-   if (isl_format_is_yuv(info->format) &&
-       (fmtl->bpb == 96 || fmtl->bpb == 48|| fmtl->bpb == 24)) {
-      *total_h_el += 1;
-      *pad_bytes += 16;
-   }
-
-   /*
-    *    - For linear surfaces, additional padding of 64 bytes is required at
-    *      the bottom of the surface. This is in addition to the padding
-    *      required above.
-    */
-   if (tile_info->tiling == ISL_TILING_LINEAR)
-      *pad_bytes += 64;
-
-   /* The below text weakens, not strengthens, the padding requirements for
-    * linear surfaces. Therefore we can safely ignore it.
-    *
-    *    - [BDW+] For SURFTYPE_BUFFER, SURFTYPE_1D, and SURFTYPE_2D non-array,
-    *      non-MSAA, non-mip-mapped surfaces in linear memory, the only
-    *      padding requirement is to the next aligned 64-byte boundary beyond
-    *      the end of the surface. The rest of the padding requirements
-    *      documented above do not apply to these surfaces.
-    */
-
-   /*
-    *    - [SKL+] For SURFTYPE_2D and SURFTYPE_3D with linear mode and
-    *      height % 4 != 0, the surface must be padded with
-    *      4-(height % 4)*Surface Pitch # of bytes.
-    */
-   if (ISL_DEV_GEN(dev) >= 9 &&
-       tile_info->tiling == ISL_TILING_LINEAR &&
-       (info->dim == ISL_SURF_DIM_2D || info->dim == ISL_SURF_DIM_3D)) {
-      *total_h_el = isl_align(*total_h_el, 4);
-   }
-
-   /*
-    *    - [SKL+] For SURFTYPE_1D with linear mode, the surface must be padded
-    *      to 4 times the Surface Pitch # of bytes
-    */
-   if (ISL_DEV_GEN(dev) >= 9 &&
-       tile_info->tiling == ISL_TILING_LINEAR &&
-       info->dim == ISL_SURF_DIM_1D) {
-      *total_h_el += 4;
-   }
-}
-
 bool
 isl_surf_init_s(const struct isl_device *dev,
                 struct isl_surf *surf,
@@ -1314,11 +1399,10 @@
       return false;
 
    struct isl_tile_info tile_info;
-   if (!isl_tiling_get_info(dev, tiling, fmtl->bpb, &tile_info))
-      return false;
+   isl_tiling_get_info(tiling, fmtl->bpb, &tile_info);
 
    const enum isl_dim_layout dim_layout =
-      isl_surf_choose_dim_layout(dev, info->dim, tiling);
+      isl_surf_choose_dim_layout(dev, info->dim, tiling, info->usage);
 
    enum isl_msaa_layout msaa_layout;
    if (!isl_choose_msaa_layout(dev, info, tiling, &msaa_layout))
@@ -1340,32 +1424,23 @@
    enum isl_array_pitch_span array_pitch_span =
       isl_choose_array_pitch_span(dev, info, dim_layout, &phys_level0_sa);
 
-   struct isl_extent2d phys_slice0_sa;
-   isl_calc_phys_slice0_extent_sa(dev, info, dim_layout, msaa_layout,
-                                  &image_align_sa, &phys_level0_sa,
-                                  &phys_slice0_sa);
-   assert(phys_slice0_sa.w % fmtl->bw == 0);
-   assert(phys_slice0_sa.h % fmtl->bh == 0);
-
-   const uint32_t array_pitch_el_rows =
-      isl_calc_array_pitch_el_rows(dev, info, &tile_info, dim_layout,
-                                   array_pitch_span, &image_align_sa,
-                                   &phys_level0_sa, &phys_slice0_sa);
-
-   uint32_t total_h_el = phys_level0_sa.array_len * array_pitch_el_rows;
-
-   uint32_t pad_bytes;
-   isl_apply_surface_padding(dev, info, &tile_info, &total_h_el, &pad_bytes);
+   uint32_t array_pitch_el_rows;
+   struct isl_extent2d phys_total_el;
+   isl_calc_phys_total_extent_el(dev, info, &tile_info,
+                                 dim_layout, msaa_layout,
+                                 &image_align_sa, &phys_level0_sa,
+                                 array_pitch_span, &array_pitch_el_rows,
+                                 &phys_total_el);
 
    uint32_t row_pitch;
    if (!isl_calc_row_pitch(dev, info, &tile_info, dim_layout,
-                           &phys_slice0_sa, &row_pitch))
+                           &phys_total_el, &row_pitch))
       return false;
 
    uint32_t base_alignment;
    uint64_t size;
    if (tiling == ISL_TILING_LINEAR) {
-      size = row_pitch * total_h_el + pad_bytes;
+      size = (uint64_t) row_pitch * phys_total_el.h;
 
       /* From the Broadwell PRM Vol 2d, RENDER_SURFACE_STATE::SurfaceBaseAddress:
        *
@@ -1386,11 +1461,10 @@
       }
       base_alignment = isl_round_up_to_power_of_two(base_alignment);
    } else {
-      total_h_el += isl_align_div_npot(pad_bytes, row_pitch);
       const uint32_t total_h_tl =
-         isl_align_div(total_h_el, tile_info.logical_extent_el.height);
+         isl_align_div(phys_total_el.h, tile_info.logical_extent_el.height);
 
-      size = total_h_tl * tile_info.phys_extent_B.height * row_pitch;
+      size = (uint64_t) total_h_tl * tile_info.phys_extent_B.height * row_pitch;
 
       const uint32_t tile_size = tile_info.phys_extent_B.width *
                                  tile_info.phys_extent_B.height;
@@ -1447,12 +1521,11 @@
 }
 
 void
-isl_surf_get_tile_info(const struct isl_device *dev,
-                       const struct isl_surf *surf,
+isl_surf_get_tile_info(const struct isl_surf *surf,
                        struct isl_tile_info *tile_info)
 {
    const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
-   isl_tiling_get_info(dev, surf->tiling, fmtl->bpb, tile_info);
+   isl_tiling_get_info(surf->tiling, fmtl->bpb, tile_info);
 }
 
 bool
@@ -1585,7 +1658,8 @@
 bool
 isl_surf_get_ccs_surf(const struct isl_device *dev,
                       const struct isl_surf *surf,
-                      struct isl_surf *ccs_surf)
+                      struct isl_surf *ccs_surf,
+                      uint32_t row_pitch)
 {
    assert(surf->samples == 1 && surf->msaa_layout == ISL_MSAA_LAYOUT_NONE);
    assert(ISL_DEV_GEN(dev) >= 7);
@@ -1593,9 +1667,30 @@
    if (surf->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT)
       return false;
 
+   /* The PRM doesn't say this explicitly, but fast-clears don't appear to
+    * work for 3D textures until gen9 where the layout of 3D textures changes
+    * to match 2D array textures.
+    */
    if (ISL_DEV_GEN(dev) <= 8 && surf->dim != ISL_SURF_DIM_2D)
       return false;
 
+   /* From the HSW PRM Volume 7: 3D-Media-GPGPU, page 652 (Color Clear of
+    * Non-MultiSampler Render Target Restrictions):
+    *
+    *    "Support is for non-mip-mapped and non-array surface types only."
+    *
+    * This restriction is lifted on gen8+.  Technically, it may be possible to
+    * create a CCS for an arrayed or mipmapped image and only enable CCS_D
+    * when rendering to the base slice.  However, there is no documentation
+    * tell us what the hardware would do in that case or what it does if you
+    * walk off the bases slice.  (Does it ignore CCS or does it start
+    * scribbling over random memory?)  We play it safe and just follow the
+    * docs and don't allow CCS_D for arrayed or mip-mapped surfaces.
+    */
+   if (ISL_DEV_GEN(dev) <= 7 &&
+       (surf->levels > 1 || surf->logical_level0_px.array_len > 1))
+      return false;
+
    if (isl_format_is_compressed(surf->format))
       return false;
 
@@ -1642,6 +1737,7 @@
                         .levels = surf->levels,
                         .array_len = surf->logical_level0_px.array_len,
                         .samples = 1,
+                        .row_pitch = row_pitch,
                         .usage = ISL_SURF_USAGE_CCS_BIT,
                         .tiling_flags = ISL_TILING_CCS_BIT);
 }
@@ -1676,6 +1772,9 @@
    case 9:
       isl_gen9_surf_fill_state_s(dev, state, info);
       break;
+   case 10:
+      isl_gen10_surf_fill_state_s(dev, state, info);
+      break;
    default:
       assert(!"Cannot fill surface state for this gen");
    }
@@ -1692,6 +1791,9 @@
    case 9:
       isl_gen9_buffer_fill_state_s(state, info);
       break;
+   case 10:
+      isl_gen10_buffer_fill_state_s(state, info);
+      break;
    default:
       assert(!"Cannot fill surface state for this gen");
    }
@@ -1745,6 +1847,9 @@
    case 9:
       isl_gen9_emit_depth_stencil_hiz_s(dev, batch, info);
       break;
+   case 10:
+      isl_gen10_emit_depth_stencil_hiz_s(dev, batch, info);
+      break;
    default:
       assert(!"Cannot fill surface state for this gen");
    }
@@ -1803,8 +1908,15 @@
                             uint32_t *y_offset_sa)
 {
    assert(level < surf->levels);
-   assert(logical_z_offset_px < isl_minify(surf->phys_level0_sa.depth, level));
-   assert(surf->phys_level0_sa.array_len == 1);
+   if (surf->dim == ISL_SURF_DIM_3D) {
+      assert(surf->phys_level0_sa.array_len == 1);
+      assert(logical_z_offset_px < isl_minify(surf->phys_level0_sa.depth, level));
+   } else {
+      assert(surf->dim == ISL_SURF_DIM_2D);
+      assert(surf->usage & ISL_SURF_USAGE_CUBE_BIT);
+      assert(surf->phys_level0_sa.array_len == 6);
+      assert(logical_z_offset_px < surf->phys_level0_sa.array_len);
+   }
 
    const struct isl_extent3d image_align_sa =
       isl_surf_get_image_alignment_sa(surf);
@@ -1812,13 +1924,16 @@
    const uint32_t W0 = surf->phys_level0_sa.width;
    const uint32_t H0 = surf->phys_level0_sa.height;
    const uint32_t D0 = surf->phys_level0_sa.depth;
+   const uint32_t AL = surf->phys_level0_sa.array_len;
 
    uint32_t x = 0;
    uint32_t y = 0;
 
    for (uint32_t l = 0; l < level; ++l) {
       const uint32_t level_h = isl_align_npot(isl_minify(H0, l), image_align_sa.h);
-      const uint32_t level_d = isl_align_npot(isl_minify(D0, l), image_align_sa.d);
+      const uint32_t level_d =
+         isl_align_npot(surf->dim == ISL_SURF_DIM_3D ? isl_minify(D0, l) : AL,
+                        image_align_sa.d);
       const uint32_t max_layers_vert = isl_align(level_d, 1u << l) / (1u << l);
 
       y += level_h * max_layers_vert;
@@ -1826,7 +1941,9 @@
 
    const uint32_t level_w = isl_align_npot(isl_minify(W0, level), image_align_sa.w);
    const uint32_t level_h = isl_align_npot(isl_minify(H0, level), image_align_sa.h);
-   const uint32_t level_d = isl_align_npot(isl_minify(D0, level), image_align_sa.d);
+   const uint32_t level_d =
+      isl_align_npot(surf->dim == ISL_SURF_DIM_3D ? isl_minify(D0, level) : AL,
+                     image_align_sa.d);
 
    const uint32_t max_layers_horiz = MIN(level_d, 1u << level);
 
@@ -1837,6 +1954,65 @@
    *y_offset_sa = y;
 }
 
+static void
+get_image_offset_sa_gen6_stencil_hiz(const struct isl_surf *surf,
+                                     uint32_t level,
+                                     uint32_t logical_array_layer,
+                                     uint32_t *x_offset_sa,
+                                     uint32_t *y_offset_sa)
+{
+   assert(level < surf->levels);
+   assert(surf->logical_level0_px.depth == 1);
+   assert(logical_array_layer < surf->logical_level0_px.array_len);
+
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+
+   const struct isl_extent3d image_align_sa =
+      isl_surf_get_image_alignment_sa(surf);
+
+   struct isl_tile_info tile_info;
+   isl_tiling_get_info(surf->tiling, fmtl->bpb, &tile_info);
+   const struct isl_extent2d tile_extent_sa = {
+      .w = tile_info.logical_extent_el.w * fmtl->bw,
+      .h = tile_info.logical_extent_el.h * fmtl->bh,
+   };
+   /* Tile size is a multiple of image alignment */
+   assert(tile_extent_sa.w % image_align_sa.w == 0);
+   assert(tile_extent_sa.h % image_align_sa.h == 0);
+
+   const uint32_t W0 = surf->phys_level0_sa.w;
+   const uint32_t H0 = surf->phys_level0_sa.h;
+
+   /* Each image has the same height as LOD0 because the hardware thinks
+    * everything is LOD0
+    */
+   const uint32_t H = isl_align(H0, image_align_sa.h);
+
+   /* Quick sanity check for consistency */
+   if (surf->phys_level0_sa.array_len > 1)
+      assert(surf->array_pitch_el_rows == isl_assert_div(H, fmtl->bh));
+
+   uint32_t x = 0, y = 0;
+   for (uint32_t l = 0; l < level; ++l) {
+      const uint32_t W = isl_minify(W0, l);
+
+      const uint32_t w = isl_align(W, tile_extent_sa.w);
+      const uint32_t h = isl_align(H * surf->phys_level0_sa.a,
+                                   tile_extent_sa.h);
+
+      if (l == 0) {
+         y += h;
+      } else {
+         x += w;
+      }
+   }
+
+   y += H * logical_array_layer;
+
+   *x_offset_sa = x;
+   *y_offset_sa = y;
+}
+
 /**
  * A variant of isl_surf_get_image_offset_sa() specific to
  * ISL_DIM_LAYOUT_GEN9_1D.
@@ -1902,9 +2078,15 @@
                                   x_offset_sa, y_offset_sa);
       break;
    case ISL_DIM_LAYOUT_GEN4_3D:
-      get_image_offset_sa_gen4_3d(surf, level, logical_z_offset_px,
+      get_image_offset_sa_gen4_3d(surf, level, logical_array_layer +
+                                  logical_z_offset_px,
                                   x_offset_sa, y_offset_sa);
       break;
+   case ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ:
+      get_image_offset_sa_gen6_stencil_hiz(surf, level, logical_array_layer +
+                                           logical_z_offset_px,
+                                           x_offset_sa, y_offset_sa);
+      break;
 
    default:
       unreachable("not reached");
@@ -1938,9 +2120,88 @@
 }
 
 void
-isl_tiling_get_intratile_offset_el(const struct isl_device *dev,
-                                   enum isl_tiling tiling,
-                                   uint8_t bs,
+isl_surf_get_image_offset_B_tile_sa(const struct isl_surf *surf,
+                                    uint32_t level,
+                                    uint32_t logical_array_layer,
+                                    uint32_t logical_z_offset_px,
+                                    uint32_t *offset_B,
+                                    uint32_t *x_offset_sa,
+                                    uint32_t *y_offset_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+
+   uint32_t total_x_offset_el, total_y_offset_el;
+   isl_surf_get_image_offset_el(surf, level, logical_array_layer,
+                                logical_z_offset_px,
+                                &total_x_offset_el,
+                                &total_y_offset_el);
+
+   uint32_t x_offset_el, y_offset_el;
+   isl_tiling_get_intratile_offset_el(surf->tiling, fmtl->bpb,
+                                      surf->row_pitch,
+                                      total_x_offset_el,
+                                      total_y_offset_el,
+                                      offset_B,
+                                      &x_offset_el,
+                                      &y_offset_el);
+
+   if (x_offset_sa) {
+      *x_offset_sa = x_offset_el * fmtl->bw;
+   } else {
+      assert(x_offset_el == 0);
+   }
+
+   if (y_offset_sa) {
+      *y_offset_sa = y_offset_el * fmtl->bh;
+   } else {
+      assert(y_offset_el == 0);
+   }
+}
+
+void
+isl_surf_get_image_surf(const struct isl_device *dev,
+                        const struct isl_surf *surf,
+                        uint32_t level,
+                        uint32_t logical_array_layer,
+                        uint32_t logical_z_offset_px,
+                        struct isl_surf *image_surf,
+                        uint32_t *offset_B,
+                        uint32_t *x_offset_sa,
+                        uint32_t *y_offset_sa)
+{
+   isl_surf_get_image_offset_B_tile_sa(surf,
+                                       level,
+                                       logical_array_layer,
+                                       logical_z_offset_px,
+                                       offset_B,
+                                       x_offset_sa,
+                                       y_offset_sa);
+
+   /* Even for cube maps there will be only single face, therefore drop the
+    * corresponding flag if present.
+    */
+   const isl_surf_usage_flags_t usage =
+      surf->usage & (~ISL_SURF_USAGE_CUBE_BIT);
+
+   bool ok UNUSED;
+   ok = isl_surf_init(dev, image_surf,
+                      .dim = ISL_SURF_DIM_2D,
+                      .format = surf->format,
+                      .width = isl_minify(surf->logical_level0_px.w, level),
+                      .height = isl_minify(surf->logical_level0_px.h, level),
+                      .depth = 1,
+                      .levels = 1,
+                      .array_len = 1,
+                      .samples = surf->samples,
+                      .row_pitch = surf->row_pitch,
+                      .usage = usage,
+                      .tiling_flags = (1 << surf->tiling));
+   assert(ok);
+}
+
+void
+isl_tiling_get_intratile_offset_el(enum isl_tiling tiling,
+                                   uint32_t bpb,
                                    uint32_t row_pitch,
                                    uint32_t total_x_offset_el,
                                    uint32_t total_y_offset_el,
@@ -1949,17 +2210,16 @@
                                    uint32_t *y_offset_el)
 {
    if (tiling == ISL_TILING_LINEAR) {
+      assert(bpb % 8 == 0);
       *base_address_offset = total_y_offset_el * row_pitch +
-                             total_x_offset_el * bs;
+                             total_x_offset_el * (bpb / 8);
       *x_offset_el = 0;
       *y_offset_el = 0;
       return;
    }
 
-   const uint32_t bpb = bs * 8;
-
    struct isl_tile_info tile_info;
-   isl_tiling_get_info(dev, tiling, bpb, &tile_info);
+   isl_tiling_get_info(tiling, bpb, &tile_info);
 
    assert(row_pitch % tile_info.phys_extent_B.width == 0);
 
diff --git a/src/intel/isl/isl.h b/src/intel/isl/isl.h
index c9970bc..dafe952 100644
--- a/src/intel/isl/isl.h
+++ b/src/intel/isl/isl.h
@@ -353,6 +353,20 @@
    ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16 =                        630,
    ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16 =                        638,
    ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16 =                        639,
+   ISL_FORMAT_ASTC_HDR_2D_4X4_FLT16 =                          832,
+   ISL_FORMAT_ASTC_HDR_2D_5X4_FLT16 =                          840,
+   ISL_FORMAT_ASTC_HDR_2D_5X5_FLT16 =                          841,
+   ISL_FORMAT_ASTC_HDR_2D_6X5_FLT16 =                          849,
+   ISL_FORMAT_ASTC_HDR_2D_6X6_FLT16 =                          850,
+   ISL_FORMAT_ASTC_HDR_2D_8X5_FLT16 =                          865,
+   ISL_FORMAT_ASTC_HDR_2D_8X6_FLT16 =                          866,
+   ISL_FORMAT_ASTC_HDR_2D_8X8_FLT16 =                          868,
+   ISL_FORMAT_ASTC_HDR_2D_10X5_FLT16 =                         881,
+   ISL_FORMAT_ASTC_HDR_2D_10X6_FLT16 =                         882,
+   ISL_FORMAT_ASTC_HDR_2D_10X8_FLT16 =                         884,
+   ISL_FORMAT_ASTC_HDR_2D_10X10_FLT16 =                        886,
+   ISL_FORMAT_ASTC_HDR_2D_12X10_FLT16 =                        894,
+   ISL_FORMAT_ASTC_HDR_2D_12X12_FLT16 =                        895,
 
    /* The formats that follow are internal to ISL and as such don't have an
     * explicit number.  We'll just let the C compiler assign it for us.  Any
@@ -514,6 +528,46 @@
    ISL_DIM_LAYOUT_GEN4_3D,
 
    /**
+    * Special layout used for HiZ and stencil on Sandy Bridge to work around
+    * the hardware's lack of mipmap support.  On gen6, HiZ and stencil buffers
+    * work the same as on gen7+ except that they don't technically support
+    * mipmapping.  That does not, however, stop us from doing it.  As far as
+    * Sandy Bridge hardware is concerned, HiZ and stencil always operates on a
+    * single miplevel 2D (possibly array) image.  The dimensions of that image
+    * are NOT minified.
+    *
+    * In order to implement HiZ and stencil on Sandy Bridge, we create one
+    * full-sized 2D (possibly array) image for every LOD with every image
+    * aligned to a page boundary.  When the surface is used with the stencil
+    * or HiZ hardware, we manually offset to the image for the given LOD.
+    *
+    * As a memory saving measure,  we pretend that the width of each miplevel
+    * is minified and we place LOD1 and above below LOD0 but horizontally
+    * adjacent to each other.  When considered as full-sized images, LOD1 and
+    * above technically overlap.  However, since we only write to part of that
+    * image, the hardware will never notice the overlap.
+    *
+    * This layout looks something like this:
+    *
+    *   +---------+
+    *   |         |
+    *   |         |
+    *   +---------+
+    *   |         |
+    *   |         |
+    *   +---------+
+    *
+    *   +----+ +-+ .
+    *   |    | +-+
+    *   +----+
+    *
+    *   +----+ +-+ .
+    *   |    | +-+
+    *   +----+
+    */
+   ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ,
+
+   /**
     * For details, see the Skylake BSpec >> Memory Views >> Common Surface
     * Formats >> Surface Layout and Tiling >> » 1D Surfaces.
     */
@@ -546,6 +600,193 @@
    ISL_AUX_USAGE_CCS_E,
 };
 
+/**
+ * Enum for keeping track of the state an auxiliary compressed surface.
+ *
+ * For any given auxiliary surface compression format (HiZ, CCS, or MCS), any
+ * given slice (lod + array layer) can be in one of the six states described
+ * by this enum.  Draw and resolve operations may cause the slice to change
+ * from one state to another.  The six valid states are:
+ *
+ *    1) Clear:  In this state, each block in the auxiliary surface contains a
+ *       magic value that indicates that the block is in the clear state.  If
+ *       a block is in the clear state, it's values in the primary surface are
+ *       ignored and the color of the samples in the block is taken either the
+ *       RENDER_SURFACE_STATE packet for color or 3DSTATE_CLEAR_PARAMS for
+ *       depth.  Since neither the primary surface nor the auxiliary surface
+ *       contains the clear value, the surface can be cleared to a different
+ *       color by simply changing the clear color without modifying either
+ *       surface.
+ *
+ *    2) Partial Clear:  In this state, each block in the auxiliary surface
+ *       contains either the magic clear or pass-through value.  See Clear and
+ *       Pass-through for more details.
+ *
+ *    3) Compressed w/ Clear:  In this state, neither the auxiliary surface
+ *       nor the primary surface has a complete representation of the data.
+ *       Instead, both surfaces must be used together or else rendering
+ *       corruption may occur.  Depending on the auxiliary compression format
+ *       and the data, any given block in the primary surface may contain all,
+ *       some, or none of the data required to reconstruct the actual sample
+ *       values.  Blocks may also be in the clear state (see Clear) and have
+ *       their value taken from outside the surface.
+ *
+ *    4) Compressed w/o Clear:  This state is identical to the state above
+ *       except that no blocks are in the clear state.  In this state, all of
+ *       the data required to reconstruct the final sample values is contained
+ *       in the auxiliary and primary surface and the clear value is not
+ *       considered.
+ *
+ *    5) Resolved:  In this state, the primary surface contains 100% of the
+ *       data.  The auxiliary surface is also valid so the surface can be
+ *       validly used with or without aux enabled.  The auxiliary surface may,
+ *       however, contain non-trivial data and any update to the primary
+ *       surface with aux disabled will cause the two to get out of sync.
+ *
+ *    6) Pass-through:  In this state, the primary surface contains 100% of the
+ *       data and every block in the auxiliary surface contains a magic value
+ *       which indicates that the auxiliary surface should be ignored and the
+ *       only the primary surface should be considered.  Updating the primary
+ *       surface without aux works fine and can be done repeatedly in this
+ *       mode.  Writing to a surface in pass-through mode with aux enabled may
+ *       cause the auxiliary buffer to contain non-trivial data and no longer
+ *       be in the pass-through state.
+ *
+ *    7) Aux Invalid:  In this state, the primary surface contains 100% of the
+ *       data and the auxiliary surface is completely bogus.  Any attempt to
+ *       use the auxiliary surface is liable to result in rendering
+ *       corruption.  The only thing that one can do to re-enable aux once
+ *       this state is reached is to use an ambiguate pass to transition into
+ *       the pass-through state.
+ *
+ * Drawing with or without aux enabled may implicitly cause the surface to
+ * transition between these states.  There are also four types of auxiliary
+ * compression operations which cause an explicit transition:
+ *
+ *    1) Fast Clear:  This operation writes the magic "clear" value to the
+ *       auxiliary surface.  This operation will safely transition any slice
+ *       of a surface from any state to the clear state so long as the entire
+ *       slice is fast cleared at once.  A fast clear that only covers part of
+ *       a slice of a surface is called a partial fast clear.
+ *
+ *    2) Full Resolve:  This operation combines the auxiliary surface data
+ *       with the primary surface data and writes the result to the primary.
+ *       For HiZ, the docs call this a depth resolve.  For CCS, the hardware
+ *       full resolve operation does both a full resolve and an ambiguate so
+ *       it actually takes you all the way to the pass-through state.
+ *
+ *    3) Partial Resolve:  This operation considers blocks which are in the
+ *       "clear" state and writes the clear value directly into the primary or
+ *       auxiliary surface.  Once this operation completes, the surface is
+ *       still compressed but no longer references the clear color.  This
+ *       operation is only available for CCS.
+ *
+ *    4) Ambiguate:  This operation throws away the current auxiliary data and
+ *       replaces it with the magic pass-through value.  If an ambiguate
+ *       operation is performed when the primary surface does not contain 100%
+ *       of the data, data will be lost.  This operation is only implemented
+ *       in hardware for depth where it is called a HiZ resolve.
+ *
+ * Not all operations are valid or useful in all states.  The diagram below
+ * contains a complete description of the states and all valid and useful
+ * transitions except clear.
+ *
+ *   Draw w/ Aux
+ *   +----------+
+ *   |          |
+ *   |       +-------------+    Draw w/ Aux     +-------------+
+ *   +------>| Compressed  |<-------------------|    Clear    |
+ *           |  w/ Clear   |----->----+         |             |
+ *           +-------------+          |         +-------------+
+ *                  |  /|\            |            |   |
+ *                  |   |             |            |   |
+ *                  |   |             +------<-----+   |  Draw w/
+ *                  |   |             |                | Clear Only
+ *                  |   |      Full   |                |   +----------+
+ *          Partial |   |     Resolve |               \|/  |          |
+ *          Resolve |   |             |         +-------------+       |
+ *                  |   |             |         |   Partial   |<------+
+ *                  |   |             |         |    Clear    |<----------+
+ *                  |   |             |         +-------------+           |
+ *                  |   |             |                |                  |
+ *                  |   |             +------>---------+  Full            |
+ *                  |   |                              | Resolve          |
+ *   Draw w/ aux    |   |   Partial Fast Clear         |                  |
+ *   +----------+   |   +--------------------------+   |                  |
+ *   |          |  \|/                             |  \|/                 |
+ *   |       +-------------+    Full Resolve    +-------------+           |
+ *   +------>| Compressed  |------------------->|  Resolved   |           |
+ *           |  w/o Clear  |<-------------------|             |           |
+ *           +-------------+    Draw w/ Aux     +-------------+           |
+ *                 /|\                             |   |                  |
+ *                  |  Draw                        |   |  Draw            |
+ *                  | w/ Aux                       |   | w/o Aux          |
+ *                  |            Ambiguate         |   |                  |
+ *                  |   +--------------------------+   |                  |
+ *   Draw w/o Aux   |   |                              |   Draw w/o Aux   |
+ *   +----------+   |   |                              |   +----------+   |
+ *   |          |   |  \|/                            \|/  |          |   |
+ *   |       +-------------+     Ambiguate      +-------------+       |   |
+ *   +------>|    Pass-    |<-------------------|     Aux     |<------+   |
+ *   +------>|   through   |                    |   Invalid   |           |
+ *   |       +-------------+                    +-------------+           |
+ *   |          |   |                                                     |
+ *   +----------+   +-----------------------------------------------------+
+ *     Draw w/                       Partial Fast Clear
+ *    Clear Only
+ *
+ *
+ * While the above general theory applies to all forms of auxiliary
+ * compression on Intel hardware, not all states and operations are available
+ * on all compression types.  However, each of the auxiliary states and
+ * operations can be fairly easily mapped onto the above diagram:
+ *
+ * HiZ:     Hierarchical depth compression is capable of being in any of the
+ *          states above.  Hardware provides three HiZ operations: "Depth
+ *          Clear", "Depth Resolve", and "HiZ Resolve" which map to "Fast
+ *          Clear", "Full Resolve", and "Ambiguate" respectively.  The
+ *          hardware provides no HiZ partial resolve operation so the only way
+ *          to get into the "Compressed w/o Clear" state is to render with HiZ
+ *          when the surface is in the resolved or pass-through states.
+ *
+ * MCS:     Multisample compression is technically capable of being in any of
+ *          the states above except that most of them aren't useful.  Both the
+ *          render engine and the sampler support MCS compression and, apart
+ *          from clear color, MCS is format-unaware so we leave the surface
+ *          compressed 100% of the time.  The hardware provides no MCS
+ *          operations.
+ *
+ * CCS_D:   Single-sample fast-clears (also called CCS_D in ISL) are one of
+ *          the simplest forms of compression since they don't do anything
+ *          beyond clear color tracking.  They really only support three of
+ *          the six states: Clear, Partial Clear, and Pass-through.  The
+ *          only CCS_D operation is "Resolve" which maps to a full resolve
+ *          followed by an ambiguate.
+ *
+ * CCS_E:   Single-sample render target compression (also called CCS_E in ISL)
+ *          is capable of being in almost all of the above states.  THe only
+ *          exception is that it does not have separate resolved and pass-
+ *          through states.  Instead, the CCS_E full resolve operation does
+ *          both a resolve and an ambiguate so it goes directly into the
+ *          pass-through state.  CCS_E also provides fast clear and partial
+ *          resolve operations which work as described above.
+ *
+ *          While it is technically possible to perform a CCS_E ambiguate, it
+ *          is not provided by Sky Lake hardware so we choose to avoid the aux
+ *          invalid state.  If the aux invalid state were determined to be
+ *          useful, a CCS ambiguate could be done by carefully rendering to
+ *          the CCS and filling it with zeros.
+ */
+enum isl_aux_state {
+   ISL_AUX_STATE_CLEAR = 0,
+   ISL_AUX_STATE_PARTIAL_CLEAR,
+   ISL_AUX_STATE_COMPRESSED_CLEAR,
+   ISL_AUX_STATE_COMPRESSED_NO_CLEAR,
+   ISL_AUX_STATE_RESOLVED,
+   ISL_AUX_STATE_PASS_THROUGH,
+   ISL_AUX_STATE_AUX_INVALID,
+};
+
 /* TODO(chadv): Explain */
 enum isl_array_pitch_span {
    ISL_ARRAY_PITCH_SPAN_FULL,
@@ -576,6 +817,21 @@
 /** @} */
 
 /**
+ * @defgroup Channel Mask
+ *
+ * These #define values are chosen to match the values of
+ * RENDER_SURFACE_STATE::Color Buffer Component Write Disables
+ *
+ * @{
+ */
+typedef uint8_t isl_channel_mask_t;
+#define ISL_CHANNEL_BLUE_BIT  (1 << 0)
+#define ISL_CHANNEL_GREEN_BIT (1 << 1)
+#define ISL_CHANNEL_RED_BIT   (1 << 2)
+#define ISL_CHANNEL_ALPHA_BIT (1 << 3)
+/** @} */
+
+/**
  * @brief A channel select (also known as texture swizzle) value
  */
 enum isl_channel_select {
@@ -681,6 +937,10 @@
       uint8_t align;
       uint8_t addr_offset;
       uint8_t aux_addr_offset;
+
+      /* Rounded up to the nearest dword to simplify GPU memcpy operations. */
+      uint8_t clear_value_size;
+      uint8_t clear_value_offset;
    } ss;
 
    /**
@@ -794,6 +1054,25 @@
 };
 
 /**
+ * Metadata about a DRM format modifier.
+ */
+struct isl_drm_modifier_info {
+   uint64_t modifier;
+
+   /** Text name of the modifier */
+   const char *name;
+
+   /** ISL tiling implied by this modifier */
+   enum isl_tiling tiling;
+
+   /** ISL aux usage implied by this modifier */
+   enum isl_aux_usage aux_usage;
+
+   /** Whether or not this modifier supports clear color */
+   bool supports_clear_color;
+};
+
+/**
  * @brief Input to surface initialization
  *
  * @invariant width >= 1
@@ -995,6 +1274,11 @@
     */
    union isl_color_value clear_color;
 
+   /**
+    * Surface write disables for gen4-5
+    */
+   isl_channel_mask_t write_disables;
+
    /* Intra-tile offset */
    uint16_t x_offset_sa, y_offset_sa;
 };
@@ -1215,6 +1499,14 @@
 }
 
 static inline bool
+isl_format_is_srgb(enum isl_format fmt)
+{
+   return isl_format_layouts[fmt].colorspace == ISL_COLORSPACE_SRGB;
+}
+
+enum isl_format isl_format_srgb_to_linear(enum isl_format fmt);
+
+static inline bool
 isl_format_is_rgb(enum isl_format fmt)
 {
    return isl_format_layouts[fmt].channels.r.bits > 0 &&
@@ -1251,6 +1543,15 @@
    return (1u << tiling) & ISL_TILING_STD_Y_MASK;
 }
 
+uint32_t
+isl_tiling_to_i915_tiling(enum isl_tiling tiling);
+
+enum isl_tiling 
+isl_tiling_from_i915_tiling(uint32_t tiling);
+
+const struct isl_drm_modifier_info * ATTRIBUTE_CONST
+isl_drm_modifier_get_info(uint64_t modifier);
+
 struct isl_extent2d ATTRIBUTE_CONST
 isl_get_interleaved_msaa_px_size_sa(uint32_t samples);
 
@@ -1336,6 +1637,9 @@
    return e;
 }
 
+bool isl_color_value_is_zero_one(union isl_color_value value,
+                                 enum isl_format format);
+
 #define isl_surf_init(dev, surf, ...) \
    isl_surf_init_s((dev), (surf), \
                    &(struct isl_surf_init_info) {  __VA_ARGS__ });
@@ -1346,8 +1650,7 @@
                 const struct isl_surf_init_info *restrict info);
 
 void
-isl_surf_get_tile_info(const struct isl_device *dev,
-                       const struct isl_surf *surf,
+isl_surf_get_tile_info(const struct isl_surf *surf,
                        struct isl_tile_info *tile_info);
 
 bool
@@ -1363,7 +1666,8 @@
 bool
 isl_surf_get_ccs_surf(const struct isl_device *dev,
                       const struct isl_surf *surf,
-                      struct isl_surf *ccs_surf);
+                      struct isl_surf *ccs_surf,
+                      uint32_t row_pitch /**< Ignored if 0 */);
 
 #define isl_surf_fill_state(dev, state, ...) \
    isl_surf_fill_state_s((dev), (state), \
@@ -1517,6 +1821,50 @@
                              uint32_t *y_offset_el);
 
 /**
+ * Calculate the offset, in bytes and intratile surface samples, to a
+ * subimage in the surface.
+ *
+ * This is equivalent to calling isl_surf_get_image_offset_el, passing the
+ * result to isl_tiling_get_intratile_offset_el, and converting the tile
+ * offsets to samples.
+ *
+ * @invariant level < surface levels
+ * @invariant logical_array_layer < logical array length of surface
+ * @invariant logical_z_offset_px < logical depth of surface at level
+ */
+void
+isl_surf_get_image_offset_B_tile_sa(const struct isl_surf *surf,
+                                    uint32_t level,
+                                    uint32_t logical_array_layer,
+                                    uint32_t logical_z_offset_px,
+                                    uint32_t *offset_B,
+                                    uint32_t *x_offset_sa,
+                                    uint32_t *y_offset_sa);
+
+/**
+ * Create an isl_surf that represents a particular subimage in the surface.
+ *
+ * The newly created surface will have a single miplevel and array slice.  The
+ * surface lives at the returned byte and intratile offsets, in samples.
+ *
+ * It is safe to call this function with surf == image_surf.
+ *
+ * @invariant level < surface levels
+ * @invariant logical_array_layer < logical array length of surface
+ * @invariant logical_z_offset_px < logical depth of surface at level
+ */
+void
+isl_surf_get_image_surf(const struct isl_device *dev,
+                        const struct isl_surf *surf,
+                        uint32_t level,
+                        uint32_t logical_array_layer,
+                        uint32_t logical_z_offset_px,
+                        struct isl_surf *image_surf,
+                        uint32_t *offset_B,
+                        uint32_t *x_offset_sa,
+                        uint32_t *y_offset_sa);
+
+/**
  * @brief Calculate the intratile offsets to a surface.
  *
  * In @a base_address_offset return the offset from the base of the surface to
@@ -1527,9 +1875,8 @@
  * surface's tiling format.
  */
 void
-isl_tiling_get_intratile_offset_el(const struct isl_device *dev,
-                                   enum isl_tiling tiling,
-                                   uint8_t bs,
+isl_tiling_get_intratile_offset_el(enum isl_tiling tiling,
+                                   uint32_t bpb,
                                    uint32_t row_pitch,
                                    uint32_t total_x_offset_el,
                                    uint32_t total_y_offset_el,
@@ -1538,8 +1885,7 @@
                                    uint32_t *y_offset_el);
 
 static inline void
-isl_tiling_get_intratile_offset_sa(const struct isl_device *dev,
-                                   enum isl_tiling tiling,
+isl_tiling_get_intratile_offset_sa(enum isl_tiling tiling,
                                    enum isl_format format,
                                    uint32_t row_pitch,
                                    uint32_t total_x_offset_sa,
@@ -1550,8 +1896,6 @@
 {
    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
 
-   assert(fmtl->bpb % 8 == 0);
-
    /* For computing the intratile offsets, we actually want a strange unit
     * which is samples for multisampled surfaces but elements for compressed
     * surfaces.
@@ -1561,7 +1905,7 @@
    const uint32_t total_x_offset = total_x_offset_sa / fmtl->bw;
    const uint32_t total_y_offset = total_y_offset_sa / fmtl->bh;
 
-   isl_tiling_get_intratile_offset_el(dev, tiling, fmtl->bpb / 8, row_pitch,
+   isl_tiling_get_intratile_offset_el(tiling, fmtl->bpb, row_pitch,
                                       total_x_offset, total_y_offset,
                                       base_address_offset,
                                       x_offset_sa, y_offset_sa);
diff --git a/src/intel/isl/isl_drm.c b/src/intel/isl/isl_drm.c
new file mode 100644
index 0000000..ef80e42
--- /dev/null
+++ b/src/intel/isl/isl_drm.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2017 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <drm_fourcc.h>
+#include <i915_drm.h>
+
+#include "isl.h"
+#include "common/gen_device_info.h"
+
+uint32_t
+isl_tiling_to_i915_tiling(enum isl_tiling tiling)
+{
+   switch (tiling) {
+   case ISL_TILING_LINEAR:
+      return I915_TILING_NONE;
+
+   case ISL_TILING_X:
+      return I915_TILING_X;
+
+   case ISL_TILING_Y0:
+      return I915_TILING_Y;
+
+   case ISL_TILING_W:
+   case ISL_TILING_Yf:
+   case ISL_TILING_Ys:
+   case ISL_TILING_HIZ:
+   case ISL_TILING_CCS:
+      return I915_TILING_NONE;
+   }
+
+   unreachable("Invalid ISL tiling");
+}
+
+enum isl_tiling
+isl_tiling_from_i915_tiling(uint32_t tiling)
+{
+   switch (tiling) {
+   case I915_TILING_NONE:
+      return ISL_TILING_LINEAR;
+
+   case I915_TILING_X:
+      return ISL_TILING_X;
+
+   case I915_TILING_Y:
+      return ISL_TILING_Y0;
+   }
+
+   unreachable("Invalid i915 tiling");
+}
+
+struct isl_drm_modifier_info modifier_info[] = {
+   {
+      .modifier = DRM_FORMAT_MOD_NONE,
+      .name = "DRM_FORMAT_MOD_NONE",
+      .tiling = ISL_TILING_LINEAR,
+   },
+   {
+      .modifier = I915_FORMAT_MOD_X_TILED,
+      .name = "I915_FORMAT_MOD_X_TILED",
+      .tiling = ISL_TILING_X,
+   },
+   {
+      .modifier = I915_FORMAT_MOD_Y_TILED,
+      .name = "I915_FORMAT_MOD_Y_TILED",
+      .tiling = ISL_TILING_Y0,
+   },
+};
+
+const struct isl_drm_modifier_info *
+isl_drm_modifier_get_info(uint64_t modifier)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(modifier_info); i++) {
+      if (modifier_info[i].modifier == modifier)
+         return &modifier_info[i];
+   }
+
+   return NULL;
+}
diff --git a/src/intel/isl/isl_emit_depth_stencil.c b/src/intel/isl/isl_emit_depth_stencil.c
index 41a01be..0d541fd 100644
--- a/src/intel/isl/isl_emit_depth_stencil.c
+++ b/src/intel/isl/isl_emit_depth_stencil.c
@@ -113,6 +113,16 @@
 #endif
    }
 
+#if GEN_GEN == 5 || GEN_GEN == 6
+   const bool separate_stencil =
+      info->stencil_surf && info->stencil_surf->format == ISL_FORMAT_R8_UINT;
+   if (separate_stencil || info->hiz_usage == ISL_AUX_USAGE_HIZ) {
+      assert(ISL_DEV_USE_SEPARATE_STENCIL(dev));
+      db.SeparateStencilBufferEnable = true;
+      db.HierarchicalDepthBufferEnable = true;
+   }
+#endif
+
 #if GEN_GEN >= 6
    struct GENX(3DSTATE_STENCIL_BUFFER) sb = {
       GENX(3DSTATE_STENCIL_BUFFER_header),
@@ -151,9 +161,6 @@
           info->hiz_usage == ISL_AUX_USAGE_HIZ);
    if (info->hiz_usage == ISL_AUX_USAGE_HIZ) {
       db.HierarchicalDepthBufferEnable = true;
-#if GEN_GEN == 5 || GEN_GEN == 6
-      db.SeparateStencilBufferEnable = true;
-#endif
 
       hiz.SurfaceBaseAddress = info->hiz_address;
       hiz.HierarchicalDepthBufferMOCS = info->mocs;
@@ -177,7 +184,26 @@
 #endif
 
       clear.DepthClearValueValid = true;
+#if GEN_GEN >= 8
       clear.DepthClearValue = info->depth_clear_value;
+#else
+      switch (info->depth_surf->format) {
+      case ISL_FORMAT_R32_FLOAT: {
+         union { float f; uint32_t u; } fu;
+         fu.f = info->depth_clear_value;
+         clear.DepthClearValue = fu.u;
+         break;
+      }
+      case ISL_FORMAT_R24_UNORM_X8_TYPELESS:
+         clear.DepthClearValue = info->depth_clear_value * ((1u << 24) - 1);
+         break;
+      case ISL_FORMAT_R16_UNORM:
+         clear.DepthClearValue = info->depth_clear_value * ((1u << 16) - 1);
+         break;
+      default:
+         unreachable("Invalid depth type");
+      }
+#endif
    }
 #endif /* GEN_GEN >= 6 */
 
diff --git a/src/intel/isl/isl_format.c b/src/intel/isl/isl_format.c
index 0452bf8..435b0d0 100644
--- a/src/intel/isl/isl_format.c
+++ b/src/intel/isl/isl_format.c
@@ -88,259 +88,273 @@
  * - Render Target Surface Types [SKL+]
  */
 static const struct surface_format_info format_info[] = {
-/* smpl filt shad CK  RT  AB  VB  SO  color TW  TR  ccs_e */
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   70, 90, 90,   R32G32B32A32_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 90, 90,   R32G32B32A32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 90, 90,   R32G32B32A32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32A32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32A32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R64G64_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R32G32B32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32A32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32A32_USCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R32G32B32A32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x, 80,  x,  x,    x,  x,  x,   R64G64_PASSTHRU)
-   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x,    x,  x,  x,   R32G32B32_FLOAT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x,    x,  x,  x,   R32G32B32_SINT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x,    x,  x,  x,   R32G32B32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32B32_USCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R32G32B32_SFIXED)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60,   70,  x, 90,   R16G16B16A16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   70,  x, 90,   R16G16B16A16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90, 90,   R16G16B16A16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75, 90,   R16G16B16A16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   70, 90, 90,   R16G16B16A16_FLOAT)
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   70, 90, 90,   R32G32_FLOAT)
-   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x,    x,  x,  x,   R32G32_FLOAT_LD)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 90, 90,   R32G32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 90, 90,   R32G32_UINT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R32_FLOAT_X8X24_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   X32_TYPELESS_G8X24_UINT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L32A32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R16G16B16X16_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x, 90,   R16G16B16X16_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16A16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16A16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32G32_USCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R32G32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x, 80,  x,  x,    x,  x,  x,   R64_PASSTHRU)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60,   70,  x, 90,   B8G8R8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,  x,  x,   B8G8R8A8_UNORM_SRGB)
-/* smpl filt shad CK  RT  AB  VB  SO  color TW  TR  ccs_e */
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60,   70,  x,  x,   R10G10B10A2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60,    x,  x,  x,   R10G10B10A2_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70,  x,  x,   R10G10B10A2_UINT)
-   SF( Y,  Y,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R10G10B10_SNORM_A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60,   70,  x, 90,   R8G8B8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60,    x,  x,  x,   R8G8B8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   70,  x, 90,   R8G8B8A8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90, 90,   R8G8B8A8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75, 90,   R8G8B8A8_UINT)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x,   70,  x, 90,   R16G16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   70,  x, 90,   R16G16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90, 90,   R16G16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75, 90,   R16G16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   70, 90, 90,   R16G16_FLOAT)
-   SF( Y,  Y,  x,  x,  Y,  Y, 75,  x, 60,   70,  x,  x,   B10G10R10A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60,    x,  x,  x,   B10G10R10A2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   70,  x,  x,   R11G11B10_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 70, 90,   R32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   70, 70, 90,   R32_UINT)
-   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x,   70, 70, 90,   R32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R24_UNORM_X8_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   X24_TYPELESS_G8_UINT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L16A16_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A32_FLOAT)
-   SF( Y,  Y,  x,  Y, 80, 80,  x,  x, 60,    x,  x, 90,   B8G8R8X8_UNORM)
-   SF( Y,  Y,  x,  x, 80, 80,  x,  x,  x,    x,  x,  x,   B8G8R8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R8G8B8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R8G8B8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R9G9B9E5_SHAREDEXP)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   B10G10R10X2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L16A16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32_SNORM)
-/* smpl filt shad CK  RT  AB  VB  SO  color TW  TR  ccs_e */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R10G10B10X2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8A8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8A8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R32_USCALED)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,   70,  x,  x,   B5G6R5_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,  x,  x,   B5G6R5_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,   70,  x,  x,   B5G5R5A1_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,  x,  x,   B5G5R5A1_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,   70,  x,  x,   B4G4R4A4_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x,    x,  x,  x,   B4G4R4A4_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   70,  x,  x,   R8G8_UNORM)
-   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x,   70,  x,  x,   R8G8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90,  x,   R8G8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75,  x,   R8G8_UINT)
-   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70,   70,  x,  x,   R16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   70,  x,  x,   R16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90,  x,   R16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75,  x,   R16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x,   70, 90,  x,   R16_FLOAT)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A8P8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A8P8_UNORM_PALETTE1)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A16_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   L8A8_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A16_FLOAT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   R5G5_SNORM_B6_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x,   70,  x,  x,   B5G5R5X1_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x,    x,  x,  x,   B5G5R5X1_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8_USCALED)
-/* smpl filt shad CK  RT  AB  VB  SO  color TW  TR  ccs_e */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16_USCALED)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P8A8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P8A8_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A1B5G5R5_UNORM)
+/*    smpl filt  shad  CK   RT   AB   VB   SO color TW   TR  ccs_e */
+   SF(  Y,  50,   x,   x,   Y,   Y,   Y,   Y,   x,  70,  90,  90,   R32G32B32A32_FLOAT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  90,  90,   R32G32B32A32_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  90,  90,   R32G32B32A32_UINT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32A32_UNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32A32_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R64G64_FLOAT)
+   SF(  Y,  50,   x,   x, 100, 100,   x,   x,   x,   x,   x, 100,   R32G32B32X32_FLOAT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32A32_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32A32_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R32G32B32A32_SFIXED)
+   SF(  x,   x,   x,   x,   x,   x,  80,   x,   x,   x,   x,   x,   R64G64_PASSTHRU)
+   SF(  Y,  50,   x,   x,   x,   x,   Y,   Y,   x,   x,   x,   x,   R32G32B32_FLOAT)
+   SF(  Y,   x,   x,   x,   x,   x,   Y,   Y,   x,   x,   x,   x,   R32G32B32_SINT)
+   SF(  Y,   x,   x,   x,   x,   x,   Y,   Y,   x,   x,   x,   x,   R32G32B32_UINT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_UNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R32G32B32_SFIXED)
+   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,  60,  70,   x,  90,   R16G16B16A16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R16G16B16A16_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R16G16B16A16_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R16G16B16A16_UINT)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,  90,   R16G16B16A16_FLOAT)
+   SF(  Y,  50,   x,   x,   Y,   Y,   Y,   Y,   x,  70,  90,  90,   R32G32_FLOAT)
+   SF(  Y,  70,   x,   x,   Y,   Y,   Y,   Y,   x,   x,   x,   x,   R32G32_FLOAT_LD)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  90,  90,   R32G32_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  90,  90,   R32G32_UINT)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R32_FLOAT_X8X24_TYPELESS)
+   SF(  Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   X32_TYPELESS_G8X24_UINT)
+   SF(  Y,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L32A32_FLOAT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32_UNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R64_FLOAT)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R16G16B16X16_UNORM)
+   SF(  Y,   Y,   x,   x,  90,  90,   x,   x,   x,   x,   x,  90,   R16G16B16X16_FLOAT)
+   SF(  Y,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A32X32_FLOAT)
+   SF(  Y,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L32X32_FLOAT)
+   SF(  Y,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I32X32_FLOAT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16A16_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16A16_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R32G32_SFIXED)
+   SF(  x,   x,   x,   x,   x,   x,  80,   x,   x,   x,   x,   x,   R64_PASSTHRU)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   Y,   x,  60,  70,   x,  90,   B8G8R8A8_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x, 100,   B8G8R8A8_UNORM_SRGB)
+/*    smpl filt  shad  CK   RT   AB   VB   SO color TW   TR  ccs_e */
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,  60,  70,   x, 100,   R10G10B10A2_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,  60,   x,   x,   x,   R10G10B10A2_UNORM_SRGB)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,   x, 100,   R10G10B10A2_UINT)
+   SF(  Y,   Y,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R10G10B10_SNORM_A2_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,  60,  70,   x,  90,   R8G8B8A8_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,  60,   x,   x, 100,   R8G8B8A8_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R8G8B8A8_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R8G8B8A8_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R8G8B8A8_UINT)
+   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,   x,  70,   x,  90,   R16G16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R16G16_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R16G16_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R16G16_UINT)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,  90,   R16G16_FLOAT)
+   SF(  Y,   Y,   x,   x,   Y,   Y,  75,   x,  60,  70,   x, 100,   B10G10R10A2_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,  60,   x,   x, 100,   B10G10R10A2_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,   x, 100,   R11G11B10_FLOAT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  70,  90,   R32_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   Y,   x,  70,  70,  90,   R32_UINT)
+   SF(  Y,  50,   Y,   x,   Y,   Y,   Y,   Y,   x,  70,  70,  90,   R32_FLOAT)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R24_UNORM_X8_TYPELESS)
+   SF(  Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   X24_TYPELESS_G8_UINT)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L16A16_UNORM)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I24X8_UNORM)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L24X8_UNORM)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A24X8_UNORM)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I32_FLOAT)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L32_FLOAT)
+   SF(  Y,  50,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A32_FLOAT)
+   SF(  Y,   Y,   x,   Y,  80,  80,   x,   x,  60,   x,   x,  90,   B8G8R8X8_UNORM)
+   SF(  Y,   Y,   x,   x,  80,  80,   x,   x,   x,   x,   x, 100,   B8G8R8X8_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R8G8B8X8_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R8G8B8X8_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R9G9B9E5_SHAREDEXP)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   B10G10R10X2_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L16A16_FLOAT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32_UNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32_SNORM)
+/*    smpl filt  shad  CK   RT   AB   VB   SO color TW   TR  ccs_e */
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R10G10B10X2_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8A8_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8A8_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32_USCALED)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   B5G6R5_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B5G6R5_UNORM_SRGB)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   B5G5R5A1_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B5G5R5A1_UNORM_SRGB)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   B4G4R4A4_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B4G4R4A4_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,   x,   x,   R8G8_UNORM)
+   SF(  Y,   Y,   x,   Y,   Y,  60,   Y,   x,   x,  70,   x,   x,   R8G8_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R8G8_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R8G8_UINT)
+   SF(  Y,   Y,   Y,   x,   Y,  45,   Y,   x,  70,  70,   x,   x,   R16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,   x,   R16_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R16_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R16_UINT)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,   x,   R16_FLOAT)
+   SF( 50,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A8P8_UNORM_PALETTE0)
+   SF( 50,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A8P8_UNORM_PALETTE1)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I16_UNORM)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L16_UNORM)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A16_UNORM)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_UNORM)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I16_FLOAT)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L16_FLOAT)
+   SF(  Y,   Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A16_FLOAT)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_UNORM_SRGB)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   R5G5_SNORM_B6_UNORM)
+   SF(  x,   x,   x,   x,   Y,   Y,   x,   x,   x,  70,   x,   x,   B5G5R5X1_UNORM)
+   SF(  x,   x,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B5G5R5X1_UNORM_SRGB)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8_USCALED)
+/*    smpl filt  shad  CK   RT   AB   VB   SO color TW   TR  ccs_e */
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16_USCALED)
+   SF( 50,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P8A8_UNORM_PALETTE0)
+   SF( 50,  50,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P8A8_UNORM_PALETTE1)
+   SF(  x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A1B5G5R5_UNORM)
    /* According to the PRM, A4B4G4R4_UNORM isn't supported until Sky Lake
     * but empirical testing indicates that at least sampling works just fine
     * on Broadwell.
     */
-   SF(80, 80,  x,  x, 90,  x,  x,  x,  x,    x,  x,  x,   A4B4G4R4_UNORM)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8A8_UINT)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8A8_SINT)
-   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x,   70,  x,  x,   R8_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x,   70,  x,  x,   R8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 90,  x,   R8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x,   70, 75,  x,   R8_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x,   70,  x,  x,   A8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I8_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   L8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P4A4_UNORM_PALETTE0)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A4P4_UNORM_PALETTE0)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8_USCALED)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P8_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8_UNORM_SRGB)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P8_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P4A4_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   A4P4_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   Y8_UNORM)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8_UINT)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   L8_SINT)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I8_UINT)
-   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   I8_SINT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   DXT1_RGB_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R1_UNORM)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60,    x,  x,  x,   YCRCB_NORMAL)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60,    x,  x,  x,   YCRCB_SWAPUVY)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P2_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   P2_UNORM_PALETTE1)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   BC1_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   BC2_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x,    x,  x,  x,   BC3_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC5_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC1_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC3_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   MONO8)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60,    x,  x,  x,   YCRCB_SWAPUV)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60,    x,  x,  x,   YCRCB_SWAPY)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   DXT1_RGB)
-/* smpl filt shad CK  RT  AB  VB  SO  color TW  TR  ccs_e */
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   FXT1)
-   SF(75, 75,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8_UNORM)
-   SF(75, 75,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R8G8B8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R64G64B64A64_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R64G64B64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC4_SNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC5_SNORM)
-   SF(50, 50,  x,  x,  x,  x, 60,  x,  x,    x,  x,  x,   R16G16B16_FLOAT)
-   SF(75, 75,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16_UNORM)
-   SF(75, 75,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x,    x,  x,  x,   R16G16B16_USCALED)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC6H_SF16)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC7_UNORM)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC7_UNORM_SRGB)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   BC6H_UF16)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   PLANAR_420_8)
-   SF(75, 75,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   R8G8B8_UNORM_SRGB)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC1_RGB8)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_RGB8)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   EAC_R11)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   EAC_RG11)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   EAC_SIGNED_R11)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   EAC_SIGNED_RG11)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_SRGB8)
-   SF(90,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R16G16B16_UINT)
-   SF(90,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R16G16B16_SINT)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R10G10B10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R10G10B10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R10G10B10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R10G10B10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   B10G10R10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   B10G10R10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   B10G10R10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   B10G10R10A2_UINT)
-   SF( x,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   B10G10R10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x, 80,  x,  x,    x,  x,  x,   R64G64B64A64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x, 80,  x,  x,    x,  x,  x,   R64G64B64_PASSTHRU)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_RGB8_PTA)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_SRGB8_PTA)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_EAC_RGBA8)
-   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ETC2_EAC_SRGB8_A8)
-   SF(90,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R8G8B8_UINT)
-   SF(90,  x,  x,  x,  x,  x, 75,  x,  x,    x,  x,  x,   R8G8B8_SINT)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_4X4_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_5X4_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_5X5_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_6X5_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_6X6_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X5_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X6_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X8_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X5_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X6_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X8_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X10_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_12X10_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_12X12_FLT16)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_4X4_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_5X4_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_5X5_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_6X5_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_6X6_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X5_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X6_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_8X8_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X5_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X6_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X8_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_10X10_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_12X10_U8SRGB)
-   SF(90, 90,  x,  x,  x,  x,  x,  x,  x,    x,  x,  x,   ASTC_LDR_2D_12X12_U8SRGB)
+   SF( 80,  80,   x,   x,  90,   x,   x,   x,   x,   x,   x,   x,   A4B4G4R4_UNORM)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_UINT)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_SINT)
+   SF(  Y,   Y,   x,  45,   Y,   Y,   Y,   x,   x,  70,   x,   x,   R8_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,   x,   R8_SNORM)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R8_SINT)
+   SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R8_UINT)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   A8_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I8_UNORM)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   L8_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P4A4_UNORM_PALETTE0)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A4P4_UNORM_PALETTE0)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8_USCALED)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P8_UNORM_PALETTE0)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8_UNORM_SRGB)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P8_UNORM_PALETTE1)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P4A4_UNORM_PALETTE1)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   A4P4_UNORM_PALETTE1)
+   SF(  x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   Y8_UNORM)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8_UINT)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8_SINT)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I8_UINT)
+   SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I8_SINT)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   DXT1_RGB_SRGB)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R1_UNORM)
+   SF(  Y,   Y,   x,   Y,   Y,   x,   x,   x,  60,   x,   x,   x,   YCRCB_NORMAL)
+   SF(  Y,   Y,   x,   Y,   Y,   x,   x,   x,  60,   x,   x,   x,   YCRCB_SWAPUVY)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P2_UNORM_PALETTE0)
+   SF( 45,  45,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P2_UNORM_PALETTE1)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   BC1_UNORM)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   BC2_UNORM)
+   SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   BC3_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC4_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC5_UNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC1_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC2_UNORM_SRGB)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC3_UNORM_SRGB)
+   SF(  Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   MONO8)
+   SF(  Y,   Y,   x,   x,   Y,   x,   x,   x,  60,   x,   x,   x,   YCRCB_SWAPUV)
+   SF(  Y,   Y,   x,   x,   Y,   x,   x,   x,  60,   x,   x,   x,   YCRCB_SWAPY)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   DXT1_RGB)
+/*    smpl filt  shad  CK   RT   AB   VB   SO color TW   TR  ccs_e */
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   FXT1)
+   SF( 75,  75,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8_UNORM)
+   SF( 75,  75,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R8G8B8_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R64G64B64A64_FLOAT)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R64G64B64_FLOAT)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC4_SNORM)
+   SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC5_SNORM)
+   SF( 50,  50,   x,   x,   x,   x,  60,   x,   x,   x,   x,   x,   R16G16B16_FLOAT)
+   SF( 75,  75,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16_UNORM)
+   SF( 75,  75,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R16G16B16_USCALED)
+   SF( 70,  70,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC6H_SF16)
+   SF( 70,  70,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC7_UNORM)
+   SF( 70,  70,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC7_UNORM_SRGB)
+   SF( 70,  70,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   BC6H_UF16)
+   SF(  x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   PLANAR_420_8)
+   SF( 75,  75,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   R8G8B8_UNORM_SRGB)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC1_RGB8)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_RGB8)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   EAC_R11)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   EAC_RG11)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   EAC_SIGNED_R11)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   EAC_SIGNED_RG11)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_SRGB8)
+   SF( 90,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R16G16B16_UINT)
+   SF( 90,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R16G16B16_SINT)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R32_SFIXED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R10G10B10A2_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R10G10B10A2_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R10G10B10A2_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R10G10B10A2_SINT)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   B10G10R10A2_SNORM)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   B10G10R10A2_USCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   B10G10R10A2_SSCALED)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   B10G10R10A2_UINT)
+   SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   B10G10R10A2_SINT)
+   SF(  x,   x,   x,   x,   x,   x,  80,   x,   x,   x,   x,   x,   R64G64B64A64_PASSTHRU)
+   SF(  x,   x,   x,   x,   x,   x,  80,   x,   x,   x,   x,   x,   R64G64B64_PASSTHRU)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_RGB8_PTA)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_SRGB8_PTA)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_EAC_RGBA8)
+   SF( 80,  80,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ETC2_EAC_SRGB8_A8)
+   SF( 90,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R8G8B8_UINT)
+   SF( 90,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R8G8B8_SINT)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_4X4_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_5X4_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_5X5_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_6X5_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_6X6_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X5_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X6_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X8_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X5_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X6_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X8_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X10_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_12X10_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_12X12_FLT16)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_4X4_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_5X4_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_5X5_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_6X5_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_6X6_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X5_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X6_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_8X8_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X5_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X6_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X8_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_10X10_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_12X10_U8SRGB)
+   SF( 90,  90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_LDR_2D_12X12_U8SRGB)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_4X4_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_5X4_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_5X5_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_6X5_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_6X6_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_8X5_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_8X6_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_8X8_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_10X5_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_10X6_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_10X8_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_10X10_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_12X10_FLT16)
+   SF(100, 100,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   ASTC_HDR_2D_12X12_FLT16)
 };
 #undef x
 #undef Y
@@ -387,10 +401,17 @@
          return true;
    } else if (devinfo->is_cherryview) {
       const struct isl_format_layout *fmtl = isl_format_get_layout(format);
-      /* Support for ASTC exists on Cherry View even though big-core
+      /* Support for ASTC LDR exists on Cherry View even though big-core
        * GPUs didn't get it until Skylake.
        */
       if (fmtl->txc == ISL_TXC_ASTC)
+         return format < ISL_FORMAT_ASTC_HDR_2D_4X4_FLT16;
+   } else if (gen_device_info_is_9lp(devinfo)) {
+      const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+      /* Support for ASTC HDR exists on Broxton even though big-core
+       * GPUs didn't get it until Cannonlake.
+       */
+      if (fmtl->txc == ISL_TXC_ASTC)
          return true;
    }
 
@@ -413,10 +434,17 @@
          return true;
    } else if (devinfo->is_cherryview) {
       const struct isl_format_layout *fmtl = isl_format_get_layout(format);
-      /* Support for ASTC exists on Cherry View even though big-core
+      /* Support for ASTC LDR exists on Cherry View even though big-core
        * GPUs didn't get it until Skylake.
        */
       if (fmtl->txc == ISL_TXC_ASTC)
+         return format < ISL_FORMAT_ASTC_HDR_2D_4X4_FLT16;
+   } else if (gen_device_info_is_9lp(devinfo)) {
+      const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+      /* Support for ASTC HDR exists on Broxton even though big-core
+       * GPUs didn't get it until Cannonlake.
+       */
+      if (fmtl->txc == ISL_TXC_ASTC)
          return true;
    }
 
@@ -526,16 +554,19 @@
     *       - any compressed texture format (BC*)
     *       - any YCRCB* format
     *
-    * The restriction on the format's size is removed on Broadwell.  Also,
-    * there is an exception for HiZ which we treat as a compressed format and
-    * is allowed to be multisampled on Broadwell and earlier.
+    * The restriction on the format's size is removed on Broadwell. Moreover,
+    * empirically it looks that even IvyBridge can handle multisampled surfaces
+    * with format sizes all the way to 128-bits (RGBA32F, RGBA32I, RGBA32UI).
+    *
+    * Also, there is an exception for HiZ which we treat as a compressed
+    * format and is allowed to be multisampled on Broadwell and earlier.
     */
    if (format == ISL_FORMAT_HIZ) {
       /* On SKL+, HiZ is always single-sampled even when the primary surface
        * is multisampled.  See also isl_surf_get_hiz_surf().
        */
       return devinfo->gen <= 8;
-   } else if (devinfo->gen < 8 && isl_format_get_layout(format)->bpb > 64) {
+   } else if (devinfo->gen < 7 && isl_format_get_layout(format)->bpb > 64) {
       return false;
    } else if (isl_format_is_compressed(format)) {
       return false;
diff --git a/src/intel/isl/isl_format_layout.csv b/src/intel/isl/isl_format_layout.csv
index f0f31c7..f340e30 100644
--- a/src/intel/isl/isl_format_layout.csv
+++ b/src/intel/isl/isl_format_layout.csv
@@ -314,6 +314,20 @@
 ASTC_LDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
 ASTC_LDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
 ASTC_LDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_4X4_FLT16       , 128,  4,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_5X4_FLT16       , 128,  5,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_5X5_FLT16       , 128,  5,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_6X5_FLT16       , 128,  6,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_6X6_FLT16       , 128,  6,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_8X5_FLT16       , 128,  8,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_8X6_FLT16       , 128,  8,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_8X8_FLT16       , 128,  8,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_10X5_FLT16      , 128, 10,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_10X6_FLT16      , 128, 10,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_10X8_FLT16      , 128, 10,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
+ASTC_HDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
 HIZ                         , 128,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   hiz
 MCS_2X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
 MCS_4X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
diff --git a/src/intel/isl/isl_gen4.c b/src/intel/isl/isl_gen4.c
index 9fed4541..14706c8 100644
--- a/src/intel/isl/isl_gen4.c
+++ b/src/intel/isl/isl_gen4.c
@@ -38,6 +38,57 @@
 }
 
 void
+isl_gen4_filter_tiling(const struct isl_device *dev,
+                       const struct isl_surf_init_info *restrict info,
+                       isl_tiling_flags_t *flags)
+{
+   /* Gen4-5 only support linear, X, and Y-tiling. */
+   *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT | ISL_TILING_Y0_BIT);
+
+   if (isl_surf_usage_is_depth_or_stencil(info->usage)) {
+      assert(!ISL_DEV_USE_SEPARATE_STENCIL(dev));
+
+      /* From the g35 PRM Vol. 2, 3DSTATE_DEPTH_BUFFER::Tile Walk:
+       *
+       *    "The Depth Buffer, if tiled, must use Y-Major tiling"
+       */
+      *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_Y0_BIT);
+   }
+
+   if (info->usage & (ISL_SURF_USAGE_DISPLAY_ROTATE_90_BIT |
+                      ISL_SURF_USAGE_DISPLAY_ROTATE_180_BIT |
+                      ISL_SURF_USAGE_DISPLAY_ROTATE_270_BIT)) {
+      assert(*flags & ISL_SURF_USAGE_DISPLAY_BIT);
+      isl_finishme("%s:%s: handle rotated display surfaces",
+                   __FILE__, __func__);
+   }
+
+   if (info->usage & (ISL_SURF_USAGE_DISPLAY_FLIP_X_BIT |
+                      ISL_SURF_USAGE_DISPLAY_FLIP_Y_BIT)) {
+      assert(*flags & ISL_SURF_USAGE_DISPLAY_BIT);
+      isl_finishme("%s:%s: handle flipped display surfaces",
+                   __FILE__, __func__);
+   }
+
+   if (info->usage & ISL_SURF_USAGE_DISPLAY_BIT) {
+      /* Before Skylake, the display engine does not accept Y */
+      *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT);
+   }
+
+   assert(info->samples == 1);
+
+   /* From the g35 PRM, Volume 1, 11.5.5, "Per-Stream Tile Format Support":
+    *
+    *    "NOTE: 128BPE Format Color buffer ( render target ) MUST be either
+    *    TileX or Linear."
+    *
+    * This is required all the way up to Sandy Bridge.
+    */
+   if (isl_format_get_layout(info->format)->bpb >= 128)
+      *flags &= ~ISL_TILING_Y0_BIT;
+}
+
+void
 isl_gen4_choose_image_alignment_el(const struct isl_device *dev,
                                    const struct isl_surf_init_info *restrict info,
                                    enum isl_tiling tiling,
diff --git a/src/intel/isl/isl_gen4.h b/src/intel/isl/isl_gen4.h
index dc6102b..c04f7fb 100644
--- a/src/intel/isl/isl_gen4.h
+++ b/src/intel/isl/isl_gen4.h
@@ -37,6 +37,11 @@
                             enum isl_msaa_layout *msaa_layout);
 
 void
+isl_gen4_filter_tiling(const struct isl_device *dev,
+                       const struct isl_surf_init_info *restrict info,
+                       isl_tiling_flags_t *flags);
+
+void
 isl_gen4_choose_image_alignment_el(const struct isl_device *dev,
                                    const struct isl_surf_init_info *restrict info,
                                    enum isl_tiling tiling,
diff --git a/src/intel/isl/isl_gen6.c b/src/intel/isl/isl_gen6.c
index b746903..51f2100 100644
--- a/src/intel/isl/isl_gen6.c
+++ b/src/intel/isl/isl_gen6.c
@@ -88,6 +88,8 @@
     *    | format                 | halign | valign |
     *    +------------------------+--------+--------+
     *    | YUV 4:2:2 formats      |      4 |      * |
+    *    | BC1-5                  |      4 |      4 |
+    *    | FXT1                   |      8 |      4 |
     *    | uncompressed formats   |      4 |      * |
     *    +------------------------+--------+--------+
     *
@@ -110,38 +112,32 @@
     */
 
    if (isl_format_is_compressed(info->format)) {
+      /* Compressed formats have an alignment equal to their block size */
       *image_align_el = isl_extent3d(1, 1, 1);
       return;
    }
 
-   if (isl_format_is_yuv(info->format)) {
+   /* Separate stencil requires 4x2 alignment */
+   if (isl_surf_usage_is_stencil(info->usage) &&
+       info->format == ISL_FORMAT_R8_UINT) {
       *image_align_el = isl_extent3d(4, 2, 1);
       return;
    }
 
+   /* Depth or combined depth stencil surfaces require 4x4 alignment */
+   if (isl_surf_usage_is_depth_or_stencil(info->usage)) {
+      *image_align_el = isl_extent3d(4, 4, 1);
+      return;
+   }
+
    if (info->samples > 1) {
       *image_align_el = isl_extent3d(4, 4, 1);
       return;
    }
 
-   if (isl_surf_usage_is_depth_or_stencil(info->usage) &&
-       !ISL_DEV_USE_SEPARATE_STENCIL(dev)) {
-      /* interleaved depthstencil buffer */
-      *image_align_el = isl_extent3d(4, 4, 1);
-      return;
-   }
-
-   if (isl_surf_usage_is_depth(info->usage)) {
-      /* separate depth buffer */
-      *image_align_el = isl_extent3d(4, 4, 1);
-      return;
-   }
-
-   if (isl_surf_usage_is_stencil(info->usage)) {
-      /* separate stencil buffer */
-      *image_align_el = isl_extent3d(4, 2, 1);
-      return;
-   }
-
+   /* For everything else, 4x2 is always a valid alignment.  Since this is
+    * also the smallest alignment we can specify, we use 4x2 for everything
+    * else because it uses the least memory.
+    */
    *image_align_el = isl_extent3d(4, 2, 1);
 }
diff --git a/src/intel/isl/isl_gen7.c b/src/intel/isl/isl_gen7.c
index 8e6b441..24d411f 100644
--- a/src/intel/isl/isl_gen7.c
+++ b/src/intel/isl/isl_gen7.c
@@ -24,6 +24,25 @@
 #include "isl_gen7.h"
 #include "isl_priv.h"
 
+static bool
+gen7_format_needs_valign2(const struct isl_device *dev,
+                          enum isl_format format)
+{
+   assert(ISL_DEV_GEN(dev) == 7);
+
+   /* From the Ivybridge PRM (2012-05-31), Volume 4, Part 1, Section 2.12.1,
+    * RENDER_SURFACE_STATE Surface Vertical Alignment:
+    *
+    *    - Value of 1 [VALIGN_4] is not supported for format YCRCB_NORMAL
+    *      (0x182), YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY
+    *      (0x190)
+    *
+    *    - VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
+    */
+   return isl_format_is_yuv(format) ||
+          format == ISL_FORMAT_R32G32B32_FLOAT;
+}
+
 bool
 isl_gen7_choose_msaa_layout(const struct isl_device *dev,
                             const struct isl_surf_init_info *info,
@@ -76,8 +95,13 @@
     * Note that the above SINT restrictions apply only to *MSRTs* (that is,
     * *multisampled* render targets). The restrictions seem to permit an MCS
     * if the render target is singlesampled.
+    *
+    * Moreover, empirically it looks that hardware can render multisampled
+    * surfaces with RGBA8I, RGBA16I and RGBA32I.
     */
-   if (isl_format_has_sint_channel(info->format))
+
+   /* Multisampling requires vertical alignment of four. */
+   if (info->samples > 1 && gen7_format_needs_valign2(dev, info->format))
       return false;
 
    /* More obvious restrictions */
@@ -151,25 +175,6 @@
    return true;
 }
 
-static bool
-gen7_format_needs_valign2(const struct isl_device *dev,
-                          enum isl_format format)
-{
-   assert(ISL_DEV_GEN(dev) == 7);
-
-   /* From the Ivybridge PRM (2012-05-31), Volume 4, Part 1, Section 2.12.1,
-    * RENDER_SURFACE_STATE Surface Vertical Alignment:
-    *
-    *    - Value of 1 [VALIGN_4] is not supported for format YCRCB_NORMAL
-    *      (0x182), YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY
-    *      (0x190)
-    *
-    *    - VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
-    */
-   return isl_format_is_yuv(format) ||
-          format == ISL_FORMAT_R32G32B32_FLOAT;
-}
-
 /**
  * @brief Filter out tiling flags that are incompatible with the surface.
  *
@@ -289,86 +294,6 @@
       *flags &= ~ISL_TILING_Y0_BIT;
 }
 
-/**
- * Choose horizontal subimage alignment, in units of surface elements.
- */
-static uint32_t
-gen7_choose_halign_el(const struct isl_device *dev,
-                      const struct isl_surf_init_info *restrict info)
-{
-   if (isl_format_is_compressed(info->format))
-      return 1;
-
-   /* From the Ivybridge PRM (2012-05-31), Volume 4, Part 1, Section 2.12.1,
-    * RENDER_SURFACE_STATE Surface Hoizontal Alignment:
-    *
-    *    - This field is intended to be set to HALIGN_8 only if the surface
-    *      was rendered as a depth buffer with Z16 format or a stencil buffer,
-    *      since these surfaces support only alignment of 8. Use of HALIGN_8
-    *      for other surfaces is supported, but uses more memory.
-    */
-   if (isl_surf_info_is_z16(info) ||
-       isl_surf_usage_is_stencil(info->usage))
-      return 8;
-
-   return 4;
-}
-
-/**
- * Choose vertical subimage alignment, in units of surface elements.
- */
-static uint32_t
-gen7_choose_valign_el(const struct isl_device *dev,
-                      const struct isl_surf_init_info *restrict info,
-                      enum isl_tiling tiling)
-{
-   MAYBE_UNUSED bool require_valign2 = false;
-   bool require_valign4 = false;
-
-   if (isl_format_is_compressed(info->format))
-      return 1;
-
-   if (gen7_format_needs_valign2(dev, info->format))
-      require_valign2 = true;
-
-   /* From the Ivybridge PRM, Volume 4, Part 1, Section 2.12.1:
-    * RENDER_SURFACE_STATE Surface Vertical Alignment:
-    *
-    *    - This field is intended to be set to VALIGN_4 if the surface was
-    *      rendered as a depth buffer, for a multisampled (4x) render target,
-    *      or for a multisampled (8x) render target, since these surfaces
-    *      support only alignment of 4.  Use of VALIGN_4 for other surfaces is
-    *      supported, but uses more memory.  This field must be set to
-    *      VALIGN_4 for all tiled Y Render Target surfaces.
-    *
-    */
-   if (isl_surf_usage_is_depth(info->usage) ||
-       info->samples > 1 ||
-       ((info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
-        tiling == ISL_TILING_Y0)) {
-      require_valign4 = true;
-   }
-
-   if (isl_surf_usage_is_stencil(info->usage)) {
-      /* The Ivybridge PRM states that the stencil buffer's vertical alignment
-       * is 8 [Ivybridge PRM, Volume 1, Part 1, Section 6.18.4.4 Alignment
-       * Unit Size]. valign=8 is outside the set of valid values of
-       * RENDER_SURFACE_STATE.SurfaceVerticalAlignment, but that's ok because
-       * a stencil buffer will never be used directly for texturing or
-       * rendering on gen7.
-       */
-      return 8;
-   }
-
-   assert(!require_valign2 || !require_valign4);
-
-   if (require_valign4)
-      return 4;
-
-   /* Prefer VALIGN_2 because it conserves memory. */
-   return 2;
-}
-
 void
 isl_gen7_choose_image_alignment_el(const struct isl_device *dev,
                                    const struct isl_surf_init_info *restrict info,
@@ -385,9 +310,80 @@
    /* IVB+ does not support combined depthstencil. */
    assert(!isl_surf_usage_is_depth_and_stencil(info->usage));
 
-   *image_align_el = (struct isl_extent3d) {
-      .w = gen7_choose_halign_el(dev, info),
-      .h = gen7_choose_valign_el(dev, info, tiling),
-      .d = 1,
-   };
+   /* From the Ivy Bridge PRM, Vol. 2, Part 2, Section 6.18.4.4,
+    * "Alignment unit size", the alignment parameters are summarized in the
+    * following table:
+    *
+    *     Surface Defined By | Surface Format  | Align Width | Align Height
+    *    --------------------+-----------------+-------------+--------------
+    *       DEPTH_BUFFER     |   D16_UNORM     |      8      |      4
+    *                        |     other       |      4      |      4
+    *    --------------------+-----------------+-------------+--------------
+    *       STENCIL_BUFFER   |      N/A        |      8      |      8
+    *    --------------------+-----------------+-------------+--------------
+    *       SURFACE_STATE    | BC*, ETC*, EAC* |      4      |      4
+    *                        |      FXT1       |      8      |      4
+    *                        |   all others    |   HALIGN    |   VALIGN
+    *    -------------------------------------------------------------------
+    */
+   if (isl_surf_usage_is_depth(info->usage)) {
+      *image_align_el = info->format == ISL_FORMAT_R16_UNORM ?
+                        isl_extent3d(8, 4, 1) : isl_extent3d(4, 4, 1);
+      return;
+   } else if (isl_surf_usage_is_stencil(info->usage)) {
+      *image_align_el = isl_extent3d(8, 8, 1);
+      return;
+   } else if (isl_format_is_compressed(info->format)) {
+      /* Compressed formats all have alignment equal to block size. */
+      *image_align_el = isl_extent3d(1, 1, 1);
+      return;
+   }
+
+   /* Everything after this point is in the "set by Surface Horizontal or
+    * Vertical Alignment" case.  Now it's just a matter of applying
+    * restrictions.
+    */
+
+   /* There are no restrictions on halign beyond what's given in the table
+    * above.  We set it to the minimum value of 4 because that uses the least
+    * memory.
+    */
+   const uint32_t halign = 4;
+
+   bool require_valign4 = false;
+
+   /* From the Ivybridge PRM, Volume 4, Part 1, Section 2.12.1:
+    * RENDER_SURFACE_STATE Surface Vertical Alignment:
+    *
+    *    * This field is intended to be set to VALIGN_4 if the surface was
+    *      rendered as a depth buffer,
+    *
+    *    * for a multisampled (4x) render target, or for a multisampled (8x)
+    *      render target, since these surfaces support only alignment of 4.
+    *
+    *    * This field must be set to VALIGN_4 for all tiled Y Render Target
+    *      surfaces
+    *
+    *    * Value of 1 is not supported for format YCRCB_NORMAL (0x182),
+    *      YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY (0x190)
+    *
+    *    * If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+    *      must be set to VALIGN_4."
+    *
+    * The first restriction is already handled by the table above and the
+    * second restriction is redundant with the fifth.
+    */
+   if (info->samples > 1)
+      require_valign4 = true;
+
+   if (tiling == ISL_TILING_Y0 &&
+       (info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT))
+      require_valign4 = true;
+
+   assert(!(require_valign4 && gen7_format_needs_valign2(dev, info->format)));
+
+   /* We default to VALIGN_2 because it uses the least memory. */
+   const uint32_t valign = require_valign4 ? 4 : 2;
+
+   *image_align_el = isl_extent3d(halign, valign, 1);
 }
diff --git a/src/intel/isl/isl_gen8.c b/src/intel/isl/isl_gen8.c
index 01500b8..2199b8d 100644
--- a/src/intel/isl/isl_gen8.c
+++ b/src/intel/isl/isl_gen8.c
@@ -87,132 +87,6 @@
    return true;
 }
 
-/**
- * Choose horizontal subimage alignment, in units of surface elements.
- */
-static uint32_t
-gen8_choose_halign_el(const struct isl_device *dev,
-                      const struct isl_surf_init_info *restrict info)
-{
-   /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
-    * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
-    *
-    *    - This field is intended to be set to HALIGN_8 only if the surface
-    *      was rendered as a depth buffer with Z16 format or a stencil buffer.
-    *      In this case it must be set to HALIGN_8 since these surfaces
-    *      support only alignment of 8.  For Z32 formats it must be set to
-    *      HALIGN_4.
-    *
-    * From the Broadwell PRM, Volume 4, "Memory Views" p. 186, the alignment
-    * parameters are summarized in the following table:
-    *
-    *     Surface Defined By | Surface Format  | Align Width | Align Height
-    *    --------------------+-----------------+-------------+--------------
-    *       DEPTH_BUFFER     |   D16_UNORM     |      8      |      4
-    *                        |     other       |      4      |      4
-    *    --------------------+-----------------+-------------+--------------
-    *       STENCIL_BUFFER   |      N/A        |      8      |      8
-    *    --------------------+-----------------+-------------+--------------
-    *       SURFACE_STATE    | BC*, ETC*, EAC* |      4      |      4
-    *                        |      FXT1       |      8      |      4
-    *                        |   all others    |   HALIGN    |   VALIGN
-    *    -------------------------------------------------------------------
-    */
-   if (isl_surf_usage_is_depth(info->usage))
-      return info->format == ISL_FORMAT_R16_UNORM ? 8 : 4;
-
-   if (isl_surf_usage_is_stencil(info->usage))
-      return 8;
-
-   /* All compressed formats in the above table have an alignment equal to
-    * their compression block size.  This translates to an alignment in
-    * elements of 1.
-    */
-   if (isl_format_is_compressed(info->format))
-      return 1;
-
-   if (!(info->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT)) {
-      /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
-       * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
-       *
-       *    - When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E,
-       *      HALIGN 16 must be used.
-       *
-       * This case handles color surfaces that may own an auxiliary MCS, CCS_D,
-       * or CCS_E. Depth buffers, including those that own an auxiliary HiZ
-       * surface, are handled above and do not require HALIGN_16.
-       */
-      assert(!isl_surf_usage_is_depth(info->usage));
-      return 16;
-   }
-
-   /* XXX(chadv): I believe the hardware requires each image to be
-    * cache-aligned. If that's true, then defaulting to halign=4 is wrong for
-    * many formats. Depending on the format's block size, we may need to
-    * increase halign to 8.
-    */
-   return 4;
-}
-
-/**
- * Choose vertical subimage alignment, in units of surface elements.
- */
-static uint32_t
-gen8_choose_valign_el(const struct isl_device *dev,
-                      const struct isl_surf_init_info *restrict info)
-{
-   /* From the Broadwell PRM > Volume 2d: Command Reference: Structures
-    * > RENDER_SURFACE_STATE Surface Vertical Alignment (p325):
-    *
-    *    - For Sampling Engine and Render Target Surfaces: This field
-    *      specifies the vertical alignment requirement in elements for the
-    *      surface. [...] An element is defined as a pixel in uncompresed
-    *      surface formats, and as a compression block in compressed surface
-    *      formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
-    *      element is a sample.
-    *
-    *    - This field is intended to be set to VALIGN_4 if the surface was
-    *      rendered as a depth buffer, for a multisampled (4x) render target,
-    *      or for a multisampled (8x) render target, since these surfaces
-    *      support only alignment of 4. Use of VALIGN_4 for other surfaces is
-    *      supported, but increases memory usage.
-    *
-    *    - This field is intended to be set to VALIGN_8 only if the surface
-    *       was rendered as a stencil buffer, since stencil buffer surfaces
-    *       support only alignment of 8. If set to VALIGN_8, Surface Format
-    *       must be R8_UINT.
-    *
-    * From the Broadwell PRM, Volume 4, "Memory Views" p. 186, the alignment
-    * parameters are summarized in the following table:
-    *
-    *     Surface Defined By | Surface Format  | Align Width | Align Height
-    *    --------------------+-----------------+-------------+--------------
-    *       DEPTH_BUFFER     |   D16_UNORM     |      8      |      4
-    *                        |     other       |      4      |      4
-    *    --------------------+-----------------+-------------+--------------
-    *       STENCIL_BUFFER   |      N/A        |      8      |      8
-    *    --------------------+-----------------+-------------+--------------
-    *       SURFACE_STATE    | BC*, ETC*, EAC* |      4      |      4
-    *                        |      FXT1       |      8      |      4
-    *                        |   all others    |   HALIGN    |   VALIGN
-    *    -------------------------------------------------------------------
-    */
-   if (isl_surf_usage_is_depth(info->usage))
-      return 4;
-
-   if (isl_surf_usage_is_stencil(info->usage))
-      return 8;
-
-   /* All compressed formats in the above table have an alignment equal to
-    * their compression block size.  This translates to an alignment in
-    * elements of 1.
-    */
-   if (isl_format_is_compressed(info->format))
-      return 1;
-
-   return 4;
-}
-
 void
 isl_gen8_choose_image_alignment_el(const struct isl_device *dev,
                                    const struct isl_surf_init_info *restrict info,
@@ -239,30 +113,65 @@
       return;
    }
 
-   /* The below text from the Broadwell PRM provides some insight into the
-    * hardware's requirements for LOD alignment.  From the Broadwell PRM >>
-    * Volume 5: Memory Views >> Surface Layout >> 2D Surfaces:
+   /* From the Broadwell PRM, Volume 4, "Memory Views" p. 186, the alignment
+    * parameters are summarized in the following table:
     *
-    *    These [2D surfaces] must adhere to the following memory organization
-    *    rules:
-    *
-    *       - For non-compressed texture formats, each mipmap must start on an
-    *         even row within the monolithic rectangular area. For
-    *         1-texel-high mipmaps, this may require a row of padding below
-    *         the previous mipmap. This restriction does not apply to any
-    *         compressed texture formats; each subsequent (lower-res)
-    *         compressed mipmap is positioned directly below the previous
-    *         mipmap.
-    *
-    *       - Vertical alignment restrictions vary with memory tiling type:
-    *         1 DWord for linear, 16-byte (DQWord) for tiled. (Note that tiled
-    *         mipmaps are not required to start at the left edge of a tile
-    *         row.)
+    *     Surface Defined By | Surface Format  | Align Width | Align Height
+    *    --------------------+-----------------+-------------+--------------
+    *       DEPTH_BUFFER     |   D16_UNORM     |      8      |      4
+    *                        |     other       |      4      |      4
+    *    --------------------+-----------------+-------------+--------------
+    *       STENCIL_BUFFER   |      N/A        |      8      |      8
+    *    --------------------+-----------------+-------------+--------------
+    *       SURFACE_STATE    | BC*, ETC*, EAC* |      4      |      4
+    *                        |      FXT1       |      8      |      4
+    *                        |   all others    |   HALIGN    |   VALIGN
+    *    -------------------------------------------------------------------
+    */
+   if (isl_surf_usage_is_depth(info->usage)) {
+      *image_align_el = info->format == ISL_FORMAT_R16_UNORM ?
+                        isl_extent3d(8, 4, 1) : isl_extent3d(4, 4, 1);
+      return;
+   } else if (isl_surf_usage_is_stencil(info->usage)) {
+      *image_align_el = isl_extent3d(8, 8, 1);
+      return;
+   } else if (isl_format_is_compressed(info->format)) {
+      /* Compressed formats all have alignment equal to block size. */
+      *image_align_el = isl_extent3d(1, 1, 1);
+      return;
+   }
+
+   /* For all other formats, the alignment is determined by the horizontal and
+    * vertical alignment fields of RENDER_SURFACE_STATE.  There are a few
+    * restrictions, but we generally have a choice.
     */
 
-   *image_align_el = (struct isl_extent3d) {
-      .w = gen8_choose_halign_el(dev, info),
-      .h = gen8_choose_valign_el(dev, info),
-      .d = 1,
-   };
+   /* Vertical alignment is unrestricted so we choose the smallest allowed
+    * alignment because that will use the least memory
+    */
+   const uint32_t valign = 4;
+
+   bool needs_halign16 = false;
+   if (!(info->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT)) {
+      /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
+       * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
+       *
+       *    - When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E,
+       *      HALIGN 16 must be used.
+       *
+       * This case handles color surfaces that may own an auxiliary MCS, CCS_D,
+       * or CCS_E. Depth buffers, including those that own an auxiliary HiZ
+       * surface, are handled above and do not require HALIGN_16.
+       */
+      needs_halign16 = true;
+   }
+
+   /* XXX(chadv): I believe the hardware requires each image to be
+    * cache-aligned. If that's true, then defaulting to halign=4 is wrong for
+    * many formats. Depending on the format's block size, we may need to
+    * increase halign to 8.
+    */
+   const uint32_t halign = needs_halign16 ? 16 : 4;
+
+   *image_align_el = isl_extent3d(halign, valign, 1);
 }
diff --git a/src/intel/isl/isl_priv.h b/src/intel/isl/isl_priv.h
index 3c4cc1e..525d8a2 100644
--- a/src/intel/isl/isl_priv.h
+++ b/src/intel/isl/isl_priv.h
@@ -80,6 +80,13 @@
    return ((n + a - 1) / a) * a;
 }
 
+static inline uintmax_t
+isl_assert_div(uintmax_t n, uintmax_t a)
+{
+   assert(n % a == 0);
+   return n / a;
+}
+
 /**
  * Alignment must be a power of 2.
  */
@@ -178,6 +185,10 @@
                            const struct isl_surf_fill_state_info *restrict info);
 
 void
+isl_gen10_surf_fill_state_s(const struct isl_device *dev, void *state,
+                            const struct isl_surf_fill_state_info *restrict info);
+
+void
 isl_gen4_buffer_fill_state_s(void *state,
                              const struct isl_buffer_fill_state_info *restrict info);
 
@@ -206,6 +217,10 @@
                              const struct isl_buffer_fill_state_info *restrict info);
 
 void
+isl_gen10_buffer_fill_state_s(void *state,
+                              const struct isl_buffer_fill_state_info *restrict info);
+
+void
 isl_gen4_emit_depth_stencil_hiz_s(const struct isl_device *dev, void *batch,
                                   const struct isl_depth_stencil_hiz_emit_info *restrict info);
 
@@ -233,4 +248,8 @@
 isl_gen9_emit_depth_stencil_hiz_s(const struct isl_device *dev, void *batch,
                                   const struct isl_depth_stencil_hiz_emit_info *restrict info);
 
+void
+isl_gen10_emit_depth_stencil_hiz_s(const struct isl_device *dev, void *batch,
+                                   const struct isl_depth_stencil_hiz_emit_info *restrict info);
+
 #endif /* ISL_PRIV_H */
diff --git a/src/intel/isl/isl_storage_image.c b/src/intel/isl/isl_storage_image.c
index 4c56e78..a8aebce 100644
--- a/src/intel/isl/isl_storage_image.c
+++ b/src/intel/isl/isl_storage_image.c
@@ -226,8 +226,12 @@
                        view->base_array_layer;
    }
 
-   isl_surf_get_image_offset_el(surf, view->base_level, view->base_array_layer,
-                                0, &param->offset[0],  &param->offset[1]);
+   isl_surf_get_image_offset_el(surf, view->base_level,
+                                surf->dim == ISL_SURF_DIM_3D ?
+                                   0 : view->base_array_layer,
+                                surf->dim == ISL_SURF_DIM_3D ?
+                                   view->base_array_layer : 0,
+                                &param->offset[0],  &param->offset[1]);
 
    const int cpp = isl_format_get_layout(surf->format)->bpb / 8;
    param->stride[0] = cpp;
diff --git a/src/intel/isl/isl_surface_state.c b/src/intel/isl/isl_surface_state.c
index fa46469..e8bdb65 100644
--- a/src/intel/isl/isl_surface_state.c
+++ b/src/intel/isl/isl_surface_state.c
@@ -254,8 +254,32 @@
    if (info->surf->dim == ISL_SURF_DIM_1D)
       assert(!isl_format_is_compressed(info->view->format));
 
+   if (isl_format_is_compressed(info->surf->format)) {
+      /* You're not allowed to make a view of a compressed format with any
+       * format other than the surface format.  None of the userspace APIs
+       * allow for this directly and doing so would mess up a number of
+       * surface parameters such as Width, Height, and alignments.  Ideally,
+       * we'd like to assert that the two formats match.  However, we have an
+       * S3TC workaround that requires us to do reinterpretation.  So assert
+       * that they're at least the same bpb and block size.
+       */
+      MAYBE_UNUSED const struct isl_format_layout *surf_fmtl =
+         isl_format_get_layout(info->surf->format);
+      MAYBE_UNUSED const struct isl_format_layout *view_fmtl =
+         isl_format_get_layout(info->surf->format);
+      assert(surf_fmtl->bpb == view_fmtl->bpb);
+      assert(surf_fmtl->bw == view_fmtl->bw);
+      assert(surf_fmtl->bh == view_fmtl->bh);
+   }
+
    s.SurfaceFormat = info->view->format;
 
+#if GEN_GEN <= 5
+   s.ColorBufferComponentWriteDisables = info->write_disables;
+#else
+   assert(info->write_disables == 0);
+#endif
+
 #if GEN_IS_HASWELL
    s.IntegerSurfaceFormat = isl_format_has_int_channel(s.SurfaceFormat);
 #endif
@@ -544,7 +568,7 @@
       assert(!(info->view->usage & ISL_SURF_USAGE_STORAGE_BIT));
 
       struct isl_tile_info tile_info;
-      isl_surf_get_tile_info(dev, info->aux_surf, &tile_info);
+      isl_surf_get_tile_info(info->aux_surf, &tile_info);
       uint32_t pitch_in_tiles =
          info->aux_surf->row_pitch / tile_info.phys_extent_B.width;
 
diff --git a/src/intel/isl/tests/isl_surf_get_image_offset_test.c b/src/intel/isl/tests/isl_surf_get_image_offset_test.c
index 1b3dc58..05fd79f 100644
--- a/src/intel/isl/tests/isl_surf_get_image_offset_test.c
+++ b/src/intel/isl/tests/isl_surf_get_image_offset_test.c
@@ -256,7 +256,6 @@
 
    t_assert_image_alignment_el(&surf, 4, 4, 1);
    t_assert_image_alignment_sa(&surf, 4, 4, 1);
-   t_assert(isl_surf_get_array_pitch_el_rows(&surf) == 74916);
    t_assert(isl_surf_get_array_pitch_sa_rows(&surf) ==
             isl_surf_get_array_pitch_el_rows(&surf));
 
diff --git a/src/intel/tools/aubinator.c b/src/intel/tools/aubinator.c
index f1bedd2..48d4456 100644
--- a/src/intel/tools/aubinator.c
+++ b/src/intel/tools/aubinator.c
@@ -99,6 +99,7 @@
 decode_group(struct gen_group *strct, const uint32_t *p, int starting_dword)
 {
    uint64_t offset = option_print_offsets ? (void *) p - gtt : 0;
+
    gen_print_group(outfile, strct, offset, p, option_color == COLOR_ALWAYS);
 }
 
@@ -523,6 +524,14 @@
 }
 
 static void
+handle_3dstate_sampler_state_pointers_gen6(struct gen_spec *spec, uint32_t *p)
+{
+   dump_samplers(spec, p[1]);
+   dump_samplers(spec, p[2]);
+   dump_samplers(spec, p[3]);
+}
+
+static void
 handle_3dstate_viewport_state_pointers_cc(struct gen_spec *spec, uint32_t *p)
 {
    uint64_t start;
@@ -633,6 +642,8 @@
 #define _3DSTATE_SAMPLER_STATE_POINTERS_GS  0x782e0000
 #define _3DSTATE_SAMPLER_STATE_POINTERS_PS  0x782f0000
 
+#define _3DSTATE_SAMPLER_STATE_POINTERS     0x78020000
+
 #define _3DSTATE_VIEWPORT_STATE_POINTERS_CC 0x78230000
 #define _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP 0x78210000
 #define _3DSTATE_BLEND_STATE_POINTERS       0x78240000
@@ -669,6 +680,7 @@
    { _3DSTATE_SAMPLER_STATE_POINTERS_VS, handle_3dstate_sampler_state_pointers },
    { _3DSTATE_SAMPLER_STATE_POINTERS_GS, handle_3dstate_sampler_state_pointers },
    { _3DSTATE_SAMPLER_STATE_POINTERS_PS, handle_3dstate_sampler_state_pointers },
+   { _3DSTATE_SAMPLER_STATE_POINTERS, handle_3dstate_sampler_state_pointers_gen6 },
 
    { _3DSTATE_VIEWPORT_STATE_POINTERS_CC, handle_3dstate_viewport_state_pointers_cc },
    { _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, handle_3dstate_viewport_state_pointers_sf_clip },
@@ -892,6 +904,8 @@
       exit(EXIT_FAILURE);
    }
 
+   close(fd);
+
    file->cursor = file->map;
    file->end = file->map + sb.st_size / 4;
 
@@ -951,7 +965,8 @@
    { "bdw", MAKE_GEN(8, 0) },
    { "skl", MAKE_GEN(9, 0) },
    { "chv", MAKE_GEN(8, 0) },
-   { "bxt", MAKE_GEN(9, 0) }
+   { "bxt", MAKE_GEN(9, 0) },
+   { "cnl", MAKE_GEN(10, 0) },
 };
 
 enum {
@@ -1119,7 +1134,7 @@
            "Decode aub file contents from either FILE or the standard input.\n\n"
            "A valid --gen option must be provided.\n\n"
            "      --help          display this help and exit\n"
-           "      --gen=platform  decode for given platform (ivb, byt, hsw, bdw, chv, skl, kbl or bxt)\n"
+           "      --gen=platform  decode for given platform (ivb, byt, hsw, bdw, chv, skl, kbl, bxt or cnl)\n"
            "      --headers       decode only command headers\n"
            "      --color[=WHEN]  colorize the output; WHEN can be 'auto' (default\n"
            "                        if omitted), 'always', or 'never'\n"
@@ -1147,7 +1162,8 @@
       { "chv", 0x22B3 }, /* Intel(R) HD Graphics (Cherryview) */
       { "skl", 0x1912 }, /* Intel(R) HD Graphics 530 (Skylake GT2) */
       { "kbl", 0x591D }, /* Intel(R) Kabylake GT2 */
-      { "bxt", 0x0A84 }  /* Intel(R) HD Graphics (Broxton) */
+      { "bxt", 0x0A84 },  /* Intel(R) HD Graphics (Broxton) */
+      { "cnl", 0x5A52 },  /* Intel(R) HD Graphics (Cannonlake) */
    };
    const struct option aubinator_opts[] = {
       { "help",       no_argument,       (int *) &help,                 true },
diff --git a/src/intel/tools/aubinator_error_decode.c b/src/intel/tools/aubinator_error_decode.c
index 2e62369..636f56a 100644
--- a/src/intel/tools/aubinator_error_decode.c
+++ b/src/intel/tools/aubinator_error_decode.c
@@ -40,6 +40,7 @@
 
 #include "common/gen_decoder.h"
 #include "util/macros.h"
+#include "gen_disasm.h"
 
 #define CSI "\e["
 #define BLUE_HEADER  CSI "0;44m"
@@ -209,6 +210,18 @@
 #define CSI "\e["
 #define NORMAL       CSI "0m"
 
+struct program {
+   const char *type;
+   const char *command;
+   uint64_t command_offset;
+   uint64_t instruction_base_address;
+   uint64_t ksp;
+};
+
+#define MAX_NUM_PROGRAMS 4096
+static struct program programs[MAX_NUM_PROGRAMS];
+static int num_programs = 0;
+
 static void decode(struct gen_spec *spec,
                    const char *buffer_name,
                    const char *ring_name,
@@ -219,6 +232,7 @@
    uint32_t *p, *end = (data + *count);
    int length;
    struct gen_group *inst;
+   uint64_t current_instruction_base_address = 0;
 
    for (p = data; p < end; p += length) {
       const char *color = option_full_decode ? BLUE_HEADER : NORMAL,
@@ -243,6 +257,135 @@
 
       gen_print_group(stdout, inst, offset, p,
                       option_color == COLOR_ALWAYS);
+
+      if (strcmp(inst->name, "MI_BATCH_BUFFER_END") == 0)
+         break;
+
+      if (strcmp(inst->name, "STATE_BASE_ADDRESS") == 0) {
+         struct gen_field_iterator iter;
+         gen_field_iterator_init(&iter, inst, p, false);
+
+         while (gen_field_iterator_next(&iter)) {
+            if (strcmp(iter.name, "Instruction Base Address") == 0) {
+               current_instruction_base_address = strtol(iter.value, NULL, 16);
+            }
+         }
+      } else if (strcmp(inst->name,   "WM_STATE") == 0 ||
+                 strcmp(inst->name, "3DSTATE_PS") == 0 ||
+                 strcmp(inst->name, "3DSTATE_WM") == 0) {
+         struct gen_field_iterator iter;
+         gen_field_iterator_init(&iter, inst, p, false);
+         uint64_t ksp[3] = {0, 0, 0};
+         bool enabled[3] = {false, false, false};
+
+         while (gen_field_iterator_next(&iter)) {
+            if (strncmp(iter.name, "Kernel Start Pointer ",
+                        strlen("Kernel Start Pointer ")) == 0) {
+               int idx = iter.name[strlen("Kernel Start Pointer ")] - '0';
+               ksp[idx] = strtol(iter.value, NULL, 16);
+            } else if (strcmp(iter.name, "8 Pixel Dispatch Enable") == 0) {
+               enabled[0] = strcmp(iter.value, "true") == 0;
+            } else if (strcmp(iter.name, "16 Pixel Dispatch Enable") == 0) {
+               enabled[1] = strcmp(iter.value, "true") == 0;
+            } else if (strcmp(iter.name, "32 Pixel Dispatch Enable") == 0) {
+               enabled[2] = strcmp(iter.value, "true") == 0;
+            }
+         }
+
+         /* FINISHME: Broken for multi-program WM_STATE,
+          * which Mesa does not use
+          */
+         if (enabled[0] + enabled[1] + enabled[2] == 1) {
+            const char *type = enabled[0] ? "SIMD8 fragment shader" :
+                               enabled[1] ? "SIMD16 fragment shader" :
+                               enabled[2] ? "SIMD32 fragment shader" : NULL;
+
+            programs[num_programs++] = (struct program) {
+               .type = type,
+               .command = inst->name,
+               .command_offset = offset,
+               .instruction_base_address = current_instruction_base_address,
+               .ksp = ksp[0],
+            };
+         } else {
+            if (enabled[0]) /* SIMD8 */ {
+               programs[num_programs++] = (struct program) {
+                  .type = "SIMD8 fragment shader",
+                  .command = inst->name,
+                  .command_offset = offset,
+                  .instruction_base_address = current_instruction_base_address,
+                  .ksp = ksp[0],
+                  .ksp = ksp[0], /* SIMD8 shader is specified by ksp[0] */
+               };
+            }
+            if (enabled[1]) /* SIMD16 */ {
+               programs[num_programs++] = (struct program) {
+                  .type = "SIMD16 fragment shader",
+                  .command = inst->name,
+                  .command_offset = offset,
+                  .instruction_base_address = current_instruction_base_address,
+                  .ksp = ksp[2], /* SIMD16 shader is specified by ksp[2] */
+               };
+            }
+            if (enabled[2]) /* SIMD32 */ {
+               programs[num_programs++] = (struct program) {
+                  .type = "SIMD32 fragment shader",
+                  .command = inst->name,
+                  .command_offset = offset,
+                  .instruction_base_address = current_instruction_base_address,
+                  .ksp = ksp[1], /* SIMD32 shader is specified by ksp[1] */
+               };
+            }
+         }
+      } else if (strcmp(inst->name,   "VS_STATE") == 0 ||
+                 strcmp(inst->name,   "GS_STATE") == 0 ||
+                 strcmp(inst->name,   "SF_STATE") == 0 ||
+                 strcmp(inst->name, "CLIP_STATE") == 0 ||
+                 strcmp(inst->name, "3DSTATE_DS") == 0 ||
+                 strcmp(inst->name, "3DSTATE_HS") == 0 ||
+                 strcmp(inst->name, "3DSTATE_GS") == 0 ||
+                 strcmp(inst->name, "3DSTATE_VS") == 0) {
+         struct gen_field_iterator iter;
+         gen_field_iterator_init(&iter, inst, p, false);
+         uint64_t ksp = 0;
+         bool is_simd8 = false; /* vertex shaders on Gen8+ only */
+         bool is_enabled = true;
+
+         while (gen_field_iterator_next(&iter)) {
+            if (strcmp(iter.name, "Kernel Start Pointer") == 0) {
+               ksp = strtol(iter.value, NULL, 16);
+            } else if (strcmp(iter.name, "SIMD8 Dispatch Enable") == 0) {
+               is_simd8 = strcmp(iter.value, "true") == 0;
+            } else if (strcmp(iter.name, "Dispatch Enable") == 0) {
+               is_simd8 = strcmp(iter.value, "SIMD8") == 0;
+            } else if (strcmp(iter.name, "Function Enable") == 0) {
+               is_enabled = strcmp(iter.value, "true") == 0;
+            }
+         }
+
+         const char *type =
+            strcmp(inst->name,   "VS_STATE") == 0 ? "vertex shader" :
+            strcmp(inst->name,   "GS_STATE") == 0 ? "geometry shader" :
+            strcmp(inst->name,   "SF_STATE") == 0 ? "strips and fans shader" :
+            strcmp(inst->name, "CLIP_STATE") == 0 ? "clip shader" :
+            strcmp(inst->name, "3DSTATE_DS") == 0 ? "tessellation control shader" :
+            strcmp(inst->name, "3DSTATE_HS") == 0 ? "tessellation evaluation shader" :
+            strcmp(inst->name, "3DSTATE_VS") == 0 ? (is_simd8 ? "SIMD8 vertex shader" : "vec4 vertex shader") :
+            strcmp(inst->name, "3DSTATE_GS") == 0 ? (is_simd8 ? "SIMD8 geometry shader" : "vec4 geometry shader") :
+            NULL;
+
+         if (is_enabled) {
+            programs[num_programs++] = (struct program) {
+               .type = type,
+               .command = inst->name,
+               .command_offset = offset,
+               .instruction_base_address = current_instruction_base_address,
+               .ksp = ksp,
+            };
+         }
+      }
+
+      assert(num_programs < MAX_NUM_PROGRAMS);
    }
 }
 
@@ -345,6 +488,7 @@
    const char *buffer_name = "batch buffer";
    char *ring_name = NULL;
    struct gen_device_info devinfo;
+   struct gen_disasm *disasm = NULL;
 
    while (getline(&line, &line_size, file) > 0) {
       char *new_ring_name = NULL;
@@ -421,17 +565,51 @@
             buffer_name = "HW Context";
             continue;
          }
+
+         matched = sscanf(dashes, "--- user = 0x%08x %08x\n",
+                          &hi, &lo);
+         if (matched > 0) {
+            new_gtt_offset = hi;
+            if (matched == 2) {
+               new_gtt_offset <<= 32;
+               new_gtt_offset |= lo;
+            }
+
+            gtt_offset = new_gtt_offset;
+            free(ring_name);
+            ring_name = new_ring_name;
+            buffer_name = "user";
+            continue;
+         }
       }
 
       if (line[0] == ':' || line[0] == '~') {
          count = ascii85_decode(line+1, &data, line[0] == ':');
          if (count == 0) {
             fprintf(stderr, "ASCII85 decode failed.\n");
-            exit(1);
+            exit(EXIT_FAILURE);
          }
-         decode(spec,
-                buffer_name, ring_name,
-                gtt_offset, data, &count);
+
+         if (strcmp(buffer_name, "user") == 0) {
+            printf("Disassembly of programs in instruction buffer at "
+                   "0x%08"PRIx64":\n", gtt_offset);
+            for (int i = 0; i < num_programs; i++) {
+               if (programs[i].instruction_base_address == gtt_offset) {
+                    printf("\n%s (specified by %s at batch offset "
+                           "0x%08"PRIx64") at offset 0x%08"PRIx64"\n",
+                           programs[i].type,
+                           programs[i].command,
+                           programs[i].command_offset,
+                           programs[i].ksp);
+                    gen_disasm_disassemble(disasm, data, programs[i].ksp,
+                                           stdout);
+               }
+            }
+         } else {
+            decode(spec,
+                   buffer_name, ring_name,
+                   gtt_offset, data, &count);
+         }
          continue;
       }
 
@@ -457,9 +635,11 @@
          if (matched == 1) {
             if (!gen_get_device_info(reg, &devinfo)) {
                printf("Unable to identify devid=%x\n", reg);
-               return;
+               exit(EXIT_FAILURE);
             }
 
+            disasm = gen_disasm_create(reg);
+
             printf("Detected GEN%i chipset\n", devinfo.gen);
 
             if (xml_path == NULL)
@@ -538,7 +718,7 @@
          data = realloc(data, data_size * sizeof (uint32_t));
          if (data == NULL) {
             fprintf(stderr, "Out of memory.\n");
-            exit(1);
+            exit(EXIT_FAILURE);
          }
       }
 
@@ -549,6 +729,7 @@
           buffer_name, ring_name,
           gtt_offset, data, &count);
 
+   gen_disasm_destroy(disasm);
    free(data);
    free(line);
    free(ring_name);
@@ -643,7 +824,7 @@
 
    if (help || argc == 1) {
       print_help(argv[0], stderr);
-      exit(0);
+      exit(EXIT_SUCCESS);
    }
 
    if (optind >= argc) {
@@ -666,7 +847,7 @@
          }
       } else {
          read_data_file(stdin);
-         exit(0);
+         exit(EXIT_SUCCESS);
       }
    } else {
       path = argv[optind];
@@ -674,7 +855,7 @@
       if (error != 0) {
          fprintf(stderr, "Error opening %s: %s\n",
                  path, strerror(errno));
-         exit(1);
+         exit(EXIT_FAILURE);
       }
    }
 
@@ -693,8 +874,8 @@
       file = fopen(filename, "r");
       if (!file) {
          int minor;
+         free(filename);
          for (minor = 0; minor < 64; minor++) {
-            free(filename);
             ret = asprintf(&filename, "%s/%d/i915_error_state", path, minor);
             assert(ret > 0);
 
diff --git a/src/intel/tools/disasm.c b/src/intel/tools/disasm.c
index 62256d2..361885b 100644
--- a/src/intel/tools/disasm.c
+++ b/src/intel/tools/disasm.c
@@ -43,52 +43,67 @@
            opcode == BRW_OPCODE_SENDSC );
 }
 
-void
-gen_disasm_disassemble(struct gen_disasm *disasm, void *assembly,
-                       int start, FILE *out)
+static int
+gen_disasm_find_end(struct gen_disasm *disasm, void *assembly, int start)
 {
    struct gen_device_info *devinfo = &disasm->devinfo;
-   bool dump_hex = false;
    int offset = start;
 
    /* This loop exits when send-with-EOT or when opcode is 0 */
    while (true) {
       brw_inst *insn = assembly + offset;
-      brw_inst uncompacted;
-      bool compacted = brw_inst_cmpt_control(devinfo, insn);
-      if (0)
-         fprintf(out, "0x%08x: ", offset);
 
-      if (compacted) {
-         brw_compact_inst *compacted = (void *)insn;
-         if (dump_hex) {
-            fprintf(out, "0x%08x 0x%08x                       ",
-                   ((uint32_t *)insn)[1],
-                   ((uint32_t *)insn)[0]);
-         }
-
-         brw_uncompact_instruction(devinfo, &uncompacted, compacted);
-         insn = &uncompacted;
+      if (brw_inst_cmpt_control(devinfo, insn)) {
          offset += 8;
       } else {
-         if (dump_hex) {
-            fprintf(out, "0x%08x 0x%08x 0x%08x 0x%08x ",
-                   ((uint32_t *)insn)[3],
-                   ((uint32_t *)insn)[2],
-                   ((uint32_t *)insn)[1],
-                   ((uint32_t *)insn)[0]);
-         }
          offset += 16;
       }
 
-      brw_disassemble_inst(out, devinfo, insn, compacted);
-
       /* Simplistic, but efficient way to terminate disasm */
       uint32_t opcode = brw_inst_opcode(devinfo, insn);
       if (opcode == 0 || (is_send(opcode) && brw_inst_eot(devinfo, insn))) {
          break;
       }
    }
+
+   return offset;
+}
+
+void
+gen_disasm_disassemble(struct gen_disasm *disasm, void *assembly,
+                       int start, FILE *out)
+{
+   struct gen_device_info *devinfo = &disasm->devinfo;
+   int end = gen_disasm_find_end(disasm, assembly, start);
+
+   /* Make a dummy annotation structure that brw_validate_instructions
+    * can work from.
+    */
+   struct annotation_info annotation_info = {
+      .ann_count = 1,
+      .ann_size = 2,
+   };
+   annotation_info.mem_ctx = ralloc_context(NULL);
+   annotation_info.ann = rzalloc_array(annotation_info.mem_ctx,
+                                       struct annotation,
+                                       annotation_info.ann_size);
+   annotation_info.ann[0].offset = start;
+   annotation_info.ann[1].offset = end;
+   brw_validate_instructions(devinfo, assembly, start, end, &annotation_info);
+   struct annotation *annotation = annotation_info.ann;
+
+   for (int i = 0; i < annotation_info.ann_count; i++) {
+      int start_offset = annotation[i].offset;
+      int end_offset = annotation[i + 1].offset;
+
+      brw_disassemble(devinfo, assembly, start_offset, end_offset, stdout);
+
+      if (annotation[i].error) {
+         fputs(annotation[i].error, stdout);
+      }
+   }
+
+   ralloc_free(annotation_info.mem_ctx);
 }
 
 struct gen_disasm *
diff --git a/src/intel/tools/intel_aub.h b/src/intel/tools/intel_aub.h
new file mode 100644
index 0000000..5f0aba8
--- /dev/null
+++ b/src/intel/tools/intel_aub.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_aub.h
+ *
+ * The AUB file is a file format used by Intel's internal simulation
+ * and other validation tools.  It can be used at various levels by a
+ * driver to input state to the simulated hardware or a replaying
+ * debugger.
+ *
+ * We choose to dump AUB files using the trace block format for ease
+ * of implementation -- dump out the blocks of memory as plain blobs
+ * and insert ring commands to execute the batchbuffer blob.
+ */
+
+#ifndef _INTEL_AUB_H
+#define _INTEL_AUB_H
+
+#define AUB_MI_NOOP			(0)
+#define AUB_MI_BATCH_BUFFER_START 	(0x31 << 23)
+#define AUB_PIPE_CONTROL		(0x7a000002)
+
+/* DW0: instruction type. */
+
+#define CMD_AUB			(7 << 29)
+
+#define CMD_AUB_HEADER		(CMD_AUB | (1 << 23) | (0x05 << 16))
+/* DW1 */
+# define AUB_HEADER_MAJOR_SHIFT		24
+# define AUB_HEADER_MINOR_SHIFT		16
+
+#define CMD_AUB_TRACE_HEADER_BLOCK (CMD_AUB | (1 << 23) | (0x41 << 16))
+#define CMD_AUB_DUMP_BMP           (CMD_AUB | (1 << 23) | (0x9e << 16))
+
+/* DW1 */
+#define AUB_TRACE_OPERATION_MASK	0x000000ff
+#define AUB_TRACE_OP_COMMENT		0x00000000
+#define AUB_TRACE_OP_DATA_WRITE		0x00000001
+#define AUB_TRACE_OP_COMMAND_WRITE	0x00000002
+#define AUB_TRACE_OP_MMIO_WRITE		0x00000003
+// operation = TRACE_DATA_WRITE, Type
+#define AUB_TRACE_TYPE_MASK		0x0000ff00
+#define AUB_TRACE_TYPE_NOTYPE		(0 << 8)
+#define AUB_TRACE_TYPE_BATCH		(1 << 8)
+#define AUB_TRACE_TYPE_VERTEX_BUFFER	(5 << 8)
+#define AUB_TRACE_TYPE_2D_MAP		(6 << 8)
+#define AUB_TRACE_TYPE_CUBE_MAP		(7 << 8)
+#define AUB_TRACE_TYPE_VOLUME_MAP	(9 << 8)
+#define AUB_TRACE_TYPE_1D_MAP		(10 << 8)
+#define AUB_TRACE_TYPE_CONSTANT_BUFFER	(11 << 8)
+#define AUB_TRACE_TYPE_CONSTANT_URB	(12 << 8)
+#define AUB_TRACE_TYPE_INDEX_BUFFER	(13 << 8)
+#define AUB_TRACE_TYPE_GENERAL		(14 << 8)
+#define AUB_TRACE_TYPE_SURFACE		(15 << 8)
+
+
+// operation = TRACE_COMMAND_WRITE, Type =
+#define AUB_TRACE_TYPE_RING_HWB		(1 << 8)
+#define AUB_TRACE_TYPE_RING_PRB0	(2 << 8)
+#define AUB_TRACE_TYPE_RING_PRB1	(3 << 8)
+#define AUB_TRACE_TYPE_RING_PRB2	(4 << 8)
+
+// Address space
+#define AUB_TRACE_ADDRESS_SPACE_MASK	0x00ff0000
+#define AUB_TRACE_MEMTYPE_GTT		(0 << 16)
+#define AUB_TRACE_MEMTYPE_LOCAL		(1 << 16)
+#define AUB_TRACE_MEMTYPE_NONLOCAL	(2 << 16)
+#define AUB_TRACE_MEMTYPE_PCI		(3 << 16)
+#define AUB_TRACE_MEMTYPE_GTT_ENTRY     (4 << 16)
+
+/* DW2 */
+
+/**
+ * aub_state_struct_type enum values are encoded with the top 16 bits
+ * representing the type to be delivered to the .aub file, and the bottom 16
+ * bits representing the subtype.  This macro performs the encoding.
+ */
+#define ENCODE_SS_TYPE(type, subtype) (((type) << 16) | (subtype))
+
+enum aub_state_struct_type {
+   AUB_TRACE_VS_STATE =			ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 1),
+   AUB_TRACE_GS_STATE =			ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 2),
+   AUB_TRACE_CLIP_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 3),
+   AUB_TRACE_SF_STATE =			ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 4),
+   AUB_TRACE_WM_STATE =			ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 5),
+   AUB_TRACE_CC_STATE =			ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 6),
+   AUB_TRACE_CLIP_VP_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 7),
+   AUB_TRACE_SF_VP_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 8),
+   AUB_TRACE_CC_VP_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x9),
+   AUB_TRACE_SAMPLER_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xa),
+   AUB_TRACE_KERNEL_INSTRUCTIONS =	ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xb),
+   AUB_TRACE_SCRATCH_SPACE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xc),
+   AUB_TRACE_SAMPLER_DEFAULT_COLOR =	ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xd),
+
+   AUB_TRACE_SCISSOR_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x15),
+   AUB_TRACE_BLEND_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x16),
+   AUB_TRACE_DEPTH_STENCIL_STATE =	ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x17),
+
+   AUB_TRACE_VERTEX_BUFFER =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_VERTEX_BUFFER, 0),
+   AUB_TRACE_BINDING_TABLE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_SURFACE, 0x100),
+   AUB_TRACE_SURFACE_STATE =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_SURFACE, 0x200),
+   AUB_TRACE_VS_CONSTANTS =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_CONSTANT_BUFFER, 0),
+   AUB_TRACE_WM_CONSTANTS =		ENCODE_SS_TYPE(AUB_TRACE_TYPE_CONSTANT_BUFFER, 1),
+};
+
+#undef ENCODE_SS_TYPE
+
+/**
+ * Decode a aub_state_struct_type value to determine the type that should be
+ * stored in the .aub file.
+ */
+static inline uint32_t AUB_TRACE_TYPE(enum aub_state_struct_type ss_type)
+{
+   return (ss_type & 0xFFFF0000) >> 16;
+}
+
+/**
+ * Decode a state_struct_type value to determine the subtype that should be
+ * stored in the .aub file.
+ */
+static inline uint32_t AUB_TRACE_SUBTYPE(enum aub_state_struct_type ss_type)
+{
+   return ss_type & 0xFFFF;
+}
+
+/* DW3: address */
+/* DW4: len */
+
+#endif /* _INTEL_AUB_H */
diff --git a/src/intel/vulkan/BUILD.gn b/src/intel/vulkan/BUILD.gn
index fdd002c..f108694 100644
--- a/src/intel/vulkan/BUILD.gn
+++ b/src/intel/vulkan/BUILD.gn
@@ -29,9 +29,10 @@
   include_dirs = [
     ".",
     "..",  # because  anv_private.h includes isl/isl.h
-    "$magma_build_root/third_party/libdrm",  # because anv_private.h includes i915_drm.h
+    "$mesa_build_root/include/drm-uapi",  # because anv_private.h includes i915_drm.h
     "$mesa_build_root/src/compiler",  # because anv_nir.h includes nir/nir.h
     "$mesa_build_root/src",  # because isl.h includes util/macros.h
+    "$mesa_build_root/src/vulkan/util",  # because anv_private.h includes vk_alloc.h
     "$root_gen_dir/third_party/mesa/src/intel/vulkan",
   ]
 
@@ -100,11 +101,13 @@
     "anv_nir_apply_pipeline_layout.c",
     "anv_nir_lower_push_constants.c",
     "anv_nir_lower_input_attachments.c",
+    "anv_nir_lower_multiview.c",
     "anv_pass.c",
     "anv_pipeline.c",
     "anv_pipeline_cache.c",
     "anv_platform.cc",
     "anv_private.h",
+    "anv_queue.c",
     "anv_util.c",
     "anv_wsi.c",
     "anv_wsi_magma.cc",
@@ -118,6 +121,7 @@
     ":gen75",
     ":gen8",
     ":gen9",
+    ":gen10",
   ]
 }
 
@@ -160,6 +164,10 @@
   defines = "GEN_VERSIONx10=90"
 }
 
+vulkan_gen8("gen10") {
+  defines = "GEN_VERSIONx10=100"
+}
+
 template("vulkan_gen7") {
   source_set(target_name) {
     defines = [ invoker.defines ]
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 89897bc..7ac99d3 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -163,11 +163,22 @@
 }
 
 static void
-anv_free_list_push(union anv_free_list *list, void *map, int32_t offset)
+anv_free_list_push(union anv_free_list *list, void *map, int32_t offset,
+                   uint32_t size, uint32_t count)
 {
    union anv_free_list current, old, new;
    int32_t *next_ptr = map + offset;
 
+   /* If we're returning more than one chunk, we need to build a chain to add
+    * to the list.  Fortunately, we can do this without any atomics since we
+    * own everything in the chain right now.  `offset` is left pointing to the
+    * head of our chain list while `next_ptr` points to the tail.
+    */
+   for (uint32_t i = 1; i < count; i++) {
+      VG_NOACCESS_WRITE(next_ptr, offset + i * size);
+      next_ptr = map + offset + i * size;
+   }
+
    old = *list;
    do {
       current = old;
@@ -229,22 +240,19 @@
    } while (old != current);
 }
 
-static uint32_t
-anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state);
+static VkResult
+anv_block_pool_expand_range(struct anv_block_pool *pool,
+                            uint32_t center_bo_offset, uint32_t size);
 
 VkResult
 anv_block_pool_init(struct anv_block_pool *pool,
-                    struct anv_device *device, uint32_t block_size)
+                    struct anv_device *device,
+                    uint32_t initial_size)
 {
    VkResult result;
 
-   assert(util_is_power_of_two(block_size));
-
    pool->device = device;
    anv_bo_init(&pool->bo, 0, 0);
-   pool->block_size = block_size;
-   pool->free_list = ANV_FREE_LIST_EMPTY;
-   pool->back_free_list = ANV_FREE_LIST_EMPTY;
 
    // Start with a large (2GB) size, assuming that the kernel won't commit pages
    // until map+fault or commit.
@@ -263,11 +271,14 @@
    pool->back_state.next = 0;
    pool->back_state.end = 0;
 
-   /* Immediately grow the pool so we'll have a backing bo. */
-   pool->state.end = anv_block_pool_grow(pool, &pool->state);
+   result = anv_block_pool_expand_range(pool, 0, initial_size);
+   if (result != VK_SUCCESS)
+      goto fail_mmap_cleanups;
 
    return VK_SUCCESS;
 
+ fail_mmap_cleanups:
+   u_vector_finish(&pool->mmap_cleanups);
  fail_fd:
    return result;
 }
@@ -289,122 +300,22 @@
 
 #define PAGE_SIZE 4096
 
-/** Grows and re-centers the block pool.
- *
- * We grow the block pool in one or both directions in such a way that the
- * following conditions are met:
- *
- *  1) The size of the entire pool is always a power of two.
- *
- *  2) The pool only grows on both ends.  Neither end can get
- *     shortened.
- *
- *  3) At the end of the allocation, we have about twice as much space
- *     allocated for each end as we have used.  This way the pool doesn't
- *     grow too far in one direction or the other.
- *
- *  4) If the _alloc_back() has never been called, then the back portion of
- *     the pool retains a size of zero.  (This makes it easier for users of
- *     the block pool that only want a one-sided pool.)
- *
- *  5) We have enough space allocated for at least one more block in
- *     whichever side `state` points to.
- *
- *  6) The center of the pool is always aligned to both the block_size of
- *     the pool and a 4K CPU page.
- */
-static uint32_t
-anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
+static VkResult
+anv_block_pool_expand_range(struct anv_block_pool *pool,
+                            uint32_t center_bo_offset, uint32_t size)
 {
-   size_t size;
    void *map;
    struct anv_mmap_cleanup *cleanup;
 
-   pthread_mutex_lock(&pool->device->mutex);
-
-   assert(state == &pool->state || state == &pool->back_state);
-
-   /* Gather a little usage information on the pool.  Since we may have
-    * threadsd waiting in queue to get some storage while we resize, it's
-    * actually possible that total_used will be larger than old_size.  In
-    * particular, block_pool_alloc() increments state->next prior to
-    * calling block_pool_grow, so this ensures that we get enough space for
-    * which ever side tries to grow the pool.
-    *
-    * We align to a page size because it makes it easier to do our
-    * calculations later in such a way that we state page-aigned.
-    */
-   uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
-   uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
-   uint32_t total_used = front_used + back_used;
-
-   assert(state == &pool->state || back_used > 0);
-
-   size_t old_size = pool->bo.size;
-
-   if (old_size != 0 &&
-       back_used * 2 <= pool->center_bo_offset &&
-       front_used * 2 <= (old_size - pool->center_bo_offset)) {
-      /* If we're in this case then this isn't the firsta allocation and we
-       * already have enough space on both sides to hold double what we
-       * have allocated.  There's nothing for us to do.
-       */
-      goto done;
-   }
-
-   if (old_size == 0) {
-      /* This is the first allocation */
-      size = MAX2(32 * pool->block_size, PAGE_SIZE);
-   } else {
-      size = old_size * 2;
-   }
-
-   /* We can't have a block pool bigger than 1GB because we use signed
-    * 32-bit offsets in the free list and we don't want overflow.  We
-    * should never need a block pool bigger than 1GB anyway.
-    */
-   assert(size <= (1u << 31));
-
-   /* We compute a new center_bo_offset such that, when we double the size
-    * of the pool, we maintain the ratio of how much is used by each side.
-    * This way things should remain more-or-less balanced.
-    */
-   uint32_t center_bo_offset;
-   if (back_used == 0) {
-      /* If we're in this case then we have never called alloc_back().  In
-       * this case, we want keep the offset at 0 to make things as simple
-       * as possible for users that don't care about back allocations.
-       */
-      center_bo_offset = 0;
-   } else {
-      /* Try to "center" the allocation based on how much is currently in
-       * use on each side of the center line.
-       */
-      center_bo_offset = ((uint64_t)size * back_used) / total_used;
-
-      /* Align down to a multiple of both the block size and page size */
-      uint32_t granularity = MAX2(pool->block_size, PAGE_SIZE);
-      assert(util_is_power_of_two(granularity));
-      center_bo_offset &= ~(granularity - 1);
-
-      assert(center_bo_offset >= back_used);
-
-      /* Make sure we don't shrink the back end of the pool */
-      if (center_bo_offset < pool->back_state.end)
-         center_bo_offset = pool->back_state.end;
-
-      /* Make sure that we don't shrink the front end of the pool */
-      if (size - center_bo_offset < pool->state.end)
-         center_bo_offset = size - pool->state.end;
-   }
-
-   assert(center_bo_offset % pool->block_size == 0);
-   assert(center_bo_offset % PAGE_SIZE == 0);
-
    /* Assert that we only ever grow the pool */
    assert(center_bo_offset >= pool->back_state.end);
    assert(size - center_bo_offset >= pool->state.end);
 
+   /* Assert that we don't go outside the bounds of the memfd */
+   assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);
+   assert(size - center_bo_offset <=
+          BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);
+
    /* Code below expects that map points to the start of the new used portion
     * as defined by the new center_bo_offset.  But we only map once for magma
     * so we use math to adjust map appropriately.
@@ -414,7 +325,8 @@
    } else {
       cleanup = u_vector_add(&pool->mmap_cleanups);
       if (!cleanup)
-         goto fail;
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
       *cleanup = ANV_MMAP_CLEANUP_INIT;
 
       map = anv_gem_mmap(pool->device, pool->bo.gem_handle, 0, BLOCK_POOL_MEMFD_SIZE, 0);
@@ -423,7 +335,7 @@
       cleanup->gem_handle = pool->bo.gem_handle;
 
       if (!map)
-         goto fail;
+         return vk_errorf(VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
 
       /* Pretend we mapped only the used portion */
       map = (uint8_t*)map + BLOCK_POOL_MEMFD_CENTER - center_bo_offset;
@@ -477,48 +389,166 @@
    pool->bo.map = map;
    pool->bo.start_offset = BLOCK_POOL_MEMFD_CENTER - center_bo_offset;
 
+   return VK_SUCCESS;
+}
+
+/** Grows and re-centers the block pool.
+ *
+ * We grow the block pool in one or both directions in such a way that the
+ * following conditions are met:
+ *
+ *  1) The size of the entire pool is always a power of two.
+ *
+ *  2) The pool only grows on both ends.  Neither end can get
+ *     shortened.
+ *
+ *  3) At the end of the allocation, we have about twice as much space
+ *     allocated for each end as we have used.  This way the pool doesn't
+ *     grow too far in one direction or the other.
+ *
+ *  4) If the _alloc_back() has never been called, then the back portion of
+ *     the pool retains a size of zero.  (This makes it easier for users of
+ *     the block pool that only want a one-sided pool.)
+ *
+ *  5) We have enough space allocated for at least one more block in
+ *     whichever side `state` points to.
+ *
+ *  6) The center of the pool is always aligned to both the block_size of
+ *     the pool and a 4K CPU page.
+ */
+static uint32_t
+anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
+{
+   VkResult result = VK_SUCCESS;
+
+   pthread_mutex_lock(&pool->device->mutex);
+
+   assert(state == &pool->state || state == &pool->back_state);
+
+   /* Gather a little usage information on the pool.  Since we may have
+    * threadsd waiting in queue to get some storage while we resize, it's
+    * actually possible that total_used will be larger than old_size.  In
+    * particular, block_pool_alloc() increments state->next prior to
+    * calling block_pool_grow, so this ensures that we get enough space for
+    * which ever side tries to grow the pool.
+    *
+    * We align to a page size because it makes it easier to do our
+    * calculations later in such a way that we state page-aigned.
+    */
+   uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
+   uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
+   uint32_t total_used = front_used + back_used;
+
+   assert(state == &pool->state || back_used > 0);
+
+   uint32_t old_size = pool->bo.size;
+
+   /* The block pool is always initialized to a nonzero size and this function
+    * is always called after initialization.
+    */
+   assert(old_size > 0);
+
+   /* The back_used and front_used may actually be smaller than the actual
+    * requirement because they are based on the next pointers which are
+    * updated prior to calling this function.
+    */
+   uint32_t back_required = MAX2(back_used, pool->center_bo_offset);
+   uint32_t front_required = MAX2(front_used, old_size - pool->center_bo_offset);
+
+   if (back_used * 2 <= back_required && front_used * 2 <= front_required) {
+      /* If we're in this case then this isn't the firsta allocation and we
+       * already have enough space on both sides to hold double what we
+       * have allocated.  There's nothing for us to do.
+       */
+      goto done;
+   }
+
+   uint32_t size = old_size * 2;
+   while (size < back_required + front_required)
+      size *= 2;
+
+   assert(size > pool->bo.size);
+
+   /* We compute a new center_bo_offset such that, when we double the size
+    * of the pool, we maintain the ratio of how much is used by each side.
+    * This way things should remain more-or-less balanced.
+    */
+   uint32_t center_bo_offset;
+   if (back_used == 0) {
+      /* If we're in this case then we have never called alloc_back().  In
+       * this case, we want keep the offset at 0 to make things as simple
+       * as possible for users that don't care about back allocations.
+       */
+      center_bo_offset = 0;
+   } else {
+      /* Try to "center" the allocation based on how much is currently in
+       * use on each side of the center line.
+       */
+      center_bo_offset = ((uint64_t)size * back_used) / total_used;
+
+      /* Align down to a multiple of the page size */
+      center_bo_offset &= ~(PAGE_SIZE - 1);
+
+      assert(center_bo_offset >= back_used);
+
+      /* Make sure we don't shrink the back end of the pool */
+      if (center_bo_offset < pool->back_state.end)
+         center_bo_offset = pool->back_state.end;
+
+      /* Make sure that we don't shrink the front end of the pool */
+      if (size - center_bo_offset < pool->state.end)
+         center_bo_offset = size - pool->state.end;
+   }
+
+   assert(center_bo_offset % PAGE_SIZE == 0);
+
+   result = anv_block_pool_expand_range(pool, center_bo_offset, size);
+
    if (pool->device->instance->physicalDevice.has_exec_async)
       pool->bo.flags |= EXEC_OBJECT_ASYNC;
 
 done:
    pthread_mutex_unlock(&pool->device->mutex);
 
-   /* Return the appropreate new size.  This function never actually
-    * updates state->next.  Instead, we let the caller do that because it
-    * needs to do so in order to maintain its concurrency model.
-    */
-   if (state == &pool->state) {
-      return pool->bo.size - pool->center_bo_offset;
+   if (result == VK_SUCCESS) {
+      /* Return the appropriate new size.  This function never actually
+       * updates state->next.  Instead, we let the caller do that because it
+       * needs to do so in order to maintain its concurrency model.
+       */
+      if (state == &pool->state) {
+         return pool->bo.size - pool->center_bo_offset;
+      } else {
+         assert(pool->center_bo_offset > 0);
+         return pool->center_bo_offset;
+      }
    } else {
-      assert(pool->center_bo_offset > 0);
-      return pool->center_bo_offset;
+      return 0;
    }
-
-fail:
-   pthread_mutex_unlock(&pool->device->mutex);
-
-   return 0;
 }
 
 static uint32_t
 anv_block_pool_alloc_new(struct anv_block_pool *pool,
-                         struct anv_block_state *pool_state)
+                         struct anv_block_state *pool_state,
+                         uint32_t block_size)
 {
    struct anv_block_state state, old, new;
 
    while (1) {
-      state.u64 = __sync_fetch_and_add(&pool_state->u64, pool->block_size);
-      if (state.next < state.end) {
+      state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
+      if (state.next + block_size <= state.end) {
          assert(pool->map);
          return state.next;
-      } else if (state.next == state.end) {
-         /* We allocated the first block outside the pool, we have to grow it.
-          * pool_state->next acts a mutex: threads who try to allocate now will
-          * get block indexes above the current limit and hit futex_wait
-          * below. */
-         new.next = state.next + pool->block_size;
-         new.end = anv_block_pool_grow(pool, pool_state);
-         assert(new.end >= new.next && new.end % pool->block_size == 0);
+      } else if (state.next <= state.end) {
+         /* We allocated the first block outside the pool so we have to grow
+          * the pool.  pool_state->next acts a mutex: threads who try to
+          * allocate now will get block indexes above the current limit and
+          * hit futex_wait below.
+          */
+         new.next = state.next + block_size;
+         do {
+            new.end = anv_block_pool_grow(pool, pool_state);
+         } while (new.end < new.next);
+
          old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
          if (old.next != state.next)
             futex_wake(&pool_state->end, INT_MAX);
@@ -531,18 +561,10 @@
 }
 
 int32_t
-anv_block_pool_alloc(struct anv_block_pool *pool)
+anv_block_pool_alloc(struct anv_block_pool *pool,
+                     uint32_t block_size)
 {
-   int32_t offset;
-
-   /* Try free list first. */
-   if (anv_free_list_pop(&pool->free_list, &pool->map, &offset)) {
-      assert(offset >= 0);
-      assert(pool->map);
-      return offset;
-   }
-
-   return anv_block_pool_alloc_new(pool, &pool->state);
+   return anv_block_pool_alloc_new(pool, &pool->state, block_size);
 }
 
 /* Allocates a block out of the back of the block pool.
@@ -555,18 +577,11 @@
  * gymnastics with the block pool's BO when doing relocations.
  */
 int32_t
-anv_block_pool_alloc_back(struct anv_block_pool *pool)
+anv_block_pool_alloc_back(struct anv_block_pool *pool,
+                          uint32_t block_size)
 {
-   int32_t offset;
-
-   /* Try free list first. */
-   if (anv_free_list_pop(&pool->back_free_list, &pool->map, &offset)) {
-      assert(offset < 0);
-      assert(pool->map);
-      return offset;
-   }
-
-   offset = anv_block_pool_alloc_new(pool, &pool->back_state);
+   int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,
+                                             block_size);
 
    /* The offset we get out of anv_block_pool_alloc_new() is actually the
     * number of bytes downwards from the middle to the end of the block.
@@ -574,56 +589,63 @@
     * start of the block.
     */
    assert(offset >= 0);
-   return -(offset + pool->block_size);
+   return -(offset + block_size);
+}
+
+VkResult
+anv_state_pool_init(struct anv_state_pool *pool,
+                    struct anv_device *device,
+                    uint32_t block_size)
+{
+   VkResult result = anv_block_pool_init(&pool->block_pool, device,
+                                         block_size * 16);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(util_is_power_of_two(block_size));
+   pool->block_size = block_size;
+   pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;
+   for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
+      pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
+      pool->buckets[i].block.next = 0;
+      pool->buckets[i].block.end = 0;
+   }
+   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
+
+   return VK_SUCCESS;
 }
 
 void
-anv_block_pool_free(struct anv_block_pool *pool, int32_t offset)
+anv_state_pool_finish(struct anv_state_pool *pool)
 {
-   if (offset < 0) {
-      anv_free_list_push(&pool->back_free_list, pool->map, offset);
-   } else {
-      anv_free_list_push(&pool->free_list, pool->map, offset);
-   }
-}
-
-static void
-anv_fixed_size_state_pool_init(struct anv_fixed_size_state_pool *pool,
-                               size_t state_size)
-{
-   /* At least a cache line and must divide the block size. */
-   assert(state_size >= 64 && util_is_power_of_two(state_size));
-
-   pool->state_size = state_size;
-   pool->free_list = ANV_FREE_LIST_EMPTY;
-   pool->block.next = 0;
-   pool->block.end = 0;
+   VG(VALGRIND_DESTROY_MEMPOOL(pool));
+   anv_block_pool_finish(&pool->block_pool);
 }
 
 static uint32_t
-anv_fixed_size_state_pool_alloc(struct anv_fixed_size_state_pool *pool,
-                                struct anv_block_pool *block_pool)
+anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
+                                    struct anv_block_pool *block_pool,
+                                    uint32_t state_size,
+                                    uint32_t block_size)
 {
-   int32_t offset;
    struct anv_block_state block, old, new;
+   uint32_t offset;
 
-   /* Try free list first. */
-   if (anv_free_list_pop(&pool->free_list, &block_pool->map, &offset)) {
-      assert(offset >= 0);
-      return offset;
-   }
+   /* If our state is large, we don't need any sub-allocation from a block.
+    * Instead, we just grab whole (potentially large) blocks.
+    */
+   if (state_size >= block_size)
+      return anv_block_pool_alloc(block_pool, state_size);
 
-   /* If free list was empty (or somebody raced us and took the items) we
-    * allocate a new item from the end of the block */
  restart:
-   block.u64 = __sync_fetch_and_add(&pool->block.u64, pool->state_size);
+   block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
 
    if (block.next < block.end) {
       return block.next;
    } else if (block.next == block.end) {
-      offset = anv_block_pool_alloc(block_pool);
-      new.next = offset + pool->state_size;
-      new.end = offset + block_pool->block_size;
+      offset = anv_block_pool_alloc(block_pool, block_size);
+      new.next = offset + state_size;
+      new.end = offset + block_size;
       old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
       if (old.next != block.next)
          futex_wake(&pool->block.end, INT_MAX);
@@ -634,72 +656,178 @@
    }
 }
 
-static void
-anv_fixed_size_state_pool_free(struct anv_fixed_size_state_pool *pool,
-                               struct anv_block_pool *block_pool,
-                               uint32_t offset)
+static uint32_t
+anv_state_pool_get_bucket(uint32_t size)
 {
-   anv_free_list_push(&pool->free_list, block_pool->map, offset);
-}
-
-void
-anv_state_pool_init(struct anv_state_pool *pool,
-                    struct anv_block_pool *block_pool)
-{
-   pool->block_pool = block_pool;
-   for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
-      size_t size = 1 << (ANV_MIN_STATE_SIZE_LOG2 + i);
-      anv_fixed_size_state_pool_init(&pool->buckets[i], size);
-   }
-   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
-}
-
-void
-anv_state_pool_finish(struct anv_state_pool *pool)
-{
-   VG(VALGRIND_DESTROY_MEMPOOL(pool));
-}
-
-struct anv_state
-anv_state_pool_alloc(struct anv_state_pool *pool, size_t size, size_t align)
-{
-   unsigned size_log2 = ilog2_round_up(size < align ? align : size);
+   unsigned size_log2 = ilog2_round_up(size);
    assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
    if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
       size_log2 = ANV_MIN_STATE_SIZE_LOG2;
-   unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+   return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+}
+
+static uint32_t
+anv_state_pool_get_bucket_size(uint32_t bucket)
+{
+   uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
+   return 1 << size_log2;
+}
+
+static struct anv_state
+anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
+                           uint32_t size, uint32_t align)
+{
+   uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
 
    struct anv_state state;
-   state.alloc_size = 1 << size_log2;
-   state.offset = anv_fixed_size_state_pool_alloc(&pool->buckets[bucket],
-                                                  pool->block_pool);
-   state.map = pool->block_pool->map + state.offset;
+   state.alloc_size = anv_state_pool_get_bucket_size(bucket);
+
+   /* Try free list first. */
+   if (anv_free_list_pop(&pool->buckets[bucket].free_list,
+                         &pool->block_pool.map, &state.offset)) {
+      assert(state.offset >= 0);
+      goto done;
+   }
+
+   /* Try to grab a chunk from some larger bucket and split it up */
+   for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
+      int32_t chunk_offset;
+      if (anv_free_list_pop(&pool->buckets[b].free_list,
+                            &pool->block_pool.map, &chunk_offset)) {
+         unsigned chunk_size = anv_state_pool_get_bucket_size(b);
+
+         /* We've found a chunk that's larger than the requested state size.
+          * There are a couple of options as to what we do with it:
+          *
+          *    1) We could fully split the chunk into state.alloc_size sized
+          *       pieces.  However, this would mean that allocating a 16B
+          *       state could potentially split a 2MB chunk into 512K smaller
+          *       chunks.  This would lead to unnecessary fragmentation.
+          *
+          *    2) The classic "buddy allocator" method would have us split the
+          *       chunk in half and return one half.  Then we would split the
+          *       remaining half in half and return one half, and repeat as
+          *       needed until we get down to the size we want.  However, if
+          *       you are allocating a bunch of the same size state (which is
+          *       the common case), this means that every other allocation has
+          *       to go up a level and every fourth goes up two levels, etc.
+          *       This is not nearly as efficient as it could be if we did a
+          *       little more work up-front.
+          *
+          *    3) Split the difference between (1) and (2) by doing a
+          *       two-level split.  If it's bigger than some fixed block_size,
+          *       we split it into block_size sized chunks and return all but
+          *       one of them.  Then we split what remains into
+          *       state.alloc_size sized chunks and return all but one.
+          *
+          * We choose option (3).
+          */
+         if (chunk_size > pool->block_size &&
+             state.alloc_size < pool->block_size) {
+            assert(chunk_size % pool->block_size == 0);
+            /* We don't want to split giant chunks into tiny chunks.  Instead,
+             * break anything bigger than a block into block-sized chunks and
+             * then break it down into bucket-sized chunks from there.  Return
+             * all but the first block of the chunk to the block bucket.
+             */
+            const uint32_t block_bucket =
+               anv_state_pool_get_bucket(pool->block_size);
+            anv_free_list_push(&pool->buckets[block_bucket].free_list,
+                               pool->block_pool.map,
+                               chunk_offset + pool->block_size,
+                               pool->block_size,
+                               (chunk_size / pool->block_size) - 1);
+            chunk_size = pool->block_size;
+         }
+
+         assert(chunk_size % state.alloc_size == 0);
+         anv_free_list_push(&pool->buckets[bucket].free_list,
+                            pool->block_pool.map,
+                            chunk_offset + state.alloc_size,
+                            state.alloc_size,
+                            (chunk_size / state.alloc_size) - 1);
+
+         state.offset = chunk_offset;
+         goto done;
+      }
+   }
+
+   state.offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
+                                                      &pool->block_pool,
+                                                      state.alloc_size,
+                                                      pool->block_size);
+
+done:
+   state.map = pool->block_pool.map + state.offset;
+   return state;
+}
+
+struct anv_state
+anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
+{
+   if (size == 0)
+      return ANV_STATE_NULL;
+
+   struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
    VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
    return state;
 }
 
+struct anv_state
+anv_state_pool_alloc_back(struct anv_state_pool *pool)
+{
+   struct anv_state state;
+   state.alloc_size = pool->block_size;
+
+   if (anv_free_list_pop(&pool->back_alloc_free_list,
+                         &pool->block_pool.map, &state.offset)) {
+      assert(state.offset < 0);
+      goto done;
+   }
+
+   state.offset = anv_block_pool_alloc_back(&pool->block_pool,
+                                            pool->block_size);
+
+done:
+   state.map = pool->block_pool.map + state.offset;
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, state.alloc_size));
+   return state;
+}
+
+static void
+anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
+{
+   assert(util_is_power_of_two(state.alloc_size));
+   unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
+
+   if (state.offset < 0) {
+      assert(state.alloc_size == pool->block_size);
+      anv_free_list_push(&pool->back_alloc_free_list,
+                         pool->block_pool.map, state.offset,
+                         state.alloc_size, 1);
+   } else {
+      anv_free_list_push(&pool->buckets[bucket].free_list,
+                         pool->block_pool.map, state.offset,
+                         state.alloc_size, 1);
+   }
+}
+
 void
 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
 {
-   assert(util_is_power_of_two(state.alloc_size));
-   unsigned size_log2 = ilog2_round_up(state.alloc_size);
-   assert(size_log2 >= ANV_MIN_STATE_SIZE_LOG2 &&
-          size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
-   unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+   if (state.alloc_size == 0)
+      return;
 
    VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
-   anv_fixed_size_state_pool_free(&pool->buckets[bucket],
-                                  pool->block_pool, state.offset);
+   anv_state_pool_free_no_vg(pool, state);
 }
 
-#define NULL_BLOCK 1
 struct anv_state_stream_block {
+   struct anv_state block;
+
    /* The next block */
    struct anv_state_stream_block *next;
 
-   /* The offset into the block pool at which this block starts */
-   uint32_t offset;
-
 #ifdef HAVE_VALGRIND
    /* A pointer to the first user-allocated thing in this block.  This is
     * what valgrind sees as the start of the block.
@@ -713,16 +841,20 @@
  */
 void
 anv_state_stream_init(struct anv_state_stream *stream,
-                      struct anv_block_pool *block_pool)
+                      struct anv_state_pool *state_pool,
+                      uint32_t block_size)
 {
-   stream->block_pool = block_pool;
-   stream->block = NULL;
+   stream->state_pool = state_pool;
+   stream->block_size = block_size;
 
-   /* Ensure that next + whatever > end.  This way the first call to
+   stream->block = ANV_STATE_NULL;
+
+   stream->block_list = NULL;
+
+   /* Ensure that next + whatever > block_size.  This way the first call to
     * state_stream_alloc fetches a new block.
     */
-   stream->next = 1;
-   stream->end = 0;
+   stream->next = block_size;
 
    VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
 }
@@ -730,14 +862,12 @@
 void
 anv_state_stream_finish(struct anv_state_stream *stream)
 {
-   VG(const uint32_t block_size = stream->block_pool->block_size);
-
-   struct anv_state_stream_block *next = stream->block;
+   struct anv_state_stream_block *next = stream->block_list;
    while (next != NULL) {
       struct anv_state_stream_block sb = VG_NOACCESS_READ(next);
       VG(VALGRIND_MEMPOOL_FREE(stream, sb._vg_ptr));
-      VG(VALGRIND_MAKE_MEM_UNDEFINED(next, block_size));
-      anv_block_pool_free(stream->block_pool, sb.offset);
+      VG(VALGRIND_MAKE_MEM_UNDEFINED(next, stream->block_size));
+      anv_state_pool_free_no_vg(stream->state_pool, sb.block);
       next = sb.next;
    }
 
@@ -748,35 +878,44 @@
 anv_state_stream_alloc(struct anv_state_stream *stream,
                        uint32_t size, uint32_t alignment)
 {
-   struct anv_state_stream_block *sb = stream->block;
+   if (size == 0)
+      return ANV_STATE_NULL;
 
-   struct anv_state state;
+   assert(alignment <= PAGE_SIZE);
 
-   state.offset = align_u32(stream->next, alignment);
-   if (state.offset + size > stream->end) {
-      uint32_t block = anv_block_pool_alloc(stream->block_pool);
-      sb = stream->block_pool->map + block;
+   uint32_t offset = align_u32(stream->next, alignment);
+   if (offset + size > stream->block.alloc_size) {
+      uint32_t block_size = stream->block_size;
+      if (block_size < size)
+         block_size = round_to_power_of_two(size);
 
-      VG(VALGRIND_MAKE_MEM_UNDEFINED(sb, sizeof(*sb)));
-      sb->next = stream->block;
-      sb->offset = block;
-      VG(sb->_vg_ptr = NULL);
-      VG(VALGRIND_MAKE_MEM_NOACCESS(sb, stream->block_pool->block_size));
+      stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
+                                                 block_size, PAGE_SIZE);
 
-      stream->block = sb;
-      stream->start = block;
-      stream->next = block + sizeof(*sb);
-      stream->end = block + stream->block_pool->block_size;
+      struct anv_state_stream_block *sb = stream->block.map;
+      VG_NOACCESS_WRITE(&sb->block, stream->block);
+      VG_NOACCESS_WRITE(&sb->next, stream->block_list);
+      stream->block_list = sb;
+      VG(VG_NOACCESS_WRITE(&sb->_vg_ptr, NULL));
 
-      state.offset = align_u32(stream->next, alignment);
-      assert(state.offset + size <= stream->end);
+      VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, stream->block_size));
+
+      /* Reset back to the start plus space for the header */
+      stream->next = sizeof(*sb);
+
+      offset = align_u32(stream->next, alignment);
+      assert(offset + size <= stream->block.alloc_size);
    }
 
-   assert(state.offset > stream->start);
-   state.map = (void *)sb + (state.offset - stream->start);
+   struct anv_state state = stream->block;
+   state.offset += offset;
    state.alloc_size = size;
+   state.map += offset;
+
+   stream->next = offset + size;
 
 #ifdef HAVE_VALGRIND
+   struct anv_state_stream_block *sb = stream->block_list;
    void *vg_ptr = VG_NOACCESS_READ(&sb->_vg_ptr);
    if (vg_ptr == NULL) {
       vg_ptr = state.map;
@@ -792,8 +931,6 @@
    }
 #endif
 
-   stream->next = state.offset + size;
-
    return state;
 }
 
@@ -1122,7 +1259,7 @@
        */
       //NOTE: got import_size from anv_gem_fd_to_handle, above
       //off_t import_size = lseek(fd, 0, SEEK_END);
-      if (import_size == (off_t)-1 || import_size != size) {
+      if (import_size == (off_t)-1 || import_size < size) {
          anv_gem_close(device, gem_handle);
          pthread_mutex_unlock(&cache->mutex);
          return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index bedd337..eb4b9e7 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -148,9 +148,6 @@
    struct drm_i915_gem_relocation_entry *entry;
    int index;
 
-   const uint32_t domain =
-      (target_bo->flags & EXEC_OBJECT_WRITE) ? I915_GEM_DOMAIN_RENDER : 0;
-
    VkResult result = anv_reloc_list_grow(list, alloc, 1);
    if (result != VK_SUCCESS)
       return result;
@@ -163,8 +160,8 @@
    entry->delta = delta;
    entry->offset = offset;
    entry->presumed_offset = target_bo->offset;
-   entry->read_domains = domain;
-   entry->write_domain = domain;
+   entry->read_domains = 0;
+   entry->write_domain = 0;
    VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
 
    return VK_SUCCESS;
@@ -453,9 +450,10 @@
 struct anv_address
 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
 {
+   struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
    return (struct anv_address) {
-      .bo = &cmd_buffer->device->surface_state_block_pool.bo,
-      .offset = *(int32_t *)u_vector_head(&cmd_buffer->bt_blocks),
+      .bo = &cmd_buffer->device->surface_state_pool.block_pool.bo,
+      .offset = bt_block->offset,
    };
 }
 
@@ -621,23 +619,22 @@
 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t entries, uint32_t *state_offset)
 {
-   struct anv_block_pool *block_pool =
-       &cmd_buffer->device->surface_state_block_pool;
-   int32_t *bt_block = u_vector_head(&cmd_buffer->bt_blocks);
+   struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool;
+   struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
    struct anv_state state;
 
    state.alloc_size = align_u32(entries * 4, 32);
 
-   if (cmd_buffer->bt_next + state.alloc_size > block_pool->block_size)
+   if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size)
       return (struct anv_state) { 0 };
 
    state.offset = cmd_buffer->bt_next;
-   state.map = block_pool->map + *bt_block + state.offset;
+   state.map = state_pool->block_pool.map + bt_block->offset + state.offset;
 
    cmd_buffer->bt_next += state.alloc_size;
 
-   assert(*bt_block < 0);
-   *state_offset = -(*bt_block);
+   assert(bt_block->offset < 0);
+   *state_offset = -bt_block->offset;
 
    return state;
 }
@@ -661,16 +658,15 @@
 VkResult
 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_block_pool *block_pool =
-       &cmd_buffer->device->surface_state_block_pool;
+   struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool;
 
-   int32_t *offset = u_vector_add(&cmd_buffer->bt_blocks);
-   if (offset == NULL) {
+   struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
+   if (bt_block == NULL) {
       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
-   *offset = anv_block_pool_alloc_back(block_pool);
+   *bt_block = anv_state_pool_alloc_back(state_pool);
    cmd_buffer->bt_next = 0;
 
    return VK_SUCCESS;
@@ -710,8 +706,10 @@
 
    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
 
-   success = u_vector_init(&cmd_buffer->bt_blocks, sizeof(int32_t),
-                             8 * sizeof(int32_t));
+   /* u_vector requires power-of-two size elements */
+   unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state));
+   success = u_vector_init(&cmd_buffer->bt_block_states,
+                           pow2_state_size, 8 * pow2_state_size);
    if (!success)
       goto fail_seen_bbos;
 
@@ -728,7 +726,7 @@
    return VK_SUCCESS;
 
  fail_bt_blocks:
-   u_vector_finish(&cmd_buffer->bt_blocks);
+   u_vector_finish(&cmd_buffer->bt_block_states);
  fail_seen_bbos:
    u_vector_finish(&cmd_buffer->seen_bbos);
  fail_batch_bo:
@@ -740,12 +738,10 @@
 void
 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 {
-   int32_t *bt_block;
-   u_vector_foreach(bt_block, &cmd_buffer->bt_blocks) {
-      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
-                          *bt_block);
-   }
-   u_vector_finish(&cmd_buffer->bt_blocks);
+   struct anv_state *bt_block;
+   u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
+      anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block);
+   u_vector_finish(&cmd_buffer->bt_block_states);
 
    anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
 
@@ -774,12 +770,11 @@
                       &cmd_buffer->batch,
                       GEN8_MI_BATCH_BUFFER_START_length * 4);
 
-   while (u_vector_length(&cmd_buffer->bt_blocks) > 1) {
-      int32_t *bt_block = u_vector_remove(&cmd_buffer->bt_blocks);
-      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
-                          *bt_block);
+   while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {
+      struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
+      anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block);
    }
-   assert(u_vector_length(&cmd_buffer->bt_blocks) == 1);
+   assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);
    cmd_buffer->bt_next = 0;
 
    cmd_buffer->surface_relocs.num_relocs = 0;
@@ -982,6 +977,7 @@
 anv_execbuf_add_bo(struct anv_execbuf *exec,
                    struct anv_bo *bo,
                    struct anv_reloc_list *relocs,
+                   uint32_t extra_flags,
                    const VkAllocationCallbacks *alloc)
 {
    struct drm_i915_gem_exec_object2 *obj = NULL;
@@ -1036,7 +1032,7 @@
       obj->relocs_ptr = 0;
       obj->alignment = 0;
       obj->offset = bo->offset;
-      obj->flags = bo->flags;
+      obj->flags = bo->flags | extra_flags;
       obj->rsvd1 = bo->start_offset;
       obj->rsvd2 = bo->size;
    }
@@ -1050,9 +1046,15 @@
       obj->relocs_ptr = (uintptr_t) relocs->relocs;
 
       for (size_t i = 0; i < relocs->num_relocs; i++) {
+         VkResult result;
+
          /* A quick sanity check on relocations */
          assert(relocs->relocs[i].offset < bo->size);
-         anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, alloc);
+         result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL,
+                                     extra_flags, alloc);
+
+         if (result != VK_SUCCESS)
+            return result;
       }
    }
 
@@ -1090,16 +1092,16 @@
    }
 
    if (flush && !device->info.has_llc)
-      anv_flush_range(p, reloc_size);
+      gen_flush_range(p, reloc_size);
 }
 
 static void
-adjust_relocations_from_state_pool(struct anv_block_pool *pool,
+adjust_relocations_from_state_pool(struct anv_state_pool *pool,
                                    struct anv_reloc_list *relocs,
                                    uint32_t last_pool_center_bo_offset)
 {
-   assert(last_pool_center_bo_offset <= pool->center_bo_offset);
-   uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset;
+   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
+   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
 
    for (size_t i = 0; i < relocs->num_relocs; i++) {
       /* All of the relocations from this block pool to other BO's should
@@ -1112,13 +1114,13 @@
 }
 
 static void
-adjust_relocations_to_state_pool(struct anv_block_pool *pool,
+adjust_relocations_to_state_pool(struct anv_state_pool *pool,
                                  struct anv_bo *from_bo,
                                  struct anv_reloc_list *relocs,
                                  uint32_t last_pool_center_bo_offset)
 {
-   assert(last_pool_center_bo_offset <= pool->center_bo_offset);
-   uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset;
+   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
+   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
 
    /* When we initially emit relocations into a block pool, we don't
     * actually know what the final center_bo_offset will be so we just emit
@@ -1127,7 +1129,7 @@
     * relocations that point to the pool bo with the correct offset.
     */
    for (size_t i = 0; i < relocs->num_relocs; i++) {
-      if (relocs->reloc_bos[i] == &pool->bo) {
+      if (relocs->reloc_bos[i] == &pool->block_pool.bo) {
          /* Adjust the delta value in the relocation to correctly
           * correspond to the new delta.  Initially, this value may have
           * been negative (if treated as unsigned), but we trust in
@@ -1141,7 +1143,8 @@
           * use by the GPU at the moment.
           */
          assert(relocs->relocs[i].offset < from_bo->size);
-         write_reloc(pool->device, from_bo->map + relocs->relocs[i].offset,
+         write_reloc(pool->block_pool.device,
+                     from_bo->map + relocs->relocs[i].offset,
                      relocs->relocs[i].presumed_offset +
                      relocs->relocs[i].delta, false);
       }
@@ -1231,7 +1234,7 @@
     * given time.  The only option is to always relocate them.
     */
    anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
-                        &cmd_buffer->device->surface_state_block_pool.bo,
+                        &cmd_buffer->device->surface_state_pool.block_pool.bo,
                         true /* always relocate surface states */);
 
    /* Since we own all of the batch buffers, we know what values are stored
@@ -1250,23 +1253,19 @@
    return true;
 }
 
-VkResult anv_cmd_buffer_execbuf(struct anv_device* device, struct anv_cmd_buffer* cmd_buffer,
-                                uint32_t wait_semaphore_count, anv_semaphore_t* wait_semaphores,
-                                uint32_t signal_semaphore_count,
-                                anv_semaphore_t* signal_semaphores)
+static VkResult
+setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
+                             struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_batch *batch = &cmd_buffer->batch;
-   struct anv_block_pool *ss_pool =
-      &cmd_buffer->device->surface_state_block_pool;
-
-   struct anv_execbuf execbuf;
-   anv_execbuf_init(&execbuf);
+   struct anv_state_pool *ss_pool =
+      &cmd_buffer->device->surface_state_pool;
 
    adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
                                       cmd_buffer->last_ss_pool_center);
-   VkResult result =
-      anv_execbuf_add_bo(&execbuf, &ss_pool->bo, &cmd_buffer->surface_relocs,
-                         &device->alloc);
+   VkResult result = anv_execbuf_add_bo(execbuf, &ss_pool->block_pool.bo,
+                                        &cmd_buffer->surface_relocs, 0,
+                                        &cmd_buffer->device->alloc);
    if (result != VK_SUCCESS)
       return result;
 
@@ -1278,8 +1277,8 @@
       adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
                                        cmd_buffer->last_ss_pool_center);
 
-      result = anv_execbuf_add_bo(&execbuf, &(*bbo)->bo, &(*bbo)->relocs,
-                                  &device->alloc);
+      result = anv_execbuf_add_bo(execbuf, &(*bbo)->bo, &(*bbo)->relocs, 0,
+                                  &cmd_buffer->device->alloc);
       if (result != VK_SUCCESS)
          return result;
    }
@@ -1288,7 +1287,7 @@
     * record the surface state pool center so future executions of the command
     * buffer can adjust correctly.
     */
-   cmd_buffer->last_ss_pool_center = ss_pool->center_bo_offset;
+   cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;
 
    struct anv_batch_bo *first_batch_bo =
       list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
@@ -1298,19 +1297,19 @@
     * corresponding to the first batch_bo in the chain with the last
     * element in the list.
     */
-   if (first_batch_bo->bo.index != execbuf.bo_count - 1) {
+   if (first_batch_bo->bo.index != execbuf->bo_count - 1) {
       uint32_t idx = first_batch_bo->bo.index;
-      uint32_t last_idx = execbuf.bo_count - 1;
+      uint32_t last_idx = execbuf->bo_count - 1;
 
-      struct drm_i915_gem_exec_object2 tmp_obj = execbuf.objects[idx];
-      assert(execbuf.bos[idx] == &first_batch_bo->bo);
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == &first_batch_bo->bo);
 
-      execbuf.objects[idx] = execbuf.objects[last_idx];
-      execbuf.bos[idx] = execbuf.bos[last_idx];
-      execbuf.bos[idx]->index = idx;
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->index = idx;
 
-      execbuf.objects[last_idx] = tmp_obj;
-      execbuf.bos[last_idx] = &first_batch_bo->bo;
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = &first_batch_bo->bo;
       first_batch_bo->bo.index = last_idx;
    }
 
@@ -1331,9 +1330,9 @@
       }
    }
 
-   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) execbuf.objects,
-      .buffer_count = execbuf.bo_count,
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
       .batch_start_offset = 0,
       .batch_len = batch->next - batch->start,
       .cliprects_ptr = 0,
@@ -1346,7 +1345,7 @@
       .rsvd2 = 0,
    };
 
-   if (relocate_cmd_buffer(cmd_buffer, &execbuf)) {
+   if (relocate_cmd_buffer(cmd_buffer, execbuf)) {
       /* If we were able to successfully relocate everything, tell the kernel
        * that it can skip doing relocations. The requirement for using
        * NO_RELOC is:
@@ -1371,7 +1370,7 @@
        * the RENDER_SURFACE_STATE matches presumed_offset, so it should be
        * safe for the kernel to relocate them as needed.
        */
-      execbuf.execbuf.flags |= I915_EXEC_NO_RELOC;
+      execbuf->execbuf.flags |= I915_EXEC_NO_RELOC;
    } else {
       /* In the case where we fall back to doing kernel relocations, we need
        * to ensure that the relocation list is valid.  All relocations on the
@@ -1386,8 +1385,63 @@
          cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
    }
 
-   result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos, wait_semaphore_count, wait_semaphores,
-                               signal_semaphore_count, signal_semaphores);
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_cmd_buffer_execbuf(struct anv_device *device,
+                       struct anv_cmd_buffer *cmd_buffer,
+                       const VkSemaphore *in_semaphores,
+                       uint32_t num_in_semaphores,
+                       const VkSemaphore *out_semaphores,
+                       uint32_t num_out_semaphores)
+{
+   struct anv_execbuf execbuf;
+   anv_execbuf_init(&execbuf);
+
+   VkResult result = VK_SUCCESS;
+   for (uint32_t i = 0; i < num_in_semaphores; i++) {
+      ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]);
+      //assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE);
+      struct anv_semaphore_impl *impl = semaphore->current;
+
+      switch (impl->type) {
+      case ANV_SEMAPHORE_TYPE_BO:
+         result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL,
+                                     0, &device->alloc);
+         if (result != VK_SUCCESS)
+            return result;
+         break;
+      default:
+         break;
+      }
+   }
+
+   for (uint32_t i = 0; i < num_out_semaphores; i++) {
+      ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]);
+      //assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE);
+      struct anv_semaphore_impl *impl = semaphore->current;
+
+      switch (impl->type) {
+      case ANV_SEMAPHORE_TYPE_BO:
+         result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL,
+                                     EXEC_OBJECT_WRITE, &device->alloc);
+         if (result != VK_SUCCESS)
+            return result;
+         break;
+      default:
+         break;
+      }
+   }
+
+   result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer);
+   if (result != VK_SUCCESS)
+      return result;
+
+   // Fuchsia: pass semaphores down
+   result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos,
+                               num_in_semaphores, (struct anv_semaphore**)in_semaphores,
+                               num_out_semaphores, (struct anv_semaphore**)out_semaphores);
 
    anv_execbuf_finish(&execbuf, &device->alloc);
 
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 66505b3..79f5234 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -111,6 +111,9 @@
    case 9:
       device->blorp.exec = gen9_blorp_exec;
       break;
+   case 10:
+      device->blorp.exec = gen10_blorp_exec;
+      break;
    default:
       unreachable("Unknown hardware generation");
    }
@@ -530,8 +533,8 @@
    switch (size_B) {
    case 1:  return ISL_FORMAT_R8_UINT;
    case 2:  return ISL_FORMAT_R8G8_UINT;
-   case 4:  return ISL_FORMAT_R8G8B8A8_UINT;
-   case 8:  return ISL_FORMAT_R16G16B16A16_UINT;
+   case 4:  return ISL_FORMAT_R32_UINT;
+   case 8:  return ISL_FORMAT_R32G32_UINT;
    case 16: return ISL_FORMAT_R32G32B32A32_UINT;
    default:
       unreachable("Not a power-of-two format size");
@@ -551,20 +554,22 @@
     */
    enum isl_format format = isl_format_for_size(block_size);
 
+   UNUSED bool ok;
    struct isl_surf surf;
-   isl_surf_init(&device->isl_dev, &surf,
-                 .dim = ISL_SURF_DIM_2D,
-                 .format = format,
-                 .width = width,
-                 .height = height,
-                 .depth = 1,
-                 .levels = 1,
-                 .array_len = 1,
-                 .samples = 1,
-                 .usage = ISL_SURF_USAGE_TEXTURE_BIT |
-                          ISL_SURF_USAGE_RENDER_TARGET_BIT,
-                 .tiling_flags = ISL_TILING_LINEAR_BIT);
-   assert(surf.row_pitch == width * block_size);
+   ok = isl_surf_init(&device->isl_dev, &surf,
+                      .dim = ISL_SURF_DIM_2D,
+                      .format = format,
+                      .width = width,
+                      .height = height,
+                      .depth = 1,
+                      .levels = 1,
+                      .array_len = 1,
+                      .samples = 1,
+                      .row_pitch = width * block_size,
+                      .usage = ISL_SURF_USAGE_TEXTURE_BIT |
+                               ISL_SURF_USAGE_RENDER_TARGET_BIT,
+                      .tiling_flags = ISL_TILING_LINEAR_BIT);
+   assert(ok);
 
    struct blorp_surf src_blorp_surf = {
       .surf = &surf,
@@ -686,7 +691,7 @@
     * little data at the top to build its linked list.
     */
    const uint32_t max_update_size =
-      cmd_buffer->device->dynamic_state_block_pool.block_size - 64;
+      cmd_buffer->device->dynamic_state_pool.block_size - 64;
 
    assert(max_update_size < MAX_SURFACE_DIM * 4);
 
@@ -710,7 +715,7 @@
       bs = gcd_pow2_u64(bs, copy_size);
 
       do_buffer_copy(&batch,
-                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
                      tmp_data.offset,
                      dst_buffer->bo, dst_buffer->offset + dstOffset,
                      copy_size / bs, 1, bs);
@@ -999,6 +1004,25 @@
    union isl_color_value clear_color =
       vk_to_isl_color(attachment->clearValue.color);
 
+   /* If multiview is enabled we ignore baseArrayLayer and layerCount */
+   if (subpass->view_mask) {
+      uint32_t view_idx;
+      for_each_bit(view_idx, subpass->view_mask) {
+         for (uint32_t r = 0; r < rectCount; ++r) {
+            const VkOffset2D offset = pRects[r].rect.offset;
+            const VkExtent2D extent = pRects[r].rect.extent;
+            blorp_clear_attachments(batch, binding_table,
+                                    ISL_FORMAT_UNSUPPORTED, pass_att->samples,
+                                    view_idx, 1,
+                                    offset.x, offset.y,
+                                    offset.x + extent.width,
+                                    offset.y + extent.height,
+                                    true, clear_color, false, 0.0f, 0, 0);
+         }
+      }
+      return;
+   }
+
    for (uint32_t r = 0; r < rectCount; ++r) {
       const VkOffset2D offset = pRects[r].rect.offset;
       const VkExtent2D extent = pRects[r].rect.extent;
@@ -1047,6 +1071,28 @@
    if (result != VK_SUCCESS)
       return;
 
+   /* If multiview is enabled we ignore baseArrayLayer and layerCount */
+   if (subpass->view_mask) {
+      uint32_t view_idx;
+      for_each_bit(view_idx, subpass->view_mask) {
+         for (uint32_t r = 0; r < rectCount; ++r) {
+            const VkOffset2D offset = pRects[r].rect.offset;
+            const VkExtent2D extent = pRects[r].rect.extent;
+            VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
+            blorp_clear_attachments(batch, binding_table,
+                                    depth_format, pass_att->samples,
+                                    view_idx, 1,
+                                    offset.x, offset.y,
+                                    offset.x + extent.width,
+                                    offset.y + extent.height,
+                                    false, color_value,
+                                    clear_depth, value.depth,
+                                    clear_stencil ? 0xff : 0, value.stencil);
+         }
+      }
+      return;
+   }
+
    for (uint32_t r = 0; r < rectCount; ++r) {
       const VkOffset2D offset = pRects[r].rect.offset;
       const VkExtent2D extent = pRects[r].rect.extent;
@@ -1282,6 +1328,22 @@
                                              clear_depth, clear_stencil,
                                              clear_att.clearValue.
                                                 depthStencil.stencil);
+
+            /* From the SKL PRM, Depth Buffer Clear:
+             *
+             * Depth Buffer Clear Workaround
+             * Depth buffer clear pass using any of the methods (WM_STATE,
+             * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a
+             * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits
+             * “set” before starting to render. DepthStall and DepthFlush are
+             * not needed between consecutive depth clear passes nor is it
+             * required if the depth-clear pass was done with “full_surf_clear”
+             * bit set in the 3DSTATE_WM_HZ_OP.
+             */
+            if (clear_depth) {
+               cmd_buffer->state.pending_pipe_bits |=
+                  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_DEPTH_STALL_BIT;
+            }
          }
       }
 
@@ -1299,8 +1361,10 @@
 static void
 resolve_image(struct blorp_batch *batch,
               const struct anv_image *src_image,
+              enum isl_aux_usage src_aux_usage,
               uint32_t src_level, uint32_t src_layer,
               const struct anv_image *dst_image,
+              enum isl_aux_usage dst_aux_usage,
               uint32_t dst_level, uint32_t dst_layer,
               VkImageAspectFlags aspect_mask,
               uint32_t src_x, uint32_t src_y, uint32_t dst_x, uint32_t dst_y,
@@ -1317,9 +1381,9 @@
 
       struct blorp_surf src_surf, dst_surf;
       get_blorp_surf_for_anv_image(src_image, aspect,
-                                   src_image->aux_usage, &src_surf);
+                                   src_aux_usage, &src_surf);
       get_blorp_surf_for_anv_image(dst_image, aspect,
-                                   dst_image->aux_usage, &dst_surf);
+                                   dst_aux_usage, &dst_surf);
 
       blorp_blit(batch,
                  &src_surf, src_level, src_layer,
@@ -1359,9 +1423,11 @@
 
       for (uint32_t layer = 0; layer < layer_count; layer++) {
          resolve_image(&batch,
-                       src_image, pRegions[r].srcSubresource.mipLevel,
+                       src_image, src_image->aux_usage,
+                       pRegions[r].srcSubresource.mipLevel,
                        pRegions[r].srcSubresource.baseArrayLayer + layer,
-                       dst_image, pRegions[r].dstSubresource.mipLevel,
+                       dst_image, dst_image->aux_usage,
+                       pRegions[r].dstSubresource.mipLevel,
                        pRegions[r].dstSubresource.baseArrayLayer + layer,
                        pRegions[r].dstSubresource.aspectMask,
                        pRegions[r].srcOffset.x, pRegions[r].srcOffset.y,
@@ -1374,19 +1440,26 @@
 }
 
 void
-anv_image_ccs_clear(struct anv_cmd_buffer *cmd_buffer,
-                    const struct anv_image *image,
-                    const struct isl_view *view,
-                    const VkImageSubresourceRange *subresourceRange)
+anv_image_fast_clear(struct anv_cmd_buffer *cmd_buffer,
+                     const struct anv_image *image,
+                     const uint32_t base_level, const uint32_t level_count,
+                     const uint32_t base_layer, uint32_t layer_count)
 {
    assert(image->type == VK_IMAGE_TYPE_3D || image->extent.depth == 1);
 
+   if (image->type == VK_IMAGE_TYPE_3D) {
+      assert(base_layer == 0);
+      assert(layer_count == anv_minify(image->extent.depth, base_level));
+   }
+
    struct blorp_batch batch;
    blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
 
    struct blorp_surf surf;
    get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT,
-                                image->aux_usage, &surf);
+                                image->aux_usage == ISL_AUX_USAGE_NONE ?
+                                ISL_AUX_USAGE_CCS_D : image->aux_usage,
+                                &surf);
 
    /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
     *
@@ -1406,11 +1479,8 @@
    cmd_buffer->state.pending_pipe_bits |=
       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
 
-   const uint32_t level_count =
-      view ? view->levels : anv_get_levelCount(image, subresourceRange);
    for (uint32_t l = 0; l < level_count; l++) {
-      const uint32_t level =
-         (view ? view->base_level : subresourceRange->baseMipLevel) + l;
+      const uint32_t level = base_level + l;
 
       const VkExtent3D extent = {
          .width = anv_minify(image->extent.width, level),
@@ -1418,21 +1488,13 @@
          .depth = anv_minify(image->extent.depth, level),
       };
 
-      /* Blorp likes to treat 2D_ARRAY and 3D the same. */
-      uint32_t blorp_base_layer, blorp_layer_count;
-      if (view) {
-         blorp_base_layer = view->base_array_layer;
-         blorp_layer_count = view->array_len;
-      } else if (image->type == VK_IMAGE_TYPE_3D) {
-         blorp_base_layer = 0;
-         blorp_layer_count = extent.depth;
-      } else {
-         blorp_base_layer = subresourceRange->baseArrayLayer;
-         blorp_layer_count = anv_get_layerCount(image, subresourceRange);
-      }
+      if (image->type == VK_IMAGE_TYPE_3D)
+         layer_count = extent.depth;
 
+      assert(level < anv_image_aux_levels(image));
+      assert(base_layer + layer_count <= anv_image_aux_layers(image, level));
       blorp_fast_clear(&batch, &surf, surf.surf->format,
-                       level, blorp_base_layer, blorp_layer_count,
+                       level, base_layer, layer_count,
                        0, 0, extent.width, extent.height);
    }
 
@@ -1440,161 +1502,16 @@
       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
 }
 
-static void
-ccs_resolve_attachment(struct anv_cmd_buffer *cmd_buffer,
-                       struct blorp_batch *batch,
-                       uint32_t att)
-{
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
-   struct anv_attachment_state *att_state =
-      &cmd_buffer->state.attachments[att];
-
-   if (att_state->aux_usage == ISL_AUX_USAGE_NONE ||
-       att_state->aux_usage == ISL_AUX_USAGE_MCS)
-      return; /* Nothing to resolve */
-
-   assert(att_state->aux_usage == ISL_AUX_USAGE_CCS_E ||
-          att_state->aux_usage == ISL_AUX_USAGE_CCS_D);
-
-   struct anv_render_pass *pass = cmd_buffer->state.pass;
-   const uint32_t subpass_idx = anv_get_subpass_id(&cmd_buffer->state);
-
-   /* Scan forward to see what all ways this attachment will be used.
-    * Ideally, we would like to resolve in the same subpass as the last write
-    * of a particular attachment.  That way we only resolve once but it's
-    * still hot in the cache.
-    */
-   bool found_draw = false;
-   enum anv_subpass_usage usage = 0;
-   for (uint32_t s = subpass_idx + 1; s < pass->subpass_count; s++) {
-      usage |= pass->attachments[att].subpass_usage[s];
-
-      if (usage & (ANV_SUBPASS_USAGE_DRAW | ANV_SUBPASS_USAGE_RESOLVE_DST)) {
-         /* We found another subpass that draws to this attachment.  We'll
-          * wait to resolve until then.
-          */
-         found_draw = true;
-         break;
-      }
-   }
-
-   struct anv_image_view *iview = fb->attachments[att];
-   const struct anv_image *image = iview->image;
-   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-
-   enum blorp_fast_clear_op resolve_op = BLORP_FAST_CLEAR_OP_NONE;
-   if (!found_draw) {
-      /* This is the last subpass that writes to this attachment so we need to
-       * resolve here.  Ideally, we would like to only resolve if the storeOp
-       * is set to VK_ATTACHMENT_STORE_OP_STORE.  However, we need to ensure
-       * that the CCS bits are set to "resolved" because there may be copy or
-       * blit operations (which may ignore CCS) between now and the next time
-       * we render and we need to ensure that anything they write will be
-       * respected in the next render.  Unfortunately, the hardware does not
-       * provide us with any sort of "invalidate" pass that sets the CCS to
-       * "resolved" without writing to the render target.
-       */
-      if (iview->image->aux_usage != ISL_AUX_USAGE_CCS_E) {
-         /* The image destination surface doesn't support compression outside
-          * the render pass.  We need a full resolve.
-          */
-         resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
-      } else if (att_state->fast_clear) {
-         /* We don't know what to do with clear colors outside the render
-          * pass.  We need a partial resolve. Only transparent black is
-          * built into the surface state object and thus no resolve is
-          * required for this case.
-          */
-         if (att_state->clear_value.color.uint32[0] ||
-             att_state->clear_value.color.uint32[1] ||
-             att_state->clear_value.color.uint32[2] ||
-             att_state->clear_value.color.uint32[3])
-            resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
-      } else {
-         /* The image "natively" supports all the compression we care about
-          * and we don't need to resolve at all.  If this is the case, we also
-          * don't need to resolve for any of the input attachment cases below.
-          */
-      }
-   } else if (usage & ANV_SUBPASS_USAGE_INPUT) {
-      /* Input attachments are clear-color aware so, at least on Sky Lake, we
-       * can frequently sample from them with no resolves at all.
-       */
-      if (att_state->aux_usage != att_state->input_aux_usage) {
-         assert(att_state->input_aux_usage == ISL_AUX_USAGE_NONE);
-         resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
-      } else if (!att_state->clear_color_is_zero_one) {
-         /* Sky Lake PRM, Vol. 2d, RENDER_SURFACE_STATE::Red Clear Color:
-          *
-          *    "If Number of Multisamples is MULTISAMPLECOUNT_1 AND if this RT
-          *    is fast cleared with non-0/1 clear value, this RT must be
-          *    partially resolved (refer to Partial Resolve operation) before
-          *    binding this surface to Sampler."
-          */
-         resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
-      }
-   }
-
-   if (resolve_op == BLORP_FAST_CLEAR_OP_NONE)
-      return;
-
-   struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT,
-                                att_state->aux_usage, &surf);
-   if (att_state->fast_clear)
-      surf.clear_color = vk_to_isl_color(att_state->clear_value.color);
-
-   /* From the Sky Lake PRM Vol. 7, "Render Target Resolve":
-    *
-    *    "When performing a render target resolve, PIPE_CONTROL with end of
-    *    pipe sync must be delivered."
-    *
-    * This comment is a bit cryptic and doesn't really tell you what's going
-    * or what's really needed.  It appears that fast clear ops are not
-    * properly synchronized with other drawing.  We need to use a PIPE_CONTROL
-    * to ensure that the contents of the previous draw hit the render target
-    * before we resolve and then use a second PIPE_CONTROL after the resolve
-    * to ensure that it is completed before any additional drawing occurs.
-    */
-   cmd_buffer->state.pending_pipe_bits |=
-      ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
-
-   for (uint32_t layer = 0; layer < fb->layers; layer++) {
-      blorp_ccs_resolve(batch, &surf,
-                        iview->isl.base_level,
-                        iview->isl.base_array_layer + layer,
-                        iview->isl.format, resolve_op);
-   }
-
-   cmd_buffer->state.pending_pipe_bits |=
-      ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
-
-   /* Once we've done any sort of resolve, we're no longer fast-cleared */
-   att_state->fast_clear = false;
-   if (att_state->aux_usage == ISL_AUX_USAGE_CCS_D)
-      att_state->aux_usage = ISL_AUX_USAGE_NONE;
-}
-
 void
 anv_cmd_buffer_resolve_subpass(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
    struct anv_subpass *subpass = cmd_buffer->state.subpass;
 
-
-   struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
-
-   for (uint32_t i = 0; i < subpass->color_count; ++i) {
-      const uint32_t att = subpass->color_attachments[i].attachment;
-      if (att == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      assert(att < cmd_buffer->state.pass->attachment_count);
-      ccs_resolve_attachment(cmd_buffer, &batch, att);
-   }
-
    if (subpass->has_resolve) {
+      struct blorp_batch batch;
+      blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+
       /* We are about to do some MSAA resolves.  We need to flush so that the
        * result of writes to the MSAA color attachments show up in the sampler
        * when we blit to the single-sampled resolve target.
@@ -1627,25 +1544,29 @@
          struct anv_image_view *src_iview = fb->attachments[src_att];
          struct anv_image_view *dst_iview = fb->attachments[dst_att];
 
+         enum isl_aux_usage src_aux_usage =
+            cmd_buffer->state.attachments[src_att].aux_usage;
+         enum isl_aux_usage dst_aux_usage =
+            cmd_buffer->state.attachments[dst_att].aux_usage;
+
          const VkRect2D render_area = cmd_buffer->state.render_area;
 
          assert(src_iview->aspect_mask == dst_iview->aspect_mask);
-         resolve_image(&batch, src_iview->image,
+
+         resolve_image(&batch, src_iview->image, src_aux_usage,
                        src_iview->isl.base_level,
                        src_iview->isl.base_array_layer,
-                       dst_iview->image,
+                       dst_iview->image, dst_aux_usage,
                        dst_iview->isl.base_level,
                        dst_iview->isl.base_array_layer,
                        src_iview->aspect_mask,
                        render_area.offset.x, render_area.offset.y,
                        render_area.offset.x, render_area.offset.y,
                        render_area.extent.width, render_area.extent.height);
-
-         ccs_resolve_attachment(cmd_buffer, &batch, dst_att);
       }
-   }
 
-   blorp_batch_finish(&batch);
+      blorp_batch_finish(&batch);
+   }
 }
 
 void
@@ -1680,8 +1601,47 @@
    };
    surf.aux_usage = ISL_AUX_USAGE_HIZ;
 
-   surf.clear_color.u32[0] = (uint32_t) ANV_HZ_FC_VAL;
+   surf.clear_color.f32[0] = ANV_HZ_FC_VAL;
 
-   blorp_gen6_hiz_op(&batch, &surf, 0, 0, op);
+   blorp_hiz_op(&batch, &surf, 0, 0, 1, op);
+   blorp_batch_finish(&batch);
+}
+
+void
+anv_ccs_resolve(struct anv_cmd_buffer * const cmd_buffer,
+                const struct anv_state surface_state,
+                const struct anv_image * const image,
+                const uint8_t level, const uint32_t layer_count,
+                const enum blorp_fast_clear_op op)
+{
+   assert(cmd_buffer && image);
+
+   /* The resolved subresource range must have a CCS buffer. */
+   assert(level < anv_image_aux_levels(image));
+   assert(layer_count <= anv_image_aux_layers(image, level));
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT && image->samples == 1);
+
+   /* Create a binding table for this surface state. */
+   uint32_t binding_table;
+   VkResult result =
+      binding_table_for_surface_state(cmd_buffer, surface_state,
+                                      &binding_table);
+   if (result != VK_SUCCESS)
+      return;
+
+   struct blorp_batch batch;
+   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
+                    BLORP_BATCH_PREDICATE_ENABLE);
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT,
+                                image->aux_usage == ISL_AUX_USAGE_CCS_E ?
+                                ISL_AUX_USAGE_CCS_E : ISL_AUX_USAGE_CCS_D,
+                                &surf);
+
+   blorp_ccs_resolve_attachment(&batch, binding_table, &surf, level,
+                                layer_count, image->color_surface.isl.format,
+                                op);
+
    blorp_batch_finish(&batch);
 }
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index c65eba2..5eec67c 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -120,11 +120,13 @@
    cmd_buffer->batch.status = VK_SUCCESS;
 
    memset(&state->descriptors, 0, sizeof(state->descriptors));
+   for (uint32_t i = 0; i < ARRAY_SIZE(state->push_descriptors); i++) {
+      vk_free(&cmd_buffer->pool->alloc, state->push_descriptors[i]);
+      state->push_descriptors[i] = NULL;
+   }
    for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (state->push_constants[i] != NULL) {
-         vk_free(&cmd_buffer->pool->alloc, state->push_constants[i]);
-         state->push_constants[i] = NULL;
-      }
+      vk_free(&cmd_buffer->pool->alloc, state->push_constants[i]);
+      state->push_constants[i] = NULL;
    }
    memset(state->binding_tables, 0, sizeof(state->binding_tables));
    memset(state->samplers, 0, sizeof(state->samplers));
@@ -148,10 +150,8 @@
    state->pma_fix_enabled = false;
    state->hiz_enabled = false;
 
-   if (state->attachments != NULL) {
-      vk_free(&cmd_buffer->pool->alloc, state->attachments);
-      state->attachments = NULL;
-   }
+   vk_free(&cmd_buffer->pool->alloc, state->attachments);
+   state->attachments = NULL;
 
    state->gen7.index_buffer = NULL;
 }
@@ -212,12 +212,12 @@
       goto fail;
 
    anv_state_stream_init(&cmd_buffer->surface_state_stream,
-                         &device->surface_state_block_pool);
+                         &device->surface_state_pool, 4096);
    anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
-                         &device->dynamic_state_block_pool);
+                         &device->dynamic_state_pool, 16384);
 
-   memset(&cmd_buffer->state.push_descriptor, 0,
-          sizeof(cmd_buffer->state.push_descriptor));
+   memset(cmd_buffer->state.push_descriptors, 0,
+          sizeof(cmd_buffer->state.push_descriptors));
 
    if (pool) {
       list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
@@ -276,7 +276,8 @@
    anv_state_stream_finish(&cmd_buffer->surface_state_stream);
    anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
 
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+   anv_cmd_state_reset(cmd_buffer);
+
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
 
@@ -306,11 +307,11 @@
 
    anv_state_stream_finish(&cmd_buffer->surface_state_stream);
    anv_state_stream_init(&cmd_buffer->surface_state_stream,
-                         &cmd_buffer->device->surface_state_block_pool);
+                         &cmd_buffer->device->surface_state_pool, 4096);
 
    anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
    anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
-                         &cmd_buffer->device->dynamic_state_block_pool);
+                         &cmd_buffer->device->dynamic_state_pool, 16384);
    return VK_SUCCESS;
 }
 
@@ -335,6 +336,8 @@
       return gen8_cmd_buffer_emit_state_base_address(cmd_buffer);
    case 9:
       return gen9_cmd_buffer_emit_state_base_address(cmd_buffer);
+   case 10:
+      return gen10_cmd_buffer_emit_state_base_address(cmd_buffer);
    default:
       unreachable("unsupported gen\n");
    }
@@ -564,7 +567,7 @@
    /* We have to defer setting up vertex buffer since we need the buffer
     * stride from the pipeline. */
 
-   assert(firstBinding + bindingCount < MAX_VBS);
+   assert(firstBinding + bindingCount <= MAX_VBS);
    for (uint32_t i = 0; i < bindingCount; i++) {
       vb[firstBinding + i].buffer = anv_buffer_from_handle(pBuffers[i]);
       vb[firstBinding + i].offset = pOffsets[i];
@@ -832,6 +835,26 @@
    return iview;
 }
 
+static VkResult
+anv_cmd_buffer_ensure_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+                                          uint32_t set)
+{
+   struct anv_push_descriptor_set **push_set =
+      &cmd_buffer->state.push_descriptors[set];
+
+   if (*push_set == NULL) {
+      *push_set = vk_alloc(&cmd_buffer->pool->alloc,
+                           sizeof(struct anv_push_descriptor_set), 8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*push_set == NULL) {
+         anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
 void anv_CmdPushDescriptorSetKHR(
     VkCommandBuffer commandBuffer,
     VkPipelineBindPoint pipelineBindPoint,
@@ -849,12 +872,17 @@
 
    const struct anv_descriptor_set_layout *set_layout =
       layout->set[_set].layout;
-   struct anv_descriptor_set *set = &cmd_buffer->state.push_descriptor.set;
+
+   if (anv_cmd_buffer_ensure_push_descriptor_set(cmd_buffer, _set) != VK_SUCCESS)
+      return;
+   struct anv_push_descriptor_set *push_set =
+      cmd_buffer->state.push_descriptors[_set];
+   struct anv_descriptor_set *set = &push_set->set;
 
    set->layout = set_layout;
    set->size = anv_descriptor_set_layout_size(set_layout);
    set->buffer_count = set_layout->buffer_count;
-   set->buffer_views = cmd_buffer->state.push_descriptor.buffer_views;
+   set->buffer_views = push_set->buffer_views;
 
    /* Go through the user supplied descriptors. */
    for (uint32_t i = 0; i < descriptorWriteCount; i++) {
@@ -935,12 +963,17 @@
 
    const struct anv_descriptor_set_layout *set_layout =
       layout->set[_set].layout;
-   struct anv_descriptor_set *set = &cmd_buffer->state.push_descriptor.set;
+
+   if (anv_cmd_buffer_ensure_push_descriptor_set(cmd_buffer, _set) != VK_SUCCESS)
+      return;
+   struct anv_push_descriptor_set *push_set =
+      cmd_buffer->state.push_descriptors[_set];
+   struct anv_descriptor_set *set = &push_set->set;
 
    set->layout = set_layout;
    set->size = anv_descriptor_set_layout_size(set_layout);
    set->buffer_count = set_layout->buffer_count;
-   set->buffer_views = cmd_buffer->state.push_descriptor.buffer_views;
+   set->buffer_views = push_set->buffer_views;
 
    anv_descriptor_set_write_template(set,
                                      cmd_buffer->device,
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index 4797c1e..8407798 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -345,7 +345,7 @@
    pool->free_list = EMPTY;
 
    anv_state_stream_init(&pool->surface_state_stream,
-                         &device->surface_state_block_pool);
+                         &device->surface_state_pool, 4096);
    pool->surface_state_free_list = NULL;
 
    *pDescriptorPool = anv_descriptor_pool_to_handle(pool);
@@ -380,7 +380,7 @@
    pool->free_list = EMPTY;
    anv_state_stream_finish(&pool->surface_state_stream);
    anv_state_stream_init(&pool->surface_state_stream,
-                         &device->surface_state_block_pool);
+                         &device->surface_state_pool, 4096);
    pool->surface_state_free_list = NULL;
 
    return VK_SUCCESS;
@@ -615,12 +615,9 @@
 
    *desc = (struct anv_descriptor) {
       .type = type,
+      .layout = info->imageLayout,
       .image_view = image_view,
       .sampler = sampler,
-      .aux_usage = image_view == NULL ? ISL_AUX_USAGE_NONE :
-                   anv_layout_to_aux_usage(devinfo, image_view->image,
-                                           image_view->aspect_mask,
-                                           info->imageLayout),
    };
 }
 
@@ -767,7 +764,7 @@
 
    for (uint32_t i = 0; i < descriptorCopyCount; i++) {
       const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
-      ANV_FROM_HANDLE(anv_descriptor_set, src, copy->dstSet);
+      ANV_FROM_HANDLE(anv_descriptor_set, src, copy->srcSet);
       ANV_FROM_HANDLE(anv_descriptor_set, dst, copy->dstSet);
 
       const struct anv_descriptor_set_binding_layout *src_layout =
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 040f438..d4bfc2d 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -35,7 +35,7 @@
 #include "util/debug.h"
 #include "util/build_id.h"
 #include "util/mesa-sha1.h"
-#include "util/vk_util.h"
+#include "vk_util.h"
 
 #include "genxml/gen7_pack.h"
 
@@ -295,7 +295,7 @@
       fprintf(stderr, "WARNING: Ivy Bridge Vulkan support is incomplete\n");
    } else if (device->info.gen == 7 && device->info.is_baytrail) {
       fprintf(stderr, "WARNING: Bay Trail Vulkan support is incomplete\n");
-   } else if (device->info.gen >= 8) {
+   } else if (device->info.gen >= 8 && device->info.gen <= 9) {
       /* Broadwell, Cherryview, Skylake, Broxton, Kabylake is as fully
        * supported as anything */
    } else {
@@ -362,8 +362,9 @@
 
    if (device->info.is_cherryview &&
        device->subslice_total > 0 && device->eu_total > 0) {
-      /* Logical CS threads = EUs per subslice * 7 threads per EU */
-      uint32_t max_cs_threads = device->eu_total / device->subslice_total * 7;
+      /* Logical CS threads = EUs per subslice * num threads per EU */
+      uint32_t max_cs_threads =
+         device->eu_total / device->subslice_total * device->info.num_thread_per_eu;
 
       /* Fuse configurations may give more threads than expected, never less. */
       if (max_cs_threads > device->info.max_cs_threads)
@@ -410,13 +411,41 @@
 
 static const VkExtensionProperties global_extensions[] = {
    {
+      .extensionName = VK_GOOGLE_IMAGE_USAGE_SCANOUT_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
       .extensionName = VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
       .specVersion = 1,
    },
    {
+      .extensionName = VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+      .specVersion = 1,
+   },   
+   {
+       .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
+       .specVersion = 1,
+   },   
+   {
+      .extensionName = VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
       .extensionName = VK_KHR_SURFACE_EXTENSION_NAME,
       .specVersion = 25,
    },
+#ifdef VK_USE_PLATFORM_MAGMA_KHR
+   {
+      .extensionName = VK_KHR_MAGMA_SURFACE_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+#endif
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+   {
+      .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
+      .specVersion = 6,
+   },
+#endif
 #ifdef VK_USE_PLATFORM_XCB_KHR
    {
       .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME,
@@ -429,51 +458,11 @@
       .specVersion = 6,
    },
 #endif
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
-   {
-      .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
-      .specVersion = 5,
-   },
-#endif
-#ifdef VK_USE_PLATFORM_MAGMA_KHR
-   {
-      .extensionName = VK_KHR_MAGMA_SURFACE_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-#endif
-   {
-       .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
-       .specVersion = 1,
-   },
-   {
-      .extensionName = VK_GOOGLE_IMAGE_USAGE_SCANOUT_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-   {
-      .extensionName = VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
-      .specVersion = 1,
-   },
 };
 
 static const VkExtensionProperties device_extensions[] = {
    {
-      .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
-      .specVersion = 68,
-   },
-   {
-      .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-   {
-      .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-   {
-      .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-   {
-      .extensionName = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+      .extensionName = VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME,
       .specVersion = 1,
    },
    {
@@ -481,10 +470,6 @@
       .specVersion = 1,
    },
    {
-      .extensionName = VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME,
-      .specVersion = 1,
-   },
-   {
       .extensionName = VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
       .specVersion = 1,
    },
@@ -504,6 +489,42 @@
       .extensionName = VK_KHR_EXTERNAL_SEMAPHORE_FUCHSIA_EXTENSION_NAME,
       .specVersion = 1,
    },
+   {
+      .extensionName = VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
+      .specVersion = 1,
+   },
+   {
+      .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+      .specVersion = 68,
+   },
+   {
+      .extensionName = VK_KHR_VARIABLE_POINTERS_EXTENSION_NAME,
+      .specVersion = 1,
+   },
 };
 
 static void *
@@ -695,7 +716,7 @@
       .sampleRateShading                        = true,
       .dualSrcBlend                             = true,
       .logicOp                                  = true,
-      .multiDrawIndirect                        = false,
+      .multiDrawIndirect                        = true,
       .drawIndirectFirstInstance                = true,
       .depthClamp                               = true,
       .depthBiasClamp                           = true,
@@ -747,6 +768,22 @@
 
    vk_foreach_struct(ext, pFeatures->pNext) {
       switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES_KHX: {
+         VkPhysicalDeviceMultiviewFeaturesKHX *features =
+            (VkPhysicalDeviceMultiviewFeaturesKHX *)ext;
+         features->multiview = true;
+         features->multiviewGeometryShader = true;
+         features->multiviewTessellationShader = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR: {
+         VkPhysicalDeviceVariablePointerFeaturesKHR *features = (void *)ext;
+         features->variablePointersStorageBuffer = true;
+         features->variablePointers = false;
+         break;
+      }
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
@@ -765,6 +802,9 @@
    const uint32_t max_raw_buffer_sz = devinfo->gen >= 7 ?
                                       (1ul << 30) : (1ul << 27);
 
+   const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ?
+                                 128 : 16;
+
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&pdevice->isl_dev);
 
@@ -783,13 +823,13 @@
       .bufferImageGranularity                   = 64, /* A cache line */
       .sparseAddressSpaceSize                   = 0,
       .maxBoundDescriptorSets                   = MAX_SETS,
-      .maxPerStageDescriptorSamplers            = 64,
+      .maxPerStageDescriptorSamplers            = max_samplers,
       .maxPerStageDescriptorUniformBuffers      = 64,
       .maxPerStageDescriptorStorageBuffers      = 64,
-      .maxPerStageDescriptorSampledImages       = 64,
+      .maxPerStageDescriptorSampledImages       = max_samplers,
       .maxPerStageDescriptorStorageImages       = 64,
       .maxPerStageDescriptorInputAttachments    = 64,
-      .maxPerStageResources                     = 128,
+      .maxPerStageResources                     = 250,
       .maxDescriptorSetSamplers                 = 256,
       .maxDescriptorSetUniformBuffers           = 256,
       .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
@@ -865,7 +905,7 @@
       .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
       .maxSampleMaskWords                       = 1,
       .timestampComputeAndGraphics              = false,
-      .timestampPeriod                          = devinfo->timebase_scale,
+      .timestampPeriod                          = 1000000000.0 / devinfo->timestamp_frequency,
       .maxClipDistances                         = 8,
       .maxCullDistances                         = 8,
       .maxCombinedClipAndCullDistances          = 8,
@@ -882,8 +922,8 @@
    };
 
    *pProperties = (VkPhysicalDeviceProperties) {
-      .apiVersion = VK_MAKE_VERSION(1, 0, 42),
-      .driverVersion = 1,
+      .apiVersion = VK_MAKE_VERSION(1, 0, 54),
+      .driverVersion = vk_get_driver_version(),
       .vendorID = 0x8086,
       .deviceID = pdevice->chipset_id,
       .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
@@ -891,7 +931,8 @@
       .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
    };
 
-   strcpy(pProperties->deviceName, pdevice->name);
+   snprintf(pProperties->deviceName, sizeof(pProperties->deviceName),
+            "%s", pdevice->name);
    memcpy(pProperties->pipelineCacheUUID,
           pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
 }
@@ -924,6 +965,14 @@
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES_KHX: {
+         VkPhysicalDeviceMultiviewPropertiesKHX *properties =
+            (VkPhysicalDeviceMultiviewPropertiesKHX *)ext;
+         properties->maxMultiviewViewCount = 16;
+         properties->maxMultiviewInstanceIndex = UINT32_MAX / 16;
+         break;
+      }
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
@@ -1062,7 +1111,7 @@
    state = anv_state_pool_alloc(pool, size, align);
    memcpy(state.map, p, size);
 
-   anv_state_flush(pool->block_pool->device, state);
+   anv_state_flush(pool->block_pool.device, state);
 
    return state;
 }
@@ -1093,62 +1142,6 @@
                                                     border_colors);
 }
 
-VkResult
-anv_device_submit_simple_batch(struct anv_device *device,
-                               struct anv_batch *batch)
-{
-   struct drm_i915_gem_execbuffer2 execbuf;
-   struct drm_i915_gem_exec_object2 exec2_objects[1];
-   struct anv_bo bo, *exec_bos[1];
-   VkResult result = VK_SUCCESS;
-   uint32_t size;
-
-   /* Kernel driver requires 8 byte aligned batch length */
-   size = align_u32(batch->next - batch->start, 8);
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
-   if (result != VK_SUCCESS)
-      return result;
-
-   memcpy(bo.map, batch->start, size);
-   if (!device->info.has_llc)
-      anv_flush_range(bo.map, size);
-
-   exec_bos[0] = &bo;
-   exec2_objects[0].handle = bo.gem_handle;
-   exec2_objects[0].relocation_count = 0;
-   exec2_objects[0].relocs_ptr = 0;
-   exec2_objects[0].alignment = 0;
-   exec2_objects[0].offset = bo.offset;
-   exec2_objects[0].flags = 0;
-   exec2_objects[0].rsvd1 = 0;
-   exec2_objects[0].rsvd2 = bo.size;
-
-   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
-   execbuf.buffer_count = 1;
-   execbuf.batch_start_offset = 0;
-   execbuf.batch_len = size;
-   execbuf.cliprects_ptr = 0;
-   execbuf.num_cliprects = 0;
-   execbuf.DR1 = 0;
-   execbuf.DR4 = 0;
-
-   execbuf.flags =
-      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
-   execbuf.rsvd1 = device->context_id;
-   execbuf.rsvd2 = 0;
-
-   result = anv_device_execbuf(device, &execbuf, exec_bos, 0, NULL, 0, NULL);
-   if (result != VK_SUCCESS)
-      goto fail;
-
-   result = anv_device_wait(device, &bo, INT64_MAX);
-
- fail:
-   anv_bo_pool_free(&device->batch_bo_pool, &bo);
-
-   return result;
-}
-
 VkResult anv_CreateDevice(
     VkPhysicalDevice                            physicalDevice,
     const VkDeviceCreateInfo*                   pCreateInfo,
@@ -1174,6 +1167,19 @@
          return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
    }
 
+   /* Check enabled features */
+   if (pCreateInfo->pEnabledFeatures) {
+      VkPhysicalDeviceFeatures supported_features;
+      anv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
+      VkBool32 *supported_feature = (VkBool32 *)&supported_features;
+      VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
+      unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
+      for (uint32_t i = 0; i < num_features; i++) {
+         if (enabled_feature[i] && !supported_feature[i])
+            return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+      }
+   }
+
    device = vk_alloc2(&physical_device->instance->alloc, pAllocator,
                        sizeof(*device), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -1249,30 +1255,18 @@
    if (result != VK_SUCCESS)
       goto fail_batch_bo_pool;
 
-   result = anv_block_pool_init(&device->dynamic_state_block_pool, device,
-                                16384);
+   result = anv_state_pool_init(&device->dynamic_state_pool, device, 16384);
    if (result != VK_SUCCESS)
       goto fail_bo_cache;
 
-   anv_state_pool_init(&device->dynamic_state_pool,
-                       &device->dynamic_state_block_pool);
-
-   result = anv_block_pool_init(&device->instruction_block_pool, device,
-                                1024 * 1024);
+   result = anv_state_pool_init(&device->instruction_state_pool, device, 16384);
    if (result != VK_SUCCESS)
       goto fail_dynamic_state_pool;
 
-   anv_state_pool_init(&device->instruction_state_pool,
-                       &device->instruction_block_pool);
-
-   result = anv_block_pool_init(&device->surface_state_block_pool, device,
-                                4096);
+   result = anv_state_pool_init(&device->surface_state_pool, device, 4096);
    if (result != VK_SUCCESS)
       goto fail_instruction_state_pool;
 
-   anv_state_pool_init(&device->surface_state_pool,
-                       &device->surface_state_block_pool);
-
    result = anv_bo_init_new(&device->workaround_bo, device, 1024);
    if (result != VK_SUCCESS)
       goto fail_surface_state_pool;
@@ -1294,6 +1288,9 @@
    case 9:
       result = gen9_init_device_state(device);
       break;
+   case 10:
+      result = gen10_init_device_state(device);
+      break;
    default:
       /* Shouldn't get here as we don't create physical devices for any other
        * gens. */
@@ -1317,13 +1314,10 @@
    anv_gem_close(device, device->workaround_bo.gem_handle);
  fail_surface_state_pool:
    anv_state_pool_finish(&device->surface_state_pool);
-   anv_block_pool_finish(&device->surface_state_block_pool);
  fail_instruction_state_pool:
    anv_state_pool_finish(&device->instruction_state_pool);
-   anv_block_pool_finish(&device->instruction_block_pool);
  fail_dynamic_state_pool:
    anv_state_pool_finish(&device->dynamic_state_pool);
-   anv_block_pool_finish(&device->dynamic_state_block_pool);
  fail_bo_cache:
    anv_bo_cache_finish(&device->bo_cache);
  fail_batch_bo_pool:
@@ -1367,11 +1361,8 @@
    anv_gem_close(device, device->workaround_bo.gem_handle);
 
    anv_state_pool_finish(&device->surface_state_pool);
-   anv_block_pool_finish(&device->surface_state_block_pool);
    anv_state_pool_finish(&device->instruction_state_pool);
-   anv_block_pool_finish(&device->instruction_block_pool);
    anv_state_pool_finish(&device->dynamic_state_pool);
-   anv_block_pool_finish(&device->dynamic_state_block_pool);
 
    anv_bo_cache_finish(&device->bo_cache);
 
@@ -1467,42 +1458,6 @@
    *pQueue = anv_queue_to_handle(&device->queue);
 }
 
-static void restore_temporary_semaphore_imports(struct anv_device* device,
-                                                struct anv_semaphore* semaphores[], uint32_t count)
-{
-   for (uint32_t i = 0; i < count; i++) {
-      if (semaphores[i]->original_platform_semaphore &&
-          semaphores[i]->current_platform_semaphore != semaphores[i]->original_platform_semaphore) {
-         anv_platform_destroy_semaphore(device, semaphores[i]->current_platform_semaphore);
-         semaphores[i]->current_platform_semaphore = semaphores[i]->original_platform_semaphore;
-      }
-   }
-}
-
-VkResult anv_device_execbuf(struct anv_device* device, struct drm_i915_gem_execbuffer2* execbuf,
-                            struct anv_bo** execbuf_bos, uint32_t wait_semaphore_count,
-                            anv_semaphore_t* wait_semaphores, uint32_t signal_semaphore_count,
-                            anv_semaphore_t* signal_semaphores)
-{
-   int ret = anv_gem_execbuffer(device, execbuf, wait_semaphore_count, wait_semaphores,
-                                signal_semaphore_count, signal_semaphores);
-
-   restore_temporary_semaphore_imports(device, wait_semaphores, wait_semaphore_count);
-   restore_temporary_semaphore_imports(device, signal_semaphores, signal_semaphore_count);
-
-   if (ret != 0) {
-      /* We don't know the real error. */
-      device->lost = true;
-      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
-   }
-
-   struct drm_i915_gem_exec_object2 *objects = (void *)execbuf->buffers_ptr;
-   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
-      execbuf_bos[k]->offset = objects[k].offset;
-
-   return VK_SUCCESS;
-}
-
 VkResult
 anv_device_query_status(struct anv_device *device)
 {
@@ -1580,141 +1535,6 @@
    return anv_device_query_status(device);
 }
 
-VkResult anv_QueueSubmit(
-    VkQueue                                     _queue,
-    uint32_t                                    submitCount,
-    const VkSubmitInfo*                         pSubmits,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-   struct anv_device *device = queue->device;
-
-   /* Query for device status prior to submitting.  Technically, we don't need
-    * to do this.  However, if we have a client that's submitting piles of
-    * garbage, we would rather break as early as possible to keep the GPU
-    * hanging contained.  If we don't check here, we'll either be waiting for
-    * the kernel to kick us or we'll have to wait until the client waits on a
-    * fence before we actually know whether or not we've hung.
-    */
-   VkResult result = anv_device_query_status(device);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* We lock around QueueSubmit for three main reasons:
-    *
-    *  1) When a block pool is resized, we create a new gem handle with a
-    *     different size and, in the case of surface states, possibly a
-    *     different center offset but we re-use the same anv_bo struct when
-    *     we do so.  If this happens in the middle of setting up an execbuf,
-    *     we could end up with our list of BOs out of sync with our list of
-    *     gem handles.
-    *
-    *  2) The algorithm we use for building the list of unique buffers isn't
-    *     thread-safe.  While the client is supposed to syncronize around
-    *     QueueSubmit, this would be extremely difficult to debug if it ever
-    *     came up in the wild due to a broken app.  It's better to play it
-    *     safe and just lock around QueueSubmit.
-    *
-    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
-    *      userspace.  Due to the fact that the surface state buffer is shared
-    *      between batches, we can't afford to have that happen from multiple
-    *      threads at the same time.  Even though the user is supposed to
-    *      ensure this doesn't happen, we play it safe as in (2) above.
-    *
-    * Since the only other things that ever take the device lock such as block
-    * pool resize only rarely happen, this will almost never be contended so
-    * taking a lock isn't really an expensive operation in this case.
-    */
-   pthread_mutex_lock(&device->mutex);
-
-   for (uint32_t i = 0; i < submitCount; i++) {
-      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
-         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
-                         pSubmits[i].pCommandBuffers[j]);
-         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-         assert(!anv_batch_has_error(&cmd_buffer->batch));
-
-         uint32_t signal_semaphore_count = pSubmits[i].signalSemaphoreCount;
-         anv_semaphore_t* signal_semaphores = (anv_semaphore_t*)pSubmits[i].pSignalSemaphores;
-         anv_semaphore_t semaphore_array_with_fence[signal_semaphore_count + 1];
-
-         if (fence && i == submitCount - 1 && j == pSubmits[i].commandBufferCount - 1) {
-            memcpy(semaphore_array_with_fence, pSubmits[i].pSignalSemaphores,
-                   signal_semaphore_count * sizeof(anv_semaphore_t));
-            semaphore_array_with_fence[signal_semaphore_count] = fence->semaphore;
-            signal_semaphore_count++;
-            signal_semaphores = semaphore_array_with_fence;
-
-            assert(fence->state == ANV_FENCE_STATE_RESET);
-            fence->state = ANV_FENCE_STATE_SUBMITTED;
-            pthread_cond_broadcast(&device->queue_submit);
-
-            // Signal that fence has been handled so we don't execute the extra command buffer below
-            fence = NULL;
-         }
-
-         result = anv_cmd_buffer_execbuf(device, cmd_buffer, pSubmits[i].waitSemaphoreCount,
-                                         (anv_semaphore_t*)pSubmits[i].pWaitSemaphores,
-                                         signal_semaphore_count, signal_semaphores);
-         if (result != VK_SUCCESS)
-            goto out;
-      }
-   }
-
-   if (fence) {
-      struct anv_bo *fence_bo = &fence->bo;
-      result =
-          anv_device_execbuf(device, &fence->execbuf, &fence_bo, 0, NULL, 1, &fence->semaphore);
-      if (result != VK_SUCCESS)
-         goto out;
-
-      /* Update the fence and wake up any waiters */
-      assert(fence->state == ANV_FENCE_STATE_RESET);
-      fence->state = ANV_FENCE_STATE_SUBMITTED;
-      pthread_cond_broadcast(&device->queue_submit);
-   }
-
-out:
-   if (result != VK_SUCCESS) {
-      /* In the case that something has gone wrong we may end up with an
-       * inconsistent state from which it may not be trivial to recover.
-       * For example, we might have computed address relocations and
-       * any future attempt to re-submit this job will need to know about
-       * this and avoid computing relocation addresses again.
-       *
-       * To avoid this sort of issues, we assume that if something was
-       * wrong during submission we must already be in a really bad situation
-       * anyway (such us being out of memory) and return
-       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
-       * submit the same job again to this device.
-       */
-      result = VK_ERROR_DEVICE_LOST;
-      device->lost = true;
-
-      /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
-       * vkWaitForFences() and vkGetFenceStatus() return a valid result
-       * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
-       * Setting the fence status to SIGNALED ensures this will happen in
-       * any case.
-       */
-      if (fence)
-         fence->state = ANV_FENCE_STATE_SIGNALED;
-   }
-
-   pthread_mutex_unlock(&device->mutex);
-
-   return result;
-}
-
-VkResult anv_QueueWaitIdle(
-    VkQueue                                     _queue)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-
-   return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
-}
-
 VkResult anv_DeviceWaitIdle(
     VkDevice                                    _device)
 {
@@ -1997,7 +1817,7 @@
       if (ranges[i].offset >= mem->map_size)
          continue;
 
-      anv_clflush_range(mem->map + ranges[i].offset,
+      gen_clflush_range(mem->map + ranges[i].offset,
                         MIN2(ranges[i].size, mem->map_size - ranges[i].offset));
    }
 }
@@ -2066,6 +1886,30 @@
    pMemoryRequirements->memoryTypeBits = memory_types;
 }
 
+void anv_GetBufferMemoryRequirements2KHR(
+    VkDevice                                    _device,
+    const VkBufferMemoryRequirementsInfo2KHR*   pInfo,
+    VkMemoryRequirements2KHR*                   pMemoryRequirements)
+{
+   anv_GetBufferMemoryRequirements(_device, pInfo->buffer,
+                                   &pMemoryRequirements->memoryRequirements);
+
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR: {
+         VkMemoryDedicatedRequirementsKHR *requirements = (void *)ext;
+         requirements->prefersDedicatedAllocation = VK_FALSE;
+         requirements->requiresDedicatedAllocation = VK_FALSE;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
 void anv_GetImageMemoryRequirements(
     VkDevice                                    _device,
     VkImage                                     _image,
@@ -2091,6 +1935,30 @@
    pMemoryRequirements->memoryTypeBits = memory_types;
 }
 
+void anv_GetImageMemoryRequirements2KHR(
+    VkDevice                                    _device,
+    const VkImageMemoryRequirementsInfo2KHR*    pInfo,
+    VkMemoryRequirements2KHR*                   pMemoryRequirements)
+{
+   anv_GetImageMemoryRequirements(_device, pInfo->image,
+                                  &pMemoryRequirements->memoryRequirements);
+
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR: {
+         VkMemoryDedicatedRequirementsKHR *requirements = (void *)ext;
+         requirements->prefersDedicatedAllocation = VK_FALSE;
+         requirements->requiresDedicatedAllocation = VK_FALSE;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
 void anv_GetImageSparseMemoryRequirements(
     VkDevice                                    device,
     VkImage                                     image,
@@ -2100,6 +1968,15 @@
    *pSparseMemoryRequirementCount = 0;
 }
 
+void anv_GetImageSparseMemoryRequirements2KHR(
+    VkDevice                                    device,
+    const VkImageSparseMemoryRequirementsInfo2KHR* pInfo,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2KHR*        pSparseMemoryRequirements)
+{
+   *pSparseMemoryRequirementCount = 0;
+}
+
 void anv_GetDeviceMemoryCommitment(
     VkDevice                                    device,
     VkDeviceMemory                              memory,
@@ -2141,370 +2018,6 @@
    return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
 }
 
-VkResult anv_CreateFence(
-    VkDevice                                    _device,
-    const VkFenceCreateInfo*                    pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkFence*                                    pFence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_bo fence_bo;
-   struct anv_fence *fence;
-   struct anv_batch batch;
-   VkSemaphore semaphore;
-   VkResult result;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
-   result = anv_CreateSemaphore(_device, NULL, pAllocator, &semaphore);
-   if (result != VK_SUCCESS)
-      return result;
-
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
-   if (result != VK_SUCCESS) {
-      anv_DestroySemaphore(_device, semaphore, pAllocator);
-      return result;
-   }
-
-   /* Fences are small.  Just store the CPU data structure in the BO. */
-   fence = fence_bo.map;
-   fence->bo = fence_bo;
-
-   /* Place the batch after the CPU data but on its own cache line. */
-   const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
-   batch.next = batch.start = fence->bo.map + batch_offset;
-   batch.end = fence->bo.map + fence->bo.size;
-   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
-   anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
-
-   if (!device->info.has_llc) {
-      assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
-      assert(batch.next - batch.start <= CACHELINE_SIZE);
-      __builtin_ia32_mfence();
-      __builtin_ia32_clflush(batch.start);
-   }
-
-   fence->exec2_objects[0].handle = fence->bo.gem_handle;
-   fence->exec2_objects[0].relocation_count = 0;
-   fence->exec2_objects[0].relocs_ptr = 0;
-   fence->exec2_objects[0].alignment = 0;
-   fence->exec2_objects[0].offset = fence->bo.offset;
-   fence->exec2_objects[0].flags = 0;
-   fence->exec2_objects[0].rsvd1 = 0;
-   fence->exec2_objects[0].rsvd2 = fence->bo.size;
-
-   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
-   fence->execbuf.buffer_count = 1;
-   fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
-   fence->execbuf.batch_len = batch.next - batch.start;
-   fence->execbuf.cliprects_ptr = 0;
-   fence->execbuf.num_cliprects = 0;
-   fence->execbuf.DR1 = 0;
-   fence->execbuf.DR4 = 0;
-
-   fence->execbuf.flags =
-      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
-   fence->execbuf.rsvd1 = device->context_id;
-   fence->execbuf.rsvd2 = 0;
-
-   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
-      fence->state = ANV_FENCE_STATE_SIGNALED;
-   } else {
-      fence->state = ANV_FENCE_STATE_RESET;
-   }
-
-   fence->semaphore = (struct anv_semaphore*)semaphore;
-
-   *pFence = anv_fence_to_handle(fence);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroyFence(
-    VkDevice                                    _device,
-    VkFence                                     _fence,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
-   if (!fence)
-      return;
-
-   assert(fence->bo.map == fence);
-   anv_DestroySemaphore(_device, (VkSemaphore)fence->semaphore, pAllocator);
-
-   anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
-}
-
-VkResult anv_ResetFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      fence->state = ANV_FENCE_STATE_RESET;
-      anv_platform_reset_semaphore(fence->semaphore->current_platform_semaphore);
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceStatus(
-    VkDevice                                    _device,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-   uint64_t t = 0;
-   int ret;
-
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   switch (fence->state) {
-   case ANV_FENCE_STATE_RESET:
-      /* If it hasn't even been sent off to the GPU yet, it's not ready */
-      return VK_NOT_READY;
-
-   case ANV_FENCE_STATE_SIGNALED:
-      /* It's been signaled, return success */
-      return VK_SUCCESS;
-
-   case ANV_FENCE_STATE_SUBMITTED:
-      /* It's been submitted to the GPU but we don't know if it's done yet. */
-      ret = anv_platform_wait_semaphore(fence->semaphore->current_platform_semaphore, t);
-      switch (ret) {
-      case 0:
-         fence->state = ANV_FENCE_STATE_SIGNALED;
-         return VK_SUCCESS;
-      case -ETIME:
-         return VK_NOT_READY;
-      default:
-         /* We don't know the real error. */
-         device->lost = true;
-         return VK_ERROR_DEVICE_LOST;
-      }
-   default:
-      unreachable("Invalid fence status");
-   }
-}
-
-#define NSEC_PER_SEC 1000000000
-#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
-
-VkResult anv_WaitForFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences,
-    VkBool32                                    waitAll,
-    uint64_t                                    _timeout)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   int ret;
-
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
-    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
-    * for a couple of kernel releases.  Since there's no way to know
-    * whether or not the kernel we're using is one of the broken ones, the
-    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
-    * maximum timeout from 584 years to 292 years - likely not a big deal.
-    */
-   int64_t timeout = MIN2(_timeout, INT64_MAX);
-
-   VkResult result = VK_SUCCESS;
-   uint32_t pending_fences = fenceCount;
-   while (pending_fences) {
-      pending_fences = 0;
-      bool signaled_fences = false;
-      for (uint32_t i = 0; i < fenceCount; i++) {
-         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-         switch (fence->state) {
-         case ANV_FENCE_STATE_RESET:
-            /* This fence hasn't been submitted yet, we'll catch it the next
-             * time around.  Yes, this may mean we dead-loop but, short of
-             * lots of locking and a condition variable, there's not much that
-             * we can do about that.
-             */
-            pending_fences++;
-            continue;
-
-         case ANV_FENCE_STATE_SIGNALED:
-            /* This fence is not pending.  If waitAll isn't set, we can return
-             * early.  Otherwise, we have to keep going.
-             */
-            if (!waitAll) {
-               result = VK_SUCCESS;
-               goto done;
-            }
-            continue;
-
-         case ANV_FENCE_STATE_SUBMITTED:
-            /* These are the fences we really care about.  Go ahead and wait
-             * on it until we hit a timeout.
-             */
-            ret = anv_platform_wait_semaphore(fence->semaphore->current_platform_semaphore,
-                                              _timeout == UINT64_MAX ? UINT64_MAX
-                                                                     : _timeout / 1000000);
-            switch (ret) {
-            case 0:
-               fence->state = ANV_FENCE_STATE_SIGNALED;
-               signaled_fences = true;
-               if (!waitAll)
-                  goto done;
-               break;
-            case -ETIME:
-               result = VK_TIMEOUT;
-               goto done;
-            default:
-               /* We don't know the real error. */
-               device->lost = true;
-               goto done;
-            }
-         }
-      }
-
-      if (pending_fences && !signaled_fences) {
-         /* If we've hit this then someone decided to vkWaitForFences before
-          * they've actually submitted any of them to a queue.  This is a
-          * fairly pessimal case, so it's ok to lock here and use a standard
-          * pthreads condition variable.
-          */
-         pthread_mutex_lock(&device->mutex);
-
-         /* It's possible that some of the fences have changed state since the
-          * last time we checked.  Now that we have the lock, check for
-          * pending fences again and don't wait if it's changed.
-          */
-         uint32_t now_pending_fences = 0;
-         for (uint32_t i = 0; i < fenceCount; i++) {
-            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-            if (fence->state == ANV_FENCE_STATE_RESET)
-               now_pending_fences++;
-         }
-         assert(now_pending_fences <= pending_fences);
-
-         if (now_pending_fences == pending_fences) {
-            struct timespec before;
-            clock_gettime(CLOCK_MONOTONIC, &before);
-
-            uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
-            uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
-                               (timeout / NSEC_PER_SEC);
-            abs_nsec %= NSEC_PER_SEC;
-
-            /* Avoid roll-over in tv_sec on 32-bit systems if the user
-             * provided timeout is UINT64_MAX
-             */
-            struct timespec abstime;
-            abstime.tv_nsec = abs_nsec;
-            abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
-
-            ret = pthread_cond_timedwait(&device->queue_submit,
-                                         &device->mutex, &abstime);
-            assert(ret != EINVAL);
-
-            struct timespec after;
-            clock_gettime(CLOCK_MONOTONIC, &after);
-            uint64_t time_elapsed =
-               ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
-               ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
-
-            if (time_elapsed >= timeout) {
-               pthread_mutex_unlock(&device->mutex);
-               result = VK_TIMEOUT;
-               goto done;
-            }
-
-            timeout -= time_elapsed;
-         }
-
-         pthread_mutex_unlock(&device->mutex);
-      }
-   }
-
-done:
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
-
-   return result;
-}
-
-// Queue semaphore functions
-
-VkResult anv_CreateSemaphore(VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo,
-                             const VkAllocationCallbacks* pAllocator, VkSemaphore* pSemaphore)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_semaphore* semaphore;
-
-   anv_platform_semaphore_t platform_semaphore;
-   if (anv_platform_create_semaphore(device, &platform_semaphore) != 0)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8,
-                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!semaphore) {
-      anv_platform_destroy_semaphore(device, platform_semaphore);
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-
-   semaphore->original_platform_semaphore = platform_semaphore;
-   semaphore->current_platform_semaphore = platform_semaphore;
-
-   *pSemaphore = (VkSemaphore)semaphore;
-   return VK_SUCCESS;
-}
-
-void anv_DestroySemaphore(VkDevice _device, VkSemaphore vk_semaphore,
-                          const VkAllocationCallbacks* pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, vk_semaphore);
-
-   if (!semaphore)
-      return;
-
-   if (semaphore->current_platform_semaphore &&
-       semaphore->current_platform_semaphore != semaphore->original_platform_semaphore)
-      anv_platform_destroy_semaphore(device, semaphore->current_platform_semaphore);
-
-   if (semaphore->original_platform_semaphore)
-      anv_platform_destroy_semaphore(device, semaphore->original_platform_semaphore);
-
-   vk_free2(&device->alloc, pAllocator, semaphore);
-}
-
-void anv_GetPhysicalDeviceExternalSemaphorePropertiesKHR(
-    VkPhysicalDevice physicalDevice,
-    const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo,
-    VkExternalSemaphorePropertiesKHR* pExternalSemaphoreProperties)
-{
-   pExternalSemaphoreProperties->sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR;
-   pExternalSemaphoreProperties->pNext = NULL;
-   pExternalSemaphoreProperties->compatibleHandleTypes = 0;
-   pExternalSemaphoreProperties->exportFromImportedHandleTypes =
-       VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR|
-       VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FUCHSIA_FENCE_BIT_KHR;
-
-   switch (pExternalSemaphoreInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FUCHSIA_FENCE_BIT_KHR:
-      pExternalSemaphoreProperties->externalSemaphoreFeatures =
-          VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR |
-          VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR;
-      break;
-   default:
-      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-   }
-}
-
 // Event functions
 
 VkResult anv_CreateEvent(
diff --git a/src/intel/vulkan/anv_entrypoints_gen.py b/src/intel/vulkan/anv_entrypoints_gen.py
index fca5fd5..b6f95ff 100755
--- a/src/intel/vulkan/anv_entrypoints_gen.py
+++ b/src/intel/vulkan/anv_entrypoints_gen.py
@@ -38,19 +38,24 @@
 MAX_API_VERSION = 1.0
 
 SUPPORTED_EXTENSIONS = [
+    'VK_KHR_dedicated_allocation',
     'VK_KHR_descriptor_update_template',
     'VK_KHR_external_memory',
     'VK_KHR_external_memory_capabilities',
     'VK_KHR_external_memory_fd',
     'VK_KHR_external_memory_fuchsia',
+    'VK_KHR_get_memory_requirements2',
     'VK_KHR_get_physical_device_properties2',
+    'VK_KHR_get_surface_capabilities2',
     'VK_KHR_incremental_present',
     'VK_KHR_maintenance1',
     'VK_KHR_push_descriptor',
     'VK_KHR_sampler_mirror_clamp_to_edge',
     'VK_KHR_shader_draw_parameters',
+    'VK_KHR_storage_buffer_storage_class',
     'VK_KHR_surface',
     'VK_KHR_swapchain',
+    'VK_KHR_variable_pointers',
     'VK_KHR_wayland_surface',
     'VK_KHR_xcb_surface',
     'VK_KHR_xlib_surface',
@@ -59,7 +64,8 @@
     'VK_KHR_external_semaphore_capabilities',
     'VK_KHR_external_semaphore',
     'VK_KHR_external_semaphore_fd',
-    'VK_KHR_external_semaphore_fuchsia'
+    'VK_KHR_external_semaphore_fuchsia',
+    'VK_KHX_multiview'
 ]
 
 # We generate a static hash table for entry point lookup
@@ -98,6 +104,7 @@
       ${type_} gen75_${name}(${args});
       ${type_} gen8_${name}(${args});
       ${type_} gen9_${name}(${args});
+      ${type_} gen10_${name}(${args});
       % if guard is not None:
     #endif // ${guard}
       % endif
@@ -159,7 +166,7 @@
      * either pick the correct entry point.
      */
 
-    % for layer in ['anv', 'gen7', 'gen75', 'gen8', 'gen9']:
+    % for layer in ['anv', 'gen7', 'gen75', 'gen8', 'gen9', 'gen10']:
       % for type_, name, args, _, _, guard in entrypoints:
         % if guard is not None:
     #ifdef ${guard}
@@ -191,6 +198,10 @@
        assert(gen == 9 && !is_haswell);
 
        switch (gen) {
+       case 10:
+          if (gen10_layer.entrypoints[index])
+             return gen10_layer.entrypoints[index];
+          /* fall through */
        case 9:
           if (gen9_layer.entrypoints[index])
              return gen9_layer.entrypoints[index];
diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c
index ca002ac..9f3eba9 100644
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -23,8 +23,7 @@
 
 #include "anv_private.h"
 #include "vk_format_info.h"
-
-#include "util/vk_util.h"
+#include "vk_util.h"
 
 /*
  * gcc-4 and earlier don't allow compound literals where a constant
@@ -251,6 +250,15 @@
 
 #undef fmt
 
+static bool
+format_supported(VkFormat vk_format)
+{
+   if (vk_format >= ARRAY_SIZE(anv_formats))
+      return false;
+
+   return anv_formats[vk_format].isl_format != ISL_FORMAT_UNSUPPORTED;
+}
+
 /**
  * Exactly one bit must be set in \a aspect.
  */
@@ -258,10 +266,10 @@
 anv_get_format(const struct gen_device_info *devinfo, VkFormat vk_format,
                VkImageAspectFlags aspect, VkImageTiling tiling)
 {
-   struct anv_format format = anv_formats[vk_format];
+   if (!format_supported(vk_format))
+      return anv_formats[VK_FORMAT_UNDEFINED];
 
-   if (format.isl_format == ISL_FORMAT_UNSUPPORTED)
-      return format;
+   struct anv_format format = anv_formats[vk_format];
 
    if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
       assert(vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_STENCIL_BIT);
@@ -379,11 +387,6 @@
    if (format == ISL_FORMAT_R32_SINT || format == ISL_FORMAT_R32_UINT)
       flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
 
-   if (flags) {
-      flags |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
-               VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
-   }
-
    return flags;
 }
 
@@ -397,11 +400,12 @@
       gen += 5;
 
    VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0;
-   if (anv_formats[format].isl_format == ISL_FORMAT_UNSUPPORTED) {
+   if (!format_supported(format)) {
       /* Nothing to do here */
    } else if (vk_format_is_depth_or_stencil(format)) {
       tiled |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
-      if (physical_device->info.gen >= 8)
+      if (vk_format_aspects(format) == VK_IMAGE_ASPECT_DEPTH_BIT ||
+          physical_device->info.gen >= 8)
          tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
 
       tiled |= VK_FORMAT_FEATURE_BLIT_SRC_BIT |
@@ -493,9 +497,8 @@
    uint32_t maxMipLevels;
    uint32_t maxArraySize;
    VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT;
-   VkImageTiling tiling = info->tiling;
-    
-   if (anv_formats[info->format].isl_format == ISL_FORMAT_UNSUPPORTED)
+
+   if (!format_supported(info->format))
       goto unsupported;
 
    anv_physical_device_get_format_properties(physical_device, info->format,
@@ -504,9 +507,9 @@
    /* Extract the VkFormatFeatureFlags that are relevant for the queried
     * tiling.
     */
-   if (tiling == VK_IMAGE_TILING_LINEAR) {
+   if (info->tiling == VK_IMAGE_TILING_LINEAR) {
       format_feature_flags = format_props.linearTilingFeatures;
-   } else if (tiling == VK_IMAGE_TILING_OPTIMAL) {
+   } else if (info->tiling == VK_IMAGE_TILING_OPTIMAL) {
       format_feature_flags = format_props.optimalTilingFeatures;
    } else {
       unreachable("bad VkImageTiling");
@@ -553,7 +556,7 @@
        goto unsupported;
    }
 
-   if (tiling == VK_IMAGE_TILING_OPTIMAL &&
+   if (info->tiling == VK_IMAGE_TILING_OPTIMAL &&
        info->type == VK_IMAGE_TYPE_2D &&
        (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
                                 VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index 185086f..ac47da41 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -48,7 +48,7 @@
  * Return gem handle, or 0 on failure. Gem handles are never 0.
  */
 uint32_t
-anv_gem_create(struct anv_device *device, size_t size)
+anv_gem_create(struct anv_device *device, uint64_t size)
 {
    struct drm_i915_gem_create gem_create = {
       .size = size,
@@ -74,7 +74,7 @@
 }
 
 /**
- * Wrapper around DRM_IOCTL_I915_GEM_MMAP.
+ * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
  */
 void*
 anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index a63e96d..8d81eb5 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -34,7 +34,7 @@
 }
 
 uint32_t
-anv_gem_create(struct anv_device *device, size_t size)
+anv_gem_create(struct anv_device *device, uint64_t size)
 {
    int fd = memfd_create("fake bo", MFD_CLOEXEC);
    if (fd == -1)
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 67147b0..0b7322e 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -64,10 +64,15 @@
                      VkShaderStageFlags active_stages,
                      const unsigned entry_size[4]);
 
-void genX(cmd_buffer_gpu_memcpy)(struct anv_cmd_buffer *cmd_buffer,
-                                 struct anv_bo *dst, uint32_t dst_offset,
-                                 struct anv_bo *src, uint32_t src_offset,
-                                 uint32_t size);
+void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                                struct anv_bo *dst, uint32_t dst_offset,
+                                struct anv_bo *src, uint32_t src_offset,
+                                uint32_t size);
+
+void genX(cmd_buffer_mi_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                                struct anv_bo *dst, uint32_t dst_offset,
+                                struct anv_bo *src, uint32_t src_offset,
+                                uint32_t size);
 
 void genX(blorp_exec)(struct blorp_batch *batch,
                       const struct blorp_params *params);
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
index bd9f76e..51f432c 100644
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -37,7 +37,8 @@
  * Exactly one bit must be set in \a aspect.
  */
 static isl_surf_usage_flags_t
-choose_isl_surf_usage(VkImageUsageFlags vk_usage,
+choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
+                      VkImageUsageFlags vk_usage,
                       VkImageAspectFlags aspect)
 {
    isl_surf_usage_flags_t isl_usage = 0;
@@ -51,7 +52,7 @@
    if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
       isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
 
-   if (vk_usage & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
+   if (vk_create_flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
       isl_usage |= ISL_SURF_USAGE_CUBE_BIT;
 
    /* Even if we're only using it for transfer operations, clears to depth and
@@ -116,6 +117,82 @@
 }
 
 /**
+ * For color images that have an auxiliary surface, request allocation for an
+ * additional buffer that mainly stores fast-clear values. Use of this buffer
+ * allows us to access the image's subresources while being aware of their
+ * fast-clear values in non-trivial cases (e.g., outside of a render pass in
+ * which a fast clear has occurred).
+ *
+ * For the purpose of discoverability, the algorithm used to manage this buffer
+ * is described here. A clear value in this buffer is updated when a fast clear
+ * is performed on a subresource. One of two synchronization operations is
+ * performed in order for a following memory access to use the fast-clear
+ * value:
+ *    a. Copy the value from the buffer to the surface state object used for
+ *       reading. This is done implicitly when the value is the clear value
+ *       predetermined to be the default in other surface state objects. This
+ *       is currently only done explicitly for the operation below.
+ *    b. Do (a) and use the surface state object to resolve the subresource.
+ *       This is only done during layout transitions for decent performance.
+ *
+ * With the above scheme, we can fast-clear whenever the hardware allows except
+ * for two cases in which synchronization becomes impossible or undesirable:
+ *    * The subresource is in the GENERAL layout and is cleared to a value
+ *      other than the special default value.
+ *
+ *      Performing a synchronization operation in order to read from the
+ *      subresource is undesirable in this case. Firstly, b) is not an option
+ *      because a layout transition isn't required between a write and read of
+ *      an image in the GENERAL layout. Secondly, it's undesirable to do a)
+ *      explicitly because it would require large infrastructural changes. The
+ *      Vulkan API supports us in deciding not to optimize this layout by
+ *      stating that using this layout may cause suboptimal performance. NOTE:
+ *      the auxiliary buffer must always be enabled to support a) implicitly.
+ *
+ *
+ *    * For the given miplevel, only some of the layers are cleared at once.
+ *
+ *      If the user clears each layer to a different value, then tries to
+ *      render to multiple layers at once, we have no ability to perform a
+ *      synchronization operation in between. a) is not helpful because the
+ *      object can only hold one clear value. b) is not an option because a
+ *      layout transition isn't required in this case.
+ */
+static void
+add_fast_clear_state_buffer(struct anv_image *image,
+                            const struct anv_device *device)
+{
+   assert(image && device);
+   assert(image->aux_surface.isl.size > 0 &&
+          image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+
+   /* The offset to the buffer of clear values must be dword-aligned for GPU
+    * memcpy operations. It is located immediately after the auxiliary surface.
+    */
+
+   /* Tiled images are guaranteed to be 4K aligned, so the image alignment
+    * should also be dword-aligned.
+    */
+   assert(image->alignment % 4 == 0);
+
+   /* Auxiliary buffers should be a multiple of 4K, so the start of the clear
+    * values buffer should already be dword-aligned.
+    */
+   assert(image->aux_surface.isl.size % 4 == 0);
+
+   /* This buffer should be at the very end of the image. */
+   assert(image->size ==
+          image->aux_surface.offset + image->aux_surface.isl.size);
+
+   const unsigned entry_size = anv_fast_clear_state_entry_size(device);
+   /* There's no padding between entries, so ensure that they're always a
+    * multiple of 32 bits in order to enable GPU memcpy operations.
+    */
+   assert(entry_size % 4 == 0);
+   image->size += entry_size * anv_image_aux_levels(image);
+}
+
+/**
  * Initialize the anv_image::*_surface selected by \a aspect. Then update the
  * image's memory requirements (that is, the image's size and alignment).
  *
@@ -179,7 +256,7 @@
       .samples = vk_info->samples,
       .min_alignment = 0,
       .row_pitch = anv_info->stride,
-      .usage = choose_isl_surf_usage(image->usage, aspect),
+      .usage = choose_isl_surf_usage(vk_info->flags, image->usage, aspect),
       .tiling_flags = tiling_flags);
 
    /* isl_surf_init() will fail only if provided invalid input. Invalid input
@@ -222,9 +299,25 @@
       if (!unlikely(INTEL_DEBUG & DEBUG_NO_RBC)) {
          assert(image->aux_surface.isl.size == 0);
          ok = isl_surf_get_ccs_surf(&dev->isl_dev, &anv_surf->isl,
-                                    &image->aux_surface.isl);
+                                    &image->aux_surface.isl, 0);
          if (ok) {
+
+            /* Disable CCS when it is not useful (i.e., when you can't render
+             * to the image with CCS enabled).
+             */
+            if (!isl_format_supports_rendering(&dev->info, format)) {
+               /* While it may be technically possible to enable CCS for this
+                * image, we currently don't have things hooked up to get it
+                * working.
+                */
+               anv_perf_warn("This image format doesn't support rendering. "
+                             "Not allocating an CCS buffer.");
+               image->aux_surface.isl.size = 0;
+               return VK_SUCCESS;
+            }
+
             add_surface(image, &image->aux_surface);
+            add_fast_clear_state_buffer(image, dev);
 
             /* For images created without MUTABLE_FORMAT_BIT set, we know that
              * they will always be used with the original format.  In
@@ -248,6 +341,7 @@
                                  &image->aux_surface.isl);
       if (ok) {
          add_surface(image, &image->aux_surface);
+         add_fast_clear_state_buffer(image, dev);
          image->aux_usage = ISL_AUX_USAGE_MCS;
       }
    }
@@ -275,12 +369,11 @@
    anv_assert(pCreateInfo->extent.height > 0);
    anv_assert(pCreateInfo->extent.depth > 0);
 
-   image = vk_alloc2(&device->alloc, alloc, sizeof(*image), 8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   image = vk_zalloc2(&device->alloc, alloc, sizeof(*image), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!image)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   memset(image, 0, sizeof(*image));
    image->type = pCreateInfo->imageType;
    image->extent = pCreateInfo->extent;
    image->vk_format = pCreateInfo->format;
@@ -579,7 +672,7 @@
    ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
    struct anv_image_view *iview;
 
-   iview = vk_alloc2(&device->alloc, pAllocator, sizeof(*iview), 8,
+   iview = vk_zalloc2(&device->alloc, pAllocator, sizeof(*iview), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (iview == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -591,7 +684,8 @@
    assert(image->usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
                           VK_IMAGE_USAGE_STORAGE_BIT |
                           VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
-                          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT|
+                          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
+                          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT |
                           VK_IMAGE_USAGE_SCANOUT_BIT_GOOGLE));
 
    switch (image->type) {
@@ -665,54 +759,47 @@
    if (image->usage & VK_IMAGE_USAGE_SAMPLED_BIT ||
        (image->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
         !(iview->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT))) {
-      iview->sampler_surface_state = alloc_surface_state(device);
-      iview->no_aux_sampler_surface_state = alloc_surface_state(device);
+      iview->optimal_sampler_surface_state = alloc_surface_state(device);
+      iview->general_sampler_surface_state = alloc_surface_state(device);
 
-      /* Sampling is performed in one of two buffer configurations in anv: with
-       * an auxiliary buffer or without it. Sampler states aren't always needed
-       * for both configurations, but are currently created unconditionally for
-       * simplicity.
-       *
-       * TODO: Consider allocating each surface state only when necessary.
-       */
-
-      /* Create a sampler state with the optimal aux_usage for sampling. This
-       * may use the aux_buffer.
-       */
-      const enum isl_aux_usage surf_usage =
+      iview->general_sampler_aux_usage =
+         anv_layout_to_aux_usage(&device->info, image, iview->aspect_mask,
+                                 VK_IMAGE_LAYOUT_GENERAL);
+      iview->optimal_sampler_aux_usage =
          anv_layout_to_aux_usage(&device->info, image, iview->aspect_mask,
                                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
 
       /* If this is a HiZ buffer we can sample from with a programmable clear
        * value (SKL+), define the clear value to the optimal constant.
        */
-      const float red_clear_color = surf_usage == ISL_AUX_USAGE_HIZ &&
-                                    device->info.gen >= 9 ?
-                                    ANV_HZ_FC_VAL : 0.0f;
+      union isl_color_value clear_color = { .u32 = { 0, } };
+      if ((iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+          device->info.gen >= 9)
+         clear_color.f32[0] = ANV_HZ_FC_VAL;
 
       struct isl_view view = iview->isl;
       view.usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
       isl_surf_fill_state(&device->isl_dev,
-                          iview->sampler_surface_state.map,
+                          iview->optimal_sampler_surface_state.map,
                           .surf = &surface->isl,
                           .view = &view,
-                          .clear_color.f32 = { red_clear_color,},
+                          .clear_color = clear_color,
                           .aux_surf = &image->aux_surface.isl,
-                          .aux_usage = surf_usage,
+                          .aux_usage = iview->optimal_sampler_aux_usage,
                           .mocs = device->default_mocs);
 
-      /* Create a sampler state that only uses the main buffer. */
       isl_surf_fill_state(&device->isl_dev,
-                          iview->no_aux_sampler_surface_state.map,
+                          iview->general_sampler_surface_state.map,
                           .surf = &surface->isl,
                           .view = &view,
+                          .clear_color = clear_color,
+                          .aux_surf = &image->aux_surface.isl,
+                          .aux_usage = iview->general_sampler_aux_usage,
                           .mocs = device->default_mocs);
 
-      anv_state_flush(device, iview->sampler_surface_state);
-      anv_state_flush(device, iview->no_aux_sampler_surface_state);
-   } else {
-      iview->sampler_surface_state.alloc_size = 0;
-      iview->no_aux_sampler_surface_state.alloc_size = 0;
+      anv_state_flush(device, iview->optimal_sampler_surface_state);
+      anv_state_flush(device, iview->general_sampler_surface_state);
    }
 
    /* NOTE: This one needs to go last since it may stomp isl_view.format */
@@ -763,9 +850,6 @@
 
       anv_state_flush(device, iview->storage_surface_state);
       anv_state_flush(device, iview->writeonly_storage_surface_state);
-   } else {
-      iview->storage_surface_state.alloc_size = 0;
-      iview->writeonly_storage_surface_state.alloc_size = 0;
    }
 
    *pView = anv_image_view_to_handle(iview);
@@ -783,14 +867,14 @@
    if (!iview)
       return;
 
-   if (iview->sampler_surface_state.alloc_size > 0) {
+   if (iview->optimal_sampler_surface_state.alloc_size > 0) {
       anv_state_pool_free(&device->surface_state_pool,
-                          iview->sampler_surface_state);
+                          iview->optimal_sampler_surface_state);
    }
 
-   if (iview->no_aux_sampler_surface_state.alloc_size > 0) {
+   if (iview->general_sampler_surface_state.alloc_size > 0) {
       anv_state_pool_free(&device->surface_state_pool,
-                          iview->no_aux_sampler_surface_state);
+                          iview->general_sampler_surface_state);
    }
 
    if (iview->storage_surface_state.alloc_size > 0) {
diff --git a/src/intel/vulkan/anv_magma.cc b/src/intel/vulkan/anv_magma.cc
index d31857d..886a82b 100644
--- a/src/intel/vulkan/anv_magma.cc
+++ b/src/intel/vulkan/anv_magma.cc
@@ -114,7 +114,7 @@
 /**
  * Returns 0, 1, or negative to indicate error
  */
-int anv_gem_busy(anv_device *device, anv_buffer_handle_t handle)
+int anv_gem_busy(anv_device* device, anv_buffer_handle_t handle)
 {
    // Magma doesn't have a means to poll buffer busy.
    // Upper layers should be changed to check semaphore signal status.
@@ -128,7 +128,7 @@
    return false;
 }
 
-int anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t *value)
+int anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t* value)
 {
    if (param == I915_CONTEXT_PARAM_GTT_SIZE) {
       // TODO(MA-311) - query for this
@@ -155,7 +155,8 @@
    magma_status_t status =
        magma_create_command_buffer(magma_connection(device), required_size, &cmd_buf_id);
    if (status != MAGMA_STATUS_OK)
-      return DRET_MSG(-1, "magma_alloc_command_buffer failed size 0x%" PRIx64 " : %d", required_size, status);
+      return DRET_MSG(-1, "magma_alloc_command_buffer failed size 0x%" PRIx64 " : %d",
+                      required_size, status);
 
    void* cmd_buf_data;
    status = magma_map(magma_connection(device), cmd_buf_id, &cmd_buf_data);
@@ -167,13 +168,13 @@
    std::vector<uint64_t> wait_semaphore_ids(wait_semaphore_count);
    for (uint32_t i = 0; i < wait_semaphore_count; i++) {
       wait_semaphore_ids[i] = magma_get_semaphore_id(
-          reinterpret_cast<magma_semaphore_t>(wait_semaphores[i]->current_platform_semaphore));
+          reinterpret_cast<magma_semaphore_t>(wait_semaphores[i]->current->platform_semaphore));
    }
 
    std::vector<uint64_t> signal_semaphore_ids(signal_semaphore_count);
    for (uint32_t i = 0; i < signal_semaphore_count; i++) {
       signal_semaphore_ids[i] = magma_get_semaphore_id(
-          reinterpret_cast<magma_semaphore_t>(signal_semaphores[i]->current_platform_semaphore));
+          reinterpret_cast<magma_semaphore_t>(signal_semaphores[i]->current->platform_semaphore));
    }
 
    if (!DrmCommandBuffer::Translate(execbuf, std::move(wait_semaphore_ids),
@@ -281,8 +282,7 @@
    return buffer;
 }
 
-int anv_gem_gpu_get_reset_stats(struct anv_device *device,
-                            uint32_t *active, uint32_t *pending)
+int anv_gem_gpu_get_reset_stats(struct anv_device* device, uint32_t* active, uint32_t* pending)
 {
    DLOG("anv_gem_gpu_get_reset_stats - STUB");
    *active = 0;
@@ -339,14 +339,18 @@
    struct anv_semaphore* semaphore = (struct anv_semaphore*)info->semaphore;
    assert(semaphore);
 
-   if (semaphore->current_platform_semaphore != semaphore->original_platform_semaphore)
-      anv_platform_destroy_semaphore(device, semaphore->current_platform_semaphore);
+   if (semaphore->temporary.platform_semaphore) {
+      anv_platform_destroy_semaphore(device, semaphore->temporary.platform_semaphore);
+      semaphore->temporary.platform_semaphore = 0;
+   }
 
-   semaphore->current_platform_semaphore = imported_semaphore;
-
-   if (permanent && semaphore->original_platform_semaphore) {
-      anv_platform_destroy_semaphore(device, semaphore->original_platform_semaphore);
-      semaphore->original_platform_semaphore = 0;
+   if (permanent) {
+      anv_platform_destroy_semaphore(device, semaphore->permanent.platform_semaphore);
+      semaphore->permanent.platform_semaphore = imported_semaphore;
+      semaphore->current = &semaphore->permanent;
+   } else {
+      semaphore->temporary.platform_semaphore = imported_semaphore;
+      semaphore->current = &semaphore->temporary;
    }
 
    return VK_SUCCESS;
@@ -363,7 +367,7 @@
       return VK_SUCCESS;
 
    anv_platform_semaphore_t semaphore =
-       ((struct anv_semaphore*)info->semaphore)->current_platform_semaphore;
+       ((struct anv_semaphore*)info->semaphore)->current->platform_semaphore;
 
    uint32_t handle;
    if (anv_platform_export_semaphore(device, semaphore, &handle) != 0)
diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h
index a929dd9..5b450b4 100644
--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@@ -35,9 +35,8 @@
 
 void anv_nir_lower_push_constants(nir_shader *shader);
 
-void anv_nir_apply_dynamic_offsets(struct anv_pipeline *pipeline,
-                                   nir_shader *shader,
-                                   struct brw_stage_prog_data *prog_data);
+bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask);
+
 void anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
                                    nir_shader *shader,
                                    struct brw_stage_prog_data *prog_data,
diff --git a/src/intel/vulkan/anv_nir_lower_multiview.c b/src/intel/vulkan/anv_nir_lower_multiview.c
new file mode 100644
index 0000000..f40e111
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_lower_multiview.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir/nir_builder.h"
+
+/**
+ * This file implements the lowering required for VK_KHR_multiview.  We
+ * implement multiview using instanced rendering.  The number of instances in
+ * each draw call is multiplied by the number of views in the subpass.  Then,
+ * in the shader, we divide gl_InstanceId by the number of views and use
+ * gl_InstanceId % view_count to compute the actual ViewIndex.
+ */
+
+struct lower_multiview_state {
+   nir_builder builder;
+
+   uint32_t view_mask;
+
+   nir_ssa_def *instance_id;
+   nir_ssa_def *view_index;
+};
+
+static nir_ssa_def *
+build_instance_id(struct lower_multiview_state *state)
+{
+   assert(state->builder.shader->stage == MESA_SHADER_VERTEX);
+
+   if (state->instance_id == NULL) {
+      nir_builder *b = &state->builder;
+
+      b->cursor = nir_before_block(nir_start_block(b->impl));
+
+      /* We use instancing for implementing multiview.  The actual instance id
+       * is given by dividing instance_id by the number of views in this
+       * subpass.
+       */
+      state->instance_id =
+         nir_idiv(b, nir_load_instance_id(b),
+                     nir_imm_int(b, _mesa_bitcount(state->view_mask)));
+   }
+
+   return state->instance_id;
+}
+
+static nir_ssa_def *
+build_view_index(struct lower_multiview_state *state)
+{
+   if (state->view_index == NULL) {
+      nir_builder *b = &state->builder;
+
+      b->cursor = nir_before_block(nir_start_block(b->impl));
+
+      assert(state->view_mask != 0);
+      if (0 && _mesa_bitcount(state->view_mask) == 1) {
+         state->view_index = nir_imm_int(b, ffs(state->view_mask) - 1);
+      } else if (state->builder.shader->stage == MESA_SHADER_VERTEX) {
+         /* We only support 16 viewports */
+         assert((state->view_mask & 0xffff0000) == 0);
+
+         /* We use instancing for implementing multiview.  The compacted view
+          * id is given by instance_id % view_count.  We then have to convert
+          * that to an actual view id.
+          */
+         nir_ssa_def *compacted =
+            nir_umod(b, nir_load_instance_id(b),
+                        nir_imm_int(b, _mesa_bitcount(state->view_mask)));
+
+         if (0 && util_is_power_of_two(state->view_mask + 1)) {
+            /* If we have a full view mask, then compacted is what we want */
+            state->view_index = compacted;
+         } else {
+            /* Now we define a map from compacted view index to the actual
+             * view index that's based on the view_mask.  The map is given by
+             * 16 nibbles, each of which is a value from 0 to 15.
+             */
+            uint64_t remap = 0;
+            uint32_t bit, i = 0;
+            for_each_bit(bit, state->view_mask) {
+               assert(bit < 16);
+               remap |= (uint64_t)bit << (i++ * 4);
+            }
+
+            nir_ssa_def *shift = nir_imul(b, compacted, nir_imm_int(b, 4));
+
+            /* One of these days, when we have int64 everywhere, this will be
+             * easier.
+             */
+            nir_ssa_def *shifted;
+            if (remap <= UINT32_MAX) {
+               shifted = nir_ushr(b, nir_imm_int(b, remap), shift);
+            } else {
+               nir_ssa_def *shifted_low =
+                  nir_ushr(b, nir_imm_int(b, remap), shift);
+               nir_ssa_def *shifted_high =
+                  nir_ushr(b, nir_imm_int(b, remap >> 32),
+                              nir_isub(b, shift, nir_imm_int(b, 32)));
+               shifted = nir_bcsel(b, nir_ilt(b, shift, nir_imm_int(b, 32)),
+                                      shifted_low, shifted_high);
+            }
+            state->view_index = nir_iand(b, shifted, nir_imm_int(b, 0xf));
+         }
+      } else {
+         const struct glsl_type *type = glsl_int_type();
+         if (b->shader->stage == MESA_SHADER_TESS_CTRL ||
+             b->shader->stage == MESA_SHADER_GEOMETRY)
+            type = glsl_array_type(type, 1);
+
+         nir_variable *idx_var =
+            nir_variable_create(b->shader, nir_var_shader_in,
+                                type, "view index");
+         idx_var->data.location = VARYING_SLOT_VIEW_INDEX;
+         if (b->shader->stage == MESA_SHADER_FRAGMENT)
+            idx_var->data.interpolation = INTERP_MODE_FLAT;
+
+         if (glsl_type_is_array(type)) {
+            nir_deref_var *deref = nir_deref_var_create(b->shader, idx_var);
+            nir_deref_array *arr = nir_deref_array_create(b->shader);
+            arr->deref.type = glsl_int_type();
+            arr->deref_array_type = nir_deref_array_type_direct;
+            arr->base_offset = 0;
+            deref->deref.child = &arr->deref;
+
+            state->view_index = nir_load_deref_var(b, deref);
+         } else {
+            state->view_index = nir_load_var(b, idx_var);
+         }
+      }
+   }
+
+   return state->view_index;
+}
+
+bool
+anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask)
+{
+   assert(shader->stage != MESA_SHADER_COMPUTE);
+
+   /* If multiview isn't enabled, we have nothing to do. */
+   if (view_mask == 0)
+      return false;
+
+   struct lower_multiview_state state = {
+      .view_mask = view_mask,
+   };
+
+   /* This pass assumes a single entrypoint */
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
+
+   nir_builder_init(&state.builder, entrypoint);
+
+   bool progress = false;
+   nir_foreach_block(block, entrypoint) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+         if (load->intrinsic != nir_intrinsic_load_instance_id &&
+             load->intrinsic != nir_intrinsic_load_view_index)
+            continue;
+
+         assert(load->dest.is_ssa);
+
+         nir_ssa_def *value;
+         if (load->intrinsic == nir_intrinsic_load_instance_id) {
+            value = build_instance_id(&state);
+         } else {
+            assert(load->intrinsic == nir_intrinsic_load_view_index);
+            value = build_view_index(&state);
+         }
+
+         nir_ssa_def_rewrite_uses(&load->dest.ssa, nir_src_for_ssa(value));
+
+         nir_instr_remove(&load->instr);
+         progress = true;
+      }
+   }
+
+   /* The view index is available in all stages but the instance id is only
+    * available in the VS.  If it's not a fragment shader, we need to pass
+    * the view index on to the next stage.
+    */
+   if (shader->stage != MESA_SHADER_FRAGMENT) {
+      nir_ssa_def *view_index = build_view_index(&state);
+
+      nir_builder *b = &state.builder;
+
+      assert(view_index->parent_instr->block == nir_start_block(entrypoint));
+      b->cursor = nir_after_instr(view_index->parent_instr);
+
+      nir_variable *view_index_out =
+         nir_variable_create(shader, nir_var_shader_out,
+                             glsl_int_type(), "view index");
+      view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+      nir_store_var(b, view_index_out, view_index, 0x1);
+
+      nir_variable *layer_id_out =
+         nir_variable_create(shader, nir_var_shader_out,
+                             glsl_int_type(), "layer ID");
+      layer_id_out->data.location = VARYING_SLOT_LAYER;
+      nir_store_var(b, layer_id_out, view_index, 0x1);
+
+      progress = true;
+   }
+
+   if (progress) {
+      nir_metadata_preserve(entrypoint, nir_metadata_block_index |
+                                        nir_metadata_dominance);
+   }
+
+   return progress;
+}
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
index dcd9aaf..a77e52b 100644
--- a/src/intel/vulkan/anv_pass.c
+++ b/src/intel/vulkan/anv_pass.c
@@ -23,6 +23,8 @@
 
 #include "anv_private.h"
 
+#include "vk_util.h"
+
 static unsigned
 num_subpass_attachments(const VkSubpassDescription *desc)
 {
@@ -32,6 +34,16 @@
           (desc->pDepthStencilAttachment != NULL);
 }
 
+static void
+init_first_subpass_layout(struct anv_render_pass_attachment * const att,
+                          const VkAttachmentReference att_ref)
+{
+   if (att->first_subpass_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+      att->first_subpass_layout = att_ref.layout;
+      assert(att->first_subpass_layout != VK_IMAGE_LAYOUT_UNDEFINED);
+   }
+}
+
 VkResult anv_CreateRenderPass(
     VkDevice                                    _device,
     const VkRenderPassCreateInfo*               pCreateInfo,
@@ -61,10 +73,6 @@
    }
    anv_multialloc_add(&ma, &subpass_attachments, subpass_attachment_count);
 
-   enum anv_subpass_usage *subpass_usages;
-   anv_multialloc_add(&ma, &subpass_usages,
-                      pCreateInfo->subpassCount * pCreateInfo->attachmentCount);
-
    if (!anv_multialloc_alloc2(&ma, &device->alloc, pAllocator,
                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -89,8 +97,7 @@
       att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
       att->initial_layout = pCreateInfo->pAttachments[i].initialLayout;
       att->final_layout = pCreateInfo->pAttachments[i].finalLayout;
-      att->subpass_usage = subpass_usages;
-      subpass_usages += pass->subpass_count;
+      att->first_subpass_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    }
 
    bool has_color = false, has_depth = false, has_input = false;
@@ -102,6 +109,7 @@
       subpass->color_count = desc->colorAttachmentCount;
       subpass->attachment_count = num_subpass_attachments(desc);
       subpass->attachments = subpass_attachments;
+      subpass->view_mask = 0;
 
       if (desc->inputAttachmentCount > 0) {
          subpass->input_attachments = subpass_attachments;
@@ -113,9 +121,10 @@
             if (a != VK_ATTACHMENT_UNUSED) {
                has_input = true;
                pass->attachments[a].usage |= VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
-               pass->attachments[a].subpass_usage[i] |= ANV_SUBPASS_USAGE_INPUT;
                pass->attachments[a].last_subpass_idx = i;
 
+               init_first_subpass_layout(&pass->attachments[a],
+                                         desc->pInputAttachments[j]);
                if (desc->pDepthStencilAttachment &&
                    a == desc->pDepthStencilAttachment->attachment)
                   subpass->has_ds_self_dep = true;
@@ -133,8 +142,10 @@
             if (a != VK_ATTACHMENT_UNUSED) {
                has_color = true;
                pass->attachments[a].usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-               pass->attachments[a].subpass_usage[i] |= ANV_SUBPASS_USAGE_DRAW;
                pass->attachments[a].last_subpass_idx = i;
+
+               init_first_subpass_layout(&pass->attachments[a],
+                                         desc->pColorAttachments[j]);
             }
          }
       }
@@ -153,12 +164,10 @@
                pass->attachments[color_att].usage |=
                   VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
                pass->attachments[a].usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-
-               pass->attachments[color_att].subpass_usage[i] |=
-                  ANV_SUBPASS_USAGE_RESOLVE_SRC;
-               pass->attachments[a].subpass_usage[i] |=
-                  ANV_SUBPASS_USAGE_RESOLVE_DST;
                pass->attachments[a].last_subpass_idx = i;
+
+               init_first_subpass_layout(&pass->attachments[a],
+                                         desc->pResolveAttachments[j]);
             }
          }
       }
@@ -171,8 +180,10 @@
             has_depth = true;
             pass->attachments[a].usage |=
                VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-            pass->attachments[a].subpass_usage[i] |= ANV_SUBPASS_USAGE_DRAW;
             pass->attachments[a].last_subpass_idx = i;
+
+            init_first_subpass_layout(&pass->attachments[a],
+                                      *desc->pDepthStencilAttachment);
          }
       } else {
          subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
@@ -261,6 +272,22 @@
          ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
    }
 
+   vk_foreach_struct(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO_KHX: {
+         VkRenderPassMultiviewCreateInfoKHX *mv = (void *)ext;
+
+         for (uint32_t i = 0; i < mv->subpassCount; i++) {
+            pass->subpasses[i].view_mask = mv->pViewMasks[i];
+         }
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+      }
+   }
+
    *pRenderPass = anv_render_pass_to_handle(pass);
 
    return VK_SUCCESS;
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 9d0dc69..8c4daa0 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -128,6 +128,8 @@
       .tessellation = true,
       .draw_parameters = true,
       .image_write_without_format = true,
+      .multiview = true,
+      .variable_pointers = true,
    };
 
    nir_function *entry_point =
@@ -169,20 +171,19 @@
    NIR_PASS_V(nir, nir_propagate_invariant);
    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
               entry_point->impl, true, false);
-   NIR_PASS_V(nir, nir_lower_system_values);
 
    /* Vulkan uses the separate-shader linking model */
-   nir->info->separate_shader = true;
+   nir->info.separate_shader = true;
 
    nir = brw_preprocess_nir(compiler, nir);
 
+   NIR_PASS_V(nir, nir_lower_system_values);
+
    NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
 
    if (stage == MESA_SHADER_FRAGMENT)
       NIR_PASS_V(nir, anv_nir_lower_input_attachments);
 
-   nir_shader_gather_info(nir, entry_point->impl);
-
    return nir;
 }
 
@@ -281,7 +282,6 @@
                      struct brw_wm_prog_key *key)
 {
    const struct gen_device_info *devinfo = &pipeline->device->info;
-   ANV_FROM_HANDLE(anv_render_pass, render_pass, info->renderPass);
 
    memset(key, 0, sizeof(*key));
 
@@ -299,8 +299,7 @@
    /* XXX Vulkan doesn't appear to specify */
    key->clamp_fragment_color = false;
 
-   key->nr_color_regions =
-      render_pass->subpasses[info->subpass].color_count;
+   key->nr_color_regions = pipeline->subpass->color_count;
 
    key->replicate_alpha = key->nr_color_regions > 1 &&
                           info->pMultisampleState &&
@@ -331,6 +330,38 @@
    populate_sampler_prog_key(devinfo, &key->tex);
 }
 
+static void
+anv_pipeline_hash_shader(struct anv_pipeline *pipeline,
+                         struct anv_shader_module *module,
+                         const char *entrypoint,
+                         gl_shader_stage stage,
+                         const VkSpecializationInfo *spec_info,
+                         const void *key, size_t key_size,
+                         unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+
+   _mesa_sha1_init(&ctx);
+   if (stage != MESA_SHADER_COMPUTE) {
+      _mesa_sha1_update(&ctx, &pipeline->subpass->view_mask,
+                        sizeof(pipeline->subpass->view_mask));
+   }
+   if (pipeline->layout) {
+      _mesa_sha1_update(&ctx, pipeline->layout->sha1,
+                        sizeof(pipeline->layout->sha1));
+   }
+   _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
+   _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
+   _mesa_sha1_update(&ctx, &stage, sizeof(stage));
+   if (spec_info) {
+      _mesa_sha1_update(&ctx, spec_info->pMapEntries,
+                        spec_info->mapEntryCount * sizeof(*spec_info->pMapEntries));
+      _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
+   }
+   _mesa_sha1_update(&ctx, key, key_size);
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
 static nir_shader *
 anv_pipeline_compile(struct anv_pipeline *pipeline,
                      struct anv_shader_module *module,
@@ -348,6 +379,11 @@
 
    NIR_PASS_V(nir, anv_nir_lower_push_constants);
 
+   if (stage != MESA_SHADER_COMPUTE)
+      NIR_PASS_V(nir, anv_nir_lower_multiview, pipeline->subpass->view_mask);
+
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
    /* Figure out the number of parameters */
    prog_data->nr_params = 0;
 
@@ -359,8 +395,8 @@
       prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
    }
 
-   if (nir->info->num_images > 0) {
-      prog_data->nr_params += nir->info->num_images * BRW_IMAGE_PARAM_SIZE;
+   if (nir->info.num_images > 0) {
+      prog_data->nr_params += nir->info.num_images * BRW_IMAGE_PARAM_SIZE;
       pipeline->needs_data_cache = true;
    }
 
@@ -368,7 +404,7 @@
       ((struct brw_cs_prog_data *)prog_data)->thread_local_id_index =
          prog_data->nr_params++; /* The CS Thread ID uniform */
 
-   if (nir->info->num_ssbos > 0)
+   if (nir->info.num_ssbos > 0)
       pipeline->needs_data_cache = true;
 
    if (prog_data->nr_params > 0) {
@@ -463,8 +499,9 @@
    populate_vs_prog_key(&pipeline->device->info, &key);
 
    if (cache) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
-                      pipeline->layout, spec_info);
+      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+                               MESA_SHADER_VERTEX, spec_info,
+                               &key, sizeof(key), sha1);
       bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
@@ -490,13 +527,10 @@
 
       ralloc_steal(mem_ctx, nir);
 
-      prog_data.inputs_read = nir->info->inputs_read;
-      prog_data.double_inputs_read = nir->info->double_inputs_read;
-
       brw_compute_vue_map(&pipeline->device->info,
                           &prog_data.base.vue_map,
-                          nir->info->outputs_written,
-                          nir->info->separate_shader);
+                          nir->info.outputs_written,
+                          nir->info.separate_shader);
 
       unsigned code_size;
       const unsigned *shader_code =
@@ -555,6 +589,10 @@
           tcs_info->tess.spacing == tes_info->tess.spacing);
    tes_info->tess.spacing |= tcs_info->tess.spacing;
 
+   assert(tcs_info->tess.primitive_mode == 0 ||
+          tes_info->tess.primitive_mode == 0 ||
+          tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
+   tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
    tes_info->tess.ccw |= tcs_info->tess.ccw;
    tes_info->tess.point_mode |= tcs_info->tess.point_mode;
 }
@@ -587,10 +625,12 @@
    tcs_key.input_vertices = info->pTessellationState->patchControlPoints;
 
    if (cache) {
-      anv_hash_shader(tcs_sha1, &tcs_key, sizeof(tcs_key), tcs_module,
-                      tcs_entrypoint, pipeline->layout, tcs_spec_info);
-      anv_hash_shader(tes_sha1, &tes_key, sizeof(tes_key), tes_module,
-                      tes_entrypoint, pipeline->layout, tes_spec_info);
+      anv_pipeline_hash_shader(pipeline, tcs_module, tcs_entrypoint,
+                               MESA_SHADER_TESS_CTRL, tcs_spec_info,
+                               &tcs_key, sizeof(tcs_key), tcs_sha1);
+      anv_pipeline_hash_shader(pipeline, tes_module, tes_entrypoint,
+                               MESA_SHADER_TESS_EVAL, tes_spec_info,
+                               &tes_key, sizeof(tes_key), tes_sha1);
       memcpy(&tcs_sha1[20], tes_sha1, 20);
       memcpy(&tes_sha1[20], tcs_sha1, 20);
       tcs_bin = anv_pipeline_cache_search(cache, tcs_sha1, sizeof(tcs_sha1));
@@ -626,10 +666,10 @@
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
       nir_lower_tes_patch_vertices(tes_nir,
-                                   tcs_nir->info->tess.tcs_vertices_out);
+                                   tcs_nir->info.tess.tcs_vertices_out);
 
       /* Copy TCS info into the TES info */
-      merge_tess_info(tes_nir->info, tcs_nir->info);
+      merge_tess_info(&tes_nir->info, &tcs_nir->info);
 
       anv_fill_binding_table(&tcs_prog_data.base.base, 0);
       anv_fill_binding_table(&tes_prog_data.base.base, 0);
@@ -643,13 +683,13 @@
        * this comes from the SPIR-V, which is part of the hash used for the
        * pipeline cache.  So it should be safe.
        */
-      tcs_key.tes_primitive_mode = tes_nir->info->tess.primitive_mode;
-      tcs_key.outputs_written = tcs_nir->info->outputs_written;
-      tcs_key.patch_outputs_written = tcs_nir->info->patch_outputs_written;
+      tcs_key.tes_primitive_mode = tes_nir->info.tess.primitive_mode;
+      tcs_key.outputs_written = tcs_nir->info.outputs_written;
+      tcs_key.patch_outputs_written = tcs_nir->info.patch_outputs_written;
       tcs_key.quads_workaround =
          devinfo->gen < 9 &&
-         tes_nir->info->tess.primitive_mode == 7 /* GL_QUADS */ &&
-         tes_nir->info->tess.spacing == TESS_SPACING_EQUAL;
+         tes_nir->info.tess.primitive_mode == 7 /* GL_QUADS */ &&
+         tes_nir->info.tess.spacing == TESS_SPACING_EQUAL;
 
       tes_key.inputs_read = tcs_key.outputs_written;
       tes_key.patch_inputs_read = tcs_key.patch_outputs_written;
@@ -724,8 +764,9 @@
    populate_gs_prog_key(&pipeline->device->info, &key);
 
    if (cache) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
-                      pipeline->layout, spec_info);
+      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+                               MESA_SHADER_GEOMETRY, spec_info,
+                               &key, sizeof(key), sha1);
       bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
@@ -753,8 +794,8 @@
 
       brw_compute_vue_map(&pipeline->device->info,
                           &prog_data.base.vue_map,
-                          nir->info->outputs_written,
-                          nir->info->separate_shader);
+                          nir->info.outputs_written,
+                          nir->info.separate_shader);
 
       unsigned code_size;
       const unsigned *shader_code =
@@ -801,8 +842,9 @@
    populate_wm_prog_key(pipeline, info, &key);
 
    if (cache) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
-                      pipeline->layout, spec_info);
+      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+                               MESA_SHADER_FRAGMENT, spec_info,
+                               &key, sizeof(key), sha1);
       bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
@@ -923,8 +965,9 @@
    populate_cs_prog_key(&pipeline->device->info, &key);
 
    if (cache) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
-                      pipeline->layout, spec_info);
+      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+                               MESA_SHADER_COMPUTE, spec_info,
+                               &key, sizeof(key), sha1);
       bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
@@ -994,8 +1037,7 @@
                        const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
    anv_cmd_dirty_mask_t states = ANV_CMD_DIRTY_DYNAMIC_ALL;
-   ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
-   struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
+   struct anv_subpass *subpass = pipeline->subpass;
 
    pipeline->dynamic_state = default_dynamic_state;
 
@@ -1198,6 +1240,11 @@
       alloc = &device->alloc;
 
    pipeline->device = device;
+
+   ANV_FROM_HANDLE(anv_render_pass, render_pass, pCreateInfo->renderPass);
+   assert(pCreateInfo->subpass < render_pass->subpass_count);
+   pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+
    pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
 
    result = anv_reloc_list_init(&pipeline->batch_relocs, alloc);
@@ -1285,7 +1332,7 @@
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
 
-      if (inputs_read & (1 << (VERT_ATTRIB_GENERIC0 + desc->location)))
+      if (inputs_read & (1ull << (VERT_ATTRIB_GENERIC0 + desc->location)))
          pipeline->vb_used |= 1 << desc->binding;
    }
 
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index 3cfe3ec..c3a62f5 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -21,7 +21,6 @@
  * IN THE SOFTWARE.
  */
 
-#include "util/mesa-sha1.h"
 #include "util/hash_table.h"
 #include "util/debug.h"
 #include "anv_private.h"
@@ -198,32 +197,6 @@
    }
 }
 
-void
-anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
-                struct anv_shader_module *module,
-                const char *entrypoint,
-                const struct anv_pipeline_layout *pipeline_layout,
-                const VkSpecializationInfo *spec_info)
-{
-   struct mesa_sha1 ctx;
-
-   _mesa_sha1_init(&ctx);
-   _mesa_sha1_update(&ctx, key, key_size);
-   _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
-   _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
-   if (pipeline_layout) {
-      _mesa_sha1_update(&ctx, pipeline_layout->sha1,
-                        sizeof(pipeline_layout->sha1));
-   }
-   /* hash in shader stage, pipeline layout? */
-   if (spec_info) {
-      _mesa_sha1_update(&ctx, spec_info->pMapEntries,
-                        spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
-      _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
-   }
-   _mesa_sha1_final(&ctx, hash);
-}
-
 static struct anv_shader_bin *
 anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache,
                                  const void *key_data, uint32_t key_size)
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index a1fce64..4e87514 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -42,13 +42,15 @@
 #define VG(x)
 #endif
 
+#include "common/gen_clflush.h"
 #include "common/gen_device_info.h"
 #include "blorp/blorp.h"
 #include "compiler/brw_compiler.h"
 #include "util/macros.h"
 #include "util/list.h"
+#include "util/u_atomic.h"
 #include "util/u_vector.h"
-#include "util/vk_alloc.h"
+#include "vk_alloc.h"
 
 /* Pre-declarations needed for WSI entrypoints */
 struct wl_surface;
@@ -92,7 +94,7 @@
  */
 #define ANV_HZ_FC_VAL 1.0f
 
-#define MAX_VBS         31
+#define MAX_VBS         28
 #define MAX_SETS         8
 #define MAX_RTS          8
 #define MAX_VIEWPORTS   16
@@ -473,12 +475,8 @@
     */
    struct u_vector mmap_cleanups;
 
-   uint32_t block_size;
-
-   union anv_free_list free_list;
    struct anv_block_state state;
 
-   union anv_free_list back_free_list;
    struct anv_block_state back_state;
 };
 
@@ -502,8 +500,9 @@
    void *map;
 };
 
+#define ANV_STATE_NULL ((struct anv_state) { .alloc_size = 0 })
+
 struct anv_fixed_size_state_pool {
-   size_t state_size;
    union anv_free_list free_list;
    struct anv_block_state block;
 };
@@ -514,69 +513,58 @@
 #define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1)
 
 struct anv_state_pool {
-   struct anv_block_pool *block_pool;
+   struct anv_block_pool block_pool;
+
+   /* The size of blocks which will be allocated from the block pool */
+   uint32_t block_size;
+
+   /** Free list for "back" allocations */
+   union anv_free_list back_alloc_free_list;
+
    struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS];
 };
 
 struct anv_state_stream_block;
 
 struct anv_state_stream {
-   struct anv_block_pool *block_pool;
+   struct anv_state_pool *state_pool;
 
-   /* The current working block */
-   struct anv_state_stream_block *block;
+   /* The size of blocks to allocate from the state pool */
+   uint32_t block_size;
 
-   /* Offset at which the current block starts */
-   uint32_t start;
-   /* Offset at which to allocate the next state */
+   /* Current block we're allocating from */
+   struct anv_state block;
+
+   /* Offset into the current block at which to allocate the next state */
    uint32_t next;
-   /* Offset at which the current block ends */
-   uint32_t end;
+
+   /* List of all blocks allocated from this pool */
+   struct anv_state_stream_block *block_list;
 };
 
-#define CACHELINE_SIZE 64
-#define CACHELINE_MASK 63
-
-static inline void
-anv_clflush_range(void *start, size_t size)
-{
-   uint8_t *p = (uint8_t *) (((uintptr_t) start) & ~CACHELINE_MASK);
-   uint8_t *end = (uint8_t*) start + size;
-
-   while (p < end) {
-      __builtin_ia32_clflush(p);
-      p += CACHELINE_SIZE;
-   }
-}
-
-static inline void
-anv_flush_range(void *start, size_t size)
-{
-   __builtin_ia32_mfence();
-   anv_clflush_range(start, size);
-}
-
-static inline void
-anv_invalidate_range(void *start, size_t size)
-{
-   anv_clflush_range(start, size);
-   __builtin_ia32_mfence();
-}
-
+/* The block_pool functions exported for testing only.  The block pool should
+ * only be used via a state pool (see below).
+ */
 VkResult anv_block_pool_init(struct anv_block_pool *pool,
-                             struct anv_device *device, uint32_t block_size);
+                             struct anv_device *device,
+                             uint32_t initial_size);
 void anv_block_pool_finish(struct anv_block_pool *pool);
-int32_t anv_block_pool_alloc(struct anv_block_pool *pool);
-int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool);
-void anv_block_pool_free(struct anv_block_pool *pool, int32_t offset);
-void anv_state_pool_init(struct anv_state_pool *pool,
-                         struct anv_block_pool *block_pool);
+int32_t anv_block_pool_alloc(struct anv_block_pool *pool,
+                             uint32_t block_size);
+int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool,
+                                  uint32_t block_size);
+
+VkResult anv_state_pool_init(struct anv_state_pool *pool,
+                             struct anv_device *device,
+                             uint32_t block_size);
 void anv_state_pool_finish(struct anv_state_pool *pool);
 struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool,
-                                      size_t state_size, size_t alignment);
+                                      uint32_t state_size, uint32_t alignment);
+struct anv_state anv_state_pool_alloc_back(struct anv_state_pool *pool);
 void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state);
 void anv_state_stream_init(struct anv_state_stream *stream,
-                           struct anv_block_pool *block_pool);
+                           struct anv_state_pool *state_pool,
+                           uint32_t block_size);
 void anv_state_stream_finish(struct anv_state_stream *stream);
 struct anv_state anv_state_stream_alloc(struct anv_state_stream *stream,
                                         uint32_t size, uint32_t alignment);
@@ -762,13 +750,8 @@
 
     struct anv_bo_cache                         bo_cache;
 
-    struct anv_block_pool                       dynamic_state_block_pool;
     struct anv_state_pool                       dynamic_state_pool;
-
-    struct anv_block_pool                       instruction_block_pool;
     struct anv_state_pool                       instruction_state_pool;
-
-    struct anv_block_pool                       surface_state_block_pool;
     struct anv_state_pool                       surface_state_pool;
 
     struct anv_bo                               workaround_bo;
@@ -798,16 +781,12 @@
    if (device->info.has_llc)
       return;
 
-   anv_flush_range(state.map, state.alloc_size);
+   gen_flush_range(state.map, state.alloc_size);
 }
 
 typedef uintptr_t anv_platform_semaphore_t;
-    
-struct anv_semaphore {
-    anv_platform_semaphore_t current_platform_semaphore;
-    anv_platform_semaphore_t original_platform_semaphore;
-};
-    
+
+struct anv_semaphore;
 typedef struct anv_semaphore* anv_semaphore_t;
     
 void anv_device_init_blorp(struct anv_device *device);
@@ -828,7 +807,7 @@
 void* anv_gem_mmap(struct anv_device* device, anv_buffer_handle_t gem_handle, uint64_t offset,
                    uint64_t size, uint32_t flags);
 void anv_gem_munmap(struct anv_device* device, anv_buffer_handle_t gem_handle, void *p, uint64_t size);
-anv_buffer_handle_t anv_gem_create(struct anv_device* device, size_t size);
+anv_buffer_handle_t anv_gem_create(struct anv_device* device, uint64_t size);
 void anv_gem_close(struct anv_device* device, anv_buffer_handle_t gem_handle);
 uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size);
 int anv_gem_busy(struct anv_device *device, anv_buffer_handle_t gem_handle);
@@ -871,8 +850,8 @@
 VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size);
 
 struct anv_reloc_list {
-   size_t                                       num_relocs;
-   size_t                                       array_length;
+   uint32_t                                     num_relocs;
+   uint32_t                                     array_length;
    struct drm_i915_gem_relocation_entry *       relocs;
    struct anv_bo **                             reloc_bos;
 };
@@ -894,7 +873,7 @@
    struct anv_bo                                bo;
 
    /* Bytes actually consumed in this batch BO */
-   size_t                                       length;
+   uint32_t                                     length;
 
    struct anv_reloc_list                        relocs;
 };
@@ -1054,6 +1033,17 @@
       .IndextoMOCSTables                           = 1  \
    }
 
+/* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */
+#define GEN10_MOCS (struct GEN10_MEMORY_OBJECT_CONTROL_STATE) {  \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */              \
+      .IndextoMOCSTables                           = 2         \
+   }
+
+#define GEN10_MOCS_PTE {                                 \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
+      .IndextoMOCSTables                           = 1  \
+   }
+
 struct anv_device_memory {
    struct anv_bo *                              bo;
    struct anv_memory_type *                     type;
@@ -1129,13 +1119,9 @@
 
    union {
       struct {
+         VkImageLayout layout;
          struct anv_image_view *image_view;
          struct anv_sampler *sampler;
-
-         /* Used to determine whether or not we need the surface state to have
-          * the auxiliary buffer enabled.
-          */
-         enum isl_aux_usage aux_usage;
       };
 
       struct {
@@ -1175,7 +1161,6 @@
    /* Put this field right behind anv_descriptor_set so it fills up the
     * descriptors[0] field. */
    struct anv_descriptor descriptors[MAX_PUSH_DESCRIPTORS];
-
    struct anv_buffer_view buffer_views[MAX_PUSH_DESCRIPTORS];
 };
 
@@ -1542,6 +1527,7 @@
    bool                                         fast_clear;
    VkClearValue                                 clear_value;
    bool                                         clear_color_is_zero_one;
+   bool                                         clear_color_is_zero;
 };
 
 /** State required while building cmd buffer */
@@ -1575,7 +1561,7 @@
    struct anv_dynamic_state                     dynamic;
    bool                                         need_query_wa;
 
-   struct anv_push_descriptor_set               push_descriptor;
+   struct anv_push_descriptor_set *             push_descriptors[MAX_SETS];
 
    /**
     * Whether or not the gen8 PMA fix is enabled.  We ensure that, at the top
@@ -1660,7 +1646,7 @@
     *
     * initialized by anv_cmd_buffer_init_batch_bo_chain()
     */
-   struct u_vector                            bt_blocks;
+   struct u_vector                              bt_block_states;
    uint32_t                                     bt_next;
 
    struct anv_reloc_list                        surface_relocs;
@@ -1688,9 +1674,10 @@
                                   struct anv_cmd_buffer *secondary);
 void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer);
 VkResult anv_cmd_buffer_execbuf(struct anv_device* device, struct anv_cmd_buffer* cmd_buffer,
-                                uint32_t wait_semaphore_count, anv_semaphore_t* wait_semaphores,
-                                uint32_t signal_semaphore_count,
-                                anv_semaphore_t* signal_semaphores);
+                                const VkSemaphore *in_semaphores,
+                                uint32_t num_in_semaphores,
+                                const VkSemaphore *out_semaphores,
+                                uint32_t num_out_semaphores);
 
 VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);
 
@@ -1779,6 +1766,45 @@
    struct anv_state                             state;
 };
 
+enum anv_semaphore_type {
+   ANV_SEMAPHORE_TYPE_NONE = 0,
+   ANV_SEMAPHORE_TYPE_DUMMY,
+   ANV_SEMAPHORE_TYPE_BO,
+};
+
+struct anv_semaphore_impl {
+   enum anv_semaphore_type type;
+
+   /* A BO representing this semaphore when type == ANV_SEMAPHORE_TYPE_BO.
+    * This BO will be added to the object list on any execbuf2 calls for
+    * which this semaphore is used as a wait or signal fence.  When used as
+    * a signal fence, the EXEC_OBJECT_WRITE flag will be set.
+    */
+   struct anv_bo *bo;
+
+   anv_platform_semaphore_t platform_semaphore;
+};
+
+struct anv_semaphore {
+   /* Permanent semaphore state.  Every semaphore has some form of permanent
+    * state (type != ANV_SEMAPHORE_TYPE_NONE).  This may be a BO to fence on
+    * (for cross-process semaphores0 or it could just be a dummy for use
+    * internally.
+    */
+   struct anv_semaphore_impl permanent;
+
+   /* Temporary semaphore state.  A semaphore *may* have temporary state.
+    * That state is added to the semaphore by an import operation and is reset
+    * back to ANV_SEMAPHORE_TYPE_NONE when the semaphore is waited on.  A
+    * semaphore with temporary state cannot be signaled because the semaphore
+    * must already be signaled before the temporary state can be exported from
+    * the semaphore in the other process and imported here.
+    */
+   struct anv_semaphore_impl temporary;
+
+   struct anv_semaphore_impl* current;
+};
+
 struct anv_shader_module {
    unsigned char                                sha1[20];
    uint32_t                                     size;
@@ -1791,12 +1817,6 @@
 #define ENUM_FROM_INT(type, val) (val)
 #endif
 
-void anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
-                     struct anv_shader_module *module,
-                     const char *entrypoint,
-                     const struct anv_pipeline_layout *pipeline_layout,
-                     const VkSpecializationInfo *spec_info);
-
 static inline gl_shader_stage
 vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
 {
@@ -1863,14 +1883,14 @@
 anv_shader_bin_ref(struct anv_shader_bin *shader)
 {
    assert(shader && shader->ref_cnt >= 1);
-   __sync_fetch_and_add(&shader->ref_cnt, 1);
+   p_atomic_inc(&shader->ref_cnt);
 }
 
 static inline void
 anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
 {
    assert(shader && shader->ref_cnt >= 1);
-   if (__sync_fetch_and_add(&shader->ref_cnt, -1) == 1)
+   if (p_atomic_dec_zero(&shader->ref_cnt))
       anv_shader_bin_destroy(device, shader);
 }
 
@@ -1882,6 +1902,7 @@
    uint32_t                                     dynamic_state_mask;
    struct anv_dynamic_state                     dynamic_state;
 
+   struct anv_subpass *                         subpass;
    struct anv_pipeline_layout *                 layout;
 
    bool                                         needs_data_cache;
@@ -2082,6 +2103,52 @@
    struct anv_surface aux_surface;
 };
 
+/* Returns the number of auxiliary buffer levels attached to an image. */
+static inline uint8_t
+anv_image_aux_levels(const struct anv_image * const image)
+{
+   assert(image);
+   return image->aux_surface.isl.size > 0 ? image->aux_surface.isl.levels : 0;
+}
+
+/* Returns the number of auxiliary buffer layers attached to an image. */
+static inline uint32_t
+anv_image_aux_layers(const struct anv_image * const image,
+                     const uint8_t miplevel)
+{
+   assert(image);
+
+   /* The miplevel must exist in the main buffer. */
+   assert(miplevel < image->levels);
+
+   if (miplevel >= anv_image_aux_levels(image)) {
+      /* There are no layers with auxiliary data because the miplevel has no
+       * auxiliary data.
+       */
+      return 0;
+   } else {
+      return MAX2(image->aux_surface.isl.logical_level0_px.array_len,
+                  image->aux_surface.isl.logical_level0_px.depth >> miplevel);
+   }
+}
+
+static inline unsigned
+anv_fast_clear_state_entry_size(const struct anv_device *device)
+{
+   assert(device);
+   /* Entry contents:
+    *   +--------------------------------------------+
+    *   | clear value dword(s) | needs resolve dword |
+    *   +--------------------------------------------+
+    */
+
+   /* Ensure that the needs resolve dword is in fact dword-aligned to enable
+    * GPU memcpy operations.
+    */
+   assert(device->isl_dev.ss.clear_value_size % 4 == 0);
+   return device->isl_dev.ss.clear_value_size + 4;
+}
+
 /* Returns true if a HiZ-enabled depth buffer can be sampled from. */
 static inline bool
 anv_can_sample_with_hiz(const struct gen_device_info * const devinfo,
@@ -2098,12 +2165,18 @@
 anv_gen8_hiz_op_resolve(struct anv_cmd_buffer *cmd_buffer,
                         const struct anv_image *image,
                         enum blorp_hiz_op op);
+void
+anv_ccs_resolve(struct anv_cmd_buffer * const cmd_buffer,
+                const struct anv_state surface_state,
+                const struct anv_image * const image,
+                const uint8_t level, const uint32_t layer_count,
+                const enum blorp_fast_clear_op op);
 
 void
-anv_image_ccs_clear(struct anv_cmd_buffer *cmd_buffer,
-                    const struct anv_image *image,
-                    const struct isl_view *view,
-                    const VkImageSubresourceRange *subresourceRange);
+anv_image_fast_clear(struct anv_cmd_buffer *cmd_buffer,
+                     const struct anv_image *image,
+                     const uint32_t base_level, const uint32_t level_count,
+                     const uint32_t base_layer, uint32_t layer_count);
 
 enum isl_aux_usage
 anv_layout_to_aux_usage(const struct gen_device_info * const devinfo,
@@ -2138,14 +2211,19 @@
    VkFormat vk_format;
    VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
 
-   /** RENDER_SURFACE_STATE when using image as a sampler surface. */
-   struct anv_state sampler_surface_state;
+   /**
+    * RENDER_SURFACE_STATE when using image as a sampler surface with an image
+    * layout of SHADER_READ_ONLY_OPTIMAL or DEPTH_STENCIL_READ_ONLY_OPTIMAL.
+    */
+   enum isl_aux_usage optimal_sampler_aux_usage;
+   struct anv_state optimal_sampler_surface_state;
 
    /**
-    * RENDER_SURFACE_STATE when using image as a sampler surface with the
-    * auxiliary buffer disabled.
+    * RENDER_SURFACE_STATE when using image as a sampler surface with an image
+    * layout of GENERAL.
     */
-   struct anv_state no_aux_sampler_surface_state;
+   enum isl_aux_usage general_sampler_aux_usage;
+   struct anv_state general_sampler_surface_state;
 
    /**
     * RENDER_SURFACE_STATE when using image as a storage image. Separate states
@@ -2218,13 +2296,6 @@
                                    uint32_t offset, uint32_t range,
                                    uint32_t stride);
 
-void anv_image_view_fill_image_param(struct anv_device *device,
-                                     struct anv_image_view *view,
-                                     struct brw_image_param *param);
-void anv_buffer_view_fill_image_param(struct anv_device *device,
-                                      struct anv_buffer_view *view,
-                                      struct brw_image_param *param);
-
 struct anv_sampler {
    uint32_t state[4];
 };
@@ -2254,6 +2325,8 @@
 
    VkAttachmentReference                        depth_stencil_attachment;
 
+   uint32_t                                     view_mask;
+
    /** Subpass has a depth/stencil self-dependency */
    bool                                         has_ds_self_dep;
 
@@ -2261,12 +2334,11 @@
    bool                                         has_resolve;
 };
 
-enum anv_subpass_usage {
-   ANV_SUBPASS_USAGE_DRAW =         (1 << 0),
-   ANV_SUBPASS_USAGE_INPUT =        (1 << 1),
-   ANV_SUBPASS_USAGE_RESOLVE_SRC =  (1 << 2),
-   ANV_SUBPASS_USAGE_RESOLVE_DST =  (1 << 3),
-};
+static inline unsigned
+anv_subpass_view_count(const struct anv_subpass *subpass)
+{
+   return MAX2(1, _mesa_bitcount(subpass->view_mask));
+}
 
 struct anv_render_pass_attachment {
    /* TODO: Consider using VkAttachmentDescription instead of storing each of
@@ -2280,9 +2352,7 @@
    VkAttachmentLoadOp                           stencil_load_op;
    VkImageLayout                                initial_layout;
    VkImageLayout                                final_layout;
-
-   /* An array, indexed by subpass id, of how the attachment will be used. */
-   enum anv_subpass_usage *                     subpass_usage;
+   VkImageLayout                                first_subpass_layout;
 
    /* The subpass id in which the attachment will be used last. */
    uint32_t                                     last_subpass_idx;
@@ -2291,7 +2361,6 @@
 struct anv_render_pass {
    uint32_t                                     attachment_count;
    uint32_t                                     subpass_count;
-   VkAttachmentReference *                      subpass_attachments;
    /* An array of subpass_count+1 flushes, one per subpass boundary */
    enum anv_pipe_bits *                         subpass_flushes;
    struct anv_render_pass_attachment *          attachments;
@@ -2399,8 +2468,8 @@
 ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, VkQueryPool)
 ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_render_pass, VkRenderPass)
 ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, VkSampler)
-ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_shader_module, VkShaderModule)
 ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_semaphore, VkSemaphore)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_shader_module, VkShaderModule)
 
 /* Gen-specific function declarations */
 #ifdef genX
@@ -2418,6 +2487,9 @@
 #  define genX(x) gen9_##x
 #  include "anv_genX.h"
 #  undef genX
+#  define genX(x) gen10_##x
+#  include "anv_genX.h"
+#  undef genX
 #endif
 
 #ifdef __cplusplus
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c
new file mode 100644
index 0000000..f38953d
--- /dev/null
+++ b/src/intel/vulkan/anv_queue.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file implements VkQueue, VkFence, and VkSemaphore
+ */
+
+#include "anv_private.h"
+#include "vk_util.h"
+
+#include "genxml/gen7_pack.h"
+
+static void restore_temporary_semaphore_imports(struct anv_device* device,
+                                                struct anv_semaphore* semaphores[], uint32_t count)
+{
+   for (uint32_t i = 0; i < count; i++) {
+      if (semaphores[i]->temporary.platform_semaphore) {
+         anv_platform_destroy_semaphore(device, semaphores[i]->temporary.platform_semaphore);
+         semaphores[i]->temporary.platform_semaphore = 0;
+         semaphores[i]->current = &semaphores[i]->permanent;
+      }
+   }
+}
+
+VkResult
+anv_device_execbuf(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf,
+                   struct anv_bo **execbuf_bos, 
+                   uint32_t wait_semaphore_count,
+                   anv_semaphore_t* wait_semaphores,
+                   uint32_t signal_semaphore_count,
+                   anv_semaphore_t* signal_semaphores)
+{
+   int ret = anv_gem_execbuffer(device, execbuf, wait_semaphore_count, wait_semaphores, signal_semaphore_count, signal_semaphores);
+
+   restore_temporary_semaphore_imports(device, wait_semaphores, wait_semaphore_count);
+   restore_temporary_semaphore_imports(device, signal_semaphores, signal_semaphore_count);
+
+   if (ret != 0) {
+      /* We don't know the real error. */
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
+   }
+
+   struct drm_i915_gem_exec_object2 *objects =
+      (void *)(uintptr_t)execbuf->buffers_ptr;
+   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
+      execbuf_bos[k]->offset = objects[k].offset;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_submit_simple_batch(struct anv_device *device,
+                               struct anv_batch *batch)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 exec2_objects[1];
+   struct anv_bo bo, *exec_bos[1];
+   VkResult result = VK_SUCCESS;
+   uint32_t size;
+
+   /* Kernel driver requires 8 byte aligned batch length */
+   size = align_u32(batch->next - batch->start, 8);
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(bo.map, batch->start, size);
+   if (!device->info.has_llc)
+      gen_flush_range(bo.map, size);
+
+   exec_bos[0] = &bo;
+   exec2_objects[0].handle = bo.gem_handle;
+   exec2_objects[0].relocation_count = 0;
+   exec2_objects[0].relocs_ptr = 0;
+   exec2_objects[0].alignment = 0;
+   exec2_objects[0].offset = bo.offset;
+   exec2_objects[0].flags = 0;
+   exec2_objects[0].rsvd1 = 0;
+   exec2_objects[0].rsvd2 = bo.size;
+
+   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
+   execbuf.buffer_count = 1;
+   execbuf.batch_start_offset = 0;
+   execbuf.batch_len = size;
+   execbuf.cliprects_ptr = 0;
+   execbuf.num_cliprects = 0;
+   execbuf.DR1 = 0;
+   execbuf.DR4 = 0;
+
+   execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   execbuf.rsvd1 = device->context_id;
+   execbuf.rsvd2 = 0;
+
+   result = anv_device_execbuf(device, &execbuf, exec_bos, 0, NULL, 0, NULL);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result = anv_device_wait(device, &bo, INT64_MAX);
+
+ fail:
+   anv_bo_pool_free(&device->batch_bo_pool, &bo);
+
+   return result;
+}
+
+VkResult anv_QueueSubmit(
+    VkQueue                                     _queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   struct anv_device *device = queue->device;
+
+   /* Query for device status prior to submitting.  Technically, we don't need
+    * to do this.  However, if we have a client that's submitting piles of
+    * garbage, we would rather break as early as possible to keep the GPU
+    * hanging contained.  If we don't check here, we'll either be waiting for
+    * the kernel to kick us or we'll have to wait until the client waits on a
+    * fence before we actually know whether or not we've hung.
+    */
+   VkResult result = anv_device_query_status(device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* We lock around QueueSubmit for three main reasons:
+    *
+    *  1) When a block pool is resized, we create a new gem handle with a
+    *     different size and, in the case of surface states, possibly a
+    *     different center offset but we re-use the same anv_bo struct when
+    *     we do so.  If this happens in the middle of setting up an execbuf,
+    *     we could end up with our list of BOs out of sync with our list of
+    *     gem handles.
+    *
+    *  2) The algorithm we use for building the list of unique buffers isn't
+    *     thread-safe.  While the client is supposed to syncronize around
+    *     QueueSubmit, this would be extremely difficult to debug if it ever
+    *     came up in the wild due to a broken app.  It's better to play it
+    *     safe and just lock around QueueSubmit.
+    *
+    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
+    *      userspace.  Due to the fact that the surface state buffer is shared
+    *      between batches, we can't afford to have that happen from multiple
+    *      threads at the same time.  Even though the user is supposed to
+    *      ensure this doesn't happen, we play it safe as in (2) above.
+    *
+    * Since the only other things that ever take the device lock such as block
+    * pool resize only rarely happen, this will almost never be contended so
+    * taking a lock isn't really an expensive operation in this case.
+    */
+   pthread_mutex_lock(&device->mutex);
+
+   for (uint32_t i = 0; i < submitCount; i++) {
+      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
+         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
+                         pSubmits[i].pCommandBuffers[j]);
+         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+         assert(!anv_batch_has_error(&cmd_buffer->batch));
+
+         const VkSemaphore *in_semaphores = NULL, *out_semaphores = NULL;
+         uint32_t num_in_semaphores = 0, num_out_semaphores = 0;
+
+         if (j == 0) {
+            /* Only the first batch gets the in semaphores */
+            in_semaphores = pSubmits[i].pWaitSemaphores;
+            num_in_semaphores = pSubmits[i].waitSemaphoreCount;
+         }
+
+         VkSemaphore semaphore_array_with_fence[pSubmits[i].signalSemaphoreCount + 1];
+
+         if (j == pSubmits[i].commandBufferCount - 1) {
+            /* Only the last batch gets the out semaphores */
+            out_semaphores = pSubmits[i].pSignalSemaphores;
+            num_out_semaphores = pSubmits[i].signalSemaphoreCount;
+
+            // Fuchsia: optimize fence
+            if (fence && i == submitCount - 1) {
+               memcpy(semaphore_array_with_fence, out_semaphores,
+                      num_out_semaphores * sizeof(VkSemaphore));
+               semaphore_array_with_fence[num_out_semaphores] = anv_semaphore_to_handle(fence->semaphore);
+               num_out_semaphores++;
+               out_semaphores = semaphore_array_with_fence;
+
+               assert(fence->state == ANV_FENCE_STATE_RESET);
+               fence->state = ANV_FENCE_STATE_SUBMITTED;
+               pthread_cond_broadcast(&device->queue_submit);
+
+               // Signal that fence has been handled so we don't execute the extra command buffer
+               // below
+               fence = NULL;
+            }
+         }
+
+         result = anv_cmd_buffer_execbuf(device, cmd_buffer,
+                                         in_semaphores, num_in_semaphores,
+                                         out_semaphores, num_out_semaphores);
+         if (result != VK_SUCCESS)
+            goto out;
+      }
+   }
+
+   if (fence) {
+      struct anv_bo *fence_bo = &fence->bo;
+      result = anv_device_execbuf(device, &fence->execbuf, &fence_bo, 0, NULL, 1, &fence->semaphore);
+      if (result != VK_SUCCESS)
+         goto out;
+
+      /* Update the fence and wake up any waiters */
+      assert(fence->state == ANV_FENCE_STATE_RESET);
+      fence->state = ANV_FENCE_STATE_SUBMITTED;
+      pthread_cond_broadcast(&device->queue_submit);
+   }
+
+out:
+   if (result != VK_SUCCESS) {
+      /* In the case that something has gone wrong we may end up with an
+       * inconsistent state from which it may not be trivial to recover.
+       * For example, we might have computed address relocations and
+       * any future attempt to re-submit this job will need to know about
+       * this and avoid computing relocation addresses again.
+       *
+       * To avoid this sort of issues, we assume that if something was
+       * wrong during submission we must already be in a really bad situation
+       * anyway (such us being out of memory) and return
+       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
+       * submit the same job again to this device.
+       */
+      result = vk_errorf(VK_ERROR_DEVICE_LOST, "vkQueueSubmit() failed");
+      device->lost = true;
+
+      /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
+       * vkWaitForFences() and vkGetFenceStatus() return a valid result
+       * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
+       * Setting the fence status to SIGNALED ensures this will happen in
+       * any case.
+       */
+      if (fence)
+         fence->state = ANV_FENCE_STATE_SIGNALED;
+   }
+
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
+}
+
+VkResult anv_QueueWaitIdle(
+    VkQueue                                     _queue)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+
+   return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
+}
+
+VkResult anv_CreateFence(
+    VkDevice                                    _device,
+    const VkFenceCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFence*                                    pFence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_bo fence_bo;
+   struct anv_fence *fence;
+   struct anv_batch batch;
+   VkSemaphore semaphore;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
+
+   VkSemaphoreCreateInfo semaphore_create_info;
+   semaphore_create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+   semaphore_create_info.pNext = NULL;
+   semaphore_create_info.flags = 0;
+
+   result = anv_CreateSemaphore(_device, &semaphore_create_info, pAllocator, &semaphore);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Fences are small.  Just store the CPU data structure in the BO. */
+   fence = fence_bo.map;
+   fence->bo = fence_bo;
+
+   /* Place the batch after the CPU data but on its own cache line. */
+   const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
+   batch.next = batch.start = fence->bo.map + batch_offset;
+   batch.end = fence->bo.map + fence->bo.size;
+   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
+   anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
+
+   if (!device->info.has_llc) {
+      assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
+      assert(batch.next - batch.start <= CACHELINE_SIZE);
+      __builtin_ia32_mfence();
+      __builtin_ia32_clflush(batch.start);
+   }
+
+   fence->exec2_objects[0].handle = fence->bo.gem_handle;
+   fence->exec2_objects[0].relocation_count = 0;
+   fence->exec2_objects[0].relocs_ptr = 0;
+   fence->exec2_objects[0].alignment = 0;
+   fence->exec2_objects[0].offset = fence->bo.offset;
+   fence->exec2_objects[0].flags = 0;
+   fence->exec2_objects[0].rsvd1 = 0;
+   fence->exec2_objects[0].rsvd2 = fence->bo.size;
+
+   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
+   fence->execbuf.buffer_count = 1;
+   fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
+   fence->execbuf.batch_len = batch.next - batch.start;
+   fence->execbuf.cliprects_ptr = 0;
+   fence->execbuf.num_cliprects = 0;
+   fence->execbuf.DR1 = 0;
+   fence->execbuf.DR4 = 0;
+
+   fence->execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   fence->execbuf.rsvd1 = device->context_id;
+   fence->execbuf.rsvd2 = 0;
+
+   if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
+      fence->state = ANV_FENCE_STATE_SIGNALED;
+   } else {
+      fence->state = ANV_FENCE_STATE_RESET;
+   }
+
+   fence->semaphore = (struct anv_semaphore*)semaphore;
+
+   *pFence = anv_fence_to_handle(fence);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyFence(
+    VkDevice                                    _device,
+    VkFence                                     _fence,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+
+   if (!fence)
+      return;
+
+   assert(fence->bo.map == fence);
+   anv_DestroySemaphore(_device, (VkSemaphore)fence->semaphore, pAllocator);
+
+   anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
+}
+
+VkResult anv_ResetFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences)
+{
+   for (uint32_t i = 0; i < fenceCount; i++) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      fence->state = ANV_FENCE_STATE_RESET;
+      anv_platform_reset_semaphore(fence->semaphore->current->platform_semaphore);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetFenceStatus(
+    VkDevice                                    _device,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   uint64_t t = 0;
+   int ret;
+
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   switch (fence->state) {
+   case ANV_FENCE_STATE_RESET:
+      /* If it hasn't even been sent off to the GPU yet, it's not ready */
+      return VK_NOT_READY;
+
+   case ANV_FENCE_STATE_SIGNALED:
+      /* It's been signaled, return success */
+      return VK_SUCCESS;
+
+   case ANV_FENCE_STATE_SUBMITTED: {
+      /* It's been submitted to the GPU but we don't know if it's done yet. */
+      ret = anv_platform_wait_semaphore(fence->semaphore->current->platform_semaphore, t);
+      switch (ret) {
+      case 0:
+         fence->state = ANV_FENCE_STATE_SIGNALED;
+         return VK_SUCCESS;
+      case -ETIME:
+         return VK_NOT_READY;
+      default:
+         /* We don't know the real error. */
+         device->lost = true;
+         return VK_ERROR_DEVICE_LOST;
+      }
+   }
+   default:
+      unreachable("Invalid fence status");
+   }
+}
+
+#define NSEC_PER_SEC 1000000000
+#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
+
+VkResult anv_WaitForFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences,
+    VkBool32                                    waitAll,
+    uint64_t                                    _timeout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   int ret;
+
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
+    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
+    * for a couple of kernel releases.  Since there's no way to know
+    * whether or not the kernel we're using is one of the broken ones, the
+    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
+    * maximum timeout from 584 years to 292 years - likely not a big deal.
+    */
+   int64_t timeout = MIN2(_timeout, INT64_MAX);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t pending_fences = fenceCount;
+   while (pending_fences) {
+      pending_fences = 0;
+      bool signaled_fences = false;
+      for (uint32_t i = 0; i < fenceCount; i++) {
+         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+         switch (fence->state) {
+         case ANV_FENCE_STATE_RESET:
+            /* This fence hasn't been submitted yet, we'll catch it the next
+             * time around.  Yes, this may mean we dead-loop but, short of
+             * lots of locking and a condition variable, there's not much that
+             * we can do about that.
+             */
+            pending_fences++;
+            continue;
+
+         case ANV_FENCE_STATE_SIGNALED:
+            /* This fence is not pending.  If waitAll isn't set, we can return
+             * early.  Otherwise, we have to keep going.
+             */
+            if (!waitAll) {
+               result = VK_SUCCESS;
+               goto done;
+            }
+            continue;
+
+         case ANV_FENCE_STATE_SUBMITTED:
+            /* These are the fences we really care about.  Go ahead and wait
+             * on it until we hit a timeout.
+             */
+            ret = anv_platform_wait_semaphore(fence->semaphore->current->platform_semaphore,
+                                              _timeout == UINT64_MAX ? UINT64_MAX
+                                                                     : _timeout / 1000000);
+            switch (ret) {
+            case 0:
+               fence->state = ANV_FENCE_STATE_SIGNALED;
+               signaled_fences = true;
+               if (!waitAll)
+                  goto done;
+               break;
+            case -ETIME:
+               result = VK_TIMEOUT;
+               goto done;
+            default:
+               /* We don't know the real error. */
+               device->lost = true;
+               goto done;
+            }
+         }
+      }
+
+      if (pending_fences && !signaled_fences) {
+         /* If we've hit this then someone decided to vkWaitForFences before
+          * they've actually submitted any of them to a queue.  This is a
+          * fairly pessimal case, so it's ok to lock here and use a standard
+          * pthreads condition variable.
+          */
+         pthread_mutex_lock(&device->mutex);
+
+         /* It's possible that some of the fences have changed state since the
+          * last time we checked.  Now that we have the lock, check for
+          * pending fences again and don't wait if it's changed.
+          */
+         uint32_t now_pending_fences = 0;
+         for (uint32_t i = 0; i < fenceCount; i++) {
+            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+            if (fence->state == ANV_FENCE_STATE_RESET)
+               now_pending_fences++;
+         }
+         assert(now_pending_fences <= pending_fences);
+
+         if (now_pending_fences == pending_fences) {
+            struct timespec before;
+            clock_gettime(CLOCK_MONOTONIC, &before);
+
+            uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
+            uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
+                               (timeout / NSEC_PER_SEC);
+            abs_nsec %= NSEC_PER_SEC;
+
+            /* Avoid roll-over in tv_sec on 32-bit systems if the user
+             * provided timeout is UINT64_MAX
+             */
+            struct timespec abstime;
+            abstime.tv_nsec = abs_nsec;
+            abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
+
+            ret = pthread_cond_timedwait(&device->queue_submit,
+                                         &device->mutex, &abstime);
+            assert(ret != EINVAL);
+
+            struct timespec after;
+            clock_gettime(CLOCK_MONOTONIC, &after);
+            uint64_t time_elapsed =
+               ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
+               ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
+
+            if (time_elapsed >= timeout) {
+               pthread_mutex_unlock(&device->mutex);
+               result = VK_TIMEOUT;
+               goto done;
+            }
+
+            timeout -= time_elapsed;
+         }
+
+         pthread_mutex_unlock(&device->mutex);
+      }
+   }
+
+done:
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   return result;
+}
+
+// Queue semaphore functions
+
+VkResult anv_CreateSemaphore(
+    VkDevice                                    _device,
+    const VkSemaphoreCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSemaphore*                                pSemaphore)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_semaphore *semaphore;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
+
+   anv_platform_semaphore_t platform_semaphore;
+   if (anv_platform_create_semaphore(device, &platform_semaphore) != 0)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (semaphore == NULL) {
+      anv_platform_destroy_semaphore(device, platform_semaphore);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* The DRM execbuffer ioctl always execute in-oder, even between
+    * different rings. As such, a dummy no-op semaphore is a perfectly
+    * valid implementation.
+    */
+   semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY;
+   semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;
+
+   semaphore->permanent.platform_semaphore = platform_semaphore;
+   semaphore->temporary.platform_semaphore = 0;
+   semaphore->current = &semaphore->permanent;
+
+   *pSemaphore = anv_semaphore_to_handle(semaphore);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroySemaphore(
+    VkDevice                                    _device,
+    VkSemaphore                                 _semaphore,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
+
+   if (semaphore == NULL)
+      return;
+
+   if (semaphore->temporary.platform_semaphore)
+      anv_platform_destroy_semaphore(device, semaphore->temporary.platform_semaphore);
+
+   assert(semaphore->permanent.platform_semaphore);
+   anv_platform_destroy_semaphore(device, semaphore->permanent.platform_semaphore);
+
+   vk_free2(&device->alloc, pAllocator, semaphore);
+}
+
+void anv_GetPhysicalDeviceExternalSemaphorePropertiesKHR(
+    VkPhysicalDevice physicalDevice,
+    const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo,
+    VkExternalSemaphorePropertiesKHR* pExternalSemaphoreProperties)
+{
+   pExternalSemaphoreProperties->sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR;
+   pExternalSemaphoreProperties->pNext = NULL;
+   pExternalSemaphoreProperties->compatibleHandleTypes = 0;
+   pExternalSemaphoreProperties->exportFromImportedHandleTypes =
+       VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR|
+       VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FUCHSIA_FENCE_BIT_KHR;
+
+   switch (pExternalSemaphoreInfo->handleType) {
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
+   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FUCHSIA_FENCE_BIT_KHR:
+      pExternalSemaphoreProperties->externalSemaphoreFeatures =
+          VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR |
+          VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR;
+      break;
+   default:
+      pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
+   }
+}
diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c
index ba91733..4b916e2 100644
--- a/src/intel/vulkan/anv_util.c
+++ b/src/intel/vulkan/anv_util.c
@@ -30,6 +30,7 @@
 
 #include "anv_private.h"
 #include "vk_enum_to_str.h"
+#include "util/debug.h"
 
 /** Log an error message.  */
 void anv_printflike(1, 2)
@@ -95,5 +96,9 @@
       fprintf(stderr, "%s:%d: %s\n", file, line, error_str);
    }
 
+   if (error == VK_ERROR_DEVICE_LOST &&
+       env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
+      abort();
+
    return error;
 }
diff --git a/src/intel/vulkan/anv_wsi.c b/src/intel/vulkan/anv_wsi.c
index baa032e..83b94a9 100644
--- a/src/intel/vulkan/anv_wsi.c
+++ b/src/intel/vulkan/anv_wsi.c
@@ -24,7 +24,7 @@
 #include "anv_private.h"
 #include "wsi_common.h"
 #include "vk_format_info.h"
-#include "util/vk_util.h"
+#include "vk_util.h"
 
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
 static const struct wsi_callbacks wsi_cbs = {
@@ -139,6 +139,19 @@
    return iface->get_capabilities(surface, pSurfaceCapabilities);
 }
 
+VkResult anv_GetPhysicalDeviceSurfaceCapabilities2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
+    VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pSurfaceInfo->surface);
+   struct wsi_interface *iface = device->wsi_device.wsi[surface->platform];
+
+   return iface->get_capabilities2(surface, pSurfaceInfo->pNext,
+                                   pSurfaceCapabilities);
+}
+
 VkResult anv_GetPhysicalDeviceSurfaceFormatsKHR(
     VkPhysicalDevice                            physicalDevice,
     VkSurfaceKHR                                _surface,
@@ -153,6 +166,20 @@
                              pSurfaceFormats);
 }
 
+VkResult anv_GetPhysicalDeviceSurfaceFormats2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
+    uint32_t*                                   pSurfaceFormatCount,
+    VkSurfaceFormat2KHR*                        pSurfaceFormats)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pSurfaceInfo->surface);
+   struct wsi_interface *iface = device->wsi_device.wsi[surface->platform];
+
+   return iface->get_formats2(surface, &device->wsi_device, pSurfaceInfo->pNext,
+                              pSurfaceFormatCount, pSurfaceFormats);
+}
+
 VkResult anv_GetPhysicalDeviceSurfacePresentModesKHR(
     VkPhysicalDevice                            physicalDevice,
     VkSurfaceKHR                                _surface,
@@ -168,16 +195,16 @@
 }
 
 static VkResult
-x11_anv_wsi_image_create(VkDevice device_h,
-                         const VkSwapchainCreateInfoKHR *pCreateInfo,
-                         const VkAllocationCallbacks* pAllocator,
-                         bool different_gpu,
-                         bool linear,
-                         VkImage *image_p,
-                         VkDeviceMemory *memory_p,
-                         uint32_t *size,
-                         uint32_t *offset,
-                         uint32_t *row_pitch, int *fd_p)
+anv_wsi_image_create(VkDevice device_h,
+                     const VkSwapchainCreateInfoKHR *pCreateInfo,
+                     const VkAllocationCallbacks* pAllocator,
+                     bool different_gpu,
+                     bool linear,
+                     VkImage *image_p,
+                     VkDeviceMemory *memory_p,
+                     uint32_t *size,
+                     uint32_t *offset,
+                     uint32_t *row_pitch, int *fd_p)
 {
    struct anv_device *device = anv_device_from_handle(device_h);
    VkImage image_h;
@@ -275,10 +302,10 @@
 }
 
 static void
-x11_anv_wsi_image_free(VkDevice device,
-                       const VkAllocationCallbacks* pAllocator,
-                       VkImage image_h,
-                       VkDeviceMemory memory_h)
+anv_wsi_image_free(VkDevice device,
+                   const VkAllocationCallbacks* pAllocator,
+                   VkImage image_h,
+                   VkDeviceMemory memory_h)
 {
    anv_DestroyImage(device, image_h, pAllocator);
 
@@ -286,8 +313,8 @@
 }
 
 static const struct wsi_image_fns anv_wsi_image_fns = {
-   .create_wsi_image = x11_anv_wsi_image_create,
-   .free_wsi_image = x11_anv_wsi_image_free,
+   .create_wsi_image = anv_wsi_image_create,
+   .free_wsi_image = anv_wsi_image_free,
 };
 
 VkResult anv_CreateSwapchainKHR(
diff --git a/src/intel/vulkan/anv_wsi_magma.cc b/src/intel/vulkan/anv_wsi_magma.cc
index 904b2b8..1753676 100644
--- a/src/intel/vulkan/anv_wsi_magma.cc
+++ b/src/intel/vulkan/anv_wsi_magma.cc
@@ -146,5 +146,5 @@
 uintptr_t anv_wsi_magma_get_platform_semaphore(VkSemaphore vk_semaphore)
 {
    ANV_FROM_HANDLE(anv_semaphore, semaphore, vk_semaphore);
-   return semaphore->current_platform_semaphore;
+   return semaphore->current->platform_semaphore;
 }
diff --git a/src/intel/vulkan/dev_icd.json.in b/src/intel/vulkan/dev_icd.json.in
index e4e65fa..84ac3d4 100644
--- a/src/intel/vulkan/dev_icd.json.in
+++ b/src/intel/vulkan/dev_icd.json.in
@@ -2,6 +2,6 @@
     "file_format_version": "1.0.0",
     "ICD": {
         "library_path": "@build_libdir@/libvulkan_intel.so",
-        "api_version": "1.0.3"
+        "api_version": "1.0.54"
     }
 }
diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
index c891a76..fb420d8 100644
--- a/src/intel/vulkan/gen8_cmd_buffer.c
+++ b/src/intel/vulkan/gen8_cmd_buffer.c
@@ -49,10 +49,10 @@
       struct GENX(SF_CLIP_VIEWPORT) sf_clip_viewport = {
          .ViewportMatrixElementm00 = vp->width / 2,
          .ViewportMatrixElementm11 = vp->height / 2,
-         .ViewportMatrixElementm22 = 1.0,
+         .ViewportMatrixElementm22 = vp->maxDepth - vp->minDepth,
          .ViewportMatrixElementm30 = vp->x + vp->width / 2,
          .ViewportMatrixElementm31 = vp->y + vp->height / 2,
-         .ViewportMatrixElementm32 = 0.0,
+         .ViewportMatrixElementm32 = vp->minDepth,
          .XMinClipGuardband = -1.0f,
          .XMaxClipGuardband = 1.0f,
          .YMinClipGuardband = -1.0f,
@@ -104,54 +104,6 @@
 }
 #endif
 
-static void
-__emit_genx_sf_state(struct anv_cmd_buffer *cmd_buffer)
-{
-      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
-      struct GENX(3DSTATE_SF) sf = {
-         GENX(3DSTATE_SF_header),
-         .LineWidth = cmd_buffer->state.dynamic.line_width,
-      };
-      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
-      /* FIXME: gen9.fs */
-      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw,
-                           cmd_buffer->state.pipeline->gen8.sf);
-}
-
-void
-gen9_emit_sf_state(struct anv_cmd_buffer *cmd_buffer);
-
-#if GEN_GEN == 9
-
-void
-gen9_emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
-{
-   __emit_genx_sf_state(cmd_buffer);
-}
-
-#endif
-
-#if GEN_GEN == 8
-
-static void
-__emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
-{
-   if (cmd_buffer->device->info.is_cherryview)
-      gen9_emit_sf_state(cmd_buffer);
-   else
-      __emit_genx_sf_state(cmd_buffer);
-}
-
-#else
-
-static void
-__emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
-{
-   __emit_genx_sf_state(cmd_buffer);
-}
-
-#endif
-
 void
 genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
 {
@@ -431,7 +383,22 @@
 
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
                                   ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)) {
-      __emit_sf_state(cmd_buffer);
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GENX(3DSTATE_SF) sf = {
+         GENX(3DSTATE_SF_header),
+      };
+#if GEN_GEN == 8
+      if (cmd_buffer->device->info.is_cherryview) {
+         sf.CHVLineWidth = cmd_buffer->state.dynamic.line_width;
+      } else {
+         sf.LineWidth = cmd_buffer->state.dynamic.line_width;
+      }
+#else
+      sf.LineWidth = cmd_buffer->state.dynamic.line_width,
+#endif
+      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw,
+                           cmd_buffer->state.pipeline->gen8.sf);
    }
 
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
@@ -467,7 +434,7 @@
          .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
          .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
          .StencilReferenceValue = d->stencil_reference.front & 0xff,
-         .BackFaceStencilReferenceValue = d->stencil_reference.back & 0xff,
+         .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
       };
       GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
 
@@ -512,19 +479,19 @@
    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
       struct anv_state cc_state =
          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GEN9_COLOR_CALC_STATE_length * 4,
+                                            GENX(COLOR_CALC_STATE_length) * 4,
                                             64);
-      struct GEN9_COLOR_CALC_STATE cc = {
+      struct GENX(COLOR_CALC_STATE) cc = {
          .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
          .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
          .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
          .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
       };
-      GEN9_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
+      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
 
       anv_state_flush(cmd_buffer->device, cc_state);
 
-      anv_batch_emit(&cmd_buffer->batch, GEN9_3DSTATE_CC_STATE_POINTERS, ccp) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
          ccp.ColorCalcStatePointer = cc_state.offset;
          ccp.ColorCalcStatePointerValid = true;
       }
@@ -535,10 +502,10 @@
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
-      uint32_t dwords[GEN9_3DSTATE_WM_DEPTH_STENCIL_length];
+      uint32_t dwords[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
       struct anv_dynamic_state *d = &cmd_buffer->state.dynamic;
-      struct GEN9_3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil = {
-         GEN9_3DSTATE_WM_DEPTH_STENCIL_header,
+      struct GENX(3DSTATE_WM_DEPTH_STENCIL) wm_depth_stencil = {
+         GENX(3DSTATE_WM_DEPTH_STENCIL_header),
 
          .StencilTestMask = d->stencil_compare_mask.front & 0xff,
          .StencilWriteMask = d->stencil_write_mask.front & 0xff,
@@ -553,7 +520,7 @@
             (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
             pipeline->writes_stencil,
       };
-      GEN9_3DSTATE_WM_DEPTH_STENCIL_pack(NULL, dwords, &wm_depth_stencil);
+      GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dwords, &wm_depth_stencil);
 
       anv_batch_emit_merge(&cmd_buffer->batch, dwords,
                            pipeline->gen9.wm_depth_stencil);
@@ -642,7 +609,7 @@
       pc.DestinationAddressType  = DAT_PPGTT,
       pc.PostSyncOperation       = WriteImmediateData,
       pc.Address = (struct anv_address) {
-         &cmd_buffer->device->dynamic_state_block_pool.bo,
+         &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
          event->state.offset
       };
       pc.ImmediateData           = VK_EVENT_SET;
@@ -666,7 +633,7 @@
       pc.DestinationAddressType  = DAT_PPGTT;
       pc.PostSyncOperation       = WriteImmediateData;
       pc.Address = (struct anv_address) {
-         &cmd_buffer->device->dynamic_state_block_pool.bo,
+         &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
          event->state.offset
       };
       pc.ImmediateData           = VK_EVENT_RESET;
@@ -695,7 +662,7 @@
          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
          sem.SemaphoreDataDword  = VK_EVENT_SET,
          sem.SemaphoreAddress = (struct anv_address) {
-            &cmd_buffer->device->dynamic_state_block_pool.bo,
+            &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
             event->state.offset
          };
       }
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 7f22b67..f041fc7 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -132,27 +132,42 @@
       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
 
    *addr = (struct blorp_address) {
-      .buffer = &cmd_buffer->device->dynamic_state_block_pool.bo,
+      .buffer = &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
       .offset = vb_state.offset,
    };
 
    return vb_state.map;
 }
 
+#if GEN_GEN >= 8
+static struct blorp_address
+blorp_get_workaround_page(struct blorp_batch *batch)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = &cmd_buffer->device->workaround_bo,
+   };
+}
+#endif
+
 static void
 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
 {
    struct anv_device *device = batch->blorp->driver_ctx;
    if (!device->info.has_llc)
-      anv_flush_range(start, size);
+      gen_flush_range(start, size);
 }
 
 static void
-blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size)
+blorp_emit_urb_config(struct blorp_batch *batch,
+                      unsigned vs_entry_size, unsigned sf_entry_size)
 {
    struct anv_device *device = batch->blorp->driver_ctx;
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
 
+   assert(sf_entry_size == 0);
+
    const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
 
    genX(emit_urb_setup)(device, &cmd_buffer->batch,
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 312a43e..f342717 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -26,6 +26,7 @@
 
 #include "anv_private.h"
 #include "vk_format_info.h"
+#include "vk_util.h"
 
 #include "common/gen_l3_config.h"
 #include "genxml/gen_macros.h"
@@ -50,6 +51,17 @@
    }
 }
 
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+static void
+emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
+      lrr.SourceRegisterAddress        = src;
+      lrr.DestinationRegisterAddress   = dst;
+   }
+}
+#endif
+
 void
 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -79,7 +91,7 @@
       sba.SurfaceStateBaseAddressModifyEnable = true;
 
       sba.DynamicStateBaseAddress =
-         (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
+         (struct anv_address) { &device->dynamic_state_pool.block_pool.bo, 0 };
       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
       sba.DynamicStateBaseAddressModifyEnable = true;
 
@@ -88,7 +100,7 @@
       sba.IndirectObjectBaseAddressModifyEnable = true;
 
       sba.InstructionBaseAddress =
-         (struct anv_address) { &device->instruction_block_pool.bo, 0 };
+         (struct anv_address) { &device->instruction_state_pool.block_pool.bo, 0 };
       sba.InstructionMemoryObjectControlState = GENX(MOCS);
       sba.InstructionBaseAddressModifyEnable = true;
 
@@ -167,17 +179,20 @@
 }
 
 static void
-add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer,
-                      const struct anv_image_view *iview,
-                      enum isl_aux_usage aux_usage,
-                      struct anv_state state)
+add_image_relocs(struct anv_cmd_buffer * const cmd_buffer,
+                 const struct anv_image * const image,
+                 const VkImageAspectFlags aspect_mask,
+                 const enum isl_aux_usage aux_usage,
+                 const struct anv_state state)
 {
    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+   const uint32_t surf_offset = image->offset +
+      anv_image_get_surface_for_aspect_mask(image, aspect_mask)->offset;
 
-   add_surface_state_reloc(cmd_buffer, state, iview->bo, iview->offset);
+   add_surface_state_reloc(cmd_buffer, state, image->bo, surf_offset);
 
    if (aux_usage != ISL_AUX_USAGE_NONE) {
-      uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset;
+      uint32_t aux_offset = image->offset + image->aux_surface.offset;
 
       /* On gen7 and prior, the bottom 12 bits of the MCS base address are
        * used to store other information.  This should be ok, however, because
@@ -191,7 +206,7 @@
          anv_reloc_list_add(&cmd_buffer->surface_relocs,
                             &cmd_buffer->pool->alloc,
                             state.offset + isl_dev->ss.aux_addr_offset,
-                            iview->bo, aux_offset);
+                            image->bo, aux_offset);
       if (result != VK_SUCCESS)
          anv_batch_set_error(&cmd_buffer->batch, result);
    }
@@ -216,13 +231,19 @@
 }
 
 static void
-color_attachment_compute_aux_usage(struct anv_device *device,
-                                   struct anv_attachment_state *att_state,
-                                   struct anv_image_view *iview,
-                                   VkRect2D render_area,
+color_attachment_compute_aux_usage(struct anv_device * device,
+                                   struct anv_cmd_state * cmd_state,
+                                   uint32_t att, VkRect2D render_area,
                                    union isl_color_value *fast_clear_color)
 {
-   if (iview->image->aux_surface.isl.size == 0) {
+   struct anv_attachment_state *att_state = &cmd_state->attachments[att];
+   struct anv_image_view *iview = cmd_state->framebuffer->attachments[att];
+
+   if (iview->isl.base_array_layer >=
+       anv_image_aux_layers(iview->image, iview->isl.base_level)) {
+      /* There is no aux buffer which corresponds to the level and layer(s)
+       * being accessed.
+       */
       att_state->aux_usage = ISL_AUX_USAGE_NONE;
       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
       att_state->fast_clear = false;
@@ -232,12 +253,49 @@
       att_state->input_aux_usage = ISL_AUX_USAGE_MCS;
       att_state->fast_clear = false;
       return;
+   } else if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E) {
+      att_state->aux_usage = ISL_AUX_USAGE_CCS_E;
+      att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E;
+   } else {
+      att_state->aux_usage = ISL_AUX_USAGE_CCS_D;
+      /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode:
+       *
+       *    "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D
+       *    setting is only allowed if Surface Format supported for Fast
+       *    Clear. In addition, if the surface is bound to the sampling
+       *    engine, Surface Format must be supported for Render Target
+       *    Compression for surfaces bound to the sampling engine."
+       *
+       * In other words, we can only sample from a fast-cleared image if it
+       * also supports color compression.
+       */
+      if (isl_format_supports_ccs_e(&device->info, iview->isl.format)) {
+         att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D;
+
+         /* While fast-clear resolves and partial resolves are fairly cheap in the
+          * case where you render to most of the pixels, full resolves are not
+          * because they potentially involve reading and writing the entire
+          * framebuffer.  If we can't texture with CCS_E, we should leave it off and
+          * limit ourselves to fast clears.
+          */
+         if (cmd_state->pass->attachments[att].first_subpass_layout ==
+             VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+            anv_perf_warn("Not temporarily enabling CCS_E.");
+         }
+      } else {
+         att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
+      }
    }
 
    assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT);
 
    att_state->clear_color_is_zero_one =
       color_is_zero_one(att_state->clear_value.color, iview->isl.format);
+   att_state->clear_color_is_zero =
+      att_state->clear_value.color.uint32[0] == 0 &&
+      att_state->clear_value.color.uint32[1] == 0 &&
+      att_state->clear_value.color.uint32[2] == 0 &&
+      att_state->clear_value.color.uint32[3] == 0;
 
    if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
       /* Start off assuming fast clears are possible */
@@ -253,21 +311,36 @@
           render_area.extent.height != iview->extent.height)
          att_state->fast_clear = false;
 
-      if (GEN_GEN <= 7) {
-         /* On gen7, we can't do multi-LOD or multi-layer fast-clears.  We
-          * technically can, but it comes with crazy restrictions that we
-          * don't want to deal with now.
-          */
-         if (iview->isl.base_level > 0 ||
-             iview->isl.base_array_layer > 0 ||
-             iview->isl.array_len > 1)
-            att_state->fast_clear = false;
-      }
-
       /* On Broadwell and earlier, we can only handle 0/1 clear colors */
       if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one)
          att_state->fast_clear = false;
 
+      /* We allow fast clears when all aux layers of the miplevel are targeted.
+       * See add_fast_clear_state_buffer() for more information. Also, because
+       * we only either do a fast clear or a normal clear and not both, this
+       * complies with the gen7 restriction of not fast-clearing multiple
+       * layers.
+       */
+      if (cmd_state->framebuffer->layers !=
+          anv_image_aux_layers(iview->image, iview->isl.base_level)) {
+         att_state->fast_clear = false;
+         if (GEN_GEN == 7) {
+            anv_perf_warn("Not fast-clearing the first layer in "
+                          "a multi-layer fast clear.");
+         }
+      }
+
+      /* We only allow fast clears in the GENERAL layout if the auxiliary
+       * buffer is always enabled and the fast-clear value is all 0's. See
+       * add_fast_clear_state_buffer() for more information.
+       */
+      if (cmd_state->pass->attachments[att].first_subpass_layout ==
+          VK_IMAGE_LAYOUT_GENERAL &&
+          (!att_state->clear_color_is_zero ||
+           iview->image->aux_usage == ISL_AUX_USAGE_NONE)) {
+         att_state->fast_clear = false;
+      }
+
       if (att_state->fast_clear) {
          memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32,
                 sizeof(fast_clear_color->u32));
@@ -275,41 +348,6 @@
    } else {
       att_state->fast_clear = false;
    }
-
-   /**
-    * TODO: Consider using a heuristic to determine if temporarily enabling
-    * CCS_E for this image view would be beneficial.
-    *
-    * While fast-clear resolves and partial resolves are fairly cheap in the
-    * case where you render to most of the pixels, full resolves are not
-    * because they potentially involve reading and writing the entire
-    * framebuffer.  If we can't texture with CCS_E, we should leave it off and
-    * limit ourselves to fast clears.
-    */
-   if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E) {
-      att_state->aux_usage = ISL_AUX_USAGE_CCS_E;
-      att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E;
-   } else if (att_state->fast_clear) {
-      att_state->aux_usage = ISL_AUX_USAGE_CCS_D;
-      /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode:
-       *
-       *    "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D
-       *    setting is only allowed if Surface Format supported for Fast
-       *    Clear. In addition, if the surface is bound to the sampling
-       *    engine, Surface Format must be supported for Render Target
-       *    Compression for surfaces bound to the sampling engine."
-       *
-       * In other words, we can only sample from a fast-cleared image if it
-       * also supports color compression.
-       */
-      if (isl_format_supports_ccs_e(&device->info, iview->isl.format))
-         att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D;
-      else
-         att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
-   } else {
-      att_state->aux_usage = ISL_AUX_USAGE_NONE;
-      att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
-   }
 }
 
 static bool
@@ -369,29 +407,400 @@
       anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op);
 }
 
+enum fast_clear_state_field {
+   FAST_CLEAR_STATE_FIELD_CLEAR_COLOR,
+   FAST_CLEAR_STATE_FIELD_NEEDS_RESOLVE,
+};
+
+static inline uint32_t
+get_fast_clear_state_offset(const struct anv_device *device,
+                            const struct anv_image *image,
+                            unsigned level, enum fast_clear_state_field field)
+{
+   assert(device && image);
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(level < anv_image_aux_levels(image));
+   uint32_t offset = image->offset + image->aux_surface.offset +
+                     image->aux_surface.isl.size +
+                     anv_fast_clear_state_entry_size(device) * level;
+
+   switch (field) {
+   case FAST_CLEAR_STATE_FIELD_NEEDS_RESOLVE:
+      offset += device->isl_dev.ss.clear_value_size;
+      /* Fall-through */
+   case FAST_CLEAR_STATE_FIELD_CLEAR_COLOR:
+      break;
+   }
+
+   assert(offset < image->offset + image->size);
+   return offset;
+}
+
+#define MI_PREDICATE_SRC0  0x2400
+#define MI_PREDICATE_SRC1  0x2408
+
+/* Manages the state of an color image subresource to ensure resolves are
+ * performed properly.
+ */
+static void
+genX(set_image_needs_resolve)(struct anv_cmd_buffer *cmd_buffer,
+                        const struct anv_image *image,
+                        unsigned level, bool needs_resolve)
+{
+   assert(cmd_buffer && image);
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(level < anv_image_aux_levels(image));
+
+   const uint32_t resolve_flag_offset =
+      get_fast_clear_state_offset(cmd_buffer->device, image, level,
+                                  FAST_CLEAR_STATE_FIELD_NEEDS_RESOLVE);
+
+   /* The HW docs say that there is no way to guarantee the completion of
+    * the following command. We use it nevertheless because it shows no
+    * issues in testing is currently being used in the GL driver.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+      sdi.Address = (struct anv_address) { image->bo, resolve_flag_offset };
+      sdi.ImmediateData = needs_resolve;
+   }
+}
+
+static void
+genX(load_needs_resolve_predicate)(struct anv_cmd_buffer *cmd_buffer,
+                                   const struct anv_image *image,
+                                   unsigned level)
+{
+   assert(cmd_buffer && image);
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(level < anv_image_aux_levels(image));
+
+   const uint32_t resolve_flag_offset =
+      get_fast_clear_state_offset(cmd_buffer->device, image, level,
+                                  FAST_CLEAR_STATE_FIELD_NEEDS_RESOLVE);
+
+   /* Make the pending predicated resolve a no-op if one is not needed.
+    * predicate = do_resolve = resolve_flag != 0;
+    */
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1    , 0);
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0    , 0);
+   emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4,
+            image->bo, resolve_flag_offset);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+}
+
+static void
+init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer,
+                            const struct anv_image *image,
+                            unsigned level)
+{
+   assert(cmd_buffer && image);
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(level < anv_image_aux_levels(image));
+
+   /* The resolve flag should updated to signify that fast-clear/compression
+    * data needs to be removed when leaving the undefined layout. Such data
+    * may need to be removed if it would cause accesses to the color buffer
+    * to return incorrect data. The fast clear data in CCS_D buffers should
+    * be removed because CCS_D isn't enabled all the time.
+    */
+   genX(set_image_needs_resolve)(cmd_buffer, image, level,
+                                 image->aux_usage == ISL_AUX_USAGE_NONE);
+
+   /* The fast clear value dword(s) will be copied into a surface state object.
+    * Ensure that the restrictions of the fields in the dword(s) are followed.
+    *
+    * CCS buffers on SKL+ can have any value set for the clear colors.
+    */
+   if (image->samples == 1 && GEN_GEN >= 9)
+      return;
+
+   /* Other combinations of auxiliary buffers and platforms require specific
+    * values in the clear value dword(s).
+    */
+   unsigned i = 0;
+   for (; i < cmd_buffer->device->isl_dev.ss.clear_value_size; i += 4) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         const uint32_t entry_offset =
+            get_fast_clear_state_offset(cmd_buffer->device, image, level,
+                                        FAST_CLEAR_STATE_FIELD_CLEAR_COLOR);
+         sdi.Address = (struct anv_address) { image->bo, entry_offset + i };
+
+         if (GEN_GEN >= 9) {
+            /* MCS buffers on SKL+ can only have 1/0 clear colors. */
+            assert(image->aux_usage == ISL_AUX_USAGE_MCS);
+            sdi.ImmediateData = 0;
+         } else if (GEN_VERSIONx10 >= 75) {
+            /* Pre-SKL, the dword containing the clear values also contains
+             * other fields, so we need to initialize those fields to match the
+             * values that would be in a color attachment.
+             */
+            assert(i == 0);
+            sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                                ISL_CHANNEL_SELECT_GREEN << 22 |
+                                ISL_CHANNEL_SELECT_BLUE  << 19 |
+                                ISL_CHANNEL_SELECT_ALPHA << 16;
+         }  else if (GEN_VERSIONx10 == 70) {
+            /* On IVB, the dword containing the clear values also contains
+             * other fields that must be zero or can be zero.
+             */
+            assert(i == 0);
+            sdi.ImmediateData = 0;
+         }
+      }
+   }
+}
+
+/* Copy the fast-clear value dword(s) between a surface state object and an
+ * image's fast clear state buffer.
+ */
+static void
+genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_state surface_state,
+                             const struct anv_image *image,
+                             unsigned level,
+                             bool copy_from_surface_state)
+{
+   assert(cmd_buffer && image);
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(level < anv_image_aux_levels(image));
+
+   struct anv_bo *ss_bo =
+      &cmd_buffer->device->surface_state_pool.block_pool.bo;
+   uint32_t ss_clear_offset = surface_state.offset +
+      cmd_buffer->device->isl_dev.ss.clear_value_offset;
+   uint32_t entry_offset =
+      get_fast_clear_state_offset(cmd_buffer->device, image, level,
+                                  FAST_CLEAR_STATE_FIELD_CLEAR_COLOR);
+   unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
+
+   if (copy_from_surface_state) {
+      genX(cmd_buffer_mi_memcpy)(cmd_buffer, image->bo, entry_offset,
+                                 ss_bo, ss_clear_offset, copy_size);
+   } else {
+      genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_bo, ss_clear_offset,
+                                 image->bo, entry_offset, copy_size);
+
+      /* Updating a surface state object may require that the state cache be
+       * invalidated. From the SKL PRM, Shared Functions -> State -> State
+       * Caching:
+       *
+       *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+       *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+       *    modified [...], the L1 state cache must be invalidated to ensure
+       *    the new surface or sampler state is fetched from system memory.
+       *
+       * In testing, SKL doesn't actually seem to need this, but HSW does.
+       */
+      cmd_buffer->state.pending_pipe_bits |=
+         ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+   }
+}
+
+/**
+ * @brief Transitions a color buffer from one layout to another.
+ *
+ * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
+ * more information.
+ *
+ * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
+ * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
+ *                    this represents the maximum layers to transition at each
+ *                    specified miplevel.
+ */
 static void
 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                         const struct anv_image *image,
+                        const uint32_t base_level, uint32_t level_count,
+                        uint32_t base_layer, uint32_t layer_count,
                         VkImageLayout initial_layout,
-                        VkImageLayout final_layout,
-                        const struct isl_view *view,
-                        const VkImageSubresourceRange *subresourceRange)
+                        VkImageLayout final_layout)
 {
-   if (image->aux_usage != ISL_AUX_USAGE_CCS_E)
-      return;
+   /* Validate the inputs. */
+   assert(cmd_buffer);
+   assert(image && image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   /* These values aren't supported for simplicity's sake. */
+   assert(level_count != VK_REMAINING_MIP_LEVELS &&
+          layer_count != VK_REMAINING_ARRAY_LAYERS);
+   /* Ensure the subresource range is valid. */
+   uint64_t last_level_num = base_level + level_count;
+   const uint32_t max_depth = anv_minify(image->extent.depth, base_level);
+   const uint32_t image_layers = MAX2(image->array_size, max_depth);
+   assert((uint64_t)base_layer + layer_count  <= image_layers);
+   assert(last_level_num <= image->levels);
+   /* The spec disallows these final layouts. */
+   assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
+          final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED);
 
-   if (initial_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
-       initial_layout != VK_IMAGE_LAYOUT_PREINITIALIZED)
-      return;
-
-#if GEN_GEN >= 9
-   /* We're transitioning from an undefined layout so it doesn't really matter
-    * what data ends up in the color buffer.  We do, however, need to ensure
-    * that the CCS has valid data in it.  One easy way to do that is to
-    * fast-clear the specified range.
+   /* No work is necessary if the layout stays the same or if this subresource
+    * range lacks auxiliary data.
     */
-   anv_image_ccs_clear(cmd_buffer, image, view, subresourceRange);
-#endif
+   if (initial_layout == final_layout ||
+       base_layer >= anv_image_aux_layers(image, base_level))
+      return;
+
+   /* A transition of a 3D subresource works on all slices at a time. */
+   if (image->type == VK_IMAGE_TYPE_3D) {
+      base_layer = 0;
+      layer_count = anv_minify(image->extent.depth, base_level);
+   }
+
+   /* We're interested in the subresource range subset that has aux data. */
+   level_count = MIN2(level_count, anv_image_aux_levels(image) - base_level);
+   layer_count = MIN2(layer_count,
+                      anv_image_aux_layers(image, base_level) - base_layer);
+   last_level_num = base_level + level_count;
+
+   /* Record whether or not the layout is undefined. Pre-initialized images
+    * with auxiliary buffers have a non-linear layout and are thus undefined.
+    */
+   assert(image->tiling == VK_IMAGE_TILING_OPTIMAL);
+   const bool undef_layout = initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
+                             initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
+
+   /* Do preparatory work before the resolve operation or return early if no
+    * resolve is actually needed.
+    */
+   if (undef_layout) {
+      /* A subresource in the undefined layout may have been aliased and
+       * populated with any arrangement of bits. Therefore, we must initialize
+       * the related aux buffer and clear buffer entry with desirable values.
+       *
+       * Initialize the relevant clear buffer entries.
+       */
+      for (unsigned level = base_level; level < last_level_num; level++)
+         init_fast_clear_state_entry(cmd_buffer, image, level);
+
+      /* Initialize the aux buffers to enable correct rendering. This operation
+       * requires up to two steps: one to rid the aux buffer of data that may
+       * cause GPU hangs, and another to ensure that writes done without aux
+       * will be visible to reads done with aux.
+       *
+       * Having an aux buffer with invalid data is possible for CCS buffers
+       * SKL+ and for MCS buffers with certain sample counts (2x and 8x). One
+       * easy way to get to a valid state is to fast-clear the specified range.
+       *
+       * Even for MCS buffers that have sample counts that don't require
+       * certain bits to be reserved (4x and 8x), we're unsure if the hardware
+       * will be okay with the sample mappings given by the undefined buffer.
+       * We don't have any data to show that this is a problem, but we want to
+       * avoid causing difficult-to-debug problems.
+       */
+      if ((GEN_GEN >= 9 && image->samples == 1) || image->samples > 1) {
+         if (image->samples == 4 || image->samples == 16) {
+            anv_perf_warn("Doing a potentially unnecessary fast-clear to "
+                          "define an MCS buffer.");
+         }
+
+         anv_image_fast_clear(cmd_buffer, image, base_level, level_count,
+                              base_layer, layer_count);
+      }
+      /* At this point, some elements of the CCS buffer may have the fast-clear
+       * bit-arrangement. As the user writes to a subresource, we need to have
+       * the associated CCS elements enter the ambiguated state. This enables
+       * reads (implicit or explicit) to reflect the user-written data instead
+       * of the clear color. The only time such elements will not change their
+       * state as described above, is in a final layout that doesn't have CCS
+       * enabled. In this case, we must force the associated CCS buffers of the
+       * specified range to enter the ambiguated state in advance.
+       */
+      if (image->samples == 1 && image->aux_usage != ISL_AUX_USAGE_CCS_E &&
+          final_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+         /* The CCS_D buffer may not be enabled in the final layout. Continue
+          * executing this function to perform a resolve.
+          */
+          anv_perf_warn("Performing an additional resolve for CCS_D layout "
+                        "transition. Consider always leaving it on or "
+                        "performing an ambiguation pass.");
+      } else {
+         /* Writes in the final layout will be aware of the auxiliary buffer.
+          * In addition, the clear buffer entries and the auxiliary buffers
+          * have been populated with values that will result in correct
+          * rendering.
+          */
+         return;
+      }
+   } else if (initial_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+      /* Resolves are only necessary if the subresource may contain blocks
+       * fast-cleared to values unsupported in other layouts. This only occurs
+       * if the initial layout is COLOR_ATTACHMENT_OPTIMAL.
+       */
+      return;
+   } else if (image->samples > 1) {
+      /* MCS buffers don't need resolving. */
+      return;
+   }
+
+   /* Perform a resolve to synchronize data between the main and aux buffer.
+    * Before we begin, we must satisfy the cache flushing requirement specified
+    * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
+    *
+    *    Any transition from any value in {Clear, Render, Resolve} to a
+    *    different value in {Clear, Render, Resolve} requires end of pipe
+    *    synchronization.
+    *
+    * We perform a flush of the write cache before and after the clear and
+    * resolve operations to meet this requirement.
+    *
+    * Unlike other drawing, fast clear operations are not properly
+    * synchronized. The first PIPE_CONTROL here likely ensures that the
+    * contents of the previous render or clear hit the render target before we
+    * resolve and the second likely ensures that the resolve is complete before
+    * we do any more rendering or clearing.
+    */
+   cmd_buffer->state.pending_pipe_bits |=
+      ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
+
+   for (uint32_t level = base_level; level < last_level_num; level++) {
+
+      /* The number of layers changes at each 3D miplevel. */
+      if (image->type == VK_IMAGE_TYPE_3D) {
+         layer_count = MIN2(layer_count, anv_image_aux_layers(image, level));
+      }
+
+      genX(load_needs_resolve_predicate)(cmd_buffer, image, level);
+
+      /* Create a surface state with the right clear color and perform the
+       * resolve.
+       */
+      struct anv_state surface_state =
+         anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+      isl_surf_fill_state(&cmd_buffer->device->isl_dev, surface_state.map,
+                          .surf = &image->color_surface.isl,
+                          .view = &(struct isl_view) {
+                              .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT,
+                              .format = image->color_surface.isl.format,
+                              .swizzle = ISL_SWIZZLE_IDENTITY,
+                              .base_level = level,
+                              .levels = 1,
+                              .base_array_layer = base_layer,
+                              .array_len = layer_count,
+                           },
+                          .aux_surf = &image->aux_surface.isl,
+                          .aux_usage = image->aux_usage == ISL_AUX_USAGE_NONE ?
+                                       ISL_AUX_USAGE_CCS_D : image->aux_usage,
+                          .mocs = cmd_buffer->device->default_mocs);
+      add_image_relocs(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
+                       image->aux_usage == ISL_AUX_USAGE_CCS_E ?
+                       ISL_AUX_USAGE_CCS_E : ISL_AUX_USAGE_CCS_D,
+                       surface_state);
+      anv_state_flush(cmd_buffer->device, surface_state);
+      genX(copy_fast_clear_dwords)(cmd_buffer, surface_state, image, level,
+                                   false /* copy to ss */);
+      anv_ccs_resolve(cmd_buffer, surface_state, image, level, layer_count,
+                      image->aux_usage == ISL_AUX_USAGE_CCS_E ?
+                      BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL :
+                      BLORP_FAST_CLEAR_OP_RESOLVE_FULL);
+
+      genX(set_image_needs_resolve)(cmd_buffer, image, level, false);
+   }
+
+   cmd_buffer->state.pending_pipe_bits |=
+      ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
 }
 
 /**
@@ -407,19 +816,18 @@
 
    vk_free(&cmd_buffer->pool->alloc, state->attachments);
 
-   if (pass->attachment_count == 0) {
+   if (pass->attachment_count > 0) {
+      state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
+                                    pass->attachment_count *
+                                         sizeof(state->attachments[0]),
+                                    8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (state->attachments == NULL) {
+         /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
+         return anv_batch_set_error(&cmd_buffer->batch,
+                                    VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+   } else {
       state->attachments = NULL;
-      return VK_SUCCESS;
-   }
-
-   state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
-                                 pass->attachment_count *
-                                      sizeof(state->attachments[0]),
-                                 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (state->attachments == NULL) {
-      /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
-      return anv_batch_set_error(&cmd_buffer->batch,
-                                 VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    /* Reserve one for the NULL state. */
@@ -514,8 +922,7 @@
          union isl_color_value clear_color = { .u32 = { 0, } };
          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
             color_attachment_compute_aux_usage(cmd_buffer->device,
-                                               &state->attachments[i],
-                                               iview, begin->renderArea,
+                                               state, i, begin->renderArea,
                                                &clear_color);
 
             struct isl_view view = iview->isl;
@@ -532,9 +939,9 @@
                                        ? cmd_buffer->device->uncached_mocs
                                        : cmd_buffer->device->default_mocs);
 
-            add_image_view_relocs(cmd_buffer, iview,
-                                  state->attachments[i].aux_usage,
-                                  state->attachments[i].color_rt_state);
+            add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask,
+                             state->attachments[i].aux_usage,
+                             state->attachments[i].color_rt_state);
          } else {
             /* This field will be initialized after the first subpass
              * transition.
@@ -556,9 +963,9 @@
                                 .clear_color = clear_color,
                                 .mocs = cmd_buffer->device->default_mocs);
 
-            add_image_view_relocs(cmd_buffer, iview,
-                                  state->attachments[i].input_aux_usage,
-                                  state->attachments[i].input_att_state);
+            add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask,
+                             state->attachments[i].input_aux_usage,
+                             state->attachments[i].input_att_state);
          }
       }
 
@@ -684,14 +1091,15 @@
           * copy the surface states for the current subpass into the storage
           * we allocated for them in BeginCommandBuffer.
           */
-         struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo;
+         struct anv_bo *ss_bo =
+            &primary->device->surface_state_pool.block_pool.bo;
          struct anv_state src_state = primary->state.render_pass_states;
          struct anv_state dst_state = secondary->state.render_pass_states;
          assert(src_state.alloc_size == dst_state.alloc_size);
 
-         genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset,
-                                     ss_bo, src_state.offset,
-                                     src_state.alloc_size);
+         genX(cmd_buffer_so_memcpy)(primary, ss_bo, dst_state.offset,
+                                    ss_bo, src_state.offset,
+                                    src_state.alloc_size);
       }
 
       anv_cmd_buffer_add_secondary(primary, secondary);
@@ -976,19 +1384,21 @@
       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
       ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);
-      if (pImageMemoryBarriers[i].subresourceRange.aspectMask &
-          VK_IMAGE_ASPECT_DEPTH_BIT) {
+      const VkImageSubresourceRange *range =
+         &pImageMemoryBarriers[i].subresourceRange;
+
+      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          transition_depth_buffer(cmd_buffer, image,
                                  pImageMemoryBarriers[i].oldLayout,
                                  pImageMemoryBarriers[i].newLayout);
-      }
-      if (pImageMemoryBarriers[i].subresourceRange.aspectMask &
-          VK_IMAGE_ASPECT_COLOR_BIT) {
+      } else if (range->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) {
          transition_color_buffer(cmd_buffer, image,
+                                 range->baseMipLevel,
+                                 anv_get_levelCount(image, range),
+                                 range->baseArrayLayer,
+                                 anv_get_layerCount(image, range),
                                  pImageMemoryBarriers[i].oldLayout,
-                                 pImageMemoryBarriers[i].newLayout,
-                                 NULL,
-                                 &pImageMemoryBarriers[i].subresourceRange);
+                                 pImageMemoryBarriers[i].newLayout);
       }
    }
 
@@ -1176,26 +1586,39 @@
          continue;
 
       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-         surface_state = desc->aux_usage == ISL_AUX_USAGE_NONE ?
-            desc->image_view->no_aux_sampler_surface_state :
-            desc->image_view->sampler_surface_state;
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
+         enum isl_aux_usage aux_usage;
+         if (desc->layout == VK_IMAGE_LAYOUT_GENERAL) {
+            surface_state = desc->image_view->general_sampler_surface_state;
+            aux_usage = desc->image_view->general_sampler_aux_usage;
+         } else {
+            surface_state = desc->image_view->optimal_sampler_surface_state;
+            aux_usage = desc->image_view->optimal_sampler_aux_usage;
+         }
          assert(surface_state.alloc_size);
-         add_image_view_relocs(cmd_buffer, desc->image_view,
-                               desc->aux_usage, surface_state);
+         add_image_relocs(cmd_buffer, desc->image_view->image,
+                          desc->image_view->aspect_mask,
+                          aux_usage, surface_state);
          break;
+      }
       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
          assert(stage == MESA_SHADER_FRAGMENT);
          if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) {
             /* For depth and stencil input attachments, we treat it like any
              * old texture that a user may have bound.
              */
-            surface_state = desc->aux_usage == ISL_AUX_USAGE_NONE ?
-               desc->image_view->no_aux_sampler_surface_state :
-               desc->image_view->sampler_surface_state;
+            enum isl_aux_usage aux_usage;
+            if (desc->layout == VK_IMAGE_LAYOUT_GENERAL) {
+               surface_state = desc->image_view->general_sampler_surface_state;
+               aux_usage = desc->image_view->general_sampler_aux_usage;
+            } else {
+               surface_state = desc->image_view->optimal_sampler_surface_state;
+               aux_usage = desc->image_view->optimal_sampler_aux_usage;
+            }
             assert(surface_state.alloc_size);
-            add_image_view_relocs(cmd_buffer, desc->image_view,
-                                  desc->aux_usage, surface_state);
+            add_image_relocs(cmd_buffer, desc->image_view->image,
+                             desc->image_view->aspect_mask,
+                             aux_usage, surface_state);
          } else {
             /* For color input attachments, we create the surface state at
              * vkBeginRenderPass time so that we can include aux and clear
@@ -1213,9 +1636,9 @@
             ? desc->image_view->writeonly_storage_surface_state
             : desc->image_view->storage_surface_state;
          assert(surface_state.alloc_size);
-         add_image_view_relocs(cmd_buffer, desc->image_view,
-                               desc->image_view->image->aux_usage,
-                               surface_state);
+         add_image_relocs(cmd_buffer, desc->image_view->image,
+                          desc->image_view->aspect_mask,
+                          desc->image_view->image->aux_usage, surface_state);
 
          struct brw_image_param *image_param =
             &cmd_buffer->state.push_constants[stage]->images[image++];
@@ -1471,11 +1894,11 @@
             c._3DCommandSubOpcode = push_constant_opcodes[stage],
             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
 #if GEN_GEN >= 9
-               .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
-               .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+               .Buffer[2] = { &cmd_buffer->device->dynamic_state_pool.block_pool.bo, state.offset },
+               .ReadLength[2] = DIV_ROUND_UP(state.alloc_size, 32),
 #else
-               .PointerToConstantBuffer0 = { .offset = state.offset },
-               .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+               .Buffer[0] = { .offset = state.offset },
+               .ReadLength[0] = DIV_ROUND_UP(state.alloc_size, 32),
 #endif
             };
          }
@@ -1521,7 +1944,12 @@
             .MemoryObjectControlState = GENX(MOCS),
 #else
             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
-            .InstanceDataStepRate = 1,
+            /* Our implementation of VK_KHR_multiview uses instancing to draw
+             * the different views.  If the client asks for instancing, we
+             * need to use the Instance Data Step Rate to ensure that we
+             * repeat the client's per-instance data once for each view.
+             */
+            .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass),
             .VertexBufferMemoryObjectControlState = GENX(MOCS),
 #endif
 
@@ -1672,7 +2100,7 @@
    anv_state_flush(cmd_buffer->device, id_state);
 
    emit_base_vertex_instance_bo(cmd_buffer,
-      &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
+      &cmd_buffer->device->dynamic_state_pool.block_pool.bo, id_state.offset);
 }
 
 static void
@@ -1686,7 +2114,7 @@
    anv_state_flush(cmd_buffer->device, state);
 
    emit_vertex_bo(cmd_buffer,
-                  &cmd_buffer->device->dynamic_state_block_pool.bo,
+                  &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
                   state.offset, 4, ANV_DRAWID_VB_INDEX);
 }
 
@@ -1711,6 +2139,11 @@
    if (vs_prog_data->uses_drawid)
       emit_draw_index(cmd_buffer, 0);
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw the
+    * different views.  We need to multiply instanceCount by the view count.
+    */
+   instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType         = SEQUENTIAL;
       prim.PrimitiveTopologyType    = pipeline->topology;
@@ -1744,6 +2177,11 @@
    if (vs_prog_data->uses_drawid)
       emit_draw_index(cmd_buffer, 0);
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw the
+    * different views.  We need to multiply instanceCount by the view count.
+    */
+   instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType         = RANDOM;
       prim.PrimitiveTopologyType    = pipeline->topology;
@@ -1763,6 +2201,112 @@
 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 
+/* MI_MATH only exists on Haswell+ */
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+
+static uint32_t
+mi_alu(uint32_t opcode, uint32_t op1, uint32_t op2)
+{
+   struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
+      .ALUOpcode = opcode,
+      .Operand1 = op1,
+      .Operand2 = op2,
+   };
+
+   uint32_t dw;
+   GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
+
+   return dw;
+}
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+/* Emit dwords to multiply GPR0 by N */
+static void
+build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
+{
+   VK_OUTARRAY_MAKE(out, dw, dw_count);
+
+#define append_alu(opcode, operand1, operand2) \
+   vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2)
+
+   assert(N > 0);
+   unsigned top_bit = 31 - __builtin_clz(N);
+   for (int i = top_bit - 1; i >= 0; i--) {
+      /* We get our initial data in GPR0 and we write the final data out to
+       * GPR0 but we use GPR1 as our scratch register.
+       */
+      unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1;
+      unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1;
+
+      /* Shift the current value left by 1 */
+      append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg);
+      append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg);
+      append_alu(MI_ALU_ADD, 0, 0);
+
+      if (N & (1 << i)) {
+         /* Store ACCU to R1 and add R0 to R1 */
+         append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU);
+         append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
+         append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
+         append_alu(MI_ALU_ADD, 0, 0);
+      }
+
+      append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
+   }
+
+#undef append_alu
+}
+
+static void
+emit_mul_gpr0(struct anv_batch *batch, uint32_t N)
+{
+   uint32_t num_dwords;
+   build_alu_multiply_gpr0(NULL, &num_dwords, N);
+
+   uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH));
+   build_alu_multiply_gpr0(dw + 1, &num_dwords, N);
+}
+
+#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
+
+static void
+load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_buffer *buffer, uint64_t offset,
+                         bool indexed)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+
+   unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);
+   if (view_count > 1) {
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+      emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
+      emit_mul_gpr0(batch, view_count);
+      emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
+#else
+      anv_finishme("Multiview + indirect draw requires MI_MATH\n"
+                   "MI_MATH is not supported on Ivy Bridge");
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+#endif
+   } else {
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   }
+
+   emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+
+   if (indexed) {
+      emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
+      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+   } else {
+      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
+      emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+   }
+}
+
 void genX(CmdDrawIndirect)(
     VkCommandBuffer                             commandBuffer,
     VkBuffer                                    _buffer,
@@ -1774,29 +2318,30 @@
    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
 
    if (anv_batch_has_error(&cmd_buffer->batch))
       return;
 
    genX(cmd_buffer_flush_state)(cmd_buffer);
 
-   if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
-      emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
-   if (vs_prog_data->uses_drawid)
-      emit_draw_index(cmd_buffer, 0);
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_bo *bo = buffer->bo;
+      uint32_t bo_offset = buffer->offset + offset;
 
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
-   emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+      if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-      prim.IndirectParameterEnable  = true;
-      prim.VertexAccessType         = SEQUENTIAL;
-      prim.PrimitiveTopologyType    = pipeline->topology;
+      load_indirect_parameters(cmd_buffer, buffer, offset, false);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = pipeline->topology;
+      }
+
+      offset += stride;
    }
 }
 
@@ -1811,30 +2356,31 @@
    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
 
    if (anv_batch_has_error(&cmd_buffer->batch))
       return;
 
    genX(cmd_buffer_flush_state)(cmd_buffer);
 
-   /* TODO: We need to stomp base vertex to 0 somehow */
-   if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
-      emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
-   if (vs_prog_data->uses_drawid)
-      emit_draw_index(cmd_buffer, 0);
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_bo *bo = buffer->bo;
+      uint32_t bo_offset = buffer->offset + offset;
 
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+      /* TODO: We need to stomp base vertex to 0 somehow */
+      if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-      prim.IndirectParameterEnable  = true;
-      prim.VertexAccessType         = RANDOM;
-      prim.PrimitiveTopologyType    = pipeline->topology;
+      load_indirect_parameters(cmd_buffer, buffer, offset, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.VertexAccessType         = RANDOM;
+         prim.PrimitiveTopologyType    = pipeline->topology;
+      }
+
+      offset += stride;
    }
 }
 
@@ -1989,7 +2535,7 @@
       anv_state_flush(cmd_buffer->device, state);
       cmd_buffer->state.num_workgroups_offset = state.offset;
       cmd_buffer->state.num_workgroups_bo =
-         &cmd_buffer->device->dynamic_state_block_pool.bo;
+         &cmd_buffer->device->dynamic_state_pool.block_pool.bo;
    }
 
    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
@@ -2013,9 +2559,6 @@
 #define GPGPU_DISPATCHDIMY 0x2504
 #define GPGPU_DISPATCHDIMZ 0x2508
 
-#define MI_PREDICATE_SRC0  0x2400
-#define MI_PREDICATE_SRC1  0x2408
-
 void genX(CmdDispatchIndirect)(
     VkCommandBuffer                             commandBuffer,
     VkBuffer                                    _buffer,
@@ -2326,14 +2869,12 @@
           * this is not the last use of the buffer. The layout should not have
           * changed from the first call and no transition is necessary.
           */
-         assert(att_ref->layout == att_state->current_layout);
+         assert(att_state->current_layout == att_ref->layout ||
+                att_state->current_layout ==
+                VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);
          continue;
       }
 
-      /* Get the appropriate target layout for this attachment. */
-      const VkImageLayout target_layout = subpass_end ?
-         att_desc->final_layout : att_ref->layout;
-
       /* The attachment index must be less than the number of attachments
        * within the framebuffer.
        */
@@ -2343,6 +2884,29 @@
          cmd_state->framebuffer->attachments[att_ref->attachment];
       const struct anv_image * const image = iview->image;
 
+      /* Get the appropriate target layout for this attachment. */
+      VkImageLayout target_layout;
+
+      /* A resolve is necessary before use as an input attachment if the clear
+       * color or auxiliary buffer usage isn't supported by the sampler.
+       */
+      const bool input_needs_resolve =
+            (att_state->fast_clear && !att_state->clear_color_is_zero_one) ||
+            att_state->input_aux_usage != att_state->aux_usage;
+      if (subpass_end) {
+         target_layout = att_desc->final_layout;
+      } else if (iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT &&
+                 !input_needs_resolve) {
+         /* Layout transitions before the final only help to enable sampling as
+          * an input attachment. If the input attachment supports sampling
+          * using the auxiliary surface, we can skip such transitions by making
+          * the target layout one that is CCS-aware.
+          */
+         target_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      } else {
+         target_layout = att_ref->layout;
+      }
+
       /* Perform the layout transition. */
       if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
          transition_depth_buffer(cmd_buffer, image,
@@ -2350,17 +2914,94 @@
          att_state->aux_usage =
             anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
                                     image->aspects, target_layout);
-      }
-      if (image->aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+      } else if (image->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
          transition_color_buffer(cmd_buffer, image,
-                                 att_state->current_layout, target_layout,
-                                 &iview->isl, NULL);
+                                 iview->isl.base_level, 1,
+                                 iview->isl.base_array_layer,
+                                 iview->isl.array_len,
+                                 att_state->current_layout, target_layout);
       }
 
       att_state->current_layout = target_layout;
    }
 }
 
+/* Update the clear value dword(s) in surface state objects or the fast clear
+ * state buffer entry for the color attachments used in this subpass.
+ */
+static void
+cmd_buffer_subpass_sync_fast_clear_values(struct anv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer && cmd_buffer->state.subpass);
+
+   const struct anv_cmd_state *state = &cmd_buffer->state;
+
+   /* Iterate through every color attachment used in this subpass. */
+   for (uint32_t i = 0; i < state->subpass->color_count; ++i) {
+
+      /* The attachment should be one of the attachments described in the
+       * render pass and used in the subpass.
+       */
+      const uint32_t a = state->subpass->color_attachments[i].attachment;
+      if (a == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      assert(a < state->pass->attachment_count);
+
+      /* Store some information regarding this attachment. */
+      const struct anv_attachment_state *att_state = &state->attachments[a];
+      const struct anv_image_view *iview = state->framebuffer->attachments[a];
+      const struct anv_render_pass_attachment *rp_att =
+         &state->pass->attachments[a];
+
+      if (att_state->aux_usage == ISL_AUX_USAGE_NONE)
+         continue;
+
+      /* The fast clear state entry must be updated if a fast clear is going to
+       * happen. The surface state must be updated if the clear value from a
+       * prior fast clear may be needed.
+       */
+      if (att_state->pending_clear_aspects && att_state->fast_clear) {
+         /* Update the fast clear state entry. */
+         genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color_rt_state,
+                                      iview->image, iview->isl.base_level,
+                                      true /* copy from ss */);
+
+         /* Fast-clears impact whether or not a resolve will be necessary. */
+         if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E &&
+             att_state->clear_color_is_zero) {
+            /* This image always has the auxiliary buffer enabled. We can mark
+             * the subresource as not needing a resolve because the clear color
+             * will match what's in every RENDER_SURFACE_STATE object when it's
+             * being used for sampling.
+             */
+            genX(set_image_needs_resolve)(cmd_buffer, iview->image,
+                                          iview->isl.base_level, false);
+         } else {
+            genX(set_image_needs_resolve)(cmd_buffer, iview->image,
+                                          iview->isl.base_level, true);
+         }
+      } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
+         /* The attachment may have been fast-cleared in a previous render
+          * pass and the value is needed now. Update the surface state(s).
+          *
+          * TODO: Do this only once per render pass instead of every subpass.
+          */
+         genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color_rt_state,
+                                      iview->image, iview->isl.base_level,
+                                      false /* copy to ss */);
+
+         if (need_input_attachment_state(rp_att) &&
+             att_state->input_aux_usage != ISL_AUX_USAGE_NONE) {
+            genX(copy_fast_clear_dwords)(cmd_buffer, att_state->input_att_state,
+                                         iview->image, iview->isl.base_level,
+                                         false /* copy to ss */);
+         }
+      }
+   }
+}
+
+
 static void
 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
                              struct anv_subpass *subpass)
@@ -2369,11 +3010,30 @@
 
    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw the
+    * different views.  If the client asks for instancing, we need to use the
+    * Instance Data Step Rate to ensure that we repeat the client's
+    * per-instance data once for each view.  Since this bit is in
+    * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top
+    * of each subpass.
+    */
+   if (GEN_GEN == 7)
+      cmd_buffer->state.vb_dirty |= ~0;
+
    /* Perform transitions to the subpass layout before any writes have
     * occurred.
     */
    cmd_buffer_subpass_transition_layouts(cmd_buffer, false);
 
+   /* Update clear values *after* performing automatic layout transitions.
+    * This ensures that transitions from the UNDEFINED layout have had a chance
+    * to populate the clear value buffer with the correct values for the
+    * LOAD_OP_LOAD loadOp and that the fast-clears will update the buffer
+    * without the aforementioned layout transition overwriting the fast-clear
+    * value.
+    */
+   cmd_buffer_subpass_sync_fast_clear_values(cmd_buffer);
+
    cmd_buffer_emit_depth_stencil(cmd_buffer);
 
    anv_cmd_buffer_clear_subpass(cmd_buffer);
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 3cbc723..db723d4 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -52,10 +52,50 @@
 }
 
 void
-genX(cmd_buffer_gpu_memcpy)(struct anv_cmd_buffer *cmd_buffer,
-                            struct anv_bo *dst, uint32_t dst_offset,
-                            struct anv_bo *src, uint32_t src_offset,
-                            uint32_t size)
+genX(cmd_buffer_mi_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_bo *dst, uint32_t dst_offset,
+                           struct anv_bo *src, uint32_t src_offset,
+                           uint32_t size)
+{
+   /* This memcpy operates in units of dwords. */
+   assert(size % 4 == 0);
+   assert(dst_offset % 4 == 0);
+   assert(src_offset % 4 == 0);
+
+   for (uint32_t i = 0; i < size; i += 4) {
+      const struct anv_address src_addr =
+         (struct anv_address) { src, src_offset + i};
+      const struct anv_address dst_addr =
+         (struct anv_address) { dst, dst_offset + i};
+#if GEN_GEN >= 8
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_COPY_MEM_MEM), cp) {
+         cp.DestinationMemoryAddress = dst_addr;
+         cp.SourceMemoryAddress = src_addr;
+      }
+#else
+      /* IVB does not have a general purpose register for command streamer
+       * commands. Therefore, we use an alternate temporary register.
+       */
+#define TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), load) {
+         load.RegisterAddress = TEMP_REG;
+         load.MemoryAddress = src_addr;
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), store) {
+         store.RegisterAddress = TEMP_REG;
+         store.MemoryAddress = dst_addr;
+      }
+#undef TEMP_REG
+#endif
+   }
+   return;
+}
+
+void
+genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_bo *dst, uint32_t dst_offset,
+                           struct anv_bo *src, uint32_t src_offset,
+                           uint32_t size)
 {
    if (size == 0)
       return;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 3fd1333..55db533 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -155,9 +155,12 @@
       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
          vfi.InstancingEnable = pipeline->instancing_enable[desc->binding];
          vfi.VertexElementIndex = slot;
-         /* Vulkan so far doesn't have an instance divisor, so
-          * this is always 1 (ignored if not instancing). */
-         vfi.InstanceDataStepRate = 1;
+         /* Our implementation of VK_KHX_multiview uses instancing to draw
+          * the different views.  If the client asks for instancing, we
+          * need to use the Instance Data Step Rate to ensure that we
+          * repeat the client's per-instance data once for each view.
+          */
+         vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);
       }
 #endif
    }
@@ -549,6 +552,7 @@
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_MULTISAMPLE), ms) {
       ms.NumberofMultisamples       = log2_samples;
 
+      ms.PixelLocation              = CENTER;
 #if GEN_GEN >= 8
       /* The PRM says that this bit is valid only for DX9:
        *
@@ -556,9 +560,7 @@
        *    should not have any effect by setting or not setting this bit.
        */
       ms.PixelPositionOffsetEnable  = false;
-      ms.PixelLocation              = CENTER;
 #else
-      ms.PixelLocation              = PIXLOC_CENTER;
 
       switch (samples) {
       case 1:
@@ -862,28 +864,14 @@
 {
    struct anv_device *device = pipeline->device;
 
-   const uint32_t num_dwords = GENX(BLEND_STATE_length);
-   pipeline->blend_state =
-      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
 
    struct GENX(BLEND_STATE) blend_state = {
 #if GEN_GEN >= 8
       .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
       .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-#else
-      /* Make sure it gets zeroed */
-      .Entry = { { 0, }, },
 #endif
    };
 
-   /* Default everything to disabled */
-   for (uint32_t i = 0; i < 8; i++) {
-      blend_state.Entry[i].WriteDisableAlpha = true;
-      blend_state.Entry[i].WriteDisableRed = true;
-      blend_state.Entry[i].WriteDisableGreen = true;
-      blend_state.Entry[i].WriteDisableBlue = true;
-   }
-
    uint32_t surface_count = 0;
    struct anv_pipeline_bind_map *map;
    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
@@ -891,7 +879,17 @@
       surface_count = map->surface_count;
    }
 
+   const uint32_t num_dwords = GENX(BLEND_STATE_length) +
+      GENX(BLEND_STATE_ENTRY_length) * surface_count;
+   pipeline->blend_state =
+      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
+
    bool has_writeable_rt = false;
+   uint32_t *state_pos = pipeline->blend_state.map;
+   state_pos += GENX(BLEND_STATE_length);
+#if GEN_GEN >= 8
+   struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
+#endif
    for (unsigned i = 0; i < surface_count; i++) {
       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
 
@@ -902,14 +900,24 @@
       /* We can have at most 8 attachments */
       assert(i < 8);
 
-      if (info == NULL || binding->index >= info->attachmentCount)
+      if (info == NULL || binding->index >= info->attachmentCount) {
+         /* Default everything to disabled */
+         struct GENX(BLEND_STATE_ENTRY) entry = {
+            .WriteDisableAlpha = true,
+            .WriteDisableRed = true,
+            .WriteDisableGreen = true,
+            .WriteDisableBlue = true,
+         };
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
+         state_pos += GENX(BLEND_STATE_ENTRY_length);
          continue;
+      }
 
       assert(binding->binding == 0);
       const VkPipelineColorBlendAttachmentState *a =
          &info->pAttachments[binding->index];
 
-      blend_state.Entry[i] = (struct GENX(BLEND_STATE_ENTRY)) {
+      struct GENX(BLEND_STATE_ENTRY) entry = {
 #if GEN_GEN < 8
          .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
          .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
@@ -938,7 +946,7 @@
 #if GEN_GEN >= 8
          blend_state.IndependentAlphaBlendEnable = true;
 #else
-         blend_state.Entry[i].IndependentAlphaBlendEnable = true;
+         entry.IndependentAlphaBlendEnable = true;
 #endif
       }
 
@@ -953,26 +961,31 @@
        */
       if (a->colorBlendOp == VK_BLEND_OP_MIN ||
           a->colorBlendOp == VK_BLEND_OP_MAX) {
-         blend_state.Entry[i].SourceBlendFactor = BLENDFACTOR_ONE;
-         blend_state.Entry[i].DestinationBlendFactor = BLENDFACTOR_ONE;
+         entry.SourceBlendFactor = BLENDFACTOR_ONE;
+         entry.DestinationBlendFactor = BLENDFACTOR_ONE;
       }
       if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
           a->alphaBlendOp == VK_BLEND_OP_MAX) {
-         blend_state.Entry[i].SourceAlphaBlendFactor = BLENDFACTOR_ONE;
-         blend_state.Entry[i].DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+         entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+         entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
       }
+      GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
+      state_pos += GENX(BLEND_STATE_ENTRY_length);
+#if GEN_GEN >= 8
+      if (i == 0)
+         bs0 = entry;
+#endif
    }
 
 #if GEN_GEN >= 8
-   struct GENX(BLEND_STATE_ENTRY) *bs0 = &blend_state.Entry[0];
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_BLEND), blend) {
       blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
       blend.HasWriteableRT                = has_writeable_rt;
-      blend.ColorBufferBlendEnable        = bs0->ColorBufferBlendEnable;
-      blend.SourceAlphaBlendFactor        = bs0->SourceAlphaBlendFactor;
-      blend.DestinationAlphaBlendFactor   = bs0->DestinationAlphaBlendFactor;
-      blend.SourceBlendFactor             = bs0->SourceBlendFactor;
-      blend.DestinationBlendFactor        = bs0->DestinationBlendFactor;
+      blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
+      blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
+      blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
+      blend.SourceBlendFactor             = bs0.SourceBlendFactor;
+      blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
       blend.AlphaTestEnable               = false;
       blend.IndependentAlphaBlendEnable   =
          blend_state.IndependentAlphaBlendEnable;
@@ -1049,7 +1062,8 @@
       }
 #else
       clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
-         (wm_prog_data->barycentric_interp_modes & 0x38) != 0 : 0;
+         (wm_prog_data->barycentric_interp_modes &
+          BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
 #endif
    }
 }
@@ -1121,7 +1135,7 @@
    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
 
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS), vs) {
-      vs.FunctionEnable       = true;
+      vs.Enable               = true;
       vs.StatisticsEnable     = true;
       vs.KernelStartPointer   = vs_bin->kernel.offset;
 #if GEN_GEN >= 8
@@ -1181,7 +1195,7 @@
    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
 
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), hs) {
-      hs.FunctionEnable = true;
+      hs.Enable = true;
       hs.StatisticsEnable = true;
       hs.KernelStartPointer = tcs_bin->kernel.offset;
 
@@ -1211,7 +1225,7 @@
    }
 
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), ds) {
-      ds.FunctionEnable = true;
+      ds.Enable = true;
       ds.StatisticsEnable = true;
       ds.KernelStartPointer = tes_bin->kernel.offset;
 
@@ -1264,7 +1278,7 @@
    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
 
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), gs) {
-      gs.FunctionEnable          = true;
+      gs.Enable                  = true;
       gs.StatisticsEnable        = true;
       gs.KernelStartPointer      = gs_bin->kernel.offset;
       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
@@ -1289,11 +1303,7 @@
       gs.ControlDataFormat       = gs_prog_data->control_data_format;
       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
-#if GEN_GEN >= 8 || GEN_IS_HASWELL
       gs.ReorderMode             = TRAILING;
-#else
-      gs.ReorderEnable           = true;
-#endif
 
 #if GEN_GEN >= 8
       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index d8070e4..52e4b48 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -507,35 +507,20 @@
 
 #if GEN_GEN > 7 || GEN_IS_HASWELL
 
-#define alu_opcode(v)   __gen_uint((v),  20, 31)
-#define alu_operand1(v) __gen_uint((v),  10, 19)
-#define alu_operand2(v) __gen_uint((v),   0,  9)
-#define alu(opcode, operand1, operand2) \
-   alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
+static inline uint32_t
+mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
+{
+   struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
+      .ALUOpcode = opcode,
+      .Operand1 = operand1,
+      .Operand2 = operand2,
+   };
 
-#define OPCODE_NOOP      0x000
-#define OPCODE_LOAD      0x080
-#define OPCODE_LOADINV   0x480
-#define OPCODE_LOAD0     0x081
-#define OPCODE_LOAD1     0x481
-#define OPCODE_ADD       0x100
-#define OPCODE_SUB       0x101
-#define OPCODE_AND       0x102
-#define OPCODE_OR        0x103
-#define OPCODE_XOR       0x104
-#define OPCODE_STORE     0x180
-#define OPCODE_STOREINV  0x580
+   uint32_t dw;
+   GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
 
-#define OPERAND_R0   0x00
-#define OPERAND_R1   0x01
-#define OPERAND_R2   0x02
-#define OPERAND_R3   0x03
-#define OPERAND_R4   0x04
-#define OPERAND_SRCA 0x20
-#define OPERAND_SRCB 0x21
-#define OPERAND_ACCU 0x31
-#define OPERAND_ZF   0x32
-#define OPERAND_CF   0x33
+   return dw;
+}
 
 #define CS_GPR(n) (0x2600 + (n) * 8)
 
@@ -588,10 +573,15 @@
    emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 
    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
-   dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
-   dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
-   dw[3] = alu(OPCODE_AND, 0, 0);
-   dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
+   if (!dw) {
+      anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return;
+   }
+
+   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
+   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
+   dw[3] = mi_alu(MI_ALU_AND, 0, 0);
+   dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
 }
 
 /*
@@ -614,12 +604,17 @@
    for (int o = 0; o < outer_count; o++) {
       /* Submit one MI_MATH to shift left by 6 bits */
       uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
+      if (!dw) {
+         anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return;
+      }
+
       dw++;
       for (int i = 0; i < inner_count; i++, dw += 4) {
-         dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
-         dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
-         dw[2] = alu(OPCODE_ADD, 0, 0);
-         dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
+         dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
+         dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
+         dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
+         dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
       }
    }
 }
@@ -682,10 +677,10 @@
       return;
    }
 
-   dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
-   dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
-   dw[3] = alu(OPCODE_SUB, 0, 0);
-   dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
+   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
+   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
+   dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
+   dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
 }
 
 void genX(CmdCopyQueryPoolResults)(
@@ -714,7 +709,7 @@
       slot_offset = (firstQuery + i) * pool->stride;
       switch (pool->type) {
       case VK_QUERY_TYPE_OCCLUSION:
-         compute_query_result(&cmd_buffer->batch, OPERAND_R2,
+         compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
                               &pool->bo, slot_offset + 8);
          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
                                 flags, 0, CS_GPR(2));
@@ -726,7 +721,7 @@
          while (statistics) {
             uint32_t stat = u_bit_scan(&statistics);
 
-            compute_query_result(&cmd_buffer->batch, OPERAND_R0,
+            compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
                                  &pool->bo, slot_offset + idx * 16 + 8);
 
             /* WaDividePSInvocationCountBy4:HSW,BDW */
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
index 62784aa..8e93195 100644
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -53,6 +53,20 @@
       ps.PipelineSelection = _3D;
    }
 
+#if GEN_GEN == 9
+   uint32_t cache_mode_1;
+   anv_pack_struct(&cache_mode_1, GENX(CACHE_MODE_1),
+                   .FloatBlendOptimizationEnable = true,
+                   .FloatBlendOptimizationEnableMask = true,
+                   .PartialResolveDisableInVC = true,
+                   .PartialResolveDisableInVCMask = true);
+
+   anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset = GENX(CACHE_MODE_1_num);
+      lri.DataDWord      = cache_mode_1;
+   }
+#endif
+
    anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
 
    anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
diff --git a/src/intel/vulkan/intel_icd.json.in b/src/intel/vulkan/intel_icd.json.in
index e959291..61db4bf 100644
--- a/src/intel/vulkan/intel_icd.json.in
+++ b/src/intel/vulkan/intel_icd.json.in
@@ -2,6 +2,6 @@
     "file_format_version": "1.0.0",
     "ICD": {
         "library_path": "@install_libdir@/libvulkan_intel.so",
-        "api_version": "1.0.3"
+        "api_version": "1.0.54"
     }
 }
diff --git a/src/intel/vulkan/tests/block_pool_no_free.c b/src/intel/vulkan/tests/block_pool_no_free.c
index b7acc5a..9d2d111 100644
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -41,16 +41,18 @@
 static void *alloc_blocks(void *_job)
 {
    struct job *job = _job;
+   uint32_t job_id = job - jobs;
+   uint32_t block_size = 16 * ((job_id % 4) + 1);
    int32_t block, *data;
 
    for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
-      block = anv_block_pool_alloc(job->pool);
+      block = anv_block_pool_alloc(job->pool, block_size);
       data = job->pool->map + block;
       *data = block;
       assert(block >= 0);
       job->blocks[i] = block;
 
-      block = anv_block_pool_alloc_back(job->pool);
+      block = anv_block_pool_alloc_back(job->pool, block_size);
       data = job->pool->map + block;
       *data = block;
       assert(block < 0);
@@ -116,7 +118,7 @@
    assert(anv_gem_connect(&device) == 0);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_block_pool_init(&pool, &device, 16);
+   anv_block_pool_init(&pool, &device, 4096);
 
    for (unsigned i = 0; i < NUM_THREADS; i++) {
       jobs[i].pool = &pool;
diff --git a/src/intel/vulkan/tests/state_pool.c b/src/intel/vulkan/tests/state_pool.c
index 8920b52..1b2abf5 100644
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -38,15 +38,13 @@
    struct anv_device device = {
       .instance = &instance,
    };
-   struct anv_block_pool block_pool;
    struct anv_state_pool state_pool;
 
    assert(anv_gem_connect(&device) == 0);
    pthread_mutex_init(&device.mutex, NULL);
 
    for (unsigned i = 0; i < NUM_RUNS; i++) {
-      anv_block_pool_init(&block_pool, &device, 256);
-      anv_state_pool_init(&state_pool, &block_pool);
+      anv_state_pool_init(&state_pool, &device, 256);
 
       /* Grab one so a zero offset is impossible */
       anv_state_pool_alloc(&state_pool, 16, 16);
@@ -54,7 +52,6 @@
       run_state_pool_test(&state_pool);
 
       anv_state_pool_finish(&state_pool);
-      anv_block_pool_finish(&block_pool);
    }
 
    pthread_mutex_destroy(&device.mutex);
diff --git a/src/intel/vulkan/tests/state_pool_free_list_only.c b/src/intel/vulkan/tests/state_pool_free_list_only.c
index 2dd991a..c65466e 100644
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -37,14 +37,12 @@
    struct anv_device device = {
       .instance = &instance,
    };
-   struct anv_block_pool block_pool;
    struct anv_state_pool state_pool;
 
    assert(anv_gem_connect(&device) == 0);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_block_pool_init(&block_pool, &device, 4096);
-   anv_state_pool_init(&state_pool, &block_pool);
+   anv_state_pool_init(&state_pool, &device, 4096);
 
    /* Grab one so a zero offset is impossible */
    anv_state_pool_alloc(&state_pool, 16, 16);
@@ -68,7 +66,6 @@
    run_state_pool_test(&state_pool);
 
    anv_state_pool_finish(&state_pool);
-   anv_block_pool_finish(&block_pool);
    pthread_mutex_destroy(&device.mutex);
 
    anv_gem_disconnect(&device);
diff --git a/src/intel/vulkan/tests/state_pool_no_free.c b/src/intel/vulkan/tests/state_pool_no_free.c
index c1bf259..f9652bf 100644
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -58,14 +58,12 @@
    struct anv_device device = {
       .instance = &instance,
    };
-   struct anv_block_pool block_pool;
    struct anv_state_pool state_pool;
 
    assert(anv_gem_connect(&device) == 0);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_block_pool_init(&block_pool, &device, 64);
-   anv_state_pool_init(&state_pool, &block_pool);
+   anv_state_pool_init(&state_pool, &device, 64);
 
    pthread_barrier_init(&barrier, NULL, NUM_THREADS);
 
@@ -111,7 +109,6 @@
    }
 
    anv_state_pool_finish(&state_pool);
-   anv_block_pool_finish(&block_pool);
    pthread_mutex_destroy(&device.mutex);
 
    anv_gem_disconnect(&device);
diff --git a/src/loader/loader.c b/src/loader/loader.c
index 5541ccc..c3fc961 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -153,7 +153,7 @@
    return tag;
 }
 
-int loader_get_user_preferred_fd(int default_fd, int *different_device)
+int loader_get_user_preferred_fd(int default_fd, bool *different_device)
 {
 /* Arbitrary "maximum" value of drm devices. */
 #define MAX_DRM_DEVICES 32
@@ -171,7 +171,7 @@
 #endif
 
    if (prime == NULL) {
-      *different_device = 0;
+      *different_device = false;
       return default_fd;
    }
 
@@ -230,16 +230,16 @@
    return fd;
 
  err:
-   *different_device = 0;
+   *different_device = false;
 
    free(default_tag);
    free(prime);
    return default_fd;
 }
 #else
-int loader_get_user_preferred_fd(int default_fd, int *different_device)
+int loader_get_user_preferred_fd(int default_fd, bool *different_device)
 {
-   *different_device = 0;
+   *different_device = false;
    return default_fd;
 }
 #endif
diff --git a/src/loader/loader.h b/src/loader/loader.h
index 84314a4..3859b45 100644
--- a/src/loader/loader.h
+++ b/src/loader/loader.h
@@ -27,6 +27,8 @@
 #ifndef LOADER_H
 #define LOADER_H
 
+#include <stdbool.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -51,7 +53,7 @@
  */
 
 int
-loader_get_user_preferred_fd(int default_fd, int *different_device);
+loader_get_user_preferred_fd(int default_fd, bool *different_device);
 
 /* for logging.. keep this aligned with egllog.h so we can just use
  * _eglLog directly.
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 493a7f5..c2ae895 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -504,6 +504,7 @@
                                      x, y, width, height, __BLIT_FLAG_FLUSH);
    }
 
+   loader_dri3_swapbuffer_barrier(draw);
    dri3_fence_reset(draw->conn, back);
    dri3_copy_area(draw->conn,
                   dri3_back_buffer(draw)->pixmap,
@@ -595,6 +596,7 @@
                                   front->height,
                                   0, 0, front->width,
                                   front->height, __BLIT_FLAG_FLUSH);
+   loader_dri3_swapbuffer_barrier(draw);
    loader_dri3_copy_drawable(draw, draw->drawable, front->pixmap);
 }
 
@@ -640,7 +642,7 @@
 
    draw->vtable->flush_drawable(draw, flush_flags);
 
-   back = draw->buffers[LOADER_DRI3_BACK_ID(draw->cur_back)];
+   back = draw->buffers[dri3_find_back(draw)];
    if (draw->is_different_gpu && back) {
       /* Update the linear buffer before presenting the pixmap */
       draw->ext->image->blitImage(dri_context,
@@ -1258,6 +1260,7 @@
          }
          break;
       case loader_dri3_buffer_front:
+         loader_dri3_swapbuffer_barrier(draw);
          dri3_fence_reset(draw->conn, new_buffer);
          dri3_copy_area(draw->conn,
                         draw->drawable,
@@ -1431,3 +1434,18 @@
       free(geom_reply);
    }
 }
+
+
+/**
+ * Make sure the server has flushed all pending swap buffers to hardware
+ * for this drawable. Ideally we'd want to send an X protocol request to
+ * have the server block our connection until the swaps are complete. That
+ * would avoid the potential round-trip here.
+ */
+void
+loader_dri3_swapbuffer_barrier(struct loader_dri3_drawable *draw)
+{
+   int64_t ust, msc, sbc;
+
+   (void) loader_dri3_wait_for_sbc(draw, 0, &ust, &msc, &sbc);
+}
diff --git a/src/loader/loader_dri3_helper.h b/src/loader/loader_dri3_helper.h
index a865e46..659b63a 100644
--- a/src/loader/loader_dri3_helper.h
+++ b/src/loader/loader_dri3_helper.h
@@ -241,4 +241,7 @@
 
 void
 loader_dri3_update_drawable_geometry(struct loader_dri3_drawable *draw);
+
+void
+loader_dri3_swapbuffer_barrier(struct loader_dri3_drawable *draw);
 #endif
diff --git a/src/mapi/Android.mk b/src/mapi/Android.mk
index 4445218..552bab6 100644
--- a/src/mapi/Android.mk
+++ b/src/mapi/Android.mk
@@ -75,4 +75,4 @@
 $(mapi_abi_headers): $(mapi_abi_deps)
 	@mkdir -p $(dir $@)
 	@echo "target $(PRIVATE_PRINTER): $(PRIVATE_MODULE) <= $(PRIVATE_APIXML)"
-	$(hide) $(PRIVATE_SCRIPT) --printer $(PRIVATE_PRINTER) --mode lib $(PRIVATE_APIXML) > $@
+	$(hide) $(PRIVATE_SCRIPT) --printer $(PRIVATE_PRINTER) $(PRIVATE_APIXML) > $@
diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 3133462..83e32d2 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -93,7 +93,7 @@
 
 shared-glapi/glapi_mapi_tmp.h : glapi/gen/gl_and_es_API.xml $(glapi_gen_mapi_deps)
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --mode lib --printer shared-glapi \
+	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --printer shared-glapi \
 		$(srcdir)/glapi/gen/gl_and_es_API.xml > $@
 
 if HAVE_OPENGL
@@ -187,14 +187,12 @@
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
-if HAVE_SHARED_GLAPI
 es1api_libGLESv1_CM_la_LIBADD += shared-glapi/libglapi.la
 endif
-endif
 
 es1api/glapi_mapi_tmp.h: glapi/gen/gl_and_es_API.xml $(glapi_gen_mapi_deps)
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --mode lib --printer es1api \
+	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --printer es1api \
 		$(srcdir)/glapi/gen/gl_and_es_API.xml > $@
 
 if HAVE_OPENGL_ES2
@@ -234,17 +232,17 @@
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
-if HAVE_SHARED_GLAPI
 es2api_libGLESv2_la_LIBADD += shared-glapi/libglapi.la
 endif
-endif
 
 es2api/glapi_mapi_tmp.h: glapi/gen/gl_and_es_API.xml $(glapi_gen_mapi_deps)
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --mode lib --printer es2api \
+	$(PYTHON_GEN) $(srcdir)/mapi_abi.py --printer es2api \
 		$(srcdir)/glapi/gen/gl_and_es_API.xml > $@
 
 include $(top_srcdir)/install-lib-links.mk
 
+if NEED_KHRPLATFORM
 khrdir = $(includedir)/KHR
 khr_HEADERS = $(top_srcdir)/include/KHR/khrplatform.h
+endif
diff --git a/src/mapi/Makefile.sources b/src/mapi/Makefile.sources
index 37d6ef3..5647158 100644
--- a/src/mapi/Makefile.sources
+++ b/src/mapi/Makefile.sources
@@ -26,6 +26,8 @@
 	entry_x86-64_tls.h \
 	entry_x86_tls.h \
 	entry_x86_tsd.h \
+	entry_ppc64le_tls.h \
+	entry_ppc64le_tsd.h \
 	mapi_tmp.h
 
 MAPI_FILES = \
diff --git a/src/mapi/entry.c b/src/mapi/entry.c
index 27d0db4..1e25012 100644
--- a/src/mapi/entry.c
+++ b/src/mapi/entry.c
@@ -25,8 +25,12 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
+#include <stdlib.h>
+#include <stdint.h>
+
 #include "entry.h"
 #include "u_current.h"
+#include "util/u_endian.h"
 
 #define _U_STRINGIFY(x) #x
 #define U_STRINGIFY(x) _U_STRINGIFY(x)
@@ -49,11 +53,15 @@
 #   endif
 #elif defined(USE_X86_64_ASM) && defined(__GNUC__) && defined(GLX_USE_TLS)
 #   include "entry_x86-64_tls.h"
+#elif defined(USE_PPC64LE_ASM) && defined(__GNUC__) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#   ifdef GLX_USE_TLS
+#      include "entry_ppc64le_tls.h"
+#   else
+#      include "entry_ppc64le_tsd.h"
+#   endif
 #else
 
-#include <stdlib.h>
-
-static inline const struct mapi_table *
+static inline const struct _glapi_table *
 entry_current_get(void)
 {
 #ifdef MAPI_MODE_BRIDGE
diff --git a/src/mapi/entry_ppc64le_tls.h b/src/mapi/entry_ppc64le_tls.h
new file mode 100644
index 0000000..e09a117
--- /dev/null
+++ b/src/mapi/entry_ppc64le_tls.h
@@ -0,0 +1,152 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2017 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Ben Crocker <bcrocker@redhat.com>
+ */
+
+#ifdef HAVE_FUNC_ATTRIBUTE_VISIBILITY
+#define HIDDEN __attribute__((visibility("hidden")))
+#else
+#define HIDDEN
+#endif
+
+// NOTE: These must be powers of two:
+#define PPC64LE_ENTRY_SIZE 64
+#define PPC64LE_PAGE_ALIGN 65536
+#if ((PPC64LE_ENTRY_SIZE & (PPC64LE_ENTRY_SIZE - 1)) != 0)
+#error PPC64LE_ENTRY_SIZE must be a power of two!
+#endif
+#if ((PPC64LE_PAGE_ALIGN & (PPC64LE_PAGE_ALIGN - 1)) != 0)
+#error PPC64LE_PAGE_ALIGN must be a power of two!
+#endif
+
+__asm__(".text\n"
+        ".balign " U_STRINGIFY(PPC64LE_ENTRY_SIZE) "\n"
+        "ppc64le_entry_start:");
+
+#define STUB_ASM_ENTRY(func)                            \
+   ".globl " func "\n"                                  \
+   ".type " func ", @function\n"                        \
+   ".balign " U_STRINGIFY(PPC64LE_ENTRY_SIZE) "\n"        \
+   func ":\n\t"                                         \
+   "  addis  2, 12, .TOC.-" func "@ha\n\t"              \
+   "  addi   2, 2, .TOC.-" func "@l\n\t"                \
+   "  .localentry  " func ", .-" func "\n\t"
+
+#define STUB_ASM_CODE(slot)                                     \
+   "  addis  11, 2, " ENTRY_CURRENT_TABLE "@got@tprel@ha\n\t"   \
+   "  ld     11, " ENTRY_CURRENT_TABLE "@got@tprel@l(11)\n\t"   \
+   "  add    11, 11," ENTRY_CURRENT_TABLE "@tls\n\t"            \
+   "  ld     11, 0(11)\n\t"                                     \
+   "  ld     12, " slot "*8(11)\n\t"                            \
+   "  mtctr  12\n\t"                                            \
+   "  bctr\n"                                                   \
+
+#define MAPI_TMP_STUB_ASM_GCC
+#include "mapi_tmp.h"
+
+#ifndef MAPI_MODE_BRIDGE
+
+#include <string.h>
+#include "u_execmem.h"
+
+void
+entry_patch_public(void)
+{
+}
+
+extern char
+ppc64le_entry_start[] HIDDEN;
+
+mapi_func
+entry_get_public(int slot)
+{
+   return (mapi_func) (ppc64le_entry_start + slot * PPC64LE_ENTRY_SIZE);
+}
+
+__asm__(".text\n");
+
+__asm__("ppc64le_dispatch_tls:\n\t"
+        "  addis  3, 2, " ENTRY_CURRENT_TABLE "@got@tprel@ha\n\t"
+        "  ld     3, " ENTRY_CURRENT_TABLE "@got@tprel@l(3)\n\t"
+        "  blr\n"
+        );
+
+extern uint64_t ppc64le_dispatch_tls();
+
+static const uint32_t code_templ[] = {
+   // This should be functionally the same code as would be generated from
+   // the STUB_ASM_CODE macro, but defined as a buffer.
+   // This is used to generate new dispatch stubs. Mesa will copy this
+   // data to the dispatch stub, and then it will patch the slot number and
+   // any addresses that it needs to.
+   // NOTE!!!  NOTE!!!  NOTE!!!
+   // This representation is correct for both little- and big-endian systems.
+   // However, more work needs to be done for big-endian Linux because it
+   // adheres to an older, AIX-compatible ABI that uses function descriptors.
+   // 1000:
+   0x7C0802A6,    // <ENTRY+00>:   mflr   0
+   0xF8010010,    // <ENTRY+04>:   std    0, 16(1)
+   0xE96C0028,    // <ENTRY+08>:   ld     11, 9000f-1000b+0(12)
+   0x7D6B6A14,    // <ENTRY+12>:   add    11, 11, 13
+   0xE96B0000,    // <ENTRY+16>:   ld     11, 0(11)
+   0xE80C0030,    // <ENTRY+20>:   ld     0, 9000f-1000b+8(12)
+   0x7D8B002A,    // <ENTRY+24>:   ldx    12, 11, 0
+   0x7D8903A6,    // <ENTRY+28>:   mtctr  12
+   0x4E800420,    // <ENTRY+32>:   bctr
+   0x60000000,    // <ENTRY+36>:   nop
+   // 9000:
+   0, 0,          // <ENTRY+40>:    .quad _glapi_tls_Dispatch
+   0, 0           // <ENTRY+48>:    .quad <slot>*8
+};
+static const uint64_t TEMPLATE_OFFSET_TLS_ADDR = sizeof(code_templ) - 2*8;
+static const uint64_t TEMPLATE_OFFSET_SLOT = sizeof(code_templ) - 1*8;
+
+void
+entry_patch(mapi_func entry, int slot)
+{
+   char *code = (char *) entry;
+   *((uint64_t *) (code + TEMPLATE_OFFSET_TLS_ADDR)) = ppc64le_dispatch_tls();
+   *((uint64_t *) (code + TEMPLATE_OFFSET_SLOT)) = slot * sizeof(mapi_func);
+}
+
+mapi_func
+entry_generate(int slot)
+{
+   char *code;
+   mapi_func entry;
+
+   code = u_execmem_alloc(sizeof(code_templ));
+   if (!code)
+      return NULL;
+
+   memcpy(code, code_templ, sizeof(code_templ));
+
+   entry = (mapi_func) code;
+   entry_patch(entry, slot);
+
+   return entry;
+}
+
+#endif /* MAPI_MODE_BRIDGE */
diff --git a/src/mapi/entry_ppc64le_tsd.h b/src/mapi/entry_ppc64le_tsd.h
new file mode 100644
index 0000000..a583b93
--- /dev/null
+++ b/src/mapi/entry_ppc64le_tsd.h
@@ -0,0 +1,210 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2017 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Ben Crocker <bcrocker@redhat.com>
+ */
+
+#ifdef HAVE_FUNC_ATTRIBUTE_VISIBILITY
+#define HIDDEN __attribute__((visibility("hidden")))
+#else
+#define HIDDEN
+#endif
+
+// NOTE: These must be powers of two:
+#define PPC64LE_ENTRY_SIZE 256
+#define PPC64LE_PAGE_ALIGN 65536
+#if ((PPC64LE_ENTRY_SIZE & (PPC64LE_ENTRY_SIZE - 1)) != 0)
+#error PPC64LE_ENTRY_SIZE must be a power of two!
+#endif
+#if ((PPC64LE_PAGE_ALIGN & (PPC64LE_PAGE_ALIGN - 1)) != 0)
+#error PPC64LE_PAGE_ALIGN must be a power of two!
+#endif
+
+__asm__(".text\n"
+        ".balign " U_STRINGIFY(PPC64LE_ENTRY_SIZE) "\n"
+        "ppc64le_entry_start:");
+
+#define STUB_ASM_ENTRY(func)                            \
+   ".globl " func "\n"                                  \
+   ".type " func ", @function\n"                        \
+   ".balign " U_STRINGIFY(PPC64LE_ENTRY_SIZE) "\n"        \
+   func ":\n\t"                                         \
+   "  addis  2, 12, .TOC.-" func "@ha\n\t"              \
+   "  addi   2, 2, .TOC.-" func "@l\n\t"                \
+   "  .localentry  " func ", .-" func "\n\t"
+
+#define STUB_ASM_CODE(slot)                                         \
+   "  addis  11, 2, " ENTRY_CURRENT_TABLE "@got@ha\n\t"             \
+   "  ld     11, " ENTRY_CURRENT_TABLE "@got@l(11)\n\t"             \
+   "  ld     11, 0(11)\n\t"                                         \
+   "  cmpldi 11, 0\n\t"                                             \
+   "  beq    2000f\n"                                               \
+   "1050:\n\t"                                                      \
+   "  ld     12, " slot "*8(11)\n\t"                                \
+   "  mtctr  12\n\t"                                                \
+   "  bctr\n"                                                       \
+   "2000:\n\t"                                                      \
+   "  mflr   0\n\t"                                                 \
+   "  std    0, 16(1)\n\t"                                          \
+   "  std    2, 40(1)\n\t"                                          \
+   "  stdu   1, -144(1)\n\t"                                        \
+   "  std    3, 56(1)\n\t"                                          \
+   "  std    4, 64(1)\n\t"                                          \
+   "  std    5, 72(1)\n\t"                                          \
+   "  std    6, 80(1)\n\t"                                          \
+   "  std    7, 88(1)\n\t"                                          \
+   "  std    8, 96(1)\n\t"                                          \
+   "  std    9, 104(1)\n\t"                                         \
+   "  std    10, 112(1)\n\t"                                        \
+   "  std    12, 128(1)\n\t"                                        \
+   "  addis  12, 2, " ENTRY_CURRENT_TABLE_GET "@got@ha\n\t"         \
+   "  ld     12, " ENTRY_CURRENT_TABLE_GET "@got@l(12)\n\t"         \
+   "  mtctr  12\n\t"                                                \
+   "  bctrl\n\t"                                                    \
+   "  ld     2, 144+40(1)\n\t"                                      \
+   "  mr     11, 3\n\t"                                             \
+   "  ld     3, 56(1)\n\t"                                          \
+   "  ld     4, 64(1)\n\t"                                          \
+   "  ld     5, 72(1)\n\t"                                          \
+   "  ld     6, 80(1)\n\t"                                          \
+   "  ld     7, 88(1)\n\t"                                          \
+   "  ld     8, 96(1)\n\t"                                          \
+   "  ld     9, 104(1)\n\t"                                         \
+   "  ld     10, 112(1)\n\t"                                        \
+   "  ld     12, 128(1)\n\t"                                        \
+   "  addi   1, 1, 144\n\t"                                         \
+   "  ld     0, 16(1)\n\t"                                          \
+   "  mtlr   0\n\t"                                                 \
+   "  b      1050b\n"
+
+#define MAPI_TMP_STUB_ASM_GCC
+#include "mapi_tmp.h"
+
+#ifndef MAPI_MODE_BRIDGE
+
+#include <string.h>
+#include "u_execmem.h"
+
+void
+entry_patch_public(void)
+{
+}
+
+extern char
+ppc64le_entry_start[] HIDDEN;
+
+mapi_func
+entry_get_public(int slot)
+{
+   return (mapi_func) (ppc64le_entry_start + slot * PPC64LE_ENTRY_SIZE);
+}
+
+static const uint32_t code_templ[] = {
+   // This should be functionally the same code as would be generated from
+   // the STUB_ASM_CODE macro, but defined as a buffer.
+   // This is used to generate new dispatch stubs. Mesa will copy this
+   // data to the dispatch stub, and then it will patch the slot number and
+   // any addresses that it needs to.
+   // NOTE!!!  NOTE!!!  NOTE!!!
+   // This representation is correct for both little- and big-endian systems.
+   // However, more work needs to be done for big-endian Linux because it
+   // adheres to an older, AIX-compatible ABI that uses function descriptors.
+   // 1000:
+   0x7C0802A6,    // <ENTRY+000>:    mflr   0
+   0xF8010010,    // <ENTRY+004>:    std    0, 16(1)
+   0xE96C0098,    // <ENTRY+008>:    ld     11, 9000f-1000b+0(12)
+   0xE96B0000,    // <ENTRY+012>:    ld     11, 0(11)
+   0x282B0000,    // <ENTRY+016>:    cmpldi 11, 0
+   0x41820014,    // <ENTRY+020>:    beq    2000f
+   // 1050:
+   0xE80C00A8,    // <ENTRY+024>:    ld     0, 9000f-1000b+16(12)
+   0x7D8B002A,    // <ENTRY+028>:    ldx    12, 11, 0
+   0x7D8903A6,    // <ENTRY+032>:    mtctr  12
+   0x4E800420,    // <ENTRY+036>:    bctr
+   // 2000:
+   0xF8410028,    // <ENTRY+040>:    std    2, 40(1)
+   0xF821FF71,    // <ENTRY+044>:    stdu   1, -144(1)
+   0xF8610038,    // <ENTRY+048>:    std    3, 56(1)
+   0xF8810040,    // <ENTRY+052>:    std    4, 64(1)
+   0xF8A10048,    // <ENTRY+056>:    std    5, 72(1)
+   0xF8C10050,    // <ENTRY+060>:    std    6, 80(1)
+   0xF8E10058,    // <ENTRY+064>:    std    7, 88(1)
+   0xF9010060,    // <ENTRY+068>:    std    8, 96(1)
+   0xF9210068,    // <ENTRY+072>:    std    9, 104(1)
+   0xF9410070,    // <ENTRY+076>:    std    10, 112(1)
+   0xF9810080,    // <ENTRY+080>:    std    12, 128(1)
+   0xE98C00A0,    // <ENTRY+084>:    ld     12, 9000f-1000b+8(12)
+   0x7D8903A6,    // <ENTRY+088>:    mtctr  12
+   0x4E800421,    // <ENTRY+092>:    bctrl
+   0x7C6B1B78,    // <ENTRY+096>:    mr     11, 3
+   0xE8610038,    // <ENTRY+100>:    ld     3, 56(1)
+   0xE8810040,    // <ENTRY+104>:    ld     4, 64(1)
+   0xE8A10048,    // <ENTRY+108>:    ld     5, 72(1)
+   0xE8C10050,    // <ENTRY+112>:    ld     6, 80(1)
+   0xE8E10058,    // <ENTRY+116>:    ld     7, 88(1)
+   0xE9010060,    // <ENTRY+120>:    ld     8, 96(1)
+   0xE9210068,    // <ENTRY+124>:    ld     9, 104(1)
+   0xE9410070,    // <ENTRY+128>:    ld     10, 112(1)
+   0xE9810080,    // <ENTRY+132>:    ld     12, 128(1)
+   0x38210090,    // <ENTRY+136>:    addi   1, 1, 144
+   0xE8010010,    // <ENTRY+140>:    ld     0, 16(1)
+   0x7C0803A6,    // <ENTRY+144>:    mtlr   0
+   0x4BFFFF84,    // <ENTRY+148>:    b      1050b
+   // 9000:
+   0, 0,          // <ENTRY+152>:    .quad ENTRY_CURRENT_TABLE
+   0, 0,          // <ENTRY+160>:    .quad ENTRY_CURRENT_TABLE_GET
+   0, 0           // <ENTRY+168>:    .quad <slot>*8
+};
+static const uint64_t TEMPLATE_OFFSET_CURRENT_TABLE = sizeof(code_templ) - 3*8;
+static const uint64_t TEMPLATE_OFFSET_CURRENT_TABLE_GET = sizeof(code_templ) - 2*8;
+static const uint64_t TEMPLATE_OFFSET_SLOT = sizeof(code_templ) - 1*8;
+
+void
+entry_patch(mapi_func entry, int slot)
+{
+   char *code = (char *) entry;
+   *((uint64_t *) (code + TEMPLATE_OFFSET_CURRENT_TABLE)) = (uint64_t) ENTRY_CURRENT_TABLE;
+   *((uint64_t *) (code + TEMPLATE_OFFSET_CURRENT_TABLE_GET)) = (uint64_t) ENTRY_CURRENT_TABLE_GET;
+   *((uint64_t *) (code + TEMPLATE_OFFSET_SLOT)) = slot * sizeof(mapi_func);
+}
+
+mapi_func
+entry_generate(int slot)
+{
+   char *code;
+   mapi_func entry;
+
+   code = u_execmem_alloc(sizeof(code_templ));
+   if (!code)
+      return NULL;
+
+   memcpy(code, code_templ, sizeof(code_templ));
+
+   entry = (mapi_func) code;
+   entry_patch(entry, slot);
+
+   return entry;
+}
+
+#endif /* MAPI_MODE_BRIDGE */
diff --git a/src/mapi/glapi/gen/APPLE_vertex_array_object.xml b/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
index 7312f9b..daf6990 100644
--- a/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
+++ b/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
@@ -5,21 +5,21 @@
 <category name="GL_APPLE_vertex_array_object" number="273">
     <enum name="VERTEX_ARRAY_BINDING_APPLE"               value="0x85B5"/>
 
-    <function name="BindVertexArrayAPPLE" deprecated="3.1">
+    <function name="BindVertexArrayAPPLE" deprecated="3.1" exec="skip">
         <param name="array" type="GLuint"/>
     </function>
 
-    <function name="DeleteVertexArraysAPPLE" alias="DeleteVertexArrays">
+    <function name="DeleteVertexArraysAPPLE" exec="skip">
         <param name="n" type="GLsizei"/>
 	<param name="arrays" type="const GLuint *"/>
     </function>
 
-    <function name="GenVertexArraysAPPLE" deprecated="3.1">
+    <function name="GenVertexArraysAPPLE" deprecated="3.1" exec="skip">
         <param name="n" type="GLsizei"/>
 	<param name="arrays" type="GLuint *" count="n" output="true"/>
     </function>
 
-    <function name="IsVertexArrayAPPLE" alias="IsVertexArray">
+    <function name="IsVertexArrayAPPLE" exec="skip">
         <param name="array" type="GLuint"/>
 	<return type="GLboolean"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_bindless_texture.xml b/src/mapi/glapi/gen/ARB_bindless_texture.xml
new file mode 100644
index 0000000..762cadf
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_bindless_texture.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_bindless_texture" number="152">
+
+   <enum name="UNSIGNED_INT64_ARB" value="0x140F" />
+   <type name="uint64EXT" unsigned="true" size="8"/>
+
+   <function name="GetTextureHandleARB" no_error="true">
+      <return type="GLuint64"/>
+      <param name="texture" type="GLuint" />
+   </function>
+
+   <function name="GetTextureSamplerHandleARB" no_error="true">
+      <return type="GLuint64"/>
+      <param name="texture" type="GLuint" />
+      <param name="sampler" type="GLuint" />
+   </function>
+
+   <function name="MakeTextureHandleResidentARB" no_error="true">
+      <param name="handle" type="GLuint64" />
+   </function>
+
+   <function name="MakeTextureHandleNonResidentARB" no_error="true">
+      <param name="handle" type="GLuint64" />
+   </function>
+
+   <function name="GetImageHandleARB" no_error="true">
+      <return type="GLuint64"/>
+      <param name="texture" type="GLuint" />
+      <param name="level" type="GLint" />
+      <param name="layered" type="GLboolean" />
+      <param name="layer" type="GLint" />
+      <param name="format" type="GLenum" />
+   </function>
+
+   <function name="MakeImageHandleResidentARB" no_error="true">
+      <param name="handle" type="GLuint64" />
+      <param name="access" type="GLenum" />
+   </function>
+
+   <function name="MakeImageHandleNonResidentARB" no_error="true">
+      <param name="handle" type="GLuint64" />
+   </function>
+
+   <function name="UniformHandleui64ARB">
+      <param name="location" type="GLint" />
+      <param name="value" type="GLuint64" />
+   </function>
+
+   <function name="UniformHandleui64vARB">
+      <param name="location" type="GLint" />
+      <param name="count" type="GLsizei" />
+      <param name="value" type="const GLuint64 *" />
+   </function>
+
+   <function name="ProgramUniformHandleui64ARB">
+      <param name="program" type="GLuint" />
+      <param name="location" type="GLint" />
+      <param name="value" type="GLuint64" />
+   </function>
+
+   <function name="ProgramUniformHandleui64vARB">
+      <param name="program" type="GLuint" />
+      <param name="location" type="GLint" />
+      <param name="count" type="GLsizei" />
+      <param name="value" type="const GLuint64 *" />
+   </function>
+
+   <function name="IsTextureHandleResidentARB" no_error="true">
+      <return type="GLboolean"/>
+      <param name="handle" type="GLuint64" />
+   </function>
+
+   <function name="IsImageHandleResidentARB" no_error="true">
+      <return type="GLboolean"/>
+      <param name="handle" type="GLuint64" />
+   </function>
+
+   <function name="VertexAttribL1ui64ARB" exec="dynamic">
+      <param name="index" type="GLuint" />
+      <param name="x" type="GLuint64EXT" />
+   </function>
+
+   <function name="VertexAttribL1ui64vARB" exec="dynamic">
+      <param name="index" type="GLuint" />
+      <param name="v" type="const GLuint64EXT *" />
+   </function>
+
+   <function name="GetVertexAttribLui64vARB">
+      <param name="index" type="GLuint" />
+      <param name="pname" type="GLenum" />
+      <param name="params" type="GLuint64EXT *" />
+   </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/ARB_blend_func_extended.xml b/src/mapi/glapi/gen/ARB_blend_func_extended.xml
index 406140f..10d85a7 100644
--- a/src/mapi/glapi/gen/ARB_blend_func_extended.xml
+++ b/src/mapi/glapi/gen/ARB_blend_func_extended.xml
@@ -8,7 +8,7 @@
 
 <category name="GL_ARB_blend_func_extended" number="78">
 
-    <function name="BindFragDataLocationIndexed">
+    <function name="BindFragDataLocationIndexed" no_error="true">
         <param name="program" type="GLuint"/>
         <param name="colorNumber" type="GLuint"/>
         <param name="index" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/ARB_clip_control.xml b/src/mapi/glapi/gen/ARB_clip_control.xml
index ecce133..051a648 100644
--- a/src/mapi/glapi/gen/ARB_clip_control.xml
+++ b/src/mapi/glapi/gen/ARB_clip_control.xml
@@ -14,7 +14,7 @@
     <enum name="CLIP_ORIGIN" value = "0x935C"/>
     <enum name="CLIP_DEPTH_MODE" value = "0x935D"/>
 
-    <function name="ClipControl">
+    <function name="ClipControl" no_error="true">
         <param name="origin" type="GLenum"/>
         <param name="depth" type="GLenum"/>
         <!-- <glx rop="1340"/> -->
diff --git a/src/mapi/glapi/gen/ARB_compute_shader.xml b/src/mapi/glapi/gen/ARB_compute_shader.xml
index c2ec842..84cbdf3 100644
--- a/src/mapi/glapi/gen/ARB_compute_shader.xml
+++ b/src/mapi/glapi/gen/ARB_compute_shader.xml
@@ -26,13 +26,13 @@
   <enum name="DISPATCH_INDIRECT_BUFFER_BINDING"                value="0x90EF"/>
   <enum name="COMPUTE_SHADER_BIT"                              value="0x00000020"/>
 
-  <function name="DispatchCompute" es2="3.1">
+  <function name="DispatchCompute" es2="3.1" no_error="true">
     <param name="num_groups_x" type="GLuint"/>
     <param name="num_groups_y" type="GLuint"/>
     <param name="num_groups_z" type="GLuint"/>
   </function>
 
-  <function name="DispatchComputeIndirect" es2="3.1">
+  <function name="DispatchComputeIndirect" es2="3.1" no_error="true">
     <param name="indirect" type="GLintptr"/>
   </function>
 </category>
diff --git a/src/mapi/glapi/gen/ARB_compute_variable_group_size.xml b/src/mapi/glapi/gen/ARB_compute_variable_group_size.xml
index b21c52f..a54c591 100644
--- a/src/mapi/glapi/gen/ARB_compute_variable_group_size.xml
+++ b/src/mapi/glapi/gen/ARB_compute_variable_group_size.xml
@@ -12,7 +12,7 @@
   <enum name="MAX_COMPUTE_VARIABLE_GROUP_SIZE_ARB"        value="0x9345"/>
   <enum name="MAX_COMPUTE_FIXED_GROUP_SIZE_ARB"           value="0x91BF"/>
 
-  <function name="DispatchComputeGroupSizeARB">
+  <function name="DispatchComputeGroupSizeARB" no_error="true">
     <param name="num_groups_x" type="GLuint"/>
     <param name="num_groups_y" type="GLuint"/>
     <param name="num_groups_z" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/ARB_copy_buffer.xml b/src/mapi/glapi/gen/ARB_copy_buffer.xml
index d1c6f1f..12e0c19 100644
--- a/src/mapi/glapi/gen/ARB_copy_buffer.xml
+++ b/src/mapi/glapi/gen/ARB_copy_buffer.xml
@@ -11,7 +11,7 @@
     <enum name="COPY_READ_BUFFER"   value="0x8F36"/>
     <enum name="COPY_WRITE_BUFFER"  value="0x8F37"/>
 
-    <function name="CopyBufferSubData" es2="3.0">
+    <function name="CopyBufferSubData" es2="3.0" no_error="true">
         <param name="readTarget" type="GLenum"/>
         <param name="writeTarget" type="GLenum"/>
         <param name="readOffset" type="GLintptr"/>
diff --git a/src/mapi/glapi/gen/ARB_copy_image.xml b/src/mapi/glapi/gen/ARB_copy_image.xml
index 9ee2ba3..fb4c9b1 100644
--- a/src/mapi/glapi/gen/ARB_copy_image.xml
+++ b/src/mapi/glapi/gen/ARB_copy_image.xml
@@ -5,7 +5,7 @@
 
 <category name="GL_ARB_copy_image" number="123">
 
-    <function name="CopyImageSubData" es2="3.2">
+    <function name="CopyImageSubData" es2="3.2" no_error="true">
         <param name="srcName" type="GLuint"/>
         <param name="srcTarget" type="GLenum"/>
         <param name="srcLevel" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_direct_state_access.xml b/src/mapi/glapi/gen/ARB_direct_state_access.xml
index 43841bb..0c34b63 100644
--- a/src/mapi/glapi/gen/ARB_direct_state_access.xml
+++ b/src/mapi/glapi/gen/ARB_direct_state_access.xml
@@ -49,33 +49,33 @@
 
    <!-- Buffer object functions -->
 
-   <function name="CreateBuffers">
+   <function name="CreateBuffers" no_error="true">
       <param name="n" type="GLsizei" />
       <param name="buffers" type="GLuint *" />
    </function>
 
-   <function name="NamedBufferStorage">
+   <function name="NamedBufferStorage" no_error="true">
       <param name="buffer" type="GLuint" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
       <param name="flags" type="GLbitfield" />
    </function>
 
-   <function name="NamedBufferData">
+   <function name="NamedBufferData" marshal="custom">
       <param name="buffer" type="GLuint" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
       <param name="usage" type="GLenum" />
    </function>
 
-   <function name="NamedBufferSubData">
+   <function name="NamedBufferSubData" no_error="true" marshal="custom">
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CopyNamedBufferSubData">
+   <function name="CopyNamedBufferSubData" no_error="true">
       <param name="readBuffer" type="GLuint" />
       <param name="writeBuffer" type="GLuint" />
       <param name="readOffset" type="GLintptr" />
@@ -101,13 +101,13 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="MapNamedBuffer">
+   <function name="MapNamedBuffer" no_error="true">
       <return type="GLvoid *" />
       <param name="buffer" type="GLuint" />
       <param name="access" type="GLenum" />
    </function>
 
-   <function name="MapNamedBufferRange">
+   <function name="MapNamedBufferRange" no_error="true">
       <return type="GLvoid *" />
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
@@ -115,12 +115,12 @@
       <param name="access" type="GLbitfield" />
    </function>
 
-   <function name="UnmapNamedBuffer">
+   <function name="UnmapNamedBuffer" no_error="true">
       <return type="GLboolean" />
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="FlushMappedNamedBufferRange">
+   <function name="FlushMappedNamedBufferRange" no_error="true">
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
       <param name="length" type="GLsizeiptr" />
@@ -171,14 +171,14 @@
       <param name="param" type="GLint" />
    </function>
 
-   <function name="NamedFramebufferTexture">
+   <function name="NamedFramebufferTexture" no_error="true">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
    </function>
 
-   <function name="NamedFramebufferTextureLayer">
+   <function name="NamedFramebufferTextureLayer" no_error="true">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="texture" type="GLuint" />
@@ -197,7 +197,7 @@
       <param name="bufs" type="const GLenum *" />
    </function>
 
-   <function name="NamedFramebufferReadBuffer">
+   <function name="NamedFramebufferReadBuffer" no_error="true">
       <param name="framebuffer" type="GLuint" />
       <param name="buf" type="GLenum" />
    </function>
@@ -247,7 +247,7 @@
       <param name="stencil" type="GLint" />
    </function>
 
-   <function name="BlitNamedFramebuffer">
+   <function name="BlitNamedFramebuffer" no_error="true">
       <param name="readFramebuffer" type="GLuint" />
       <param name="drawFramebuffer" type="GLuint" />
       <param name="srcX0" type="GLint" />
@@ -283,7 +283,7 @@
 
    <!-- Renderbuffer object functions -->
 
-   <function name="CreateRenderbuffers">
+   <function name="CreateRenderbuffers" no_error="true">
       <param name="n" type="GLsizei" />
       <param name="renderbuffers" type="GLuint *" />
    </function>
@@ -311,7 +311,7 @@
 
    <!-- Texture object functions -->
 
-   <function name="CreateTextures">
+   <function name="CreateTextures" no_error="true">
       <param name="target" type="GLenum" />
       <param name="n" type="GLsizei" />
       <param name="textures" type="GLuint *" />
@@ -410,7 +410,7 @@
       <param name="pixels" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage1D">
+   <function name="CompressedTextureSubImage1D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -420,7 +420,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage2D">
+   <function name="CompressedTextureSubImage2D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -432,7 +432,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage3D">
+   <function name="CompressedTextureSubImage3D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -446,7 +446,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CopyTextureSubImage1D">
+   <function name="CopyTextureSubImage1D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -455,7 +455,7 @@
       <param name="width" type="GLsizei" />
    </function>
 
-   <function name="CopyTextureSubImage2D">
+   <function name="CopyTextureSubImage2D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -466,7 +466,7 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="CopyTextureSubImage3D">
+   <function name="CopyTextureSubImage3D" no_error="true">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -518,7 +518,7 @@
       <param name="texture" type="GLuint" />
    </function>
 
-   <function name="BindTextureUnit">
+   <function name="BindTextureUnit" no_error="true">
       <param name="unit" type="GLuint" />
       <param name="texture" type="GLuint" />
    </function>
@@ -584,12 +584,12 @@
       <param name="arrays" type="GLuint *" />
    </function>
 
-   <function name="DisableVertexArrayAttrib">
+   <function name="DisableVertexArrayAttrib" no_error="true">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
    </function>
 
-   <function name="EnableVertexArrayAttrib">
+   <function name="EnableVertexArrayAttrib" no_error="true">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
    </function>
@@ -599,7 +599,7 @@
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="VertexArrayVertexBuffer">
+   <function name="VertexArrayVertexBuffer" no_error="true">
       <param name="vaobj" type="GLuint" />
       <param name="bindingindex" type="GLuint" />
       <param name="buffer" type="GLuint" />
@@ -607,7 +607,7 @@
       <param name="stride" type="GLsizei" />
    </function>
 
-   <function name="VertexArrayVertexBuffers">
+   <function name="VertexArrayVertexBuffers" no_error="true">
       <param name="vaobj" type="GLuint" />
       <param name="first" type="GLuint" />
       <param name="count" type="GLsizei" />
@@ -641,7 +641,7 @@
       <param name="relativeoffset" type="GLuint" />
    </function>
 
-   <function name="VertexArrayAttribBinding">
+   <function name="VertexArrayAttribBinding" no_error="true">
       <param name="vaobj" type="GLuint" />
       <param name="attribindex" type="GLuint" />
       <param name="bindingindex" type="GLuint" />
@@ -675,14 +675,14 @@
 
    <!-- Sampler object functions -->
 
-   <function name="CreateSamplers">
+   <function name="CreateSamplers" no_error="true">
       <param name="n" type="GLsizei" />
       <param name="samplers" type="GLuint *" />
    </function>
 
    <!-- Program Pipeline object functions -->
 
-   <function name="CreateProgramPipelines">
+   <function name="CreateProgramPipelines" no_error="true">
       <param name="n" type="GLsizei" />
       <param name="pipelines" type="GLuint *" />
    </function>
diff --git a/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml b/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
index 8c33fbf..1a44f38 100644
--- a/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
+++ b/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
@@ -13,19 +13,19 @@
         <param name="mode" type="GLenum"/>
     </function>
 
-    <function name="BlendEquationSeparateiARB">
+    <function name="BlendEquationSeparateiARB" no_error="true">
         <param name="buf" type="GLuint"/>
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
     </function>
 
-    <function name="BlendFunciARB">
+    <function name="BlendFunciARB" no_error="true">
         <param name="buf" type="GLuint"/>
         <param name="src" type="GLenum"/>
         <param name="dst" type="GLenum"/>
     </function>
 
-    <function name="BlendFuncSeparateiARB">
+    <function name="BlendFuncSeparateiARB" no_error="true">
         <param name="buf" type="GLuint"/>
         <param name="srcRGB" type="GLenum"/>
         <param name="dstRGB" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_framebuffer_object.xml b/src/mapi/glapi/gen/ARB_framebuffer_object.xml
index 1573e7e..b8fff92 100644
--- a/src/mapi/glapi/gen/ARB_framebuffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_framebuffer_object.xml
@@ -158,7 +158,7 @@
 	<glx rop="4317"/>
     </function>
 
-    <function name="GenRenderbuffers" es2="2.0">
+    <function name="GenRenderbuffers" es2="2.0" no_error="true">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="renderbuffers" type="GLuint *" count="n" output="true"/>
 	<glx vendorpriv="1423" always_array="true"/>
@@ -212,13 +212,13 @@
 	<glx vendorpriv="1426" always_array="true"/>
     </function>
 
-    <function name="CheckFramebufferStatus" es2="2.0">
+    <function name="CheckFramebufferStatus" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
 	<return type="GLenum"/>
 	<glx vendorpriv="1427"/>
     </function>
 
-    <function name="FramebufferTexture1D">
+    <function name="FramebufferTexture1D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -227,7 +227,7 @@
 	<glx rop="4321"/>
     </function>
 
-    <function name="FramebufferTexture2D" es2="2.0">
+    <function name="FramebufferTexture2D" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -236,7 +236,7 @@
 	<glx rop="4322"/>
     </function>
 
-    <function name="FramebufferTexture3D">
+    <function name="FramebufferTexture3D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -246,7 +246,7 @@
 	<glx rop="4323"/>
     </function>
 
-    <function name="FramebufferTextureLayer" es2="3.0">
+    <function name="FramebufferTextureLayer" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="texture" type="GLuint"/>
@@ -271,7 +271,7 @@
 	<glx vendorpriv="1428"/>
     </function>
 
-    <function name="BlitFramebuffer" es2="3.0">
+    <function name="BlitFramebuffer" es2="3.0" no_error="true">
         <param name="srcX0" type="GLint"/>
         <param name="srcY0" type="GLint"/>
         <param name="srcX1" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_invalidate_subdata.xml b/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
index 052816a..2cbc4f6 100644
--- a/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
+++ b/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
@@ -19,13 +19,13 @@
     <param name="level" type="GLint"/>
   </function>
 
-  <function name="InvalidateBufferSubData">
+  <function name="InvalidateBufferSubData" no_error="true">
     <param name="buffer" type="GLuint"/>
     <param name="offset" type="GLintptr"/>
     <param name="length" type="GLsizeiptr"/>
   </function>
 
-  <function name="InvalidateBufferData">
+  <function name="InvalidateBufferData" no_error="true">
     <param name="buffer" type="GLuint"/>
   </function>
 
diff --git a/src/mapi/glapi/gen/ARB_map_buffer_range.xml b/src/mapi/glapi/gen/ARB_map_buffer_range.xml
index cf7b211..35a20be 100644
--- a/src/mapi/glapi/gen/ARB_map_buffer_range.xml
+++ b/src/mapi/glapi/gen/ARB_map_buffer_range.xml
@@ -15,7 +15,7 @@
     <enum name="MAP_FLUSH_EXPLICIT_BIT"      value="0x0010"/>
     <enum name="MAP_UNSYNCHRONIZED_BIT"      value="0x0020"/>
 
-    <function name="MapBufferRange" es2="3.0">
+    <function name="MapBufferRange" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="length" type="GLsizeiptr"/>
@@ -23,7 +23,7 @@
         <return type="GLvoid *"/>
     </function>
 
-    <function name="FlushMappedBufferRange" es2="3.0">
+    <function name="FlushMappedBufferRange" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="length" type="GLsizeiptr"/>
diff --git a/src/mapi/glapi/gen/ARB_multi_bind.xml b/src/mapi/glapi/gen/ARB_multi_bind.xml
index f42eaa2..601680f 100644
--- a/src/mapi/glapi/gen/ARB_multi_bind.xml
+++ b/src/mapi/glapi/gen/ARB_multi_bind.xml
@@ -35,13 +35,13 @@
         <param name="samplers" type="const GLuint *"/>
     </function>
 
-    <function name="BindImageTextures">
+    <function name="BindImageTextures" no_error="true">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="textures" type="const GLuint *"/>
     </function>
 
-    <function name="BindVertexBuffers">
+    <function name="BindVertexBuffers" no_error="true">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="buffers" type="const GLuint *"/>
diff --git a/src/mapi/glapi/gen/ARB_sampler_objects.xml b/src/mapi/glapi/gen/ARB_sampler_objects.xml
index 20363f7..9fe6c41c 100644
--- a/src/mapi/glapi/gen/ARB_sampler_objects.xml
+++ b/src/mapi/glapi/gen/ARB_sampler_objects.xml
@@ -7,7 +7,7 @@
 
 <category name="GL_ARB_sampler_objects" number="81">
 
-    <function name="GenSamplers" es2="3.0">
+    <function name="GenSamplers" es2="3.0" no_error="true">
       <param name="count" type="GLsizei"/>
       <param name="samplers" type="GLuint *"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_separate_shader_objects.xml b/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
index 26a7afa..2273b48 100644
--- a/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
+++ b/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
@@ -15,12 +15,12 @@
       <enum   name="ALL_SHADER_BITS"                              value="0xFFFFFFFF"/>
       <enum   name="PROGRAM_SEPARABLE"                            value="0x8258"/>
 
-      <function name="UseProgramStages" es2="3.1">
+      <function name="UseProgramStages" es2="3.1" no_error="true">
          <param name="pipeline" type="GLuint" />
          <param name="stages" type="GLbitfield" />
          <param name="program" type="GLuint" />
       </function>
-      <function name="ActiveShaderProgram" es2="3.1">
+      <function name="ActiveShaderProgram" es2="3.1" no_error="true">
          <param name="pipeline" type="GLuint" />
          <param name="program" type="GLuint" />
       </function>
@@ -30,14 +30,14 @@
          <param name="strings" type="const GLchar * const *" />
          <return type="GLuint"/>
       </function>
-      <function name="BindProgramPipeline" es2="3.1">
+      <function name="BindProgramPipeline" es2="3.1" no_error="true">
          <param name="pipeline" type="GLuint" />
       </function>
       <function name="DeleteProgramPipelines" es2="3.1">
          <param name="n" type="GLsizei" />
          <param name="pipelines" type="const GLuint *" />
       </function>
-      <function name="GenProgramPipelines" es2="3.1">
+      <function name="GenProgramPipelines" es2="3.1" no_error="true">
          <param name="n" type="GLsizei" />
          <param name="pipelines" type="GLuint *" />
       </function>
diff --git a/src/mapi/glapi/gen/ARB_shader_image_load_store.xml b/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
index 178e930..6e9ee1f 100644
--- a/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
+++ b/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
@@ -70,7 +70,7 @@
 <enum name="MAX_FRAGMENT_IMAGE_UNIFORMS" value="0x90CE"/>
 <enum name="MAX_COMBINED_IMAGE_UNIFORMS" value="0x90CF"/>
 
-<function name="BindImageTexture" es2="3.1">
+<function name="BindImageTexture" es2="3.1" no_error="true">
   <param name="unit" type="GLuint"/>
   <param name="texture" type="GLuint"/>
   <param name="level" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_shader_storage_buffer_object.xml b/src/mapi/glapi/gen/ARB_shader_storage_buffer_object.xml
index 6901bdf..4d22882 100644
--- a/src/mapi/glapi/gen/ARB_shader_storage_buffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_shader_storage_buffer_object.xml
@@ -25,7 +25,7 @@
 <!-- Duplicated with GL3x.xml: BindBufferRange, BindBufferBase,
      GetIntegeri_v -->
 
-<function name="ShaderStorageBlockBinding">
+<function name="ShaderStorageBlockBinding" no_error="true">
     <param name="program" type="GLuint" />
     <param name="shaderStorageBlockIndex" type="GLuint" />
     <param name="shaderStorageBlockBinding" type="GLuint" />
diff --git a/src/mapi/glapi/gen/ARB_sync.xml b/src/mapi/glapi/gen/ARB_sync.xml
index d8a1c34..f1463f1 100644
--- a/src/mapi/glapi/gen/ARB_sync.xml
+++ b/src/mapi/glapi/gen/ARB_sync.xml
@@ -39,7 +39,7 @@
     -->
 
 
-    <function name="FenceSync" es2="3.0">
+    <function name="FenceSync" es2="3.0" no_error="true">
         <param name="condition" type="GLenum"/>
         <param name="flags" type="GLbitfield"/>
         <return type="GLsync"/>
@@ -54,7 +54,7 @@
         <param name="sync" type="GLsync"/>
     </function>
 
-    <function name="ClientWaitSync" es2="3.0">
+    <function name="ClientWaitSync" es2="3.0" no_error="true">
         <param name="sync" type="GLsync"/>
         <param name="flags" type="GLbitfield"/>
 	<param name="timeout" type="GLuint64"/>
diff --git a/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml b/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
index cf86bbb..47bb047 100644
--- a/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
@@ -86,7 +86,7 @@
 <!-- Duplicated with GL3x.xml: BindBufferRange, BindBufferBase,
      GetIntegeri_v -->
 
-<function name="UniformBlockBinding" es2="3.0">
+<function name="UniformBlockBinding" es2="3.0" no_error="true">
     <param name="program" type="GLuint" />
     <param name="uniformBlockIndex" type="GLuint" />
     <param name="uniformBlockBinding" type="GLuint" />
diff --git a/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml b/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
index 211642f..6d76003 100644
--- a/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
@@ -51,7 +51,7 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttribLPointer">
+    <function name="VertexAttribLPointer" no_error="true">
         <param name="index" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml b/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
index ba9ca57..d1f8db9 100644
--- a/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
@@ -7,7 +7,7 @@
 
 <category name="GL_ARB_vertex_attrib_binding" number="125">
 
-    <function name="BindVertexBuffer" es2="3.1">
+    <function name="BindVertexBuffer" es2="3.1" no_error="true">
         <param name="bindingindex" type="GLuint"/>
         <param name="buffer" type="GLuint"/>
         <param name="offset" type="GLintptr"/>
@@ -36,7 +36,7 @@
         <param name="relativeoffset" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribBinding" es2="3.1">
+    <function name="VertexAttribBinding" es2="3.1" no_error="true">
         <param name="attribindex" type="GLuint"/>
         <param name="bindingindex" type="GLuint"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_viewport_array.xml b/src/mapi/glapi/gen/ARB_viewport_array.xml
index ebd5b99..30c4bca 100644
--- a/src/mapi/glapi/gen/ARB_viewport_array.xml
+++ b/src/mapi/glapi/gen/ARB_viewport_array.xml
@@ -29,44 +29,44 @@
     <enum name="PROVOKING_VERTEX" value="0x8E4F"/>
     <enum name="UNDEFINED_VERTEX" value="0x8260"/>
 
-    <function name="ViewportArrayv">
+    <function name="ViewportArrayv" no_error="true">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
-        <param name="v" type="const GLfloat *"/>
+        <param name="v" type="const GLfloat *" count="count" count_scale="4"/>
     </function>
-    <function name="ViewportIndexedf">
+    <function name="ViewportIndexedf" no_error="true">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="w" type="GLfloat"/>
         <param name="h" type="GLfloat"/>
     </function>
-    <function name="ViewportIndexedfv">
+    <function name="ViewportIndexedfv" no_error="true">
         <param name="index" type="GLuint"/>
-        <param name="v" type="const GLfloat *"/>
+        <param name="v" type="const GLfloat *" count="4"/>
     </function>
-    <function name="ScissorArrayv">
+    <function name="ScissorArrayv" no_error="true">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
-        <param name="v" type="const int *"/>
+        <param name="v" type="const int *" count="count" count_scale="4"/>
     </function>
-    <function name="ScissorIndexed">
+    <function name="ScissorIndexed" no_error="true">
         <param name="index" type="GLuint"/>
         <param name="left" type="GLint"/>
         <param name="bottom" type="GLint"/>
         <param name="width" type="GLsizei"/>
         <param name="height" type="GLsizei"/>
     </function>
-    <function name="ScissorIndexedv">
+    <function name="ScissorIndexedv" no_error="true">
         <param name="index" type="GLuint"/>
-        <param name="v" type="const GLint *"/>
+        <param name="v" type="const GLint *" count="4"/>
     </function>
     <function name="DepthRangeArrayv">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
-        <param name="v" type="const GLclampd *"/>
+        <param name="v" type="const GLclampd *" count="count" count_scale="2"/>
     </function>
-    <function name="DepthRangeIndexed">
+    <function name="DepthRangeIndexed" no_error="true">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLclampd"/>
         <param name="f" type="GLclampd"/>
diff --git a/src/mapi/glapi/gen/GL3x.xml b/src/mapi/glapi/gen/GL3x.xml
index f38a287..7c86e8f 100644
--- a/src/mapi/glapi/gen/GL3x.xml
+++ b/src/mapi/glapi/gen/GL3x.xml
@@ -117,13 +117,13 @@
 
   <!-- These functions are unique to GL3 -->
 
-  <function name="ClearBufferiv" es2="3.0">
+  <function name="ClearBufferiv" es2="3.0" marshal="custom">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="value" type="const GLint *"/>
   </function>
 
-  <function name="ClearBufferuiv" es2="3.0">
+  <function name="ClearBufferuiv" es2="3.0" marshal="custom">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="value" type="const GLuint *"/>
@@ -135,7 +135,7 @@
     <param name="value" type="const GLfloat *"/>
   </function>
 
-  <function name="ClearBufferfi" es2="3.0">
+  <function name="ClearBufferfi" es2="3.0" marshal="custom">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="depth" type="GLfloat"/>
@@ -200,7 +200,7 @@
       <return type="GLint"/>
   </function>
 
-  <function name="BindFragDataLocation">
+  <function name="BindFragDataLocation" no_error="true">
     <param name="program" type="GLuint"/>
     <param name="colorNumber" type="GLuint"/>
     <param name="name" type="const GLchar *"/>
@@ -213,7 +213,7 @@
   <function name="EndTransformFeedback" es2="3.0">
   </function>
 
-  <function name="BindBufferRange" es2="3.0">
+  <function name="BindBufferRange" es2="3.0" no_error="true">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="buffer" type="GLuint"/>
@@ -257,6 +257,7 @@
   <!-- These functions alias ones from GL_EXT_gpu_shader4 -->
 
   <function name="VertexAttribIPointer" es2="3.0" marshal="async"
+            no_error="true"
             marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
     <param name="index" type="GLuint"/>
     <param name="size" type="GLint"/>
@@ -606,7 +607,7 @@
     <param name="params" type="GLint64 *"/>
   </function>
 
-  <function name="FramebufferTexture" es2="3.2">
+  <function name="FramebufferTexture" es2="3.2" no_error="true">
     <param name="target" type="GLenum"/>
     <param name="attachment" type="GLenum"/>
     <param name="texture" type="GLuint"/>
@@ -624,7 +625,7 @@
   <enum name="TEXTURE_SWIZZLE_A"                value="0x8E45"/>
   <enum name="TEXTURE_SWIZZLE_RGBA"             value="0x8E46"/>
 
-  <function name="VertexAttribDivisor" es2="3.0">
+  <function name="VertexAttribDivisor" es2="3.0" no_error="true">
     <param name="index" type="GLuint"/>
     <param name="divisor" type="GLuint"/>
   </function>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index c793067..bd04519 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -119,6 +119,7 @@
 	gl_and_glX_API.xml \
 	ARB_base_instance.xml \
 	ARB_blend_func_extended.xml \
+	ARB_bindless_texture.xml \
 	ARB_clear_buffer_object.xml \
 	ARB_clear_texture.xml \
 	ARB_clip_control.xml \
@@ -260,7 +261,7 @@
 
 $(MESA_GLAPI_DIR)/glapi_mapi_tmp.h: $(MESA_MAPI_DIR)/mapi_abi.py $(COMMON)
 	$(PYTHON_GEN) $(MESA_MAPI_DIR)/mapi_abi.py \
-		--printer glapi --mode lib $(srcdir)/gl_and_es_API.xml > $@
+		--printer glapi $(srcdir)/gl_and_es_API.xml > $@
 
 $(MESA_GLAPI_DIR)/glprocs.h: gl_procs.py $(COMMON)
 	$(PYTHON_GEN) $(srcdir)/gl_procs.py -c -f $(srcdir)/gl_and_es_API.xml > $@
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index e5e1b7d..61eda4b 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -291,4 +291,7 @@
     "ProgramUniform2ui64vARB": exec_info(core=31),
     "ProgramUniform3ui64vARB": exec_info(core=31),
     "ProgramUniform4ui64vARB": exec_info(core=31),
+
+    # GL_ARB_bindless_texture
+    "GetVertexAttribLui64vARB": exec_info(core=31),
 }
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 3e705eb..271f0c0 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -318,7 +318,8 @@
     <enum name="POINT_SIZE_ARRAY_OES"                     value="0x8B9C"/>
     <enum name="POINT_SIZE_ARRAY_BUFFER_BINDING_OES"	  value="0x8B9F"/>
 
-    <function name="PointSizePointerOES" es1="1.0" desktop="false">
+    <function name="PointSizePointerOES" es1="1.0" desktop="false"
+              no_error="true">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 522d2e5..18839ec 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -1961,7 +1961,7 @@
         <glx rop="78"/>
     </function>
 
-    <function name="CullFace" es1="1.0" es2="2.0">
+    <function name="CullFace" es1="1.0" es2="2.0" no_error="true">
         <param name="mode" type="GLenum"/>
         <glx rop="79"/>
     </function>
@@ -1990,7 +1990,7 @@
         <glx rop="83"/>
     </function>
 
-    <function name="FrontFace" es1="1.0" es2="2.0">
+    <function name="FrontFace" es1="1.0" es2="2.0" no_error="true">
         <param name="mode" type="GLenum"/>
         <glx rop="84"/>
     </function>
@@ -2108,7 +2108,7 @@
         <glx rop="102"/>
     </function>
 
-    <function name="Scissor" es1="1.0" es2="2.0">
+    <function name="Scissor" es1="1.0" es2="2.0" no_error="true">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -2149,7 +2149,7 @@
         <glx rop="108"/>
     </function>
 
-    <function name="TexImage1D">
+    <function name="TexImage1D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -2161,7 +2161,7 @@
         <glx rop="109" large="true"/>
     </function>
 
-    <function name="TexImage2D" es1="1.0" es2="2.0">
+    <function name="TexImage2D" es1="1.0" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -2291,7 +2291,7 @@
         <glx rop="126"/>
     </function>
 
-    <function name="Clear" es1="1.0" es2="2.0">
+    <function name="Clear" es1="1.0" es2="2.0" no_error="true">
         <param name="mask" type="GLbitfield"/>
         <glx rop="127"/>
     </function>
@@ -2545,32 +2545,32 @@
         <glx rop="159"/>
     </function>
 
-    <function name="BlendFunc" es1="1.0" es2="2.0">
+    <function name="BlendFunc" es1="1.0" es2="2.0" no_error="true">
         <param name="sfactor" type="GLenum"/>
         <param name="dfactor" type="GLenum"/>
         <glx rop="160"/>
     </function>
 
-    <function name="LogicOp" es1="1.0">
+    <function name="LogicOp" es1="1.0" no_error="true">
         <param name="opcode" type="GLenum"/>
         <glx rop="161"/>
     </function>
 
-    <function name="StencilFunc" es1="1.0" es2="2.0">
+    <function name="StencilFunc" es1="1.0" es2="2.0" no_error="true">
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLint"/>
         <param name="mask" type="GLuint"/>
         <glx rop="162"/>
     </function>
 
-    <function name="StencilOp" es1="1.0" es2="2.0">
+    <function name="StencilOp" es1="1.0" es2="2.0" no_error="true">
         <param name="fail" type="GLenum"/>
         <param name="zfail" type="GLenum"/>
         <param name="zpass" type="GLenum"/>
         <glx rop="163"/>
     </function>
 
-    <function name="DepthFunc" es1="1.0" es2="2.0">
+    <function name="DepthFunc" es1="1.0" es2="2.0" no_error="true">
         <param name="func" type="GLenum"/>
         <glx rop="164"/>
     </function>
@@ -2593,13 +2593,13 @@
         <glx rop="167"/>
     </function>
 
-    <function name="PixelStoref">
+    <function name="PixelStoref" no_error="true">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx sop="109" handcode="client"/>
     </function>
 
-    <function name="PixelStorei" es1="1.0" es2="2.0">
+    <function name="PixelStorei" es1="1.0" es2="2.0" no_error="true">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx sop="110" handcode="client"/>
@@ -2626,7 +2626,7 @@
         <glx rop="170" large="true"/>
     </function>
 
-    <function name="ReadBuffer" es2="3.0">
+    <function name="ReadBuffer" es2="3.0" no_error="true">
         <param name="mode" type="GLenum"/>
         <glx rop="171"/>
     </function>
@@ -2966,7 +2966,7 @@
         <glx rop="190"/>
     </function>
 
-    <function name="Viewport" es1="1.0" es2="2.0">
+    <function name="Viewport" es1="1.0" es2="2.0" no_error="true">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -3154,6 +3154,7 @@
     </function>
 
     <function name="ColorPointer" es1="1.0" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -3184,6 +3185,7 @@
     </function>
 
     <function name="EdgeFlagPointer" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
@@ -3202,6 +3204,7 @@
     </function>
 
     <function name="IndexPointer" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -3217,6 +3220,7 @@
     </function>
 
     <function name="NormalPointer" es1="1.0" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -3225,6 +3229,7 @@
     </function>
 
     <function name="TexCoordPointer" es1="1.0" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -3234,6 +3239,7 @@
     </function>
 
     <function name="VertexPointer" es1="1.0" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -3248,7 +3254,7 @@
         <glx rop="192"/>
     </function>
 
-    <function name="CopyTexImage1D">
+    <function name="CopyTexImage1D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -3259,7 +3265,7 @@
         <glx rop="4119"/>
     </function>
 
-    <function name="CopyTexImage2D" es1="1.0" es2="2.0">
+    <function name="CopyTexImage2D" es1="1.0" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -3271,7 +3277,7 @@
         <glx rop="4120"/>
     </function>
 
-    <function name="CopyTexSubImage1D">
+    <function name="CopyTexSubImage1D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3281,7 +3287,7 @@
         <glx rop="4121"/>
     </function>
 
-    <function name="CopyTexSubImage2D" es1="1.0" es2="2.0">
+    <function name="CopyTexSubImage2D" es1="1.0" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3293,7 +3299,7 @@
         <glx rop="4122"/>
     </function>
 
-    <function name="TexSubImage1D">
+    <function name="TexSubImage1D" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3305,7 +3311,7 @@
         <glx rop="4099" large="true"/>
     </function>
 
-    <function name="TexSubImage2D" es1="1.0" es2="2.0">
+    <function name="TexSubImage2D" es1="1.0" es2="2.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3339,7 +3345,7 @@
         <glx sop="144"/>
     </function>
 
-    <function name="GenTextures" es1="1.0" es2="2.0">
+    <function name="GenTextures" es1="1.0" es2="2.0" no_error="true">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="textures" type="GLuint *" output="true" count="n"/>
         <glx sop="145" always_array="true"/>
@@ -4005,7 +4011,7 @@
         <glx rop="4113"/>
     </function>
 
-    <function name="TexImage3D" es2="3.0">
+    <function name="TexImage3D" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -4019,7 +4025,7 @@
         <glx rop="4114" large="true"/>
     </function>
 
-    <function name="TexSubImage3D" es2="3.0">
+    <function name="TexSubImage3D" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4035,7 +4041,7 @@
         <glx rop="4115" large="true"/>
     </function>
 
-    <function name="CopyTexSubImage3D" es2="3.0">
+    <function name="CopyTexSubImage3D" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4262,7 +4268,7 @@
     <enum name="DOT3_RGB"                                 value="0x86AE"/>
     <enum name="DOT3_RGBA"                                value="0x86AF"/>
 
-    <function name="ActiveTexture" es1="1.0" es2="2.0">
+    <function name="ActiveTexture" es1="1.0" es2="2.0" no_error="true">
         <param name="texture" type="GLenum"/>
         <glx rop="197"/>
     </function>
@@ -4501,7 +4507,8 @@
         <glx rop="229"/>
     </function>
 
-    <function name="CompressedTexImage3D" es2="3.0" marshal="sync">
+    <function name="CompressedTexImage3D" es2="3.0" marshal="sync"
+              no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4514,7 +4521,8 @@
         <glx rop="216" handcode="client"/>
     </function>
 
-    <function name="CompressedTexImage2D" es1="1.0" es2="2.0" marshal="sync">
+    <function name="CompressedTexImage2D" es1="1.0" es2="2.0" marshal="sync"
+               no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4526,7 +4534,7 @@
         <glx rop="215" handcode="client"/>
     </function>
 
-    <function name="CompressedTexImage1D" marshal="sync">
+    <function name="CompressedTexImage1D" marshal="sync" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4537,7 +4545,8 @@
         <glx rop="214" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage3D" es2="3.0" marshal="sync">
+    <function name="CompressedTexSubImage3D" es2="3.0" marshal="sync"
+              no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4552,7 +4561,8 @@
         <glx rop="219" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage2D" es1="1.0" es2="2.0" marshal="sync">
+    <function name="CompressedTexSubImage2D" es1="1.0" es2="2.0" marshal="sync"
+              no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4565,7 +4575,7 @@
         <glx rop="218" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage1D" marshal="sync">
+    <function name="CompressedTexSubImage1D" marshal="sync" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4699,7 +4709,7 @@
     </enum>
     <enum name="COMPARE_R_TO_TEXTURE"                     value="0x884E"/>
 
-    <function name="BlendFuncSeparate" es2="2.0">
+    <function name="BlendFuncSeparate" es2="2.0" no_error="true">
         <param name="sfactorRGB" type="GLenum"/>
         <param name="dfactorRGB" type="GLenum"/>
         <param name="sfactorAlpha" type="GLenum"/>
@@ -4725,6 +4735,7 @@
     </function>
 
     <function name="FogCoordPointer" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -4862,6 +4873,7 @@
     </function>
 
     <function name="SecondaryColorPointer" deprecated="3.1" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -5027,7 +5039,7 @@
     <type name="intptr"   size="4"                  glx_name="CARD32"/>
     <type name="sizeiptr" size="4"  unsigned="true" glx_name="CARD32"/>
 
-    <function name="BindBuffer" es1="1.1" es2="2.0" marshal="custom">
+    <function name="BindBuffer" es1="1.1" es2="2.0" marshal="custom" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="buffer" type="GLuint"/>
         <glx ignore="true"/>
@@ -5041,7 +5053,8 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="BufferSubData" es1="1.1" es2="2.0" marshal="custom">
+    <function name="BufferSubData" es1="1.1" es2="2.0" marshal="custom"
+              no_error="true">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="size" type="GLsizeiptr" counter="true"/>
@@ -5055,7 +5068,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GenBuffers" es1="1.1" es2="2.0">
+    <function name="GenBuffers" es1="1.1" es2="2.0" no_error="true">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="buffer" type="GLuint *" output="true" count="n"/>
         <glx ignore="true"/>
@@ -5089,14 +5102,14 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="MapBuffer">
+    <function name="MapBuffer" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="access" type="GLenum"/>
         <return type="GLvoid *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="UnmapBuffer" es2="3.0">
+    <function name="UnmapBuffer" es2="3.0" no_error="true">
         <param name="target" type="GLenum"/>
         <return type="GLboolean"/>
         <glx ignore="true"/>
@@ -5260,7 +5273,7 @@
         <glx rop="233" large="true"/>
     </function>
 
-    <function name="StencilFuncSeparate" es2="2.0">
+    <function name="StencilFuncSeparate" es2="2.0" no_error="true">
         <param name="face" type="GLenum"/>
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLint"/>
@@ -5268,7 +5281,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="StencilOpSeparate" es2="2.0">
+    <function name="StencilOpSeparate" es2="2.0" no_error="true">
         <param name="face" type="GLenum"/>
         <param name="sfail" type="GLenum"/>
         <param name="zfail" type="GLenum"/>
@@ -5276,19 +5289,19 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="StencilMaskSeparate" es2="2.0">
+    <function name="StencilMaskSeparate" es2="2.0" no_error="true">
         <param name="face" type="GLenum"/>
         <param name="mask" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="AttachShader" es2="2.0">
+    <function name="AttachShader" es2="2.0" no_error="true">
         <param name="program" type="GLuint"/>
         <param name="shader" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="BindAttribLocation" es2="2.0">
+    <function name="BindAttribLocation" es2="2.0" no_error="true">
         <param name="program" type="GLuint"/>
         <param name="index" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
@@ -5305,7 +5318,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="CreateShader" es2="2.0">
+    <function name="CreateShader" es2="2.0" no_error="true">
         <param name="type" type="GLenum"/>
         <return type="GLuint"/>
         <glx ignore="true"/>
@@ -5327,13 +5340,13 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="DisableVertexAttribArray" es2="2.0">
+    <function name="DisableVertexAttribArray" es2="2.0" no_error="true">
         <param name="index" type="GLuint"/>
         <glx ignore="true"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="EnableVertexAttribArray" es2="2.0">
+    <function name="EnableVertexAttribArray" es2="2.0" no_error="true">
         <param name="index" type="GLuint"/>
         <glx ignore="true"/>
         <glx handcode="true"/>
@@ -5414,7 +5427,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetUniformLocation" es2="2.0">
+    <function name="GetUniformLocation" es2="2.0" no_error="true">
         <param name="program" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLint"/>
@@ -5492,7 +5505,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="UseProgram" es2="2.0">
+    <function name="UseProgram" es2="2.0" no_error="true">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
@@ -5817,6 +5830,7 @@
     </function>
 
     <function name="VertexAttribPointer" es2="2.0" marshal="async"
+              no_error="true"
               marshal_fail="_mesa_glthread_is_non_vbo_vertex_attrib_pointer(ctx)">
         <param name="index" type="GLuint"/>
         <param name="size" type="GLint"/>
@@ -7673,7 +7687,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="CreateShaderObjectARB">
+    <function name="CreateShaderObjectARB" no_error="true">
         <param name="shaderType" type="GLenum"/>
         <return type="GLhandleARB"/>
         <glx ignore="true"/>
@@ -7695,7 +7709,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="AttachObjectARB">
+    <function name="AttachObjectARB" no_error="true">
         <param name="containerObj" type="GLhandleARB"/>
         <param name="obj" type="GLhandleARB"/>
         <glx ignore="true"/>
@@ -8251,7 +8265,7 @@
     <enum name="BUFFER_STORAGE_FLAGS" value="0x8220" />
     <enum name="CLIENT_MAPPED_BUFFER_BARRIER_BIT" value="0x4000" />
 
-    <function name="BufferStorage">
+    <function name="BufferStorage" no_error="true">
         <param name="target" type="GLenum"/>
         <param name="size" type="GLsizeiptr"/>
         <param name="data" type="const GLvoid *"/>
@@ -8272,7 +8286,9 @@
     <enum name="QUERY_BUFFER_BARRIER_BIT"           value="0x00008000"/>
 </category>
 
-<!-- ARB extensions 149 - 152 -->
+<!-- ARB extensions 149 - 151 -->
+
+<xi:include href="ARB_bindless_texture.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <xi:include href="ARB_compute_variable_group_size.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mapi/glapi/gen/gl_XML.py b/src/mapi/glapi/gen/gl_XML.py
index c688906..a5320e9 100644
--- a/src/mapi/glapi/gen/gl_XML.py
+++ b/src/mapi/glapi/gen/gl_XML.py
@@ -606,6 +606,7 @@
         self.exec_flavor = 'mesa'
         self.desktop = True
         self.deprecated = None
+        self.has_no_error_variant = False
 
         # self.entry_point_api_map[name][api] is a decimal value
         # indicating the earliest version of the given API in which
@@ -676,6 +677,11 @@
         if not is_attr_true(element, 'desktop', 'true'):
             self.desktop = False
 
+        if self.has_no_error_variant or is_attr_true(element, 'no_error'):
+            self.has_no_error_variant = True
+        else:
+            self.has_no_error_variant = False
+
         if alias:
             true_name = alias
         else:
diff --git a/src/mapi/glapi/gen/gl_genexec.py b/src/mapi/glapi/gen/gl_genexec.py
index 3a75419..57e155b 100644
--- a/src/mapi/glapi/gen/gl_genexec.py
+++ b/src/mapi/glapi/gen/gl_genexec.py
@@ -113,6 +113,7 @@
 #include "main/texstate.h"
 #include "main/texstorage.h"
 #include "main/barrier.h"
+#include "main/texturebindless.h"
 #include "main/textureview.h"
 #include "main/transformfeedback.h"
 #include "main/mtypes.h"
@@ -232,8 +233,16 @@
                 # This function is not implemented, or is dispatched
                 # dynamically.
                 continue
-            settings_by_condition[condition].append(
-                'SET_{0}(exec, {1}{0});'.format(f.name, prefix, f.name))
+            if f.has_no_error_variant:
+                no_error_condition = '_mesa_is_no_error_enabled(ctx) && ({0})'.format(condition)
+                error_condition = '!_mesa_is_no_error_enabled(ctx) && ({0})'.format(condition)
+                settings_by_condition[no_error_condition].append(
+                    'SET_{0}(exec, {1}{0}_no_error);'.format(f.name, prefix, f.name))
+                settings_by_condition[error_condition].append(
+                    'SET_{0}(exec, {1}{0});'.format(f.name, prefix, f.name))
+            else:
+                settings_by_condition[condition].append(
+                    'SET_{0}(exec, {1}{0});'.format(f.name, prefix, f.name))
         # Print out an if statement for each unique condition, with
         # the SET_* calls nested inside it.
         for condition in sorted(settings_by_condition.keys()):
diff --git a/src/mapi/glapi/gen/gl_marshal.py b/src/mapi/glapi/gen/gl_marshal.py
index 51475e1..efa4d9e 100644
--- a/src/mapi/glapi/gen/gl_marshal.py
+++ b/src/mapi/glapi/gen/gl_marshal.py
@@ -66,8 +66,6 @@
 
     def printRealHeader(self):
         print header
-        print '#ifdef HAVE_PTHREAD'
-        print
         print 'static inline int safe_mul(int a, int b)'
         print '{'
         print '    if (a < 0 || b < 0) return -1;'
@@ -78,8 +76,7 @@
         print
 
     def printRealFooter(self):
-        print
-        print '#endif'
+        pass
 
     def print_sync_call(self, func):
         call = 'CALL_{0}(ctx->CurrentServerDispatch, ({1}))'.format(
@@ -90,7 +87,6 @@
             out('return {0};'.format(call))
 
     def print_sync_dispatch(self, func):
-        out('_mesa_glthread_finish(ctx);')
         out('debug_print_sync_fallback("{0}");'.format(func.name))
         self.print_sync_call(func)
 
@@ -177,11 +173,19 @@
         with indent():
             for p in func.fixed_params:
                 if p.count:
-                    out('const {0} * {1} = cmd->{1};'.format(
-                            p.get_base_type_string(), p.name))
+                    p_decl = '{0} * {1} = cmd->{1};'.format(
+                            p.get_base_type_string(), p.name)
                 else:
-                    out('const {0} {1} = cmd->{1};'.format(
-                            p.type_string(), p.name))
+                    p_decl = '{0} {1} = cmd->{1};'.format(
+                            p.type_string(), p.name)
+
+                if not p_decl.startswith('const '):
+                    # Declare all local function variables as const, even if
+                    # the original parameter is not const.
+                    p_decl = 'const ' + p_decl
+
+                out(p_decl)
+
             if func.variable_params:
                 for p in func.variable_params:
                     out('const {0} * {1};'.format(
@@ -259,6 +263,7 @@
         if need_fallback_sync:
             out('fallback_to_sync:')
         with indent():
+            out('_mesa_glthread_finish(ctx);')
             self.print_sync_dispatch(func)
 
         out('}')
diff --git a/src/mapi/glapi/gen/gl_table.py b/src/mapi/glapi/gen/gl_table.py
index 43c9135..80a44f4 100644
--- a/src/mapi/glapi/gen/gl_table.py
+++ b/src/mapi/glapi/gen/gl_table.py
@@ -40,20 +40,14 @@
         self.license = license.bsd_license_template % ( \
 """Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
 (C) Copyright IBM Corporation 2004""", "BRIAN PAUL, IBM")
-        self.ifdef_emitted = False
         return
 
     def printBody(self, api):
         for f in api.functionIterateByOffset():
-            if not f.is_abi() and not self.ifdef_emitted:
-                print '#if !defined HAVE_SHARED_GLAPI'
-                self.ifdef_emitted = True
             arg_string = f.get_parameter_string()
             print '   %s (GLAPIENTRYP %s)(%s); /* %d */' % (
                 f.return_type, f.name, arg_string, f.offset)
 
-        print '#endif /* !defined HAVE_SHARED_GLAPI */'
-
     def printRealHeader(self):
         print '#ifndef GLAPIENTRYP'
         print '# ifndef GLAPIENTRY'
diff --git a/src/mapi/glapi/glapi.c b/src/mapi/glapi/glapi.c
index 194b9ee..55258a4 100644
--- a/src/mapi/glapi/glapi.c
+++ b/src/mapi/glapi/glapi.c
@@ -60,5 +60,5 @@
 void
 _glapi_set_dispatch(struct _glapi_table *dispatch)
 {
-   u_current_set_table((const struct mapi_table *) dispatch);
+   u_current_set_table((const struct _glapi_table *) dispatch);
 }
diff --git a/src/mapi/glapi/registry/gl.xml b/src/mapi/glapi/registry/gl.xml
index 9e2ff98..353d0ef 100644
--- a/src/mapi/glapi/registry/gl.xml
+++ b/src/mapi/glapi/registry/gl.xml
@@ -1,36 +1,27 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <registry>
     <comment>
-Copyright (c) 2013-2016 The Khronos Group Inc.
+Copyright (c) 2013-2017 The Khronos Group Inc.
 
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and/or associated documentation files (the
-"Materials"), to deal in the Materials without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Materials, and to
-permit persons to whom the Materials are furnished to do so, subject to
-the following conditions:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Materials.
+    http://www.apache.org/licenses/LICENSE-2.0
 
-THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
 
 ------------------------------------------------------------------------
 
-This file, gl.xml, is the OpenGL and OpenGL API Registry. The older
-".spec" file format has been retired and will no longer be updated with
-new extensions and API versions. The canonical version of the registry,
-together with documentation, schema, and Python generator scripts used
-to generate C header files for OpenGL and OpenGL ES, can always be found
-in the Khronos Registry at
-        http://www.opengl.org/registry/
+This file, gl.xml, is the OpenGL and OpenGL API Registry. The canonical
+version of the registry, together with documentation, schema, and Python
+generator scripts used to generate C header files for OpenGL and OpenGL ES,
+can always be found in the Khronos Registry at
+        https://github.com/KhronosGroup/OpenGL-Registry
     </comment>
 
     <!-- SECTION: GL type definitions. -->
@@ -151,6 +142,7 @@
         <type>typedef void (<apientry/> *<name>GLDEBUGPROCAMD</name>)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,void *userParam);</type>
         <type>typedef unsigned short <name>GLhalfNV</name>;</type>
         <type requires="GLintptr">typedef GLintptr <name>GLvdpauSurfaceNV</name>;</type>
+        <type>typedef void (<apientry/> *<name>GLVULKANPROCNV</name>)(void);</type>
     </types>
 
     <!-- SECTION: GL parameter class type definitions. -->
@@ -250,6 +242,41 @@
             <enum name="GL_TRUE"/>
         </group>
 
+        <group name="BufferTargetARB">
+          <enum name="GL_ARRAY_BUFFER"/>
+          <enum name="GL_ATOMIC_COUNTER_BUFFER" />
+          <enum name="GL_COPY_READ_BUFFER" />
+          <enum name="GL_COPY_WRITE_BUFFER" />
+          <enum name="GL_DISPATCH_INDIRECT_BUFFER" />
+          <enum name="GL_DRAW_INDIRECT_BUFFER" />
+          <enum name="GL_ELEMENT_ARRAY_BUFFER" />
+          <enum name="GL_PIXEL_PACK_BUFFER" />
+          <enum name="GL_PIXEL_UNPACK_BUFFER" />
+          <enum name="GL_QUERY_BUFFER" />
+          <enum name="GL_SHADER_STORAGE_BUFFER" />
+          <enum name="GL_TEXTURE_BUFFER" />
+          <enum name="GL_TRANSFORM_FEEDBACK_BUFFER" />
+          <enum name="GL_UNIFORM_BUFFER" />
+        </group>
+
+        <group name="BufferUsageARB">
+          <enum name="GL_STREAM_DRAW"/>
+          <enum name="GL_STREAM_READ"/>
+          <enum name="GL_STREAM_COPY"/>
+          <enum name="GL_STATIC_DRAW"/>
+          <enum name="GL_STATIC_READ"/>
+          <enum name="GL_STATIC_COPY"/>
+          <enum name="GL_DYNAMIC_DRAW"/>
+          <enum name="GL_DYNAMIC_READ"/>
+          <enum name="GL_DYNAMIC_COPY"/>
+        </group>
+
+        <group name="BufferAccessARB">
+          <enum name="GL_READ_ONLY"/>
+          <enum name="GL_WRITE_ONLY"/>
+          <enum name="GL_READ_WRITE"/>
+        </group>
+
         <group name="ClearBufferMask">
             <enum name="GL_ACCUM_BUFFER_BIT"/>
             <enum name="GL_COLOR_BUFFER_BIT"/>
@@ -401,6 +428,12 @@
             <enum name="GL_RIGHT"/>
         </group>
 
+        <group name="DrawElementsType">
+            <enum name="GL_UNSIGNED_BYTE"/>
+            <enum name="GL_UNSIGNED_SHORT"/>
+            <enum name="GL_UNSIGNED_INT"/>
+        </group>
+
         <group name="EnableCap">
             <enum name="GL_ALPHA_TEST"/>
             <enum name="GL_ASYNC_DRAW_PIXELS_SGIX"/>
@@ -526,6 +559,17 @@
             <enum name="GL_TEXTURE_TOO_LARGE_EXT"/>
         </group>
 
+        <group name="ExternalHandleType">
+            <enum name="GL_HANDLE_TYPE_OPAQUE_FD_EXT"/>
+            <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_EXT"/>
+            <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT"/>
+            <enum name="GL_HANDLE_TYPE_D3D12_TILEPOOL_EXT"/>
+            <enum name="GL_HANDLE_TYPE_D3D12_RESOURCE_EXT"/>
+            <enum name="GL_HANDLE_TYPE_D3D11_IMAGE_EXT"/>
+            <enum name="GL_HANDLE_TYPE_D3D11_IMAGE_KMT_EXT"/>
+            <enum name="GL_HANDLE_TYPE_D3D12_FENCE_EXT"/>
+        </group>
+
         <group name="FeedbackType">
             <enum name="GL_2D"/>
             <enum name="GL_3D"/>
@@ -591,6 +635,10 @@
             <enum name="GL_FRAGMENT_LIGHT_MODEL_TWO_SIDE_SGIX"/>
         </group>
 
+        <group name="FramebufferFetchNoncoherent">
+            <enum name="GL_FRAMEBUFFER_FETCH_NONCOHERENT_QCOM"/>
+        </group>
+
         <group name="FrontFaceDirection">
             <enum name="GL_CCW"/>
             <enum name="GL_CW"/>
@@ -737,11 +785,15 @@
             <enum name="GL_DEPTH_TEST"/>
             <enum name="GL_DEPTH_WRITEMASK"/>
             <enum name="GL_DETAIL_TEXTURE_2D_BINDING_SGIS"/>
+            <enum name="GL_DEVICE_LUID_EXT"/>
+            <enum name="GL_DEVICE_NODE_MASK_EXT"/>
+            <enum name="GL_DEVICE_UUID_EXT"/>
             <enum name="GL_DISTANCE_ATTENUATION_SGIS"/>
             <enum name="GL_DITHER"/>
             <enum name="GL_DOUBLEBUFFER"/>
             <enum name="GL_DRAW_BUFFER"/>
             <enum name="GL_DRAW_BUFFER_EXT"/>
+            <enum name="GL_DRIVER_UUID_EXT"/>
             <enum name="GL_EDGE_FLAG"/>
             <enum name="GL_EDGE_FLAG_ARRAY"/>
             <enum name="GL_EDGE_FLAG_ARRAY_COUNT_EXT"/>
@@ -881,6 +933,7 @@
             <enum name="GL_NORMAL_ARRAY_COUNT_EXT"/>
             <enum name="GL_NORMAL_ARRAY_STRIDE"/>
             <enum name="GL_NORMAL_ARRAY_TYPE"/>
+            <enum name="GL_NUM_DEVICE_UUIDS"/>
             <enum name="GL_PACK_ALIGNMENT"/>
             <enum name="GL_PACK_CMYK_HINT_EXT"/>
             <enum name="GL_PACK_IMAGE_DEPTH_SGIS"/>
@@ -1415,6 +1468,11 @@
             <enum name="GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT_EXT"/>
         </group>
 
+        <group name="MemoryObjectParameterName">
+            <enum name="GL_DEDICATED_MEMORY_OBJECT_EXT"/>
+            <enum name="GL_PROTECTED_MEMORY_OBJECT_EXT"/>
+        </group>
+
         <group name="MeshMode1">
             <enum name="GL_LINE"/>
             <enum name="GL_POINT"/>
@@ -1795,6 +1853,10 @@
             <enum name="GL_4PASS_3_SGIS"/>
         </group>
 
+        <group name="SemaphoreParameterName">
+            <enum name="GL_D3D12_FENCE_VALUE_EXT"/>
+        </group>
+
         <group name="SeparableTargetEXT">
             <enum name="GL_SEPARABLE_2D"/>
             <enum name="GL_SEPARABLE_2D_EXT"/>
@@ -1805,6 +1867,12 @@
             <enum name="GL_SMOOTH"/>
         </group>
 
+        <group name="StencilFaceDirection">
+             <enum name="GL_FRONT"/>
+             <enum name="GL_BACK"/>
+             <enum name="GL_FRONT_AND_BACK"/>
+        </group>
+
         <group name="StencilFunction">
             <enum name="GL_ALWAYS"/>
             <enum name="GL_EQUAL"/>
@@ -1830,6 +1898,7 @@
             <enum name="GL_RENDERER"/>
             <enum name="GL_VENDOR"/>
             <enum name="GL_VERSION"/>
+            <enum name="GL_SHADING_LANGUAGE_VERSION"/>
         </group>
 
         <group name="TexCoordPointerType">
@@ -1888,6 +1957,16 @@
             <enum name="GL_TEXTURE_GEN_MODE"/>
         </group>
 
+        <group name="TextureLayout">
+            <enum name="GL_LAYOUT_GENERAL_EXT"/>
+            <enum name="GL_LAYOUT_COLOR_ATTACHMENT_EXT"/>
+            <enum name="GL_LAYOUT_DEPTH_STENCIL_ATTACHMENT_EXT"/>
+            <enum name="GL_LAYOUT_DEPTH_STENCIL_READ_ONLY_EXT"/>
+            <enum name="GL_LAYOUT_SHADER_READ_ONLY_EXT"/>
+            <enum name="GL_LAYOUT_TRANSFER_SRC_EXT"/>
+            <enum name="GL_LAYOUT_TRANSFER_DST_EXT"/>
+        </group>
+
         <group name="TextureMagFilter">
             <enum name="GL_FILTER4_SGIS"/>
             <enum name="GL_LINEAR"/>
@@ -1954,31 +2033,67 @@
             <enum name="GL_TEXTURE_WRAP_R_OES"/>
             <enum name="GL_TEXTURE_WRAP_S"/>
             <enum name="GL_TEXTURE_WRAP_T"/>
+            <enum name="GL_TEXTURE_BASE_LEVEL"/>
+            <enum name="GL_TEXTURE_COMPARE_MODE"/>
+            <enum name="GL_TEXTURE_COMPARE_FUNC"/>
+            <enum name="GL_TEXTURE_LOD_BIAS"/>
+            <enum name="GL_TEXTURE_MIN_LOD"/>
+            <enum name="GL_TEXTURE_MAX_LOD"/>
+            <enum name="GL_TEXTURE_MAX_LEVEL"/>
+            <enum name="GL_TEXTURE_SWIZZLE_R"/>
+            <enum name="GL_TEXTURE_SWIZZLE_G"/>
+            <enum name="GL_TEXTURE_SWIZZLE_B"/>
+            <enum name="GL_TEXTURE_SWIZZLE_A"/>
+            <enum name="GL_TEXTURE_SWIZZLE_RGBA"/>
+            <enum name="GL_TEXTURE_TILING_EXT"/>
+            <enum name="GL_DEPTH_STENCIL_TEXTURE_MODE"/>
         </group>
 
         <group name="TextureTarget">
             <enum name="GL_DETAIL_TEXTURE_2D_SGIS"/>
             <enum name="GL_PROXY_TEXTURE_1D"/>
+            <enum name="GL_PROXY_TEXTURE_1D_ARRAY"/>
+            <enum name="GL_PROXY_TEXTURE_1D_ARRAY_EXT"/>
             <enum name="GL_PROXY_TEXTURE_1D_EXT"/>
             <enum name="GL_PROXY_TEXTURE_2D"/>
+            <enum name="GL_PROXY_TEXTURE_2D_ARRAY"/>
+            <enum name="GL_PROXY_TEXTURE_2D_ARRAY_EXT"/>
             <enum name="GL_PROXY_TEXTURE_2D_EXT"/>
+            <enum name="GL_PROXY_TEXTURE_2D_MULTISAMPLE"/>
+            <enum name="GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY"/>
             <enum name="GL_PROXY_TEXTURE_3D"/>
             <enum name="GL_PROXY_TEXTURE_3D_EXT"/>
             <enum name="GL_PROXY_TEXTURE_4D_SGIS"/>
+            <enum name="GL_PROXY_TEXTURE_CUBE_MAP"/>
+            <enum name="GL_PROXY_TEXTURE_CUBE_MAP_ARB"/>
+            <enum name="GL_PROXY_TEXTURE_CUBE_MAP_EXT"/>
+            <enum name="GL_PROXY_TEXTURE_CUBE_MAP_ARRAY"/>
+            <enum name="GL_PROXY_TEXTURE_CUBE_MAP_ARRAY_ARB"/>
+            <enum name="GL_PROXY_TEXTURE_RECTANGLE"/>
+            <enum name="GL_PROXY_TEXTURE_RECTANGLE_ARB"/>
+            <enum name="GL_PROXY_TEXTURE_RECTANGLE_NV"/>
             <enum name="GL_TEXTURE_1D"/>
             <enum name="GL_TEXTURE_2D"/>
             <enum name="GL_TEXTURE_3D"/>
             <enum name="GL_TEXTURE_3D_EXT"/>
             <enum name="GL_TEXTURE_3D_OES"/>
             <enum name="GL_TEXTURE_4D_SGIS"/>
-            <enum name="GL_TEXTURE_BASE_LEVEL"/>
-            <enum name="GL_TEXTURE_BASE_LEVEL_SGIS"/>
-            <enum name="GL_TEXTURE_MAX_LEVEL"/>
-            <enum name="GL_TEXTURE_MAX_LEVEL_SGIS"/>
-            <enum name="GL_TEXTURE_MAX_LOD"/>
-            <enum name="GL_TEXTURE_MAX_LOD_SGIS"/>
-            <enum name="GL_TEXTURE_MIN_LOD"/>
-            <enum name="GL_TEXTURE_MIN_LOD_SGIS"/>
+            <enum name="GL_TEXTURE_RECTANGLE"/>
+            <enum name="GL_TEXTURE_CUBE_MAP"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_POSITIVE_X"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_NEGATIVE_X"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_POSITIVE_Y"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_NEGATIVE_Y"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_POSITIVE_Z"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_NEGATIVE_Z"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_ARRAY"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_ARRAY_ARB"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_ARRAY_EXT"/>
+            <enum name="GL_TEXTURE_CUBE_MAP_ARRAY_OES"/>
+            <enum name="GL_TEXTURE_1D_ARRAY"/>
+            <enum name="GL_TEXTURE_2D_ARRAY"/>
+            <enum name="GL_TEXTURE_2D_MULTISAMPLE"/>
+            <enum name="GL_TEXTURE_2D_MULTISAMPLE_ARRAY"/>
         </group>
 
         <group name="TextureWrapMode">
@@ -2097,7 +2212,8 @@
         <enum value="0x0200" name="GL_CLIENT_STORAGE_BIT"/>
         <enum value="0x0200" name="GL_CLIENT_STORAGE_BIT_EXT"/>
         <enum value="0x0400" name="GL_SPARSE_STORAGE_BIT_ARB"/>
-            <!-- Bits 0x1000 and 0x0800 reserved for Joshua Schnarr, jschnarr@nvidia.com -->
+        <enum value="0x0800" name="GL_LGPU_SEPARATE_STORAGE_BIT_NVX"/>
+        <enum value="0x0800" name="GL_PER_GPU_STORAGE_BIT_NV"/>
     </enums>
 
     <enums namespace="GL" group="MemoryBarrierMask" type="bitmask">
@@ -2294,11 +2410,17 @@
         <enum value="0x80000000" name="GL_MULTISAMPLE_BUFFER_BIT7_QCOM"/>
     </enums>
 
+    <enums namespace="GL" group="FoveationConfigBitQCOM" type="bitmask">
+        <enum value="0x00000001" name="GL_FOVEATION_ENABLE_BIT_QCOM"/>
+        <enum value="0x00000002" name="GL_FOVEATION_SCALED_BIN_METHOD_BIT_QCOM"/>
+    </enums>
+
     <enums namespace="GL" group="FfdMaskSGIX" type="bitmask">
         <enum value="0x00000001" name="GL_TEXTURE_DEFORMATION_BIT_SGIX"/>
         <enum value="0x00000002" name="GL_GEOMETRY_DEFORMATION_BIT_SGIX"/>
     </enums>
 
+
     <!-- Non-bitmask enums with their own namespace. Generally small numbers
          used for indexed access. -->
 
@@ -2412,11 +2534,14 @@
         <enum value="1" name="GL_TRUE"/>
         <enum value="1" name="GL_ONE"/>
         <enum value="0xFFFFFFFF" name="GL_INVALID_INDEX" type="u" comment="Tagged as uint"/>
+        <enum value="0xFFFFFFFF" name="GL_ALL_PIXELS_AMD"/>
         <enum value="0xFFFFFFFFFFFFFFFF" name="GL_TIMEOUT_IGNORED" type="ull" comment="Tagged as uint64"/>
         <enum value="0xFFFFFFFFFFFFFFFF" name="GL_TIMEOUT_IGNORED_APPLE" type="ull" comment="Tagged as uint64"/>
         <enum value="1" name="GL_VERSION_ES_CL_1_0" comment="Not an API enum. API definition macro for ES 1.0/1.1 headers"/>
         <enum value="1" name="GL_VERSION_ES_CM_1_1" comment="Not an API enum. API definition macro for ES 1.0/1.1 headers"/>
         <enum value="1" name="GL_VERSION_ES_CL_1_1" comment="Not an API enum. API definition macro for ES 1.0/1.1 headers"/>
+        <enum value="16" name="GL_UUID_SIZE_EXT"/>
+        <enum value="8" name="GL_LUID_SIZE_EXT"/>
     </enums>
 
     <enums namespace="GL" start="0x0000" end="0x7FFF" vendor="ARB" comment="Mostly OpenGL 1.0/1.1 enum assignments. Unused ranges should generally remain unused.">
@@ -7442,7 +7567,9 @@
         <enum value="0x8F65" name="GL_FETCH_PER_SAMPLE_ARM"/>
         <enum value="0x8F66" name="GL_FRAGMENT_SHADER_FRAMEBUFFER_FETCH_MRT_ARM"/>
         <enum value="0x8F67" name="GL_MAX_SHADER_PIXEL_LOCAL_STORAGE_SIZE_EXT"/>
-            <unused start="0x8F68" end="0x8F6F" vendor="ARM"/>
+            <unused start="0x8F68" vendor="ARM"/>
+        <enum value="0x8F69" name="GL_TEXTURE_ASTC_DECODE_PRECISION_EXT"/>
+            <unused start="0x8F6A" end="0x8F6F" vendor="ARM"/>
     </enums>
 
     <enums namespace="GL" start="0x8F70" end="0x8F7F" vendor="HI" comment="For Mark Callow, Khronos bug 4055. Shared with EGL.">
@@ -8030,7 +8157,9 @@
         <enum value="0x91A9" name="GL_SPARSE_TEXTURE_FULL_ARRAY_CUBE_MIPMAPS_EXT"/>
         <enum value="0x91AA" name="GL_NUM_SPARSE_LEVELS_ARB"/>
         <enum value="0x91AA" name="GL_NUM_SPARSE_LEVELS_EXT"/>
-            <unused start="0x91AB" end="0x91AF" vendor="AMD"/>
+            <unused start="0x91AB" end="0x91AD" vendor="AMD"/>
+        <enum value="0x91AE" name="GL_PIXELS_PER_SAMPLE_PATTERN_X_AMD"/>
+        <enum value="0x91AF" name="GL_PIXELS_PER_SAMPLE_PATTERN_Y_AMD"/>
         <enum value="0x91B0" name="GL_MAX_SHADER_COMPILER_THREADS_ARB"/>
         <enum value="0x91B1" name="GL_COMPLETION_STATUS_ARB"/>
             <unused start="0x91B2" end="0x91B8" vendor="AMD"/>
@@ -8042,7 +8171,17 @@
         <enum value="0x91BE" name="GL_MAX_COMPUTE_WORK_GROUP_COUNT"/>
         <enum value="0x91BF" name="GL_MAX_COMPUTE_WORK_GROUP_SIZE"/>
         <enum value="0x91BF" name="GL_MAX_COMPUTE_FIXED_GROUP_SIZE_ARB" alias="GL_MAX_COMPUTE_WORK_GROUP_SIZE"/>
-            <unused start="0x91C0" end="0x923F" vendor="AMD"/>
+            <unused start="0x91C0" end="0x91C4" vendor="AMD"/>
+        <enum value="0x91C5" name="GL_FLOAT16_MAT2_AMD"/>
+        <enum value="0x91C6" name="GL_FLOAT16_MAT3_AMD"/>
+        <enum value="0x91C7" name="GL_FLOAT16_MAT4_AMD"/>
+        <enum value="0x91C8" name="GL_FLOAT16_MAT2x3_AMD"/>
+        <enum value="0x91C9" name="GL_FLOAT16_MAT2x4_AMD"/>
+        <enum value="0x91CA" name="GL_FLOAT16_MAT3x2_AMD"/>
+        <enum value="0x91CB" name="GL_FLOAT16_MAT3x4_AMD"/>
+        <enum value="0x91CC" name="GL_FLOAT16_MAT4x2_AMD"/>
+        <enum value="0x91CD" name="GL_FLOAT16_MAT4x3_AMD"/>
+            <unused start="0x91CE" end="0x923F" vendor="AMD"/>
     </enums>
 
     <enums namespace="GL" start="0x9240" end="0x924F" vendor="WEBGL" comment="Khronos bug 6473,6884">
@@ -8174,14 +8313,16 @@
         <enum value="0x92B2" name="GL_PLUS_CLAMPED_ALPHA_NV"/>
         <enum value="0x92B3" name="GL_MINUS_CLAMPED_NV"/>
         <enum value="0x92B4" name="GL_INVERT_OVG_NV"/>
-            <unused start="0x92B5" end="0x92BA" vendor="NV"/>
+            <unused start="0x92B5" end="0x92B9" vendor="NV"/>
+        <enum value="0x92BA" name="GL_MAX_LGPU_GPUS_NVX"/>
+        <enum value="0x92BA" name="GL_MULTICAST_GPUS_NV"/>
         <enum value="0x92BB" name="GL_PURGED_CONTEXT_RESET_NV"/>
             <unused start="0x92BC" end="0x92BD" vendor="NV"/>
         <enum value="0x92BE" name="GL_PRIMITIVE_BOUNDING_BOX_ARB"/>
         <enum value="0x92BE" name="GL_PRIMITIVE_BOUNDING_BOX"/>
         <enum value="0x92BE" name="GL_PRIMITIVE_BOUNDING_BOX_EXT"/>
         <enum value="0x92BE" name="GL_PRIMITIVE_BOUNDING_BOX_OES"/>
-            <unused start="0x92BF" vendor="NV"/>
+        <enum value="0x92BF" name="GL_ALPHA_TO_COVERAGE_DITHER_MODE_NV"/>
         <enum value="0x92C0" name="GL_ATOMIC_COUNTER_BUFFER"/>
         <enum value="0x92C1" name="GL_ATOMIC_COUNTER_BUFFER_BINDING"/>
         <enum value="0x92C2" name="GL_ATOMIC_COUNTER_BUFFER_START"/>
@@ -8339,7 +8480,9 @@
         <enum value="0x934A" name="GL_LOCATION_COMPONENT"/>
         <enum value="0x934B" name="GL_TRANSFORM_FEEDBACK_BUFFER_INDEX"/>
         <enum value="0x934C" name="GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE"/>
-            <unused start="0x934D" end="0x934F" vendor="NV"/>
+        <enum value="0x934D" name="GL_ALPHA_TO_COVERAGE_DITHER_DEFAULT_NV"/>
+        <enum value="0x934E" name="GL_ALPHA_TO_COVERAGE_DITHER_ENABLE_NV"/>
+        <enum value="0x934F" name="GL_ALPHA_TO_COVERAGE_DITHER_DISABLE_NV"/>
         <enum value="0x9350" name="GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV"/>
         <enum value="0x9351" name="GL_VIEWPORT_SWIZZLE_NEGATIVE_X_NV"/>
         <enum value="0x9352" name="GL_VIEWPORT_SWIZZLE_POSITIVE_Y_NV"/>
@@ -8517,18 +8660,56 @@
     </enums>
 
     <enums namespace="GL" start="0x9530" end="0x962F" vendor="NV" comment="Khronos bug 12977">
-            <unused start="0x9530" end="0x954C" vendor="NV"/>
+            <unused start="0x9530" end="0x9547" vendor="NV"/>
+        <enum value="0x9548" name="GL_PER_GPU_STORAGE_NV"/>
+        <enum value="0x9549" name="GL_MULTICAST_PROGRAMMABLE_SAMPLE_LOCATION_NV"/>
+            <unused start="0x954A" end="0x954C" vendor="NV"/>
         <enum value="0x954D" name="GL_CONSERVATIVE_RASTER_MODE_NV"/>
         <enum value="0x954E" name="GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV"/>
         <enum value="0x954F" name="GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_TRIANGLES_NV"/>
-            <unused start="0x9550" end="0x962F" vendor="NV"/>
+            <unused start="0x9550" vendor="NV"/>
+        <enum value="0x9551" name="GL_SHADER_BINARY_FORMAT_SPIR_V_ARB"/>
+        <enum value="0x9552" name="GL_SPIR_V_BINARY_ARB"/>
+            <unused start="0x9553" end="0x9557" vendor="NV"/>
+        <enum value="0x9558" name="GL_RENDER_GPU_MASK_NV"/>
+            <unused start="0x9559" end="0x957F" vendor="NV"/>
+        <enum value="0x9580" name="GL_TEXTURE_TILING_EXT"/>
+        <enum value="0x9581" name="GL_DEDICATED_MEMORY_OBJECT_EXT"/>
+        <enum value="0x9582" name="GL_NUM_TILING_TYPES_EXT"/>
+        <enum value="0x9583" name="GL_TILING_TYPES_EXT"/>
+        <enum value="0x9584" name="GL_OPTIMAL_TILING_EXT"/>
+        <enum value="0x9585" name="GL_LINEAR_TILING_EXT"/>
+        <enum value="0x9586" name="GL_HANDLE_TYPE_OPAQUE_FD_EXT"/>
+        <enum value="0x9587" name="GL_HANDLE_TYPE_OPAQUE_WIN32_EXT"/>
+        <enum value="0x9588" name="GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT"/>
+        <enum value="0x9589" name="GL_HANDLE_TYPE_D3D12_TILEPOOL_EXT"/>
+        <enum value="0x958A" name="GL_HANDLE_TYPE_D3D12_RESOURCE_EXT"/>
+        <enum value="0x958B" name="GL_HANDLE_TYPE_D3D11_IMAGE_EXT"/>
+        <enum value="0x958C" name="GL_HANDLE_TYPE_D3D11_IMAGE_KMT_EXT"/>
+        <enum value="0x958D" name="GL_LAYOUT_GENERAL_EXT"/>
+        <enum value="0x958E" name="GL_LAYOUT_COLOR_ATTACHMENT_EXT"/>
+        <enum value="0x958F" name="GL_LAYOUT_DEPTH_STENCIL_ATTACHMENT_EXT"/>
+        <enum value="0x9590" name="GL_LAYOUT_DEPTH_STENCIL_READ_ONLY_EXT"/>
+        <enum value="0x9591" name="GL_LAYOUT_SHADER_READ_ONLY_EXT"/>
+        <enum value="0x9592" name="GL_LAYOUT_TRANSFER_SRC_EXT"/>
+        <enum value="0x9593" name="GL_LAYOUT_TRANSFER_DST_EXT"/>
+        <enum value="0x9594" name="GL_HANDLE_TYPE_D3D12_FENCE_EXT"/>
+        <enum value="0x9595" name="GL_D3D12_FENCE_VALUE_EXT"/>
+        <enum value="0x9596" name="GL_NUM_DEVICE_UUIDS_EXT"/>
+        <enum value="0x9597" name="GL_DEVICE_UUID_EXT"/>
+        <enum value="0x9598" name="GL_DRIVER_UUID_EXT"/>
+        <enum value="0x9599" name="GL_DEVICE_LUID_EXT"/>
+        <enum value="0x959A" name="GL_DEVICE_NODE_MASK_EXT"/>
+        <enum value="0x959B" name="GL_PROTECTED_MEMORY_OBJECT_EXT"/>
+            <unused start="0x959C" end="0x962F" vendor="NV"/>
     </enums>
 
     <enums namespace="GL" start="0x9630" end="0x963F" vendor="Oculus" comment="Email from Cass Everitt">
         <enum value="0x9630" name="GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR"/>
         <enum value="0x9631" name="GL_MAX_VIEWS_OVR"/>
         <enum value="0x9632" name="GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR"/>
-            <unused start="0x9633" end="0x963F" vendor="Oculus"/>
+        <enum value="0x9633" name="GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR"/>
+            <unused start="0x9634" end="0x963F" vendor="Oculus"/>
     </enums>
 
     <enums namespace="GL" start="0x9640" end="0x964F" vendor="Mediatek" comment="Khronos bug 14294">
@@ -8549,7 +8730,9 @@
     </enums>
 
     <enums namespace="GL" start="0x96A0" end="0x96AF" vendor="Qualcomm" comment="contact Maurice Ribble">
-            <unused start="0x96A0" end="0x96AF" vendor="Qualcomm"/>
+            <unused start="0x96A0" end="0x96A1" vendor="Qualcomm"/>
+        <enum value="0x96A2" name="GL_FRAMEBUFFER_FETCH_NONCOHERENT_QCOM"/>
+            <unused start="0x96A3" end="0x96AF" vendor="Qualcomm"/>
     </enums>
 
 <!-- Enums reservable for future use. To reserve a new range, allocate one
@@ -8732,6 +8915,10 @@
             <param group="ClampedFixed"><ptype>GLfixed</ptype> <name>ref</name></param>
         </command>
         <command>
+            <proto>void <name>glAlphaToCoverageDitherControlNV</name></proto>
+            <param><ptype>GLenum</ptype> <name>mode</name></param>
+        </command>
+        <command>
             <proto>void <name>glApplyFramebufferAttachmentCMAAINTEL</name></proto>
         </command>
         <command>
@@ -8739,6 +8926,12 @@
             <param group="LightTextureModeEXT"><ptype>GLenum</ptype> <name>mode</name></param>
         </command>
         <command>
+            <proto><ptype>GLboolean</ptype> <name>glAcquireKeyedMutexWin32EXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>key</name></param>
+            <param><ptype>GLuint</ptype> <name>timeout</name></param>
+        </command>
+        <command>
             <proto group="Boolean"><ptype>GLboolean</ptype> <name>glAreProgramsResidentNV</name></proto>
             <param><ptype>GLsizei</ptype> <name>n</name></param>
             <param len="n">const <ptype>GLuint</ptype> *<name>programs</name></param>
@@ -9668,6 +9861,13 @@
             <alias name="glBufferStorage"/>
         </command>
         <command>
+            <proto>void <name>glBufferStorageMemEXT</name></proto>
+            <param group="BufferTargetARB"><ptype>GLenum</ptype> <name>target</name></param>
+            <param group="BufferSize"><ptype>GLsizeiptr</ptype> <name>size</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
             <proto>void <name>glBufferSubData</name></proto>
             <param group="BufferTargetARB"><ptype>GLenum</ptype> <name>target</name></param>
             <param group="BufferOffset"><ptype>GLintptr</ptype> <name>offset</name></param>
@@ -9955,6 +10155,15 @@
             <param len="COMPSIZE(format,type)">const void *<name>data</name></param>
         </command>
         <command>
+            <proto>void <name>glClearTexImageEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLint</ptype> <name>level</name></param>
+            <param><ptype>GLenum</ptype> <name>format</name></param>
+            <param><ptype>GLenum</ptype> <name>type</name></param>
+            <param len="COMPSIZE(format,type)">const void *<name>data</name></param>
+            <alias name="glClearTexImage"/>
+        </command>
+        <command>
             <proto>void <name>glClearTexSubImage</name></proto>
             <param><ptype>GLuint</ptype> <name>texture</name></param>
             <param><ptype>GLint</ptype> <name>level</name></param>
@@ -9969,6 +10178,21 @@
             <param len="COMPSIZE(format,type)">const void *<name>data</name></param>
         </command>
         <command>
+            <proto>void <name>glClearTexSubImageEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLint</ptype> <name>level</name></param>
+            <param><ptype>GLint</ptype> <name>xoffset</name></param>
+            <param><ptype>GLint</ptype> <name>yoffset</name></param>
+            <param><ptype>GLint</ptype> <name>zoffset</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+            <param><ptype>GLenum</ptype> <name>format</name></param>
+            <param><ptype>GLenum</ptype> <name>type</name></param>
+            <param len="COMPSIZE(format,type)">const void *<name>data</name></param>
+            <alias name="glClearTexSubImage"/>
+        </command>
+        <command>
             <proto>void <name>glClientActiveTexture</name></proto>
             <param group="TextureUnit"><ptype>GLenum</ptype> <name>texture</name></param>
         </command>
@@ -11707,6 +11931,11 @@
             <param><ptype>GLuint</ptype> *<name>framebuffers</name></param>
         </command>
         <command>
+            <proto>void <name>glCreateMemoryObjectsEXT</name></proto>
+            <param><ptype>GLsizei</ptype> <name>n</name></param>
+            <param><ptype>GLuint</ptype> *<name>memoryObjects</name></param>
+        </command>
+        <command>
             <proto>void <name>glCreatePerfQueryINTEL</name></proto>
             <param><ptype>GLuint</ptype> <name>queryId</name></param>
             <param><ptype>GLuint</ptype> *<name>queryHandle</name></param>
@@ -12014,6 +12243,11 @@
             <glx type="single" opcode="103"/>
         </command>
         <command>
+            <proto>void <name>glDeleteMemoryObjectsEXT</name></proto>
+            <param><ptype>GLsizei</ptype> <name>n</name></param>
+            <param len="n">const <ptype>GLuint</ptype> *<name>memoryObjects</name></param>
+        </command>
+        <command>
             <proto>void <name>glDeleteNamedStringARB</name></proto>
             <param><ptype>GLint</ptype> <name>namelen</name></param>
             <param len="namelen">const <ptype>GLchar</ptype> *<name>name</name></param>
@@ -12116,6 +12350,11 @@
             <param len="count">const <ptype>GLuint</ptype> *<name>samplers</name></param>
         </command>
         <command>
+            <proto>void <name>glDeleteSemaphoresEXT</name></proto>
+            <param><ptype>GLsizei</ptype> <name>n</name></param>
+            <param len="count">const <ptype>GLuint</ptype> *<name>semaphores</name></param>
+        </command>
+        <command>
             <proto>void <name>glDeleteShader</name></proto>
             <param><ptype>GLuint</ptype> <name>shader</name></param>
             <glx type="single" opcode="195"/>
@@ -12862,12 +13101,25 @@
             <param><ptype>GLuint</ptype> <name>id</name></param>
         </command>
         <command>
+            <proto>void <name>glDrawTransformFeedbackEXT</name></proto>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <alias name="glDrawTransformFeedback"/>
+        </command>
+        <command>
             <proto>void <name>glDrawTransformFeedbackInstanced</name></proto>
             <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLuint</ptype> <name>id</name></param>
             <param><ptype>GLsizei</ptype> <name>instancecount</name></param>
         </command>
         <command>
+            <proto>void <name>glDrawTransformFeedbackInstancedEXT</name></proto>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <param><ptype>GLsizei</ptype> <name>instancecount</name></param>
+            <alias name="glDrawTransformFeedbackInstanced"/>
+        </command>
+        <command>
             <proto>void <name>glDrawTransformFeedbackNV</name></proto>
             <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLuint</ptype> <name>id</name></param>
@@ -13623,6 +13875,28 @@
             <param group="DrawBufferMode" len="n">const <ptype>GLenum</ptype> *<name>bufs</name></param>
         </command>
         <command>
+            <proto>void <name>glFramebufferFetchBarrierQCOM</name></proto>
+        </command>
+        <command>
+            <proto>void <name>glFramebufferFoveationConfigQCOM</name></proto>
+            <param group="Framebuffer"><ptype>GLuint</ptype> <name>framebuffer</name></param>
+            <param><ptype>GLuint</ptype> <name>numLayers</name></param>
+            <param><ptype>GLuint</ptype> <name>focalPointsPerLayer</name></param>
+            <param><ptype>GLuint</ptype> <name>requestedFeatures</name></param>
+            <param len="1"><ptype>GLuint</ptype> *<name>providedFeatures</name></param>
+        </command>
+        <command>
+            <proto>void <name>glFramebufferFoveationParametersQCOM</name></proto>
+            <param group="Framebuffer"><ptype>GLuint</ptype> <name>framebuffer</name></param>
+            <param><ptype>GLuint</ptype> <name>layer</name></param>
+            <param><ptype>GLuint</ptype> <name>focalPoint</name></param>
+            <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>focalX</name></param>
+            <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>focalY</name></param>
+            <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>gainX</name></param>
+            <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>gainY</name></param>
+            <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>foveaArea</name></param>
+        </command>
+        <command>
             <proto>void <name>glFramebufferParameteri</name></proto>
             <param><ptype>GLenum</ptype> <name>target</name></param>
             <param><ptype>GLenum</ptype> <name>pname</name></param>
@@ -13677,6 +13951,13 @@
             <param>const <ptype>GLfloat</ptype> *<name>v</name></param>
         </command>
         <command>
+            <proto>void <name>glFramebufferSamplePositionsfvAMD</name></proto>
+            <param><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLuint</ptype> <name>numsamples</name></param>
+            <param><ptype>GLuint</ptype> <name>pixelindex</name></param>
+            <param>const <ptype>GLfloat</ptype> *<name>values</name></param>
+        </command>
+        <command>
             <proto>void <name>glFramebufferTexture</name></proto>
             <param><ptype>GLenum</ptype> <name>target</name></param>
             <param><ptype>GLenum</ptype> <name>attachment</name></param>
@@ -14078,6 +14359,11 @@
             <param len="count"><ptype>GLuint</ptype> *<name>samplers</name></param>
         </command>
         <command>
+            <proto>void <name>glGenSemaphoresEXT</name></proto>
+            <param><ptype>GLsizei</ptype> <name>n</name></param>
+            <param len="count"><ptype>GLuint</ptype> *<name>semaphores</name></param>
+        </command>
+        <command>
             <proto><ptype>GLuint</ptype> <name>glGenSymbolsEXT</name></proto>
             <param group="DataTypeEXT"><ptype>GLenum</ptype> <name>datatype</name></param>
             <param group="VertexShaderStorageTypeEXT"><ptype>GLenum</ptype> <name>storagetype</name></param>
@@ -14879,6 +15165,15 @@
             <param len="COMPSIZE(pname)"><ptype>GLint</ptype> *<name>params</name></param>
         </command>
         <command>
+            <proto>void <name>glGetFramebufferParameterfvAMD</name></proto>
+            <param><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLuint</ptype> <name>numsamples</name></param>
+            <param><ptype>GLuint</ptype> <name>pixelindex</name></param>
+            <param><ptype>GLsizei</ptype> <name>size</name></param>
+            <param><ptype>GLfloat</ptype> *<name>values</name></param>
+        </command>
+        <command>
             <proto>void <name>glGetFramebufferParameteriv</name></proto>
             <param><ptype>GLenum</ptype> <name>target</name></param>
             <param><ptype>GLenum</ptype> <name>pname</name></param>
@@ -14902,6 +15197,7 @@
         </command>
         <command>
             <proto><ptype>GLenum</ptype> <name>glGetGraphicsResetStatusEXT</name></proto>
+            <alias name="glGetGraphicsResetStatus"/>
         </command>
         <command>
             <proto><ptype>GLenum</ptype> <name>glGetGraphicsResetStatusKHR</name></proto>
@@ -15257,6 +15553,12 @@
             <param len="COMPSIZE(pname)"><ptype>GLfixed</ptype> *<name>params</name></param>
         </command>
         <command>
+            <proto>void <name>glGetMemoryObjectParameterivEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memoryObject</name></param>
+            <param group="MemoryObjectParameterName"><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLint</ptype> *<name>params</name></param>
+        </command>
+        <command>
             <proto>void <name>glGetMinmax</name></proto>
             <param group="MinmaxTarget"><ptype>GLenum</ptype> <name>target</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>reset</name></param>
@@ -15455,6 +15757,15 @@
             <param len="COMPSIZE(size)">void *<name>data</name></param>
         </command>
         <command>
+            <proto>void <name>glGetNamedFramebufferParameterfvAMD</name></proto>
+            <param><ptype>GLuint</ptype> <name>framebuffer</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLuint</ptype> <name>numsamples</name></param>
+            <param><ptype>GLuint</ptype> <name>pixelindex</name></param>
+            <param><ptype>GLsizei</ptype> <name>size</name></param>
+            <param><ptype>GLfloat</ptype> *<name>values</name></param>
+        </command>
+        <command>
             <proto>void <name>glGetNamedFramebufferAttachmentParameteriv</name></proto>
             <param><ptype>GLuint</ptype> <name>framebuffer</name></param>
             <param><ptype>GLenum</ptype> <name>attachment</name></param>
@@ -16329,6 +16640,12 @@
             <param len="COMPSIZE(pname)"><ptype>GLint</ptype> *<name>params</name></param>
         </command>
         <command>
+            <proto>void <name>glGetSemaphoreParameterui64vEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param group="SemaphoreParameterName"><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLuint64</ptype> *<name>params</name></param>
+        </command>
+        <command>
             <proto>void <name>glGetSeparableFilter</name></proto>
             <param group="SeparableTarget"><ptype>GLenum</ptype> <name>target</name></param>
             <param group="PixelFormat"><ptype>GLenum</ptype> <name>format</name></param>
@@ -16629,6 +16946,11 @@
             <param><ptype>GLuint</ptype> <name>texture</name></param>
         </command>
         <command>
+            <proto><ptype>GLuint64</ptype> <name>glGetTextureHandleIMG</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <alias name="glGetTextureHandleARB"/>
+        </command>
+        <command>
             <proto><ptype>GLuint64</ptype> <name>glGetTextureHandleNV</name></proto>
             <param><ptype>GLuint</ptype> <name>texture</name></param>
         </command>
@@ -16738,6 +17060,12 @@
             <param><ptype>GLuint</ptype> <name>sampler</name></param>
         </command>
         <command>
+            <proto><ptype>GLuint64</ptype> <name>glGetTextureSamplerHandleIMG</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLuint</ptype> <name>sampler</name></param>
+            <alias name="glGetTextureSamplerHandleARB"/>
+        </command>
+        <command>
             <proto><ptype>GLuint64</ptype> <name>glGetTextureSamplerHandleNV</name></proto>
             <param><ptype>GLuint</ptype> <name>texture</name></param>
             <param><ptype>GLuint</ptype> <name>sampler</name></param>
@@ -16928,6 +17256,17 @@
             <alias name="glGetUniformuiv"/>
         </command>
         <command>
+            <proto>void <name>glGetUnsignedBytevEXT</name></proto>
+            <param group="GetPName"><ptype>GLenum</ptype> <name>pname</name></param>
+            <param len="COMPSIZE(pname)"><ptype>GLubyte</ptype> *<name>data</name></param>
+        </command>
+        <command>
+            <proto>void <name>glGetUnsignedBytei_vEXT</name></proto>
+            <param><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLuint</ptype> <name>index</name></param>
+            <param len="COMPSIZE(target)"><ptype>GLubyte</ptype> *<name>data</name></param>
+        </command>
+        <command>
             <proto>void <name>glGetVariantArrayObjectfvATI</name></proto>
             <param><ptype>GLuint</ptype> <name>id</name></param>
             <param group="ArrayObjectPNameATI"><ptype>GLenum</ptype> <name>pname</name></param>
@@ -17468,6 +17807,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>bufSize</name></param>
             <param len="bufSize"><ptype>GLfloat</ptype> *<name>params</name></param>
+            <alias name="glGetnUniformfv"/>
         </command>
         <command>
             <proto>void <name>glGetnUniformfvKHR</name></proto>
@@ -17504,6 +17844,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>bufSize</name></param>
             <param len="bufSize"><ptype>GLint</ptype> *<name>params</name></param>
+            <alias name="glGetnUniformiv"/>
         </command>
         <command>
             <proto>void <name>glGetnUniformivKHR</name></proto>
@@ -17633,6 +17974,45 @@
             <param len="COMPSIZE(pname)">const <ptype>GLint</ptype> *<name>params</name></param>
         </command>
         <command>
+            <proto>void <name>glImportMemoryFdEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>size</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param><ptype>GLint</ptype> <name>fd</name></param>
+        </command>
+        <command>
+            <proto>void <name>glImportMemoryWin32HandleEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>size</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param>void *<name>handle</name></param>
+        </command>
+        <command>
+            <proto>void <name>glImportMemoryWin32NameEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>size</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param>const void *<name>name</name></param>
+        </command>
+        <command>
+            <proto>void <name>glImportSemaphoreFdEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param><ptype>GLint</ptype> <name>fd</name></param>
+        </command>
+        <command>
+            <proto>void <name>glImportSemaphoreWin32HandleEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param>void *<name>handle</name></param>
+        </command>
+        <command>
+            <proto>void <name>glImportSemaphoreWin32NameEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param group="ExternalHandleType"><ptype>GLenum</ptype> <name>handleType</name></param>
+            <param>const void *<name>name</name></param>
+        </command>
+        <command>
             <proto group="sync"><ptype>GLsync</ptype> <name>glImportSyncEXT</name></proto>
             <param><ptype>GLenum</ptype> <name>external_sync_type</name></param>
             <param><ptype>GLintptr</ptype> <name>external_sync</name></param>
@@ -17921,6 +18301,10 @@
             <glx type="single" opcode="141"/>
         </command>
         <command>
+            <proto group="Boolean"><ptype>GLboolean</ptype> <name>glIsMemoryObjectEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memoryObject</name></param>
+        </command>
+        <command>
             <proto group="Boolean"><ptype>GLboolean</ptype> <name>glIsNameAMD</name></proto>
             <param><ptype>GLenum</ptype> <name>identifier</name></param>
             <param><ptype>GLuint</ptype> <name>name</name></param>
@@ -18013,6 +18397,10 @@
             <param><ptype>GLuint</ptype> <name>renderbuffer</name></param>
         </command>
         <command>
+            <proto group="Boolean"><ptype>GLboolean</ptype> <name>glIsSemaphoreEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+        </command>
+        <command>
             <proto group="Boolean"><ptype>GLboolean</ptype> <name>glIsSampler</name></proto>
             <param><ptype>GLuint</ptype> <name>sampler</name></param>
         </command>
@@ -18087,6 +18475,37 @@
             <param><ptype>GLenum</ptype> <name>pname</name></param>
         </command>
         <command>
+            <proto>void <name>glLGPUCopyImageSubDataNVX</name></proto>
+            <param><ptype>GLuint</ptype> <name>sourceGpu</name></param>
+            <param><ptype>GLbitfield</ptype> <name>destinationGpuMask</name></param>
+            <param><ptype>GLuint</ptype> <name>srcName</name></param>
+            <param><ptype>GLenum</ptype> <name>srcTarget</name></param>
+            <param><ptype>GLint</ptype> <name>srcLevel</name></param>
+            <param><ptype>GLint</ptype> <name>srcX</name></param>
+            <param><ptype>GLint</ptype> <name>srxY</name></param>
+            <param><ptype>GLint</ptype> <name>srcZ</name></param>
+            <param><ptype>GLuint</ptype> <name>dstName</name></param>
+            <param><ptype>GLenum</ptype> <name>dstTarget</name></param>
+            <param><ptype>GLint</ptype> <name>dstLevel</name></param>
+            <param><ptype>GLint</ptype> <name>dstX</name></param>
+            <param><ptype>GLint</ptype> <name>dstY</name></param>
+            <param><ptype>GLint</ptype> <name>dstZ</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+        </command>
+        <command>
+            <proto>void <name>glLGPUInterlockNVX</name></proto>
+        </command>
+        <command>
+            <proto>void <name>glLGPUNamedBufferSubDataNVX</name></proto>
+            <param><ptype>GLbitfield</ptype> <name>gpuMask</name></param>
+            <param><ptype>GLuint</ptype> <name>buffer</name></param>
+            <param><ptype>GLintptr</ptype> <name>offset</name></param>
+            <param><ptype>GLsizeiptr</ptype> <name>size</name></param>
+            <param>const void *<name>data</name></param>
+        </command>
+        <command>
             <proto>void <name>glLabelObjectEXT</name></proto>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param><ptype>GLuint</ptype> <name>object</name></param>
@@ -18904,6 +19323,12 @@
             <alias name="glMemoryBarrier"/>
         </command>
         <command>
+            <proto>void <name>glMemoryObjectParameterivEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memoryObject</name></param>
+            <param group="MemoryObjectParameterName"><ptype>GLenum</ptype> <name>pname</name></param>
+            <param>const <ptype>GLint</ptype> *<name>params</name></param>
+        </command>
+        <command>
             <proto>void <name>glMinSampleShading</name></proto>
             <param group="ColorF"><ptype>GLfloat</ptype> <name>value</name></param>
         </command>
@@ -18989,14 +19414,14 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirect</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param len="COMPSIZE(drawcount,stride)">const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawcount</name></param>
             <param><ptype>GLsizei</ptype> <name>stride</name></param>
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirectAMD</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>primcount</name></param>
             <param><ptype>GLsizei</ptype> <name>stride</name></param>
@@ -19004,7 +19429,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirectBindlessCountNV</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawCount</name></param>
             <param><ptype>GLsizei</ptype> <name>maxDrawCount</name></param>
@@ -19013,7 +19438,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirectBindlessNV</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawCount</name></param>
             <param><ptype>GLsizei</ptype> <name>stride</name></param>
@@ -19021,7 +19446,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirectCountARB</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLintptr</ptype> <name>indirect</name></param>
             <param><ptype>GLintptr</ptype> <name>drawcount</name></param>
             <param><ptype>GLsizei</ptype> <name>maxdrawcount</name></param>
@@ -19029,7 +19454,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawArraysIndirectEXT</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param len="COMPSIZE(drawcount,stride)">const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawcount</name></param>
             <param><ptype>GLsizei</ptype> <name>stride</name></param>
@@ -19052,7 +19477,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsBaseVertex</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param len="COMPSIZE(drawcount)">const <ptype>GLsizei</ptype> *<name>count</name></param>
             <param group="DrawElementsType"><ptype>GLenum</ptype> <name>type</name></param>
             <param len="COMPSIZE(drawcount)">const void *const*<name>indices</name></param>
@@ -19061,7 +19486,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsBaseVertexEXT</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param len="COMPSIZE(drawcount)">const <ptype>GLsizei</ptype> *<name>count</name></param>
             <param group="DrawElementsType"><ptype>GLenum</ptype> <name>type</name></param>
             <param len="COMPSIZE(drawcount)">const void *const*<name>indices</name></param>
@@ -19071,7 +19496,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsBaseVertexOES</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param len="COMPSIZE(drawcount)">const <ptype>GLsizei</ptype> *<name>count</name></param>
             <param group="DrawElementsType"><ptype>GLenum</ptype> <name>type</name></param>
             <param len="COMPSIZE(drawcount)">const void *const*<name>indices</name></param>
@@ -19090,7 +19515,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirect</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param len="COMPSIZE(drawcount,stride)">const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawcount</name></param>
@@ -19098,7 +19523,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirectAMD</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>primcount</name></param>
@@ -19107,7 +19532,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirectBindlessCountNV</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawCount</name></param>
@@ -19117,7 +19542,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirectBindlessNV</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param>const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawCount</name></param>
@@ -19126,7 +19551,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirectCountARB</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param><ptype>GLintptr</ptype> <name>indirect</name></param>
             <param><ptype>GLintptr</ptype> <name>drawcount</name></param>
@@ -19135,7 +19560,7 @@
         </command>
         <command>
             <proto>void <name>glMultiDrawElementsIndirectEXT</name></proto>
-            <param><ptype>GLenum</ptype> <name>mode</name></param>
+            <param group="PrimitiveType"><ptype>GLenum</ptype> <name>mode</name></param>
             <param><ptype>GLenum</ptype> <name>type</name></param>
             <param len="COMPSIZE(drawcount,stride)">const void *<name>indirect</name></param>
             <param><ptype>GLsizei</ptype> <name>drawcount</name></param>
@@ -20053,6 +20478,103 @@
             <param len="COMPSIZE(format,type,width,height,depth)">const void *<name>pixels</name></param>
         </command>
         <command>
+            <proto>void <name>glMulticastBarrierNV</name></proto>
+        </command>
+        <command>
+            <proto>void <name>glMulticastBlitFramebufferNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>srcGpu</name></param>
+            <param><ptype>GLuint</ptype> <name>dstGpu</name></param>
+            <param><ptype>GLint</ptype> <name>srcX0</name></param>
+            <param><ptype>GLint</ptype> <name>srcY0</name></param>
+            <param><ptype>GLint</ptype> <name>srcX1</name></param>
+            <param><ptype>GLint</ptype> <name>srcY1</name></param>
+            <param><ptype>GLint</ptype> <name>dstX0</name></param>
+            <param><ptype>GLint</ptype> <name>dstY0</name></param>
+            <param><ptype>GLint</ptype> <name>dstX1</name></param>
+            <param><ptype>GLint</ptype> <name>dstY1</name></param>
+            <param><ptype>GLbitfield</ptype> <name>mask</name></param>
+            <param><ptype>GLenum</ptype> <name>filter</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastBufferSubDataNV</name></proto>
+            <param><ptype>GLbitfield</ptype> <name>gpuMask</name></param>
+            <param><ptype>GLuint</ptype> <name>buffer</name></param>
+            <param><ptype>GLintptr</ptype> <name>offset</name></param>
+            <param><ptype>GLsizeiptr</ptype> <name>size</name></param>
+            <param>const <ptype>GLvoid</ptype> *<name>data</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastCopyBufferSubDataNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>readGpu</name></param>
+            <param><ptype>GLbitfield</ptype> <name>writeGpuMask</name></param>
+            <param><ptype>GLuint</ptype> <name>readBuffer</name></param>
+            <param><ptype>GLuint</ptype> <name>writeBuffer</name></param>
+            <param><ptype>GLintptr</ptype> <name>readOffset</name></param>
+            <param><ptype>GLintptr</ptype> <name>writeOffset</name></param>
+            <param><ptype>GLsizeiptr</ptype> <name>size</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastCopyImageSubDataNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>srcGpu</name></param>
+            <param><ptype>GLbitfield</ptype> <name>dstGpuMask</name></param>
+            <param><ptype>GLuint</ptype> <name>srcName</name></param>
+            <param><ptype>GLenum</ptype> <name>srcTarget</name></param>
+            <param><ptype>GLint</ptype> <name>srcLevel</name></param>
+            <param><ptype>GLint</ptype> <name>srcX</name></param>
+            <param><ptype>GLint</ptype> <name>srcY</name></param>
+            <param><ptype>GLint</ptype> <name>srcZ</name></param>
+            <param><ptype>GLuint</ptype> <name>dstName</name></param>
+            <param><ptype>GLenum</ptype> <name>dstTarget</name></param>
+            <param><ptype>GLint</ptype> <name>dstLevel</name></param>
+            <param><ptype>GLint</ptype> <name>dstX</name></param>
+            <param><ptype>GLint</ptype> <name>dstY</name></param>
+            <param><ptype>GLint</ptype> <name>dstZ</name></param>
+            <param><ptype>GLsizei</ptype> <name>srcWidth</name></param>
+            <param><ptype>GLsizei</ptype> <name>srcHeight</name></param>
+            <param><ptype>GLsizei</ptype> <name>srcDepth</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastFramebufferSampleLocationsfvNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>gpu</name></param>
+            <param><ptype>GLuint</ptype> <name>framebuffer</name></param>
+            <param><ptype>GLuint</ptype> <name>start</name></param>
+            <param><ptype>GLsizei</ptype> <name>count</name></param>
+            <param>const <ptype>GLfloat</ptype> *<name>v</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastGetQueryObjecti64vNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>gpu</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLint64</ptype> *<name>params</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastGetQueryObjectivNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>gpu</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLint</ptype> *<name>params</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastGetQueryObjectui64vNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>gpu</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLuint64</ptype> *<name>params</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastGetQueryObjectuivNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>gpu</name></param>
+            <param><ptype>GLuint</ptype> <name>id</name></param>
+            <param><ptype>GLenum</ptype> <name>pname</name></param>
+            <param><ptype>GLuint</ptype> *<name>params</name></param>
+        </command>
+        <command>
+            <proto>void <name>glMulticastWaitSyncNV</name></proto>
+            <param><ptype>GLuint</ptype> <name>signalGpu</name></param>
+            <param><ptype>GLbitfield</ptype> <name>waitGpuMask</name></param>
+        </command>
+        <command>
             <proto>void <name>glNamedBufferData</name></proto>
             <param><ptype>GLuint</ptype> <name>buffer</name></param>
             <param group="BufferSize"><ptype>GLsizeiptr</ptype> <name>size</name></param>
@@ -20096,6 +20618,13 @@
             <alias name="glNamedBufferStorage"/>
         </command>
         <command>
+            <proto>void <name>glNamedBufferStorageMemEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>buffer</name></param>
+            <param group="BufferSize"><ptype>GLsizeiptr</ptype> <name>size</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
             <proto>void <name>glNamedBufferSubData</name></proto>
             <param><ptype>GLuint</ptype> <name>buffer</name></param>
             <param><ptype>GLintptr</ptype> <name>offset</name></param>
@@ -20182,6 +20711,13 @@
             <param><ptype>GLint</ptype> <name>level</name></param>
         </command>
         <command>
+            <proto>void <name>glNamedFramebufferSamplePositionsfvAMD</name></proto>
+            <param><ptype>GLuint</ptype> <name>framebuffer</name></param>
+            <param><ptype>GLuint</ptype> <name>numsamples</name></param>
+            <param><ptype>GLuint</ptype> <name>pixelindex</name></param>
+            <param>const <ptype>GLfloat</ptype> *<name>values</name></param>
+        </command>
+        <command>
             <proto>void <name>glNamedFramebufferTexture1DEXT</name></proto>
             <param group="Framebuffer"><ptype>GLuint</ptype> <name>framebuffer</name></param>
             <param group="FramebufferAttachment"><ptype>GLenum</ptype> <name>attachment</name></param>
@@ -21710,7 +22246,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="1">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform1dvEXT</name></proto>
@@ -21737,7 +22273,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="1">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform1fvEXT</name></proto>
@@ -21791,7 +22327,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="1">const <ptype>GLint</ptype> *<name>value</name></param>
+            <param len="count">const <ptype>GLint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform1ivEXT</name></proto>
@@ -21845,7 +22381,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="1">const <ptype>GLuint</ptype> *<name>value</name></param>
+            <param len="count">const <ptype>GLuint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform1uivEXT</name></proto>
@@ -21874,14 +22410,14 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="2">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*2">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform2dvEXT</name></proto>
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*2">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform2f</name></proto>
@@ -21903,7 +22439,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="2">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*2">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform2fvEXT</name></proto>
@@ -21961,7 +22497,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="2">const <ptype>GLint</ptype> *<name>value</name></param>
+            <param len="count*2">const <ptype>GLint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform2ivEXT</name></proto>
@@ -22019,7 +22555,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="2">const <ptype>GLuint</ptype> *<name>value</name></param>
+            <param len="count*2">const <ptype>GLuint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform2uivEXT</name></proto>
@@ -22050,14 +22586,14 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="3">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*3">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform3dvEXT</name></proto>
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*3">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform3f</name></proto>
@@ -22081,7 +22617,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="3">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*3">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform3fvEXT</name></proto>
@@ -22143,7 +22679,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="3">const <ptype>GLint</ptype> *<name>value</name></param>
+            <param len="count*3">const <ptype>GLint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform3ivEXT</name></proto>
@@ -22205,7 +22741,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="3">const <ptype>GLuint</ptype> *<name>value</name></param>
+            <param len="count*3">const <ptype>GLuint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform3uivEXT</name></proto>
@@ -22238,14 +22774,14 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="4">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform4dvEXT</name></proto>
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform4f</name></proto>
@@ -22271,7 +22807,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="4">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform4fvEXT</name></proto>
@@ -22337,7 +22873,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="4">const <ptype>GLint</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform4ivEXT</name></proto>
@@ -22403,7 +22939,7 @@
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
-            <param len="4">const <ptype>GLuint</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLuint</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniform4uivEXT</name></proto>
@@ -22420,6 +22956,13 @@
             <param><ptype>GLuint64</ptype> <name>value</name></param>
         </command>
         <command>
+            <proto>void <name>glProgramUniformHandleui64IMG</name></proto>
+            <param><ptype>GLuint</ptype> <name>program</name></param>
+            <param><ptype>GLint</ptype> <name>location</name></param>
+            <param><ptype>GLuint64</ptype> <name>value</name></param>
+            <alias name="glProgramUniformHandleui64ARB"/>
+        </command>
+        <command>
             <proto>void <name>glProgramUniformHandleui64NV</name></proto>
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
@@ -22433,6 +22976,14 @@
             <param len="count">const <ptype>GLuint64</ptype> *<name>values</name></param>
         </command>
         <command>
+            <proto>void <name>glProgramUniformHandleui64vIMG</name></proto>
+            <param><ptype>GLuint</ptype> <name>program</name></param>
+            <param><ptype>GLint</ptype> <name>location</name></param>
+            <param><ptype>GLsizei</ptype> <name>count</name></param>
+            <param len="count">const <ptype>GLuint64</ptype> *<name>values</name></param>
+            <alias name="glProgramUniformHandleui64vARB"/>
+        </command>
+        <command>
             <proto>void <name>glProgramUniformHandleui64vNV</name></proto>
             <param><ptype>GLuint</ptype> <name>program</name></param>
             <param><ptype>GLint</ptype> <name>location</name></param>
@@ -22445,7 +22996,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="2">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2dvEXT</name></proto>
@@ -22453,7 +23004,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2fv</name></proto>
@@ -22461,7 +23012,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="2">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*4">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2fvEXT</name></proto>
@@ -22478,7 +23029,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x3dvEXT</name></proto>
@@ -22486,7 +23037,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x3fv</name></proto>
@@ -22494,7 +23045,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x3fvEXT</name></proto>
@@ -22511,7 +23062,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x4dvEXT</name></proto>
@@ -22519,7 +23070,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x4fv</name></proto>
@@ -22527,7 +23078,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix2x4fvEXT</name></proto>
@@ -22544,7 +23095,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="3">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*9">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3dvEXT</name></proto>
@@ -22552,7 +23103,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*9">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3fv</name></proto>
@@ -22560,7 +23111,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="3">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*9">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3fvEXT</name></proto>
@@ -22577,7 +23128,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x2dvEXT</name></proto>
@@ -22585,7 +23136,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x2fv</name></proto>
@@ -22593,7 +23144,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*6">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x2fvEXT</name></proto>
@@ -22610,7 +23161,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x4dvEXT</name></proto>
@@ -22618,7 +23169,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x4fv</name></proto>
@@ -22626,7 +23177,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix3x4fvEXT</name></proto>
@@ -22643,7 +23194,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="4">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*16">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4dvEXT</name></proto>
@@ -22651,7 +23202,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*16">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4fv</name></proto>
@@ -22659,7 +23210,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="4">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*16">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4fvEXT</name></proto>
@@ -22676,7 +23227,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x2dvEXT</name></proto>
@@ -22684,7 +23235,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x2fv</name></proto>
@@ -22692,7 +23243,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*8">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x2fvEXT</name></proto>
@@ -22709,7 +23260,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x3dvEXT</name></proto>
@@ -22717,7 +23268,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLdouble</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLdouble</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x3fv</name></proto>
@@ -22725,7 +23276,7 @@
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
             <param group="Boolean"><ptype>GLboolean</ptype> <name>transpose</name></param>
-            <param len="count">const <ptype>GLfloat</ptype> *<name>value</name></param>
+            <param len="count*12">const <ptype>GLfloat</ptype> *<name>value</name></param>
         </command>
         <command>
             <proto>void <name>glProgramUniformMatrix4x3fvEXT</name></proto>
@@ -23086,6 +23637,11 @@
             <alias name="glReadnPixels"/>
         </command>
         <command>
+            <proto><ptype>GLboolean</ptype> <name>glReleaseKeyedMutexWin32EXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>key</name></param>
+        </command>
+        <command>
             <proto>void <name>glRectd</name></proto>
             <param group="CoordD"><ptype>GLdouble</ptype> <name>x1</name></param>
             <param group="CoordD"><ptype>GLdouble</ptype> <name>y1</name></param>
@@ -23162,6 +23718,10 @@
             <proto>void <name>glReleaseShaderCompiler</name></proto>
         </command>
         <command>
+            <proto>void <name>glRenderGpuMaskNV</name></proto>
+            <param><ptype>GLbitfield</ptype> <name>mask</name></param>
+        </command>
+        <command>
             <proto><ptype>GLint</ptype> <name>glRenderMode</name></proto>
             <param group="RenderingMode"><ptype>GLenum</ptype> <name>mode</name></param>
             <glx type="single" opcode="107"/>
@@ -23990,6 +24550,12 @@
             <param len="numCounters"><ptype>GLuint</ptype> *<name>counterList</name></param>
         </command>
         <command>
+            <proto>void <name>glSemaphoreParameterui64vEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param group="SemaphoreParameterName"><ptype>GLenum</ptype> <name>pname</name></param>
+            <param>const <ptype>GLuint64</ptype> *<name>params</name></param>
+        </command>
+        <command>
             <proto>void <name>glSeparableFilter2D</name></proto>
             <param group="SeparableTarget"><ptype>GLenum</ptype> <name>target</name></param>
             <param group="PixelInternalFormat"><ptype>GLenum</ptype> <name>internalformat</name></param>
@@ -24110,6 +24676,23 @@
             <glx type="render" opcode="2052"/>
         </command>
         <command>
+            <proto>void <name>glSignalSemaphoreEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param><ptype>GLuint</ptype> <name>numBufferBarriers</name></param>
+            <param len="COMPSIZE(numBufferBarriers)">const <ptype>GLuint</ptype> *<name>buffers</name></param>
+            <param><ptype>GLuint</ptype> <name>numTextureBarriers</name></param>
+            <param len="COMPSIZE(numTextureBarriers)">const <ptype>GLuint</ptype> *<name>textures</name></param>
+            <param group="TextureLayout" len="COMPSIZE(numTextureBarriers)">const <ptype>GLenum</ptype> *<name>dstLayouts</name></param>
+        </command>
+        <command>
+            <proto>void <name>glSpecializeShaderARB</name></proto>
+            <param><ptype>GLuint</ptype> <name>shader</name></param>
+            <param>const <ptype>GLchar</ptype> *<name>pEntryPoint</name></param>
+            <param><ptype>GLuint</ptype> <name>numSpecializationConstants</name></param>
+            <param>const <ptype>GLuint</ptype> *<name>pConstantIndex</name></param>
+            <param>const <ptype>GLuint</ptype> *<name>pConstantValue</name></param>
+        </command>
+        <command>
             <proto>void <name>glSpriteParameterfSGIX</name></proto>
             <param group="SpriteParameterNameSGIX"><ptype>GLenum</ptype> <name>pname</name></param>
             <param group="CheckedFloat32"><ptype>GLfloat</ptype> <name>param</name></param>
@@ -25447,6 +26030,59 @@
             <alias name="glTexStorage3DMultisample"/>
         </command>
         <command>
+            <proto>void <name>glTexStorageMem1DEXT</name></proto>
+            <param group="TextureTarget"><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTexStorageMem2DEXT</name></proto>
+            <param group="TextureTarget"><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTexStorageMem2DMultisampleEXT</name></proto>
+            <param group="TextureTarget"><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLsizei</ptype> <name>samples</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLboolean</ptype> <name>fixedSampleLocations</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTexStorageMem3DEXT</name></proto>
+            <param group="TextureTarget"><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTexStorageMem3DMultisampleEXT</name></proto>
+            <param group="TextureTarget"><ptype>GLenum</ptype> <name>target</name></param>
+            <param><ptype>GLsizei</ptype> <name>samples</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+            <param><ptype>GLboolean</ptype> <name>fixedSampleLocations</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
             <proto>void <name>glTexStorageSparseAMD</name></proto>
             <param><ptype>GLenum</ptype> <name>target</name></param>
             <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
@@ -25909,6 +26545,59 @@
             <param group="Boolean"><ptype>GLboolean</ptype> <name>fixedsamplelocations</name></param>
         </command>
         <command>
+            <proto>void <name>glTextureStorageMem1DEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTextureStorageMem2DEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTextureStorageMem2DMultisampleEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLsizei</ptype> <name>samples</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLboolean</ptype> <name>fixedSampleLocations</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTextureStorageMem3DEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLsizei</ptype> <name>levels</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
+            <proto>void <name>glTextureStorageMem3DMultisampleEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>texture</name></param>
+            <param><ptype>GLsizei</ptype> <name>samples</name></param>
+            <param><ptype>GLenum</ptype> <name>internalFormat</name></param>
+            <param><ptype>GLsizei</ptype> <name>width</name></param>
+            <param><ptype>GLsizei</ptype> <name>height</name></param>
+            <param><ptype>GLsizei</ptype> <name>depth</name></param>
+            <param><ptype>GLboolean</ptype> <name>fixedSampleLocations</name></param>
+            <param><ptype>GLuint</ptype> <name>memory</name></param>
+            <param><ptype>GLuint64</ptype> <name>offset</name></param>
+        </command>
+        <command>
             <proto>void <name>glTextureStorageSparseAMD</name></proto>
             <param><ptype>GLuint</ptype> <name>texture</name></param>
             <param><ptype>GLenum</ptype> <name>target</name></param>
@@ -26712,6 +27401,12 @@
             <param><ptype>GLuint64</ptype> <name>value</name></param>
         </command>
         <command>
+            <proto>void <name>glUniformHandleui64IMG</name></proto>
+            <param><ptype>GLint</ptype> <name>location</name></param>
+            <param><ptype>GLuint64</ptype> <name>value</name></param>
+            <alias name="glUniformHandleui64ARB"/>
+        </command>
+        <command>
             <proto>void <name>glUniformHandleui64NV</name></proto>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLuint64</ptype> <name>value</name></param>
@@ -26723,6 +27418,13 @@
             <param len="count">const <ptype>GLuint64</ptype> *<name>value</name></param>
         </command>
         <command>
+            <proto>void <name>glUniformHandleui64vIMG</name></proto>
+            <param><ptype>GLint</ptype> <name>location</name></param>
+            <param><ptype>GLsizei</ptype> <name>count</name></param>
+            <param len="count">const <ptype>GLuint64</ptype> *<name>value</name></param>
+            <alias name="glUniformHandleui64vARB"/>
+        </command>
+        <command>
             <proto>void <name>glUniformHandleui64vNV</name></proto>
             <param><ptype>GLint</ptype> <name>location</name></param>
             <param><ptype>GLsizei</ptype> <name>count</name></param>
@@ -29558,6 +30260,15 @@
             <param><ptype>GLenum</ptype> <name>swizzlew</name></param>
         </command>
         <command>
+            <proto>void <name>glWaitSemaphoreEXT</name></proto>
+            <param><ptype>GLuint</ptype> <name>semaphore</name></param>
+            <param><ptype>GLuint</ptype> <name>numBufferBarriers</name></param>
+            <param len="COMPSIZE(numBufferBarriers)">const <ptype>GLuint</ptype> *<name>buffers</name></param>
+            <param><ptype>GLuint</ptype> <name>numTextureBarriers</name></param>
+            <param len="COMPSIZE(numTextureBarriers)">const <ptype>GLuint</ptype> *<name>textures</name></param>
+            <param group="TextureLayout" len="COMPSIZE(numTextureBarriers)">const <ptype>GLenum</ptype> *<name>srcLayouts</name></param>
+        </command>
+        <command>
             <proto>void <name>glWaitSync</name></proto>
             <param group="sync"><ptype>GLsync</ptype> <name>sync</name></param>
             <param><ptype>GLbitfield</ptype> <name>flags</name></param>
@@ -30002,6 +30713,36 @@
             <param group="VertexShaderWriteMaskEXT"><ptype>GLenum</ptype> <name>outZ</name></param>
             <param group="VertexShaderWriteMaskEXT"><ptype>GLenum</ptype> <name>outW</name></param>
         </command>
+        <command>
+            <proto>void <name>glDrawVkImageNV</name></proto>
+            <param><ptype>GLuint64</ptype> <name>vkImage</name></param>
+            <param><ptype>GLuint</ptype> <name>sampler</name></param>
+            <param><ptype>GLfloat</ptype> <name>x0</name></param>
+            <param><ptype>GLfloat</ptype> <name>y0</name></param>
+            <param><ptype>GLfloat</ptype> <name>x1</name></param>
+            <param><ptype>GLfloat</ptype> <name>y1</name></param>
+            <param><ptype>GLfloat</ptype> <name>z</name></param>
+            <param><ptype>GLfloat</ptype> <name>s0</name></param>
+            <param><ptype>GLfloat</ptype> <name>t0</name></param>
+            <param><ptype>GLfloat</ptype> <name>s1</name></param>
+            <param><ptype>GLfloat</ptype> <name>t1</name></param>
+        </command>
+        <command>
+            <proto><ptype>GLVULKANPROCNV</ptype> <name>glGetVkProcAddrNV</name></proto>
+            <param len="COMPSIZE(name)">const <ptype>GLchar</ptype> *<name>name</name></param>
+        </command>
+        <command>
+            <proto>void <name>glWaitVkSemaphoreNV</name></proto>
+            <param><ptype>GLuint64</ptype> <name>vkSemaphore</name></param>
+        </command>
+        <command>
+            <proto>void <name>glSignalVkSemaphoreNV</name></proto>
+            <param><ptype>GLuint64</ptype> <name>vkSemaphore</name></param>
+        </command>
+        <command>
+            <proto>void <name>glSignalVkFenceNV</name></proto>
+            <param><ptype>GLuint64</ptype> <name>vkFence</name></param>
+        </command>
 
     </commands>
 
@@ -30009,6 +30750,430 @@
     <feature api="gl" name="GL_VERSION_1_0" number="1.0">
         <require>
             <type name="GLvoid" comment="No longer used in headers"/>
+            <enum name="GL_DEPTH_BUFFER_BIT"/>
+            <enum name="GL_STENCIL_BUFFER_BIT"/>
+            <enum name="GL_COLOR_BUFFER_BIT"/>
+            <enum name="GL_FALSE"/>
+            <enum name="GL_TRUE"/>
+            <enum name="GL_POINTS"/>
+            <enum name="GL_LINES"/>
+            <enum name="GL_LINE_LOOP"/>
+            <enum name="GL_LINE_STRIP"/>
+            <enum name="GL_TRIANGLES"/>
+            <enum name="GL_TRIANGLE_STRIP"/>
+            <enum name="GL_TRIANGLE_FAN"/>
+            <enum name="GL_QUADS"/>
+            <enum name="GL_NEVER"/>
+            <enum name="GL_LESS"/>
+            <enum name="GL_EQUAL"/>
+            <enum name="GL_LEQUAL"/>
+            <enum name="GL_GREATER"/>
+            <enum name="GL_NOTEQUAL"/>
+            <enum name="GL_GEQUAL"/>
+            <enum name="GL_ALWAYS"/>
+            <enum name="GL_ZERO"/>
+            <enum name="GL_ONE"/>
+            <enum name="GL_SRC_COLOR"/>
+            <enum name="GL_ONE_MINUS_SRC_COLOR"/>
+            <enum name="GL_SRC_ALPHA"/>
+            <enum name="GL_ONE_MINUS_SRC_ALPHA"/>
+            <enum name="GL_DST_ALPHA"/>
+            <enum name="GL_ONE_MINUS_DST_ALPHA"/>
+            <enum name="GL_DST_COLOR"/>
+            <enum name="GL_ONE_MINUS_DST_COLOR"/>
+            <enum name="GL_SRC_ALPHA_SATURATE"/>
+            <enum name="GL_NONE"/>
+            <enum name="GL_FRONT_LEFT"/>
+            <enum name="GL_FRONT_RIGHT"/>
+            <enum name="GL_BACK_LEFT"/>
+            <enum name="GL_BACK_RIGHT"/>
+            <enum name="GL_FRONT"/>
+            <enum name="GL_BACK"/>
+            <enum name="GL_LEFT"/>
+            <enum name="GL_RIGHT"/>
+            <enum name="GL_FRONT_AND_BACK"/>
+            <enum name="GL_NO_ERROR"/>
+            <enum name="GL_INVALID_ENUM"/>
+            <enum name="GL_INVALID_VALUE"/>
+            <enum name="GL_INVALID_OPERATION"/>
+            <enum name="GL_OUT_OF_MEMORY"/>
+            <enum name="GL_CW"/>
+            <enum name="GL_CCW"/>
+            <enum name="GL_POINT_SIZE"/>
+            <enum name="GL_POINT_SIZE_RANGE"/>
+            <enum name="GL_POINT_SIZE_GRANULARITY"/>
+            <enum name="GL_LINE_SMOOTH"/>
+            <enum name="GL_LINE_WIDTH"/>
+            <enum name="GL_LINE_WIDTH_RANGE"/>
+            <enum name="GL_LINE_WIDTH_GRANULARITY"/>
+            <enum name="GL_POLYGON_MODE"/>
+            <enum name="GL_POLYGON_SMOOTH"/>
+            <enum name="GL_CULL_FACE"/>
+            <enum name="GL_CULL_FACE_MODE"/>
+            <enum name="GL_FRONT_FACE"/>
+            <enum name="GL_DEPTH_RANGE"/>
+            <enum name="GL_DEPTH_TEST"/>
+            <enum name="GL_DEPTH_WRITEMASK"/>
+            <enum name="GL_DEPTH_CLEAR_VALUE"/>
+            <enum name="GL_DEPTH_FUNC"/>
+            <enum name="GL_STENCIL_TEST"/>
+            <enum name="GL_STENCIL_CLEAR_VALUE"/>
+            <enum name="GL_STENCIL_FUNC"/>
+            <enum name="GL_STENCIL_VALUE_MASK"/>
+            <enum name="GL_STENCIL_FAIL"/>
+            <enum name="GL_STENCIL_PASS_DEPTH_FAIL"/>
+            <enum name="GL_STENCIL_PASS_DEPTH_PASS"/>
+            <enum name="GL_STENCIL_REF"/>
+            <enum name="GL_STENCIL_WRITEMASK"/>
+            <enum name="GL_VIEWPORT"/>
+            <enum name="GL_DITHER"/>
+            <enum name="GL_BLEND_DST"/>
+            <enum name="GL_BLEND_SRC"/>
+            <enum name="GL_BLEND"/>
+            <enum name="GL_LOGIC_OP_MODE"/>
+            <enum name="GL_DRAW_BUFFER"/>
+            <enum name="GL_READ_BUFFER"/>
+            <enum name="GL_SCISSOR_BOX"/>
+            <enum name="GL_SCISSOR_TEST"/>
+            <enum name="GL_COLOR_CLEAR_VALUE"/>
+            <enum name="GL_COLOR_WRITEMASK"/>
+            <enum name="GL_DOUBLEBUFFER"/>
+            <enum name="GL_STEREO"/>
+            <enum name="GL_LINE_SMOOTH_HINT"/>
+            <enum name="GL_POLYGON_SMOOTH_HINT"/>
+            <enum name="GL_UNPACK_SWAP_BYTES"/>
+            <enum name="GL_UNPACK_LSB_FIRST"/>
+            <enum name="GL_UNPACK_ROW_LENGTH"/>
+            <enum name="GL_UNPACK_SKIP_ROWS"/>
+            <enum name="GL_UNPACK_SKIP_PIXELS"/>
+            <enum name="GL_UNPACK_ALIGNMENT"/>
+            <enum name="GL_PACK_SWAP_BYTES"/>
+            <enum name="GL_PACK_LSB_FIRST"/>
+            <enum name="GL_PACK_ROW_LENGTH"/>
+            <enum name="GL_PACK_SKIP_ROWS"/>
+            <enum name="GL_PACK_SKIP_PIXELS"/>
+            <enum name="GL_PACK_ALIGNMENT"/>
+            <enum name="GL_MAX_TEXTURE_SIZE"/>
+            <enum name="GL_MAX_VIEWPORT_DIMS"/>
+            <enum name="GL_SUBPIXEL_BITS"/>
+            <enum name="GL_TEXTURE_1D"/>
+            <enum name="GL_TEXTURE_2D"/>
+            <enum name="GL_TEXTURE_WIDTH"/>
+            <enum name="GL_TEXTURE_HEIGHT"/>
+            <enum name="GL_TEXTURE_BORDER_COLOR"/>
+            <enum name="GL_DONT_CARE"/>
+            <enum name="GL_FASTEST"/>
+            <enum name="GL_NICEST"/>
+            <enum name="GL_BYTE"/>
+            <enum name="GL_UNSIGNED_BYTE"/>
+            <enum name="GL_SHORT"/>
+            <enum name="GL_UNSIGNED_SHORT"/>
+            <enum name="GL_INT"/>
+            <enum name="GL_UNSIGNED_INT"/>
+            <enum name="GL_FLOAT"/>
+            <enum name="GL_STACK_OVERFLOW"/>
+            <enum name="GL_STACK_UNDERFLOW"/>
+            <enum name="GL_CLEAR"/>
+            <enum name="GL_AND"/>
+            <enum name="GL_AND_REVERSE"/>
+            <enum name="GL_COPY"/>
+            <enum name="GL_AND_INVERTED"/>
+            <enum name="GL_NOOP"/>
+            <enum name="GL_XOR"/>
+            <enum name="GL_OR"/>
+            <enum name="GL_NOR"/>
+            <enum name="GL_EQUIV"/>
+            <enum name="GL_INVERT"/>
+            <enum name="GL_OR_REVERSE"/>
+            <enum name="GL_COPY_INVERTED"/>
+            <enum name="GL_OR_INVERTED"/>
+            <enum name="GL_NAND"/>
+            <enum name="GL_SET"/>
+            <enum name="GL_TEXTURE"/>
+            <enum name="GL_COLOR"/>
+            <enum name="GL_DEPTH"/>
+            <enum name="GL_STENCIL"/>
+            <enum name="GL_STENCIL_INDEX"/>
+            <enum name="GL_DEPTH_COMPONENT"/>
+            <enum name="GL_RED"/>
+            <enum name="GL_GREEN"/>
+            <enum name="GL_BLUE"/>
+            <enum name="GL_ALPHA"/>
+            <enum name="GL_RGB"/>
+            <enum name="GL_RGBA"/>
+            <enum name="GL_POINT"/>
+            <enum name="GL_LINE"/>
+            <enum name="GL_FILL"/>
+            <enum name="GL_KEEP"/>
+            <enum name="GL_REPLACE"/>
+            <enum name="GL_INCR"/>
+            <enum name="GL_DECR"/>
+            <enum name="GL_VENDOR"/>
+            <enum name="GL_RENDERER"/>
+            <enum name="GL_VERSION"/>
+            <enum name="GL_EXTENSIONS"/>
+            <enum name="GL_NEAREST"/>
+            <enum name="GL_LINEAR"/>
+            <enum name="GL_NEAREST_MIPMAP_NEAREST"/>
+            <enum name="GL_LINEAR_MIPMAP_NEAREST"/>
+            <enum name="GL_NEAREST_MIPMAP_LINEAR"/>
+            <enum name="GL_LINEAR_MIPMAP_LINEAR"/>
+            <enum name="GL_TEXTURE_MAG_FILTER"/>
+            <enum name="GL_TEXTURE_MIN_FILTER"/>
+            <enum name="GL_TEXTURE_WRAP_S"/>
+            <enum name="GL_TEXTURE_WRAP_T"/>
+            <enum name="GL_REPEAT"/>
+            <enum name="GL_CURRENT_BIT"/>
+            <enum name="GL_POINT_BIT"/>
+            <enum name="GL_LINE_BIT"/>
+            <enum name="GL_POLYGON_BIT"/>
+            <enum name="GL_POLYGON_STIPPLE_BIT"/>
+            <enum name="GL_PIXEL_MODE_BIT"/>
+            <enum name="GL_LIGHTING_BIT"/>
+            <enum name="GL_FOG_BIT"/>
+            <enum name="GL_ACCUM_BUFFER_BIT"/>
+            <enum name="GL_VIEWPORT_BIT"/>
+            <enum name="GL_TRANSFORM_BIT"/>
+            <enum name="GL_ENABLE_BIT"/>
+            <enum name="GL_HINT_BIT"/>
+            <enum name="GL_EVAL_BIT"/>
+            <enum name="GL_LIST_BIT"/>
+            <enum name="GL_TEXTURE_BIT"/>
+            <enum name="GL_SCISSOR_BIT"/>
+            <enum name="GL_ALL_ATTRIB_BITS"/>
+            <enum name="GL_QUAD_STRIP"/>
+            <enum name="GL_POLYGON"/>
+            <enum name="GL_ACCUM"/>
+            <enum name="GL_LOAD"/>
+            <enum name="GL_RETURN"/>
+            <enum name="GL_MULT"/>
+            <enum name="GL_ADD"/>
+            <enum name="GL_AUX0"/>
+            <enum name="GL_AUX1"/>
+            <enum name="GL_AUX2"/>
+            <enum name="GL_AUX3"/>
+            <enum name="GL_2D"/>
+            <enum name="GL_3D"/>
+            <enum name="GL_3D_COLOR"/>
+            <enum name="GL_3D_COLOR_TEXTURE"/>
+            <enum name="GL_4D_COLOR_TEXTURE"/>
+            <enum name="GL_PASS_THROUGH_TOKEN"/>
+            <enum name="GL_POINT_TOKEN"/>
+            <enum name="GL_LINE_TOKEN"/>
+            <enum name="GL_POLYGON_TOKEN"/>
+            <enum name="GL_BITMAP_TOKEN"/>
+            <enum name="GL_DRAW_PIXEL_TOKEN"/>
+            <enum name="GL_COPY_PIXEL_TOKEN"/>
+            <enum name="GL_LINE_RESET_TOKEN"/>
+            <enum name="GL_EXP"/>
+            <enum name="GL_EXP2"/>
+            <enum name="GL_COEFF"/>
+            <enum name="GL_ORDER"/>
+            <enum name="GL_DOMAIN"/>
+            <enum name="GL_PIXEL_MAP_I_TO_I"/>
+            <enum name="GL_PIXEL_MAP_S_TO_S"/>
+            <enum name="GL_PIXEL_MAP_I_TO_R"/>
+            <enum name="GL_PIXEL_MAP_I_TO_G"/>
+            <enum name="GL_PIXEL_MAP_I_TO_B"/>
+            <enum name="GL_PIXEL_MAP_I_TO_A"/>
+            <enum name="GL_PIXEL_MAP_R_TO_R"/>
+            <enum name="GL_PIXEL_MAP_G_TO_G"/>
+            <enum name="GL_PIXEL_MAP_B_TO_B"/>
+            <enum name="GL_PIXEL_MAP_A_TO_A"/>
+            <enum name="GL_CURRENT_COLOR"/>
+            <enum name="GL_CURRENT_INDEX"/>
+            <enum name="GL_CURRENT_NORMAL"/>
+            <enum name="GL_CURRENT_TEXTURE_COORDS"/>
+            <enum name="GL_CURRENT_RASTER_COLOR"/>
+            <enum name="GL_CURRENT_RASTER_INDEX"/>
+            <enum name="GL_CURRENT_RASTER_TEXTURE_COORDS"/>
+            <enum name="GL_CURRENT_RASTER_POSITION"/>
+            <enum name="GL_CURRENT_RASTER_POSITION_VALID"/>
+            <enum name="GL_CURRENT_RASTER_DISTANCE"/>
+            <enum name="GL_POINT_SMOOTH"/>
+            <enum name="GL_LINE_STIPPLE"/>
+            <enum name="GL_LINE_STIPPLE_PATTERN"/>
+            <enum name="GL_LINE_STIPPLE_REPEAT"/>
+            <enum name="GL_LIST_MODE"/>
+            <enum name="GL_MAX_LIST_NESTING"/>
+            <enum name="GL_LIST_BASE"/>
+            <enum name="GL_LIST_INDEX"/>
+            <enum name="GL_POLYGON_STIPPLE"/>
+            <enum name="GL_EDGE_FLAG"/>
+            <enum name="GL_LIGHTING"/>
+            <enum name="GL_LIGHT_MODEL_LOCAL_VIEWER"/>
+            <enum name="GL_LIGHT_MODEL_TWO_SIDE"/>
+            <enum name="GL_LIGHT_MODEL_AMBIENT"/>
+            <enum name="GL_SHADE_MODEL"/>
+            <enum name="GL_COLOR_MATERIAL_FACE"/>
+            <enum name="GL_COLOR_MATERIAL_PARAMETER"/>
+            <enum name="GL_COLOR_MATERIAL"/>
+            <enum name="GL_FOG"/>
+            <enum name="GL_FOG_INDEX"/>
+            <enum name="GL_FOG_DENSITY"/>
+            <enum name="GL_FOG_START"/>
+            <enum name="GL_FOG_END"/>
+            <enum name="GL_FOG_MODE"/>
+            <enum name="GL_FOG_COLOR"/>
+            <enum name="GL_ACCUM_CLEAR_VALUE"/>
+            <enum name="GL_MATRIX_MODE"/>
+            <enum name="GL_NORMALIZE"/>
+            <enum name="GL_MODELVIEW_STACK_DEPTH"/>
+            <enum name="GL_PROJECTION_STACK_DEPTH"/>
+            <enum name="GL_TEXTURE_STACK_DEPTH"/>
+            <enum name="GL_MODELVIEW_MATRIX"/>
+            <enum name="GL_PROJECTION_MATRIX"/>
+            <enum name="GL_TEXTURE_MATRIX"/>
+            <enum name="GL_ATTRIB_STACK_DEPTH"/>
+            <enum name="GL_ALPHA_TEST"/>
+            <enum name="GL_ALPHA_TEST_FUNC"/>
+            <enum name="GL_ALPHA_TEST_REF"/>
+            <enum name="GL_LOGIC_OP"/>
+            <enum name="GL_AUX_BUFFERS"/>
+            <enum name="GL_INDEX_CLEAR_VALUE"/>
+            <enum name="GL_INDEX_WRITEMASK"/>
+            <enum name="GL_INDEX_MODE"/>
+            <enum name="GL_RGBA_MODE"/>
+            <enum name="GL_RENDER_MODE"/>
+            <enum name="GL_PERSPECTIVE_CORRECTION_HINT"/>
+            <enum name="GL_POINT_SMOOTH_HINT"/>
+            <enum name="GL_FOG_HINT"/>
+            <enum name="GL_TEXTURE_GEN_S"/>
+            <enum name="GL_TEXTURE_GEN_T"/>
+            <enum name="GL_TEXTURE_GEN_R"/>
+            <enum name="GL_TEXTURE_GEN_Q"/>
+            <enum name="GL_PIXEL_MAP_I_TO_I_SIZE"/>
+            <enum name="GL_PIXEL_MAP_S_TO_S_SIZE"/>
+            <enum name="GL_PIXEL_MAP_I_TO_R_SIZE"/>
+            <enum name="GL_PIXEL_MAP_I_TO_G_SIZE"/>
+            <enum name="GL_PIXEL_MAP_I_TO_B_SIZE"/>
+            <enum name="GL_PIXEL_MAP_I_TO_A_SIZE"/>
+            <enum name="GL_PIXEL_MAP_R_TO_R_SIZE"/>
+            <enum name="GL_PIXEL_MAP_G_TO_G_SIZE"/>
+            <enum name="GL_PIXEL_MAP_B_TO_B_SIZE"/>
+            <enum name="GL_PIXEL_MAP_A_TO_A_SIZE"/>
+            <enum name="GL_MAP_COLOR"/>
+            <enum name="GL_MAP_STENCIL"/>
+            <enum name="GL_INDEX_SHIFT"/>
+            <enum name="GL_INDEX_OFFSET"/>
+            <enum name="GL_RED_SCALE"/>
+            <enum name="GL_RED_BIAS"/>
+            <enum name="GL_ZOOM_X"/>
+            <enum name="GL_ZOOM_Y"/>
+            <enum name="GL_GREEN_SCALE"/>
+            <enum name="GL_GREEN_BIAS"/>
+            <enum name="GL_BLUE_SCALE"/>
+            <enum name="GL_BLUE_BIAS"/>
+            <enum name="GL_ALPHA_SCALE"/>
+            <enum name="GL_ALPHA_BIAS"/>
+            <enum name="GL_DEPTH_SCALE"/>
+            <enum name="GL_DEPTH_BIAS"/>
+            <enum name="GL_MAX_EVAL_ORDER"/>
+            <enum name="GL_MAX_LIGHTS"/>
+            <enum name="GL_MAX_CLIP_PLANES"/>
+            <enum name="GL_MAX_PIXEL_MAP_TABLE"/>
+            <enum name="GL_MAX_ATTRIB_STACK_DEPTH"/>
+            <enum name="GL_MAX_MODELVIEW_STACK_DEPTH"/>
+            <enum name="GL_MAX_NAME_STACK_DEPTH"/>
+            <enum name="GL_MAX_PROJECTION_STACK_DEPTH"/>
+            <enum name="GL_MAX_TEXTURE_STACK_DEPTH"/>
+            <enum name="GL_INDEX_BITS"/>
+            <enum name="GL_RED_BITS"/>
+            <enum name="GL_GREEN_BITS"/>
+            <enum name="GL_BLUE_BITS"/>
+            <enum name="GL_ALPHA_BITS"/>
+            <enum name="GL_DEPTH_BITS"/>
+            <enum name="GL_STENCIL_BITS"/>
+            <enum name="GL_ACCUM_RED_BITS"/>
+            <enum name="GL_ACCUM_GREEN_BITS"/>
+            <enum name="GL_ACCUM_BLUE_BITS"/>
+            <enum name="GL_ACCUM_ALPHA_BITS"/>
+            <enum name="GL_NAME_STACK_DEPTH"/>
+            <enum name="GL_AUTO_NORMAL"/>
+            <enum name="GL_MAP1_COLOR_4"/>
+            <enum name="GL_MAP1_INDEX"/>
+            <enum name="GL_MAP1_NORMAL"/>
+            <enum name="GL_MAP1_TEXTURE_COORD_1"/>
+            <enum name="GL_MAP1_TEXTURE_COORD_2"/>
+            <enum name="GL_MAP1_TEXTURE_COORD_3"/>
+            <enum name="GL_MAP1_TEXTURE_COORD_4"/>
+            <enum name="GL_MAP1_VERTEX_3"/>
+            <enum name="GL_MAP1_VERTEX_4"/>
+            <enum name="GL_MAP2_COLOR_4"/>
+            <enum name="GL_MAP2_INDEX"/>
+            <enum name="GL_MAP2_NORMAL"/>
+            <enum name="GL_MAP2_TEXTURE_COORD_1"/>
+            <enum name="GL_MAP2_TEXTURE_COORD_2"/>
+            <enum name="GL_MAP2_TEXTURE_COORD_3"/>
+            <enum name="GL_MAP2_TEXTURE_COORD_4"/>
+            <enum name="GL_MAP2_VERTEX_3"/>
+            <enum name="GL_MAP2_VERTEX_4"/>
+            <enum name="GL_MAP1_GRID_DOMAIN"/>
+            <enum name="GL_MAP1_GRID_SEGMENTS"/>
+            <enum name="GL_MAP2_GRID_DOMAIN"/>
+            <enum name="GL_MAP2_GRID_SEGMENTS"/>
+            <enum name="GL_TEXTURE_COMPONENTS"/>
+            <enum name="GL_TEXTURE_BORDER"/>
+            <enum name="GL_AMBIENT"/>
+            <enum name="GL_DIFFUSE"/>
+            <enum name="GL_SPECULAR"/>
+            <enum name="GL_POSITION"/>
+            <enum name="GL_SPOT_DIRECTION"/>
+            <enum name="GL_SPOT_EXPONENT"/>
+            <enum name="GL_SPOT_CUTOFF"/>
+            <enum name="GL_CONSTANT_ATTENUATION"/>
+            <enum name="GL_LINEAR_ATTENUATION"/>
+            <enum name="GL_QUADRATIC_ATTENUATION"/>
+            <enum name="GL_COMPILE"/>
+            <enum name="GL_COMPILE_AND_EXECUTE"/>
+            <enum name="GL_2_BYTES"/>
+            <enum name="GL_3_BYTES"/>
+            <enum name="GL_4_BYTES"/>
+            <enum name="GL_EMISSION"/>
+            <enum name="GL_SHININESS"/>
+            <enum name="GL_AMBIENT_AND_DIFFUSE"/>
+            <enum name="GL_COLOR_INDEXES"/>
+            <enum name="GL_MODELVIEW"/>
+            <enum name="GL_PROJECTION"/>
+            <enum name="GL_COLOR_INDEX"/>
+            <enum name="GL_LUMINANCE"/>
+            <enum name="GL_LUMINANCE_ALPHA"/>
+            <enum name="GL_BITMAP"/>
+            <enum name="GL_RENDER"/>
+            <enum name="GL_FEEDBACK"/>
+            <enum name="GL_SELECT"/>
+            <enum name="GL_FLAT"/>
+            <enum name="GL_SMOOTH"/>
+            <enum name="GL_S"/>
+            <enum name="GL_T"/>
+            <enum name="GL_R"/>
+            <enum name="GL_Q"/>
+            <enum name="GL_MODULATE"/>
+            <enum name="GL_DECAL"/>
+            <enum name="GL_TEXTURE_ENV_MODE"/>
+            <enum name="GL_TEXTURE_ENV_COLOR"/>
+            <enum name="GL_TEXTURE_ENV"/>
+            <enum name="GL_EYE_LINEAR"/>
+            <enum name="GL_OBJECT_LINEAR"/>
+            <enum name="GL_SPHERE_MAP"/>
+            <enum name="GL_TEXTURE_GEN_MODE"/>
+            <enum name="GL_OBJECT_PLANE"/>
+            <enum name="GL_EYE_PLANE"/>
+            <enum name="GL_CLAMP"/>
+            <enum name="GL_CLIP_PLANE0"/>
+            <enum name="GL_CLIP_PLANE1"/>
+            <enum name="GL_CLIP_PLANE2"/>
+            <enum name="GL_CLIP_PLANE3"/>
+            <enum name="GL_CLIP_PLANE4"/>
+            <enum name="GL_CLIP_PLANE5"/>
+            <enum name="GL_LIGHT0"/>
+            <enum name="GL_LIGHT1"/>
+            <enum name="GL_LIGHT2"/>
+            <enum name="GL_LIGHT3"/>
+            <enum name="GL_LIGHT4"/>
+            <enum name="GL_LIGHT5"/>
+            <enum name="GL_LIGHT6"/>
+            <enum name="GL_LIGHT7"/>
             <command name="glCullFace"/>
             <command name="glFrontFace"/>
             <command name="glHint"/>
@@ -30321,116 +31486,7 @@
         <require>
             <type name="GLclampf" comment="No longer used in GL 1.1, but still defined in Mesa gl.h"/>
             <type name="GLclampd" comment="No longer used in GL 1.1, but still defined in Mesa gl.h"/>
-            <!-- Many of these are really VERSION_1_0 enums -->
-            <enum name="GL_DEPTH_BUFFER_BIT"/>
-            <enum name="GL_STENCIL_BUFFER_BIT"/>
-            <enum name="GL_COLOR_BUFFER_BIT"/>
-            <enum name="GL_FALSE"/>
-            <enum name="GL_TRUE"/>
-            <enum name="GL_POINTS"/>
-            <enum name="GL_LINES"/>
-            <enum name="GL_LINE_LOOP"/>
-            <enum name="GL_LINE_STRIP"/>
-            <enum name="GL_TRIANGLES"/>
-            <enum name="GL_TRIANGLE_STRIP"/>
-            <enum name="GL_TRIANGLE_FAN"/>
-            <enum name="GL_QUADS"/>
-            <enum name="GL_NEVER"/>
-            <enum name="GL_LESS"/>
-            <enum name="GL_EQUAL"/>
-            <enum name="GL_LEQUAL"/>
-            <enum name="GL_GREATER"/>
-            <enum name="GL_NOTEQUAL"/>
-            <enum name="GL_GEQUAL"/>
-            <enum name="GL_ALWAYS"/>
-            <enum name="GL_ZERO"/>
-            <enum name="GL_ONE"/>
-            <enum name="GL_SRC_COLOR"/>
-            <enum name="GL_ONE_MINUS_SRC_COLOR"/>
-            <enum name="GL_SRC_ALPHA"/>
-            <enum name="GL_ONE_MINUS_SRC_ALPHA"/>
-            <enum name="GL_DST_ALPHA"/>
-            <enum name="GL_ONE_MINUS_DST_ALPHA"/>
-            <enum name="GL_DST_COLOR"/>
-            <enum name="GL_ONE_MINUS_DST_COLOR"/>
-            <enum name="GL_SRC_ALPHA_SATURATE"/>
-            <enum name="GL_NONE"/>
-            <enum name="GL_FRONT_LEFT"/>
-            <enum name="GL_FRONT_RIGHT"/>
-            <enum name="GL_BACK_LEFT"/>
-            <enum name="GL_BACK_RIGHT"/>
-            <enum name="GL_FRONT"/>
-            <enum name="GL_BACK"/>
-            <enum name="GL_LEFT"/>
-            <enum name="GL_RIGHT"/>
-            <enum name="GL_FRONT_AND_BACK"/>
-            <enum name="GL_NO_ERROR"/>
-            <enum name="GL_INVALID_ENUM"/>
-            <enum name="GL_INVALID_VALUE"/>
-            <enum name="GL_INVALID_OPERATION"/>
-            <enum name="GL_OUT_OF_MEMORY"/>
-            <enum name="GL_CW"/>
-            <enum name="GL_CCW"/>
-            <enum name="GL_POINT_SIZE"/>
-            <enum name="GL_POINT_SIZE_RANGE"/>
-            <enum name="GL_POINT_SIZE_GRANULARITY"/>
-            <enum name="GL_LINE_SMOOTH"/>
-            <enum name="GL_LINE_WIDTH"/>
-            <enum name="GL_LINE_WIDTH_RANGE"/>
-            <enum name="GL_LINE_WIDTH_GRANULARITY"/>
-            <enum name="GL_POLYGON_MODE"/>
-            <enum name="GL_POLYGON_SMOOTH"/>
-            <enum name="GL_CULL_FACE"/>
-            <enum name="GL_CULL_FACE_MODE"/>
-            <enum name="GL_FRONT_FACE"/>
-            <enum name="GL_DEPTH_RANGE"/>
-            <enum name="GL_DEPTH_TEST"/>
-            <enum name="GL_DEPTH_WRITEMASK"/>
-            <enum name="GL_DEPTH_CLEAR_VALUE"/>
-            <enum name="GL_DEPTH_FUNC"/>
-            <enum name="GL_STENCIL_TEST"/>
-            <enum name="GL_STENCIL_CLEAR_VALUE"/>
-            <enum name="GL_STENCIL_FUNC"/>
-            <enum name="GL_STENCIL_VALUE_MASK"/>
-            <enum name="GL_STENCIL_FAIL"/>
-            <enum name="GL_STENCIL_PASS_DEPTH_FAIL"/>
-            <enum name="GL_STENCIL_PASS_DEPTH_PASS"/>
-            <enum name="GL_STENCIL_REF"/>
-            <enum name="GL_STENCIL_WRITEMASK"/>
-            <enum name="GL_VIEWPORT"/>
-            <enum name="GL_DITHER"/>
-            <enum name="GL_BLEND_DST"/>
-            <enum name="GL_BLEND_SRC"/>
-            <enum name="GL_BLEND"/>
-            <enum name="GL_LOGIC_OP_MODE"/>
             <enum name="GL_COLOR_LOGIC_OP"/>
-            <enum name="GL_DRAW_BUFFER"/>
-            <enum name="GL_READ_BUFFER"/>
-            <enum name="GL_SCISSOR_BOX"/>
-            <enum name="GL_SCISSOR_TEST"/>
-            <enum name="GL_COLOR_CLEAR_VALUE"/>
-            <enum name="GL_COLOR_WRITEMASK"/>
-            <enum name="GL_DOUBLEBUFFER"/>
-            <enum name="GL_STEREO"/>
-            <enum name="GL_LINE_SMOOTH_HINT"/>
-            <enum name="GL_POLYGON_SMOOTH_HINT"/>
-            <enum name="GL_UNPACK_SWAP_BYTES"/>
-            <enum name="GL_UNPACK_LSB_FIRST"/>
-            <enum name="GL_UNPACK_ROW_LENGTH"/>
-            <enum name="GL_UNPACK_SKIP_ROWS"/>
-            <enum name="GL_UNPACK_SKIP_PIXELS"/>
-            <enum name="GL_UNPACK_ALIGNMENT"/>
-            <enum name="GL_PACK_SWAP_BYTES"/>
-            <enum name="GL_PACK_LSB_FIRST"/>
-            <enum name="GL_PACK_ROW_LENGTH"/>
-            <enum name="GL_PACK_SKIP_ROWS"/>
-            <enum name="GL_PACK_SKIP_PIXELS"/>
-            <enum name="GL_PACK_ALIGNMENT"/>
-            <enum name="GL_MAX_TEXTURE_SIZE"/>
-            <enum name="GL_MAX_VIEWPORT_DIMS"/>
-            <enum name="GL_SUBPIXEL_BITS"/>
-            <enum name="GL_TEXTURE_1D"/>
-            <enum name="GL_TEXTURE_2D"/>
             <enum name="GL_POLYGON_OFFSET_UNITS"/>
             <enum name="GL_POLYGON_OFFSET_POINT"/>
             <enum name="GL_POLYGON_OFFSET_LINE"/>
@@ -30438,79 +31494,14 @@
             <enum name="GL_POLYGON_OFFSET_FACTOR"/>
             <enum name="GL_TEXTURE_BINDING_1D"/>
             <enum name="GL_TEXTURE_BINDING_2D"/>
-            <enum name="GL_TEXTURE_WIDTH"/>
-            <enum name="GL_TEXTURE_HEIGHT"/>
             <enum name="GL_TEXTURE_INTERNAL_FORMAT"/>
-            <enum name="GL_TEXTURE_BORDER_COLOR"/>
             <enum name="GL_TEXTURE_RED_SIZE"/>
             <enum name="GL_TEXTURE_GREEN_SIZE"/>
             <enum name="GL_TEXTURE_BLUE_SIZE"/>
             <enum name="GL_TEXTURE_ALPHA_SIZE"/>
-            <enum name="GL_DONT_CARE"/>
-            <enum name="GL_FASTEST"/>
-            <enum name="GL_NICEST"/>
-            <enum name="GL_BYTE"/>
-            <enum name="GL_UNSIGNED_BYTE"/>
-            <enum name="GL_SHORT"/>
-            <enum name="GL_UNSIGNED_SHORT"/>
-            <enum name="GL_INT"/>
-            <enum name="GL_UNSIGNED_INT"/>
-            <enum name="GL_FLOAT"/>
             <enum name="GL_DOUBLE"/>
-            <enum name="GL_STACK_OVERFLOW"/>
-            <enum name="GL_STACK_UNDERFLOW"/>
-            <enum name="GL_CLEAR"/>
-            <enum name="GL_AND"/>
-            <enum name="GL_AND_REVERSE"/>
-            <enum name="GL_COPY"/>
-            <enum name="GL_AND_INVERTED"/>
-            <enum name="GL_NOOP"/>
-            <enum name="GL_XOR"/>
-            <enum name="GL_OR"/>
-            <enum name="GL_NOR"/>
-            <enum name="GL_EQUIV"/>
-            <enum name="GL_INVERT"/>
-            <enum name="GL_OR_REVERSE"/>
-            <enum name="GL_COPY_INVERTED"/>
-            <enum name="GL_OR_INVERTED"/>
-            <enum name="GL_NAND"/>
-            <enum name="GL_SET"/>
-            <enum name="GL_TEXTURE"/>
-            <enum name="GL_COLOR"/>
-            <enum name="GL_DEPTH"/>
-            <enum name="GL_STENCIL"/>
-            <enum name="GL_STENCIL_INDEX"/>
-            <enum name="GL_DEPTH_COMPONENT"/>
-            <enum name="GL_RED"/>
-            <enum name="GL_GREEN"/>
-            <enum name="GL_BLUE"/>
-            <enum name="GL_ALPHA"/>
-            <enum name="GL_RGB"/>
-            <enum name="GL_RGBA"/>
-            <enum name="GL_POINT"/>
-            <enum name="GL_LINE"/>
-            <enum name="GL_FILL"/>
-            <enum name="GL_KEEP"/>
-            <enum name="GL_REPLACE"/>
-            <enum name="GL_INCR"/>
-            <enum name="GL_DECR"/>
-            <enum name="GL_VENDOR"/>
-            <enum name="GL_RENDERER"/>
-            <enum name="GL_VERSION"/>
-            <enum name="GL_EXTENSIONS"/>
-            <enum name="GL_NEAREST"/>
-            <enum name="GL_LINEAR"/>
-            <enum name="GL_NEAREST_MIPMAP_NEAREST"/>
-            <enum name="GL_LINEAR_MIPMAP_NEAREST"/>
-            <enum name="GL_NEAREST_MIPMAP_LINEAR"/>
-            <enum name="GL_LINEAR_MIPMAP_LINEAR"/>
-            <enum name="GL_TEXTURE_MAG_FILTER"/>
-            <enum name="GL_TEXTURE_MIN_FILTER"/>
-            <enum name="GL_TEXTURE_WRAP_S"/>
-            <enum name="GL_TEXTURE_WRAP_T"/>
             <enum name="GL_PROXY_TEXTURE_1D"/>
             <enum name="GL_PROXY_TEXTURE_2D"/>
-            <enum name="GL_REPEAT"/>
             <enum name="GL_R3_G3_B2"/>
             <enum name="GL_RGB4"/>
             <enum name="GL_RGB5"/>
@@ -30525,66 +31516,9 @@
             <enum name="GL_RGB10_A2"/>
             <enum name="GL_RGBA12"/>
             <enum name="GL_RGBA16"/>
-            <enum name="GL_CURRENT_BIT"/>
-            <enum name="GL_POINT_BIT"/>
-            <enum name="GL_LINE_BIT"/>
-            <enum name="GL_POLYGON_BIT"/>
-            <enum name="GL_POLYGON_STIPPLE_BIT"/>
-            <enum name="GL_PIXEL_MODE_BIT"/>
-            <enum name="GL_LIGHTING_BIT"/>
-            <enum name="GL_FOG_BIT"/>
-            <enum name="GL_ACCUM_BUFFER_BIT"/>
-            <enum name="GL_VIEWPORT_BIT"/>
-            <enum name="GL_TRANSFORM_BIT"/>
-            <enum name="GL_ENABLE_BIT"/>
-            <enum name="GL_HINT_BIT"/>
-            <enum name="GL_EVAL_BIT"/>
-            <enum name="GL_LIST_BIT"/>
-            <enum name="GL_TEXTURE_BIT"/>
-            <enum name="GL_SCISSOR_BIT"/>
-            <enum name="GL_ALL_ATTRIB_BITS"/>
             <enum name="GL_CLIENT_PIXEL_STORE_BIT"/>
             <enum name="GL_CLIENT_VERTEX_ARRAY_BIT"/>
             <enum name="GL_CLIENT_ALL_ATTRIB_BITS"/>
-            <enum name="GL_QUAD_STRIP"/>
-            <enum name="GL_POLYGON"/>
-            <enum name="GL_ACCUM"/>
-            <enum name="GL_LOAD"/>
-            <enum name="GL_RETURN"/>
-            <enum name="GL_MULT"/>
-            <enum name="GL_ADD"/>
-            <enum name="GL_AUX0"/>
-            <enum name="GL_AUX1"/>
-            <enum name="GL_AUX2"/>
-            <enum name="GL_AUX3"/>
-            <enum name="GL_2D"/>
-            <enum name="GL_3D"/>
-            <enum name="GL_3D_COLOR"/>
-            <enum name="GL_3D_COLOR_TEXTURE"/>
-            <enum name="GL_4D_COLOR_TEXTURE"/>
-            <enum name="GL_PASS_THROUGH_TOKEN"/>
-            <enum name="GL_POINT_TOKEN"/>
-            <enum name="GL_LINE_TOKEN"/>
-            <enum name="GL_POLYGON_TOKEN"/>
-            <enum name="GL_BITMAP_TOKEN"/>
-            <enum name="GL_DRAW_PIXEL_TOKEN"/>
-            <enum name="GL_COPY_PIXEL_TOKEN"/>
-            <enum name="GL_LINE_RESET_TOKEN"/>
-            <enum name="GL_EXP"/>
-            <enum name="GL_EXP2"/>
-            <enum name="GL_COEFF"/>
-            <enum name="GL_ORDER"/>
-            <enum name="GL_DOMAIN"/>
-            <enum name="GL_PIXEL_MAP_I_TO_I"/>
-            <enum name="GL_PIXEL_MAP_S_TO_S"/>
-            <enum name="GL_PIXEL_MAP_I_TO_R"/>
-            <enum name="GL_PIXEL_MAP_I_TO_G"/>
-            <enum name="GL_PIXEL_MAP_I_TO_B"/>
-            <enum name="GL_PIXEL_MAP_I_TO_A"/>
-            <enum name="GL_PIXEL_MAP_R_TO_R"/>
-            <enum name="GL_PIXEL_MAP_G_TO_G"/>
-            <enum name="GL_PIXEL_MAP_B_TO_B"/>
-            <enum name="GL_PIXEL_MAP_A_TO_A"/>
             <enum name="GL_VERTEX_ARRAY_POINTER"/>
             <enum name="GL_NORMAL_ARRAY_POINTER"/>
             <enum name="GL_COLOR_ARRAY_POINTER"/>
@@ -30593,141 +31527,9 @@
             <enum name="GL_EDGE_FLAG_ARRAY_POINTER"/>
             <enum name="GL_FEEDBACK_BUFFER_POINTER"/>
             <enum name="GL_SELECTION_BUFFER_POINTER"/>
-            <enum name="GL_CURRENT_COLOR"/>
-            <enum name="GL_CURRENT_INDEX"/>
-            <enum name="GL_CURRENT_NORMAL"/>
-            <enum name="GL_CURRENT_TEXTURE_COORDS"/>
-            <enum name="GL_CURRENT_RASTER_COLOR"/>
-            <enum name="GL_CURRENT_RASTER_INDEX"/>
-            <enum name="GL_CURRENT_RASTER_TEXTURE_COORDS"/>
-            <enum name="GL_CURRENT_RASTER_POSITION"/>
-            <enum name="GL_CURRENT_RASTER_POSITION_VALID"/>
-            <enum name="GL_CURRENT_RASTER_DISTANCE"/>
-            <enum name="GL_POINT_SMOOTH"/>
-            <enum name="GL_LINE_STIPPLE"/>
-            <enum name="GL_LINE_STIPPLE_PATTERN"/>
-            <enum name="GL_LINE_STIPPLE_REPEAT"/>
-            <enum name="GL_LIST_MODE"/>
-            <enum name="GL_MAX_LIST_NESTING"/>
-            <enum name="GL_LIST_BASE"/>
-            <enum name="GL_LIST_INDEX"/>
-            <enum name="GL_POLYGON_STIPPLE"/>
-            <enum name="GL_EDGE_FLAG"/>
-            <enum name="GL_LIGHTING"/>
-            <enum name="GL_LIGHT_MODEL_LOCAL_VIEWER"/>
-            <enum name="GL_LIGHT_MODEL_TWO_SIDE"/>
-            <enum name="GL_LIGHT_MODEL_AMBIENT"/>
-            <enum name="GL_SHADE_MODEL"/>
-            <enum name="GL_COLOR_MATERIAL_FACE"/>
-            <enum name="GL_COLOR_MATERIAL_PARAMETER"/>
-            <enum name="GL_COLOR_MATERIAL"/>
-            <enum name="GL_FOG"/>
-            <enum name="GL_FOG_INDEX"/>
-            <enum name="GL_FOG_DENSITY"/>
-            <enum name="GL_FOG_START"/>
-            <enum name="GL_FOG_END"/>
-            <enum name="GL_FOG_MODE"/>
-            <enum name="GL_FOG_COLOR"/>
-            <enum name="GL_ACCUM_CLEAR_VALUE"/>
-            <enum name="GL_MATRIX_MODE"/>
-            <enum name="GL_NORMALIZE"/>
-            <enum name="GL_MODELVIEW_STACK_DEPTH"/>
-            <enum name="GL_PROJECTION_STACK_DEPTH"/>
-            <enum name="GL_TEXTURE_STACK_DEPTH"/>
-            <enum name="GL_MODELVIEW_MATRIX"/>
-            <enum name="GL_PROJECTION_MATRIX"/>
-            <enum name="GL_TEXTURE_MATRIX"/>
-            <enum name="GL_ATTRIB_STACK_DEPTH"/>
             <enum name="GL_CLIENT_ATTRIB_STACK_DEPTH"/>
-            <enum name="GL_ALPHA_TEST"/>
-            <enum name="GL_ALPHA_TEST_FUNC"/>
-            <enum name="GL_ALPHA_TEST_REF"/>
             <enum name="GL_INDEX_LOGIC_OP"/>
-            <enum name="GL_LOGIC_OP"/>
-            <enum name="GL_AUX_BUFFERS"/>
-            <enum name="GL_INDEX_CLEAR_VALUE"/>
-            <enum name="GL_INDEX_WRITEMASK"/>
-            <enum name="GL_INDEX_MODE"/>
-            <enum name="GL_RGBA_MODE"/>
-            <enum name="GL_RENDER_MODE"/>
-            <enum name="GL_PERSPECTIVE_CORRECTION_HINT"/>
-            <enum name="GL_POINT_SMOOTH_HINT"/>
-            <enum name="GL_FOG_HINT"/>
-            <enum name="GL_TEXTURE_GEN_S"/>
-            <enum name="GL_TEXTURE_GEN_T"/>
-            <enum name="GL_TEXTURE_GEN_R"/>
-            <enum name="GL_TEXTURE_GEN_Q"/>
-            <enum name="GL_PIXEL_MAP_I_TO_I_SIZE"/>
-            <enum name="GL_PIXEL_MAP_S_TO_S_SIZE"/>
-            <enum name="GL_PIXEL_MAP_I_TO_R_SIZE"/>
-            <enum name="GL_PIXEL_MAP_I_TO_G_SIZE"/>
-            <enum name="GL_PIXEL_MAP_I_TO_B_SIZE"/>
-            <enum name="GL_PIXEL_MAP_I_TO_A_SIZE"/>
-            <enum name="GL_PIXEL_MAP_R_TO_R_SIZE"/>
-            <enum name="GL_PIXEL_MAP_G_TO_G_SIZE"/>
-            <enum name="GL_PIXEL_MAP_B_TO_B_SIZE"/>
-            <enum name="GL_PIXEL_MAP_A_TO_A_SIZE"/>
-            <enum name="GL_MAP_COLOR"/>
-            <enum name="GL_MAP_STENCIL"/>
-            <enum name="GL_INDEX_SHIFT"/>
-            <enum name="GL_INDEX_OFFSET"/>
-            <enum name="GL_RED_SCALE"/>
-            <enum name="GL_RED_BIAS"/>
-            <enum name="GL_ZOOM_X"/>
-            <enum name="GL_ZOOM_Y"/>
-            <enum name="GL_GREEN_SCALE"/>
-            <enum name="GL_GREEN_BIAS"/>
-            <enum name="GL_BLUE_SCALE"/>
-            <enum name="GL_BLUE_BIAS"/>
-            <enum name="GL_ALPHA_SCALE"/>
-            <enum name="GL_ALPHA_BIAS"/>
-            <enum name="GL_DEPTH_SCALE"/>
-            <enum name="GL_DEPTH_BIAS"/>
-            <enum name="GL_MAX_EVAL_ORDER"/>
-            <enum name="GL_MAX_LIGHTS"/>
-            <enum name="GL_MAX_CLIP_PLANES"/>
-            <enum name="GL_MAX_PIXEL_MAP_TABLE"/>
-            <enum name="GL_MAX_ATTRIB_STACK_DEPTH"/>
-            <enum name="GL_MAX_MODELVIEW_STACK_DEPTH"/>
-            <enum name="GL_MAX_NAME_STACK_DEPTH"/>
-            <enum name="GL_MAX_PROJECTION_STACK_DEPTH"/>
-            <enum name="GL_MAX_TEXTURE_STACK_DEPTH"/>
             <enum name="GL_MAX_CLIENT_ATTRIB_STACK_DEPTH"/>
-            <enum name="GL_INDEX_BITS"/>
-            <enum name="GL_RED_BITS"/>
-            <enum name="GL_GREEN_BITS"/>
-            <enum name="GL_BLUE_BITS"/>
-            <enum name="GL_ALPHA_BITS"/>
-            <enum name="GL_DEPTH_BITS"/>
-            <enum name="GL_STENCIL_BITS"/>
-            <enum name="GL_ACCUM_RED_BITS"/>
-            <enum name="GL_ACCUM_GREEN_BITS"/>
-            <enum name="GL_ACCUM_BLUE_BITS"/>
-            <enum name="GL_ACCUM_ALPHA_BITS"/>
-            <enum name="GL_NAME_STACK_DEPTH"/>
-            <enum name="GL_AUTO_NORMAL"/>
-            <enum name="GL_MAP1_COLOR_4"/>
-            <enum name="GL_MAP1_INDEX"/>
-            <enum name="GL_MAP1_NORMAL"/>
-            <enum name="GL_MAP1_TEXTURE_COORD_1"/>
-            <enum name="GL_MAP1_TEXTURE_COORD_2"/>
-            <enum name="GL_MAP1_TEXTURE_COORD_3"/>
-            <enum name="GL_MAP1_TEXTURE_COORD_4"/>
-            <enum name="GL_MAP1_VERTEX_3"/>
-            <enum name="GL_MAP1_VERTEX_4"/>
-            <enum name="GL_MAP2_COLOR_4"/>
-            <enum name="GL_MAP2_INDEX"/>
-            <enum name="GL_MAP2_NORMAL"/>
-            <enum name="GL_MAP2_TEXTURE_COORD_1"/>
-            <enum name="GL_MAP2_TEXTURE_COORD_2"/>
-            <enum name="GL_MAP2_TEXTURE_COORD_3"/>
-            <enum name="GL_MAP2_TEXTURE_COORD_4"/>
-            <enum name="GL_MAP2_VERTEX_3"/>
-            <enum name="GL_MAP2_VERTEX_4"/>
-            <enum name="GL_MAP1_GRID_DOMAIN"/>
-            <enum name="GL_MAP1_GRID_SEGMENTS"/>
-            <enum name="GL_MAP2_GRID_DOMAIN"/>
-            <enum name="GL_MAP2_GRID_SEGMENTS"/>
             <enum name="GL_FEEDBACK_BUFFER_SIZE"/>
             <enum name="GL_FEEDBACK_BUFFER_TYPE"/>
             <enum name="GL_SELECTION_BUFFER_SIZE"/>
@@ -30751,58 +31553,10 @@
             <enum name="GL_TEXTURE_COORD_ARRAY_TYPE"/>
             <enum name="GL_TEXTURE_COORD_ARRAY_STRIDE"/>
             <enum name="GL_EDGE_FLAG_ARRAY_STRIDE"/>
-            <enum name="GL_TEXTURE_COMPONENTS"/>
-            <enum name="GL_TEXTURE_BORDER"/>
             <enum name="GL_TEXTURE_LUMINANCE_SIZE"/>
             <enum name="GL_TEXTURE_INTENSITY_SIZE"/>
             <enum name="GL_TEXTURE_PRIORITY"/>
             <enum name="GL_TEXTURE_RESIDENT"/>
-            <enum name="GL_AMBIENT"/>
-            <enum name="GL_DIFFUSE"/>
-            <enum name="GL_SPECULAR"/>
-            <enum name="GL_POSITION"/>
-            <enum name="GL_SPOT_DIRECTION"/>
-            <enum name="GL_SPOT_EXPONENT"/>
-            <enum name="GL_SPOT_CUTOFF"/>
-            <enum name="GL_CONSTANT_ATTENUATION"/>
-            <enum name="GL_LINEAR_ATTENUATION"/>
-            <enum name="GL_QUADRATIC_ATTENUATION"/>
-            <enum name="GL_COMPILE"/>
-            <enum name="GL_COMPILE_AND_EXECUTE"/>
-            <enum name="GL_2_BYTES"/>
-            <enum name="GL_3_BYTES"/>
-            <enum name="GL_4_BYTES"/>
-            <enum name="GL_EMISSION"/>
-            <enum name="GL_SHININESS"/>
-            <enum name="GL_AMBIENT_AND_DIFFUSE"/>
-            <enum name="GL_COLOR_INDEXES"/>
-            <enum name="GL_MODELVIEW"/>
-            <enum name="GL_PROJECTION"/>
-            <enum name="GL_COLOR_INDEX"/>
-            <enum name="GL_LUMINANCE"/>
-            <enum name="GL_LUMINANCE_ALPHA"/>
-            <enum name="GL_BITMAP"/>
-            <enum name="GL_RENDER"/>
-            <enum name="GL_FEEDBACK"/>
-            <enum name="GL_SELECT"/>
-            <enum name="GL_FLAT"/>
-            <enum name="GL_SMOOTH"/>
-            <enum name="GL_S"/>
-            <enum name="GL_T"/>
-            <enum name="GL_R"/>
-            <enum name="GL_Q"/>
-            <enum name="GL_MODULATE"/>
-            <enum name="GL_DECAL"/>
-            <enum name="GL_TEXTURE_ENV_MODE"/>
-            <enum name="GL_TEXTURE_ENV_COLOR"/>
-            <enum name="GL_TEXTURE_ENV"/>
-            <enum name="GL_EYE_LINEAR"/>
-            <enum name="GL_OBJECT_LINEAR"/>
-            <enum name="GL_SPHERE_MAP"/>
-            <enum name="GL_TEXTURE_GEN_MODE"/>
-            <enum name="GL_OBJECT_PLANE"/>
-            <enum name="GL_EYE_PLANE"/>
-            <enum name="GL_CLAMP"/>
             <enum name="GL_ALPHA4"/>
             <enum name="GL_ALPHA8"/>
             <enum name="GL_ALPHA12"/>
@@ -30836,20 +31590,6 @@
             <enum name="GL_T2F_N3F_V3F"/>
             <enum name="GL_T2F_C4F_N3F_V3F"/>
             <enum name="GL_T4F_C4F_N3F_V4F"/>
-            <enum name="GL_CLIP_PLANE0"/>
-            <enum name="GL_CLIP_PLANE1"/>
-            <enum name="GL_CLIP_PLANE2"/>
-            <enum name="GL_CLIP_PLANE3"/>
-            <enum name="GL_CLIP_PLANE4"/>
-            <enum name="GL_CLIP_PLANE5"/>
-            <enum name="GL_LIGHT0"/>
-            <enum name="GL_LIGHT1"/>
-            <enum name="GL_LIGHT2"/>
-            <enum name="GL_LIGHT3"/>
-            <enum name="GL_LIGHT4"/>
-            <enum name="GL_LIGHT5"/>
-            <enum name="GL_LIGHT6"/>
-            <enum name="GL_LIGHT7"/>
             <command name="glDrawArrays"/>
             <command name="glDrawElements"/>
             <command name="glGetPointerv"/>
@@ -33126,6 +33866,7 @@
             <command name="glGenProgramPipelines"/>
             <command name="glIsProgramPipeline"/>
             <command name="glGetProgramPipelineiv"/>
+            <command name="glProgramParameteri"/>
             <command name="glProgramUniform1i"/>
             <command name="glProgramUniform1iv"/>
             <command name="glProgramUniform1f"/>
@@ -36507,7 +37248,37 @@
                 <command name="glBlendEquationSeparateIndexedAMD"/>
             </require>
         </extension>
+        <extension name="GL_AMD_framebuffer_sample_positions" supported="gl">
+            <require>
+                <enum name="GL_SUBSAMPLE_DISTANCE_AMD"/>
+                <enum name="GL_PIXELS_PER_SAMPLE_PATTERN_X_AMD"/>
+                <enum name="GL_PIXELS_PER_SAMPLE_PATTERN_Y_AMD"/>
+                <enum name="GL_ALL_PIXELS_AMD"/>
+                <command name="glFramebufferSamplePositionsfvAMD"/>
+                <command name="glNamedFramebufferSamplePositionsfvAMD"/>
+                <command name="glGetFramebufferParameterfvAMD"/>
+                <command name="glGetNamedFramebufferParameterfvAMD"/>
+            </require>
+        </extension>
         <extension name="GL_AMD_gcn_shader" supported="gl"/>
+        <extension name="GL_AMD_gpu_shader_half_float" supported="gl">
+            <require>
+                <enum name="GL_FLOAT16_NV"/>
+                <enum name="GL_FLOAT16_VEC2_NV"/>
+                <enum name="GL_FLOAT16_VEC3_NV"/>
+                <enum name="GL_FLOAT16_VEC4_NV"/>
+                <enum name="GL_FLOAT16_MAT2_AMD"/>
+                <enum name="GL_FLOAT16_MAT3_AMD"/>
+                <enum name="GL_FLOAT16_MAT4_AMD"/>
+                <enum name="GL_FLOAT16_MAT2x3_AMD"/>
+                <enum name="GL_FLOAT16_MAT2x4_AMD"/>
+                <enum name="GL_FLOAT16_MAT3x2_AMD"/>
+                <enum name="GL_FLOAT16_MAT3x4_AMD"/>
+                <enum name="GL_FLOAT16_MAT4x2_AMD"/>
+                <enum name="GL_FLOAT16_MAT4x3_AMD"/>
+            </require>
+        </extension>
+        <extension name="GL_AMD_gpu_shader_int16" supported="gl"/>
         <extension name="GL_AMD_gpu_shader_int64" supported="gl">
             <require>
                 <enum name="GL_INT64_NV"/>
@@ -36619,7 +37390,7 @@
                 <command name="glQueryObjectParameteruiAMD"/>
             </require>
         </extension>
-        <extension name="GL_AMD_performance_monitor" supported="gl|gles2">
+        <extension name="GL_AMD_performance_monitor" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_COUNTER_TYPE_AMD"/>
                 <enum name="GL_COUNTER_RANGE_AMD"/>
@@ -36670,6 +37441,7 @@
             </require>
         </extension>
         <extension name="GL_AMD_shader_atomic_counter_ops" supported="gl"/>
+        <extension name="GL_AMD_shader_ballot" supported="gl"/>
         <extension name="GL_AMD_shader_stencil_export" supported="gl"/>
         <extension name="GL_AMD_shader_trinary_minmax" supported="gl"/>
         <extension name="GL_AMD_shader_explicit_vertex_parameter" supported="gl"/>
@@ -36697,6 +37469,7 @@
                 <command name="glStencilOpValueAMD"/>
             </require>
         </extension>
+        <extension name="GL_AMD_texture_gather_bias_lod" supported="gl"/>
         <extension name="GL_AMD_texture_texture4" supported="gl"/>
         <extension name="GL_AMD_transform_feedback3_lines_triangles" supported="gl"/>
         <extension name="GL_AMD_transform_feedback4" supported="gl">
@@ -36921,7 +37694,7 @@
                 <command name="glGetObjectParameterivAPPLE"/>
             </require>
         </extension>
-        <extension name="GL_APPLE_rgb_422" supported="gl|gles2">
+        <extension name="GL_APPLE_rgb_422" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_RGB_422_APPLE"/>
                 <enum name="GL_UNSIGNED_SHORT_8_8_APPLE"/>
@@ -37090,7 +37863,7 @@
                 <command name="glMemoryBarrierByRegion"/>
             </require>
         </extension>
-        <extension name="GL_ARB_ES3_2_compatibility" supported="gl">
+        <extension name="GL_ARB_ES3_2_compatibility" supported="gl|glcore">
             <require>
                 <enum name="GL_PRIMITIVE_BOUNDING_BOX_ARB"/>
                 <enum name="GL_MULTISAMPLE_LINE_WIDTH_RANGE_ARB"/>
@@ -37516,7 +38289,7 @@
                 <command name="glDrawElementsIndirect"/>
             </require>
         </extension>
-        <extension name="GL_ARB_draw_instanced" supported="gl">
+        <extension name="GL_ARB_draw_instanced" supported="gl|glcore">
             <require>
                 <command name="glDrawArraysInstancedARB"/>
                 <command name="glDrawElementsInstancedARB"/>
@@ -37649,7 +38422,7 @@
                 <enum name="GL_FRAGMENT_SHADER_DERIVATIVE_HINT_ARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_fragment_shader_interlock" supported="gl"/>
+        <extension name="GL_ARB_fragment_shader_interlock" supported="gl|glcore"/>
         <extension name="GL_ARB_framebuffer_no_attachments" supported="gl|glcore">
             <require>
                 <enum name="GL_FRAMEBUFFER_DEFAULT_WIDTH"/>
@@ -37769,7 +38542,7 @@
                 <enum name="GL_FRAMEBUFFER_SRGB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_geometry_shader4" supported="gl">
+        <extension name="GL_ARB_geometry_shader4" supported="gl|glcore">
             <require>
                 <enum name="GL_LINES_ADJACENCY_ARB"/>
                 <enum name="GL_LINE_STRIP_ADJACENCY_ARB"/>
@@ -37859,7 +38632,7 @@
                 <command name="glGetUniformdv"/>
             </require>
         </extension>
-        <extension name="GL_ARB_gpu_shader_int64" supported="gl">
+        <extension name="GL_ARB_gpu_shader_int64" supported="gl|glcore">
             <require>
                 <enum name="GL_INT64_ARB"/>
                 <enum name="GL_UNSIGNED_INT64_ARB"/>
@@ -38042,7 +38815,7 @@
                 <command name="glMultiDrawElementsIndirectCountARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_instanced_arrays" supported="gl">
+        <extension name="GL_ARB_instanced_arrays" supported="gl|glcore">
             <require>
                 <enum name="GL_VERTEX_ATTRIB_ARRAY_DIVISOR_ARB"/>
                 <command name="glVertexAttribDivisorARB"/>
@@ -38345,7 +39118,7 @@
                 <enum name="GL_ANY_SAMPLES_PASSED"/>
             </require>
         </extension>
-        <extension name="GL_ARB_parallel_shader_compile" supported="gl">
+        <extension name="GL_ARB_parallel_shader_compile" supported="gl|glcore">
             <require>
                 <enum name="GL_MAX_SHADER_COMPILER_THREADS_ARB"/>
                 <enum name="GL_COMPLETION_STATUS_ARB"/>
@@ -38367,7 +39140,7 @@
                 <enum name="GL_CLIPPING_OUTPUT_PRIMITIVES_ARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_pixel_buffer_object" supported="gl">
+        <extension name="GL_ARB_pixel_buffer_object" supported="gl|glcore">
             <require>
                 <enum name="GL_PIXEL_PACK_BUFFER_ARB"/>
                 <enum name="GL_PIXEL_UNPACK_BUFFER_ARB"/>
@@ -38391,7 +39164,7 @@
                 <enum name="GL_COORD_REPLACE_ARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_post_depth_coverage" supported="gl"/>
+        <extension name="GL_ARB_post_depth_coverage" supported="gl|glcore"/>
         <extension name="GL_ARB_program_interface_query" supported="gl|glcore">
             <require>
                 <enum name="GL_UNIFORM"/>
@@ -38505,7 +39278,7 @@
             </require>
         </extension>
         <extension name="GL_ARB_robustness_isolation" supported="gl|glcore"/>
-        <extension name="GL_ARB_sample_locations" supported="gl">
+        <extension name="GL_ARB_sample_locations" supported="gl|glcore">
             <require>
                 <enum name="GL_SAMPLE_LOCATION_SUBPIXEL_BITS_ARB"/>
                 <enum name="GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB"/>
@@ -38575,6 +39348,7 @@
                 <command name="glGenProgramPipelines"/>
                 <command name="glIsProgramPipeline"/>
                 <command name="glGetProgramPipelineiv"/>
+                <command name="glProgramParameteri"/>
                 <command name="glProgramUniform1i"/>
                 <command name="glProgramUniform1iv"/>
                 <command name="glProgramUniform1f"/>
@@ -38629,7 +39403,7 @@
                 <command name="glGetProgramPipelineInfoLog"/>
             </require>
         </extension>
-        <extension name="GL_ARB_shader_atomic_counter_ops" supported="gl"/>
+        <extension name="GL_ARB_shader_atomic_counter_ops" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_atomic_counters" supported="gl|glcore">
             <require>
                 <enum name="GL_ATOMIC_COUNTER_BUFFER"/>
@@ -38666,7 +39440,7 @@
         </extension>
         <extension name="GL_ARB_shader_ballot" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_bit_encoding" supported="gl|glcore"/>
-        <extension name="GL_ARB_shader_clock" supported="gl"/>
+        <extension name="GL_ARB_shader_clock" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_draw_parameters" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_group_vote" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_image_load_store" supported="gl|glcore">
@@ -38866,7 +39640,7 @@
         </extension>
         <extension name="GL_ARB_shader_texture_image_samples" supported="gl|glcore"/>
         <extension name="GL_ARB_shader_texture_lod" supported="gl"/>
-        <extension name="GL_ARB_shader_viewport_layer_array" supported="gl"/>
+        <extension name="GL_ARB_shader_viewport_layer_array" supported="gl|glcore"/>
         <extension name="GL_ARB_shading_language_100" supported="gl">
             <require>
                 <enum name="GL_SHADING_LANGUAGE_VERSION_ARB"/>
@@ -38928,8 +39702,15 @@
                 <command name="glTexPageCommitmentARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_sparse_texture2" supported="gl|gles2"/>
-        <extension name="GL_ARB_sparse_texture_clamp" supported="gl"/>
+        <extension name="GL_ARB_sparse_texture2" supported="gl|glcore"/>
+        <extension name="GL_ARB_sparse_texture_clamp" supported="gl|glcore"/>
+        <extension name="GL_ARB_gl_spirv" supported="gl|glcore">
+            <require>
+                <enum name="GL_SHADER_BINARY_FORMAT_SPIR_V_ARB"/>
+                <enum name="GL_SPIR_V_BINARY_ARB"/>
+                <command name="glSpecializeShaderARB"/>
+            </require>
+        </extension>
         <extension name="GL_ARB_stencil_texturing" supported="gl|glcore">
             <require>
                 <enum name="GL_DEPTH_STENCIL_TEXTURE_MODE"/>
@@ -39009,12 +39790,12 @@
                 <command name="glTextureBarrier"/>
             </require>
         </extension>
-        <extension name="GL_ARB_texture_border_clamp" supported="gl">
+        <extension name="GL_ARB_texture_border_clamp" supported="gl|glcore">
             <require>
                 <enum name="GL_CLAMP_TO_BORDER_ARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_texture_buffer_object" supported="gl">
+        <extension name="GL_ARB_texture_buffer_object" supported="gl|glcore">
             <require>
                 <enum name="GL_TEXTURE_BUFFER_ARB"/>
                 <enum name="GL_MAX_TEXTURE_BUFFER_SIZE_ARB"/>
@@ -39138,7 +39919,7 @@
                 <enum name="GL_DOT3_RGBA_ARB"/>
             </require>
         </extension>
-        <extension name="GL_ARB_texture_filter_minmax" supported="gl">
+        <extension name="GL_ARB_texture_filter_minmax" supported="gl|glcore">
             <require>
                 <enum name="GL_TEXTURE_REDUCTION_MODE_ARB"/>
                 <enum name="GL_WEIGHTED_AVERAGE_ARB"/>
@@ -39180,7 +39961,7 @@
                 <enum name="GL_MIRROR_CLAMP_TO_EDGE"/>
             </require>
         </extension>
-        <extension name="GL_ARB_texture_mirrored_repeat" supported="gl">
+        <extension name="GL_ARB_texture_mirrored_repeat" supported="gl|glcore">
             <require>
                 <enum name="GL_MIRRORED_REPEAT_ARB"/>
             </require>
@@ -39214,7 +39995,7 @@
                 <command name="glSampleMaski"/>
             </require>
         </extension>
-        <extension name="GL_ARB_texture_non_power_of_two" supported="gl"/>
+        <extension name="GL_ARB_texture_non_power_of_two" supported="gl|glcore"/>
         <extension name="GL_ARB_texture_query_levels" supported="gl|glcore"/>
         <extension name="GL_ARB_texture_query_lod" supported="gl|glcore"/>
         <extension name="GL_ARB_texture_rectangle" supported="gl">
@@ -40255,6 +41036,8 @@
                 <enum name="GL_422_REV_AVERAGE_EXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_EGL_image_array" supported="gles2">
+        </extension>
         <extension name="GL_EXT_YUV_target" supported="gles2">
             <require>
                 <enum name="GL_SAMPLER_EXTERNAL_2D_Y2Y_EXT"/>
@@ -40368,6 +41151,12 @@
                 <!-- <command name="glNamedBufferStorageEXT"/> -->
             </require>
         </extension>
+        <extension name="GL_EXT_clear_texture" supported="gles2">
+            <require>
+                <command name="glClearTexImageEXT"/>
+                <command name="glClearTexSubImageEXT"/>
+            </require>
+        </extension>
         <extension name="GL_EXT_clip_cull_distance" supported="gles2">
             <require>
                 <enum name="GL_MAX_CLIP_DISTANCES_EXT"/>
@@ -40421,6 +41210,7 @@
                 <command name="glUnlockArraysEXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_conservative_depth" supported="gles2"/>
         <extension name="GL_EXT_convolution" supported="gl">
             <require>
                 <enum name="GL_CONVOLUTION_1D_EXT"/>
@@ -40521,7 +41311,7 @@
                 <command name="glCullParameterfvEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_debug_label" supported="gl|gles2">
+        <extension name="GL_EXT_debug_label" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_PROGRAM_PIPELINE_OBJECT_EXT"/>
                 <enum name="GL_PROGRAM_OBJECT_EXT"/>
@@ -40537,7 +41327,7 @@
                 <enum name="GL_TRANSFORM_FEEDBACK"/>
             </require>
         </extension>
-        <extension name="GL_EXT_debug_marker" supported="gl|gles2">
+        <extension name="GL_EXT_debug_marker" supported="gl|glcore|gles2">
             <require>
                 <command name="glInsertEventMarkerEXT"/>
                 <command name="glPushGroupMarkerEXT"/>
@@ -40551,7 +41341,7 @@
                 <command name="glDepthBoundsEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_direct_state_access" supported="gl" comment="DSA extension doesn't identify which interfaces are core profile and keeps getting expanded. This is in sync with revision 34, 2010/09/07">
+        <extension name="GL_EXT_direct_state_access" supported="gl|glcore" comment="DSA extension doesn't identify which interfaces are core profile and keeps getting expanded. This is in sync with revision 34, 2010/09/07">
             <require>
                 <enum name="GL_PROGRAM_MATRIX_EXT"/>
                 <enum name="GL_TRANSPOSE_PROGRAM_MATRIX_EXT"/>
@@ -41032,7 +41822,7 @@
                 <command name="glMultiDrawElementsBaseVertexEXT" comment="Supported only if GL_EXT_multi_draw_arrays is supported"/>
             </require>
         </extension>
-        <extension name="GL_EXT_draw_instanced" supported="gl|gles2">
+        <extension name="GL_EXT_draw_instanced" supported="gl|glcore|gles2">
             <require>
                 <command name="glDrawArraysInstancedEXT"/>
                 <command name="glDrawElementsInstancedEXT"/>
@@ -41045,6 +41835,12 @@
                 <command name="glDrawRangeElementsEXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_draw_transform_feedback" supported="gles2">
+            <require>
+                <command name="glDrawTransformFeedbackEXT"/>
+                <command name="glDrawTransformFeedbackInstancedEXT"/>
+            </require>
+        </extension>
         <extension name="GL_EXT_float_blend" supported="gles2"/>
         <extension name="GL_EXT_fog_coord" supported="gl">
             <require>
@@ -41372,6 +42168,67 @@
                 <command name="glFlushMappedBufferRangeEXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_memory_object" supported="gl|gles2">
+            <require>
+                <enum name="GL_TEXTURE_TILING_EXT"/>
+                <enum name="GL_DEDICATED_MEMORY_OBJECT_EXT"/>
+                <enum name="GL_PROTECTED_MEMORY_OBJECT_EXT"/>
+                <enum name="GL_NUM_TILING_TYPES_EXT"/>
+                <enum name="GL_TILING_TYPES_EXT"/>
+                <enum name="GL_OPTIMAL_TILING_EXT"/>
+                <enum name="GL_LINEAR_TILING_EXT"/>
+                <enum name="GL_NUM_DEVICE_UUIDS_EXT"/>
+                <enum name="GL_DEVICE_UUID_EXT"/>
+                <enum name="GL_DRIVER_UUID_EXT"/>
+                <enum name="GL_UUID_SIZE_EXT"/>
+                <command name="glGetUnsignedBytevEXT"/>
+                <command name="glGetUnsignedBytei_vEXT"/>
+                <command name="glDeleteMemoryObjectsEXT"/>
+                <command name="glIsMemoryObjectEXT"/>
+                <command name="glCreateMemoryObjectsEXT"/>
+                <command name="glMemoryObjectParameterivEXT"/>
+                <command name="glGetMemoryObjectParameterivEXT"/>
+                <command name="glTexStorageMem2DEXT"/>
+                <command name="glTexStorageMem2DMultisampleEXT"/>
+                <command name="glTexStorageMem3DEXT"/>
+                <command name="glTexStorageMem3DMultisampleEXT"/>
+                <command name="glBufferStorageMemEXT"/>
+            </require>
+            <require comment="Supported only if GL_EXT_direct_state_access is supported">
+                <command name="glTextureStorageMem2DEXT"/>
+                <command name="glTextureStorageMem2DMultisampleEXT"/>
+                <command name="glTextureStorageMem3DEXT"/>
+                <command name="glTextureStorageMem3DMultisampleEXT"/>
+                <command name="glNamedBufferStorageMemEXT"/>
+            </require>
+            <require api="gl">
+                <command name="glTexStorageMem1DEXT"/>
+            </require>
+            <require api="gl" comment="Supported only if GL_EXT_direct_state_access is supported">
+                <command name="glTextureStorageMem1DEXT"/>
+            </require>
+        </extension>
+        <extension name="GL_EXT_memory_object_fd" supported="gl|gles2">
+            <require>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_FD_EXT"/>
+                <command name="glImportMemoryFdEXT"/>
+            </require>
+        </extension>
+        <extension name="GL_EXT_memory_object_win32" supported="gl|gles2">
+            <require>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_EXT"/>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT"/>
+                <enum name="GL_DEVICE_LUID_EXT"/>
+                <enum name="GL_DEVICE_NODE_MASK_EXT"/>
+                <enum name="GL_LUID_SIZE_EXT"/>
+                <enum name="GL_HANDLE_TYPE_D3D12_TILEPOOL_EXT"/>
+                <enum name="GL_HANDLE_TYPE_D3D12_RESOURCE_EXT"/>
+                <enum name="GL_HANDLE_TYPE_D3D11_IMAGE_EXT"/>
+                <enum name="GL_HANDLE_TYPE_D3D11_IMAGE_KMT_EXT"/>
+                <command name="glImportMemoryWin32HandleEXT"/>
+                <command name="glImportMemoryWin32NameEXT"/>
+            </require>
+        </extension>
         <extension name="GL_EXT_misc_attribute" supported="gl"/>
         <extension name="GL_EXT_multi_draw_arrays" supported="gl|gles1|gles2">
             <require>
@@ -41537,13 +42394,13 @@
                 <command name="glPolygonOffsetEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_polygon_offset_clamp" supported="gl|gles2">
+        <extension name="GL_EXT_polygon_offset_clamp" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_POLYGON_OFFSET_CLAMP_EXT"/>
                 <command name="glPolygonOffsetClampEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_post_depth_coverage" supported="gl|gles2"/>
+        <extension name="GL_EXT_post_depth_coverage" supported="gl|glcore|gles2"/>
         <extension name="GL_EXT_primitive_bounding_box" supported="gles2">
             <require>
                 <enum name="GL_PRIMITIVE_BOUNDING_BOX_EXT"/>
@@ -41575,7 +42432,7 @@
                 <enum name="GL_COMPRESSED_SRGB_ALPHA_PVRTC_4BPPV2_IMG"/>
             </require>
         </extension>
-        <extension name="GL_EXT_raster_multisample" supported="gl|gles2">
+        <extension name="GL_EXT_raster_multisample" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_RASTER_MULTISAMPLE_EXT"/>
                 <enum name="GL_RASTER_SAMPLES_EXT"/>
@@ -41626,6 +42483,49 @@
                 <command name="glGetnUniformivEXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_semaphore" supported="gl|gles2">
+            <require>
+                <enum name="GL_NUM_DEVICE_UUIDS_EXT"/>
+                <enum name="GL_DEVICE_UUID_EXT"/>
+                <enum name="GL_DRIVER_UUID_EXT"/>
+                <enum name="GL_UUID_SIZE_EXT"/>
+                <enum name="GL_LAYOUT_GENERAL_EXT"/>
+                <enum name="GL_LAYOUT_COLOR_ATTACHMENT_EXT"/>
+                <enum name="GL_LAYOUT_DEPTH_STENCIL_ATTACHMENT_EXT"/>
+                <enum name="GL_LAYOUT_DEPTH_STENCIL_READ_ONLY_EXT"/>
+                <enum name="GL_LAYOUT_SHADER_READ_ONLY_EXT"/>
+                <enum name="GL_LAYOUT_TRANSFER_SRC_EXT"/>
+                <enum name="GL_LAYOUT_TRANSFER_DST_EXT"/>
+                <command name="glGetUnsignedBytevEXT"/>
+                <command name="glGetUnsignedBytei_vEXT"/>
+                <command name="glGenSemaphoresEXT"/>
+                <command name="glDeleteSemaphoresEXT"/>
+                <command name="glIsSemaphoreEXT"/>
+                <command name="glSemaphoreParameterui64vEXT"/>
+                <command name="glGetSemaphoreParameterui64vEXT"/>
+                <command name="glWaitSemaphoreEXT"/>
+                <command name="glSignalSemaphoreEXT"/>
+            </require>
+        </extension>
+        <extension name="GL_EXT_semaphore_fd" supported="gl|gles2">
+            <require>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_FD_EXT"/>
+                <command name="glImportSemaphoreFdEXT"/>
+            </require>
+        </extension>
+        <extension name="GL_EXT_semaphore_win32" supported="gl|gles2">
+            <require>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_EXT"/>
+                <enum name="GL_HANDLE_TYPE_OPAQUE_WIN32_KMT_EXT"/>
+                <enum name="GL_DEVICE_LUID_EXT"/>
+                <enum name="GL_DEVICE_NODE_MASK_EXT"/>
+                <enum name="GL_LUID_SIZE_EXT"/>
+                <enum name="GL_HANDLE_TYPE_D3D12_FENCE_EXT"/>
+                <enum name="GL_D3D12_FENCE_VALUE_EXT"/>
+                <command name="glImportSemaphoreWin32HandleEXT"/>
+                <command name="glImportSemaphoreWin32NameEXT"/>
+            </require>
+        </extension>
         <extension name="GL_EXT_sRGB" supported="gles1|gles2">
             <require>
                 <enum name="GL_SRGB_EXT"/>
@@ -41667,7 +42567,7 @@
                 <command name="glSecondaryColorPointerEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_separate_shader_objects" supported="gl|gles2">
+        <extension name="GL_EXT_separate_shader_objects" supported="gl|glcore|gles2">
             <require api="gl" comment="The OpenGL version of this extension is completely unrelated to the OpenGL ES version">
                 <enum name="GL_ACTIVE_PROGRAM_EXT"/>
                 <command name="glUseShaderProgramEXT"/>
@@ -41806,7 +42706,7 @@
             </require>
         </extension>
         <extension name="GL_EXT_shader_implicit_conversions" supported="gles2"/>
-        <extension name="GL_EXT_shader_integer_mix" supported="gl|gles2"/>
+        <extension name="GL_EXT_shader_integer_mix" supported="gl|glcore|gles2"/>
         <extension name="GL_EXT_shader_io_blocks" supported="gles2"/>
         <extension name="GL_EXT_shader_non_constant_global_initializers" supported="gles2"/>
         <extension name="GL_EXT_shader_pixel_local_storage" supported="gles2">
@@ -41863,7 +42763,7 @@
                 <!-- <command name="glTexturePageCommitmentEXT"/> -->
             </require>
         </extension>
-        <extension name="GL_EXT_sparse_texture2" supported="gl"/>
+        <extension name="GL_EXT_sparse_texture2" supported="gl|gles2"/>
         <extension name="GL_EXT_stencil_clear_tag" supported="gl">
             <require>
                 <enum name="GL_STENCIL_TAG_BITS_EXT"/>
@@ -42063,6 +42963,11 @@
                 <command name="glTexBufferEXT"/>
             </require>
         </extension>
+        <extension name="GL_EXT_texture_compression_astc_decode_mode" supported="gles2">
+            <require>
+                <enum name="GL_TEXTURE_ASTC_DECODE_PRECISION_EXT"/>
+            </require>
+        </extension>
         <extension name="GL_EXT_texture_compression_dxt1" supported="gles1|gles2">
             <require>
                 <enum name="GL_COMPRESSED_RGB_S3TC_DXT1_EXT"/>
@@ -42085,7 +42990,7 @@
                 <enum name="GL_COMPRESSED_SIGNED_RED_GREEN_RGTC2_EXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_texture_compression_s3tc" supported="gl|gles2|glsc2">
+        <extension name="GL_EXT_texture_compression_s3tc" supported="gl|glcore|gles2|glsc2">
             <require>
                 <enum name="GL_COMPRESSED_RGB_S3TC_DXT1_EXT"/>
                 <enum name="GL_COMPRESSED_RGBA_S3TC_DXT1_EXT"/>
@@ -42160,7 +43065,7 @@
                 <enum name="GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_texture_filter_minmax" supported="gl|gles2">
+        <extension name="GL_EXT_texture_filter_minmax" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_RASTER_MULTISAMPLE_EXT"/>
                 <enum name="GL_RASTER_SAMPLES_EXT"/>
@@ -42319,7 +43224,7 @@
                 <enum name="GL_SRG8_EXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_texture_sRGB_decode" supported="gl|gles2">
+        <extension name="GL_EXT_texture_sRGB_decode" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_TEXTURE_SRGB_DECODE_EXT"/>
                 <enum name="GL_DECODE_EXT"/>
@@ -42714,7 +43619,13 @@
                 <command name="glVertexWeightPointerEXT"/>
             </require>
         </extension>
-        <extension name="GL_EXT_window_rectangles" supported="gl|gles2">
+        <extension name="GL_EXT_win32_keyed_mutex" supported="gl|gles2">
+            <require>
+                <command name="glAcquireKeyedMutexWin32EXT"/>
+                <command name="glReleaseKeyedMutexWin32EXT"/>
+            </require>
+        </extension>
+        <extension name="GL_EXT_window_rectangles" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_INCLUSIVE_EXT"/>
                 <enum name="GL_EXCLUSIVE_EXT"/>
@@ -42848,6 +43759,16 @@
                 <command name="glVertexPointerListIBM"/>
             </require>
         </extension>
+        <extension name="GL_IMG_bindless_texture" supported="gles2">
+            <require>
+                <command name="glGetTextureHandleIMG"/>
+                <command name="glGetTextureSamplerHandleIMG"/>
+                <command name="glUniformHandleui64IMG"/>
+                <command name="glUniformHandleui64vIMG"/>
+                <command name="glProgramUniformHandleui64IMG"/>
+                <command name="glProgramUniformHandleui64vIMG"/>
+            </require>
+        </extension>
         <extension name="GL_IMG_framebuffer_downsample" supported="gles2">
             <require>
                 <enum name="GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_AND_DOWNSAMPLE_IMG"/>
@@ -42951,13 +43872,13 @@
                 <enum name="GL_INTERLACE_READ_INGR"/>
             </require>
         </extension>
-        <extension name="GL_INTEL_conservative_rasterization" supported="gl|gles2">
+        <extension name="GL_INTEL_conservative_rasterization" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_CONSERVATIVE_RASTERIZATION_INTEL"/>
             </require>
         </extension>
         <extension name="GL_INTEL_fragment_shader_ordering" supported="gl"/>
-        <extension name="GL_INTEL_framebuffer_CMAA" supported="gl|gles2">
+        <extension name="GL_INTEL_framebuffer_CMAA" supported="gl|glcore|gles2">
             <require>
                 <command name="glApplyFramebufferAttachmentCMAAINTEL"/>
             </require>
@@ -42986,7 +43907,7 @@
                 <command name="glTexCoordPointervINTEL"/>
             </require>
         </extension>
-        <extension name="GL_INTEL_performance_query" supported="gl|gles2">
+        <extension name="GL_INTEL_performance_query" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_PERFQUERY_SINGLE_CONTEXT_INTEL"/>
                 <enum name="GL_PERFQUERY_GLOBAL_CONTEXT_INTEL"/>
@@ -43020,7 +43941,7 @@
                 <command name="glGetPerfQueryInfoINTEL"/>
             </require>
         </extension>
-        <extension name="GL_KHR_blend_equation_advanced" supported="gl|gles2">
+        <extension name="GL_KHR_blend_equation_advanced" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_MULTIPLY_KHR"/>
                 <enum name="GL_SCREEN_KHR"/>
@@ -43040,7 +43961,7 @@
                 <command name="glBlendBarrierKHR"/>
             </require>
         </extension>
-        <extension name="GL_KHR_blend_equation_advanced_coherent" supported="gl|gles2">
+        <extension name="GL_KHR_blend_equation_advanced_coherent" supported="gl|glcore|gles2">
             <require comment="Otherwise identical to GL_KHR_blend_equation_advanced, just different semantic behavior">
                 <enum name="GL_BLEND_ADVANCED_COHERENT_KHR"/>
             </require>
@@ -43293,6 +44214,7 @@
                 <command name="glResizeBuffersMESA"/>
             </require>
         </extension>
+        <extension name="GL_MESA_shader_integer_functions" supported="gl|gles2"/>
         <extension name="GL_MESA_window_pos" supported="gl">
             <require>
                 <command name="glWindowPos2dMESA"/>
@@ -43328,6 +44250,7 @@
                 <enum name="GL_YCBCR_MESA"/>
             </require>
         </extension>
+        <extension name="GL_NVX_blend_equation_advanced_multi_draw_buffers" supported="gl|gles2"/>
         <extension name="GL_NVX_conditional_render" supported="gl">
             <require>
                 <command name="glBeginConditionalRenderNVX"/>
@@ -43343,19 +44266,37 @@
                 <enum name="GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX"/>
             </require>
         </extension>
-        <extension name="GL_NV_bindless_multi_draw_indirect" supported="gl">
+        <extension name="GL_NVX_linked_gpu_multicast" supported="gl">
+            <require>
+                <enum name="GL_LGPU_SEPARATE_STORAGE_BIT_NVX"/>
+                <enum name="GL_MAX_LGPU_GPUS_NVX"/>
+                <command name="glLGPUNamedBufferSubDataNVX"/>
+                <command name="glLGPUCopyImageSubDataNVX"/>
+                <command name="glLGPUInterlockNVX"/>
+            </require>
+        </extension>
+        <extension name="GL_NV_alpha_to_coverage_dither_control" supported="gl">
+            <require>
+                <enum name="GL_ALPHA_TO_COVERAGE_DITHER_DEFAULT_NV"/>
+                <enum name="GL_ALPHA_TO_COVERAGE_DITHER_ENABLE_NV"/>
+                <enum name="GL_ALPHA_TO_COVERAGE_DITHER_DISABLE_NV"/>
+                <enum name="GL_ALPHA_TO_COVERAGE_DITHER_MODE_NV"/>
+                <command name="glAlphaToCoverageDitherControlNV"/>
+            </require>
+        </extension>
+        <extension name="GL_NV_bindless_multi_draw_indirect" supported="gl|glcore">
             <require>
                 <command name="glMultiDrawArraysIndirectBindlessNV"/>
                 <command name="glMultiDrawElementsIndirectBindlessNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_bindless_multi_draw_indirect_count" supported="gl">
+        <extension name="GL_NV_bindless_multi_draw_indirect_count" supported="gl|glcore">
             <require>
                 <command name="glMultiDrawArraysIndirectBindlessCountNV"/>
                 <command name="glMultiDrawElementsIndirectBindlessCountNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_bindless_texture" supported="gl|gles2">
+        <extension name="GL_NV_bindless_texture" supported="gl|glcore|gles2">
             <require>
                 <command name="glGetTextureHandleNV"/>
                 <command name="glGetTextureSamplerHandleNV"/>
@@ -43372,7 +44313,7 @@
                 <command name="glIsImageHandleResidentNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_blend_equation_advanced" supported="gl|gles2">
+        <extension name="GL_NV_blend_equation_advanced" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_BLEND_OVERLAP_NV"/>
                 <enum name="GL_BLEND_PREMULTIPLIED_SRC_NV"/>
@@ -43429,13 +44370,13 @@
                 <command name="glBlendBarrierNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_blend_equation_advanced_coherent" supported="gl|gles2">
+        <extension name="GL_NV_blend_equation_advanced_coherent" supported="gl|glcore|gles2">
             <require comment="Otherwise identical to GL_NV_blend_equation_advanced, just different semantic behavior">
                 <enum name="GL_BLEND_ADVANCED_COHERENT_NV"/>
             </require>
         </extension>
         <extension name="GL_NV_blend_square" supported="gl"/>
-        <extension name="GL_NV_clip_space_w_scaling" supported="gl">
+        <extension name="GL_NV_clip_space_w_scaling" supported="gl|glcore">
             <require>
                 <enum name="GL_VIEWPORT_POSITION_W_SCALE_NV"/>
                 <enum name="GL_VIEWPORT_POSITION_W_SCALE_X_COEFF_NV"/>
@@ -43443,7 +44384,7 @@
                 <command name="glViewportPositionWScaleNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_command_list" supported="gl">
+        <extension name="GL_NV_command_list" supported="gl|glcore">
             <require>
                 <enum name="GL_TERMINATE_SEQUENCE_COMMAND_NV"/>
                 <enum name="GL_NOP_COMMAND_NV"/>
@@ -43489,7 +44430,7 @@
                 <enum name="GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_conditional_render" supported="gl|gles2">
+        <extension name="GL_NV_conditional_render" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_QUERY_WAIT_NV"/>
                 <enum name="GL_QUERY_NO_WAIT_NV"/>
@@ -43499,7 +44440,7 @@
                 <command name="glEndConditionalRenderNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_conservative_raster" supported="gl|gles2">
+        <extension name="GL_NV_conservative_raster" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_CONSERVATIVE_RASTERIZATION_NV"/>
                 <enum name="GL_SUBPIXEL_PRECISION_BIAS_X_BITS_NV"/>
@@ -43508,7 +44449,7 @@
                 <command name="glSubpixelPrecisionBiasNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_conservative_raster_dilate" supported="gl">
+        <extension name="GL_NV_conservative_raster_dilate" supported="gl|glcore">
             <require>
                 <enum name="GL_CONSERVATIVE_RASTER_DILATE_NV"/>
                 <enum name="GL_CONSERVATIVE_RASTER_DILATE_RANGE_NV"/>
@@ -43516,7 +44457,7 @@
                 <command name="glConservativeRasterParameterfNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_conservative_raster_pre_snap_triangles" supported="gl|gles2">
+        <extension name="GL_NV_conservative_raster_pre_snap_triangles" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_CONSERVATIVE_RASTER_MODE_NV"/>
                 <enum name="GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV"/>
@@ -43634,6 +44575,15 @@
                 <command name="glDrawTextureNV"/>
             </require>
         </extension>
+        <extension name="GL_NV_draw_vulkan_image" supported="gl|glcore|gles2">
+            <require>
+                <command name="glDrawVkImageNV"/>
+                <command name="glGetVkProcAddrNV"/>
+                <command name="glWaitVkSemaphoreNV"/>
+                <command name="glSignalVkSemaphoreNV"/>
+                <command name="glSignalVkFenceNV"/>
+            </require>
+        </extension>
         <extension name="GL_NV_evaluators" supported="gl">
             <require>
                 <enum name="GL_EVAL_2D_NV"/>
@@ -43724,7 +44674,7 @@
                 <command name="glSetFenceNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_fill_rectangle" supported="gl|gles2">
+        <extension name="GL_NV_fill_rectangle" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_FILL_RECTANGLE_NV"/>
             </require>
@@ -43756,7 +44706,7 @@
                 <enum name="GL_EYE_PLANE"/>
             </require>
         </extension>
-        <extension name="GL_NV_fragment_coverage_to_color" supported="gl|gles2">
+        <extension name="GL_NV_fragment_coverage_to_color" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_FRAGMENT_COVERAGE_TO_COLOR_NV"/>
                 <enum name="GL_FRAGMENT_COVERAGE_COLOR_NV"/>
@@ -43792,7 +44742,7 @@
         </extension>
         <extension name="GL_NV_fragment_program4" supported="gl"/>
         <extension name="GL_NV_fragment_program_option" supported="gl"/>
-        <extension name="GL_NV_fragment_shader_interlock" supported="gl|gles2"/>
+        <extension name="GL_NV_fragment_shader_interlock" supported="gl|glcore|gles2"/>
         <extension name="GL_NV_framebuffer_blit" supported="gles2">
             <require>
                 <enum name="GL_READ_FRAMEBUFFER_NV"/>
@@ -43802,7 +44752,7 @@
                 <command name="glBlitFramebufferNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_framebuffer_mixed_samples" supported="gl|gles2">
+        <extension name="GL_NV_framebuffer_mixed_samples" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_RASTER_MULTISAMPLE_EXT"/>
                 <enum name="GL_COVERAGE_MODULATION_TABLE_NV"/>
@@ -43832,7 +44782,7 @@
                 <command name="glRenderbufferStorageMultisampleNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_framebuffer_multisample_coverage" supported="gl">
+        <extension name="GL_NV_framebuffer_multisample_coverage" supported="gl|glcore">
             <require>
                 <enum name="GL_RENDERBUFFER_COVERAGE_SAMPLES_NV"/>
                 <enum name="GL_RENDERBUFFER_COLOR_SAMPLES_NV"/>
@@ -43867,7 +44817,7 @@
             </require>
         </extension>
         <extension name="GL_NV_geometry_shader4" supported="gl"/>
-        <extension name="GL_NV_geometry_shader_passthrough" supported="gl|gles2"/>
+        <extension name="GL_NV_geometry_shader_passthrough" supported="gl|glcore|gles2"/>
         <extension name="GL_NV_gpu_program4" supported="gl">
             <require>
                 <enum name="GL_MIN_PROGRAM_TEXEL_OFFSET_NV"/>
@@ -43911,7 +44861,7 @@
             </require>
         </extension>
         <extension name="GL_NV_gpu_program5_mem_extended" supported="gl"/>
-        <extension name="GL_NV_gpu_shader5" supported="gl|gles2">
+        <extension name="GL_NV_gpu_shader5" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_INT64_NV"/>
                 <enum name="GL_UNSIGNED_INT64_NV"/>
@@ -44037,7 +44987,7 @@
                 <command name="glVertexAttribDivisorNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_internalformat_sample_query" supported="gl|gles2">
+        <extension name="GL_NV_internalformat_sample_query" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_RENDERBUFFER"/>
                 <enum name="GL_TEXTURE_2D_MULTISAMPLE"/>
@@ -44055,6 +45005,27 @@
                 <enum name="GL_MAX_SPOT_EXPONENT_NV"/>
             </require>
         </extension>
+        <extension name="GL_NV_gpu_multicast" supported="gl">
+            <require>
+                <enum name="GL_PER_GPU_STORAGE_BIT_NV"/>
+                <enum name="GL_MULTICAST_GPUS_NV"/>
+                <enum name="GL_RENDER_GPU_MASK_NV"/>
+                <enum name="GL_PER_GPU_STORAGE_NV"/>
+                <enum name="GL_MULTICAST_PROGRAMMABLE_SAMPLE_LOCATION_NV"/>
+                <command name="glRenderGpuMaskNV"/>
+                <command name="glMulticastBufferSubDataNV"/>
+                <command name="glMulticastCopyBufferSubDataNV"/>
+                <command name="glMulticastCopyImageSubDataNV"/>
+                <command name="glMulticastBlitFramebufferNV"/>
+                <command name="glMulticastFramebufferSampleLocationsfvNV"/>
+                <command name="glMulticastBarrierNV"/>
+                <command name="glMulticastWaitSyncNV"/>
+                <command name="glMulticastGetQueryObjectivNV"/>
+                <command name="glMulticastGetQueryObjectuivNV"/>
+                <command name="glMulticastGetQueryObjecti64vNV"/>
+                <command name="glMulticastGetQueryObjectui64vNV"/>
+            </require>
+        </extension>
         <extension name="GL_NV_multisample_coverage" supported="gl">
             <require>
                 <enum name="GL_SAMPLES_ARB"/>
@@ -44116,7 +45087,7 @@
             </require>
         </extension>
         <extension name="GL_NV_parameter_buffer_object2" supported="gl"/>
-        <extension name="GL_NV_path_rendering" supported="gl|gles2">
+        <extension name="GL_NV_path_rendering" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_PATH_FORMAT_SVG_NV"/>
                 <enum name="GL_PATH_FORMAT_PS_NV"/>
@@ -44356,7 +45327,7 @@
                 <enum name="GL_FRAGMENT_INPUT_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_path_rendering_shared_edge" supported="gl|gles2">
+        <extension name="GL_NV_path_rendering_shared_edge" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_SHARED_EDGE_NV"/>
             </require>
@@ -44526,7 +45497,7 @@
                 <enum name="GL_ETC1_SRGB8_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_sample_locations" supported="gl|gles2">
+        <extension name="GL_NV_sample_locations" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_SAMPLE_LOCATION_SUBPIXEL_BITS_NV"/>
                 <enum name="GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_NV"/>
@@ -44541,13 +45512,13 @@
                 <command name="glResolveDepthValuesNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_sample_mask_override_coverage" supported="gl|gles2"/>
-        <extension name="GL_NV_shader_atomic_counters" supported="gl"/>
-        <extension name="GL_NV_shader_atomic_float" supported="gl"/>
-        <extension name="GL_NV_shader_atomic_float64" supported="gl"/>
-        <extension name="GL_NV_shader_atomic_fp16_vector" supported="gl|gles2"/>
-        <extension name="GL_NV_shader_atomic_int64" supported="gl"/>
-        <extension name="GL_NV_shader_buffer_load" supported="gl">
+        <extension name="GL_NV_sample_mask_override_coverage" supported="gl|glcore|gles2"/>
+        <extension name="GL_NV_shader_atomic_counters" supported="gl|glcore"/>
+        <extension name="GL_NV_shader_atomic_float" supported="gl|glcore"/>
+        <extension name="GL_NV_shader_atomic_float64" supported="gl|glcore"/>
+        <extension name="GL_NV_shader_atomic_fp16_vector" supported="gl|glcore|gles2"/>
+        <extension name="GL_NV_shader_atomic_int64" supported="gl|glcore"/>
+        <extension name="GL_NV_shader_buffer_load" supported="gl|glcore">
             <require>
                 <enum name="GL_BUFFER_GPU_ADDRESS_NV"/>
                 <enum name="GL_GPU_ADDRESS_NV"/>
@@ -44568,7 +45539,7 @@
                 <command name="glProgramUniformui64vNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_shader_buffer_store" supported="gl">
+        <extension name="GL_NV_shader_buffer_store" supported="gl|glcore">
             <require>
                 <enum name="GL_SHADER_GLOBAL_ACCESS_BARRIER_BIT_NV"/>
                 <enum name="GL_READ_WRITE"/>
@@ -44577,14 +45548,14 @@
         </extension>
         <extension name="GL_NV_shader_noperspective_interpolation" supported="gles2"/>
         <extension name="GL_NV_shader_storage_buffer_object" supported="gl"/>
-        <extension name="GL_NV_shader_thread_group" supported="gl">
+        <extension name="GL_NV_shader_thread_group" supported="gl|glcore">
             <require>
                 <enum name="GL_WARP_SIZE_NV"/>
                 <enum name="GL_WARPS_PER_SM_NV"/>
                 <enum name="GL_SM_COUNT_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_shader_thread_shuffle" supported="gl"/>
+        <extension name="GL_NV_shader_thread_shuffle" supported="gl|glcore"/>
         <extension name="GL_NV_shadow_samplers_array" supported="gles2">
             <require>
                 <enum name="GL_SAMPLER_2D_ARRAY_SHADOW_NV"/>
@@ -44595,7 +45566,7 @@
                 <enum name="GL_SAMPLER_CUBE_SHADOW_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_stereo_view_rendering" supported="gl"/>
+        <extension name="GL_NV_stereo_view_rendering" supported="gl|glcore"/>
         <extension name="GL_NV_tessellation_program5" supported="gl">
             <require>
                 <enum name="GL_MAX_PROGRAM_PATCH_ATTRIBS_NV"/>
@@ -44618,7 +45589,7 @@
                 <enum name="GL_REFLECTION_MAP_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_texture_barrier" supported="gl">
+        <extension name="GL_NV_texture_barrier" supported="gl|glcore">
             <require>
                 <command name="glTextureBarrierNV"/>
             </require>
@@ -44835,7 +45806,7 @@
                 <command name="glDrawTransformFeedbackNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_uniform_buffer_unified_memory" supported="gl">
+        <extension name="GL_NV_uniform_buffer_unified_memory" supported="gl|glcore">
             <require>
                 <enum name="GL_UNIFORM_BUFFER_UNIFIED_NV"/>
                 <enum name="GL_UNIFORM_BUFFER_ADDRESS_NV"/>
@@ -44876,7 +45847,7 @@
                 <enum name="GL_VERTEX_ARRAY_RANGE_WITHOUT_FLUSH_NV"/>
             </require>
         </extension>
-        <extension name="GL_NV_vertex_attrib_integer_64bit" supported="gl">
+        <extension name="GL_NV_vertex_attrib_integer_64bit" supported="gl|glcore">
             <require>
                 <enum name="GL_INT64_NV"/>
                 <enum name="GL_UNSIGNED_INT64_NV"/>
@@ -44901,7 +45872,7 @@
                 <command name="glVertexAttribLFormatNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_vertex_buffer_unified_memory" supported="gl">
+        <extension name="GL_NV_vertex_buffer_unified_memory" supported="gl|glcore">
             <require>
                 <enum name="GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV"/>
                 <enum name="GL_ELEMENT_ARRAY_UNIFIED_NV"/>
@@ -45203,8 +46174,8 @@
                 <command name="glIsEnablediNV"/>
             </require>
         </extension>
-        <extension name="GL_NV_viewport_array2" supported="gl|gles2"/>
-        <extension name="GL_NV_viewport_swizzle" supported="gl|gles2">
+        <extension name="GL_NV_viewport_array2" supported="gl|glcore|gles2"/>
+        <extension name="GL_NV_viewport_swizzle" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV"/>
                 <enum name="GL_VIEWPORT_SWIZZLE_NEGATIVE_X_NV"/>
@@ -46096,15 +47067,16 @@
                 <enum name="GL_FORMAT_SUBSAMPLE_244_244_OML"/>
             </require>
         </extension>
-        <extension name="GL_OVR_multiview" supported="gl|gles2">
+        <extension name="GL_OVR_multiview" supported="gl|glcore|gles2">
             <require>
                 <enum name="GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR"/>
                 <enum name="GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR"/>
                 <enum name="GL_MAX_VIEWS_OVR"/>
+                <enum name="GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR"/>
                 <command name="glFramebufferTextureMultiviewOVR"/>
             </require>
         </extension>
-        <extension name="GL_OVR_multiview2" supported="gl|gles2"/>
+        <extension name="GL_OVR_multiview2" supported="gl|glcore|gles2"/>
         <extension name="GL_OVR_multiview_multisampled_render_to_texture" supported="gles2">
             <require>
                 <command name="glFramebufferTextureMultisampleMultiviewOVR"/>
@@ -46216,11 +47188,25 @@
                 <command name="glExtGetProgramBinarySourceQCOM"/>
             </require>
         </extension>
+        <extension name="GL_QCOM_framebuffer_foveated" supported="gles2">
+            <require>
+                <enum name="GL_FOVEATION_ENABLE_BIT_QCOM"/>
+                <enum name="GL_FOVEATION_SCALED_BIN_METHOD_BIT_QCOM"/>
+                <command name="glFramebufferFoveationConfigQCOM"/>
+                <command name="glFramebufferFoveationParametersQCOM"/>
+            </require>
+        </extension>
         <extension name="GL_QCOM_perfmon_global_mode" supported="gles1|gles2">
             <require>
                 <enum name="GL_PERFMON_GLOBAL_MODE_QCOM"/>
             </require>
         </extension>
+        <extension name="GL_QCOM_shader_framebuffer_fetch_noncoherent" supported="gles2">
+            <require>
+                <enum name="GL_FRAMEBUFFER_FETCH_NONCOHERENT_QCOM"/>
+                <command name="glFramebufferFetchBarrierQCOM"/>
+            </require>
+        </extension>
         <extension name="GL_QCOM_tiled_rendering" supported="gles1|gles2">
             <require>
                 <enum name="GL_COLOR_BUFFER_BIT0_QCOM"/>
diff --git a/src/mapi/mapi_abi.py b/src/mapi/mapi_abi.py
index 2343182..82a2511 100644
--- a/src/mapi/mapi_abi.py
+++ b/src/mapi/mapi_abi.py
@@ -346,28 +346,6 @@
                 '#define MAPI_TABLE_NUM_DYNAMIC %d') % (
                         num_static_entries, ABI_NUM_DYNAMIC_ENTRIES)
 
-    def c_mapi_table_initializer(self, prefix):
-        """Return the array initializer for mapi_table_fill."""
-        entries = [self._c_function(ent, prefix)
-                for ent in self.entries if not ent.alias]
-        pre = self.indent + '(mapi_proc) '
-        return pre + (',\n' + pre).join(entries)
-
-    def c_mapi_table_spec(self):
-        """Return the spec for mapi_init."""
-        specv1 = []
-        line = '"1'
-        for ent in self.entries:
-            if not ent.alias:
-                line += '\\0"\n'
-                specv1.append(line)
-                line = '"'
-            line += '%s\\0' % ent.name
-        line += '";'
-        specv1.append(line)
-
-        return self.indent + self.indent.join(specv1)
-
     def _c_function(self, ent, prefix, mangle=False, stringify=False):
         """Return the function name of an entry."""
         formats = {
@@ -411,13 +389,6 @@
 
         return cast
 
-    def c_private_declarations(self, prefix):
-        """Return the declarations of private functions."""
-        decls = [self._c_decl(ent, prefix) + ';'
-                for ent in self.entries if not ent.alias]
-
-        return "\n".join(decls)
-
     def c_public_dispatches(self, prefix, no_hidden):
         """Return the public dispatch functions."""
         dispatches = []
@@ -437,7 +408,7 @@
             if ent.ret:
                 ret = 'return '
             stmt1 = self.indent
-            stmt1 += 'const struct mapi_table *_tbl = %s();' % (
+            stmt1 += 'const struct _glapi_table *_tbl = %s();' % (
                     self.current_get)
             stmt2 = self.indent
             stmt2 += 'mapi_func _func = ((const mapi_func *) _tbl)[%d];' % (
@@ -667,22 +638,6 @@
                 print '#undef MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN'
                 print '#endif /* MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN */'
 
-    def output_for_app(self):
-        print self.c_notice()
-        print
-        print self.c_private_declarations(self.prefix_app)
-        print
-        print '#ifdef API_TMP_DEFINE_SPEC'
-        print
-        print 'static const char %s_spec[] =' % (self.prefix_app)
-        print self.c_mapi_table_spec()
-        print
-        print 'static const mapi_proc %s_procs[] = {' % (self.prefix_app)
-        print self.c_mapi_table_initializer(self.prefix_app)
-        print '};'
-        print
-        print '#endif /* API_TMP_DEFINE_SPEC */'
-
 class GLAPIPrinter(ABIPrinter):
     """OpenGL API Printer"""
 
@@ -704,7 +659,6 @@
         self.lib_need_non_hidden_entries = True
 
         self.prefix_lib = 'GLAPI_PREFIX'
-        self.prefix_app = '_mesa_'
         self.prefix_noop = 'noop'
         self.prefix_warn = self.prefix_lib
 
@@ -817,17 +771,13 @@
 
 def parse_args():
     printers = ['glapi', 'es1api', 'es2api', 'shared-glapi']
-    modes = ['lib', 'app']
 
     parser = OptionParser(usage='usage: %prog [options] <filename>')
     parser.add_option('-p', '--printer', dest='printer',
             help='printer to use: %s' % (", ".join(printers)))
-    parser.add_option('-m', '--mode', dest='mode',
-            help='target user: %s' % (", ".join(modes)))
 
     options, args = parser.parse_args()
-    if not args or options.printer not in printers or \
-            options.mode not in modes:
+    if not args or options.printer not in printers:
         parser.print_help()
         sys.exit(1)
 
@@ -850,10 +800,7 @@
     abi_sanity_check(entries)
 
     printer = printers[options.printer](entries)
-    if options.mode == 'lib':
-        printer.output_for_lib()
-    else:
-        printer.output_for_app()
+    printer.output_for_lib()
 
 if __name__ == '__main__':
     main()
diff --git a/src/mapi/mapi_glapi.c b/src/mapi/mapi_glapi.c
index 9f02edb..3a376e8 100644
--- a/src/mapi/mapi_glapi.c
+++ b/src/mapi/mapi_glapi.c
@@ -65,7 +65,7 @@
 void
 _glapi_set_dispatch(struct _glapi_table *dispatch)
 {
-   u_current_set_table((const struct mapi_table *) dispatch);
+   u_current_set_table((const struct _glapi_table *) dispatch);
 }
 
 /**
diff --git a/src/mapi/shared-glapi/SConscript b/src/mapi/shared-glapi/SConscript
index e5d45db..5d74bd6 100644
--- a/src/mapi/shared-glapi/SConscript
+++ b/src/mapi/shared-glapi/SConscript
@@ -32,7 +32,7 @@
         script = '../mapi_abi.py',
         source = [GLAPI + 'gen/gl_and_es_API.xml'] + env.Glob(GLAPI + 'gen/*.xml'),
         command = python_cmd + ' $SCRIPT ' + \
-                '--printer %s --mode lib $SOURCE > $TARGET' % (printer),
+                '--printer %s $SOURCE > $TARGET' % (printer),
     )
 
     cpppath = [
diff --git a/src/mapi/table.h b/src/mapi/table.h
index a1af40c..f488b6d 100644
--- a/src/mapi/table.h
+++ b/src/mapi/table.h
@@ -37,7 +37,7 @@
 #define MAPI_TABLE_NUM_SLOTS (MAPI_TABLE_NUM_STATIC + MAPI_TABLE_NUM_DYNAMIC)
 #define MAPI_TABLE_SIZE (MAPI_TABLE_NUM_SLOTS * sizeof(mapi_func))
 
-struct mapi_table;
+struct _glapi_table;
 
 extern const mapi_func table_noop_array[];
 
@@ -52,17 +52,17 @@
 /**
  * Get the no-op dispatch table.
  */
-static inline const struct mapi_table *
+static inline const struct _glapi_table *
 table_get_noop(void)
 {
-   return (const struct mapi_table *) table_noop_array;
+   return (const struct _glapi_table *) table_noop_array;
 }
 
 /**
  * Set the function of a slot.
  */
 static inline void
-table_set_func(struct mapi_table *tbl, int slot, mapi_func func)
+table_set_func(struct _glapi_table *tbl, int slot, mapi_func func)
 {
    mapi_func *funcs = (mapi_func *) tbl;
    funcs[slot] = func;
@@ -72,7 +72,7 @@
  * Return the function of a slot.
  */
 static inline mapi_func
-table_get_func(const struct mapi_table *tbl, int slot)
+table_get_func(const struct _glapi_table *tbl, int slot)
 {
    const mapi_func *funcs = (const mapi_func *) tbl;
    return funcs[slot];
diff --git a/src/mapi/u_current.c b/src/mapi/u_current.c
index 7e7e275..1402cea 100644
--- a/src/mapi/u_current.c
+++ b/src/mapi/u_current.c
@@ -99,17 +99,17 @@
 /*@{*/
 #if defined(GLX_USE_TLS)
 
-__thread struct mapi_table *u_current_table
+__thread struct _glapi_table *u_current_table
     __attribute__((tls_model("initial-exec")))
-    = (struct mapi_table *) table_noop_array;
+    = (struct _glapi_table *) table_noop_array;
 
 __thread void *u_current_context
     __attribute__((tls_model("initial-exec")));
 
 #else
 
-struct mapi_table *u_current_table =
-   (struct mapi_table *) table_noop_array;
+struct _glapi_table *u_current_table =
+   (struct _glapi_table *) table_noop_array;
 void *u_current_context;
 
 tss_t u_current_table_tsd;
@@ -259,17 +259,17 @@
  * table (__glapi_noop_table).
  */
 void
-u_current_set_table(const struct mapi_table *tbl)
+u_current_set_table(const struct _glapi_table *tbl)
 {
    u_current_init();
 
    stub_init_once();
 
    if (!tbl)
-      tbl = (const struct mapi_table *) table_noop_array;
+      tbl = (const struct _glapi_table *) table_noop_array;
 
 #if defined(GLX_USE_TLS)
-   u_current_table = (struct mapi_table *) tbl;
+   u_current_table = (struct _glapi_table *) tbl;
 #else
    tss_set(u_current_table_tsd, (void *) tbl);
    u_current_table = (ThreadSafe) ? NULL : (void *) tbl;
@@ -279,15 +279,15 @@
 /**
  * Return pointer to current dispatch table for calling thread.
  */
-struct mapi_table *
+struct _glapi_table *
 u_current_get_table_internal(void)
 {
 #if defined(GLX_USE_TLS)
    return u_current_table;
 #else
    if (ThreadSafe)
-      return (struct mapi_table *) tss_get(u_current_table_tsd);
+      return (struct _glapi_table *) tss_get(u_current_table_tsd);
    else
-      return (struct mapi_table *) u_current_table;
+      return (struct _glapi_table *) u_current_table;
 #endif
 }
diff --git a/src/mapi/u_current.h b/src/mapi/u_current.h
index ea4f817..3c9a414 100644
--- a/src/mapi/u_current.h
+++ b/src/mapi/u_current.h
@@ -10,9 +10,6 @@
 
 #include "glapi/glapi.h"
 
-/* ugly renames to match glapi.h */
-#define mapi_table _glapi_table
-
 #ifdef GLX_USE_TLS
 #define u_current_table _glapi_tls_Dispatch
 #define u_current_context _glapi_tls_Context
@@ -28,11 +25,11 @@
 
 #else /* MAPI_MODE_UTIL || MAPI_MODE_GLAPI || MAPI_MODE_BRIDGE */
 
-struct mapi_table;
+struct _glapi_table;
 
 #ifdef GLX_USE_TLS
 
-extern __thread struct mapi_table *u_current_table
+extern __thread struct _glapi_table *u_current_table
     __attribute__((tls_model("initial-exec")));
 
 extern __thread void *u_current_context
@@ -40,7 +37,7 @@
 
 #else /* GLX_USE_TLS */
 
-extern struct mapi_table *u_current_table;
+extern struct _glapi_table *u_current_table;
 extern void *u_current_context;
 
 #endif /* GLX_USE_TLS */
@@ -54,9 +51,9 @@
 u_current_destroy(void);
 
 void
-u_current_set_table(const struct mapi_table *tbl);
+u_current_set_table(const struct _glapi_table *tbl);
 
-struct mapi_table *
+struct _glapi_table *
 u_current_get_table_internal(void);
 
 void
@@ -65,7 +62,7 @@
 void *
 u_current_get_context_internal(void);
 
-static inline const struct mapi_table *
+static inline const struct _glapi_table *
 u_current_get_table(void)
 {
 #ifdef GLX_USE_TLS
diff --git a/src/mesa/Android.gen.mk b/src/mesa/Android.gen.mk
index 42d4ba1..ee2d1de 100644
--- a/src/mesa/Android.gen.mk
+++ b/src/mesa/Android.gen.mk
@@ -34,6 +34,7 @@
 	main/enums.c \
 	main/api_exec.c \
 	main/dispatch.h \
+	main/format_fallback.c \
 	main/format_pack.c \
 	main/format_unpack.c \
 	main/format_info.h \
@@ -53,8 +54,6 @@
 endif
 endif
 
-sources += main/git_sha1.h
-
 sources := $(addprefix $(intermediates)/, $(sources))
 
 LOCAL_GENERATED_SOURCES += $(sources)
@@ -71,16 +70,6 @@
 	$(hide) $(PRIVATE_SCRIPT) $(1) $(PRIVATE_XML) > $@
 endef
 
-$(intermediates)/main/git_sha1.h:
-	@mkdir -p $(dir $@)
-	@echo "GIT-SHA1: $(PRIVATE_MODULE) <= git"
-	$(hide) touch $@
-	$(hide) if which git > /dev/null; then \
-			git --git-dir $(PRIVATE_PATH)/../../.git log -n 1 --oneline | \
-			sed 's/^\([^ ]*\) .*/#define MESA_GIT_SHA1 "git-\1"/' \
-			> $@; \
-		fi
-
 matypes_deps := \
 	$(BUILD_OUT_EXECUTABLES)/mesa_gen_matypes$(BUILD_EXECUTABLE_SUFFIX) \
 	$(LOCAL_PATH)/main/mtypes.h \
@@ -135,6 +124,17 @@
                $(LOCAL_PATH)/main/get_hash_params.py $(GET_HASH_GEN)
 	$(call es-gen)
 
+FORMAT_FALLBACK := $(LOCAL_PATH)/main/format_fallback.py
+format_fallback_deps := \
+	$(LOCAL_PATH)/main/formats.csv \
+	$(LOCAL_PATH)/main/format_parser.py \
+	$(FORMAT_FALLBACK)
+
+$(intermediates)/main/format_fallback.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_FALLBACK)
+$(intermediates)/main/format_fallback.c: PRIVATE_XML :=
+$(intermediates)/main/format_fallback.c: $(format_fallback_deps)
+	$(call es-gen, $< /dev/stdout)
+
 FORMAT_INFO := $(LOCAL_PATH)/main/format_info.py
 format_info_deps := \
 	$(LOCAL_PATH)/main/formats.csv \
diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
index 86196ce..c7715a5 100644
--- a/src/mesa/Android.libmesa_dricore.mk
+++ b/src/mesa/Android.libmesa_dricore.mk
@@ -24,8 +24,6 @@
 # libmesa_dricore.a
 # ----------------------------------------------------------------------
 
-ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
-
 LOCAL_PATH := $(call my-dir)
 
 # Import the following variables:
@@ -67,10 +65,9 @@
 	$(MESA_GEN_GLSL_H)
 
 LOCAL_WHOLE_STATIC_LIBRARIES += \
-	libmesa_program
+	libmesa_program \
+	libmesa_git_sha1
 
 include $(LOCAL_PATH)/Android.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
-
-endif # MESA_BUILD_CLASSIC
diff --git a/src/mesa/Android.libmesa_git_sha1.mk b/src/mesa/Android.libmesa_git_sha1.mk
new file mode 100644
index 0000000..7d64b1c
--- /dev/null
+++ b/src/mesa/Android.libmesa_git_sha1.mk
@@ -0,0 +1,54 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2017 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# ----------------------------------------------------------------------
+# libmesa_git_sha1
+# ----------------------------------------------------------------------
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_git_sha1
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+intermediates := $(call local-generated-sources-dir)
+
+# dummy.c source file is generated to meet the build system's rules.
+LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c
+
+$(intermediates)/dummy.c:
+	@mkdir -p $(dir $@)
+	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) touch $@
+
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, git_sha1.h)
+
+$(intermediates)/git_sha1.h: $(wildcard $(MESA_TOP)/.git/logs/HEAD)
+	@mkdir -p $(dir $@)
+	@echo "GIT-SHA1: $(PRIVATE_MODULE) <= git"
+	$(hide) sh $(MESA_TOP)/bin/git_sha1_gen.sh > $@
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk
index 92df4ad..ddfd030 100644
--- a/src/mesa/Android.libmesa_st_mesa.mk
+++ b/src/mesa/Android.libmesa_st_mesa.mk
@@ -24,8 +24,6 @@
 # libmesa_st_mesa.a
 # ----------------------------------------------------------------------
 
-ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
-
 LOCAL_PATH := $(call my-dir)
 
 # Import variables:
@@ -65,12 +63,11 @@
 	$(MESA_TOP)/src/gallium/include
 
 LOCAL_WHOLE_STATIC_LIBRARIES += \
-	libmesa_program
+	libmesa_program \
+	libmesa_git_sha1
 
 LOCAL_STATIC_LIBRARIES += libmesa_nir libmesa_glsl
 
 include $(LOCAL_PATH)/Android.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
-
-endif # MESA_BUILD_GALLIUM
diff --git a/src/mesa/Android.mk b/src/mesa/Android.mk
index 9a1aef8..e89de82 100644
--- a/src/mesa/Android.mk
+++ b/src/mesa/Android.mk
@@ -25,5 +25,6 @@
 include $(LOCAL_PATH)/Android.libmesa_dricore.mk
 include $(LOCAL_PATH)/Android.libmesa_st_mesa.mk
 include $(LOCAL_PATH)/Android.libmesa_sse41.mk
+include $(LOCAL_PATH)/Android.libmesa_git_sha1.mk
 
 include $(LOCAL_PATH)/program/Android.mk
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 53f311d..97a9bbd 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -37,6 +37,7 @@
 
 EXTRA_DIST = \
 	drivers/SConscript \
+	main/format_fallback.py \
 	main/format_info.py \
 	main/format_pack.py \
 	main/format_parser.py \
@@ -54,6 +55,7 @@
 
 BUILT_SOURCES = \
 	main/get_hash.h \
+	main/format_fallback.c \
 	main/format_info.h \
 	main/format_pack.c \
 	main/format_unpack.c \
@@ -70,6 +72,11 @@
 	$(PYTHON_GEN) $(srcdir)/main/get_hash_generator.py \
 		-f $(srcdir)/../mapi/glapi/gen/gl_and_es_API.xml > $@
 
+main/format_fallback.c: main/format_fallback.py \
+                        main/format_parser.py \
+	                main/formats.csv
+	$(PYTHON_GEN) $(srcdir)/main/format_fallback.py $(srcdir)/main/formats.csv $@
+
 main/format_info.h: main/formats.csv \
                     main/format_parser.py main/format_info.py
 	$(PYTHON_GEN) $(srcdir)/main/format_info.py $(srcdir)/main/formats.csv > $@
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 8a65fbe..86fbf39 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -94,6 +94,7 @@
 	main/ffvertex_prog.h \
 	main/fog.c \
 	main/fog.h \
+	main/format_fallback.c \
 	main/format_info.h \
 	main/format_pack.h \
 	main/format_pack.c \
@@ -240,6 +241,8 @@
 	main/texstorage.h \
 	main/texstore.c \
 	main/texstore.h \
+	main/texturebindless.c \
+	main/texturebindless.h \
 	main/textureview.c \
 	main/textureview.h \
 	main/transformfeedback.c \
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index fa4efe1..b63e15a 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -79,6 +79,13 @@
       command = python_cmd + ' $SCRIPT ' + ' $SOURCE > $TARGET'
 )
 
+format_fallback = env.CodeGenerate(
+      target = 'main/format_fallback.c',
+      script = 'main/format_fallback.py',
+      source = 'main/formats.csv',
+      command = python_cmd + ' $SCRIPT ' + ' $SOURCE ' + ' $TARGET'
+)
+
 #
 # Assembly sources
 #
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index db0a107..5008ae8 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -250,7 +250,7 @@
    ctx->Driver.Enable(ctx, GL_LINE_SMOOTH, ctx->Line.SmoothFlag);
    ctx->Driver.Enable(ctx, GL_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
    ctx->Driver.Enable(ctx, GL_SCISSOR_TEST, ctx->Scissor.EnableFlags);
-   ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
+   ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
    ctx->Driver.Enable(ctx, GL_TEXTURE_1D, GL_FALSE);
    ctx->Driver.Enable(ctx, GL_TEXTURE_2D, GL_FALSE);
    ctx->Driver.Enable(ctx, GL_TEXTURE_RECTANGLE_NV, GL_FALSE);
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index f4c91ac..7314384 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -118,8 +118,12 @@
       ? GL_TEXTURE_CUBE_MAP_POSITIVE_X + texImage->Face
       : texObj->Target;
 
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, texTarget,
-                             level, layer, false, __func__);
+   struct gl_renderbuffer_attachment *att =
+      _mesa_get_and_validate_attachment(ctx, fb, attachment, __func__);
+   assert(att);
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, att, texObj, texTarget,
+                             level, layer, false);
 }
 
 static struct gl_shader *
@@ -1898,8 +1902,7 @@
       verts[3].tex[1] = tex->Ttop;
 
       /* upload new vertex data */
-      _mesa_buffer_sub_data(ctx, copypix->buf_obj, 0, sizeof(verts), verts,
-                            __func__);
+      _mesa_buffer_sub_data(ctx, copypix->buf_obj, 0, sizeof(verts), verts);
    }
 
    _mesa_set_enable(ctx, tex->Target, GL_TRUE);
@@ -2347,7 +2350,7 @@
     * Check if swrast fallback is needed.
     */
    if (ctx->_ImageTransferState ||
-       ctx->FragmentProgram._Enabled ||
+       _mesa_arb_fragment_program_enabled(ctx) ||
        ctx->Fog.Enabled ||
        ctx->Texture._MaxEnabledTexImageUnit != -1 ||
        width > tex->MaxSize ||
@@ -2418,8 +2421,7 @@
       }
 
       /* upload new vertex data */
-      _mesa_buffer_sub_data(ctx, bitmap->buf_obj, 0, sizeof(verts), verts,
-                            __func__);
+      _mesa_buffer_sub_data(ctx, bitmap->buf_obj, 0, sizeof(verts), verts);
    }
 
    /* choose different foreground/background alpha values */
@@ -2768,7 +2770,7 @@
  * glBlitFramebuffer() to implement glCopyTexSubImage().
  */
 static bool
-copytexsubimage_using_blit_framebuffer(struct gl_context *ctx, GLuint dims,
+copytexsubimage_using_blit_framebuffer(struct gl_context *ctx,
                                        struct gl_texture_image *texImage,
                                        GLint xoffset,
                                        GLint yoffset,
@@ -2862,7 +2864,7 @@
    GLint bpp;
    void *buf;
 
-   if (copytexsubimage_using_blit_framebuffer(ctx, dims,
+   if (copytexsubimage_using_blit_framebuffer(ctx,
                                               texImage,
                                               xoffset, yoffset, zoffset,
                                               rb,
@@ -3062,6 +3064,11 @@
    if (width > decompress_fbo->Width || height > decompress_fbo->Height) {
       _mesa_renderbuffer_storage(ctx, decompress_fbo->rb, rbFormat,
                                  width, height, 0);
+
+      /* Do the full completeness check to recompute
+       * ctx->DrawBuffer->Width/Height.
+       */
+      ctx->DrawBuffer->_Status = GL_FRAMEBUFFER_UNDEFINED;
       status = _mesa_check_framebuffer_status(ctx, ctx->DrawBuffer);
       if (status != GL_FRAMEBUFFER_COMPLETE) {
          /* If the framebuffer isn't complete then we'll leave
@@ -3136,8 +3143,7 @@
    _mesa_set_viewport(ctx, 0, 0, 0, width, height);
 
    /* upload new vertex data */
-   _mesa_buffer_sub_data(ctx, decompress->buf_obj, 0, sizeof(verts), verts,
-                         __func__);
+   _mesa_buffer_sub_data(ctx, decompress->buf_obj, 0, sizeof(verts), verts);
 
    /* setup texture state */
    _mesa_BindTexture(target, texObj->Name);
@@ -3405,8 +3411,7 @@
          verts[3].st[i][1] = t1;
       }
 
-      _mesa_buffer_sub_data(ctx, drawtex->buf_obj, 0, sizeof(verts), verts,
-                            __func__);
+      _mesa_buffer_sub_data(ctx, drawtex->buf_obj, 0, sizeof(verts), verts);
    }
 
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 991d52f..7adad46 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -59,7 +59,7 @@
 setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
                                    struct blit_state *blit,
                                    struct gl_renderbuffer *src_rb,
-                                   GLenum target, GLenum filter)
+                                   GLenum target)
 {
    GLint loc_src_width, loc_src_height;
    int i, samples;
@@ -581,7 +581,7 @@
                                    2, texcoord_size, 0);
 
    if (is_target_multisample && is_filter_scaled_resolve && is_scaled_blit) {
-      setup_glsl_msaa_blit_scaled_shader(ctx, blit, src_rb, target, filter);
+      setup_glsl_msaa_blit_scaled_shader(ctx, blit, src_rb, target);
    } else if (is_target_multisample) {
       setup_glsl_msaa_blit_shader(ctx, blit, drawFb, src_rb, target);
    } else {
@@ -800,8 +800,7 @@
       verts[3].tex[1] = t1;
       verts[3].tex[2] = readAtt->Zoffset;
 
-      _mesa_buffer_sub_data(ctx, blit->buf_obj, 0, sizeof(verts), verts,
-                            __func__);
+      _mesa_buffer_sub_data(ctx, blit->buf_obj, 0, sizeof(verts), verts);
    }
 
    /* setup viewport */
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index b8c422b..e0284a3 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -136,6 +136,7 @@
    _mesa_initialize_texture_object(ctx, tex_obj, *tmp_tex, GL_TEXTURE_2D);
    /* This must be set after _mesa_initialize_texture_object, not before. */
    tex_obj->Immutable = GL_TRUE;
+   tex_obj->ImmutableLevels = 1;
    /* This is required for interactions with ARB_texture_view. */
    tex_obj->NumLayers = 1;
 
diff --git a/src/mesa/drivers/dri/Android.mk b/src/mesa/drivers/dri/Android.mk
index dc13f71..53ff4b4 100644
--- a/src/mesa/drivers/dri/Android.mk
+++ b/src/mesa/drivers/dri/Android.mk
@@ -57,14 +57,5 @@
 #-----------------------------------------------
 # Build drivers and libmesa_dri_common
 
-SUBDIRS := common
-
-ifneq ($(filter i915, $(MESA_GPU_DRIVERS)),)
-	SUBDIRS += i915
-endif
-
-ifneq ($(filter i965, $(MESA_GPU_DRIVERS)),)
-	SUBDIRS += i965
-endif
-
+SUBDIRS := common i915 i965
 include $(foreach d, $(SUBDIRS), $(LOCAL_PATH)/$(d)/Android.mk)
diff --git a/src/mesa/drivers/dri/Makefile.am b/src/mesa/drivers/dri/Makefile.am
index fb0fc32..381c6a2 100644
--- a/src/mesa/drivers/dri/Makefile.am
+++ b/src/mesa/drivers/dri/Makefile.am
@@ -6,10 +6,6 @@
 
 SUBDIRS+=common
 
-if HAVE_SHARED_GLAPI
-SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
-endif
-
 if HAVE_I915_DRI
 SUBDIRS += i915
 MEGADRIVERS_DEPS += i915/libi915_dri.la
@@ -70,7 +66,7 @@
         common/libmegadriver_stub.la \
         common/libdricommon.la \
         common/libxmlconfig.la \
-        $(SHARED_GLAPI_LIB) \
+        $(top_builddir)/src/mapi/shared-glapi/libglapi.la \
         $(MEGADRIVERS_DEPS) \
         $(DRI_LIB_DEPS) \
         $()
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index f6df488..bfae020 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -403,7 +403,8 @@
     if (mesa_api != API_OPENGL_COMPAT
         && mesa_api != API_OPENGL_CORE
         && (flags & ~(__DRI_CTX_FLAG_DEBUG |
-	              __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS))) {
+	              __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS |
+	              __DRI_CTX_FLAG_NO_ERROR))) {
 	*error = __DRI_CTX_ERROR_BAD_FLAG;
 	return NULL;
     }
@@ -425,7 +426,8 @@
 
     const uint32_t allowed_flags = (__DRI_CTX_FLAG_DEBUG
                                     | __DRI_CTX_FLAG_FORWARD_COMPATIBLE
-                                    | __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS);
+                                    | __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS
+                                    | __DRI_CTX_FLAG_NO_ERROR);
     if (flags & ~allowed_flags) {
 	*error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
 	return NULL;
@@ -467,6 +469,8 @@
        _mesa_set_debug_state_int(ctx, GL_DEBUG_OUTPUT, GL_TRUE);
         ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
     }
+    if ((flags & __DRI_CTX_FLAG_NO_ERROR) != 0)
+        ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR;
 }
 
 static __DRIcontext *
@@ -935,3 +939,7 @@
 
    .copySubBuffer               = driCopySubBuffer,
 };
+
+const __DRInoErrorExtension dri2NoErrorExtension = {
+   .base = { __DRI2_NO_ERROR, 1 },
+};
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index 8fcd632..3e1ce62 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -293,4 +293,6 @@
 
 extern const __DRIimageDriverExtension driImageDriverExtension;
 
+extern const __DRInoErrorExtension dri2NoErrorExtension;
+
 #endif /* _DRI_UTIL_H_ */
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 14d7713..c9fcb59 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -17,6 +17,10 @@
   an #extension directive in the middle of its shaders, which is illegal
   in GLSL.
 
+* Dying Light and Dead Island Definitive Edition redeclare vertex shader
+  built-ins (specifically gl_VertexID), which causes the vertex shaders to fail
+  to compile.
+
 TODO: document the other workarounds.
 
 -->
@@ -81,8 +85,19 @@
             <option name="always_have_depth_buffer" value="true" />
         </application>
 
-        <application name="Dead Island" executable="DeadIslandGame">
+        <application name="Dead Island (incl. Definitive Edition)" executable="DeadIslandGame">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+
+            <!-- For the Definitive Edition which shares the same executable name -->
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+        </application>
+
+        <application name="Dead Island Riptide Definitive Edition" executable="DeadIslandRiptideGame">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+        </application>
+
+        <application name="Dying Light" executable="DyingLightGame">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
         </application>
 
         <application name="Second Life" executable="do-not-directly-run-secondlife-bin">
@@ -136,5 +151,55 @@
         <application name="Kerbal Space Program (64-bit)" executable="KSP.x86_64">
             <option name="glsl_zero_init" value="true"/>
         </application>
+
+        <application name="Rocket League" executable="RocketLeague">
+            <option name="glsl_correct_derivatives_after_discard" value="true"/>
+        </application>
+
+        <application name="The Witcher 2" executable="witcher2">
+            <option name="glsl_correct_derivatives_after_discard" value="true"/>
+        </application>
+
+        <!-- The GL thread whitelist is below, workarounds are above.
+             Keep it that way. -->
+
+        <application name="Alien Isolation" executable="AlienIsolation">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="Civilization 6" executable="Civ6">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="BioShock Infinite" executable="bioshock.i386">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="Borderlands 2" executable="Borderlands2">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="Civilization 5" executable="Civ5XP">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="The Witcher 2" executable="witcher2">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="American Truck Simulator" executable="amtrucks">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+        <application name="Euro Truck Simulator 2" executable="eurotrucks2">
+            <option name="mesa_glthread" value="true"/>
+        </application>
+	<application name="War Thunder (Wine)" executable="aces.exe">
+            <option name="mesa_glthread" value="true"/>
+	</application>
+    </device>
+    <!-- vmwgfx doesn't like full buffer swaps and can't sync to vertical retraces.-->
+    <device driver="vmwgfx">
+        <application name="gnome-shell" executable="gnome-shell">
+            <option name="glx_disable_ext_buffer_age" value="true" />
+            <option name="glx_disable_oml_sync_control" value="true" />
+        </application>
+	<application name="Compiz" executable="Compiz">
+            <option name="glx_disable_ext_buffer_age" value="true" />
+	    <option name="glx_disable_oml_sync_control" value="true" />
+        </application>
     </device>
 </driconf>
diff --git a/src/mesa/drivers/dri/common/xmlconfig.c b/src/mesa/drivers/dri/common/xmlconfig.c
index d464937..31c5447 100644
--- a/src/mesa/drivers/dri/common/xmlconfig.c
+++ b/src/mesa/drivers/dri/common/xmlconfig.c
@@ -776,7 +776,10 @@
     OC_APPLICATION = 0, OC_DEVICE, OC_DRICONF, OC_OPTION, OC_COUNT
 };
 static const XML_Char *OptConfElems[] = {
-    "application", "device", "driconf", "option"
+    [OC_APPLICATION]  = "application",
+    [OC_DEVICE] = "device",
+    [OC_DRICONF] = "driconf",
+    [OC_OPTION] = "option",
 };
 
 /** \brief Parse attributes of a device element. */
diff --git a/src/mesa/drivers/dri/common/xmlpool/t_options.h b/src/mesa/drivers/dri/common/xmlpool/t_options.h
index cd4f025..e308839 100644
--- a/src/mesa/drivers/dri/common/xmlpool/t_options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/t_options.h
@@ -115,6 +115,11 @@
         DRI_CONF_DESC(en,gettext("Allow GLSL #extension directives in the middle of shaders")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION(def) \
+DRI_CONF_OPT_BEGIN_B(allow_glsl_builtin_variable_redeclaration, def) \
+        DRI_CONF_DESC(en,gettext("Allow GLSL built-in variables to be redeclared verbatim")) \
+DRI_CONF_OPT_END
+
 #define DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION(def) \
 DRI_CONF_OPT_BEGIN_B(allow_higher_compat_version, def) \
         DRI_CONF_DESC(en,gettext("Allow a higher compat profile (version 3.1+) for apps that request it")) \
@@ -125,6 +130,10 @@
         DRI_CONF_DESC(en,gettext("Force computing the absolute value for sqrt() and inversesqrt()")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD(def) \
+DRI_CONF_OPT_BEGIN_B(glsl_correct_derivatives_after_discard, def) \
+        DRI_CONF_DESC(en,gettext("Implicit and explicit derivatives after a discard behave as if the discard didn't happen")) \
+DRI_CONF_OPT_END
 
 
 /**
@@ -323,6 +332,21 @@
         DRI_CONF_DESC(en,gettext("Enable offloading GL driver work to a separate thread")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_MESA_NO_ERROR(def) \
+DRI_CONF_OPT_BEGIN_B(mesa_no_error, def) \
+        DRI_CONF_DESC(en,gettext("Disable GL driver error checking")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_DISABLE_EXT_BUFFER_AGE(def) \
+DRI_CONF_OPT_BEGIN_B(glx_disable_ext_buffer_age, def) \
+   DRI_CONF_DESC(en, gettext("Disable the GLX_EXT_buffer_age extension")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_DISABLE_OML_SYNC_CONTROL(def) \
+DRI_CONF_OPT_BEGIN_B(glx_disable_oml_sync_control, def) \
+   DRI_CONF_DESC(en, gettext("Disable the GLX_OML_sync_control extension")) \
+DRI_CONF_OPT_END
+
 
 /**
  * \brief Software-fallback options.  To allow using features (like
diff --git a/src/mesa/drivers/dri/i915/Makefile.am b/src/mesa/drivers/dri/i915/Makefile.am
index e85fb9d..feead3e 100644
--- a/src/mesa/drivers/dri/i915/Makefile.am
+++ b/src/mesa/drivers/dri/i915/Makefile.am
@@ -34,8 +34,8 @@
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
-	$(INTEL_CFLAGS)
+	$(I915_CFLAGS)
 
 noinst_LTLIBRARIES = libi915_dri.la
 libi915_dri_la_SOURCES = $(i915_FILES)
-libi915_dri_la_LIBADD = $(INTEL_LIBS)
+libi915_dri_la_LIBADD = $(I915_LIBS)
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 6c48823..4f6bdb7 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -27,6 +27,7 @@
 
 #include "i915_context.h"
 #include "main/api_exec.h"
+#include "main/framebuffer.h"
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/version.h"
@@ -52,15 +53,19 @@
 /* Override intel default.
  */
 static void
-i915InvalidateState(struct gl_context * ctx, GLuint new_state)
+i915InvalidateState(struct gl_context * ctx)
 {
+   GLuint new_state = ctx->NewState;
+
    _swrast_InvalidateState(ctx, new_state);
    _swsetup_InvalidateState(ctx, new_state);
-   _vbo_InvalidateState(ctx, new_state);
    _tnl_InvalidateState(ctx, new_state);
    _tnl_invalidate_vertex_state(ctx, new_state);
    intel_context(ctx)->NewGLState |= new_state;
 
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    /* Todo: gather state values under which tracked parameters become
     * invalidated, add callbacks for things like
     * ProgramLocalParameters, etc.
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 4c4d95c..685af04 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -127,7 +127,7 @@
                  S5_STENCIL_PASS_Z_PASS_SHIFT));
 
    /* Set back state if different from front. */
-   if (ctx->Stencil._TestTwoSide) {
+   if (_mesa_stencil_is_two_sided(ctx)) {
       set_ctx_bits(I915_CTXREG_BF_STENCIL_OPS,
                    BFO_STENCIL_REF_MASK |
                    BFO_STENCIL_TEST_MASK |
diff --git a/src/mesa/drivers/dri/i915/intel_clear.c b/src/mesa/drivers/dri/i915/intel_clear.c
index 4306826..f8df2e0 100644
--- a/src/mesa/drivers/dri/i915/intel_clear.c
+++ b/src/mesa/drivers/dri/i915/intel_clear.c
@@ -181,7 +181,7 @@
 
    if (tri_mask) {
       debug_mask("tri", tri_mask);
-      if (ctx->API == API_OPENGLES || !ctx->Extensions.ARB_fragment_shader)
+      if (!ctx->Extensions.ARB_fragment_shader)
 	 _mesa_meta_Clear(&intel->ctx, tri_mask);
       else
 	 _mesa_meta_glsl_Clear(&intel->ctx, tri_mask);
diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index 5607d5b..e0766a0 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -314,16 +314,19 @@
 
 
 static void
-intelInvalidateState(struct gl_context * ctx, GLuint new_state)
+intelInvalidateState(struct gl_context * ctx)
 {
+   GLuint new_state = ctx->NewState;
     struct intel_context *intel = intel_context(ctx);
 
     if (ctx->swrast_context)
        _swrast_InvalidateState(ctx, new_state);
-   _vbo_InvalidateState(ctx, new_state);
 
    intel->NewGLState |= new_state;
 
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    if (intel->vtbl.invalidate_state)
       intel->vtbl.invalidate_state( intel, new_state );
 }
diff --git a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
index 5cbf763..b0ebff5 100644
--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2006 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <GL/gl.h>
@@ -83,9 +83,6 @@
    mt->format = format;
    mt->first_level = first_level;
    mt->last_level = last_level;
-   mt->logical_width0 = width0;
-   mt->logical_height0 = height0;
-   mt->logical_depth0 = depth0;
 
    /* The cpp is bytes per (1, blockheight)-sized block for compressed
     * textures.  This is why you'll see divides by blockheight all over
@@ -96,7 +93,7 @@
    mt->cpp = _mesa_get_format_bytes(mt->format) / bw;
 
    mt->compressed = _mesa_is_format_compressed(format);
-   mt->refcount = 1; 
+   mt->refcount = 1;
 
    if (target == GL_TEXTURE_CUBE_MAP) {
       assert(depth0 == 1);
@@ -108,9 +105,8 @@
    mt->physical_depth0 = depth0;
 
    intel_get_texture_alignment_unit(intel, mt->format,
-				    &mt->align_w, &mt->align_h);
+                                    &mt->align_w, &mt->align_h);
 
-   (void) intel;
    if (intel->is_945)
       i945_miptree_layout(mt);
    else
@@ -159,14 +155,14 @@
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct intel_context *intel,
-		     GLenum target,
-		     mesa_format format,
-		     GLuint first_level,
-		     GLuint last_level,
-		     GLuint width0,
-		     GLuint height0,
-		     GLuint depth0,
-		     bool expect_accelerated_upload,
+                     GLenum target,
+                     mesa_format format,
+                     GLuint first_level,
+                     GLuint last_level,
+                     GLuint width0,
+                     GLuint height0,
+                     GLuint depth0,
+                     bool expect_accelerated_upload,
                      enum intel_miptree_tiling_mode requested_tiling)
 {
    struct intel_mipmap_tree *mt;
@@ -174,11 +170,10 @@
 
 
    mt = intel_miptree_create_layout(intel, target, format,
-				      first_level, last_level, width0,
-				      height0, depth0);
-   /*
-    * pitch == 0 || height == 0  indicates the null texture
-    */
+                                    first_level, last_level, width0,
+                                    height0, depth0);
+
+   /* pitch == 0 || height == 0  indicates the null texture */
    if (!mt || !mt->total_width || !mt->total_height) {
       intel_miptree_release(&mt);
       return NULL;
@@ -193,11 +188,11 @@
    bool y_or_x = tiling == (I915_TILING_Y | I915_TILING_X);
 
    mt->region = intel_region_alloc(intel->intelScreen,
-				   y_or_x ? I915_TILING_Y : tiling,
-				   mt->cpp,
-				   total_width,
-				   total_height,
-				   expect_accelerated_upload);
+                                   y_or_x ? I915_TILING_Y : tiling,
+                                   mt->cpp,
+                                   total_width,
+                                   total_height,
+                                   expect_accelerated_upload);
 
    /* If the region is too large to fit in the aperture, we need to use the
     * BLT engine to support it.  The BLT paths can't currently handle Y-tiling,
@@ -278,13 +273,9 @@
 
 
 /**
- * For a singlesample DRI2 buffer, this simply wraps the given region with a miptree.
- *
- * For a multisample DRI2 buffer, this wraps the given region with
- * a singlesample miptree, then creates a multisample miptree into which the
- * singlesample miptree is embedded as a child.
+ * Wraps the given region with a miptree.
  */
-struct intel_mipmap_tree*
+struct intel_mipmap_tree *
 intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
                                      unsigned dri_attachment,
                                      mesa_format format,
@@ -317,13 +308,9 @@
 }
 
 /**
- * For a singlesample image buffer, this simply wraps the given region with a miptree.
- *
- * For a multisample image buffer, this wraps the given region with
- * a singlesample miptree, then creates a multisample miptree into which the
- * singlesample miptree is embedded as a child.
+ * Wraps the given region with a miptree.
  */
-struct intel_mipmap_tree*
+struct intel_mipmap_tree *
 intel_miptree_create_for_image_buffer(struct intel_context *intel,
                                       enum __DRIimageBufferMask buffer_type,
                                       mesa_format format,
@@ -349,7 +336,7 @@
    return mt;
 }
 
-struct intel_mipmap_tree*
+struct intel_mipmap_tree *
 intel_miptree_create_for_renderbuffer(struct intel_context *intel,
                                       mesa_format format,
                                       uint32_t width,
@@ -395,7 +382,7 @@
       intel_region_release(&((*mt)->region));
 
       for (i = 0; i < MAX_TEXTURE_LEVELS; i++) {
-	 free((*mt)->level[i].slice);
+         free((*mt)->level[i].slice);
       }
 
       free(*mt);
@@ -408,11 +395,6 @@
                                        int *width, int *height, int *depth)
 {
    switch (image->TexObject->Target) {
-   case GL_TEXTURE_1D_ARRAY:
-      *width = image->Width;
-      *height = 1;
-      *depth = image->Height;
-      break;
    default:
       *width = image->Width;
       *height = image->Height;
@@ -455,24 +437,10 @@
     * minification.  This will also catch images not present in the
     * tree, changed targets, etc.
     */
-   if (mt->target == GL_TEXTURE_2D_MULTISAMPLE ||
-         mt->target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY) {
-      /* nonzero level here is always bogus */
-      assert(level == 0);
-
-      if (width != mt->logical_width0 ||
-            height != mt->logical_height0 ||
-            depth != mt->logical_depth0) {
-         return false;
-      }
-   }
-   else {
-      /* all normal textures, renderbuffers, etc */
-      if (width != mt->level[level].width ||
-          height != mt->level[level].height ||
-          depth != mt->level[level].depth) {
-         return false;
-      }
+   if (width != mt->level[level].width ||
+       height != mt->level[level].height ||
+       depth != mt->level[level].depth) {
+      return false;
    }
 
    return true;
@@ -481,9 +449,9 @@
 
 void
 intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
-			     GLuint level,
-			     GLuint x, GLuint y,
-			     GLuint w, GLuint h, GLuint d)
+                             GLuint level,
+                             GLuint x, GLuint y,
+                             GLuint w, GLuint h, GLuint d)
 {
    mt->level[level].width = w;
    mt->level[level].height = h;
@@ -504,8 +472,8 @@
 
 void
 intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint img,
-			       GLuint x, GLuint y)
+                               GLuint level, GLuint img,
+                               GLuint x, GLuint y)
 {
    if (img == 0 && level == 0)
       assert(x == 0 && y == 0);
@@ -523,8 +491,8 @@
 
 void
 intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint slice,
-			       GLuint *x, GLuint *y)
+                               GLuint level, GLuint slice,
+                               GLuint *x, GLuint *y)
 {
    assert(slice < mt->level[level].depth);
 
@@ -552,14 +520,13 @@
    uint32_t x, y;
    uint32_t mask_x, mask_y;
 
-   intel_region_get_tile_masks(region, &mask_x, &mask_y, false);
+   intel_region_get_tile_masks(region, &mask_x, &mask_y);
    intel_miptree_get_image_offset(mt, level, slice, &x, &y);
 
    *tile_x = x & mask_x;
    *tile_y = y & mask_y;
 
-   return intel_region_get_aligned_offset(region, x & ~mask_x, y & ~mask_y,
-                                          false);
+   return intel_region_get_aligned_offset(region, x & ~mask_x, y & ~mask_y);
 }
 
 static void
@@ -614,11 +581,11 @@
 
 static void
 intel_miptree_copy_slice(struct intel_context *intel,
-			 struct intel_mipmap_tree *dst_mt,
-			 struct intel_mipmap_tree *src_mt,
-			 int level,
-			 int face,
-			 int depth)
+                         struct intel_mipmap_tree *dst_mt,
+                         struct intel_mipmap_tree *src_mt,
+                         int level,
+                         int face,
+                         int depth)
 
 {
    mesa_format format = src_mt->format;
@@ -672,8 +639,8 @@
  */
 void
 intel_miptree_copy_teximage(struct intel_context *intel,
-			    struct intel_texture_image *intelImage,
-			    struct intel_mipmap_tree *dst_mt,
+                            struct intel_texture_image *intelImage,
+                            struct intel_mipmap_tree *dst_mt,
                             bool invalidate)
 {
    struct intel_mipmap_tree *src_mt = intelImage->mt;
@@ -722,9 +689,9 @@
 
 static void
 intel_miptree_map_gtt(struct intel_context *intel,
-		      struct intel_mipmap_tree *mt,
-		      struct intel_miptree_map *map,
-		      unsigned int level, unsigned int slice)
+                      struct intel_mipmap_tree *mt,
+                      struct intel_miptree_map *map,
+                      unsigned int level, unsigned int slice)
 {
    unsigned int bw, bh;
    void *base;
@@ -770,9 +737,9 @@
 
 static void
 intel_miptree_map_blit(struct intel_context *intel,
-		       struct intel_mipmap_tree *mt,
-		       struct intel_miptree_map *map,
-		       unsigned int level, unsigned int slice)
+                       struct intel_mipmap_tree *mt,
+                       struct intel_miptree_map *map,
+                       unsigned int level, unsigned int slice)
 {
    map->mt = intel_miptree_create(intel, GL_TEXTURE_2D, mt->format,
                                   0, 0,
@@ -813,10 +780,10 @@
 
 static void
 intel_miptree_unmap_blit(struct intel_context *intel,
-			 struct intel_mipmap_tree *mt,
-			 struct intel_miptree_map *map,
-			 unsigned int level,
-			 unsigned int slice)
+                         struct intel_mipmap_tree *mt,
+                         struct intel_miptree_map *map,
+                         unsigned int level,
+                         unsigned int slice)
 {
    struct gl_context *ctx = &intel->ctx;
 
@@ -897,7 +864,7 @@
    struct intel_miptree_map *map;
 
    map = intel_miptree_attach_map(mt, level, slice, x, y, w, h, mode);
-   if (!map){
+   if (!map) {
       *out_ptr = NULL;
       *out_stride = 0;
       return;
diff --git a/src/mesa/drivers/dri/i915/intel_mipmap_tree.h b/src/mesa/drivers/dri/i915/intel_mipmap_tree.h
index 853a4a7..2d11853 100644
--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.h
@@ -94,8 +94,6 @@
     *
     * The exact semantics of depth varies according to the texture target:
     *    - For GL_TEXTURE_CUBE_MAP, depth is 6.
-    *    - For GL_TEXTURE_2D_ARRAY, depth is the number of array slices. It is
-    *      identical for all miplevels in the texture.
     *    - For GL_TEXTURE_3D, it is the texture's depth at this miplevel. Its
     *      value, like width and height, varies with miplevel.
     *    - For other texture types, depth is 1.
@@ -160,35 +158,14 @@
    /**
     * Level zero image dimensions.  These dimensions correspond to the
     * physical layout of data in memory.  Accordingly, they account for the
-    * extra width, height, and or depth that must be allocated in order to
-    * accommodate multisample formats, and they account for the extra factor
-    * of 6 in depth that must be allocated in order to accommodate cubemap
-    * textures.
+    * extra factor of 6 in depth that must be allocated in order to
+    * accommodate cubemap textures.
     */
    GLuint physical_width0, physical_height0, physical_depth0;
 
    GLuint cpp;
    bool compressed;
 
-   /**
-    * Level zero image dimensions.  These dimensions correspond to the
-    * logical width, height, and depth of the region as seen by client code.
-    * Accordingly, they do not account for the extra width, height, and/or
-    * depth that must be allocated in order to accommodate multisample
-    * formats, nor do they account for the extra factor of 6 in depth that
-    * must be allocated in order to accommodate cubemap textures.
-    */
-   uint32_t logical_width0, logical_height0, logical_depth0;
-
-   /**
-    * For 1D array, 2D array, cube, and 2D multisampled surfaces on Gen7: true
-    * if the surface only contains LOD 0, and hence no space is for LOD's
-    * other than 0 in between array slices.
-    *
-    * Corresponds to the surface_array_spacing bit in gen7_surface_state.
-    */
-   bool array_spacing_lod0;
-
    /* Derived from the above:
     */
    GLuint total_width;
diff --git a/src/mesa/drivers/dri/i915/intel_pixel.c b/src/mesa/drivers/dri/i915/intel_pixel.c
index feb1a3f..084b563 100644
--- a/src/mesa/drivers/dri/i915/intel_pixel.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel.c
@@ -28,6 +28,7 @@
 #include "main/accum.h"
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/stencil.h"
 #include "main/bufferobj.h"
 #include "main/context.h"
 #include "swrast/swrast.h"
@@ -61,7 +62,7 @@
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   if (ctx->FragmentProgram._Enabled) {
+   if (_mesa_arb_fragment_program_enabled(ctx)) {
       DBG("fallback due to fragment program\n");
       return false;
    }
@@ -110,7 +111,7 @@
       return false;
    }
 
-   if (ctx->Stencil._Enabled) {
+   if (_mesa_stencil_is_enabled(ctx)) {
       DBG("fallback due to image stencil\n");
       return false;
    }
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_copy.c b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
index 213cdbd..b4f9466 100644
--- a/src/mesa/drivers/dri/i915/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
@@ -28,6 +28,7 @@
 #include "main/glheader.h"
 #include "main/image.h"
 #include "main/state.h"
+#include "main/stencil.h"
 #include "main/mtypes.h"
 #include "main/condrender.h"
 #include "main/fbobject.h"
@@ -111,14 +112,14 @@
       return false;
    }
 
-   if (ctx->Stencil._Enabled) {
+   if (_mesa_stencil_is_enabled(ctx)) {
       perf_debug("glCopyPixels(): Unsupported stencil test state\n");
       return false;
    }
 
    if (ctx->Fog.Enabled ||
        ctx->Texture._MaxEnabledTexImageUnit != -1 ||
-       ctx->FragmentProgram._Enabled) {
+       _mesa_arb_fragment_program_enabled(ctx)) {
       perf_debug("glCopyPixels(): Unsupported fragment shader state\n");
       return false;
    }
diff --git a/src/mesa/drivers/dri/i915/intel_regions.c b/src/mesa/drivers/dri/i915/intel_regions.c
index c9b776d..be0dca4 100644
--- a/src/mesa/drivers/dri/i915/intel_regions.c
+++ b/src/mesa/drivers/dri/i915/intel_regions.c
@@ -284,15 +284,11 @@
  */
 void
 intel_region_get_tile_masks(struct intel_region *region,
-                            uint32_t *mask_x, uint32_t *mask_y,
-                            bool map_stencil_as_y_tiled)
+                            uint32_t *mask_x, uint32_t *mask_y)
 {
    int cpp = region->cpp;
    uint32_t tiling = region->tiling;
 
-   if (map_stencil_as_y_tiled)
-      tiling = I915_TILING_Y;
-
    switch (tiling) {
    default:
       assert(false);
@@ -317,25 +313,12 @@
  */
 uint32_t
 intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
-                                uint32_t y, bool map_stencil_as_y_tiled)
+                                uint32_t y)
 {
    int cpp = region->cpp;
    uint32_t pitch = region->pitch;
    uint32_t tiling = region->tiling;
 
-   if (map_stencil_as_y_tiled) {
-      tiling = I915_TILING_Y;
-
-      /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
-       * gets transformed into a 32-high Y-tile.  Accordingly, the pitch of
-       * the resulting region is twice the pitch of the original region, since
-       * each row in the Y-tiled view corresponds to two rows in the actual
-       * W-tiled surface.  So we need to correct the pitch before computing
-       * the offsets.
-       */
-      pitch *= 2;
-   }
-
    switch (tiling) {
    default:
       assert(false);
diff --git a/src/mesa/drivers/dri/i915/intel_regions.h b/src/mesa/drivers/dri/i915/intel_regions.h
index 562f7cd..05375f1 100644
--- a/src/mesa/drivers/dri/i915/intel_regions.h
+++ b/src/mesa/drivers/dri/i915/intel_regions.h
@@ -101,12 +101,11 @@
 
 void
 intel_region_get_tile_masks(struct intel_region *region,
-                            uint32_t *mask_x, uint32_t *mask_y,
-                            bool map_stencil_as_y_tiled);
+                            uint32_t *mask_x, uint32_t *mask_y);
 
 uint32_t
 intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
-                                uint32_t y, bool map_stencil_as_y_tiled);
+                                uint32_t y);
 
 /**
  * Used with images created with image_from_names
diff --git a/src/mesa/drivers/dri/i915/intel_screen.c b/src/mesa/drivers/dri/i915/intel_screen.c
index 7e17e95..863f6ef 100644
--- a/src/mesa/drivers/dri/i915/intel_screen.c
+++ b/src/mesa/drivers/dri/i915/intel_screen.c
@@ -47,7 +47,6 @@
 
 DRI_CONF_BEGIN
    DRI_CONF_SECTION_PERFORMANCE
-      DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_ALWAYS_SYNC)
       /* Options correspond to DRI_CONF_BO_REUSE_DISABLED,
        * DRI_CONF_BO_REUSE_ALL
        */
@@ -227,17 +226,20 @@
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
 
-   /* For YUYV buffers, we set up two overlapping DRI images and treat
-    * them as planar buffers in the compositors.  Plane 0 is GR88 and
-    * samples YU or YV pairs and places Y into the R component, while
-    * plane 1 is ARGB and samples YUYV clusters and places pairs and
-    * places U into the G component and V into A.  This lets the
-    * texture sampler interpolate the Y components correctly when
-    * sampling from plane 0, and interpolate U and V correctly when
-    * sampling from plane 1. */
+   /* For YUYV and UYVY buffers, we set up two overlapping DRI images
+    * and treat them as planar buffers in the compositors.
+    * Plane 0 is GR88 and samples YU or YV pairs and places Y into
+    * the R component, while plane 1 is ARGB/ABGR and samples YUYV/UYVY
+    * clusters and places pairs and places U into the G component and
+    * V into A.  This lets the texture sampler interpolate the Y
+    * components correctly when sampling from plane 0, and interpolate
+    * U and V correctly when sampling from plane 1. */
    { __DRI_IMAGE_FOURCC_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 },
-       { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }
+       { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } },
+   { __DRI_IMAGE_FOURCC_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 },
+       { 0, 1, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }
 };
 
 static __DRIimage *
@@ -278,7 +280,7 @@
 
    intel_miptree_check_level_layer(mt, level, zoffset);
 
-   intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
+   intel_region_get_tile_masks(mt->region, &mask_x, &mask_y);
    intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
 
    image->width = mt->level[level].width;
@@ -288,8 +290,7 @@
 
    image->offset = intel_region_get_aligned_offset(mt->region,
                                                    draw_x & ~mask_x,
-                                                   draw_y & ~mask_y,
-                                                   false);
+                                                   draw_y & ~mask_y);
 
    intel_region_reference(&image->region, mt->region);
 }
@@ -685,7 +686,7 @@
     image->offset = offset;
     intel_setup_image_from_dimensions(image);
 
-    intel_region_get_tile_masks(image->region, &mask_x, &mask_y, false);
+    intel_region_get_tile_masks(image->region, &mask_x, &mask_y);
     if (offset & mask_x)
        _mesa_warning(NULL,
                      "intel_create_sub_image: offset not on tile boundary");
@@ -800,6 +801,7 @@
     &intelImageExtension.base,
     &intelRendererQueryExtension.base,
     &dri2ConfigQueryExtension.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
@@ -875,12 +877,11 @@
 
    /* setup the hardware-based renderbuffers */
    rb = intel_create_renderbuffer(rgbFormat);
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_FRONT_LEFT, &rb->Base.Base);
+   _mesa_attach_and_own_rb(fb, BUFFER_FRONT_LEFT, &rb->Base.Base);
 
    if (mesaVis->doubleBufferMode) {
       rb = intel_create_renderbuffer(rgbFormat);
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_BACK_LEFT,
-                                            &rb->Base.Base);
+      _mesa_attach_and_own_rb(fb, BUFFER_BACK_LEFT, &rb->Base.Base);
    }
 
    /*
@@ -896,13 +897,13 @@
        * attached to two attachment points.
        */
       rb = intel_create_private_renderbuffer(MESA_FORMAT_Z24_UNORM_S8_UINT);
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, &rb->Base.Base);
-      _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &rb->Base.Base);
+      _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, &rb->Base.Base);
+      _mesa_attach_and_reference_rb(fb, BUFFER_STENCIL, &rb->Base.Base);
    }
    else if (mesaVis->depthBits == 16) {
       assert(mesaVis->stencilBits == 0);
       rb = intel_create_private_renderbuffer(MESA_FORMAT_Z_UNORM16);
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, &rb->Base.Base);
+      _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, &rb->Base.Base);
    }
    else {
       assert(mesaVis->depthBits == 0);
@@ -970,7 +971,7 @@
    __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    struct intel_screen *intelScreen = sPriv->driverPrivate;
 
-   if (flags & ~__DRI_CTX_FLAG_DEBUG) {
+   if (flags & ~(__DRI_CTX_FLAG_DEBUG | __DRI_CTX_FLAG_NO_ERROR)) {
       *error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
       return false;
    }
diff --git a/src/mesa/drivers/dri/i915/intel_tex.c b/src/mesa/drivers/dri/i915/intel_tex.c
index 4c48d3b..0b24c7e 100644
--- a/src/mesa/drivers/dri/i915/intel_tex.c
+++ b/src/mesa/drivers/dri/i915/intel_tex.c
@@ -140,10 +140,6 @@
    /* Our texture data is always stored in a miptree. */
    assert(mt);
 
-   /* Check that our caller wasn't confused about how to map a 1D texture. */
-   assert(tex_image->TexObject->Target != GL_TEXTURE_1D_ARRAY ||
-	  h == 1);
-
    /* intel_miptree_map operates on a unified "slice" number that references the
     * cube face, since it's all just slices to the miptree code.
     */
diff --git a/src/mesa/drivers/dri/i915/intel_tex_image.c b/src/mesa/drivers/dri/i915/intel_tex_image.c
index 63ef08b..3308e37 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -141,12 +141,6 @@
       return false;
    }
 
-   if (image->TexObject->Target == GL_TEXTURE_1D_ARRAY ||
-       image->TexObject->Target == GL_TEXTURE_2D_ARRAY) {
-      DBG("%s: no support for array textures\n", __func__);
-      return false;
-   }
-
    src_buffer = intel_bufferobj_source(intel, pbo, 64, &src_offset);
    /* note: potential 64-bit ptr to 32-bit int cast */
    src_offset += (GLuint) (unsigned long) pixels;
diff --git a/src/mesa/drivers/dri/i915/intel_tex_layout.c b/src/mesa/drivers/dri/i915/intel_tex_layout.c
index 01ea165..efe7a8d 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_layout.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_layout.c
@@ -97,14 +97,11 @@
     * | FXT1  compressed format                |  4  |  4  |  4  |  4  |  4  |
     * | Depth Buffer                           |  2  |  2  |  2  |  4  |  4  |
     * | Separate Stencil Buffer                | N/A | N/A | N/A |  4  |  8  |
-    * | Multisampled (4x or 8x) render target  | N/A | N/A | N/A |  4  |  4  |
     * | All Others                             |  2  |  2  |  2  |  2  |  2  |
     * +----------------------------------------------------------------------+
     *
     * On SNB+, non-special cases can be overridden by setting the SURFACE_STATE
     * "Surface Vertical Alignment" field to VALIGN_2 or VALIGN_4.
-    *
-    * We currently don't support multisampling.
     */
    if (_mesa_is_format_compressed(format))
       return 4;
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index 36cba22..70e4fa3 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -39,6 +39,7 @@
 #include "main/state.h"
 #include "main/dd.h"
 #include "main/fbobject.h"
+#include "main/state.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index ac0a444..d75ffb1 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -29,22 +29,89 @@
 
 I965_PERGEN_COMMON_INCLUDES := \
 	$(MESA_DRI_C_INCLUDES) \
-	$(MESA_TOP)/src/intel
+	$(MESA_TOP)/src/intel \
+	$(MESA_TOP)/include/drm-uapi
 
 I965_PERGEN_SHARED_LIBRARIES := \
-	$(MESA_DRI_SHARED_LIBRARIES) \
-	libdrm_intel
+	$(MESA_DRI_SHARED_LIBRARIES)
 
 I965_PERGEN_STATIC_LIBRARIES := \
 	libmesa_genxml \
 	libmesa_nir
 
 I965_PERGEN_LIBS := \
+	libmesa_i965_gen4 \
+	libmesa_i965_gen45 \
+	libmesa_i965_gen5 \
 	libmesa_i965_gen6 \
 	libmesa_i965_gen7 \
 	libmesa_i965_gen75 \
 	libmesa_i965_gen8 \
-	libmesa_i965_gen9
+	libmesa_i965_gen9 \
+	libmesa_i965_gen10
+
+# ---------------------------------------
+# Build libmesa_i965_gen4
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_i965_gen4
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(i965_gen4_FILES)
+
+LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
+
+LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=40
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libmesa_i965_gen45
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_i965_gen45
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(i965_gen45_FILES)
+
+LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
+
+LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=45
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libmesa_i965_gen5
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_i965_gen5
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(i965_gen5_FILES)
+
+LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
+
+LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=50
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
 # Build libmesa_i965_gen6
@@ -152,6 +219,27 @@
 include $(BUILD_STATIC_LIBRARY)
 
 # ---------------------------------------
+# Build libmesa_i965_gen10
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_i965_gen10
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(i965_gen10_FILES)
+
+LOCAL_SHARED_LIBRARIES := $(I965_PERGEN_SHARED_LIBRARIES)
+
+LOCAL_STATIC_LIBRARIES := $(I965_PERGEN_STATIC_LIBRARIES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=100
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
 # Build i965_dri
 # ---------------------------------------
 
@@ -169,7 +257,8 @@
 endif
 
 LOCAL_C_INCLUDES := \
-	$(MESA_DRI_C_INCLUDES)
+	$(MESA_DRI_C_INCLUDES) \
+	$(MESA_TOP)/include/drm-uapi
 
 LOCAL_SRC_FILES := \
 	$(i965_FILES)
@@ -183,8 +272,7 @@
 	libmesa_intel_compiler
 
 LOCAL_SHARED_LIBRARIES := \
-	$(MESA_DRI_SHARED_LIBRARIES) \
-	libdrm_intel
+	$(MESA_DRI_SHARED_LIBRARIES)
 
 LOCAL_GENERATED_SOURCES := \
 	$(MESA_DRI_OPTIONS_H) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 4e9b062..7ef4e79 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -38,19 +38,34 @@
 	-I$(top_srcdir)/src/compiler/nir \
 	-I$(top_builddir)/src/intel \
 	-I$(top_srcdir)/src/intel \
+	-I$(top_srcdir)/include/drm-uapi \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
-	$(VALGRIND_CFLAGS)
+	$(VALGRIND_CFLAGS) \
+	-msse2
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
 I965_PERGEN_LIBS = \
+	libi965_gen4.la \
+	libi965_gen45.la \
+	libi965_gen5.la \
 	libi965_gen6.la \
 	libi965_gen7.la \
 	libi965_gen75.la \
 	libi965_gen8.la \
-	libi965_gen9.la
+	libi965_gen9.la \
+	libi965_gen10.la
+
+libi965_gen4_la_SOURCES = $(i965_gen4_FILES)
+libi965_gen4_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=40
+
+libi965_gen45_la_SOURCES = $(i965_gen45_FILES)
+libi965_gen45_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=45
+
+libi965_gen5_la_SOURCES = $(i965_gen5_FILES)
+libi965_gen5_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=50
 
 libi965_gen6_la_SOURCES = $(i965_gen6_FILES)
 libi965_gen6_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=60
@@ -67,6 +82,9 @@
 libi965_gen9_la_SOURCES = $(i965_gen9_FILES)
 libi965_gen9_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=90
 
+libi965_gen10_la_SOURCES = $(i965_gen10_FILES)
+libi965_gen10_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=100
+
 noinst_LTLIBRARIES = \
 	libi965_dri.la \
 	$(I965_PERGEN_LIBS)
@@ -87,13 +105,22 @@
 
 EXTRA_DIST = \
 	brw_oa_hsw.xml \
+	brw_oa_bdw.xml \
+	brw_oa_chv.xml \
+	brw_oa_sklgt2.xml \
+	brw_oa_sklgt3.xml \
+	brw_oa_sklgt4.xml \
+	brw_oa_bxt.xml \
+	brw_oa_kblgt2.xml \
+	brw_oa_kblgt3.xml \
+	brw_oa_glk.xml \
 	brw_oa.py
 
 # Note: we avoid using a multi target rule here and outputting both the
 # .c and .h files in one go so we don't hit problems with parallel
 # make and multiple invocations of the same script trying to write
 # to the same files.
-brw_oa_hsw.h: brw_oa.py brw_oa_hsw.xml
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_oa.py --header=$(builddir)/brw_oa_hsw.h --chipset=hsw $(srcdir)/brw_oa_hsw.xml
-brw_oa_hsw.c: brw_oa.py brw_oa_hsw.xml
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_oa.py --code=$(builddir)/brw_oa_hsw.c --chipset=hsw $(srcdir)/brw_oa_hsw.xml
+brw_oa_%.h: brw_oa.py brw_oa_%.xml Makefile.am
+	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_oa.py --header=$(builddir)/brw_oa_$(*).h --chipset=$(*) $(srcdir)/brw_oa_$(*).xml
+brw_oa_%.c: brw_oa.py brw_oa_%.xml Makefile.am
+	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_oa.py --code=$(builddir)/brw_oa_$(*).c --chipset=$(*) $(srcdir)/brw_oa_$(*).xml
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 02dbb55..425c883 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -4,16 +4,8 @@
 	brw_blorp.h \
 	brw_bufmgr.c \
 	brw_bufmgr.h \
-	brw_cc.c \
 	brw_clear.c \
 	brw_clip.c \
-	brw_clip.h \
-	brw_clip_line.c \
-	brw_clip_point.c \
-	brw_clip_state.c \
-	brw_clip_tri.c \
-	brw_clip_unfilled.c \
-	brw_clip_util.c \
 	brw_compute.c \
 	brw_conditional_render.c \
 	brw_context.c \
@@ -28,12 +20,9 @@
 	brw_ff_gs.c \
 	brw_ff_gs_emit.c \
 	brw_ff_gs.h \
-	brw_fs_channel_expressions.cpp \
-	brw_fs_vector_splitting.cpp \
 	brw_formatquery.c \
 	brw_gs.c \
 	brw_gs.h \
-	brw_gs_state.c \
 	brw_gs_surface_state.c \
 	brw_link.cpp \
 	brw_meta_util.c \
@@ -51,11 +40,7 @@
 	brw_primitive_restart.c \
 	brw_queryobj.c \
 	brw_reset.c \
-	brw_sampler_state.c \
 	brw_sf.c \
-	brw_sf_emit.c \
-	brw_sf.h \
-	brw_sf_state.c \
 	brw_state_batch.c \
 	brw_state.h \
 	brw_state_upload.c \
@@ -66,62 +51,33 @@
 	brw_tcs_surface_state.c \
 	brw_tes.c \
 	brw_tes_surface_state.c \
-	brw_tex_layout.c \
 	brw_urb.c \
 	brw_util.c \
 	brw_util.h \
 	brw_vs.c \
 	brw_vs.h \
-	brw_vs_state.c \
 	brw_vs_surface_state.c \
 	brw_wm.c \
 	brw_wm.h \
-	brw_wm_state.c \
 	brw_wm_surface_state.c \
-	gen6_cc.c \
+	gen4_blorp_exec.h \
 	gen6_clip_state.c \
 	gen6_constant_state.c \
 	gen6_depth_state.c \
-	gen6_depthstencil.c \
-	gen6_gs_state.c \
 	gen6_multisample_state.c \
 	gen6_queryobj.c \
 	gen6_sampler_state.c \
-	gen6_scissor_state.c \
-	gen6_sf_state.c \
 	gen6_sol.c \
 	gen6_urb.c \
-	gen6_viewport_state.c \
-	gen6_vs_state.c \
-	gen6_wm_state.c \
 	gen7_cs_state.c \
-	gen7_ds_state.c \
-	gen7_gs_state.c \
-	gen7_hs_state.c \
 	gen7_l3_state.c \
 	gen7_misc_state.c \
-	gen7_sf_state.c \
 	gen7_sol_state.c \
-	gen7_te_state.c \
 	gen7_urb.c \
-	gen7_viewport_state.c \
-	gen7_vs_state.c \
-	gen7_wm_state.c \
 	gen7_wm_surface_state.c \
-	gen8_blend_state.c \
 	gen8_depth_state.c \
-	gen8_draw_upload.c \
-	gen8_ds_state.c \
-	gen8_gs_state.c \
-	gen8_hs_state.c \
 	gen8_multisample_state.c \
-	gen8_ps_state.c \
-	gen8_sf_state.c \
-	gen8_sol_state.c \
 	gen8_surface_state.c \
-	gen8_viewport_state.c \
-	gen8_vs_state.c \
-	gen8_wm_depth_stencil.c \
 	hsw_queryobj.c \
 	hsw_sol.c \
 	intel_batchbuffer.c \
@@ -145,8 +101,6 @@
 	intel_pixel_draw.c \
 	intel_pixel.h \
 	intel_pixel_read.c \
-	intel_resolve_map.c \
-	intel_resolve_map.h \
 	intel_screen.c \
 	intel_screen.h \
 	intel_state.c \
@@ -162,21 +116,60 @@
 	intel_upload.c \
 	libdrm_macros.h
 
+i965_gen4_FILES = \
+	genX_blorp_exec.c \
+	genX_state_upload.c
+
+i965_gen45_FILES = \
+	genX_blorp_exec.c \
+	genX_state_upload.c
+
+i965_gen5_FILES = \
+	genX_blorp_exec.c \
+	genX_state_upload.c
+
 i965_gen6_FILES = \
-	genX_blorp_exec.c
+	genX_blorp_exec.c \
+	genX_state_upload.c
 
 i965_gen7_FILES = \
-	genX_blorp_exec.c
+	genX_blorp_exec.c \
+	genX_state_upload.c
 
 i965_gen75_FILES = \
-	genX_blorp_exec.c
+	genX_blorp_exec.c \
+	genX_state_upload.c
 
 i965_gen8_FILES = \
-	genX_blorp_exec.c
+	genX_blorp_exec.c \
+	genX_state_upload.c
 
 i965_gen9_FILES = \
-	genX_blorp_exec.c
+	genX_blorp_exec.c \
+	genX_state_upload.c
+
+i965_gen10_FILES = \
+	genX_blorp_exec.c \
+	genX_state_upload.c
 
 i965_oa_GENERATED_FILES = \
 	brw_oa_hsw.h \
-	brw_oa_hsw.c
+	brw_oa_hsw.c \
+	brw_oa_bdw.h \
+	brw_oa_bdw.c \
+	brw_oa_chv.h \
+	brw_oa_chv.c \
+	brw_oa_sklgt2.h \
+	brw_oa_sklgt2.c \
+	brw_oa_sklgt3.h \
+	brw_oa_sklgt3.c \
+	brw_oa_sklgt4.h \
+	brw_oa_sklgt4.c \
+	brw_oa_bxt.h \
+	brw_oa_bxt.c \
+	brw_oa_kblgt2.h \
+	brw_oa_kblgt2.c \
+	brw_oa_kblgt3.h \
+	brw_oa_kblgt3.c \
+	brw_oa_glk.h \
+	brw_oa_glk.c
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c
index e550ce5..447a14a 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -71,6 +71,16 @@
    brw->blorp.compiler = brw->screen->compiler;
 
    switch (brw->gen) {
+   case 4:
+      if (brw->is_g4x) {
+         brw->blorp.exec = gen45_blorp_exec;
+      } else {
+         brw->blorp.exec = gen4_blorp_exec;
+      }
+      break;
+   case 5:
+      brw->blorp.exec = gen5_blorp_exec;
+      break;
    case 6:
       brw->blorp.mocs.tex = 0;
       brw->blorp.mocs.rb = 0;
@@ -99,6 +109,12 @@
       brw->blorp.mocs.vb = SKL_MOCS_WB;
       brw->blorp.exec = gen9_blorp_exec;
       break;
+   case 10:
+      brw->blorp.mocs.tex = CNL_MOCS_WB;
+      brw->blorp.mocs.rb = CNL_MOCS_PTE;
+      brw->blorp.mocs.vb = CNL_MOCS_WB;
+      brw->blorp.exec = gen10_blorp_exec;
+      break;
    default:
       unreachable("Invalid gen");
    }
@@ -108,48 +124,19 @@
 }
 
 static void
-apply_gen6_stencil_hiz_offset(struct isl_surf *surf,
-                              struct intel_mipmap_tree *mt,
-                              uint32_t lod,
-                              uint32_t *offset)
-{
-   assert(mt->array_layout == GEN6_HIZ_STENCIL);
-
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      /* Note: we can't compute the stencil offset using
-       * intel_miptree_get_aligned_offset(), because the miptree
-       * claims that the region is untiled even though it's W tiled.
-       */
-      *offset = mt->level[lod].level_y * mt->pitch +
-                mt->level[lod].level_x * 64;
-   } else {
-      *offset = intel_miptree_get_aligned_offset(mt,
-                                                 mt->level[lod].level_x,
-                                                 mt->level[lod].level_y);
-   }
-
-   surf->logical_level0_px.width = minify(surf->logical_level0_px.width, lod);
-   surf->logical_level0_px.height = minify(surf->logical_level0_px.height, lod);
-   surf->phys_level0_sa.width = minify(surf->phys_level0_sa.width, lod);
-   surf->phys_level0_sa.height = minify(surf->phys_level0_sa.height, lod);
-   surf->levels = 1;
-   surf->array_pitch_el_rows =
-      ALIGN(surf->phys_level0_sa.height, surf->image_alignment_el.height);
-}
-
-static void
 blorp_surf_for_miptree(struct brw_context *brw,
                        struct blorp_surf *surf,
                        struct intel_mipmap_tree *mt,
+                       enum isl_aux_usage aux_usage,
                        bool is_render_target,
-                       uint32_t safe_aux_usage,
                        unsigned *level,
                        unsigned start_layer, unsigned num_layers,
-                       struct isl_surf tmp_surfs[2])
+                       struct isl_surf tmp_surfs[1])
 {
-   if (mt->msaa_layout == INTEL_MSAA_LAYOUT_UMS ||
-       mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
-      const unsigned num_samples = MAX2(1, mt->num_samples);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (mt->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY) {
+      const unsigned num_samples = mt->surf.samples;
       for (unsigned i = 0; i < num_layers; i++) {
          for (unsigned s = 0; s < num_samples; s++) {
             const unsigned phys_layer = (start_layer + i) * num_samples + s;
@@ -161,8 +148,7 @@
          intel_miptree_check_level_layer(mt, *level, start_layer + i);
    }
 
-   intel_miptree_get_isl_surf(brw, mt, &tmp_surfs[0]);
-   surf->surf = &tmp_surfs[0];
+   surf->surf = &mt->surf;
    surf->addr = (struct blorp_address) {
       .buffer = mt->bo,
       .offset = mt->offset,
@@ -171,67 +157,27 @@
       .write_domain = is_render_target ? I915_GEM_DOMAIN_RENDER : 0,
    };
 
-   if (brw->gen == 6 && mt->format == MESA_FORMAT_S_UINT8 &&
-       mt->array_layout == GEN6_HIZ_STENCIL) {
-      /* Sandy bridge stencil and HiZ use this GEN6_HIZ_STENCIL hack in
-       * order to allow for layered rendering.  The hack makes each LOD of the
-       * stencil or HiZ buffer a single tightly packed array surface at some
-       * offset into the surface.  Since ISL doesn't know how to deal with the
-       * crazy GEN6_HIZ_STENCIL layout and since we have to do a manual
-       * offset of it anyway, we might as well do the offset here and keep the
-       * hacks inside the i965 driver.
-       *
-       * See also gen6_depth_stencil_state.c
-       */
-      uint32_t offset;
-      apply_gen6_stencil_hiz_offset(&tmp_surfs[0], mt, *level, &offset);
-      surf->addr.offset += offset;
-      *level = 0;
-   }
+   surf->aux_usage = aux_usage;
 
-   struct isl_surf *aux_surf = &tmp_surfs[1];
-   intel_miptree_get_aux_isl_surf(brw, mt, aux_surf, &surf->aux_usage);
+   struct isl_surf *aux_surf = NULL;
+   if (mt->mcs_buf)
+      aux_surf = &mt->mcs_buf->surf;
+   else if (mt->hiz_buf)
+      aux_surf = &mt->hiz_buf->surf;
 
-   if (surf->aux_usage != ISL_AUX_USAGE_NONE) {
-      if (surf->aux_usage == ISL_AUX_USAGE_HIZ) {
-         /* If we're not going to use it as a depth buffer, resolve HiZ */
-         if (!(safe_aux_usage & (1 << ISL_AUX_USAGE_HIZ))) {
-            for (unsigned i = 0; i < num_layers; i++) {
-               intel_miptree_slice_resolve_depth(brw, mt, *level,
-                                                 start_layer + i);
+   if (mt->format == MESA_FORMAT_S_UINT8 && is_render_target &&
+       devinfo->gen <= 7)
+      mt->r8stencil_needs_update = true;
 
-               /* If we're rendering to it then we'll need a HiZ resolve once
-                * we're done before we can use it with HiZ again.
-                */
-               if (is_render_target)
-                  intel_miptree_slice_set_needs_hiz_resolve(mt, *level,
-                                                            start_layer + i);
-            }
-            surf->aux_usage = ISL_AUX_USAGE_NONE;
-         }
-      } else if (!(safe_aux_usage & (1 << surf->aux_usage))) {
-         uint32_t flags = 0;
-         if (safe_aux_usage & (1 << ISL_AUX_USAGE_CCS_E))
-            flags |= INTEL_MIPTREE_IGNORE_CCS_E;
-
-         intel_miptree_resolve_color(brw, mt,
-                                     *level, start_layer, num_layers, flags);
-
-         assert(!intel_miptree_has_color_unresolved(mt, *level, 1,
-                                                    start_layer, num_layers));
-         surf->aux_usage = ISL_AUX_USAGE_NONE;
-      }
-   }
-
-   if (is_render_target)
-      intel_miptree_used_for_rendering(brw, mt, *level,
-                                       start_layer, num_layers);
+   if (surf->aux_usage == ISL_AUX_USAGE_HIZ &&
+       !intel_miptree_level_has_hiz(mt, *level))
+      surf->aux_usage = ISL_AUX_USAGE_NONE;
 
    if (surf->aux_usage != ISL_AUX_USAGE_NONE) {
       /* We only really need a clear color if we also have an auxiliary
        * surface.  Without one, it does nothing.
        */
-      surf->clear_color = intel_miptree_get_isl_clear_color(brw, mt);
+      surf->clear_color = mt->fast_clear_color;
 
       surf->aux_surf = aux_surf;
       surf->aux_addr = (struct blorp_address) {
@@ -246,21 +192,8 @@
       } else {
          assert(surf->aux_usage == ISL_AUX_USAGE_HIZ);
 
-         surf->aux_addr.buffer = mt->hiz_buf->aux_base.bo;
-         surf->aux_addr.offset = mt->hiz_buf->aux_base.offset;
-
-         struct intel_mipmap_tree *hiz_mt = mt->hiz_buf->mt;
-         if (hiz_mt) {
-            assert(brw->gen == 6 && hiz_mt->array_layout == GEN6_HIZ_STENCIL);
-
-            /* gen6 requires the HiZ buffer to be manually offset to the
-             * right location.  We could fixup the surf but it doesn't
-             * matter since most of those fields don't matter.
-             */
-            apply_gen6_stencil_hiz_offset(aux_surf, hiz_mt, *level,
-                                          &surf->aux_addr.offset);
-            assert(hiz_mt->pitch == aux_surf->row_pitch);
-         }
+         surf->aux_addr.buffer = mt->hiz_buf->bo;
+         surf->aux_addr.offset = mt->hiz_buf->offset;
       }
    } else {
       surf->aux_addr = (struct blorp_address) {
@@ -270,6 +203,9 @@
    }
    assert((surf->aux_usage == ISL_AUX_USAGE_NONE) ==
           (surf->aux_addr.buffer == NULL));
+
+   /* ISL wants real levels, not offset ones. */
+   *level -= mt->first_level;
 }
 
 static enum isl_format
@@ -291,8 +227,8 @@
       return ISL_FORMAT_R16_UNORM;
    default: {
       if (is_render_target) {
-         assert(brw->format_supported_as_render_target[format]);
-         return brw->render_target_format[format];
+         assert(brw->mesa_format_supports_render[format]);
+         return brw->mesa_to_isl_render_format[format];
       } else {
          return brw_isl_format_for_mesa_format(format);
       }
@@ -320,25 +256,11 @@
    return (enum isl_channel_select)((swizzle + 4) & 7);
 }
 
-static unsigned
-physical_to_logical_layer(struct intel_mipmap_tree *mt,
-                          unsigned physical_layer)
-{
-   if (mt->num_samples > 1 &&
-       (mt->msaa_layout == INTEL_MSAA_LAYOUT_UMS ||
-        mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS)) {
-      assert(physical_layer % mt->num_samples == 0);
-      return physical_layer / mt->num_samples;
-   } else {
-      return physical_layer;
-   }
-}
-
 /**
  * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
  * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
  * the physical layer holding sample 0.  So, for example, if
- * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
+ * src_mt->surf.samples == 4, then logical layer n corresponds to src_layer ==
  * 4*n.
  */
 void
@@ -356,16 +278,12 @@
                         GLenum filter, bool mirror_x, bool mirror_y,
                         bool decode_srgb, bool encode_srgb)
 {
-   /* Blorp operates in logical layers */
-   src_layer = physical_to_logical_layer(src_mt, src_layer);
-   dst_layer = physical_to_logical_layer(dst_mt, dst_layer);
-
    DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
        "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
        __func__,
-       src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
+       src_mt->surf.samples, _mesa_get_format_name(src_mt->format), src_mt,
        src_level, src_layer, src_x0, src_y0, src_x1, src_y1,
-       dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
+       dst_mt->surf.samples, _mesa_get_format_name(dst_mt->format), dst_mt,
        dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
        mirror_x, mirror_y);
 
@@ -385,29 +303,41 @@
     * R32_FLOAT, so only the contents of the red channel matters.
     */
    if (brw->gen == 6 &&
-       src_mt->num_samples > 1 && dst_mt->num_samples <= 1 &&
+       src_mt->surf.samples > 1 && dst_mt->surf.samples <= 1 &&
        src_mt->format == dst_mt->format &&
        (dst_format == MESA_FORMAT_L_FLOAT32 ||
         dst_format == MESA_FORMAT_I_FLOAT32)) {
       src_format = dst_format = MESA_FORMAT_R_FLOAT32;
    }
 
-   uint32_t src_usage_flags = (1 << ISL_AUX_USAGE_MCS);
-   if (src_format == src_mt->format)
-      src_usage_flags |= (1 << ISL_AUX_USAGE_CCS_E);
+   enum isl_format src_isl_format =
+      brw_blorp_to_isl_format(brw, src_format, false);
+   enum isl_aux_usage src_aux_usage =
+      intel_miptree_texture_aux_usage(brw, src_mt, src_isl_format);
+   /* We do format workarounds for some depth formats so we can't reliably
+    * sample with HiZ.  One of these days, we should fix that.
+    */
+   if (src_aux_usage == ISL_AUX_USAGE_HIZ)
+      src_aux_usage = ISL_AUX_USAGE_NONE;
+   const bool src_clear_supported =
+      src_aux_usage != ISL_AUX_USAGE_NONE && src_mt->format == src_format;
+   intel_miptree_prepare_access(brw, src_mt, src_level, 1, src_layer, 1,
+                                src_aux_usage, src_clear_supported);
 
-   uint32_t dst_usage_flags = (1 << ISL_AUX_USAGE_MCS);
-   if (dst_format == dst_mt->format) {
-      dst_usage_flags |= (1 << ISL_AUX_USAGE_CCS_E) |
-                         (1 << ISL_AUX_USAGE_CCS_D);
-   }
+   enum isl_format dst_isl_format =
+      brw_blorp_to_isl_format(brw, dst_format, true);
+   enum isl_aux_usage dst_aux_usage =
+      intel_miptree_render_aux_usage(brw, dst_mt, dst_isl_format, false);
+   const bool dst_clear_supported = dst_aux_usage != ISL_AUX_USAGE_NONE;
+   intel_miptree_prepare_access(brw, dst_mt, dst_level, 1, dst_layer, 1,
+                                dst_aux_usage, dst_clear_supported);
 
-   struct isl_surf tmp_surfs[4];
+   struct isl_surf tmp_surfs[2];
    struct blorp_surf src_surf, dst_surf;
-   blorp_surf_for_miptree(brw, &src_surf, src_mt, false, src_usage_flags,
+   blorp_surf_for_miptree(brw, &src_surf, src_mt, src_aux_usage, false,
                           &src_level, src_layer, 1, &tmp_surfs[0]);
-   blorp_surf_for_miptree(brw, &dst_surf, dst_mt, true, dst_usage_flags,
-                          &dst_level, dst_layer, 1, &tmp_surfs[2]);
+   blorp_surf_for_miptree(brw, &dst_surf, dst_mt, dst_aux_usage, true,
+                          &dst_level, dst_layer, 1, &tmp_surfs[1]);
 
    struct isl_swizzle src_isl_swizzle = {
       .r = swizzle_to_scs(GET_SWZ(src_swizzle, 0)),
@@ -419,14 +349,16 @@
    struct blorp_batch batch;
    blorp_batch_init(&brw->blorp, &batch, brw, 0);
    blorp_blit(&batch, &src_surf, src_level, src_layer,
-              brw_blorp_to_isl_format(brw, src_format, false), src_isl_swizzle,
+              src_isl_format, src_isl_swizzle,
               &dst_surf, dst_level, dst_layer,
-              brw_blorp_to_isl_format(brw, dst_format, true),
-              ISL_SWIZZLE_IDENTITY,
+              dst_isl_format, ISL_SWIZZLE_IDENTITY,
               src_x0, src_y0, src_x1, src_y1,
               dst_x0, dst_y0, dst_x1, dst_y1,
               filter, mirror_x, mirror_y);
    blorp_batch_finish(&batch);
+
+   intel_miptree_finish_write(brw, dst_mt, dst_level, dst_layer, 1,
+                              dst_aux_usage);
 }
 
 void
@@ -442,21 +374,59 @@
    DBG("%s from %dx %s mt %p %d %d (%d,%d) %dx%d"
        "to %dx %s mt %p %d %d (%d,%d)\n",
        __func__,
-       src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
+       src_mt->surf.samples, _mesa_get_format_name(src_mt->format), src_mt,
        src_level, src_layer, src_x, src_y, src_width, src_height,
-       dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
+       dst_mt->surf.samples, _mesa_get_format_name(dst_mt->format), dst_mt,
        dst_level, dst_layer, dst_x, dst_y);
 
-   struct isl_surf tmp_surfs[4];
+   enum isl_aux_usage src_aux_usage, dst_aux_usage;
+   bool src_clear_supported, dst_clear_supported;
+
+   switch (src_mt->aux_usage) {
+   case ISL_AUX_USAGE_MCS:
+   case ISL_AUX_USAGE_CCS_E:
+      src_aux_usage = src_mt->aux_usage;
+      /* Prior to gen9, fast-clear only supported 0/1 clear colors.  Since
+       * we're going to re-interpret the format as an integer format possibly
+       * with a different number of components, we can't handle clear colors
+       * until gen9.
+       */
+      src_clear_supported = brw->gen >= 9;
+      break;
+   default:
+      src_aux_usage = ISL_AUX_USAGE_NONE;
+      src_clear_supported = false;
+      break;
+   }
+
+   switch (dst_mt->aux_usage) {
+   case ISL_AUX_USAGE_MCS:
+   case ISL_AUX_USAGE_CCS_E:
+      dst_aux_usage = dst_mt->aux_usage;
+      /* Prior to gen9, fast-clear only supported 0/1 clear colors.  Since
+       * we're going to re-interpret the format as an integer format possibly
+       * with a different number of components, we can't handle clear colors
+       * until gen9.
+       */
+      dst_clear_supported = brw->gen >= 9;
+      break;
+   default:
+      dst_aux_usage = ISL_AUX_USAGE_NONE;
+      dst_clear_supported = false;
+      break;
+   }
+
+   intel_miptree_prepare_access(brw, src_mt, src_level, 1, src_layer, 1,
+                                src_aux_usage, src_clear_supported);
+   intel_miptree_prepare_access(brw, dst_mt, dst_level, 1, dst_layer, 1,
+                                dst_aux_usage, dst_clear_supported);
+
+   struct isl_surf tmp_surfs[2];
    struct blorp_surf src_surf, dst_surf;
-   blorp_surf_for_miptree(brw, &src_surf, src_mt, false,
-                          (1 << ISL_AUX_USAGE_MCS) |
-                          (1 << ISL_AUX_USAGE_CCS_E),
+   blorp_surf_for_miptree(brw, &src_surf, src_mt, src_aux_usage, false,
                           &src_level, src_layer, 1, &tmp_surfs[0]);
-   blorp_surf_for_miptree(brw, &dst_surf, dst_mt, true,
-                          (1 << ISL_AUX_USAGE_MCS) |
-                          (1 << ISL_AUX_USAGE_CCS_E),
-                          &dst_level, dst_layer, 1, &tmp_surfs[2]);
+   blorp_surf_for_miptree(brw, &dst_surf, dst_mt, dst_aux_usage, true,
+                          &dst_level, dst_layer, 1, &tmp_surfs[1]);
 
    struct blorp_batch batch;
    blorp_batch_init(&brw->blorp, &batch, brw, 0);
@@ -464,6 +434,9 @@
               &dst_surf, dst_level, dst_layer,
               src_x, src_y, dst_x, dst_y, src_width, src_height);
    blorp_batch_finish(&batch);
+
+   intel_miptree_finish_write(brw, dst_mt, dst_level, dst_layer, 1,
+                              dst_aux_usage);
 }
 
 static struct intel_mipmap_tree *
@@ -570,12 +543,25 @@
           (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
          return false;
 
+      /* We also can't handle any combined depth-stencil formats because we
+       * have to reinterpret as a color format.
+       */
+      if (_mesa_get_format_base_format(src_mt->format) == GL_DEPTH_STENCIL ||
+          _mesa_get_format_base_format(dst_mt->format) == GL_DEPTH_STENCIL)
+         return false;
+
       do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
                     dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
                     srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
                     filter, mirror_x, mirror_y);
       break;
    case GL_STENCIL_BUFFER_BIT:
+      /* Blorp doesn't support combined depth stencil which is all we have
+       * prior to gen6.
+       */
+      if (brw->gen < 6)
+         return false;
+
       src_irb =
          intel_renderbuffer(read_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
       dst_irb =
@@ -618,11 +604,7 @@
    struct intel_mipmap_tree *dst_mt = intel_image->mt;
 
    /* There is support for only up to eight samples. */
-   if (src_mt->num_samples > 8 || dst_mt->num_samples > 8)
-      return false;
-
-   /* BLORP is only supported from Gen6 onwards. */
-   if (brw->gen < 6)
+   if (src_mt->surf.samples > 8 || dst_mt->surf.samples > 8)
       return false;
 
    if (_mesa_get_format_base_format(src_rb->Format) !=
@@ -639,7 +621,14 @@
       return false;
    }
 
-   if (!brw->format_supported_as_render_target[dst_image->TexFormat])
+   /* We also can't handle any combined depth-stencil formats because we
+    * have to reinterpret as a color format.
+    */
+   if (_mesa_get_format_base_format(src_mt->format) == GL_DEPTH_STENCIL ||
+       _mesa_get_format_base_format(dst_mt->format) == GL_DEPTH_STENCIL)
+      return false;
+
+   if (!brw->mesa_format_supports_render[dst_image->TexFormat])
       return false;
 
    /* Source clipping shouldn't be necessary, since copytexsubimage (in
@@ -723,10 +712,6 @@
                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                       GLbitfield mask, GLenum filter)
 {
-   /* BLORP is not supported before Gen6. */
-   if (brw->gen < 6)
-      return mask;
-
    static GLbitfield buffer_bits[] = {
       GL_COLOR_BUFFER_BIT,
       GL_DEPTH_BUFFER_BIT,
@@ -769,24 +754,19 @@
    return disables;
 }
 
-static unsigned
-irb_logical_mt_layer(struct intel_renderbuffer *irb)
-{
-   return physical_to_logical_layer(irb->mt, irb->mt_layer);
-}
-
-static bool
+static void
 do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
                       struct gl_renderbuffer *rb, unsigned buf,
                       bool partial_clear, bool encode_srgb)
 {
    struct gl_context *ctx = &brw->ctx;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-   mesa_format format = irb->mt->format;
    uint32_t x0, x1, y0, y1;
 
+   mesa_format format = irb->Base.Base.Format;
    if (!encode_srgb && _mesa_get_format_color_encoding(format) == GL_SRGB)
       format = _mesa_get_srgb_format_linear(format);
+   enum isl_format isl_format = brw->mesa_to_isl_render_format[format];
 
    x0 = fb->_Xmin;
    x1 = fb->_Xmax;
@@ -800,7 +780,7 @@
 
    /* If the clear region is empty, just return. */
    if (x0 == x1 || y0 == y1)
-      return true;
+      return;
 
    bool can_fast_clear = !partial_clear;
 
@@ -808,14 +788,17 @@
    if (set_write_disables(irb, ctx->Color.ColorMask[buf], color_write_disable))
       can_fast_clear = false;
 
-   if (irb->mt->aux_disable & INTEL_AUX_DISABLE_CCS ||
-       !brw_is_color_fast_clear_compatible(brw, irb->mt, &ctx->Color.ClearColor))
+   /* We store clear colors as floats or uints as needed.  If there are
+    * texture views in play, the formats will not properly be respected
+    * during resolves because the resolve operations only know about the
+    * miptree and not the renderbuffer.
+    */
+   if (irb->Base.Base.Format != irb->mt->format)
       can_fast_clear = false;
 
-   const unsigned logical_layer = irb_logical_mt_layer(irb);
-   const enum intel_fast_clear_state fast_clear_state =
-      intel_miptree_get_fast_clear_state(irb->mt, irb->mt_level,
-                                         logical_layer);
+   if (!irb->mt->supports_fast_clear ||
+       !brw_is_color_fast_clear_compatible(brw, irb->mt, &ctx->Color.ClearColor))
+      can_fast_clear = false;
 
    /* Surface state can only record one fast clear color value. Therefore
     * unless different levels/layers agree on the color it can be used to
@@ -825,56 +808,55 @@
    if (irb->layer_count > 1 || irb->mt_level || irb->mt_layer)
       can_fast_clear = false;
 
+   unsigned level = irb->mt_level;
+   const unsigned num_layers = fb->MaxNumLayers ? irb->layer_count : 1;
+
+   /* If the MCS buffer hasn't been allocated yet, we need to allocate it now.
+    */
+   if (can_fast_clear && !irb->mt->mcs_buf) {
+      assert(irb->mt->aux_usage == ISL_AUX_USAGE_CCS_D);
+      if (!intel_miptree_alloc_ccs(brw, irb->mt)) {
+         /* There are a few reasons in addition to out-of-memory, that can
+          * cause intel_miptree_alloc_non_msrt_mcs to fail.  Try to recover by
+          * falling back to non-fast clear.
+          */
+         can_fast_clear = false;
+      }
+   }
+
    if (can_fast_clear) {
-      union gl_color_union override_color =
+      const enum isl_aux_state aux_state =
+         intel_miptree_get_aux_state(irb->mt, irb->mt_level, irb->mt_layer);
+      union isl_color_value clear_color =
          brw_meta_convert_fast_clear_color(brw, irb->mt,
                                            &ctx->Color.ClearColor);
 
-      /* Record the clear color in the miptree so that it will be
-       * programmed in SURFACE_STATE by later rendering and resolve
-       * operations.
-       */
-      const bool color_updated = brw_meta_set_fast_clear_color(
-                                    brw, &irb->mt->gen9_fast_clear_color,
-                                    &override_color);
+      bool same_clear_color = memcmp(&irb->mt->fast_clear_color,
+                                     &clear_color, sizeof(clear_color)) == 0;
 
       /* If the buffer is already in INTEL_FAST_CLEAR_STATE_CLEAR, the clear
        * is redundant and can be skipped.
        */
-      if (!color_updated && fast_clear_state == INTEL_FAST_CLEAR_STATE_CLEAR)
-         return true;
+      if (aux_state == ISL_AUX_STATE_CLEAR && same_clear_color)
+         return;
 
-      /* If the MCS buffer hasn't been allocated yet, we need to allocate
-       * it now.
+      irb->mt->fast_clear_color = clear_color;
+
+      /* If the clear color has changed, we need to emit a new SURFACE_STATE
+       * on the next draw call.
        */
-      if (!irb->mt->mcs_buf) {
-         assert(!intel_miptree_is_lossless_compressed(brw, irb->mt));
-         if (!intel_miptree_alloc_non_msrt_mcs(brw, irb->mt, false)) {
-            /* MCS allocation failed--probably this will only happen in
-             * out-of-memory conditions.  But in any case, try to recover
-             * by falling back to a non-blorp clear technique.
-             */
-            return false;
-         }
-      }
-   }
+      if (!same_clear_color)
+         ctx->NewDriverState |= BRW_NEW_FAST_CLEAR_COLOR;
 
-   const unsigned num_layers = fb->MaxNumLayers ? irb->layer_count : 1;
-
-   /* We can't setup the blorp_surf until we've allocated the MCS above */
-   struct isl_surf isl_tmp[2];
-   struct blorp_surf surf;
-   unsigned level = irb->mt_level;
-   blorp_surf_for_miptree(brw, &surf, irb->mt, true,
-                          (1 << ISL_AUX_USAGE_MCS) |
-                          (1 << ISL_AUX_USAGE_CCS_E) |
-                          (1 << ISL_AUX_USAGE_CCS_D),
-                          &level, logical_layer, num_layers, isl_tmp);
-
-   if (can_fast_clear) {
       DBG("%s (fast) to mt %p level %d layers %d+%d\n", __FUNCTION__,
           irb->mt, irb->mt_level, irb->mt_layer, num_layers);
 
+      /* We can't setup the blorp_surf until we've allocated the MCS above */
+      struct isl_surf isl_tmp[2];
+      struct blorp_surf surf;
+      blorp_surf_for_miptree(brw, &surf, irb->mt, irb->mt->aux_usage, true,
+                             &level, irb->mt_layer, num_layers, isl_tmp);
+
       /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
        *
        *    "Any transition from any value in {Clear, Render, Resolve} to a
@@ -891,9 +873,8 @@
 
       struct blorp_batch batch;
       blorp_batch_init(&brw->blorp, &batch, brw, 0);
-      blorp_fast_clear(&batch, &surf,
-                       (enum isl_format)brw->render_target_format[format],
-                       level, logical_layer, num_layers,
+      blorp_fast_clear(&batch, &surf, isl_format,
+                       level, irb->mt_layer, num_layers,
                        x0, y0, x1, y1);
       blorp_batch_finish(&batch);
 
@@ -903,31 +884,42 @@
        * INTEL_FAST_CLEAR_STATE_CLEAR so that we won't waste time doing
        * redundant clears.
        */
-      intel_miptree_set_fast_clear_state(brw, irb->mt, irb->mt_level,
-                                         logical_layer, num_layers,
-                                         INTEL_FAST_CLEAR_STATE_CLEAR);
+      intel_miptree_set_aux_state(brw, irb->mt, irb->mt_level,
+                                  irb->mt_layer, num_layers,
+                                  ISL_AUX_STATE_CLEAR);
    } else {
       DBG("%s (slow) to mt %p level %d layer %d+%d\n", __FUNCTION__,
           irb->mt, irb->mt_level, irb->mt_layer, num_layers);
 
+      enum isl_aux_usage aux_usage =
+         intel_miptree_render_aux_usage(brw, irb->mt, isl_format, false);
+      intel_miptree_prepare_render(brw, irb->mt, level, irb->mt_layer,
+                                   num_layers, isl_format, false);
+
+      struct isl_surf isl_tmp[2];
+      struct blorp_surf surf;
+      blorp_surf_for_miptree(brw, &surf, irb->mt, aux_usage, true,
+                             &level, irb->mt_layer, num_layers, isl_tmp);
+
       union isl_color_value clear_color;
       memcpy(clear_color.f32, ctx->Color.ClearColor.f, sizeof(float) * 4);
 
       struct blorp_batch batch;
       blorp_batch_init(&brw->blorp, &batch, brw, 0);
-      blorp_clear(&batch, &surf,
-                  (enum isl_format)brw->render_target_format[format],
-                  ISL_SWIZZLE_IDENTITY,
-                  level, irb_logical_mt_layer(irb), num_layers,
+      blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
+                  level, irb->mt_layer, num_layers,
                   x0, y0, x1, y1,
                   clear_color, color_write_disable);
       blorp_batch_finish(&batch);
+
+      intel_miptree_finish_render(brw, irb->mt, level, irb->mt_layer,
+                                  num_layers, isl_format, false);
    }
 
-   return true;
+   return;
 }
 
-bool
+void
 brw_blorp_clear_color(struct brw_context *brw, struct gl_framebuffer *fb,
                       GLbitfield mask, bool partial_clear, bool encode_srgb)
 {
@@ -946,45 +938,151 @@
       if (rb == NULL)
          continue;
 
-      if (!do_single_blorp_clear(brw, fb, rb, buf, partial_clear,
-                                 encode_srgb)) {
-         return false;
-      }
-
+      do_single_blorp_clear(brw, fb, rb, buf, partial_clear, encode_srgb);
       irb->need_downsample = true;
    }
 
-   return true;
+   return;
+}
+
+void
+brw_blorp_clear_depth_stencil(struct brw_context *brw,
+                              struct gl_framebuffer *fb,
+                              GLbitfield mask, bool partial_clear)
+{
+   const struct gl_context *ctx = &brw->ctx;
+   struct gl_renderbuffer *depth_rb =
+      fb->Attachment[BUFFER_DEPTH].Renderbuffer;
+   struct gl_renderbuffer *stencil_rb =
+      fb->Attachment[BUFFER_STENCIL].Renderbuffer;
+
+   if (!depth_rb || ctx->Depth.Mask == GL_FALSE)
+      mask &= ~BUFFER_BIT_DEPTH;
+
+   if (!stencil_rb || (ctx->Stencil.WriteMask[0] & 0xff) == 0)
+      mask &= ~BUFFER_BIT_STENCIL;
+
+   if (!(mask & (BUFFER_BITS_DEPTH_STENCIL)))
+      return;
+
+   uint32_t x0, x1, y0, y1, rb_name, rb_height;
+   if (depth_rb) {
+      rb_name = depth_rb->Name;
+      rb_height = depth_rb->Height;
+      if (stencil_rb) {
+         assert(depth_rb->Width == stencil_rb->Width);
+         assert(depth_rb->Height == stencil_rb->Height);
+      }
+   } else {
+      assert(stencil_rb);
+      rb_name = stencil_rb->Name;
+      rb_height = stencil_rb->Height;
+   }
+
+   x0 = fb->_Xmin;
+   x1 = fb->_Xmax;
+   if (rb_name != 0) {
+      y0 = fb->_Ymin;
+      y1 = fb->_Ymax;
+   } else {
+      y0 = rb_height - fb->_Ymax;
+      y1 = rb_height - fb->_Ymin;
+   }
+
+   /* If the clear region is empty, just return. */
+   if (x0 == x1 || y0 == y1)
+      return;
+
+   uint32_t level, start_layer, num_layers;
+   struct isl_surf isl_tmp[4];
+   struct blorp_surf depth_surf, stencil_surf;
+
+   struct intel_mipmap_tree *depth_mt = NULL;
+   if (mask & BUFFER_BIT_DEPTH) {
+      struct intel_renderbuffer *irb = intel_renderbuffer(depth_rb);
+      depth_mt = find_miptree(GL_DEPTH_BUFFER_BIT, irb);
+
+      level = irb->mt_level;
+      start_layer = irb->mt_layer;
+      num_layers = fb->MaxNumLayers ? irb->layer_count : 1;
+
+      intel_miptree_prepare_depth(brw, depth_mt, level,
+                                  start_layer, num_layers);
+
+      unsigned depth_level = level;
+      blorp_surf_for_miptree(brw, &depth_surf, depth_mt, depth_mt->aux_usage,
+                             true, &depth_level, start_layer, num_layers,
+                             &isl_tmp[0]);
+      assert(depth_level == level);
+   }
+
+   uint8_t stencil_mask = 0;
+   struct intel_mipmap_tree *stencil_mt = NULL;
+   if (mask & BUFFER_BIT_STENCIL) {
+      struct intel_renderbuffer *irb = intel_renderbuffer(stencil_rb);
+      stencil_mt = find_miptree(GL_STENCIL_BUFFER_BIT, irb);
+
+      if (mask & BUFFER_BIT_DEPTH) {
+         assert(level == irb->mt_level);
+         assert(start_layer == irb->mt_layer);
+         assert(num_layers == fb->MaxNumLayers ? irb->layer_count : 1);
+      } else {
+         level = irb->mt_level;
+         start_layer = irb->mt_layer;
+         num_layers = fb->MaxNumLayers ? irb->layer_count : 1;
+      }
+
+      stencil_mask = ctx->Stencil.WriteMask[0] & 0xff;
+
+      intel_miptree_prepare_access(brw, stencil_mt, level, 1,
+                                   start_layer, num_layers,
+                                   ISL_AUX_USAGE_NONE, false);
+
+      unsigned stencil_level = level;
+      blorp_surf_for_miptree(brw, &stencil_surf, stencil_mt,
+                             ISL_AUX_USAGE_NONE, true,
+                             &stencil_level, start_layer, num_layers,
+                             &isl_tmp[2]);
+   }
+
+   assert((mask & BUFFER_BIT_DEPTH) || stencil_mask);
+
+   struct blorp_batch batch;
+   blorp_batch_init(&brw->blorp, &batch, brw, 0);
+   blorp_clear_depth_stencil(&batch, &depth_surf, &stencil_surf,
+                             level, start_layer, num_layers,
+                             x0, y0, x1, y1,
+                             (mask & BUFFER_BIT_DEPTH), ctx->Depth.Clear,
+                             stencil_mask, ctx->Stencil.Clear);
+   blorp_batch_finish(&batch);
+
+   if (mask & BUFFER_BIT_DEPTH) {
+      intel_miptree_finish_depth(brw, depth_mt, level,
+                                 start_layer, num_layers, true);
+   }
+
+   if (stencil_mask) {
+      intel_miptree_finish_write(brw, stencil_mt, level,
+                                 start_layer, num_layers,
+                                 ISL_AUX_USAGE_NONE);
+   }
 }
 
 void
 brw_blorp_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt,
-                        unsigned level, unsigned layer)
+                        unsigned level, unsigned layer,
+                        enum blorp_fast_clear_op resolve_op)
 {
    DBG("%s to mt %p level %u layer %u\n", __FUNCTION__, mt, level, layer);
 
    const mesa_format format = _mesa_get_srgb_format_linear(mt->format);
 
-   struct isl_surf isl_tmp[2];
+   struct isl_surf isl_tmp[1];
    struct blorp_surf surf;
-   blorp_surf_for_miptree(brw, &surf, mt, true,
-                          (1 << ISL_AUX_USAGE_CCS_E) |
-                          (1 << ISL_AUX_USAGE_CCS_D),
+   blorp_surf_for_miptree(brw, &surf, mt, mt->aux_usage, true,
                           &level, layer, 1 /* num_layers */,
                           isl_tmp);
 
-   enum blorp_fast_clear_op resolve_op;
-   if (brw->gen >= 9) {
-      if (surf.aux_usage == ISL_AUX_USAGE_CCS_E)
-         resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
-      else
-         resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
-   } else {
-      assert(surf.aux_usage == ISL_AUX_USAGE_CCS_D);
-      /* Broadwell and earlier do not have a partial resolve */
-      resolve_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
-   }
-
    /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
     *
     *    "Any transition from any value in {Clear, Render, Resolve} to a
@@ -1011,20 +1109,29 @@
    brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);
 }
 
-static void
-gen6_blorp_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-                    unsigned int level, unsigned int layer, enum blorp_hiz_op op)
+void
+brw_blorp_mcs_partial_resolve(struct brw_context *brw,
+                              struct intel_mipmap_tree *mt,
+                              uint32_t start_layer, uint32_t num_layers)
 {
-   assert(intel_miptree_level_has_hiz(mt, level));
+   DBG("%s to mt %p layers %u-%u\n", __FUNCTION__, mt,
+       start_layer, start_layer + num_layers - 1);
 
-   struct isl_surf isl_tmp[2];
+   assert(mt->aux_usage == ISL_AUX_USAGE_MCS);
+
+   const mesa_format format = _mesa_get_srgb_format_linear(mt->format);
+   enum isl_format isl_format = brw_blorp_to_isl_format(brw, format, true);
+
+   struct isl_surf isl_tmp[1];
    struct blorp_surf surf;
-   blorp_surf_for_miptree(brw, &surf, mt, true, (1 << ISL_AUX_USAGE_HIZ),
-                          &level, layer, 1, isl_tmp);
+   uint32_t level = 0;
+   blorp_surf_for_miptree(brw, &surf, mt, ISL_AUX_USAGE_MCS, true,
+                          &level, start_layer, num_layers, isl_tmp);
 
    struct blorp_batch batch;
    blorp_batch_init(&brw->blorp, &batch, brw, 0);
-   blorp_gen6_hiz_op(&batch, &surf, level, layer, op);
+   blorp_mcs_partial_resolve(&batch, &surf, isl_format,
+                             start_layer, num_layers);
    blorp_batch_finish(&batch);
 }
 
@@ -1042,6 +1149,8 @@
                unsigned int level, unsigned int start_layer,
                unsigned int num_layers, enum blorp_hiz_op op)
 {
+   assert(intel_miptree_level_has_hiz(mt, level));
+   assert(op != BLORP_HIZ_OP_NONE);
    const char *opname = NULL;
 
    switch (op) {
@@ -1063,81 +1172,96 @@
        __func__, opname, mt, level, start_layer, start_layer + num_layers - 1);
 
    /* The following stalls and flushes are only documented to be required for
-    * HiZ clear operations.  However, they also seem to be required for the
-    * HiZ resolve operation which is basically the same as a fast clear only a
-    * different value is written into the HiZ surface.
+    * HiZ clear operations.  However, they also seem to be required for
+    * resolve operations.
     */
-   if (op == BLORP_HIZ_OP_DEPTH_CLEAR || op == BLORP_HIZ_OP_HIZ_RESOLVE) {
-      if (brw->gen == 6) {
-         /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-          *
-          *   "If other rendering operations have preceded this clear, a
-          *   PIPE_CONTROL with write cache flush enabled and Z-inhibit
-          *   disabled must be issued before the rectangle primitive used for
-          *   the depth buffer clear operation.
-          */
-          brw_emit_pipe_control_flush(brw,
-                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                      PIPE_CONTROL_CS_STALL);
-      } else if (brw->gen >= 7) {
-         /*
-          * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
-          *
-          *   If other rendering operations have preceded this clear, a
-          *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
-          *   enabled must be issued before the rectangle primitive used for
-          *   the depth buffer clear operation.
-          *
-          * Same applies for Gen8 and Gen9.
-          *
-          * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
-          * PIPE_CONTROL, Depth Cache Flush Enable:
-          *
-          *   This bit must not be set when Depth Stall Enable bit is set in
-          *   this packet.
-          *
-          * This is confirmed to hold for real, HSW gets immediate gpu hangs.
-          *
-          * Therefore issue two pipe control flushes, one for cache flush and
-          * another for depth stall.
-          */
-          brw_emit_pipe_control_flush(brw,
-                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                      PIPE_CONTROL_CS_STALL);
+   if (brw->gen == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+       *
+       *   "If other rendering operations have preceded this clear, a
+       *   PIPE_CONTROL with write cache flush enabled and Z-inhibit
+       *   disabled must be issued before the rectangle primitive used for
+       *   the depth buffer clear operation.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+   } else if (brw->gen >= 7) {
+      /*
+       * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+       *
+       *   If other rendering operations have preceded this clear, a
+       *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+       *   enabled must be issued before the rectangle primitive used for
+       *   the depth buffer clear operation.
+       *
+       * Same applies for Gen8 and Gen9.
+       *
+       * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
+       * PIPE_CONTROL, Depth Cache Flush Enable:
+       *
+       *   This bit must not be set when Depth Stall Enable bit is set in
+       *   this packet.
+       *
+       * This is confirmed to hold for real, HSW gets immediate gpu hangs.
+       *
+       * Therefore issue two pipe control flushes, one for cache flush and
+       * another for depth stall.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
 
-          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-      }
+       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
    }
 
-   if (brw->gen >= 8) {
-      for (unsigned a = 0; a < num_layers; a++)
-         gen8_hiz_exec(brw, mt, level, start_layer + a, op);
-   } else {
-      for (unsigned a = 0; a < num_layers; a++)
-         gen6_blorp_hiz_exec(brw, mt, level, start_layer + a, op);
-   }
+   assert(mt->aux_usage == ISL_AUX_USAGE_HIZ && mt->hiz_buf);
 
+   struct isl_surf isl_tmp[2];
+   struct blorp_surf surf;
+   blorp_surf_for_miptree(brw, &surf, mt, ISL_AUX_USAGE_HIZ, true,
+                          &level, start_layer, num_layers, isl_tmp);
+
+   struct blorp_batch batch;
+   blorp_batch_init(&brw->blorp, &batch, brw, 0);
+   blorp_hiz_op(&batch, &surf, level, start_layer, num_layers, op);
+   blorp_batch_finish(&batch);
 
    /* The following stalls and flushes are only documented to be required for
-    * HiZ clear operations.  However, they also seem to be required for the
-    * HiZ resolve operation which is basically the same as a fast clear only a
-    * different value is written into the HiZ surface.
+    * HiZ clear operations.  However, they also seem to be required for
+    * resolve operations.
     */
-   if (op == BLORP_HIZ_OP_DEPTH_CLEAR || op == BLORP_HIZ_OP_HIZ_RESOLVE) {
-      if (brw->gen == 6) {
-         /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
-          *
-          *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
-          *     followed by a PIPE_CONTROL command with DEPTH_STALL bit set
-          *     and Then followed by Depth FLUSH'
-         */
-         brw_emit_pipe_control_flush(brw,
-                                     PIPE_CONTROL_DEPTH_STALL);
+   if (brw->gen == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+       *
+       *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
+       *     followed by a PIPE_CONTROL command with DEPTH_STALL bit set
+       *     and Then followed by Depth FLUSH'
+      */
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_STALL);
 
-         brw_emit_pipe_control_flush(brw,
-                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                     PIPE_CONTROL_CS_STALL);
-      }
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+   } else if (brw->gen >= 8) {
+      /*
+       * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+       *
+       *    "Depth buffer clear pass using any of the methods (WM_STATE,
+       *    3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a
+       *    PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits
+       *    "set" before starting to render.  DepthStall and DepthFlush are
+       *    not needed between consecutive depth clear passes nor is it
+       *    required if the depth clear pass was done with
+       *    'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP."
+       *
+       *  TODO: Such as the spec says, this could be conditional.
+       */
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_DEPTH_STALL);
+
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index 7f4719f..c65a68a 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -59,20 +59,36 @@
                         unsigned dst_x, unsigned dst_y,
                         unsigned src_width, unsigned src_height);
 
-bool
+void
 brw_blorp_clear_color(struct brw_context *brw, struct gl_framebuffer *fb,
                       GLbitfield mask, bool partial_clear, bool encode_srgb);
+void
+brw_blorp_clear_depth_stencil(struct brw_context *brw,
+                              struct gl_framebuffer *fb,
+                              GLbitfield mask, bool partial_clear);
 
 void
 brw_blorp_resolve_color(struct brw_context *brw,
                         struct intel_mipmap_tree *mt,
-                        unsigned level, unsigned layer);
+                        unsigned level, unsigned layer,
+                        enum blorp_fast_clear_op resolve_op);
+
+void
+brw_blorp_mcs_partial_resolve(struct brw_context *brw,
+                              struct intel_mipmap_tree *mt,
+                              uint32_t start_layer, uint32_t num_layers);
 
 void
 intel_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
                unsigned int level, unsigned int start_layer,
                unsigned int num_layers, enum blorp_hiz_op op);
 
+void gen4_blorp_exec(struct blorp_batch *batch,
+                     const struct blorp_params *params);
+void gen45_blorp_exec(struct blorp_batch *batch,
+                      const struct blorp_params *params);
+void gen5_blorp_exec(struct blorp_batch *batch,
+                     const struct blorp_params *params);
 void gen6_blorp_exec(struct blorp_batch *batch,
                      const struct blorp_params *params);
 void gen7_blorp_exec(struct blorp_batch *batch,
@@ -83,6 +99,8 @@
                      const struct blorp_params *params);
 void gen9_blorp_exec(struct blorp_batch *batch,
                      const struct blorp_params *params);
+void gen10_blorp_exec(struct blorp_batch *batch,
+                      const struct blorp_params *params);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 4b64331..328ef17 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -56,6 +56,7 @@
 #ifndef ETIME
 #define ETIME ETIMEDOUT
 #endif
+#include "common/gen_clflush.h"
 #include "common/gen_debug.h"
 #include "common/gen_device_info.h"
 #include "libdrm_macros.h"
@@ -77,6 +78,16 @@
 #define VG(x)
 #endif
 
+/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier
+ * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is
+ * leaked. All because it does not call VG(cli_free) from its
+ * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like
+ * and allocation, we mark it available for use upon mmapping and remove
+ * it upon unmapping.
+ */
+#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size))
+#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size))
+
 #define memclear(s) memset(&s, 0, sizeof(s))
 
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
@@ -110,6 +121,7 @@
    struct hash_table *handle_table;
 
    bool has_llc:1;
+   bool has_mmap_wc:1;
    bool bo_reuse:1;
 };
 
@@ -254,10 +266,20 @@
    bool alloc_from_cache;
    uint64_t bo_size;
    bool for_render = false;
+   bool zeroed = false;
 
    if (flags & BO_ALLOC_FOR_RENDER)
       for_render = true;
 
+   if (flags & BO_ALLOC_ZEROED)
+      zeroed = true;
+
+   /* FOR_RENDER really means "I'm ok with a busy BO".  This doesn't really
+    * jive with ZEROED as we have to wait for it to be idle before we can
+    * memset.  Just disallow that combination.
+    */
+   assert(!(for_render && zeroed));
+
    /* Round the allocated size up to a power of two number of pages. */
    bucket = bucket_for_size(bufmgr, size);
 
@@ -277,10 +299,12 @@
 retry:
    alloc_from_cache = false;
    if (bucket != NULL && !list_empty(&bucket->head)) {
-      if (for_render) {
+      if (for_render && !zeroed) {
          /* Allocate new render-target BOs from the tail (MRU)
           * of the list, as it will likely be hot in the GPU
-          * cache and in the aperture for us.
+          * cache and in the aperture for us.  If the caller
+          * asked us to zero the buffer, we don't want this
+          * because we are going to mmap it.
           */
          bo = LIST_ENTRY(struct brw_bo, bucket->head.prev, head);
          list_del(&bo->head);
@@ -313,6 +337,15 @@
             bo_free(bo);
             goto retry;
          }
+
+         if (zeroed) {
+            void *map = brw_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+            if (!map) {
+               bo_free(bo);
+               goto retry;
+            }
+            memset(map, 0, bo_size);
+         }
       }
    }
 
@@ -324,10 +357,14 @@
          goto err;
 
       bo->size = bo_size;
+      bo->idle = true;
 
       memclear(create);
       create.size = bo_size;
 
+      /* All new BOs we get from the kernel are zeroed, so we don't need to
+       * worry about that here.
+       */
       ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create);
       if (ret != 0) {
          free(bo);
@@ -346,11 +383,25 @@
 
       if (bo_set_tiling_internal(bo, tiling_mode, stride))
          goto err_free;
+
+      /* Calling set_domain() will allocate pages for the BO outside of the
+       * struct mutex lock in the kernel, which is more efficient than waiting
+       * to create them during the first execbuf that uses the BO.
+       */
+      struct drm_i915_gem_set_domain sd = {
+         .handle = bo->gem_handle,
+         .read_domains = I915_GEM_DOMAIN_CPU,
+         .write_domain = 0,
+      };
+
+      if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0)
+         goto err_free;
    }
 
    bo->name = name;
    p_atomic_set(&bo->refcount, 1);
    bo->reusable = true;
+   bo->cache_coherent = bufmgr->has_llc;
 
    pthread_mutex_unlock(&bufmgr->lock);
 
@@ -374,8 +425,16 @@
 
 struct brw_bo *
 brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr, const char *name,
-                   int x, int y, int cpp, uint32_t tiling,
-                   uint32_t *pitch, unsigned flags)
+                   uint64_t size, uint32_t tiling_mode, uint32_t pitch,
+                   unsigned flags)
+{
+   return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch, 0);
+}
+
+struct brw_bo *
+brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr, const char *name,
+                      int x, int y, int cpp, uint32_t tiling,
+                      uint32_t *pitch, unsigned flags)
 {
    uint64_t size;
    uint32_t stride;
@@ -468,12 +527,12 @@
 
    bo->size = open_arg.size;
    bo->offset64 = 0;
-   bo->virtual = NULL;
    bo->bufmgr = bufmgr;
    bo->gem_handle = open_arg.handle;
    bo->name = name;
    bo->global_name = handle;
    bo->reusable = false;
+   bo->external = true;
 
    _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
    _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
@@ -507,16 +566,17 @@
    struct hash_entry *entry;
    int ret;
 
-   if (bo->mem_virtual) {
-      VG(VALGRIND_FREELIKE_BLOCK(bo->mem_virtual, 0));
-      drm_munmap(bo->mem_virtual, bo->size);
+   if (bo->map_cpu) {
+      VG_NOACCESS(bo->map_cpu, bo->size);
+      drm_munmap(bo->map_cpu, bo->size);
    }
-   if (bo->wc_virtual) {
-      VG(VALGRIND_FREELIKE_BLOCK(bo->wc_virtual, 0));
-      drm_munmap(bo->wc_virtual, bo->size);
+   if (bo->map_wc) {
+      VG_NOACCESS(bo->map_wc, bo->size);
+      drm_munmap(bo->map_wc, bo->size);
    }
-   if (bo->gtt_virtual) {
-      drm_munmap(bo->gtt_virtual, bo->size);
+   if (bo->map_gtt) {
+      VG_NOACCESS(bo->map_gtt, bo->size);
+      drm_munmap(bo->map_gtt, bo->size);
    }
 
    if (bo->global_name) {
@@ -537,21 +597,6 @@
    free(bo);
 }
 
-static void
-bo_mark_mmaps_incoherent(struct brw_bo *bo)
-{
-#if HAVE_VALGRIND
-   if (bo->mem_virtual)
-      VALGRIND_MAKE_MEM_NOACCESS(bo->mem_virtual, bo->size);
-
-   if (bo->wc_virtual)
-      VALGRIND_MAKE_MEM_NOACCESS(bo->wc_virtual, bo->size);
-
-   if (bo->gtt_virtual)
-      VALGRIND_MAKE_MEM_NOACCESS(bo->gtt_virtual, bo->size);
-#endif
-}
-
 /** Frees all cached buffers significantly older than @time. */
 static void
 cleanup_bo_cache(struct brw_bufmgr *bufmgr, time_t time)
@@ -585,13 +630,6 @@
 
    DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
 
-   /* Clear any left-over mappings */
-   if (bo->map_count) {
-      DBG("bo freed with non-zero map-count %d\n", bo->map_count);
-      bo->map_count = 0;
-      bo_mark_mmaps_incoherent(bo);
-   }
-
    bucket = bucket_for_size(bufmgr, bo->size);
    /* Put the buffer into our internal cache for reuse if we can. */
    if (bufmgr->bo_reuse && bo->reusable && bucket != NULL &&
@@ -599,6 +637,7 @@
       bo->free_time = time;
 
       bo->name = NULL;
+      bo->kflags = 0;
 
       list_addtail(&bo->head, &bucket->head);
    } else {
@@ -632,22 +671,13 @@
 }
 
 static void
-set_domain(struct brw_context *brw, const char *action,
-           struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
+bo_wait_with_stall_warning(struct brw_context *brw,
+                           struct brw_bo *bo,
+                           const char *action)
 {
-   struct drm_i915_gem_set_domain sd = {
-      .handle = bo->gem_handle,
-      .read_domains = read_domains,
-      .write_domain = write_domain,
-   };
-
    double elapsed = unlikely(brw && brw->perf_debug) ? -get_time() : 0.0;
 
-   if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
-      DBG("%s:%d: Error setting memory domains %d (%08x %08x): %s.\n",
-          __FILE__, __LINE__, bo->gem_handle, read_domains, write_domain,
-          strerror(errno));
-   }
+   brw_bo_wait_rendering(bo);
 
    if (unlikely(brw && brw->perf_debug)) {
       elapsed += get_time();
@@ -657,194 +687,280 @@
    }
 }
 
-int
-brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable)
+static void
+print_flags(unsigned flags)
+{
+   if (flags & MAP_READ)
+      DBG("READ ");
+   if (flags & MAP_WRITE)
+      DBG("WRITE ");
+   if (flags & MAP_ASYNC)
+      DBG("ASYNC ");
+   if (flags & MAP_PERSISTENT)
+      DBG("PERSISTENT ");
+   if (flags & MAP_COHERENT)
+      DBG("COHERENT ");
+   if (flags & MAP_RAW)
+      DBG("RAW ");
+   DBG("\n");
+}
+
+static void *
+brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
 {
    struct brw_bufmgr *bufmgr = bo->bufmgr;
-   int ret;
 
-   pthread_mutex_lock(&bufmgr->lock);
+   /* We disallow CPU maps for writing to non-coherent buffers, as the
+    * CPU map can become invalidated when a batch is flushed out, which
+    * can happen at unpredictable times.  You should use WC maps instead.
+    */
+   assert(bo->cache_coherent || !(flags & MAP_WRITE));
 
-   if (!bo->mem_virtual) {
+   if (!bo->map_cpu) {
       struct drm_i915_gem_mmap mmap_arg;
+      void *map;
 
-      DBG("bo_map: %d (%s), map_count=%d\n",
-          bo->gem_handle, bo->name, bo->map_count);
+      DBG("brw_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name);
 
       memclear(mmap_arg);
       mmap_arg.handle = bo->gem_handle;
       mmap_arg.size = bo->size;
-      ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+      int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
       if (ret != 0) {
          ret = -errno;
          DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
              __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
-         pthread_mutex_unlock(&bufmgr->lock);
-         return ret;
+         return NULL;
       }
-      bo->map_count++;
-      VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, mmap_arg.size, 0, 1));
-      bo->mem_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr;
+      map = (void *) (uintptr_t) mmap_arg.addr_ptr;
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         drm_munmap(map, bo->size);
+      }
    }
-   DBG("bo_map: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->mem_virtual);
-   bo->virtual = bo->mem_virtual;
+   assert(bo->map_cpu);
 
-   set_domain(brw, "CPU mapping", bo, I915_GEM_DOMAIN_CPU,
-              write_enable ? I915_GEM_DOMAIN_CPU : 0);
+   DBG("brw_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name,
+       bo->map_cpu);
+   print_flags(flags);
 
-   bo_mark_mmaps_incoherent(bo);
-   VG(VALGRIND_MAKE_MEM_DEFINED(bo->mem_virtual, bo->size));
-   pthread_mutex_unlock(&bufmgr->lock);
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(brw, bo, "CPU mapping");
+   }
 
-   return 0;
+   if (!bo->cache_coherent && !bo->bufmgr->has_llc) {
+      /* If we're reusing an existing CPU mapping, the CPU caches may
+       * contain stale data from the last time we read from that mapping.
+       * (With the BO cache, it might even be data from a previous buffer!)
+       * Even if it's a brand new mapping, the kernel may have zeroed the
+       * buffer via CPU writes.
+       *
+       * We need to invalidate those cachelines so that we see the latest
+       * contents, and so long as we only read from the CPU mmap we do not
+       * need to write those cachelines back afterwards.
+       *
+       * On LLC, the emprical evidence suggests that writes from the GPU
+       * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU
+       * cachelines. (Other reads, such as the display engine, bypass the
+       * LLC entirely requiring us to keep dirty pixels for the scanout
+       * out of any cache.)
+       */
+      gen_invalidate_range(bo->map_cpu, bo->size);
+   }
+
+   return bo->map_cpu;
 }
 
-static int
-map_gtt(struct brw_bo *bo)
+static void *
+brw_bo_map_wc(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
 {
    struct brw_bufmgr *bufmgr = bo->bufmgr;
-   int ret;
+
+   if (!bufmgr->has_mmap_wc)
+      return NULL;
+
+   if (!bo->map_wc) {
+      struct drm_i915_gem_mmap mmap_arg;
+      void *map;
+
+      DBG("brw_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name);
+
+      memclear(mmap_arg);
+      mmap_arg.handle = bo->gem_handle;
+      mmap_arg.size = bo->size;
+      mmap_arg.flags = I915_MMAP_WC;
+      int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+      if (ret != 0) {
+         ret = -errno;
+         DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+         return NULL;
+      }
+
+      map = (void *) (uintptr_t) mmap_arg.addr_ptr;
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         drm_munmap(map, bo->size);
+      }
+   }
+   assert(bo->map_wc);
+
+   DBG("brw_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc);
+   print_flags(flags);
+
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(brw, bo, "WC mapping");
+   }
+
+   return bo->map_wc;
+}
+
+/**
+ * Perform an uncached mapping via the GTT.
+ *
+ * Write access through the GTT is not quite fully coherent. On low power
+ * systems especially, like modern Atoms, we can observe reads from RAM before
+ * the write via GTT has landed. A write memory barrier that flushes the Write
+ * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later
+ * read after the write as the GTT write suffers a small delay through the GTT
+ * indirection. The kernel uses an uncached mmio read to ensure the GTT write
+ * is ordered with reads (either by the GPU, WB or WC) and unconditionally
+ * flushes prior to execbuf submission. However, if we are not informing the
+ * kernel about our GTT writes, it will not flush before earlier access, such
+ * as when using the cmdparser. Similarly, we need to be careful if we should
+ * ever issue a CPU read immediately following a GTT write.
+ *
+ * Telling the kernel about write access also has one more important
+ * side-effect. Upon receiving notification about the write, it cancels any
+ * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by
+ * either SW_FINISH or DIRTYFB. The presumption is that we never write to the
+ * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR
+ * tracking is handled on the buffer exchange instead.
+ */
+static void *
+brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
+{
+   struct brw_bufmgr *bufmgr = bo->bufmgr;
 
    /* Get a mapping of the buffer if we haven't before. */
-   if (bo->gtt_virtual == NULL) {
+   if (bo->map_gtt == NULL) {
       struct drm_i915_gem_mmap_gtt mmap_arg;
+      void *map;
 
-      DBG("bo_map_gtt: mmap %d (%s), map_count=%d\n",
-          bo->gem_handle, bo->name, bo->map_count);
+      DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name);
 
       memclear(mmap_arg);
       mmap_arg.handle = bo->gem_handle;
 
       /* Get the fake offset back... */
-      ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
+      int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
       if (ret != 0) {
-         ret = -errno;
          DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n",
              __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
-         return ret;
+         return NULL;
       }
 
-      /* and mmap it */
-      bo->gtt_virtual = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE,
-                                 MAP_SHARED, bufmgr->fd, mmap_arg.offset);
-      if (bo->gtt_virtual == MAP_FAILED) {
-         bo->gtt_virtual = NULL;
-         ret = -errno;
+      /* and mmap it. */
+      map = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE,
+                     MAP_SHARED, bufmgr->fd, mmap_arg.offset);
+      if (map == MAP_FAILED) {
          DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
              __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
-         return ret;
+         return NULL;
+      }
+
+      /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will
+       * already intercept this mmap call. However, for consistency between
+       * all the mmap paths, we mark the pointer as defined now and mark it
+       * as inaccessible afterwards.
+       */
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         drm_munmap(map, bo->size);
       }
    }
+   assert(bo->map_gtt);
 
-   bo->map_count++;
-   bo->virtual = bo->gtt_virtual;
+   DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt);
+   print_flags(flags);
 
-   DBG("bo_map_gtt: %d (%s) -> %p\n", bo->gem_handle, bo->name,
-       bo->gtt_virtual);
-
-   return 0;
-}
-
-int
-brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo)
-{
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-   int ret;
-
-   pthread_mutex_lock(&bufmgr->lock);
-
-   ret = map_gtt(bo);
-   if (ret) {
-      pthread_mutex_unlock(&bufmgr->lock);
-      return ret;
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(brw, bo, "GTT mapping");
    }
 
-   /* Now move it to the GTT domain so that the GPU and CPU
-    * caches are flushed and the GPU isn't actively using the
-    * buffer.
+   return bo->map_gtt;
+}
+
+static bool
+can_map_cpu(struct brw_bo *bo, unsigned flags)
+{
+   if (bo->cache_coherent)
+      return true;
+
+   /* Even if the buffer itself is not cache-coherent (such as a scanout), on
+    * an LLC platform reads always are coherent (as they are performed via the
+    * central system agent). It is just the writes that we need to take special
+    * care to ensure that land in main memory and not stick in the CPU cache.
+    */
+   if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc)
+      return true;
+
+   /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
+    * across batch flushes where the kernel will change cache domains of the
+    * bo, invalidating continued access to the CPU mmap on non-LLC device.
     *
-    * The pagefault handler does this domain change for us when
-    * it has unbound the BO from the GTT, but it's up to us to
-    * tell it when we're about to use things if we had done
-    * rendering and it still happens to be bound to the GTT.
+    * Similarly, ASYNC typically means that the buffer will be accessed via
+    * both the CPU and the GPU simultaneously.  Batches may be executed that
+    * use the BO even while it is mapped.  While OpenGL technically disallows
+    * most drawing while non-persistent mappings are active, we may still use
+    * the GPU for blits or other operations, causing batches to happen at
+    * inconvenient times.
     */
-   set_domain(brw, "GTT mapping", bo,
-              I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+   if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC))
+      return false;
 
-   bo_mark_mmaps_incoherent(bo);
-   VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size));
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return 0;
+   return !(flags & MAP_WRITE);
 }
 
-/**
- * Performs a mapping of the buffer object like the normal GTT
- * mapping, but avoids waiting for the GPU to be done reading from or
- * rendering to the buffer.
- *
- * This is used in the implementation of GL_ARB_map_buffer_range: The
- * user asks to create a buffer, then does a mapping, fills some
- * space, runs a drawing command, then asks to map it again without
- * synchronizing because it guarantees that it won't write over the
- * data that the GPU is busy using (or, more specifically, that if it
- * does write over the data, it acknowledges that rendering is
- * undefined).
- */
-
-int
-brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo)
+void *
+brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
 {
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-   int ret;
+   if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
+      return brw_bo_map_gtt(brw, bo, flags);
 
-   /* If the CPU cache isn't coherent with the GTT, then use a
-    * regular synchronized mapping.  The problem is that we don't
-    * track where the buffer was last used on the CPU side in
-    * terms of brw_bo_map vs brw_bo_map_gtt, so
-    * we would potentially corrupt the buffer even when the user
-    * does reasonable things.
+   void *map;
+
+   if (can_map_cpu(bo, flags))
+      map = brw_bo_map_cpu(brw, bo, flags);
+   else
+      map = brw_bo_map_wc(brw, bo, flags);
+
+   /* Allow the attempt to fail by falling back to the GTT where necessary.
+    *
+    * Not every buffer can be mmaped directly using the CPU (or WC), for
+    * example buffers that wrap stolen memory or are imported from other
+    * devices. For those, we have little choice but to use a GTT mmapping.
+    * However, if we use a slow GTT mmapping for reads where we expected fast
+    * access, that order of magnitude difference in throughput will be clearly
+    * expressed by angry users.
+    *
+    * We skip MAP_RAW because we want to avoid map_gtt's fence detiling.
     */
-   if (!bufmgr->has_llc)
-      return brw_bo_map_gtt(brw, bo);
-
-   pthread_mutex_lock(&bufmgr->lock);
-
-   ret = map_gtt(bo);
-   if (ret == 0) {
-      bo_mark_mmaps_incoherent(bo);
-      VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size));
+   if (!map && !(flags & MAP_RAW)) {
+      if (brw) {
+         perf_debug("Fallback GTT mapping for %s with access flags %x\n",
+                    bo->name, flags);
+      }
+      map = brw_bo_map_gtt(brw, bo, flags);
    }
 
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return ret;
-}
-
-int
-brw_bo_unmap(struct brw_bo *bo)
-{
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-   int ret = 0;
-
-   pthread_mutex_lock(&bufmgr->lock);
-
-   if (bo->map_count <= 0) {
-      DBG("attempted to unmap an unmapped bo\n");
-      pthread_mutex_unlock(&bufmgr->lock);
-      /* Preserve the old behaviour of just treating this as a
-       * no-op rather than reporting the error.
-       */
-      return 0;
-   }
-
-   if (--bo->map_count == 0) {
-      bo_mark_mmaps_incoherent(bo);
-      bo->virtual = NULL;
-   }
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return ret;
+   return map;
 }
 
 int
@@ -871,36 +987,14 @@
    return ret;
 }
 
-int
-brw_bo_get_subdata(struct brw_bo *bo, uint64_t offset,
-                   uint64_t size, void *data)
-{
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-   struct drm_i915_gem_pread pread;
-   int ret;
-
-   memclear(pread);
-   pread.handle = bo->gem_handle;
-   pread.offset = offset;
-   pread.size = size;
-   pread.data_ptr = (uint64_t) (uintptr_t) data;
-   ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_PREAD, &pread);
-   if (ret != 0) {
-      ret = -errno;
-      DBG("%s:%d: Error reading data from buffer %d: "
-          "(%"PRIu64" %"PRIu64") %s .\n",
-          __FILE__, __LINE__, bo->gem_handle, offset, size, strerror(errno));
-   }
-
-   return ret;
-}
-
 /** Waits for all GPU rendering with the object to have completed. */
 void
-brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo)
+brw_bo_wait_rendering(struct brw_bo *bo)
 {
-   set_domain(brw, "waiting for",
-              bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+   /* We require a kernel recent enough for WAIT_IOCTL support.
+    * See intel_init_bufmgr()
+    */
+   brw_bo_wait(bo, -1);
 }
 
 /**
@@ -937,6 +1031,10 @@
    struct drm_i915_gem_wait wait;
    int ret;
 
+   /* If we know it's idle, don't bother with the kernel round trip */
+   if (bo->idle && !bo->external)
+      return 0;
+
    memclear(wait);
    wait.bo_handle = bo->gem_handle;
    wait.timeout_ns = timeout_ns;
@@ -1012,8 +1110,7 @@
 }
 
 struct brw_bo *
-brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd,
-                             int size)
+brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd)
 {
    int ret;
    uint32_t handle;
@@ -1054,8 +1151,6 @@
    ret = lseek(prime_fd, 0, SEEK_END);
    if (ret != -1)
       bo->size = ret;
-   else
-      bo->size = size;
 
    bo->bufmgr = bufmgr;
 
@@ -1064,6 +1159,7 @@
 
    bo->name = "prime";
    bo->reusable = false;
+   bo->external = true;
 
    memclear(get_tiling);
    get_tiling.handle = bo->gem_handle;
@@ -1094,6 +1190,7 @@
       return -errno;
 
    bo->reusable = false;
+   bo->external = true;
 
    return 0;
 }
@@ -1115,6 +1212,7 @@
       if (!bo->global_name) {
          bo->global_name = flink.name;
          bo->reusable = false;
+         bo->external = true;
 
          _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
       }
@@ -1220,109 +1318,19 @@
    return ret;
 }
 
-void *
-brw_bo_map__gtt(struct brw_bo *bo)
+static int
+gem_param(int fd, int name)
 {
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
+   drm_i915_getparam_t gp;
+   int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
 
-   if (bo->gtt_virtual)
-      return bo->gtt_virtual;
+   memset(&gp, 0, sizeof(gp));
+   gp.param = name;
+   gp.value = &v;
+   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+      return -1;
 
-   pthread_mutex_lock(&bufmgr->lock);
-   if (bo->gtt_virtual == NULL) {
-      struct drm_i915_gem_mmap_gtt mmap_arg;
-      void *ptr;
-
-      DBG("bo_map_gtt: mmap %d (%s), map_count=%d\n",
-          bo->gem_handle, bo->name, bo->map_count);
-
-      memclear(mmap_arg);
-      mmap_arg.handle = bo->gem_handle;
-
-      /* Get the fake offset back... */
-      ptr = MAP_FAILED;
-      if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg) == 0) {
-         /* and mmap it */
-         ptr = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE,
-                        MAP_SHARED, bufmgr->fd, mmap_arg.offset);
-      }
-      if (ptr == MAP_FAILED) {
-         --bo->map_count;
-         ptr = NULL;
-      }
-
-      bo->gtt_virtual = ptr;
-   }
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return bo->gtt_virtual;
-}
-
-void *
-brw_bo_map__cpu(struct brw_bo *bo)
-{
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-
-   if (bo->mem_virtual)
-      return bo->mem_virtual;
-
-   pthread_mutex_lock(&bufmgr->lock);
-   if (!bo->mem_virtual) {
-      struct drm_i915_gem_mmap mmap_arg;
-
-      DBG("bo_map: %d (%s), map_count=%d\n",
-          bo->gem_handle, bo->name, bo->map_count);
-
-      memclear(mmap_arg);
-      mmap_arg.handle = bo->gem_handle;
-      mmap_arg.size = bo->size;
-      if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-         DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
-             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
-      } else {
-         bo->map_count++;
-         VG(VALGRIND_MALLOCLIKE_BLOCK
-            (mmap_arg.addr_ptr, mmap_arg.size, 0, 1));
-         bo->mem_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr;
-      }
-   }
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return bo->mem_virtual;
-}
-
-void *
-brw_bo_map__wc(struct brw_bo *bo)
-{
-   struct brw_bufmgr *bufmgr = bo->bufmgr;
-
-   if (bo->wc_virtual)
-      return bo->wc_virtual;
-
-   pthread_mutex_lock(&bufmgr->lock);
-   if (!bo->wc_virtual) {
-      struct drm_i915_gem_mmap mmap_arg;
-
-      DBG("bo_map: %d (%s), map_count=%d\n",
-          bo->gem_handle, bo->name, bo->map_count);
-
-      memclear(mmap_arg);
-      mmap_arg.handle = bo->gem_handle;
-      mmap_arg.size = bo->size;
-      mmap_arg.flags = I915_MMAP_WC;
-      if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-         DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
-             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
-      } else {
-         bo->map_count++;
-         VG(VALGRIND_MALLOCLIKE_BLOCK
-            (mmap_arg.addr_ptr, mmap_arg.size, 0, 1));
-         bo->wc_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr;
-      }
-   }
-   pthread_mutex_unlock(&bufmgr->lock);
-
-   return bo->wc_virtual;
+   return v;
 }
 
 /**
@@ -1357,6 +1365,7 @@
    }
 
    bufmgr->has_llc = devinfo->has_llc;
+   bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
 
    init_cache_buckets(bufmgr);
 
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 1b1790a..6a6051b 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -62,16 +62,6 @@
     */
    uint64_t align;
 
-   /**
-    * Virtual address for accessing the buffer data.  Only valid while
-    * mapped.
-    */
-#ifdef __cplusplus
-   void *virt;
-#else
-   void *virtual;
-#endif
-
    /** Buffer manager context associated with this buffer object */
    struct brw_bufmgr *bufmgr;
 
@@ -97,6 +87,11 @@
    int refcount;
    const char *name;
 
+#ifndef EXEC_OBJECT_CAPTURE
+#define EXEC_OBJECT_CAPTURE            (1<<7)
+#endif
+   uint64_t kflags;
+
    /**
     * Kenel-assigned global name for this object
     *
@@ -114,12 +109,11 @@
    time_t free_time;
 
    /** Mapped address for the buffer, saved across map/unmap cycles */
-   void *mem_virtual;
+   void *map_cpu;
    /** GTT virtual address for the buffer, saved across map/unmap cycles */
-   void *gtt_virtual;
+   void *map_gtt;
    /** WC CPU address for the buffer, saved across map/unmap cycles */
-   void *wc_virtual;
-   int map_count;
+   void *map_wc;
 
    /** BO cache list */
    struct list_head head;
@@ -128,16 +122,27 @@
     * Boolean of whether this buffer can be re-used
     */
    bool reusable;
+
+   /**
+    * Boolean of whether this buffer has been shared with an external client.
+    */
+   bool external;
+
+   /**
+    * Boolean of whether this buffer is cache coherent
+    */
+   bool cache_coherent;
 };
 
 #define BO_ALLOC_FOR_RENDER (1<<0)
+#define BO_ALLOC_ZEROED     (1<<1)
 
 /**
  * Allocate a buffer object.
  *
  * Buffer objects are not necessarily initially mapped into CPU virtual
  * address space or graphics device aperture.  They must be mapped
- * using bo_map() or brw_bo_map_gtt() to be used by the CPU.
+ * using brw_bo_map() to be used by the CPU.
  */
 struct brw_bo *brw_bo_alloc(struct brw_bufmgr *bufmgr, const char *name,
                             uint64_t size, uint64_t alignment);
@@ -152,17 +157,35 @@
  *  I915_TILING_NONE
  *  I915_TILING_X
  *  I915_TILING_Y
+ */
+struct brw_bo *brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr,
+                                  const char *name,
+                                  uint64_t size,
+                                  uint32_t tiling_mode,
+                                  uint32_t pitch,
+                                  unsigned flags);
+
+/**
+ * Allocate a tiled buffer object.
+ *
+ * Alignment for tiled objects is set automatically; the 'flags'
+ * argument provides a hint about how the object will be used initially.
+ *
+ * Valid tiling formats are:
+ *  I915_TILING_NONE
+ *  I915_TILING_X
+ *  I915_TILING_Y
  *
  * Note the tiling format may be rejected; callers should check the
  * 'tiling_mode' field on return, as well as the pitch value, which
  * may have been rounded up to accommodate for tiling restrictions.
  */
-struct brw_bo *brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr,
-                                  const char *name,
-                                  int x, int y, int cpp,
-                                  uint32_t tiling_mode,
-                                  uint32_t *pitch,
-                                  unsigned flags);
+struct brw_bo *brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr,
+                                     const char *name,
+                                     int x, int y, int cpp,
+                                     uint32_t tiling_mode,
+                                     uint32_t *pitch,
+                                     unsigned flags);
 
 /** Takes a reference on a buffer object */
 void brw_bo_reference(struct brw_bo *bo);
@@ -173,27 +196,33 @@
  */
 void brw_bo_unreference(struct brw_bo *bo);
 
+/* Must match MapBufferRange interface (for convenience) */
+#define MAP_READ        GL_MAP_READ_BIT
+#define MAP_WRITE       GL_MAP_WRITE_BIT
+#define MAP_ASYNC       GL_MAP_UNSYNCHRONIZED_BIT
+#define MAP_PERSISTENT  GL_MAP_PERSISTENT_BIT
+#define MAP_COHERENT    GL_MAP_COHERENT_BIT
+/* internal */
+#define MAP_INTERNAL_MASK       (0xff << 24)
+#define MAP_RAW                 (0x01 << 24)
+
 /**
  * Maps the buffer into userspace.
  *
  * This function will block waiting for any existing execution on the
- * buffer to complete, first.  The resulting mapping is available at
- * buf->virtual.
+ * buffer to complete, first.  The resulting mapping is returned.
  */
-int brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable);
+MUST_CHECK void *brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags);
 
 /**
  * Reduces the refcount on the userspace mapping of the buffer
  * object.
  */
-int brw_bo_unmap(struct brw_bo *bo);
+static inline int brw_bo_unmap(struct brw_bo *bo) { return 0; }
 
 /** Write data into an object. */
 int brw_bo_subdata(struct brw_bo *bo, uint64_t offset,
                    uint64_t size, const void *data);
-/** Read data from an object. */
-int brw_bo_get_subdata(struct brw_bo *bo, uint64_t offset,
-                       uint64_t size, void *data);
 /**
  * Waits for rendering to an object by the GPU to have completed.
  *
@@ -201,7 +230,7 @@
  * bo_subdata, etc.  It is merely a way for the driver to implement
  * glFinish.
  */
-void brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo);
+void brw_bo_wait_rendering(struct brw_bo *bo);
 
 /**
  * Tears down the buffer manager instance.
@@ -253,12 +282,6 @@
                                            const char *name,
                                            unsigned int handle);
 void brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr);
-int brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo);
-int brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo);
-
-void *brw_bo_map__cpu(struct brw_bo *bo);
-void *brw_bo_map__gtt(struct brw_bo *bo);
-void *brw_bo_map__wc(struct brw_bo *bo);
 
 int brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns);
 
@@ -267,7 +290,7 @@
 
 int brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd);
 struct brw_bo *brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr,
-                                            int prime_fd, int size);
+                                            int prime_fd);
 
 int brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset,
                  uint64_t *result);
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
deleted file mode 100644
index 21b01f3..0000000
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "main/macros.h"
-#include "main/stencil.h"
-#include "intel_batchbuffer.h"
-
-static void
-brw_upload_cc_vp(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_cc_viewport *ccv;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   ccv = brw_state_batch(brw, sizeof(*ccv) * viewport_count, 32,
-                         &brw->cc.vp_offset);
-
-   /* _NEW_TRANSFORM */
-   for (unsigned i = 0; i < viewport_count; i++) {
-      if (ctx->Transform.DepthClamp) {
-         /* _NEW_VIEWPORT */
-         ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near,
-                                 ctx->ViewportArray[i].Far);
-         ccv[i].max_depth = MAX2(ctx->ViewportArray[i].Near,
-                                 ctx->ViewportArray[i].Far);
-      } else {
-         ccv[i].min_depth = 0.0;
-         ccv[i].max_depth = 1.0;
-      }
-   }
-
-   if (brw->gen >= 7) {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS_CC << 16 | (2 - 2));
-      OUT_BATCH(brw->cc.vp_offset);
-      ADVANCE_BATCH();
-   } else {
-      brw->ctx.NewDriverState |= BRW_NEW_CC_VP;
-   }
-}
-
-const struct brw_tracked_state brw_cc_vp = {
-   .dirty = {
-      .mesa = _NEW_TRANSFORM |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = brw_upload_cc_vp
-};
-
-/**
- * Modify blend function to force destination alpha to 1.0
- *
- * If \c function specifies a blend function that uses destination alpha,
- * replace it with a function that hard-wires destination alpha to 1.0.  This
- * is used when rendering to xRGB targets.
- */
-GLenum
-brw_fix_xRGB_alpha(GLenum function)
-{
-   switch (function) {
-   case GL_DST_ALPHA:
-      return GL_ONE;
-
-   case GL_ONE_MINUS_DST_ALPHA:
-   case GL_SRC_ALPHA_SATURATE:
-      return GL_ZERO;
-   }
-
-   return function;
-}
-
-/**
- * Creates a CC unit packet from the current blend state.
- */
-static void upload_cc_unit(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_cc_unit_state *cc;
-
-   cc = brw_state_batch(brw, sizeof(*cc), 64, &brw->cc.state_offset);
-   memset(cc, 0, sizeof(*cc));
-
-   /* _NEW_STENCIL | _NEW_BUFFERS */
-   if (ctx->Stencil._Enabled) {
-      const unsigned back = ctx->Stencil._BackFace;
-
-      cc->cc0.stencil_enable = 1;
-      cc->cc0.stencil_func =
-	 intel_translate_compare_func(ctx->Stencil.Function[0]);
-      cc->cc0.stencil_fail_op =
-	 intel_translate_stencil_op(ctx->Stencil.FailFunc[0]);
-      cc->cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]);
-      cc->cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]);
-      cc->cc1.stencil_ref = _mesa_get_stencil_ref(ctx, 0);
-      cc->cc1.stencil_write_mask = ctx->Stencil.WriteMask[0];
-      cc->cc1.stencil_test_mask = ctx->Stencil.ValueMask[0];
-
-      if (ctx->Stencil._TestTwoSide) {
-	 cc->cc0.bf_stencil_enable = 1;
-	 cc->cc0.bf_stencil_func =
-	    intel_translate_compare_func(ctx->Stencil.Function[back]);
-	 cc->cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(ctx->Stencil.FailFunc[back]);
-	 cc->cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]);
-	 cc->cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]);
-	 cc->cc1.bf_stencil_ref = _mesa_get_stencil_ref(ctx, back);
-	 cc->cc2.bf_stencil_write_mask = ctx->Stencil.WriteMask[back];
-	 cc->cc2.bf_stencil_test_mask = ctx->Stencil.ValueMask[back];
-      }
-
-      /* Not really sure about this:
-       */
-      if (ctx->Stencil.WriteMask[0] ||
-	  (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[back]))
-	 cc->cc0.stencil_write_enable = 1;
-   }
-
-   /* _NEW_COLOR */
-   if (ctx->Color.ColorLogicOpEnabled && ctx->Color.LogicOp != GL_COPY) {
-      cc->cc2.logicop_enable = 1;
-      cc->cc5.logicop_func = intel_translate_logic_op(ctx->Color.LogicOp);
-   } else if (ctx->Color.BlendEnabled && !ctx->Color._AdvancedBlendMode) {
-      GLenum eqRGB = ctx->Color.Blend[0].EquationRGB;
-      GLenum eqA = ctx->Color.Blend[0].EquationA;
-      GLenum srcRGB = ctx->Color.Blend[0].SrcRGB;
-      GLenum dstRGB = ctx->Color.Blend[0].DstRGB;
-      GLenum srcA = ctx->Color.Blend[0].SrcA;
-      GLenum dstA = ctx->Color.Blend[0].DstA;
-
-      /* If the renderbuffer is XRGB, we have to frob the blend function to
-       * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
-       * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
-       */
-      if (ctx->DrawBuffer->Visual.alphaBits == 0) {
-	 srcRGB = brw_fix_xRGB_alpha(srcRGB);
-	 srcA   = brw_fix_xRGB_alpha(srcA);
-	 dstRGB = brw_fix_xRGB_alpha(dstRGB);
-	 dstA   = brw_fix_xRGB_alpha(dstA);
-      }
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
-      }
-
-      cc->cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      cc->cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
-      cc->cc6.blend_function = brw_translate_blend_equation(eqRGB);
-
-      cc->cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      cc->cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
-      cc->cc5.ia_blend_function = brw_translate_blend_equation(eqA);
-
-      cc->cc3.blend_enable = 1;
-      cc->cc3.ia_blend_enable = (srcA != srcRGB ||
-				dstA != dstRGB ||
-				eqA != eqRGB);
-   }
-
-   /* _NEW_BUFFERS */
-   if (ctx->Color.AlphaEnabled && ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
-      cc->cc3.alpha_test = 1;
-      cc->cc3.alpha_test_func =
-	 intel_translate_compare_func(ctx->Color.AlphaFunc);
-      cc->cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-
-      UNCLAMPED_FLOAT_TO_UBYTE(cc->cc7.alpha_ref.ub[0], ctx->Color.AlphaRef);
-   }
-
-   if (ctx->Color.DitherFlag) {
-      cc->cc5.dither_enable = 1;
-      cc->cc6.y_dither_offset = 0;
-      cc->cc6.x_dither_offset = 0;
-   }
-
-   /* _NEW_DEPTH */
-   if (ctx->Depth.Test) {
-      cc->cc2.depth_test = 1;
-      cc->cc2.depth_test_function =
-	 intel_translate_compare_func(ctx->Depth.Func);
-      cc->cc2.depth_write_enable = brw_depth_writes_enabled(brw);
-   }
-
-   if (brw->stats_wm || unlikely(INTEL_DEBUG & DEBUG_STATS))
-      cc->cc5.statistics_enable = 1;
-
-   /* BRW_NEW_CC_VP */
-   cc->cc4.cc_viewport_state_offset = (brw->batch.bo->offset64 +
-				       brw->cc.vp_offset) >> 5; /* reloc */
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-
-   /* Emit CC viewport relocation */
-   brw_emit_reloc(&brw->batch,
-                  (brw->cc.state_offset +
-                   offsetof(struct brw_cc_unit_state, cc4)),
-                  brw->batch.bo, brw->cc.vp_offset,
-                  I915_GEM_DOMAIN_INSTRUCTION, 0);
-}
-
-const struct brw_tracked_state brw_cc_unit = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_COLOR |
-              _NEW_DEPTH |
-              _NEW_STENCIL,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_CC_VP |
-             BRW_NEW_STATS_WM,
-   },
-   .emit = upload_cc_unit,
-};
-
-static void upload_blend_constant_color(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   BEGIN_BATCH(5);
-   OUT_BATCH(_3DSTATE_BLEND_CONSTANT_COLOR << 16 | (5-2));
-   OUT_BATCH_F(ctx->Color.BlendColorUnclamped[0]);
-   OUT_BATCH_F(ctx->Color.BlendColorUnclamped[1]);
-   OUT_BATCH_F(ctx->Color.BlendColorUnclamped[2]);
-   OUT_BATCH_F(ctx->Color.BlendColorUnclamped[3]);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_blend_constant_color = {
-   .dirty = {
-      .mesa = _NEW_COLOR,
-      .brw = BRW_NEW_CONTEXT |
-             BRW_NEW_BLORP,
-   },
-   .emit = upload_blend_constant_color
-};
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 2eac555..5118d96 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -121,11 +121,11 @@
    if ((ctx->Scissor.EnableFlags & 1) && !noop_scissor(fb)) {
       perf_debug("Failed to fast clear %dx%d depth because of scissors.  "
                  "Possible 5%% performance win if avoided.\n",
-                 mt->logical_width0, mt->logical_height0);
+                 mt->surf.logical_level0_px.width,
+                 mt->surf.logical_level0_px.height);
       return false;
    }
 
-   uint32_t depth_clear_value;
    switch (mt->format) {
    case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
    case MESA_FORMAT_Z24_UNORM_S8_UINT:
@@ -139,10 +139,6 @@
        */
       return false;
 
-   case MESA_FORMAT_Z_FLOAT32:
-      depth_clear_value = float_as_int(ctx->Depth.Clear);
-      break;
-
    case MESA_FORMAT_Z_UNORM16:
       /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
        *
@@ -154,28 +150,36 @@
        *        optimization must be disabled.
        */
       if (brw->gen == 6 &&
-          (minify(mt->physical_width0,
+          (minify(mt->surf.phys_level0_sa.width,
                   depth_irb->mt_level - mt->first_level) % 16) != 0)
 	 return false;
-      /* FALLTHROUGH */
+      break;
 
    default:
-      if (brw->gen >= 8)
-         depth_clear_value = float_as_int(ctx->Depth.Clear);
-      else
-         depth_clear_value = fb->_DepthMax * ctx->Depth.Clear;
       break;
    }
 
+   /* Quantize the clear value to what can be stored in the actual depth
+    * buffer.  This makes the following check more accurate because it now
+    * checks if the actual depth bits will match.  It also prevents us from
+    * getting a too-accurate depth value during depth testing or when sampling
+    * with HiZ enabled.
+    */
+   float clear_value =
+      mt->format == MESA_FORMAT_Z_FLOAT32 ? ctx->Depth.Clear :
+      (unsigned)(ctx->Depth.Clear * fb->_DepthMax) / (float)fb->_DepthMax;
+
    /* If we're clearing to a new clear value, then we need to resolve any clear
     * flags out of the HiZ buffer into the real depth buffer.
     */
-   if (mt->depth_clear_value != depth_clear_value) {
-      intel_miptree_all_slices_resolve_depth(brw, mt);
-      mt->depth_clear_value = depth_clear_value;
+   if (mt->fast_clear_color.f32[0] != clear_value) {
+      intel_miptree_prepare_access(brw, mt, 0, INTEL_REMAINING_LEVELS,
+                                   0, INTEL_REMAINING_LAYERS,
+                                   ISL_AUX_USAGE_HIZ, false);
+      mt->fast_clear_color.f32[0] = clear_value;
    }
 
-   if (fb->MaxNumLayers > 0) {
+   if (depth_att->Layered) {
       intel_hiz_exec(brw, mt, depth_irb->mt_level,
                      depth_irb->mt_layer, depth_irb->layer_count,
                      BLORP_HIZ_OP_DEPTH_CLEAR);
@@ -187,7 +191,15 @@
    /* Now, the HiZ buffer contains data that needs to be resolved to the depth
     * buffer.
     */
-   intel_renderbuffer_att_set_needs_depth_resolve(depth_att);
+   if (depth_att->Layered) {
+      intel_miptree_set_aux_state(brw, mt, depth_irb->mt_level,
+                                  depth_irb->mt_layer, depth_irb->layer_count,
+                                  ISL_AUX_STATE_CLEAR);
+   } else {
+      intel_miptree_set_aux_state(brw, mt, depth_irb->mt_level,
+                                  depth_irb->mt_layer, 1,
+                                  ISL_AUX_STATE_CLEAR);
+   }
 
    return true;
 }
@@ -227,31 +239,32 @@
          mt->stencil_mt->r8stencil_needs_update = true;
    }
 
-   /* BLORP is currently only supported on Gen6+. */
-   if (brw->gen >= 6 && (mask & BUFFER_BITS_COLOR)) {
-      const bool encode_srgb = ctx->Color.sRGBEnabled;
-      if (brw_blorp_clear_color(brw, fb, mask, partial_clear, encode_srgb)) {
-         debug_mask("blorp color", mask & BUFFER_BITS_COLOR);
-         mask &= ~BUFFER_BITS_COLOR;
-      }
+   if (mask & BUFFER_BITS_COLOR) {
+      brw_blorp_clear_color(brw, fb, mask, partial_clear,
+                            ctx->Color.sRGBEnabled);
+      debug_mask("blorp color", mask & BUFFER_BITS_COLOR);
+      mask &= ~BUFFER_BITS_COLOR;
    }
 
-   GLbitfield tri_mask = mask & (BUFFER_BITS_COLOR |
-				 BUFFER_BIT_STENCIL |
-				 BUFFER_BIT_DEPTH);
+   if (brw->gen >= 6 && (mask & BUFFER_BITS_DEPTH_STENCIL)) {
+      brw_blorp_clear_depth_stencil(brw, fb, mask, partial_clear);
+      debug_mask("blorp depth/stencil", mask & BUFFER_BITS_DEPTH_STENCIL);
+      mask &= ~BUFFER_BITS_DEPTH_STENCIL;
+   }
+
+   GLbitfield tri_mask = mask & (BUFFER_BIT_STENCIL |
+                                 BUFFER_BIT_DEPTH);
 
    if (tri_mask) {
       debug_mask("tri", tri_mask);
       mask &= ~tri_mask;
-
-      if (ctx->API == API_OPENGLES) {
-         _mesa_meta_Clear(&brw->ctx, tri_mask);
-      } else {
-         _mesa_meta_glsl_Clear(&brw->ctx, tri_mask);
-      }
+      _mesa_meta_glsl_Clear(&brw->ctx, tri_mask);
    }
 
-   /* Any strange buffers get passed off to swrast */
+   /* Any strange buffers get passed off to swrast.  The only thing that
+    * should be left at this point is the accumulation buffer.
+    */
+   assert((mask & ~BUFFER_BIT_ACCUM) == 0);
    if (mask) {
       debug_mask("swrast", mask);
       _swrast_Clear(ctx, mask);
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 4187207..e3023e5 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -38,88 +38,28 @@
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "brw_clip.h"
+#include "compiler/brw_eu.h"
 
 #include "util/ralloc.h"
 
-#define FRONT_UNFILLED_BIT  0x1
-#define BACK_UNFILLED_BIT   0x2
-
-
 static void compile_clip_prog( struct brw_context *brw,
 			     struct brw_clip_prog_key *key )
 {
-   struct brw_clip_compile c;
-   const GLuint *program;
+   const unsigned *program;
    void *mem_ctx;
-   GLuint program_size;
-
-   memset(&c, 0, sizeof(c));
+   unsigned program_size;
 
    mem_ctx = ralloc_context(NULL);
 
-   /* Begin the compilation:
-    */
-   brw_init_codegen(&brw->screen->devinfo, &c.func, mem_ctx);
-
-   c.func.single_program_flow = 1;
-
-   c.key = *key;
-   c.vue_map = brw->vue_map_geom_out;
-
-   /* nr_regs is the number of registers filled by reading data from the VUE.
-    * This program accesses the entire VUE, so nr_regs needs to be the size of
-    * the VUE (measured in pairs, since two slots are stored in each
-    * register).
-    */
-   c.nr_regs = (c.vue_map.num_slots + 1)/2;
-
-   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
-
-   /* For some reason the thread is spawned with only 4 channels
-    * unmasked.
-    */
-   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
-
-
-   /* Would ideally have the option of producing a program which could
-    * do all three:
-    */
-   switch (key->primitive) {
-   case GL_TRIANGLES:
-      if (key->do_unfilled)
-	 brw_emit_unfilled_clip( &c );
-      else
-	 brw_emit_tri_clip( &c );
-      break;
-   case GL_LINES:
-      brw_emit_line_clip( &c );
-      break;
-   case GL_POINTS:
-      brw_emit_point_clip( &c );
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   brw_compact_instructions(&c.func, 0, 0, NULL);
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_CLIP)) {
-      fprintf(stderr, "clip:\n");
-      brw_disassemble(&brw->screen->devinfo, c.func.store,
-                      0, program_size, stderr);
-      fprintf(stderr, "\n");
-   }
+   struct brw_clip_prog_data prog_data;
+   program = brw_compile_clip(brw->screen->compiler, mem_ctx, key, &prog_data,
+                              &brw->vue_map_geom_out, &program_size);
 
    brw_upload_cache(&brw->cache,
 		    BRW_CACHE_CLIP_PROG,
-		    &c.key, sizeof(c.key),
+		    key, sizeof(*key),
 		    program, program_size,
-		    &c.prog_data, sizeof(c.prog_data),
+		    &prog_data, sizeof(prog_data),
 		    &brw->clip.prog_offset, &brw->clip.prog_data);
    ralloc_free(mem_ctx);
 }
@@ -155,7 +95,11 @@
       key.contains_flat_varying = wm_prog_data->contains_flat_varying;
       key.contains_noperspective_varying =
          wm_prog_data->contains_noperspective_varying;
-      key.interp_mode = wm_prog_data->interp_mode;
+
+      STATIC_ASSERT(sizeof(key.interp_mode) ==
+                    sizeof(wm_prog_data->interp_mode));
+      memcpy(key.interp_mode, wm_prog_data->interp_mode,
+             sizeof(key.interp_mode));
    }
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
@@ -170,18 +114,18 @@
       key.nr_userclip = _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1;
 
    if (brw->gen == 5)
-       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+       key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP;
    else
-       key.clip_mode = BRW_CLIPMODE_NORMAL;
+       key.clip_mode = BRW_CLIP_MODE_NORMAL;
 
    /* _NEW_POLYGON */
    if (key.primitive == GL_TRIANGLES) {
       if (ctx->Polygon.CullFlag &&
 	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
-	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+	 key.clip_mode = BRW_CLIP_MODE_REJECT_ALL;
       else {
-	 GLuint fill_front = CLIP_CULL;
-	 GLuint fill_back = CLIP_CULL;
+	 GLuint fill_front = BRW_CLIP_FILL_MODE_CULL;
+	 GLuint fill_back = BRW_CLIP_FILL_MODE_CULL;
 	 GLuint offset_front = 0;
 	 GLuint offset_back = 0;
 
@@ -189,15 +133,15 @@
 	     ctx->Polygon.CullFaceMode != GL_FRONT) {
 	    switch (ctx->Polygon.FrontMode) {
 	    case GL_FILL:
-	       fill_front = CLIP_FILL;
+	       fill_front = BRW_CLIP_FILL_MODE_FILL;
 	       offset_front = 0;
 	       break;
 	    case GL_LINE:
-	       fill_front = CLIP_LINE;
+	       fill_front = BRW_CLIP_FILL_MODE_LINE;
 	       offset_front = ctx->Polygon.OffsetLine;
 	       break;
 	    case GL_POINT:
-	       fill_front = CLIP_POINT;
+	       fill_front = BRW_CLIP_FILL_MODE_POINT;
 	       offset_front = ctx->Polygon.OffsetPoint;
 	       break;
 	    }
@@ -207,15 +151,15 @@
 	     ctx->Polygon.CullFaceMode != GL_BACK) {
 	    switch (ctx->Polygon.BackMode) {
 	    case GL_FILL:
-	       fill_back = CLIP_FILL;
+	       fill_back = BRW_CLIP_FILL_MODE_FILL;
 	       offset_back = 0;
 	       break;
 	    case GL_LINE:
-	       fill_back = CLIP_LINE;
+	       fill_back = BRW_CLIP_FILL_MODE_LINE;
 	       offset_back = ctx->Polygon.OffsetLine;
 	       break;
 	    case GL_POINT:
-	       fill_back = CLIP_POINT;
+	       fill_back = BRW_CLIP_FILL_MODE_POINT;
 	       offset_back = ctx->Polygon.OffsetPoint;
 	       break;
 	    }
@@ -228,7 +172,7 @@
 	    /* Most cases the fixed function units will handle.  Cases where
 	     * one or more polygon faces are unfilled will require help:
 	     */
-	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	    key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED;
 
 	    if (offset_back || offset_front) {
 	       /* _NEW_POLYGON, _NEW_BUFFERS */
@@ -237,13 +181,13 @@
 	       key.offset_clamp = ctx->Polygon.OffsetClamp * ctx->DrawBuffer->_MRD;
 	    }
 
-	    if (!ctx->Polygon._FrontBit) {
+	    if (!brw->polygon_front_bit) {
 	       key.fill_ccw = fill_front;
 	       key.fill_cw = fill_back;
 	       key.offset_ccw = offset_front;
 	       key.offset_cw = offset_back;
 	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_cw != CLIP_CULL)
+		   key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
 		  key.copy_bfc_cw = 1;
 	    } else {
 	       key.fill_cw = fill_front;
@@ -251,7 +195,7 @@
 	       key.offset_cw = offset_front;
 	       key.offset_ccw = offset_back;
 	       if (ctx->Light.Model.TwoSide &&
-		   key.fill_ccw != CLIP_CULL)
+		   key.fill_ccw != BRW_CLIP_FILL_MODE_CULL)
 		  key.copy_bfc_ccw = 1;
 	    }
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
deleted file mode 100644
index 5e084a9..0000000
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-#include "intel_batchbuffer.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "main/framebuffer.h"
-
-static void
-upload_clip_vp(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_clipper_viewport *vp;
-
-   vp = brw_state_batch(brw, sizeof(*vp), 32, &brw->clip.vp_offset);
-
-   const float maximum_post_clamp_delta = 4096;
-   float gbx = maximum_post_clamp_delta / ctx->ViewportArray[0].Width;
-   float gby = maximum_post_clamp_delta / ctx->ViewportArray[0].Height;
-
-   vp->xmin = -gbx;
-   vp->xmax = gbx;
-   vp->ymin = -gby;
-   vp->ymax = gby;
-}
-
-static void
-brw_upload_clip_unit(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_clip_unit_state *clip;
-
-   /* _NEW_BUFFERS */
-   const struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const float fb_width = (float)_mesa_geometric_width(fb);
-   const float fb_height = (float)_mesa_geometric_height(fb);
-
-   upload_clip_vp(brw);
-
-   clip = brw_state_batch(brw, sizeof(*clip), 32, &brw->clip.state_offset);
-   memset(clip, 0, sizeof(*clip));
-
-   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_CLIP_PROG_DATA */
-   clip->thread0.grf_reg_count = (ALIGN(brw->clip.prog_data->total_grf, 16) /
-				 16 - 1);
-   clip->thread0.kernel_start_pointer =
-      brw_program_reloc(brw,
-			brw->clip.state_offset +
-			offsetof(struct brw_clip_unit_state, thread0),
-			brw->clip.prog_offset +
-			(clip->thread0.grf_reg_count << 1)) >> 6;
-
-   clip->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   clip->thread1.single_program_flow = 1;
-
-   clip->thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
-   clip->thread3.const_urb_entry_read_length =
-      brw->clip.prog_data->curb_read_length;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   clip->thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
-   clip->thread3.dispatch_grf_start_reg = 1;
-   clip->thread3.urb_entry_read_offset = 0;
-
-   /* BRW_NEW_URB_FENCE */
-   clip->thread4.nr_urb_entries = brw->urb.nr_clip_entries;
-   clip->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-   /* If we have enough clip URB entries to run two threads, do so.
-    */
-   if (brw->urb.nr_clip_entries >= 10) {
-      /* Half of the URB entries go to each thread, and it has to be an
-       * even number.
-       */
-      assert(brw->urb.nr_clip_entries % 2 == 0);
-
-      /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
-       * only 2 threads can output VUEs at a time.
-       */
-      if (brw->gen == 5)
-         clip->thread4.max_threads = 16 - 1;
-      else
-         clip->thread4.max_threads = 2 - 1;
-   } else {
-      assert(brw->urb.nr_clip_entries >= 5);
-      clip->thread4.max_threads = 1 - 1;
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      clip->thread4.stats_enable = 1;
-
-   /* _NEW_TRANSFORM */
-   if (brw->gen == 5 || brw->is_g4x)
-      clip->clip5.userclip_enable_flags = ctx->Transform.ClipPlanesEnabled;
-   else
-      /* Up to 6 actual clip flags, plus the 7th for negative RHW workaround. */
-      clip->clip5.userclip_enable_flags = (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
-
-   clip->clip5.userclip_must_clip = 1;
-
-   /* enable guardband clipping if we can */
-   if (ctx->ViewportArray[0].X == 0 &&
-       ctx->ViewportArray[0].Y == 0 &&
-       ctx->ViewportArray[0].Width == fb_width &&
-       ctx->ViewportArray[0].Height == fb_height)
-   {
-      clip->clip5.guard_band_enable = 1;
-      clip->clip6.clipper_viewport_state_ptr =
-         (brw->batch.bo->offset64 + brw->clip.vp_offset) >> 5;
-
-      /* emit clip viewport relocation */
-      brw_emit_reloc(&brw->batch,
-                     (brw->clip.state_offset +
-                      offsetof(struct brw_clip_unit_state, clip6)),
-                     brw->batch.bo, brw->clip.vp_offset,
-                     I915_GEM_DOMAIN_INSTRUCTION, 0);
-   }
-
-   /* _NEW_TRANSFORM */
-   if (!ctx->Transform.DepthClamp)
-      clip->clip5.viewport_z_clip_enable = 1;
-   clip->clip5.viewport_xy_clip_enable = 1;
-   clip->clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
-   if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
-      clip->clip5.api_mode = BRW_CLIP_API_DX;
-   else
-      clip->clip5.api_mode = BRW_CLIP_API_OGL;
-   clip->clip5.clip_mode = brw->clip.prog_data->clip_mode;
-
-   if (brw->is_g4x)
-      clip->clip5.negative_w_clip_test = 1;
-
-   clip->viewport_xmin = -1;
-   clip->viewport_xmax = 1;
-   clip->viewport_ymin = -1;
-   clip->viewport_ymax = 1;
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-}
-
-const struct brw_tracked_state brw_clip_unit = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_TRANSFORM |
-               _NEW_VIEWPORT,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CLIP_PROG_DATA |
-               BRW_NEW_CURBE_OFFSETS |
-               BRW_NEW_PROGRAM_CACHE |
-               BRW_NEW_URB_FENCE,
-   },
-   .emit = brw_upload_clip_unit,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index 8046153..d6cb016 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -188,6 +188,8 @@
 
    brw_validate_textures(brw);
 
+   brw_predraw_resolve_inputs(brw);
+
    const int sampler_state_size = 16; /* 16 bytes */
    estimated_buffer_space_needed = 512; /* batchbuffer commands */
    estimated_buffer_space_needed += (BRW_MAX_TEX_UNIT *
@@ -261,7 +263,7 @@
    struct brw_bo *bo =
       intel_bufferobj_buffer(brw,
                              intel_buffer_object(indirect_buffer),
-                             indirect, 3 * sizeof(GLuint));
+                             indirect, 3 * sizeof(GLuint), false);
 
    brw->compute.num_work_groups_bo = bo;
    brw->compute.num_work_groups_offset = indirect;
diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c
index 046a42b..6be4d48 100644
--- a/src/mesa/drivers/dri/i965/brw_conditional_render.c
+++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c
@@ -52,6 +52,19 @@
                                  struct brw_query_object *query,
                                  int stream_start, int count)
 {
+   if (!can_do_mi_math_and_lrr(brw->screen)) {
+      brw->predicate.state = BRW_PREDICATE_STATE_STALL_FOR_QUERY;
+      return;
+   }
+
+   brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
+
    hsw_overflow_result_to_gpr0(brw, query, count);
    brw_load_register_reg64(brw, HSW_CS_GPR(0), MI_PREDICATE_SRC0);
    brw_load_register_imm64(brw, MI_PREDICATE_SRC1, 0ull);
@@ -61,6 +74,19 @@
 set_predicate_for_occlusion_query(struct brw_context *brw,
                                   struct brw_query_object *query)
 {
+   if (!brw->predicate.supported) {
+      brw->predicate.state = BRW_PREDICATE_STATE_STALL_FOR_QUERY;
+      return;
+   }
+
+   brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
+
    brw_load_register_mem64(brw,
                            MI_PREDICATE_SRC0,
                            query->bo,
@@ -80,17 +106,10 @@
                          struct brw_query_object *query,
                          bool inverted)
 {
-
    int load_op;
 
    assert(query->bo != NULL);
 
-   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
-    * command when loading the values into the predicate source registers for
-    * conditional rendering.
-    */
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
-
    switch (query->Base.Target) {
    case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB:
       set_predicate_for_overflow_query(brw, query, 0, 1);
@@ -102,19 +121,19 @@
       set_predicate_for_occlusion_query(brw, query);
    }
 
-   if (inverted)
-      load_op = MI_PREDICATE_LOADOP_LOAD;
-   else
-      load_op = MI_PREDICATE_LOADOP_LOADINV;
+   if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT) {
+      if (inverted)
+         load_op = MI_PREDICATE_LOADOP_LOAD;
+      else
+         load_op = MI_PREDICATE_LOADOP_LOADINV;
 
-   BEGIN_BATCH(1);
-   OUT_BATCH(GEN7_MI_PREDICATE |
-             load_op |
-             MI_PREDICATE_COMBINEOP_SET |
-             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
-   ADVANCE_BATCH();
-
-   brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
+      BEGIN_BATCH(1);
+      OUT_BATCH(GEN7_MI_PREDICATE |
+                load_op |
+                MI_PREDICATE_COMBINEOP_SET |
+                MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
+      ADVANCE_BATCH();
+   }
 }
 
 static void
@@ -126,14 +145,6 @@
    struct brw_query_object *query = (struct brw_query_object *) q;
    bool inverted;
 
-   if (!brw->predicate.supported)
-      return;
-
-   if ((query->Base.Target == GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB ||
-        query->Base.Target == GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB) &&
-       !can_do_mi_math_and_lrr(brw->screen))
-      return;
-
    switch (mode) {
    case GL_QUERY_WAIT:
    case GL_QUERY_NO_WAIT:
@@ -183,24 +194,11 @@
 bool
 brw_check_conditional_render(struct brw_context *brw)
 {
-   const struct gl_query_object *query = brw->ctx.Query.CondRenderQuery;
-
-   const bool query_is_xfb = query &&
-      (query->Target == GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB ||
-       query->Target == GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB);
-
-   if (brw->predicate.supported &&
-       (can_do_mi_math_and_lrr(brw->screen) || !query_is_xfb)) {
-      /* In some cases it is possible to determine that the primitives should
-       * be skipped without needing the predicate enable bit and still without
-       * stalling.
-       */
-      return brw->predicate.state != BRW_PREDICATE_STATE_DONT_RENDER;
-   } else if (query) {
+   if (brw->predicate.state == BRW_PREDICATE_STATE_STALL_FOR_QUERY) {
       perf_debug("Conditional rendering is implemented in software and may "
                  "stall.\n");
       return _mesa_check_conditional_render(&brw->ctx);
-   } else {
-      return true;
    }
+
+   return brw->predicate.state != BRW_PREDICATE_STATE_DONT_RENDER;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 782543e..1ca7438 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -43,6 +43,8 @@
 #include "main/vtxfmt.h"
 #include "main/texobj.h"
 #include "main/framebuffer.h"
+#include "main/stencil.h"
+#include "main/state.h"
 
 #include "vbo/vbo_context.h"
 
@@ -168,190 +170,29 @@
                                  fb->DefaultGeometry.NumSamples);
 }
 
-static bool
-intel_disable_rb_aux_buffer(struct brw_context *brw, const struct brw_bo *bo)
-{
-   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
-   bool found = false;
-
-   for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
-      const struct intel_renderbuffer *irb =
-         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
-
-      if (irb && irb->mt->bo == bo) {
-         found = brw->draw_aux_buffer_disabled[i] = true;
-      }
-   }
-
-   return found;
-}
-
-/* On Gen9 color buffers may be compressed by the hardware (lossless
- * compression). There are, however, format restrictions and care needs to be
- * taken that the sampler engine is capable for re-interpreting a buffer with
- * format different the buffer was originally written with.
- *
- * For example, SRGB formats are not compressible and the sampler engine isn't
- * capable of treating RGBA_UNORM as SRGB_ALPHA. In such a case the underlying
- * color buffer needs to be resolved so that the sampling surface can be
- * sampled as non-compressed (i.e., without the auxiliary MCS buffer being
- * set).
- */
-static bool
-intel_texture_view_requires_resolve(struct brw_context *brw,
-                                    struct intel_texture_object *intel_tex)
-{
-   if (brw->gen < 9 ||
-       !intel_miptree_is_lossless_compressed(brw, intel_tex->mt))
-     return false;
-
-   const uint32_t brw_format = brw_isl_format_for_mesa_format(intel_tex->_Format);
-
-   if (isl_format_supports_ccs_e(&brw->screen->devinfo, brw_format))
-      return false;
-
-   perf_debug("Incompatible sampling format (%s) for rbc (%s)\n",
-              _mesa_get_format_name(intel_tex->_Format),
-              _mesa_get_format_name(intel_tex->mt->format));
-
-   if (intel_disable_rb_aux_buffer(brw, intel_tex->mt->bo))
-      perf_debug("Sampling renderbuffer with non-compressible format - "
-                 "turning off compression");
-
-   return true;
-}
-
 static void
-intel_update_state(struct gl_context * ctx, GLuint new_state)
+intel_update_state(struct gl_context * ctx)
 {
+   GLuint new_state = ctx->NewState;
    struct brw_context *brw = brw_context(ctx);
-   struct intel_texture_object *tex_obj;
-   struct intel_renderbuffer *depth_irb;
 
    if (ctx->swrast_context)
       _swrast_InvalidateState(ctx, new_state);
-   _vbo_InvalidateState(ctx, new_state);
 
    brw->NewGLState |= new_state;
 
-   _mesa_unlock_context_textures(ctx);
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 
-   /* Resolve the depth buffer's HiZ buffer. */
-   depth_irb = intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
-   if (depth_irb)
-      intel_renderbuffer_resolve_hiz(brw, depth_irb);
-
-   memset(brw->draw_aux_buffer_disabled, 0,
-          sizeof(brw->draw_aux_buffer_disabled));
-
-   /* Resolve depth buffer and render cache of each enabled texture. */
-   int maxEnabledUnit = ctx->Texture._MaxEnabledTexImageUnit;
-   for (int i = 0; i <= maxEnabledUnit; i++) {
-      if (!ctx->Texture.Unit[i]._Current)
-	 continue;
-      tex_obj = intel_texture_object(ctx->Texture.Unit[i]._Current);
-      if (!tex_obj || !tex_obj->mt)
-	 continue;
-      if (intel_miptree_sample_with_hiz(brw, tex_obj->mt))
-         intel_miptree_all_slices_resolve_hiz(brw, tex_obj->mt);
-      else
-         intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
-      /* Sampling engine understands lossless compression and resolving
-       * those surfaces should be skipped for performance reasons.
-       */
-      const int flags = intel_texture_view_requires_resolve(brw, tex_obj) ?
-                           0 : INTEL_MIPTREE_IGNORE_CCS_E;
-      intel_miptree_all_slices_resolve_color(brw, tex_obj->mt, flags);
-      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
-
-      if (tex_obj->base.StencilSampling ||
-          tex_obj->mt->format == MESA_FORMAT_S_UINT8) {
-         intel_update_r8stencil(brw, tex_obj->mt);
-      }
+   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS)) {
+      brw->stencil_enabled = _mesa_stencil_is_enabled(ctx);
+      brw->stencil_two_sided = _mesa_stencil_is_two_sided(ctx);
+      brw->stencil_write_enabled =
+         _mesa_stencil_is_write_enabled(ctx, brw->stencil_two_sided);
    }
 
-   /* Resolve color for each active shader image. */
-   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      const struct gl_program *prog = ctx->_Shader->CurrentProgram[i];
-
-      if (unlikely(prog && prog->info.num_images)) {
-         for (unsigned j = 0; j < prog->info.num_images; j++) {
-            struct gl_image_unit *u =
-               &ctx->ImageUnits[prog->sh.ImageUnits[j]];
-            tex_obj = intel_texture_object(u->TexObj);
-
-            if (tex_obj && tex_obj->mt) {
-               /* Access to images is implemented using indirect messages
-                * against data port. Normal render target write understands
-                * lossless compression but unfortunately the typed/untyped
-                * read/write interface doesn't. Therefore even lossless
-                * compressed surfaces need to be resolved prior to accessing
-                * them. Hence skip setting INTEL_MIPTREE_IGNORE_CCS_E.
-                */
-               intel_miptree_all_slices_resolve_color(brw, tex_obj->mt, 0);
-
-               if (intel_miptree_is_lossless_compressed(brw, tex_obj->mt) &&
-                   intel_disable_rb_aux_buffer(brw, tex_obj->mt->bo)) {
-                  perf_debug("Using renderbuffer as shader image - turning "
-                             "off lossless compression");
-               }
-
-               brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
-            }
-         }
-      }
-   }
-
-   /* Resolve color buffers for non-coherent framebuffer fetch. */
-   if (!ctx->Extensions.MESA_shader_framebuffer_fetch &&
-       ctx->FragmentProgram._Current &&
-       ctx->FragmentProgram._Current->info.outputs_read) {
-      const struct gl_framebuffer *fb = ctx->DrawBuffer;
-
-      for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
-         const struct intel_renderbuffer *irb =
-            intel_renderbuffer(fb->_ColorDrawBuffers[i]);
-
-         if (irb &&
-             intel_miptree_resolve_color(
-                brw, irb->mt, irb->mt_level, irb->mt_layer, irb->layer_count,
-                INTEL_MIPTREE_IGNORE_CCS_E))
-            brw_render_cache_set_check_flush(brw, irb->mt->bo);
-      }
-   }
-
-   /* If FRAMEBUFFER_SRGB is used on Gen9+ then we need to resolve any of the
-    * single-sampled color renderbuffers because the CCS buffer isn't
-    * supported for SRGB formats. This only matters if FRAMEBUFFER_SRGB is
-    * enabled because otherwise the surface state will be programmed with the
-    * linear equivalent format anyway.
-    */
-   if (brw->gen >= 9 && ctx->Color.sRGBEnabled) {
-      struct gl_framebuffer *fb = ctx->DrawBuffer;
-      for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
-         struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[i];
-
-         if (rb == NULL)
-            continue;
-
-         struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-         struct intel_mipmap_tree *mt = irb->mt;
-
-         if (mt == NULL ||
-             mt->num_samples > 1 ||
-             _mesa_get_srgb_format_linear(mt->format) == mt->format)
-               continue;
-
-         /* Lossless compression is not supported for SRGB formats, it
-          * should be impossible to get here with such surfaces.
-          */
-         assert(!intel_miptree_is_lossless_compressed(brw, mt));
-         intel_miptree_all_slices_resolve_color(brw, mt, 0);
-         brw_render_cache_set_check_flush(brw, mt->bo);
-      }
-   }
-
-   _mesa_lock_context_textures(ctx);
+   if (new_state & _NEW_POLYGON)
+      brw->polygon_front_bit = _mesa_polygon_get_front_bit(ctx);
 
    if (new_state & _NEW_BUFFERS) {
       intel_update_framebuffer(ctx, ctx->DrawBuffer);
@@ -413,7 +254,7 @@
    intel_glFlush(ctx);
 
    if (brw->batch.last_bo)
-      brw_bo_wait_rendering(brw, brw->batch.last_bo);
+      brw_bo_wait_rendering(brw->batch.last_bo);
 }
 
 static void
@@ -458,8 +299,7 @@
    else
       gen4_init_queryobj_functions(functions);
    brw_init_compute_functions(functions);
-   if (brw->gen >= 7)
-      brw_init_conditional_render_functions(functions);
+   brw_init_conditional_render_functions(functions);
 
    functions->QueryInternalFormat = brw_query_internal_format;
 
@@ -772,8 +612,11 @@
     *      the element in the buffer."
     *
     * However, unaligned accesses are slower, so enforce buffer alignment.
+    *
+    * In order to push UBO data, 3DSTATE_CONSTANT_XS imposes an additional
+    * restriction: the start of the buffer needs to be 32B aligned.
     */
-   ctx->Const.UniformBufferOffsetAlignment = 16;
+   ctx->Const.UniformBufferOffsetAlignment = 32;
 
    /* ShaderStorageBufferOffsetAlignment should be a cacheline (64 bytes) so
     * that we can safely have the CPU and GPU writing the same SSBO on
@@ -902,6 +745,9 @@
           brw->has_separate_stencil = false;
    }
 
+   if (driQueryOptionb(options, "mesa_no_error"))
+      ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR;
+
    if (driQueryOptionb(options, "always_flush_batch")) {
       fprintf(stderr, "flushing batchbuffer before/after each draw call\n");
       brw->always_flush_batch = true;
@@ -934,6 +780,9 @@
    ctx->Const.AllowGLSLExtensionDirectiveMidShader =
       driQueryOptionb(options, "allow_glsl_extension_directive_midshader");
 
+   ctx->Const.AllowGLSLBuiltinVariableRedeclaration =
+      driQueryOptionb(options, "allow_glsl_builtin_variable_redeclaration");
+
    ctx->Const.AllowHigherCompatVersion =
       driQueryOptionb(options, "allow_higher_compat_version");
 
@@ -965,8 +814,9 @@
    /* Only allow the __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS flag if the kernel
     * provides us with context reset notifications.
     */
-   uint32_t allowed_flags = __DRI_CTX_FLAG_DEBUG
-      | __DRI_CTX_FLAG_FORWARD_COMPATIBLE;
+   uint32_t allowed_flags = __DRI_CTX_FLAG_DEBUG |
+                            __DRI_CTX_FLAG_FORWARD_COMPATIBLE |
+                            __DRI_CTX_FLAG_NO_ERROR;
 
    if (screen->has_context_reset_notification)
       allowed_flags |= __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS;
@@ -994,7 +844,7 @@
    brw->is_baytrail = devinfo->is_baytrail;
    brw->is_haswell = devinfo->is_haswell;
    brw->is_cherryview = devinfo->is_cherryview;
-   brw->is_broxton = devinfo->is_broxton;
+   brw->is_broxton = devinfo->is_broxton || devinfo->is_geminilake;
    brw->has_llc = devinfo->has_llc;
    brw->has_hiz = devinfo->has_hiz_and_separate_stencil;
    brw->has_separate_stencil = devinfo->has_hiz_and_separate_stencil;
@@ -1008,7 +858,7 @@
    brw->must_use_separate_stencil = devinfo->must_use_separate_stencil;
    brw->has_swizzling = screen->hw_has_swizzling;
 
-   isl_device_init(&brw->isl_dev, devinfo, screen->hw_has_swizzling);
+   brw->isl_dev = screen->isl_dev;
 
    brw->vs.base.stage = MESA_SHADER_VERTEX;
    brw->tcs.base.stage = MESA_SHADER_TESS_CTRL;
@@ -1115,8 +965,7 @@
 
    brw_init_surface_formats(brw);
 
-   if (brw->gen >= 6)
-      brw_blorp_init(brw);
+   brw_blorp_init(brw);
 
    brw->urb.size = devinfo->urb.size;
 
@@ -1126,7 +975,6 @@
    brw->prim_restart.in_progress = false;
    brw->prim_restart.enable_cut_index = false;
    brw->gs.enabled = false;
-   brw->sf.viewport_transform_enable = true;
    brw->clip.viewport_count = 1;
 
    brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
@@ -1200,6 +1048,12 @@
    if (brw->wm.base.scratch_bo)
       brw_bo_unreference(brw->wm.base.scratch_bo);
 
+   brw_bo_unreference(brw->vs.base.push_const_bo);
+   brw_bo_unreference(brw->tcs.base.push_const_bo);
+   brw_bo_unreference(brw->tes.base.push_const_bo);
+   brw_bo_unreference(brw->gs.base.push_const_bo);
+   brw_bo_unreference(brw->wm.base.push_const_bo);
+
    brw_destroy_hw_context(brw->bufmgr, brw->hw_ctx);
 
    if (ctx->swrast_context) {
@@ -1371,10 +1225,10 @@
       rb = intel_get_renderbuffer(fb, buffers[i]);
       if (rb == NULL || rb->mt == NULL)
          continue;
-      if (rb->mt->num_samples <= 1) {
+      if (rb->mt->surf.samples == 1) {
          assert(rb->mt_layer == 0 && rb->mt_level == 0 &&
                 rb->layer_count == 1);
-         intel_miptree_resolve_color(brw, rb->mt, 0, 0, 1, 0);
+         intel_miptree_prepare_access(brw, rb->mt, 0, 1, 0, 1, false, false);
       } else {
          intel_renderbuffer_downsample(brw, rb);
       }
@@ -1409,7 +1263,7 @@
    struct gl_framebuffer *fb = drawable->driverPrivate;
    struct intel_renderbuffer *rb;
    __DRIbuffer *buffers = NULL;
-   int i, count;
+   int count;
    const char *region_name;
 
    /* Set this up front, so that in case our buffers get invalidated
@@ -1425,7 +1279,7 @@
    if (buffers == NULL)
       return;
 
-   for (i = 0; i < count; i++) {
+   for (int i = 0; i < count; i++) {
        switch (buffers[i].attachment) {
        case __DRI_BUFFER_FRONT_LEFT:
            rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
@@ -1661,9 +1515,34 @@
       return;
    }
 
-   intel_update_winsys_renderbuffer_miptree(brw, rb, bo,
-                                            drawable->w, drawable->h,
-                                            buffer->pitch);
+   struct intel_mipmap_tree *mt =
+      intel_miptree_create_for_bo(brw,
+                                  bo,
+                                  intel_rb_format(rb),
+                                  0,
+                                  drawable->w,
+                                  drawable->h,
+                                  1,
+                                  buffer->pitch,
+                                  MIPTREE_CREATE_DEFAULT);
+   if (!mt) {
+      brw_bo_unreference(bo);
+      return;
+   }
+
+   /* We got this BO from X11.  We cana't assume that we have coherent texture
+    * access because X may suddenly decide to use it for scan-out which would
+    * destroy coherency.
+    */
+   bo->cache_coherent = false;
+
+   if (!intel_update_winsys_renderbuffer_miptree(brw, rb, mt,
+                                                 drawable->w, drawable->h,
+                                                 buffer->pitch)) {
+      brw_bo_unreference(bo);
+      intel_miptree_release(&mt);
+      return;
+   }
 
    if (_mesa_is_front_buffer_drawing(fb) &&
        (buffer->attachment == __DRI_BUFFER_FRONT_LEFT ||
@@ -1719,9 +1598,30 @@
    if (last_mt && last_mt->bo == buffer->bo)
       return;
 
-   intel_update_winsys_renderbuffer_miptree(intel, rb, buffer->bo,
-                                            buffer->width, buffer->height,
-                                            buffer->pitch);
+   enum isl_colorspace colorspace;
+   switch (_mesa_get_format_color_encoding(intel_rb_format(rb))) {
+   case GL_SRGB:
+      colorspace = ISL_COLORSPACE_SRGB;
+      break;
+   case GL_LINEAR:
+      colorspace = ISL_COLORSPACE_LINEAR;
+      break;
+   default:
+      unreachable("Invalid color encoding");
+   }
+
+   struct intel_mipmap_tree *mt =
+      intel_miptree_create_for_dri_image(intel, buffer, GL_TEXTURE_2D,
+                                         colorspace, true);
+   if (!mt)
+      return;
+
+   if (!intel_update_winsys_renderbuffer_miptree(intel, rb, mt,
+                                                 buffer->width, buffer->height,
+                                                 buffer->pitch)) {
+      intel_miptree_release(&mt);
+      return;
+   }
 
    if (_mesa_is_front_buffer_drawing(fb) &&
        buffer_type == __DRI_IMAGE_BUFFER_FRONT &&
@@ -1738,7 +1638,7 @@
    struct intel_renderbuffer *front_rb;
    struct intel_renderbuffer *back_rb;
    struct __DRIimageList images;
-   unsigned int format;
+   mesa_format format;
    uint32_t buffer_mask = 0;
    int ret;
 
@@ -1778,6 +1678,7 @@
                                 images.front,
                                 __DRI_IMAGE_BUFFER_FRONT);
    }
+
    if (images.image_mask & __DRI_IMAGE_BUFFER_BACK) {
       drawable->w = images.back->width;
       drawable->h = images.back->height;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 51ea927..c25e5e2 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -47,7 +47,6 @@
 #include "common/gen_debug.h"
 #include "intel_screen.h"
 #include "intel_tex_obj.h"
-#include "intel_resolve_map.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -173,7 +172,6 @@
    BRW_STATE_GEOMETRY_PROGRAM,
    BRW_STATE_TESS_PROGRAMS,
    BRW_STATE_VERTEX_PROGRAM,
-   BRW_STATE_CURBE_OFFSETS,
    BRW_STATE_REDUCED_PRIMITIVE,
    BRW_STATE_PATCH_PRIMITIVE,
    BRW_STATE_PRIMITIVE,
@@ -216,6 +214,8 @@
    BRW_STATE_BLORP,
    BRW_STATE_VIEWPORT_COUNT,
    BRW_STATE_CONSERVATIVE_RASTERIZATION,
+   BRW_STATE_DRAW_CALL,
+   BRW_STATE_FAST_CLEAR_COLOR,
    BRW_NUM_STATE_BITS
 };
 
@@ -259,7 +259,6 @@
 #define BRW_NEW_GEOMETRY_PROGRAM        (1ull << BRW_STATE_GEOMETRY_PROGRAM)
 #define BRW_NEW_TESS_PROGRAMS           (1ull << BRW_STATE_TESS_PROGRAMS)
 #define BRW_NEW_VERTEX_PROGRAM          (1ull << BRW_STATE_VERTEX_PROGRAM)
-#define BRW_NEW_CURBE_OFFSETS           (1ull << BRW_STATE_CURBE_OFFSETS)
 #define BRW_NEW_REDUCED_PRIMITIVE       (1ull << BRW_STATE_REDUCED_PRIMITIVE)
 #define BRW_NEW_PATCH_PRIMITIVE         (1ull << BRW_STATE_PATCH_PRIMITIVE)
 #define BRW_NEW_PRIMITIVE               (1ull << BRW_STATE_PRIMITIVE)
@@ -307,6 +306,8 @@
 #define BRW_NEW_CC_STATE                (1ull << BRW_STATE_CC_STATE)
 #define BRW_NEW_BLORP                   (1ull << BRW_STATE_BLORP)
 #define BRW_NEW_CONSERVATIVE_RASTERIZATION (1ull << BRW_STATE_CONSERVATIVE_RASTERIZATION)
+#define BRW_NEW_DRAW_CALL               (1ull << BRW_STATE_DRAW_CALL)
+#define BRW_NEW_FAST_CLEAR_COLOR        (1ull << BRW_STATE_FAST_CLEAR_COLOR)
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -327,27 +328,6 @@
 };
 
 
-struct brw_sf_prog_data {
-   GLuint urb_read_length;
-   GLuint total_grf;
-
-   /* Each vertex may have upto 12 attributes, 4 components each,
-    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
-    * rows.
-    *
-    * Actually we use 4 for each, so call it 12 rows.
-    */
-   GLuint urb_entry_size;
-};
-
-
-struct brw_clip_prog_data {
-   GLuint curb_read_length;	/* user planes? */
-   GLuint clip_mode;
-   GLuint urb_read_length;
-   GLuint total_grf;
-};
-
 struct brw_ff_gs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
@@ -391,13 +371,12 @@
 
    struct brw_cache_item **items;
    struct brw_bo *bo;
+   void *map;
    GLuint size, n_items;
 
    uint32_t next_offset;
-   bool bo_used_by_gpu;
 };
 
-
 /* Considered adding a member to this struct to document which flags
  * an update might raise so that ordering of the state atoms can be
  * checked or derived at runtime.  Dropped the idea in favor of having
@@ -480,11 +459,13 @@
    struct drm_i915_gem_relocation_entry *relocs;
    int reloc_count;
    int reloc_array_size;
+
    /** The validation list */
-   struct drm_i915_gem_exec_object2 *exec_objects;
+   struct drm_i915_gem_exec_object2 *validation_list;
    struct brw_bo **exec_bos;
    int exec_count;
    int exec_array_size;
+
    /** The amount of aperture space (in bytes) used by all exec_bos */
    int aperture_space;
 
@@ -576,7 +557,8 @@
    /** Offset in the batchbuffer to Gen4-5 pipelined state (VS/WM/GS_STATE). */
    uint32_t state_offset;
 
-   uint32_t push_const_offset; /* Offset in the batchbuffer */
+   struct brw_bo *push_const_bo; /* NULL if using the batchbuffer */
+   uint32_t push_const_offset; /* Offset in the push constant BO or batch */
    int push_const_size; /* in 256-bit register increments */
 
    /* Binding table: pointers to SURFACE_STATE entries. */
@@ -586,6 +568,9 @@
    /** SAMPLER_STATE count and table offset */
    uint32_t sampler_count;
    uint32_t sampler_offset;
+
+   /** Need to re-emit 3DSTATE_CONSTANT_XS? */
+   bool push_constants_dirty;
 };
 
 enum brw_predicate_state {
@@ -600,7 +585,11 @@
    /* In this case whether to draw or not depends on the result of an
     * MI_PREDICATE command so the predicate enable bit needs to be checked.
     */
-   BRW_PREDICATE_STATE_USE_BIT
+   BRW_PREDICATE_STATE_USE_BIT,
+   /* In this case, either MI_PREDICATE doesn't exist or we lack the
+    * necessary kernel features to use it.  Stall for the query result.
+    */
+   BRW_PREDICATE_STATE_STALL_FOR_QUERY,
 };
 
 struct shader_times;
@@ -666,6 +655,17 @@
                                      uint32_t width, uint32_t height,
                                      uint32_t tile_x, uint32_t tile_y);
 
+      /**
+       * Emit an MI_REPORT_PERF_COUNT command packet.
+       *
+       * This asks the GPU to write a report of the current OA counter values
+       * into @bo at the given offset and containing the given @report_id
+       * which we can cross-reference when parsing the report (gen7+ only).
+       */
+      void (*emit_mi_report_perf_count)(struct brw_context *brw,
+                                        struct brw_bo *bo,
+                                        uint32_t offset_in_bytes,
+                                        uint32_t report_id);
    } vtbl;
 
    struct brw_bufmgr *bufmgr;
@@ -696,6 +696,7 @@
 
    struct {
       struct brw_bo *bo;
+      void *map;
       uint32_t next_offset;
    } upload;
 
@@ -772,8 +773,6 @@
    bool has_negative_rhw_bug;
    bool has_pln;
    bool no_simd8;
-   bool use_rep_send;
-   bool use_resource_streamer;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
@@ -784,6 +783,13 @@
     */
    bool needs_unlit_centroid_workaround;
 
+   /** Derived stencil states. */
+   bool stencil_enabled;
+   bool stencil_two_sided;
+   bool stencil_write_enabled;
+   /** Derived polygon state. */
+   bool polygon_front_bit; /**< 0=GL_CCW, 1=GL_CW */
+
    struct isl_device isl_dev;
 
    struct blorp_context blorp;
@@ -885,7 +891,7 @@
       /* Updates are signaled by BRW_NEW_INDEX_BUFFER. */
       struct brw_bo *bo;
       uint32_t size;
-      GLuint type;
+      unsigned index_size;
 
       /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
        * avoid re-uploading the IB packet over and over if we're actually
@@ -956,8 +962,7 @@
    } urb;
 
 
-   /* BRW_NEW_CURBE_OFFSETS:
-    */
+   /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
    struct {
       GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
       GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
@@ -990,22 +995,10 @@
 
    struct {
       struct brw_stage_state base;
-
-      /**
-       * True if the 3DSTATE_HS command most recently emitted to the 3D
-       * pipeline enabled the HS; false otherwise.
-       */
-      bool enabled;
    } tcs;
 
    struct {
       struct brw_stage_state base;
-
-      /**
-       * True if the 3DSTATE_DS command most recently emitted to the 3D
-       * pipeline enabled the DS; false otherwise.
-       */
-      bool enabled;
    } tes;
 
    struct {
@@ -1065,7 +1058,6 @@
       uint32_t prog_offset;
       uint32_t state_offset;
       uint32_t vp_offset;
-      bool viewport_transform_enable;
    } sf;
 
    struct {
@@ -1114,6 +1106,9 @@
          uint64_t timestamp_frequency; /** $GpuTimestampFrequency */
          uint64_t n_eus;               /** $EuCoresTotalCount */
          uint64_t n_eu_slices;         /** $EuSlicesTotalCount */
+         uint64_t n_eu_sub_slices;     /** $EuSubslicesTotalCount */
+         uint64_t eu_threads_count;    /** $EuThreadsCount */
+         uint64_t slice_mask;          /** $SliceMask */
          uint64_t subslice_mask;       /** $SubsliceMask */
          uint64_t gt_min_freq;         /** $GpuMinFrequency */
          uint64_t gt_max_freq;         /** $GpuMaxFrequency */
@@ -1187,8 +1182,8 @@
    const struct brw_tracked_state render_atoms[76];
    const struct brw_tracked_state compute_atoms[11];
 
-   uint32_t render_target_format[MESA_FORMAT_COUNT];
-   bool format_supported_as_render_target[MESA_FORMAT_COUNT];
+   const enum isl_format *mesa_to_isl_render_format;
+   const bool *mesa_format_supports_render;
 
    /* PrimitiveRestart */
    struct {
@@ -1201,12 +1196,11 @@
     * brw_workaround_depthstencil_alignment().
     */
    struct {
-      struct intel_mipmap_tree *depth_mt;
-      struct intel_mipmap_tree *stencil_mt;
-
       /* Inter-tile (page-aligned) byte offsets. */
-      uint32_t depth_offset, hiz_offset, stencil_offset;
-      /* Intra-tile x,y offsets for drawing to depth/stencil/hiz */
+      uint32_t depth_offset;
+      /* Intra-tile x,y offsets for drawing to combined depth-stencil. Only
+       * used for Gen < 6.
+       */
       uint32_t tile_x, tile_y;
    } depthstencil;
 
@@ -1265,6 +1259,8 @@
                                 __DRIdrawable *drawable);
 void intel_prepare_render(struct brw_context *brw);
 
+void brw_predraw_resolve_inputs(struct brw_context *brw);
+
 void intel_resolve_for_dri2_flush(struct brw_context *brw,
                                   __DRIdrawable *drawable);
 
@@ -1387,6 +1383,8 @@
 
 /* brw_urb.c
  */
+void brw_calculate_urb_fence(struct brw_context *brw, unsigned csize,
+                             unsigned vsize, unsigned sfsize);
 void brw_upload_urb_fence(struct brw_context *brw);
 
 /* brw_curbe.c
@@ -1401,29 +1399,17 @@
                                      const struct gl_vertex_array *glarray);
 
 static inline unsigned
-brw_get_index_type(GLenum type)
+brw_get_index_type(unsigned index_size)
 {
-   assert((type == GL_UNSIGNED_BYTE)
-          || (type == GL_UNSIGNED_SHORT)
-          || (type == GL_UNSIGNED_INT));
-
-   /* The possible values for type are GL_UNSIGNED_BYTE (0x1401),
-    * GL_UNSIGNED_SHORT (0x1403), and GL_UNSIGNED_INT (0x1405) which we want
-    * to map to scale factors of 0, 1, and 2, respectively.  These scale
-    * factors are then left-shfited by 8 to be in the correct position in the
-    * CMD_INDEX_BUFFER packet.
-    *
-    * Subtracting 0x1401 gives 0, 2, and 4.  Shifting left by 7 afterwards
-    * gives 0x00000000, 0x00000100, and 0x00000200.  These just happen to be
-    * the values the need to be written in the CMD_INDEX_BUFFER packet.
+   /* The hw needs 0x00, 0x01, and 0x02 for ubyte, ushort, and uint,
+    * respectively.
     */
-   return (type - 0x1401) << 7;
+   return index_size >> 1;
 }
 
 void brw_prepare_vertices(struct brw_context *brw);
 
 /* brw_wm_surface_state.c */
-void brw_init_surface_formats(struct brw_context *brw);
 void brw_create_constant_surface(struct brw_context *brw,
                                  struct brw_bo *bo,
                                  uint32_t offset,
@@ -1455,6 +1441,8 @@
                                struct brw_stage_prog_data *prog_data);
 
 /* brw_surface_formats.c */
+void intel_screen_init_surface_formats(struct intel_screen *screen);
+void brw_init_surface_formats(struct brw_context *brw);
 bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
@@ -1549,15 +1537,6 @@
                           int dstX0, int dstY0,
                           int width, int height);
 
-/* gen6_multisample_state.c */
-unsigned
-gen6_determine_sample_mask(struct brw_context *brw);
-
-void
-gen6_emit_3dstate_multisample(struct brw_context *brw,
-                              unsigned num_samples);
-void
-gen6_emit_3dstate_sample_mask(struct brw_context *brw, unsigned mask);
 void
 gen6_get_sample_position(struct gl_context *ctx,
                          struct gl_framebuffer *fb,
@@ -1677,9 +1656,6 @@
                             uint32_t width, uint32_t height,
                             uint32_t tile_x, uint32_t tile_y);
 
-void gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-                   unsigned int level, unsigned int layer, enum blorp_hiz_op op);
-
 uint32_t get_hw_prim_for_gl_prim(int mode);
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 7d58efb..139a3bc 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -41,7 +41,7 @@
  * quickly at thread setup time.  Each individual fixed function unit's state
  * (brw_vs_state.c for example) tells the hardware which subset of the CURBE
  * it wants in its register space, and we calculate those areas here under the
- * BRW_NEW_CURBE_OFFSETS state flag.  The brw_urb.c allocation will control
+ * BRW_NEW_PUSH_CONSTANT_ALLOCATION state flag.  The brw_urb.c allocation will control
  * how many CURBEs can be loaded into the hardware at once before a pipeline
  * stall occurs at CMD_CONST_BUFFER time.
  *
@@ -135,7 +135,7 @@
                  brw->curbe.vs_start,
                  brw->curbe.vs_size );
 
-      brw->ctx.NewDriverState |= BRW_NEW_CURBE_OFFSETS;
+      brw->ctx.NewDriverState |= BRW_NEW_PUSH_CONSTANT_ALLOCATION;
    }
 }
 
@@ -196,7 +196,7 @@
 brw_upload_constant_buffer(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_CURBE_OFFSETS */
+   /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
    const GLuint sz = brw->curbe.total_size;
    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    gl_constant_value *buf;
@@ -216,7 +216,7 @@
    if (brw->curbe.wm_size) {
       _mesa_load_state_parameters(ctx, brw->fragment_program->Parameters);
 
-      /* BRW_NEW_CURBE_OFFSETS */
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
       GLuint offset = brw->curbe.wm_start * 16;
 
       /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */
@@ -338,7 +338,7 @@
       .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw  = BRW_NEW_BATCH |
               BRW_NEW_BLORP |
-              BRW_NEW_CURBE_OFFSETS |
+              BRW_NEW_PUSH_CONSTANT_ALLOCATION |
               BRW_NEW_FRAGMENT_PROGRAM |
               BRW_NEW_FS_PROG_DATA |
               BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 08106c0..4abb790 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -109,13 +109,6 @@
 #define BRW_CLIP_API_OGL     0
 #define BRW_CLIP_API_DX      1
 
-#define BRW_CLIPMODE_NORMAL              0
-#define BRW_CLIPMODE_CLIP_ALL            1
-#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
-#define BRW_CLIPMODE_REJECT_ALL          3
-#define BRW_CLIPMODE_ACCEPT_ALL          4
-#define BRW_CLIPMODE_KERNEL_CLIP         5
-
 #define BRW_CLIP_NDCSPACE     0
 #define BRW_CLIP_SCREENSPACE  1
 
@@ -153,8 +146,6 @@
 #define BRW_FRONTWINDING_CW      0
 #define BRW_FRONTWINDING_CCW     1
 
-#define BRW_SPRITE_POINT_ENABLE  16
-
 #define BRW_CUT_INDEX_ENABLE     (1 << 10)
 
 #define BRW_INDEX_BYTE     0
@@ -1359,54 +1350,13 @@
 
 #define GEN6_MI_REPORT_PERF_COUNT ((0x28 << 23) | (3 - 2))
 
+#define GEN8_MI_REPORT_PERF_COUNT ((0x28 << 23) | (4 - 2))
 
 /* Maximum number of entries that can be addressed using a binding table
  * pointer of type SURFTYPE_BUFFER
  */
 #define BRW_MAX_NUM_BUFFER_ENTRIES	(1 << 27)
 
-/* Memory Object Control State:
- * Specifying zero for L3 means "uncached in L3", at least on Haswell
- * and Baytrail, since there are no PTE flags for setting L3 cacheability.
- * On Ivybridge, the PTEs do have a cache-in-L3 bit, so setting MOCS to 0
- * may still respect that.
- */
-#define GEN7_MOCS_L3                    1
-
-/* Ivybridge only: cache in LLC.
- * Specifying zero here means to use the PTE values set by the kernel;
- * non-zero overrides the PTE values.
- */
-#define IVB_MOCS_LLC                    (1 << 1)
-
-/* Baytrail only: snoop in CPU cache */
-#define BYT_MOCS_SNOOP                  (1 << 1)
-
-/* Haswell only: LLC/eLLC controls (write-back or uncached).
- * Specifying zero here means to use the PTE values set by the kernel,
- * which is useful since it offers additional control (write-through
- * cacheing and age).  Non-zero overrides the PTE values.
- */
-#define HSW_MOCS_UC_LLC_UC_ELLC         (1 << 1)
-#define HSW_MOCS_WB_LLC_WB_ELLC         (2 << 1)
-#define HSW_MOCS_UC_LLC_WB_ELLC         (3 << 1)
-
-/* Broadwell: these defines always use all available caches (L3, LLC, eLLC),
- * and let you force write-back (WB) or write-through (WT) caching, or leave
- * it up to the page table entry (PTE) specified by the kernel.
- */
-#define BDW_MOCS_WB  0x78
-#define BDW_MOCS_WT  0x58
-#define BDW_MOCS_PTE 0x18
-
-/* Skylake: MOCS is now an index into an array of 62 different caching
- * configurations programmed by the kernel.
- */
-/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
-#define SKL_MOCS_WB  (2 << 1)
-/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
-#define SKL_MOCS_PTE (1 << 1)
-
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
 # define MEDIA_VFE_STATE_MAX_THREADS_SHIFT      16
@@ -1660,12 +1610,20 @@
 #define GEN7_GPGPU_DISPATCHDIMZ         0x2508
 
 #define GEN7_CACHE_MODE_1               0x7004
+# define GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE (1 << 4)
 # define GEN8_HIZ_NP_PMA_FIX_ENABLE        (1 << 11)
 # define GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE (1 << 13)
 # define GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC (1 << 1)
 # define GEN8_HIZ_PMA_MASK_BITS \
    REG_MASK(GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE)
 
+#define GEN7_GT_MODE                    0x7008
+# define GEN9_SUBSLICE_HASHING_8x8      (0 << 8)
+# define GEN9_SUBSLICE_HASHING_16x4     (1 << 8)
+# define GEN9_SUBSLICE_HASHING_8x4      (2 << 8)
+# define GEN9_SUBSLICE_HASHING_16x16    (3 << 8)
+# define GEN9_SUBSLICE_HASHING_MASK_BITS REG_MASK(3 << 8)
+
 /* Predicate registers */
 #define MI_PREDICATE_SRC0               0x2400
 #define MI_PREDICATE_SRC1               0x2408
@@ -1729,4 +1687,10 @@
 # define GEN8_L3CNTLREG_ALL_ALLOC_SHIFT    25
 # define GEN8_L3CNTLREG_ALL_ALLOC_MASK     INTEL_MASK(31, 25)
 
+#define INSTPM                             0x20c0
+# define INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 6)
+
+#define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
+# define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 611cb86..7d66c63 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -25,6 +25,7 @@
 
 #include <sys/errno.h>
 
+#include "main/blend.h"
 #include "main/context.h"
 #include "main/condrender.h"
 #include "main/samplerobj.h"
@@ -222,7 +223,7 @@
       struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
       struct brw_bo *bo = intel_bufferobj_buffer(brw,
             intel_buffer_object(indirect_buffer),
-            prim->indirect_offset, 5 * sizeof(GLuint));
+            prim->indirect_offset, 5 * sizeof(GLuint), false);
 
       indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
 
@@ -341,6 +342,147 @@
    }
 }
 
+static bool
+intel_disable_rb_aux_buffer(struct brw_context *brw, const struct brw_bo *bo)
+{
+   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
+   bool found = false;
+
+   for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
+      const struct intel_renderbuffer *irb =
+         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
+
+      if (irb && irb->mt->bo == bo) {
+         found = brw->draw_aux_buffer_disabled[i] = true;
+      }
+   }
+
+   return found;
+}
+
+/**
+ * \brief Resolve buffers before drawing.
+ *
+ * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each
+ * enabled depth texture, and flush the render cache for any dirty textures.
+ */
+void
+brw_predraw_resolve_inputs(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct intel_texture_object *tex_obj;
+
+   memset(brw->draw_aux_buffer_disabled, 0,
+          sizeof(brw->draw_aux_buffer_disabled));
+
+   /* Resolve depth buffer and render cache of each enabled texture. */
+   int maxEnabledUnit = ctx->Texture._MaxEnabledTexImageUnit;
+   for (int i = 0; i <= maxEnabledUnit; i++) {
+      if (!ctx->Texture.Unit[i]._Current)
+	 continue;
+      tex_obj = intel_texture_object(ctx->Texture.Unit[i]._Current);
+      if (!tex_obj || !tex_obj->mt)
+	 continue;
+
+      struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, i);
+      enum isl_format view_format =
+         translate_tex_format(brw, tex_obj->_Format, sampler->sRGBDecode);
+
+      bool aux_supported;
+      intel_miptree_prepare_texture(brw, tex_obj->mt, view_format,
+                                    &aux_supported);
+
+      if (!aux_supported && brw->gen >= 9 &&
+          intel_disable_rb_aux_buffer(brw, tex_obj->mt->bo)) {
+         perf_debug("Sampling renderbuffer with non-compressible format - "
+                    "turning off compression");
+      }
+
+      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+
+      if (tex_obj->base.StencilSampling ||
+          tex_obj->mt->format == MESA_FORMAT_S_UINT8) {
+         intel_update_r8stencil(brw, tex_obj->mt);
+      }
+   }
+
+   /* Resolve color for each active shader image. */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      const struct gl_program *prog = ctx->_Shader->CurrentProgram[i];
+
+      if (unlikely(prog && prog->info.num_images)) {
+         for (unsigned j = 0; j < prog->info.num_images; j++) {
+            struct gl_image_unit *u =
+               &ctx->ImageUnits[prog->sh.ImageUnits[j]];
+            tex_obj = intel_texture_object(u->TexObj);
+
+            if (tex_obj && tex_obj->mt) {
+               intel_miptree_prepare_image(brw, tex_obj->mt);
+
+               if (tex_obj->mt->aux_usage == ISL_AUX_USAGE_CCS_E &&
+                   intel_disable_rb_aux_buffer(brw, tex_obj->mt->bo)) {
+                  perf_debug("Using renderbuffer as shader image - turning "
+                             "off lossless compression");
+               }
+
+               brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+            }
+         }
+      }
+   }
+}
+
+static void
+brw_predraw_resolve_framebuffer(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct intel_renderbuffer *depth_irb;
+
+   /* Resolve the depth buffer's HiZ buffer. */
+   depth_irb = intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
+   if (depth_irb && depth_irb->mt) {
+      intel_miptree_prepare_depth(brw, depth_irb->mt,
+                                  depth_irb->mt_level,
+                                  depth_irb->mt_layer,
+                                  depth_irb->layer_count);
+   }
+
+   /* Resolve color buffers for non-coherent framebuffer fetch. */
+   if (!ctx->Extensions.MESA_shader_framebuffer_fetch &&
+       ctx->FragmentProgram._Current &&
+       ctx->FragmentProgram._Current->info.outputs_read) {
+      const struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+      for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
+         const struct intel_renderbuffer *irb =
+            intel_renderbuffer(fb->_ColorDrawBuffers[i]);
+
+         if (irb) {
+            intel_miptree_prepare_fb_fetch(brw, irb->mt, irb->mt_level,
+                                           irb->mt_layer, irb->layer_count);
+         }
+      }
+   }
+
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
+      struct intel_renderbuffer *irb =
+         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
+
+      if (irb == NULL || irb->mt == NULL)
+         continue;
+
+      mesa_format mesa_format =
+         _mesa_get_render_format(ctx, intel_rb_format(irb));
+      enum isl_format isl_format = brw_isl_format_for_mesa_format(mesa_format);
+
+      intel_miptree_prepare_render(brw, irb->mt, irb->mt_level,
+                                   irb->mt_layer, irb->layer_count,
+                                   isl_format,
+                                   ctx->Color.BlendEnabled & (1 << i));
+   }
+}
+
 /**
  * \brief Call this after drawing to mark which buffers need resolving
  *
@@ -372,13 +514,26 @@
       front_irb->need_downsample = true;
    if (back_irb)
       back_irb->need_downsample = true;
-   if (depth_irb && brw_depth_writes_enabled(brw)) {
-      intel_renderbuffer_att_set_needs_depth_resolve(depth_att);
-      brw_render_cache_set_add_bo(brw, depth_irb->mt->bo);
+   if (depth_irb) {
+      bool depth_written = brw_depth_writes_enabled(brw);
+      if (depth_att->Layered) {
+         intel_miptree_finish_depth(brw, depth_irb->mt,
+                                    depth_irb->mt_level,
+                                    depth_irb->mt_layer,
+                                    depth_irb->layer_count,
+                                    depth_written);
+      } else {
+         intel_miptree_finish_depth(brw, depth_irb->mt,
+                                    depth_irb->mt_level,
+                                    depth_irb->mt_layer, 1,
+                                    depth_written);
+      }
+      if (depth_written)
+         brw_render_cache_set_add_bo(brw, depth_irb->mt->bo);
    }
 
    if (ctx->Extensions.ARB_stencil_texturing &&
-       stencil_irb && ctx->Stencil._WriteEnabled) {
+       stencil_irb && brw->stencil_write_enabled) {
       brw_render_cache_set_add_bo(brw, stencil_irb->mt->bo);
    }
 
@@ -389,42 +544,65 @@
       if (!irb)
          continue;
      
+      mesa_format mesa_format =
+         _mesa_get_render_format(ctx, intel_rb_format(irb));
+      enum isl_format isl_format = brw_isl_format_for_mesa_format(mesa_format);
+
       brw_render_cache_set_add_bo(brw, irb->mt->bo);
-      intel_miptree_used_for_rendering(
-         brw, irb->mt, irb->mt_level, irb->mt_layer, irb->layer_count);
+      intel_miptree_finish_render(brw, irb->mt, irb->mt_level,
+                                  irb->mt_layer, irb->layer_count,
+                                  isl_format,
+                                  ctx->Color.BlendEnabled & (1 << i));
    }
 }
 
 static void
-brw_predraw_set_aux_buffers(struct brw_context *brw)
+intel_renderbuffer_move_temp_back(struct brw_context *brw,
+                                  struct intel_renderbuffer *irb)
 {
-   if (brw->gen < 9)
+   if (irb->align_wa_mt == NULL)
       return;
 
+   brw_render_cache_set_check_flush(brw, irb->align_wa_mt->bo);
+
+   intel_miptree_copy_slice(brw, irb->align_wa_mt, 0, 0,
+                            irb->mt,
+                            irb->Base.Base.TexImage->Level, irb->mt_layer);
+
+   intel_miptree_reference(&irb->align_wa_mt, NULL);
+
+   /* Finally restore the x,y to correspond to full miptree. */
+   intel_renderbuffer_set_draw_offset(irb);
+
+   /* Make sure render surface state gets re-emitted with updated miptree. */
+   brw->NewGLState |= _NEW_BUFFERS;
+}
+
+static void
+brw_postdraw_reconcile_align_wa_slices(struct brw_context *brw)
+{
    struct gl_context *ctx = &brw->ctx;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
 
+   struct intel_renderbuffer *depth_irb =
+      intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   struct intel_renderbuffer *stencil_irb =
+      intel_get_renderbuffer(fb, BUFFER_STENCIL);
+
+   if (depth_irb && depth_irb->align_wa_mt)
+      intel_renderbuffer_move_temp_back(brw, depth_irb);
+
+   if (stencil_irb && stencil_irb->align_wa_mt)
+      intel_renderbuffer_move_temp_back(brw, stencil_irb);
+
    for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
       struct intel_renderbuffer *irb =
          intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
-      if (!irb) {
+      if (!irb || irb->align_wa_mt == NULL)
          continue;
-      }
 
-      /* For layered rendering non-compressed fast cleared buffers need to be
-       * resolved. Surface state can carry only one fast color clear value
-       * while each layer may have its own fast clear color value. For
-       * compressed buffers color value is available in the color buffer.
-       */
-      if (irb->layer_count > 1 &&
-          !(irb->mt->aux_disable & INTEL_AUX_DISABLE_CCS) &&
-          !intel_miptree_is_lossless_compressed(brw, irb->mt)) {
-         assert(brw->gen >= 8);
-
-         intel_miptree_resolve_color(brw, irb->mt, irb->mt_level,
-                                     irb->mt_layer, irb->layer_count, 0);
-      }
+      intel_renderbuffer_move_temp_back(brw, irb);
    }
 }
 
@@ -476,7 +654,6 @@
       util_last_bit(ctx->VertexProgram._Current->SamplersUsed);
 
    intel_prepare_render(brw);
-   brw_predraw_set_aux_buffers(brw);
 
    /* This workaround has to happen outside of brw_upload_render_state()
     * because it may flush the batchbuffer for a blit, affecting the state
@@ -484,6 +661,13 @@
     */
    brw_workaround_depthstencil_alignment(brw, 0);
 
+   /* Resolves must occur after updating renderbuffers, updating context state,
+    * and finalizing textures but before setting up any hardware state for
+    * this draw call.
+    */
+   brw_predraw_resolve_inputs(brw);
+   brw_predraw_resolve_framebuffer(brw);
+
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs(brw, arrays);
@@ -507,6 +691,11 @@
       estimated_max_prim_size += 1024; /* gen6 WM push constants */
       estimated_max_prim_size += 512; /* misc. pad */
 
+      /* Flag BRW_NEW_DRAW_CALL on every draw.  This allows us to have
+       * atoms that happen on every draw call.
+       */
+      brw->ctx.NewDriverState |= BRW_NEW_DRAW_CALL;
+
       /* Flush the batch if it's approaching full, so that we don't wrap while
        * we've got validated state that needs to be in the same batch as the
        * primitives.
@@ -626,6 +815,7 @@
       intel_batchbuffer_flush(brw);
 
    brw_program_cache_check_size(brw);
+   brw_postdraw_reconcile_align_wa_slices(brw);
    brw_postdraw_set_buffers_need_resolve(brw);
 
    return;
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index fced0b4..5b56aaf 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -242,86 +242,6 @@
            : double_types_float[size]);
 }
 
-static bool
-is_passthru_format(uint32_t format)
-{
-   switch (format) {
-   case ISL_FORMAT_R64_PASSTHRU:
-   case ISL_FORMAT_R64G64_PASSTHRU:
-   case ISL_FORMAT_R64G64B64_PASSTHRU:
-   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
-      return true;
-   default:
-      return false;
-   }
-}
-
-static int
-uploads_needed(uint32_t format)
-{
-   if (!is_passthru_format(format))
-      return 1;
-
-   switch (format) {
-   case ISL_FORMAT_R64_PASSTHRU:
-   case ISL_FORMAT_R64G64_PASSTHRU:
-      return 1;
-   case ISL_FORMAT_R64G64B64_PASSTHRU:
-   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
-      return 2;
-   default:
-      unreachable("not reached");
-   }
-}
-
-/*
- * Returns the number of componentes associated with a format that is used on
- * a 64 to 32 format split. See downsize_format()
- */
-static int
-upload_format_size(uint32_t upload_format)
-{
-   switch (upload_format) {
-   case ISL_FORMAT_R32G32_FLOAT:
-      return 2;
-   case ISL_FORMAT_R32G32B32A32_FLOAT:
-      return 4;
-   default:
-      unreachable("not reached");
-   }
-}
-
-/*
- * Returns the format that we are finally going to use when upload a vertex
- * element. It will only change if we are using *64*PASSTHRU formats, as for
- * gen < 8 they need to be splitted on two *32*FLOAT formats.
- *
- * @upload points in which upload we are. Valid values are [0,1]
- */
-static uint32_t
-downsize_format_if_needed(uint32_t format,
-                          int upload)
-{
-   assert(upload == 0 || upload == 1);
-
-   if (!is_passthru_format(format))
-      return format;
-
-   switch (format) {
-   case ISL_FORMAT_R64_PASSTHRU:
-      return ISL_FORMAT_R32G32_FLOAT;
-   case ISL_FORMAT_R64G64_PASSTHRU:
-      return ISL_FORMAT_R32G32B32A32_FLOAT;
-   case ISL_FORMAT_R64G64B64_PASSTHRU:
-      return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
-                     : ISL_FORMAT_R32G32_FLOAT;
-   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
-      return ISL_FORMAT_R32G32B32A32_FLOAT;
-   default:
-      unreachable("not reached");
-   }
-}
-
 /**
  * Given vertex array type/size/format/normalized info, return
  * the appopriate hardware surface type.
@@ -702,7 +622,8 @@
       const uint32_t start = buffer_range_start[i];
       const uint32_t range = buffer_range_end[i] - buffer_range_start[i];
 
-      buffer->bo = intel_bufferobj_buffer(brw, enabled_buffer[i], start, range);
+      buffer->bo = intel_bufferobj_buffer(brw, enabled_buffer[i], start,
+                                          range, false);
       brw_bo_reference(buffer->bo);
    }
 
@@ -788,384 +709,9 @@
    }
 }
 
-/**
- * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
- */
-uint32_t *
-brw_emit_vertex_buffer_state(struct brw_context *brw,
-                             unsigned buffer_nr,
-                             struct brw_bo *bo,
-                             unsigned start_offset,
-                             unsigned end_offset,
-                             unsigned stride,
-                             unsigned step_rate,
-                             uint32_t *__map)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw0;
-
-   if (brw->gen >= 8) {
-      dw0 = buffer_nr << GEN6_VB0_INDEX_SHIFT;
-   } else if (brw->gen >= 6) {
-      dw0 = (buffer_nr << GEN6_VB0_INDEX_SHIFT) |
-            (step_rate ? GEN6_VB0_ACCESS_INSTANCEDATA
-                       : GEN6_VB0_ACCESS_VERTEXDATA);
-   } else {
-      dw0 = (buffer_nr << BRW_VB0_INDEX_SHIFT) |
-            (step_rate ? BRW_VB0_ACCESS_INSTANCEDATA
-                       : BRW_VB0_ACCESS_VERTEXDATA);
-   }
-
-   if (brw->gen >= 7)
-      dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
-
-   switch (brw->gen) {
-   case 7:
-      dw0 |= GEN7_MOCS_L3 << 16;
-      break;
-   case 8:
-      dw0 |= BDW_MOCS_WB << 16;
-      break;
-   case 9:
-      dw0 |= SKL_MOCS_WB << 16;
-      break;
-   }
-
-   WARN_ONCE(stride >= (brw->gen >= 5 ? 2048 : 2047),
-             "VBO stride %d too large, bad rendering may occur\n",
-             stride);
-   OUT_BATCH(dw0 | (stride << BRW_VB0_PITCH_SHIFT));
-   if (brw->gen >= 8) {
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
-      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
-       *                 Vertex Fetch (VF) Stage - State
-       *
-       * Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
-       * VBState.BufferPitch", the address of the byte immediately beyond the
-       * last valid byte of the buffer is determined by
-       * "VBState.StartingBufferAddress + VBState.BufferSize".
-       */
-      OUT_BATCH(end_offset - start_offset);
-   } else if (brw->gen >= 5) {
-      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
-      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
-       *                 Vertex Fetch (VF) Stage - State
-       *
-       *  Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
-       *  VBState.BufferPitch", the address of the byte immediately beyond the
-       *  last valid byte of the buffer is determined by
-       *  "VBState.EndAddress + 1".
-       */
-      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, end_offset - 1);
-      OUT_BATCH(step_rate);
-   } else {
-      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
-      OUT_BATCH(0);
-      OUT_BATCH(step_rate);
-   }
-
-   return __map;
-}
-
-static void
-brw_emit_vertices(struct brw_context *brw)
-{
-   GLuint i;
-
-   brw_prepare_vertices(brw);
-   brw_prepare_shader_draw_parameters(brw);
-
-   brw_emit_query_begin(brw);
-
-   const struct brw_vs_prog_data *vs_prog_data =
-      brw_vs_prog_data(brw->vs.base.prog_data);
-
-   unsigned nr_elements = brw->vb.nr_enabled;
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
-       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
-      ++nr_elements;
-   if (vs_prog_data->uses_drawid)
-      nr_elements++;
-
-   /* If any of the formats of vb.enabled needs more that one upload, we need
-    * to add it to nr_elements */
-   unsigned extra_uploads = 0;
-   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
-
-      if (uploads_needed(format) > 1)
-         extra_uploads++;
-   }
-   nr_elements += extra_uploads;
-
-   /* If the VS doesn't read any inputs (calculating vertex position from
-    * a state variable for some reason, for example), emit a single pad
-    * VERTEX_ELEMENT struct and bail.
-    *
-    * The stale VB state stays in place, but they don't do anything unless
-    * a VE loads from them.
-    */
-   if (nr_elements == 0) {
-      BEGIN_BATCH(3);
-      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | 1);
-      if (brw->gen >= 6) {
-	 OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
-		   GEN6_VE0_VALID |
-		   (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
-		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      } else {
-	 OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
-		   BRW_VE0_VALID |
-		   (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
-		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      }
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
-		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
-      ADVANCE_BATCH();
-      return;
-   }
-
-   /* Now emit VB and VEP state packets.
-    */
-
-   const bool uses_draw_params =
-      vs_prog_data->uses_basevertex ||
-      vs_prog_data->uses_baseinstance;
-   const unsigned nr_buffers = brw->vb.nr_buffers +
-      uses_draw_params + vs_prog_data->uses_drawid;
-
-   if (nr_buffers) {
-      if (brw->gen >= 6) {
-	 assert(nr_buffers <= 33);
-      } else {
-	 assert(nr_buffers <= 17);
-      }
-
-      BEGIN_BATCH(1 + 4 * nr_buffers);
-      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
-      for (i = 0; i < brw->vb.nr_buffers; i++) {
-	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         /* Prior to Haswell and Bay Trail we have to use 4-component formats
-          * to fake 3-component ones.  In particular, we do this for
-          * half-float and 8 and 16-bit integer formats.  This means that the
-          * vertex element may poke over the end of the buffer by 2 bytes.
-          */
-         unsigned padding =
-            (brw->gen <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;
-         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->offset,
-                                  buffer->offset + buffer->size + padding,
-                                  buffer->stride, buffer->step_rate);
-
-      }
-
-      if (uses_draw_params) {
-         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
-                                  brw->draw.draw_params_bo,
-                                  brw->draw.draw_params_offset,
-                                  brw->draw.draw_params_bo->size,
-                                  0,  /* stride */
-                                  0); /* step rate */
-      }
-
-      if (vs_prog_data->uses_drawid) {
-         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1,
-                                  brw->draw.draw_id_bo,
-                                  brw->draw.draw_id_offset,
-                                  brw->draw.draw_id_bo->size,
-                                  0,  /* stride */
-                                  0); /* step rate */
-      }
-
-      ADVANCE_BATCH();
-   }
-
-   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, presumably
-    * for VertexID/InstanceID.
-    */
-   if (brw->gen >= 6) {
-      assert(nr_elements <= 34);
-   } else {
-      assert(nr_elements <= 18);
-   }
-
-   struct brw_vertex_element *gen6_edgeflag_input = NULL;
-
-   BEGIN_BATCH(1 + nr_elements * 2);
-   OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1));
-   for (i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
-      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
-      unsigned num_uploads = 1;
-      unsigned c;
-
-      num_uploads = uploads_needed(format);
-
-      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
-         /* Gen6+ passes edgeflag as sideband along with the vertex, instead
-          * of in the VUE.  We have to upload it sideband as the last vertex
-          * element according to the B-Spec.
-          */
-         if (brw->gen >= 6) {
-            gen6_edgeflag_input = input;
-            continue;
-         }
-      }
-
-      for (c = 0; c < num_uploads; c++) {
-         uint32_t upload_format = downsize_format_if_needed(format, c);
-         /* If we need more that one upload, the offset stride would be 128
-          * bits (16 bytes), as for previous uploads we are using the full
-          * entry. */
-         unsigned int offset = input->offset + c * 16;
-         int size = input->glarray->Size;
-
-         if (is_passthru_format(format))
-            size = upload_format_size(upload_format);
-
-         switch (size) {
-         case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
-         case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
-         case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
-         case 3: comp3 = input->glarray->Integer
-                         ? BRW_VE1_COMPONENT_STORE_1_INT
-                         : BRW_VE1_COMPONENT_STORE_1_FLT;
-            break;
-         }
-
-         if (brw->gen >= 6) {
-            OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |
-                      GEN6_VE0_VALID |
-                      (upload_format << BRW_VE0_FORMAT_SHIFT) |
-                      (offset << BRW_VE0_SRC_OFFSET_SHIFT));
-         } else {
-            OUT_BATCH((input->buffer << BRW_VE0_INDEX_SHIFT) |
-                      BRW_VE0_VALID |
-                      (upload_format << BRW_VE0_FORMAT_SHIFT) |
-                      (offset << BRW_VE0_SRC_OFFSET_SHIFT));
-         }
-
-         if (brw->gen >= 5)
-            OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-                      (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-                      (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-                      (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
-         else
-            OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-                      (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-                      (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-                      (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
-                      ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
-      }
-   }
-
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
-       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
-      uint32_t dw0 = 0, dw1 = 0;
-      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_0;
-      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_0;
-      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_0;
-      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_0;
-
-      if (vs_prog_data->uses_basevertex)
-         comp0 = BRW_VE1_COMPONENT_STORE_SRC;
-
-      if (vs_prog_data->uses_baseinstance)
-         comp1 = BRW_VE1_COMPONENT_STORE_SRC;
-
-      if (vs_prog_data->uses_vertexid)
-         comp2 = BRW_VE1_COMPONENT_STORE_VID;
-
-      if (vs_prog_data->uses_instanceid)
-         comp3 = BRW_VE1_COMPONENT_STORE_IID;
-
-      dw1 = (comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-            (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-            (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-            (comp3 << BRW_VE1_COMPONENT_3_SHIFT);
-
-      if (brw->gen >= 6) {
-         dw0 |= GEN6_VE0_VALID |
-                brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
-                ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
-      } else {
-         dw0 |= BRW_VE0_VALID |
-                brw->vb.nr_buffers << BRW_VE0_INDEX_SHIFT |
-                ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
-	 dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
-      }
-
-      /* Note that for gl_VertexID, gl_InstanceID, and gl_PrimitiveID values,
-       * the format is ignored and the value is always int.
-       */
-
-      OUT_BATCH(dw0);
-      OUT_BATCH(dw1);
-   }
-
-   if (vs_prog_data->uses_drawid) {
-      uint32_t dw0 = 0, dw1 = 0;
-
-      dw1 = (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_1_SHIFT) |
-            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_2_SHIFT) |
-            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_3_SHIFT);
-
-      if (brw->gen >= 6) {
-         dw0 |= GEN6_VE0_VALID |
-                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
-                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
-      } else {
-         dw0 |= BRW_VE0_VALID |
-                ((brw->vb.nr_buffers + 1) << BRW_VE0_INDEX_SHIFT) |
-                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
-
-	 dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
-      }
-
-      OUT_BATCH(dw0);
-      OUT_BATCH(dw1);
-   }
-
-   if (brw->gen >= 6 && gen6_edgeflag_input) {
-      uint32_t format =
-         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
-
-      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
-                GEN6_VE0_VALID |
-                GEN6_VE0_EDGE_FLAG_ENABLE |
-                (format << BRW_VE0_FORMAT_SHIFT) |
-                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
-
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_vertices = {
-   .dirty = {
-      .mesa = _NEW_POLYGON,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VERTICES |
-             BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = brw_emit_vertices,
-};
-
 static void
 brw_upload_indices(struct brw_context *brw)
 {
-   struct gl_context *ctx = &brw->ctx;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint ib_size;
    struct brw_bo *old_bo = brw->ib.bo;
@@ -1176,7 +722,7 @@
    if (index_buffer == NULL)
       return;
 
-   ib_type_size = _mesa_sizeof_type(index_buffer->type);
+   ib_type_size = index_buffer->index_size;
    ib_size = index_buffer->count ? ib_type_size * index_buffer->count :
                                    index_buffer->obj->Size;
    bufferobj = index_buffer->obj;
@@ -1192,35 +738,14 @@
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
 
-      /* If the index buffer isn't aligned to its element size, we have to
-       * rebase it into a temporary.
-       */
-      if ((ib_type_size - 1) & offset) {
-         perf_debug("copying index buffer to a temporary to work around "
-                    "misaligned offset %d\n", offset);
-
-         GLubyte *map = ctx->Driver.MapBufferRange(ctx,
-                                                   offset,
-                                                   ib_size,
-                                                   GL_MAP_READ_BIT,
-                                                   bufferobj,
-                                                   MAP_INTERNAL);
-
-         intel_upload_data(brw, map, ib_size, ib_type_size,
-                           &brw->ib.bo, &offset);
-         brw->ib.size = brw->ib.bo->size;
-
-         ctx->Driver.UnmapBuffer(ctx, bufferobj, MAP_INTERNAL);
-      } else {
-         struct brw_bo *bo =
-            intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj),
-                                   offset, ib_size);
-         if (bo != brw->ib.bo) {
-            brw_bo_unreference(brw->ib.bo);
-            brw->ib.bo = bo;
-            brw->ib.size = bufferobj->Size;
-            brw_bo_reference(bo);
-         }
+      struct brw_bo *bo =
+         intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj),
+                                offset, ib_size, false);
+      if (bo != brw->ib.bo) {
+         brw_bo_unreference(brw->ib.bo);
+         brw->ib.bo = bo;
+         brw->ib.size = bufferobj->Size;
+         brw_bo_reference(bo);
       }
    }
 
@@ -1233,8 +758,8 @@
    if (brw->ib.bo != old_bo)
       brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
 
-   if (index_buffer->type != brw->ib.type) {
-      brw->ib.type = index_buffer->type;
+   if (index_buffer->index_size != brw->ib.index_size) {
+      brw->ib.index_size = index_buffer->index_size;
       brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
    }
 }
@@ -1247,42 +772,3 @@
    },
    .emit = brw_upload_indices,
 };
-
-static void
-brw_emit_index_buffer(struct brw_context *brw)
-{
-   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   GLuint cut_index_setting;
-
-   if (index_buffer == NULL)
-      return;
-
-   if (brw->prim_restart.enable_cut_index && !brw->is_haswell) {
-      cut_index_setting = BRW_CUT_INDEX_ENABLE;
-   } else {
-      cut_index_setting = 0;
-   }
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(CMD_INDEX_BUFFER << 16 |
-             cut_index_setting |
-             brw_get_index_type(index_buffer->type) |
-             1);
-   OUT_RELOC(brw->ib.bo,
-             I915_GEM_DOMAIN_VERTEX, 0,
-             0);
-   OUT_RELOC(brw->ib.bo,
-             I915_GEM_DOMAIN_VERTEX, 0,
-	     brw->ib.size - 1);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_index_buffer = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_INDEX_BUFFER,
-   },
-   .emit = brw_emit_index_buffer,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_formatquery.c b/src/mesa/drivers/dri/i965/brw_formatquery.c
index 96cc6e0..5faf91f 100644
--- a/src/mesa/drivers/dri/i965/brw_formatquery.c
+++ b/src/mesa/drivers/dri/i965/brw_formatquery.c
@@ -37,6 +37,7 @@
    (void) internalFormat;
 
    switch (brw->gen) {
+   case 10:
    case 9:
       samples[0] = 16;
       samples[1] = 8;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
deleted file mode 100644
index 58fa207..0000000
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_wm_channel_expressions.cpp
- *
- * Breaks vector operations down into operations on each component.
- *
- * The 965 fragment shader receives 8 or 16 pixels at a time, so each
- * channel of a vector is laid out as 1 or 2 8-float registers.  Each
- * ALU operation operates on one of those channel registers.  As a
- * result, there is no value to the 965 fragment shader in tracking
- * "vector" expressions in the sense of GLSL fragment shaders, when
- * doing a channel at a time may help in constant folding, algebraic
- * simplification, and reducing the liveness of channel registers.
- *
- * The exception to the desire to break everything down to floats is
- * texturing.  The texture sampler returns a writemasked masked
- * 4/8-register sequence containing the texture values.  We don't want
- * to dispatch to the sampler separately for each channel we need, so
- * we do retain the vector types in that case.
- */
-
-#include "brw_program.h"
-#include "compiler/glsl/ir.h"
-#include "compiler/glsl/ir_expression_flattening.h"
-#include "compiler/glsl_types.h"
-
-class ir_channel_expressions_visitor : public ir_hierarchical_visitor {
-public:
-   ir_channel_expressions_visitor()
-   {
-      this->progress = false;
-      this->mem_ctx = NULL;
-   }
-
-   ir_visitor_status visit_leave(ir_assignment *);
-
-   ir_rvalue *get_element(ir_variable *var, unsigned int element);
-   void assign(ir_assignment *ir, int elem, ir_rvalue *val);
-
-   bool progress;
-   void *mem_ctx;
-};
-
-static bool
-channel_expressions_predicate(ir_instruction *ir)
-{
-   ir_expression *expr = ir->as_expression();
-   unsigned int i;
-
-   if (!expr)
-      return false;
-
-   switch (expr->operation) {
-      case ir_unop_pack_half_2x16:
-      case ir_unop_pack_snorm_2x16:
-      case ir_unop_pack_snorm_4x8:
-      case ir_unop_pack_unorm_2x16:
-      case ir_unop_pack_unorm_4x8:
-         return false;
-
-      /* these opcodes need to act on the whole vector,
-       * just like texturing.
-       */
-      case ir_unop_interpolate_at_centroid:
-      case ir_binop_interpolate_at_offset:
-      case ir_binop_interpolate_at_sample:
-      case ir_unop_pack_double_2x32:
-      case ir_unop_pack_int_2x32:
-      case ir_unop_pack_uint_2x32:
-         return false;
-      default:
-         break;
-   }
-
-   for (i = 0; i < expr->get_num_operands(); i++) {
-      if (expr->operands[i]->type->is_vector())
-	 return true;
-   }
-
-   return false;
-}
-
-bool
-brw_do_channel_expressions(exec_list *instructions)
-{
-   ir_channel_expressions_visitor v;
-
-   /* Pull out any matrix expression to a separate assignment to a
-    * temp.  This will make our handling of the breakdown to
-    * operations on the matrix's vector components much easier.
-    */
-   do_expression_flattening(instructions, channel_expressions_predicate);
-
-   visit_list_elements(&v, instructions);
-
-   return v.progress;
-}
-
-ir_rvalue *
-ir_channel_expressions_visitor::get_element(ir_variable *var, unsigned int elem)
-{
-   ir_dereference *deref;
-
-   if (var->type->is_scalar())
-      return new(mem_ctx) ir_dereference_variable(var);
-
-   assert(elem < var->type->components());
-   deref = new(mem_ctx) ir_dereference_variable(var);
-   return new(mem_ctx) ir_swizzle(deref, elem, 0, 0, 0, 1);
-}
-
-void
-ir_channel_expressions_visitor::assign(ir_assignment *ir, int elem, ir_rvalue *val)
-{
-   ir_dereference *lhs = ir->lhs->clone(mem_ctx, NULL);
-   ir_assignment *assign;
-
-   /* This assign-of-expression should have been generated by the
-    * expression flattening visitor (since we never short circit to
-    * not flatten, even for plain assignments of variables), so the
-    * writemask is always full.
-    */
-   assert(ir->write_mask == (1 << ir->lhs->type->components()) - 1);
-
-   assign = new(mem_ctx) ir_assignment(lhs, val, NULL, (1 << elem));
-   ir->insert_before(assign);
-}
-
-ir_visitor_status
-ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
-{
-   ir_expression *expr = ir->rhs->as_expression();
-   bool found_vector = false;
-   unsigned int i, vector_elements = 1;
-   ir_variable *op_var[4];
-
-   if (!expr)
-      return visit_continue;
-
-   if (!this->mem_ctx)
-      this->mem_ctx = ralloc_parent(ir);
-
-   for (i = 0; i < expr->get_num_operands(); i++) {
-      if (expr->operands[i]->type->is_vector()) {
-	 found_vector = true;
-	 vector_elements = expr->operands[i]->type->vector_elements;
-	 break;
-      }
-   }
-   if (!found_vector)
-      return visit_continue;
-
-   switch (expr->operation) {
-      case ir_unop_pack_half_2x16:
-      case ir_unop_pack_snorm_2x16:
-      case ir_unop_pack_snorm_4x8:
-      case ir_unop_pack_unorm_2x16:
-      case ir_unop_pack_unorm_4x8:
-      case ir_unop_interpolate_at_centroid:
-      case ir_binop_interpolate_at_offset:
-      case ir_binop_interpolate_at_sample:
-      /* We scalarize these in NIR, so no need to do it here */
-      case ir_unop_pack_double_2x32:
-      case ir_unop_pack_int_2x32:
-      case ir_unop_pack_uint_2x32:
-         return visit_continue;
-
-      default:
-         break;
-   }
-
-   /* Store the expression operands in temps so we can use them
-    * multiple times.
-    */
-   for (i = 0; i < expr->get_num_operands(); i++) {
-      ir_assignment *assign;
-      ir_dereference *deref;
-
-      assert(!expr->operands[i]->type->is_matrix());
-
-      op_var[i] = new(mem_ctx) ir_variable(expr->operands[i]->type,
-					   "channel_expressions",
-					   ir_var_temporary);
-      ir->insert_before(op_var[i]);
-
-      deref = new(mem_ctx) ir_dereference_variable(op_var[i]);
-      assign = new(mem_ctx) ir_assignment(deref,
-					  expr->operands[i],
-					  NULL);
-      ir->insert_before(assign);
-   }
-
-   const glsl_type *element_type = glsl_type::get_instance(ir->lhs->type->base_type,
-							   1, 1);
-
-   /* OK, time to break down this vector operation. */
-   switch (expr->operation) {
-   case ir_unop_bit_not:
-   case ir_unop_logic_not:
-   case ir_unop_neg:
-   case ir_unop_abs:
-   case ir_unop_sign:
-   case ir_unop_rcp:
-   case ir_unop_rsq:
-   case ir_unop_sqrt:
-   case ir_unop_exp:
-   case ir_unop_log:
-   case ir_unop_exp2:
-   case ir_unop_log2:
-   case ir_unop_bitcast_i2f:
-   case ir_unop_bitcast_f2i:
-   case ir_unop_bitcast_f2u:
-   case ir_unop_bitcast_u2f:
-   case ir_unop_bitcast_u642d:
-   case ir_unop_bitcast_i642d:
-   case ir_unop_bitcast_d2u64:
-   case ir_unop_bitcast_d2i64:
-   case ir_unop_i2u:
-   case ir_unop_u2i:
-   case ir_unop_f2i:
-   case ir_unop_f2u:
-   case ir_unop_i2f:
-   case ir_unop_f2b:
-   case ir_unop_b2f:
-   case ir_unop_i2b:
-   case ir_unop_b2i:
-   case ir_unop_u2f:
-   case ir_unop_d2f:
-   case ir_unop_f2d:
-   case ir_unop_d2i:
-   case ir_unop_i2d:
-   case ir_unop_d2u:
-   case ir_unop_u2d:
-   case ir_unop_d2b:
-   case ir_unop_i642i:
-   case ir_unop_u642i:
-   case ir_unop_i642u:
-   case ir_unop_u642u:
-   case ir_unop_i642b:
-   case ir_unop_i642f:
-   case ir_unop_u642f:
-   case ir_unop_i642d:
-   case ir_unop_u642d:
-   case ir_unop_i2i64:
-   case ir_unop_u2i64:
-   case ir_unop_b2i64:
-   case ir_unop_f2i64:
-   case ir_unop_d2i64:
-   case ir_unop_i2u64:
-   case ir_unop_u2u64:
-   case ir_unop_f2u64:
-   case ir_unop_d2u64:
-   case ir_unop_u642i64:
-   case ir_unop_i642u64:
-   case ir_unop_trunc:
-   case ir_unop_ceil:
-   case ir_unop_floor:
-   case ir_unop_fract:
-   case ir_unop_round_even:
-   case ir_unop_sin:
-   case ir_unop_cos:
-   case ir_unop_dFdx:
-   case ir_unop_dFdx_coarse:
-   case ir_unop_dFdx_fine:
-   case ir_unop_dFdy:
-   case ir_unop_dFdy_coarse:
-   case ir_unop_dFdy_fine:
-   case ir_unop_bitfield_reverse:
-   case ir_unop_bit_count:
-   case ir_unop_find_msb:
-   case ir_unop_find_lsb:
-   case ir_unop_saturate:
-   case ir_unop_subroutine_to_int:
-      for (i = 0; i < vector_elements; i++) {
-	 ir_rvalue *op0 = get_element(op_var[0], i);
-
-	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
-						  element_type,
-						  op0,
-						  NULL));
-      }
-      break;
-
-   case ir_binop_add:
-   case ir_binop_sub:
-   case ir_binop_mul:
-   case ir_binop_imul_high:
-   case ir_binop_div:
-   case ir_binop_carry:
-   case ir_binop_borrow:
-   case ir_binop_mod:
-   case ir_binop_min:
-   case ir_binop_max:
-   case ir_binop_pow:
-   case ir_binop_lshift:
-   case ir_binop_rshift:
-   case ir_binop_bit_and:
-   case ir_binop_bit_xor:
-   case ir_binop_bit_or:
-   case ir_binop_logic_and:
-   case ir_binop_logic_xor:
-   case ir_binop_logic_or:
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_nequal:
-   case ir_binop_ldexp:
-      for (i = 0; i < vector_elements; i++) {
-	 ir_rvalue *op0 = get_element(op_var[0], i);
-	 ir_rvalue *op1 = get_element(op_var[1], i);
-
-	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
-						  element_type,
-						  op0,
-						  op1));
-      }
-      break;
-
-   case ir_binop_dot: {
-      ir_expression *last = NULL;
-      for (i = 0; i < vector_elements; i++) {
-	 ir_rvalue *op0 = get_element(op_var[0], i);
-	 ir_rvalue *op1 = get_element(op_var[1], i);
-	 ir_expression *temp;
-
-	 temp = new(mem_ctx) ir_expression(ir_binop_mul,
-					   element_type,
-					   op0,
-					   op1);
-	 if (last) {
-	    last = new(mem_ctx) ir_expression(ir_binop_add,
-					      element_type,
-					      temp,
-					      last);
-	 } else {
-	    last = temp;
-	 }
-      }
-      assign(ir, 0, last);
-      break;
-   }
-
-   case ir_binop_all_equal:
-   case ir_binop_any_nequal: {
-      ir_expression *last = NULL;
-      for (i = 0; i < vector_elements; i++) {
-	 ir_rvalue *op0 = get_element(op_var[0], i);
-	 ir_rvalue *op1 = get_element(op_var[1], i);
-	 ir_expression *temp;
-	 ir_expression_operation join;
-
-	 if (expr->operation == ir_binop_all_equal)
-	    join = ir_binop_logic_and;
-	 else
-	    join = ir_binop_logic_or;
-
-	 temp = new(mem_ctx) ir_expression(expr->operation,
-					   element_type,
-					   op0,
-					   op1);
-	 if (last) {
-	    last = new(mem_ctx) ir_expression(join,
-					      element_type,
-					      temp,
-					      last);
-	 } else {
-	    last = temp;
-	 }
-      }
-      assign(ir, 0, last);
-      break;
-   }
-   case ir_unop_noise:
-      unreachable("noise should have been broken down to function call");
-
-   case ir_binop_ubo_load:
-   case ir_unop_get_buffer_size:
-      unreachable("not yet supported");
-
-   case ir_triop_fma:
-   case ir_triop_lrp:
-   case ir_triop_csel:
-   case ir_triop_bitfield_extract:
-      for (i = 0; i < vector_elements; i++) {
-	 ir_rvalue *op0 = get_element(op_var[0], i);
-	 ir_rvalue *op1 = get_element(op_var[1], i);
-	 ir_rvalue *op2 = get_element(op_var[2], i);
-
-	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
-						  element_type,
-						  op0,
-						  op1,
-						  op2));
-      }
-      break;
-
-   case ir_quadop_bitfield_insert:
-      for (i = 0; i < vector_elements; i++) {
-         ir_rvalue *op0 = get_element(op_var[0], i);
-         ir_rvalue *op1 = get_element(op_var[1], i);
-         ir_rvalue *op2 = get_element(op_var[2], i);
-         ir_rvalue *op3 = get_element(op_var[3], i);
-
-         assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
-                                                  element_type,
-                                                  op0,
-                                                  op1,
-                                                  op2,
-                                                  op3));
-      }
-      break;
-
-   case ir_unop_pack_snorm_2x16:
-   case ir_unop_pack_snorm_4x8:
-   case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_unorm_4x8:
-   case ir_unop_pack_half_2x16:
-   case ir_unop_unpack_snorm_2x16:
-   case ir_unop_unpack_snorm_4x8:
-   case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_unorm_4x8:
-   case ir_unop_unpack_half_2x16:
-   case ir_binop_vector_extract:
-   case ir_triop_vector_insert:
-   case ir_quadop_vector:
-   case ir_unop_ssbo_unsized_array_length:
-      unreachable("should have been lowered");
-
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-   case ir_unop_unpack_double_2x32:
-      unreachable("not reached: expression operates on scalars only");
-
-   case ir_unop_pack_double_2x32:
-   case ir_unop_pack_int_2x32:
-   case ir_unop_pack_uint_2x32:
-      unreachable("not reached: to be lowered in NIR, should've been skipped");
-
-   case ir_unop_frexp_sig:
-   case ir_unop_frexp_exp:
-      unreachable("should have been lowered by lower_instructions");
-
-   case ir_unop_vote_any:
-   case ir_unop_vote_all:
-   case ir_unop_vote_eq:
-   case ir_unop_unpack_int_2x32:
-   case ir_unop_unpack_uint_2x32:
-   case ir_unop_ballot:
-   case ir_unop_read_first_invocation:
-   case ir_binop_read_invocation:
-      unreachable("unsupported");
-   }
-
-   ir->remove();
-   this->progress = true;
-
-   return visit_continue;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
deleted file mode 100644
index ce64dc6..0000000
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_wm_vector_splitting.cpp
- *
- * If a vector is only ever referenced by its components, then
- * split those components out to individual variables so they can be
- * handled normally by other optimization passes.
- *
- * This skips vectors in uniforms and varyings, which need to be
- * accessible as vectors for their access by the GL.  Also, vector
- * results of non-variable-derefs in assignments aren't handled
- * because to do so we would have to store the vector result to a
- * temporary in order to unload each channel, and to do so would just
- * loop us back to where we started.  For the 965, this is exactly the
- * behavior we want for the results of texture lookups, but probably not for
- */
-
-#include "brw_program.h"
-#include "main/imports.h"
-#include "compiler/glsl/ir.h"
-#include "compiler/glsl/ir_rvalue_visitor.h"
-#include "compiler/glsl_types.h"
-#include "util/hash_table.h"
-
-static bool debug = false;
-
-class variable_entry : public exec_node
-{
-public:
-   variable_entry(ir_variable *var)
-   {
-      this->var = var;
-      this->whole_vector_access = 0;
-      this->mem_ctx = NULL;
-   }
-
-   ir_variable *var; /* The key: the variable's pointer. */
-
-   /** Number of times the variable is referenced, including assignments. */
-   unsigned whole_vector_access;
-
-   ir_variable *components[4];
-
-   /** ralloc_parent(this->var) -- the shader's ralloc context. */
-   void *mem_ctx;
-};
-
-class ir_vector_reference_visitor : public ir_hierarchical_visitor {
-public:
-   ir_vector_reference_visitor(void)
-   {
-      this->mem_ctx = ralloc_context(NULL);
-      this->ht = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
-                                         _mesa_key_pointer_equal);
-   }
-
-   ~ir_vector_reference_visitor(void)
-   {
-      ralloc_free(mem_ctx);
-   }
-
-   virtual ir_visitor_status visit(ir_variable *);
-   virtual ir_visitor_status visit(ir_dereference_variable *);
-   virtual ir_visitor_status visit_enter(ir_swizzle *);
-   virtual ir_visitor_status visit_enter(ir_assignment *);
-   virtual ir_visitor_status visit_enter(ir_function_signature *);
-
-   variable_entry *get_variable_entry(ir_variable *var);
-
-   /* List of variable_entry */
-   struct hash_table *ht;
-
-   void *mem_ctx;
-};
-
-variable_entry *
-ir_vector_reference_visitor::get_variable_entry(ir_variable *var)
-{
-   assert(var);
-
-   if (!var->type->is_vector())
-      return NULL;
-
-   switch (var->data.mode) {
-   case ir_var_uniform:
-   case ir_var_shader_storage:
-   case ir_var_shader_shared:
-   case ir_var_shader_in:
-   case ir_var_shader_out:
-   case ir_var_system_value:
-   case ir_var_function_in:
-   case ir_var_function_out:
-   case ir_var_function_inout:
-      /* Can't split varyings or uniforms.  Function in/outs won't get split
-       * either.
-       */
-      return NULL;
-   case ir_var_auto:
-   case ir_var_temporary:
-      break;
-   }
-
-   struct hash_entry *hte = _mesa_hash_table_search(ht, var);
-   if (hte)
-      return (struct variable_entry *) hte->data;
-
-   variable_entry *entry = new(mem_ctx) variable_entry(var);
-   _mesa_hash_table_insert(ht, var, entry);
-   return entry;
-}
-
-
-ir_visitor_status
-ir_vector_reference_visitor::visit(ir_variable *ir)
-{
-   /* Make sure splitting looks at splitting this variable */
-   (void)this->get_variable_entry(ir);
-
-   return visit_continue;
-}
-
-ir_visitor_status
-ir_vector_reference_visitor::visit(ir_dereference_variable *ir)
-{
-   ir_variable *const var = ir->var;
-   variable_entry *entry = this->get_variable_entry(var);
-
-   if (entry)
-      entry->whole_vector_access++;
-
-   return visit_continue;
-}
-
-ir_visitor_status
-ir_vector_reference_visitor::visit_enter(ir_swizzle *ir)
-{
-   /* Don't descend into a vector ir_dereference_variable below. */
-   if (ir->val->as_dereference_variable() && ir->type->is_scalar())
-      return visit_continue_with_parent;
-
-   return visit_continue;
-}
-
-ir_visitor_status
-ir_vector_reference_visitor::visit_enter(ir_assignment *ir)
-{
-   if (ir->lhs->as_dereference_variable() &&
-       ir->rhs->as_dereference_variable() &&
-       !ir->condition) {
-      /* We'll split copies of a vector to copies of channels, so don't
-       * descend to the ir_dereference_variables.
-       */
-      return visit_continue_with_parent;
-   }
-   if (ir->lhs->as_dereference_variable() &&
-       _mesa_is_pow_two(ir->write_mask) &&
-       !ir->condition) {
-      /* If we're writing just a channel, then channel-splitting the LHS is OK.
-       */
-      ir->rhs->accept(this);
-      return visit_continue_with_parent;
-   }
-   return visit_continue;
-}
-
-ir_visitor_status
-ir_vector_reference_visitor::visit_enter(ir_function_signature *ir)
-{
-   /* We don't want to descend into the function parameters and
-    * split them, so just accept the body here.
-    */
-   visit_list_elements(this, &ir->body);
-   return visit_continue_with_parent;
-}
-
-class ir_vector_splitting_visitor : public ir_rvalue_visitor {
-public:
-   ir_vector_splitting_visitor(struct hash_table *vars)
-   {
-      this->ht = vars;
-   }
-
-   virtual ir_visitor_status visit_leave(ir_assignment *);
-
-   void handle_rvalue(ir_rvalue **rvalue);
-   variable_entry *get_splitting_entry(ir_variable *var);
-
-   struct hash_table *ht;
-};
-
-variable_entry *
-ir_vector_splitting_visitor::get_splitting_entry(ir_variable *var)
-{
-   assert(var);
-
-   if (!var->type->is_vector())
-      return NULL;
-
-   struct hash_entry *hte = _mesa_hash_table_search(ht, var);
-   return hte ? (struct variable_entry *) hte->data : NULL;
-}
-
-void
-ir_vector_splitting_visitor::handle_rvalue(ir_rvalue **rvalue)
-{
-   if (!*rvalue)
-      return;
-
-   ir_swizzle *swiz = (*rvalue)->as_swizzle();
-   if (!swiz || !swiz->type->is_scalar())
-      return;
-
-   ir_dereference_variable *deref_var = swiz->val->as_dereference_variable();
-   if (!deref_var)
-      return;
-
-   variable_entry *entry = get_splitting_entry(deref_var->var);
-   if (!entry)
-      return;
-
-   ir_variable *var = entry->components[swiz->mask.x];
-   *rvalue = new(entry->mem_ctx) ir_dereference_variable(var);
-}
-
-ir_visitor_status
-ir_vector_splitting_visitor::visit_leave(ir_assignment *ir)
-{
-   ir_dereference_variable *lhs_deref = ir->lhs->as_dereference_variable();
-   ir_dereference_variable *rhs_deref = ir->rhs->as_dereference_variable();
-   variable_entry *lhs = lhs_deref ? get_splitting_entry(lhs_deref->var) : NULL;
-   variable_entry *rhs = rhs_deref ? get_splitting_entry(rhs_deref->var) : NULL;
-
-   if (lhs_deref && rhs_deref && (lhs || rhs) && !ir->condition) {
-      unsigned int rhs_chan = 0;
-
-      /* Straight assignment of vector variables. */
-      for (unsigned int i = 0; i < ir->lhs->type->vector_elements; i++) {
-	 ir_dereference *new_lhs;
-	 ir_rvalue *new_rhs;
-	 void *mem_ctx = lhs ? lhs->mem_ctx : rhs->mem_ctx;
-	 unsigned int writemask;
-
-	 if (!(ir->write_mask & (1 << i)))
-	    continue;
-
-	 if (lhs) {
-	    new_lhs = new(mem_ctx) ir_dereference_variable(lhs->components[i]);
-	    writemask = 1;
-	 } else {
-	    new_lhs = ir->lhs->clone(mem_ctx, NULL);
-	    writemask = 1 << i;
-	 }
-
-	 if (rhs) {
-	    new_rhs =
-	       new(mem_ctx) ir_dereference_variable(rhs->components[rhs_chan]);
-	 } else {
-	    new_rhs = new(mem_ctx) ir_swizzle(ir->rhs->clone(mem_ctx, NULL),
-					      rhs_chan, 0, 0, 0, 1);
-	 }
-
-	 ir->insert_before(new(mem_ctx) ir_assignment(new_lhs,
-						      new_rhs,
-						      NULL, writemask));
-
-	 rhs_chan++;
-      }
-      ir->remove();
-   } else if (lhs) {
-      void *mem_ctx = lhs->mem_ctx;
-      int elem = -1;
-
-      switch (ir->write_mask) {
-      case (1 << 0):
-	 elem = 0;
-	 break;
-      case (1 << 1):
-	 elem = 1;
-	 break;
-      case (1 << 2):
-	 elem = 2;
-	 break;
-      case (1 << 3):
-	 elem = 3;
-	 break;
-      default:
-	 ir->fprint(stderr);
-	 unreachable("not reached: non-channelwise dereference of LHS.");
-      }
-
-      ir->lhs = new(mem_ctx) ir_dereference_variable(lhs->components[elem]);
-      ir->write_mask = (1 << 0);
-
-      handle_rvalue(&ir->rhs);
-   } else {
-      handle_rvalue(&ir->rhs);
-   }
-
-   handle_rvalue(&ir->condition);
-
-   return visit_continue;
-}
-
-bool
-brw_do_vector_splitting(exec_list *instructions)
-{
-   struct hash_entry *hte;
-
-   ir_vector_reference_visitor refs;
-
-   visit_list_elements(&refs, instructions);
-
-   /* Trim out variables we can't split. */
-   hash_table_foreach(refs.ht, hte) {
-      struct variable_entry *entry = (struct variable_entry *) hte->data;
-      if (debug) {
-	 fprintf(stderr, "vector %s@%p: whole_access %d\n",
-                 entry->var->name, (void *) entry->var,
-                 entry->whole_vector_access);
-      }
-
-      if (entry->whole_vector_access) {
-         _mesa_hash_table_remove(refs.ht, hte);
-      }
-   }
-
-   if (refs.ht->entries == 0)
-      return false;
-
-   void *mem_ctx = ralloc_context(NULL);
-
-   /* Replace the decls of the vectors to be split with their split
-    * components.
-    */
-   hash_table_foreach(refs.ht, hte) {
-      struct variable_entry *entry = (struct variable_entry *) hte->data;
-      const struct glsl_type *type;
-      type = glsl_type::get_instance(entry->var->type->base_type, 1, 1);
-
-      entry->mem_ctx = ralloc_parent(entry->var);
-
-      for (unsigned int i = 0; i < entry->var->type->vector_elements; i++) {
-         char *const name = ir_variable::temporaries_allocate_names
-            ? ralloc_asprintf(mem_ctx, "%s_%c",
-                              entry->var->name,
-                              "xyzw"[i])
-            : NULL;
-
-	 entry->components[i] = new(entry->mem_ctx) ir_variable(type, name,
-								ir_var_temporary);
-
-         ralloc_free(name);
-
-         if (entry->var->constant_initializer) {
-            ir_constant_data data = {0};
-            assert(entry->var->data.has_initializer);
-            if (entry->var->type->is_double()) {
-               data.d[0] = entry->var->constant_initializer->value.d[i];
-            } else {
-               data.u[0] = entry->var->constant_initializer->value.u[i];
-            }
-            entry->components[i]->data.has_initializer = true;
-            entry->components[i]->constant_initializer = new(entry->components[i]) ir_constant(type, &data);
-         }
-
-	 entry->var->insert_before(entry->components[i]);
-      }
-
-      entry->var->remove();
-   }
-
-   ir_vector_splitting_visitor split(refs.ht);
-   visit_list_elements(&split, instructions);
-
-   ralloc_free(mem_ctx);
-
-   return true;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 0c04ef0..bd8f993 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -112,6 +112,8 @@
    brw_nir_setup_glsl_uniforms(gp->program.nir, &gp->program,
                                &prog_data.base.base,
                                compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
+   brw_nir_analyze_ubo_ranges(compiler, gp->program.nir,
+                              prog_data.base.base.ubo_ranges);
 
    uint64_t outputs_written = gp->program.info.outputs_written;
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
deleted file mode 100644
index ed9ae44..0000000
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-brw_upload_gs_unit(struct brw_context *brw)
-{
-   struct brw_gs_unit_state *gs;
-
-   gs = brw_state_batch(brw, sizeof(*gs), 32, &brw->ff_gs.state_offset);
-
-   memset(gs, 0, sizeof(*gs));
-
-   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_GS_PROG_DATA */
-   if (brw->ff_gs.prog_active) {
-      gs->thread0.grf_reg_count = (ALIGN(brw->ff_gs.prog_data->total_grf, 16) /
-				   16 - 1);
-
-      gs->thread0.kernel_start_pointer =
-	 brw_program_reloc(brw,
-			   brw->ff_gs.state_offset +
-			   offsetof(struct brw_gs_unit_state, thread0),
-			   brw->ff_gs.prog_offset +
-			   (gs->thread0.grf_reg_count << 1)) >> 6;
-
-      gs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-      gs->thread1.single_program_flow = 1;
-
-      gs->thread3.dispatch_grf_start_reg = 1;
-      gs->thread3.const_urb_entry_read_offset = 0;
-      gs->thread3.const_urb_entry_read_length = 0;
-      gs->thread3.urb_entry_read_offset = 0;
-      gs->thread3.urb_entry_read_length =
-         brw->ff_gs.prog_data->urb_read_length;
-
-      /* BRW_NEW_URB_FENCE */
-      gs->thread4.nr_urb_entries = brw->urb.nr_gs_entries;
-      gs->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-
-      if (brw->urb.nr_gs_entries >= 8)
-	 gs->thread4.max_threads = 1;
-      else
-	 gs->thread4.max_threads = 0;
-   }
-
-   if (brw->gen == 5)
-      gs->thread4.rendering_enable = 1;
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      gs->thread4.stats_enable = 1;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   gs->gs6.max_vp_index = brw->clip.viewport_count - 1;
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-}
-
-const struct brw_tracked_state brw_gs_unit = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CURBE_OFFSETS |
-               BRW_NEW_FF_GS_PROG_DATA |
-               BRW_NEW_PROGRAM_CACHE |
-               BRW_NEW_URB_FENCE |
-               BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = brw_upload_gs_unit,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index d5a106d..bcf39a8 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -133,6 +133,7 @@
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_GEOMETRY_PROGRAM |
              BRW_NEW_GS_PROG_DATA |
              BRW_NEW_IMAGE_UNITS,
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 7c10a40..e9158c5 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -28,6 +28,7 @@
 #include "compiler/glsl/ir_optimization.h"
 #include "compiler/glsl/program.h"
 #include "program/program.h"
+#include "main/mtypes.h"
 #include "main/shaderapi.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
@@ -88,9 +89,6 @@
                 struct gl_linked_shader *shader)
 {
    struct gl_context *ctx = &brw->ctx;
-   const struct brw_compiler *compiler = brw->screen->compiler;
-   const struct gl_shader_compiler_options *options =
-      &ctx->Const.ShaderCompilerOptions[shader->Stage];
 
    /* Temporary memory context for any new IR. */
    void *mem_ctx = ralloc_context(NULL);
@@ -132,21 +130,6 @@
    lower_noise(shader->ir);
    lower_quadop_vector(shader->ir, false);
 
-   bool progress;
-   do {
-      progress = false;
-
-      if (compiler->scalar_stage[shader->Stage]) {
-         if (shader->Stage == MESA_SHADER_VERTEX ||
-             shader->Stage == MESA_SHADER_FRAGMENT)
-            brw_do_channel_expressions(shader->ir);
-         brw_do_vector_splitting(shader->ir);
-      }
-
-      progress = do_common_optimization(shader->ir, true, true,
-                                        options, ctx->Const.NativeIntegers) || progress;
-   } while (progress);
-
    validate_ir_tree(shader->ir);
 
    /* Now that we've finished altering the linked IR, reparent any live IR back
@@ -194,6 +177,39 @@
    }
 }
 
+static void
+update_xfb_info(struct gl_transform_feedback_info *xfb_info)
+{
+   if (!xfb_info)
+      return;
+
+   for (unsigned i = 0; i < xfb_info->NumOutputs; i++) {
+      struct gl_transform_feedback_output *output = &xfb_info->Outputs[i];
+
+      /* The VUE header contains three scalar fields packed together:
+       * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w
+       * - gl_Layer is stored in VARYING_SLOT_PSIZ.y
+       * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
+       */
+      switch (output->OutputRegister) {
+      case VARYING_SLOT_LAYER:
+         assert(output->NumComponents == 1);
+         output->OutputRegister = VARYING_SLOT_PSIZ;
+         output->ComponentOffset = 1;
+         break;
+      case VARYING_SLOT_VIEWPORT:
+         assert(output->NumComponents == 1);
+         output->OutputRegister = VARYING_SLOT_PSIZ;
+         output->ComponentOffset = 2;
+         break;
+      case VARYING_SLOT_PSIZ:
+         assert(output->NumComponents == 1);
+         output->ComponentOffset = 3;
+         break;
+      }
+   }
+}
+
 extern "C" GLboolean
 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 {
@@ -217,6 +233,8 @@
       prog->ShadowSamplers = shader->shadow_samplers;
       _mesa_update_shader_textures_used(shProg, prog);
 
+      update_xfb_info(prog->sh.LinkedTransformFeedback);
+
       bool debug_enabled =
          (INTEL_DEBUG & intel_debug_flag_for_shader_stage(shader->Stage));
 
@@ -229,7 +247,7 @@
 
       prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage,
                                  compiler->scalar_stage[stage]);
-      infos[stage] = prog->nir->info;
+      infos[stage] = &prog->nir->info;
 
       /* Make a pass over the IR to add state references for any built-in
        * uniforms that are used.  This has to be done now (during linking).
diff --git a/src/mesa/drivers/dri/i965/brw_meta_util.c b/src/mesa/drivers/dri/i965/brw_meta_util.c
index 1b47c60..7ce1fd1 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_util.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_util.c
@@ -289,18 +289,7 @@
     */
    if (brw->gen >= 9 &&
        brw_isl_format_for_mesa_format(mt->format) !=
-       brw->render_target_format[mt->format])
-      return false;
-
-   /* Gen9 doesn't support fast clear on single-sampled SRGB buffers. When
-    * GL_FRAMEBUFFER_SRGB is enabled any color renderbuffers will be
-    * resolved in intel_update_state. In that case it's pointless to do a
-    * fast clear because it's very likely to be immediately resolved.
-    */
-   if (brw->gen >= 9 &&
-       mt->num_samples <= 1 &&
-       ctx->Color.sRGBEnabled &&
-       _mesa_get_srgb_format_linear(mt->format) != mt->format)
+       brw->mesa_to_isl_render_format[mt->format])
       return false;
 
    const mesa_format format = _mesa_get_render_format(ctx, mt->format);
@@ -329,12 +318,19 @@
  * Convert the given color to a bitfield suitable for ORing into DWORD 7 of
  * SURFACE_STATE (DWORD 12-15 on SKL+).
  */
-union gl_color_union
+union isl_color_value
 brw_meta_convert_fast_clear_color(const struct brw_context *brw,
                                   const struct intel_mipmap_tree *mt,
                                   const union gl_color_union *color)
 {
-   union gl_color_union override_color = *color;
+   union isl_color_value override_color = {
+      .u32 = {
+         color->ui[0],
+         color->ui[1],
+         color->ui[2],
+         color->ui[3],
+      },
+   };
 
    /* The sampler doesn't look at the format of the surface when the fast
     * clear color is used so we need to implement luminance, intensity and
@@ -342,17 +338,17 @@
     */
    switch (_mesa_get_format_base_format(mt->format)) {
    case GL_INTENSITY:
-      override_color.ui[3] = override_color.ui[0];
+      override_color.u32[3] = override_color.u32[0];
       /* flow through */
    case GL_LUMINANCE:
    case GL_LUMINANCE_ALPHA:
-      override_color.ui[1] = override_color.ui[0];
-      override_color.ui[2] = override_color.ui[0];
+      override_color.u32[1] = override_color.u32[0];
+      override_color.u32[2] = override_color.u32[0];
       break;
    default:
       for (int i = 0; i < 3; i++) {
          if (!_mesa_format_has_color_component(mt->format, i))
-            override_color.ui[i] = 0;
+            override_color.u32[i] = 0;
       }
       break;
    }
@@ -360,12 +356,12 @@
    switch (_mesa_get_format_datatype(mt->format)) {
    case GL_UNSIGNED_NORMALIZED:
       for (int i = 0; i < 4; i++)
-         override_color.f[i] = CLAMP(override_color.f[i], 0.0f, 1.0f);
+         override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f);
       break;
 
    case GL_SIGNED_NORMALIZED:
       for (int i = 0; i < 4; i++)
-         override_color.f[i] = CLAMP(override_color.f[i], -1.0f, 1.0f);
+         override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f);
       break;
 
    case GL_UNSIGNED_INT:
@@ -373,7 +369,7 @@
          unsigned bits = _mesa_get_format_bits(mt->format, GL_RED_BITS + i);
          if (bits < 32) {
             uint32_t max = (1u << bits) - 1;
-            override_color.ui[i] = MIN2(override_color.ui[i], max);
+            override_color.u32[i] = MIN2(override_color.u32[i], max);
          }
       }
       break;
@@ -384,7 +380,7 @@
          if (bits < 32) {
             int32_t max = (1 << (bits - 1)) - 1;
             int32_t min = -(1 << (bits - 1));
-            override_color.i[i] = CLAMP(override_color.i[i], min, max);
+            override_color.i32[i] = CLAMP(override_color.i32[i], min, max);
          }
       }
       break;
@@ -392,55 +388,26 @@
    case GL_FLOAT:
       if (!_mesa_is_format_signed(mt->format)) {
          for (int i = 0; i < 4; i++)
-            override_color.f[i] = MAX2(override_color.f[i], 0.0f);
+            override_color.f32[i] = MAX2(override_color.f32[i], 0.0f);
       }
       break;
    }
 
    if (!_mesa_format_has_color_component(mt->format, 3)) {
       if (_mesa_is_format_integer_color(mt->format))
-         override_color.ui[3] = 1;
+         override_color.u32[3] = 1;
       else
-         override_color.f[3] = 1.0f;
+         override_color.f32[3] = 1.0f;
    }
 
    /* Handle linear to SRGB conversion */
    if (brw->ctx.Color.sRGBEnabled &&
        _mesa_get_srgb_format_linear(mt->format) != mt->format) {
       for (int i = 0; i < 3; i++) {
-         override_color.f[i] =
-            util_format_linear_to_srgb_float(override_color.f[i]);
+         override_color.f32[i] =
+            util_format_linear_to_srgb_float(override_color.f32[i]);
       }
    }
 
    return override_color;
 }
-
-/* Returned boolean tells if the given color differs from the current. */
-bool
-brw_meta_set_fast_clear_color(struct brw_context *brw,
-                              union gl_color_union *curr_color,
-                              const union gl_color_union *new_color)
-{
-   bool updated;
-
-   if (brw->gen >= 9) {
-      updated = memcmp(curr_color, new_color, sizeof(*curr_color));
-      *curr_color = *new_color;
-   } else {
-      const uint32_t old_color_value = *(uint32_t *)curr_color;
-      uint32_t adjusted = 0;
-
-      for (int i = 0; i < 4; i++) {
-         /* Testing for non-0 works for integer and float colors */
-         if (new_color->f[i] != 0.0f) {
-            adjusted |= 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
-         }
-      }
-
-      updated = (old_color_value != adjusted);
-      *(uint32_t *)curr_color = adjusted;
-   }
-
-   return updated;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_meta_util.h b/src/mesa/drivers/dri/i965/brw_meta_util.h
index 207a54b..4b3408d 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_util.h
+++ b/src/mesa/drivers/dri/i965/brw_meta_util.h
@@ -42,17 +42,12 @@
                                  GLfloat *dstX1, GLfloat *dstY1,
                                  bool *mirror_x, bool *mirror_y);
 
-union gl_color_union
+union isl_color_value
 brw_meta_convert_fast_clear_color(const struct brw_context *brw,
                                   const struct intel_mipmap_tree *mt,
                                   const union gl_color_union *color);
 
 bool
-brw_meta_set_fast_clear_color(struct brw_context *brw,
-                              union gl_color_union *curr_color,
-                              const union gl_color_union *new_color);
-
-bool
 brw_is_color_fast_clear_compatible(struct brw_context *brw,
                                    const struct intel_mipmap_tree *mt,
                                    const union gl_color_union *color);
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 61a0c4c..1e3be78 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -42,34 +42,9 @@
 
 #include "main/framebuffer.h"
 #include "main/fbobject.h"
+#include "main/format_utils.h"
 #include "main/glformats.h"
 
-/* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void
-upload_drawing_rect(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const unsigned int fb_width = _mesa_geometric_width(fb);
-   const unsigned int fb_height = _mesa_geometric_height(fb);
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
-   OUT_BATCH(0); /* xmin, ymin */
-   OUT_BATCH(((fb_width - 1) & 0xffff) | ((fb_height - 1) << 16));
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_drawing_rect = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS,
-      .brw = BRW_NEW_BLORP |
-             BRW_NEW_CONTEXT,
-   },
-   .emit = upload_drawing_rect
-};
-
 /**
  * Upload pointers to the per-stage state.
  *
@@ -151,62 +126,6 @@
    return brw_depth_format(brw, drb->mt->format);
 }
 
-/**
- * Returns the mask of how many bits of x and y must be handled through the
- * depthbuffer's draw offset x and y fields.
- *
- * The draw offset x/y field of the depthbuffer packet is unfortunately shared
- * between the depth, hiz, and stencil buffers.  Because it can be hard to get
- * all 3 to agree on this value, we want to do as much drawing offset
- * adjustment as possible by moving the base offset of the 3 buffers, which is
- * restricted to tile boundaries.
- *
- * For each buffer, the remainder must be applied through the x/y draw offset.
- * This returns the worst-case mask of the low bits that have to go into the
- * packet.  If the 3 buffers don't agree on the drawing offset ANDed with this
- * mask, then we're in trouble.
- */
-static void
-brw_get_depthstencil_tile_masks(struct intel_mipmap_tree *depth_mt,
-                                uint32_t depth_level,
-                                uint32_t depth_layer,
-                                struct intel_mipmap_tree *stencil_mt,
-                                uint32_t *out_tile_mask_x,
-                                uint32_t *out_tile_mask_y)
-{
-   uint32_t tile_mask_x = 0, tile_mask_y = 0;
-
-   if (depth_mt) {
-      intel_get_tile_masks(depth_mt->tiling,
-                           depth_mt->cpp,
-                           &tile_mask_x, &tile_mask_y);
-      assert(!intel_miptree_level_has_hiz(depth_mt, depth_level));
-   }
-
-   if (stencil_mt) {
-      if (stencil_mt->stencil_mt)
-	 stencil_mt = stencil_mt->stencil_mt;
-
-      if (stencil_mt->format == MESA_FORMAT_S_UINT8) {
-         /* Separate stencil buffer uses 64x64 tiles. */
-         tile_mask_x |= 63;
-         tile_mask_y |= 63;
-      } else {
-         uint32_t stencil_tile_mask_x, stencil_tile_mask_y;
-         intel_get_tile_masks(stencil_mt->tiling,
-                              stencil_mt->cpp,
-                              &stencil_tile_mask_x,
-                              &stencil_tile_mask_y);
-
-         tile_mask_x |= stencil_tile_mask_x;
-         tile_mask_y |= stencil_tile_mask_y;
-      }
-   }
-
-   *out_tile_mask_x = tile_mask_x;
-   *out_tile_mask_y = tile_mask_y;
-}
-
 static struct intel_mipmap_tree *
 get_stencil_miptree(struct intel_renderbuffer *irb)
 {
@@ -214,7 +133,68 @@
       return NULL;
    if (irb->mt->stencil_mt)
       return irb->mt->stencil_mt;
-   return irb->mt;
+   return intel_renderbuffer_get_mt(irb);
+}
+
+static bool
+rebase_depth_stencil(struct brw_context *brw, struct intel_renderbuffer *irb,
+                     bool invalidate)
+{
+   struct gl_context *ctx = &brw->ctx;
+   uint32_t tile_mask_x = 0, tile_mask_y = 0;
+
+   intel_get_tile_masks(irb->mt->surf.tiling, irb->mt->cpp,
+                        &tile_mask_x, &tile_mask_y);
+   assert(!intel_miptree_level_has_hiz(irb->mt, irb->mt_level));
+
+   uint32_t tile_x = irb->draw_x & tile_mask_x;
+   uint32_t tile_y = irb->draw_y & tile_mask_y;
+
+   /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
+    * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
+    * Coordinate Offset X/Y":
+    *
+    *   "The 3 LSBs of both offsets must be zero to ensure correct
+    *   alignment"
+    */
+   bool rebase = tile_x & 7 || tile_y & 7;
+
+   /* We didn't even have intra-tile offsets before g45. */
+   rebase |= (!brw->has_surface_tile_offset && (tile_x || tile_y));
+
+   if (rebase) {
+      perf_debug("HW workaround: blitting depth level %d to a temporary "
+                 "to fix alignment (depth tile offset %d,%d)\n",
+                 irb->mt_level, tile_x, tile_y);
+      intel_renderbuffer_move_to_temp(brw, irb, invalidate);
+
+      /* There is now only single slice miptree. */
+      brw->depthstencil.tile_x = 0;
+      brw->depthstencil.tile_y = 0;
+      brw->depthstencil.depth_offset = 0;
+      return true;
+   }
+
+   /* While we just tried to get everything aligned, we may have failed to do
+    * so in the case of rendering to array or 3D textures, where nonzero faces
+    * will still have an offset post-rebase.  At least give an informative
+    * warning.
+    */
+   WARN_ONCE((tile_x & 7) || (tile_y & 7),
+             "Depth/stencil buffer needs alignment to 8-pixel boundaries.\n"
+             "Truncating offset (%u:%u), bad rendering may occur.\n",
+             tile_x, tile_y);
+   tile_x &= ~7;
+   tile_y &= ~7;
+
+   brw->depthstencil.tile_x = tile_x;
+   brw->depthstencil.tile_y = tile_y;
+   brw->depthstencil.depth_offset = intel_miptree_get_aligned_offset(
+                                       irb->mt,
+                                       irb->draw_x & ~tile_mask_x,
+                                       irb->draw_y & ~tile_mask_y);
+
+   return false;
 }
 
 void
@@ -223,14 +203,9 @@
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
-   bool rebase_depth = false;
-   bool rebase_stencil = false;
    struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
    struct intel_mipmap_tree *depth_mt = NULL;
-   struct intel_mipmap_tree *stencil_mt = get_stencil_miptree(stencil_irb);
-   uint32_t tile_x = 0, tile_y = 0, stencil_tile_x = 0, stencil_tile_y = 0;
-   uint32_t stencil_draw_x = 0, stencil_draw_y = 0;
    bool invalidate_depth = clear_mask & BUFFER_BIT_DEPTH;
    bool invalidate_stencil = clear_mask & BUFFER_BIT_STENCIL;
 
@@ -242,14 +217,6 @@
    brw->depthstencil.tile_x = 0;
    brw->depthstencil.tile_y = 0;
    brw->depthstencil.depth_offset = 0;
-   brw->depthstencil.stencil_offset = 0;
-   brw->depthstencil.hiz_offset = 0;
-   brw->depthstencil.depth_mt = NULL;
-   brw->depthstencil.stencil_mt = NULL;
-   if (depth_irb)
-      brw->depthstencil.depth_mt = depth_mt;
-   if (stencil_irb)
-      brw->depthstencil.stencil_mt = get_stencil_miptree(stencil_irb);
 
    /* Gen6+ doesn't require the workarounds, since we always program the
     * surface state at the start of the whole surface.
@@ -258,54 +225,14 @@
       return;
 
    /* Check if depth buffer is in depth/stencil format.  If so, then it's only
-    * safe to invalidate it if we're also clearing stencil, and both depth_irb
-    * and stencil_irb point to the same miptree.
-    *
-    * Note: it's not sufficient to check for the case where
-    * _mesa_get_format_base_format(depth_mt->format) == GL_DEPTH_STENCIL,
-    * because this fails to catch depth/stencil buffers on hardware that uses
-    * separate stencil.  To catch that case, we check whether
-    * depth_mt->stencil_mt is non-NULL.
+    * safe to invalidate it if we're also clearing stencil.
     */
    if (depth_irb && invalidate_depth &&
-       (_mesa_get_format_base_format(depth_mt->format) == GL_DEPTH_STENCIL ||
-        depth_mt->stencil_mt)) {
-      invalidate_depth = invalidate_stencil && depth_irb && stencil_irb
-         && depth_irb->mt == stencil_irb->mt;
-   }
-
-   uint32_t tile_mask_x, tile_mask_y;
-   brw_get_depthstencil_tile_masks(depth_mt,
-                                   depth_mt ? depth_irb->mt_level : 0,
-                                   depth_mt ? depth_irb->mt_layer : 0,
-                                   stencil_mt,
-                                   &tile_mask_x, &tile_mask_y);
+      _mesa_get_format_base_format(depth_mt->format) == GL_DEPTH_STENCIL)
+      invalidate_depth = invalidate_stencil && stencil_irb;
 
    if (depth_irb) {
-      tile_x = depth_irb->draw_x & tile_mask_x;
-      tile_y = depth_irb->draw_y & tile_mask_y;
-
-      /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327
-       * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth
-       * Coordinate Offset X/Y":
-       *
-       *   "The 3 LSBs of both offsets must be zero to ensure correct
-       *   alignment"
-       */
-      if (tile_x & 7 || tile_y & 7)
-         rebase_depth = true;
-
-      /* We didn't even have intra-tile offsets before g45. */
-      if (!brw->has_surface_tile_offset) {
-         if (tile_x || tile_y)
-            rebase_depth = true;
-      }
-
-      if (rebase_depth) {
-         perf_debug("HW workaround: blitting depth level %d to a temporary "
-                    "to fix alignment (depth tile offset %d,%d)\n",
-                    depth_irb->mt_level, tile_x, tile_y);
-         intel_renderbuffer_move_to_temp(brw, depth_irb, invalidate_depth);
+      if (rebase_depth_stencil(brw, depth_irb, invalidate_depth)) {
          /* In the case of stencil_irb being the same packed depth/stencil
           * texture but not the same rb, make it point at our rebased mt, too.
           */
@@ -315,156 +242,18 @@
             intel_miptree_reference(&stencil_irb->mt, depth_irb->mt);
             intel_renderbuffer_set_draw_offset(stencil_irb);
          }
-
-         stencil_mt = get_stencil_miptree(stencil_irb);
-
-         tile_x = depth_irb->draw_x & tile_mask_x;
-         tile_y = depth_irb->draw_y & tile_mask_y;
       }
 
       if (stencil_irb) {
-         stencil_mt = get_stencil_miptree(stencil_irb);
-         intel_miptree_get_image_offset(stencil_mt,
-                                        stencil_irb->mt_level,
-                                        stencil_irb->mt_layer,
-                                        &stencil_draw_x, &stencil_draw_y);
-         int stencil_tile_x = stencil_draw_x & tile_mask_x;
-         int stencil_tile_y = stencil_draw_y & tile_mask_y;
-
-         /* If stencil doesn't match depth, then we'll need to rebase stencil
-          * as well.  (if we hadn't decided to rebase stencil before, the
-          * post-stencil depth test will also rebase depth to try to match it
-          * up).
-          */
-         if (tile_x != stencil_tile_x ||
-             tile_y != stencil_tile_y) {
-            rebase_stencil = true;
-         }
+         assert(stencil_irb->mt == depth_irb->mt);
+         assert(stencil_irb->mt_level == depth_irb->mt_level);
+         assert(stencil_irb->mt_layer == depth_irb->mt_layer);
       }
    }
 
-   /* If we have (just) stencil, check it for ignored low bits as well */
-   if (stencil_irb) {
-      intel_miptree_get_image_offset(stencil_mt,
-                                     stencil_irb->mt_level,
-                                     stencil_irb->mt_layer,
-                                     &stencil_draw_x, &stencil_draw_y);
-      stencil_tile_x = stencil_draw_x & tile_mask_x;
-      stencil_tile_y = stencil_draw_y & tile_mask_y;
-
-      if (stencil_tile_x & 7 || stencil_tile_y & 7)
-         rebase_stencil = true;
-
-      if (!brw->has_surface_tile_offset) {
-         if (stencil_tile_x || stencil_tile_y)
-            rebase_stencil = true;
-      }
-   }
-
-   if (rebase_stencil) {
-      perf_debug("HW workaround: blitting stencil level %d to a temporary "
-                 "to fix alignment (stencil tile offset %d,%d)\n",
-                 stencil_irb->mt_level, stencil_tile_x, stencil_tile_y);
-
-      intel_renderbuffer_move_to_temp(brw, stencil_irb, invalidate_stencil);
-      stencil_mt = get_stencil_miptree(stencil_irb);
-
-      intel_miptree_get_image_offset(stencil_mt,
-                                     stencil_irb->mt_level,
-                                     stencil_irb->mt_layer,
-                                     &stencil_draw_x, &stencil_draw_y);
-      stencil_tile_x = stencil_draw_x & tile_mask_x;
-      stencil_tile_y = stencil_draw_y & tile_mask_y;
-
-      if (depth_irb && depth_irb->mt == stencil_irb->mt) {
-         intel_miptree_reference(&depth_irb->mt, stencil_irb->mt);
-         intel_renderbuffer_set_draw_offset(depth_irb);
-      } else if (depth_irb && !rebase_depth) {
-         if (tile_x != stencil_tile_x ||
-             tile_y != stencil_tile_y) {
-            perf_debug("HW workaround: blitting depth level %d to a temporary "
-                       "to match stencil level %d alignment (depth tile offset "
-                       "%d,%d, stencil offset %d,%d)\n",
-                       depth_irb->mt_level,
-                       stencil_irb->mt_level,
-                       tile_x, tile_y,
-                       stencil_tile_x, stencil_tile_y);
-
-            intel_renderbuffer_move_to_temp(brw, depth_irb, invalidate_depth);
-
-            tile_x = depth_irb->draw_x & tile_mask_x;
-            tile_y = depth_irb->draw_y & tile_mask_y;
-
-            if (stencil_irb && stencil_irb->mt == depth_mt) {
-               intel_miptree_reference(&stencil_irb->mt, depth_irb->mt);
-               intel_renderbuffer_set_draw_offset(stencil_irb);
-            }
-
-            WARN_ONCE(stencil_tile_x != tile_x ||
-                      stencil_tile_y != tile_y,
-                      "Rebased stencil tile offset (%d,%d) doesn't match depth "
-                      "tile offset (%d,%d).\n",
-                      stencil_tile_x, stencil_tile_y,
-                      tile_x, tile_y);
-         }
-      }
-   }
-
-   if (!depth_irb) {
-      tile_x = stencil_tile_x;
-      tile_y = stencil_tile_y;
-   }
-
-   /* While we just tried to get everything aligned, we may have failed to do
-    * so in the case of rendering to array or 3D textures, where nonzero faces
-    * will still have an offset post-rebase.  At least give an informative
-    * warning.
-    */
-   WARN_ONCE((tile_x & 7) || (tile_y & 7),
-             "Depth/stencil buffer needs alignment to 8-pixel boundaries.\n"
-             "Truncating offset, bad rendering may occur.\n");
-   tile_x &= ~7;
-   tile_y &= ~7;
-
-   /* Now, after rebasing, save off the new dephtstencil state so the hardware
-    * packets can just dereference that without re-calculating tile offsets.
-    */
-   brw->depthstencil.tile_x = tile_x;
-   brw->depthstencil.tile_y = tile_y;
-   if (depth_irb) {
-      depth_mt = depth_irb->mt;
-      brw->depthstencil.depth_mt = depth_mt;
-      brw->depthstencil.depth_offset =
-         intel_miptree_get_aligned_offset(depth_mt,
-                                          depth_irb->draw_x & ~tile_mask_x,
-                                          depth_irb->draw_y & ~tile_mask_y);
-      if (intel_renderbuffer_has_hiz(depth_irb)) {
-         brw->depthstencil.hiz_offset =
-            intel_miptree_get_aligned_offset(depth_mt,
-                                             depth_irb->draw_x & ~tile_mask_x,
-                                             (depth_irb->draw_y & ~tile_mask_y) / 2);
-      }
-   }
-   if (stencil_irb) {
-      stencil_mt = get_stencil_miptree(stencil_irb);
-
-      brw->depthstencil.stencil_mt = stencil_mt;
-      if (stencil_mt->format == MESA_FORMAT_S_UINT8) {
-         /* Note: we can't compute the stencil offset using
-          * intel_region_get_aligned_offset(), because stencil_region claims
-          * that the region is untiled even though it's W tiled.
-          */
-         brw->depthstencil.stencil_offset =
-            (stencil_draw_y & ~tile_mask_y) * stencil_mt->pitch +
-            (stencil_draw_x & ~tile_mask_x) * 64;
-      } else if (!depth_irb) {
-         brw->depthstencil.depth_offset =
-            intel_miptree_get_aligned_offset(
-               stencil_mt,
-               stencil_irb->draw_x & ~tile_mask_x,
-               stencil_irb->draw_y & ~tile_mask_y);
-      }
-   }
+   /* If there is no depth attachment, consider if stencil needs rebase. */
+   if (!depth_irb && stencil_irb)
+       rebase_depth_stencil(brw, stencil_irb, invalidate_stencil);
 }
 
 void
@@ -475,8 +264,8 @@
    /* _NEW_BUFFERS */
    struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-   struct intel_mipmap_tree *depth_mt = brw->depthstencil.depth_mt;
-   struct intel_mipmap_tree *stencil_mt = brw->depthstencil.stencil_mt;
+   struct intel_mipmap_tree *depth_mt = intel_renderbuffer_get_mt(depth_irb);
+   struct intel_mipmap_tree *stencil_mt = get_stencil_miptree(stencil_irb);
    uint32_t tile_x = brw->depthstencil.tile_x;
    uint32_t tile_y = brw->depthstencil.tile_y;
    bool hiz = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
@@ -517,8 +306,8 @@
       /* Prior to Gen7, if using separate stencil, hiz must be enabled. */
       assert(brw->gen >= 7 || !separate_stencil || hiz);
 
-      assert(brw->gen < 6 || depth_mt->tiling == I915_TILING_Y);
-      assert(!hiz || depth_mt->tiling == I915_TILING_Y);
+      assert(brw->gen < 6 || depth_mt->surf.tiling == ISL_TILING_Y0);
+      assert(!hiz || depth_mt->surf.tiling == ISL_TILING_Y0);
 
       depthbuffer_format = brw_depthbuffer_format(brw);
       depth_surface_type = BRW_SURFACE_2D;
@@ -555,6 +344,21 @@
                                     width, height, tile_x, tile_y);
 }
 
+uint32_t
+brw_convert_depth_value(mesa_format format, float value)
+{
+   switch (format) {
+   case MESA_FORMAT_Z_FLOAT32:
+      return float_as_int(value);
+   case MESA_FORMAT_Z_UNORM16:
+      return value * ((1u << 16) - 1);
+   case MESA_FORMAT_Z24_UNORM_X8_UINT:
+      return value * ((1u << 24) - 1);
+   default:
+      unreachable("Invalid depth format");
+   }
+}
+
 void
 brw_emit_depth_stencil_hiz(struct brw_context *brw,
                            struct intel_mipmap_tree *depth_mt,
@@ -565,42 +369,21 @@
                            uint32_t width, uint32_t height,
                            uint32_t tile_x, uint32_t tile_y)
 {
-   /* Enable the hiz bit if we're doing separate stencil, because it and the
-    * separate stencil bit must have the same value. From Section 2.11.5.6.1.1
-    * 3DSTATE_DEPTH_BUFFER, Bit 1.21 "Separate Stencil Enable":
-    *     [DevIL]: If this field is enabled, Hierarchical Depth Buffer
-    *     Enable must also be enabled.
-    *
-    *     [DevGT]: This field must be set to the same value (enabled or
-    *     disabled) as Hierarchical Depth Buffer Enable
-    */
-   bool enable_hiz_ss = hiz || separate_stencil;
+   (void)hiz;
+   (void)separate_stencil;
+   (void)stencil_mt;
 
+   assert(!hiz);
+   assert(!separate_stencil);
 
-   /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both
-    * non-pipelined state that will need the PIPE_CONTROL workaround.
-    */
-   if (brw->gen == 6) {
-      brw_emit_depth_stall_flushes(brw);
-   }
-
-   unsigned int len;
-   if (brw->gen >= 6)
-      len = 7;
-   else if (brw->is_g4x || brw->gen == 5)
-      len = 6;
-   else
-      len = 5;
+   const unsigned len = (brw->is_g4x || brw->gen == 5) ? 6 : 5;
 
    BEGIN_BATCH(len);
    OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
-   OUT_BATCH((depth_mt ? depth_mt->pitch - 1 : 0) |
+   OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
              (depthbuffer_format << 18) |
-             ((enable_hiz_ss ? 1 : 0) << 21) | /* separate stencil enable */
-             ((enable_hiz_ss ? 1 : 0) << 22) | /* hiz enable */
              (BRW_TILEWALK_YMAJOR << 26) |
-             ((depth_mt ? depth_mt->tiling != I915_TILING_NONE : 1)
-              << 27) |
+             (1 << 27) |
              (depth_surface_type << 29));
 
    if (depth_mt) {
@@ -624,73 +407,6 @@
       OUT_BATCH(0);
 
    ADVANCE_BATCH();
-
-   if (hiz || separate_stencil) {
-      /*
-       * In the 3DSTATE_DEPTH_BUFFER batch emitted above, the 'separate
-       * stencil enable' and 'hiz enable' bits were set. Therefore we must
-       * emit 3DSTATE_HIER_DEPTH_BUFFER and 3DSTATE_STENCIL_BUFFER. Even if
-       * there is no stencil buffer, 3DSTATE_STENCIL_BUFFER must be emitted;
-       * failure to do so causes hangs on gen5 and a stall on gen6.
-       */
-
-      /* Emit hiz buffer. */
-      if (hiz) {
-         assert(depth_mt);
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(depth_mt->hiz_buf->aux_base.pitch - 1);
-	 OUT_RELOC(depth_mt->hiz_buf->aux_base.bo,
-		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		   brw->depthstencil.hiz_offset);
-	 ADVANCE_BATCH();
-      } else {
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-      }
-
-      /* Emit stencil buffer. */
-      if (separate_stencil) {
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-         /* The stencil buffer has quirky pitch requirements.  From Vol 2a,
-          * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
-          *    The pitch must be set to 2x the value computed based on width, as
-          *    the stencil buffer is stored with two rows interleaved.
-          */
-	 OUT_BATCH(2 * stencil_mt->pitch - 1);
-	 OUT_RELOC(stencil_mt->bo,
-		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		   brw->depthstencil.stencil_offset);
-	 ADVANCE_BATCH();
-      } else {
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-      }
-   }
-
-   /*
-    * On Gen >= 6, emit clear params for safety. If using hiz, then clear
-    * params must be emitted.
-    *
-    * From Section 2.11.5.6.4.1 3DSTATE_CLEAR_PARAMS:
-    *     3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet
-    *     when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
-    */
-   if (brw->gen >= 6 || hiz) {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 |
-		GEN5_DEPTH_CLEAR_VALID |
-		(2 - 2));
-      OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0);
-      ADVANCE_BATCH();
-   }
 }
 
 const struct brw_tracked_state brw_depthbuffer = {
@@ -702,127 +418,6 @@
    .emit = brw_emit_depthbuffer,
 };
 
-/**
- * Polygon stipple packet
- */
-static void
-upload_polygon_stipple(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   GLuint i;
-
-   /* _NEW_POLYGON */
-   if (!ctx->Polygon.StippleFlag)
-      return;
-
-   BEGIN_BATCH(33);
-   OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2));
-
-   /* Polygon stipple is provided in OpenGL order, i.e. bottom
-    * row first.  If we're rendering to a window (i.e. the
-    * default frame buffer object, 0), then we need to invert
-    * it to match our pixel layout.  But if we're rendering
-    * to a FBO (i.e. any named frame buffer object), we *don't*
-    * need to invert - we already match the layout.
-    */
-   if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
-      for (i = 0; i < 32; i++)
-	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
-   } else {
-      for (i = 0; i < 32; i++)
-	 OUT_BATCH(ctx->PolygonStipple[i]);
-   }
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_polygon_stipple = {
-   .dirty = {
-      .mesa = _NEW_POLYGON |
-              _NEW_POLYGONSTIPPLE,
-      .brw = BRW_NEW_CONTEXT,
-   },
-   .emit = upload_polygon_stipple
-};
-
-/**
- * Polygon stipple offset packet
- */
-static void
-upload_polygon_stipple_offset(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   /* _NEW_POLYGON */
-   if (!ctx->Polygon.StippleFlag)
-      return;
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2));
-
-   /* _NEW_BUFFERS
-    *
-    * If we're drawing to a system window we have to invert the Y axis
-    * in order to match the OpenGL pixel coordinate system, and our
-    * offset must be matched to the window position.  If we're drawing
-    * to a user-created FBO then our native pixel coordinate system
-    * works just fine, and there's no window system to worry about.
-    */
-   if (_mesa_is_winsys_fbo(ctx->DrawBuffer))
-      OUT_BATCH((32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31);
-   else
-      OUT_BATCH(0);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_polygon_stipple_offset = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_POLYGON,
-      .brw = BRW_NEW_CONTEXT,
-   },
-   .emit = upload_polygon_stipple_offset
-};
-
-/**
- * Line stipple packet
- */
-static void
-upload_line_stipple(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   GLfloat tmp;
-   GLint tmpi;
-
-   if (!ctx->Line.StippleFlag)
-      return;
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
-   OUT_BATCH(ctx->Line.StipplePattern);
-
-   if (brw->gen >= 7) {
-      /* in U1.16 */
-      tmp = 1.0f / ctx->Line.StippleFactor;
-      tmpi = tmp * (1<<16);
-      OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
-   } else {
-      /* in U1.13 */
-      tmp = 1.0f / ctx->Line.StippleFactor;
-      tmpi = tmp * (1<<13);
-      OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
-   }
-
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_line_stipple = {
-   .dirty = {
-      .mesa = _NEW_LINE,
-      .brw = BRW_NEW_CONTEXT,
-   },
-   .emit = upload_line_stipple
-};
-
 void
 brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index f35e8f8..f0bccac 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -135,7 +135,7 @@
       struct gl_uniform_storage *storage =
          &prog->sh.data->UniformStorage[u];
 
-      if (storage->builtin)
+      if (storage->builtin || storage->type->is_sampler())
          continue;
 
       if (strncmp(var->name, storage->name, namelen) != 0 ||
diff --git a/src/mesa/drivers/dri/i965/brw_oa.py b/src/mesa/drivers/dri/i965/brw_oa.py
index bf950b1..254c512 100644
--- a/src/mesa/drivers/dri/i965/brw_oa.py
+++ b/src/mesa/drivers/dri/i965/brw_oa.py
@@ -214,7 +214,9 @@
     value = stack[-1]
 
     if value in hw_vars:
-        value = hw_vars[value];
+        value = hw_vars[value]
+    if value in counter_vars:
+        value = read_funcs[value[1:]] + "(brw, query, accumulator)"
 
     c("\nreturn " + value + ";")
 
diff --git a/src/mesa/drivers/dri/i965/brw_oa_bdw.xml b/src/mesa/drivers/dri/i965/brw_oa_bdw.xml
new file mode 100644
index 0000000..ee3c359
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_bdw.xml
@@ -0,0 +1,15051 @@
+<?xml version="1.0"?>
+<metrics version="1491574821" merge_md5="">
+  <set name="Render Metrics Basic Gen8"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="b541bd57-0e0f-4154-b4c0-5858010a2bf7"
+       chipset="BDW"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ B 5 READ UADD 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x143F000F" />
+        <register type="NOA" address="0x00009888" value="0x14110014" />
+        <register type="NOA" address="0x00009888" value="0x14310014" />
+        <register type="NOA" address="0x00009888" value="0x14BF000F" />
+        <register type="NOA" address="0x00009888" value="0x118A0317" />
+        <register type="NOA" address="0x00009888" value="0x13837BE0" />
+        <register type="NOA" address="0x00009888" value="0x3B800060" />
+        <register type="NOA" address="0x00009888" value="0x3D800005" />
+        <register type="NOA" address="0x00009888" value="0x005C4000" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x003D8000" />
+        <register type="NOA" address="0x00009888" value="0x183D0800" />
+        <register type="NOA" address="0x00009888" value="0x0A3F0023" />
+        <register type="NOA" address="0x00009888" value="0x103F0000" />
+        <register type="NOA" address="0x00009888" value="0x00584000" />
+        <register type="NOA" address="0x00009888" value="0x08584000" />
+        <register type="NOA" address="0x00009888" value="0x0A5A4000" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B8000" />
+        <register type="NOA" address="0x00009888" value="0x185B2400" />
+        <register type="NOA" address="0x00009888" value="0x0A1D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1F0800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA00" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18380001" />
+        <register type="NOA" address="0x00009888" value="0x00392000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A391000" />
+        <register type="NOA" address="0x00009888" value="0x00104000" />
+        <register type="NOA" address="0x00009888" value="0x08104000" />
+        <register type="NOA" address="0x00009888" value="0x00110030" />
+        <register type="NOA" address="0x00009888" value="0x08110031" />
+        <register type="NOA" address="0x00009888" value="0x10110000" />
+        <register type="NOA" address="0x00009888" value="0x00134000" />
+        <register type="NOA" address="0x00009888" value="0x16130020" />
+        <register type="NOA" address="0x00009888" value="0x06308000" />
+        <register type="NOA" address="0x00009888" value="0x08308000" />
+        <register type="NOA" address="0x00009888" value="0x06311800" />
+        <register type="NOA" address="0x00009888" value="0x08311880" />
+        <register type="NOA" address="0x00009888" value="0x10310000" />
+        <register type="NOA" address="0x00009888" value="0x0E334000" />
+        <register type="NOA" address="0x00009888" value="0x16330080" />
+        <register type="NOA" address="0x00009888" value="0x0ABF1180" />
+        <register type="NOA" address="0x00009888" value="0x10BF0000" />
+        <register type="NOA" address="0x00009888" value="0x0ADA8000" />
+        <register type="NOA" address="0x00009888" value="0x0A9D8000" />
+        <register type="NOA" address="0x00009888" value="0x109F0002" />
+        <register type="NOA" address="0x00009888" value="0x0AB94000" />
+        <register type="NOA" address="0x00009888" value="0x0D888000" />
+        <register type="NOA" address="0x00009888" value="0x038A0380" />
+        <register type="NOA" address="0x00009888" value="0x058A000E" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A00A0" />
+        <register type="NOA" address="0x00009888" value="0x078A0000" />
+        <register type="NOA" address="0x00009888" value="0x098A0000" />
+        <register type="NOA" address="0x00009888" value="0x238B2820" />
+        <register type="NOA" address="0x00009888" value="0x258B2550" />
+        <register type="NOA" address="0x00009888" value="0x198C1000" />
+        <register type="NOA" address="0x00009888" value="0x0B8D8000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x0D831021" />
+        <register type="NOA" address="0x00009888" value="0x0F83572F" />
+        <register type="NOA" address="0x00009888" value="0x01835680" />
+        <register type="NOA" address="0x00009888" value="0x0383002C" />
+        <register type="NOA" address="0x00009888" value="0x11830000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830001" />
+        <register type="NOA" address="0x00009888" value="0x05830000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x05844000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C137" />
+        <register type="NOA" address="0x00009888" value="0x1D80C147" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x15804000" />
+        <register type="NOA" address="0x00009888" value="0x4D801110" />
+        <register type="NOA" address="0x00009888" value="0x4F800331" />
+        <register type="NOA" address="0x00009888" value="0x43800802" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45801465" />
+        <register type="NOA" address="0x00009888" value="0x53801111" />
+        <register type="NOA" address="0x00009888" value="0x478014A5" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800CA5" />
+        <register type="NOA" address="0x00009888" value="0x41800003" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SliceMask 0x02 AND"
+                     priority="1"
+                     >
+        <register type="NOA" address="0x00009888" value="0x143F000F" />
+        <register type="NOA" address="0x00009888" value="0x14BF000F" />
+        <register type="NOA" address="0x00009888" value="0x14910014" />
+        <register type="NOA" address="0x00009888" value="0x14B10014" />
+        <register type="NOA" address="0x00009888" value="0x118A0317" />
+        <register type="NOA" address="0x00009888" value="0x13837BE0" />
+        <register type="NOA" address="0x00009888" value="0x3B800060" />
+        <register type="NOA" address="0x00009888" value="0x3D800005" />
+        <register type="NOA" address="0x00009888" value="0x0A3F0023" />
+        <register type="NOA" address="0x00009888" value="0x103F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A5A4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F8000" />
+        <register type="NOA" address="0x00009888" value="0x0A391000" />
+        <register type="NOA" address="0x00009888" value="0x00DC4000" />
+        <register type="NOA" address="0x00009888" value="0x06DC8000" />
+        <register type="NOA" address="0x00009888" value="0x08DCC000" />
+        <register type="NOA" address="0x00009888" value="0x00BD8000" />
+        <register type="NOA" address="0x00009888" value="0x18BD0800" />
+        <register type="NOA" address="0x00009888" value="0x0ABF1180" />
+        <register type="NOA" address="0x00009888" value="0x10BF0000" />
+        <register type="NOA" address="0x00009888" value="0x00D84000" />
+        <register type="NOA" address="0x00009888" value="0x08D84000" />
+        <register type="NOA" address="0x00009888" value="0x0ADA8000" />
+        <register type="NOA" address="0x00009888" value="0x00DB4000" />
+        <register type="NOA" address="0x00009888" value="0x0EDB8000" />
+        <register type="NOA" address="0x00009888" value="0x18DB2400" />
+        <register type="NOA" address="0x00009888" value="0x0A9D8000" />
+        <register type="NOA" address="0x00009888" value="0x0C9F0800" />
+        <register type="NOA" address="0x00009888" value="0x0E9F2A00" />
+        <register type="NOA" address="0x00009888" value="0x109F0002" />
+        <register type="NOA" address="0x00009888" value="0x00B84000" />
+        <register type="NOA" address="0x00009888" value="0x0EB84000" />
+        <register type="NOA" address="0x00009888" value="0x16B84000" />
+        <register type="NOA" address="0x00009888" value="0x18B80001" />
+        <register type="NOA" address="0x00009888" value="0x00B92000" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB94000" />
+        <register type="NOA" address="0x00009888" value="0x00904000" />
+        <register type="NOA" address="0x00009888" value="0x08904000" />
+        <register type="NOA" address="0x00009888" value="0x00910030" />
+        <register type="NOA" address="0x00009888" value="0x08910031" />
+        <register type="NOA" address="0x00009888" value="0x10910000" />
+        <register type="NOA" address="0x00009888" value="0x00934000" />
+        <register type="NOA" address="0x00009888" value="0x16930020" />
+        <register type="NOA" address="0x00009888" value="0x06B08000" />
+        <register type="NOA" address="0x00009888" value="0x08B08000" />
+        <register type="NOA" address="0x00009888" value="0x06B11800" />
+        <register type="NOA" address="0x00009888" value="0x08B11880" />
+        <register type="NOA" address="0x00009888" value="0x10B10000" />
+        <register type="NOA" address="0x00009888" value="0x0EB34000" />
+        <register type="NOA" address="0x00009888" value="0x16B30080" />
+        <register type="NOA" address="0x00009888" value="0x01888000" />
+        <register type="NOA" address="0x00009888" value="0x0D88B800" />
+        <register type="NOA" address="0x00009888" value="0x038A0380" />
+        <register type="NOA" address="0x00009888" value="0x058A000E" />
+        <register type="NOA" address="0x00009888" value="0x1B8A0080" />
+        <register type="NOA" address="0x00009888" value="0x078A0000" />
+        <register type="NOA" address="0x00009888" value="0x098A0000" />
+        <register type="NOA" address="0x00009888" value="0x238B2840" />
+        <register type="NOA" address="0x00009888" value="0x258B26A0" />
+        <register type="NOA" address="0x00009888" value="0x018C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C1100" />
+        <register type="NOA" address="0x00009888" value="0x018D2000" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8D8000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x0D831021" />
+        <register type="NOA" address="0x00009888" value="0x0F83572F" />
+        <register type="NOA" address="0x00009888" value="0x01835680" />
+        <register type="NOA" address="0x00009888" value="0x0383002C" />
+        <register type="NOA" address="0x00009888" value="0x11830000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830001" />
+        <register type="NOA" address="0x00009888" value="0x05830000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x05844000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C137" />
+        <register type="NOA" address="0x00009888" value="0x1D80C147" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x15804000" />
+        <register type="NOA" address="0x00009888" value="0x4D801550" />
+        <register type="NOA" address="0x00009888" value="0x4F800331" />
+        <register type="NOA" address="0x00009888" value="0x43800802" />
+        <register type="NOA" address="0x00009888" value="0x51800400" />
+        <register type="NOA" address="0x00009888" value="0x458004A1" />
+        <register type="NOA" address="0x00009888" value="0x53805555" />
+        <register type="NOA" address="0x00009888" value="0x47800421" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801421" />
+        <register type="NOA" address="0x00009888" value="0x41800845" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="35fbc9b2-a891-40a6-a38d-022bb7057552"
+       chipset="BDW"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of typed memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL  $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x105C00E0" />
+        <register type="NOA" address="0x00009888" value="0x105800E0" />
+        <register type="NOA" address="0x00009888" value="0x103800E0" />
+        <register type="NOA" address="0x00009888" value="0x3580001A" />
+        <register type="NOA" address="0x00009888" value="0x3B800060" />
+        <register type="NOA" address="0x00009888" value="0x3D800005" />
+        <register type="NOA" address="0x00009888" value="0x065C2100" />
+        <register type="NOA" address="0x00009888" value="0x0A5C0041" />
+        <register type="NOA" address="0x00009888" value="0x0C5C6600" />
+        <register type="NOA" address="0x00009888" value="0x005C6580" />
+        <register type="NOA" address="0x00009888" value="0x085C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5C8000" />
+        <register type="NOA" address="0x00009888" value="0x00580042" />
+        <register type="NOA" address="0x00009888" value="0x08582080" />
+        <register type="NOA" address="0x00009888" value="0x0C58004C" />
+        <register type="NOA" address="0x00009888" value="0x0E582580" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B1000" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0104" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA00" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x08380042" />
+        <register type="NOA" address="0x00009888" value="0x0A382080" />
+        <register type="NOA" address="0x00009888" value="0x0E38404C" />
+        <register type="NOA" address="0x00009888" value="0x0238404B" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x16380000" />
+        <register type="NOA" address="0x00009888" value="0x18381145" />
+        <register type="NOA" address="0x00009888" value="0x04380000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x02392000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B02A0" />
+        <register type="NOA" address="0x00009888" value="0x258B5550" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x1F850A80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x03844000" />
+        <register type="NOA" address="0x00009888" value="0x17808137" />
+        <register type="NOA" address="0x00009888" value="0x1980C147" />
+        <register type="NOA" address="0x00009888" value="0x1B80C0E5" />
+        <register type="NOA" address="0x00009888" value="0x1D80C0E3" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x13804000" />
+        <register type="NOA" address="0x00009888" value="0x15800000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D801000" />
+        <register type="NOA" address="0x00009888" value="0x4F800111" />
+        <register type="NOA" address="0x00009888" value="0x43800062" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800062" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800062" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801062" />
+        <register type="NOA" address="0x00009888" value="0x41801084" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SliceMask 0x02 AND"
+                     priority="2"
+                     >
+        <register type="NOA" address="0x00009888" value="0x10DC00E0" />
+        <register type="NOA" address="0x00009888" value="0x10D800E0" />
+        <register type="NOA" address="0x00009888" value="0x10B800E0" />
+        <register type="NOA" address="0x00009888" value="0x3580001A" />
+        <register type="NOA" address="0x00009888" value="0x3B800060" />
+        <register type="NOA" address="0x00009888" value="0x3D800005" />
+        <register type="NOA" address="0x00009888" value="0x06DC2100" />
+        <register type="NOA" address="0x00009888" value="0x0ADC0041" />
+        <register type="NOA" address="0x00009888" value="0x0CDC6600" />
+        <register type="NOA" address="0x00009888" value="0x00DC6580" />
+        <register type="NOA" address="0x00009888" value="0x08DC8000" />
+        <register type="NOA" address="0x00009888" value="0x0EDC8000" />
+        <register type="NOA" address="0x00009888" value="0x00D80042" />
+        <register type="NOA" address="0x00009888" value="0x08D82080" />
+        <register type="NOA" address="0x00009888" value="0x0CD8004C" />
+        <register type="NOA" address="0x00009888" value="0x0ED82580" />
+        <register type="NOA" address="0x00009888" value="0x00DB4000" />
+        <register type="NOA" address="0x00009888" value="0x18DB1000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0104" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA00" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x08B80042" />
+        <register type="NOA" address="0x00009888" value="0x0AB82080" />
+        <register type="NOA" address="0x00009888" value="0x0EB8404C" />
+        <register type="NOA" address="0x00009888" value="0x02B8404B" />
+        <register type="NOA" address="0x00009888" value="0x00B84000" />
+        <register type="NOA" address="0x00009888" value="0x16B80000" />
+        <register type="NOA" address="0x00009888" value="0x18B81145" />
+        <register type="NOA" address="0x00009888" value="0x04B80000" />
+        <register type="NOA" address="0x00009888" value="0x00B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x02B92000" />
+        <register type="NOA" address="0x00009888" value="0x01888000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x238B0540" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA0" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x018C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x018DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038D2000" />
+        <register type="NOA" address="0x00009888" value="0x1F850A80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x03844000" />
+        <register type="NOA" address="0x00009888" value="0x17808137" />
+        <register type="NOA" address="0x00009888" value="0x1980C147" />
+        <register type="NOA" address="0x00009888" value="0x1B80C0E5" />
+        <register type="NOA" address="0x00009888" value="0x1D80C0E3" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x13804000" />
+        <register type="NOA" address="0x00009888" value="0x15800000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D805000" />
+        <register type="NOA" address="0x00009888" value="0x4F800555" />
+        <register type="NOA" address="0x00009888" value="0x43800062" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800062" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800062" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800062" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="233d0544-fff7-4281-8291-e02f222aff72"
+       chipset="BDW"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0A1E0000" />
+        <register type="NOA" address="0x00009888" value="0x0C1F000F" />
+        <register type="NOA" address="0x00009888" value="0x10176800" />
+        <register type="NOA" address="0x00009888" value="0x1191001F" />
+        <register type="NOA" address="0x00009888" value="0x0B880320" />
+        <register type="NOA" address="0x00009888" value="0x01890C40" />
+        <register type="NOA" address="0x00009888" value="0x118A1C00" />
+        <register type="NOA" address="0x00009888" value="0x118D7C00" />
+        <register type="NOA" address="0x00009888" value="0x118E0020" />
+        <register type="NOA" address="0x00009888" value="0x118F4C00" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x13900001" />
+        <register type="NOA" address="0x00009888" value="0x065C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C3D8000" />
+        <register type="NOA" address="0x00009888" value="0x06584000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B4000" />
+        <register type="NOA" address="0x00009888" value="0x081E0040" />
+        <register type="NOA" address="0x00009888" value="0x0E1E0000" />
+        <register type="NOA" address="0x00009888" value="0x021F5400" />
+        <register type="NOA" address="0x00009888" value="0x001F0000" />
+        <register type="NOA" address="0x00009888" value="0x101F0010" />
+        <register type="NOA" address="0x00009888" value="0x0E1F0080" />
+        <register type="NOA" address="0x00009888" value="0x0C384000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0C13C000" />
+        <register type="NOA" address="0x00009888" value="0x06164000" />
+        <register type="NOA" address="0x00009888" value="0x06170012" />
+        <register type="NOA" address="0x00009888" value="0x00170000" />
+        <register type="NOA" address="0x00009888" value="0x01910005" />
+        <register type="NOA" address="0x00009888" value="0x07880002" />
+        <register type="NOA" address="0x00009888" value="0x01880C00" />
+        <register type="NOA" address="0x00009888" value="0x0F880000" />
+        <register type="NOA" address="0x00009888" value="0x0D880000" />
+        <register type="NOA" address="0x00009888" value="0x05880000" />
+        <register type="NOA" address="0x00009888" value="0x09890032" />
+        <register type="NOA" address="0x00009888" value="0x078A0800" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0A00" />
+        <register type="NOA" address="0x00009888" value="0x198A4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A2000" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x038A4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B54C0" />
+        <register type="NOA" address="0x00009888" value="0x258BAA55" />
+        <register type="NOA" address="0x00009888" value="0x278B0019" />
+        <register type="NOA" address="0x00009888" value="0x198C0100" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8D0015" />
+        <register type="NOA" address="0x00009888" value="0x018D1000" />
+        <register type="NOA" address="0x00009888" value="0x098D8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DF000" />
+        <register type="NOA" address="0x00009888" value="0x0D8D3000" />
+        <register type="NOA" address="0x00009888" value="0x038DE000" />
+        <register type="NOA" address="0x00009888" value="0x058D3000" />
+        <register type="NOA" address="0x00009888" value="0x0D8E0004" />
+        <register type="NOA" address="0x00009888" value="0x058E000C" />
+        <register type="NOA" address="0x00009888" value="0x098E0000" />
+        <register type="NOA" address="0x00009888" value="0x078E0000" />
+        <register type="NOA" address="0x00009888" value="0x038E0000" />
+        <register type="NOA" address="0x00009888" value="0x0B8F0020" />
+        <register type="NOA" address="0x00009888" value="0x198F0C00" />
+        <register type="NOA" address="0x00009888" value="0x078F8000" />
+        <register type="NOA" address="0x00009888" value="0x098F4000" />
+        <register type="NOA" address="0x00009888" value="0x0B900980" />
+        <register type="NOA" address="0x00009888" value="0x03900D80" />
+        <register type="NOA" address="0x00009888" value="0x01900000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D801111" />
+        <register type="NOA" address="0x00009888" value="0x3D800800" />
+        <register type="NOA" address="0x00009888" value="0x4F801011" />
+        <register type="NOA" address="0x00009888" value="0x43800443" />
+        <register type="NOA" address="0x00009888" value="0x51801111" />
+        <register type="NOA" address="0x00009888" value="0x45800422" />
+        <register type="NOA" address="0x00009888" value="0x53801111" />
+        <register type="NOA" address="0x00009888" value="0x47800C60" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800422" />
+        <register type="NOA" address="0x00009888" value="0x41800021" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen8"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="2b255d48-2117-4fef-a8f7-f151e1d25a2c"
+       chipset="BDW"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x198B0343" />
+        <register type="NOA" address="0x00009888" value="0x13845800" />
+        <register type="NOA" address="0x00009888" value="0x15840018" />
+        <register type="NOA" address="0x00009888" value="0x3580001A" />
+        <register type="NOA" address="0x00009888" value="0x038B6300" />
+        <register type="NOA" address="0x00009888" value="0x058B6B62" />
+        <register type="NOA" address="0x00009888" value="0x078B006A" />
+        <register type="NOA" address="0x00009888" value="0x118B0000" />
+        <register type="NOA" address="0x00009888" value="0x238B0000" />
+        <register type="NOA" address="0x00009888" value="0x258B0000" />
+        <register type="NOA" address="0x00009888" value="0x1F85A080" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385000A" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x01840018" />
+        <register type="NOA" address="0x00009888" value="0x07844C80" />
+        <register type="NOA" address="0x00009888" value="0x09840D9A" />
+        <register type="NOA" address="0x00009888" value="0x0B840E9C" />
+        <register type="NOA" address="0x00009888" value="0x0D840F9E" />
+        <register type="NOA" address="0x00009888" value="0x0F840010" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x03848000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x2F8000E5" />
+        <register type="NOA" address="0x00009888" value="0x138080E3" />
+        <register type="NOA" address="0x00009888" value="0x1580C0E1" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x11804000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F804000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800800" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800842" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800842" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801042" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800084" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen8"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="f7fd3220-b466-4a4d-9f98-b0caf3f2394c"
+       chipset="BDW"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x198B0343" />
+        <register type="NOA" address="0x00009888" value="0x13845400" />
+        <register type="NOA" address="0x00009888" value="0x3580001A" />
+        <register type="NOA" address="0x00009888" value="0x3D800805" />
+        <register type="NOA" address="0x00009888" value="0x038B6300" />
+        <register type="NOA" address="0x00009888" value="0x058B6B62" />
+        <register type="NOA" address="0x00009888" value="0x078B006A" />
+        <register type="NOA" address="0x00009888" value="0x118B0000" />
+        <register type="NOA" address="0x00009888" value="0x238B0000" />
+        <register type="NOA" address="0x00009888" value="0x258B0000" />
+        <register type="NOA" address="0x00009888" value="0x1F85A080" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x23850002" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x01840010" />
+        <register type="NOA" address="0x00009888" value="0x07844880" />
+        <register type="NOA" address="0x00009888" value="0x09840992" />
+        <register type="NOA" address="0x00009888" value="0x0B840A94" />
+        <register type="NOA" address="0x00009888" value="0x0D840B96" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x03848000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x2D800147" />
+        <register type="NOA" address="0x00009888" value="0x2F8000E5" />
+        <register type="NOA" address="0x00009888" value="0x138080E3" />
+        <register type="NOA" address="0x00009888" value="0x1580C0E1" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x11804000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F800000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800842" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800842" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801082" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800084" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="e99ccaca-821c-4df9-97a7-96bdb7204e43"
+       chipset="BDW"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="Ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu atomic requests to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="Ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu requests to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="Ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu requests to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="Ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu requests to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="Ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu requests to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x143D0160" />
+        <register type="NOA" address="0x00009888" value="0x163D2800" />
+        <register type="NOA" address="0x00009888" value="0x183D0120" />
+        <register type="NOA" address="0x00009888" value="0x105800E0" />
+        <register type="NOA" address="0x00009888" value="0x005CC000" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5CC000" />
+        <register type="NOA" address="0x00009888" value="0x025CC000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x003D0011" />
+        <register type="NOA" address="0x00009888" value="0x063D0900" />
+        <register type="NOA" address="0x00009888" value="0x083D0A13" />
+        <register type="NOA" address="0x00009888" value="0x0A3D0B15" />
+        <register type="NOA" address="0x00009888" value="0x0C3D2317" />
+        <register type="NOA" address="0x00009888" value="0x043D21B7" />
+        <register type="NOA" address="0x00009888" value="0x103D0000" />
+        <register type="NOA" address="0x00009888" value="0x0E3D0000" />
+        <register type="NOA" address="0x00009888" value="0x1A3D0000" />
+        <register type="NOA" address="0x00009888" value="0x0E5825C1" />
+        <register type="NOA" address="0x00009888" value="0x00586100" />
+        <register type="NOA" address="0x00009888" value="0x0258204C" />
+        <register type="NOA" address="0x00009888" value="0x06588000" />
+        <register type="NOA" address="0x00009888" value="0x0858C000" />
+        <register type="NOA" address="0x00009888" value="0x0A58C000" />
+        <register type="NOA" address="0x00009888" value="0x0C58C000" />
+        <register type="NOA" address="0x00009888" value="0x0458C000" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B5400" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0155" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18381555" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x06384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B2AA0" />
+        <register type="NOA" address="0x00009888" value="0x258B5551" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x02 AND"
+                     priority="2"
+                     >
+        <register type="NOA" address="0x00009888" value="0x105C00E0" />
+        <register type="NOA" address="0x00009888" value="0x145B0160" />
+        <register type="NOA" address="0x00009888" value="0x165B2800" />
+        <register type="NOA" address="0x00009888" value="0x185B0120" />
+        <register type="NOA" address="0x00009888" value="0x0E5C25C1" />
+        <register type="NOA" address="0x00009888" value="0x005C6100" />
+        <register type="NOA" address="0x00009888" value="0x025C204C" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5CC000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x005B0011" />
+        <register type="NOA" address="0x00009888" value="0x065B0900" />
+        <register type="NOA" address="0x00009888" value="0x085B0A13" />
+        <register type="NOA" address="0x00009888" value="0x0A5B0B15" />
+        <register type="NOA" address="0x00009888" value="0x0C5B2317" />
+        <register type="NOA" address="0x00009888" value="0x045B21B7" />
+        <register type="NOA" address="0x00009888" value="0x105B0000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B0000" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18381555" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x06384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B2AA0" />
+        <register type="NOA" address="0x00009888" value="0x258B5551" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x04 AND"
+                     priority="4"
+                     >
+        <register type="NOA" address="0x00009888" value="0x103800E0" />
+        <register type="NOA" address="0x00009888" value="0x143A0160" />
+        <register type="NOA" address="0x00009888" value="0x163A2800" />
+        <register type="NOA" address="0x00009888" value="0x183A0120" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0E38A5C1" />
+        <register type="NOA" address="0x00009888" value="0x0038A100" />
+        <register type="NOA" address="0x00009888" value="0x0238204C" />
+        <register type="NOA" address="0x00009888" value="0x16388000" />
+        <register type="NOA" address="0x00009888" value="0x183802AA" />
+        <register type="NOA" address="0x00009888" value="0x04380000" />
+        <register type="NOA" address="0x00009888" value="0x06380000" />
+        <register type="NOA" address="0x00009888" value="0x08388000" />
+        <register type="NOA" address="0x00009888" value="0x0A388000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x003A0011" />
+        <register type="NOA" address="0x00009888" value="0x063A0900" />
+        <register type="NOA" address="0x00009888" value="0x083A0A13" />
+        <register type="NOA" address="0x00009888" value="0x0A3A0B15" />
+        <register type="NOA" address="0x00009888" value="0x0C3A2317" />
+        <register type="NOA" address="0x00009888" value="0x043A21B7" />
+        <register type="NOA" address="0x00009888" value="0x103A0000" />
+        <register type="NOA" address="0x00009888" value="0x0E3A0000" />
+        <register type="NOA" address="0x00009888" value="0x1A3A0000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B2AA0" />
+        <register type="NOA" address="0x00009888" value="0x258B5551" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x08 AND"
+                     priority="1"
+                     >
+        <register type="NOA" address="0x00009888" value="0x14BD0160" />
+        <register type="NOA" address="0x00009888" value="0x16BD2800" />
+        <register type="NOA" address="0x00009888" value="0x18BD0120" />
+        <register type="NOA" address="0x00009888" value="0x10D800E0" />
+        <register type="NOA" address="0x00009888" value="0x00DCC000" />
+        <register type="NOA" address="0x00009888" value="0x06DC8000" />
+        <register type="NOA" address="0x00009888" value="0x08DCC000" />
+        <register type="NOA" address="0x00009888" value="0x0ADCC000" />
+        <register type="NOA" address="0x00009888" value="0x0CDCC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDCC000" />
+        <register type="NOA" address="0x00009888" value="0x02DCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DCC000" />
+        <register type="NOA" address="0x00009888" value="0x00BD0011" />
+        <register type="NOA" address="0x00009888" value="0x06BD0900" />
+        <register type="NOA" address="0x00009888" value="0x08BD0A13" />
+        <register type="NOA" address="0x00009888" value="0x0ABD0B15" />
+        <register type="NOA" address="0x00009888" value="0x0CBD2317" />
+        <register type="NOA" address="0x00009888" value="0x04BD21B7" />
+        <register type="NOA" address="0x00009888" value="0x10BD0000" />
+        <register type="NOA" address="0x00009888" value="0x0EBD0000" />
+        <register type="NOA" address="0x00009888" value="0x1ABD0000" />
+        <register type="NOA" address="0x00009888" value="0x0ED825C1" />
+        <register type="NOA" address="0x00009888" value="0x00D86100" />
+        <register type="NOA" address="0x00009888" value="0x02D8204C" />
+        <register type="NOA" address="0x00009888" value="0x06D88000" />
+        <register type="NOA" address="0x00009888" value="0x08D8C000" />
+        <register type="NOA" address="0x00009888" value="0x0AD8C000" />
+        <register type="NOA" address="0x00009888" value="0x0CD8C000" />
+        <register type="NOA" address="0x00009888" value="0x04D8C000" />
+        <register type="NOA" address="0x00009888" value="0x00DB4000" />
+        <register type="NOA" address="0x00009888" value="0x0EDB4000" />
+        <register type="NOA" address="0x00009888" value="0x18DB5400" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0155" />
+        <register type="NOA" address="0x00009888" value="0x02DB4000" />
+        <register type="NOA" address="0x00009888" value="0x04DB4000" />
+        <register type="NOA" address="0x00009888" value="0x06DB4000" />
+        <register type="NOA" address="0x00009888" value="0x08DB4000" />
+        <register type="NOA" address="0x00009888" value="0x0ADB4000" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x00B84000" />
+        <register type="NOA" address="0x00009888" value="0x0EB84000" />
+        <register type="NOA" address="0x00009888" value="0x16B84000" />
+        <register type="NOA" address="0x00009888" value="0x18B81555" />
+        <register type="NOA" address="0x00009888" value="0x02B84000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB84000" />
+        <register type="NOA" address="0x00009888" value="0x00B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x01888000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x238B5540" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA2" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x018C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x018DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x10 AND"
+                     priority="3"
+                     >
+        <register type="NOA" address="0x00009888" value="0x10DC00E0" />
+        <register type="NOA" address="0x00009888" value="0x14DB0160" />
+        <register type="NOA" address="0x00009888" value="0x16DB2800" />
+        <register type="NOA" address="0x00009888" value="0x18DB0120" />
+        <register type="NOA" address="0x00009888" value="0x0EDC25C1" />
+        <register type="NOA" address="0x00009888" value="0x00DC6100" />
+        <register type="NOA" address="0x00009888" value="0x02DC204C" />
+        <register type="NOA" address="0x00009888" value="0x06DC8000" />
+        <register type="NOA" address="0x00009888" value="0x08DCC000" />
+        <register type="NOA" address="0x00009888" value="0x0ADCC000" />
+        <register type="NOA" address="0x00009888" value="0x0CDCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DCC000" />
+        <register type="NOA" address="0x00009888" value="0x00DB0011" />
+        <register type="NOA" address="0x00009888" value="0x06DB0900" />
+        <register type="NOA" address="0x00009888" value="0x08DB0A13" />
+        <register type="NOA" address="0x00009888" value="0x0ADB0B15" />
+        <register type="NOA" address="0x00009888" value="0x0CDB2317" />
+        <register type="NOA" address="0x00009888" value="0x04DB21B7" />
+        <register type="NOA" address="0x00009888" value="0x10DB0000" />
+        <register type="NOA" address="0x00009888" value="0x0EDB0000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0000" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x00B84000" />
+        <register type="NOA" address="0x00009888" value="0x0EB84000" />
+        <register type="NOA" address="0x00009888" value="0x16B84000" />
+        <register type="NOA" address="0x00009888" value="0x18B81555" />
+        <register type="NOA" address="0x00009888" value="0x02B84000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB84000" />
+        <register type="NOA" address="0x00009888" value="0x00B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x01888000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x238B5540" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA2" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x018C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x018DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x20 AND"
+                     priority="5"
+                     >
+        <register type="NOA" address="0x00009888" value="0x10B800E0" />
+        <register type="NOA" address="0x00009888" value="0x14BA0160" />
+        <register type="NOA" address="0x00009888" value="0x16BA2800" />
+        <register type="NOA" address="0x00009888" value="0x18BA0120" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA2A" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0EB8A5C1" />
+        <register type="NOA" address="0x00009888" value="0x00B8A100" />
+        <register type="NOA" address="0x00009888" value="0x02B8204C" />
+        <register type="NOA" address="0x00009888" value="0x16B88000" />
+        <register type="NOA" address="0x00009888" value="0x18B802AA" />
+        <register type="NOA" address="0x00009888" value="0x04B80000" />
+        <register type="NOA" address="0x00009888" value="0x06B80000" />
+        <register type="NOA" address="0x00009888" value="0x08B88000" />
+        <register type="NOA" address="0x00009888" value="0x0AB88000" />
+        <register type="NOA" address="0x00009888" value="0x00B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x00BA0011" />
+        <register type="NOA" address="0x00009888" value="0x06BA0900" />
+        <register type="NOA" address="0x00009888" value="0x08BA0A13" />
+        <register type="NOA" address="0x00009888" value="0x0ABA0B15" />
+        <register type="NOA" address="0x00009888" value="0x0CBA2317" />
+        <register type="NOA" address="0x00009888" value="0x04BA21B7" />
+        <register type="NOA" address="0x00009888" value="0x10BA0000" />
+        <register type="NOA" address="0x00009888" value="0x0EBA0000" />
+        <register type="NOA" address="0x00009888" value="0x1ABA0000" />
+        <register type="NOA" address="0x00009888" value="0x01888000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x238B5540" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA2" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x018C4000" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x018DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA2" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="27a364dc-8225-4ecb-b607-d6f1925598d9"
+       chipset="BDW"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ C 2 READ C 3 READ B 6 READ B 7 READ UADD UADD UADD UADD UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OCL OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OCL OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="L3 Bank 10 Accesses"
+             description="The total number of accesses to L3 Bank 10."
+             data_type="uint64"
+             equation="C 2 READ 2 UMUL"
+             underscore_name="l3_bank10_accesses"
+             units="messages"
+             symbol_name="L3Bank10Accesses"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 10 IC Accesses"
+             description="The total number of accesses to L3 Bank 10 from IC cache."
+             data_type="uint64"
+             equation="B 4 READ B 5 READ UADD 2 UMUL $L3Bank10Accesses UMIN"
+             underscore_name="l3_bank10_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank10IcAccesses"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 10 IC Hits"
+             description="The total number of hits in L3 Bank 10 from IC cache."
+             data_type="uint64"
+             equation="B 5 READ 2 UMUL $L3Bank10IcAccesses UMIN"
+             underscore_name="l3_bank10_ic_hits"
+             units="messages"
+             symbol_name="L3Bank10IcHits"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 11 Accesses"
+             description="The total number of accesses to L3 Bank 11."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="l3_bank11_accesses"
+             units="messages"
+             symbol_name="L3Bank11Accesses"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Bank 13 Accesses"
+             description="The total number of accesses to L3 Bank 13."
+             data_type="uint64"
+             equation="B 7 READ 2 UMUL"
+             underscore_name="l3_bank13_accesses"
+             units="messages"
+             symbol_name="L3Bank13Accesses"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="L3 Bank 12 Accesses"
+             description="The total number of accesses to L3 Bank 12."
+             data_type="uint64"
+             equation="B 6 READ 2 UMUL"
+             underscore_name="l3_bank12_accesses"
+             units="messages"
+             symbol_name="L3Bank12Accesses"
+             availability="$SliceMask 0x02 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x143F00B3" />
+        <register type="NOA" address="0x00009888" value="0x14BF00B3" />
+        <register type="NOA" address="0x00009888" value="0x138303C0" />
+        <register type="NOA" address="0x00009888" value="0x3B800060" />
+        <register type="NOA" address="0x00009888" value="0x3D800805" />
+        <register type="NOA" address="0x00009888" value="0x003F0029" />
+        <register type="NOA" address="0x00009888" value="0x063F1400" />
+        <register type="NOA" address="0x00009888" value="0x083F1225" />
+        <register type="NOA" address="0x00009888" value="0x0E3F1327" />
+        <register type="NOA" address="0x00009888" value="0x103F0000" />
+        <register type="NOA" address="0x00009888" value="0x005A4000" />
+        <register type="NOA" address="0x00009888" value="0x065A8000" />
+        <register type="NOA" address="0x00009888" value="0x085AC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5AC000" />
+        <register type="NOA" address="0x00009888" value="0x001D4000" />
+        <register type="NOA" address="0x00009888" value="0x061D8000" />
+        <register type="NOA" address="0x00009888" value="0x081DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C1F0800" />
+        <register type="NOA" address="0x00009888" value="0x0E1F2A00" />
+        <register type="NOA" address="0x00009888" value="0x101F0280" />
+        <register type="NOA" address="0x00009888" value="0x00391000" />
+        <register type="NOA" address="0x00009888" value="0x06394000" />
+        <register type="NOA" address="0x00009888" value="0x08395000" />
+        <register type="NOA" address="0x00009888" value="0x0E395000" />
+        <register type="NOA" address="0x00009888" value="0x0ABF1429" />
+        <register type="NOA" address="0x00009888" value="0x0CBF1225" />
+        <register type="NOA" address="0x00009888" value="0x00BF1380" />
+        <register type="NOA" address="0x00009888" value="0x02BF0026" />
+        <register type="NOA" address="0x00009888" value="0x10BF0000" />
+        <register type="NOA" address="0x00009888" value="0x0ADAC000" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x00DA8000" />
+        <register type="NOA" address="0x00009888" value="0x02DA4000" />
+        <register type="NOA" address="0x00009888" value="0x0A9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x009D8000" />
+        <register type="NOA" address="0x00009888" value="0x029D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F8000" />
+        <register type="NOA" address="0x00009888" value="0x109F002A" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0AB95000" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x00B94000" />
+        <register type="NOA" address="0x00009888" value="0x02B91000" />
+        <register type="NOA" address="0x00009888" value="0x0D88C000" />
+        <register type="NOA" address="0x00009888" value="0x0F880003" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A8020" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x238B0520" />
+        <register type="NOA" address="0x00009888" value="0x258BA950" />
+        <register type="NOA" address="0x00009888" value="0x278B0016" />
+        <register type="NOA" address="0x00009888" value="0x198C5400" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0001" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038D2000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x03835180" />
+        <register type="NOA" address="0x00009888" value="0x05834022" />
+        <register type="NOA" address="0x00009888" value="0x11830000" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x05844000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C137" />
+        <register type="NOA" address="0x00009888" value="0x1D80C147" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x15804000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D801000" />
+        <register type="NOA" address="0x00009888" value="0x4F800111" />
+        <register type="NOA" address="0x00009888" value="0x43800842" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800840" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800800" />
+        <register type="NOA" address="0x00009888" value="0x418014A2" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Data Port Reads Coalescing Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="data_port_reads_coalescing"
+       hw_config_guid="857fc630-2f09-4804-85f1-084adfadd5ab"
+       chipset="BDW"
+       symbol_name="DataPortReadsCoalescing"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU to Data Port 0 Reads 64"
+             description="The subslice 0 EU data reads from Data Port with 64B per message."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_hdc0_reads64_b"
+             units="messages"
+             symbol_name="EuHdc0Reads64B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Data Port 0 to L3 Data Reads"
+             description="The subslice 0 Data Port data and constant reads from L3 cache."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="hdc0_l3_data_reads"
+             units="messages"
+             symbol_name="Hdc0L3DataReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Data Port 0 to L3 Data Writes"
+             description="The subslice 0 Data Port data writes to L3 cache."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="hdc0_l3_data_writes"
+             units="messages"
+             symbol_name="Hdc0L3DataWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU to Data Port 0 Reads 128"
+             description="The subslice 0 EU data reads from Data Port with 128B per message."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_hdc0_reads128_b"
+             units="messages"
+             symbol_name="EuHdc0Reads128B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="All Data Port 0 Writes to L3"
+             description="The subslice 0 Data Port writes to L3 cache."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="hdc0_l3_writes"
+             units="messages"
+             symbol_name="Hdc0L3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU to Data Port 0 Reads 32"
+             description="The subslice 0 EU data reads from Data Port with 32B per message."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_hdc0_reads32_b"
+             units="messages"
+             symbol_name="EuHdc0Reads32B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU to Data Port 0 Reads 256"
+             description="The subslice 0 EU data reads from Data Port with 256B per message."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_hdc0_reads256_b"
+             units="messages"
+             symbol_name="EuHdc0Reads256B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuBytesReadPerCacheLine"
+             description="Average EU bytes read per L3 cache line."
+             data_type="float"
+             equation="$EuHdc0Reads32B 32 UMUL $EuHdc0Reads64B 64 UMUL $EuHdc0Reads128B 128 UMUL $EuHdc0Reads256B 256 UMUL UADD UADD UADD $Hdc0L3DataReads FDIV"
+             underscore_name="eu_bytes_read_per_cache_line"
+             units="eu bytes per l3 cache line"
+             symbol_name="EuBytesReadPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuDataReadsPerCacheLine"
+             description="Coalescing ratio of EU read requests to L3 cache lines."
+             data_type="float"
+             equation="$EuBytesReadPerCacheLine 64 FDIV"
+             underscore_name="eu_data_reads_per_cache_line"
+             units="utilization"
+             symbol_name="EuDataReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="All Data Port 0 Reads from L3"
+             description="The subslice 0 Data Port reads from L3 cache."
+             data_type="uint64"
+             equation="C 3 READ C 2 READ USUB"
+             underscore_name="hdc0_l3_reads"
+             units="messages"
+             symbol_name="Hdc0L3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x103D0005" />
+        <register type="NOA" address="0x00009888" value="0x163D240B" />
+        <register type="NOA" address="0x00009888" value="0x1058022F" />
+        <register type="NOA" address="0x00009888" value="0x185B5520" />
+        <register type="NOA" address="0x00009888" value="0x198B0003" />
+        <register type="NOA" address="0x00009888" value="0x005CC000" />
+        <register type="NOA" address="0x00009888" value="0x065CC000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5CC000" />
+        <register type="NOA" address="0x00009888" value="0x025C4000" />
+        <register type="NOA" address="0x00009888" value="0x045C8000" />
+        <register type="NOA" address="0x00009888" value="0x003D0000" />
+        <register type="NOA" address="0x00009888" value="0x063D00B0" />
+        <register type="NOA" address="0x00009888" value="0x083D0182" />
+        <register type="NOA" address="0x00009888" value="0x0A3D10A0" />
+        <register type="NOA" address="0x00009888" value="0x0C3D11A2" />
+        <register type="NOA" address="0x00009888" value="0x0E3D0000" />
+        <register type="NOA" address="0x00009888" value="0x183D0000" />
+        <register type="NOA" address="0x00009888" value="0x1A3D0000" />
+        <register type="NOA" address="0x00009888" value="0x0E582242" />
+        <register type="NOA" address="0x00009888" value="0x00586700" />
+        <register type="NOA" address="0x00009888" value="0x0258004F" />
+        <register type="NOA" address="0x00009888" value="0x0658C000" />
+        <register type="NOA" address="0x00009888" value="0x0858C000" />
+        <register type="NOA" address="0x00009888" value="0x0A58C000" />
+        <register type="NOA" address="0x00009888" value="0x0C58C000" />
+        <register type="NOA" address="0x00009888" value="0x045B6300" />
+        <register type="NOA" address="0x00009888" value="0x105B0000" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0155" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B0000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAAA0" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18381555" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C384000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x0639A000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x02392000" />
+        <register type="NOA" address="0x00009888" value="0x04398000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x038B6300" />
+        <register type="NOA" address="0x00009888" value="0x058B0062" />
+        <register type="NOA" address="0x00009888" value="0x118B0000" />
+        <register type="NOA" address="0x00009888" value="0x238B02A0" />
+        <register type="NOA" address="0x00009888" value="0x258B5555" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D801000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800001" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800041" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0xBA98BA98" />
+        <register type="OA" address="0x00002748" value="0xBA98BA98" />
+        <register type="OA" address="0x00002744" value="0x00003377" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFF2" />
+        <register type="OA" address="0x00002774" value="0x00007FF0" />
+        <register type="OA" address="0x00002778" value="0x0007FFE2" />
+        <register type="OA" address="0x0000277C" value="0x00007FF0" />
+        <register type="OA" address="0x00002780" value="0x0007FFC2" />
+        <register type="OA" address="0x00002784" value="0x00007FF0" />
+        <register type="OA" address="0x00002788" value="0x0007FF82" />
+        <register type="OA" address="0x0000278C" value="0x00007FF0" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000BFEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000BFDF" />
+        <register type="OA" address="0x000027A0" value="0x0007FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000BFBF" />
+        <register type="OA" address="0x000027A8" value="0x0007FFFA" />
+        <register type="OA" address="0x000027AC" value="0x0000BF7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Data Port Writes Coalescing Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="data_port_writes_coalescing"
+       hw_config_guid="343ebc99-4a55-414c-8c17-d8e259cf5e20"
+       chipset="BDW"
+       symbol_name="DataPortWritesCoalescing"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU to Data Port 0 Writes 128"
+             description="The subslice 0 EU data simd16 writes to Data Port with 192B per message."
+             data_type="uint64"
+             equation="B 6 READ 2 UDIV"
+             underscore_name="eu_hdc0_writes192_b"
+             units="messages"
+             symbol_name="EuHdc0Writes192B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="EU to Data Port 0 Writes 32B"
+             description="The subslice 0 EU data writes to Data Port with 32B per message."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_hdc0_writes32_b"
+             units="messages"
+             symbol_name="EuHdc0Writes32B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU to Data Port 0 Writes 256B"
+             description="The subslice 0 EU data simd16 writes to Data Port with 256B per message."
+             data_type="uint64"
+             equation="B 7 READ 2 UDIV"
+             underscore_name="eu_hdc0_writes256_b_simd16"
+             units="messages"
+             symbol_name="EuHdc0Writes256BSimd16"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Data Port 0 to L3 Data Reads"
+             description="The subslice 0 Data Port data and constant reads from L3 cache."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="hdc0_l3_data_reads"
+             units="messages"
+             symbol_name="Hdc0L3DataReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Data Port 0 to L3 Data Writes"
+             description="The subslice 0 Data Port data writes to L3 cache."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="hdc0_l3_data_writes"
+             units="messages"
+             symbol_name="Hdc0L3DataWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU to Data Port 0 Writes 64B"
+             description="The subslice 0 EU data simd16 writes to Data Port with 128B per message."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_hdc0_writes128_b_simd16"
+             units="messages"
+             symbol_name="EuHdc0Writes128BSimd16"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU to Data Port 0 Writes 64B"
+             description="The subslice 0 EU data writes to Data Port with 64B per message."
+             data_type="uint64"
+             equation="B 2 READ 2 UDIV"
+             underscore_name="eu_hdc0_writes96_b"
+             units="messages"
+             symbol_name="EuHdc0Writes96B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="All Data Port 0 Writes to L3"
+             description="The subslice 0 Data Port writes to L3 cache."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="hdc0_l3_writes"
+             units="messages"
+             symbol_name="Hdc0L3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU to Data Port 0 Writes 64B"
+             description="The subslice 0 EU data writes to Data Port with 64B per message."
+             data_type="uint64"
+             equation="B 1 READ B 4 READ UADD"
+             underscore_name="eu_hdc0_writes64_b"
+             units="messages"
+             symbol_name="EuHdc0Writes64B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU to Data Port 0 Writes 128"
+             description="The subslice 0 EU data writes to Data Port with 128B per message."
+             data_type="uint64"
+             equation="B 3 READ 2 UDIV"
+             underscore_name="eu_hdc0_writes128_b"
+             units="messages"
+             symbol_name="EuHdc0Writes128B"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuBytesWrittenPerCacheLine"
+             description="Average EU bytes written per L3 cache line."
+             data_type="float"
+             equation="$EuHdc0Writes32B 32 UMUL $EuHdc0Writes64B 64 UMUL $EuHdc0Writes96B 96 UMUL $EuHdc0Writes128B 128 UMUL $EuHdc0Writes128BSimd16 128 UMUL $EuHdc0Writes256BSimd16 256 UMUL $EuHdc0Writes192B 192 UMUL UADD UADD UADD UADD UADD UADD $Hdc0L3DataWrites FDIV"
+             underscore_name="eu_bytes_written_per_cache_line"
+             units="eu bytes per l3 cache line"
+             symbol_name="EuBytesWrittenPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuDataWritesPerCacheLine"
+             description="Coalescing ratio of EU write requests to L3 cache lines."
+             data_type="float"
+             equation="$EuBytesWrittenPerCacheLine 64 FDIV"
+             underscore_name="eu_data_writes_per_cache_line"
+             units="utilization"
+             symbol_name="EuDataWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL 2 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="All Data Port 0 Reads from L3"
+             description="The subslice 0 Data Port reads from L3 cache."
+             data_type="uint64"
+             equation="C 3 READ C 2 READ USUB"
+             underscore_name="hdc0_l3_reads"
+             units="messages"
+             symbol_name="Hdc0L3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x103D0005" />
+        <register type="NOA" address="0x00009888" value="0x143D0120" />
+        <register type="NOA" address="0x00009888" value="0x163D2400" />
+        <register type="NOA" address="0x00009888" value="0x1058022F" />
+        <register type="NOA" address="0x00009888" value="0x105B0000" />
+        <register type="NOA" address="0x00009888" value="0x198B0003" />
+        <register type="NOA" address="0x00009888" value="0x005CC000" />
+        <register type="NOA" address="0x00009888" value="0x065CC000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5CC000" />
+        <register type="NOA" address="0x00009888" value="0x025C4000" />
+        <register type="NOA" address="0x00009888" value="0x045C8000" />
+        <register type="NOA" address="0x00009888" value="0x003D0000" />
+        <register type="NOA" address="0x00009888" value="0x063D0094" />
+        <register type="NOA" address="0x00009888" value="0x083D0182" />
+        <register type="NOA" address="0x00009888" value="0x0A3D1814" />
+        <register type="NOA" address="0x00009888" value="0x0E3D0000" />
+        <register type="NOA" address="0x00009888" value="0x183D0000" />
+        <register type="NOA" address="0x00009888" value="0x1A3D0000" />
+        <register type="NOA" address="0x00009888" value="0x0C3D0000" />
+        <register type="NOA" address="0x00009888" value="0x0E582242" />
+        <register type="NOA" address="0x00009888" value="0x00586700" />
+        <register type="NOA" address="0x00009888" value="0x0258004F" />
+        <register type="NOA" address="0x00009888" value="0x0658C000" />
+        <register type="NOA" address="0x00009888" value="0x0858C000" />
+        <register type="NOA" address="0x00009888" value="0x0A58C000" />
+        <register type="NOA" address="0x00009888" value="0x045B6A80" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B5400" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0141" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B0000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAAA0" />
+        <register type="NOA" address="0x00009888" value="0x101F0282" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18381415" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C384000" />
+        <register type="NOA" address="0x00009888" value="0x0039A000" />
+        <register type="NOA" address="0x00009888" value="0x0639A000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x02392000" />
+        <register type="NOA" address="0x00009888" value="0x04398000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A82A0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x038B6300" />
+        <register type="NOA" address="0x00009888" value="0x058B0062" />
+        <register type="NOA" address="0x00009888" value="0x118B0000" />
+        <register type="NOA" address="0x00009888" value="0x238B02A0" />
+        <register type="NOA" address="0x00009888" value="0x258B1555" />
+        <register type="NOA" address="0x00009888" value="0x278B0014" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x21852AAA" />
+        <register type="NOA" address="0x00009888" value="0x23850028" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830141" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00000D24" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x4D801000" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800001" />
+        <register type="NOA" address="0x00009888" value="0x43800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800420" />
+        <register type="NOA" address="0x00009888" value="0x3F800421" />
+        <register type="NOA" address="0x00009888" value="0x41800041" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0xBA98BA98" />
+        <register type="OA" address="0x00002748" value="0xBA98BA98" />
+        <register type="OA" address="0x00002744" value="0x00003377" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FF72" />
+        <register type="OA" address="0x00002774" value="0x0000BFD0" />
+        <register type="OA" address="0x00002778" value="0x0007FF62" />
+        <register type="OA" address="0x0000277C" value="0x0000BFD0" />
+        <register type="OA" address="0x00002780" value="0x0007FF42" />
+        <register type="OA" address="0x00002784" value="0x0000BFD0" />
+        <register type="OA" address="0x00002788" value="0x0007FF02" />
+        <register type="OA" address="0x0000278C" value="0x0000BFD0" />
+        <register type="OA" address="0x00002790" value="0x0005FFF2" />
+        <register type="OA" address="0x00002794" value="0x0000BFD0" />
+        <register type="OA" address="0x00002798" value="0x0005FFE2" />
+        <register type="OA" address="0x0000279C" value="0x0000BFD0" />
+        <register type="OA" address="0x000027A0" value="0x0005FFC2" />
+        <register type="OA" address="0x000027A4" value="0x0000BFD0" />
+        <register type="OA" address="0x000027A8" value="0x0005FF82" />
+        <register type="OA" address="0x000027AC" value="0x0000BFD0" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="7bdafd88-a4fa-4ed5-bc09-1a977aa5be3e"
+       chipset="BDW"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 1 READ C 0 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader12_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader12AccessStalledOnL3"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ B 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader10_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader10AccessStalledOnL3"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="B 7 READ B 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader11_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader11AccessStalledOnL3"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x105C0232" />
+        <register type="NOA" address="0x00009888" value="0x10580232" />
+        <register type="NOA" address="0x00009888" value="0x10380232" />
+        <register type="NOA" address="0x00009888" value="0x10DC0232" />
+        <register type="NOA" address="0x00009888" value="0x10D80232" />
+        <register type="NOA" address="0x00009888" value="0x10B80232" />
+        <register type="NOA" address="0x00009888" value="0x118E4400" />
+        <register type="NOA" address="0x00009888" value="0x025C6080" />
+        <register type="NOA" address="0x00009888" value="0x045C004B" />
+        <register type="NOA" address="0x00009888" value="0x005C8000" />
+        <register type="NOA" address="0x00009888" value="0x00582080" />
+        <register type="NOA" address="0x00009888" value="0x0258004B" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x04386080" />
+        <register type="NOA" address="0x00009888" value="0x0638404B" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A380000" />
+        <register type="NOA" address="0x00009888" value="0x0C380000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0CDC25C1" />
+        <register type="NOA" address="0x00009888" value="0x0ADCC000" />
+        <register type="NOA" address="0x00009888" value="0x0AD825C1" />
+        <register type="NOA" address="0x00009888" value="0x18DB4000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0001" />
+        <register type="NOA" address="0x00009888" value="0x0E9F8000" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0EB825C1" />
+        <register type="NOA" address="0x00009888" value="0x18B80154" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0D88C000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258BAA05" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C5400" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x098DC000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x098E05C0" />
+        <register type="NOA" address="0x00009888" value="0x058E0000" />
+        <register type="NOA" address="0x00009888" value="0x198F0020" />
+        <register type="NOA" address="0x00009888" value="0x2185AA0A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x19835000" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x19808000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x51800040" />
+        <register type="NOA" address="0x00009888" value="0x43800400" />
+        <register type="NOA" address="0x00009888" value="0x45800800" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800C62" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801042" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x418014A4" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FFF7" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="9385ebb2-f34f-4aa5-aec5-7e9cbbea0f0b"
+       chipset="BDW"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank1 Stalled"
+             description="The percentage of time in which slice1 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank1_stalled"
+             units="percent"
+             symbol_name="L31Bank1Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice1 L3 Bank0 Stalled"
+             description="The percentage of time in which slice1 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank0_stalled"
+             units="percent"
+             symbol_name="L31Bank0Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Slice1 L3 Bank1 Active"
+             description="The percentage of time in which slice1 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank1_active"
+             units="percent"
+             symbol_name="L31Bank1Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank0 Active"
+             description="The percentage of time in which slice1 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank0_active"
+             units="percent"
+             symbol_name="L31Bank0Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x10BF03DA" />
+        <register type="NOA" address="0x00009888" value="0x14BF0001" />
+        <register type="NOA" address="0x00009888" value="0x12980340" />
+        <register type="NOA" address="0x00009888" value="0x12990340" />
+        <register type="NOA" address="0x00009888" value="0x0CBF1187" />
+        <register type="NOA" address="0x00009888" value="0x0EBF1205" />
+        <register type="NOA" address="0x00009888" value="0x00BF0500" />
+        <register type="NOA" address="0x00009888" value="0x02BF042B" />
+        <register type="NOA" address="0x00009888" value="0x04BF002C" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x00DA8000" />
+        <register type="NOA" address="0x00009888" value="0x02DAC000" />
+        <register type="NOA" address="0x00009888" value="0x04DA4000" />
+        <register type="NOA" address="0x00009888" value="0x04983400" />
+        <register type="NOA" address="0x00009888" value="0x10980000" />
+        <register type="NOA" address="0x00009888" value="0x06990034" />
+        <register type="NOA" address="0x00009888" value="0x10990000" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x009D8000" />
+        <register type="NOA" address="0x00009888" value="0x029DC000" />
+        <register type="NOA" address="0x00009888" value="0x049D4000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00BA" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x00B94000" />
+        <register type="NOA" address="0x00009888" value="0x02B95000" />
+        <register type="NOA" address="0x00009888" value="0x04B91000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA4000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x258B800A" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x47800000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="446ae59b-ff2e-41c9-b49e-0184a54bf00a"
+       chipset="BDW"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x103F03DA" />
+        <register type="NOA" address="0x00009888" value="0x143F0001" />
+        <register type="NOA" address="0x00009888" value="0x12180340" />
+        <register type="NOA" address="0x00009888" value="0x12190340" />
+        <register type="NOA" address="0x00009888" value="0x0C3F1187" />
+        <register type="NOA" address="0x00009888" value="0x0E3F1205" />
+        <register type="NOA" address="0x00009888" value="0x003F0500" />
+        <register type="NOA" address="0x00009888" value="0x023F042B" />
+        <register type="NOA" address="0x00009888" value="0x043F002C" />
+        <register type="NOA" address="0x00009888" value="0x0C5AC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5AC000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x04183400" />
+        <register type="NOA" address="0x00009888" value="0x10180000" />
+        <register type="NOA" address="0x00009888" value="0x06190034" />
+        <register type="NOA" address="0x00009888" value="0x10190000" />
+        <register type="NOA" address="0x00009888" value="0x0C1DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1DC000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x101F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00BA" />
+        <register type="NOA" address="0x00009888" value="0x0C388000" />
+        <register type="NOA" address="0x00009888" value="0x0C395000" />
+        <register type="NOA" address="0x00009888" value="0x0E395000" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04391000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AA800" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B4005" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x47800000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="84a7956f-1ea4-4d0d-837f-e39a0376e38c"
+       chipset="BDW"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank3 Active"
+             description="The percentage of time in which slice1 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank3_active"
+             units="percent"
+             symbol_name="L31Bank3Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 L3 Bank3 Stalled"
+             description="The percentage of time in which slice1 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank3_stalled"
+             units="percent"
+             symbol_name="L31Bank3Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121B0340" />
+        <register type="NOA" address="0x00009888" value="0x103F0274" />
+        <register type="NOA" address="0x00009888" value="0x123F0000" />
+        <register type="NOA" address="0x00009888" value="0x129B0340" />
+        <register type="NOA" address="0x00009888" value="0x10BF0274" />
+        <register type="NOA" address="0x00009888" value="0x12BF0000" />
+        <register type="NOA" address="0x00009888" value="0x041B3400" />
+        <register type="NOA" address="0x00009888" value="0x101B0000" />
+        <register type="NOA" address="0x00009888" value="0x045C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3D4000" />
+        <register type="NOA" address="0x00009888" value="0x003F0080" />
+        <register type="NOA" address="0x00009888" value="0x023F0793" />
+        <register type="NOA" address="0x00009888" value="0x043F0014" />
+        <register type="NOA" address="0x00009888" value="0x04588000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F002A" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04399000" />
+        <register type="NOA" address="0x00009888" value="0x069B0034" />
+        <register type="NOA" address="0x00009888" value="0x109B0000" />
+        <register type="NOA" address="0x00009888" value="0x06DC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CBD4000" />
+        <register type="NOA" address="0x00009888" value="0x0CBF0981" />
+        <register type="NOA" address="0x00009888" value="0x0EBF0A0F" />
+        <register type="NOA" address="0x00009888" value="0x06D84000" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0CDB4000" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0E9F0080" />
+        <register type="NOA" address="0x00009888" value="0x0CB84000" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B8009" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800C00" />
+        <register type="NOA" address="0x00009888" value="0x47800C63" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F8014A5" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800045" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_4"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_4"
+       hw_config_guid="92b493d9-df18-4bed-be06-5cac6f2a6f5f"
+       chipset="BDW"
+       symbol_name="L3_4"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank2 Active"
+             description="The percentage of time in which slice1 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank2_active"
+             units="percent"
+             symbol_name="L31Bank2Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Slice1 L3 Bank2 Stalled"
+             description="The percentage of time in which slice1 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank2_stalled"
+             units="percent"
+             symbol_name="L31Bank2Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121A0340" />
+        <register type="NOA" address="0x00009888" value="0x103F0017" />
+        <register type="NOA" address="0x00009888" value="0x123F0020" />
+        <register type="NOA" address="0x00009888" value="0x129A0340" />
+        <register type="NOA" address="0x00009888" value="0x10BF0017" />
+        <register type="NOA" address="0x00009888" value="0x12BF0020" />
+        <register type="NOA" address="0x00009888" value="0x041A3400" />
+        <register type="NOA" address="0x00009888" value="0x101A0000" />
+        <register type="NOA" address="0x00009888" value="0x043B8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3E0010" />
+        <register type="NOA" address="0x00009888" value="0x003F0200" />
+        <register type="NOA" address="0x00009888" value="0x023F0113" />
+        <register type="NOA" address="0x00009888" value="0x043F0014" />
+        <register type="NOA" address="0x00009888" value="0x02592000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F001A" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04391000" />
+        <register type="NOA" address="0x00009888" value="0x069A0034" />
+        <register type="NOA" address="0x00009888" value="0x109A0000" />
+        <register type="NOA" address="0x00009888" value="0x06BB4000" />
+        <register type="NOA" address="0x00009888" value="0x0ABE0040" />
+        <register type="NOA" address="0x00009888" value="0x0CBF0984" />
+        <register type="NOA" address="0x00009888" value="0x0EBF0A02" />
+        <register type="NOA" address="0x00009888" value="0x02D94000" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0C9C0400" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C9E0400" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0E9F0040" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B8009" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800800" />
+        <register type="NOA" address="0x00009888" value="0x47800842" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801084" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800044" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="14345c35-cc46-40d0-bb04-6ed1fbb43679"
+       chipset="BDW"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Rasterizer Input Available"
+             description="The percentage of time in which slice1 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer1_input_available"
+             units="percent"
+             symbol_name="Rasterizer1InputAvailable"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Pixel Values Ready"
+             description="The percentage of time in which slice1 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values1_ready"
+             units="percent"
+             symbol_name="PixelValues1Ready"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 PS Output Available"
+             description="The percentage of time in which slice1 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output1_available"
+             units="percent"
+             symbol_name="PSOutput1Available"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice1 Rasterizer Output Ready"
+             description="The percentage of time in which slice1 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer1_output_ready"
+             units="percent"
+             symbol_name="Rasterizer1OutputReady"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice1 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data1_ready"
+             units="percent"
+             symbol_name="PixelData1Ready"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x143B000E" />
+        <register type="NOA" address="0x00009888" value="0x043C55C0" />
+        <register type="NOA" address="0x00009888" value="0x0A1E0280" />
+        <register type="NOA" address="0x00009888" value="0x0C1E0408" />
+        <register type="NOA" address="0x00009888" value="0x10390000" />
+        <register type="NOA" address="0x00009888" value="0x12397A1F" />
+        <register type="NOA" address="0x00009888" value="0x14BB000E" />
+        <register type="NOA" address="0x00009888" value="0x04BC5000" />
+        <register type="NOA" address="0x00009888" value="0x0A9E0296" />
+        <register type="NOA" address="0x00009888" value="0x0C9E0008" />
+        <register type="NOA" address="0x00009888" value="0x10B90000" />
+        <register type="NOA" address="0x00009888" value="0x12B97A1F" />
+        <register type="NOA" address="0x00009888" value="0x063B0042" />
+        <register type="NOA" address="0x00009888" value="0x103B0000" />
+        <register type="NOA" address="0x00009888" value="0x083C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A3E0040" />
+        <register type="NOA" address="0x00009888" value="0x043F8000" />
+        <register type="NOA" address="0x00009888" value="0x02594000" />
+        <register type="NOA" address="0x00009888" value="0x045A8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1C0400" />
+        <register type="NOA" address="0x00009888" value="0x041D8000" />
+        <register type="NOA" address="0x00009888" value="0x081E02C0" />
+        <register type="NOA" address="0x00009888" value="0x0E1E0000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1F0260" />
+        <register type="NOA" address="0x00009888" value="0x101F0014" />
+        <register type="NOA" address="0x00009888" value="0x003905E0" />
+        <register type="NOA" address="0x00009888" value="0x06390BC0" />
+        <register type="NOA" address="0x00009888" value="0x02390018" />
+        <register type="NOA" address="0x00009888" value="0x04394000" />
+        <register type="NOA" address="0x00009888" value="0x04BB0042" />
+        <register type="NOA" address="0x00009888" value="0x10BB0000" />
+        <register type="NOA" address="0x00009888" value="0x02BC05C0" />
+        <register type="NOA" address="0x00009888" value="0x08BC0000" />
+        <register type="NOA" address="0x00009888" value="0x0ABE0004" />
+        <register type="NOA" address="0x00009888" value="0x02BF8000" />
+        <register type="NOA" address="0x00009888" value="0x02D91000" />
+        <register type="NOA" address="0x00009888" value="0x02DA8000" />
+        <register type="NOA" address="0x00009888" value="0x089C8000" />
+        <register type="NOA" address="0x00009888" value="0x029D8000" />
+        <register type="NOA" address="0x00009888" value="0x089E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E9E0000" />
+        <register type="NOA" address="0x00009888" value="0x0E9FA806" />
+        <register type="NOA" address="0x00009888" value="0x109F0142" />
+        <register type="NOA" address="0x00009888" value="0x08B90617" />
+        <register type="NOA" address="0x00009888" value="0x0AB90BE0" />
+        <register type="NOA" address="0x00009888" value="0x02B94000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000C" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A2800" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B52A0" />
+        <register type="NOA" address="0x00009888" value="0x258B6A95" />
+        <register type="NOA" address="0x00009888" value="0x278B0029" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C1500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0014" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038D8000" />
+        <register type="NOA" address="0x00009888" value="0x058D2000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4D800444" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F804000" />
+        <register type="NOA" address="0x00009888" value="0x43801080" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800084" />
+        <register type="NOA" address="0x00009888" value="0x53800044" />
+        <register type="NOA" address="0x00009888" value="0x47801080" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800840" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00006000" />
+        <register type="OA" address="0x00002774" value="0x0000F3FF" />
+        <register type="OA" address="0x00002778" value="0x00001800" />
+        <register type="OA" address="0x0000277C" value="0x0000FCFF" />
+        <register type="OA" address="0x00002780" value="0x00000600" />
+        <register type="OA" address="0x00002784" value="0x0000FF3F" />
+        <register type="OA" address="0x00002788" value="0x00000180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000060" />
+        <register type="OA" address="0x00002794" value="0x0000FFF3" />
+        <register type="OA" address="0x00002798" value="0x00000018" />
+        <register type="OA" address="0x0000279C" value="0x0000FFFC" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler_1"
+       hw_config_guid="f0c6ba37-d3d3-4211-91b5-226730312a54"
+       chipset="BDW"
+       symbol_name="Sampler_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Slice1 Subslice0 Input Available"
+             description="The percentage of time in which slice1 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler10_input_available"
+             units="percent"
+             symbol_name="Sampler10InputAvailable"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Slice1 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler12_output_ready"
+             units="percent"
+             symbol_name="Sampler12OutputReady"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Subslice1 Input Available"
+             description="The percentage of time in which slice1 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler11_input_available"
+             units="percent"
+             symbol_name="Sampler11InputAvailable"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice1 Subslice2 Input Available"
+             description="The percentage of time in which slice1 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler12_input_available"
+             units="percent"
+             symbol_name="Sampler12InputAvailable"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice1 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler10_output_ready"
+             units="percent"
+             symbol_name="Sampler10OutputReady"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Slice1 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler11_output_ready"
+             units="percent"
+             symbol_name="Sampler11OutputReady"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x18921400" />
+        <register type="NOA" address="0x00009888" value="0x149500AB" />
+        <register type="NOA" address="0x00009888" value="0x18B21400" />
+        <register type="NOA" address="0x00009888" value="0x14B500AB" />
+        <register type="NOA" address="0x00009888" value="0x18D21400" />
+        <register type="NOA" address="0x00009888" value="0x14D500AB" />
+        <register type="NOA" address="0x00009888" value="0x0CDC8000" />
+        <register type="NOA" address="0x00009888" value="0x0EDC4000" />
+        <register type="NOA" address="0x00009888" value="0x02DCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DCC000" />
+        <register type="NOA" address="0x00009888" value="0x1ABD00A0" />
+        <register type="NOA" address="0x00009888" value="0x0ABD8000" />
+        <register type="NOA" address="0x00009888" value="0x0CD88000" />
+        <register type="NOA" address="0x00009888" value="0x0ED84000" />
+        <register type="NOA" address="0x00009888" value="0x04D88000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0050" />
+        <register type="NOA" address="0x00009888" value="0x04DB8000" />
+        <register type="NOA" address="0x00009888" value="0x06DB8000" />
+        <register type="NOA" address="0x00009888" value="0x08DB8000" />
+        <register type="NOA" address="0x00009888" value="0x0ADB4000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A0" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00AA" />
+        <register type="NOA" address="0x00009888" value="0x18B82500" />
+        <register type="NOA" address="0x00009888" value="0x02B88000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB84000" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB98000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x00B98000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x1ABA0200" />
+        <register type="NOA" address="0x00009888" value="0x02BA8000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA8000" />
+        <register type="NOA" address="0x00009888" value="0x04908000" />
+        <register type="NOA" address="0x00009888" value="0x04918000" />
+        <register type="NOA" address="0x00009888" value="0x04927300" />
+        <register type="NOA" address="0x00009888" value="0x10920000" />
+        <register type="NOA" address="0x00009888" value="0x1893000A" />
+        <register type="NOA" address="0x00009888" value="0x0A934000" />
+        <register type="NOA" address="0x00009888" value="0x0A946000" />
+        <register type="NOA" address="0x00009888" value="0x0C959000" />
+        <register type="NOA" address="0x00009888" value="0x0E950098" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x04B04000" />
+        <register type="NOA" address="0x00009888" value="0x04B14000" />
+        <register type="NOA" address="0x00009888" value="0x04B20073" />
+        <register type="NOA" address="0x00009888" value="0x10B20000" />
+        <register type="NOA" address="0x00009888" value="0x04B38000" />
+        <register type="NOA" address="0x00009888" value="0x06B38000" />
+        <register type="NOA" address="0x00009888" value="0x08B34000" />
+        <register type="NOA" address="0x00009888" value="0x04B4C000" />
+        <register type="NOA" address="0x00009888" value="0x02B59890" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x06D04000" />
+        <register type="NOA" address="0x00009888" value="0x06D14000" />
+        <register type="NOA" address="0x00009888" value="0x06D20073" />
+        <register type="NOA" address="0x00009888" value="0x10D20000" />
+        <register type="NOA" address="0x00009888" value="0x18D30020" />
+        <register type="NOA" address="0x00009888" value="0x02D38000" />
+        <register type="NOA" address="0x00009888" value="0x0CD34000" />
+        <register type="NOA" address="0x00009888" value="0x0AD48000" />
+        <register type="NOA" address="0x00009888" value="0x04D42000" />
+        <register type="NOA" address="0x00009888" value="0x0ED59000" />
+        <register type="NOA" address="0x00009888" value="0x00D59800" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000E" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x258B000A" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8D8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x2185000A" />
+        <register type="NOA" address="0x00009888" value="0x1B830150" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D848000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D808000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801021" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800C64" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800C02" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler_2"
+       hw_config_guid="30bf3702-48cf-4bca-b412-7cf50bb2f564"
+       chipset="BDW"
+       symbol_name="Sampler_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x18121400" />
+        <register type="NOA" address="0x00009888" value="0x141500AB" />
+        <register type="NOA" address="0x00009888" value="0x18321400" />
+        <register type="NOA" address="0x00009888" value="0x143500AB" />
+        <register type="NOA" address="0x00009888" value="0x18521400" />
+        <register type="NOA" address="0x00009888" value="0x145500AB" />
+        <register type="NOA" address="0x00009888" value="0x0C5C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5C4000" />
+        <register type="NOA" address="0x00009888" value="0x025CC000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x1A3D00A0" />
+        <register type="NOA" address="0x00009888" value="0x0A3D8000" />
+        <register type="NOA" address="0x00009888" value="0x0C588000" />
+        <register type="NOA" address="0x00009888" value="0x0E584000" />
+        <register type="NOA" address="0x00009888" value="0x04588000" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0050" />
+        <register type="NOA" address="0x00009888" value="0x045B8000" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B8000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x101F02A0" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x18382500" />
+        <register type="NOA" address="0x00009888" value="0x02388000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x06384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C388000" />
+        <register type="NOA" address="0x00009888" value="0x0C398000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x1A3A0200" />
+        <register type="NOA" address="0x00009888" value="0x023A8000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A8000" />
+        <register type="NOA" address="0x00009888" value="0x04108000" />
+        <register type="NOA" address="0x00009888" value="0x04118000" />
+        <register type="NOA" address="0x00009888" value="0x04127300" />
+        <register type="NOA" address="0x00009888" value="0x10120000" />
+        <register type="NOA" address="0x00009888" value="0x1813000A" />
+        <register type="NOA" address="0x00009888" value="0x0A134000" />
+        <register type="NOA" address="0x00009888" value="0x0A146000" />
+        <register type="NOA" address="0x00009888" value="0x0C159000" />
+        <register type="NOA" address="0x00009888" value="0x0E150098" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04304000" />
+        <register type="NOA" address="0x00009888" value="0x04314000" />
+        <register type="NOA" address="0x00009888" value="0x04320073" />
+        <register type="NOA" address="0x00009888" value="0x10320000" />
+        <register type="NOA" address="0x00009888" value="0x04338000" />
+        <register type="NOA" address="0x00009888" value="0x06338000" />
+        <register type="NOA" address="0x00009888" value="0x08334000" />
+        <register type="NOA" address="0x00009888" value="0x0434C000" />
+        <register type="NOA" address="0x00009888" value="0x02359890" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x06504000" />
+        <register type="NOA" address="0x00009888" value="0x06514000" />
+        <register type="NOA" address="0x00009888" value="0x06520073" />
+        <register type="NOA" address="0x00009888" value="0x10520000" />
+        <register type="NOA" address="0x00009888" value="0x18530020" />
+        <register type="NOA" address="0x00009888" value="0x02538000" />
+        <register type="NOA" address="0x00009888" value="0x0C534000" />
+        <register type="NOA" address="0x00009888" value="0x0A548000" />
+        <register type="NOA" address="0x00009888" value="0x04542000" />
+        <register type="NOA" address="0x00009888" value="0x0E559000" />
+        <register type="NOA" address="0x00009888" value="0x00559800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AA000" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x258B0005" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x2185000A" />
+        <register type="NOA" address="0x00009888" value="0x1B830150" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D848000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D808000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801021" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800C64" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800C02" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="238bec85-df05-44f3-b905-d166712f2451"
+       chipset="BDW"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread11_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread11ReadyForDispatch"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread12_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread12ReadyForDispatch"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread10_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread10ReadyForDispatch"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread12_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread12ReadyForDispatch"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread10_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread10ReadyForDispatch"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread11_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread11ReadyForDispatch"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x16154D60" />
+        <register type="NOA" address="0x00009888" value="0x16352E60" />
+        <register type="NOA" address="0x00009888" value="0x16554D60" />
+        <register type="NOA" address="0x00009888" value="0x16950000" />
+        <register type="NOA" address="0x00009888" value="0x16B50000" />
+        <register type="NOA" address="0x00009888" value="0x16D50000" />
+        <register type="NOA" address="0x00009888" value="0x005C8000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x065C4000" />
+        <register type="NOA" address="0x00009888" value="0x083D8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3D8000" />
+        <register type="NOA" address="0x00009888" value="0x0458C000" />
+        <register type="NOA" address="0x00009888" value="0x025B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04388000" />
+        <register type="NOA" address="0x00009888" value="0x06388000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C384000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x043A8000" />
+        <register type="NOA" address="0x00009888" value="0x063A8000" />
+        <register type="NOA" address="0x00009888" value="0x08138000" />
+        <register type="NOA" address="0x00009888" value="0x0A138000" />
+        <register type="NOA" address="0x00009888" value="0x06143000" />
+        <register type="NOA" address="0x00009888" value="0x0415CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x02338000" />
+        <register type="NOA" address="0x00009888" value="0x0C338000" />
+        <register type="NOA" address="0x00009888" value="0x04342000" />
+        <register type="NOA" address="0x00009888" value="0x06344000" />
+        <register type="NOA" address="0x00009888" value="0x0035C700" />
+        <register type="NOA" address="0x00009888" value="0x063500CF" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04538000" />
+        <register type="NOA" address="0x00009888" value="0x06538000" />
+        <register type="NOA" address="0x00009888" value="0x0454C000" />
+        <register type="NOA" address="0x00009888" value="0x0255CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06DC8000" />
+        <register type="NOA" address="0x00009888" value="0x08DC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CDCC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDCC000" />
+        <register type="NOA" address="0x00009888" value="0x1ABD00A8" />
+        <register type="NOA" address="0x00009888" value="0x0CD8C000" />
+        <register type="NOA" address="0x00009888" value="0x0ED84000" />
+        <register type="NOA" address="0x00009888" value="0x0EDB8000" />
+        <register type="NOA" address="0x00009888" value="0x18DB0800" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0254" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA00" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0EB84000" />
+        <register type="NOA" address="0x00009888" value="0x16B84000" />
+        <register type="NOA" address="0x00009888" value="0x18B8156A" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x18BAA000" />
+        <register type="NOA" address="0x00009888" value="0x1ABA0002" />
+        <register type="NOA" address="0x00009888" value="0x16934000" />
+        <register type="NOA" address="0x00009888" value="0x1893000A" />
+        <register type="NOA" address="0x00009888" value="0x0A947000" />
+        <register type="NOA" address="0x00009888" value="0x0C95C5C1" />
+        <register type="NOA" address="0x00009888" value="0x0E9500C3" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x0EB38000" />
+        <register type="NOA" address="0x00009888" value="0x16B30040" />
+        <register type="NOA" address="0x00009888" value="0x18B30020" />
+        <register type="NOA" address="0x00009888" value="0x06B48000" />
+        <register type="NOA" address="0x00009888" value="0x08B41000" />
+        <register type="NOA" address="0x00009888" value="0x0AB48000" />
+        <register type="NOA" address="0x00009888" value="0x06B5C500" />
+        <register type="NOA" address="0x00009888" value="0x08B500C3" />
+        <register type="NOA" address="0x00009888" value="0x0EB5C100" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x16D31500" />
+        <register type="NOA" address="0x00009888" value="0x08D4E000" />
+        <register type="NOA" address="0x00009888" value="0x08D5C100" />
+        <register type="NOA" address="0x00009888" value="0x0AD5C3C5" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA5" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800C42" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800063" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800800" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F8014A4" />
+        <register type="NOA" address="0x00009888" value="0x41801042" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x0000FE7F" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFBF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFF7" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FFF9" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="24bf02cd-8693-4583-981c-c4165b33da01"
+       chipset="BDW"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice1 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header11_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader11ReadyPort1"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header11_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader11ReadyPort0"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header12_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader12ReadyPort0"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header10_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader10ReadyPort1"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header12_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader12ReadyPort1"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header10_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader10ReadyPort0"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x16150000" />
+        <register type="NOA" address="0x00009888" value="0x16350000" />
+        <register type="NOA" address="0x00009888" value="0x16550000" />
+        <register type="NOA" address="0x00009888" value="0x16952E60" />
+        <register type="NOA" address="0x00009888" value="0x16B54D60" />
+        <register type="NOA" address="0x00009888" value="0x16D52E60" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5C4000" />
+        <register type="NOA" address="0x00009888" value="0x0E3D8000" />
+        <register type="NOA" address="0x00009888" value="0x183DA000" />
+        <register type="NOA" address="0x00009888" value="0x06588000" />
+        <register type="NOA" address="0x00009888" value="0x08588000" />
+        <register type="NOA" address="0x00009888" value="0x0A584000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B5800" />
+        <register type="NOA" address="0x00009888" value="0x1A5B000A" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA00" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18382A55" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x1A3A02A0" />
+        <register type="NOA" address="0x00009888" value="0x0E138000" />
+        <register type="NOA" address="0x00009888" value="0x16130500" />
+        <register type="NOA" address="0x00009888" value="0x06148000" />
+        <register type="NOA" address="0x00009888" value="0x08146000" />
+        <register type="NOA" address="0x00009888" value="0x0615C100" />
+        <register type="NOA" address="0x00009888" value="0x0815C500" />
+        <register type="NOA" address="0x00009888" value="0x0A1500C3" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x16335040" />
+        <register type="NOA" address="0x00009888" value="0x08349000" />
+        <register type="NOA" address="0x00009888" value="0x0A341000" />
+        <register type="NOA" address="0x00009888" value="0x083500C1" />
+        <register type="NOA" address="0x00009888" value="0x0A35C500" />
+        <register type="NOA" address="0x00009888" value="0x0C3500C3" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x1853002A" />
+        <register type="NOA" address="0x00009888" value="0x0A54E000" />
+        <register type="NOA" address="0x00009888" value="0x0C55C500" />
+        <register type="NOA" address="0x00009888" value="0x0E55C1C3" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x00DC8000" />
+        <register type="NOA" address="0x00009888" value="0x02DCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DC4000" />
+        <register type="NOA" address="0x00009888" value="0x04BD8000" />
+        <register type="NOA" address="0x00009888" value="0x06BD8000" />
+        <register type="NOA" address="0x00009888" value="0x02D8C000" />
+        <register type="NOA" address="0x00009888" value="0x02DB8000" />
+        <register type="NOA" address="0x00009888" value="0x04DB4000" />
+        <register type="NOA" address="0x00009888" value="0x06DB4000" />
+        <register type="NOA" address="0x00009888" value="0x08DB8000" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00AA" />
+        <register type="NOA" address="0x00009888" value="0x02B84000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x00B98000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0ABA8000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA8000" />
+        <register type="NOA" address="0x00009888" value="0x04938000" />
+        <register type="NOA" address="0x00009888" value="0x06938000" />
+        <register type="NOA" address="0x00009888" value="0x0494C000" />
+        <register type="NOA" address="0x00009888" value="0x0295CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x02B38000" />
+        <register type="NOA" address="0x00009888" value="0x08B38000" />
+        <register type="NOA" address="0x00009888" value="0x04B42000" />
+        <register type="NOA" address="0x00009888" value="0x06B41000" />
+        <register type="NOA" address="0x00009888" value="0x00B5C700" />
+        <register type="NOA" address="0x00009888" value="0x04B500CF" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x0AD38000" />
+        <register type="NOA" address="0x00009888" value="0x0CD38000" />
+        <register type="NOA" address="0x00009888" value="0x06D46000" />
+        <register type="NOA" address="0x00009888" value="0x04D5C700" />
+        <register type="NOA" address="0x00009888" value="0x06D500CF" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x258B555A" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800882" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45801082" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x478014A5" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800002" />
+        <register type="NOA" address="0x00009888" value="0x41800C62" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x0000FE7F" />
+        <register type="OA" address="0x00002780" value="0x00000000" />
+        <register type="OA" address="0x00002784" value="0x0000FF9F" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000FFE7" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFFB" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FFFD" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen8"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="8fb61ba2-2fbb-454c-a136-2dec5a8a595e"
+       chipset="BDW"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 5 READ C 6 READ FADD C 7 READ FADD C 2 READ FADD C 3 READ FADD C 4 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x161503E0" />
+        <register type="NOA" address="0x00009888" value="0x163503E0" />
+        <register type="NOA" address="0x00009888" value="0x165503E0" />
+        <register type="NOA" address="0x00009888" value="0x169503E0" />
+        <register type="NOA" address="0x00009888" value="0x16B503E0" />
+        <register type="NOA" address="0x00009888" value="0x16D503E0" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x083D8000" />
+        <register type="NOA" address="0x00009888" value="0x04584000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00A8" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C388000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A8000" />
+        <register type="NOA" address="0x00009888" value="0x08138000" />
+        <register type="NOA" address="0x00009888" value="0x06141000" />
+        <register type="NOA" address="0x00009888" value="0x041500C3" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x0A338000" />
+        <register type="NOA" address="0x00009888" value="0x06342000" />
+        <register type="NOA" address="0x00009888" value="0x0435C300" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x0C538000" />
+        <register type="NOA" address="0x00009888" value="0x06544000" />
+        <register type="NOA" address="0x00009888" value="0x065500C3" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x00DC8000" />
+        <register type="NOA" address="0x00009888" value="0x02DC4000" />
+        <register type="NOA" address="0x00009888" value="0x02BD8000" />
+        <register type="NOA" address="0x00009888" value="0x00D88000" />
+        <register type="NOA" address="0x00009888" value="0x02DB4000" />
+        <register type="NOA" address="0x00009888" value="0x04DB8000" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F0002" />
+        <register type="NOA" address="0x00009888" value="0x02B84000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B88000" />
+        <register type="NOA" address="0x00009888" value="0x00B98000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06BA8000" />
+        <register type="NOA" address="0x00009888" value="0x02938000" />
+        <register type="NOA" address="0x00009888" value="0x04942000" />
+        <register type="NOA" address="0x00009888" value="0x0095C300" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x04B38000" />
+        <register type="NOA" address="0x00009888" value="0x04B44000" />
+        <register type="NOA" address="0x00009888" value="0x02B500C3" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x06D38000" />
+        <register type="NOA" address="0x00009888" value="0x04D48000" />
+        <register type="NOA" address="0x00009888" value="0x02D5C300" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B3500" />
+        <register type="NOA" address="0x00009888" value="0x258B0005" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x2185000A" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800C40" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41801482" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00001000" />
+        <register type="FLEX" address="0x0000E558" value="0x00003002" />
+        <register type="FLEX" address="0x0000E658" value="0x00005004" />
+        <register type="FLEX" address="0x0000E758" value="0x00011010" />
+        <register type="FLEX" address="0x0000E45C" value="0x00050012" />
+        <register type="FLEX" address="0x0000E55C" value="0x00052051" />
+        <register type="FLEX" address="0x0000E65C" value="0x00000008" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen8"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="e1743ca0-7fc8-410b-a066-de7bbb9280b7"
+       chipset="BDW"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14100812" />
+        <register type="NOA" address="0x00009888" value="0x14125800" />
+        <register type="NOA" address="0x00009888" value="0x161200C0" />
+        <register type="NOA" address="0x00009888" value="0x14300812" />
+        <register type="NOA" address="0x00009888" value="0x14325800" />
+        <register type="NOA" address="0x00009888" value="0x163200C0" />
+        <register type="NOA" address="0x00009888" value="0x005C4000" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5CC000" />
+        <register type="NOA" address="0x00009888" value="0x003D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E3D8000" />
+        <register type="NOA" address="0x00009888" value="0x183D2800" />
+        <register type="NOA" address="0x00009888" value="0x00584000" />
+        <register type="NOA" address="0x00009888" value="0x06588000" />
+        <register type="NOA" address="0x00009888" value="0x0858C000" />
+        <register type="NOA" address="0x00009888" value="0x005B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B9400" />
+        <register type="NOA" address="0x00009888" value="0x1A5B002A" />
+        <register type="NOA" address="0x00009888" value="0x0C1F0800" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA00" />
+        <register type="NOA" address="0x00009888" value="0x101F002A" />
+        <register type="NOA" address="0x00009888" value="0x00384000" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18380155" />
+        <register type="NOA" address="0x00009888" value="0x00392000" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x00100047" />
+        <register type="NOA" address="0x00009888" value="0x06101A80" />
+        <register type="NOA" address="0x00009888" value="0x10100000" />
+        <register type="NOA" address="0x00009888" value="0x0810C000" />
+        <register type="NOA" address="0x00009888" value="0x0811C000" />
+        <register type="NOA" address="0x00009888" value="0x08126151" />
+        <register type="NOA" address="0x00009888" value="0x10120000" />
+        <register type="NOA" address="0x00009888" value="0x00134000" />
+        <register type="NOA" address="0x00009888" value="0x0E134000" />
+        <register type="NOA" address="0x00009888" value="0x161300A0" />
+        <register type="NOA" address="0x00009888" value="0x0A301AC7" />
+        <register type="NOA" address="0x00009888" value="0x10300000" />
+        <register type="NOA" address="0x00009888" value="0x0C30C000" />
+        <register type="NOA" address="0x00009888" value="0x0C31C000" />
+        <register type="NOA" address="0x00009888" value="0x0C326151" />
+        <register type="NOA" address="0x00009888" value="0x10320000" />
+        <register type="NOA" address="0x00009888" value="0x16332A00" />
+        <register type="NOA" address="0x00009888" value="0x18330001" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A2AA0" />
+        <register type="NOA" address="0x00009888" value="0x238B0020" />
+        <register type="NOA" address="0x00009888" value="0x258B5550" />
+        <register type="NOA" address="0x00009888" value="0x278B0001" />
+        <register type="NOA" address="0x00009888" value="0x1F850080" />
+        <register type="NOA" address="0x00009888" value="0x2185AAA0" />
+        <register type="NOA" address="0x00009888" value="0x23850002" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830015" />
+        <register type="NOA" address="0x00009888" value="0x01844000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x11804000" />
+        <register type="NOA" address="0x00009888" value="0x17808000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3D800800" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800002" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800884" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800002" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen8"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="d6de6f55-e526-4f79-a6a6-d7315c09044e"
+       chipset="BDW"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x198B0000" />
+        <register type="NOA" address="0x00009888" value="0x078B0066" />
+        <register type="NOA" address="0x00009888" value="0x118B0000" />
+        <register type="NOA" address="0x00009888" value="0x258B0000" />
+        <register type="NOA" address="0x00009888" value="0x21850008" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_bxt.xml b/src/mesa/drivers/dri/i965/brw_oa_bxt.xml
new file mode 100644
index 0000000..18d14d9
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_bxt.xml
@@ -0,0 +1,9211 @@
+<?xml version="1.0"?>
+<metrics version="1491575452" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="22b9519a-e9ba-4c41-8b54-f4f8ca14fa0a"
+       chipset="BXT"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x03 UGTE"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x166C00F0" />
+        <register type="NOA" address="0x00009888" value="0x12120280" />
+        <register type="NOA" address="0x00009888" value="0x12320280" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x419000A0" />
+        <register type="NOA" address="0x00009888" value="0x002D1000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E0800" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5900" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4C4000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0010" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FCC00" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0002" />
+        <register type="NOA" address="0x00009888" value="0x1C2C0040" />
+        <register type="NOA" address="0x00009888" value="0x00101000" />
+        <register type="NOA" address="0x00009888" value="0x04101000" />
+        <register type="NOA" address="0x00009888" value="0x00114000" />
+        <register type="NOA" address="0x00009888" value="0x08114000" />
+        <register type="NOA" address="0x00009888" value="0x00120020" />
+        <register type="NOA" address="0x00009888" value="0x08120021" />
+        <register type="NOA" address="0x00009888" value="0x00141000" />
+        <register type="NOA" address="0x00009888" value="0x08141000" />
+        <register type="NOA" address="0x00009888" value="0x02308000" />
+        <register type="NOA" address="0x00009888" value="0x04302000" />
+        <register type="NOA" address="0x00009888" value="0x06318000" />
+        <register type="NOA" address="0x00009888" value="0x08318000" />
+        <register type="NOA" address="0x00009888" value="0x06320800" />
+        <register type="NOA" address="0x00009888" value="0x08320840" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x06344000" />
+        <register type="NOA" address="0x00009888" value="0x08344000" />
+        <register type="NOA" address="0x00009888" value="0x0D931831" />
+        <register type="NOA" address="0x00009888" value="0x0F939F3F" />
+        <register type="NOA" address="0x00009888" value="0x01939E80" />
+        <register type="NOA" address="0x00009888" value="0x039303BC" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1993002A" />
+        <register type="NOA" address="0x00009888" value="0x07930000" />
+        <register type="NOA" address="0x00009888" value="0x09930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900177" />
+        <register type="NOA" address="0x00009888" value="0x1F900187" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x53901110" />
+        <register type="NOA" address="0x00009888" value="0x43900423" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900C02" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900020" />
+        <register type="NOA" address="0x00009888" value="0x59901111" />
+        <register type="NOA" address="0x00009888" value="0x4B900421" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x45900821" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="012d72cf-82a9-4d25-8ddf-74076fd30797"
+       chipset="BXT"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL  $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x002D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D4000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E1400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5100" />
+        <register type="NOA" address="0x00009888" value="0x102E0114" />
+        <register type="NOA" address="0x00009888" value="0x044CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4C4000" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x004EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F6B42" />
+        <register type="NOA" address="0x00009888" value="0x064F6200" />
+        <register type="NOA" address="0x00009888" value="0x084F4100" />
+        <register type="NOA" address="0x00009888" value="0x0A4F0061" />
+        <register type="NOA" address="0x00009888" value="0x0C4F6C4C" />
+        <register type="NOA" address="0x00009888" value="0x0E4F4B00" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0000" />
+        <register type="NOA" address="0x00009888" value="0x180F5000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F8800" />
+        <register type="NOA" address="0x00009888" value="0x1C0F08A2" />
+        <register type="NOA" address="0x00009888" value="0x182C4000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C1451" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C0010" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x19938A28" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x19900177" />
+        <register type="NOA" address="0x00009888" value="0x1B900178" />
+        <register type="NOA" address="0x00009888" value="0x1D900125" />
+        <register type="NOA" address="0x00009888" value="0x1F900123" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x53901000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="ce416533-e49e-4211-80af-ec513590a914"
+       chipset="BXT"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C2E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A2F0000" />
+        <register type="NOA" address="0x00009888" value="0x10186800" />
+        <register type="NOA" address="0x00009888" value="0x11810019" />
+        <register type="NOA" address="0x00009888" value="0x15810013" />
+        <register type="NOA" address="0x00009888" value="0x13820020" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x17840000" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x21860000" />
+        <register type="NOA" address="0x00009888" value="0x178703E0" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x022E5400" />
+        <register type="NOA" address="0x00009888" value="0x002E0000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0080" />
+        <register type="NOA" address="0x00009888" value="0x082F0040" />
+        <register type="NOA" address="0x00009888" value="0x002F0000" />
+        <register type="NOA" address="0x00009888" value="0x06143000" />
+        <register type="NOA" address="0x00009888" value="0x06174000" />
+        <register type="NOA" address="0x00009888" value="0x06180012" />
+        <register type="NOA" address="0x00009888" value="0x00180000" />
+        <register type="NOA" address="0x00009888" value="0x0D804000" />
+        <register type="NOA" address="0x00009888" value="0x0F804000" />
+        <register type="NOA" address="0x00009888" value="0x05804000" />
+        <register type="NOA" address="0x00009888" value="0x09810200" />
+        <register type="NOA" address="0x00009888" value="0x0B810030" />
+        <register type="NOA" address="0x00009888" value="0x03810003" />
+        <register type="NOA" address="0x00009888" value="0x21819140" />
+        <register type="NOA" address="0x00009888" value="0x23819050" />
+        <register type="NOA" address="0x00009888" value="0x25810018" />
+        <register type="NOA" address="0x00009888" value="0x0B820980" />
+        <register type="NOA" address="0x00009888" value="0x03820D80" />
+        <register type="NOA" address="0x00009888" value="0x11820000" />
+        <register type="NOA" address="0x00009888" value="0x0182C000" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x09824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0D830004" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x0F831000" />
+        <register type="NOA" address="0x00009888" value="0x01848072" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x09844000" />
+        <register type="NOA" address="0x00009888" value="0x0F848000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x09860092" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x01869100" />
+        <register type="NOA" address="0x00009888" value="0x0F870065" />
+        <register type="NOA" address="0x00009888" value="0x01870000" />
+        <register type="NOA" address="0x00009888" value="0x19930800" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1B952000" />
+        <register type="NOA" address="0x00009888" value="0x1D955055" />
+        <register type="NOA" address="0x00009888" value="0x1F951455" />
+        <register type="NOA" address="0x00009888" value="0x0992A000" />
+        <register type="NOA" address="0x00009888" value="0x0F928000" />
+        <register type="NOA" address="0x00009888" value="0x1192A800" />
+        <register type="NOA" address="0x00009888" value="0x1392028A" />
+        <register type="NOA" address="0x00009888" value="0x0B92A000" />
+        <register type="NOA" address="0x00009888" value="0x0D922000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900C01" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900863" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900061" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C22" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="398e2452-18d7-42d0-b241-e4d0a9148ada"
+       chipset="BXT"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800343" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x41900003" />
+        <register type="NOA" address="0x00009888" value="0x03803180" />
+        <register type="NOA" address="0x00009888" value="0x058035E2" />
+        <register type="NOA" address="0x00009888" value="0x0780006A" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x2181A000" />
+        <register type="NOA" address="0x00009888" value="0x2381000A" />
+        <register type="NOA" address="0x00009888" value="0x1D950550" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D92A000" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x13900170" />
+        <register type="NOA" address="0x00009888" value="0x21900171" />
+        <register type="NOA" address="0x00009888" value="0x23900172" />
+        <register type="NOA" address="0x00009888" value="0x25900173" />
+        <register type="NOA" address="0x00009888" value="0x27900174" />
+        <register type="NOA" address="0x00009888" value="0x29900175" />
+        <register type="NOA" address="0x00009888" value="0x2B900176" />
+        <register type="NOA" address="0x00009888" value="0x2D900177" />
+        <register type="NOA" address="0x00009888" value="0x2F90017F" />
+        <register type="NOA" address="0x00009888" value="0x31900125" />
+        <register type="NOA" address="0x00009888" value="0x15900123" />
+        <register type="NOA" address="0x00009888" value="0x17900121" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47901080" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49901084" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B901084" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900004" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="d324a0d6-7269-4847-a5c2-6f71ddc7fed5"
+       chipset="BXT"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800343" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F900000" />
+        <register type="NOA" address="0x00009888" value="0x41900080" />
+        <register type="NOA" address="0x00009888" value="0x03803180" />
+        <register type="NOA" address="0x00009888" value="0x058035E2" />
+        <register type="NOA" address="0x00009888" value="0x0780006A" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x2181A000" />
+        <register type="NOA" address="0x00009888" value="0x2381000A" />
+        <register type="NOA" address="0x00009888" value="0x1D950550" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D92A000" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x13900180" />
+        <register type="NOA" address="0x00009888" value="0x21900181" />
+        <register type="NOA" address="0x00009888" value="0x23900182" />
+        <register type="NOA" address="0x00009888" value="0x25900183" />
+        <register type="NOA" address="0x00009888" value="0x27900184" />
+        <register type="NOA" address="0x00009888" value="0x29900185" />
+        <register type="NOA" address="0x00009888" value="0x2B900186" />
+        <register type="NOA" address="0x00009888" value="0x2D900187" />
+        <register type="NOA" address="0x00009888" value="0x2F900170" />
+        <register type="NOA" address="0x00009888" value="0x31900125" />
+        <register type="NOA" address="0x00009888" value="0x15900123" />
+        <register type="NOA" address="0x00009888" value="0x17900121" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47901080" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49901084" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B901084" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900004" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="caf3596a-7bb1-4dec-b3b3-2a080d283b49"
+       chipset="BXT"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C0160" />
+        <register type="NOA" address="0x00009888" value="0x161C0015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x002D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5515" />
+        <register type="NOA" address="0x00009888" value="0x102E0155" />
+        <register type="NOA" address="0x00009888" value="0x044CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E4CC000" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x004EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084EA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4F4B41" />
+        <register type="NOA" address="0x00009888" value="0x004F4200" />
+        <register type="NOA" address="0x00009888" value="0x024F404C" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0A1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0031" />
+        <register type="NOA" address="0x00009888" value="0x061C1900" />
+        <register type="NOA" address="0x00009888" value="0x081C1A33" />
+        <register type="NOA" address="0x00009888" value="0x0A1C1B35" />
+        <register type="NOA" address="0x00009888" value="0x0C1C3337" />
+        <register type="NOA" address="0x00009888" value="0x041C31C7" />
+        <register type="NOA" address="0x00009888" value="0x180F5000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FA8AA" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0AAA" />
+        <register type="NOA" address="0x00009888" value="0x182C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C6AAA" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C2950" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x1993AAAA" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900400" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="49b956e2-d5b9-47e0-9d8a-cee5e8cec527"
+       chipset="BXT"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C03B0" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x002D1000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E0400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E1500" />
+        <register type="NOA" address="0x00009888" value="0x102E0140" />
+        <register type="NOA" address="0x00009888" value="0x044C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4CC000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x004E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x1A4F4001" />
+        <register type="NOA" address="0x00009888" value="0x1C4F5005" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x180F1000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FA800" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C4000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C4015" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x03931980" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x1993A00A" />
+        <register type="NOA" address="0x00009888" value="0x07930000" />
+        <register type="NOA" address="0x00009888" value="0x09930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900177" />
+        <register type="NOA" address="0x00009888" value="0x1F900178" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x53901000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="f64ef50a-bdba-4b35-8f09-203c13d8ee5a"
+       chipset="BXT"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x022D4000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0055" />
+        <register type="NOA" address="0x00009888" value="0x064C8000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x024F6100" />
+        <register type="NOA" address="0x00009888" value="0x044F416B" />
+        <register type="NOA" address="0x00009888" value="0x064F004B" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F02A8" />
+        <register type="NOA" address="0x00009888" value="0x1A2C5500" />
+        <register type="NOA" address="0x00009888" value="0x0F808000" />
+        <register type="NOA" address="0x00009888" value="0x25810020" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1F951000" />
+        <register type="NOA" address="0x00009888" value="0x13920200" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="00ad5a41-7eab-4f7a-9103-49d411c67219"
+       chipset="BXT"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x03 UGTE"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x12643400" />
+        <register type="NOA" address="0x00009888" value="0x12653400" />
+        <register type="NOA" address="0x00009888" value="0x106C6800" />
+        <register type="NOA" address="0x00009888" value="0x126C001E" />
+        <register type="NOA" address="0x00009888" value="0x166C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E0154" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0055" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F5500" />
+        <register type="NOA" address="0x00009888" value="0x1A4F1554" />
+        <register type="NOA" address="0x00009888" value="0x0A640024" />
+        <register type="NOA" address="0x00009888" value="0x10640000" />
+        <register type="NOA" address="0x00009888" value="0x04640000" />
+        <register type="NOA" address="0x00009888" value="0x0C650024" />
+        <register type="NOA" address="0x00009888" value="0x10650000" />
+        <register type="NOA" address="0x00009888" value="0x06650000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0900" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0AA0" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F02AA" />
+        <register type="NOA" address="0x00009888" value="0x1C2C5400" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C5550" />
+        <register type="NOA" address="0x00009888" value="0x1993AA00" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900421" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x03 ULT"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x14640340" />
+        <register type="NOA" address="0x00009888" value="0x14650340" />
+        <register type="NOA" address="0x00009888" value="0x106C6800" />
+        <register type="NOA" address="0x00009888" value="0x126C001E" />
+        <register type="NOA" address="0x00009888" value="0x166C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E0154" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0055" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F5500" />
+        <register type="NOA" address="0x00009888" value="0x1A4F1554" />
+        <register type="NOA" address="0x00009888" value="0x04642400" />
+        <register type="NOA" address="0x00009888" value="0x22640000" />
+        <register type="NOA" address="0x00009888" value="0x1A640000" />
+        <register type="NOA" address="0x00009888" value="0x06650024" />
+        <register type="NOA" address="0x00009888" value="0x22650000" />
+        <register type="NOA" address="0x00009888" value="0x1C650000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0900" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0AA0" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F02AA" />
+        <register type="NOA" address="0x00009888" value="0x1C2C5400" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C5550" />
+        <register type="NOA" address="0x00009888" value="0x1993AA00" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900421" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="46dc44ca-491c-4cc1-a951-e7b3e62bf02b"
+       chipset="BXT"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102D7800" />
+        <register type="NOA" address="0x00009888" value="0x122D79E0" />
+        <register type="NOA" address="0x00009888" value="0x0C2F0004" />
+        <register type="NOA" address="0x00009888" value="0x100E3800" />
+        <register type="NOA" address="0x00009888" value="0x180F0005" />
+        <register type="NOA" address="0x00009888" value="0x002D0940" />
+        <register type="NOA" address="0x00009888" value="0x022D802F" />
+        <register type="NOA" address="0x00009888" value="0x042D4013" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0050" />
+        <register type="NOA" address="0x00009888" value="0x022F0010" />
+        <register type="NOA" address="0x00009888" value="0x002F0000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x040E0480" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x060F0027" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F0040" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x439014A0" />
+        <register type="NOA" address="0x00009888" value="0x459000A4" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="8364e2a8-af63-40af-b0d5-42969a255654"
+       chipset="BXT"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121300A0" />
+        <register type="NOA" address="0x00009888" value="0x141600AB" />
+        <register type="NOA" address="0x00009888" value="0x123300A0" />
+        <register type="NOA" address="0x00009888" value="0x143600AB" />
+        <register type="NOA" address="0x00009888" value="0x125300A0" />
+        <register type="NOA" address="0x00009888" value="0x145600AB" />
+        <register type="NOA" address="0x00009888" value="0x0C2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E01A0" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0065" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0800" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F023F" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0003" />
+        <register type="NOA" address="0x00009888" value="0x1A2CC030" />
+        <register type="NOA" address="0x00009888" value="0x04132180" />
+        <register type="NOA" address="0x00009888" value="0x02130000" />
+        <register type="NOA" address="0x00009888" value="0x0C148000" />
+        <register type="NOA" address="0x00009888" value="0x0E142000" />
+        <register type="NOA" address="0x00009888" value="0x04148000" />
+        <register type="NOA" address="0x00009888" value="0x1E150140" />
+        <register type="NOA" address="0x00009888" value="0x1C150040" />
+        <register type="NOA" address="0x00009888" value="0x0C163000" />
+        <register type="NOA" address="0x00009888" value="0x0E160068" />
+        <register type="NOA" address="0x00009888" value="0x10160000" />
+        <register type="NOA" address="0x00009888" value="0x18160000" />
+        <register type="NOA" address="0x00009888" value="0x0A164000" />
+        <register type="NOA" address="0x00009888" value="0x04330043" />
+        <register type="NOA" address="0x00009888" value="0x02330000" />
+        <register type="NOA" address="0x00009888" value="0x0234A000" />
+        <register type="NOA" address="0x00009888" value="0x04342000" />
+        <register type="NOA" address="0x00009888" value="0x1C350015" />
+        <register type="NOA" address="0x00009888" value="0x02363460" />
+        <register type="NOA" address="0x00009888" value="0x10360000" />
+        <register type="NOA" address="0x00009888" value="0x04360000" />
+        <register type="NOA" address="0x00009888" value="0x06360000" />
+        <register type="NOA" address="0x00009888" value="0x08364000" />
+        <register type="NOA" address="0x00009888" value="0x06530043" />
+        <register type="NOA" address="0x00009888" value="0x02530000" />
+        <register type="NOA" address="0x00009888" value="0x0E548000" />
+        <register type="NOA" address="0x00009888" value="0x00548000" />
+        <register type="NOA" address="0x00009888" value="0x06542000" />
+        <register type="NOA" address="0x00009888" value="0x1E550400" />
+        <register type="NOA" address="0x00009888" value="0x1A552000" />
+        <register type="NOA" address="0x00009888" value="0x1C550100" />
+        <register type="NOA" address="0x00009888" value="0x0E563000" />
+        <register type="NOA" address="0x00009888" value="0x00563400" />
+        <register type="NOA" address="0x00009888" value="0x10560000" />
+        <register type="NOA" address="0x00009888" value="0x18560000" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x0C564000" />
+        <register type="NOA" address="0x00009888" value="0x1993A800" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B9014A0" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900820" />
+        <register type="NOA" address="0x00009888" value="0x45901022" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="175c8092-cb25-4d1e-8dc7-b4fdd39e2d92"
+       chipset="BXT"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A0000" />
+        <register type="NOA" address="0x00009888" value="0x143A0000" />
+        <register type="NOA" address="0x00009888" value="0x145A0000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E0150" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E006A" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0BC0" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F0302" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0003" />
+        <register type="NOA" address="0x00009888" value="0x1A2C00F0" />
+        <register type="NOA" address="0x00009888" value="0x021A3080" />
+        <register type="NOA" address="0x00009888" value="0x041A31E5" />
+        <register type="NOA" address="0x00009888" value="0x02148000" />
+        <register type="NOA" address="0x00009888" value="0x0414A000" />
+        <register type="NOA" address="0x00009888" value="0x1C150054" />
+        <register type="NOA" address="0x00009888" value="0x06168000" />
+        <register type="NOA" address="0x00009888" value="0x08168000" />
+        <register type="NOA" address="0x00009888" value="0x0A168000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A3280" />
+        <register type="NOA" address="0x00009888" value="0x0E3A0063" />
+        <register type="NOA" address="0x00009888" value="0x063A0061" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x0C348000" />
+        <register type="NOA" address="0x00009888" value="0x0E342000" />
+        <register type="NOA" address="0x00009888" value="0x06342000" />
+        <register type="NOA" address="0x00009888" value="0x1E350140" />
+        <register type="NOA" address="0x00009888" value="0x1C350100" />
+        <register type="NOA" address="0x00009888" value="0x18360028" />
+        <register type="NOA" address="0x00009888" value="0x0C368000" />
+        <register type="NOA" address="0x00009888" value="0x0E5A3080" />
+        <register type="NOA" address="0x00009888" value="0x005A3280" />
+        <register type="NOA" address="0x00009888" value="0x025A0063" />
+        <register type="NOA" address="0x00009888" value="0x0E548000" />
+        <register type="NOA" address="0x00009888" value="0x00548000" />
+        <register type="NOA" address="0x00009888" value="0x02542000" />
+        <register type="NOA" address="0x00009888" value="0x1E550400" />
+        <register type="NOA" address="0x00009888" value="0x1A552000" />
+        <register type="NOA" address="0x00009888" value="0x1C550001" />
+        <register type="NOA" address="0x00009888" value="0x18560080" />
+        <register type="NOA" address="0x00009888" value="0x02568000" />
+        <register type="NOA" address="0x00009888" value="0x04568000" />
+        <register type="NOA" address="0x00009888" value="0x1993A800" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x45901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="d260f03f-b34d-4b49-a44e-436819117332"
+       chipset="BXT"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A026B" />
+        <register type="NOA" address="0x00009888" value="0x143A0173" />
+        <register type="NOA" address="0x00009888" value="0x145A026B" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0069" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x180F6000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F030A" />
+        <register type="NOA" address="0x00009888" value="0x1A2C03C0" />
+        <register type="NOA" address="0x00009888" value="0x041A37E7" />
+        <register type="NOA" address="0x00009888" value="0x021A0000" />
+        <register type="NOA" address="0x00009888" value="0x0414A000" />
+        <register type="NOA" address="0x00009888" value="0x1C150050" />
+        <register type="NOA" address="0x00009888" value="0x08168000" />
+        <register type="NOA" address="0x00009888" value="0x0A168000" />
+        <register type="NOA" address="0x00009888" value="0x003A3380" />
+        <register type="NOA" address="0x00009888" value="0x063A006F" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x00348000" />
+        <register type="NOA" address="0x00009888" value="0x06342000" />
+        <register type="NOA" address="0x00009888" value="0x1A352000" />
+        <register type="NOA" address="0x00009888" value="0x1C350100" />
+        <register type="NOA" address="0x00009888" value="0x02368000" />
+        <register type="NOA" address="0x00009888" value="0x0C368000" />
+        <register type="NOA" address="0x00009888" value="0x025A37E7" />
+        <register type="NOA" address="0x00009888" value="0x0254A000" />
+        <register type="NOA" address="0x00009888" value="0x1C550005" />
+        <register type="NOA" address="0x00009888" value="0x04568000" />
+        <register type="NOA" address="0x00009888" value="0x06568000" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900020" />
+        <register type="NOA" address="0x00009888" value="0x45901080" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="fa6ecf21-2cb8-4d0b-9308-6e4a7b4ca87a"
+       chipset="BXT"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 7 READ C 6 READ FADD C 5 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A001F" />
+        <register type="NOA" address="0x00009888" value="0x143A001F" />
+        <register type="NOA" address="0x00009888" value="0x145A001F" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0094" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F00E0" />
+        <register type="NOA" address="0x00009888" value="0x1A2C0C00" />
+        <register type="NOA" address="0x00009888" value="0x061A0063" />
+        <register type="NOA" address="0x00009888" value="0x021A0000" />
+        <register type="NOA" address="0x00009888" value="0x06142000" />
+        <register type="NOA" address="0x00009888" value="0x1C150100" />
+        <register type="NOA" address="0x00009888" value="0x0C168000" />
+        <register type="NOA" address="0x00009888" value="0x043A3180" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x04348000" />
+        <register type="NOA" address="0x00009888" value="0x1C350040" />
+        <register type="NOA" address="0x00009888" value="0x0A368000" />
+        <register type="NOA" address="0x00009888" value="0x045A0063" />
+        <register type="NOA" address="0x00009888" value="0x025A0000" />
+        <register type="NOA" address="0x00009888" value="0x04542000" />
+        <register type="NOA" address="0x00009888" value="0x1C550010" />
+        <register type="NOA" address="0x00009888" value="0x08568000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900004" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00001000" />
+        <register type="FLEX" address="0x0000E558" value="0x00003002" />
+        <register type="FLEX" address="0x0000E658" value="0x00005004" />
+        <register type="FLEX" address="0x0000E758" value="0x00011010" />
+        <register type="FLEX" address="0x0000E45C" value="0x00050012" />
+        <register type="FLEX" address="0x0000E55C" value="0x00052051" />
+        <register type="FLEX" address="0x0000E65C" value="0x00000008" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="5ee72f5c-092f-421e-8b70-225f7c3e9612"
+       chipset="BXT"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800000" />
+        <register type="NOA" address="0x00009888" value="0x07800063" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x23810008" />
+        <register type="NOA" address="0x00009888" value="0x1D950400" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_chv.xml b/src/mesa/drivers/dri/i965/brw_oa_chv.xml
new file mode 100644
index 0000000..a4ebfb3
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_chv.xml
@@ -0,0 +1,9569 @@
+<?xml version="1.0"?>
+<metrics version="1491577975" merge_md5="">
+  <set name="Render Metrics Basic Gen8LP"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="9d8a3af5-c02c-4a4a-b947-f1672469e0fb"
+       chipset="CHV"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ  B 1 READ UADD $GpuCoreClocks FDIV 2 FDIV 100 FMUL"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+        <register type="NOA" address="0x00009888" value="0x59800001" />
+        <register type="NOA" address="0x00009888" value="0x285A0006" />
+        <register type="NOA" address="0x00009888" value="0x2C110014" />
+        <register type="NOA" address="0x00009888" value="0x2E110000" />
+        <register type="NOA" address="0x00009888" value="0x2C310014" />
+        <register type="NOA" address="0x00009888" value="0x2E310000" />
+        <register type="NOA" address="0x00009888" value="0x2B8303DF" />
+        <register type="NOA" address="0x00009888" value="0x3580024F" />
+        <register type="NOA" address="0x00009888" value="0x00580888" />
+        <register type="NOA" address="0x00009888" value="0x1E5A0015" />
+        <register type="NOA" address="0x00009888" value="0x205A0014" />
+        <register type="NOA" address="0x00009888" value="0x045A0000" />
+        <register type="NOA" address="0x00009888" value="0x025A0000" />
+        <register type="NOA" address="0x00009888" value="0x02180500" />
+        <register type="NOA" address="0x00009888" value="0x00190555" />
+        <register type="NOA" address="0x00009888" value="0x021D0500" />
+        <register type="NOA" address="0x00009888" value="0x021F0A00" />
+        <register type="NOA" address="0x00009888" value="0x00380444" />
+        <register type="NOA" address="0x00009888" value="0x02390500" />
+        <register type="NOA" address="0x00009888" value="0x003A0666" />
+        <register type="NOA" address="0x00009888" value="0x00100111" />
+        <register type="NOA" address="0x00009888" value="0x06110030" />
+        <register type="NOA" address="0x00009888" value="0x0A110031" />
+        <register type="NOA" address="0x00009888" value="0x0E110046" />
+        <register type="NOA" address="0x00009888" value="0x04110000" />
+        <register type="NOA" address="0x00009888" value="0x00110000" />
+        <register type="NOA" address="0x00009888" value="0x00130111" />
+        <register type="NOA" address="0x00009888" value="0x00300444" />
+        <register type="NOA" address="0x00009888" value="0x08310030" />
+        <register type="NOA" address="0x00009888" value="0x0C310031" />
+        <register type="NOA" address="0x00009888" value="0x10310046" />
+        <register type="NOA" address="0x00009888" value="0x04310000" />
+        <register type="NOA" address="0x00009888" value="0x00310000" />
+        <register type="NOA" address="0x00009888" value="0x00330444" />
+        <register type="NOA" address="0x00009888" value="0x038A0A00" />
+        <register type="NOA" address="0x00009888" value="0x018B0FFF" />
+        <register type="NOA" address="0x00009888" value="0x038B0A00" />
+        <register type="NOA" address="0x00009888" value="0x01855000" />
+        <register type="NOA" address="0x00009888" value="0x03850055" />
+        <register type="NOA" address="0x00009888" value="0x13830021" />
+        <register type="NOA" address="0x00009888" value="0x15830020" />
+        <register type="NOA" address="0x00009888" value="0x1783002F" />
+        <register type="NOA" address="0x00009888" value="0x1983002E" />
+        <register type="NOA" address="0x00009888" value="0x1B83002D" />
+        <register type="NOA" address="0x00009888" value="0x1D83002C" />
+        <register type="NOA" address="0x00009888" value="0x05830000" />
+        <register type="NOA" address="0x00009888" value="0x01840555" />
+        <register type="NOA" address="0x00009888" value="0x03840500" />
+        <register type="NOA" address="0x00009888" value="0x23800074" />
+        <register type="NOA" address="0x00009888" value="0x2580007D" />
+        <register type="NOA" address="0x00009888" value="0x05800000" />
+        <register type="NOA" address="0x00009888" value="0x01805000" />
+        <register type="NOA" address="0x00009888" value="0x03800055" />
+        <register type="NOA" address="0x00009888" value="0x01865000" />
+        <register type="NOA" address="0x00009888" value="0x03860055" />
+        <register type="NOA" address="0x00009888" value="0x01875000" />
+        <register type="NOA" address="0x00009888" value="0x03870055" />
+        <register type="NOA" address="0x00009888" value="0x418000AA" />
+        <register type="NOA" address="0x00009888" value="0x4380000A" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x4780000A" />
+        <register type="NOA" address="0x00009888" value="0x49800000" />
+        <register type="NOA" address="0x00009888" value="0x4B800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x55800000" />
+        <register type="NOA" address="0x00009888" value="0x57800000" />
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen8LP"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="f522a89c-ecd1-4522-8331-3383c54af5f5"
+       chipset="CHV"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 4 READ B 5 READ UADD 64 UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 2 READ B 3 READ UADD 64 UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 0 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Ring Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and Uncore ring."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 128 UMUL"
+             equation="C 2 READ 128 UMUL"
+             underscore_name="gti_ring_throughput"
+             units="bytes"
+             symbol_name="GtiRingThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 0 READ B 1 READ UADD 64 UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read-Only Stall"
+             description="The percentage of time in which GTI Read-Only port has been stalled."
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gti_ro_stall"
+             units="percent"
+             symbol_name="GtiRoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read-Write Stall"
+             description="The percentage of time in which GTI Read-Write port has been stalled."
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gti_rw_stall"
+             units="percent"
+             symbol_name="GtiRwStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 1 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+        <register type="NOA" address="0x00009888" value="0x59800001" />
+        <register type="NOA" address="0x00009888" value="0x2E5800E0" />
+        <register type="NOA" address="0x00009888" value="0x2E3800E0" />
+        <register type="NOA" address="0x00009888" value="0x3580024F" />
+        <register type="NOA" address="0x00009888" value="0x3D800140" />
+        <register type="NOA" address="0x00009888" value="0x08580042" />
+        <register type="NOA" address="0x00009888" value="0x0C580040" />
+        <register type="NOA" address="0x00009888" value="0x1058004C" />
+        <register type="NOA" address="0x00009888" value="0x1458004B" />
+        <register type="NOA" address="0x00009888" value="0x04580000" />
+        <register type="NOA" address="0x00009888" value="0x00580000" />
+        <register type="NOA" address="0x00009888" value="0x00195555" />
+        <register type="NOA" address="0x00009888" value="0x06380042" />
+        <register type="NOA" address="0x00009888" value="0x0A380040" />
+        <register type="NOA" address="0x00009888" value="0x0E38004C" />
+        <register type="NOA" address="0x00009888" value="0x1238004B" />
+        <register type="NOA" address="0x00009888" value="0x04380000" />
+        <register type="NOA" address="0x00009888" value="0x00384444" />
+        <register type="NOA" address="0x00009888" value="0x003A5555" />
+        <register type="NOA" address="0x00009888" value="0x018BFFFF" />
+        <register type="NOA" address="0x00009888" value="0x01845555" />
+        <register type="NOA" address="0x00009888" value="0x17800074" />
+        <register type="NOA" address="0x00009888" value="0x1980007D" />
+        <register type="NOA" address="0x00009888" value="0x1B80007C" />
+        <register type="NOA" address="0x00009888" value="0x1D8000B6" />
+        <register type="NOA" address="0x00009888" value="0x1F8000B7" />
+        <register type="NOA" address="0x00009888" value="0x05800000" />
+        <register type="NOA" address="0x00009888" value="0x03800000" />
+        <register type="NOA" address="0x00009888" value="0x418000AA" />
+        <register type="NOA" address="0x00009888" value="0x438000AA" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x47800000" />
+        <register type="NOA" address="0x00009888" value="0x4980012A" />
+        <register type="NOA" address="0x00009888" value="0x4B80012A" />
+        <register type="NOA" address="0x00009888" value="0x4D80012A" />
+        <register type="NOA" address="0x00009888" value="0x4F80012A" />
+        <register type="NOA" address="0x00009888" value="0x518001CE" />
+        <register type="NOA" address="0x00009888" value="0x538001CE" />
+        <register type="NOA" address="0x00009888" value="0x5580000E" />
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="a9ccc03d-a943-4e6b-9cd6-13e063075927"
+       chipset="CHV"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses UADD UADD 64 UMUL"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+        <register type="NOA" address="0x00009888" value="0x59800001" />
+        <register type="NOA" address="0x00009888" value="0x261E0000" />
+        <register type="NOA" address="0x00009888" value="0x281F000F" />
+        <register type="NOA" address="0x00009888" value="0x2817001A" />
+        <register type="NOA" address="0x00009888" value="0x2791001F" />
+        <register type="NOA" address="0x00009888" value="0x27880019" />
+        <register type="NOA" address="0x00009888" value="0x2D890000" />
+        <register type="NOA" address="0x00009888" value="0x278A0007" />
+        <register type="NOA" address="0x00009888" value="0x298D001F" />
+        <register type="NOA" address="0x00009888" value="0x278E0020" />
+        <register type="NOA" address="0x00009888" value="0x2B8F0012" />
+        <register type="NOA" address="0x00009888" value="0x29900000" />
+        <register type="NOA" address="0x00009888" value="0x00184000" />
+        <register type="NOA" address="0x00009888" value="0x02181000" />
+        <register type="NOA" address="0x00009888" value="0x02194000" />
+        <register type="NOA" address="0x00009888" value="0x141E0002" />
+        <register type="NOA" address="0x00009888" value="0x041E0000" />
+        <register type="NOA" address="0x00009888" value="0x001E0000" />
+        <register type="NOA" address="0x00009888" value="0x221F0015" />
+        <register type="NOA" address="0x00009888" value="0x041F0000" />
+        <register type="NOA" address="0x00009888" value="0x001F4000" />
+        <register type="NOA" address="0x00009888" value="0x021F0000" />
+        <register type="NOA" address="0x00009888" value="0x023A8000" />
+        <register type="NOA" address="0x00009888" value="0x0213C000" />
+        <register type="NOA" address="0x00009888" value="0x02164000" />
+        <register type="NOA" address="0x00009888" value="0x24170012" />
+        <register type="NOA" address="0x00009888" value="0x04170000" />
+        <register type="NOA" address="0x00009888" value="0x07910005" />
+        <register type="NOA" address="0x00009888" value="0x05910000" />
+        <register type="NOA" address="0x00009888" value="0x01911500" />
+        <register type="NOA" address="0x00009888" value="0x03910501" />
+        <register type="NOA" address="0x00009888" value="0x0D880002" />
+        <register type="NOA" address="0x00009888" value="0x1D880003" />
+        <register type="NOA" address="0x00009888" value="0x05880000" />
+        <register type="NOA" address="0x00009888" value="0x0B890032" />
+        <register type="NOA" address="0x00009888" value="0x1B890031" />
+        <register type="NOA" address="0x00009888" value="0x05890000" />
+        <register type="NOA" address="0x00009888" value="0x01890040" />
+        <register type="NOA" address="0x00009888" value="0x03890040" />
+        <register type="NOA" address="0x00009888" value="0x098A0000" />
+        <register type="NOA" address="0x00009888" value="0x198A0004" />
+        <register type="NOA" address="0x00009888" value="0x058A0000" />
+        <register type="NOA" address="0x00009888" value="0x018A8050" />
+        <register type="NOA" address="0x00009888" value="0x038A2050" />
+        <register type="NOA" address="0x00009888" value="0x018B95A9" />
+        <register type="NOA" address="0x00009888" value="0x038BE5A9" />
+        <register type="NOA" address="0x00009888" value="0x018C1500" />
+        <register type="NOA" address="0x00009888" value="0x038C0501" />
+        <register type="NOA" address="0x00009888" value="0x178D0015" />
+        <register type="NOA" address="0x00009888" value="0x058D0000" />
+        <register type="NOA" address="0x00009888" value="0x138E0004" />
+        <register type="NOA" address="0x00009888" value="0x218E000C" />
+        <register type="NOA" address="0x00009888" value="0x058E0000" />
+        <register type="NOA" address="0x00009888" value="0x018E0500" />
+        <register type="NOA" address="0x00009888" value="0x038E0101" />
+        <register type="NOA" address="0x00009888" value="0x0F8F0027" />
+        <register type="NOA" address="0x00009888" value="0x058F0000" />
+        <register type="NOA" address="0x00009888" value="0x018F0000" />
+        <register type="NOA" address="0x00009888" value="0x038F0001" />
+        <register type="NOA" address="0x00009888" value="0x11900013" />
+        <register type="NOA" address="0x00009888" value="0x1F900017" />
+        <register type="NOA" address="0x00009888" value="0x05900000" />
+        <register type="NOA" address="0x00009888" value="0x01900100" />
+        <register type="NOA" address="0x00009888" value="0x03900001" />
+        <register type="NOA" address="0x00009888" value="0x01845555" />
+        <register type="NOA" address="0x00009888" value="0x03845555" />
+        <register type="NOA" address="0x00009888" value="0x418000AA" />
+        <register type="NOA" address="0x00009888" value="0x438000AA" />
+        <register type="NOA" address="0x00009888" value="0x458000AA" />
+        <register type="NOA" address="0x00009888" value="0x478000AA" />
+        <register type="NOA" address="0x00009888" value="0x4980018C" />
+        <register type="NOA" address="0x00009888" value="0x4B80014B" />
+        <register type="NOA" address="0x00009888" value="0x4D800128" />
+        <register type="NOA" address="0x00009888" value="0x4F80012A" />
+        <register type="NOA" address="0x00009888" value="0x51800187" />
+        <register type="NOA" address="0x00009888" value="0x5380014B" />
+        <register type="NOA" address="0x00009888" value="0x55800149" />
+        <register type="NOA" address="0x00009888" value="0x5780010A" />
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="2cf0c064-68df-4fac-9b3f-57f51ca8a069"
+       chipset="CHV"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 1 READ C 0 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader12_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader12AccessStalledOnL3"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ B 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader10_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader10AccessStalledOnL3"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="B 7 READ B 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader11_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader11AccessStalledOnL3"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x105C0232" />
+        <register type="NOA" address="0x00009888" value="0x10580232" />
+        <register type="NOA" address="0x00009888" value="0x10380232" />
+        <register type="NOA" address="0x00009888" value="0x10DC0232" />
+        <register type="NOA" address="0x00009888" value="0x10D80232" />
+        <register type="NOA" address="0x00009888" value="0x10B80232" />
+        <register type="NOA" address="0x00009888" value="0x118E4400" />
+        <register type="NOA" address="0x00009888" value="0x025C6080" />
+        <register type="NOA" address="0x00009888" value="0x045C004B" />
+        <register type="NOA" address="0x00009888" value="0x005C8000" />
+        <register type="NOA" address="0x00009888" value="0x00582080" />
+        <register type="NOA" address="0x00009888" value="0x0258004B" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x04386080" />
+        <register type="NOA" address="0x00009888" value="0x0638404B" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A380000" />
+        <register type="NOA" address="0x00009888" value="0x0C380000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0CDC25C1" />
+        <register type="NOA" address="0x00009888" value="0x0ADCC000" />
+        <register type="NOA" address="0x00009888" value="0x0AD825C1" />
+        <register type="NOA" address="0x00009888" value="0x18DB4000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0001" />
+        <register type="NOA" address="0x00009888" value="0x0E9F8000" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0EB825C1" />
+        <register type="NOA" address="0x00009888" value="0x18B80154" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0D88C000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258BAA05" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C5400" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x098DC000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x098E05C0" />
+        <register type="NOA" address="0x00009888" value="0x058E0000" />
+        <register type="NOA" address="0x00009888" value="0x198F0020" />
+        <register type="NOA" address="0x00009888" value="0x2185AA0A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x19835000" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x19808000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x51800040" />
+        <register type="NOA" address="0x00009888" value="0x43800400" />
+        <register type="NOA" address="0x00009888" value="0x45800800" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800C62" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801042" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x418014A4" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FFF7" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="78a87ff9-543a-49ce-95ea-26d86071ea93"
+       chipset="CHV"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank1 Stalled"
+             description="The percentage of time in which slice1 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank1_stalled"
+             units="percent"
+             symbol_name="L31Bank1Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice1 L3 Bank0 Stalled"
+             description="The percentage of time in which slice1 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank0_stalled"
+             units="percent"
+             symbol_name="L31Bank0Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Slice1 L3 Bank1 Active"
+             description="The percentage of time in which slice1 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank1_active"
+             units="percent"
+             symbol_name="L31Bank1Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank0 Active"
+             description="The percentage of time in which slice1 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank0_active"
+             units="percent"
+             symbol_name="L31Bank0Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x10BF03DA" />
+        <register type="NOA" address="0x00009888" value="0x14BF0001" />
+        <register type="NOA" address="0x00009888" value="0x12980340" />
+        <register type="NOA" address="0x00009888" value="0x12990340" />
+        <register type="NOA" address="0x00009888" value="0x0CBF1187" />
+        <register type="NOA" address="0x00009888" value="0x0EBF1205" />
+        <register type="NOA" address="0x00009888" value="0x00BF0500" />
+        <register type="NOA" address="0x00009888" value="0x02BF042B" />
+        <register type="NOA" address="0x00009888" value="0x04BF002C" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x00DA8000" />
+        <register type="NOA" address="0x00009888" value="0x02DAC000" />
+        <register type="NOA" address="0x00009888" value="0x04DA4000" />
+        <register type="NOA" address="0x00009888" value="0x04983400" />
+        <register type="NOA" address="0x00009888" value="0x10980000" />
+        <register type="NOA" address="0x00009888" value="0x06990034" />
+        <register type="NOA" address="0x00009888" value="0x10990000" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x009D8000" />
+        <register type="NOA" address="0x00009888" value="0x029DC000" />
+        <register type="NOA" address="0x00009888" value="0x049D4000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00BA" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x00B94000" />
+        <register type="NOA" address="0x00009888" value="0x02B95000" />
+        <register type="NOA" address="0x00009888" value="0x04B91000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA4000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x258B800A" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x47800000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="9f2cece5-7bfe-4320-ad66-8c7cc526bec5"
+       chipset="CHV"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x103F03DA" />
+        <register type="NOA" address="0x00009888" value="0x143F0001" />
+        <register type="NOA" address="0x00009888" value="0x12180340" />
+        <register type="NOA" address="0x00009888" value="0x12190340" />
+        <register type="NOA" address="0x00009888" value="0x0C3F1187" />
+        <register type="NOA" address="0x00009888" value="0x0E3F1205" />
+        <register type="NOA" address="0x00009888" value="0x003F0500" />
+        <register type="NOA" address="0x00009888" value="0x023F042B" />
+        <register type="NOA" address="0x00009888" value="0x043F002C" />
+        <register type="NOA" address="0x00009888" value="0x0C5AC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5AC000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x04183400" />
+        <register type="NOA" address="0x00009888" value="0x10180000" />
+        <register type="NOA" address="0x00009888" value="0x06190034" />
+        <register type="NOA" address="0x00009888" value="0x10190000" />
+        <register type="NOA" address="0x00009888" value="0x0C1DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1DC000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x101F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00BA" />
+        <register type="NOA" address="0x00009888" value="0x0C388000" />
+        <register type="NOA" address="0x00009888" value="0x0C395000" />
+        <register type="NOA" address="0x00009888" value="0x0E395000" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04391000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AA800" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B4005" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800000" />
+        <register type="NOA" address="0x00009888" value="0x47800000" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="d890ef38-d309-47e4-b8b5-aa779bb19ab0"
+       chipset="CHV"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank3 Active"
+             description="The percentage of time in which slice1 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank3_active"
+             units="percent"
+             symbol_name="L31Bank3Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 L3 Bank3 Stalled"
+             description="The percentage of time in which slice1 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank3_stalled"
+             units="percent"
+             symbol_name="L31Bank3Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121B0340" />
+        <register type="NOA" address="0x00009888" value="0x103F0274" />
+        <register type="NOA" address="0x00009888" value="0x123F0000" />
+        <register type="NOA" address="0x00009888" value="0x129B0340" />
+        <register type="NOA" address="0x00009888" value="0x10BF0274" />
+        <register type="NOA" address="0x00009888" value="0x12BF0000" />
+        <register type="NOA" address="0x00009888" value="0x041B3400" />
+        <register type="NOA" address="0x00009888" value="0x101B0000" />
+        <register type="NOA" address="0x00009888" value="0x045C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3D4000" />
+        <register type="NOA" address="0x00009888" value="0x003F0080" />
+        <register type="NOA" address="0x00009888" value="0x023F0793" />
+        <register type="NOA" address="0x00009888" value="0x043F0014" />
+        <register type="NOA" address="0x00009888" value="0x04588000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F002A" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04399000" />
+        <register type="NOA" address="0x00009888" value="0x069B0034" />
+        <register type="NOA" address="0x00009888" value="0x109B0000" />
+        <register type="NOA" address="0x00009888" value="0x06DC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CBD4000" />
+        <register type="NOA" address="0x00009888" value="0x0CBF0981" />
+        <register type="NOA" address="0x00009888" value="0x0EBF0A0F" />
+        <register type="NOA" address="0x00009888" value="0x06D84000" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0CDB4000" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0E9F0080" />
+        <register type="NOA" address="0x00009888" value="0x0CB84000" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B8009" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800C00" />
+        <register type="NOA" address="0x00009888" value="0x47800C63" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F8014A5" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800045" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_4"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_4"
+       hw_config_guid="5fdff4a6-9dc8-45e1-bfda-ef54869fbdd4"
+       chipset="CHV"
+       symbol_name="L3_4"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 L3 Bank2 Active"
+             description="The percentage of time in which slice1 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank2_active"
+             units="percent"
+             symbol_name="L31Bank2Active"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Slice1 L3 Bank2 Stalled"
+             description="The percentage of time in which slice1 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l31_bank2_stalled"
+             units="percent"
+             symbol_name="L31Bank2Stalled"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121A0340" />
+        <register type="NOA" address="0x00009888" value="0x103F0017" />
+        <register type="NOA" address="0x00009888" value="0x123F0020" />
+        <register type="NOA" address="0x00009888" value="0x129A0340" />
+        <register type="NOA" address="0x00009888" value="0x10BF0017" />
+        <register type="NOA" address="0x00009888" value="0x12BF0020" />
+        <register type="NOA" address="0x00009888" value="0x041A3400" />
+        <register type="NOA" address="0x00009888" value="0x101A0000" />
+        <register type="NOA" address="0x00009888" value="0x043B8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3E0010" />
+        <register type="NOA" address="0x00009888" value="0x003F0200" />
+        <register type="NOA" address="0x00009888" value="0x023F0113" />
+        <register type="NOA" address="0x00009888" value="0x043F0014" />
+        <register type="NOA" address="0x00009888" value="0x02592000" />
+        <register type="NOA" address="0x00009888" value="0x005A8000" />
+        <register type="NOA" address="0x00009888" value="0x025AC000" />
+        <register type="NOA" address="0x00009888" value="0x045A4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x001D8000" />
+        <register type="NOA" address="0x00009888" value="0x021DC000" />
+        <register type="NOA" address="0x00009888" value="0x041D4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F001A" />
+        <register type="NOA" address="0x00009888" value="0x00394000" />
+        <register type="NOA" address="0x00009888" value="0x02395000" />
+        <register type="NOA" address="0x00009888" value="0x04391000" />
+        <register type="NOA" address="0x00009888" value="0x069A0034" />
+        <register type="NOA" address="0x00009888" value="0x109A0000" />
+        <register type="NOA" address="0x00009888" value="0x06BB4000" />
+        <register type="NOA" address="0x00009888" value="0x0ABE0040" />
+        <register type="NOA" address="0x00009888" value="0x0CBF0984" />
+        <register type="NOA" address="0x00009888" value="0x0EBF0A02" />
+        <register type="NOA" address="0x00009888" value="0x02D94000" />
+        <register type="NOA" address="0x00009888" value="0x0CDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDAC000" />
+        <register type="NOA" address="0x00009888" value="0x0C9C0400" />
+        <register type="NOA" address="0x00009888" value="0x0C9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0E9DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C9E0400" />
+        <register type="NOA" address="0x00009888" value="0x109F02A8" />
+        <register type="NOA" address="0x00009888" value="0x0E9F0040" />
+        <register type="NOA" address="0x00009888" value="0x0CB95000" />
+        <register type="NOA" address="0x00009888" value="0x0EB95000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258B8009" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x198C4000" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185800A" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x1B830154" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x45800800" />
+        <register type="NOA" address="0x00009888" value="0x47800842" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F801084" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800044" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="2c0e45e1-7e2c-4a14-ae00-0b7ec868b8aa"
+       chipset="CHV"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Rasterizer Input Available"
+             description="The percentage of time in which slice1 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer1_input_available"
+             units="percent"
+             symbol_name="Rasterizer1InputAvailable"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied))"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Pixel Values Ready"
+             description="The percentage of time in which slice1 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values1_ready"
+             units="percent"
+             symbol_name="PixelValues1Ready"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 PS Output Available"
+             description="The percentage of time in which slice1 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output1_available"
+             units="percent"
+             symbol_name="PSOutput1Available"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice1 Rasterizer Output Ready"
+             description="The percentage of time in which slice1 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer1_output_ready"
+             units="percent"
+             symbol_name="Rasterizer1OutputReady"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice1 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice1 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data1_ready"
+             units="percent"
+             symbol_name="PixelData1Ready"
+             availability="$SliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x143B000E" />
+        <register type="NOA" address="0x00009888" value="0x043C55C0" />
+        <register type="NOA" address="0x00009888" value="0x0A1E0280" />
+        <register type="NOA" address="0x00009888" value="0x0C1E0408" />
+        <register type="NOA" address="0x00009888" value="0x10390000" />
+        <register type="NOA" address="0x00009888" value="0x12397A1F" />
+        <register type="NOA" address="0x00009888" value="0x14BB000E" />
+        <register type="NOA" address="0x00009888" value="0x04BC5000" />
+        <register type="NOA" address="0x00009888" value="0x0A9E0296" />
+        <register type="NOA" address="0x00009888" value="0x0C9E0008" />
+        <register type="NOA" address="0x00009888" value="0x10B90000" />
+        <register type="NOA" address="0x00009888" value="0x12B97A1F" />
+        <register type="NOA" address="0x00009888" value="0x063B0042" />
+        <register type="NOA" address="0x00009888" value="0x103B0000" />
+        <register type="NOA" address="0x00009888" value="0x083C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A3E0040" />
+        <register type="NOA" address="0x00009888" value="0x043F8000" />
+        <register type="NOA" address="0x00009888" value="0x02594000" />
+        <register type="NOA" address="0x00009888" value="0x045A8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1C0400" />
+        <register type="NOA" address="0x00009888" value="0x041D8000" />
+        <register type="NOA" address="0x00009888" value="0x081E02C0" />
+        <register type="NOA" address="0x00009888" value="0x0E1E0000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA800" />
+        <register type="NOA" address="0x00009888" value="0x0E1F0260" />
+        <register type="NOA" address="0x00009888" value="0x101F0014" />
+        <register type="NOA" address="0x00009888" value="0x003905E0" />
+        <register type="NOA" address="0x00009888" value="0x06390BC0" />
+        <register type="NOA" address="0x00009888" value="0x02390018" />
+        <register type="NOA" address="0x00009888" value="0x04394000" />
+        <register type="NOA" address="0x00009888" value="0x04BB0042" />
+        <register type="NOA" address="0x00009888" value="0x10BB0000" />
+        <register type="NOA" address="0x00009888" value="0x02BC05C0" />
+        <register type="NOA" address="0x00009888" value="0x08BC0000" />
+        <register type="NOA" address="0x00009888" value="0x0ABE0004" />
+        <register type="NOA" address="0x00009888" value="0x02BF8000" />
+        <register type="NOA" address="0x00009888" value="0x02D91000" />
+        <register type="NOA" address="0x00009888" value="0x02DA8000" />
+        <register type="NOA" address="0x00009888" value="0x089C8000" />
+        <register type="NOA" address="0x00009888" value="0x029D8000" />
+        <register type="NOA" address="0x00009888" value="0x089E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E9E0000" />
+        <register type="NOA" address="0x00009888" value="0x0E9FA806" />
+        <register type="NOA" address="0x00009888" value="0x109F0142" />
+        <register type="NOA" address="0x00009888" value="0x08B90617" />
+        <register type="NOA" address="0x00009888" value="0x0AB90BE0" />
+        <register type="NOA" address="0x00009888" value="0x02B94000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000C" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x018A8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A2800" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x238B52A0" />
+        <register type="NOA" address="0x00009888" value="0x258B6A95" />
+        <register type="NOA" address="0x00009888" value="0x278B0029" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C1500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0014" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x038D8000" />
+        <register type="NOA" address="0x00009888" value="0x058D2000" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA80" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x01834000" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0184C000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1180C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4D800444" />
+        <register type="NOA" address="0x00009888" value="0x3D800000" />
+        <register type="NOA" address="0x00009888" value="0x4F804000" />
+        <register type="NOA" address="0x00009888" value="0x43801080" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800084" />
+        <register type="NOA" address="0x00009888" value="0x53800044" />
+        <register type="NOA" address="0x00009888" value="0x47801080" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800840" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00006000" />
+        <register type="OA" address="0x00002774" value="0x0000F3FF" />
+        <register type="OA" address="0x00002778" value="0x00001800" />
+        <register type="OA" address="0x0000277C" value="0x0000FCFF" />
+        <register type="OA" address="0x00002780" value="0x00000600" />
+        <register type="OA" address="0x00002784" value="0x0000FF3F" />
+        <register type="OA" address="0x00002788" value="0x00000180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000060" />
+        <register type="OA" address="0x00002794" value="0x0000FFF3" />
+        <register type="OA" address="0x00002798" value="0x00000018" />
+        <register type="OA" address="0x0000279C" value="0x0000FFFC" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler_1"
+       hw_config_guid="71148d78-baf5-474f-878a-e23158d0265d"
+       chipset="CHV"
+       symbol_name="Sampler_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Slice1 Subslice0 Input Available"
+             description="The percentage of time in which slice1 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler10_input_available"
+             units="percent"
+             symbol_name="Sampler10InputAvailable"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Slice1 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler12_output_ready"
+             units="percent"
+             symbol_name="Sampler12OutputReady"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice1 Subslice1 Input Available"
+             description="The percentage of time in which slice1 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler11_input_available"
+             units="percent"
+             symbol_name="Sampler11InputAvailable"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice1 Subslice2 Input Available"
+             description="The percentage of time in which slice1 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler12_input_available"
+             units="percent"
+             symbol_name="Sampler12InputAvailable"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice1 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler10_output_ready"
+             units="percent"
+             symbol_name="Sampler10OutputReady"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Slice1 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice1 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler11_output_ready"
+             units="percent"
+             symbol_name="Sampler11OutputReady"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x18921400" />
+        <register type="NOA" address="0x00009888" value="0x149500AB" />
+        <register type="NOA" address="0x00009888" value="0x18B21400" />
+        <register type="NOA" address="0x00009888" value="0x14B500AB" />
+        <register type="NOA" address="0x00009888" value="0x18D21400" />
+        <register type="NOA" address="0x00009888" value="0x14D500AB" />
+        <register type="NOA" address="0x00009888" value="0x0CDC8000" />
+        <register type="NOA" address="0x00009888" value="0x0EDC4000" />
+        <register type="NOA" address="0x00009888" value="0x02DCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DCC000" />
+        <register type="NOA" address="0x00009888" value="0x1ABD00A0" />
+        <register type="NOA" address="0x00009888" value="0x0ABD8000" />
+        <register type="NOA" address="0x00009888" value="0x0CD88000" />
+        <register type="NOA" address="0x00009888" value="0x0ED84000" />
+        <register type="NOA" address="0x00009888" value="0x04D88000" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0050" />
+        <register type="NOA" address="0x00009888" value="0x04DB8000" />
+        <register type="NOA" address="0x00009888" value="0x06DB8000" />
+        <register type="NOA" address="0x00009888" value="0x08DB8000" />
+        <register type="NOA" address="0x00009888" value="0x0ADB4000" />
+        <register type="NOA" address="0x00009888" value="0x109F02A0" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00AA" />
+        <register type="NOA" address="0x00009888" value="0x18B82500" />
+        <register type="NOA" address="0x00009888" value="0x02B88000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB84000" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB98000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x00B98000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x1ABA0200" />
+        <register type="NOA" address="0x00009888" value="0x02BA8000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA8000" />
+        <register type="NOA" address="0x00009888" value="0x04908000" />
+        <register type="NOA" address="0x00009888" value="0x04918000" />
+        <register type="NOA" address="0x00009888" value="0x04927300" />
+        <register type="NOA" address="0x00009888" value="0x10920000" />
+        <register type="NOA" address="0x00009888" value="0x1893000A" />
+        <register type="NOA" address="0x00009888" value="0x0A934000" />
+        <register type="NOA" address="0x00009888" value="0x0A946000" />
+        <register type="NOA" address="0x00009888" value="0x0C959000" />
+        <register type="NOA" address="0x00009888" value="0x0E950098" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x04B04000" />
+        <register type="NOA" address="0x00009888" value="0x04B14000" />
+        <register type="NOA" address="0x00009888" value="0x04B20073" />
+        <register type="NOA" address="0x00009888" value="0x10B20000" />
+        <register type="NOA" address="0x00009888" value="0x04B38000" />
+        <register type="NOA" address="0x00009888" value="0x06B38000" />
+        <register type="NOA" address="0x00009888" value="0x08B34000" />
+        <register type="NOA" address="0x00009888" value="0x04B4C000" />
+        <register type="NOA" address="0x00009888" value="0x02B59890" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x06D04000" />
+        <register type="NOA" address="0x00009888" value="0x06D14000" />
+        <register type="NOA" address="0x00009888" value="0x06D20073" />
+        <register type="NOA" address="0x00009888" value="0x10D20000" />
+        <register type="NOA" address="0x00009888" value="0x18D30020" />
+        <register type="NOA" address="0x00009888" value="0x02D38000" />
+        <register type="NOA" address="0x00009888" value="0x0CD34000" />
+        <register type="NOA" address="0x00009888" value="0x0AD48000" />
+        <register type="NOA" address="0x00009888" value="0x04D42000" />
+        <register type="NOA" address="0x00009888" value="0x0ED59000" />
+        <register type="NOA" address="0x00009888" value="0x00D59800" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x0F88000E" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x258B000A" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8D8000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x2185000A" />
+        <register type="NOA" address="0x00009888" value="0x1B830150" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D848000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D808000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801021" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800C64" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800C02" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler_2"
+       hw_config_guid="b996a2b7-c59c-492d-877a-8cd54fd6df84"
+       chipset="CHV"
+       symbol_name="Sampler_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x18121400" />
+        <register type="NOA" address="0x00009888" value="0x141500AB" />
+        <register type="NOA" address="0x00009888" value="0x18321400" />
+        <register type="NOA" address="0x00009888" value="0x143500AB" />
+        <register type="NOA" address="0x00009888" value="0x18521400" />
+        <register type="NOA" address="0x00009888" value="0x145500AB" />
+        <register type="NOA" address="0x00009888" value="0x0C5C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5C4000" />
+        <register type="NOA" address="0x00009888" value="0x025CC000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x1A3D00A0" />
+        <register type="NOA" address="0x00009888" value="0x0A3D8000" />
+        <register type="NOA" address="0x00009888" value="0x0C588000" />
+        <register type="NOA" address="0x00009888" value="0x0E584000" />
+        <register type="NOA" address="0x00009888" value="0x04588000" />
+        <register type="NOA" address="0x00009888" value="0x1A5B0050" />
+        <register type="NOA" address="0x00009888" value="0x045B8000" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B8000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x101F02A0" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x18382500" />
+        <register type="NOA" address="0x00009888" value="0x02388000" />
+        <register type="NOA" address="0x00009888" value="0x04384000" />
+        <register type="NOA" address="0x00009888" value="0x06384000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C388000" />
+        <register type="NOA" address="0x00009888" value="0x0C398000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x1A3A0200" />
+        <register type="NOA" address="0x00009888" value="0x023A8000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A8000" />
+        <register type="NOA" address="0x00009888" value="0x04108000" />
+        <register type="NOA" address="0x00009888" value="0x04118000" />
+        <register type="NOA" address="0x00009888" value="0x04127300" />
+        <register type="NOA" address="0x00009888" value="0x10120000" />
+        <register type="NOA" address="0x00009888" value="0x1813000A" />
+        <register type="NOA" address="0x00009888" value="0x0A134000" />
+        <register type="NOA" address="0x00009888" value="0x0A146000" />
+        <register type="NOA" address="0x00009888" value="0x0C159000" />
+        <register type="NOA" address="0x00009888" value="0x0E150098" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04304000" />
+        <register type="NOA" address="0x00009888" value="0x04314000" />
+        <register type="NOA" address="0x00009888" value="0x04320073" />
+        <register type="NOA" address="0x00009888" value="0x10320000" />
+        <register type="NOA" address="0x00009888" value="0x04338000" />
+        <register type="NOA" address="0x00009888" value="0x06338000" />
+        <register type="NOA" address="0x00009888" value="0x08334000" />
+        <register type="NOA" address="0x00009888" value="0x0434C000" />
+        <register type="NOA" address="0x00009888" value="0x02359890" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x06504000" />
+        <register type="NOA" address="0x00009888" value="0x06514000" />
+        <register type="NOA" address="0x00009888" value="0x06520073" />
+        <register type="NOA" address="0x00009888" value="0x10520000" />
+        <register type="NOA" address="0x00009888" value="0x18530020" />
+        <register type="NOA" address="0x00009888" value="0x02538000" />
+        <register type="NOA" address="0x00009888" value="0x0C534000" />
+        <register type="NOA" address="0x00009888" value="0x0A548000" />
+        <register type="NOA" address="0x00009888" value="0x04542000" />
+        <register type="NOA" address="0x00009888" value="0x0E559000" />
+        <register type="NOA" address="0x00009888" value="0x00559800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AA000" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x258B0005" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x2185000A" />
+        <register type="NOA" address="0x00009888" value="0x1B830150" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0D848000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x07844000" />
+        <register type="NOA" address="0x00009888" value="0x1D808000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x17804000" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47801021" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800C64" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x41800C02" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="eb2fecba-b431-42e7-8261-fe9429a6e67a"
+       chipset="CHV"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread11_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread11ReadyForDispatch"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread12_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread12ReadyForDispatch"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread10_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread10ReadyForDispatch"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread12_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread12ReadyForDispatch"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice1 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice1 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread10_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread10ReadyForDispatch"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice1 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice1 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread11_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread11ReadyForDispatch"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x16154D60" />
+        <register type="NOA" address="0x00009888" value="0x16352E60" />
+        <register type="NOA" address="0x00009888" value="0x16554D60" />
+        <register type="NOA" address="0x00009888" value="0x16950000" />
+        <register type="NOA" address="0x00009888" value="0x16B50000" />
+        <register type="NOA" address="0x00009888" value="0x16D50000" />
+        <register type="NOA" address="0x00009888" value="0x005C8000" />
+        <register type="NOA" address="0x00009888" value="0x045CC000" />
+        <register type="NOA" address="0x00009888" value="0x065C4000" />
+        <register type="NOA" address="0x00009888" value="0x083D8000" />
+        <register type="NOA" address="0x00009888" value="0x0A3D8000" />
+        <register type="NOA" address="0x00009888" value="0x0458C000" />
+        <register type="NOA" address="0x00009888" value="0x025B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E1F00AA" />
+        <register type="NOA" address="0x00009888" value="0x02384000" />
+        <register type="NOA" address="0x00009888" value="0x04388000" />
+        <register type="NOA" address="0x00009888" value="0x06388000" />
+        <register type="NOA" address="0x00009888" value="0x08384000" />
+        <register type="NOA" address="0x00009888" value="0x0A384000" />
+        <register type="NOA" address="0x00009888" value="0x0C384000" />
+        <register type="NOA" address="0x00009888" value="0x00398000" />
+        <register type="NOA" address="0x00009888" value="0x0239A000" />
+        <register type="NOA" address="0x00009888" value="0x0439A000" />
+        <register type="NOA" address="0x00009888" value="0x06392000" />
+        <register type="NOA" address="0x00009888" value="0x043A8000" />
+        <register type="NOA" address="0x00009888" value="0x063A8000" />
+        <register type="NOA" address="0x00009888" value="0x08138000" />
+        <register type="NOA" address="0x00009888" value="0x0A138000" />
+        <register type="NOA" address="0x00009888" value="0x06143000" />
+        <register type="NOA" address="0x00009888" value="0x0415CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x02338000" />
+        <register type="NOA" address="0x00009888" value="0x0C338000" />
+        <register type="NOA" address="0x00009888" value="0x04342000" />
+        <register type="NOA" address="0x00009888" value="0x06344000" />
+        <register type="NOA" address="0x00009888" value="0x0035C700" />
+        <register type="NOA" address="0x00009888" value="0x063500CF" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04538000" />
+        <register type="NOA" address="0x00009888" value="0x06538000" />
+        <register type="NOA" address="0x00009888" value="0x0454C000" />
+        <register type="NOA" address="0x00009888" value="0x0255CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06DC8000" />
+        <register type="NOA" address="0x00009888" value="0x08DC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CDCC000" />
+        <register type="NOA" address="0x00009888" value="0x0EDCC000" />
+        <register type="NOA" address="0x00009888" value="0x1ABD00A8" />
+        <register type="NOA" address="0x00009888" value="0x0CD8C000" />
+        <register type="NOA" address="0x00009888" value="0x0ED84000" />
+        <register type="NOA" address="0x00009888" value="0x0EDB8000" />
+        <register type="NOA" address="0x00009888" value="0x18DB0800" />
+        <register type="NOA" address="0x00009888" value="0x1ADB0254" />
+        <register type="NOA" address="0x00009888" value="0x0E9FAA00" />
+        <register type="NOA" address="0x00009888" value="0x109F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0EB84000" />
+        <register type="NOA" address="0x00009888" value="0x16B84000" />
+        <register type="NOA" address="0x00009888" value="0x18B8156A" />
+        <register type="NOA" address="0x00009888" value="0x06B98000" />
+        <register type="NOA" address="0x00009888" value="0x08B9A000" />
+        <register type="NOA" address="0x00009888" value="0x0AB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0CB9A000" />
+        <register type="NOA" address="0x00009888" value="0x0EB9A000" />
+        <register type="NOA" address="0x00009888" value="0x18BAA000" />
+        <register type="NOA" address="0x00009888" value="0x1ABA0002" />
+        <register type="NOA" address="0x00009888" value="0x16934000" />
+        <register type="NOA" address="0x00009888" value="0x1893000A" />
+        <register type="NOA" address="0x00009888" value="0x0A947000" />
+        <register type="NOA" address="0x00009888" value="0x0C95C5C1" />
+        <register type="NOA" address="0x00009888" value="0x0E9500C3" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x0EB38000" />
+        <register type="NOA" address="0x00009888" value="0x16B30040" />
+        <register type="NOA" address="0x00009888" value="0x18B30020" />
+        <register type="NOA" address="0x00009888" value="0x06B48000" />
+        <register type="NOA" address="0x00009888" value="0x08B41000" />
+        <register type="NOA" address="0x00009888" value="0x0AB48000" />
+        <register type="NOA" address="0x00009888" value="0x06B5C500" />
+        <register type="NOA" address="0x00009888" value="0x08B500C3" />
+        <register type="NOA" address="0x00009888" value="0x0EB5C100" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x16D31500" />
+        <register type="NOA" address="0x00009888" value="0x08D4E000" />
+        <register type="NOA" address="0x00009888" value="0x08D5C100" />
+        <register type="NOA" address="0x00009888" value="0x0AD5C3C5" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x0D88F800" />
+        <register type="NOA" address="0x00009888" value="0x0F88000F" />
+        <register type="NOA" address="0x00009888" value="0x038A8000" />
+        <register type="NOA" address="0x00009888" value="0x058A8000" />
+        <register type="NOA" address="0x00009888" value="0x078A8000" />
+        <register type="NOA" address="0x00009888" value="0x098A8000" />
+        <register type="NOA" address="0x00009888" value="0x0B8A8000" />
+        <register type="NOA" address="0x00009888" value="0x0D8A8000" />
+        <register type="NOA" address="0x00009888" value="0x258BAAA5" />
+        <register type="NOA" address="0x00009888" value="0x278B002A" />
+        <register type="NOA" address="0x00009888" value="0x238B2A80" />
+        <register type="NOA" address="0x00009888" value="0x0F8C4000" />
+        <register type="NOA" address="0x00009888" value="0x178C2000" />
+        <register type="NOA" address="0x00009888" value="0x198C5500" />
+        <register type="NOA" address="0x00009888" value="0x1B8C0015" />
+        <register type="NOA" address="0x00009888" value="0x078D8000" />
+        <register type="NOA" address="0x00009888" value="0x098DA000" />
+        <register type="NOA" address="0x00009888" value="0x0B8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0D8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0F8DA000" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800C42" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45800063" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x47800800" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F8014A4" />
+        <register type="NOA" address="0x00009888" value="0x41801042" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x0000FE7F" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFBF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFF7" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FFF9" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="60749470-a648-4a4b-9f10-dbfe1e36e44d"
+       chipset="CHV"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice1 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header11_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader11ReadyPort1"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header11_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader11ReadyPort0"
+             availability="$SubsliceMask 0x10 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header12_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader12ReadyPort0"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header10_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader10ReadyPort1"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice1 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header12_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader12ReadyPort1"
+             availability="$SubsliceMask 0x20 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL 2 UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice1 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice1 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header10_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader10ReadyPort0"
+             availability="$SubsliceMask 0x8 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x16150000" />
+        <register type="NOA" address="0x00009888" value="0x16350000" />
+        <register type="NOA" address="0x00009888" value="0x16550000" />
+        <register type="NOA" address="0x00009888" value="0x16952E60" />
+        <register type="NOA" address="0x00009888" value="0x16B54D60" />
+        <register type="NOA" address="0x00009888" value="0x16D52E60" />
+        <register type="NOA" address="0x00009888" value="0x065C8000" />
+        <register type="NOA" address="0x00009888" value="0x085CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A5CC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5C4000" />
+        <register type="NOA" address="0x00009888" value="0x0E3D8000" />
+        <register type="NOA" address="0x00009888" value="0x183DA000" />
+        <register type="NOA" address="0x00009888" value="0x06588000" />
+        <register type="NOA" address="0x00009888" value="0x08588000" />
+        <register type="NOA" address="0x00009888" value="0x0A584000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x185B5800" />
+        <register type="NOA" address="0x00009888" value="0x1A5B000A" />
+        <register type="NOA" address="0x00009888" value="0x0E1FAA00" />
+        <register type="NOA" address="0x00009888" value="0x101F02AA" />
+        <register type="NOA" address="0x00009888" value="0x0E384000" />
+        <register type="NOA" address="0x00009888" value="0x16384000" />
+        <register type="NOA" address="0x00009888" value="0x18382A55" />
+        <register type="NOA" address="0x00009888" value="0x06398000" />
+        <register type="NOA" address="0x00009888" value="0x0839A000" />
+        <register type="NOA" address="0x00009888" value="0x0A39A000" />
+        <register type="NOA" address="0x00009888" value="0x0C39A000" />
+        <register type="NOA" address="0x00009888" value="0x0E39A000" />
+        <register type="NOA" address="0x00009888" value="0x1A3A02A0" />
+        <register type="NOA" address="0x00009888" value="0x0E138000" />
+        <register type="NOA" address="0x00009888" value="0x16130500" />
+        <register type="NOA" address="0x00009888" value="0x06148000" />
+        <register type="NOA" address="0x00009888" value="0x08146000" />
+        <register type="NOA" address="0x00009888" value="0x0615C100" />
+        <register type="NOA" address="0x00009888" value="0x0815C500" />
+        <register type="NOA" address="0x00009888" value="0x0A1500C3" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x16335040" />
+        <register type="NOA" address="0x00009888" value="0x08349000" />
+        <register type="NOA" address="0x00009888" value="0x0A341000" />
+        <register type="NOA" address="0x00009888" value="0x083500C1" />
+        <register type="NOA" address="0x00009888" value="0x0A35C500" />
+        <register type="NOA" address="0x00009888" value="0x0C3500C3" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x1853002A" />
+        <register type="NOA" address="0x00009888" value="0x0A54E000" />
+        <register type="NOA" address="0x00009888" value="0x0C55C500" />
+        <register type="NOA" address="0x00009888" value="0x0E55C1C3" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x00DC8000" />
+        <register type="NOA" address="0x00009888" value="0x02DCC000" />
+        <register type="NOA" address="0x00009888" value="0x04DC4000" />
+        <register type="NOA" address="0x00009888" value="0x04BD8000" />
+        <register type="NOA" address="0x00009888" value="0x06BD8000" />
+        <register type="NOA" address="0x00009888" value="0x02D8C000" />
+        <register type="NOA" address="0x00009888" value="0x02DB8000" />
+        <register type="NOA" address="0x00009888" value="0x04DB4000" />
+        <register type="NOA" address="0x00009888" value="0x06DB4000" />
+        <register type="NOA" address="0x00009888" value="0x08DB8000" />
+        <register type="NOA" address="0x00009888" value="0x0C9FA000" />
+        <register type="NOA" address="0x00009888" value="0x0E9F00AA" />
+        <register type="NOA" address="0x00009888" value="0x02B84000" />
+        <register type="NOA" address="0x00009888" value="0x04B84000" />
+        <register type="NOA" address="0x00009888" value="0x06B84000" />
+        <register type="NOA" address="0x00009888" value="0x08B84000" />
+        <register type="NOA" address="0x00009888" value="0x0AB88000" />
+        <register type="NOA" address="0x00009888" value="0x0CB88000" />
+        <register type="NOA" address="0x00009888" value="0x00B98000" />
+        <register type="NOA" address="0x00009888" value="0x02B9A000" />
+        <register type="NOA" address="0x00009888" value="0x04B9A000" />
+        <register type="NOA" address="0x00009888" value="0x06B92000" />
+        <register type="NOA" address="0x00009888" value="0x0ABA8000" />
+        <register type="NOA" address="0x00009888" value="0x0CBA8000" />
+        <register type="NOA" address="0x00009888" value="0x04938000" />
+        <register type="NOA" address="0x00009888" value="0x06938000" />
+        <register type="NOA" address="0x00009888" value="0x0494C000" />
+        <register type="NOA" address="0x00009888" value="0x0295CFC7" />
+        <register type="NOA" address="0x00009888" value="0x10950000" />
+        <register type="NOA" address="0x00009888" value="0x02B38000" />
+        <register type="NOA" address="0x00009888" value="0x08B38000" />
+        <register type="NOA" address="0x00009888" value="0x04B42000" />
+        <register type="NOA" address="0x00009888" value="0x06B41000" />
+        <register type="NOA" address="0x00009888" value="0x00B5C700" />
+        <register type="NOA" address="0x00009888" value="0x04B500CF" />
+        <register type="NOA" address="0x00009888" value="0x10B50000" />
+        <register type="NOA" address="0x00009888" value="0x0AD38000" />
+        <register type="NOA" address="0x00009888" value="0x0CD38000" />
+        <register type="NOA" address="0x00009888" value="0x06D46000" />
+        <register type="NOA" address="0x00009888" value="0x04D5C700" />
+        <register type="NOA" address="0x00009888" value="0x06D500CF" />
+        <register type="NOA" address="0x00009888" value="0x10D50000" />
+        <register type="NOA" address="0x00009888" value="0x03888000" />
+        <register type="NOA" address="0x00009888" value="0x05888000" />
+        <register type="NOA" address="0x00009888" value="0x07888000" />
+        <register type="NOA" address="0x00009888" value="0x09888000" />
+        <register type="NOA" address="0x00009888" value="0x0B888000" />
+        <register type="NOA" address="0x00009888" value="0x0D880400" />
+        <register type="NOA" address="0x00009888" value="0x0F8A8000" />
+        <register type="NOA" address="0x00009888" value="0x198A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8AAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0002" />
+        <register type="NOA" address="0x00009888" value="0x258B555A" />
+        <register type="NOA" address="0x00009888" value="0x278B0015" />
+        <register type="NOA" address="0x00009888" value="0x238B5500" />
+        <register type="NOA" address="0x00009888" value="0x038C4000" />
+        <register type="NOA" address="0x00009888" value="0x058C4000" />
+        <register type="NOA" address="0x00009888" value="0x078C4000" />
+        <register type="NOA" address="0x00009888" value="0x098C4000" />
+        <register type="NOA" address="0x00009888" value="0x0B8C4000" />
+        <register type="NOA" address="0x00009888" value="0x0D8C4000" />
+        <register type="NOA" address="0x00009888" value="0x018D8000" />
+        <register type="NOA" address="0x00009888" value="0x038DA000" />
+        <register type="NOA" address="0x00009888" value="0x058DA000" />
+        <register type="NOA" address="0x00009888" value="0x078D2000" />
+        <register type="NOA" address="0x00009888" value="0x2185AAAA" />
+        <register type="NOA" address="0x00009888" value="0x2385002A" />
+        <register type="NOA" address="0x00009888" value="0x1F85AA00" />
+        <register type="NOA" address="0x00009888" value="0x0F834000" />
+        <register type="NOA" address="0x00009888" value="0x19835400" />
+        <register type="NOA" address="0x00009888" value="0x1B830155" />
+        <register type="NOA" address="0x00009888" value="0x03834000" />
+        <register type="NOA" address="0x00009888" value="0x05834000" />
+        <register type="NOA" address="0x00009888" value="0x07834000" />
+        <register type="NOA" address="0x00009888" value="0x09834000" />
+        <register type="NOA" address="0x00009888" value="0x0B834000" />
+        <register type="NOA" address="0x00009888" value="0x0D834000" />
+        <register type="NOA" address="0x00009888" value="0x0784C000" />
+        <register type="NOA" address="0x00009888" value="0x0984C000" />
+        <register type="NOA" address="0x00009888" value="0x0B84C000" />
+        <register type="NOA" address="0x00009888" value="0x0D84C000" />
+        <register type="NOA" address="0x00009888" value="0x0F84C000" />
+        <register type="NOA" address="0x00009888" value="0x01848000" />
+        <register type="NOA" address="0x00009888" value="0x0384C000" />
+        <register type="NOA" address="0x00009888" value="0x0584C000" />
+        <register type="NOA" address="0x00009888" value="0x1780C000" />
+        <register type="NOA" address="0x00009888" value="0x1980C000" />
+        <register type="NOA" address="0x00009888" value="0x1B80C000" />
+        <register type="NOA" address="0x00009888" value="0x1D80C000" />
+        <register type="NOA" address="0x00009888" value="0x1F80C000" />
+        <register type="NOA" address="0x00009888" value="0x11808000" />
+        <register type="NOA" address="0x00009888" value="0x1380C000" />
+        <register type="NOA" address="0x00009888" value="0x1580C000" />
+        <register type="NOA" address="0x00009888" value="0x4F800000" />
+        <register type="NOA" address="0x00009888" value="0x43800882" />
+        <register type="NOA" address="0x00009888" value="0x51800000" />
+        <register type="NOA" address="0x00009888" value="0x45801082" />
+        <register type="NOA" address="0x00009888" value="0x53800000" />
+        <register type="NOA" address="0x00009888" value="0x478014A5" />
+        <register type="NOA" address="0x00009888" value="0x21800000" />
+        <register type="NOA" address="0x00009888" value="0x31800000" />
+        <register type="NOA" address="0x00009888" value="0x4D800000" />
+        <register type="NOA" address="0x00009888" value="0x3F800002" />
+        <register type="NOA" address="0x00009888" value="0x41800C62" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x0000FE7F" />
+        <register type="OA" address="0x00002780" value="0x00000000" />
+        <register type="OA" address="0x00002784" value="0x0000FF9F" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000FFE7" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFFB" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FFFD" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen8LP"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="4a534b07-cba3-414d-8d60-874830e883aa"
+       chipset="CHV"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+        <register type="NOA" address="0x00009888" value="0x59800001" />
+        <register type="NOA" address="0x00009888" value="0x338B0000" />
+        <register type="NOA" address="0x00009888" value="0x258B0066" />
+        <register type="NOA" address="0x00009888" value="0x058B0000" />
+        <register type="NOA" address="0x00009888" value="0x038B0000" />
+        <register type="NOA" address="0x00009888" value="0x03844000" />
+        <register type="NOA" address="0x00009888" value="0x47800080" />
+        <register type="NOA" address="0x00009888" value="0x57800000" />
+        <register type="NOA" address="0x001823A4" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x59800000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_glk.xml b/src/mesa/drivers/dri/i965/brw_oa_glk.xml
new file mode 100644
index 0000000..31a9a31
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_glk.xml
@@ -0,0 +1,9124 @@
+<?xml version="1.0"?>
+<metrics version="1493336461" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="d72df5c7-5b4a-4274-a43f-00b0fd51fc68"
+       chipset="GLK"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C00F0" />
+        <register type="NOA" address="0x00009888" value="0x12120280" />
+        <register type="NOA" address="0x00009888" value="0x12320280" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x419000A0" />
+        <register type="NOA" address="0x00009888" value="0x002D1000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E0800" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5900" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4C4000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0010" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FCC00" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0002" />
+        <register type="NOA" address="0x00009888" value="0x1C2C0040" />
+        <register type="NOA" address="0x00009888" value="0x00101000" />
+        <register type="NOA" address="0x00009888" value="0x04101000" />
+        <register type="NOA" address="0x00009888" value="0x00114000" />
+        <register type="NOA" address="0x00009888" value="0x08114000" />
+        <register type="NOA" address="0x00009888" value="0x00120020" />
+        <register type="NOA" address="0x00009888" value="0x08120021" />
+        <register type="NOA" address="0x00009888" value="0x00141000" />
+        <register type="NOA" address="0x00009888" value="0x08141000" />
+        <register type="NOA" address="0x00009888" value="0x02308000" />
+        <register type="NOA" address="0x00009888" value="0x04302000" />
+        <register type="NOA" address="0x00009888" value="0x06318000" />
+        <register type="NOA" address="0x00009888" value="0x08318000" />
+        <register type="NOA" address="0x00009888" value="0x06320800" />
+        <register type="NOA" address="0x00009888" value="0x08320840" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x06344000" />
+        <register type="NOA" address="0x00009888" value="0x08344000" />
+        <register type="NOA" address="0x00009888" value="0x0D931831" />
+        <register type="NOA" address="0x00009888" value="0x0F939F3F" />
+        <register type="NOA" address="0x00009888" value="0x01939E80" />
+        <register type="NOA" address="0x00009888" value="0x039303BC" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1993002A" />
+        <register type="NOA" address="0x00009888" value="0x07930000" />
+        <register type="NOA" address="0x00009888" value="0x09930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900177" />
+        <register type="NOA" address="0x00009888" value="0x1F900187" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x53901110" />
+        <register type="NOA" address="0x00009888" value="0x43900423" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900C02" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900020" />
+        <register type="NOA" address="0x00009888" value="0x59901111" />
+        <register type="NOA" address="0x00009888" value="0x4B900421" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x45900821" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="814285f6-354d-41d2-ba49-e24e622714a0"
+       chipset="GLK"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL  $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x002D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D4000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E1400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5100" />
+        <register type="NOA" address="0x00009888" value="0x102E0114" />
+        <register type="NOA" address="0x00009888" value="0x044CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4C4000" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x004EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F6B42" />
+        <register type="NOA" address="0x00009888" value="0x064F6200" />
+        <register type="NOA" address="0x00009888" value="0x084F4100" />
+        <register type="NOA" address="0x00009888" value="0x0A4F0061" />
+        <register type="NOA" address="0x00009888" value="0x0C4F6C4C" />
+        <register type="NOA" address="0x00009888" value="0x0E4F4B00" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0000" />
+        <register type="NOA" address="0x00009888" value="0x180F5000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F8800" />
+        <register type="NOA" address="0x00009888" value="0x1C0F08A2" />
+        <register type="NOA" address="0x00009888" value="0x182C4000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C1451" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C0010" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x19938A28" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x19900177" />
+        <register type="NOA" address="0x00009888" value="0x1B900178" />
+        <register type="NOA" address="0x00009888" value="0x1D900125" />
+        <register type="NOA" address="0x00009888" value="0x1F900123" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x53901000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="07d397a6-b3e6-49f6-9433-a4f293d55978"
+       chipset="GLK"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C2E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A2F0000" />
+        <register type="NOA" address="0x00009888" value="0x10186800" />
+        <register type="NOA" address="0x00009888" value="0x11810019" />
+        <register type="NOA" address="0x00009888" value="0x15810013" />
+        <register type="NOA" address="0x00009888" value="0x13820020" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x17840000" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x21860000" />
+        <register type="NOA" address="0x00009888" value="0x178703E0" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x022E5400" />
+        <register type="NOA" address="0x00009888" value="0x002E0000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0080" />
+        <register type="NOA" address="0x00009888" value="0x082F0040" />
+        <register type="NOA" address="0x00009888" value="0x002F0000" />
+        <register type="NOA" address="0x00009888" value="0x06143000" />
+        <register type="NOA" address="0x00009888" value="0x06174000" />
+        <register type="NOA" address="0x00009888" value="0x06180012" />
+        <register type="NOA" address="0x00009888" value="0x00180000" />
+        <register type="NOA" address="0x00009888" value="0x0D804000" />
+        <register type="NOA" address="0x00009888" value="0x0F804000" />
+        <register type="NOA" address="0x00009888" value="0x05804000" />
+        <register type="NOA" address="0x00009888" value="0x09810200" />
+        <register type="NOA" address="0x00009888" value="0x0B810030" />
+        <register type="NOA" address="0x00009888" value="0x03810003" />
+        <register type="NOA" address="0x00009888" value="0x21819140" />
+        <register type="NOA" address="0x00009888" value="0x23819050" />
+        <register type="NOA" address="0x00009888" value="0x25810018" />
+        <register type="NOA" address="0x00009888" value="0x0B820980" />
+        <register type="NOA" address="0x00009888" value="0x03820D80" />
+        <register type="NOA" address="0x00009888" value="0x11820000" />
+        <register type="NOA" address="0x00009888" value="0x0182C000" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x09824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0D830004" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x0F831000" />
+        <register type="NOA" address="0x00009888" value="0x01848072" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x07848000" />
+        <register type="NOA" address="0x00009888" value="0x09844000" />
+        <register type="NOA" address="0x00009888" value="0x0F848000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x09860092" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x01869100" />
+        <register type="NOA" address="0x00009888" value="0x0F870065" />
+        <register type="NOA" address="0x00009888" value="0x01870000" />
+        <register type="NOA" address="0x00009888" value="0x19930800" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1B952000" />
+        <register type="NOA" address="0x00009888" value="0x1D955055" />
+        <register type="NOA" address="0x00009888" value="0x1F951455" />
+        <register type="NOA" address="0x00009888" value="0x0992A000" />
+        <register type="NOA" address="0x00009888" value="0x0F928000" />
+        <register type="NOA" address="0x00009888" value="0x1192A800" />
+        <register type="NOA" address="0x00009888" value="0x1392028A" />
+        <register type="NOA" address="0x00009888" value="0x0B92A000" />
+        <register type="NOA" address="0x00009888" value="0x0D922000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900C01" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900863" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900061" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C22" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="1a356946-5428-450b-a2f0-89f8783a302d"
+       chipset="GLK"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800343" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x41900003" />
+        <register type="NOA" address="0x00009888" value="0x03803180" />
+        <register type="NOA" address="0x00009888" value="0x058035E2" />
+        <register type="NOA" address="0x00009888" value="0x0780006A" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x2181A000" />
+        <register type="NOA" address="0x00009888" value="0x2381000A" />
+        <register type="NOA" address="0x00009888" value="0x1D950550" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D92A000" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x13900170" />
+        <register type="NOA" address="0x00009888" value="0x21900171" />
+        <register type="NOA" address="0x00009888" value="0x23900172" />
+        <register type="NOA" address="0x00009888" value="0x25900173" />
+        <register type="NOA" address="0x00009888" value="0x27900174" />
+        <register type="NOA" address="0x00009888" value="0x29900175" />
+        <register type="NOA" address="0x00009888" value="0x2B900176" />
+        <register type="NOA" address="0x00009888" value="0x2D900177" />
+        <register type="NOA" address="0x00009888" value="0x2F90017F" />
+        <register type="NOA" address="0x00009888" value="0x31900125" />
+        <register type="NOA" address="0x00009888" value="0x15900123" />
+        <register type="NOA" address="0x00009888" value="0x17900121" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47901080" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49901084" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B901084" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900004" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="5299be9d-7a61-4c99-9f81-f87e6c5aaca9"
+       chipset="GLK"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800343" />
+        <register type="NOA" address="0x00009888" value="0x39900340" />
+        <register type="NOA" address="0x00009888" value="0x3F900000" />
+        <register type="NOA" address="0x00009888" value="0x41900080" />
+        <register type="NOA" address="0x00009888" value="0x03803180" />
+        <register type="NOA" address="0x00009888" value="0x058035E2" />
+        <register type="NOA" address="0x00009888" value="0x0780006A" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x2181A000" />
+        <register type="NOA" address="0x00009888" value="0x2381000A" />
+        <register type="NOA" address="0x00009888" value="0x1D950550" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D92A000" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x13900180" />
+        <register type="NOA" address="0x00009888" value="0x21900181" />
+        <register type="NOA" address="0x00009888" value="0x23900182" />
+        <register type="NOA" address="0x00009888" value="0x25900183" />
+        <register type="NOA" address="0x00009888" value="0x27900184" />
+        <register type="NOA" address="0x00009888" value="0x29900185" />
+        <register type="NOA" address="0x00009888" value="0x2B900186" />
+        <register type="NOA" address="0x00009888" value="0x2D900187" />
+        <register type="NOA" address="0x00009888" value="0x2F900170" />
+        <register type="NOA" address="0x00009888" value="0x31900125" />
+        <register type="NOA" address="0x00009888" value="0x15900123" />
+        <register type="NOA" address="0x00009888" value="0x17900121" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47901080" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49901084" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B901084" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900004" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="bc9bcff2-459a-4cbc-986d-a84b077153f3"
+       chipset="GLK"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C0160" />
+        <register type="NOA" address="0x00009888" value="0x161C0015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x002D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0A2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E5515" />
+        <register type="NOA" address="0x00009888" value="0x102E0155" />
+        <register type="NOA" address="0x00009888" value="0x044CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E4CC000" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x004EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084EA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4F4B41" />
+        <register type="NOA" address="0x00009888" value="0x004F4200" />
+        <register type="NOA" address="0x00009888" value="0x024F404C" />
+        <register type="NOA" address="0x00009888" value="0x1C4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0A1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0031" />
+        <register type="NOA" address="0x00009888" value="0x061C1900" />
+        <register type="NOA" address="0x00009888" value="0x081C1A33" />
+        <register type="NOA" address="0x00009888" value="0x0A1C1B35" />
+        <register type="NOA" address="0x00009888" value="0x0C1C3337" />
+        <register type="NOA" address="0x00009888" value="0x041C31C7" />
+        <register type="NOA" address="0x00009888" value="0x180F5000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FA8AA" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0AAA" />
+        <register type="NOA" address="0x00009888" value="0x182C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C6AAA" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C2950" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x1993AAAA" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29904000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900400" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="88ec931f-5b4a-453a-9db6-a61232b6143d"
+       chipset="GLK"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C03B0" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900C00" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x002D1000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x082D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E0400" />
+        <register type="NOA" address="0x00009888" value="0x0E2E1500" />
+        <register type="NOA" address="0x00009888" value="0x102E0140" />
+        <register type="NOA" address="0x00009888" value="0x044C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4CC000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x004E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x1A4F4001" />
+        <register type="NOA" address="0x00009888" value="0x1C4F5005" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x180F1000" />
+        <register type="NOA" address="0x00009888" value="0x1A0FA800" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C4000" />
+        <register type="NOA" address="0x00009888" value="0x1C2C4015" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x03931980" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x01938000" />
+        <register type="NOA" address="0x00009888" value="0x0F938000" />
+        <register type="NOA" address="0x00009888" value="0x1993A00A" />
+        <register type="NOA" address="0x00009888" value="0x07930000" />
+        <register type="NOA" address="0x00009888" value="0x09930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900177" />
+        <register type="NOA" address="0x00009888" value="0x1F900178" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x53901000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="530d176d-2a18-4014-adf8-1500c6c60835"
+       chipset="GLK"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x022D4000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0055" />
+        <register type="NOA" address="0x00009888" value="0x064C8000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x024F6100" />
+        <register type="NOA" address="0x00009888" value="0x044F416B" />
+        <register type="NOA" address="0x00009888" value="0x064F004B" />
+        <register type="NOA" address="0x00009888" value="0x1A4F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F02A8" />
+        <register type="NOA" address="0x00009888" value="0x1A2C5500" />
+        <register type="NOA" address="0x00009888" value="0x0F808000" />
+        <register type="NOA" address="0x00009888" value="0x25810020" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1F951000" />
+        <register type="NOA" address="0x00009888" value="0x13920200" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="fdee5a5a-f23c-43d1-aa73-f6257c71671d"
+       chipset="GLK"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12643400" />
+        <register type="NOA" address="0x00009888" value="0x12653400" />
+        <register type="NOA" address="0x00009888" value="0x106C6800" />
+        <register type="NOA" address="0x00009888" value="0x126C001E" />
+        <register type="NOA" address="0x00009888" value="0x166C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C2D5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E0154" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0055" />
+        <register type="NOA" address="0x00009888" value="0x104C8000" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4EA000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C4F5500" />
+        <register type="NOA" address="0x00009888" value="0x1A4F1554" />
+        <register type="NOA" address="0x00009888" value="0x0A640024" />
+        <register type="NOA" address="0x00009888" value="0x10640000" />
+        <register type="NOA" address="0x00009888" value="0x04640000" />
+        <register type="NOA" address="0x00009888" value="0x0C650024" />
+        <register type="NOA" address="0x00009888" value="0x10650000" />
+        <register type="NOA" address="0x00009888" value="0x06650000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0900" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0AA0" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F02AA" />
+        <register type="NOA" address="0x00009888" value="0x1C2C5400" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0001" />
+        <register type="NOA" address="0x00009888" value="0x1A2C5550" />
+        <register type="NOA" address="0x00009888" value="0x1993AA00" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900421" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="6617623e-ca73-4791-b2b7-ddedd0846a0c"
+       chipset="GLK"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102D7800" />
+        <register type="NOA" address="0x00009888" value="0x122D79E0" />
+        <register type="NOA" address="0x00009888" value="0x0C2F0004" />
+        <register type="NOA" address="0x00009888" value="0x100E3800" />
+        <register type="NOA" address="0x00009888" value="0x180F0005" />
+        <register type="NOA" address="0x00009888" value="0x002D0940" />
+        <register type="NOA" address="0x00009888" value="0x022D802F" />
+        <register type="NOA" address="0x00009888" value="0x042D4013" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0050" />
+        <register type="NOA" address="0x00009888" value="0x022F0010" />
+        <register type="NOA" address="0x00009888" value="0x002F0000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x040E0480" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x060F0027" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F0040" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x439014A0" />
+        <register type="NOA" address="0x00009888" value="0x459000A4" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="f3b2ea63-e82e-4234-b418-44dd20dd34d0"
+       chipset="GLK"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121300A0" />
+        <register type="NOA" address="0x00009888" value="0x141600AB" />
+        <register type="NOA" address="0x00009888" value="0x123300A0" />
+        <register type="NOA" address="0x00009888" value="0x143600AB" />
+        <register type="NOA" address="0x00009888" value="0x125300A0" />
+        <register type="NOA" address="0x00009888" value="0x145600AB" />
+        <register type="NOA" address="0x00009888" value="0x0C2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E01A0" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0065" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x084C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x044E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0800" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F023F" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0003" />
+        <register type="NOA" address="0x00009888" value="0x1A2CC030" />
+        <register type="NOA" address="0x00009888" value="0x04132180" />
+        <register type="NOA" address="0x00009888" value="0x02130000" />
+        <register type="NOA" address="0x00009888" value="0x0C148000" />
+        <register type="NOA" address="0x00009888" value="0x0E142000" />
+        <register type="NOA" address="0x00009888" value="0x04148000" />
+        <register type="NOA" address="0x00009888" value="0x1E150140" />
+        <register type="NOA" address="0x00009888" value="0x1C150040" />
+        <register type="NOA" address="0x00009888" value="0x0C163000" />
+        <register type="NOA" address="0x00009888" value="0x0E160068" />
+        <register type="NOA" address="0x00009888" value="0x10160000" />
+        <register type="NOA" address="0x00009888" value="0x18160000" />
+        <register type="NOA" address="0x00009888" value="0x0A164000" />
+        <register type="NOA" address="0x00009888" value="0x04330043" />
+        <register type="NOA" address="0x00009888" value="0x02330000" />
+        <register type="NOA" address="0x00009888" value="0x0234A000" />
+        <register type="NOA" address="0x00009888" value="0x04342000" />
+        <register type="NOA" address="0x00009888" value="0x1C350015" />
+        <register type="NOA" address="0x00009888" value="0x02363460" />
+        <register type="NOA" address="0x00009888" value="0x10360000" />
+        <register type="NOA" address="0x00009888" value="0x04360000" />
+        <register type="NOA" address="0x00009888" value="0x06360000" />
+        <register type="NOA" address="0x00009888" value="0x08364000" />
+        <register type="NOA" address="0x00009888" value="0x06530043" />
+        <register type="NOA" address="0x00009888" value="0x02530000" />
+        <register type="NOA" address="0x00009888" value="0x0E548000" />
+        <register type="NOA" address="0x00009888" value="0x00548000" />
+        <register type="NOA" address="0x00009888" value="0x06542000" />
+        <register type="NOA" address="0x00009888" value="0x1E550400" />
+        <register type="NOA" address="0x00009888" value="0x1A552000" />
+        <register type="NOA" address="0x00009888" value="0x1C550100" />
+        <register type="NOA" address="0x00009888" value="0x0E563000" />
+        <register type="NOA" address="0x00009888" value="0x00563400" />
+        <register type="NOA" address="0x00009888" value="0x10560000" />
+        <register type="NOA" address="0x00009888" value="0x18560000" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x0C564000" />
+        <register type="NOA" address="0x00009888" value="0x1993A800" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B9014A0" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900001" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900820" />
+        <register type="NOA" address="0x00009888" value="0x45901022" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="14411d35-cbf6-4f5e-b68b-190faf9a1a83"
+       chipset="GLK"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A0000" />
+        <register type="NOA" address="0x00009888" value="0x143A0000" />
+        <register type="NOA" address="0x00009888" value="0x145A0000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D5000" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x102E0150" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E006A" />
+        <register type="NOA" address="0x00009888" value="0x124C8000" />
+        <register type="NOA" address="0x00009888" value="0x144C8000" />
+        <register type="NOA" address="0x00009888" value="0x164C2000" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4EA000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024E2000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x1C0F0BC0" />
+        <register type="NOA" address="0x00009888" value="0x180F4000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F0302" />
+        <register type="NOA" address="0x00009888" value="0x1E2C0003" />
+        <register type="NOA" address="0x00009888" value="0x1A2C00F0" />
+        <register type="NOA" address="0x00009888" value="0x021A3080" />
+        <register type="NOA" address="0x00009888" value="0x041A31E5" />
+        <register type="NOA" address="0x00009888" value="0x02148000" />
+        <register type="NOA" address="0x00009888" value="0x0414A000" />
+        <register type="NOA" address="0x00009888" value="0x1C150054" />
+        <register type="NOA" address="0x00009888" value="0x06168000" />
+        <register type="NOA" address="0x00009888" value="0x08168000" />
+        <register type="NOA" address="0x00009888" value="0x0A168000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A3280" />
+        <register type="NOA" address="0x00009888" value="0x0E3A0063" />
+        <register type="NOA" address="0x00009888" value="0x063A0061" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x0C348000" />
+        <register type="NOA" address="0x00009888" value="0x0E342000" />
+        <register type="NOA" address="0x00009888" value="0x06342000" />
+        <register type="NOA" address="0x00009888" value="0x1E350140" />
+        <register type="NOA" address="0x00009888" value="0x1C350100" />
+        <register type="NOA" address="0x00009888" value="0x18360028" />
+        <register type="NOA" address="0x00009888" value="0x0C368000" />
+        <register type="NOA" address="0x00009888" value="0x0E5A3080" />
+        <register type="NOA" address="0x00009888" value="0x005A3280" />
+        <register type="NOA" address="0x00009888" value="0x025A0063" />
+        <register type="NOA" address="0x00009888" value="0x0E548000" />
+        <register type="NOA" address="0x00009888" value="0x00548000" />
+        <register type="NOA" address="0x00009888" value="0x02542000" />
+        <register type="NOA" address="0x00009888" value="0x1E550400" />
+        <register type="NOA" address="0x00009888" value="0x1A552000" />
+        <register type="NOA" address="0x00009888" value="0x1C550001" />
+        <register type="NOA" address="0x00009888" value="0x18560080" />
+        <register type="NOA" address="0x00009888" value="0x02568000" />
+        <register type="NOA" address="0x00009888" value="0x04568000" />
+        <register type="NOA" address="0x00009888" value="0x1993A800" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x2D904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4D900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x45901084" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="ffa3f263-0478-4724-8c9f-c911c5ec0f1d"
+       chipset="GLK"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A026B" />
+        <register type="NOA" address="0x00009888" value="0x143A0173" />
+        <register type="NOA" address="0x00009888" value="0x145A026B" />
+        <register type="NOA" address="0x00009888" value="0x002D4000" />
+        <register type="NOA" address="0x00009888" value="0x022D5000" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0C2E5000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0069" />
+        <register type="NOA" address="0x00009888" value="0x044C8000" />
+        <register type="NOA" address="0x00009888" value="0x064CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C4000" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x024EA000" />
+        <register type="NOA" address="0x00009888" value="0x064E2000" />
+        <register type="NOA" address="0x00009888" value="0x180F6000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F030A" />
+        <register type="NOA" address="0x00009888" value="0x1A2C03C0" />
+        <register type="NOA" address="0x00009888" value="0x041A37E7" />
+        <register type="NOA" address="0x00009888" value="0x021A0000" />
+        <register type="NOA" address="0x00009888" value="0x0414A000" />
+        <register type="NOA" address="0x00009888" value="0x1C150050" />
+        <register type="NOA" address="0x00009888" value="0x08168000" />
+        <register type="NOA" address="0x00009888" value="0x0A168000" />
+        <register type="NOA" address="0x00009888" value="0x003A3380" />
+        <register type="NOA" address="0x00009888" value="0x063A006F" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x00348000" />
+        <register type="NOA" address="0x00009888" value="0x06342000" />
+        <register type="NOA" address="0x00009888" value="0x1A352000" />
+        <register type="NOA" address="0x00009888" value="0x1C350100" />
+        <register type="NOA" address="0x00009888" value="0x02368000" />
+        <register type="NOA" address="0x00009888" value="0x0C368000" />
+        <register type="NOA" address="0x00009888" value="0x025A37E7" />
+        <register type="NOA" address="0x00009888" value="0x0254A000" />
+        <register type="NOA" address="0x00009888" value="0x1C550005" />
+        <register type="NOA" address="0x00009888" value="0x04568000" />
+        <register type="NOA" address="0x00009888" value="0x06568000" />
+        <register type="NOA" address="0x00009888" value="0x03938000" />
+        <register type="NOA" address="0x00009888" value="0x05938000" />
+        <register type="NOA" address="0x00009888" value="0x07938000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17904000" />
+        <register type="NOA" address="0x00009888" value="0x19904000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900020" />
+        <register type="NOA" address="0x00009888" value="0x45901080" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="15274c82-27d2-4819-876a-7cb1a2c59ba4"
+       chipset="GLK"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 7 READ C 6 READ FADD C 5 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A001F" />
+        <register type="NOA" address="0x00009888" value="0x143A001F" />
+        <register type="NOA" address="0x00009888" value="0x145A001F" />
+        <register type="NOA" address="0x00009888" value="0x042D5000" />
+        <register type="NOA" address="0x00009888" value="0x062D1000" />
+        <register type="NOA" address="0x00009888" value="0x0E2E0094" />
+        <register type="NOA" address="0x00009888" value="0x084CC000" />
+        <register type="NOA" address="0x00009888" value="0x044EA000" />
+        <register type="NOA" address="0x00009888" value="0x1A0F00E0" />
+        <register type="NOA" address="0x00009888" value="0x1A2C0C00" />
+        <register type="NOA" address="0x00009888" value="0x061A0063" />
+        <register type="NOA" address="0x00009888" value="0x021A0000" />
+        <register type="NOA" address="0x00009888" value="0x06142000" />
+        <register type="NOA" address="0x00009888" value="0x1C150100" />
+        <register type="NOA" address="0x00009888" value="0x0C168000" />
+        <register type="NOA" address="0x00009888" value="0x043A3180" />
+        <register type="NOA" address="0x00009888" value="0x023A0000" />
+        <register type="NOA" address="0x00009888" value="0x04348000" />
+        <register type="NOA" address="0x00009888" value="0x1C350040" />
+        <register type="NOA" address="0x00009888" value="0x0A368000" />
+        <register type="NOA" address="0x00009888" value="0x045A0063" />
+        <register type="NOA" address="0x00009888" value="0x025A0000" />
+        <register type="NOA" address="0x00009888" value="0x04542000" />
+        <register type="NOA" address="0x00009888" value="0x1C550010" />
+        <register type="NOA" address="0x00009888" value="0x08568000" />
+        <register type="NOA" address="0x00009888" value="0x09938000" />
+        <register type="NOA" address="0x00009888" value="0x0B938000" />
+        <register type="NOA" address="0x00009888" value="0x0D938000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1D904000" />
+        <register type="NOA" address="0x00009888" value="0x1F904000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900004" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00001000" />
+        <register type="FLEX" address="0x0000E558" value="0x00003002" />
+        <register type="FLEX" address="0x0000E658" value="0x00005004" />
+        <register type="FLEX" address="0x0000E758" value="0x00011010" />
+        <register type="FLEX" address="0x0000E45C" value="0x00050012" />
+        <register type="FLEX" address="0x0000E55C" value="0x00052051" />
+        <register type="FLEX" address="0x0000E65C" value="0x00000008" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="dd3fd789-e783-4204-8cd0-b671bbccb0cf"
+       chipset="GLK"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x19800000" />
+        <register type="NOA" address="0x00009888" value="0x07800063" />
+        <register type="NOA" address="0x00009888" value="0x11800000" />
+        <register type="NOA" address="0x00009888" value="0x23810008" />
+        <register type="NOA" address="0x00009888" value="0x1D950400" />
+        <register type="NOA" address="0x00009888" value="0x0F922000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_hsw.xml b/src/mesa/drivers/dri/i965/brw_oa_hsw.xml
index 739ea0e..a227640 100644
--- a/src/mesa/drivers/dri/i965/brw_oa_hsw.xml
+++ b/src/mesa/drivers/dri/i965/brw_oa_hsw.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0"?>
-<metrics version="1486658299" merge_md5="">
+<metrics version="1490233727" merge_md5="">
   <set name="Render Metrics Basic Gen7.5"
        mdapi_supported_apis="OGL OCL IO BB"
        underscore_name="render_basic"
@@ -926,7 +926,7 @@
              mdapi_hw_unit_type="gpu"
              mdapi_group="3D Pipe/Output Merger"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x000253A4" value="0x01600000" />
         <register type="NOA" address="0x00025440" value="0x00100000" />
         <register type="NOA" address="0x00025128" value="0x00000000" />
@@ -987,7 +987,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x00042049" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002724" value="0x00800000" /> <!--OASTARTTRIG6 -->
         <register type="OA" address="0x00002720" value="0x00000000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002714" value="0x00800000" /> <!--OASTARTTRIG1 -->
@@ -1680,7 +1680,7 @@
              mdapi_group="EU Array/Vertex Shader"
              mdapi_hw_unit_type="gpu"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x000253A4" value="0x00000000" />
         <register type="NOA" address="0x0002681C" value="0x01F00800" />
         <register type="NOA" address="0x00026820" value="0x00001000" />
@@ -1714,7 +1714,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x00000C03" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002710" value="0x00000000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002714" value="0x00800000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002718" value="0xAAAAAAAA" /> <!--OASTARTTRIG1 -->
@@ -2002,7 +2002,7 @@
              mdapi_hw_unit_type="subslice"
              mdapi_group="L3/Data Port"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x0002681C" value="0x3EB00800" />
         <register type="NOA" address="0x00026820" value="0x00900000" />
         <register type="NOA" address="0x00025384" value="0x02AAAAAA" />
@@ -2019,7 +2019,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x00000000" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002724" value="0xF0800000" /> <!--OASTARTTRIG6 -->
         <register type="OA" address="0x00002720" value="0x00000000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002714" value="0xF0800000" /> <!--OASTARTTRIG1 -->
@@ -2774,7 +2774,7 @@
              mdapi_group="EU Array/Vertex Shader"
              mdapi_hw_unit_type="gpu"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x000253A4" value="0x34300000" />
         <register type="NOA" address="0x00025440" value="0x2D800000" />
         <register type="NOA" address="0x00025444" value="0x00000008" />
@@ -2794,7 +2794,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x00000000" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002724" value="0xF0800000" /> <!--OASTARTTRIG6 -->
         <register type="OA" address="0x00002720" value="0x00000000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002714" value="0xF0800000" /> <!--OASTARTTRIG1 -->
@@ -3544,7 +3544,7 @@
              mdapi_group="EU Array/Vertex Shader"
              mdapi_hw_unit_type="gpu"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x000253A4" value="0x34300000" />
         <register type="NOA" address="0x00025440" value="0x01500000" />
         <register type="NOA" address="0x00025444" value="0x00000120" />
@@ -3564,7 +3564,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x00000000" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002724" value="0xF0800000" /> <!--OASTARTTRIG6 -->
         <register type="OA" address="0x00002720" value="0x00000000" /> <!--OASTARTTRIG1 -->
         <register type="OA" address="0x00002714" value="0xF0800000" /> <!--OASTARTTRIG1 -->
@@ -4345,7 +4345,7 @@
              mdapi_hw_unit_type="gpu"
              mdapi_group="3D Pipe/Output Merger"
              />
-    <register_config id="0">
+    <register_config type="NOA">
         <register type="NOA" address="0x0002EB9C" value="0x01906400" />
         <register type="NOA" address="0x0002FB9C" value="0x01906400" />
         <register type="NOA" address="0x000253A4" value="0x00000000" />
@@ -4387,7 +4387,7 @@
         <register type="NOA" address="0x0002541C" value="0x00000000" />
         <register type="NOA" address="0x00025428" value="0x0004A54A" />
     </register_config>
-    <register_config id="1">
+    <register_config type="OA">
         <register type="OA" address="0x00002740" value="0x00000000" /> <!--OAREPORTTRIG1 -->
         <register type="OA" address="0x00002744" value="0x00800000" /> <!--OAREPORTTRIG2 -->
         <register type="OA" address="0x00002710" value="0x00000000" /> <!--OASTARTTRIG1 -->
diff --git a/src/mesa/drivers/dri/i965/brw_oa_kblgt2.xml b/src/mesa/drivers/dri/i965/brw_oa_kblgt2.xml
new file mode 100644
index 0000000..ed68948
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_kblgt2.xml
@@ -0,0 +1,10455 @@
+<?xml version="1.0"?>
+<metrics version="1491576018" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="f8d677e9-ff6f-4df1-9310-0334c6efacce"
+       chipset="KBLGT2"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C01E0" />
+        <register type="NOA" address="0x00009888" value="0x12170280" />
+        <register type="NOA" address="0x00009888" value="0x12370280" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0080" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0001" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x042F1000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8400" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F6600" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C2200" />
+        <register type="NOA" address="0x00009888" value="0x062D8000" />
+        <register type="NOA" address="0x00009888" value="0x082D8000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x08133000" />
+        <register type="NOA" address="0x00009888" value="0x00170020" />
+        <register type="NOA" address="0x00009888" value="0x08170021" />
+        <register type="NOA" address="0x00009888" value="0x10170000" />
+        <register type="NOA" address="0x00009888" value="0x0633C000" />
+        <register type="NOA" address="0x00009888" value="0x0833C000" />
+        <register type="NOA" address="0x00009888" value="0x06370800" />
+        <register type="NOA" address="0x00009888" value="0x08370840" />
+        <register type="NOA" address="0x00009888" value="0x10370000" />
+        <register type="NOA" address="0x00009888" value="0x0D933031" />
+        <register type="NOA" address="0x00009888" value="0x0F933E3F" />
+        <register type="NOA" address="0x00009888" value="0x01933D00" />
+        <register type="NOA" address="0x00009888" value="0x0393073C" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1D930000" />
+        <register type="NOA" address="0x00009888" value="0x19930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190001F" />
+        <register type="NOA" address="0x00009888" value="0x51904400" />
+        <register type="NOA" address="0x00009888" value="0x41900020" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C21" />
+        <register type="NOA" address="0x00009888" value="0x47900061" />
+        <register type="NOA" address="0x00009888" value="0x57904440" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900004" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="e17fc42a-e614-41b6-90c4-1074841a6c77"
+       chipset="KBLGT2"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0820" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F0032" />
+        <register type="NOA" address="0x00009888" value="0x0A4F1891" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E00" />
+        <register type="NOA" address="0x00009888" value="0x0E4F003C" />
+        <register type="NOA" address="0x00009888" value="0x004F0D80" />
+        <register type="NOA" address="0x00009888" value="0x024F003B" />
+        <register type="NOA" address="0x00009888" value="0x006C0002" />
+        <register type="NOA" address="0x00009888" value="0x086C0100" />
+        <register type="NOA" address="0x00009888" value="0x0C6C000C" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B00" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x081B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B8000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0024" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C6000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C001B" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0208" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CFB00" />
+        <register type="NOA" address="0x00009888" value="0x182C00BE" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900158" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900821" />
+        <register type="NOA" address="0x00009888" value="0x47900802" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900802" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900422" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="d7a17a3a-ca71-40d2-a919-ace80d50633f"
+       chipset="KBLGT2"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x1B931001" />
+        <register type="NOA" address="0x00009888" value="0x1D930001" />
+        <register type="NOA" address="0x00009888" value="0x19934000" />
+        <register type="NOA" address="0x00009888" value="0x1B958000" />
+        <register type="NOA" address="0x00009888" value="0x1D950094" />
+        <register type="NOA" address="0x00009888" value="0x19958000" />
+        <register type="NOA" address="0x00009888" value="0x09E58000" />
+        <register type="NOA" address="0x00009888" value="0x0BE58000" />
+        <register type="NOA" address="0x00009888" value="0x03E5C000" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900440" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900C21" />
+        <register type="NOA" address="0x00009888" value="0x57900400" />
+        <register type="NOA" address="0x00009888" value="0x49900042" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900024" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900841" />
+        <register type="NOA" address="0x00009888" value="0x53900400" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="57b59202-172b-477a-87de-33f85572c589"
+       chipset="KBLGT2"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900064" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900150" />
+        <register type="NOA" address="0x00009888" value="0x21900151" />
+        <register type="NOA" address="0x00009888" value="0x23900152" />
+        <register type="NOA" address="0x00009888" value="0x25900153" />
+        <register type="NOA" address="0x00009888" value="0x27900154" />
+        <register type="NOA" address="0x00009888" value="0x29900155" />
+        <register type="NOA" address="0x00009888" value="0x2B900156" />
+        <register type="NOA" address="0x00009888" value="0x2D900157" />
+        <register type="NOA" address="0x00009888" value="0x2F90015F" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="3addf8ef-8e9b-40f5-a448-3dbb5d5128b0"
+       chipset="KBLGT2"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900160" />
+        <register type="NOA" address="0x00009888" value="0x21900161" />
+        <register type="NOA" address="0x00009888" value="0x23900162" />
+        <register type="NOA" address="0x00009888" value="0x25900163" />
+        <register type="NOA" address="0x00009888" value="0x27900164" />
+        <register type="NOA" address="0x00009888" value="0x29900165" />
+        <register type="NOA" address="0x00009888" value="0x2B900166" />
+        <register type="NOA" address="0x00009888" value="0x2D900167" />
+        <register type="NOA" address="0x00009888" value="0x2F900150" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="4af0400a-81c3-47db-a6b6-deddbd75680e"
+       chipset="KBLGT2"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C8160" />
+        <register type="NOA" address="0x00009888" value="0x161C8015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4EAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B01" />
+        <register type="NOA" address="0x00009888" value="0x006C0200" />
+        <register type="NOA" address="0x00009888" value="0x026C000C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0041" />
+        <register type="NOA" address="0x00009888" value="0x061C4200" />
+        <register type="NOA" address="0x00009888" value="0x081C4443" />
+        <register type="NOA" address="0x00009888" value="0x0A1C4645" />
+        <register type="NOA" address="0x00009888" value="0x0C1C7647" />
+        <register type="NOA" address="0x00009888" value="0x041C7357" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x101C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0000" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CAA2A" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02AA" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5515" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x11907FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900802" />
+        <register type="NOA" address="0x00009888" value="0x47900842" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900842" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900800" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="0e22f995-79ca-4f67-83ab-e9d9772488d8"
+       chipset="KBLGT2"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C0760" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8020" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1CE000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2A00" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0280" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F1500" />
+        <register type="NOA" address="0x00009888" value="0x100F0140" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C00A0" />
+        <register type="NOA" address="0x00009888" value="0x03933300" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190030F" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="bc2a00f7-cb8a-4ff2-8ad0-e241dad16937"
+       chipset="KBLGT2"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x106C0232" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F1880" />
+        <register type="NOA" address="0x00009888" value="0x024F08BB" />
+        <register type="NOA" address="0x00009888" value="0x044F001B" />
+        <register type="NOA" address="0x00009888" value="0x046C0100" />
+        <register type="NOA" address="0x00009888" value="0x066C000B" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x041B8000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025BC000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x165C8000" />
+        <register type="NOA" address="0x00009888" value="0x185C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00A0" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x062CC000" />
+        <register type="NOA" address="0x00009888" value="0x082CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x1D950080" />
+        <register type="NOA" address="0x00009888" value="0x13928000" />
+        <register type="NOA" address="0x00009888" value="0x0F988000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900040" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="d2bbe790-f058-42d9-81c6-cdedcf655bc2"
+       chipset="KBLGT2"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C7B40" />
+        <register type="NOA" address="0x00009888" value="0x166C0020" />
+        <register type="NOA" address="0x00009888" value="0x0A603444" />
+        <register type="NOA" address="0x00009888" value="0x0A613400" />
+        <register type="NOA" address="0x00009888" value="0x1A4EA800" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C003C" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x10600000" />
+        <register type="NOA" address="0x00009888" value="0x04600000" />
+        <register type="NOA" address="0x00009888" value="0x0C610044" />
+        <register type="NOA" address="0x00009888" value="0x10610000" />
+        <register type="NOA" address="0x00009888" value="0x06610000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A8" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0154" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190FFC0" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900021" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900400" />
+        <register type="NOA" address="0x00009888" value="0x43900421" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="2f8e32e4-5956-46e2-af31-c8ea95887332"
+       chipset="KBLGT2"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C02E0" />
+        <register type="NOA" address="0x00009888" value="0x146C0001" />
+        <register type="NOA" address="0x00009888" value="0x0A623400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x026C3324" />
+        <register type="NOA" address="0x00009888" value="0x046C3422" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x06614000" />
+        <register type="NOA" address="0x00009888" value="0x0C620044" />
+        <register type="NOA" address="0x00009888" value="0x10620000" />
+        <register type="NOA" address="0x00009888" value="0x06620000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="ca046aad-b5fb-4101-adce-6473ee6e5b14"
+       chipset="KBLGT2"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C4E80" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A633400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x026C3321" />
+        <register type="NOA" address="0x00009888" value="0x046C342F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C2000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x06604000" />
+        <register type="NOA" address="0x00009888" value="0x0C630044" />
+        <register type="NOA" address="0x00009888" value="0x10630000" />
+        <register type="NOA" address="0x00009888" value="0x06630000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00AA" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="605f388f-24bb-455c-88e3-8d57ae0d7e9f"
+       chipset="KBLGT2"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102F3800" />
+        <register type="NOA" address="0x00009888" value="0x144D0500" />
+        <register type="NOA" address="0x00009888" value="0x120D03C0" />
+        <register type="NOA" address="0x00009888" value="0x140D03CF" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0004" />
+        <register type="NOA" address="0x00009888" value="0x0C4E4000" />
+        <register type="NOA" address="0x00009888" value="0x042F0480" />
+        <register type="NOA" address="0x00009888" value="0x082F0000" />
+        <register type="NOA" address="0x00009888" value="0x022F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0090" />
+        <register type="NOA" address="0x00009888" value="0x064D0027" />
+        <register type="NOA" address="0x00009888" value="0x004D0000" />
+        <register type="NOA" address="0x00009888" value="0x000D0D40" />
+        <register type="NOA" address="0x00009888" value="0x020D803F" />
+        <register type="NOA" address="0x00009888" value="0x040D8023" />
+        <register type="NOA" address="0x00009888" value="0x100D0000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020F0010" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0050" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x43901485" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="31dd157c-bf4e-4bab-bf2b-f5c8174af1af"
+       chipset="KBLGT2"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14152C00" />
+        <register type="NOA" address="0x00009888" value="0x16150005" />
+        <register type="NOA" address="0x00009888" value="0x121600A0" />
+        <register type="NOA" address="0x00009888" value="0x14352C00" />
+        <register type="NOA" address="0x00009888" value="0x16350005" />
+        <register type="NOA" address="0x00009888" value="0x123600A0" />
+        <register type="NOA" address="0x00009888" value="0x14552C00" />
+        <register type="NOA" address="0x00009888" value="0x16550005" />
+        <register type="NOA" address="0x00009888" value="0x125600A0" />
+        <register type="NOA" address="0x00009888" value="0x062F6000" />
+        <register type="NOA" address="0x00009888" value="0x022F2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0050" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0350" />
+        <register type="NOA" address="0x00009888" value="0x0C0FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F00DA" />
+        <register type="NOA" address="0x00009888" value="0x182C0028" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x022DC000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C138000" />
+        <register type="NOA" address="0x00009888" value="0x0E132000" />
+        <register type="NOA" address="0x00009888" value="0x0413C000" />
+        <register type="NOA" address="0x00009888" value="0x1C140018" />
+        <register type="NOA" address="0x00009888" value="0x0C157000" />
+        <register type="NOA" address="0x00009888" value="0x0E150078" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04162180" />
+        <register type="NOA" address="0x00009888" value="0x02160000" />
+        <register type="NOA" address="0x00009888" value="0x04174000" />
+        <register type="NOA" address="0x00009888" value="0x0233A000" />
+        <register type="NOA" address="0x00009888" value="0x04333000" />
+        <register type="NOA" address="0x00009888" value="0x14348000" />
+        <register type="NOA" address="0x00009888" value="0x16348000" />
+        <register type="NOA" address="0x00009888" value="0x02357870" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04360043" />
+        <register type="NOA" address="0x00009888" value="0x02360000" />
+        <register type="NOA" address="0x00009888" value="0x04371000" />
+        <register type="NOA" address="0x00009888" value="0x0E538000" />
+        <register type="NOA" address="0x00009888" value="0x00538000" />
+        <register type="NOA" address="0x00009888" value="0x06533000" />
+        <register type="NOA" address="0x00009888" value="0x1C540020" />
+        <register type="NOA" address="0x00009888" value="0x12548000" />
+        <register type="NOA" address="0x00009888" value="0x0E557000" />
+        <register type="NOA" address="0x00009888" value="0x00557800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06560043" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x06571000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900060" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="105db928-5542-466b-9128-e1f3c91426cb"
+       chipset="KBLGT2"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12120000" />
+        <register type="NOA" address="0x00009888" value="0x12320000" />
+        <register type="NOA" address="0x00009888" value="0x12520000" />
+        <register type="NOA" address="0x00009888" value="0x002F8000" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0015" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F03A0" />
+        <register type="NOA" address="0x00009888" value="0x0C0FF000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0095" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x02108000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x02118000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x02121880" />
+        <register type="NOA" address="0x00009888" value="0x041219B5" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x02134000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x0C308000" />
+        <register type="NOA" address="0x00009888" value="0x0E304000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x0C318000" />
+        <register type="NOA" address="0x00009888" value="0x0E314000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x0C321A80" />
+        <register type="NOA" address="0x00009888" value="0x0E320033" />
+        <register type="NOA" address="0x00009888" value="0x06320031" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x0C334000" />
+        <register type="NOA" address="0x00009888" value="0x0E331000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0E508000" />
+        <register type="NOA" address="0x00009888" value="0x00508000" />
+        <register type="NOA" address="0x00009888" value="0x02504000" />
+        <register type="NOA" address="0x00009888" value="0x0E518000" />
+        <register type="NOA" address="0x00009888" value="0x00518000" />
+        <register type="NOA" address="0x00009888" value="0x02514000" />
+        <register type="NOA" address="0x00009888" value="0x0E521880" />
+        <register type="NOA" address="0x00009888" value="0x00521A80" />
+        <register type="NOA" address="0x00009888" value="0x02520033" />
+        <register type="NOA" address="0x00009888" value="0x0E534000" />
+        <register type="NOA" address="0x00009888" value="0x00534000" />
+        <register type="NOA" address="0x00009888" value="0x02531000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900062" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="03db94d2-b37f-4c58-a791-0d2067b013bb"
+       chipset="KBLGT2"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12124D60" />
+        <register type="NOA" address="0x00009888" value="0x12322E60" />
+        <register type="NOA" address="0x00009888" value="0x12524D60" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0014" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0FE000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0097" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x002D8000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x04121FB7" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x00308000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x00318000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x00321B80" />
+        <register type="NOA" address="0x00009888" value="0x0632003F" />
+        <register type="NOA" address="0x00009888" value="0x00334000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0250C000" />
+        <register type="NOA" address="0x00009888" value="0x0251C000" />
+        <register type="NOA" address="0x00009888" value="0x02521FB7" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x02535000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900063" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="aa7a3fb9-22fb-43ff-a32d-0ab6c13bbd16"
+       chipset="KBLGT2"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 7 READ C 6 READ FADD C 5 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121203E0" />
+        <register type="NOA" address="0x00009888" value="0x123203E0" />
+        <register type="NOA" address="0x00009888" value="0x125203E0" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F006C" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x042D8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06114000" />
+        <register type="NOA" address="0x00009888" value="0x06120033" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x04308000" />
+        <register type="NOA" address="0x00009888" value="0x04318000" />
+        <register type="NOA" address="0x00009888" value="0x04321980" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x04334000" />
+        <register type="NOA" address="0x00009888" value="0x04504000" />
+        <register type="NOA" address="0x00009888" value="0x04514000" />
+        <register type="NOA" address="0x00009888" value="0x04520033" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x04531000" />
+        <register type="NOA" address="0x00009888" value="0x1190E000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900C00" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00001000" />
+        <register type="FLEX" address="0x0000E558" value="0x00003002" />
+        <register type="FLEX" address="0x0000E658" value="0x00005004" />
+        <register type="FLEX" address="0x0000E758" value="0x00011010" />
+        <register type="FLEX" address="0x0000E45C" value="0x00050012" />
+        <register type="FLEX" address="0x0000E55C" value="0x00052051" />
+        <register type="FLEX" address="0x0000E65C" value="0x00000008" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen9"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="398a4268-ef6f-4ffc-b55f-3c7b5363ce61"
+       chipset="KBLGT2"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A5800" />
+        <register type="NOA" address="0x00009888" value="0x161A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12180240" />
+        <register type="NOA" address="0x00009888" value="0x14180002" />
+        <register type="NOA" address="0x00009888" value="0x143A5800" />
+        <register type="NOA" address="0x00009888" value="0x163A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12380240" />
+        <register type="NOA" address="0x00009888" value="0x14380002" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x022F8000" />
+        <register type="NOA" address="0x00009888" value="0x042F3000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C1500" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F9500" />
+        <register type="NOA" address="0x00009888" value="0x100F002A" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x0A2DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C2DC000" />
+        <register type="NOA" address="0x00009888" value="0x04193000" />
+        <register type="NOA" address="0x00009888" value="0x081A28C1" />
+        <register type="NOA" address="0x00009888" value="0x001A0000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x0613C000" />
+        <register type="NOA" address="0x00009888" value="0x0813F000" />
+        <register type="NOA" address="0x00009888" value="0x00172000" />
+        <register type="NOA" address="0x00009888" value="0x06178000" />
+        <register type="NOA" address="0x00009888" value="0x0817A000" />
+        <register type="NOA" address="0x00009888" value="0x00180037" />
+        <register type="NOA" address="0x00009888" value="0x06180940" />
+        <register type="NOA" address="0x00009888" value="0x08180000" />
+        <register type="NOA" address="0x00009888" value="0x02180000" />
+        <register type="NOA" address="0x00009888" value="0x04183000" />
+        <register type="NOA" address="0x00009888" value="0x06393000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A28C1" />
+        <register type="NOA" address="0x00009888" value="0x003A0000" />
+        <register type="NOA" address="0x00009888" value="0x0A33F000" />
+        <register type="NOA" address="0x00009888" value="0x0C33F000" />
+        <register type="NOA" address="0x00009888" value="0x0A37A000" />
+        <register type="NOA" address="0x00009888" value="0x0C37A000" />
+        <register type="NOA" address="0x00009888" value="0x0A380977" />
+        <register type="NOA" address="0x00009888" value="0x08380000" />
+        <register type="NOA" address="0x00009888" value="0x04380000" />
+        <register type="NOA" address="0x00009888" value="0x06383000" />
+        <register type="NOA" address="0x00009888" value="0x119000FF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900800" />
+        <register type="NOA" address="0x00009888" value="0x47901000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900844" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9.5"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="baa3c7e4-52b6-4b85-801e-465a94b746dd"
+       chipset="KBLGT2"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810000" />
+        <register type="NOA" address="0x00009888" value="0x07810013" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930040" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_kblgt3.xml b/src/mesa/drivers/dri/i965/brw_oa_kblgt3.xml
new file mode 100644
index 0000000..b4ee231
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_kblgt3.xml
@@ -0,0 +1,10500 @@
+<?xml version="1.0"?>
+<metrics version="1491576028" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="0286c920-2f6d-493b-b22d-7a5280df43de"
+       chipset="KBLGT3"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ B 5 READ UADD 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C01E0" />
+        <register type="NOA" address="0x00009888" value="0x12170280" />
+        <register type="NOA" address="0x00009888" value="0x12370280" />
+        <register type="NOA" address="0x00009888" value="0x16EC01E0" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0380" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0001" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x042F1000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8400" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0002" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F6600" />
+        <register type="NOA" address="0x00009888" value="0x100F0001" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CA200" />
+        <register type="NOA" address="0x00009888" value="0x062D8000" />
+        <register type="NOA" address="0x00009888" value="0x082D8000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x08133000" />
+        <register type="NOA" address="0x00009888" value="0x00170020" />
+        <register type="NOA" address="0x00009888" value="0x08170021" />
+        <register type="NOA" address="0x00009888" value="0x10170000" />
+        <register type="NOA" address="0x00009888" value="0x0633C000" />
+        <register type="NOA" address="0x00009888" value="0x0833C000" />
+        <register type="NOA" address="0x00009888" value="0x06370800" />
+        <register type="NOA" address="0x00009888" value="0x08370840" />
+        <register type="NOA" address="0x00009888" value="0x10370000" />
+        <register type="NOA" address="0x00009888" value="0x1ACE0200" />
+        <register type="NOA" address="0x00009888" value="0x0AEC5300" />
+        <register type="NOA" address="0x00009888" value="0x10EC0000" />
+        <register type="NOA" address="0x00009888" value="0x1CEC0000" />
+        <register type="NOA" address="0x00009888" value="0x0A9B8000" />
+        <register type="NOA" address="0x00009888" value="0x1C9C0002" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0002" />
+        <register type="NOA" address="0x00009888" value="0x0A8D8000" />
+        <register type="NOA" address="0x00009888" value="0x108F0001" />
+        <register type="NOA" address="0x00009888" value="0x16AC8000" />
+        <register type="NOA" address="0x00009888" value="0x0D933031" />
+        <register type="NOA" address="0x00009888" value="0x0F933E3F" />
+        <register type="NOA" address="0x00009888" value="0x01933D00" />
+        <register type="NOA" address="0x00009888" value="0x0393073C" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1D930000" />
+        <register type="NOA" address="0x00009888" value="0x19930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190003F" />
+        <register type="NOA" address="0x00009888" value="0x51902240" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x55900242" />
+        <register type="NOA" address="0x00009888" value="0x45900084" />
+        <register type="NOA" address="0x00009888" value="0x47901400" />
+        <register type="NOA" address="0x00009888" value="0x57902220" />
+        <register type="NOA" address="0x00009888" value="0x49900C60" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900002" />
+        <register type="NOA" address="0x00009888" value="0x43900C63" />
+        <register type="NOA" address="0x00009888" value="0x53902222" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="9823aaa1-b06f-40ce-884b-cd798c79f0c2"
+       chipset="KBLGT3"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0820" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F0032" />
+        <register type="NOA" address="0x00009888" value="0x0A4F1891" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E00" />
+        <register type="NOA" address="0x00009888" value="0x0E4F003C" />
+        <register type="NOA" address="0x00009888" value="0x004F0D80" />
+        <register type="NOA" address="0x00009888" value="0x024F003B" />
+        <register type="NOA" address="0x00009888" value="0x006C0002" />
+        <register type="NOA" address="0x00009888" value="0x086C0100" />
+        <register type="NOA" address="0x00009888" value="0x0C6C000C" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B00" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x081B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B8000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0024" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C6000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C001B" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0208" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CFB00" />
+        <register type="NOA" address="0x00009888" value="0x182C00BE" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900158" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900821" />
+        <register type="NOA" address="0x00009888" value="0x47900802" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900802" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900422" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="c7c735f3-ce58-45cf-aa04-30b183f1faff"
+       chipset="KBLGT3"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x1B931001" />
+        <register type="NOA" address="0x00009888" value="0x1D930001" />
+        <register type="NOA" address="0x00009888" value="0x19934000" />
+        <register type="NOA" address="0x00009888" value="0x1B958000" />
+        <register type="NOA" address="0x00009888" value="0x1D950094" />
+        <register type="NOA" address="0x00009888" value="0x19958000" />
+        <register type="NOA" address="0x00009888" value="0x09E58000" />
+        <register type="NOA" address="0x00009888" value="0x0BE58000" />
+        <register type="NOA" address="0x00009888" value="0x03E5C000" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900440" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900C21" />
+        <register type="NOA" address="0x00009888" value="0x57900400" />
+        <register type="NOA" address="0x00009888" value="0x49900042" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900024" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900841" />
+        <register type="NOA" address="0x00009888" value="0x53900400" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="96ec2219-040b-428a-856a-6bc03363a057"
+       chipset="KBLGT3"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900064" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900150" />
+        <register type="NOA" address="0x00009888" value="0x21900151" />
+        <register type="NOA" address="0x00009888" value="0x23900152" />
+        <register type="NOA" address="0x00009888" value="0x25900153" />
+        <register type="NOA" address="0x00009888" value="0x27900154" />
+        <register type="NOA" address="0x00009888" value="0x29900155" />
+        <register type="NOA" address="0x00009888" value="0x2B900156" />
+        <register type="NOA" address="0x00009888" value="0x2D900157" />
+        <register type="NOA" address="0x00009888" value="0x2F90015F" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="03372b64-4996-4d3b-aa18-790e75eeb9c2"
+       chipset="KBLGT3"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900160" />
+        <register type="NOA" address="0x00009888" value="0x21900161" />
+        <register type="NOA" address="0x00009888" value="0x23900162" />
+        <register type="NOA" address="0x00009888" value="0x25900163" />
+        <register type="NOA" address="0x00009888" value="0x27900164" />
+        <register type="NOA" address="0x00009888" value="0x29900165" />
+        <register type="NOA" address="0x00009888" value="0x2B900166" />
+        <register type="NOA" address="0x00009888" value="0x2D900167" />
+        <register type="NOA" address="0x00009888" value="0x2F900150" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="31b4ce5a-bd61-4c1f-bb5d-f2e731412150"
+       chipset="KBLGT3"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C8160" />
+        <register type="NOA" address="0x00009888" value="0x161C8015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4EAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B01" />
+        <register type="NOA" address="0x00009888" value="0x006C0200" />
+        <register type="NOA" address="0x00009888" value="0x026C000C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0041" />
+        <register type="NOA" address="0x00009888" value="0x061C4200" />
+        <register type="NOA" address="0x00009888" value="0x081C4443" />
+        <register type="NOA" address="0x00009888" value="0x0A1C4645" />
+        <register type="NOA" address="0x00009888" value="0x0C1C7647" />
+        <register type="NOA" address="0x00009888" value="0x041C7357" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x101C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0000" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CAA2A" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02AA" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5515" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x11907FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900802" />
+        <register type="NOA" address="0x00009888" value="0x47900842" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900842" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900800" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="2ce0911a-27fc-4887-96f0-11084fa807c3"
+       chipset="KBLGT3"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C0760" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8020" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1CE000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2A00" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0280" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F1500" />
+        <register type="NOA" address="0x00009888" value="0x100F0140" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C00A0" />
+        <register type="NOA" address="0x00009888" value="0x03933300" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190030F" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="546c4c1d-99b8-42fb-a107-5aaabb5314a8"
+       chipset="KBLGT3"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x106C0232" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F1880" />
+        <register type="NOA" address="0x00009888" value="0x024F08BB" />
+        <register type="NOA" address="0x00009888" value="0x044F001B" />
+        <register type="NOA" address="0x00009888" value="0x046C0100" />
+        <register type="NOA" address="0x00009888" value="0x066C000B" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x041B8000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025BC000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x165C8000" />
+        <register type="NOA" address="0x00009888" value="0x185C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00A0" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x062CC000" />
+        <register type="NOA" address="0x00009888" value="0x082CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x1D950080" />
+        <register type="NOA" address="0x00009888" value="0x13928000" />
+        <register type="NOA" address="0x00009888" value="0x0F988000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900040" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="4e93d156-9b39-4268-8544-a8e0480806d7"
+       chipset="KBLGT3"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C7B40" />
+        <register type="NOA" address="0x00009888" value="0x166C0020" />
+        <register type="NOA" address="0x00009888" value="0x0A603444" />
+        <register type="NOA" address="0x00009888" value="0x0A613400" />
+        <register type="NOA" address="0x00009888" value="0x1A4EA800" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C003C" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x10600000" />
+        <register type="NOA" address="0x00009888" value="0x04600000" />
+        <register type="NOA" address="0x00009888" value="0x0C610044" />
+        <register type="NOA" address="0x00009888" value="0x10610000" />
+        <register type="NOA" address="0x00009888" value="0x06610000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A8" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0154" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190FFC0" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900021" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900400" />
+        <register type="NOA" address="0x00009888" value="0x43900421" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="de1bec86-ca92-4b43-89fa-147653221cc0"
+       chipset="KBLGT3"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C02E0" />
+        <register type="NOA" address="0x00009888" value="0x146C0001" />
+        <register type="NOA" address="0x00009888" value="0x0A623400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x026C3324" />
+        <register type="NOA" address="0x00009888" value="0x046C3422" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x06614000" />
+        <register type="NOA" address="0x00009888" value="0x0C620044" />
+        <register type="NOA" address="0x00009888" value="0x10620000" />
+        <register type="NOA" address="0x00009888" value="0x06620000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="e63537bb-10be-4d4a-92c4-c6b0c65e02ef"
+       chipset="KBLGT3"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C4E80" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A633400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x026C3321" />
+        <register type="NOA" address="0x00009888" value="0x046C342F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C2000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x06604000" />
+        <register type="NOA" address="0x00009888" value="0x0C630044" />
+        <register type="NOA" address="0x00009888" value="0x10630000" />
+        <register type="NOA" address="0x00009888" value="0x06630000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00AA" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="7a03a9f8-ec5e-46bb-8b67-1f0ff1476281"
+       chipset="KBLGT3"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102F3800" />
+        <register type="NOA" address="0x00009888" value="0x144D0500" />
+        <register type="NOA" address="0x00009888" value="0x120D03C0" />
+        <register type="NOA" address="0x00009888" value="0x140D03CF" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0004" />
+        <register type="NOA" address="0x00009888" value="0x0C4E4000" />
+        <register type="NOA" address="0x00009888" value="0x042F0480" />
+        <register type="NOA" address="0x00009888" value="0x082F0000" />
+        <register type="NOA" address="0x00009888" value="0x022F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0090" />
+        <register type="NOA" address="0x00009888" value="0x064D0027" />
+        <register type="NOA" address="0x00009888" value="0x004D0000" />
+        <register type="NOA" address="0x00009888" value="0x000D0D40" />
+        <register type="NOA" address="0x00009888" value="0x020D803F" />
+        <register type="NOA" address="0x00009888" value="0x040D8023" />
+        <register type="NOA" address="0x00009888" value="0x100D0000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020F0010" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0050" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x43901485" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="b25d2ebf-a6e0-4b29-96be-a9b010edeeda"
+       chipset="KBLGT3"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14152C00" />
+        <register type="NOA" address="0x00009888" value="0x16150005" />
+        <register type="NOA" address="0x00009888" value="0x121600A0" />
+        <register type="NOA" address="0x00009888" value="0x14352C00" />
+        <register type="NOA" address="0x00009888" value="0x16350005" />
+        <register type="NOA" address="0x00009888" value="0x123600A0" />
+        <register type="NOA" address="0x00009888" value="0x14552C00" />
+        <register type="NOA" address="0x00009888" value="0x16550005" />
+        <register type="NOA" address="0x00009888" value="0x125600A0" />
+        <register type="NOA" address="0x00009888" value="0x062F6000" />
+        <register type="NOA" address="0x00009888" value="0x022F2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0050" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0350" />
+        <register type="NOA" address="0x00009888" value="0x0C0FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F00DA" />
+        <register type="NOA" address="0x00009888" value="0x182C0028" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x022DC000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C138000" />
+        <register type="NOA" address="0x00009888" value="0x0E132000" />
+        <register type="NOA" address="0x00009888" value="0x0413C000" />
+        <register type="NOA" address="0x00009888" value="0x1C140018" />
+        <register type="NOA" address="0x00009888" value="0x0C157000" />
+        <register type="NOA" address="0x00009888" value="0x0E150078" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04162180" />
+        <register type="NOA" address="0x00009888" value="0x02160000" />
+        <register type="NOA" address="0x00009888" value="0x04174000" />
+        <register type="NOA" address="0x00009888" value="0x0233A000" />
+        <register type="NOA" address="0x00009888" value="0x04333000" />
+        <register type="NOA" address="0x00009888" value="0x14348000" />
+        <register type="NOA" address="0x00009888" value="0x16348000" />
+        <register type="NOA" address="0x00009888" value="0x02357870" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04360043" />
+        <register type="NOA" address="0x00009888" value="0x02360000" />
+        <register type="NOA" address="0x00009888" value="0x04371000" />
+        <register type="NOA" address="0x00009888" value="0x0E538000" />
+        <register type="NOA" address="0x00009888" value="0x00538000" />
+        <register type="NOA" address="0x00009888" value="0x06533000" />
+        <register type="NOA" address="0x00009888" value="0x1C540020" />
+        <register type="NOA" address="0x00009888" value="0x12548000" />
+        <register type="NOA" address="0x00009888" value="0x0E557000" />
+        <register type="NOA" address="0x00009888" value="0x00557800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06560043" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x06571000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900060" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="469a05e5-e299-46f7-9598-7b05f3c34991"
+       chipset="KBLGT3"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12120000" />
+        <register type="NOA" address="0x00009888" value="0x12320000" />
+        <register type="NOA" address="0x00009888" value="0x12520000" />
+        <register type="NOA" address="0x00009888" value="0x002F8000" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0015" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F03A0" />
+        <register type="NOA" address="0x00009888" value="0x0C0FF000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0095" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x02108000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x02118000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x02121880" />
+        <register type="NOA" address="0x00009888" value="0x041219B5" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x02134000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x0C308000" />
+        <register type="NOA" address="0x00009888" value="0x0E304000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x0C318000" />
+        <register type="NOA" address="0x00009888" value="0x0E314000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x0C321A80" />
+        <register type="NOA" address="0x00009888" value="0x0E320033" />
+        <register type="NOA" address="0x00009888" value="0x06320031" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x0C334000" />
+        <register type="NOA" address="0x00009888" value="0x0E331000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0E508000" />
+        <register type="NOA" address="0x00009888" value="0x00508000" />
+        <register type="NOA" address="0x00009888" value="0x02504000" />
+        <register type="NOA" address="0x00009888" value="0x0E518000" />
+        <register type="NOA" address="0x00009888" value="0x00518000" />
+        <register type="NOA" address="0x00009888" value="0x02514000" />
+        <register type="NOA" address="0x00009888" value="0x0E521880" />
+        <register type="NOA" address="0x00009888" value="0x00521A80" />
+        <register type="NOA" address="0x00009888" value="0x02520033" />
+        <register type="NOA" address="0x00009888" value="0x0E534000" />
+        <register type="NOA" address="0x00009888" value="0x00534000" />
+        <register type="NOA" address="0x00009888" value="0x02531000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900062" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="52f925c6-786a-4ec6-86ce-cba85c83453a"
+       chipset="KBLGT3"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12124D60" />
+        <register type="NOA" address="0x00009888" value="0x12322E60" />
+        <register type="NOA" address="0x00009888" value="0x12524D60" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0014" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0FE000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0097" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x002D8000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x04121FB7" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x00308000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x00318000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x00321B80" />
+        <register type="NOA" address="0x00009888" value="0x0632003F" />
+        <register type="NOA" address="0x00009888" value="0x00334000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0250C000" />
+        <register type="NOA" address="0x00009888" value="0x0251C000" />
+        <register type="NOA" address="0x00009888" value="0x02521FB7" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x02535000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900063" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="efc497ac-884e-4ee4-a4a8-15fba22aaf21"
+       chipset="KBLGT3"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 5 READ C 6 READ FADD C 7 READ FADD C 2 READ FADD C 3 READ FADD C 4 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121203E0" />
+        <register type="NOA" address="0x00009888" value="0x123203E0" />
+        <register type="NOA" address="0x00009888" value="0x125203E0" />
+        <register type="NOA" address="0x00009888" value="0x129203E0" />
+        <register type="NOA" address="0x00009888" value="0x12B203E0" />
+        <register type="NOA" address="0x00009888" value="0x12D203E0" />
+        <register type="NOA" address="0x00009888" value="0x024EC000" />
+        <register type="NOA" address="0x00009888" value="0x044EC000" />
+        <register type="NOA" address="0x00009888" value="0x064EC000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0042" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F006D" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x042D8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06114000" />
+        <register type="NOA" address="0x00009888" value="0x06120033" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x04308000" />
+        <register type="NOA" address="0x00009888" value="0x04318000" />
+        <register type="NOA" address="0x00009888" value="0x04321980" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x04334000" />
+        <register type="NOA" address="0x00009888" value="0x04504000" />
+        <register type="NOA" address="0x00009888" value="0x04514000" />
+        <register type="NOA" address="0x00009888" value="0x04520033" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x04531000" />
+        <register type="NOA" address="0x00009888" value="0x00AF8000" />
+        <register type="NOA" address="0x00009888" value="0x0ACC0001" />
+        <register type="NOA" address="0x00009888" value="0x008D8000" />
+        <register type="NOA" address="0x00009888" value="0x028DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C8FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F0001" />
+        <register type="NOA" address="0x00009888" value="0x06AC8000" />
+        <register type="NOA" address="0x00009888" value="0x02AD4000" />
+        <register type="NOA" address="0x00009888" value="0x02908000" />
+        <register type="NOA" address="0x00009888" value="0x02918000" />
+        <register type="NOA" address="0x00009888" value="0x02921980" />
+        <register type="NOA" address="0x00009888" value="0x00920000" />
+        <register type="NOA" address="0x00009888" value="0x02934000" />
+        <register type="NOA" address="0x00009888" value="0x02B04000" />
+        <register type="NOA" address="0x00009888" value="0x02B14000" />
+        <register type="NOA" address="0x00009888" value="0x02B20033" />
+        <register type="NOA" address="0x00009888" value="0x00B20000" />
+        <register type="NOA" address="0x00009888" value="0x02B31000" />
+        <register type="NOA" address="0x00009888" value="0x00D08000" />
+        <register type="NOA" address="0x00009888" value="0x00D18000" />
+        <register type="NOA" address="0x00009888" value="0x00D21980" />
+        <register type="NOA" address="0x00009888" value="0x00D34000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900002" />
+        <register type="NOA" address="0x00009888" value="0x53900420" />
+        <register type="NOA" address="0x00009888" value="0x459000A1" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen9"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="bfd9764d-2c5b-4c16-bfc1-89de3ca10917"
+       chipset="KBLGT3"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A5800" />
+        <register type="NOA" address="0x00009888" value="0x161A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12180240" />
+        <register type="NOA" address="0x00009888" value="0x14180002" />
+        <register type="NOA" address="0x00009888" value="0x149A5800" />
+        <register type="NOA" address="0x00009888" value="0x169A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12980240" />
+        <register type="NOA" address="0x00009888" value="0x14980002" />
+        <register type="NOA" address="0x00009888" value="0x1A4E3FC0" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x022F8000" />
+        <register type="NOA" address="0x00009888" value="0x042F3000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C9500" />
+        <register type="NOA" address="0x00009888" value="0x0C4C002A" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0015" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C000A" />
+        <register type="NOA" address="0x00009888" value="0x04193000" />
+        <register type="NOA" address="0x00009888" value="0x081A28C1" />
+        <register type="NOA" address="0x00009888" value="0x001A0000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x0613C000" />
+        <register type="NOA" address="0x00009888" value="0x0813F000" />
+        <register type="NOA" address="0x00009888" value="0x00172000" />
+        <register type="NOA" address="0x00009888" value="0x06178000" />
+        <register type="NOA" address="0x00009888" value="0x0817A000" />
+        <register type="NOA" address="0x00009888" value="0x00180037" />
+        <register type="NOA" address="0x00009888" value="0x06180940" />
+        <register type="NOA" address="0x00009888" value="0x08180000" />
+        <register type="NOA" address="0x00009888" value="0x02180000" />
+        <register type="NOA" address="0x00009888" value="0x04183000" />
+        <register type="NOA" address="0x00009888" value="0x04AFC000" />
+        <register type="NOA" address="0x00009888" value="0x06AF3000" />
+        <register type="NOA" address="0x00009888" value="0x0ACC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0015" />
+        <register type="NOA" address="0x00009888" value="0x0A8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F4000" />
+        <register type="NOA" address="0x00009888" value="0x108F0015" />
+        <register type="NOA" address="0x00009888" value="0x16ACA000" />
+        <register type="NOA" address="0x00009888" value="0x18AC000A" />
+        <register type="NOA" address="0x00009888" value="0x06993000" />
+        <register type="NOA" address="0x00009888" value="0x0C9A28C1" />
+        <register type="NOA" address="0x00009888" value="0x009A0000" />
+        <register type="NOA" address="0x00009888" value="0x0A93F000" />
+        <register type="NOA" address="0x00009888" value="0x0C93F000" />
+        <register type="NOA" address="0x00009888" value="0x0A97A000" />
+        <register type="NOA" address="0x00009888" value="0x0C97A000" />
+        <register type="NOA" address="0x00009888" value="0x0A980977" />
+        <register type="NOA" address="0x00009888" value="0x08980000" />
+        <register type="NOA" address="0x00009888" value="0x04980000" />
+        <register type="NOA" address="0x00009888" value="0x06983000" />
+        <register type="NOA" address="0x00009888" value="0x119000FF" />
+        <register type="NOA" address="0x00009888" value="0x51900040" />
+        <register type="NOA" address="0x00009888" value="0x41900020" />
+        <register type="NOA" address="0x00009888" value="0x55900004" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x479008A5" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900002" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="f1792f32-6db2-4b50-b4b2-557128f1688d"
+       chipset="KBLGT3"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810000" />
+        <register type="NOA" address="0x00009888" value="0x07810013" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930040" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_sklgt2.xml b/src/mesa/drivers/dri/i965/brw_oa_sklgt2.xml
new file mode 100644
index 0000000..eb28450
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_sklgt2.xml
@@ -0,0 +1,10925 @@
+<?xml version="1.0"?>
+<metrics version="1491577975" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="f519e481-24d2-4d42-87c9-3fdd12c00202"
+       chipset="SKLGT2"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x02 UGTE"
+                     priority="1"
+                     >
+        <register type="NOA" address="0x00009888" value="0x166C01E0" />
+        <register type="NOA" address="0x00009888" value="0x12170280" />
+        <register type="NOA" address="0x00009888" value="0x12370280" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0080" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0001" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x042F1000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8400" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F6600" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C2200" />
+        <register type="NOA" address="0x00009888" value="0x062D8000" />
+        <register type="NOA" address="0x00009888" value="0x082D8000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x08133000" />
+        <register type="NOA" address="0x00009888" value="0x00170020" />
+        <register type="NOA" address="0x00009888" value="0x08170021" />
+        <register type="NOA" address="0x00009888" value="0x10170000" />
+        <register type="NOA" address="0x00009888" value="0x0633C000" />
+        <register type="NOA" address="0x00009888" value="0x0833C000" />
+        <register type="NOA" address="0x00009888" value="0x06370800" />
+        <register type="NOA" address="0x00009888" value="0x08370840" />
+        <register type="NOA" address="0x00009888" value="0x10370000" />
+        <register type="NOA" address="0x00009888" value="0x0D933031" />
+        <register type="NOA" address="0x00009888" value="0x0F933E3F" />
+        <register type="NOA" address="0x00009888" value="0x01933D00" />
+        <register type="NOA" address="0x00009888" value="0x0393073C" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1D930000" />
+        <register type="NOA" address="0x00009888" value="0x19930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190001F" />
+        <register type="NOA" address="0x00009888" value="0x51904400" />
+        <register type="NOA" address="0x00009888" value="0x41900020" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C21" />
+        <register type="NOA" address="0x00009888" value="0x47900061" />
+        <register type="NOA" address="0x00009888" value="0x57904440" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900004" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53904444" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="fe47b29d-ae51-423e-bff4-27d965a95b60"
+       chipset="SKLGT2"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND $SkuRevisionId 0x02 ULT &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901403" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8200" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F0DB2" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F1880" />
+        <register type="NOA" address="0x00009888" value="0x0A4F0011" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E3C" />
+        <register type="NOA" address="0x00009888" value="0x0E4F1D80" />
+        <register type="NOA" address="0x00009888" value="0x086C0002" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0100" />
+        <register type="NOA" address="0x00009888" value="0x0E6C000C" />
+        <register type="NOA" address="0x00009888" value="0x026C000B" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x081B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B4000" />
+        <register type="NOA" address="0x00009888" value="0x021B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0012" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x005BC000" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B8000" />
+        <register type="NOA" address="0x00009888" value="0x0A5B4000" />
+        <register type="NOA" address="0x00009888" value="0x0C5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B8000" />
+        <register type="NOA" address="0x00009888" value="0x105C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A5CA000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C002D" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0082" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002CC000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CBE00" />
+        <register type="NOA" address="0x00009888" value="0x182C00EF" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900167" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00000D28" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900840" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900842" />
+        <register type="NOA" address="0x00009888" value="0x47900840" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900840" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900040" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900840" />
+        <register type="NOA" address="0x00009888" value="0x53901111" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND $SkuRevisionId 0x02 UGTE &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901403" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0820" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F0032" />
+        <register type="NOA" address="0x00009888" value="0x0A4F1810" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E00" />
+        <register type="NOA" address="0x00009888" value="0x0E4F003C" />
+        <register type="NOA" address="0x00009888" value="0x004F0D80" />
+        <register type="NOA" address="0x00009888" value="0x024F003B" />
+        <register type="NOA" address="0x00009888" value="0x006C0002" />
+        <register type="NOA" address="0x00009888" value="0x086C0000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C000C" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B00" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x081B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B8000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0024" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C6000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C001B" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0208" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CFB00" />
+        <register type="NOA" address="0x00009888" value="0x182C00BE" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900167" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900842" />
+        <register type="NOA" address="0x00009888" value="0x47900802" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900802" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53901111" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="e0ad5ae0-84ba-4f29-a723-1906c12cb774"
+       chipset="SKLGT2"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x02 ULT"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x15968000" />
+        <register type="NOA" address="0x00009888" value="0x17968000" />
+        <register type="NOA" address="0x00009888" value="0x0F96C000" />
+        <register type="NOA" address="0x00009888" value="0x1F950011" />
+        <register type="NOA" address="0x00009888" value="0x1D950014" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x0B978000" />
+        <register type="NOA" address="0x00009888" value="0x0F974000" />
+        <register type="NOA" address="0x00009888" value="0x11974000" />
+        <register type="NOA" address="0x00009888" value="0x13978000" />
+        <register type="NOA" address="0x00009888" value="0x09974000" />
+        <register type="NOA" address="0x00000D28" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x419010A0" />
+        <register type="NOA" address="0x00009888" value="0x55904000" />
+        <register type="NOA" address="0x00009888" value="0x45901000" />
+        <register type="NOA" address="0x00009888" value="0x47900084" />
+        <register type="NOA" address="0x00009888" value="0x57904400" />
+        <register type="NOA" address="0x00009888" value="0x499000A5" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900081" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x439014A4" />
+        <register type="NOA" address="0x00009888" value="0x53900400" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x02 UGTE"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x1B931001" />
+        <register type="NOA" address="0x00009888" value="0x1D930001" />
+        <register type="NOA" address="0x00009888" value="0x19934000" />
+        <register type="NOA" address="0x00009888" value="0x1B958000" />
+        <register type="NOA" address="0x00009888" value="0x1D950094" />
+        <register type="NOA" address="0x00009888" value="0x19958000" />
+        <register type="NOA" address="0x00009888" value="0x05E5A000" />
+        <register type="NOA" address="0x00009888" value="0x01E5C000" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x419010A0" />
+        <register type="NOA" address="0x00009888" value="0x55904000" />
+        <register type="NOA" address="0x00009888" value="0x45901000" />
+        <register type="NOA" address="0x00009888" value="0x47900084" />
+        <register type="NOA" address="0x00009888" value="0x57904400" />
+        <register type="NOA" address="0x00009888" value="0x499000A5" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900081" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x439014A4" />
+        <register type="NOA" address="0x00009888" value="0x53900400" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="9bc436dd-6130-4add-affc-283eb6eaa864"
+       chipset="SKLGT2"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND $SkuRevisionId 0x02 ULT &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x13946000" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x0F968000" />
+        <register type="NOA" address="0x00009888" value="0x1196C000" />
+        <register type="NOA" address="0x00009888" value="0x13964000" />
+        <register type="NOA" address="0x00009888" value="0x11938000" />
+        <register type="NOA" address="0x00009888" value="0x1B93FE00" />
+        <register type="NOA" address="0x00009888" value="0x01940010" />
+        <register type="NOA" address="0x00009888" value="0x07941100" />
+        <register type="NOA" address="0x00009888" value="0x09941312" />
+        <register type="NOA" address="0x00009888" value="0x0B941514" />
+        <register type="NOA" address="0x00009888" value="0x0D941716" />
+        <register type="NOA" address="0x00009888" value="0x11940000" />
+        <register type="NOA" address="0x00009888" value="0x19940000" />
+        <register type="NOA" address="0x00009888" value="0x1B940000" />
+        <register type="NOA" address="0x00009888" value="0x1D940000" />
+        <register type="NOA" address="0x00009888" value="0x1B954000" />
+        <register type="NOA" address="0x00009888" value="0x1D95A550" />
+        <register type="NOA" address="0x00009888" value="0x1F9502AA" />
+        <register type="NOA" address="0x00009888" value="0x2F900157" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00000D28" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x05 ULT $SkuRevisionId 0x02 UGTE &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x13946000" />
+        <register type="NOA" address="0x00009888" value="0x15940016" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x19930800" />
+        <register type="NOA" address="0x00009888" value="0x1B93AA55" />
+        <register type="NOA" address="0x00009888" value="0x1D9300AA" />
+        <register type="NOA" address="0x00009888" value="0x01940010" />
+        <register type="NOA" address="0x00009888" value="0x07941100" />
+        <register type="NOA" address="0x00009888" value="0x09941312" />
+        <register type="NOA" address="0x00009888" value="0x0B941514" />
+        <register type="NOA" address="0x00009888" value="0x0D941716" />
+        <register type="NOA" address="0x00009888" value="0x0F940018" />
+        <register type="NOA" address="0x00009888" value="0x1B940000" />
+        <register type="NOA" address="0x00009888" value="0x11940000" />
+        <register type="NOA" address="0x00009888" value="0x01E58000" />
+        <register type="NOA" address="0x00009888" value="0x03E57000" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C20" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900421" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900421" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900061" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x05 UGTE"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900064" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900150" />
+        <register type="NOA" address="0x00009888" value="0x21900151" />
+        <register type="NOA" address="0x00009888" value="0x23900152" />
+        <register type="NOA" address="0x00009888" value="0x25900153" />
+        <register type="NOA" address="0x00009888" value="0x27900154" />
+        <register type="NOA" address="0x00009888" value="0x29900155" />
+        <register type="NOA" address="0x00009888" value="0x2B900156" />
+        <register type="NOA" address="0x00009888" value="0x2D900157" />
+        <register type="NOA" address="0x00009888" value="0x2F90015F" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="2ea0da8f-3527-4669-9d9d-13099a7435bf"
+       chipset="SKLGT2"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA"
+                     availability="$SliceMask 0x01 AND $SkuRevisionId 0x02 ULT &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x13945400" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901400" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x0F968000" />
+        <register type="NOA" address="0x00009888" value="0x1196C000" />
+        <register type="NOA" address="0x00009888" value="0x13964000" />
+        <register type="NOA" address="0x00009888" value="0x11938000" />
+        <register type="NOA" address="0x00009888" value="0x1B93FE00" />
+        <register type="NOA" address="0x00009888" value="0x01940010" />
+        <register type="NOA" address="0x00009888" value="0x07941100" />
+        <register type="NOA" address="0x00009888" value="0x09941312" />
+        <register type="NOA" address="0x00009888" value="0x0B941514" />
+        <register type="NOA" address="0x00009888" value="0x0D941716" />
+        <register type="NOA" address="0x00009888" value="0x11940000" />
+        <register type="NOA" address="0x00009888" value="0x19940000" />
+        <register type="NOA" address="0x00009888" value="0x1B940000" />
+        <register type="NOA" address="0x00009888" value="0x1D940000" />
+        <register type="NOA" address="0x00009888" value="0x1B954000" />
+        <register type="NOA" address="0x00009888" value="0x1D95A550" />
+        <register type="NOA" address="0x00009888" value="0x1F9502AA" />
+        <register type="NOA" address="0x00009888" value="0x2F900167" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00000D28" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x05 ULT $SkuRevisionId 0x02 UGTE &amp;&amp;"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x13945400" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901400" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x19930800" />
+        <register type="NOA" address="0x00009888" value="0x1B93AA55" />
+        <register type="NOA" address="0x00009888" value="0x1D93002A" />
+        <register type="NOA" address="0x00009888" value="0x01940010" />
+        <register type="NOA" address="0x00009888" value="0x07941100" />
+        <register type="NOA" address="0x00009888" value="0x09941312" />
+        <register type="NOA" address="0x00009888" value="0x0B941514" />
+        <register type="NOA" address="0x00009888" value="0x0D941716" />
+        <register type="NOA" address="0x00009888" value="0x1B940000" />
+        <register type="NOA" address="0x00009888" value="0x11940000" />
+        <register type="NOA" address="0x00009888" value="0x01E58000" />
+        <register type="NOA" address="0x00009888" value="0x03E57000" />
+        <register type="NOA" address="0x00009888" value="0x2F900167" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x13908000" />
+        <register type="NOA" address="0x00009888" value="0x21908000" />
+        <register type="NOA" address="0x00009888" value="0x23908000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27908000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C20" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900421" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900421" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="NOA"
+                     availability="$SkuRevisionId 0x05 UGTE"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900160" />
+        <register type="NOA" address="0x00009888" value="0x21900161" />
+        <register type="NOA" address="0x00009888" value="0x23900162" />
+        <register type="NOA" address="0x00009888" value="0x25900163" />
+        <register type="NOA" address="0x00009888" value="0x27900164" />
+        <register type="NOA" address="0x00009888" value="0x29900165" />
+        <register type="NOA" address="0x00009888" value="0x2B900166" />
+        <register type="NOA" address="0x00009888" value="0x2D900167" />
+        <register type="NOA" address="0x00009888" value="0x2F900150" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="d97d16af-028b-4cd1-a672-6210cb5513dd"
+       chipset="SKLGT2"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA"
+                     availability="$SubsliceMask 0x01 AND"
+                     priority="0"
+                     >
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C8160" />
+        <register type="NOA" address="0x00009888" value="0x161C8015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4EAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B01" />
+        <register type="NOA" address="0x00009888" value="0x006C0200" />
+        <register type="NOA" address="0x00009888" value="0x026C000C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0041" />
+        <register type="NOA" address="0x00009888" value="0x061C4200" />
+        <register type="NOA" address="0x00009888" value="0x081C4443" />
+        <register type="NOA" address="0x00009888" value="0x0A1C4645" />
+        <register type="NOA" address="0x00009888" value="0x0C1C7647" />
+        <register type="NOA" address="0x00009888" value="0x041C7357" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x101C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0000" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CAA2A" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02AA" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5515" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00000D28" value="0x00000000" />
+        <register type="NOA" address="0x00009888" value="0x11907FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900802" />
+        <register type="NOA" address="0x00009888" value="0x47900842" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900842" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900800" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="9fb22842-e708-43f7-9752-e0e41670c39e"
+       chipset="SKLGT2"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C0760" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F901403" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8020" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1CE000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2A00" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0280" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F1500" />
+        <register type="NOA" address="0x00009888" value="0x100F0140" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C00A0" />
+        <register type="NOA" address="0x00009888" value="0x03933300" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900167" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190030F" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900042" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x53901111" />
+        <register type="NOA" address="0x00009888" value="0x43900420" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="5378e2a1-4248-4188-a4ae-da25a794c603"
+       chipset="SKLGT2"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x106C0232" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F1880" />
+        <register type="NOA" address="0x00009888" value="0x024F08BB" />
+        <register type="NOA" address="0x00009888" value="0x044F001B" />
+        <register type="NOA" address="0x00009888" value="0x046C0100" />
+        <register type="NOA" address="0x00009888" value="0x066C000B" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x041B8000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025BC000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x165C8000" />
+        <register type="NOA" address="0x00009888" value="0x185C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00A0" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x062CC000" />
+        <register type="NOA" address="0x00009888" value="0x082CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x1D950080" />
+        <register type="NOA" address="0x00009888" value="0x13928000" />
+        <register type="NOA" address="0x00009888" value="0x0F988000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x4B9000A0" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="f42cdd6a-b000-42cb-870f-5eb423a7f514"
+       chipset="SKLGT2"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C7B40" />
+        <register type="NOA" address="0x00009888" value="0x166C0020" />
+        <register type="NOA" address="0x00009888" value="0x0A603444" />
+        <register type="NOA" address="0x00009888" value="0x0A613400" />
+        <register type="NOA" address="0x00009888" value="0x1A4EA800" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C003C" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x10600000" />
+        <register type="NOA" address="0x00009888" value="0x04600000" />
+        <register type="NOA" address="0x00009888" value="0x0C610044" />
+        <register type="NOA" address="0x00009888" value="0x10610000" />
+        <register type="NOA" address="0x00009888" value="0x06610000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A8" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0154" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190FFC0" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900021" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900400" />
+        <register type="NOA" address="0x00009888" value="0x43900421" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="b9bf2423-d88c-4a7b-a051-627611d00dcc"
+       chipset="SKLGT2"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C02E0" />
+        <register type="NOA" address="0x00009888" value="0x146C0001" />
+        <register type="NOA" address="0x00009888" value="0x0A623400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x026C3324" />
+        <register type="NOA" address="0x00009888" value="0x046C3422" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x06614000" />
+        <register type="NOA" address="0x00009888" value="0x0C620044" />
+        <register type="NOA" address="0x00009888" value="0x10620000" />
+        <register type="NOA" address="0x00009888" value="0x06620000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="2414a93d-d84f-406e-99c0-472161194b40"
+       chipset="SKLGT2"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C4E80" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A633400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x026C3321" />
+        <register type="NOA" address="0x00009888" value="0x046C342F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C2000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x06604000" />
+        <register type="NOA" address="0x00009888" value="0x0C630044" />
+        <register type="NOA" address="0x00009888" value="0x10630000" />
+        <register type="NOA" address="0x00009888" value="0x06630000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00AA" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="53a45d2d-170b-4cf5-b7bb-585120c8e2f5"
+       chipset="SKLGT2"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102F3800" />
+        <register type="NOA" address="0x00009888" value="0x144D0500" />
+        <register type="NOA" address="0x00009888" value="0x120D03C0" />
+        <register type="NOA" address="0x00009888" value="0x140D03CF" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0004" />
+        <register type="NOA" address="0x00009888" value="0x0C4E4000" />
+        <register type="NOA" address="0x00009888" value="0x042F0480" />
+        <register type="NOA" address="0x00009888" value="0x082F0000" />
+        <register type="NOA" address="0x00009888" value="0x022F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0090" />
+        <register type="NOA" address="0x00009888" value="0x064D0027" />
+        <register type="NOA" address="0x00009888" value="0x004D0000" />
+        <register type="NOA" address="0x00009888" value="0x000D0D40" />
+        <register type="NOA" address="0x00009888" value="0x020D803F" />
+        <register type="NOA" address="0x00009888" value="0x040D8023" />
+        <register type="NOA" address="0x00009888" value="0x100D0000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020F0010" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0050" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x43901485" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="b4cff514-a91e-4798-a0b3-426ca13fc9c1"
+       chipset="SKLGT2"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14152C00" />
+        <register type="NOA" address="0x00009888" value="0x16150005" />
+        <register type="NOA" address="0x00009888" value="0x121600A0" />
+        <register type="NOA" address="0x00009888" value="0x14352C00" />
+        <register type="NOA" address="0x00009888" value="0x16350005" />
+        <register type="NOA" address="0x00009888" value="0x123600A0" />
+        <register type="NOA" address="0x00009888" value="0x14552C00" />
+        <register type="NOA" address="0x00009888" value="0x16550005" />
+        <register type="NOA" address="0x00009888" value="0x125600A0" />
+        <register type="NOA" address="0x00009888" value="0x062F6000" />
+        <register type="NOA" address="0x00009888" value="0x022F2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0050" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0350" />
+        <register type="NOA" address="0x00009888" value="0x0C0FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F00DA" />
+        <register type="NOA" address="0x00009888" value="0x182C0028" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x022DC000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C138000" />
+        <register type="NOA" address="0x00009888" value="0x0E132000" />
+        <register type="NOA" address="0x00009888" value="0x0413C000" />
+        <register type="NOA" address="0x00009888" value="0x1C140018" />
+        <register type="NOA" address="0x00009888" value="0x0C157000" />
+        <register type="NOA" address="0x00009888" value="0x0E150078" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04162180" />
+        <register type="NOA" address="0x00009888" value="0x02160000" />
+        <register type="NOA" address="0x00009888" value="0x04174000" />
+        <register type="NOA" address="0x00009888" value="0x0233A000" />
+        <register type="NOA" address="0x00009888" value="0x04333000" />
+        <register type="NOA" address="0x00009888" value="0x14348000" />
+        <register type="NOA" address="0x00009888" value="0x16348000" />
+        <register type="NOA" address="0x00009888" value="0x02357870" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04360043" />
+        <register type="NOA" address="0x00009888" value="0x02360000" />
+        <register type="NOA" address="0x00009888" value="0x04371000" />
+        <register type="NOA" address="0x00009888" value="0x0E538000" />
+        <register type="NOA" address="0x00009888" value="0x00538000" />
+        <register type="NOA" address="0x00009888" value="0x06533000" />
+        <register type="NOA" address="0x00009888" value="0x1C540020" />
+        <register type="NOA" address="0x00009888" value="0x12548000" />
+        <register type="NOA" address="0x00009888" value="0x0E557000" />
+        <register type="NOA" address="0x00009888" value="0x00557800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06560043" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x06571000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900060" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="7821d13b-9b8b-4405-9618-78cd56b62cce"
+       chipset="SKLGT2"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12120000" />
+        <register type="NOA" address="0x00009888" value="0x12320000" />
+        <register type="NOA" address="0x00009888" value="0x12520000" />
+        <register type="NOA" address="0x00009888" value="0x002F8000" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0015" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F03A0" />
+        <register type="NOA" address="0x00009888" value="0x0C0FF000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0095" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x02108000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x02118000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x02121880" />
+        <register type="NOA" address="0x00009888" value="0x041219B5" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x02134000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x0C308000" />
+        <register type="NOA" address="0x00009888" value="0x0E304000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x0C318000" />
+        <register type="NOA" address="0x00009888" value="0x0E314000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x0C321A80" />
+        <register type="NOA" address="0x00009888" value="0x0E320033" />
+        <register type="NOA" address="0x00009888" value="0x06320031" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x0C334000" />
+        <register type="NOA" address="0x00009888" value="0x0E331000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0E508000" />
+        <register type="NOA" address="0x00009888" value="0x00508000" />
+        <register type="NOA" address="0x00009888" value="0x02504000" />
+        <register type="NOA" address="0x00009888" value="0x0E518000" />
+        <register type="NOA" address="0x00009888" value="0x00518000" />
+        <register type="NOA" address="0x00009888" value="0x02514000" />
+        <register type="NOA" address="0x00009888" value="0x0E521880" />
+        <register type="NOA" address="0x00009888" value="0x00521A80" />
+        <register type="NOA" address="0x00009888" value="0x02520033" />
+        <register type="NOA" address="0x00009888" value="0x0E534000" />
+        <register type="NOA" address="0x00009888" value="0x00534000" />
+        <register type="NOA" address="0x00009888" value="0x02531000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900062" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="893f1a4d-919d-4388-8cb7-746d73ea7259"
+       chipset="SKLGT2"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12124D60" />
+        <register type="NOA" address="0x00009888" value="0x12322E60" />
+        <register type="NOA" address="0x00009888" value="0x12524D60" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0014" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0FE000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0097" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x002D8000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x04121FB7" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x00308000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x00318000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x00321B80" />
+        <register type="NOA" address="0x00009888" value="0x0632003F" />
+        <register type="NOA" address="0x00009888" value="0x00334000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0250C000" />
+        <register type="NOA" address="0x00009888" value="0x0251C000" />
+        <register type="NOA" address="0x00009888" value="0x02521FB7" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x02535000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900063" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="41a24047-7484-4ead-ae37-de907e5ff2b2"
+       chipset="SKLGT2"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 7 READ C 6 READ FADD C 5 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121203E0" />
+        <register type="NOA" address="0x00009888" value="0x123203E0" />
+        <register type="NOA" address="0x00009888" value="0x125203E0" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F006C" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x042D8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06114000" />
+        <register type="NOA" address="0x00009888" value="0x06120033" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x04308000" />
+        <register type="NOA" address="0x00009888" value="0x04318000" />
+        <register type="NOA" address="0x00009888" value="0x04321980" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x04334000" />
+        <register type="NOA" address="0x00009888" value="0x04504000" />
+        <register type="NOA" address="0x00009888" value="0x04514000" />
+        <register type="NOA" address="0x00009888" value="0x04520033" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x04531000" />
+        <register type="NOA" address="0x00009888" value="0x1190E000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x43900C00" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00001000" />
+        <register type="FLEX" address="0x0000E558" value="0x00003002" />
+        <register type="FLEX" address="0x0000E658" value="0x00005004" />
+        <register type="FLEX" address="0x0000E758" value="0x00011010" />
+        <register type="FLEX" address="0x0000E45C" value="0x00050012" />
+        <register type="FLEX" address="0x0000E55C" value="0x00052051" />
+        <register type="FLEX" address="0x0000E65C" value="0x00000008" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen9"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="95910492-943f-44bd-9461-390240f243fd"
+       chipset="SKLGT2"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A5800" />
+        <register type="NOA" address="0x00009888" value="0x161A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12180240" />
+        <register type="NOA" address="0x00009888" value="0x14180002" />
+        <register type="NOA" address="0x00009888" value="0x143A5800" />
+        <register type="NOA" address="0x00009888" value="0x163A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12380240" />
+        <register type="NOA" address="0x00009888" value="0x14380002" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x022F8000" />
+        <register type="NOA" address="0x00009888" value="0x042F3000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C1500" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F9500" />
+        <register type="NOA" address="0x00009888" value="0x100F002A" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x0A2DC000" />
+        <register type="NOA" address="0x00009888" value="0x0C2DC000" />
+        <register type="NOA" address="0x00009888" value="0x04193000" />
+        <register type="NOA" address="0x00009888" value="0x081A28C1" />
+        <register type="NOA" address="0x00009888" value="0x001A0000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x0613C000" />
+        <register type="NOA" address="0x00009888" value="0x0813F000" />
+        <register type="NOA" address="0x00009888" value="0x00172000" />
+        <register type="NOA" address="0x00009888" value="0x06178000" />
+        <register type="NOA" address="0x00009888" value="0x0817A000" />
+        <register type="NOA" address="0x00009888" value="0x00180037" />
+        <register type="NOA" address="0x00009888" value="0x06180940" />
+        <register type="NOA" address="0x00009888" value="0x08180000" />
+        <register type="NOA" address="0x00009888" value="0x02180000" />
+        <register type="NOA" address="0x00009888" value="0x04183000" />
+        <register type="NOA" address="0x00009888" value="0x06393000" />
+        <register type="NOA" address="0x00009888" value="0x0C3A28C1" />
+        <register type="NOA" address="0x00009888" value="0x003A0000" />
+        <register type="NOA" address="0x00009888" value="0x0A33F000" />
+        <register type="NOA" address="0x00009888" value="0x0C33F000" />
+        <register type="NOA" address="0x00009888" value="0x0A37A000" />
+        <register type="NOA" address="0x00009888" value="0x0C37A000" />
+        <register type="NOA" address="0x00009888" value="0x0A380977" />
+        <register type="NOA" address="0x00009888" value="0x08380000" />
+        <register type="NOA" address="0x00009888" value="0x04380000" />
+        <register type="NOA" address="0x00009888" value="0x06383000" />
+        <register type="NOA" address="0x00009888" value="0x119000FF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900800" />
+        <register type="NOA" address="0x00009888" value="0x47901000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900844" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="1651949f-0ac0-4cb1-a06f-dafd74a407d1"
+       chipset="SKLGT2"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810000" />
+        <register type="NOA" address="0x00009888" value="0x07810016" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930040" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_sklgt3.xml b/src/mesa/drivers/dri/i965/brw_oa_sklgt3.xml
new file mode 100644
index 0000000..6dd80d6
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_sklgt3.xml
@@ -0,0 +1,10499 @@
+<?xml version="1.0"?>
+<metrics version="1491577975" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="4616d450-2393-4836-8146-53c5ed84d359"
+       chipset="SKLGT3"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ B 5 READ UADD 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler 1 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 1 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler1_bottleneck"
+             units="percent"
+             symbol_name="Sampler1Bottleneck"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck $Sampler1Bottleneck FMAX"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C01E0" />
+        <register type="NOA" address="0x00009888" value="0x12170280" />
+        <register type="NOA" address="0x00009888" value="0x12370280" />
+        <register type="NOA" address="0x00009888" value="0x16EC01E0" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0380" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0001" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x042F1000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C8400" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0002" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F6600" />
+        <register type="NOA" address="0x00009888" value="0x100F0001" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CA200" />
+        <register type="NOA" address="0x00009888" value="0x062D8000" />
+        <register type="NOA" address="0x00009888" value="0x082D8000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x08133000" />
+        <register type="NOA" address="0x00009888" value="0x00170020" />
+        <register type="NOA" address="0x00009888" value="0x08170021" />
+        <register type="NOA" address="0x00009888" value="0x10170000" />
+        <register type="NOA" address="0x00009888" value="0x0633C000" />
+        <register type="NOA" address="0x00009888" value="0x0833C000" />
+        <register type="NOA" address="0x00009888" value="0x06370800" />
+        <register type="NOA" address="0x00009888" value="0x08370840" />
+        <register type="NOA" address="0x00009888" value="0x10370000" />
+        <register type="NOA" address="0x00009888" value="0x1ACE0200" />
+        <register type="NOA" address="0x00009888" value="0x0AEC5300" />
+        <register type="NOA" address="0x00009888" value="0x10EC0000" />
+        <register type="NOA" address="0x00009888" value="0x1CEC0000" />
+        <register type="NOA" address="0x00009888" value="0x0A9B8000" />
+        <register type="NOA" address="0x00009888" value="0x1C9C0002" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0002" />
+        <register type="NOA" address="0x00009888" value="0x0A8D8000" />
+        <register type="NOA" address="0x00009888" value="0x108F0001" />
+        <register type="NOA" address="0x00009888" value="0x16AC8000" />
+        <register type="NOA" address="0x00009888" value="0x0D933031" />
+        <register type="NOA" address="0x00009888" value="0x0F933E3F" />
+        <register type="NOA" address="0x00009888" value="0x01933D00" />
+        <register type="NOA" address="0x00009888" value="0x0393073C" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1D930000" />
+        <register type="NOA" address="0x00009888" value="0x19930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190003F" />
+        <register type="NOA" address="0x00009888" value="0x51907710" />
+        <register type="NOA" address="0x00009888" value="0x419020A0" />
+        <register type="NOA" address="0x00009888" value="0x55901515" />
+        <register type="NOA" address="0x00009888" value="0x45900529" />
+        <register type="NOA" address="0x00009888" value="0x47901025" />
+        <register type="NOA" address="0x00009888" value="0x57907770" />
+        <register type="NOA" address="0x00009888" value="0x49902100" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900108" />
+        <register type="NOA" address="0x00009888" value="0x59900007" />
+        <register type="NOA" address="0x00009888" value="0x43902108" />
+        <register type="NOA" address="0x00009888" value="0x53907777" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="4320492b-fd03-42ac-922f-dbe1ef3b7b58"
+       chipset="SKLGT3"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0820" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F0032" />
+        <register type="NOA" address="0x00009888" value="0x0A4F1891" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E00" />
+        <register type="NOA" address="0x00009888" value="0x0E4F003C" />
+        <register type="NOA" address="0x00009888" value="0x004F0D80" />
+        <register type="NOA" address="0x00009888" value="0x024F003B" />
+        <register type="NOA" address="0x00009888" value="0x006C0002" />
+        <register type="NOA" address="0x00009888" value="0x086C0100" />
+        <register type="NOA" address="0x00009888" value="0x0C6C000C" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B00" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x081B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B8000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0024" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C6000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C001B" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0208" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CFB00" />
+        <register type="NOA" address="0x00009888" value="0x182C00BE" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900158" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900863" />
+        <register type="NOA" address="0x00009888" value="0x47900802" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900802" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900C62" />
+        <register type="NOA" address="0x00009888" value="0x53903333" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="bd2d9cae-b9ec-4f5b-9d2f-934bed398a2d"
+       chipset="SKLGT3"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x1B931001" />
+        <register type="NOA" address="0x00009888" value="0x1D930001" />
+        <register type="NOA" address="0x00009888" value="0x19934000" />
+        <register type="NOA" address="0x00009888" value="0x1B958000" />
+        <register type="NOA" address="0x00009888" value="0x1D950094" />
+        <register type="NOA" address="0x00009888" value="0x19958000" />
+        <register type="NOA" address="0x00009888" value="0x09E58000" />
+        <register type="NOA" address="0x00009888" value="0x0BE58000" />
+        <register type="NOA" address="0x00009888" value="0x03E5C000" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51901150" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x55905111" />
+        <register type="NOA" address="0x00009888" value="0x45901400" />
+        <register type="NOA" address="0x00009888" value="0x479004A5" />
+        <register type="NOA" address="0x00009888" value="0x57903455" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B9000A0" />
+        <register type="NOA" address="0x00009888" value="0x59900001" />
+        <register type="NOA" address="0x00009888" value="0x43900005" />
+        <register type="NOA" address="0x00009888" value="0x53900455" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="4ca0f3fe-7fd3-4924-98cb-1807d9879767"
+       chipset="SKLGT3"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900064" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900150" />
+        <register type="NOA" address="0x00009888" value="0x21900151" />
+        <register type="NOA" address="0x00009888" value="0x23900152" />
+        <register type="NOA" address="0x00009888" value="0x25900153" />
+        <register type="NOA" address="0x00009888" value="0x27900154" />
+        <register type="NOA" address="0x00009888" value="0x29900155" />
+        <register type="NOA" address="0x00009888" value="0x2B900156" />
+        <register type="NOA" address="0x00009888" value="0x2D900157" />
+        <register type="NOA" address="0x00009888" value="0x2F90015F" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="a0c0172c-ee13-403d-99ff-2bdf6936cf14"
+       chipset="SKLGT3"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900160" />
+        <register type="NOA" address="0x00009888" value="0x21900161" />
+        <register type="NOA" address="0x00009888" value="0x23900162" />
+        <register type="NOA" address="0x00009888" value="0x25900163" />
+        <register type="NOA" address="0x00009888" value="0x27900164" />
+        <register type="NOA" address="0x00009888" value="0x29900165" />
+        <register type="NOA" address="0x00009888" value="0x2B900166" />
+        <register type="NOA" address="0x00009888" value="0x2D900167" />
+        <register type="NOA" address="0x00009888" value="0x2F900150" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="52435e0b-f188-42ea-8680-21a56ee20dee"
+       chipset="SKLGT3"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C8160" />
+        <register type="NOA" address="0x00009888" value="0x161C8015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4EAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B01" />
+        <register type="NOA" address="0x00009888" value="0x006C0200" />
+        <register type="NOA" address="0x00009888" value="0x026C000C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0041" />
+        <register type="NOA" address="0x00009888" value="0x061C4200" />
+        <register type="NOA" address="0x00009888" value="0x081C4443" />
+        <register type="NOA" address="0x00009888" value="0x0A1C4645" />
+        <register type="NOA" address="0x00009888" value="0x0C1C7647" />
+        <register type="NOA" address="0x00009888" value="0x041C7357" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x101C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0000" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CAA2A" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02AA" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5515" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x11907FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900802" />
+        <register type="NOA" address="0x00009888" value="0x47900842" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900842" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900800" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="27076eeb-49f3-4fed-8423-c66506005c63"
+       chipset="SKLGT3"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C0760" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8020" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1CE000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2A00" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0280" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F1500" />
+        <register type="NOA" address="0x00009888" value="0x100F0140" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C00A0" />
+        <register type="NOA" address="0x00009888" value="0x03933300" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190030F" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900063" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x53903333" />
+        <register type="NOA" address="0x00009888" value="0x43900840" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="8071b409-c39a-4674-94d7-32962ecfb512"
+       chipset="SKLGT3"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x106C0232" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F1880" />
+        <register type="NOA" address="0x00009888" value="0x024F08BB" />
+        <register type="NOA" address="0x00009888" value="0x044F001B" />
+        <register type="NOA" address="0x00009888" value="0x046C0100" />
+        <register type="NOA" address="0x00009888" value="0x066C000B" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x041B8000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025BC000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x165C8000" />
+        <register type="NOA" address="0x00009888" value="0x185C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00A0" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x062CC000" />
+        <register type="NOA" address="0x00009888" value="0x082CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x1D950080" />
+        <register type="NOA" address="0x00009888" value="0x13928000" />
+        <register type="NOA" address="0x00009888" value="0x0F988000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900005" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="5e0b391e-9ea8-4901-b2ff-b64ff616c7ed"
+       chipset="SKLGT3"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C7B40" />
+        <register type="NOA" address="0x00009888" value="0x166C0020" />
+        <register type="NOA" address="0x00009888" value="0x0A603444" />
+        <register type="NOA" address="0x00009888" value="0x0A613400" />
+        <register type="NOA" address="0x00009888" value="0x1A4EA800" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C003C" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x10600000" />
+        <register type="NOA" address="0x00009888" value="0x04600000" />
+        <register type="NOA" address="0x00009888" value="0x0C610044" />
+        <register type="NOA" address="0x00009888" value="0x10610000" />
+        <register type="NOA" address="0x00009888" value="0x06610000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A8" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0154" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190FFC0" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900021" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900400" />
+        <register type="NOA" address="0x00009888" value="0x43900421" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="25dc828e-1d2d-426e-9546-a1d4233cdf16"
+       chipset="SKLGT3"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C02E0" />
+        <register type="NOA" address="0x00009888" value="0x146C0001" />
+        <register type="NOA" address="0x00009888" value="0x0A623400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x026C3324" />
+        <register type="NOA" address="0x00009888" value="0x046C3422" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x06614000" />
+        <register type="NOA" address="0x00009888" value="0x0C620044" />
+        <register type="NOA" address="0x00009888" value="0x10620000" />
+        <register type="NOA" address="0x00009888" value="0x06620000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="3dba9405-2d7e-4d70-8199-e734e82fd6bf"
+       chipset="SKLGT3"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C4E80" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A633400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x026C3321" />
+        <register type="NOA" address="0x00009888" value="0x046C342F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C2000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x06604000" />
+        <register type="NOA" address="0x00009888" value="0x0C630044" />
+        <register type="NOA" address="0x00009888" value="0x10630000" />
+        <register type="NOA" address="0x00009888" value="0x06630000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00AA" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="76935d7b-09c9-46bf-87f1-c18b4a86ebe5"
+       chipset="SKLGT3"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102F3800" />
+        <register type="NOA" address="0x00009888" value="0x144D0500" />
+        <register type="NOA" address="0x00009888" value="0x120D03C0" />
+        <register type="NOA" address="0x00009888" value="0x140D03CF" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0004" />
+        <register type="NOA" address="0x00009888" value="0x0C4E4000" />
+        <register type="NOA" address="0x00009888" value="0x042F0480" />
+        <register type="NOA" address="0x00009888" value="0x082F0000" />
+        <register type="NOA" address="0x00009888" value="0x022F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0090" />
+        <register type="NOA" address="0x00009888" value="0x064D0027" />
+        <register type="NOA" address="0x00009888" value="0x004D0000" />
+        <register type="NOA" address="0x00009888" value="0x000D0D40" />
+        <register type="NOA" address="0x00009888" value="0x020D803F" />
+        <register type="NOA" address="0x00009888" value="0x040D8023" />
+        <register type="NOA" address="0x00009888" value="0x100D0000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020F0010" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0050" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x43901485" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="1b34c0d6-4f4c-4d7b-833f-4aaf236d87a6"
+       chipset="SKLGT3"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14152C00" />
+        <register type="NOA" address="0x00009888" value="0x16150005" />
+        <register type="NOA" address="0x00009888" value="0x121600A0" />
+        <register type="NOA" address="0x00009888" value="0x14352C00" />
+        <register type="NOA" address="0x00009888" value="0x16350005" />
+        <register type="NOA" address="0x00009888" value="0x123600A0" />
+        <register type="NOA" address="0x00009888" value="0x14552C00" />
+        <register type="NOA" address="0x00009888" value="0x16550005" />
+        <register type="NOA" address="0x00009888" value="0x125600A0" />
+        <register type="NOA" address="0x00009888" value="0x062F6000" />
+        <register type="NOA" address="0x00009888" value="0x022F2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0050" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0350" />
+        <register type="NOA" address="0x00009888" value="0x0C0FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F00DA" />
+        <register type="NOA" address="0x00009888" value="0x182C0028" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x022DC000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C138000" />
+        <register type="NOA" address="0x00009888" value="0x0E132000" />
+        <register type="NOA" address="0x00009888" value="0x0413C000" />
+        <register type="NOA" address="0x00009888" value="0x1C140018" />
+        <register type="NOA" address="0x00009888" value="0x0C157000" />
+        <register type="NOA" address="0x00009888" value="0x0E150078" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04162180" />
+        <register type="NOA" address="0x00009888" value="0x02160000" />
+        <register type="NOA" address="0x00009888" value="0x04174000" />
+        <register type="NOA" address="0x00009888" value="0x0233A000" />
+        <register type="NOA" address="0x00009888" value="0x04333000" />
+        <register type="NOA" address="0x00009888" value="0x14348000" />
+        <register type="NOA" address="0x00009888" value="0x16348000" />
+        <register type="NOA" address="0x00009888" value="0x02357870" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04360043" />
+        <register type="NOA" address="0x00009888" value="0x02360000" />
+        <register type="NOA" address="0x00009888" value="0x04371000" />
+        <register type="NOA" address="0x00009888" value="0x0E538000" />
+        <register type="NOA" address="0x00009888" value="0x00538000" />
+        <register type="NOA" address="0x00009888" value="0x06533000" />
+        <register type="NOA" address="0x00009888" value="0x1C540020" />
+        <register type="NOA" address="0x00009888" value="0x12548000" />
+        <register type="NOA" address="0x00009888" value="0x0E557000" />
+        <register type="NOA" address="0x00009888" value="0x00557800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06560043" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x06571000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900060" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="b375c985-9953-455b-bda2-b03f7594e9db"
+       chipset="SKLGT3"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12120000" />
+        <register type="NOA" address="0x00009888" value="0x12320000" />
+        <register type="NOA" address="0x00009888" value="0x12520000" />
+        <register type="NOA" address="0x00009888" value="0x002F8000" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0015" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F03A0" />
+        <register type="NOA" address="0x00009888" value="0x0C0FF000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0095" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x02108000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x02118000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x02121880" />
+        <register type="NOA" address="0x00009888" value="0x041219B5" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x02134000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x0C308000" />
+        <register type="NOA" address="0x00009888" value="0x0E304000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x0C318000" />
+        <register type="NOA" address="0x00009888" value="0x0E314000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x0C321A80" />
+        <register type="NOA" address="0x00009888" value="0x0E320033" />
+        <register type="NOA" address="0x00009888" value="0x06320031" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x0C334000" />
+        <register type="NOA" address="0x00009888" value="0x0E331000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0E508000" />
+        <register type="NOA" address="0x00009888" value="0x00508000" />
+        <register type="NOA" address="0x00009888" value="0x02504000" />
+        <register type="NOA" address="0x00009888" value="0x0E518000" />
+        <register type="NOA" address="0x00009888" value="0x00518000" />
+        <register type="NOA" address="0x00009888" value="0x02514000" />
+        <register type="NOA" address="0x00009888" value="0x0E521880" />
+        <register type="NOA" address="0x00009888" value="0x00521A80" />
+        <register type="NOA" address="0x00009888" value="0x02520033" />
+        <register type="NOA" address="0x00009888" value="0x0E534000" />
+        <register type="NOA" address="0x00009888" value="0x00534000" />
+        <register type="NOA" address="0x00009888" value="0x02531000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900062" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="3e2be2bb-884a-49bb-82c5-2358e6bd5f2d"
+       chipset="SKLGT3"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12124D60" />
+        <register type="NOA" address="0x00009888" value="0x12322E60" />
+        <register type="NOA" address="0x00009888" value="0x12524D60" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0014" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0FE000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0097" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x002D8000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x04121FB7" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x00308000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x00318000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x00321B80" />
+        <register type="NOA" address="0x00009888" value="0x0632003F" />
+        <register type="NOA" address="0x00009888" value="0x00334000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0250C000" />
+        <register type="NOA" address="0x00009888" value="0x0251C000" />
+        <register type="NOA" address="0x00009888" value="0x02521FB7" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x02535000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900063" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="2d80a648-7b5a-4e92-bbe7-3b5c76f2e221"
+       chipset="SKLGT3"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active including Ext Math"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing including Extended Math processing"
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ  C 5 READ C 6 READ FADD C 7 READ FADD C 2 READ FADD C 3 READ FADD C 4 READ FADD 8 FMUL FADD 100 FMUL $EuCoresTotalCount FDIV $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active_adjusted"
+             units="percent"
+             symbol_name="Fpu1ActiveAdjusted"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121203E0" />
+        <register type="NOA" address="0x00009888" value="0x123203E0" />
+        <register type="NOA" address="0x00009888" value="0x125203E0" />
+        <register type="NOA" address="0x00009888" value="0x129203E0" />
+        <register type="NOA" address="0x00009888" value="0x12B203E0" />
+        <register type="NOA" address="0x00009888" value="0x12D203E0" />
+        <register type="NOA" address="0x00009888" value="0x024EC000" />
+        <register type="NOA" address="0x00009888" value="0x044EC000" />
+        <register type="NOA" address="0x00009888" value="0x064EC000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0042" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F006D" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x042D8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06114000" />
+        <register type="NOA" address="0x00009888" value="0x06120033" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x04308000" />
+        <register type="NOA" address="0x00009888" value="0x04318000" />
+        <register type="NOA" address="0x00009888" value="0x04321980" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x04334000" />
+        <register type="NOA" address="0x00009888" value="0x04504000" />
+        <register type="NOA" address="0x00009888" value="0x04514000" />
+        <register type="NOA" address="0x00009888" value="0x04520033" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x04531000" />
+        <register type="NOA" address="0x00009888" value="0x00AF8000" />
+        <register type="NOA" address="0x00009888" value="0x0ACC0001" />
+        <register type="NOA" address="0x00009888" value="0x008D8000" />
+        <register type="NOA" address="0x00009888" value="0x028DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C8FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F0001" />
+        <register type="NOA" address="0x00009888" value="0x06AC8000" />
+        <register type="NOA" address="0x00009888" value="0x02AD4000" />
+        <register type="NOA" address="0x00009888" value="0x02908000" />
+        <register type="NOA" address="0x00009888" value="0x02918000" />
+        <register type="NOA" address="0x00009888" value="0x02921980" />
+        <register type="NOA" address="0x00009888" value="0x00920000" />
+        <register type="NOA" address="0x00009888" value="0x02934000" />
+        <register type="NOA" address="0x00009888" value="0x02B04000" />
+        <register type="NOA" address="0x00009888" value="0x02B14000" />
+        <register type="NOA" address="0x00009888" value="0x02B20033" />
+        <register type="NOA" address="0x00009888" value="0x00B20000" />
+        <register type="NOA" address="0x00009888" value="0x02B31000" />
+        <register type="NOA" address="0x00009888" value="0x00D08000" />
+        <register type="NOA" address="0x00009888" value="0x00D18000" />
+        <register type="NOA" address="0x00009888" value="0x00D21980" />
+        <register type="NOA" address="0x00009888" value="0x00D34000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900402" />
+        <register type="NOA" address="0x00009888" value="0x53901550" />
+        <register type="NOA" address="0x00009888" value="0x45900080" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen9"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="cfae9232-6ffc-42cc-a703-9790016925f0"
+       chipset="SKLGT3"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A5800" />
+        <register type="NOA" address="0x00009888" value="0x161A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12180240" />
+        <register type="NOA" address="0x00009888" value="0x14180002" />
+        <register type="NOA" address="0x00009888" value="0x149A5800" />
+        <register type="NOA" address="0x00009888" value="0x169A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12980240" />
+        <register type="NOA" address="0x00009888" value="0x14980002" />
+        <register type="NOA" address="0x00009888" value="0x1A4E3FC0" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x022F8000" />
+        <register type="NOA" address="0x00009888" value="0x042F3000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C9500" />
+        <register type="NOA" address="0x00009888" value="0x0C4C002A" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0015" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C000A" />
+        <register type="NOA" address="0x00009888" value="0x04193000" />
+        <register type="NOA" address="0x00009888" value="0x081A28C1" />
+        <register type="NOA" address="0x00009888" value="0x001A0000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x0613C000" />
+        <register type="NOA" address="0x00009888" value="0x0813F000" />
+        <register type="NOA" address="0x00009888" value="0x00172000" />
+        <register type="NOA" address="0x00009888" value="0x06178000" />
+        <register type="NOA" address="0x00009888" value="0x0817A000" />
+        <register type="NOA" address="0x00009888" value="0x00180037" />
+        <register type="NOA" address="0x00009888" value="0x06180940" />
+        <register type="NOA" address="0x00009888" value="0x08180000" />
+        <register type="NOA" address="0x00009888" value="0x02180000" />
+        <register type="NOA" address="0x00009888" value="0x04183000" />
+        <register type="NOA" address="0x00009888" value="0x04AFC000" />
+        <register type="NOA" address="0x00009888" value="0x06AF3000" />
+        <register type="NOA" address="0x00009888" value="0x0ACC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0015" />
+        <register type="NOA" address="0x00009888" value="0x0A8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F4000" />
+        <register type="NOA" address="0x00009888" value="0x108F0015" />
+        <register type="NOA" address="0x00009888" value="0x16ACA000" />
+        <register type="NOA" address="0x00009888" value="0x18AC000A" />
+        <register type="NOA" address="0x00009888" value="0x06993000" />
+        <register type="NOA" address="0x00009888" value="0x0C9A28C1" />
+        <register type="NOA" address="0x00009888" value="0x009A0000" />
+        <register type="NOA" address="0x00009888" value="0x0A93F000" />
+        <register type="NOA" address="0x00009888" value="0x0C93F000" />
+        <register type="NOA" address="0x00009888" value="0x0A97A000" />
+        <register type="NOA" address="0x00009888" value="0x0C97A000" />
+        <register type="NOA" address="0x00009888" value="0x0A980977" />
+        <register type="NOA" address="0x00009888" value="0x08980000" />
+        <register type="NOA" address="0x00009888" value="0x04980000" />
+        <register type="NOA" address="0x00009888" value="0x06983000" />
+        <register type="NOA" address="0x00009888" value="0x119000FF" />
+        <register type="NOA" address="0x00009888" value="0x51900050" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900115" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x47900884" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900002" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="2b985803-d3c9-4629-8a4f-634bfecba0e8"
+       chipset="SKLGT3"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810000" />
+        <register type="NOA" address="0x00009888" value="0x07810013" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930040" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_oa_sklgt4.xml b/src/mesa/drivers/dri/i965/brw_oa_sklgt4.xml
new file mode 100644
index 0000000..10833f1
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_oa_sklgt4.xml
@@ -0,0 +1,10522 @@
+<?xml version="1.0"?>
+<metrics version="1491577975" merge_md5="">
+  <set name="Render Metrics Basic Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_basic"
+       hw_config_guid="bad77c24-cc64-480d-99bf-e7b740713800"
+       chipset="SKLGT4"
+       symbol_name="RenderBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Misses 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Cache Misses"
+             description="The total number of sampler cache misses in all LODs in all sampler units."
+             data_type="uint64"
+             equation="B 4 READ B 5 READ UADD B 3 READ UADD 8 UMUL"
+             underscore_name="sampler_l1_misses"
+             units="messages"
+             symbol_name="SamplerL1Misses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$SamplerL1Misses 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Sampler 0 Busy"
+             description="The percentage of time in which Sampler 0 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler0_busy"
+             units="percent"
+             symbol_name="Sampler0Busy"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler 1 Busy"
+             description="The percentage of time in which Sampler 1 has been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler1_busy"
+             units="percent"
+             symbol_name="Sampler1Busy"
+             availability="$SubsliceMask 0x12 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Samplers Busy"
+             description="The percentage of time in which samplers have been processing EU requests."
+             data_type="float"
+             max_equation="100"
+             equation="$Sampler0Busy $Sampler1Busy FMAX"
+             underscore_name="samplers_busy"
+             units="percent"
+             symbol_name="SamplersBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI Fixed Pipe Throughput"
+             description="The total number of GPU memory bytes transferred between 3D Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="B 6 READ B 7 READ UADD 64 UMUL"
+             underscore_name="gti_vf_throughput"
+             units="bytes"
+             symbol_name="GtiVfThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/3D Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler 0 Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which Sampler 0 has been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             max_equation="100"
+             underscore_name="sampler0_bottleneck"
+             units="percent"
+             symbol_name="Sampler0Bottleneck"
+             availability="$SubsliceMask 0x09 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="L3 Lookup Accesses w/o IC"
+             description="The total number of L3 cache lookup accesses w/o IC."
+             data_type="uint64"
+             equation="$SamplerL1Misses $ShaderMemoryAccesses UADD"
+             underscore_name="l3_lookups"
+             units="messages"
+             symbol_name="L3Lookups"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Depth Throughput"
+             description="The total number of GPU memory bytes transferred between depth caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 0 READ C 1 READ UADD 64 UMUL"
+             underscore_name="gti_depth_throughput"
+             units="bytes"
+             symbol_name="GtiDepthThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Depth Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Samplers Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which samplers have been slowing down the pipe when processing EU requests."
+             data_type="float"
+             high_watermark="15"
+             equation="$Sampler0Bottleneck"
+             max_equation="100"
+             underscore_name="sampler_bottleneck"
+             units="percent"
+             symbol_name="SamplerBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Indicate System Frame Batch Draw"
+             mdapi_group="Sampler"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="GTI HDC TLB Lookup Throughput"
+             description="The total number of GPU memory bytes transferred between GTI and HDC, when HDC is doing TLB lookups."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_hdc_lookups_throughput"
+             units="bytes"
+             symbol_name="GtiHdcLookupsThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI RCC Throughput"
+             description="The total number of GPU memory bytes transferred between render color caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 2 READ C 3 READ UADD 64 UMUL"
+             underscore_name="gti_rcc_throughput"
+             units="bytes"
+             symbol_name="GtiRccThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/Color Cache"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C01E0" />
+        <register type="NOA" address="0x00009888" value="0x12170280" />
+        <register type="NOA" address="0x00009888" value="0x12370280" />
+        <register type="NOA" address="0x00009888" value="0x16EC01E0" />
+        <register type="NOA" address="0x00009888" value="0x176C01E0" />
+        <register type="NOA" address="0x00009888" value="0x11930317" />
+        <register type="NOA" address="0x00009888" value="0x159303DF" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x1A4E03B0" />
+        <register type="NOA" address="0x00009888" value="0x0A6C0053" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A1B4000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0001" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x042F1000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CA400" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0002" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5600" />
+        <register type="NOA" address="0x00009888" value="0x100F0001" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x062D8000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x08133000" />
+        <register type="NOA" address="0x00009888" value="0x00170020" />
+        <register type="NOA" address="0x00009888" value="0x08170021" />
+        <register type="NOA" address="0x00009888" value="0x10170000" />
+        <register type="NOA" address="0x00009888" value="0x0633C000" />
+        <register type="NOA" address="0x00009888" value="0x06370800" />
+        <register type="NOA" address="0x00009888" value="0x10370000" />
+        <register type="NOA" address="0x00009888" value="0x1ACE0230" />
+        <register type="NOA" address="0x00009888" value="0x0AEC5300" />
+        <register type="NOA" address="0x00009888" value="0x10EC0000" />
+        <register type="NOA" address="0x00009888" value="0x1CEC0000" />
+        <register type="NOA" address="0x00009888" value="0x0A9B8000" />
+        <register type="NOA" address="0x00009888" value="0x1C9C0002" />
+        <register type="NOA" address="0x00009888" value="0x0ACC2000" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0002" />
+        <register type="NOA" address="0x00009888" value="0x088D8000" />
+        <register type="NOA" address="0x00009888" value="0x0A8D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F1000" />
+        <register type="NOA" address="0x00009888" value="0x108F0001" />
+        <register type="NOA" address="0x00009888" value="0x16AC8800" />
+        <register type="NOA" address="0x00009888" value="0x1B4E0020" />
+        <register type="NOA" address="0x00009888" value="0x096C5300" />
+        <register type="NOA" address="0x00009888" value="0x116C0000" />
+        <register type="NOA" address="0x00009888" value="0x1D6C0000" />
+        <register type="NOA" address="0x00009888" value="0x091B8000" />
+        <register type="NOA" address="0x00009888" value="0x1B1C8000" />
+        <register type="NOA" address="0x00009888" value="0x0B4C2000" />
+        <register type="NOA" address="0x00009888" value="0x090D8000" />
+        <register type="NOA" address="0x00009888" value="0x0F0F1000" />
+        <register type="NOA" address="0x00009888" value="0x172C0800" />
+        <register type="NOA" address="0x00009888" value="0x0D933031" />
+        <register type="NOA" address="0x00009888" value="0x0F933E3F" />
+        <register type="NOA" address="0x00009888" value="0x01933D00" />
+        <register type="NOA" address="0x00009888" value="0x0393073C" />
+        <register type="NOA" address="0x00009888" value="0x0593000E" />
+        <register type="NOA" address="0x00009888" value="0x1D930000" />
+        <register type="NOA" address="0x00009888" value="0x19930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x2B908000" />
+        <register type="NOA" address="0x00009888" value="0x2D908000" />
+        <register type="NOA" address="0x00009888" value="0x2F908000" />
+        <register type="NOA" address="0x00009888" value="0x31908000" />
+        <register type="NOA" address="0x00009888" value="0x15908000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190003F" />
+        <register type="NOA" address="0x00009888" value="0x5190FF30" />
+        <register type="NOA" address="0x00009888" value="0x41900060" />
+        <register type="NOA" address="0x00009888" value="0x55903033" />
+        <register type="NOA" address="0x00009888" value="0x45901421" />
+        <register type="NOA" address="0x00009888" value="0x47900803" />
+        <register type="NOA" address="0x00009888" value="0x5790FFF1" />
+        <register type="NOA" address="0x00009888" value="0x49900001" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x5990000F" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x5390FFFF" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Basic Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_basic"
+       hw_config_guid="7277228f-e7f3-4743-945a-6a2049d11377"
+       chipset="SKLGT4"
+       symbol_name="ComputeBasic"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 6 READ B 7 READ C 0 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_read"
+             units="bytes"
+             symbol_name="UntypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Bytes Written"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 3 READ B 4 READ B 5 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_written"
+             units="bytes"
+             symbol_name="TypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Untyped Writes"
+             description="The total number of untyped memory bytes written via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 1 READ C 2 READ C 3 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="untyped_bytes_written"
+             units="bytes"
+             symbol_name="UntypedBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="Typed Bytes Read"
+             description="The total number of typed memory bytes read via Data Port."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="B 0 READ B 1 READ B 2 READ UADD UADD $EuSlicesTotalCount 64 UMUL UMUL"
+             underscore_name="typed_bytes_read"
+             units="bytes"
+             symbol_name="TypedBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 5 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO OCL BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F00E0" />
+        <register type="NOA" address="0x00009888" value="0x124F1C00" />
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E0820" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x064F0900" />
+        <register type="NOA" address="0x00009888" value="0x084F0032" />
+        <register type="NOA" address="0x00009888" value="0x0A4F1891" />
+        <register type="NOA" address="0x00009888" value="0x0C4F0E00" />
+        <register type="NOA" address="0x00009888" value="0x0E4F003C" />
+        <register type="NOA" address="0x00009888" value="0x004F0D80" />
+        <register type="NOA" address="0x00009888" value="0x024F003B" />
+        <register type="NOA" address="0x00009888" value="0x006C0002" />
+        <register type="NOA" address="0x00009888" value="0x086C0100" />
+        <register type="NOA" address="0x00009888" value="0x0C6C000C" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B00" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x081B8000" />
+        <register type="NOA" address="0x00009888" value="0x0C1B4000" />
+        <register type="NOA" address="0x00009888" value="0x0E1B8000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C8000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0024" />
+        <register type="NOA" address="0x00009888" value="0x065B8000" />
+        <register type="NOA" address="0x00009888" value="0x085B4000" />
+        <register type="NOA" address="0x00009888" value="0x0A5BC000" />
+        <register type="NOA" address="0x00009888" value="0x0C5B8000" />
+        <register type="NOA" address="0x00009888" value="0x0E5B4000" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C6000" />
+        <register type="NOA" address="0x00009888" value="0x1C5C001B" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0208" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2CC000" />
+        <register type="NOA" address="0x00009888" value="0x162CFB00" />
+        <register type="NOA" address="0x00009888" value="0x182C00BE" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x19900157" />
+        <register type="NOA" address="0x00009888" value="0x1B900158" />
+        <register type="NOA" address="0x00009888" value="0x1D900105" />
+        <register type="NOA" address="0x00009888" value="0x1F900103" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x11900FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900821" />
+        <register type="NOA" address="0x00009888" value="0x47900802" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900802" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900422" />
+        <register type="NOA" address="0x00009888" value="0x53905555" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Render Metrics for 3D Pipeline Profile Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="render_pipe_profile"
+       hw_config_guid="463c668c-3f60-49b6-8f85-d995b635b3b2"
+       chipset="SKLGT4"
+       symbol_name="RenderPipeProfile"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which vertex shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_bottleneck"
+             units="percent"
+             symbol_name="VsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Hi-Depth Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which early hierarchical depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hi_depth_bottleneck"
+             units="percent"
+             symbol_name="HiDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which geometry shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gs_bottleneck"
+             units="percent"
+             symbol_name="GsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Geometry Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="BC Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which barycentric coordinates calculation pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="bc_bottleneck"
+             units="percent"
+             symbol_name="BcBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Barycentric Calc"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Stall"
+             description="The percentage of time in which hull stall pipeline stage was stalled."
+             data_type="float"
+             equation="C 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_stall"
+             units="percent"
+             symbol_name="HsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Hull Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="VF Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which vertex fetch pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="C 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vf_bottleneck"
+             units="percent"
+             symbol_name="VfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Input Assembler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Strip-Fans Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which strip-fans pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="10"
+             equation="B 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_bottleneck"
+             units="percent"
+             symbol_name="SfBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SF Stall"
+             description="The percentage of time in which strip-fans pipeline stage was stalled."
+             data_type="float"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sf_stall"
+             units="percent"
+             symbol_name="SfStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Strip-Fans"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="HS Bottleneck"
+             low_watermark="3"
+             description="The percentage of time in which hull shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="9"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="hs_bottleneck"
+             units="percent"
+             symbol_name="HsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Hull Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CL Stall"
+             description="The percentage of time in which clipper pipeline stage was stalled."
+             data_type="float"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_stall"
+             units="percent"
+             symbol_name="ClStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Clipper"
+             />
+    <counter name="SO Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which stream output pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_bottleneck"
+             units="percent"
+             symbol_name="SoBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Stream Output"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="DS Bottleneck"
+             low_watermark="5"
+             description="The percentage of time in which domain shader pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="15"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_bottleneck"
+             units="percent"
+             symbol_name="DsBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Domain Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Clipper Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which clipper pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="cl_bottleneck"
+             units="percent"
+             symbol_name="ClBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Clipper"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Stall"
+             description="The percentage of time in which domain shader pipeline stage was stalled."
+             data_type="float"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ds_stall"
+             units="percent"
+             symbol_name="DsStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Domain Shader"
+             />
+    <counter name="Early Depth Bottleneck"
+             low_watermark="10"
+             description="The percentage of time in which early depth test pipeline stage was slowing down the 3D pipeline."
+             data_type="float"
+             high_watermark="30"
+             equation="B 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="early_depth_bottleneck"
+             units="percent"
+             symbol_name="EarlyDepthBottleneck"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Indicate Draw"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SO Stall"
+             description="The percentage of time in which stream-output pipeline stage was stalled."
+             data_type="float"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="so_stall"
+             units="percent"
+             symbol_name="SoStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Correlate Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Stream Output"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x0C0E001F" />
+        <register type="NOA" address="0x00009888" value="0x0A0F0000" />
+        <register type="NOA" address="0x00009888" value="0x10116800" />
+        <register type="NOA" address="0x00009888" value="0x178A03E0" />
+        <register type="NOA" address="0x00009888" value="0x11824C00" />
+        <register type="NOA" address="0x00009888" value="0x11830020" />
+        <register type="NOA" address="0x00009888" value="0x13840020" />
+        <register type="NOA" address="0x00009888" value="0x11850019" />
+        <register type="NOA" address="0x00009888" value="0x11860007" />
+        <register type="NOA" address="0x00009888" value="0x01870C40" />
+        <register type="NOA" address="0x00009888" value="0x17880000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0040" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x040D4000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020E5400" />
+        <register type="NOA" address="0x00009888" value="0x000E0000" />
+        <register type="NOA" address="0x00009888" value="0x080F0040" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x100F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0040" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06110012" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x01898000" />
+        <register type="NOA" address="0x00009888" value="0x0D890100" />
+        <register type="NOA" address="0x00009888" value="0x03898000" />
+        <register type="NOA" address="0x00009888" value="0x09808000" />
+        <register type="NOA" address="0x00009888" value="0x0B808000" />
+        <register type="NOA" address="0x00009888" value="0x0380C000" />
+        <register type="NOA" address="0x00009888" value="0x0F8A0075" />
+        <register type="NOA" address="0x00009888" value="0x1D8A0000" />
+        <register type="NOA" address="0x00009888" value="0x118A8000" />
+        <register type="NOA" address="0x00009888" value="0x1B8A4000" />
+        <register type="NOA" address="0x00009888" value="0x138A8000" />
+        <register type="NOA" address="0x00009888" value="0x1D81A000" />
+        <register type="NOA" address="0x00009888" value="0x15818000" />
+        <register type="NOA" address="0x00009888" value="0x17818000" />
+        <register type="NOA" address="0x00009888" value="0x0B820030" />
+        <register type="NOA" address="0x00009888" value="0x07828000" />
+        <register type="NOA" address="0x00009888" value="0x0D824000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x05824000" />
+        <register type="NOA" address="0x00009888" value="0x0D830003" />
+        <register type="NOA" address="0x00009888" value="0x0583000C" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x03838000" />
+        <register type="NOA" address="0x00009888" value="0x07838000" />
+        <register type="NOA" address="0x00009888" value="0x0B840980" />
+        <register type="NOA" address="0x00009888" value="0x03844D80" />
+        <register type="NOA" address="0x00009888" value="0x11840000" />
+        <register type="NOA" address="0x00009888" value="0x09848000" />
+        <register type="NOA" address="0x00009888" value="0x09850080" />
+        <register type="NOA" address="0x00009888" value="0x03850003" />
+        <register type="NOA" address="0x00009888" value="0x01850000" />
+        <register type="NOA" address="0x00009888" value="0x07860000" />
+        <register type="NOA" address="0x00009888" value="0x0F860400" />
+        <register type="NOA" address="0x00009888" value="0x09870032" />
+        <register type="NOA" address="0x00009888" value="0x01888052" />
+        <register type="NOA" address="0x00009888" value="0x11880000" />
+        <register type="NOA" address="0x00009888" value="0x09884000" />
+        <register type="NOA" address="0x00009888" value="0x1B931001" />
+        <register type="NOA" address="0x00009888" value="0x1D930001" />
+        <register type="NOA" address="0x00009888" value="0x19934000" />
+        <register type="NOA" address="0x00009888" value="0x1B958000" />
+        <register type="NOA" address="0x00009888" value="0x1D950094" />
+        <register type="NOA" address="0x00009888" value="0x19958000" />
+        <register type="NOA" address="0x00009888" value="0x09E58000" />
+        <register type="NOA" address="0x00009888" value="0x0BE58000" />
+        <register type="NOA" address="0x00009888" value="0x03E5C000" />
+        <register type="NOA" address="0x00009888" value="0x0592C000" />
+        <register type="NOA" address="0x00009888" value="0x0B928000" />
+        <register type="NOA" address="0x00009888" value="0x0D924000" />
+        <register type="NOA" address="0x00009888" value="0x0F924000" />
+        <register type="NOA" address="0x00009888" value="0x11928000" />
+        <register type="NOA" address="0x00009888" value="0x1392C000" />
+        <register type="NOA" address="0x00009888" value="0x09924000" />
+        <register type="NOA" address="0x00009888" value="0x01985000" />
+        <register type="NOA" address="0x00009888" value="0x07988000" />
+        <register type="NOA" address="0x00009888" value="0x09981000" />
+        <register type="NOA" address="0x00009888" value="0x0B982000" />
+        <register type="NOA" address="0x00009888" value="0x0D982000" />
+        <register type="NOA" address="0x00009888" value="0x0F989000" />
+        <register type="NOA" address="0x00009888" value="0x05982000" />
+        <register type="NOA" address="0x00009888" value="0x13904000" />
+        <register type="NOA" address="0x00009888" value="0x21904000" />
+        <register type="NOA" address="0x00009888" value="0x23904000" />
+        <register type="NOA" address="0x00009888" value="0x25908000" />
+        <register type="NOA" address="0x00009888" value="0x27904000" />
+        <register type="NOA" address="0x00009888" value="0x29908000" />
+        <register type="NOA" address="0x00009888" value="0x2B904000" />
+        <register type="NOA" address="0x00009888" value="0x2F904000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x15904000" />
+        <register type="NOA" address="0x00009888" value="0x17908000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B904000" />
+        <register type="NOA" address="0x00009888" value="0x1190C080" />
+        <register type="NOA" address="0x00009888" value="0x51901110" />
+        <register type="NOA" address="0x00009888" value="0x41900440" />
+        <register type="NOA" address="0x00009888" value="0x55901111" />
+        <register type="NOA" address="0x00009888" value="0x45900400" />
+        <register type="NOA" address="0x00009888" value="0x47900C21" />
+        <register type="NOA" address="0x00009888" value="0x57901411" />
+        <register type="NOA" address="0x00009888" value="0x49900042" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900024" />
+        <register type="NOA" address="0x00009888" value="0x59900001" />
+        <register type="NOA" address="0x00009888" value="0x43900841" />
+        <register type="NOA" address="0x00009888" value="0x53900411" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFEA" />
+        <register type="OA" address="0x00002774" value="0x00007FFC" />
+        <register type="OA" address="0x00002778" value="0x0007AFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000F5FD" />
+        <register type="OA" address="0x00002780" value="0x00079FFA" />
+        <register type="OA" address="0x00002784" value="0x0000F3FB" />
+        <register type="OA" address="0x00002788" value="0x0007BF7A" />
+        <register type="OA" address="0x0000278C" value="0x0000F7E7" />
+        <register type="OA" address="0x00002790" value="0x0007FEFA" />
+        <register type="OA" address="0x00002794" value="0x0000F7CF" />
+        <register type="OA" address="0x00002798" value="0x00077FFA" />
+        <register type="OA" address="0x0000279C" value="0x0000EFDF" />
+        <register type="OA" address="0x000027A0" value="0x0006FFFA" />
+        <register type="OA" address="0x000027A4" value="0x0000CFBF" />
+        <register type="OA" address="0x000027A8" value="0x0003FFFA" />
+        <register type="OA" address="0x000027AC" value="0x00005F7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Reads Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_reads"
+       hw_config_guid="3ae6e74c-72c3-4040-9bd0-7961430b8cc8"
+       chipset="SKLGT4"
+       symbol_name="MemoryReads"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank0Reads"
+             description="The total number of GTI memory reads from L3 Bank 0 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_reads"
+             units="messages"
+             symbol_name="GtiL3Bank0Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all accesses from GTI to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiL3Bank3Reads"
+             description="The total number of GTI memory reads from L3 Bank 3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_reads"
+             units="messages"
+             symbol_name="GtiL3Bank3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiRsMemoryReads"
+             description="The total number of GTI memory reads from Resource Streamer."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_rs_memory_reads"
+             units="messages"
+             symbol_name="GtiRsMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Resource Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiHizMemoryReads"
+             description="The total number of GTI memory reads from Hierarchical Depth Cache (Hi-Depth Cache misses)."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_reads"
+             units="messages"
+             symbol_name="GtiHizMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="GtiRccMemoryReads"
+             description="The total number of GTI memory reads from Render Color Cache (Render Color Cache misses)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_reads"
+             units="messages"
+             symbol_name="GtiRccMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Bank1Reads"
+             description="The total number of GTI memory reads from L3 Bank 1 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_reads"
+             units="messages"
+             symbol_name="GtiL3Bank1Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiCmdStreamerMemoryReads"
+             description="The total number of GTI memory reads from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_reads"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="GtiL3Bank2Reads"
+             description="The total number of GTI memory reads from L3 Bank 2 (L3 Cache misses)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_reads"
+             units="messages"
+             symbol_name="GtiL3Bank2Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiMemoryReads"
+             description="The total number of GTI memory reads."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_reads"
+             units="messages"
+             symbol_name="GtiMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="GtiRczMemoryReads"
+             description="The total number of GTI memory reads from Render Depth Cache (Render Depth Cache misses)."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_reads"
+             units="messages"
+             symbol_name="GtiRczMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="GtiMscMemoryReads"
+             description="The total number of GTI memory reads from Multisampling Color Cache (Multisampling Color Cache misses)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_reads"
+             units="messages"
+             symbol_name="GtiMscMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiVfMemoryReads"
+             description="The total number of GTI memory reads from Vertex Fetch."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="gti_vf_memory_reads"
+             units="messages"
+             symbol_name="GtiVfMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Vertex Fetch"
+             />
+    <counter name="GtiStcMemoryReads"
+             description="The total number of GTI memory reads from Stencil Cache (Stencil Cache misses)."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_reads"
+             units="messages"
+             symbol_name="GtiStcMemoryReads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiL3Reads"
+             description="The total number of GTI memory reads from L3 (L3 Cache misses)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Reads $GtiL3Bank1Reads $GtiL3Bank2Reads $GtiL3Bank3Reads UADD UADD UADD"
+             underscore_name="gti_l3_reads"
+             units="messages"
+             symbol_name="GtiL3Reads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F900064" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900150" />
+        <register type="NOA" address="0x00009888" value="0x21900151" />
+        <register type="NOA" address="0x00009888" value="0x23900152" />
+        <register type="NOA" address="0x00009888" value="0x25900153" />
+        <register type="NOA" address="0x00009888" value="0x27900154" />
+        <register type="NOA" address="0x00009888" value="0x29900155" />
+        <register type="NOA" address="0x00009888" value="0x2B900156" />
+        <register type="NOA" address="0x00009888" value="0x2D900157" />
+        <register type="NOA" address="0x00009888" value="0x2F90015F" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F872" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Memory Writes Distribution Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="memory_writes"
+       hw_config_guid="055f256d-4052-467c-8dec-6064a4806433"
+       chipset="SKLGT4"
+       symbol_name="MemoryWrites"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiMemoryWrites"
+             description="The total number of GTI memory writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="gti_memory_writes"
+             units="messages"
+             symbol_name="GtiMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="GtiRingAccesses"
+             description="The total number of all GTI accesses to the ring."
+             data_type="uint64"
+             equation="C 3 READ 2 UMUL"
+             underscore_name="gti_ring_accesses"
+             units="messages"
+             symbol_name="GtiRingAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="GtiMscMemoryWrites"
+             description="The total number of GTI memory writes from Multisampling Color Cache (Multisampling Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="gti_msc_memory_writes"
+             units="messages"
+             symbol_name="GtiMscMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="GtiCmdStreamerMemoryWrites"
+             description="The total number of GTI memory writes from Command Streamer."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="gti_cmd_streamer_memory_writes"
+             units="messages"
+             symbol_name="GtiCmdStreamerMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Command Streamer"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GtiL3Bank0Writes"
+             description="The total number of GTI memory writes from L3 Bank 0 (L3 Bank 0 invalidations)."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="gti_l3_bank0_writes"
+             units="messages"
+             symbol_name="GtiL3Bank0Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank1Writes"
+             description="The total number of GTI memory writes from L3 Bank 1 (L3 Bank 1 invalidations)."
+             data_type="uint64"
+             equation="C 5 READ"
+             underscore_name="gti_l3_bank1_writes"
+             units="messages"
+             symbol_name="GtiL3Bank1Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank2Writes"
+             description="The total number of GTI memory writes from L3 Bank 2 (L3 Bank 2 invalidations)."
+             data_type="uint64"
+             equation="C 6 READ"
+             underscore_name="gti_l3_bank2_writes"
+             units="messages"
+             symbol_name="GtiL3Bank2Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Bank3Writes"
+             description="The total number of GTI memory writes from L3 Bank 3 (L3 Bank 3 invalidations)."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="gti_l3_bank3_writes"
+             units="messages"
+             symbol_name="GtiL3Bank3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GtiL3Writes"
+             description="The total number of GTI memory writes from L3 (L3 invalidations)."
+             data_type="uint64"
+             equation="$GtiL3Bank0Writes $GtiL3Bank1Writes $GtiL3Bank2Writes $GtiL3Bank3Writes UADD UADD UADD"
+             underscore_name="gti_l3_writes"
+             units="messages"
+             symbol_name="GtiL3Writes"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/L3"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiRccMemoryWrites"
+             description="The total number of GTI memory writes from Render Color Cache (Render Color Cache invalidations)."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="gti_rcc_memory_writes"
+             units="messages"
+             symbol_name="GtiRccMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Color Cache"
+             />
+    <counter name="GtiSoMemoryWrites"
+             description="The total number of GTI memory writes from Stream Output."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="gti_so_memory_writes"
+             units="messages"
+             symbol_name="GtiSoMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/3D Pipe/Stream Output"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GtiStcMemoryWrites"
+             description="The total number of GTI memory writes from Stencil Cache."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="gti_stc_memory_writes"
+             units="messages"
+             symbol_name="GtiStcMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="GtiRczMemoryWrites"
+             description="The total number of GTI memory writes from Render Depth Cache."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="gti_rcz_memory_writes"
+             units="messages"
+             symbol_name="GtiRczMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="GtiHizMemoryWrites"
+             description="The total number of GTI memory writes from Hierarchical Depth Cache."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="gti_hiz_memory_writes"
+             units="messages"
+             symbol_name="GtiHizMemoryWrites"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GTI/Depth Cache"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810C00" />
+        <register type="NOA" address="0x00009888" value="0x1381001A" />
+        <register type="NOA" address="0x00009888" value="0x37906800" />
+        <register type="NOA" address="0x00009888" value="0x3F901000" />
+        <register type="NOA" address="0x00009888" value="0x03811300" />
+        <register type="NOA" address="0x00009888" value="0x05811B12" />
+        <register type="NOA" address="0x00009888" value="0x0781001A" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x17810000" />
+        <register type="NOA" address="0x00009888" value="0x19810000" />
+        <register type="NOA" address="0x00009888" value="0x1B810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930055" />
+        <register type="NOA" address="0x00009888" value="0x03E58000" />
+        <register type="NOA" address="0x00009888" value="0x05E5C000" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x13900160" />
+        <register type="NOA" address="0x00009888" value="0x21900161" />
+        <register type="NOA" address="0x00009888" value="0x23900162" />
+        <register type="NOA" address="0x00009888" value="0x25900163" />
+        <register type="NOA" address="0x00009888" value="0x27900164" />
+        <register type="NOA" address="0x00009888" value="0x29900165" />
+        <register type="NOA" address="0x00009888" value="0x2B900166" />
+        <register type="NOA" address="0x00009888" value="0x2D900167" />
+        <register type="NOA" address="0x00009888" value="0x2F900150" />
+        <register type="NOA" address="0x00009888" value="0x31900105" />
+        <register type="NOA" address="0x00009888" value="0x15900103" />
+        <register type="NOA" address="0x00009888" value="0x17900101" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1D908000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C60" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900C63" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C63" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900063" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x0000272C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002728" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x0000271C" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002718" value="0xFFFFFFFF" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x0000274C" value="0x86543210" />
+        <register type="OA" address="0x00002748" value="0x86543210" />
+        <register type="OA" address="0x00002744" value="0x00006667" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x0000275C" value="0x86543210" />
+        <register type="OA" address="0x00002758" value="0x86543210" />
+        <register type="OA" address="0x00002754" value="0x00006465" />
+        <register type="OA" address="0x00002750" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007F81A" />
+        <register type="OA" address="0x00002774" value="0x0000FE00" />
+        <register type="OA" address="0x00002778" value="0x0007F82A" />
+        <register type="OA" address="0x0000277C" value="0x0000FE00" />
+        <register type="OA" address="0x00002780" value="0x0007F822" />
+        <register type="OA" address="0x00002784" value="0x0000FE00" />
+        <register type="OA" address="0x00002788" value="0x0007F8BA" />
+        <register type="OA" address="0x0000278C" value="0x0000FE00" />
+        <register type="OA" address="0x00002790" value="0x0007F87A" />
+        <register type="OA" address="0x00002794" value="0x0000FE00" />
+        <register type="OA" address="0x00002798" value="0x0007F8EA" />
+        <register type="OA" address="0x0000279C" value="0x0000FE00" />
+        <register type="OA" address="0x000027A0" value="0x0007F8E2" />
+        <register type="OA" address="0x000027A4" value="0x0000FE00" />
+        <register type="OA" address="0x000027A8" value="0x0007F8F2" />
+        <register type="OA" address="0x000027AC" value="0x0000FE00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00015014" />
+        <register type="FLEX" address="0x0000E658" value="0x00025024" />
+        <register type="FLEX" address="0x0000E758" value="0x00035034" />
+        <register type="FLEX" address="0x0000E45C" value="0x00045044" />
+        <register type="FLEX" address="0x0000E55C" value="0x00055054" />
+        <register type="FLEX" address="0x0000E65C" value="0x00065064" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extended Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extended"
+       hw_config_guid="753972d4-87cd-4460-824d-754463ac5054"
+       chipset="SKLGT4"
+       symbol_name="ComputeExtended"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Typed Writes 0"
+             description="The subslice 0 typed writes."
+             data_type="uint64"
+             equation="C 0 READ"
+             underscore_name="typed_writes0"
+             units="messages"
+             symbol_name="TypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuTypedAtomics0"
+             description="The subslice 0 EU Typed Atomics subslice 0."
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="eu_typed_atomics0"
+             units="messages"
+             symbol_name="EuTypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Atomics 0"
+             description="The subslice 0 typed atomics."
+             data_type="uint64"
+             equation="C 4 READ"
+             underscore_name="typed_atomics0"
+             units="messages"
+             symbol_name="TypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedAtomicsPerCacheLine"
+             description="The ratio of EU typed atomics requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedAtomics0 $TypedAtomics0 FDIV"
+             underscore_name="typed_atomics_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedAtomicsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedReads0"
+             description="The subslice 0 EU Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="eu_untyped_reads0"
+             units="messages"
+             symbol_name="EuUntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Writes 0"
+             description="The subslice 0 untyped writes (including SLM writes)."
+             data_type="uint64"
+             equation="C 1 READ"
+             underscore_name="untyped_writes0"
+             units="messages"
+             symbol_name="UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedAtomics0"
+             description="The subslice 0 EU Untyped Atomics subslice 0."
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="eu_untyped_atomics0"
+             units="messages"
+             symbol_name="EuUntypedAtomics0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuUntypedWrites0"
+             description="The subslice 0 EU Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="eu_untyped_writes0"
+             units="messages"
+             symbol_name="EuUntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedWrites0"
+             description="The subslice 0 EU A64 Untyped Writes subslice 0."
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="eu_a64_untyped_writes0"
+             units="messages"
+             symbol_name="EuA64UntypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedWritesPerCacheLine"
+             description="The ratio of EU untyped write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuUntypedWrites0 $EuA64UntypedWrites0 UADD $UntypedWrites0 FDIV"
+             underscore_name="untyped_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedWrites0"
+             description="The subslice 0 EU Typed Writes subslice 0."
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="eu_typed_writes0"
+             units="messages"
+             symbol_name="EuTypedWrites0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedWritesPerCacheLine"
+             description="The ratio of EU typed write requests to L3 cache line writes."
+             data_type="float"
+             equation="$EuTypedWrites0 $TypedWrites0 FDIV"
+             underscore_name="typed_writes_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedWritesPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Typed Reads 0"
+             description="The subslice 0 typed reads."
+             data_type="uint64"
+             equation="C 2 READ"
+             underscore_name="typed_reads0"
+             units="messages"
+             symbol_name="TypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Untyped Reads 0"
+             description="The subslice 0 untyped reads (including SLM reads)."
+             data_type="uint64"
+             equation="C 3 READ"
+             underscore_name="untyped_reads0"
+             units="messages"
+             symbol_name="UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EuA64UntypedReads0"
+             description="The subslice 0 EU A64 Untyped Reads subslice 0."
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="eu_a64_untyped_reads0"
+             units="messages"
+             symbol_name="EuA64UntypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ 8 UMUL $EuCoresTotalCount UDIV $EuThreadsCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EuTypedReads0"
+             description="The subslice 0 EU Typed Reads subslice 0."
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="eu_typed_reads0"
+             units="messages"
+             symbol_name="EuTypedReads0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="UntypedReadsPerCacheLine"
+             description="The ratio of EU untyped read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuUntypedReads0 $EuA64UntypedReads0 UADD $UntypedReads0 FDIV"
+             underscore_name="untyped_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="UntypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="TypedReadsPerCacheLine"
+             description="The ratio of EU typed read requests to L3 cache line reads."
+             data_type="float"
+             equation="$EuTypedReads0 $TypedReads0 FDIV"
+             underscore_name="typed_reads_per_cache_line"
+             units="eu sends to l3 cache lines"
+             symbol_name="TypedReadsPerCacheLine"
+             semantic_type="ratio"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="subslice"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x106C00E0" />
+        <register type="NOA" address="0x00009888" value="0x141C8160" />
+        <register type="NOA" address="0x00009888" value="0x161C8015" />
+        <register type="NOA" address="0x00009888" value="0x181C0120" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4EAAA0" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E6C0B01" />
+        <register type="NOA" address="0x00009888" value="0x006C0200" />
+        <register type="NOA" address="0x00009888" value="0x026C000C" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x001C0041" />
+        <register type="NOA" address="0x00009888" value="0x061C4200" />
+        <register type="NOA" address="0x00009888" value="0x081C4443" />
+        <register type="NOA" address="0x00009888" value="0x0A1C4645" />
+        <register type="NOA" address="0x00009888" value="0x0C1C7647" />
+        <register type="NOA" address="0x00009888" value="0x041C7357" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x101C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0000" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4CAA2A" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02AA" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x000DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5515" />
+        <register type="NOA" address="0x00009888" value="0x100F0155" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x11907FFF" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900040" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900802" />
+        <register type="NOA" address="0x00009888" value="0x47900842" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900842" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x43900800" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FC2A" />
+        <register type="OA" address="0x00002774" value="0x0000BF00" />
+        <register type="OA" address="0x00002778" value="0x0007FC6A" />
+        <register type="OA" address="0x0000277C" value="0x0000BF00" />
+        <register type="OA" address="0x00002780" value="0x0007FC92" />
+        <register type="OA" address="0x00002784" value="0x0000BF00" />
+        <register type="OA" address="0x00002788" value="0x0007FCA2" />
+        <register type="OA" address="0x0000278C" value="0x0000BF00" />
+        <register type="OA" address="0x00002790" value="0x0007FC32" />
+        <register type="OA" address="0x00002794" value="0x0000BF00" />
+        <register type="OA" address="0x00002798" value="0x0007FC9A" />
+        <register type="OA" address="0x0000279C" value="0x0000BF00" />
+        <register type="OA" address="0x000027A0" value="0x0007FE6A" />
+        <register type="OA" address="0x000027A4" value="0x0000BF00" />
+        <register type="OA" address="0x000027A8" value="0x0007FE7A" />
+        <register type="OA" address="0x000027AC" value="0x0000BF00" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00778008" />
+        <register type="FLEX" address="0x0000E45C" value="0x00088078" />
+        <register type="FLEX" address="0x0000E55C" value="0x00808708" />
+        <register type="FLEX" address="0x0000E65C" value="0x00A08908" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics L3 Cache Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_l3_cache"
+       hw_config_guid="4e4392e9-8f73-457b-ab44-b49f7a0c733b"
+       chipset="SKLGT4"
+       symbol_name="ComputeL3Cache"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 03 Accesses"
+             description="The total number of accesses to L3 Bank 03."
+             data_type="uint64"
+             equation="B 3 READ 2 UMUL"
+             underscore_name="l3_bank03_accesses"
+             units="messages"
+             symbol_name="L3Bank03Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="L3 Accesses"
+             description="The total number of L3 accesses from all entities."
+             data_type="uint64"
+             equation="C 0 READ C 1 READ B 2 READ B 3 READ UADD UADD UADD 2 UMUL"
+             underscore_name="l3_accesses"
+             units="messages"
+             symbol_name="L3Accesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Sampler Throughput"
+             description="The total number of GPU memory bytes transferred between samplers and L3 caches."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="A 29 READ 64 UMUL"
+             underscore_name="l3_sampler_throughput"
+             units="bytes"
+             symbol_name="L3SamplerThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Sampler"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu0_active"
+             units="percent"
+             symbol_name="Fpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU AVG IPC Rate"
+             description="The average rate of IPC calculated for 2 FPU pipelines."
+             data_type="float"
+             max_equation="2"
+             equation="A 9 READ  A 10 READ  A 11 READ UADD  A 9 READ USUB FDIV 1 FADD"
+             underscore_name="eu_avg_ipc_rate"
+             units="number"
+             symbol_name="EuAvgIpcRate"
+             semantic_type="ratio"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="EU FPU0 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 13 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu0_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Misses"
+             description="The total number of L3 misses."
+             data_type="uint64"
+             equation="C 4 READ C 5 READ UADD"
+             underscore_name="l3_misses"
+             units="messages"
+             symbol_name="L3Misses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/TAG"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="L3 Bank 00 Accesses"
+             description="The total number of accesses to L3 Bank 00."
+             data_type="uint64"
+             equation="C 0 READ 2 UMUL"
+             underscore_name="l3_bank00_accesses"
+             units="messages"
+             symbol_name="L3Bank00Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="EU FPU0 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 19 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu0_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Hybrid Instruction"
+             description="The percentage of time in which execution units were actively processing hybrid instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 14 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_hybrid_fpu1_instruction"
+             units="percent"
+             symbol_name="EuHybridFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Read Throughput"
+             description="The total number of GPU memory bytes read from GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL"
+             equation="C 6 READ 64 UMUL"
+             underscore_name="gti_read_throughput"
+             units="bytes"
+             symbol_name="GtiReadThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI L3 Throughput"
+             description="The total number of GPU memory bytes transferred between L3 caches and GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="C 4 READ C 5 READ UADD 64 UMUL"
+             underscore_name="gti_l3_throughput"
+             units="bytes"
+             symbol_name="GtiL3Throughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Bank 00 IC Accesses"
+             description="The total number of accesses to L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 0 READ B 1 READ UADD 2 UMUL $L3Bank00Accesses UMIN"
+             underscore_name="l3_bank00_ic_accesses"
+             units="messages"
+             symbol_name="L3Bank00IcAccesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="L3 Bank 00 IC Hits"
+             description="The total number of hits in L3 Bank 00 from IC cache."
+             data_type="uint64"
+             equation="B 1 READ 2 UMUL $L3Bank00IcAccesses UMIN"
+             underscore_name="l3_bank00_ic_hits"
+             units="messages"
+             symbol_name="L3Bank00IcHits"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3/IC"
+             />
+    <counter name="Sampler Accesses"
+             description="The total number of messages send to samplers."
+             data_type="uint64"
+             equation="A 28 READ"
+             underscore_name="sampler_accesses"
+             units="messages"
+             symbol_name="SamplerAccesses"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler"
+             />
+    <counter name="L3 Bank 01 Accesses"
+             description="The total number of accesses to L3 Bank 01."
+             data_type="uint64"
+             equation="C 1 READ 2 UMUL"
+             underscore_name="l3_bank01_accesses"
+             units="messages"
+             symbol_name="L3Bank01Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU0 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU0."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu0_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu0Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU FPU1 Move Instruction"
+             description="The percentage of time in which execution units were actively processing move instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 20 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_move_fpu1_instruction"
+             units="percent"
+             symbol_name="EuMoveFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="L3 Bank 02 Accesses"
+             description="The total number of accesses to L3 Bank 02."
+             data_type="uint64"
+             equation="B 2 READ 2 UMUL"
+             underscore_name="l3_bank02_accesses"
+             units="messages"
+             symbol_name="L3Bank02Accesses"
+             availability="$SliceMask 0x01 AND"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_hw_unit_type="slice"
+             mdapi_group="L3"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="L3 Total Throughput"
+             description="The total number of GPU memory bytes transferred via L3."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL"
+             equation="$L3Accesses 64 UMUL"
+             underscore_name="l3_total_throughput"
+             units="bytes"
+             symbol_name="L3TotalThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GTI Write Throughput"
+             description="The total number of GPU memory bytes written to GTI."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 32 UMUL"
+             equation="C 7 READ 64 UMUL"
+             underscore_name="gti_write_throughput"
+             units="bytes"
+             symbol_name="GtiWriteThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GTI"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="EU FPU1 Binary Instruction"
+             description="The percentage of time in which execution units were actively processing binary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_binary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuBinaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU FPU1 Ternary Instruction"
+             description="The percentage of time in which execution units were actively processing ternary instructions on FPU1."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_ternary_fpu1_instruction"
+             units="percent"
+             symbol_name="EuTernaryFpu1Instruction"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes/Instructions"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_send_active"
+             units="percent"
+             symbol_name="EuSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x166C0760" />
+        <register type="NOA" address="0x00009888" value="0x1593001E" />
+        <register type="NOA" address="0x00009888" value="0x3F900003" />
+        <register type="NOA" address="0x00009888" value="0x004E8000" />
+        <register type="NOA" address="0x00009888" value="0x0E4E8000" />
+        <register type="NOA" address="0x00009888" value="0x184E8000" />
+        <register type="NOA" address="0x00009888" value="0x1A4E8020" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x006C0051" />
+        <register type="NOA" address="0x00009888" value="0x066C5000" />
+        <register type="NOA" address="0x00009888" value="0x086C5C5D" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5E5F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x186C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x001B4000" />
+        <register type="NOA" address="0x00009888" value="0x061B8000" />
+        <register type="NOA" address="0x00009888" value="0x081BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x101C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1CE000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C0030" />
+        <register type="NOA" address="0x00009888" value="0x004C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C2A00" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0280" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F1500" />
+        <register type="NOA" address="0x00009888" value="0x100F0140" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162C0A00" />
+        <register type="NOA" address="0x00009888" value="0x182C00A0" />
+        <register type="NOA" address="0x00009888" value="0x03933300" />
+        <register type="NOA" address="0x00009888" value="0x05930032" />
+        <register type="NOA" address="0x00009888" value="0x11930000" />
+        <register type="NOA" address="0x00009888" value="0x1B930000" />
+        <register type="NOA" address="0x00009888" value="0x1D900157" />
+        <register type="NOA" address="0x00009888" value="0x1F900158" />
+        <register type="NOA" address="0x00009888" value="0x35900000" />
+        <register type="NOA" address="0x00009888" value="0x19908000" />
+        <register type="NOA" address="0x00009888" value="0x1B908000" />
+        <register type="NOA" address="0x00009888" value="0x1190030F" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900000" />
+        <register type="NOA" address="0x00009888" value="0x55900000" />
+        <register type="NOA" address="0x00009888" value="0x45900021" />
+        <register type="NOA" address="0x00009888" value="0x47900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900000" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x53905555" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x0007FFFA" />
+        <register type="OA" address="0x00002774" value="0x0000FEFE" />
+        <register type="OA" address="0x00002778" value="0x0007FFFA" />
+        <register type="OA" address="0x0000277C" value="0x0000FEFD" />
+        <register type="OA" address="0x00002790" value="0x0007FFFA" />
+        <register type="OA" address="0x00002794" value="0x0000FBEF" />
+        <register type="OA" address="0x00002798" value="0x0007FFFA" />
+        <register type="OA" address="0x0000279C" value="0x0000FBDF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00000003" />
+        <register type="FLEX" address="0x0000E658" value="0x00002001" />
+        <register type="FLEX" address="0x0000E758" value="0x00101100" />
+        <register type="FLEX" address="0x0000E45C" value="0x00201200" />
+        <register type="FLEX" address="0x0000E55C" value="0x00301300" />
+        <register type="FLEX" address="0x0000E65C" value="0x00401400" />
+    </register_config>
+  </set>
+
+  <set name="Metric set HDCAndSF"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="hdc_and_sf"
+       hw_config_guid="730d95dd-7da8-4e1c-ab8d-c0eb1e4c1805"
+       chipset="SKLGT4"
+       symbol_name="HDCAndSF"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Polygon Data Ready"
+             description="The percentage of time in which geometry pipeline output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="poly_data_ready"
+             units="percent"
+             symbol_name="PolyDataReady"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe/Strip-Fans"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="HDC stalled by L3 (s0.ss1)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss1)"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ C 4 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader01_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader01AccessStalledOnL3"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss2)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss2)"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ C 2 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader02_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader02AccessStalledOnL3"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="HDC stalled by L3 (s0.ss0)"
+             description="Percentage of time when HDC has messges to L3, but it's stalled due to lack of credits (s0.ss0)"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ C 6 READ USUB 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_sampler_shader00_access_stalled_on_l3"
+             units="percent"
+             symbol_name="NonSamplerShader00AccessStalledOnL3"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="GPU/Data Port"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x104F0232" />
+        <register type="NOA" address="0x00009888" value="0x124F4640" />
+        <register type="NOA" address="0x00009888" value="0x106C0232" />
+        <register type="NOA" address="0x00009888" value="0x11834400" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x004F1880" />
+        <register type="NOA" address="0x00009888" value="0x024F08BB" />
+        <register type="NOA" address="0x00009888" value="0x044F001B" />
+        <register type="NOA" address="0x00009888" value="0x046C0100" />
+        <register type="NOA" address="0x00009888" value="0x066C000B" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x041B8000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x005B8000" />
+        <register type="NOA" address="0x00009888" value="0x025BC000" />
+        <register type="NOA" address="0x00009888" value="0x045B4000" />
+        <register type="NOA" address="0x00009888" value="0x125C8000" />
+        <register type="NOA" address="0x00009888" value="0x145C8000" />
+        <register type="NOA" address="0x00009888" value="0x165C8000" />
+        <register type="NOA" address="0x00009888" value="0x185C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00A0" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x022CC000" />
+        <register type="NOA" address="0x00009888" value="0x042CC000" />
+        <register type="NOA" address="0x00009888" value="0x062CC000" />
+        <register type="NOA" address="0x00009888" value="0x082CC000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0F828000" />
+        <register type="NOA" address="0x00009888" value="0x0F8305C0" />
+        <register type="NOA" address="0x00009888" value="0x09830000" />
+        <register type="NOA" address="0x00009888" value="0x07830000" />
+        <register type="NOA" address="0x00009888" value="0x1D950080" />
+        <register type="NOA" address="0x00009888" value="0x13928000" />
+        <register type="NOA" address="0x00009888" value="0x0F988000" />
+        <register type="NOA" address="0x00009888" value="0x31904000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x59900001" />
+        <register type="NOA" address="0x00009888" value="0x4B900040" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x10800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000FDFF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_1"
+       hw_config_guid="d9e86d70-462b-462a-851e-fd63e8c13d63"
+       chipset="SKLGT4"
+       symbol_name="L3_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank1 Active"
+             description="The percentage of time in which slice0 L3 bank1 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_active"
+             units="percent"
+             symbol_name="L30Bank1Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 L3 Bank1 Stalled"
+             description="The percentage of time in which slice0 L3 bank1 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank1_stalled"
+             units="percent"
+             symbol_name="L30Bank1Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Active"
+             description="The percentage of time in which slice0 L3 bank0 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_active"
+             units="percent"
+             symbol_name="L30Bank0Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank0 Stalled"
+             description="The percentage of time in which slice0 L3 bank0 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank0_stalled"
+             units="percent"
+             symbol_name="L30Bank0Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C7B40" />
+        <register type="NOA" address="0x00009888" value="0x166C0020" />
+        <register type="NOA" address="0x00009888" value="0x0A603444" />
+        <register type="NOA" address="0x00009888" value="0x0A613400" />
+        <register type="NOA" address="0x00009888" value="0x1A4EA800" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0002" />
+        <register type="NOA" address="0x00009888" value="0x024E8000" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C6C5327" />
+        <register type="NOA" address="0x00009888" value="0x0E6C5425" />
+        <register type="NOA" address="0x00009888" value="0x006C2A00" />
+        <register type="NOA" address="0x00009888" value="0x026C285B" />
+        <register type="NOA" address="0x00009888" value="0x046C005C" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1C6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1E6C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0800" />
+        <register type="NOA" address="0x00009888" value="0x0C1BC000" />
+        <register type="NOA" address="0x00009888" value="0x0E1BC000" />
+        <register type="NOA" address="0x00009888" value="0x001B8000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x1C1C003C" />
+        <register type="NOA" address="0x00009888" value="0x121C8000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x10600000" />
+        <register type="NOA" address="0x00009888" value="0x04600000" />
+        <register type="NOA" address="0x00009888" value="0x0C610044" />
+        <register type="NOA" address="0x00009888" value="0x10610000" />
+        <register type="NOA" address="0x00009888" value="0x06610000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A8" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0154" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x182C00AA" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190FFC0" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900420" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900021" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900400" />
+        <register type="NOA" address="0x00009888" value="0x43900421" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00014002" />
+        <register type="OA" address="0x0000277C" value="0x0000C3FF" />
+        <register type="OA" address="0x00002780" value="0x00010002" />
+        <register type="OA" address="0x00002784" value="0x0000C7FF" />
+        <register type="OA" address="0x00002788" value="0x00004002" />
+        <register type="OA" address="0x0000278C" value="0x0000D3FF" />
+        <register type="OA" address="0x00002790" value="0x00100700" />
+        <register type="OA" address="0x00002794" value="0x0000FF1F" />
+        <register type="OA" address="0x00002798" value="0x00001402" />
+        <register type="OA" address="0x0000279C" value="0x0000FC3F" />
+        <register type="OA" address="0x000027A0" value="0x00001002" />
+        <register type="OA" address="0x000027A4" value="0x0000FC7F" />
+        <register type="OA" address="0x000027A8" value="0x00000402" />
+        <register type="OA" address="0x000027AC" value="0x0000FD3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_2"
+       hw_config_guid="52200424-6ee9-48b3-b7fa-0afcf1975e4d"
+       chipset="SKLGT4"
+       symbol_name="L3_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Stalled"
+             description="The percentage of time in which slice0 L3 bank2 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_stalled"
+             units="percent"
+             symbol_name="L30Bank2Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank2 Active"
+             description="The percentage of time in which slice0 L3 bank2 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank2_active"
+             units="percent"
+             symbol_name="L30Bank2Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C02E0" />
+        <register type="NOA" address="0x00009888" value="0x146C0001" />
+        <register type="NOA" address="0x00009888" value="0x0A623400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x064F4000" />
+        <register type="NOA" address="0x00009888" value="0x026C3324" />
+        <register type="NOA" address="0x00009888" value="0x046C3422" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C0000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C0800" />
+        <register type="NOA" address="0x00009888" value="0x065B4000" />
+        <register type="NOA" address="0x00009888" value="0x1A5C1000" />
+        <register type="NOA" address="0x00009888" value="0x06614000" />
+        <register type="NOA" address="0x00009888" value="0x0C620044" />
+        <register type="NOA" address="0x00009888" value="0x10620000" />
+        <register type="NOA" address="0x00009888" value="0x06620000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C002A" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2CC000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set L3_3"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="l3_3"
+       hw_config_guid="1988315f-0a26-44df-acb0-df7ec86b1456"
+       chipset="SKLGT4"
+       symbol_name="L3_3"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 L3 Bank3 Stalled"
+             description="The percentage of time in which slice0 L3 bank3 is stalled"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_stalled"
+             units="percent"
+             symbol_name="L30Bank3Stalled"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 L3 Bank3 Active"
+             description="The percentage of time in which slice0 L3 bank3 is active"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="l30_bank3_active"
+             units="percent"
+             symbol_name="L30Bank3Active"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GTI/L3"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x126C4E80" />
+        <register type="NOA" address="0x00009888" value="0x146C0000" />
+        <register type="NOA" address="0x00009888" value="0x0A633400" />
+        <register type="NOA" address="0x00009888" value="0x044E8000" />
+        <register type="NOA" address="0x00009888" value="0x064E8000" />
+        <register type="NOA" address="0x00009888" value="0x084E8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4E8000" />
+        <register type="NOA" address="0x00009888" value="0x0C4E8000" />
+        <register type="NOA" address="0x00009888" value="0x026C3321" />
+        <register type="NOA" address="0x00009888" value="0x046C342F" />
+        <register type="NOA" address="0x00009888" value="0x106C0000" />
+        <register type="NOA" address="0x00009888" value="0x1A6C2000" />
+        <register type="NOA" address="0x00009888" value="0x021BC000" />
+        <register type="NOA" address="0x00009888" value="0x041BC000" />
+        <register type="NOA" address="0x00009888" value="0x061B4000" />
+        <register type="NOA" address="0x00009888" value="0x141C8000" />
+        <register type="NOA" address="0x00009888" value="0x161C8000" />
+        <register type="NOA" address="0x00009888" value="0x181C8000" />
+        <register type="NOA" address="0x00009888" value="0x1A1C1800" />
+        <register type="NOA" address="0x00009888" value="0x06604000" />
+        <register type="NOA" address="0x00009888" value="0x0C630044" />
+        <register type="NOA" address="0x00009888" value="0x10630000" />
+        <register type="NOA" address="0x00009888" value="0x06630000" />
+        <register type="NOA" address="0x00009888" value="0x084C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C00AA" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F4000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0055" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190F800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900002" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00100070" />
+        <register type="OA" address="0x00002774" value="0x0000FFF1" />
+        <register type="OA" address="0x00002778" value="0x00028002" />
+        <register type="OA" address="0x0000277C" value="0x000087FF" />
+        <register type="OA" address="0x00002780" value="0x00020002" />
+        <register type="OA" address="0x00002784" value="0x00008FFF" />
+        <register type="OA" address="0x00002788" value="0x00008002" />
+        <register type="OA" address="0x0000278C" value="0x0000A7FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set RasterizerAndPixelBackend"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="rasterizer_and_pixel_backend"
+       hw_config_guid="f1f17ca7-286e-4ae5-9d15-9fccad6c665d"
+       chipset="SKLGT4"
+       symbol_name="RasterizerAndPixelBackend"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Slice0 Pixel Values Ready"
+             description="The percentage of time in which slice0 pixel values are ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_values0_ready"
+             units="percent"
+             symbol_name="PixelValues0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="Slice0 Rasterizer Input Available"
+             description="The percentage of time in which slice0 rasterizer input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_input_available"
+             units="percent"
+             symbol_name="Rasterizer0InputAvailable"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Slice0 Post-EarlyZ Pixel Data Ready"
+             description="The percentage of time in which slice0 post-EarlyZ pixel data is ready (after early Z tests have been applied)"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="pixel_data0_ready"
+             units="percent"
+             symbol_name="PixelData0Ready"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer/Early Depth Test"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Rasterizer Output Ready"
+             description="The percentage of time in which slice0 rasterizer output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="rasterizer0_output_ready"
+             units="percent"
+             symbol_name="Rasterizer0OutputReady"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Rasterizer"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Slice0 PS Output Available"
+             description="The percentage of time in which slice0 PS output is available"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_output0_available"
+             units="percent"
+             symbol_name="PSOutput0Available"
+             availability="$SliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/3D Pipe"
+             mdapi_hw_unit_type="slice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x102F3800" />
+        <register type="NOA" address="0x00009888" value="0x144D0500" />
+        <register type="NOA" address="0x00009888" value="0x120D03C0" />
+        <register type="NOA" address="0x00009888" value="0x140D03CF" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0004" />
+        <register type="NOA" address="0x00009888" value="0x0C4E4000" />
+        <register type="NOA" address="0x00009888" value="0x042F0480" />
+        <register type="NOA" address="0x00009888" value="0x082F0000" />
+        <register type="NOA" address="0x00009888" value="0x022F0000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0090" />
+        <register type="NOA" address="0x00009888" value="0x064D0027" />
+        <register type="NOA" address="0x00009888" value="0x004D0000" />
+        <register type="NOA" address="0x00009888" value="0x000D0D40" />
+        <register type="NOA" address="0x00009888" value="0x020D803F" />
+        <register type="NOA" address="0x00009888" value="0x040D8023" />
+        <register type="NOA" address="0x00009888" value="0x100D0000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x020F0010" />
+        <register type="NOA" address="0x00009888" value="0x000F0000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0050" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41901400" />
+        <register type="NOA" address="0x00009888" value="0x43901485" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900001" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x30800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x0000EFFF" />
+        <register type="OA" address="0x00002778" value="0x00006000" />
+        <register type="OA" address="0x0000277C" value="0x0000F3FF" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set Sampler"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="sampler"
+       hw_config_guid="00a9e0fb-3d2e-4405-852c-dce6334ffb3b"
+       chipset="SKLGT4"
+       symbol_name="Sampler"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice2 Input Available"
+             description="The percentage of time in which slice0 subslice2 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_input_available"
+             units="percent"
+             symbol_name="Sampler02InputAvailable"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Slice0 Subslice0 Input Available"
+             description="The percentage of time in which slice0 subslice0 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_input_available"
+             units="percent"
+             symbol_name="Sampler00InputAvailable"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="Slice0 Subslice2 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice2 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler02_output_ready"
+             units="percent"
+             symbol_name="Sampler02OutputReady"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Slice0 Subslice1 Input Available"
+             description="The percentage of time in which slice0 subslice1 sampler input is available"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_input_available"
+             units="percent"
+             symbol_name="Sampler01InputAvailable"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Slice0 Subslice0 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice0 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler00_output_ready"
+             units="percent"
+             symbol_name="Sampler00OutputReady"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Slice0 Subslice1 Sampler Output Ready"
+             description="The percentage of time in which slice0 subslice1 sampler output is ready"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="sampler01_output_ready"
+             units="percent"
+             symbol_name="Sampler01OutputReady"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Sampler"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x14152C00" />
+        <register type="NOA" address="0x00009888" value="0x16150005" />
+        <register type="NOA" address="0x00009888" value="0x121600A0" />
+        <register type="NOA" address="0x00009888" value="0x14352C00" />
+        <register type="NOA" address="0x00009888" value="0x16350005" />
+        <register type="NOA" address="0x00009888" value="0x123600A0" />
+        <register type="NOA" address="0x00009888" value="0x14552C00" />
+        <register type="NOA" address="0x00009888" value="0x16550005" />
+        <register type="NOA" address="0x00009888" value="0x125600A0" />
+        <register type="NOA" address="0x00009888" value="0x062F6000" />
+        <register type="NOA" address="0x00009888" value="0x022F2000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C0050" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0010" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0350" />
+        <register type="NOA" address="0x00009888" value="0x0C0FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F00DA" />
+        <register type="NOA" address="0x00009888" value="0x182C0028" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x022DC000" />
+        <register type="NOA" address="0x00009888" value="0x042D4000" />
+        <register type="NOA" address="0x00009888" value="0x0C138000" />
+        <register type="NOA" address="0x00009888" value="0x0E132000" />
+        <register type="NOA" address="0x00009888" value="0x0413C000" />
+        <register type="NOA" address="0x00009888" value="0x1C140018" />
+        <register type="NOA" address="0x00009888" value="0x0C157000" />
+        <register type="NOA" address="0x00009888" value="0x0E150078" />
+        <register type="NOA" address="0x00009888" value="0x10150000" />
+        <register type="NOA" address="0x00009888" value="0x04162180" />
+        <register type="NOA" address="0x00009888" value="0x02160000" />
+        <register type="NOA" address="0x00009888" value="0x04174000" />
+        <register type="NOA" address="0x00009888" value="0x0233A000" />
+        <register type="NOA" address="0x00009888" value="0x04333000" />
+        <register type="NOA" address="0x00009888" value="0x14348000" />
+        <register type="NOA" address="0x00009888" value="0x16348000" />
+        <register type="NOA" address="0x00009888" value="0x02357870" />
+        <register type="NOA" address="0x00009888" value="0x10350000" />
+        <register type="NOA" address="0x00009888" value="0x04360043" />
+        <register type="NOA" address="0x00009888" value="0x02360000" />
+        <register type="NOA" address="0x00009888" value="0x04371000" />
+        <register type="NOA" address="0x00009888" value="0x0E538000" />
+        <register type="NOA" address="0x00009888" value="0x00538000" />
+        <register type="NOA" address="0x00009888" value="0x06533000" />
+        <register type="NOA" address="0x00009888" value="0x1C540020" />
+        <register type="NOA" address="0x00009888" value="0x12548000" />
+        <register type="NOA" address="0x00009888" value="0x0E557000" />
+        <register type="NOA" address="0x00009888" value="0x00557800" />
+        <register type="NOA" address="0x00009888" value="0x10550000" />
+        <register type="NOA" address="0x00009888" value="0x06560043" />
+        <register type="NOA" address="0x00009888" value="0x02560000" />
+        <register type="NOA" address="0x00009888" value="0x06571000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900060" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900842" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900060" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x70800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+        <register type="OA" address="0x00002770" value="0x0000C000" />
+        <register type="OA" address="0x00002774" value="0x0000E7FF" />
+        <register type="OA" address="0x00002778" value="0x00003000" />
+        <register type="OA" address="0x0000277C" value="0x0000F9FF" />
+        <register type="OA" address="0x00002780" value="0x00000C00" />
+        <register type="OA" address="0x00002784" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_1"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_1"
+       hw_config_guid="13dcc50a-7ec0-409b-99d6-a3f932cedcb3"
+       chipset="SKLGT4"
+       symbol_name="TDL_1"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice0 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice0 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 1 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread00_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread00ReadyForDispatch"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="PS Thread Ready For Dispatch on Slice0 Subslice2 Thread Dispatcher"
+             description="The percentage of time in which PS thread is ready for dispatch on slice0 subslice2 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_thread02_ready_for_dispatch"
+             units="percent"
+             symbol_name="PSThread02ReadyForDispatch"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Non-PS Thread Ready For Dispatch on Slice0 Subslice1 Thread Dispatcher"
+             description="The percentage of time in which non-PS thread is ready for dispatch on slice0 subslice1 thread dispatcher"
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="non_ps_thread01_ready_for_dispatch"
+             units="percent"
+             symbol_name="NonPSThread01ReadyForDispatch"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12120000" />
+        <register type="NOA" address="0x00009888" value="0x12320000" />
+        <register type="NOA" address="0x00009888" value="0x12520000" />
+        <register type="NOA" address="0x00009888" value="0x002F8000" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0015" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F03A0" />
+        <register type="NOA" address="0x00009888" value="0x0C0FF000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0095" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2D4000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x02108000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x02118000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x02121880" />
+        <register type="NOA" address="0x00009888" value="0x041219B5" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x02134000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x0C308000" />
+        <register type="NOA" address="0x00009888" value="0x0E304000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x0C318000" />
+        <register type="NOA" address="0x00009888" value="0x0E314000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x0C321A80" />
+        <register type="NOA" address="0x00009888" value="0x0E320033" />
+        <register type="NOA" address="0x00009888" value="0x06320031" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x0C334000" />
+        <register type="NOA" address="0x00009888" value="0x0E331000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0E508000" />
+        <register type="NOA" address="0x00009888" value="0x00508000" />
+        <register type="NOA" address="0x00009888" value="0x02504000" />
+        <register type="NOA" address="0x00009888" value="0x0E518000" />
+        <register type="NOA" address="0x00009888" value="0x00518000" />
+        <register type="NOA" address="0x00009888" value="0x02514000" />
+        <register type="NOA" address="0x00009888" value="0x0E521880" />
+        <register type="NOA" address="0x00009888" value="0x00521A80" />
+        <register type="NOA" address="0x00009888" value="0x02520033" />
+        <register type="NOA" address="0x00009888" value="0x0E534000" />
+        <register type="NOA" address="0x00009888" value="0x00534000" />
+        <register type="NOA" address="0x00009888" value="0x02531000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900800" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900062" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900C00" />
+        <register type="NOA" address="0x00009888" value="0x43900003" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00000002" />
+        <register type="OA" address="0x00002774" value="0x00007FFF" />
+        <register type="OA" address="0x00002778" value="0x00000000" />
+        <register type="OA" address="0x0000277C" value="0x00009FFF" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000EFFF" />
+        <register type="OA" address="0x00002788" value="0x00000000" />
+        <register type="OA" address="0x0000278C" value="0x0000F3FF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FDFF" />
+        <register type="OA" address="0x00002798" value="0x00000000" />
+        <register type="OA" address="0x0000279C" value="0x0000FE7F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Metric set TDL_2"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="tdl_2"
+       hw_config_guid="97875e21-6624-4aee-9191-682feb3eae21"
+       chipset="SKLGT4"
+       symbol_name="TDL_2"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Send Pipe Active"
+             description="The percentage of time in which EU send pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 12 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_send_active"
+             units="percent"
+             symbol_name="VsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 11 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu1_active"
+             units="percent"
+             symbol_name="VsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GS Threads Dispatched"
+             description="The total number of geometry shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 5 READ"
+             underscore_name="gs_threads"
+             units="threads"
+             symbol_name="GsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Geometry Shader"
+             />
+    <counter name="Early Hi-Depth Test Fails"
+             description="The total number of pixels dropped on early hierarchical depth test."
+             data_type="uint64"
+             equation="A 22 READ 4 UMUL"
+             underscore_name="hi_depth_test_fails"
+             units="pixels"
+             symbol_name="HiDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Hi-Depth Test"
+             />
+    <counter name="FS Both FPU Active"
+             description="The percentage of time in which fragment shaders were processed actively on the both FPUs."
+             data_type="float"
+             max_equation="100"
+             equation="A 18 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_eu_both_fpu_active"
+             units="percent"
+             symbol_name="PsEuBothFpuActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Frame Batch Draw"
+             mdapi_group="3D Pipe/Fragment Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS Threads Dispatched"
+             description="The total number of vertex shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 1 READ"
+             underscore_name="vs_threads"
+             units="threads"
+             symbol_name="VsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Vertex Shader"
+             />
+    <counter name="FS Threads Dispatched"
+             description="The total number of fragment shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 6 READ"
+             underscore_name="ps_threads"
+             units="threads"
+             symbol_name="PsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Fragment Shader"
+             />
+    <counter name="Shader Barrier Messages"
+             description="The total number of shader barrier messages."
+             data_type="uint64"
+             equation="A 35 READ"
+             underscore_name="shader_barriers"
+             units="messages"
+             symbol_name="ShaderBarriers"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Barrier"
+             />
+    <counter name="Sampler Texels"
+             description="The total number of texels seen on input (with 2x2 accuracy) in all sampler units."
+             data_type="uint64"
+             equation="A 28 READ 4 UMUL"
+             underscore_name="sampler_texels"
+             units="texels"
+             symbol_name="SamplerTexels"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Input"
+             />
+    <counter name="Pixels Failing Tests"
+             description="The total number of pixels dropped on post-FS alpha, stencil, or depth tests."
+             data_type="uint64"
+             equation="A 25 READ 4 UMUL"
+             underscore_name="pixels_failing_post_ps_tests"
+             units="pixels"
+             symbol_name="PixelsFailingPostPsTests"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 7 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort0"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice1 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice1 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 2 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header01_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader01ReadyPort1"
+             availability="$SubsliceMask 0x2 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Sampler Texels Misses"
+             description="The total number of texels lookups (with 2x2 accuracy) that missed L1 sampler cache."
+             data_type="uint64"
+             equation="A 29 READ 4 UMUL"
+             underscore_name="sampler_texel_misses"
+             units="texels"
+             symbol_name="SamplerTexelMisses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Batch Frame Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="Sampler/Sampler Cache"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="SLM Bytes Read"
+             description="The total number of GPU memory bytes read from shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ 64 UMUL"
+             underscore_name="slm_bytes_read"
+             units="bytes"
+             symbol_name="SlmBytesRead"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 16 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu1_active"
+             units="percent"
+             symbol_name="PsFpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="PS Send Pipeline Active"
+             description="The percentage of time in which EU send pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 17 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_send_active"
+             units="percent"
+             symbol_name="PsSendActive"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a vertex shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 10 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vs_fpu0_active"
+             units="percent"
+             symbol_name="VsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Vertex Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 4 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort1"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice2 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice2 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 3 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header02_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader02ReadyPort0"
+             availability="$SubsliceMask 0x4 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Rasterized Pixels"
+             description="The total number of rasterized pixels."
+             data_type="uint64"
+             equation="A 21 READ 4 UMUL"
+             underscore_name="rasterized_pixels"
+             units="pixels"
+             symbol_name="RasterizedPixels"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer"
+             />
+    <counter name="PS FPU0 Pipe Active"
+             description="The percentage of time in which EU FPU0 pipeline was actively processing a pixel shader instruction."
+             data_type="float"
+             max_equation="100"
+             equation="A 15 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="ps_fpu0_active"
+             units="percent"
+             symbol_name="PsFpu0Active"
+             semantic_type="duration"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pixel Shader"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="DS Threads Dispatched"
+             description="The total number of domain shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 3 READ"
+             underscore_name="ds_threads"
+             units="threads"
+             symbol_name="DsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Domain Shader"
+             />
+    <counter name="Samples Written"
+             description="The total number of samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 26 READ 4 UMUL"
+             underscore_name="samples_written"
+             units="pixels"
+             symbol_name="SamplesWritten"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Blended"
+             description="The total number of blended samples or pixels written to all render targets."
+             data_type="uint64"
+             equation="A 27 READ 4 UMUL"
+             underscore_name="samples_blended"
+             units="pixels"
+             symbol_name="SamplesBlended"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Output Merger"
+             />
+    <counter name="Early Depth Test Fails"
+             description="The total number of pixels dropped on early depth test."
+             data_type="uint64"
+             equation="A 23 READ 4 UMUL"
+             underscore_name="early_depth_test_fails"
+             units="pixels"
+             symbol_name="EarlyDepthTestFails"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Rasterizer/Early Depth Test"
+             />
+    <counter name="Shader Memory Accesses"
+             description="The total number of shader memory accesses to L3."
+             data_type="uint64"
+             equation="A 32 READ"
+             underscore_name="shader_memory_accesses"
+             units="messages"
+             symbol_name="ShaderMemoryAccesses"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port"
+             />
+    <counter name="HS Threads Dispatched"
+             description="The total number of hull shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 2 READ"
+             underscore_name="hs_threads"
+             units="threads"
+             symbol_name="HsThreads"
+             semantic_type="event"
+             mdapi_supported_apis="OGL4 IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Hull Shader"
+             />
+    <counter name="SLM Bytes Written"
+             description="The total number of GPU memory bytes written into shared local memory."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 31 READ 64 UMUL"
+             underscore_name="slm_bytes_written"
+             units="bytes"
+             symbol_name="SlmBytesWritten"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_group="L3/Data Port/SLM"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="L3 Shader Throughput"
+             description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB."
+             data_type="uint64"
+             max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL"
+             equation="A 30 READ  A 31 READ $ShaderMemoryAccesses 64 UMUL UADD UADD"
+             underscore_name="l3_shader_throughput"
+             units="bytes"
+             symbol_name="L3ShaderThroughput"
+             semantic_type="throughput"
+             mdapi_supported_apis="OGL4 OCL IO BB"
+             mdapi_usage_flags="Tier2 Frame Batch Draw"
+             mdapi_group="L3/Data Port"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="Samples Killed in FS"
+             description="The total number of samples or pixels dropped in fragment shaders."
+             data_type="uint64"
+             equation="A 24 READ 4 UMUL"
+             underscore_name="samples_killed_in_ps"
+             units="pixels"
+             symbol_name="SamplesKilledInPs"
+             semantic_type="event"
+             mdapi_supported_apis="OGL IO BB"
+             mdapi_usage_flags="Tier4 Overview Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="3D Pipe/Fragment Shader"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 0"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 0"
+             data_type="float"
+             max_equation="100"
+             equation="C 5 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port0"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort0"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <counter name="Shader Atomic Memory Accesses"
+             description="The total number of shader atomic memory accesses."
+             data_type="uint64"
+             equation="A 34 READ"
+             underscore_name="shader_atomics"
+             units="messages"
+             symbol_name="ShaderAtomics"
+             semantic_type="event"
+             mdapi_supported_apis="OGL OCL IO BB"
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="L3/Data Port/Atomics"
+             />
+    <counter name="Thread Header Ready on Slice0 Subslice0 Thread Dispatcher Port 1"
+             description="The percentage of time in which thread header is ready on slice0 subslice0 thread dispatcher port 1"
+             data_type="float"
+             max_equation="100"
+             equation="C 6 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="thread_header00_ready_port1"
+             units="percent"
+             symbol_name="ThreadHeader00ReadyPort1"
+             availability="$SubsliceMask 0x1 AND"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview Frame Batch Draw"
+             mdapi_group="GPU/Thread Dispatcher"
+             mdapi_hw_unit_type="subslice"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x12124D60" />
+        <register type="NOA" address="0x00009888" value="0x12322E60" />
+        <register type="NOA" address="0x00009888" value="0x12524D60" />
+        <register type="NOA" address="0x00009888" value="0x022F3000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0014" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x0C0FE000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F0097" />
+        <register type="NOA" address="0x00009888" value="0x082C8000" />
+        <register type="NOA" address="0x00009888" value="0x0A2C8000" />
+        <register type="NOA" address="0x00009888" value="0x002D8000" />
+        <register type="NOA" address="0x00009888" value="0x062D4000" />
+        <register type="NOA" address="0x00009888" value="0x0410C000" />
+        <register type="NOA" address="0x00009888" value="0x0411C000" />
+        <register type="NOA" address="0x00009888" value="0x04121FB7" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x04135000" />
+        <register type="NOA" address="0x00009888" value="0x00308000" />
+        <register type="NOA" address="0x00009888" value="0x06304000" />
+        <register type="NOA" address="0x00009888" value="0x00318000" />
+        <register type="NOA" address="0x00009888" value="0x06314000" />
+        <register type="NOA" address="0x00009888" value="0x00321B80" />
+        <register type="NOA" address="0x00009888" value="0x0632003F" />
+        <register type="NOA" address="0x00009888" value="0x00334000" />
+        <register type="NOA" address="0x00009888" value="0x06331000" />
+        <register type="NOA" address="0x00009888" value="0x0250C000" />
+        <register type="NOA" address="0x00009888" value="0x0251C000" />
+        <register type="NOA" address="0x00009888" value="0x02521FB7" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x02535000" />
+        <register type="NOA" address="0x00009888" value="0x1190FC00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x51900000" />
+        <register type="NOA" address="0x00009888" value="0x41900800" />
+        <register type="NOA" address="0x00009888" value="0x43900063" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900040" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0x00800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x00800000" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00010003" />
+        <register type="FLEX" address="0x0000E658" value="0x00012011" />
+        <register type="FLEX" address="0x0000E758" value="0x00015014" />
+        <register type="FLEX" address="0x0000E45C" value="0x00051050" />
+        <register type="FLEX" address="0x0000E55C" value="0x00053052" />
+        <register type="FLEX" address="0x0000E65C" value="0x00055054" />
+    </register_config>
+  </set>
+
+  <set name="Compute Metrics Extra Gen9"
+       mdapi_supported_apis="OGL4 OCL IO BB"
+       underscore_name="compute_extra"
+       hw_config_guid="a5aa857d-e8f0-4dfa-8981-ce340fa748fd"
+       chipset="SKLGT4"
+       symbol_name="ComputeExtra"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU FPU1 Pipe Active"
+             description="The percentage of time in which EU FPU1 pipeline was actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="fpu1_active"
+             units="percent"
+             symbol_name="Fpu1Active"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x121203E0" />
+        <register type="NOA" address="0x00009888" value="0x123203E0" />
+        <register type="NOA" address="0x00009888" value="0x125203E0" />
+        <register type="NOA" address="0x00009888" value="0x129203E0" />
+        <register type="NOA" address="0x00009888" value="0x12B203E0" />
+        <register type="NOA" address="0x00009888" value="0x12D203E0" />
+        <register type="NOA" address="0x00009888" value="0x131203E0" />
+        <register type="NOA" address="0x00009888" value="0x133203E0" />
+        <register type="NOA" address="0x00009888" value="0x135203E0" />
+        <register type="NOA" address="0x00009888" value="0x1A4EF000" />
+        <register type="NOA" address="0x00009888" value="0x1C4E0003" />
+        <register type="NOA" address="0x00009888" value="0x024EC000" />
+        <register type="NOA" address="0x00009888" value="0x044EC000" />
+        <register type="NOA" address="0x00009888" value="0x064EC000" />
+        <register type="NOA" address="0x00009888" value="0x022F4000" />
+        <register type="NOA" address="0x00009888" value="0x0C4C02A0" />
+        <register type="NOA" address="0x00009888" value="0x084CA000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C0042" />
+        <register type="NOA" address="0x00009888" value="0x0C0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E0DA000" />
+        <register type="NOA" address="0x00009888" value="0x000D8000" />
+        <register type="NOA" address="0x00009888" value="0x020DA000" />
+        <register type="NOA" address="0x00009888" value="0x040DA000" />
+        <register type="NOA" address="0x00009888" value="0x060D2000" />
+        <register type="NOA" address="0x00009888" value="0x100F0150" />
+        <register type="NOA" address="0x00009888" value="0x0C0F5000" />
+        <register type="NOA" address="0x00009888" value="0x0E0F006D" />
+        <register type="NOA" address="0x00009888" value="0x182C00A8" />
+        <register type="NOA" address="0x00009888" value="0x022C8000" />
+        <register type="NOA" address="0x00009888" value="0x042C8000" />
+        <register type="NOA" address="0x00009888" value="0x062C8000" />
+        <register type="NOA" address="0x00009888" value="0x0C2C8000" />
+        <register type="NOA" address="0x00009888" value="0x042D8000" />
+        <register type="NOA" address="0x00009888" value="0x06104000" />
+        <register type="NOA" address="0x00009888" value="0x06114000" />
+        <register type="NOA" address="0x00009888" value="0x06120033" />
+        <register type="NOA" address="0x00009888" value="0x00120000" />
+        <register type="NOA" address="0x00009888" value="0x06131000" />
+        <register type="NOA" address="0x00009888" value="0x04308000" />
+        <register type="NOA" address="0x00009888" value="0x04318000" />
+        <register type="NOA" address="0x00009888" value="0x04321980" />
+        <register type="NOA" address="0x00009888" value="0x00320000" />
+        <register type="NOA" address="0x00009888" value="0x04334000" />
+        <register type="NOA" address="0x00009888" value="0x04504000" />
+        <register type="NOA" address="0x00009888" value="0x04514000" />
+        <register type="NOA" address="0x00009888" value="0x04520033" />
+        <register type="NOA" address="0x00009888" value="0x00520000" />
+        <register type="NOA" address="0x00009888" value="0x04531000" />
+        <register type="NOA" address="0x00009888" value="0x1ACEF000" />
+        <register type="NOA" address="0x00009888" value="0x1CCE0003" />
+        <register type="NOA" address="0x00009888" value="0x00AF8000" />
+        <register type="NOA" address="0x00009888" value="0x0CCC02A0" />
+        <register type="NOA" address="0x00009888" value="0x0ACC0001" />
+        <register type="NOA" address="0x00009888" value="0x0C8D8000" />
+        <register type="NOA" address="0x00009888" value="0x0E8DA000" />
+        <register type="NOA" address="0x00009888" value="0x008D8000" />
+        <register type="NOA" address="0x00009888" value="0x028DA000" />
+        <register type="NOA" address="0x00009888" value="0x108F0150" />
+        <register type="NOA" address="0x00009888" value="0x0C8FB000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F0001" />
+        <register type="NOA" address="0x00009888" value="0x18AC00A8" />
+        <register type="NOA" address="0x00009888" value="0x06AC8000" />
+        <register type="NOA" address="0x00009888" value="0x02AD4000" />
+        <register type="NOA" address="0x00009888" value="0x02908000" />
+        <register type="NOA" address="0x00009888" value="0x02918000" />
+        <register type="NOA" address="0x00009888" value="0x02921980" />
+        <register type="NOA" address="0x00009888" value="0x00920000" />
+        <register type="NOA" address="0x00009888" value="0x02934000" />
+        <register type="NOA" address="0x00009888" value="0x02B04000" />
+        <register type="NOA" address="0x00009888" value="0x02B14000" />
+        <register type="NOA" address="0x00009888" value="0x02B20033" />
+        <register type="NOA" address="0x00009888" value="0x00B20000" />
+        <register type="NOA" address="0x00009888" value="0x02B31000" />
+        <register type="NOA" address="0x00009888" value="0x00D08000" />
+        <register type="NOA" address="0x00009888" value="0x00D18000" />
+        <register type="NOA" address="0x00009888" value="0x00D21980" />
+        <register type="NOA" address="0x00009888" value="0x00D34000" />
+        <register type="NOA" address="0x00009888" value="0x072F8000" />
+        <register type="NOA" address="0x00009888" value="0x0D4C0100" />
+        <register type="NOA" address="0x00009888" value="0x0D0D8000" />
+        <register type="NOA" address="0x00009888" value="0x0F0DA000" />
+        <register type="NOA" address="0x00009888" value="0x110F01B0" />
+        <register type="NOA" address="0x00009888" value="0x192C0080" />
+        <register type="NOA" address="0x00009888" value="0x0F2D4000" />
+        <register type="NOA" address="0x00009888" value="0x0F108000" />
+        <register type="NOA" address="0x00009888" value="0x0F118000" />
+        <register type="NOA" address="0x00009888" value="0x0F121980" />
+        <register type="NOA" address="0x00009888" value="0x01120000" />
+        <register type="NOA" address="0x00009888" value="0x0F134000" />
+        <register type="NOA" address="0x00009888" value="0x0F304000" />
+        <register type="NOA" address="0x00009888" value="0x0F314000" />
+        <register type="NOA" address="0x00009888" value="0x0F320033" />
+        <register type="NOA" address="0x00009888" value="0x01320000" />
+        <register type="NOA" address="0x00009888" value="0x0F331000" />
+        <register type="NOA" address="0x00009888" value="0x0D508000" />
+        <register type="NOA" address="0x00009888" value="0x0D518000" />
+        <register type="NOA" address="0x00009888" value="0x0D521980" />
+        <register type="NOA" address="0x00009888" value="0x01520000" />
+        <register type="NOA" address="0x00009888" value="0x0D534000" />
+        <register type="NOA" address="0x00009888" value="0x1190FF80" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900C00" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+        <register type="NOA" address="0x00009888" value="0x4B900002" />
+        <register type="NOA" address="0x00009888" value="0x59900000" />
+        <register type="NOA" address="0x00009888" value="0x51901100" />
+        <register type="NOA" address="0x00009888" value="0x41901000" />
+        <register type="NOA" address="0x00009888" value="0x43901423" />
+        <register type="NOA" address="0x00009888" value="0x53903331" />
+        <register type="NOA" address="0x00009888" value="0x45900044" />
+    </register_config>
+  </set>
+
+  <set name="Media Vme Pipe Gen9"
+       mdapi_supported_apis="MEDIA IO BB"
+       underscore_name="vme_pipe"
+       hw_config_guid="0e8d8b86-4ee7-4cdd-aaaa-58adc92cb29e"
+       chipset="SKLGT4"
+       symbol_name="VMEPipe"
+       >
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="EU Active"
+             description="The percentage of time in which the Execution Units were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 7 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_active"
+             units="percent"
+             symbol_name="EuActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Both FPU Pipes Active"
+             description="The percentage of time in which both EU FPU pipelines were actively processing."
+             data_type="float"
+             max_equation="100"
+             equation="A 9 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_fpu_both_active"
+             units="percent"
+             symbol_name="EuFpuBothActive"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Overview System Frame Batch Draw"
+             mdapi_group="EU Array/Pipes"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL  GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="CS Threads Dispatched"
+             description="The total number of compute shader hardware threads dispatched."
+             data_type="uint64"
+             equation="A 4 READ"
+             underscore_name="cs_threads"
+             units="threads"
+             symbol_name="CsThreads"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier3 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="EU Array/Compute Shader"
+             />
+    <counter name="EU Thread Occupancy"
+             description="The percentage of time in which hardware threads occupied EUs."
+             data_type="float"
+             max_equation="100"
+             equation="8 A 10 READ FMUL $EuThreadsCount FDIV $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_thread_occupancy"
+             units="percent"
+             symbol_name="EuThreadOccupancy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="EU Stall"
+             description="The percentage of time in which the Execution Units were stalled."
+             data_type="float"
+             max_equation="100"
+             equation="A 8 READ $EuCoresTotalCount UDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="eu_stall"
+             units="percent"
+             symbol_name="EuStall"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier2 Overview System Frame Batch Draw"
+             mdapi_group="EU Array"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="VME Busy"
+             description="The percentage of time in which VME (IME or CRE) was actively processing data."
+             data_type="float"
+             max_equation="100"
+             equation="B 0 READ B 3 READ FADD 2 FDIV 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="vme_busy"
+             units="percent"
+             symbol_name="VMEBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Overview System Batch Tier2"
+             mdapi_group="VME Pipe"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="GPU Busy"
+             description="The percentage of time in which the GPU has been processing GPU commands."
+             data_type="float"
+             max_equation="100"
+             equation="A 0 READ 100 UMUL $GpuCoreClocks FDIV"
+             underscore_name="gpu_busy"
+             units="percent"
+             symbol_name="GpuBusy"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x141A5800" />
+        <register type="NOA" address="0x00009888" value="0x161A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12180240" />
+        <register type="NOA" address="0x00009888" value="0x14180002" />
+        <register type="NOA" address="0x00009888" value="0x149A5800" />
+        <register type="NOA" address="0x00009888" value="0x169A00C0" />
+        <register type="NOA" address="0x00009888" value="0x12980240" />
+        <register type="NOA" address="0x00009888" value="0x14980002" />
+        <register type="NOA" address="0x00009888" value="0x1A4E3FC0" />
+        <register type="NOA" address="0x00009888" value="0x002F1000" />
+        <register type="NOA" address="0x00009888" value="0x022F8000" />
+        <register type="NOA" address="0x00009888" value="0x042F3000" />
+        <register type="NOA" address="0x00009888" value="0x004C4000" />
+        <register type="NOA" address="0x00009888" value="0x0A4C9500" />
+        <register type="NOA" address="0x00009888" value="0x0C4C002A" />
+        <register type="NOA" address="0x00009888" value="0x000D2000" />
+        <register type="NOA" address="0x00009888" value="0x060D8000" />
+        <register type="NOA" address="0x00009888" value="0x080DA000" />
+        <register type="NOA" address="0x00009888" value="0x0A0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C0F0400" />
+        <register type="NOA" address="0x00009888" value="0x0E0F5500" />
+        <register type="NOA" address="0x00009888" value="0x100F0015" />
+        <register type="NOA" address="0x00009888" value="0x002C8000" />
+        <register type="NOA" address="0x00009888" value="0x0E2C8000" />
+        <register type="NOA" address="0x00009888" value="0x162CAA00" />
+        <register type="NOA" address="0x00009888" value="0x182C000A" />
+        <register type="NOA" address="0x00009888" value="0x04193000" />
+        <register type="NOA" address="0x00009888" value="0x081A28C1" />
+        <register type="NOA" address="0x00009888" value="0x001A0000" />
+        <register type="NOA" address="0x00009888" value="0x00133000" />
+        <register type="NOA" address="0x00009888" value="0x0613C000" />
+        <register type="NOA" address="0x00009888" value="0x0813F000" />
+        <register type="NOA" address="0x00009888" value="0x00172000" />
+        <register type="NOA" address="0x00009888" value="0x06178000" />
+        <register type="NOA" address="0x00009888" value="0x0817A000" />
+        <register type="NOA" address="0x00009888" value="0x00180037" />
+        <register type="NOA" address="0x00009888" value="0x06180940" />
+        <register type="NOA" address="0x00009888" value="0x08180000" />
+        <register type="NOA" address="0x00009888" value="0x02180000" />
+        <register type="NOA" address="0x00009888" value="0x04183000" />
+        <register type="NOA" address="0x00009888" value="0x04AFC000" />
+        <register type="NOA" address="0x00009888" value="0x06AF3000" />
+        <register type="NOA" address="0x00009888" value="0x0ACC4000" />
+        <register type="NOA" address="0x00009888" value="0x0CCC0015" />
+        <register type="NOA" address="0x00009888" value="0x0A8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0C8DA000" />
+        <register type="NOA" address="0x00009888" value="0x0E8F4000" />
+        <register type="NOA" address="0x00009888" value="0x108F0015" />
+        <register type="NOA" address="0x00009888" value="0x16ACA000" />
+        <register type="NOA" address="0x00009888" value="0x18AC000A" />
+        <register type="NOA" address="0x00009888" value="0x06993000" />
+        <register type="NOA" address="0x00009888" value="0x0C9A28C1" />
+        <register type="NOA" address="0x00009888" value="0x009A0000" />
+        <register type="NOA" address="0x00009888" value="0x0A93F000" />
+        <register type="NOA" address="0x00009888" value="0x0C93F000" />
+        <register type="NOA" address="0x00009888" value="0x0A97A000" />
+        <register type="NOA" address="0x00009888" value="0x0C97A000" />
+        <register type="NOA" address="0x00009888" value="0x0A980977" />
+        <register type="NOA" address="0x00009888" value="0x08980000" />
+        <register type="NOA" address="0x00009888" value="0x04980000" />
+        <register type="NOA" address="0x00009888" value="0x06983000" />
+        <register type="NOA" address="0x00009888" value="0x119000FF" />
+        <register type="NOA" address="0x00009888" value="0x51900010" />
+        <register type="NOA" address="0x00009888" value="0x41900060" />
+        <register type="NOA" address="0x00009888" value="0x55900111" />
+        <register type="NOA" address="0x00009888" value="0x45900C00" />
+        <register type="NOA" address="0x00009888" value="0x47900821" />
+        <register type="NOA" address="0x00009888" value="0x57900000" />
+        <register type="NOA" address="0x00009888" value="0x49900002" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0x30800000" />
+        <register type="OA" address="0x00002770" value="0x00100030" />
+        <register type="OA" address="0x00002774" value="0x0000FFF9" />
+        <register type="OA" address="0x00002778" value="0x00000002" />
+        <register type="OA" address="0x0000277C" value="0x0000FFFC" />
+        <register type="OA" address="0x00002780" value="0x00000002" />
+        <register type="OA" address="0x00002784" value="0x0000FFF3" />
+        <register type="OA" address="0x00002788" value="0x00100180" />
+        <register type="OA" address="0x0000278C" value="0x0000FFCF" />
+        <register type="OA" address="0x00002790" value="0x00000002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00000002" />
+        <register type="OA" address="0x0000279C" value="0x0000FF3F" />
+    </register_config>
+    <register_config type="FLEX">
+        <register type="FLEX" address="0x0000E458" value="0x00005004" />
+        <register type="FLEX" address="0x0000E558" value="0x00008003" />
+    </register_config>
+  </set>
+
+  <set name="MDAPI testing set Gen9"
+       mdapi_supported_apis="OGL OCL IO BB"
+       underscore_name="test_oa"
+       hw_config_guid="882fa433-1f4a-4a67-a962-c741888fe5f5"
+       chipset="SKLGT4"
+       symbol_name="TestOa"
+       >
+    <counter name="TestCounter7"
+             description="HW test counter 7. Factor: 0.666"
+             data_type="uint64"
+             equation="B 7 READ"
+             underscore_name="counter7"
+             units="events"
+             symbol_name="Counter7"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Time Elapsed"
+             description="Time elapsed on the GPU during the measurement."
+             data_type="uint64"
+             equation="GPU_TIME 0 READ 1000000000 UMUL $GpuTimestampFrequency UDIV"
+             underscore_name="gpu_time"
+             units="ns"
+             symbol_name="GpuTime"
+             semantic_type="duration"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="GPU Core Clocks"
+             description="The total number of GPU core clocks elapsed during the measurement."
+             data_type="uint64"
+             equation="GPU_CLOCK 0 READ"
+             underscore_name="gpu_core_clocks"
+             units="cycles"
+             symbol_name="GpuCoreClocks"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="AVG GPU Core Frequency"
+             description="Average GPU Core Frequency in the measurement."
+             data_type="uint64"
+             max_equation="$GpuMaxFrequency"
+             equation="$GpuCoreClocks 1000000000 UMUL $GpuTime UDIV"
+             underscore_name="avg_gpu_core_frequency"
+             units="hz"
+             symbol_name="AvgGpuCoreFrequency"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Tier1 Overview System Frame Batch Draw"
+             mdapi_group="GPU"
+             mdapi_hw_unit_type="gpu"
+             />
+    <counter name="TestCounter8"
+             description="HW test counter 8. Should be equal to 1."
+             data_type="uint64"
+             equation="C 7 READ"
+             underscore_name="counter8"
+             units="events"
+             symbol_name="Counter8"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter4"
+             description="HW test counter 4. Factor: 0.333"
+             data_type="uint64"
+             equation="B 4 READ"
+             underscore_name="counter4"
+             units="events"
+             symbol_name="Counter4"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter5"
+             description="HW test counter 5. Factor: 0.333"
+             data_type="uint64"
+             equation="B 5 READ"
+             underscore_name="counter5"
+             units="events"
+             symbol_name="Counter5"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter6"
+             description="HW test counter 6. Factor: 0.166"
+             data_type="uint64"
+             equation="B 6 READ"
+             underscore_name="counter6"
+             units="events"
+             symbol_name="Counter6"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter3"
+             description="HW test counter 3. Factor: 0.5"
+             data_type="uint64"
+             equation="B 3 READ"
+             underscore_name="counter3"
+             units="events"
+             symbol_name="Counter3"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter0"
+             description="HW test counter 0. Factor: 0.0"
+             data_type="uint64"
+             equation="B 0 READ"
+             underscore_name="counter0"
+             units="events"
+             symbol_name="Counter0"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter1"
+             description="HW test counter 1. Factor: 1.0"
+             data_type="uint64"
+             equation="B 1 READ"
+             underscore_name="counter1"
+             units="events"
+             symbol_name="Counter1"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <counter name="TestCounter2"
+             description="HW test counter 2. Factor: 1.0"
+             data_type="uint64"
+             equation="B 2 READ"
+             underscore_name="counter2"
+             units="events"
+             symbol_name="Counter2"
+             semantic_type="event"
+             mdapi_supported_apis=""
+             mdapi_usage_flags="Frame Batch Draw"
+             mdapi_hw_unit_type="gpu"
+             mdapi_group="GPU"
+             />
+    <register_config type="NOA">
+        <register type="NOA" address="0x00009888" value="0x11810000" />
+        <register type="NOA" address="0x00009888" value="0x07810013" />
+        <register type="NOA" address="0x00009888" value="0x1F810000" />
+        <register type="NOA" address="0x00009888" value="0x1D810000" />
+        <register type="NOA" address="0x00009888" value="0x1B930040" />
+        <register type="NOA" address="0x00009888" value="0x07E54000" />
+        <register type="NOA" address="0x00009888" value="0x1F908000" />
+        <register type="NOA" address="0x00009888" value="0x11900000" />
+        <register type="NOA" address="0x00009888" value="0x37900000" />
+        <register type="NOA" address="0x00009888" value="0x53900000" />
+        <register type="NOA" address="0x00009888" value="0x45900000" />
+        <register type="NOA" address="0x00009888" value="0x33900000" />
+    </register_config>
+    <register_config type="OA">
+        <register type="OA" address="0x00002740" value="0x00000000" />
+        <register type="OA" address="0x00002744" value="0x00800000" />
+        <register type="OA" address="0x00002714" value="0xF0800000" />
+        <register type="OA" address="0x00002710" value="0x00000000" />
+        <register type="OA" address="0x00002724" value="0xF0800000" />
+        <register type="OA" address="0x00002720" value="0x00000000" />
+        <register type="OA" address="0x00002770" value="0x00000004" />
+        <register type="OA" address="0x00002774" value="0x00000000" />
+        <register type="OA" address="0x00002778" value="0x00000003" />
+        <register type="OA" address="0x0000277C" value="0x00000000" />
+        <register type="OA" address="0x00002780" value="0x00000007" />
+        <register type="OA" address="0x00002784" value="0x00000000" />
+        <register type="OA" address="0x00002788" value="0x00100002" />
+        <register type="OA" address="0x0000278C" value="0x0000FFF7" />
+        <register type="OA" address="0x00002790" value="0x00100002" />
+        <register type="OA" address="0x00002794" value="0x0000FFCF" />
+        <register type="OA" address="0x00002798" value="0x00100082" />
+        <register type="OA" address="0x0000279C" value="0x0000FFEF" />
+        <register type="OA" address="0x000027A0" value="0x001000C2" />
+        <register type="OA" address="0x000027A4" value="0x0000FFE7" />
+        <register type="OA" address="0x000027A8" value="0x00100001" />
+        <register type="OA" address="0x000027AC" value="0x0000FFE7" />
+    </register_config>
+  </set>
+
+</metrics>
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 95f112e..d8680b4 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -72,16 +72,36 @@
 #include "brw_defines.h"
 #include "brw_performance_query.h"
 #include "brw_oa_hsw.h"
+#include "brw_oa_bdw.h"
+#include "brw_oa_chv.h"
+#include "brw_oa_sklgt2.h"
+#include "brw_oa_sklgt3.h"
+#include "brw_oa_sklgt4.h"
+#include "brw_oa_bxt.h"
+#include "brw_oa_kblgt2.h"
+#include "brw_oa_kblgt3.h"
+#include "brw_oa_glk.h"
 #include "intel_batchbuffer.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PERFMON
 
 /*
- * The largest OA format we can use on Haswell includes:
- * 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
+ * The largest OA formats we can use include:
+ * For Haswell:
+ *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
+ * For Gen8+
+ *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
  */
 #define MAX_OA_REPORT_COUNTERS 62
 
+#define OAREPORT_REASON_MASK           0x3f
+#define OAREPORT_REASON_SHIFT          19
+#define OAREPORT_REASON_TIMER          (1<<0)
+#define OAREPORT_REASON_TRIGGER1       (1<<1)
+#define OAREPORT_REASON_TRIGGER2       (1<<2)
+#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
+#define OAREPORT_REASON_GO_TRANSITION  (1<<4)
+
 #define I915_PERF_OA_SAMPLE_SIZE (8 +   /* drm_i915_perf_record_header */ \
                                   256)  /* OA counter report */
 
@@ -202,6 +222,7 @@
    int refcount;
    int len;
    uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
+   uint32_t last_timestamp;
 };
 
 /**
@@ -227,6 +248,11 @@
          struct brw_bo *bo;
 
          /**
+          * Address of mapped of @bo
+          */
+         void *map;
+
+         /**
           * The MI_REPORT_PERF_COUNT command lets us specify a unique
           * ID that will be reflected in the resulting OA report
           * that's written by the GPU. This is the ID we're expecting
@@ -468,29 +494,6 @@
 }
 
 /**
- * Emit an MI_REPORT_PERF_COUNT command packet.
- *
- * This asks the GPU to write a report of the current OA counter
- * values into @bo at the given offset and containing the given
- * @report_id which we can cross-reference when parsing the report.
- */
-static void
-emit_mi_report_perf_count(struct brw_context *brw,
-                          struct brw_bo *bo,
-                          uint32_t offset_in_bytes,
-                          uint32_t report_id)
-{
-   assert(offset_in_bytes % 64 == 0);
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
-   OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-             offset_in_bytes);
-   OUT_BATCH(report_id);
-   ADVANCE_BATCH();
-}
-
-/**
  * Add a query to the global list of "unaccumulated queries."
  *
  * Queries are tracked here until all the associated OA reports have
@@ -558,9 +561,10 @@
 static uint64_t
 timebase_scale(struct brw_context *brw, uint32_t u32_time_delta)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    uint64_t tmp = ((uint64_t)u32_time_delta) * 1000000000ull;
 
-   return tmp ? tmp / brw->perfquery.sys_vars.timestamp_frequency : 0;
+   return tmp ? tmp / devinfo->timestamp_frequency : 0;
 }
 
 static void
@@ -571,6 +575,28 @@
    *accumulator += (uint32_t)(*report1 - *report0);
 }
 
+static void
+accumulate_uint40(int a_index,
+                  const uint32_t *report0,
+                  const uint32_t *report1,
+                  uint64_t *accumulator)
+{
+   const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
+   const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
+   uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
+   uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
+   uint64_t value0 = report0[a_index + 4] | high0;
+   uint64_t value1 = report1[a_index + 4] | high1;
+   uint64_t delta;
+
+   if (value0 > value1)
+      delta = (1ULL << 40) + value1 - value0;
+   else
+      delta = value1 - value0;
+
+   *accumulator += delta;
+}
+
 /**
  * Given pointers to starting and ending OA snapshots, add the deltas for each
  * counter to the results.
@@ -583,9 +609,27 @@
 {
    const struct brw_perf_query_info *query = obj->query;
    uint64_t *accumulator = obj->oa.accumulator;
+   int idx = 0;
    int i;
 
    switch (query->oa_format) {
+   case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
+      accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
+      accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
+
+      /* 32x 40bit A counters... */
+      for (i = 0; i < 32; i++)
+         accumulate_uint40(i, start, end, accumulator + idx++);
+
+      /* 4x 32bit A counters... */
+      for (i = 0; i < 4; i++)
+         accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++);
+
+      /* 8x 32bit B counters + 8x 32bit C counters... */
+      for (i = 0; i < 16; i++)
+         accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++);
+
+      break;
    case I915_OA_FORMAT_A45_B8_C8:
       accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
 
@@ -646,11 +690,26 @@
    }
 }
 
-static bool
-read_oa_samples(struct brw_context *brw)
+enum OaReadStatus {
+   OA_READ_STATUS_ERROR,
+   OA_READ_STATUS_UNFINISHED,
+   OA_READ_STATUS_FINISHED,
+};
+
+static enum OaReadStatus
+read_oa_samples_until(struct brw_context *brw,
+                      uint32_t start_timestamp,
+                      uint32_t end_timestamp)
 {
+   struct exec_node *tail_node =
+      exec_list_get_tail(&brw->perfquery.sample_buffers);
+   struct brw_oa_sample_buf *tail_buf =
+      exec_node_data(struct brw_oa_sample_buf, tail_node, link);
+   uint32_t last_timestamp = tail_buf->last_timestamp;
+
    while (1) {
       struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
+      uint32_t offset;
       int len;
 
       while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
@@ -662,28 +721,94 @@
 
          if (len < 0) {
             if (errno == EAGAIN)
-               return true;
+               return ((last_timestamp - start_timestamp) >=
+                       (end_timestamp - start_timestamp)) ?
+                      OA_READ_STATUS_FINISHED :
+                      OA_READ_STATUS_UNFINISHED;
             else {
                DBG("Error reading i915 perf samples: %m\n");
-               return false;
             }
-         } else {
+         } else
             DBG("Spurious EOF reading i915 perf samples\n");
-            return false;
-         }
+
+         return OA_READ_STATUS_ERROR;
       }
 
       buf->len = len;
       exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
+
+      /* Go through the reports and update the last timestamp. */
+      offset = 0;
+      while (offset < buf->len) {
+         const struct drm_i915_perf_record_header *header =
+            (const struct drm_i915_perf_record_header *) &buf->buf[offset];
+         uint32_t *report = (uint32_t *) (header + 1);
+
+         if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
+            last_timestamp = report[1];
+
+         offset += header->size;
+      }
+
+      buf->last_timestamp = last_timestamp;
    }
 
    unreachable("not reached");
+   return OA_READ_STATUS_ERROR;
+}
+
+/**
+ * Try to read all the reports until either the delimiting timestamp
+ * or an error arises.
+ */
+static bool
+read_oa_samples_for_query(struct brw_context *brw,
+                          struct brw_perf_query_object *obj)
+{
+   uint32_t *start;
+   uint32_t *last;
+   uint32_t *end;
+
+   /* We need the MI_REPORT_PERF_COUNT to land before we can start
+    * accumulate. */
+   assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
+          !brw_bo_busy(obj->oa.bo));
+
+   /* Map the BO once here and let accumulate_oa_reports() unmap
+    * it. */
+   if (obj->oa.map == NULL)
+      obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
+
+   start = last = obj->oa.map;
+   end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+   if (start[0] != obj->oa.begin_report_id) {
+      DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+      return true;
+   }
+   if (end[0] != (obj->oa.begin_report_id + 1)) {
+      DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+      return true;
+   }
+
+   /* Read the reports until the end timestamp. */
+   switch (read_oa_samples_until(brw, start[1], end[1])) {
+   case OA_READ_STATUS_ERROR:
+      /* Fallthrough and let accumulate_oa_reports() deal with the
+       * error. */
+   case OA_READ_STATUS_FINISHED:
+      return true;
+   case OA_READ_STATUS_UNFINISHED:
+      return false;
+   }
+
+   unreachable("invalid read status");
    return false;
 }
 
 /**
- * Accumulate raw OA counter values based on deltas between pairs
- * of OA reports.
+ * Accumulate raw OA counter values based on deltas between pairs of
+ * OA reports.
  *
  * Accumulation starts from the first report captured via
  * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
@@ -694,30 +819,30 @@
  *
  * These periodic snapshots help to ensure we handle counter overflow
  * correctly by being frequent enough to ensure we don't miss multiple
- * overflows of a counter between snapshots.
+ * overflows of a counter between snapshots. For Gen8+ the i915 perf
+ * snapshots provide the extra context-switch reports that let us
+ * subtract out the progress of counters associated with other
+ * contexts running on the system.
  */
 static void
 accumulate_oa_reports(struct brw_context *brw,
                       struct brw_perf_query_object *obj)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    struct gl_perf_query_object *o = &obj->base;
-   uint32_t *query_buffer;
    uint32_t *start;
    uint32_t *last;
    uint32_t *end;
    struct exec_node *first_samples_node;
+   bool in_ctx = true;
+   uint32_t ctx_id;
+   int out_duration = 0;
 
    assert(o->Ready);
+   assert(obj->oa.map != NULL);
 
-   /* Collect the latest periodic OA reports from i915 perf */
-   if (!read_oa_samples(brw))
-      goto error;
-
-   brw_bo_map(brw, obj->oa.bo, false);
-   query_buffer = obj->oa.bo->virtual;
-
-   start = last = query_buffer;
-   end = query_buffer + (MI_RPC_BO_END_OFFSET_BYTES / sizeof(uint32_t));
+   start = last = obj->oa.map;
+   end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
 
    if (start[0] != obj->oa.begin_report_id) {
       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
@@ -728,6 +853,8 @@
       goto error;
    }
 
+   ctx_id = start[2];
+
    /* See if we have any periodic reports to accumulate too... */
 
    /* N.B. The oa.samples_head was set when the query began and
@@ -757,6 +884,7 @@
          switch (header->type) {
          case DRM_I915_PERF_RECORD_SAMPLE: {
             uint32_t *report = (uint32_t *)(header + 1);
+            bool add = true;
 
             /* Ignore reports that come before the start marker.
              * (Note: takes care to allow overflow of 32bit timestamps)
@@ -770,7 +898,49 @@
             if (timebase_scale(brw, report[1] - end[1]) <= 5000000000)
                goto end;
 
-            add_deltas(brw, obj, last, report);
+            /* For Gen8+ since the counters continue while other
+             * contexts are running we need to discount any unrelated
+             * deltas. The hardware automatically generates a report
+             * on context switch which gives us a new reference point
+             * to continuing adding deltas from.
+             *
+             * For Haswell we can rely on the HW to stop the progress
+             * of OA counters while any other context is acctive.
+             */
+            if (devinfo->gen >= 8) {
+               if (in_ctx && report[2] != ctx_id) {
+                  DBG("i915 perf: Switch AWAY (observed by ID change)\n");
+                  in_ctx = false;
+                  out_duration = 0;
+               } else if (in_ctx == false && report[2] == ctx_id) {
+                  DBG("i915 perf: Switch TO\n");
+                  in_ctx = true;
+
+                  /* From experimentation in IGT, we found that the OA unit
+                   * might label some report as "idle" (using an invalid
+                   * context ID), right after a report for a given context.
+                   * Deltas generated by those reports actually belong to the
+                   * previous context, even though they're not labelled as
+                   * such.
+                   *
+                   * We didn't *really* Switch AWAY in the case that we e.g.
+                   * saw a single periodic report while idle...
+                   */
+                  if (out_duration >= 1)
+                     add = false;
+               } else if (in_ctx) {
+                  assert(report[2] == ctx_id);
+                  DBG("i915 perf: Continuation IN\n");
+               } else {
+                  assert(report[2] != ctx_id);
+                  DBG("i915 perf: Continuation OUT\n");
+                  add = false;
+                  out_duration++;
+               }
+            }
+
+            if (add)
+               add_deltas(brw, obj, last, report);
 
             last = report;
 
@@ -794,6 +964,7 @@
    DBG("Marking %d accumulated - results gathered\n", o->Id);
 
    brw_bo_unmap(obj->oa.bo);
+   obj->oa.map = NULL;
    obj->oa.results_accumulated = true;
    drop_from_unaccumulated_query_list(brw, obj);
    dec_n_oa_users(brw);
@@ -803,6 +974,7 @@
 error:
 
    brw_bo_unmap(obj->oa.bo);
+   obj->oa.map = NULL;
    discard_all_queries(brw);
 }
 
@@ -833,7 +1005,7 @@
                I915_PERF_FLAG_FD_NONBLOCK |
                I915_PERF_FLAG_DISABLED,
       .num_properties = ARRAY_SIZE(properties) / 2,
-      .properties_ptr = (uint64_t)properties
+      .properties_ptr = (uintptr_t) properties,
    };
    int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
    if (fd == -1) {
@@ -948,21 +1120,60 @@
       /* If the OA counters aren't already on, enable them. */
       if (brw->perfquery.oa_stream_fd == -1) {
          __DRIscreen *screen = brw->screen->driScrnPriv;
-         int period_exponent;
+         const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-         /* The timestamp for HSW+ increments every 80ns
+         /* The period_exponent gives a sampling period as follows:
+          *   sample_period = timestamp_period * 2^(period_exponent + 1)
           *
-          * The period_exponent gives a sampling period as follows:
-          *   sample_period = 80ns * 2^(period_exponent + 1)
+          * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
+          * ~83ns (GEN8/9).
           *
-          * The overflow period for Haswell can be calculated as:
+          * The counter overflow period is derived from the EuActive counter
+          * which reads a counter that increments by the number of clock
+          * cycles multiplied by the number of EUs. It can be calculated as:
           *
-          * 2^32 / (n_eus * max_gen_freq * 2)
+          * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
+          *
           * (E.g. 40 EUs @ 1GHz = ~53ms)
           *
-          * We currently sample every 42 milliseconds...
+          * We select a sampling period inferior to that overflow period to
+          * ensure we cannot see more than 1 counter overflow, otherwise we
+          * could loose information.
           */
-         period_exponent = 18;
+
+         int a_counter_in_bits = 32;
+         if (devinfo->gen >= 8)
+            a_counter_in_bits = 40;
+
+         uint64_t overflow_period = pow(2, a_counter_in_bits) /
+            (brw->perfquery.sys_vars.n_eus *
+             /* drop 1GHz freq to have units in nanoseconds */
+             2);
+
+         DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
+             overflow_period, overflow_period / 1000000ul, brw->perfquery.sys_vars.n_eus);
+
+         int period_exponent = 0;
+         uint64_t prev_sample_period, next_sample_period;
+         for (int e = 0; e < 30; e++) {
+            prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
+            next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
+
+            /* Take the previous sampling period, lower than the overflow
+             * period.
+             */
+            if (prev_sample_period < overflow_period &&
+                next_sample_period > overflow_period)
+               period_exponent = e + 1;
+         }
+
+         if (period_exponent == 0) {
+            DBG("WARNING: enable to find a sampling exponent\n");
+            return false;
+         }
+
+         DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
+             prev_sample_period / 1000000ul);
 
          if (!open_i915_perf_oa_stream(brw,
                                        query->oa_metrics_set_id,
@@ -993,17 +1204,25 @@
                       MI_RPC_BO_SIZE, 64);
 #ifdef DEBUG
       /* Pre-filling the BO helps debug whether writes landed. */
-      brw_bo_map(brw, obj->oa.bo, true);
-      memset((char *) obj->oa.bo->virtual, 0x80, MI_RPC_BO_SIZE);
+      void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
+      memset(map, 0x80, MI_RPC_BO_SIZE);
       brw_bo_unmap(obj->oa.bo);
 #endif
 
       obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id;
       brw->perfquery.next_query_start_report_id += 2;
 
+      /* We flush the batchbuffer here to minimize the chances that MI_RPC
+       * delimiting commands end up in different batchbuffers. If that's the
+       * case, the measurement will include the time it takes for the kernel
+       * scheduler to load a new request into the hardware. This is manifested in
+       * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
+       */
+      intel_batchbuffer_flush(brw);
+
       /* Take a starting OA counter snapshot. */
-      emit_mi_report_perf_count(brw, obj->oa.bo, 0,
-                                obj->oa.begin_report_id);
+      brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
+                                          obj->oa.begin_report_id);
       ++brw->perfquery.n_active_oa_queries;
 
       /* No already-buffered samples can possibly be associated with this query
@@ -1082,9 +1301,9 @@
        */
       if (!obj->oa.results_accumulated) {
          /* Take an ending OA counter snapshot. */
-         emit_mi_report_perf_count(brw, obj->oa.bo,
-                                   MI_RPC_BO_END_OFFSET_BYTES,
-                                   obj->oa.begin_report_id + 1);
+         brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
+                                             MI_RPC_BO_END_OFFSET_BYTES,
+                                             obj->oa.begin_report_id + 1);
       }
 
       --brw->perfquery.n_active_oa_queries;
@@ -1131,7 +1350,17 @@
    if (brw_batch_references(&brw->batch, bo))
       intel_batchbuffer_flush(brw);
 
-   brw_bo_wait_rendering(brw, bo);
+   brw_bo_wait_rendering(bo);
+
+   /* Due to a race condition between the OA unit signaling report
+    * availability and the report actually being written into memory,
+    * we need to wait for all the reports to come in before we can
+    * read them.
+    */
+   if (obj->query->kind == OA_COUNTERS) {
+      while (!read_oa_samples_for_query(brw, obj))
+         ;
+   }
 }
 
 static bool
@@ -1149,8 +1378,8 @@
       return (obj->oa.results_accumulated ||
               (obj->oa.bo &&
                !brw_batch_references(&brw->batch, obj->oa.bo) &&
-               !brw_bo_busy(obj->oa.bo)));
-
+               !brw_bo_busy(obj->oa.bo) &&
+               read_oa_samples_for_query(brw, obj)));
    case PIPELINE_STATS:
       return (obj->pipeline_stats.bo &&
               !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
@@ -1215,8 +1444,7 @@
    int n_counters = obj->query->n_counters;
    uint8_t *p = data;
 
-   brw_bo_map(brw, obj->pipeline_stats.bo, false);
-   uint64_t *start = obj->pipeline_stats.bo->virtual;
+   uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ);
    uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
 
    for (int i = 0; i < n_counters; i++) {
@@ -1399,6 +1627,7 @@
 static void
 init_pipeline_statistic_query_registers(struct brw_context *brw)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    struct brw_perf_query_info *query = append_query_info(brw);
 
    query->kind = PIPELINE_STATS;
@@ -1414,7 +1643,7 @@
    add_basic_stat_reg(query, VS_INVOCATION_COUNT,
                       "N vertex shader invocations");
 
-   if (brw->gen == 6) {
+   if (devinfo->gen == 6) {
       add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
                    "SO_PRIM_STORAGE_NEEDED",
                    "N geometry shader stream-out primitives (total)");
@@ -1463,7 +1692,7 @@
    add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
                       "N primitives leaving clipping");
 
-   if (brw->is_haswell || brw->gen == 8)
+   if (devinfo->is_haswell || devinfo->gen == 8)
       add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
                    "N fragment shader invocations",
                    "N fragment shader invocations");
@@ -1473,7 +1702,7 @@
 
    add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
 
-   if (brw->gen >= 7)
+   if (devinfo->gen >= 7)
       add_basic_stat_reg(query, CS_INVOCATION_COUNT,
                          "N compute shader invocations");
 
@@ -1581,6 +1810,7 @@
 static bool
 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
 
    if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
@@ -1595,30 +1825,82 @@
 
    brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
    brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
+   brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
+   brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
+   /* Assuming uniform distribution of subslices per slices. */
+   brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
 
-   if (brw->is_haswell) {
-      const struct gen_device_info *info = &brw->screen->devinfo;
+   if (devinfo->is_haswell) {
+      brw->perfquery.sys_vars.slice_mask = 0;
+      brw->perfquery.sys_vars.subslice_mask = 0;
 
-      brw->perfquery.sys_vars.timestamp_frequency = 12500000;
+      for (int s = 0; s < devinfo->num_slices; s++)
+         brw->perfquery.sys_vars.slice_mask |= 1U << s;
+      for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
+         brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
 
-      if (info->gt == 1) {
+      if (devinfo->gt == 1) {
          brw->perfquery.sys_vars.n_eus = 10;
-         brw->perfquery.sys_vars.n_eu_slices = 1;
-         brw->perfquery.sys_vars.subslice_mask = 0x1;
-      } else if (info->gt == 2) {
+      } else if (devinfo->gt == 2) {
          brw->perfquery.sys_vars.n_eus = 20;
-         brw->perfquery.sys_vars.n_eu_slices = 1;
-         brw->perfquery.sys_vars.subslice_mask = 0x3;
-      } else if (info->gt == 3) {
+      } else if (devinfo->gt == 3) {
          brw->perfquery.sys_vars.n_eus = 40;
-         brw->perfquery.sys_vars.n_eu_slices = 2;
-         brw->perfquery.sys_vars.subslice_mask = 0xf;
       } else
          unreachable("not reached");
+   } else {
+      __DRIscreen *screen = brw->screen->driScrnPriv;
+      drm_i915_getparam_t gp;
+      int ret;
+      int slice_mask = 0;
+      int ss_mask = 0;
+      /* maximum number of slices */
+      int s_max = devinfo->num_slices;
+      /* maximum number of subslices per slice (assuming uniform subslices per
+       * slices)
+       */
+      int ss_max = devinfo->num_subslices[0];
+      uint64_t subslice_mask = 0;
+      int s;
 
-      return true;
-   } else
-      return false;
+      gp.param = I915_PARAM_SLICE_MASK;
+      gp.value = &slice_mask;
+      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+      if (ret)
+         return false;
+
+      gp.param = I915_PARAM_SUBSLICE_MASK;
+      gp.value = &ss_mask;
+      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+      if (ret)
+         return false;
+
+      brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
+      brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
+      brw->perfquery.sys_vars.slice_mask = slice_mask;
+
+      /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
+       * which applies to all slices.
+       *
+       * Note: some of the metrics we have (as described in XML) are
+       * conditional on a $SubsliceMask variable which is expected to also
+       * reflect the slice mask by packing together subslice masks for each
+       * slice in one value..
+       */
+      for (s = 0; s < s_max; s++) {
+         if (slice_mask & (1<<s)) {
+            subslice_mask |= ss_mask << (ss_max * s);
+         }
+      }
+
+      brw->perfquery.sys_vars.subslice_mask = subslice_mask;
+      brw->perfquery.sys_vars.n_eu_sub_slices =
+         __builtin_popcount(subslice_mask);
+   }
+
+   brw->perfquery.sys_vars.eu_threads_count =
+      brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+
+   return true;
 }
 
 static bool
@@ -1687,23 +1969,77 @@
    return false;
 }
 
+typedef void (*perf_register_oa_queries_t)(struct brw_context *);
+
+static perf_register_oa_queries_t
+get_register_queries_function(const struct gen_device_info *devinfo)
+{
+   if (devinfo->is_haswell)
+      return brw_oa_register_queries_hsw;
+   if (devinfo->is_cherryview)
+      return brw_oa_register_queries_chv;
+   if (devinfo->is_broadwell)
+      return brw_oa_register_queries_bdw;
+   if (devinfo->is_broxton)
+      return brw_oa_register_queries_bxt;
+   if (devinfo->is_skylake) {
+      if (devinfo->gt == 2)
+         return brw_oa_register_queries_sklgt2;
+      if (devinfo->gt == 3)
+         return brw_oa_register_queries_sklgt3;
+      if (devinfo->gt == 4)
+         return brw_oa_register_queries_sklgt4;
+   }
+   if (devinfo->is_kabylake) {
+      if (devinfo->gt == 2)
+         return brw_oa_register_queries_kblgt2;
+      if (devinfo->gt == 3)
+         return brw_oa_register_queries_kblgt3;
+   }
+   if (devinfo->is_geminilake)
+      return brw_oa_register_queries_glk;
+   return NULL;
+}
+
 static unsigned
 brw_init_perf_query_info(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   bool i915_perf_oa_available = false;
    struct stat sb;
    char sysfs_dev_dir[128];
+   perf_register_oa_queries_t oa_register;
 
    if (brw->perfquery.n_queries)
       return brw->perfquery.n_queries;
 
    init_pipeline_statistic_query_registers(brw);
 
+   oa_register = get_register_queries_function(devinfo);
+
    /* The existence of this sysctl parameter implies the kernel supports
     * the i915 perf interface.
     */
-   if (brw->is_haswell &&
-       stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0 &&
+   if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
+
+      /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
+       * metrics unless running as root.
+       */
+      if (devinfo->is_haswell)
+         i915_perf_oa_available = true;
+      else {
+         uint64_t paranoid = 1;
+
+         read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
+
+         if (paranoid == 0 || geteuid() == 0)
+            i915_perf_oa_available = true;
+      }
+   }
+
+   if (i915_perf_oa_available &&
+       oa_register &&
        get_sysfs_dev_dir(brw, sysfs_dev_dir, sizeof(sysfs_dev_dir)) &&
        init_oa_sys_vars(brw, sysfs_dev_dir))
    {
@@ -1711,10 +2047,10 @@
          _mesa_hash_table_create(NULL, _mesa_key_hash_string,
                                  _mesa_key_string_equal);
 
-      /* Index all the metric sets mesa knows about before looking to
-       * see what the kernel is advertising.
+      /* Index all the metric sets mesa knows about before looking to see what
+       * the kernel is advertising.
        */
-      brw_oa_register_queries_hsw(brw);
+      oa_register(brw);
 
       enumerate_sysfs_metrics(brw, sysfs_dev_dir);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index e329cc7..3dc221e 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -52,14 +52,14 @@
 
    bool cut_index_will_work;
 
-   switch (ib->type) {
-   case GL_UNSIGNED_BYTE:
+   switch (ib->index_size) {
+   case 1:
       cut_index_will_work = ctx->Array.RestartIndex == 0xff;
       break;
-   case GL_UNSIGNED_SHORT:
+   case 2:
       cut_index_will_work = ctx->Array.RestartIndex == 0xffff;
       break;
-   case GL_UNSIGNED_INT:
+   case 4:
       cut_index_will_work = ctx->Array.RestartIndex == 0xffffffff;
       break;
    default:
@@ -177,43 +177,3 @@
    /* The primitive restart draw was completed, so return true. */
    return GL_TRUE;
 }
-
-static void
-haswell_upload_cut_index(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   /* Don't trigger on Ivybridge */
-   if (brw->gen < 8 && !brw->is_haswell)
-      return;
-
-   const unsigned cut_index_setting =
-      ctx->Array._PrimitiveRestart ? HSW_CUT_INDEX_ENABLE : 0;
-
-   /* BRW_NEW_INDEX_BUFFER */
-   unsigned cut_index;
-   if (brw->ib.ib) {
-      cut_index = _mesa_primitive_restart_index(ctx, brw->ib.type);
-   } else {
-      /* There's no index buffer, but primitive restart may still apply
-       * to glDrawArrays and such.  FIXED_INDEX mode only applies to drawing
-       * operations that use an index buffer, so we can ignore it and use
-       * the GL restart index directly.
-       */
-      cut_index = ctx->Array.RestartIndex;
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_VF << 16 | cut_index_setting | (2 - 2));
-   OUT_BATCH(cut_index);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state haswell_cut_index = {
-   .dirty = {
-      .mesa  = _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_INDEX_BUFFER,
-   },
-   .emit = haswell_upload_cut_index,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 4641cfe..94d8d8b 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -109,14 +109,12 @@
 
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 
-   /* nir_shader may have been cloned so make sure shader_info is in sync */
-   if (nir->info != &prog->info) {
-      const char *name = prog->info.name;
-      const char *label = prog->info.label;
-      prog->info = *nir->info;
-      prog->info.name = name;
-      prog->info.label = label;
-   }
+   /* Copy the info we just generated back into the gl_program */
+   const char *prog_name = prog->info.name;
+   const char *prog_label = prog->info.label;
+   prog->info = nir->info;
+   prog->info.name = prog_name;
+   prog->info.label = prog_label;
 
    if (shader_prog) {
       NIR_PASS_V(nir, nir_lower_samplers, shader_prog);
@@ -292,7 +290,7 @@
    unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
-   assert(brw->gen >= 7 && brw->gen <= 9);
+   assert(brw->gen >= 7 && brw->gen <= 10);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
                    GL_ELEMENT_ARRAY_BARRIER_BIT |
@@ -580,8 +578,7 @@
     * delaying reading the reports, but it doesn't look like it's a big
     * overhead compared to the cost of tracking the time in the first place.
     */
-   brw_bo_map(brw, brw->shader_time.bo, true);
-   void *bo_map = brw->shader_time.bo->virtual;
+   void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
@@ -726,7 +723,7 @@
       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
    }
 
-   if (prog->nir->info->uses_texture_gather) {
+   if (prog->nir->info.uses_texture_gather) {
       if (devinfo->gen >= 8) {
          stage_prog_data->binding_table.gather_texture_start =
             stage_prog_data->binding_table.texture_start;
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index 2e9f121..e62b7d36 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -32,9 +32,6 @@
 
 struct brw_context;
 
-bool brw_do_channel_expressions(struct exec_list *instructions);
-bool brw_do_vector_splitting(struct exec_list *instructions);
-
 struct nir_shader *brw_create_nir(struct brw_context *brw,
                                   const struct gl_shader_program *shader_prog,
                                   struct gl_program *prog,
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index b0e2962..4dcfd52 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -45,6 +45,8 @@
  */
 
 #include "main/imports.h"
+#include "main/streaming-load-memcpy.h"
+#include "x86/common_x86_asm.h"
 #include "intel_batchbuffer.h"
 #include "brw_state.h"
 #include "brw_wm.h"
@@ -216,26 +218,26 @@
    struct brw_bo *new_bo;
 
    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
-   if (brw->has_llc)
-      brw_bo_map_unsynchronized(brw, new_bo);
+   if (can_do_exec_capture(brw->screen))
+      new_bo->kflags = EXEC_OBJECT_CAPTURE;
+
+   void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
+                                       MAP_ASYNC | MAP_PERSISTENT);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
-      } else {
-         brw_bo_map(brw, cache->bo, false);
-         brw_bo_subdata(new_bo, 0, cache->next_offset,
-                              cache->bo->virtual);
-         brw_bo_unmap(cache->bo);
-      }
+#ifdef USE_SSE41
+      if (!cache->bo->cache_coherent && cpu_has_sse4_1)
+         _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
+      else
+#endif
+         memcpy(map, cache->map, cache->next_offset);
    }
 
-   if (brw->has_llc)
-      brw_bo_unmap(cache->bo);
+   brw_bo_unmap(cache->bo);
    brw_bo_unreference(cache->bo);
    cache->bo = new_bo;
-   cache->bo_used_by_gpu = false;
+   cache->map = map;
 
    /* Since we have a new BO in place, we need to signal the units
     * that depend on it (state base address on gen5+, or unit state before).
@@ -252,23 +254,13 @@
                 enum brw_cache_id cache_id,
                 const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
    unsigned i;
    const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-         int ret;
-
-         if (item->cache_id != cache_id || item->size != data_size)
-            continue;
-
-         if (!brw->has_llc)
-            brw_bo_map(brw, cache->bo, false);
-         ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
-         if (!brw->has_llc)
-            brw_bo_unmap(cache->bo);
-         if (ret)
+         if (item->cache_id != cache_id || item->size != data_size ||
+             memcmp(cache->map + item->offset, data, item->size) != 0)
             continue;
 
          return item;
@@ -282,7 +274,6 @@
 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
    uint32_t offset;
-   struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
    if (cache->next_offset + size > cache->bo->size) {
@@ -294,14 +285,6 @@
       brw_cache_new_bo(cache, new_size);
    }
 
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
    offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
@@ -339,7 +322,6 @@
                  uint32_t *out_offset,
                  void *out_aux)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    const struct brw_cache_item *matching_data =
       brw_lookup_prog(cache, cache_id, data, data_size);
@@ -366,11 +348,7 @@
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
-      } else {
-         brw_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      memcpy(cache->map + item->offset, data, data_size);
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -407,8 +385,11 @@
       calloc(cache->size, sizeof(struct brw_cache_item *));
 
    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
-   if (brw->has_llc)
-      brw_bo_map_unsynchronized(brw, cache->bo);
+   if (can_do_exec_capture(brw->screen))
+      cache->bo->kflags = EXEC_OBJECT_CAPTURE;
+
+   cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
+                                           MAP_ASYNC | MAP_PERSISTENT);
 }
 
 static void
@@ -487,10 +468,10 @@
 
    /* This can be NULL if context creation failed early on */
    if (cache->bo) {
-      if (brw->has_llc)
-         brw_bo_unmap(cache->bo);
+      brw_bo_unmap(cache->bo);
       brw_bo_unreference(cache->bo);
       cache->bo = NULL;
+      cache->map = NULL;
    }
    brw_clear_cache(brw, cache);
    free(cache->items);
@@ -538,17 +519,11 @@
    const struct brw_cache *cache = &brw->cache;
    struct brw_cache_item *item;
 
-   if (!brw->has_llc)
-      brw_bo_map(brw, cache->bo, false);
-
    for (unsigned i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
          fprintf(stderr, "%s:\n", cache_name(i));
-         brw_disassemble(&brw->screen->devinfo, cache->bo->virtual,
+         brw_disassemble(&brw->screen->devinfo, cache->map,
                          item->offset, item->size, stderr);
       }
    }
-
-   if (!brw->has_llc)
-      brw_bo_unmap(cache->bo);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index caf3412..04ce9a9 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -47,7 +47,7 @@
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   return (double)gpu_timestamp * devinfo->timebase_scale;
+   return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 }
 
 /* As best we know currently, the Gen HW timestamps are 36bits across
@@ -111,6 +111,14 @@
    if (brw->gen == 9 && brw->gt == 4)
       flags |= PIPE_CONTROL_CS_STALL;
 
+   if (brw->gen >= 10) {
+      /* "Driver must program PIPE_CONTROL with only Depth Stall Enable bit set
+       * prior to programming a PIPE_CONTROL with Write PS Depth Count Post sync
+       * operation."
+       */
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   }
+
    brw_emit_pipe_control_write(brw, flags,
                                query_bo, idx * sizeof(uint64_t), 0);
 }
@@ -145,8 +153,7 @@
       }
    }
 
-   brw_bo_map(brw, query->bo, false);
-   results = query->bo->virtual;
+   results = brw_bo_map(brw, query->bo, MAP_READ);
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED_EXT:
       /* The query BO contains the starting and ending timestamps.
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
deleted file mode 100644
index 26bf0cb..0000000
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-/**
- * @file brw_sampler_state.c
- *
- * This file contains code for emitting SAMPLER_STATE structures, which
- * specifies filter modes, wrap modes, border color, and so on.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "intel_mipmap_tree.h"
-
-#include "main/macros.h"
-#include "main/samplerobj.h"
-#include "util/half_float.h"
-
-/**
- * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
- */
-static void
-gen7_emit_sampler_state_pointers_xs(struct brw_context *brw,
-                                    struct brw_stage_state *stage_state)
-{
-   static const uint16_t packet_headers[] = {
-      [MESA_SHADER_VERTEX] = _3DSTATE_SAMPLER_STATE_POINTERS_VS,
-      [MESA_SHADER_TESS_CTRL] = _3DSTATE_SAMPLER_STATE_POINTERS_HS,
-      [MESA_SHADER_TESS_EVAL] = _3DSTATE_SAMPLER_STATE_POINTERS_DS,
-      [MESA_SHADER_GEOMETRY] = _3DSTATE_SAMPLER_STATE_POINTERS_GS,
-      [MESA_SHADER_FRAGMENT] = _3DSTATE_SAMPLER_STATE_POINTERS_PS,
-   };
-
-   /* Ivybridge requires a workaround flush before VS packets. */
-   if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail &&
-       stage_state->stage == MESA_SHADER_VERTEX) {
-      gen7_emit_vs_workaround_flush(brw);
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(packet_headers[stage_state->stage] << 16 | (2 - 2));
-   OUT_BATCH(stage_state->sampler_offset);
-   ADVANCE_BATCH();
-}
-
-/**
- * Emit a SAMPLER_STATE structure, given all the fields.
- */
-void
-brw_emit_sampler_state(struct brw_context *brw,
-                       uint32_t *ss,
-                       uint32_t batch_offset_for_sampler_state,
-                       unsigned min_filter,
-                       unsigned mag_filter,
-                       unsigned mip_filter,
-                       unsigned max_anisotropy,
-                       unsigned address_rounding,
-                       unsigned wrap_s,
-                       unsigned wrap_t,
-                       unsigned wrap_r,
-                       unsigned base_level,
-                       unsigned min_lod,
-                       unsigned max_lod,
-                       int lod_bias,
-                       unsigned shadow_function,
-                       bool non_normalized_coordinates,
-                       uint32_t border_color_offset)
-{
-   ss[0] = BRW_SAMPLER_LOD_PRECLAMP_ENABLE |
-           SET_FIELD(mip_filter, BRW_SAMPLER_MIP_FILTER) |
-           SET_FIELD(mag_filter, BRW_SAMPLER_MAG_FILTER) |
-           SET_FIELD(min_filter, BRW_SAMPLER_MIN_FILTER);
-
-   ss[2] = border_color_offset;
-   if (brw->gen < 6) {
-      ss[2] += brw->batch.bo->offset64; /* reloc */
-      brw_emit_reloc(&brw->batch, batch_offset_for_sampler_state + 8,
-                     brw->batch.bo, border_color_offset,
-                     I915_GEM_DOMAIN_SAMPLER, 0);
-   }
-
-   ss[3] = SET_FIELD(max_anisotropy, BRW_SAMPLER_MAX_ANISOTROPY) |
-           SET_FIELD(address_rounding, BRW_SAMPLER_ADDRESS_ROUNDING);
-
-   if (brw->gen >= 7) {
-      ss[0] |= SET_FIELD(lod_bias & 0x1fff, GEN7_SAMPLER_LOD_BIAS);
-
-      if (min_filter == BRW_MAPFILTER_ANISOTROPIC)
-         ss[0] |= GEN7_SAMPLER_EWA_ANISOTROPIC_ALGORITHM;
-
-      ss[1] = SET_FIELD(min_lod, GEN7_SAMPLER_MIN_LOD) |
-              SET_FIELD(max_lod, GEN7_SAMPLER_MAX_LOD) |
-              SET_FIELD(shadow_function, GEN7_SAMPLER_SHADOW_FUNCTION);
-
-      ss[3] |= SET_FIELD(wrap_s, BRW_SAMPLER_TCX_WRAP_MODE) |
-               SET_FIELD(wrap_t, BRW_SAMPLER_TCY_WRAP_MODE) |
-               SET_FIELD(wrap_r, BRW_SAMPLER_TCZ_WRAP_MODE);
-
-      if (non_normalized_coordinates)
-         ss[3] |= GEN7_SAMPLER_NON_NORMALIZED_COORDINATES;
-   } else {
-      ss[0] |= SET_FIELD(lod_bias & 0x7ff, GEN4_SAMPLER_LOD_BIAS) |
-               SET_FIELD(shadow_function, GEN4_SAMPLER_SHADOW_FUNCTION);
-
-      /* This field has existed since the original i965, but is declared MBZ
-       * until Sandy Bridge.  According to the PRM:
-       *
-       *    "This was added to match OpenGL semantics"
-       *
-       * In particular, OpenGL allowed you to offset by 0.5 in certain cases
-       * to get slightly better filtering.  On Ivy Bridge and above, it
-       * appears that this is added to RENDER_SURFACE_STATE::SurfaceMinLOD so
-       * the right value is 0.0 or 0.5 (if you want the wacky behavior).  On
-       * Sandy Bridge, however, this sum does not seem to occur and you have
-       * to set it to the actual base level of the texture.
-       */
-      if (brw->gen == 6)
-         ss[0] |= SET_FIELD(base_level, BRW_SAMPLER_BASE_MIPLEVEL);
-
-      if (brw->gen == 6 && min_filter != mag_filter)
-         ss[0] |= GEN6_SAMPLER_MIN_MAG_NOT_EQUAL;
-
-      ss[1] = SET_FIELD(min_lod, GEN4_SAMPLER_MIN_LOD) |
-              SET_FIELD(max_lod, GEN4_SAMPLER_MAX_LOD) |
-              SET_FIELD(wrap_s, BRW_SAMPLER_TCX_WRAP_MODE) |
-              SET_FIELD(wrap_t, BRW_SAMPLER_TCY_WRAP_MODE) |
-              SET_FIELD(wrap_r, BRW_SAMPLER_TCZ_WRAP_MODE);
-
-      if (brw->gen >= 6 && non_normalized_coordinates)
-         ss[3] |= GEN6_SAMPLER_NON_NORMALIZED_COORDINATES;
-   }
-}
-
-static uint32_t
-translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
-{
-   switch (wrap) {
-   case GL_REPEAT:
-      return BRW_TEXCOORDMODE_WRAP;
-   case GL_CLAMP:
-      /* GL_CLAMP is the weird mode where coordinates are clamped to
-       * [0.0, 1.0], so linear filtering of coordinates outside of
-       * [0.0, 1.0] give you half edge texel value and half border
-       * color.
-       *
-       * Gen8+ supports this natively.
-       */
-      if (brw->gen >= 8)
-         return GEN8_TEXCOORDMODE_HALF_BORDER;
-
-      /* On Gen4-7.5, we clamp the coordinates in the fragment shader
-       * and set clamp_border here, which gets the result desired.
-       * We just use clamp(_to_edge) for nearest, because for nearest
-       * clamping to 1.0 gives border color instead of the desired
-       * edge texels.
-       */
-      if (using_nearest)
-	 return BRW_TEXCOORDMODE_CLAMP;
-      else
-	 return BRW_TEXCOORDMODE_CLAMP_BORDER;
-   case GL_CLAMP_TO_EDGE:
-      return BRW_TEXCOORDMODE_CLAMP;
-   case GL_CLAMP_TO_BORDER:
-      return BRW_TEXCOORDMODE_CLAMP_BORDER;
-   case GL_MIRRORED_REPEAT:
-      return BRW_TEXCOORDMODE_MIRROR;
-   case GL_MIRROR_CLAMP_TO_EDGE:
-      return BRW_TEXCOORDMODE_MIRROR_ONCE;
-   default:
-      return BRW_TEXCOORDMODE_WRAP;
-   }
-}
-
-/**
- * Return true if the given wrap mode requires the border color to exist.
- */
-static bool
-wrap_mode_needs_border_color(unsigned wrap_mode)
-{
-   return wrap_mode == BRW_TEXCOORDMODE_CLAMP_BORDER ||
-          wrap_mode == GEN8_TEXCOORDMODE_HALF_BORDER;
-}
-
-static bool
-has_component(mesa_format format, int i)
-{
-   if (_mesa_is_format_color_format(format))
-      return _mesa_format_has_color_component(format, i);
-
-   /* depth and stencil have only one component */
-   return i == 0;
-}
-
-/**
- * Upload SAMPLER_BORDER_COLOR_STATE.
- */
-static void
-upload_default_color(struct brw_context *brw,
-                     const struct gl_sampler_object *sampler,
-                     mesa_format format, GLenum base_format,
-                     bool is_integer_format, bool is_stencil_sampling,
-                     uint32_t *sdc_offset)
-{
-   union gl_color_union color;
-
-   switch (base_format) {
-   case GL_DEPTH_COMPONENT:
-      /* GL specs that border color for depth textures is taken from the
-       * R channel, while the hardware uses A.  Spam R into all the
-       * channels for safety.
-       */
-      color.ui[0] = sampler->BorderColor.ui[0];
-      color.ui[1] = sampler->BorderColor.ui[0];
-      color.ui[2] = sampler->BorderColor.ui[0];
-      color.ui[3] = sampler->BorderColor.ui[0];
-      break;
-   case GL_ALPHA:
-      color.ui[0] = 0u;
-      color.ui[1] = 0u;
-      color.ui[2] = 0u;
-      color.ui[3] = sampler->BorderColor.ui[3];
-      break;
-   case GL_INTENSITY:
-      color.ui[0] = sampler->BorderColor.ui[0];
-      color.ui[1] = sampler->BorderColor.ui[0];
-      color.ui[2] = sampler->BorderColor.ui[0];
-      color.ui[3] = sampler->BorderColor.ui[0];
-      break;
-   case GL_LUMINANCE:
-      color.ui[0] = sampler->BorderColor.ui[0];
-      color.ui[1] = sampler->BorderColor.ui[0];
-      color.ui[2] = sampler->BorderColor.ui[0];
-      color.ui[3] = float_as_int(1.0);
-      break;
-   case GL_LUMINANCE_ALPHA:
-      color.ui[0] = sampler->BorderColor.ui[0];
-      color.ui[1] = sampler->BorderColor.ui[0];
-      color.ui[2] = sampler->BorderColor.ui[0];
-      color.ui[3] = sampler->BorderColor.ui[3];
-      break;
-   default:
-      color.ui[0] = sampler->BorderColor.ui[0];
-      color.ui[1] = sampler->BorderColor.ui[1];
-      color.ui[2] = sampler->BorderColor.ui[2];
-      color.ui[3] = sampler->BorderColor.ui[3];
-      break;
-   }
-
-   /* In some cases we use an RGBA surface format for GL RGB textures,
-    * where we've initialized the A channel to 1.0.  We also have to set
-    * the border color alpha to 1.0 in that case.
-    */
-   if (base_format == GL_RGB)
-      color.ui[3] = float_as_int(1.0);
-
-   if (brw->gen >= 8) {
-      /* On Broadwell, the border color is represented as four 32-bit floats,
-       * integers, or unsigned values, interpreted according to the surface
-       * format.  This matches the sampler->BorderColor union exactly; just
-       * memcpy the values.
-       */
-      uint32_t *sdc = brw_state_batch(brw, 4 * 4, 64, sdc_offset);
-      memcpy(sdc, color.ui, 4 * 4);
-   } else if (brw->is_haswell && (is_integer_format || is_stencil_sampling)) {
-      /* Haswell's integer border color support is completely insane:
-       * SAMPLER_BORDER_COLOR_STATE is 20 DWords.  The first four are
-       * for float colors.  The next 12 DWords are MBZ and only exist to
-       * pad it out to a 64 byte cacheline boundary.  DWords 16-19 then
-       * contain integer colors; these are only used if SURFACE_STATE
-       * has the "Integer Surface Format" bit set.  Even then, the
-       * arrangement of the RGBA data devolves into madness.
-       */
-      uint32_t *sdc = brw_state_batch(brw, 20 * 4, 512, sdc_offset);
-      memset(sdc, 0, 20 * 4);
-      sdc = &sdc[16];
-
-      bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
-      const int bits_per_channel =
-         _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
-
-      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
-       * "If any color channel is missing from the surface format,
-       *  corresponding border color should be programmed as zero and if
-       *  alpha channel is missing, corresponding Alpha border color should
-       *  be programmed as 1."
-       */
-      unsigned c[4] = { 0, 0, 0, 1 };
-      for (int i = 0; i < 4; i++) {
-         if (has_component(format, i))
-            c[i] = color.ui[i];
-      }
-
-      switch (bits_per_channel) {
-      case 8:
-         /* Copy RGBA in order. */
-         for (int i = 0; i < 4; i++)
-            ((uint8_t *) sdc)[i] = c[i];
-         break;
-      case 10:
-         /* R10G10B10A2_UINT is treated like a 16-bit format. */
-      case 16:
-         ((uint16_t *) sdc)[0] = c[0]; /* R -> DWord 0, bits 15:0  */
-         ((uint16_t *) sdc)[1] = c[1]; /* G -> DWord 0, bits 31:16 */
-         /* DWord 1 is Reserved/MBZ! */
-         ((uint16_t *) sdc)[4] = c[2]; /* B -> DWord 2, bits 15:0  */
-         ((uint16_t *) sdc)[5] = c[3]; /* A -> DWord 3, bits 31:16 */
-         break;
-      case 32:
-         if (base_format == GL_RG) {
-            /* Careful inspection of the tables reveals that for RG32 formats,
-             * the green channel needs to go where blue normally belongs.
-             */
-            sdc[0] = c[0];
-            sdc[2] = c[1];
-            sdc[3] = 1;
-         } else {
-            /* Copy RGBA in order. */
-            for (int i = 0; i < 4; i++)
-               sdc[i] = c[i];
-         }
-         break;
-      default:
-         assert(!"Invalid number of bits per channel in integer format.");
-         break;
-      }
-   } else if (brw->gen == 5 || brw->gen == 6) {
-      struct gen5_sampler_default_color *sdc;
-
-      sdc = brw_state_batch(brw, sizeof(*sdc), 32, sdc_offset);
-
-      memset(sdc, 0, sizeof(*sdc));
-
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[3], color.f[3]);
-
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[3], color.f[3]);
-
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[3], color.f[3]);
-
-      sdc->hf[0] = _mesa_float_to_half(color.f[0]);
-      sdc->hf[1] = _mesa_float_to_half(color.f[1]);
-      sdc->hf[2] = _mesa_float_to_half(color.f[2]);
-      sdc->hf[3] = _mesa_float_to_half(color.f[3]);
-
-      sdc->b[0] = sdc->s[0] >> 8;
-      sdc->b[1] = sdc->s[1] >> 8;
-      sdc->b[2] = sdc->s[2] >> 8;
-      sdc->b[3] = sdc->s[3] >> 8;
-
-      sdc->f[0] = color.f[0];
-      sdc->f[1] = color.f[1];
-      sdc->f[2] = color.f[2];
-      sdc->f[3] = color.f[3];
-   } else {
-      float *sdc = brw_state_batch(brw, 4 * 4, 32, sdc_offset);
-      memcpy(sdc, color.f, 4 * 4);
-   }
-}
-
-/**
- * Sets the sampler state for a single unit based off of the sampler key
- * entry.
- */
-static void
-brw_update_sampler_state(struct brw_context *brw,
-                         GLenum target, bool tex_cube_map_seamless,
-                         GLfloat tex_unit_lod_bias,
-                         mesa_format format, GLenum base_format,
-                         const struct gl_texture_object *texObj,
-                         const struct gl_sampler_object *sampler,
-                         uint32_t *sampler_state,
-                         uint32_t batch_offset_for_sampler_state)
-{
-   unsigned min_filter, mag_filter, mip_filter;
-
-   /* Select min and mip filters. */
-   switch (sampler->MinFilter) {
-   case GL_NEAREST:
-      min_filter = BRW_MAPFILTER_NEAREST;
-      mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_LINEAR:
-      min_filter = BRW_MAPFILTER_LINEAR;
-      mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_NEAREST_MIPMAP_NEAREST:
-      min_filter = BRW_MAPFILTER_NEAREST;
-      mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_LINEAR_MIPMAP_NEAREST:
-      min_filter = BRW_MAPFILTER_LINEAR;
-      mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_NEAREST_MIPMAP_LINEAR:
-      min_filter = BRW_MAPFILTER_NEAREST;
-      mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   case GL_LINEAR_MIPMAP_LINEAR:
-      min_filter = BRW_MAPFILTER_LINEAR;
-      mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   /* Select mag filter. */
-   if (sampler->MagFilter == GL_LINEAR)
-      mag_filter = BRW_MAPFILTER_LINEAR;
-   else
-      mag_filter = BRW_MAPFILTER_NEAREST;
-
-   /* Enable anisotropic filtering if desired. */
-   unsigned max_anisotropy = BRW_ANISORATIO_2;
-   if (sampler->MaxAnisotropy > 1.0f) {
-      if (min_filter == BRW_MAPFILTER_LINEAR)
-         min_filter = BRW_MAPFILTER_ANISOTROPIC;
-      if (mag_filter == BRW_MAPFILTER_LINEAR)
-         mag_filter = BRW_MAPFILTER_ANISOTROPIC;
-
-      if (sampler->MaxAnisotropy > 2.0f) {
-	 max_anisotropy =
-            MIN2((sampler->MaxAnisotropy - 2) / 2, BRW_ANISORATIO_16);
-      }
-   }
-
-   /* Set address rounding bits if not using nearest filtering. */
-   unsigned address_rounding = 0;
-   if (min_filter != BRW_MAPFILTER_NEAREST) {
-      address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MIN |
-                          BRW_ADDRESS_ROUNDING_ENABLE_V_MIN |
-                          BRW_ADDRESS_ROUNDING_ENABLE_R_MIN;
-   }
-   if (mag_filter != BRW_MAPFILTER_NEAREST) {
-      address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MAG |
-                          BRW_ADDRESS_ROUNDING_ENABLE_V_MAG |
-                          BRW_ADDRESS_ROUNDING_ENABLE_R_MAG;
-   }
-
-   bool either_nearest =
-      sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
-   unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
-   unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
-   unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
-
-   if (target == GL_TEXTURE_CUBE_MAP ||
-       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
-      /* Cube maps must use the same wrap mode for all three coordinate
-       * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
-       *
-       * Ivybridge and Baytrail seem to have problems with CUBE mode and
-       * integer formats.  Fall back to CLAMP for now.
-       */
-      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
-          !(brw->gen == 7 && !brw->is_haswell && texObj->_IsIntegerFormat)) {
-	 wrap_s = BRW_TEXCOORDMODE_CUBE;
-	 wrap_t = BRW_TEXCOORDMODE_CUBE;
-	 wrap_r = BRW_TEXCOORDMODE_CUBE;
-      } else {
-	 wrap_s = BRW_TEXCOORDMODE_CLAMP;
-	 wrap_t = BRW_TEXCOORDMODE_CLAMP;
-	 wrap_r = BRW_TEXCOORDMODE_CLAMP;
-      }
-   } else if (target == GL_TEXTURE_1D) {
-      /* There's a bug in 1D texture sampling - it actually pays
-       * attention to the wrap_t value, though it should not.
-       * Override the wrap_t value here to GL_REPEAT to keep
-       * any nonexistent border pixels from floating in.
-       */
-      wrap_t = BRW_TEXCOORDMODE_WRAP;
-   }
-
-   /* Set shadow function. */
-   unsigned shadow_function = 0;
-   if (sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB) {
-      shadow_function =
-	 intel_translate_shadow_compare_func(sampler->CompareFunc);
-   }
-
-   const int lod_bits = brw->gen >= 7 ? 8 : 6;
-   const float hw_max_lod = brw->gen >= 7 ? 14 : 13;
-   const unsigned base_level =
-      U_FIXED(CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod), 1);
-   const unsigned min_lod =
-      U_FIXED(CLAMP(sampler->MinLod, 0, hw_max_lod), lod_bits);
-   const unsigned max_lod =
-      U_FIXED(CLAMP(sampler->MaxLod, 0, hw_max_lod), lod_bits);
-   const int lod_bias =
-      S_FIXED(CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15), lod_bits);
-
-   /* Upload the border color if necessary.  If not, just point it at
-    * offset 0 (the start of the batch) - the color should be ignored,
-    * but that address won't fault in case something reads it anyway.
-    */
-   uint32_t border_color_offset = 0;
-   if (wrap_mode_needs_border_color(wrap_s) ||
-       wrap_mode_needs_border_color(wrap_t) ||
-       wrap_mode_needs_border_color(wrap_r)) {
-      upload_default_color(brw, sampler, format, base_format,
-                           texObj->_IsIntegerFormat, texObj->StencilSampling,
-                           &border_color_offset);
-   }
-
-   const bool non_normalized_coords = target == GL_TEXTURE_RECTANGLE;
-
-   brw_emit_sampler_state(brw,
-                          sampler_state,
-                          batch_offset_for_sampler_state,
-                          min_filter, mag_filter, mip_filter,
-                          max_anisotropy,
-                          address_rounding,
-                          wrap_s, wrap_t, wrap_r,
-                          base_level, min_lod, max_lod, lod_bias,
-                          shadow_function,
-                          non_normalized_coords,
-                          border_color_offset);
-}
-
-static void
-update_sampler_state(struct brw_context *brw,
-                     int unit,
-                     uint32_t *sampler_state,
-                     uint32_t batch_offset_for_sampler_state)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   const struct gl_texture_object *texObj = texUnit->_Current;
-   const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
-
-   /* These don't use samplers at all. */
-   if (texObj->Target == GL_TEXTURE_BUFFER)
-      return;
-
-   struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
-   brw_update_sampler_state(brw, texObj->Target, ctx->Texture.CubeMapSeamless,
-                            texUnit->LodBias,
-                            firstImage->TexFormat, firstImage->_BaseFormat,
-                            texObj, sampler,
-                            sampler_state, batch_offset_for_sampler_state);
-}
-
-static void
-brw_upload_sampler_state_table(struct brw_context *brw,
-                               struct gl_program *prog,
-                               struct brw_stage_state *stage_state)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t sampler_count = stage_state->sampler_count;
-
-   GLbitfield SamplersUsed = prog->SamplersUsed;
-
-   if (sampler_count == 0)
-      return;
-
-   /* SAMPLER_STATE is 4 DWords on all platforms. */
-   const int dwords = 4;
-   const int size_in_bytes = dwords * sizeof(uint32_t);
-
-   uint32_t *sampler_state = brw_state_batch(brw,
-                                             sampler_count * size_in_bytes,
-                                             32, &stage_state->sampler_offset);
-   memset(sampler_state, 0, sampler_count * size_in_bytes);
-
-   uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
-
-   for (unsigned s = 0; s < sampler_count; s++) {
-      if (SamplersUsed & (1 << s)) {
-         const unsigned unit = prog->SamplerUnits[s];
-         if (ctx->Texture.Unit[unit]._Current) {
-            update_sampler_state(brw, unit, sampler_state,
-                                     batch_offset_for_sampler_state);
-         }
-      }
-
-      sampler_state += dwords;
-      batch_offset_for_sampler_state += size_in_bytes;
-   }
-
-   if (brw->gen >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
-      /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
-      gen7_emit_sampler_state_pointers_xs(brw, stage_state);
-   } else {
-      /* Flag that the sampler state table pointer has changed; later atoms
-       * will handle it.
-       */
-      brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
-   }
-}
-
-static void
-brw_upload_fs_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct gl_program *fs = (struct gl_program *) brw->fragment_program;
-   brw_upload_sampler_state_table(brw, fs, &brw->wm.base);
-}
-
-const struct brw_tracked_state brw_fs_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_FRAGMENT_PROGRAM,
-   },
-   .emit = brw_upload_fs_samplers,
-};
-
-static void
-brw_upload_vs_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_VERTEX_PROGRAM */
-   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
-   brw_upload_sampler_state_table(brw, vs, &brw->vs.base);
-}
-
-
-const struct brw_tracked_state brw_vs_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VERTEX_PROGRAM,
-   },
-   .emit = brw_upload_vs_samplers,
-};
-
-
-static void
-brw_upload_gs_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   struct gl_program *gs = (struct gl_program *) brw->geometry_program;
-   if (!gs)
-      return;
-
-   brw_upload_sampler_state_table(brw, gs, &brw->gs.base);
-}
-
-
-const struct brw_tracked_state brw_gs_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_GEOMETRY_PROGRAM,
-   },
-   .emit = brw_upload_gs_samplers,
-};
-
-
-static void
-brw_upload_tcs_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_TESS_PROGRAMS */
-   struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
-   if (!tcs)
-      return;
-
-   brw_upload_sampler_state_table(brw, tcs, &brw->tcs.base);
-}
-
-
-const struct brw_tracked_state brw_tcs_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_TESS_PROGRAMS,
-   },
-   .emit = brw_upload_tcs_samplers,
-};
-
-
-static void
-brw_upload_tes_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_TESS_PROGRAMS */
-   struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
-   if (!tes)
-      return;
-
-   brw_upload_sampler_state_table(brw, tes, &brw->tes.base);
-}
-
-
-const struct brw_tracked_state brw_tes_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_TESS_PROGRAMS,
-   },
-   .emit = brw_upload_tes_samplers,
-};
-
-static void
-brw_upload_cs_samplers(struct brw_context *brw)
-{
-   /* BRW_NEW_COMPUTE_PROGRAM */
-   struct gl_program *cs = (struct gl_program *) brw->compute_program;
-   if (!cs)
-      return;
-
-   brw_upload_sampler_state_table(brw, cs, &brw->cs.base);
-}
-
-const struct brw_tracked_state brw_cs_samplers = {
-   .dirty = {
-      .mesa = _NEW_TEXTURE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_COMPUTE_PROGRAM,
-   },
-   .emit = brw_upload_cs_samplers,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index ff15c97..1d50232 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -34,97 +34,35 @@
 #include "main/mtypes.h"
 #include "main/enums.h"
 #include "main/fbobject.h"
+#include "main/state.h"
 
 #include "intel_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_util.h"
-#include "brw_sf.h"
 #include "brw_state.h"
+#include "compiler/brw_eu.h"
 
 #include "util/ralloc.h"
 
 static void compile_sf_prog( struct brw_context *brw,
 			     struct brw_sf_prog_key *key )
 {
-   struct brw_sf_compile c;
-   const GLuint *program;
+   const unsigned *program;
    void *mem_ctx;
-   GLuint program_size;
-
-   memset(&c, 0, sizeof(c));
+   unsigned program_size;
 
    mem_ctx = ralloc_context(NULL);
-   /* Begin the compilation:
-    */
-   brw_init_codegen(&brw->screen->devinfo, &c.func, mem_ctx);
 
-   c.key = *key;
-   c.vue_map = brw->vue_map_geom_out;
-   if (c.key.do_point_coord) {
-      /*
-       * gl_PointCoord is a FS instead of VS builtin variable, thus it's
-       * not included in c.vue_map generated in VS stage. Here we add
-       * it manually to let SF shader generate the needed interpolation
-       * coefficient for FS shader.
-       */
-      c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
-      c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
-   }
-   c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
-   c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
-   c.nr_setup_regs = c.nr_attr_regs;
-
-   c.prog_data.urb_read_length = c.nr_attr_regs;
-   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
-
-   /* Which primitive?  Or all three?
-    */
-   switch (key->primitive) {
-   case SF_TRIANGLES:
-      c.nr_verts = 3;
-      brw_emit_tri_setup( &c, true );
-      break;
-   case SF_LINES:
-      c.nr_verts = 2;
-      brw_emit_line_setup( &c, true );
-      break;
-   case SF_POINTS:
-      c.nr_verts = 1;
-      if (key->do_point_sprite)
-	  brw_emit_point_sprite_setup( &c, true );
-      else
-	  brw_emit_point_setup( &c, true );
-      break;
-   case SF_UNFILLED_TRIS:
-      c.nr_verts = 3;
-      brw_emit_anyprim_setup( &c );
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
-    * source). Compacting would be difficult.
-    */
-   /* brw_compact_instructions(&c.func, 0, 0, NULL); */
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_SF)) {
-      fprintf(stderr, "sf:\n");
-      brw_disassemble(&brw->screen->devinfo,
-                      c.func.store, 0, program_size, stderr);
-      fprintf(stderr, "\n");
-   }
+   struct brw_sf_prog_data prog_data;
+   program = brw_compile_sf(brw->screen->compiler, mem_ctx, key, &prog_data,
+                            &brw->vue_map_geom_out, &program_size);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_SF_PROG,
-		    &c.key, sizeof(c.key),
+		    key, sizeof(*key),
 		    program, program_size,
-		    &c.prog_data, sizeof(c.prog_data),
+		    &prog_data, sizeof(prog_data),
 		    &brw->sf.prog_offset, &brw->sf.prog_data);
    ralloc_free(mem_ctx);
 }
@@ -170,15 +108,15 @@
        * program.
        */
       if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE))
-	 key.primitive = SF_UNFILLED_TRIS;
+	 key.primitive = BRW_SF_PRIM_UNFILLED_TRIS;
       else
-	 key.primitive = SF_TRIANGLES;
+	 key.primitive = BRW_SF_PRIM_TRIANGLES;
       break;
    case GL_LINES:
-      key.primitive = SF_LINES;
+      key.primitive = BRW_SF_PRIM_LINES;
       break;
    case GL_POINTS:
-      key.primitive = SF_POINTS;
+      key.primitive = BRW_SF_PRIM_POINTS;
       break;
    }
 
@@ -207,12 +145,15 @@
       brw_wm_prog_data(brw->wm.base.prog_data);
    if (wm_prog_data) {
       key.contains_flat_varying = wm_prog_data->contains_flat_varying;
-      key.interp_mode = wm_prog_data->interp_mode;
+
+      STATIC_ASSERT(sizeof(key.interp_mode) ==
+                    sizeof(wm_prog_data->interp_mode));
+      memcpy(key.interp_mode, wm_prog_data->interp_mode,
+             sizeof(key.interp_mode));
    }
 
    /* _NEW_LIGHT | _NEW_PROGRAM */
-   key.do_twoside_color = ((ctx->Light.Enabled && ctx->Light.Model.TwoSide) ||
-                           ctx->VertexProgram._TwoSideEnabled);
+   key.do_twoside_color = _mesa_vertex_program_two_side_enabled(ctx);
 
    /* _NEW_POLYGON */
    if (key.do_twoside_color) {
@@ -220,7 +161,7 @@
        * face orientation, just as we invert the viewport in
        * sf_unit_create_from_key().
        */
-      key.frontface_ccw = ctx->Polygon._FrontBit == render_to_fbo;
+      key.frontface_ccw = brw->polygon_front_bit == render_to_fbo;
    }
 
    if (!brw_search_cache(&brw->cache, BRW_CACHE_SF_PROG,
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
deleted file mode 100644
index f372656..0000000
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-#ifndef BRW_SF_H
-#define BRW_SF_H
-
-
-#include "program/program.h"
-#include "brw_context.h"
-#include "compiler/brw_eu.h"
-
-
-#define SF_POINTS    0
-#define SF_LINES     1
-#define SF_TRIANGLES 2
-#define SF_UNFILLED_TRIS   3
-
-struct brw_sf_prog_key {
-   GLbitfield64 attrs;
-   bool contains_flat_varying;
-   const unsigned char *interp_mode;
-   uint8_t point_sprite_coord_replace;
-   GLuint primitive:2;
-   GLuint do_twoside_color:1;
-   GLuint frontface_ccw:1;
-   GLuint do_point_sprite:1;
-   GLuint do_point_coord:1;
-   GLuint sprite_origin_lower_left:1;
-   GLuint userclip_active:1;
-};
-
-struct brw_sf_compile {
-   struct brw_codegen func;
-   struct brw_sf_prog_key key;
-   struct brw_sf_prog_data prog_data;
-
-   struct brw_reg pv;
-   struct brw_reg det;
-   struct brw_reg dx0;
-   struct brw_reg dx2;
-   struct brw_reg dy0;
-   struct brw_reg dy2;
-
-   /* z and 1/w passed in seperately:
-    */
-   struct brw_reg z[3];
-   struct brw_reg inv_w[3];
-
-   /* The vertices:
-    */
-   struct brw_reg vert[3];
-
-    /* Temporaries, allocated after last vertex reg.
-    */
-   struct brw_reg inv_det;
-   struct brw_reg a1_sub_a0;
-   struct brw_reg a2_sub_a0;
-   struct brw_reg tmp;
-
-   struct brw_reg m1Cx;
-   struct brw_reg m2Cy;
-   struct brw_reg m3C0;
-
-   GLuint nr_verts;
-   GLuint nr_attr_regs;
-   GLuint nr_setup_regs;
-   int urb_entry_read_offset;
-
-   /** The last known value of the f0.0 flag register. */
-   unsigned flag_value;
-
-   struct brw_vue_map vue_map;
-};
-
-
-void brw_emit_tri_setup( struct brw_sf_compile *c, bool allocate );
-void brw_emit_line_setup( struct brw_sf_compile *c, bool allocate );
-void brw_emit_point_setup( struct brw_sf_compile *c, bool allocate );
-void brw_emit_point_sprite_setup( struct brw_sf_compile *c, bool allocate );
-void brw_emit_anyprim_setup( struct brw_sf_compile *c );
-
-#endif
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
deleted file mode 100644
index d50ceb1..0000000
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-
-#include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/fbobject.h"
-#include "main/viewport.h"
-#include "intel_batchbuffer.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_sf.h"
-
-static void upload_sf_vp(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_sf_viewport *sfv;
-   GLfloat y_scale, y_bias;
-   float scale[3], translate[3];
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-
-   sfv = brw_state_batch(brw, sizeof(*sfv), 32, &brw->sf.vp_offset);
-   memset(sfv, 0, sizeof(*sfv));
-
-   /* Accessing the fields Width and Height of gl_framebuffer to produce the
-    * values to program the viewport and scissor is fine as long as the
-    * gl_framebuffer has atleast one attachment.
-    */
-   assert(ctx->DrawBuffer->_HasAttachments);
-
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   }
-   else {
-      y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
-   }
-
-   /* _NEW_VIEWPORT */
-
-   _mesa_get_viewport_xform(ctx, 0, scale, translate);
-   sfv->viewport.m00 = scale[0];
-   sfv->viewport.m11 = scale[1] * y_scale;
-   sfv->viewport.m22 = scale[2];
-   sfv->viewport.m30 = translate[0];
-   sfv->viewport.m31 = translate[1] * y_scale + y_bias;
-   sfv->viewport.m32 = translate[2];
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
-    * for DrawBuffer->_[XY]{min,max}
-    */
-
-   /* The scissor only needs to handle the intersection of drawable
-    * and scissor rect, since there are no longer cliprects for shared
-    * buffers with DRI2.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-
-   if (ctx->DrawBuffer->_Xmin == ctx->DrawBuffer->_Xmax ||
-       ctx->DrawBuffer->_Ymin == ctx->DrawBuffer->_Ymax) {
-      /* If the scissor was out of bounds and got clamped to 0
-       * width/height at the bounds, the subtraction of 1 from
-       * maximums could produce a negative number and thus not clip
-       * anything.  Instead, just provide a min > max scissor inside
-       * the bounds, which produces the expected no rendering.
-       */
-      sfv->scissor.xmin = 1;
-      sfv->scissor.xmax = 0;
-      sfv->scissor.ymin = 1;
-      sfv->scissor.ymax = 0;
-   } else if (render_to_fbo) {
-      /* texmemory: Y=0=bottom */
-      sfv->scissor.xmin = ctx->DrawBuffer->_Xmin;
-      sfv->scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      sfv->scissor.ymin = ctx->DrawBuffer->_Ymin;
-      sfv->scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
-   }
-   else {
-      /* memory: Y=0=top */
-      sfv->scissor.xmin = ctx->DrawBuffer->_Xmin;
-      sfv->scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      sfv->scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
-      sfv->scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
-   }
-
-   brw->ctx.NewDriverState |= BRW_NEW_SF_VP;
-}
-
-const struct brw_tracked_state brw_sf_vp = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_SCISSOR |
-               _NEW_VIEWPORT,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP,
-   },
-   .emit = upload_sf_vp
-};
-
-static void upload_sf_unit( struct brw_context *brw )
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct brw_sf_unit_state *sf;
-   int chipset_max_threads;
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-
-   sf = brw_state_batch(brw, sizeof(*sf), 64, &brw->sf.state_offset);
-
-   memset(sf, 0, sizeof(*sf));
-
-   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_SF_PROG_DATA */
-   sf->thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1;
-   sf->thread0.kernel_start_pointer =
-      brw_program_reloc(brw,
-			brw->sf.state_offset +
-			offsetof(struct brw_sf_unit_state, thread0),
-			brw->sf.prog_offset +
-			(sf->thread0.grf_reg_count << 1)) >> 6;
-
-   sf->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-
-   sf->thread3.dispatch_grf_start_reg = 3;
-   sf->thread3.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
-
-   /* BRW_NEW_SF_PROG_DATA */
-   sf->thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
-
-   /* BRW_NEW_URB_FENCE */
-   sf->thread4.nr_urb_entries = brw->urb.nr_sf_entries;
-   sf->thread4.urb_entry_allocation_size = brw->urb.sfsize - 1;
-
-   /* Each SF thread produces 1 PUE, and there can be up to 24 (Pre-Ironlake) or
-    * 48 (Ironlake) threads.
-    */
-   if (brw->gen == 5)
-      chipset_max_threads = 48;
-   else
-      chipset_max_threads = 24;
-
-   /* BRW_NEW_URB_FENCE */
-   sf->thread4.max_threads = MIN2(chipset_max_threads,
-				  brw->urb.nr_sf_entries) - 1;
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      sf->thread4.stats_enable = 1;
-
-   /* BRW_NEW_SF_VP */
-   sf->sf5.sf_viewport_state_offset = (brw->batch.bo->offset64 +
-				       brw->sf.vp_offset) >> 5; /* reloc */
-
-   sf->sf5.viewport_transform = 1;
-
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
-      sf->sf6.scissor = 1;
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon._FrontBit)
-      sf->sf5.front_winding = BRW_FRONTWINDING_CW;
-   else
-      sf->sf5.front_winding = BRW_FRONTWINDING_CCW;
-
-   /* _NEW_BUFFERS
-    * The viewport is inverted for rendering to a FBO, and that inverts
-    * polygon front/back orientation.
-    */
-   sf->sf5.front_winding ^= render_to_fbo;
-
-   /* _NEW_POLYGON */
-   switch (ctx->Polygon.CullFlag ? ctx->Polygon.CullFaceMode : GL_NONE) {
-   case GL_FRONT:
-      sf->sf6.cull_mode = BRW_CULLMODE_FRONT;
-      break;
-   case GL_BACK:
-      sf->sf6.cull_mode = BRW_CULLMODE_BACK;
-      break;
-   case GL_FRONT_AND_BACK:
-      sf->sf6.cull_mode = BRW_CULLMODE_BOTH;
-      break;
-   case GL_NONE:
-      sf->sf6.cull_mode = BRW_CULLMODE_NONE;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   /* _NEW_LINE */
-   sf->sf6.line_width =
-      CLAMP(ctx->Line.Width, 1.0f, ctx->Const.MaxLineWidth) * (1<<1);
-
-   sf->sf6.line_endcap_aa_region_width = 1;
-   if (ctx->Line.SmoothFlag)
-      sf->sf6.aa_enable = 1;
-   else if (sf->sf6.line_width <= 0x2)
-       sf->sf6.line_width = 0;
-
-   /* _NEW_BUFFERS */
-   if (!render_to_fbo) {
-      /* Rendering to an OpenGL window */
-      sf->sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
-   }
-   else {
-      /* If rendering to an FBO, the pixel coordinate system is
-       * inverted with respect to the normal OpenGL coordinate
-       * system, so BRW_RASTRULE_LOWER_RIGHT is correct.
-       * But this value is listed as "Reserved, but not seen as useful"
-       * in Intel documentation (page 212, "Point Rasterization Rule",
-       * section 7.4 "SF Pipeline State Summary", of document
-       * "Intel® 965 Express Chipset Family and Intel® G35 Express
-       * Chipset Graphics Controller Programmer's Reference Manual,
-       * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
-       * available at
-       *     https://01.org/linuxgraphics/documentation/hardware-specification-prms
-       * at the time of this writing).
-       *
-       * It does work on at least some devices, if not all;
-       * if devices that don't support it can be identified,
-       * the likely failure case is that points are rasterized
-       * incorrectly, which is no worse than occurs without
-       * the value, so we're using it here.
-       */
-      sf->sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
-   }
-   /* XXX clamp max depends on AA vs. non-AA */
-
-   /* _NEW_POINT */
-   sf->sf7.sprite_point = ctx->Point.PointSprite;
-   sf->sf7.point_size = CLAMP(rintf(CLAMP(ctx->Point.Size,
-                                          ctx->Point.MinSize,
-                                          ctx->Point.MaxSize)), 1.0f, 255.0f) *
-                        (1<<3);
-   /* _NEW_PROGRAM | _NEW_POINT */
-   sf->sf7.use_point_size_state = !(ctx->VertexProgram.PointSizeEnabled ||
-				    ctx->Point._Attenuated);
-   sf->sf7.aa_line_distance_mode = brw->is_g4x || brw->gen == 5;
-
-   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
-    * _NEW_LIGHT
-    */
-   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
-      sf->sf7.trifan_pv = 2;
-      sf->sf7.linestrip_pv = 1;
-      sf->sf7.tristrip_pv = 2;
-   } else {
-      sf->sf7.trifan_pv = 1;
-      sf->sf7.linestrip_pv = 0;
-      sf->sf7.tristrip_pv = 0;
-   }
-   sf->sf7.line_last_pixel_enable = 0;
-
-   /* Set bias for OpenGL rasterization rules:
-    */
-   sf->sf6.dest_org_vbias = 0x8;
-   sf->sf6.dest_org_hbias = 0x8;
-
-   /* STATE_PREFETCH command description describes this state as being
-    * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
-    */
-
-   /* Emit SF viewport relocation */
-   brw_emit_reloc(&brw->batch,
-                  brw->sf.state_offset +
-		  offsetof(struct brw_sf_unit_state, sf5),
-                  brw->batch.bo,
-                  brw->sf.vp_offset | sf->sf5.front_winding |
-                  (sf->sf5.viewport_transform << 1),
-                  I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-}
-
-const struct brw_tracked_state brw_sf_unit = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_LINE |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_PROGRAM |
-               _NEW_SCISSOR,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_PROGRAM_CACHE |
-               BRW_NEW_SF_PROG_DATA |
-               BRW_NEW_SF_VP |
-               BRW_NEW_URB_FENCE,
-   },
-   .emit = upload_sf_unit,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index ec79a4e..1432a68 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -42,8 +42,6 @@
 enum intel_msaa_layout;
 
 extern const struct brw_tracked_state brw_blend_constant_color;
-extern const struct brw_tracked_state brw_cc_vp;
-extern const struct brw_tracked_state brw_cc_unit;
 extern const struct brw_tracked_state brw_clip_unit;
 extern const struct brw_tracked_state brw_vs_pull_constants;
 extern const struct brw_tracked_state brw_tcs_pull_constants;
@@ -54,21 +52,10 @@
 extern const struct brw_tracked_state brw_constant_buffer;
 extern const struct brw_tracked_state brw_curbe_offsets;
 extern const struct brw_tracked_state brw_invariant_state;
-extern const struct brw_tracked_state brw_fs_samplers;
-extern const struct brw_tracked_state brw_gs_unit;
-extern const struct brw_tracked_state brw_line_stipple;
 extern const struct brw_tracked_state brw_binding_table_pointers;
 extern const struct brw_tracked_state brw_depthbuffer;
-extern const struct brw_tracked_state brw_polygon_stipple_offset;
-extern const struct brw_tracked_state brw_polygon_stipple;
 extern const struct brw_tracked_state brw_recalculate_urb_fence;
-extern const struct brw_tracked_state brw_sf_unit;
 extern const struct brw_tracked_state brw_sf_vp;
-extern const struct brw_tracked_state brw_vs_samplers;
-extern const struct brw_tracked_state brw_tcs_samplers;
-extern const struct brw_tracked_state brw_tes_samplers;
-extern const struct brw_tracked_state brw_gs_samplers;
-extern const struct brw_tracked_state brw_cs_samplers;
 extern const struct brw_tracked_state brw_cs_texture_surfaces;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
@@ -82,7 +69,6 @@
 extern const struct brw_tracked_state brw_gs_ubo_surfaces;
 extern const struct brw_tracked_state brw_gs_abo_surfaces;
 extern const struct brw_tracked_state brw_gs_image_surfaces;
-extern const struct brw_tracked_state brw_vs_unit;
 extern const struct brw_tracked_state brw_renderbuffer_surfaces;
 extern const struct brw_tracked_state brw_renderbuffer_read_surfaces;
 extern const struct brw_tracked_state brw_texture_surfaces;
@@ -97,75 +83,24 @@
 extern const struct brw_tracked_state brw_cs_ubo_surfaces;
 extern const struct brw_tracked_state brw_cs_abo_surfaces;
 extern const struct brw_tracked_state brw_cs_image_surfaces;
-extern const struct brw_tracked_state brw_wm_unit;
 
 extern const struct brw_tracked_state brw_psp_urb_cbs;
 
-extern const struct brw_tracked_state brw_drawing_rect;
 extern const struct brw_tracked_state brw_indices;
-extern const struct brw_tracked_state brw_vertices;
 extern const struct brw_tracked_state brw_index_buffer;
-extern const struct brw_tracked_state brw_cs_state;
 extern const struct brw_tracked_state gen7_cs_push_constants;
 extern const struct brw_tracked_state gen6_binding_table_pointers;
-extern const struct brw_tracked_state gen6_blend_state;
-extern const struct brw_tracked_state gen6_clip_state;
-extern const struct brw_tracked_state gen6_sf_and_clip_viewports;
-extern const struct brw_tracked_state gen6_color_calc_state;
-extern const struct brw_tracked_state gen6_depth_stencil_state;
-extern const struct brw_tracked_state gen6_gs_state;
-extern const struct brw_tracked_state gen6_gs_push_constants;
 extern const struct brw_tracked_state gen6_gs_binding_table;
-extern const struct brw_tracked_state gen6_multisample_state;
 extern const struct brw_tracked_state gen6_renderbuffer_surfaces;
 extern const struct brw_tracked_state gen6_sampler_state;
-extern const struct brw_tracked_state gen6_scissor_state;
 extern const struct brw_tracked_state gen6_sol_surface;
-extern const struct brw_tracked_state gen6_sf_state;
 extern const struct brw_tracked_state gen6_sf_vp;
 extern const struct brw_tracked_state gen6_urb;
-extern const struct brw_tracked_state gen6_viewport_state;
-extern const struct brw_tracked_state gen6_vs_push_constants;
-extern const struct brw_tracked_state gen6_vs_state;
-extern const struct brw_tracked_state gen6_wm_push_constants;
-extern const struct brw_tracked_state gen6_wm_state;
 extern const struct brw_tracked_state gen7_depthbuffer;
-extern const struct brw_tracked_state gen7_ds_state;
-extern const struct brw_tracked_state gen7_gs_state;
-extern const struct brw_tracked_state gen7_tcs_push_constants;
-extern const struct brw_tracked_state gen7_hs_state;
 extern const struct brw_tracked_state gen7_l3_state;
-extern const struct brw_tracked_state gen7_ps_state;
 extern const struct brw_tracked_state gen7_push_constant_space;
-extern const struct brw_tracked_state gen7_sbe_state;
-extern const struct brw_tracked_state gen7_sf_clip_viewport;
-extern const struct brw_tracked_state gen7_sf_state;
-extern const struct brw_tracked_state gen7_sol_state;
-extern const struct brw_tracked_state gen7_te_state;
-extern const struct brw_tracked_state gen7_tes_push_constants;
 extern const struct brw_tracked_state gen7_urb;
-extern const struct brw_tracked_state gen7_vs_state;
-extern const struct brw_tracked_state gen7_wm_state;
-extern const struct brw_tracked_state haswell_cut_index;
-extern const struct brw_tracked_state gen8_blend_state;
-extern const struct brw_tracked_state gen8_ds_state;
-extern const struct brw_tracked_state gen8_gs_state;
-extern const struct brw_tracked_state gen8_hs_state;
-extern const struct brw_tracked_state gen8_index_buffer;
-extern const struct brw_tracked_state gen8_multisample_state;
 extern const struct brw_tracked_state gen8_pma_fix;
-extern const struct brw_tracked_state gen8_ps_blend;
-extern const struct brw_tracked_state gen8_ps_extra;
-extern const struct brw_tracked_state gen8_ps_state;
-extern const struct brw_tracked_state gen8_wm_depth_stencil;
-extern const struct brw_tracked_state gen8_wm_state;
-extern const struct brw_tracked_state gen8_raster_state;
-extern const struct brw_tracked_state gen8_sbe_state;
-extern const struct brw_tracked_state gen8_sf_state;
-extern const struct brw_tracked_state gen8_sf_clip_viewport;
-extern const struct brw_tracked_state gen8_vertices;
-extern const struct brw_tracked_state gen8_vf_topology;
-extern const struct brw_tracked_state gen8_vs_state;
 extern const struct brw_tracked_state brw_cs_work_groups_surface;
 
 static inline bool
@@ -187,6 +122,9 @@
 uint32_t
 brw_depthbuffer_format(struct brw_context *brw);
 
+uint32_t
+brw_convert_depth_value(mesa_format format, float value);
+
 void brw_upload_state_base_address(struct brw_context *brw);
 
 /* gen8_depth_state.c */
@@ -260,14 +198,13 @@
 void gen4_init_vtable_surface_functions(struct brw_context *brw);
 uint32_t brw_get_surface_tiling_bits(uint32_t tiling);
 uint32_t brw_get_surface_num_multisamples(unsigned num_samples);
-
-uint32_t brw_isl_format_for_mesa_format(mesa_format mesa_format);
+enum isl_format brw_isl_format_for_mesa_format(mesa_format mesa_format);
 
 GLuint translate_tex_target(GLenum target);
 
-GLuint translate_tex_format(struct brw_context *brw,
-                            mesa_format mesa_format,
-                            GLenum srgb_decode);
+enum isl_format translate_tex_format(struct brw_context *brw,
+                                     mesa_format mesa_format,
+                                     GLenum srgb_decode);
 
 int brw_get_texture_swizzle(const struct gl_context *ctx,
                             const struct gl_texture_object *t);
@@ -299,20 +236,6 @@
 void gen7_check_surface_setup(uint32_t *surf, bool is_render_target);
 void gen7_init_vtable_surface_functions(struct brw_context *brw);
 
-/* gen8_ps_state.c */
-void gen8_upload_ps_state(struct brw_context *brw,
-                          const struct brw_stage_state *stage_state,
-                          const struct brw_wm_prog_data *prog_data,
-                          uint32_t fast_clear_op);
-
-void gen8_upload_ps_extra(struct brw_context *brw,
-                          const struct brw_wm_prog_data *prog_data);
-
-/* gen7_sol_state.c */
-void gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
-                                      const struct brw_vue_map *vue_map);
-void gen8_upload_3dstate_so_buffers(struct brw_context *brw);
-
 /* gen8_surface_state.c */
 
 void gen8_init_vtable_surface_functions(struct brw_context *brw);
@@ -337,25 +260,6 @@
                             bool non_normalized_coordinates,
                             uint32_t border_color_offset);
 
-/* gen6_wm_state.c */
-void
-gen6_upload_wm_state(struct brw_context *brw,
-                     const struct brw_wm_prog_data *prog_data,
-                     const struct brw_stage_state *stage_state,
-                     bool multisampled_fbo,
-                     bool dual_source_blend_enable, bool kill_enable,
-                     bool color_buffer_write_enable, bool msaa_enabled,
-                     bool line_stipple_enable, bool polygon_stipple_enable,
-                     bool statistic_enable);
-
-/* gen6_sf_state.c */
-void
-calculate_attr_overrides(const struct brw_context *brw,
-                         uint16_t *attr_overrides,
-                         uint32_t *point_sprite_enables,
-                         uint32_t *urb_entry_read_length,
-                         uint32_t *urb_entry_read_offset);
-
 /* gen6_surface_state.c */
 void gen6_init_vtable_surface_functions(struct brw_context *brw);
 
@@ -435,11 +339,69 @@
           (brw->vue_map_geom_out.slots_valid & VARYING_BIT_PSIZ) == 0;
 }
 
-void brw_calculate_guardband_size(const struct gen_device_info *devinfo,
-                                  uint32_t fb_width, uint32_t fb_height,
-                                  float m00, float m11, float m30, float m31,
-                                  float *xmin, float *xmax,
-                                  float *ymin, float *ymax);
+void brw_copy_pipeline_atoms(struct brw_context *brw,
+                             enum brw_pipeline pipeline,
+                             const struct brw_tracked_state **atoms,
+                             int num_atoms);
+void gen4_init_atoms(struct brw_context *brw);
+void gen45_init_atoms(struct brw_context *brw);
+void gen5_init_atoms(struct brw_context *brw);
+void gen6_init_atoms(struct brw_context *brw);
+void gen7_init_atoms(struct brw_context *brw);
+void gen75_init_atoms(struct brw_context *brw);
+void gen8_init_atoms(struct brw_context *brw);
+void gen9_init_atoms(struct brw_context *brw);
+void gen10_init_atoms(struct brw_context *brw);
+
+/* Memory Object Control State:
+ * Specifying zero for L3 means "uncached in L3", at least on Haswell
+ * and Baytrail, since there are no PTE flags for setting L3 cacheability.
+ * On Ivybridge, the PTEs do have a cache-in-L3 bit, so setting MOCS to 0
+ * may still respect that.
+ */
+#define GEN7_MOCS_L3                    1
+
+/* Ivybridge only: cache in LLC.
+ * Specifying zero here means to use the PTE values set by the kernel;
+ * non-zero overrides the PTE values.
+ */
+#define IVB_MOCS_LLC                    (1 << 1)
+
+/* Baytrail only: snoop in CPU cache */
+#define BYT_MOCS_SNOOP                  (1 << 1)
+
+/* Haswell only: LLC/eLLC controls (write-back or uncached).
+ * Specifying zero here means to use the PTE values set by the kernel,
+ * which is useful since it offers additional control (write-through
+ * cacheing and age).  Non-zero overrides the PTE values.
+ */
+#define HSW_MOCS_UC_LLC_UC_ELLC         (1 << 1)
+#define HSW_MOCS_WB_LLC_WB_ELLC         (2 << 1)
+#define HSW_MOCS_UC_LLC_WB_ELLC         (3 << 1)
+
+/* Broadwell: these defines always use all available caches (L3, LLC, eLLC),
+ * and let you force write-back (WB) or write-through (WT) caching, or leave
+ * it up to the page table entry (PTE) specified by the kernel.
+ */
+#define BDW_MOCS_WB  0x78
+#define BDW_MOCS_WT  0x58
+#define BDW_MOCS_PTE 0x18
+
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
+ */
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (1 << 1)
+
+/* Cannonlake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
+ */
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define CNL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define CNL_MOCS_PTE (1 << 1)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 9c0b82c..07df3cc 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -45,341 +45,6 @@
 #include "brw_cs.h"
 #include "main/framebuffer.h"
 
-static const struct brw_tracked_state *gen4_atoms[] =
-{
-   /* Once all the programs are done, we know how large urb entry
-    * sizes need to be and can decide if we need to change the urb
-    * layout.
-    */
-   &brw_curbe_offsets,
-   &brw_recalculate_urb_fence,
-
-   &brw_cc_vp,
-   &brw_cc_unit,
-
-   /* Surface state setup.  Must come before the VS/WM unit.  The binding
-    * table upload must be last.
-    */
-   &brw_vs_pull_constants,
-   &brw_wm_pull_constants,
-   &brw_renderbuffer_surfaces,
-   &brw_renderbuffer_read_surfaces,
-   &brw_texture_surfaces,
-   &brw_vs_binding_table,
-   &brw_wm_binding_table,
-
-   &brw_fs_samplers,
-   &brw_vs_samplers,
-
-   /* These set up state for brw_psp_urb_cbs */
-   &brw_wm_unit,
-   &brw_sf_vp,
-   &brw_sf_unit,
-   &brw_vs_unit,		/* always required, enabled or not */
-   &brw_clip_unit,
-   &brw_gs_unit,
-
-   /* Command packets:
-    */
-   &brw_invariant_state,
-
-   &brw_binding_table_pointers,
-   &brw_blend_constant_color,
-
-   &brw_depthbuffer,
-
-   &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
-   &brw_line_stipple,
-
-   &brw_psp_urb_cbs,
-
-   &brw_drawing_rect,
-   &brw_indices, /* must come before brw_vertices */
-   &brw_index_buffer,
-   &brw_vertices,
-
-   &brw_constant_buffer
-};
-
-static const struct brw_tracked_state *gen6_atoms[] =
-{
-   &gen6_sf_and_clip_viewports,
-
-   /* Command packets: */
-
-   &brw_cc_vp,
-   &gen6_viewport_state,	/* must do after *_vp stages */
-
-   &gen6_urb,
-   &gen6_blend_state,		/* must do before cc unit */
-   &gen6_color_calc_state,	/* must do before cc unit */
-   &gen6_depth_stencil_state,	/* must do before cc unit */
-
-   &gen6_vs_push_constants, /* Before vs_state */
-   &gen6_gs_push_constants, /* Before gs_state */
-   &gen6_wm_push_constants, /* Before wm_state */
-
-   /* Surface state setup.  Must come before the VS/WM unit.  The binding
-    * table upload must be last.
-    */
-   &brw_vs_pull_constants,
-   &brw_vs_ubo_surfaces,
-   &brw_gs_pull_constants,
-   &brw_gs_ubo_surfaces,
-   &brw_wm_pull_constants,
-   &brw_wm_ubo_surfaces,
-   &gen6_renderbuffer_surfaces,
-   &brw_renderbuffer_read_surfaces,
-   &brw_texture_surfaces,
-   &gen6_sol_surface,
-   &brw_vs_binding_table,
-   &gen6_gs_binding_table,
-   &brw_wm_binding_table,
-
-   &brw_fs_samplers,
-   &brw_vs_samplers,
-   &brw_gs_samplers,
-   &gen6_sampler_state,
-   &gen6_multisample_state,
-
-   &gen6_vs_state,
-   &gen6_gs_state,
-   &gen6_clip_state,
-   &gen6_sf_state,
-   &gen6_wm_state,
-
-   &gen6_scissor_state,
-
-   &gen6_binding_table_pointers,
-
-   &brw_depthbuffer,
-
-   &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
-   &brw_line_stipple,
-
-   &brw_drawing_rect,
-
-   &brw_indices, /* must come before brw_vertices */
-   &brw_index_buffer,
-   &brw_vertices,
-};
-
-static const struct brw_tracked_state *gen7_render_atoms[] =
-{
-   /* Command packets: */
-
-   &brw_cc_vp,
-   &gen7_sf_clip_viewport,
-
-   &gen7_l3_state,
-   &gen7_push_constant_space,
-   &gen7_urb,
-   &gen6_blend_state,		/* must do before cc unit */
-   &gen6_color_calc_state,	/* must do before cc unit */
-   &gen6_depth_stencil_state,	/* must do before cc unit */
-
-   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
-   &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
-   &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
-   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
-   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
-
-   &gen6_vs_push_constants, /* Before vs_state */
-   &gen7_tcs_push_constants,
-   &gen7_tes_push_constants,
-   &gen6_gs_push_constants, /* Before gs_state */
-   &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
-
-   /* Surface state setup.  Must come before the VS/WM unit.  The binding
-    * table upload must be last.
-    */
-   &brw_vs_pull_constants,
-   &brw_vs_ubo_surfaces,
-   &brw_vs_abo_surfaces,
-   &brw_tcs_pull_constants,
-   &brw_tcs_ubo_surfaces,
-   &brw_tcs_abo_surfaces,
-   &brw_tes_pull_constants,
-   &brw_tes_ubo_surfaces,
-   &brw_tes_abo_surfaces,
-   &brw_gs_pull_constants,
-   &brw_gs_ubo_surfaces,
-   &brw_gs_abo_surfaces,
-   &brw_wm_pull_constants,
-   &brw_wm_ubo_surfaces,
-   &brw_wm_abo_surfaces,
-   &gen6_renderbuffer_surfaces,
-   &brw_renderbuffer_read_surfaces,
-   &brw_texture_surfaces,
-   &brw_vs_binding_table,
-   &brw_tcs_binding_table,
-   &brw_tes_binding_table,
-   &brw_gs_binding_table,
-   &brw_wm_binding_table,
-
-   &brw_fs_samplers,
-   &brw_vs_samplers,
-   &brw_tcs_samplers,
-   &brw_tes_samplers,
-   &brw_gs_samplers,
-   &gen6_multisample_state,
-
-   &gen7_vs_state,
-   &gen7_hs_state,
-   &gen7_te_state,
-   &gen7_ds_state,
-   &gen7_gs_state,
-   &gen7_sol_state,
-   &gen6_clip_state,
-   &gen7_sbe_state,
-   &gen7_sf_state,
-   &gen7_wm_state,
-   &gen7_ps_state,
-
-   &gen6_scissor_state,
-
-   &gen7_depthbuffer,
-
-   &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
-   &brw_line_stipple,
-
-   &brw_drawing_rect,
-
-   &brw_indices, /* must come before brw_vertices */
-   &brw_index_buffer,
-   &brw_vertices,
-
-   &haswell_cut_index,
-};
-
-static const struct brw_tracked_state *gen7_compute_atoms[] =
-{
-   &gen7_l3_state,
-   &brw_cs_image_surfaces,
-   &gen7_cs_push_constants,
-   &brw_cs_pull_constants,
-   &brw_cs_ubo_surfaces,
-   &brw_cs_abo_surfaces,
-   &brw_cs_texture_surfaces,
-   &brw_cs_work_groups_surface,
-   &brw_cs_samplers,
-   &brw_cs_state,
-};
-
-static const struct brw_tracked_state *gen8_render_atoms[] =
-{
-   &brw_cc_vp,
-   &gen8_sf_clip_viewport,
-
-   &gen7_l3_state,
-   &gen7_push_constant_space,
-   &gen7_urb,
-   &gen8_blend_state,
-   &gen6_color_calc_state,
-
-   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
-   &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
-   &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
-   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
-   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
-
-   &gen6_vs_push_constants, /* Before vs_state */
-   &gen7_tcs_push_constants,
-   &gen7_tes_push_constants,
-   &gen6_gs_push_constants, /* Before gs_state */
-   &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
-
-   /* Surface state setup.  Must come before the VS/WM unit.  The binding
-    * table upload must be last.
-    */
-   &brw_vs_pull_constants,
-   &brw_vs_ubo_surfaces,
-   &brw_vs_abo_surfaces,
-   &brw_tcs_pull_constants,
-   &brw_tcs_ubo_surfaces,
-   &brw_tcs_abo_surfaces,
-   &brw_tes_pull_constants,
-   &brw_tes_ubo_surfaces,
-   &brw_tes_abo_surfaces,
-   &brw_gs_pull_constants,
-   &brw_gs_ubo_surfaces,
-   &brw_gs_abo_surfaces,
-   &brw_wm_pull_constants,
-   &brw_wm_ubo_surfaces,
-   &brw_wm_abo_surfaces,
-   &gen6_renderbuffer_surfaces,
-   &brw_renderbuffer_read_surfaces,
-   &brw_texture_surfaces,
-   &brw_vs_binding_table,
-   &brw_tcs_binding_table,
-   &brw_tes_binding_table,
-   &brw_gs_binding_table,
-   &brw_wm_binding_table,
-
-   &brw_fs_samplers,
-   &brw_vs_samplers,
-   &brw_tcs_samplers,
-   &brw_tes_samplers,
-   &brw_gs_samplers,
-   &gen8_multisample_state,
-
-   &gen8_vs_state,
-   &gen8_hs_state,
-   &gen7_te_state,
-   &gen8_ds_state,
-   &gen8_gs_state,
-   &gen7_sol_state,
-   &gen6_clip_state,
-   &gen8_raster_state,
-   &gen8_sbe_state,
-   &gen8_sf_state,
-   &gen8_ps_blend,
-   &gen8_ps_extra,
-   &gen8_ps_state,
-   &gen8_wm_depth_stencil,
-   &gen8_wm_state,
-
-   &gen6_scissor_state,
-
-   &gen7_depthbuffer,
-
-   &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
-   &brw_line_stipple,
-
-   &brw_drawing_rect,
-
-   &gen8_vf_topology,
-
-   &brw_indices,
-   &gen8_index_buffer,
-   &gen8_vertices,
-
-   &haswell_cut_index,
-   &gen8_pma_fix,
-};
-
-static const struct brw_tracked_state *gen8_compute_atoms[] =
-{
-   &gen7_l3_state,
-   &brw_cs_image_surfaces,
-   &gen7_cs_push_constants,
-   &brw_cs_pull_constants,
-   &brw_cs_ubo_surfaces,
-   &brw_cs_abo_surfaces,
-   &brw_cs_texture_surfaces,
-   &brw_cs_work_groups_surface,
-   &brw_cs_samplers,
-   &brw_cs_state,
-};
-
 static void
 brw_upload_initial_gpu_state(struct brw_context *brw)
 {
@@ -395,14 +60,27 @@
 
    brw_upload_invariant_state(brw);
 
-   /* Recommended optimization for Victim Cache eviction in pixel backend. */
-   if (brw->gen >= 9) {
+   if (brw->gen == 9) {
+      /* Recommended optimizations for Victim Cache eviction and floating
+       * point blending.
+       */
       BEGIN_BATCH(3);
       OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
       OUT_BATCH(GEN7_CACHE_MODE_1);
-      OUT_BATCH(REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
+      OUT_BATCH(REG_MASK(GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE) |
+                REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
+                GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE |
                 GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC);
       ADVANCE_BATCH();
+
+      if (brw->is_broxton) {
+         BEGIN_BATCH(3);
+         OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
+         OUT_BATCH(GEN7_GT_MODE);
+         OUT_BATCH(GEN9_SUBSLICE_HASHING_MASK_BITS |
+                   GEN9_SUBSLICE_HASHING_16x16);
+         ADVANCE_BATCH();
+      }
    }
 
    if (brw->gen >= 8) {
@@ -439,7 +117,7 @@
    }
 }
 
-static void
+void
 brw_copy_pipeline_atoms(struct brw_context *brw,
                         enum brw_pipeline pipeline,
                         const struct brw_tracked_state **atoms,
@@ -467,40 +145,26 @@
    /* Force the first brw_select_pipeline to emit pipeline select */
    brw->last_pipeline = BRW_NUM_PIPELINES;
 
-   STATIC_ASSERT(ARRAY_SIZE(gen4_atoms) <= ARRAY_SIZE(brw->render_atoms));
-   STATIC_ASSERT(ARRAY_SIZE(gen6_atoms) <= ARRAY_SIZE(brw->render_atoms));
-   STATIC_ASSERT(ARRAY_SIZE(gen7_render_atoms) <=
-                 ARRAY_SIZE(brw->render_atoms));
-   STATIC_ASSERT(ARRAY_SIZE(gen8_render_atoms) <=
-                 ARRAY_SIZE(brw->render_atoms));
-   STATIC_ASSERT(ARRAY_SIZE(gen7_compute_atoms) <=
-                 ARRAY_SIZE(brw->compute_atoms));
-   STATIC_ASSERT(ARRAY_SIZE(gen8_compute_atoms) <=
-                 ARRAY_SIZE(brw->compute_atoms));
-
    brw_init_caches(brw);
 
-   if (brw->gen >= 8) {
-      brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
-                              gen8_render_atoms,
-                              ARRAY_SIZE(gen8_render_atoms));
-      brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
-                              gen8_compute_atoms,
-                              ARRAY_SIZE(gen8_compute_atoms));
-   } else if (brw->gen == 7) {
-      brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
-                              gen7_render_atoms,
-                              ARRAY_SIZE(gen7_render_atoms));
-      brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
-                              gen7_compute_atoms,
-                              ARRAY_SIZE(gen7_compute_atoms));
-   } else if (brw->gen == 6) {
-      brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
-                              gen6_atoms, ARRAY_SIZE(gen6_atoms));
-   } else {
-      brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
-                              gen4_atoms, ARRAY_SIZE(gen4_atoms));
-   }
+   if (brw->gen >= 10)
+      gen10_init_atoms(brw);
+   else if (brw->gen >= 9)
+      gen9_init_atoms(brw);
+   else if (brw->gen >= 8)
+      gen8_init_atoms(brw);
+   else if (brw->is_haswell)
+      gen75_init_atoms(brw);
+   else if (brw->gen >= 7)
+      gen7_init_atoms(brw);
+   else if (brw->gen >= 6)
+      gen6_init_atoms(brw);
+   else if (brw->gen >= 5)
+      gen5_init_atoms(brw);
+   else if (brw->is_g4x)
+      gen45_init_atoms(brw);
+   else
+      gen4_init_atoms(brw);
 
    brw_upload_initial_gpu_state(brw);
 
@@ -597,7 +261,6 @@
    DEFINE_BIT(_NEW_TRACK_MATRIX),
    DEFINE_BIT(_NEW_PROGRAM),
    DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
-   DEFINE_BIT(_NEW_BUFFER_OBJECT),
    DEFINE_BIT(_NEW_FRAG_CLAMP),
    /* Avoid sign extension problems. */
    {(unsigned) _NEW_VARYING_VP_INPUTS, "_NEW_VARYING_VP_INPUTS", 0},
@@ -620,7 +283,6 @@
    DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM),
    DEFINE_BIT(BRW_NEW_TESS_PROGRAMS),
    DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
-   DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
    DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_PATCH_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_PRIMITIVE),
@@ -663,6 +325,8 @@
    DEFINE_BIT(BRW_NEW_BLORP),
    DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT),
    DEFINE_BIT(BRW_NEW_CONSERVATIVE_RASTERIZATION),
+   DEFINE_BIT(BRW_NEW_DRAW_CALL),
+   DEFINE_BIT(BRW_NEW_FAST_CLEAR_COLOR),
    {0, 0, 0}
 };
 
@@ -779,7 +443,8 @@
    int i;
    static int dirty_count = 0;
    struct brw_state_flags state = brw->state.pipelines[pipeline];
-   unsigned int fb_samples = _mesa_geometric_samples(ctx->DrawBuffer);
+   const unsigned fb_samples =
+      MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1);
 
    brw_select_pipeline(brw, pipeline);
 
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 55338c0..fb592be 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -65,530 +65,6 @@
    } bits1;
 };
 
-/* State structs for the various fixed function units:
- */
-
-
-struct thread0
-{
-   unsigned pad0:1;
-   unsigned grf_reg_count:3;
-   unsigned pad1:2;
-   unsigned kernel_start_pointer:26; /* Offset from GENERAL_STATE_BASE */
-};
-
-struct thread1
-{
-   unsigned ext_halt_exception_enable:1;
-   unsigned sw_exception_enable:1;
-   unsigned mask_stack_exception_enable:1;
-   unsigned timeout_exception_enable:1;
-   unsigned illegal_op_exception_enable:1;
-   unsigned pad0:3;
-   unsigned depth_coef_urb_read_offset:6;	/* WM only */
-   unsigned pad1:2;
-   unsigned floating_point_mode:1;
-   unsigned thread_priority:1;
-   unsigned binding_table_entry_count:8;
-   unsigned pad3:5;
-   unsigned single_program_flow:1;
-};
-
-struct thread2
-{
-   unsigned per_thread_scratch_space:4;
-   unsigned pad0:6;
-   unsigned scratch_space_base_pointer:22;
-};
-
-
-struct thread3
-{
-   unsigned dispatch_grf_start_reg:4;
-   unsigned urb_entry_read_offset:6;
-   unsigned pad0:1;
-   unsigned urb_entry_read_length:6;
-   unsigned pad1:1;
-   unsigned const_urb_entry_read_offset:6;
-   unsigned pad2:1;
-   unsigned const_urb_entry_read_length:6;
-   unsigned pad3:1;
-};
-
-
-
-struct brw_clip_unit_state
-{
-   struct thread0 thread0;
-   struct
-   {
-      unsigned pad0:7;
-      unsigned sw_exception_enable:1;
-      unsigned pad1:3;
-      unsigned mask_stack_exception_enable:1;
-      unsigned pad2:1;
-      unsigned illegal_op_exception_enable:1;
-      unsigned pad3:2;
-      unsigned floating_point_mode:1;
-      unsigned thread_priority:1;
-      unsigned binding_table_entry_count:8;
-      unsigned pad4:5;
-      unsigned single_program_flow:1;
-   } thread1;
-
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:9;
-      unsigned gs_output_stats:1; /* not always */
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:5; 	/* may be less */
-      unsigned pad3:2;
-   } thread4;
-
-   struct
-   {
-      unsigned pad0:13;
-      unsigned clip_mode:3;
-      unsigned userclip_enable_flags:8;
-      unsigned userclip_must_clip:1;
-      unsigned negative_w_clip_test:1;
-      unsigned guard_band_enable:1;
-      unsigned viewport_z_clip_enable:1;
-      unsigned viewport_xy_clip_enable:1;
-      unsigned vertex_position_space:1;
-      unsigned api_mode:1;
-      unsigned pad2:1;
-   } clip5;
-
-   struct
-   {
-      unsigned pad0:5;
-      unsigned clipper_viewport_state_ptr:27;
-   } clip6;
-
-
-   float viewport_xmin;
-   float viewport_xmax;
-   float viewport_ymin;
-   float viewport_ymax;
-};
-
-struct gen6_blend_state
-{
-   struct {
-      unsigned dest_blend_factor:5;
-      unsigned source_blend_factor:5;
-      unsigned pad3:1;
-      unsigned blend_func:3;
-      unsigned pad2:1;
-      unsigned ia_dest_blend_factor:5;
-      unsigned ia_source_blend_factor:5;
-      unsigned pad1:1;
-      unsigned ia_blend_func:3;
-      unsigned pad0:1;
-      unsigned ia_blend_enable:1;
-      unsigned blend_enable:1;
-   } blend0;
-
-   struct {
-      unsigned post_blend_clamp_enable:1;
-      unsigned pre_blend_clamp_enable:1;
-      unsigned clamp_range:2;
-      unsigned pad0:4;
-      unsigned x_dither_offset:2;
-      unsigned y_dither_offset:2;
-      unsigned dither_enable:1;
-      unsigned alpha_test_func:3;
-      unsigned alpha_test_enable:1;
-      unsigned pad1:1;
-      unsigned logic_op_func:4;
-      unsigned logic_op_enable:1;
-      unsigned pad2:1;
-      unsigned write_disable_b:1;
-      unsigned write_disable_g:1;
-      unsigned write_disable_r:1;
-      unsigned write_disable_a:1;
-      unsigned pad3:1;
-      unsigned alpha_to_coverage_dither:1;
-      unsigned alpha_to_one:1;
-      unsigned alpha_to_coverage:1;
-   } blend1;
-};
-
-struct gen6_color_calc_state
-{
-   struct {
-      unsigned alpha_test_format:1;
-      unsigned pad0:14;
-      unsigned round_disable:1;
-      unsigned bf_stencil_ref:8;
-      unsigned stencil_ref:8;
-   } cc0;
-
-   union {
-      float alpha_ref_f;
-      struct {
-	 unsigned ui:8;
-	 unsigned pad0:24;
-      } alpha_ref_fi;
-   } cc1;
-
-   float constant_r;
-   float constant_g;
-   float constant_b;
-   float constant_a;
-};
-
-struct gen6_depth_stencil_state
-{
-   struct {
-      unsigned pad0:3;
-      unsigned bf_stencil_pass_depth_pass_op:3;
-      unsigned bf_stencil_pass_depth_fail_op:3;
-      unsigned bf_stencil_fail_op:3;
-      unsigned bf_stencil_func:3;
-      unsigned bf_stencil_enable:1;
-      unsigned pad1:2;
-      unsigned stencil_write_enable:1;
-      unsigned stencil_pass_depth_pass_op:3;
-      unsigned stencil_pass_depth_fail_op:3;
-      unsigned stencil_fail_op:3;
-      unsigned stencil_func:3;
-      unsigned stencil_enable:1;
-   } ds0;
-
-   struct {
-      unsigned bf_stencil_write_mask:8;
-      unsigned bf_stencil_test_mask:8;
-      unsigned stencil_write_mask:8;
-      unsigned stencil_test_mask:8;
-   } ds1;
-
-   struct {
-      unsigned pad0:26;
-      unsigned depth_write_enable:1;
-      unsigned depth_test_func:3;
-      unsigned pad1:1;
-      unsigned depth_test_enable:1;
-   } ds2;
-};
-
-struct brw_cc_unit_state
-{
-   struct
-   {
-      unsigned pad0:3;
-      unsigned bf_stencil_pass_depth_pass_op:3;
-      unsigned bf_stencil_pass_depth_fail_op:3;
-      unsigned bf_stencil_fail_op:3;
-      unsigned bf_stencil_func:3;
-      unsigned bf_stencil_enable:1;
-      unsigned pad1:2;
-      unsigned stencil_write_enable:1;
-      unsigned stencil_pass_depth_pass_op:3;
-      unsigned stencil_pass_depth_fail_op:3;
-      unsigned stencil_fail_op:3;
-      unsigned stencil_func:3;
-      unsigned stencil_enable:1;
-   } cc0;
-
-
-   struct
-   {
-      unsigned bf_stencil_ref:8;
-      unsigned stencil_write_mask:8;
-      unsigned stencil_test_mask:8;
-      unsigned stencil_ref:8;
-   } cc1;
-
-
-   struct
-   {
-      unsigned logicop_enable:1;
-      unsigned pad0:10;
-      unsigned depth_write_enable:1;
-      unsigned depth_test_function:3;
-      unsigned depth_test:1;
-      unsigned bf_stencil_write_mask:8;
-      unsigned bf_stencil_test_mask:8;
-   } cc2;
-
-
-   struct
-   {
-      unsigned pad0:8;
-      unsigned alpha_test_func:3;
-      unsigned alpha_test:1;
-      unsigned blend_enable:1;
-      unsigned ia_blend_enable:1;
-      unsigned pad1:1;
-      unsigned alpha_test_format:1;
-      unsigned pad2:16;
-   } cc3;
-
-   struct
-   {
-      unsigned pad0:5;
-      unsigned cc_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
-   } cc4;
-
-   struct
-   {
-      unsigned pad0:2;
-      unsigned ia_dest_blend_factor:5;
-      unsigned ia_src_blend_factor:5;
-      unsigned ia_blend_function:3;
-      unsigned statistics_enable:1;
-      unsigned logicop_func:4;
-      unsigned pad1:11;
-      unsigned dither_enable:1;
-   } cc5;
-
-   struct
-   {
-      unsigned clamp_post_alpha_blend:1;
-      unsigned clamp_pre_alpha_blend:1;
-      unsigned clamp_range:2;
-      unsigned pad0:11;
-      unsigned y_dither_offset:2;
-      unsigned x_dither_offset:2;
-      unsigned dest_blend_factor:5;
-      unsigned src_blend_factor:5;
-      unsigned blend_function:3;
-   } cc6;
-
-   struct {
-      union {
-	 float f;
-	 uint8_t ub[4];
-      } alpha_ref;
-   } cc7;
-};
-
-struct brw_sf_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:10;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:6;
-      unsigned pad3:1;
-   } thread4;
-
-   struct
-   {
-      unsigned front_winding:1;
-      unsigned viewport_transform:1;
-      unsigned pad0:3;
-      unsigned sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
-   } sf5;
-
-   struct
-   {
-      unsigned pad0:9;
-      unsigned dest_org_vbias:4;
-      unsigned dest_org_hbias:4;
-      unsigned scissor:1;
-      unsigned disable_2x2_trifilter:1;
-      unsigned disable_zero_pix_trifilter:1;
-      unsigned point_rast_rule:2;
-      unsigned line_endcap_aa_region_width:2;
-      unsigned line_width:4;
-      unsigned fast_scissor_disable:1;
-      unsigned cull_mode:2;
-      unsigned aa_enable:1;
-   } sf6;
-
-   struct
-   {
-      unsigned point_size:11;
-      unsigned use_point_size_state:1;
-      unsigned subpixel_precision:1;
-      unsigned sprite_point:1;
-      unsigned pad0:10;
-      unsigned aa_line_distance_mode:1;
-      unsigned trifan_pv:2;
-      unsigned linestrip_pv:2;
-      unsigned tristrip_pv:2;
-      unsigned line_last_pixel_enable:1;
-   } sf7;
-
-};
-
-struct gen6_scissor_rect
-{
-   unsigned xmin:16;
-   unsigned ymin:16;
-   unsigned xmax:16;
-   unsigned ymax:16;
-};
-
-struct brw_gs_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:8;
-      unsigned rendering_enable:1; /* for Ironlake */
-      unsigned pad4:1;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:5;
-      unsigned pad3:2;
-   } thread4;
-
-   struct
-   {
-      unsigned sampler_count:3;
-      unsigned pad0:2;
-      unsigned sampler_state_pointer:27;
-   } gs5;
-
-
-   struct
-   {
-      unsigned max_vp_index:4;
-      unsigned pad0:12;
-      unsigned svbi_post_inc_value:10;
-      unsigned pad1:1;
-      unsigned svbi_post_inc_enable:1;
-      unsigned svbi_payload:1;
-      unsigned discard_adjaceny:1;
-      unsigned reorder_enable:1;
-      unsigned pad2:1;
-   } gs6;
-};
-
-
-struct brw_vs_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:10;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:6;
-      unsigned pad3:1;
-   } thread4;
-
-   struct
-   {
-      unsigned sampler_count:3;
-      unsigned pad0:2;
-      unsigned sampler_state_pointer:27;
-   } vs5;
-
-   struct
-   {
-      unsigned vs_enable:1;
-      unsigned vert_cache_disable:1;
-      unsigned pad0:30;
-   } vs6;
-};
-
-
-struct brw_wm_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct {
-      unsigned stats_enable:1;
-      unsigned depth_buffer_clear:1;
-      unsigned sampler_count:3;
-      unsigned sampler_state_pointer:27;
-   } wm4;
-
-   struct
-   {
-      unsigned enable_8_pix:1;
-      unsigned enable_16_pix:1;
-      unsigned enable_32_pix:1;
-      unsigned enable_con_32_pix:1;
-      unsigned enable_con_64_pix:1;
-      unsigned pad0:1;
-
-      /* These next four bits are for Ironlake+ */
-      unsigned fast_span_coverage_enable:1;
-      unsigned depth_buffer_clear:1;
-      unsigned depth_buffer_resolve_enable:1;
-      unsigned hierarchical_depth_buffer_resolve_enable:1;
-
-      unsigned legacy_global_depth_bias:1;
-      unsigned line_stipple:1;
-      unsigned depth_offset:1;
-      unsigned polygon_stipple:1;
-      unsigned line_aa_region_width:2;
-      unsigned line_endcap_aa_region_width:2;
-      unsigned early_depth_test:1;
-      unsigned thread_dispatch_enable:1;
-      unsigned program_uses_depth:1;
-      unsigned program_computes_depth:1;
-      unsigned program_uses_killpixel:1;
-      unsigned legacy_line_rast: 1;
-      unsigned transposed_urb_read_enable:1;
-      unsigned max_threads:7;
-   } wm5;
-
-   float global_depth_offset_constant;
-   float global_depth_offset_scale;
-
-   /* for Ironlake only */
-   struct {
-      unsigned pad0:1;
-      unsigned grf_reg_count_1:3;
-      unsigned pad1:2;
-      unsigned kernel_start_pointer_1:26;
-   } wm8;
-
-   struct {
-      unsigned pad0:1;
-      unsigned grf_reg_count_2:3;
-      unsigned pad1:2;
-      unsigned kernel_start_pointer_2:26;
-   } wm9;
-
-   struct {
-      unsigned pad0:1;
-      unsigned grf_reg_count_3:3;
-      unsigned pad1:2;
-      unsigned kernel_start_pointer_3:26;
-   } wm10;
-};
-
 struct gen5_sampler_default_color {
    uint8_t ub[4];
    float f[4];
@@ -598,71 +74,4 @@
    uint8_t b[4];
 };
 
-struct brw_clipper_viewport
-{
-   float xmin;
-   float xmax;
-   float ymin;
-   float ymax;
-};
-
-struct brw_cc_viewport
-{
-   float min_depth;
-   float max_depth;
-};
-
-struct brw_sf_viewport
-{
-   struct {
-      float m00;
-      float m11;
-      float m22;
-      float m30;
-      float m31;
-      float m32;
-   } viewport;
-
-   /* scissor coordinates are inclusive */
-   struct {
-      int16_t xmin;
-      int16_t ymin;
-      int16_t xmax;
-      int16_t ymax;
-   } scissor;
-};
-
-struct gen6_sf_viewport {
-   float m00;
-   float m11;
-   float m22;
-   float m30;
-   float m31;
-   float m32;
-
-   unsigned pad0[2];
-};
-
-struct gen7_sf_clip_viewport {
-   struct {
-      float m00;
-      float m11;
-      float m22;
-      float m30;
-      float m31;
-      float m32;
-   } viewport;
-
-   unsigned pad0[2];
-
-   struct {
-      float xmin;
-      float xmax;
-      float ymin;
-      float ymax;
-   } guardband;
-
-   float pad1[4];
-};
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index b176a21..a2bc1de 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -28,7 +28,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-uint32_t
+enum isl_format
 brw_isl_format_for_mesa_format(mesa_format mesa_format)
 {
    /* This table is ordered according to the enum ordering in formats.h.  We do
@@ -36,31 +36,19 @@
     * staying in sync, so we initialize to 0 even though
     * ISL_FORMAT_R32G32B32A32_FLOAT happens to also be 0.
     */
-   static const uint32_t table[MESA_FORMAT_COUNT] =
-   {
-      [MESA_FORMAT_A8B8G8R8_UNORM] = 0,
+   static const enum isl_format table[MESA_FORMAT_COUNT] = {
+      [0 ... MESA_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED,
+
       [MESA_FORMAT_R8G8B8A8_UNORM] = ISL_FORMAT_R8G8B8A8_UNORM,
       [MESA_FORMAT_B8G8R8A8_UNORM] = ISL_FORMAT_B8G8R8A8_UNORM,
-      [MESA_FORMAT_A8R8G8B8_UNORM] = 0,
-      [MESA_FORMAT_X8B8G8R8_UNORM] = 0,
       [MESA_FORMAT_R8G8B8X8_UNORM] = ISL_FORMAT_R8G8B8X8_UNORM,
       [MESA_FORMAT_B8G8R8X8_UNORM] = ISL_FORMAT_B8G8R8X8_UNORM,
-      [MESA_FORMAT_X8R8G8B8_UNORM] = 0,
-      [MESA_FORMAT_BGR_UNORM8] = 0,
       [MESA_FORMAT_RGB_UNORM8] = ISL_FORMAT_R8G8B8_UNORM,
       [MESA_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM,
-      [MESA_FORMAT_R5G6B5_UNORM] = 0,
       [MESA_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM,
-      [MESA_FORMAT_A4R4G4B4_UNORM] = 0,
-      [MESA_FORMAT_A1B5G5R5_UNORM] = 0,
       [MESA_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM,
-      [MESA_FORMAT_A1R5G5B5_UNORM] = 0,
-      [MESA_FORMAT_L4A4_UNORM] = 0,
       [MESA_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM,
-      [MESA_FORMAT_A8L8_UNORM] = 0,
       [MESA_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM,
-      [MESA_FORMAT_A16L16_UNORM] = 0,
-      [MESA_FORMAT_B2G3R3_UNORM] = 0,
       [MESA_FORMAT_A_UNORM8] = ISL_FORMAT_A8_UNORM,
       [MESA_FORMAT_A_UNORM16] = ISL_FORMAT_A16_UNORM,
       [MESA_FORMAT_L_UNORM8] = ISL_FORMAT_L8_UNORM,
@@ -71,29 +59,16 @@
       [MESA_FORMAT_YCBCR] = ISL_FORMAT_YCRCB_SWAPUVY,
       [MESA_FORMAT_R_UNORM8] = ISL_FORMAT_R8_UNORM,
       [MESA_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM,
-      [MESA_FORMAT_G8R8_UNORM] = 0,
       [MESA_FORMAT_R_UNORM16] = ISL_FORMAT_R16_UNORM,
       [MESA_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM,
-      [MESA_FORMAT_G16R16_UNORM] = 0,
       [MESA_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM,
-      [MESA_FORMAT_S8_UINT_Z24_UNORM] = 0,
-      [MESA_FORMAT_Z24_UNORM_S8_UINT] = 0,
-      [MESA_FORMAT_Z_UNORM16] = 0,
-      [MESA_FORMAT_Z24_UNORM_X8_UINT] = 0,
-      [MESA_FORMAT_X8_UINT_Z24_UNORM] = 0,
-      [MESA_FORMAT_Z_UNORM32] = 0,
       [MESA_FORMAT_S_UINT8] = ISL_FORMAT_R8_UINT,
 
-      [MESA_FORMAT_BGR_SRGB8] = 0,
-      [MESA_FORMAT_A8B8G8R8_SRGB] = 0,
       [MESA_FORMAT_B8G8R8A8_SRGB] = ISL_FORMAT_B8G8R8A8_UNORM_SRGB,
-      [MESA_FORMAT_A8R8G8B8_SRGB] = 0,
       [MESA_FORMAT_R8G8B8A8_SRGB] = ISL_FORMAT_R8G8B8A8_UNORM_SRGB,
-      [MESA_FORMAT_X8R8G8B8_SRGB] = 0,
       [MESA_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB,
       [MESA_FORMAT_L_SRGB8] = ISL_FORMAT_L8_UNORM_SRGB,
       [MESA_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB,
-      [MESA_FORMAT_A8L8_SRGB] = 0,
       [MESA_FORMAT_SRGB_DXT1] = ISL_FORMAT_BC1_UNORM_SRGB,
       [MESA_FORMAT_SRGBA_DXT1] = ISL_FORMAT_BC1_UNORM_SRGB,
       [MESA_FORMAT_SRGBA_DXT3] = ISL_FORMAT_BC2_UNORM_SRGB,
@@ -109,7 +84,6 @@
       [MESA_FORMAT_RGBA_FLOAT32] = ISL_FORMAT_R32G32B32A32_FLOAT,
       [MESA_FORMAT_RGBA_FLOAT16] = ISL_FORMAT_R16G16B16A16_FLOAT,
       [MESA_FORMAT_RGB_FLOAT32] = ISL_FORMAT_R32G32B32_FLOAT,
-      [MESA_FORMAT_RGB_FLOAT16] = 0,
       [MESA_FORMAT_A_FLOAT32] = ISL_FORMAT_A32_FLOAT,
       [MESA_FORMAT_A_FLOAT16] = ISL_FORMAT_A16_FLOAT,
       [MESA_FORMAT_L_FLOAT32] = ISL_FORMAT_L32_FLOAT,
@@ -123,34 +97,6 @@
       [MESA_FORMAT_RG_FLOAT32] = ISL_FORMAT_R32G32_FLOAT,
       [MESA_FORMAT_RG_FLOAT16] = ISL_FORMAT_R16G16_FLOAT,
 
-      [MESA_FORMAT_A_UINT8] = 0,
-      [MESA_FORMAT_A_UINT16] = 0,
-      [MESA_FORMAT_A_UINT32] = 0,
-      [MESA_FORMAT_A_SINT8] = 0,
-      [MESA_FORMAT_A_SINT16] = 0,
-      [MESA_FORMAT_A_SINT32] = 0,
-
-      [MESA_FORMAT_I_UINT8] = 0,
-      [MESA_FORMAT_I_UINT16] = 0,
-      [MESA_FORMAT_I_UINT32] = 0,
-      [MESA_FORMAT_I_SINT8] = 0,
-      [MESA_FORMAT_I_SINT16] = 0,
-      [MESA_FORMAT_I_SINT32] = 0,
-
-      [MESA_FORMAT_L_UINT8] = 0,
-      [MESA_FORMAT_L_UINT16] = 0,
-      [MESA_FORMAT_L_UINT32] = 0,
-      [MESA_FORMAT_L_SINT8] = 0,
-      [MESA_FORMAT_L_SINT16] = 0,
-      [MESA_FORMAT_L_SINT32] = 0,
-
-      [MESA_FORMAT_LA_UINT8] = 0,
-      [MESA_FORMAT_LA_UINT16] = 0,
-      [MESA_FORMAT_LA_UINT32] = 0,
-      [MESA_FORMAT_LA_SINT8] = 0,
-      [MESA_FORMAT_LA_SINT16] = 0,
-      [MESA_FORMAT_LA_SINT32] = 0,
-
       [MESA_FORMAT_R_SINT8] = ISL_FORMAT_R8_SINT,
       [MESA_FORMAT_RG_SINT8] = ISL_FORMAT_R8G8_SINT,
       [MESA_FORMAT_RGB_SINT8] = ISL_FORMAT_R8G8B8_SINT,
@@ -179,8 +125,6 @@
 
       [MESA_FORMAT_R_SNORM8] = ISL_FORMAT_R8_SNORM,
       [MESA_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM,
-      [MESA_FORMAT_X8B8G8R8_SNORM] = 0,
-      [MESA_FORMAT_A8B8G8R8_SNORM] = 0,
       [MESA_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM,
       [MESA_FORMAT_R_SNORM16] = ISL_FORMAT_R16_SNORM,
       [MESA_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM,
@@ -193,11 +137,6 @@
       [MESA_FORMAT_RG_RGTC2_UNORM] = ISL_FORMAT_BC5_UNORM,
       [MESA_FORMAT_RG_RGTC2_SNORM] = ISL_FORMAT_BC5_SNORM,
 
-      [MESA_FORMAT_L_LATC1_UNORM] = 0,
-      [MESA_FORMAT_L_LATC1_SNORM] = 0,
-      [MESA_FORMAT_LA_LATC2_UNORM] = 0,
-      [MESA_FORMAT_LA_LATC2_SNORM] = 0,
-
       [MESA_FORMAT_ETC1_RGB8] = ISL_FORMAT_ETC1_RGB8,
       [MESA_FORMAT_ETC2_RGB8] = ISL_FORMAT_ETC2_RGB8,
       [MESA_FORMAT_ETC2_SRGB8] = ISL_FORMAT_ETC2_SRGB8,
@@ -244,71 +183,45 @@
       [MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10] = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB,
       [MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12] = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB,
 
-      [MESA_FORMAT_A_SNORM8] = 0,
-      [MESA_FORMAT_L_SNORM8] = 0,
-      [MESA_FORMAT_L8A8_SNORM] = 0,
-      [MESA_FORMAT_A8L8_SNORM] = 0,
-      [MESA_FORMAT_I_SNORM8] = 0,
-      [MESA_FORMAT_A_SNORM16] = 0,
-      [MESA_FORMAT_L_SNORM16] = 0,
-      [MESA_FORMAT_LA_SNORM16] = 0,
-      [MESA_FORMAT_I_SNORM16] = 0,
-
       [MESA_FORMAT_R9G9B9E5_FLOAT] = ISL_FORMAT_R9G9B9E5_SHAREDEXP,
       [MESA_FORMAT_R11G11B10_FLOAT] = ISL_FORMAT_R11G11B10_FLOAT,
 
-      [MESA_FORMAT_Z_FLOAT32] = 0,
-      [MESA_FORMAT_Z32_FLOAT_S8X24_UINT] = 0,
-
       [MESA_FORMAT_R10G10B10A2_UNORM] = ISL_FORMAT_R10G10B10A2_UNORM,
       [MESA_FORMAT_B10G10R10A2_UINT] = ISL_FORMAT_B10G10R10A2_UINT,
       [MESA_FORMAT_R10G10B10A2_UINT] = ISL_FORMAT_R10G10B10A2_UINT,
 
-      [MESA_FORMAT_B4G4R4X4_UNORM] = 0,
       [MESA_FORMAT_B5G5R5X1_UNORM] = ISL_FORMAT_B5G5R5X1_UNORM,
-      [MESA_FORMAT_R8G8B8X8_SNORM] = 0,
       [MESA_FORMAT_R8G8B8X8_SRGB] = ISL_FORMAT_R8G8B8X8_UNORM_SRGB,
-      [MESA_FORMAT_X8B8G8R8_SRGB] = 0,
-      [MESA_FORMAT_RGBX_UINT8] = 0,
-      [MESA_FORMAT_RGBX_SINT8] = 0,
       [MESA_FORMAT_B10G10R10X2_UNORM] = ISL_FORMAT_B10G10R10X2_UNORM,
       [MESA_FORMAT_RGBX_UNORM16] = ISL_FORMAT_R16G16B16X16_UNORM,
-      [MESA_FORMAT_RGBX_SNORM16] = 0,
       [MESA_FORMAT_RGBX_FLOAT16] = ISL_FORMAT_R16G16B16X16_FLOAT,
-      [MESA_FORMAT_RGBX_UINT16] = 0,
-      [MESA_FORMAT_RGBX_SINT16] = 0,
       [MESA_FORMAT_RGBX_FLOAT32] = ISL_FORMAT_R32G32B32X32_FLOAT,
-      [MESA_FORMAT_RGBX_UINT32] = 0,
-      [MESA_FORMAT_RGBX_SINT32] = 0,
    };
+
    assert(mesa_format < MESA_FORMAT_COUNT);
    return table[mesa_format];
 }
 
 void
-brw_init_surface_formats(struct brw_context *brw)
+intel_screen_init_surface_formats(struct intel_screen *screen)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gl_context *ctx = &brw->ctx;
-   int gen;
+   const struct gen_device_info *devinfo = &screen->devinfo;
    mesa_format format;
 
-   memset(&ctx->TextureFormatSupported, 0, sizeof(ctx->TextureFormatSupported));
+   memset(&screen->mesa_format_supports_texture, 0,
+          sizeof(screen->mesa_format_supports_texture));
 
-   gen = brw->gen * 10;
-   if (brw->is_g4x || brw->is_haswell)
+   int gen = devinfo->gen * 10;
+   if (devinfo->is_g4x || devinfo->is_haswell)
       gen += 5;
 
    for (format = MESA_FORMAT_NONE + 1; format < MESA_FORMAT_COUNT; format++) {
-      uint32_t texture, render;
+      enum isl_format texture, render;
       bool is_integer = _mesa_is_format_integer_color(format);
 
       render = texture = brw_isl_format_for_mesa_format(format);
 
-      /* The value of ISL_FORMAT_R32G32B32A32_FLOAT is 0, so don't skip
-       * it.
-       */
-      if (texture == 0 && format != MESA_FORMAT_RGBA_FLOAT32)
+      if (texture == ISL_FORMAT_UNSUPPORTED)
 	 continue;
 
       /* Don't advertise 8 and 16-bit RGB formats to core mesa.  This ensures
@@ -323,7 +236,7 @@
 
       if (isl_format_supports_sampling(devinfo, texture) &&
           (isl_format_supports_filtering(devinfo, texture) || is_integer))
-	 ctx->TextureFormatSupported[format] = true;
+	 screen->mesa_format_supports_texture[format] = true;
 
       /* Re-map some render target formats to make them supported when they
        * wouldn't be using their format for texturing.
@@ -376,6 +289,8 @@
       case ISL_FORMAT_R8G8B8X8_UNORM_SRGB:
          render = ISL_FORMAT_R8G8B8A8_UNORM_SRGB;
          break;
+      default:
+         break;
       }
 
       /* Note that GL_EXT_texture_integer says that blending doesn't occur for
@@ -385,30 +300,30 @@
        */
       if (isl_format_supports_rendering(devinfo, render) &&
           (isl_format_supports_alpha_blending(devinfo, render) || is_integer)) {
-	 brw->render_target_format[format] = render;
-	 brw->format_supported_as_render_target[format] = true;
+	 screen->mesa_to_isl_render_format[format] = render;
+	 screen->mesa_format_supports_render[format] = true;
       }
    }
 
    /* We will check this table for FBO completeness, but the surface format
     * table above only covered color rendering.
     */
-   brw->format_supported_as_render_target[MESA_FORMAT_Z24_UNORM_S8_UINT] = true;
-   brw->format_supported_as_render_target[MESA_FORMAT_Z24_UNORM_X8_UINT] = true;
-   brw->format_supported_as_render_target[MESA_FORMAT_S_UINT8] = true;
-   brw->format_supported_as_render_target[MESA_FORMAT_Z_FLOAT32] = true;
-   brw->format_supported_as_render_target[MESA_FORMAT_Z32_FLOAT_S8X24_UINT] = true;
-   if (brw->gen >= 8)
-      brw->format_supported_as_render_target[MESA_FORMAT_Z_UNORM16] = true;
+   screen->mesa_format_supports_render[MESA_FORMAT_Z24_UNORM_S8_UINT] = true;
+   screen->mesa_format_supports_render[MESA_FORMAT_Z24_UNORM_X8_UINT] = true;
+   screen->mesa_format_supports_render[MESA_FORMAT_S_UINT8] = true;
+   screen->mesa_format_supports_render[MESA_FORMAT_Z_FLOAT32] = true;
+   screen->mesa_format_supports_render[MESA_FORMAT_Z32_FLOAT_S8X24_UINT] = true;
+   if (gen >= 80)
+      screen->mesa_format_supports_render[MESA_FORMAT_Z_UNORM16] = true;
 
    /* We remap depth formats to a supported texturing format in
     * translate_tex_format().
     */
-   ctx->TextureFormatSupported[MESA_FORMAT_Z24_UNORM_S8_UINT] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_Z24_UNORM_X8_UINT] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_Z_FLOAT32] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_Z32_FLOAT_S8X24_UINT] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_S_UINT8] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_Z24_UNORM_S8_UINT] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_Z24_UNORM_X8_UINT] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_Z_FLOAT32] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_Z32_FLOAT_S8X24_UINT] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_S_UINT8] = true;
 
    /* Benchmarking shows that Z16 is slower than Z24, so there's no reason to
     * use it unless you're under memory (not memory bandwidth) pressure.
@@ -424,8 +339,8 @@
     * With the PMA stall workaround in place, Z16 is faster than Z24, as it
     * should be.
     */
-   if (brw->gen >= 8)
-      ctx->TextureFormatSupported[MESA_FORMAT_Z_UNORM16] = true;
+   if (gen >= 80)
+      screen->mesa_format_supports_texture[MESA_FORMAT_Z_UNORM16] = true;
 
    /* The RGBX formats are not renderable. Normally these get mapped
     * internally to RGBA formats when rendering. However on Gen9+ when this
@@ -440,7 +355,7 @@
     * doesn't implement this swizzle override. We don't need to do this for
     * BGRX because that actually is supported natively on Gen8+.
     */
-   if (brw->gen >= 9) {
+   if (gen >= 90) {
       static const mesa_format rgbx_formats[] = {
          MESA_FORMAT_R8G8B8X8_UNORM,
          MESA_FORMAT_R8G8B8X8_SRGB,
@@ -450,30 +365,47 @@
       };
 
       for (int i = 0; i < ARRAY_SIZE(rgbx_formats); i++) {
-         ctx->TextureFormatSupported[rgbx_formats[i]] = false;
-         brw->format_supported_as_render_target[rgbx_formats[i]] = false;
+         screen->mesa_format_supports_texture[rgbx_formats[i]] = false;
+         screen->mesa_format_supports_render[rgbx_formats[i]] = false;
       }
    }
 
    /* On hardware that lacks support for ETC1, we map ETC1 to RGBX
     * during glCompressedTexImage2D(). See intel_mipmap_tree::wraps_etc1.
     */
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC1_RGB8] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC1_RGB8] = true;
 
    /* On hardware that lacks support for ETC2, we map ETC2 to a suitable
     * MESA_FORMAT during glCompressedTexImage2D().
     * See intel_mipmap_tree::wraps_etc2.
     */
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_RGB8] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_SRGB8] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_RGBA8_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_R11_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_RG11_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_SIGNED_R11_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_SIGNED_RG11_EAC] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1] = true;
-   ctx->TextureFormatSupported[MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_RGB8] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_SRGB8] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_RGBA8_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_R11_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_RG11_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_SIGNED_R11_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_SIGNED_RG11_EAC] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1] = true;
+   screen->mesa_format_supports_texture[MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1] = true;
+}
+
+void
+brw_init_surface_formats(struct brw_context *brw)
+{
+   struct intel_screen *screen = brw->screen;
+   struct gl_context *ctx = &brw->ctx;
+
+   brw->mesa_format_supports_render = screen->mesa_format_supports_render;
+   brw->mesa_to_isl_render_format = screen->mesa_to_isl_render_format;
+
+   STATIC_ASSERT(ARRAY_SIZE(ctx->TextureFormatSupported) ==
+                 ARRAY_SIZE(screen->mesa_format_supports_texture));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->TextureFormatSupported); ++i) {
+      ctx->TextureFormatSupported[i] = screen->mesa_format_supports_texture[i];
+   }
 }
 
 bool
@@ -508,10 +440,10 @@
          return false;
    }
 
-   return brw->format_supported_as_render_target[format];
+   return brw->mesa_format_supports_render[format];
 }
 
-GLuint
+enum isl_format
 translate_tex_format(struct brw_context *brw,
                      mesa_format mesa_format,
 		     GLenum srgb_decode)
@@ -555,7 +487,8 @@
    case MESA_FORMAT_RGBA_ASTC_10x10:
    case MESA_FORMAT_RGBA_ASTC_12x10:
    case MESA_FORMAT_RGBA_ASTC_12x12: {
-      GLuint brw_fmt = brw_isl_format_for_mesa_format(mesa_format);
+      enum isl_format isl_fmt =
+         brw_isl_format_for_mesa_format(mesa_format);
 
       /**
        * It is possible to process these formats using the LDR Profile
@@ -566,13 +499,12 @@
        * processing sRGBs, which are incompatible with this mode.
        */
       if (ctx->Extensions.KHR_texture_compression_astc_hdr)
-         brw_fmt |= GEN9_SURFACE_ASTC_HDR_FORMAT_BIT;
+         isl_fmt |= GEN9_SURFACE_ASTC_HDR_FORMAT_BIT;
 
-      return brw_fmt;
+      return isl_fmt;
    }
 
    default:
-      assert(brw_isl_format_for_mesa_format(mesa_format) != 0);
       return brw_isl_format_for_mesa_format(mesa_format);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_sync.c b/src/mesa/drivers/dri/i965/brw_sync.c
index a8356c3..edfb188 100644
--- a/src/mesa/drivers/dri/i965/brw_sync.c
+++ b/src/mesa/drivers/dri/i965/brw_sync.c
@@ -110,6 +110,35 @@
 static bool MUST_CHECK
 brw_fence_insert_locked(struct brw_context *brw, struct brw_fence *fence)
 {
+   __DRIcontext *driContext = brw->driContext;
+   __DRIdrawable *driDrawable = driContext->driDrawablePriv;
+
+   /*
+    * From KHR_fence_sync:
+    *
+    *   When the condition of the sync object is satisfied by the fence
+    *   command, the sync is signaled by the associated client API context,
+    *   causing any eglClientWaitSyncKHR commands (see below) blocking on
+    *   <sync> to unblock. The only condition currently supported is
+    *   EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR, which is satisfied by
+    *   completion of the fence command corresponding to the sync object,
+    *   and all preceding commands in the associated client API context's
+    *   command stream. The sync object will not be signaled until all
+    *   effects from these commands on the client API's internal and
+    *   framebuffer state are fully realized. No other state is affected by
+    *   execution of the fence command.
+    *
+    * Note the emphasis there on ensuring that the framebuffer is fully
+    * realised before the fence is signaled. We cannot just flush the batch,
+    * but must also resolve the drawable first. The importance of this is,
+    * for example, in creating a fence for a frame to be passed to a
+    * remote compositor. Without us flushing the drawable explicitly, the
+    * resolve will be in a following batch (when the client finally calls
+    * SwapBuffers, or triggers a resolve via some other path) and so the
+    * compositor may read the incomplete framebuffer instead.
+    */
+   if (driDrawable)
+      intel_resolve_for_dri2_flush(brw, driDrawable);
    brw_emit_mi_flush(brw);
 
    switch (fence->type) {
@@ -335,6 +364,9 @@
    struct brw_context *brw = brw_context(ctx);
    struct brw_gl_sync *sync = (struct brw_gl_sync *) _sync;
 
+   /* brw_fence_insert_locked() assumes it must do a complete flush */
+   assert(condition == GL_SYNC_GPU_COMMANDS_COMPLETE);
+
    brw_fence_init(brw, &sync->fence, BRW_FENCE_TYPE_BO_WAIT);
 
    if (!brw_fence_insert_locked(brw, &sync->fence)) {
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 3cc6cdb..1ed622e 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -50,11 +50,11 @@
    nir_ssa_def *invoc_id =
       nir_load_system_value(&b, nir_intrinsic_load_invocation_id, 0);
 
-   nir->info->inputs_read = key->outputs_written &
+   nir->info.inputs_read = key->outputs_written &
       ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
-   nir->info->outputs_written = key->outputs_written;
-   nir->info->tess.tcs_vertices_out = key->input_vertices;
-   nir->info->name = ralloc_strdup(nir, "passthrough");
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.tess.tcs_vertices_out = key->input_vertices;
+   nir->info.name = ralloc_strdup(nir, "passthrough");
    nir->num_uniforms = 8 * sizeof(uint32_t);
 
    var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0");
@@ -81,7 +81,7 @@
    }
 
    /* Copy inputs to outputs. */
-   uint64_t varyings = nir->info->inputs_read;
+   uint64_t varyings = nir->info.inputs_read;
 
    while (varyings != 0) {
       const int varying = ffsll(varyings) - 1;
@@ -205,6 +205,8 @@
 
       brw_nir_setup_glsl_uniforms(nir, &tcp->program, &prog_data.base.base,
                                   compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
+      brw_nir_analyze_ubo_ranges(compiler, tcp->program.nir,
+                                 prog_data.base.base.ubo_ranges);
    } else {
       /* Upload the Patch URB Header as the first two uniforms.
        * Do the annoying scrambling so the shader doesn't have to.
@@ -394,8 +396,8 @@
       key.tes_primitive_mode = GL_TRIANGLES;
    }
 
-   key.outputs_written = prog->nir->info->outputs_written;
-   key.patch_outputs_written = prog->nir->info->patch_outputs_written;
+   key.outputs_written = prog->nir->info.outputs_written;
+   key.patch_outputs_written = prog->nir->info.patch_outputs_written;
 
    success = brw_codegen_tcs_prog(brw, btcp, btep, &key);
 
diff --git a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
index ede7ae9..03c8c88 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
@@ -133,6 +133,7 @@
    .dirty = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_TCS_PROG_DATA |
              BRW_NEW_IMAGE_UNITS |
              BRW_NEW_TESS_PROGRAMS,
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
index 449f946..20ce1f4 100644
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -102,6 +102,8 @@
 
    brw_nir_setup_glsl_uniforms(nir, &tep->program, &prog_data.base.base,
                                compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
+   brw_nir_analyze_ubo_ranges(compiler, tep->program.nir,
+                              prog_data.base.base.ubo_ranges);
 
    int st_index = -1;
    if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
@@ -234,15 +236,15 @@
    memset(&key, 0, sizeof(key));
 
    key.program_string_id = btep->id;
-   key.inputs_read = prog->nir->info->inputs_read;
-   key.patch_inputs_read = prog->nir->info->patch_inputs_read;
+   key.inputs_read = prog->nir->info.inputs_read;
+   key.patch_inputs_read = prog->nir->info.patch_inputs_read;
 
    if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
       struct gl_program *tcp =
          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
-      key.inputs_read |= tcp->nir->info->outputs_written &
+      key.inputs_read |= tcp->nir->info.outputs_written &
          ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
-      key.patch_inputs_read |= tcp->nir->info->patch_outputs_written;
+      key.patch_inputs_read |= tcp->nir->info.patch_outputs_written;
    }
 
    brw_setup_tex_for_precompile(brw, &key.tex, prog);
diff --git a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
index 1982955..2750487 100644
--- a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
@@ -133,6 +133,7 @@
    .dirty = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_IMAGE_UNITS |
              BRW_NEW_TESS_PROGRAMS |
              BRW_NEW_TES_PROG_DATA,
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
deleted file mode 100644
index 1f0a1e9..0000000
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright 2006 VMware, Inc.
- * Copyright © 2006 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_tex_layout.cpp
- *
- * Code to lay out images in a mipmap tree.
- *
- * \author Keith Whitwell <keithw@vmware.com>
- * \author Michel Dänzer <daenzer@vmware.com>
- */
-
-#include "intel_mipmap_tree.h"
-#include "brw_context.h"
-#include "main/macros.h"
-#include "main/glformats.h"
-
-#define FILE_DEBUG_FLAG DEBUG_MIPTREE
-
-static unsigned int
-intel_horizontal_texture_alignment_unit(struct brw_context *brw,
-                                        struct intel_mipmap_tree *mt,
-                                        uint32_t layout_flags)
-{
-   if (layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16)
-      return 16;
-
-   /**
-    * +----------------------------------------------------------------------+
-    * |                                        | alignment unit width  ("i") |
-    * | Surface Property                       |-----------------------------|
-    * |                                        | 915 | 965 | ILK | SNB | IVB |
-    * +----------------------------------------------------------------------+
-    * | YUV 4:2:2 format                       |  8  |  4  |  4  |  4  |  4  |
-    * | BC1-5 compressed format (DXTn/S3TC)    |  4  |  4  |  4  |  4  |  4  |
-    * | FXT1  compressed format                |  8  |  8  |  8  |  8  |  8  |
-    * | Depth Buffer (16-bit)                  |  4  |  4  |  4  |  4  |  8  |
-    * | Depth Buffer (other)                   |  4  |  4  |  4  |  4  |  4  |
-    * | Separate Stencil Buffer                | N/A | N/A |  8  |  8  |  8  |
-    * | All Others                             |  4  |  4  |  4  |  4  |  4  |
-    * +----------------------------------------------------------------------+
-    *
-    * On IVB+, non-special cases can be overridden by setting the SURFACE_STATE
-    * "Surface Horizontal Alignment" field to HALIGN_4 or HALIGN_8.
-    */
-
-   if (brw->gen >= 7 && mt->format == MESA_FORMAT_Z_UNORM16)
-      return 8;
-
-   return 4;
-}
-
-static unsigned int
-intel_vertical_texture_alignment_unit(struct brw_context *brw,
-                                      const struct intel_mipmap_tree *mt)
-{
-   /**
-    * +----------------------------------------------------------------------+
-    * |                                        | alignment unit height ("j") |
-    * | Surface Property                       |-----------------------------|
-    * |                                        | 915 | 965 | ILK | SNB | IVB |
-    * +----------------------------------------------------------------------+
-    * | BC1-5 compressed format (DXTn/S3TC)    |  4  |  4  |  4  |  4  |  4  |
-    * | FXT1  compressed format                |  4  |  4  |  4  |  4  |  4  |
-    * | Depth Buffer                           |  2  |  2  |  2  |  4  |  4  |
-    * | Separate Stencil Buffer                | N/A | N/A | N/A |  4  |  8  |
-    * | Multisampled (4x or 8x) render target  | N/A | N/A | N/A |  4  |  4  |
-    * | All Others                             |  2  |  2  |  2  |  *  |  *  |
-    * +----------------------------------------------------------------------+
-    *
-    * Where "*" means either VALIGN_2 or VALIGN_4 depending on the setting of
-    * the SURFACE_STATE "Surface Vertical Alignment" field.
-    */
-
-   /* Broadwell only supports VALIGN of 4, 8, and 16.  The BSpec says 4
-    * should always be used, except for stencil buffers, which should be 8.
-    */
-   if (brw->gen >= 8)
-      return 4;
-
-   if (mt->num_samples > 1)
-      return 4;
-
-   GLenum base_format = _mesa_get_format_base_format(mt->format);
-
-   if (brw->gen >= 6 &&
-       (base_format == GL_DEPTH_COMPONENT ||
-	base_format == GL_DEPTH_STENCIL)) {
-      return 4;
-   }
-
-   if (brw->gen == 7) {
-      /* On Gen7, we prefer a vertical alignment of 4 when possible, because
-       * that allows Y tiled render targets.
-       *
-       * From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
-       * messages), on p64, under the heading "Surface Vertical Alignment":
-       *
-       *     Value of 1 [VALIGN_4] is not supported for format YCRCB_NORMAL
-       *     (0x182), YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY
-       *     (0x190)
-       *
-       *     VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
-       */
-      if (base_format == GL_YCBCR_MESA || mt->format == MESA_FORMAT_RGB_FLOAT32)
-         return 2;
-
-      return 4;
-   }
-
-   return 2;
-}
-
-static void
-gen9_miptree_layout_1d(struct intel_mipmap_tree *mt)
-{
-   unsigned x = 0;
-   unsigned width = mt->physical_width0;
-   unsigned depth = mt->physical_depth0; /* number of array layers. */
-
-   /* When this layout is used the horizontal alignment is fixed at 64 and the
-    * hardware ignores the value given in the surface state
-    */
-   const unsigned int halign = 64;
-
-   mt->total_height = mt->physical_height0;
-   mt->total_width = 0;
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
-      unsigned img_width;
-
-      intel_miptree_set_level_info(mt, level, x, 0, depth);
-
-      img_width = ALIGN(width, halign);
-
-      mt->total_width = MAX2(mt->total_width, x + img_width);
-
-      x += img_width;
-
-      width = minify(width, 1);
-   }
-}
-
-static void
-brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
-{
-   unsigned x = 0;
-   unsigned y = 0;
-   unsigned width = mt->physical_width0;
-   unsigned height = mt->physical_height0;
-   /* Number of layers of array texture or slices of 3d texture (gen9+). */
-   unsigned depth = mt->physical_depth0;
-   unsigned int bw, bh;
-
-   _mesa_get_format_block_size(mt->format, &bw, &bh);
-
-   mt->total_width = mt->physical_width0;
-   mt->total_width = ALIGN_NPOT(mt->total_width, bw);
-
-   /* May need to adjust width to accommodate the placement of
-    * the 2nd mipmap.  This occurs when the alignment
-    * constraints of mipmap placement push the right edge of the
-    * 2nd mipmap out past the width of its parent.
-    */
-   if (mt->first_level != mt->last_level) {
-       unsigned mip1_width;
-
-      mip1_width = ALIGN_NPOT(minify(mt->physical_width0, 1), mt->halign) +
-                   ALIGN_NPOT(minify(mt->physical_width0, 2), bw);
-
-      if (mip1_width > mt->total_width)
-         mt->total_width = mip1_width;
-   }
-
-   mt->total_width /= bw;
-   mt->total_height = 0;
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
-      unsigned img_height;
-
-      intel_miptree_set_level_info(mt, level, x, y, depth);
-
-      img_height = ALIGN_NPOT(height, mt->valign);
-      img_height /= bh;
-
-      if (mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-         /* Compact arrays with separated miplevels */
-         img_height *= depth;
-      }
-
-      /* Because the images are packed better, the final offset
-       * might not be the maximal one:
-       */
-      mt->total_height = MAX2(mt->total_height, y + img_height);
-
-      /* Layout_below: step right after second mipmap.
-       *
-       * For Sandy Bridge HiZ and stencil, we always step down.
-       */
-      if (level == mt->first_level + 1) {
-	 x += ALIGN_NPOT(width, mt->halign) / bw;
-      } else {
-	 y += img_height;
-      }
-
-      width  = minify(width, 1);
-      height = minify(height, 1);
-
-      if (mt->target == GL_TEXTURE_3D)
-         depth = minify(depth, 1);
-   }
-}
-
-static void
-brw_miptree_layout_gen6_hiz_stencil(struct intel_mipmap_tree *mt)
-{
-   unsigned x = 0;
-   unsigned y = 0;
-   unsigned width = mt->physical_width0;
-   unsigned height = mt->physical_height0;
-   /* Number of layers of array texture. */
-   unsigned depth = mt->physical_depth0;
-   unsigned tile_width, tile_height, bw, bh;
-
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      bw = bh = 1;
-      /* W-tiled */
-      tile_width = 64;
-      tile_height = 64;
-   } else {
-      assert(_mesa_get_format_base_format(mt->format) == GL_DEPTH_COMPONENT ||
-             _mesa_get_format_base_format(mt->format) == GL_DEPTH_STENCIL);
-      /* Each 128-bit HiZ block corresponds to a region of of 8x4 depth
-       * samples.  Each cache line in the Y-Tiled HiZ image contains 2x2 HiZ
-       * blocks.  Therefore, each Y-tiled cache line corresponds to an 16x8
-       * region in the depth surface.  Since we're representing it as
-       * RGBA_FLOAT32, the miptree calculations will think that each cache
-       * line is 1x4 pixels.  Therefore, we need a scale-down factor of 16x2
-       * and a vertical alignment of 2.
-       */
-      mt->cpp = 16;
-      bw = 16;
-      bh = 2;
-      /* Y-tiled */
-      tile_width = 128 / mt->cpp;
-      tile_height = 32;
-   }
-
-   mt->total_width = 0;
-   mt->total_height = 0;
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
-      intel_miptree_set_level_info(mt, level, x, y, depth);
-
-      const unsigned img_width = ALIGN(DIV_ROUND_UP(width, bw), mt->halign);
-      const unsigned img_height =
-         ALIGN(DIV_ROUND_UP(height, bh), mt->valign) * depth;
-
-      mt->total_width = MAX2(mt->total_width, x + img_width);
-      mt->total_height = MAX2(mt->total_height, y + img_height);
-
-      if (level == mt->first_level) {
-         y += ALIGN(img_height, tile_height);
-      } else {
-         x += ALIGN(img_width, tile_width);
-      }
-
-      /* We only minify the width.  We want qpitch to match for all miplevels
-       * because the hardware doesn't know we aren't on LOD0.
-       */
-      width = minify(width, 1);
-   }
-}
-
-unsigned
-brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
-                                       const struct intel_mipmap_tree *mt,
-                                       unsigned level)
-{
-   if ((brw->gen < 9 && mt->target == GL_TEXTURE_3D) ||
-       (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP)) {
-      return ALIGN_NPOT(minify(mt->physical_width0, level), mt->halign);
-   } else {
-      return 0;
-   }
-}
-
-unsigned
-brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt,
-                                     unsigned level)
-{
-   assert(mt->array_layout != GEN6_HIZ_STENCIL || brw->gen == 6);
-
-   if (brw->gen >= 9) {
-      /* ALL_SLICES_AT_EACH_LOD isn't supported on Gen8+ but this code will
-       * effectively end up with a packed qpitch anyway whenever
-       * mt->first_level == mt->last_level.
-       */
-      assert(mt->array_layout != ALL_SLICES_AT_EACH_LOD);
-
-      /* On Gen9 we can pick whatever qpitch we like as long as it's aligned
-       * to the vertical alignment so we don't need to add any extra rows.
-       */
-      unsigned qpitch = mt->total_height;
-
-      /* If the surface might be used as a stencil buffer or HiZ buffer then
-       * it needs to be a multiple of 8.
-       */
-      const GLenum base_format = _mesa_get_format_base_format(mt->format);
-      if (_mesa_is_depth_or_stencil_format(base_format))
-         qpitch = ALIGN(qpitch, 8);
-
-      /* 3D textures need to be aligned to the tile height. At this point we
-       * don't know which tiling will be used so let's just align it to 32
-       */
-      if (mt->target == GL_TEXTURE_3D)
-         qpitch = ALIGN(qpitch, 32);
-
-      return qpitch;
-
-   } else if (mt->target == GL_TEXTURE_3D ||
-              (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP) ||
-              mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-      return ALIGN_NPOT(minify(mt->physical_height0, level), mt->valign);
-
-   } else if (mt->array_layout == GEN6_HIZ_STENCIL) {
-      /* For HiZ and stencil on Sandy Bridge, we don't minify the height. */
-      if (mt->format == MESA_FORMAT_S_UINT8) {
-         return ALIGN(mt->physical_height0, mt->valign);
-      } else {
-         /* HiZ has a vertical scale factor of 2. */
-         return ALIGN(DIV_ROUND_UP(mt->physical_height0, 2), mt->valign);
-      }
-
-   } else {
-      const unsigned h0 = ALIGN_NPOT(mt->physical_height0, mt->valign);
-      const unsigned h1 = ALIGN_NPOT(minify(mt->physical_height0, 1), mt->valign);
-
-      return h0 + h1 + (brw->gen >= 7 ? 12 : 11) * mt->valign;
-   }
-}
-
-static void
-align_cube(struct intel_mipmap_tree *mt)
-{
-   /* The 965's sampler lays cachelines out according to how accesses
-    * in the texture surfaces run, so they may be "vertical" through
-    * memory.  As a result, the docs say in Surface Padding Requirements:
-    * Sampling Engine Surfaces that two extra rows of padding are required.
-    */
-   if (mt->target == GL_TEXTURE_CUBE_MAP)
-      mt->total_height += 2;
-}
-
-bool
-gen9_use_linear_1d_layout(const struct brw_context *brw,
-                          const struct intel_mipmap_tree *mt)
-{
-   /* On Gen9+ the mipmap levels of a 1D surface are all laid out in a
-    * horizontal line. This isn't done for depth/stencil buffers however
-    * because those will be using a tiled layout
-    */
-   if (brw->gen >= 9 &&
-       (mt->target == GL_TEXTURE_1D ||
-        mt->target == GL_TEXTURE_1D_ARRAY)) {
-      GLenum base_format = _mesa_get_format_base_format(mt->format);
-
-      if (base_format != GL_DEPTH_COMPONENT &&
-          base_format != GL_DEPTH_STENCIL &&
-          base_format != GL_STENCIL_INDEX)
-         return true;
-   }
-
-   return false;
-}
-
-static void
-brw_miptree_layout_texture_array(struct brw_context *brw,
-				 struct intel_mipmap_tree *mt)
-{
-   unsigned height = mt->physical_height0;
-   bool layout_1d = gen9_use_linear_1d_layout(brw, mt);
-   int physical_qpitch;
-
-   if (layout_1d)
-      gen9_miptree_layout_1d(mt);
-   else if (mt->array_layout == GEN6_HIZ_STENCIL)
-      brw_miptree_layout_gen6_hiz_stencil(mt);
-   else
-      brw_miptree_layout_2d(mt);
-
-   if (layout_1d) {
-      physical_qpitch = 1;
-      /* When using the horizontal layout the qpitch specifies the distance in
-       * pixels between array slices. The total_width is forced to be a
-       * multiple of the horizontal alignment in brw_miptree_layout_1d (in
-       * this case it's always 64). The vertical alignment is ignored.
-       */
-      mt->qpitch = mt->total_width;
-   } else {
-      mt->qpitch = brw_miptree_get_vertical_slice_pitch(brw, mt, 0);
-      /* Unlike previous generations the qpitch is a multiple of the
-       * compressed block size on Gen9 so physical_qpitch matches mt->qpitch.
-       */
-      physical_qpitch = (mt->compressed && brw->gen < 9 ? mt->qpitch / 4 :
-                         mt->qpitch);
-   }
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
-      unsigned img_height;
-      img_height = ALIGN_NPOT(height, mt->valign);
-      if (mt->compressed)
-         img_height /= mt->valign;
-
-      for (unsigned q = 0; q < mt->level[level].depth; q++) {
-         if (mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-            intel_miptree_set_image_offset(mt, level, q, 0, q * img_height);
-         } else {
-            intel_miptree_set_image_offset(mt, level, q, 0, q * physical_qpitch);
-         }
-      }
-      height = minify(height, 1);
-   }
-   if (mt->array_layout == ALL_LOD_IN_EACH_SLICE)
-      mt->total_height = physical_qpitch * mt->physical_depth0;
-
-   align_cube(mt);
-}
-
-static void
-brw_miptree_layout_texture_3d(struct brw_context *brw,
-                              struct intel_mipmap_tree *mt)
-{
-   mt->total_width = 0;
-   mt->total_height = 0;
-
-   unsigned ysum = 0;
-   unsigned bh, bw;
-
-   _mesa_get_format_block_size(mt->format, &bw, &bh);
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
-      unsigned WL = MAX2(mt->physical_width0 >> level, 1);
-      unsigned HL = MAX2(mt->physical_height0 >> level, 1);
-      unsigned DL = MAX2(mt->physical_depth0 >> level, 1);
-      unsigned wL = ALIGN_NPOT(WL, mt->halign);
-      unsigned hL = ALIGN_NPOT(HL, mt->valign);
-
-      if (mt->target == GL_TEXTURE_CUBE_MAP)
-         DL = 6;
-
-      intel_miptree_set_level_info(mt, level, 0, 0, DL);
-
-      for (unsigned q = 0; q < DL; q++) {
-         unsigned x = (q % (1 << level)) * wL;
-         unsigned y = ysum + (q >> level) * hL;
-
-         intel_miptree_set_image_offset(mt, level, q, x / bw, y / bh);
-         mt->total_width = MAX2(mt->total_width, (x + wL) / bw);
-         mt->total_height = MAX2(mt->total_height, (y + hL) / bh);
-      }
-
-      ysum += ALIGN(DL, 1 << level) / (1 << level) * hL;
-   }
-
-   align_cube(mt);
-}
-
-/**
- * \brief Helper function for intel_miptree_create().
- */
-static uint32_t
-brw_miptree_choose_tiling(struct brw_context *brw,
-                          const struct intel_mipmap_tree *mt,
-                          uint32_t layout_flags)
-{
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      /* The stencil buffer is W tiled. However, we request from the kernel a
-       * non-tiled buffer because the GTT is incapable of W fencing.
-       */
-      return I915_TILING_NONE;
-   }
-
-   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
-   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
-
-   /* Some usages may want only one type of tiling, like depth miptrees (Y
-    * tiled), or temporary BOs for uploading data once (linear).
-    */
-   switch (layout_flags & MIPTREE_LAYOUT_TILING_ANY) {
-   case MIPTREE_LAYOUT_TILING_ANY:
-      break;
-   case MIPTREE_LAYOUT_TILING_Y:
-      return I915_TILING_Y;
-   case MIPTREE_LAYOUT_TILING_NONE:
-      return I915_TILING_NONE;
-   }
-
-   if (mt->num_samples > 1) {
-      /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
-       * Surface"):
-       *
-       *   [DevSNB+]: For multi-sample render targets, this field must be
-       *   1. MSRTs can only be tiled.
-       *
-       * Our usual reason for preferring X tiling (fast blits using the
-       * blitting engine) doesn't apply to MSAA, since we'll generally be
-       * downsampling or upsampling when blitting between the MSAA buffer
-       * and another buffer, and the blitting engine doesn't support that.
-       * So use Y tiling, since it makes better use of the cache.
-       */
-      return I915_TILING_Y;
-   }
-
-   GLenum base_format = _mesa_get_format_base_format(mt->format);
-   if (base_format == GL_DEPTH_COMPONENT ||
-       base_format == GL_DEPTH_STENCIL_EXT)
-      return I915_TILING_Y;
-
-   /* 1D textures (and 1D array textures) don't get any benefit from tiling,
-    * in fact it leads to a less efficient use of memory space and bandwidth
-    * due to tile alignment.
-    */
-   if (mt->logical_height0 == 1)
-      return I915_TILING_NONE;
-
-   int minimum_pitch = mt->total_width * mt->cpp;
-
-   /* If the width is much smaller than a tile, don't bother tiling. */
-   if (minimum_pitch < 64)
-      return I915_TILING_NONE;
-
-   if (ALIGN(minimum_pitch, 512) >= 32768) {
-      perf_debug("%dx%d miptree too large to blit, falling back to untiled",
-                 mt->total_width, mt->total_height);
-      return I915_TILING_NONE;
-   }
-
-   /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
-   if (brw->gen < 6)
-      return I915_TILING_X;
-
-   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
-    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
-    *  or Linear."
-    * 128 bits per pixel translates to 16 bytes per pixel. This is necessary
-    * all the way back to 965, but is permitted on Gen7+.
-    */
-   if (brw->gen < 7 && mt->cpp >= 16)
-      return I915_TILING_X;
-
-   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
-    * messages), on p64, under the heading "Surface Vertical Alignment":
-    *
-    *     This field must be set to VALIGN_4 for all tiled Y Render Target
-    *     surfaces.
-    *
-    * So if the surface is renderable and uses a vertical alignment of 2,
-    * force it to be X tiled.  This is somewhat conservative (it's possible
-    * that the client won't ever render to this surface), but it's difficult
-    * to know that ahead of time.  And besides, since we use a vertical
-    * alignment of 4 as often as we can, this shouldn't happen very often.
-    */
-   if (brw->gen == 7 && mt->valign == 2 &&
-       brw->format_supported_as_render_target[mt->format]) {
-      return I915_TILING_X;
-   }
-
-   return I915_TILING_Y | I915_TILING_X;
-}
-
-static void
-intel_miptree_set_total_width_height(struct brw_context *brw,
-                                     struct intel_mipmap_tree *mt)
-{
-   switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP:
-      if (brw->gen == 4) {
-         /* Gen4 stores cube maps as 3D textures. */
-         assert(mt->physical_depth0 == 6);
-         brw_miptree_layout_texture_3d(brw, mt);
-      } else {
-         /* All other hardware stores cube maps as 2D arrays. */
-	 brw_miptree_layout_texture_array(brw, mt);
-      }
-      break;
-
-   case GL_TEXTURE_3D:
-      if (brw->gen >= 9)
-         brw_miptree_layout_texture_array(brw, mt);
-      else
-         brw_miptree_layout_texture_3d(brw, mt);
-      break;
-
-   case GL_TEXTURE_1D_ARRAY:
-   case GL_TEXTURE_2D_ARRAY:
-   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-   case GL_TEXTURE_CUBE_MAP_ARRAY:
-      brw_miptree_layout_texture_array(brw, mt);
-      break;
-
-   default:
-      switch (mt->msaa_layout) {
-      case INTEL_MSAA_LAYOUT_UMS:
-      case INTEL_MSAA_LAYOUT_CMS:
-         brw_miptree_layout_texture_array(brw, mt);
-         break;
-      case INTEL_MSAA_LAYOUT_NONE:
-      case INTEL_MSAA_LAYOUT_IMS:
-         if (gen9_use_linear_1d_layout(brw, mt))
-            gen9_miptree_layout_1d(mt);
-         else if (mt->array_layout == GEN6_HIZ_STENCIL)
-            brw_miptree_layout_gen6_hiz_stencil(mt);
-         else
-            brw_miptree_layout_2d(mt);
-         break;
-      }
-      break;
-   }
-
-   DBG("%s: %dx%dx%d\n", __func__,
-       mt->total_width, mt->total_height, mt->cpp);
-}
-
-static void
-intel_miptree_set_alignment(struct brw_context *brw,
-                            struct intel_mipmap_tree *mt,
-                            uint32_t layout_flags)
-{
-   /**
-    * From the "Alignment Unit Size" section of various specs, namely:
-    * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
-    * - i965 and G45 PRMs:             Volume 1,         Section 6.17.3.4.
-    * - Ironlake and Sandybridge PRMs: Volume 1, Part 1, Section 7.18.3.4
-    * - BSpec (for Ivybridge and slight variations in separate stencil)
-    */
-
-   if (mt->array_layout == GEN6_HIZ_STENCIL) {
-      /* On gen6, we use GEN6_HIZ_STENCIL for stencil/hiz because the
-       * hardware doesn't support multiple mip levels on stencil/hiz.
-       *
-       * PRM Vol 2, Part 1, 7.5.3 Hierarchical Depth Buffer:
-       * "The hierarchical depth buffer does not support the LOD field"
-       *
-       * PRM Vol 2, Part 1, 7.5.4.1 Separate Stencil Buffer:
-       * "The stencil depth buffer does not support the LOD field"
-       */
-      if (mt->format == MESA_FORMAT_S_UINT8) {
-         /* Stencil uses W tiling, so we force W tiling alignment for the
-          * ALL_SLICES_AT_EACH_LOD miptree layout.
-          */
-         mt->halign = 4;
-         mt->valign = 2;
-         assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
-      } else {
-         /* See brw_miptree_layout_gen6_hiz_stencil() */
-         mt->halign = 1;
-         mt->valign = 2;
-      }
-   } else if (mt->compressed) {
-       /* The hardware alignment requirements for compressed textures
-        * happen to match the block boundaries.
-        */
-      _mesa_get_format_block_size(mt->format, &mt->halign, &mt->valign);
-
-      /* On Gen9+ we can pick our own alignment for compressed textures but it
-       * has to be a multiple of the block size. The minimum alignment we can
-       * pick is 4 so we effectively have to align to 4 times the block
-       * size
-       */
-      if (brw->gen >= 9) {
-         mt->halign *= 4;
-         mt->valign *= 4;
-      }
-   } else if (mt->format == MESA_FORMAT_S_UINT8) {
-      mt->halign = 8;
-      mt->valign = brw->gen >= 7 ? 8 : 4;
-   } else {
-      mt->halign =
-         intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
-      mt->valign = intel_vertical_texture_alignment_unit(brw, mt);
-   }
-}
-
-bool
-brw_miptree_layout(struct brw_context *brw,
-                   struct intel_mipmap_tree *mt,
-                   uint32_t layout_flags)
-{
-   intel_miptree_set_alignment(brw, mt, layout_flags);
-   intel_miptree_set_total_width_height(brw, mt);
-
-   if (!mt->total_width || !mt->total_height)
-      return false;
-
-   /* On Gen9+ the alignment values are expressed in multiples of the block
-    * size
-    */
-   if (brw->gen >= 9) {
-      unsigned int i, j;
-      _mesa_get_format_block_size(mt->format, &i, &j);
-      mt->halign /= i;
-      mt->valign /= j;
-   }
-
-   if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
-      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
-
-   return true;
-}
-
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index d61e713..18daf51 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -112,12 +112,10 @@
 /* Most minimal update, forces re-emit of URB fence packet after GS
  * unit turned on/off.
  */
-static void recalculate_urb_fence( struct brw_context *brw )
+void
+brw_calculate_urb_fence(struct brw_context *brw, unsigned csize,
+                        unsigned vsize, unsigned sfsize)
 {
-   GLuint csize = brw->curbe.total_size;
-   GLuint vsize = brw_vue_prog_data(brw->vs.base.prog_data)->urb_entry_size;
-   GLuint sfsize = brw->sf.prog_data->urb_entry_size;
-
    if (csize < limits[CS].min_entry_size)
       csize = limits[CS].min_entry_size;
 
@@ -208,12 +206,19 @@
    }
 }
 
+static void recalculate_urb_fence( struct brw_context *brw )
+{
+   brw_calculate_urb_fence(brw, brw->curbe.total_size,
+                           brw_vue_prog_data(brw->vs.base.prog_data)->urb_entry_size,
+                           brw->sf.prog_data->urb_entry_size);
+}
+
 
 const struct brw_tracked_state brw_recalculate_urb_fence = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_BLORP |
-             BRW_NEW_CURBE_OFFSETS |
+             BRW_NEW_PUSH_CONSTANT_ALLOCATION |
              BRW_NEW_SF_PROG_DATA |
              BRW_NEW_VS_PROG_DATA,
    },
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 3e9a6ee..095c43a 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -38,9 +38,8 @@
 
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
-extern GLenum brw_fix_xRGB_alpha(GLenum function);
 
-static inline uint32_t
+static inline float
 brw_get_line_width(struct brw_context *brw)
 {
    /* From the OpenGL 4.4 spec:
@@ -52,14 +51,9 @@
    float line_width =
       CLAMP(!_mesa_is_multisample_enabled(&brw->ctx) && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
-            0.0f, brw->ctx.Const.MaxLineWidth);
-   uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
+            0.125f, brw->ctx.Const.MaxLineWidth);
 
-   /* Line width of 0 is not allowed when MSAA enabled */
-   if (_mesa_is_multisample_enabled(&brw->ctx)) {
-      if (line_width_u3_7 == 0)
-         line_width_u3_7 = 1;
-   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
+   if (!_mesa_is_multisample_enabled(&brw->ctx) && brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
       /* For 1 pixel line thickness or less, the general
        * anti-aliasing algorithm gives up, and a garbage line is
        * generated.  Setting a Line Width of 0.0 specifies the
@@ -71,10 +65,10 @@
        * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
        * Rasterization.
        */
-      line_width_u3_7 = 0;
+      line_width = 0.0f;
    }
 
-   return line_width_u3_7;
+   return line_width;
 }
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 74b07cb..c0a0a13 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -203,6 +203,8 @@
       brw_nir_setup_glsl_uniforms(vp->program.nir, &vp->program,
                                   &prog_data.base.base,
                                   compiler->scalar_stage[MESA_SHADER_VERTEX]);
+      brw_nir_analyze_ubo_ranges(compiler, vp->program.nir,
+                                 prog_data.base.base.ubo_ranges);
    } else {
       brw_nir_setup_arb_uniforms(vp->program.nir, &vp->program,
                                  &prog_data.base.base);
@@ -210,16 +212,10 @@
 
    uint64_t outputs_written =
       brw_vs_outputs_written(brw, key, vp->program.info.outputs_written);
-   prog_data.inputs_read = vp->program.info.inputs_read;
-   prog_data.double_inputs_read = vp->program.info.double_inputs_read;
-
-   if (key->copy_edgeflag) {
-      prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
-   }
 
    brw_compute_vue_map(devinfo,
                        &prog_data.base.vue_map, outputs_written,
-                       vp->program.nir->info->separate_shader);
+                       vp->program.nir->info.separate_shader);
 
    if (0) {
       _mesa_fprint_program_opt(stderr, &vp->program, PROG_PRINT_DEBUG, true);
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
deleted file mode 100644
index 8a94eb2..0000000
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-
-#include "intel_batchbuffer.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "main/macros.h"
-
-static void
-brw_upload_vs_unit(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct brw_stage_state *stage_state = &brw->vs.base;
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-
-   struct brw_vs_unit_state *vs;
-
-   vs = brw_state_batch(brw, sizeof(*vs), 32, &stage_state->state_offset);
-   memset(vs, 0, sizeof(*vs));
-
-   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_VS_PROG_DATA */
-   vs->thread0.grf_reg_count = ALIGN(vue_prog_data->total_grf, 16) / 16 - 1;
-   vs->thread0.kernel_start_pointer =
-      brw_program_reloc(brw,
-			stage_state->state_offset +
-			offsetof(struct brw_vs_unit_state, thread0),
-			stage_state->prog_offset +
-			(vs->thread0.grf_reg_count << 1)) >> 6;
-
-   if (prog_data->use_alt_mode)
-      vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   else
-      vs->thread1.floating_point_mode = BRW_FLOATING_POINT_IEEE_754;
-
-   /* Choosing multiple program flow means that we may get 2-vertex threads,
-    * which will have the channel mask for dwords 4-7 enabled in the thread,
-    * and those dwords will be written to the second URB handle when we
-    * brw_urb_WRITE() results.
-    */
-   /* Force single program flow on Ironlake.  We cannot reliably get
-    * all applications working without it.  See:
-    * https://bugs.freedesktop.org/show_bug.cgi?id=29172
-    *
-    * The most notable and reliably failing application is the Humus
-    * demo "CelShading"
-   */
-   vs->thread1.single_program_flow = (brw->gen == 5);
-
-   vs->thread1.binding_table_entry_count =
-      prog_data->binding_table.size_bytes / 4;
-
-   if (prog_data->total_scratch != 0) {
-      vs->thread2.scratch_space_base_pointer =
-	 stage_state->scratch_bo->offset64 >> 10; /* reloc */
-      vs->thread2.per_thread_scratch_space =
-	 ffs(stage_state->per_thread_scratch) - 11;
-   } else {
-      vs->thread2.scratch_space_base_pointer = 0;
-      vs->thread2.per_thread_scratch_space = 0;
-   }
-
-   vs->thread3.urb_entry_read_length = vue_prog_data->urb_read_length;
-   vs->thread3.const_urb_entry_read_length = prog_data->curb_read_length;
-   vs->thread3.dispatch_grf_start_reg = prog_data->dispatch_grf_start_reg;
-   vs->thread3.urb_entry_read_offset = 0;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   vs->thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
-
-   /* BRW_NEW_URB_FENCE */
-   if (brw->gen == 5) {
-      switch (brw->urb.nr_vs_entries) {
-      case 8:
-      case 12:
-      case 16:
-      case 32:
-      case 64:
-      case 96:
-      case 128:
-      case 168:
-      case 192:
-      case 224:
-      case 256:
-	 vs->thread4.nr_urb_entries = brw->urb.nr_vs_entries >> 2;
-	 break;
-      default:
-         unreachable("not reached");
-      }
-   } else {
-      switch (brw->urb.nr_vs_entries) {
-      case 8:
-      case 12:
-      case 16:
-      case 32:
-	 break;
-      case 64:
-	 assert(brw->is_g4x);
-	 break;
-      default:
-         unreachable("not reached");
-      }
-      vs->thread4.nr_urb_entries = brw->urb.nr_vs_entries;
-   }
-
-   vs->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-
-   vs->thread4.max_threads = CLAMP(brw->urb.nr_vs_entries / 2,
-				   1, devinfo->max_vs_threads) - 1;
-
-   if (brw->gen == 5)
-      vs->vs5.sampler_count = 0; /* hardware requirement */
-   else {
-      vs->vs5.sampler_count = (stage_state->sampler_count + 3) / 4;
-   }
-
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      vs->thread4.stats_enable = 1;
-
-   /* Vertex program always enabled:
-    */
-   vs->vs6.vs_enable = 1;
-
-   /* Set the sampler state pointer, and its reloc
-    */
-   if (stage_state->sampler_count) {
-      /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
-      vs->vs5.sampler_state_pointer =
-         (brw->batch.bo->offset64 + stage_state->sampler_offset) >> 5;
-      brw_emit_reloc(&brw->batch,
-                     stage_state->state_offset +
-                     offsetof(struct brw_vs_unit_state, vs5),
-                     brw->batch.bo,
-                     (stage_state->sampler_offset | vs->vs5.sampler_count),
-                     I915_GEM_DOMAIN_INSTRUCTION, 0);
-   }
-
-   /* Emit scratch space relocation */
-   if (prog_data->total_scratch != 0) {
-      brw_emit_reloc(&brw->batch,
-                     stage_state->state_offset +
-                     offsetof(struct brw_vs_unit_state, thread2),
-                     stage_state->scratch_bo,
-                     vs->thread2.per_thread_scratch_space,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   }
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-}
-
-const struct brw_tracked_state brw_vs_unit = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CURBE_OFFSETS |
-               BRW_NEW_PROGRAM_CACHE |
-               BRW_NEW_SAMPLER_STATE_TABLE |
-               BRW_NEW_URB_FENCE |
-               BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = brw_upload_vs_unit,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 016da74..9c2184c 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -198,6 +198,7 @@
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_IMAGE_UNITS |
              BRW_NEW_VERTEX_PROGRAM |
              BRW_NEW_VS_PROG_DATA,
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 59d503e..c9c4504 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -58,7 +58,7 @@
       brw_assign_common_binding_table_offsets(devinfo, prog, &prog_data->base,
                                               next_binding_table_offset);
 
-   if (prog->nir->info->outputs_read && !key->coherent_fb_fetch) {
+   if (prog->nir->info.outputs_read && !key->coherent_fb_fetch) {
       prog_data->binding_table.render_target_read_start =
          next_binding_table_offset;
       next_binding_table_offset += key->nr_color_regions;
@@ -106,6 +106,9 @@
                       old_key->alpha_test_func, key->alpha_test_func);
    found |= key_debug(brw, "mrt alpha test reference value",
                       old_key->alpha_test_ref, key->alpha_test_ref);
+   found |= key_debug(brw, "force dual color blending",
+                      old_key->force_dual_color_blend,
+                      key->force_dual_color_blend);
 
    found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
 
@@ -162,6 +165,8 @@
    if (!fp->program.is_arb_asm) {
       brw_nir_setup_glsl_uniforms(fp->program.nir, &fp->program,
                                   &prog_data.base, true);
+      brw_nir_analyze_ubo_ranges(brw->screen->compiler, fp->program.nir,
+                                 prog_data.base.ubo_ranges);
    } else {
       brw_nir_setup_arb_uniforms(fp->program.nir, &fp->program,
                                  &prog_data.base);
@@ -188,7 +193,7 @@
    program = brw_compile_fs(brw->screen->compiler, brw, mem_ctx,
                             key, &prog_data, fp->program.nir,
                             &fp->program, st_index8, st_index16,
-                            true, brw->use_rep_send, vue_map,
+                            true, false, vue_map,
                             &program_size, &error_str);
 
    if (program == NULL) {
@@ -267,6 +272,10 @@
    found |= key_debug(brw, "yx_xuxv image bound",
                       old_key->yx_xuxv_image_mask,
                       key->yx_xuxv_image_mask);
+   found |= key_debug(brw, "xy_uxvx image bound",
+                      old_key->xy_uxvx_image_mask,
+                      key->xy_uxvx_image_mask);
+
 
    for (unsigned int i = 0; i < MAX_SAMPLERS; i++) {
       found |= key_debug(brw, "textureGather workarounds",
@@ -335,7 +344,7 @@
          }
 
          /* gather4 for RG32* is broken in multiple ways on Gen7. */
-         if (brw->gen == 7 && prog->nir->info->uses_texture_gather) {
+         if (brw->gen == 7 && prog->nir->info.uses_texture_gather) {
             switch (img->InternalFormat) {
             case GL_RG32I:
             case GL_RG32UI: {
@@ -373,7 +382,7 @@
          /* Gen6's gather4 is broken for UINT/SINT; we treat them as
           * UNORM/FLOAT instead and fix it in the shader.
           */
-         if (brw->gen == 6 && prog->nir->info->uses_texture_gather) {
+         if (brw->gen == 6 && prog->nir->info.uses_texture_gather) {
             key->gen6_gather_wa[s] = gen6_gather_workaround(img->InternalFormat);
          }
 
@@ -387,12 +396,14 @@
          /* From gen9 onwards some single sampled buffers can also be
           * compressed. These don't need ld2dms sampling along with mcs fetch.
           */
-         if (brw->gen >= 7 &&
-             intel_tex->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS &&
-             intel_tex->mt->num_samples > 1) {
+         if (intel_tex->mt->aux_usage == ISL_AUX_USAGE_MCS) {
+            assert(brw->gen >= 7);
+            assert(intel_tex->mt->surf.samples > 1);
+            assert(intel_tex->mt->mcs_buf);
+            assert(intel_tex->mt->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY);
             key->compressed_multisample_layout_mask |= 1 << s;
 
-            if (intel_tex->mt->num_samples >= 16) {
+            if (intel_tex->mt->surf.samples >= 16) {
                assert(brw->gen >= 9);
                key->msaa_16 |= 1 << s;
             }
@@ -409,6 +420,9 @@
             case __DRI_IMAGE_COMPONENTS_Y_XUXV:
                key->yx_xuxv_image_mask |= 1 << s;
                break;
+            case __DRI_IMAGE_COMPONENTS_Y_UXVX:
+               key->xy_uxvx_image_mask |= 1 << s;
+               break;
             default:
                break;
             }
@@ -471,7 +485,7 @@
          lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
 
       /* _NEW_STENCIL | _NEW_BUFFERS */
-      if (ctx->Stencil._Enabled) {
+      if (brw->stencil_enabled) {
          lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
 
          if (ctx->Stencil.WriteMask[0] ||
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 613172a..113cdf3 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -41,8 +41,6 @@
 extern "C" {
 #endif
 
-bool brw_color_buffer_write_enabled(struct brw_context *brw);
-
 void
 brw_upload_wm_prog(struct brw_context *brw);
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
deleted file mode 100644
index 5da7d6d..0000000
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
-
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_wm.h"
-#include "compiler/nir/nir.h"
-
-/***********************************************************************
- * WM unit - fragment programs and rasterization
- */
-
-bool
-brw_color_buffer_write_enabled(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct gl_program *fp = brw->fragment_program;
-   unsigned i;
-
-   /* _NEW_BUFFERS */
-   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-      uint64_t outputs_written = fp->info.outputs_written;
-
-      /* _NEW_COLOR */
-      if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
-	         outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
-	  (ctx->Color.ColorMask[i][0] ||
-	   ctx->Color.ColorMask[i][1] ||
-	   ctx->Color.ColorMask[i][2] ||
-	   ctx->Color.ColorMask[i][3])) {
-	 return true;
-      }
-   }
-
-   return false;
-}
-
-/**
- * Setup wm hardware state.  See page 225 of Volume 2
- */
-static void
-brw_upload_wm_unit(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct gl_program *fp = brw->fragment_program;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   struct brw_wm_unit_state *wm;
-
-   wm = brw_state_batch(brw, sizeof(*wm), 32, &brw->wm.base.state_offset);
-   memset(wm, 0, sizeof(*wm));
-
-   if (prog_data->dispatch_8 && prog_data->dispatch_16) {
-      /* These two fields should be the same pre-gen6, which is why we
-       * only have one hardware field to program for both dispatch
-       * widths.
-       */
-      assert(prog_data->base.dispatch_grf_start_reg ==
-	     prog_data->dispatch_grf_start_reg_2);
-   }
-
-   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */
-   wm->wm5.enable_8_pix = prog_data->dispatch_8;
-   wm->wm5.enable_16_pix = prog_data->dispatch_16;
-
-   if (prog_data->dispatch_8 || prog_data->dispatch_16) {
-      wm->thread0.grf_reg_count = prog_data->reg_blocks_0;
-      wm->thread0.kernel_start_pointer =
-         brw_program_reloc(brw,
-                           brw->wm.base.state_offset +
-                           offsetof(struct brw_wm_unit_state, thread0),
-                           brw->wm.base.prog_offset +
-                           (wm->thread0.grf_reg_count << 1)) >> 6;
-   }
-
-   if (prog_data->prog_offset_2) {
-      wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2;
-      wm->wm9.kernel_start_pointer_2 =
-         brw_program_reloc(brw,
-                           brw->wm.base.state_offset +
-                           offsetof(struct brw_wm_unit_state, wm9),
-                           brw->wm.base.prog_offset +
-                           prog_data->prog_offset_2 +
-                           (wm->wm9.grf_reg_count_2 << 1)) >> 6;
-   }
-
-   wm->thread1.depth_coef_urb_read_offset = 1;
-   if (prog_data->base.use_alt_mode)
-      wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   else
-      wm->thread1.floating_point_mode = BRW_FLOATING_POINT_IEEE_754;
-
-   wm->thread1.binding_table_entry_count =
-      prog_data->base.binding_table.size_bytes / 4;
-
-   if (prog_data->base.total_scratch != 0) {
-      wm->thread2.scratch_space_base_pointer =
-	 brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
-      wm->thread2.per_thread_scratch_space =
-	 ffs(brw->wm.base.per_thread_scratch) - 11;
-   } else {
-      wm->thread2.scratch_space_base_pointer = 0;
-      wm->thread2.per_thread_scratch_space = 0;
-   }
-
-   wm->thread3.dispatch_grf_start_reg =
-      prog_data->base.dispatch_grf_start_reg;
-   wm->thread3.urb_entry_read_length =
-      prog_data->num_varying_inputs * 2;
-   wm->thread3.urb_entry_read_offset = 0;
-   wm->thread3.const_urb_entry_read_length =
-      prog_data->base.curb_read_length;
-   /* BRW_NEW_CURBE_OFFSETS */
-   wm->thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
-
-   if (brw->gen == 5)
-      wm->wm4.sampler_count = 0; /* hardware requirement */
-   else {
-      wm->wm4.sampler_count = (brw->wm.base.sampler_count + 1) / 4;
-   }
-
-   if (brw->wm.base.sampler_count) {
-      /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
-      wm->wm4.sampler_state_pointer = (brw->batch.bo->offset64 +
-				       brw->wm.base.sampler_offset) >> 5;
-   } else {
-      wm->wm4.sampler_state_pointer = 0;
-   }
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   wm->wm5.program_uses_depth = prog_data->uses_src_depth;
-   wm->wm5.program_computes_depth = (fp->info.outputs_written &
-				     BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
-   /* _NEW_BUFFERS
-    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
-    * Depth field.
-    */
-   if (!intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH))
-      wm->wm5.program_computes_depth = 0;
-
-   /* _NEW_COLOR */
-   wm->wm5.program_uses_killpixel =
-      prog_data->uses_kill || ctx->Color.AlphaEnabled;
-
-   wm->wm5.max_threads = devinfo->max_wm_threads - 1;
-
-   /* _NEW_BUFFERS | _NEW_COLOR */
-   if (brw_color_buffer_write_enabled(brw) ||
-       wm->wm5.program_uses_killpixel ||
-       wm->wm5.program_computes_depth) {
-      wm->wm5.thread_dispatch_enable = 1;
-   }
-
-   wm->wm5.legacy_line_rast = 0;
-   wm->wm5.legacy_global_depth_bias = 0;
-   wm->wm5.early_depth_test = 1;	        /* never need to disable */
-   wm->wm5.line_aa_region_width = 0;
-   wm->wm5.line_endcap_aa_region_width = 1;
-
-   /* _NEW_POLYGONSTIPPLE */
-   wm->wm5.polygon_stipple = ctx->Polygon.StippleFlag;
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon.OffsetFill) {
-      wm->wm5.depth_offset = 1;
-      /* Something weird going on with legacy_global_depth_bias,
-       * offset_constant, scaling and MRD.  This value passes glean
-       * but gives some odd results elsewere (eg. the
-       * quad-offset-units test).
-       */
-      wm->global_depth_offset_constant = ctx->Polygon.OffsetUnits * 2;
-
-      /* This is the only value that passes glean:
-       */
-      wm->global_depth_offset_scale = ctx->Polygon.OffsetFactor;
-   }
-
-   /* _NEW_LINE */
-   wm->wm5.line_stipple = ctx->Line.StippleFlag;
-
-   /* BRW_NEW_STATS_WM */
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS) || brw->stats_wm)
-      wm->wm4.stats_enable = 1;
-
-   /* Emit scratch space relocation */
-   if (prog_data->base.total_scratch != 0) {
-      brw_emit_reloc(&brw->batch,
-                     brw->wm.base.state_offset +
-                     offsetof(struct brw_wm_unit_state, thread2),
-                     brw->wm.base.scratch_bo,
-                     wm->thread2.per_thread_scratch_space,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   }
-
-   /* Emit sampler state relocation */
-   if (brw->wm.base.sampler_count != 0) {
-      brw_emit_reloc(&brw->batch,
-                     brw->wm.base.state_offset +
-                     offsetof(struct brw_wm_unit_state, wm4),
-                     brw->batch.bo,
-                     brw->wm.base.sampler_offset | wm->wm4.stats_enable |
-                     (wm->wm4.sampler_count << 2),
-                     I915_GEM_DOMAIN_INSTRUCTION, 0);
-   }
-
-   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-
-   /* _NEW_POLGYON */
-   if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP << 16 | (2 - 2));
-      OUT_BATCH_F(ctx->Polygon.OffsetClamp);
-      ADVANCE_BATCH();
-
-      brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
-   }
-}
-
-const struct brw_tracked_state brw_wm_unit = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_COLOR |
-              _NEW_LINE |
-              _NEW_POLYGON |
-              _NEW_POLYGONSTIPPLE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_CURBE_OFFSETS |
-             BRW_NEW_FRAGMENT_PROGRAM |
-             BRW_NEW_FS_PROG_DATA |
-             BRW_NEW_PROGRAM_CACHE |
-             BRW_NEW_SAMPLER_STATE_TABLE |
-             BRW_NEW_STATS_WM,
-   },
-   .emit = brw_upload_wm_unit,
-};
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 5806ca1..79a13eb 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -64,96 +64,114 @@
    [7] = GEN7_MOCS_L3,
    [8] = BDW_MOCS_WB,
    [9] = SKL_MOCS_WB,
+   [10] = CNL_MOCS_WB,
 };
 
 uint32_t rb_mocs[] = {
    [7] = GEN7_MOCS_L3,
    [8] = BDW_MOCS_PTE,
    [9] = SKL_MOCS_PTE,
+   [10] = CNL_MOCS_PTE,
 };
 
 static void
+get_isl_surf(struct brw_context *brw, struct intel_mipmap_tree *mt,
+             GLenum target, struct isl_view *view,
+             uint32_t *tile_x, uint32_t *tile_y,
+             uint32_t *offset, struct isl_surf *surf)
+{
+   *surf = mt->surf;
+
+   const enum isl_dim_layout dim_layout =
+      get_isl_dim_layout(&brw->screen->devinfo, mt->surf.tiling, target);
+
+   if (surf->dim_layout == dim_layout)
+      return;
+
+   /* The layout of the specified texture target is not compatible with the
+    * actual layout of the miptree structure in memory -- You're entering
+    * dangerous territory, this can only possibly work if you only intended
+    * to access a single level and slice of the texture, and the hardware
+    * supports the tile offset feature in order to allow non-tile-aligned
+    * base offsets, since we'll have to point the hardware to the first
+    * texel of the level instead of relying on the usual base level/layer
+    * controls.
+    */
+   assert(brw->has_surface_tile_offset);
+   assert(view->levels == 1 && view->array_len == 1);
+   assert(*tile_x == 0 && *tile_y == 0);
+
+   *offset += intel_miptree_get_tile_offsets(mt, view->base_level,
+                                             view->base_array_layer,
+                                             tile_x, tile_y);
+
+   /* Minify the logical dimensions of the texture. */
+   const unsigned l = view->base_level - mt->first_level;
+   surf->logical_level0_px.width = minify(surf->logical_level0_px.width, l);
+   surf->logical_level0_px.height = surf->dim <= ISL_SURF_DIM_1D ? 1 :
+      minify(surf->logical_level0_px.height, l);
+   surf->logical_level0_px.depth = surf->dim <= ISL_SURF_DIM_2D ? 1 :
+      minify(surf->logical_level0_px.depth, l);
+
+   /* Only the base level and layer can be addressed with the overridden
+    * layout.
+    */
+   surf->logical_level0_px.array_len = 1;
+   surf->levels = 1;
+   surf->dim_layout = dim_layout;
+
+   /* The requested slice of the texture is now at the base level and
+    * layer.
+    */
+   view->base_level = 0;
+   view->base_array_layer = 0;
+}
+
+static void
 brw_emit_surface_state(struct brw_context *brw,
-                       struct intel_mipmap_tree *mt, uint32_t flags,
+                       struct intel_mipmap_tree *mt,
                        GLenum target, struct isl_view view,
+                       enum isl_aux_usage aux_usage,
                        uint32_t mocs, uint32_t *surf_offset, int surf_index,
                        unsigned read_domains, unsigned write_domains)
 {
-   uint32_t tile_x = mt->level[0].slice[0].x_offset;
-   uint32_t tile_y = mt->level[0].slice[0].y_offset;
+   uint32_t tile_x = mt->level[0].level_x;
+   uint32_t tile_y = mt->level[0].level_y;
    uint32_t offset = mt->offset;
 
    struct isl_surf surf;
-   intel_miptree_get_isl_surf(brw, mt, &surf);
 
-   surf.dim = get_isl_surf_dim(target);
-
-   const enum isl_dim_layout dim_layout =
-      get_isl_dim_layout(&brw->screen->devinfo, mt->tiling, target);
-
-   if (surf.dim_layout != dim_layout) {
-      /* The layout of the specified texture target is not compatible with the
-       * actual layout of the miptree structure in memory -- You're entering
-       * dangerous territory, this can only possibly work if you only intended
-       * to access a single level and slice of the texture, and the hardware
-       * supports the tile offset feature in order to allow non-tile-aligned
-       * base offsets, since we'll have to point the hardware to the first
-       * texel of the level instead of relying on the usual base level/layer
-       * controls.
-       */
-      assert(brw->has_surface_tile_offset);
-      assert(view.levels == 1 && view.array_len == 1);
-      assert(tile_x == 0 && tile_y == 0);
-
-      offset += intel_miptree_get_tile_offsets(mt, view.base_level,
-                                               view.base_array_layer,
-                                               &tile_x, &tile_y);
-
-      /* Minify the logical dimensions of the texture. */
-      const unsigned l = view.base_level - mt->first_level;
-      surf.logical_level0_px.width = minify(surf.logical_level0_px.width, l);
-      surf.logical_level0_px.height = surf.dim <= ISL_SURF_DIM_1D ? 1 :
-         minify(surf.logical_level0_px.height, l);
-      surf.logical_level0_px.depth = surf.dim <= ISL_SURF_DIM_2D ? 1 :
-         minify(surf.logical_level0_px.depth, l);
-
-      /* Only the base level and layer can be addressed with the overridden
-       * layout.
-       */
-      surf.logical_level0_px.array_len = 1;
-      surf.levels = 1;
-      surf.dim_layout = dim_layout;
-
-      /* The requested slice of the texture is now at the base level and
-       * layer.
-       */
-      view.base_level = 0;
-      view.base_array_layer = 0;
-   }
+   get_isl_surf(brw, mt, target, &view, &tile_x, &tile_y, &offset, &surf);
 
    union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
 
    struct brw_bo *aux_bo;
-   struct isl_surf *aux_surf = NULL, aux_surf_s;
+   struct isl_surf *aux_surf = NULL;
    uint64_t aux_offset = 0;
-   enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
-   if ((mt->mcs_buf || intel_miptree_sample_with_hiz(brw, mt)) &&
-       !(flags & INTEL_AUX_BUFFER_DISABLED)) {
-      intel_miptree_get_aux_isl_surf(brw, mt, &aux_surf_s, &aux_usage);
-      aux_surf = &aux_surf_s;
+   switch (aux_usage) {
+   case ISL_AUX_USAGE_MCS:
+   case ISL_AUX_USAGE_CCS_D:
+   case ISL_AUX_USAGE_CCS_E:
+      aux_surf = &mt->mcs_buf->surf;
+      aux_bo = mt->mcs_buf->bo;
+      aux_offset = mt->mcs_buf->bo->offset64 + mt->mcs_buf->offset;
+      break;
 
-      if (mt->mcs_buf) {
-         aux_bo = mt->mcs_buf->bo;
-         aux_offset = mt->mcs_buf->bo->offset64 + mt->mcs_buf->offset;
-      } else {
-         aux_bo = mt->hiz_buf->aux_base.bo;
-         aux_offset = mt->hiz_buf->aux_base.bo->offset64;
-      }
+   case ISL_AUX_USAGE_HIZ:
+      aux_surf = &mt->hiz_buf->surf;
+      aux_bo = mt->hiz_buf->bo;
+      aux_offset = mt->hiz_buf->bo->offset64;
+      break;
 
+   case ISL_AUX_USAGE_NONE:
+      break;
+   }
+
+   if (aux_usage != ISL_AUX_USAGE_NONE) {
       /* We only really need a clear color if we also have an auxiliary
        * surface.  Without one, it does nothing.
        */
-      clear_color = intel_miptree_get_isl_clear_color(brw, mt);
+      clear_color = mt->fast_clear_color;
    }
 
    void *state = brw_state_batch(brw,
@@ -161,7 +179,7 @@
                                  brw->isl_dev.ss.align,
                                  surf_offset);
 
-   isl_surf_fill_state(&brw->isl_dev, state, .surf = &surf, .view = &view,
+   isl_surf_fill_state(&brw->isl_dev, state, .surf = &mt->surf, .view = &view,
                        .address = mt->bo->offset64 + offset,
                        .aux_surf = aux_surf, .aux_usage = aux_usage,
                        .aux_address = aux_offset,
@@ -190,42 +208,39 @@
 uint32_t
 brw_update_renderbuffer_surface(struct brw_context *brw,
                                 struct gl_renderbuffer *rb,
-                                uint32_t flags, unsigned unit /* unused */,
+                                uint32_t flags, unsigned unit,
                                 uint32_t surf_index)
 {
    struct gl_context *ctx = &brw->ctx;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_mipmap_tree *mt = irb->mt;
 
-   if (brw->gen < 9) {
-      assert(!(flags & INTEL_AUX_BUFFER_DISABLED));
-   }
-
    assert(brw_render_target_supported(brw, rb));
 
    mesa_format rb_format = _mesa_get_render_format(ctx, intel_rb_format(irb));
-   if (unlikely(!brw->format_supported_as_render_target[rb_format])) {
+   if (unlikely(!brw->mesa_format_supports_render[rb_format])) {
       _mesa_problem(ctx, "%s: renderbuffer format %s unsupported\n",
                     __func__, _mesa_get_format_name(rb_format));
    }
+   enum isl_format isl_format = brw->mesa_to_isl_render_format[rb_format];
 
-   const unsigned layer_multiplier =
-      (irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_UMS ||
-       irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) ?
-      MAX2(irb->mt->num_samples, 1) : 1;
+   enum isl_aux_usage aux_usage =
+      brw->draw_aux_buffer_disabled[unit] ? ISL_AUX_USAGE_NONE :
+      intel_miptree_render_aux_usage(brw, mt, isl_format,
+                                     ctx->Color.BlendEnabled & (1 << unit));
 
    struct isl_view view = {
-      .format = brw->render_target_format[rb_format],
+      .format = isl_format,
       .base_level = irb->mt_level - irb->mt->first_level,
       .levels = 1,
-      .base_array_layer = irb->mt_layer / layer_multiplier,
+      .base_array_layer = irb->mt_layer,
       .array_len = MAX2(irb->layer_count, 1),
       .swizzle = ISL_SWIZZLE_IDENTITY,
       .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT,
    };
 
    uint32_t offset;
-   brw_emit_surface_state(brw, mt, flags, mt->target, view,
+   brw_emit_surface_state(brw, mt, mt->target, view, aux_usage,
                           rb_mocs[brw->gen],
                           &offset, surf_index,
                           I915_GEM_DOMAIN_RENDER,
@@ -264,12 +279,12 @@
 }
 
 uint32_t
-brw_get_surface_tiling_bits(uint32_t tiling)
+brw_get_surface_tiling_bits(enum isl_tiling tiling)
 {
    switch (tiling) {
-   case I915_TILING_X:
+   case ISL_TILING_X:
       return BRW_SURFACE_TILED;
-   case I915_TILING_Y:
+   case ISL_TILING_Y0:
       return BRW_SURFACE_TILED | BRW_SURFACE_TILED_Y;
    default:
       return 0;
@@ -423,95 +438,21 @@
    return (need_green_to_blue && scs == HSW_SCS_GREEN) ? HSW_SCS_BLUE : scs;
 }
 
-static unsigned
-brw_find_matching_rb(const struct gl_framebuffer *fb,
-                     const struct intel_mipmap_tree *mt)
+static bool
+brw_aux_surface_disabled(const struct brw_context *brw,
+                         const struct intel_mipmap_tree *mt)
 {
+   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
+
    for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
       const struct intel_renderbuffer *irb =
          intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
       if (irb && irb->mt == mt)
-         return i;
+         return brw->draw_aux_buffer_disabled[i];
    }
 
-   return fb->_NumColorDrawBuffers;
-}
-
-static inline bool
-brw_texture_view_sane(const struct brw_context *brw,
-                      const struct intel_mipmap_tree *mt,
-                      const struct isl_view *view)
-{
-   /* There are special cases only for lossless compression. */
-   if (!intel_miptree_is_lossless_compressed(brw, mt))
-      return true;
-
-   if (isl_format_supports_ccs_e(&brw->screen->devinfo, view->format))
-      return true;
-
-   /* Logic elsewhere needs to take care to resolve the color buffer prior
-    * to sampling it as non-compressed.
-    */
-   if (intel_miptree_has_color_unresolved(mt, view->base_level, view->levels,
-                                          view->base_array_layer,
-                                          view->array_len))
-      return false;
-
-   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
-   const unsigned rb_index = brw_find_matching_rb(fb, mt);
-
-   if (rb_index == fb->_NumColorDrawBuffers)
-      return true;
-
-   /* Underlying surface is compressed but it is sampled using a format that
-    * the sampling engine doesn't support as compressed. Compression must be
-    * disabled for both sampling engine and data port in case the same surface
-    * is used also as render target.
-    */
-   return brw->draw_aux_buffer_disabled[rb_index];
-}
-
-static bool
-brw_disable_aux_surface(const struct brw_context *brw,
-                        const struct intel_mipmap_tree *mt,
-                        const struct isl_view *view)
-{
-   /* Nothing to disable. */
-   if (!mt->mcs_buf)
-      return false;
-
-   const bool is_unresolved = intel_miptree_has_color_unresolved(
-                                 mt, view->base_level, view->levels,
-                                 view->base_array_layer, view->array_len);
-
-   /* There are special cases only for lossless compression. */
-   if (!intel_miptree_is_lossless_compressed(brw, mt))
-      return !is_unresolved;
-
-   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
-   const unsigned rb_index = brw_find_matching_rb(fb, mt);
-
-   /* If we are drawing into this with compression enabled, then we must also
-    * enable compression when texturing from it regardless of
-    * fast_clear_state.  If we don't then, after the first draw call with
-    * this setup, there will be data in the CCS which won't get picked up by
-    * subsequent texturing operations as required by ARB_texture_barrier.
-    * Since we don't want to re-emit the binding table or do a resolve
-    * operation every draw call, the easiest thing to do is just enable
-    * compression on the texturing side.  This is completely safe to do
-    * since, if compressed texturing weren't allowed, we would have disabled
-    * compression of render targets in whatever_that_function_is_called().
-    */
-   if (rb_index < fb->_NumColorDrawBuffers) {
-      if (brw->draw_aux_buffer_disabled[rb_index]) {
-         assert(!is_unresolved);
-      }
-
-      return brw->draw_aux_buffer_disabled[rb_index];
-   }
-
-   return !is_unresolved;
+   return false;
 }
 
 void
@@ -541,9 +482,14 @@
       /* If this is a view with restricted NumLayers, then our effective depth
        * is not just the miptree depth.
        */
-      const unsigned view_num_layers =
-         (obj->Immutable && obj->Target != GL_TEXTURE_3D) ? obj->NumLayers :
-                                                            mt->logical_depth0;
+      unsigned view_num_layers;
+      if (obj->Immutable && obj->Target != GL_TEXTURE_3D) {
+         view_num_layers = obj->NumLayers;
+      } else {
+         view_num_layers = mt->surf.dim == ISL_SURF_DIM_3D ?
+                              mt->surf.logical_level0_px.depth :
+                              mt->surf.logical_level0_px.array_len;
+      }
 
       /* Handling GL_ALPHA as a surface format override breaks 1.30+ style
        * texturing functions that return a float, as our code generation always
@@ -557,8 +503,8 @@
                                 brw_get_texture_swizzle(&brw->ctx, obj));
 
       mesa_format mesa_fmt = plane == 0 ? intel_obj->_Format : mt->format;
-      unsigned format = translate_tex_format(brw, mesa_fmt,
-                                             sampler->sRGBDecode);
+      enum isl_format format = translate_tex_format(brw, mesa_fmt,
+                                                    sampler->sRGBDecode);
 
       /* Implement gen6 and gen7 gather work-around */
       bool need_green_to_blue = false;
@@ -633,11 +579,13 @@
           obj->Target == GL_TEXTURE_CUBE_MAP_ARRAY)
          view.usage |= ISL_SURF_USAGE_CUBE_BIT;
 
-      assert(brw_texture_view_sane(brw, mt, &view));
+      enum isl_aux_usage aux_usage =
+         intel_miptree_texture_aux_usage(brw, mt, format);
 
-      const int flags = brw_disable_aux_surface(brw, mt, &view) ?
-                           INTEL_AUX_BUFFER_DISABLED : 0;
-      brw_emit_surface_state(brw, mt, flags, mt->target, view,
+      if (brw_aux_surface_disabled(brw, mt))
+         aux_usage = ISL_AUX_USAGE_NONE;
+
+      brw_emit_surface_state(brw, mt, mt->target, view, aux_usage,
                              tex_mocs[brw->gen],
                              surf_offset, surf_index,
                              I915_GEM_DOMAIN_SAMPLER, 0);
@@ -686,12 +634,13 @@
    uint32_t size = tObj->BufferSize;
    struct brw_bo *bo = NULL;
    mesa_format format = tObj->_BufferObjectFormat;
-   uint32_t brw_format = brw_isl_format_for_mesa_format(format);
+   const enum isl_format isl_format = brw_isl_format_for_mesa_format(format);
    int texel_size = _mesa_get_format_bytes(format);
 
    if (intel_obj) {
       size = MIN2(size, intel_obj->Base.Size);
-      bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size);
+      bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size,
+                                  false);
    }
 
    /* The ARB_texture_buffer_specification says:
@@ -712,14 +661,14 @@
     */
    size = MIN2(size, ctx->Const.MaxTextureBufferSize * (unsigned) texel_size);
 
-   if (brw_format == 0 && format != MESA_FORMAT_RGBA_FLOAT32) {
+   if (isl_format == ISL_FORMAT_UNSUPPORTED) {
       _mesa_problem(NULL, "bad format %s for texture buffer\n",
 		    _mesa_get_format_name(format));
    }
 
    brw_emit_buffer_surface_state(brw, surf_offset, bo,
                                  tObj->BufferOffset,
-                                 brw_format,
+                                 isl_format,
                                  size,
                                  texel_size,
                                  false /* rw */);
@@ -779,7 +728,8 @@
    uint32_t offset_bytes = 4 * offset_dwords;
    struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
                                              offset_bytes,
-                                             buffer_obj->Size - offset_bytes);
+                                             buffer_obj->Size - offset_bytes,
+                                             true);
    uint32_t *surf = brw_state_batch(brw, 6 * 4, 32, out_offset);
    uint32_t pitch_minus_1 = 4*stride_dwords - 1;
    size_t size_dwords = buffer_obj->Size / 4;
@@ -986,7 +936,7 @@
    struct intel_mipmap_tree *mt = irb->mt;
    uint32_t *surf;
    uint32_t tile_x, tile_y;
-   uint32_t format = 0;
+   enum isl_format format;
    uint32_t offset;
    /* _NEW_BUFFERS */
    mesa_format rb_format = _mesa_get_render_format(ctx, intel_rb_format(irb));
@@ -1006,14 +956,15 @@
 	  * miptree and render into that.
 	  */
 	 intel_renderbuffer_move_to_temp(brw, irb, false);
-	 mt = irb->mt;
+	 assert(irb->align_wa_mt);
+	 mt = irb->align_wa_mt;
       }
    }
 
    surf = brw_state_batch(brw, 6 * 4, 32, &offset);
 
-   format = brw->render_target_format[rb_format];
-   if (unlikely(!brw->format_supported_as_render_target[rb_format])) {
+   format = brw->mesa_to_isl_render_format[rb_format];
+   if (unlikely(!brw->mesa_format_supports_render[rb_format])) {
       _mesa_problem(ctx, "%s: renderbuffer format %s unsupported\n",
                     __func__, _mesa_get_format_name(rb_format));
    }
@@ -1029,10 +980,10 @@
    surf[2] = ((rb->Width - 1) << BRW_SURFACE_WIDTH_SHIFT |
 	      (rb->Height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
 
-   surf[3] = (brw_get_surface_tiling_bits(mt->tiling) |
-	      (mt->pitch - 1) << BRW_SURFACE_PITCH_SHIFT);
+   surf[3] = (brw_get_surface_tiling_bits(mt->surf.tiling) |
+	      (mt->surf.row_pitch - 1) << BRW_SURFACE_PITCH_SHIFT);
 
-   surf[4] = brw_get_surface_num_multisamples(mt->num_samples);
+   surf[4] = brw_get_surface_num_multisamples(mt->surf.samples);
 
    assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
    /* Note that the low bits of these fields are missing, so
@@ -1042,7 +993,8 @@
    assert(tile_y % 2 == 0);
    surf[5] = ((tile_x / 4) << BRW_SURFACE_X_OFFSET_SHIFT |
 	      (tile_y / 2) << BRW_SURFACE_Y_OFFSET_SHIFT |
-	      (mt->valign == 4 ? BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0));
+	      (mt->surf.image_alignment_el.height == 4 ?
+                  BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0));
 
    if (brw->gen < 6) {
       /* _NEW_COLOR */
@@ -1144,7 +1096,8 @@
    .dirty = {
       .mesa = _NEW_BUFFERS,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR,
    },
    .emit = update_renderbuffer_surfaces,
 };
@@ -1172,7 +1125,7 @@
          uint32_t *surf_offset = &brw->wm.base.surf_offset[surf_index];
 
          if (irb) {
-            const unsigned format = brw->render_target_format[
+            const enum isl_format format = brw->mesa_to_isl_render_format[
                _mesa_get_render_format(ctx, intel_rb_format(irb))];
             assert(isl_format_supports_sampling(&brw->screen->devinfo,
                                                 format));
@@ -1192,29 +1145,22 @@
                irb->mt->target == GL_TEXTURE_1D_ARRAY ? GL_TEXTURE_2D_ARRAY :
                irb->mt->target;
 
-            /* intel_renderbuffer::mt_layer is expressed in sample units for
-             * the UMS and CMS multisample layouts, but
-             * intel_renderbuffer::layer_count is expressed in units of whole
-             * logical layers regardless of the multisample layout.
-             */
-            const unsigned mt_layer_unit =
-               (irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_UMS ||
-                irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) ?
-               MAX2(irb->mt->num_samples, 1) : 1;
-
             const struct isl_view view = {
                .format = format,
                .base_level = irb->mt_level - irb->mt->first_level,
                .levels = 1,
-               .base_array_layer = irb->mt_layer / mt_layer_unit,
+               .base_array_layer = irb->mt_layer,
                .array_len = irb->layer_count,
                .swizzle = ISL_SWIZZLE_IDENTITY,
                .usage = ISL_SURF_USAGE_TEXTURE_BIT,
             };
 
-            const int flags = brw->draw_aux_buffer_disabled[i] ?
-                                 INTEL_AUX_BUFFER_DISABLED : 0;
-            brw_emit_surface_state(brw, irb->mt, flags, target, view,
+            enum isl_aux_usage aux_usage =
+               intel_miptree_texture_aux_usage(brw, irb->mt, format);
+            if (brw->draw_aux_buffer_disabled[i])
+               aux_usage = ISL_AUX_USAGE_NONE;
+
+            brw_emit_surface_state(brw, irb->mt, target, view, aux_usage,
                                    tex_mocs[brw->gen],
                                    surf_offset, surf_index,
                                    I915_GEM_DOMAIN_SAMPLER, 0);
@@ -1234,6 +1180,7 @@
    .dirty = {
       .mesa = _NEW_BUFFERS,
       .brw = BRW_NEW_BATCH |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_FRAGMENT_PROGRAM |
              BRW_NEW_FS_PROG_DATA,
    },
@@ -1305,15 +1252,15 @@
     * allows the surface format to be overriden for only the
     * gather4 messages. */
    if (brw->gen < 8) {
-      if (vs && vs->nir->info->uses_texture_gather)
+      if (vs && vs->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, vs, &brw->vs.base, true, 0);
-      if (tcs && tcs->nir->info->uses_texture_gather)
+      if (tcs && tcs->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, tcs, &brw->tcs.base, true, 0);
-      if (tes && tes->nir->info->uses_texture_gather)
+      if (tes && tes->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, tes, &brw->tes.base, true, 0);
-      if (gs && gs->nir->info->uses_texture_gather)
+      if (gs && gs->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, gs, &brw->gs.base, true, 0);
-      if (fs && fs->nir->info->uses_texture_gather)
+      if (fs && fs->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, fs, &brw->wm.base, true, 0);
    }
 
@@ -1330,6 +1277,7 @@
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_FRAGMENT_PROGRAM |
              BRW_NEW_FS_PROG_DATA |
              BRW_NEW_GEOMETRY_PROGRAM |
@@ -1358,7 +1306,7 @@
     * gather4 messages.
     */
    if (brw->gen < 8) {
-      if (cs && cs->nir->info->uses_texture_gather)
+      if (cs && cs->nir->info.uses_texture_gather)
          update_stage_texture_surfaces(brw, cs, &brw->cs.base, true, 0);
    }
 
@@ -1370,7 +1318,8 @@
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
-             BRW_NEW_COMPUTE_PROGRAM,
+             BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_FAST_CLEAR_COLOR,
    },
    .emit = brw_update_cs_texture_surfaces,
 };
@@ -1404,7 +1353,7 @@
          struct brw_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   size);
+                                   size, false);
          brw_create_constant_surface(brw, bo, binding->Offset,
                                      size,
                                      &ubo_surf_offsets[i]);
@@ -1429,13 +1378,15 @@
          struct brw_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   size);
+                                   size, true);
          brw_create_buffer_surface(brw, bo, binding->Offset,
                                    size,
                                    &ssbo_surf_offsets[i]);
       }
    }
 
+   stage_state->push_constants_dirty = true;
+
    if (prog->info.num_ubos || prog->info.num_ssbos)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
 }
@@ -1445,7 +1396,7 @@
 {
    struct gl_context *ctx = &brw->ctx;
    /* _NEW_PROGRAM */
-   struct gl_program *prog = ctx->_Shader->_CurrentFragmentProgram;
+   struct gl_program *prog = ctx->FragmentProgram._Current;
 
    /* BRW_NEW_FS_PROG_DATA */
    brw_upload_ubo_surfaces(brw, prog, &brw->wm.base, brw->wm.base.prog_data);
@@ -1501,8 +1452,10 @@
             &ctx->AtomicBufferBindings[prog->sh.AtomicBuffers[i]->Binding];
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
-         struct brw_bo *bo = intel_bufferobj_buffer(
-            brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset);
+         struct brw_bo *bo =
+            intel_bufferobj_buffer(brw, intel_bo, binding->Offset,
+                                   intel_bo->Base.Size - binding->Offset,
+                                   true);
 
          brw_emit_buffer_surface_state(brw, &surf_offsets[i], bo,
                                        binding->Offset, ISL_FORMAT_RAW,
@@ -1578,6 +1531,7 @@
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_CS_PROG_DATA |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_IMAGE_UNITS
    },
    .emit = brw_upload_cs_image_surfaces,
@@ -1587,7 +1541,7 @@
 get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   uint32_t hw_format = brw_isl_format_for_mesa_format(format);
+   enum isl_format hw_format = brw_isl_format_for_mesa_format(format);
    if (access == GL_WRITE_ONLY) {
       return hw_format;
    } else if (isl_has_matching_typed_storage_image_format(devinfo, hw_format)) {
@@ -1634,71 +1588,16 @@
    param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
 }
 
-static void
-update_texture_image_param(struct brw_context *brw,
-                           struct gl_image_unit *u,
-                           unsigned surface_idx,
-                           struct brw_image_param *param)
+static unsigned
+get_image_num_layers(const struct intel_mipmap_tree *mt, GLenum target,
+                     unsigned level)
 {
-   struct intel_mipmap_tree *mt = intel_texture_object(u->TexObj)->mt;
+   if (target == GL_TEXTURE_CUBE_MAP)
+      return 6;
 
-   update_default_image_param(brw, u, surface_idx, param);
-
-   param->size[0] = minify(mt->logical_width0, u->Level);
-   param->size[1] = minify(mt->logical_height0, u->Level);
-   param->size[2] = (!u->Layered ? 1 :
-                     u->TexObj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
-                     u->TexObj->Target == GL_TEXTURE_3D ?
-                     minify(mt->logical_depth0, u->Level) :
-                     mt->logical_depth0);
-
-   intel_miptree_get_image_offset(mt, u->Level, u->_Layer,
-                                  &param->offset[0],
-                                  &param->offset[1]);
-
-   param->stride[0] = mt->cpp;
-   param->stride[1] = mt->pitch / mt->cpp;
-   param->stride[2] =
-      brw_miptree_get_horizontal_slice_pitch(brw, mt, u->Level);
-   param->stride[3] =
-      brw_miptree_get_vertical_slice_pitch(brw, mt, u->Level);
-
-   if (mt->tiling == I915_TILING_X) {
-      /* An X tile is a rectangular block of 512x8 bytes. */
-      param->tiling[0] = _mesa_logbase2(512 / mt->cpp);
-      param->tiling[1] = _mesa_logbase2(8);
-
-      if (brw->has_swizzling) {
-         /* Right shifts required to swizzle bits 9 and 10 of the memory
-          * address with bit 6.
-          */
-         param->swizzling[0] = 3;
-         param->swizzling[1] = 4;
-      }
-   } else if (mt->tiling == I915_TILING_Y) {
-      /* The layout of a Y-tiled surface in memory isn't really fundamentally
-       * different to the layout of an X-tiled surface, we simply pretend that
-       * the surface is broken up in a number of smaller 16Bx32 tiles, each
-       * one arranged in X-major order just like is the case for X-tiling.
-       */
-      param->tiling[0] = _mesa_logbase2(16 / mt->cpp);
-      param->tiling[1] = _mesa_logbase2(32);
-
-      if (brw->has_swizzling) {
-         /* Right shift required to swizzle bit 9 of the memory address with
-          * bit 6.
-          */
-         param->swizzling[0] = 3;
-      }
-   }
-
-   /* 3D textures are arranged in 2D in memory with 2^lod slices per row.  The
-    * address calculation algorithm (emit_address_calculation() in
-    * brw_fs_surface_builder.cpp) handles this as a sort of tiling with
-    * modulus equal to the LOD.
-    */
-   param->tiling[2] = (u->TexObj->Target == GL_TEXTURE_3D ? u->Level :
-                       0);
+   return target == GL_TEXTURE_3D ?
+      minify(mt->surf.logical_level0_px.depth, level) :
+      mt->surf.logical_level0_px.array_len;
 }
 
 static void
@@ -1729,6 +1628,18 @@
       } else {
          struct intel_texture_object *intel_obj = intel_texture_object(obj);
          struct intel_mipmap_tree *mt = intel_obj->mt;
+         const unsigned num_layers = u->Layered ?
+            get_image_num_layers(mt, obj->Target, u->Level) : 1;
+
+         struct isl_view view = {
+            .format = format,
+            .base_level = obj->MinLevel + u->Level,
+            .levels = 1,
+            .base_array_layer = obj->MinLayer + u->_Layer,
+            .array_len = num_layers,
+            .swizzle = ISL_SWIZZLE_IDENTITY,
+            .usage = ISL_SURF_USAGE_STORAGE_BIT,
+         };
 
          if (format == ISL_FORMAT_RAW) {
             brw_emit_buffer_surface_state(
@@ -1737,34 +1648,21 @@
                access != GL_READ_ONLY);
 
          } else {
-            const unsigned num_layers = (!u->Layered ? 1 :
-                                         obj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
-                                         mt->logical_depth0);
-
-            struct isl_view view = {
-               .format = format,
-               .base_level = obj->MinLevel + u->Level,
-               .levels = 1,
-               .base_array_layer = obj->MinLayer + u->_Layer,
-               .array_len = num_layers,
-               .swizzle = ISL_SWIZZLE_IDENTITY,
-               .usage = ISL_SURF_USAGE_STORAGE_BIT,
-            };
-
             const int surf_index = surf_offset - &brw->wm.base.surf_offset[0];
-            const bool unresolved = intel_miptree_has_color_unresolved(
-                                       mt, view.base_level, view.levels,
-                                       view.base_array_layer, view.array_len);
-            const int flags = unresolved ? 0 : INTEL_AUX_BUFFER_DISABLED;
-            brw_emit_surface_state(brw, mt, flags, mt->target, view,
-                                   tex_mocs[brw->gen],
+            assert(!intel_miptree_has_color_unresolved(mt,
+                                                       view.base_level, 1,
+                                                       view.base_array_layer,
+                                                       view.array_len));
+            brw_emit_surface_state(brw, mt, mt->target, view,
+                                   ISL_AUX_USAGE_NONE, tex_mocs[brw->gen],
                                    surf_offset, surf_index,
                                    I915_GEM_DOMAIN_SAMPLER,
                                    access == GL_READ_ONLY ? 0 :
                                              I915_GEM_DOMAIN_SAMPLER);
          }
 
-         update_texture_image_param(brw, u, surface_idx, param);
+         isl_surf_fill_image_param(&brw->isl_dev, param, &mt->surf, &view);
+         param->surface_idx = surface_idx;
       }
 
    } else {
@@ -1820,6 +1718,7 @@
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FAST_CLEAR_COLOR |
              BRW_NEW_FRAGMENT_PROGRAM |
              BRW_NEW_FS_PROG_DATA |
              BRW_NEW_IMAGE_UNITS
diff --git a/src/mesa/drivers/dri/i965/gen4_blorp_exec.h b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h
new file mode 100644
index 0000000..764b198
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+static inline struct blorp_address
+dynamic_state_address(struct blorp_batch *batch, uint32_t offset)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = brw->batch.bo,
+      .offset = offset,
+      .write_domain = 0,
+      .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
+   };
+}
+
+static inline struct blorp_address
+instruction_state_address(struct blorp_batch *batch, uint32_t offset)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = brw->cache.bo,
+      .offset = offset,
+      .write_domain = 0,
+      .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
+   };
+}
+
+static struct blorp_address
+blorp_emit_vs_state(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(VS_STATE), vs, 64, &offset) {
+      vs.Enable = false;
+      vs.URBEntryAllocationSize = brw->urb.vsize - 1;
+#if GEN_GEN == 5
+      vs.NumberofURBEntries = brw->urb.nr_vs_entries >> 2;
+#else
+      vs.NumberofURBEntries = brw->urb.nr_vs_entries;
+#endif
+   }
+
+   return dynamic_state_address(batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_sf_state(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+   const struct brw_sf_prog_data *prog_data = params->sf_prog_data;
+
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(SF_STATE), sf, 64, &offset) {
+#if GEN_GEN == 4
+      sf.KernelStartPointer =
+         instruction_state_address(batch, params->sf_prog_kernel);
+#else
+      sf.KernelStartPointer = params->sf_prog_kernel;
+#endif
+      sf.GRFRegisterCount = DIV_ROUND_UP(prog_data->total_grf, 16) - 1;
+      sf.VertexURBEntryReadLength = prog_data->urb_read_length;
+      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+      sf.DispatchGRFStartRegisterForURBData = 3;
+
+      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
+      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
+
+#if GEN_GEN == 5
+      sf.MaximumNumberofThreads = MIN2(48, brw->urb.nr_sf_entries) - 1;
+#else
+      sf.MaximumNumberofThreads = MIN2(24, brw->urb.nr_sf_entries) - 1;
+#endif
+
+      sf.ViewportTransformEnable = false;
+
+      sf.CullMode = CULLMODE_NONE;
+   }
+
+   return dynamic_state_address(batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_wm_state(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
+
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(WM_STATE), wm, 64, &offset) {
+      if (params->src.enabled) {
+         /* Iron Lake can't do sampler prefetch */
+         wm.SamplerCount = (GEN_GEN != 5);
+         wm.BindingTableEntryCount = 2;
+         uint32_t sampler = blorp_emit_sampler_state(batch, params);
+         wm.SamplerStatePointer = dynamic_state_address(batch, sampler);
+      }
+
+      if (prog_data) {
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         wm.SetupURBEntryReadLength = prog_data->num_varying_inputs * 2;
+         wm.SetupURBEntryReadOffset = 0;
+
+         wm.DepthCoefficientURBReadOffset = 1;
+         wm.PixelShaderKillsPixel = prog_data->uses_kill;
+         wm.ThreadDispatchEnable = true;
+         wm.EarlyDepthTestEnable = true;
+
+         wm._8PixelDispatchEnable = prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+
+#if GEN_GEN == 4
+         wm.KernelStartPointer0 =
+            instruction_state_address(batch, params->wm_prog_kernel);
+         wm.GRFRegisterCount0 = prog_data->reg_blocks_0;
+#else
+         wm.KernelStartPointer0 = params->wm_prog_kernel;
+         wm.GRFRegisterCount0 = prog_data->reg_blocks_0;
+         wm.KernelStartPointer2 =
+            params->wm_prog_kernel + prog_data->prog_offset_2;
+         wm.GRFRegisterCount2 = prog_data->reg_blocks_2;
+#endif
+      }
+
+      wm.MaximumNumberofThreads =
+         batch->blorp->compiler->devinfo->max_wm_threads - 1;
+   }
+
+   return dynamic_state_address(batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_color_calc_state(struct blorp_batch *batch,
+                            const struct blorp_params *params)
+{
+   uint32_t cc_viewport = blorp_emit_cc_viewport(batch, params);
+
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
+      cc.CCViewportStatePointer = dynamic_state_address(batch, cc_viewport);
+   }
+
+   return dynamic_state_address(batch, offset);
+}
+
+static void
+blorp_emit_pipeline(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+
+   emit_urb_config(batch, params);
+
+   blorp_emit(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
+      pp.PointertoVSState = blorp_emit_vs_state(batch, params);
+      pp.GSEnable = false;
+      pp.ClipEnable = false;
+      pp.PointertoSFState = blorp_emit_sf_state(batch, params);
+      pp.PointertoWMState = blorp_emit_wm_state(batch, params);
+      pp.PointertoColorCalcState = blorp_emit_color_calc_state(batch, params);
+   }
+
+   brw_upload_urb_fence(brw);
+
+   blorp_emit(batch, GENX(CS_URB_STATE), curb);
+   blorp_emit(batch, GENX(CONSTANT_BUFFER), curb);
+}
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
deleted file mode 100644
index 0e0d05e..0000000
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "intel_batchbuffer.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "main/glformats.h"
-#include "main/stencil.h"
-
-static void
-gen6_upload_blend_state(struct brw_context *brw)
-{
-   bool is_buffer_zero_integer_format = false;
-   struct gl_context *ctx = &brw->ctx;
-   struct gen6_blend_state *blend;
-   int b;
-   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
-   int size;
-
-   /* We need at least one BLEND_STATE written, because we might do
-    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
-    * for computed depth or alpha test), which will do an FB write
-    * with render target 0, which will reference BLEND_STATE[0] for
-    * alpha test enable.
-    */
-   if (nr_draw_buffers == 0)
-      nr_draw_buffers = 1;
-
-   size = sizeof(*blend) * nr_draw_buffers;
-   blend = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
-
-   memset(blend, 0, size);
-
-   for (b = 0; b < nr_draw_buffers; b++) {
-      /* _NEW_BUFFERS */
-      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[b];
-      GLenum rb_type;
-      bool integer;
-
-      if (rb)
-	 rb_type = _mesa_get_format_datatype(rb->Format);
-      else
-	 rb_type = GL_UNSIGNED_NORMALIZED;
-
-      /* Used for implementing the following bit of GL_EXT_texture_integer:
-       *     "Per-fragment operations that require floating-point color
-       *      components, including multisample alpha operations, alpha test,
-       *      blending, and dithering, have no effect when the corresponding
-       *      colors are written to an integer color buffer."
-      */
-      integer = (rb_type == GL_INT || rb_type == GL_UNSIGNED_INT);
-
-      if(b == 0 && integer)
-         is_buffer_zero_integer_format = true;
-
-      /* _NEW_COLOR */
-      if (ctx->Color.ColorLogicOpEnabled) {
-	 /* Floating point RTs should have no effect from LogicOp,
-	  * except for disabling of blending, but other types should.
-	  *
-	  * However, from the Sandy Bridge PRM, Vol 2 Par 1, Section 8.1.11,
-	  * "Logic Ops",
-	  *
-	  *     "Logic Ops are only supported on *_UNORM surfaces (excluding
-	  *      _SRGB variants), otherwise Logic Ops must be DISABLED."
-	  */
-         WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
-                   rb_type != GL_UNSIGNED_NORMALIZED &&
-                   rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
-                   "renderbuffer\n",
-                   _mesa_enum_to_string(ctx->Color.LogicOp),
-                   _mesa_enum_to_string(rb_type));
-	 if (rb_type == GL_UNSIGNED_NORMALIZED) {
-	    blend[b].blend1.logic_op_enable = 1;
-	    blend[b].blend1.logic_op_func =
-	       intel_translate_logic_op(ctx->Color.LogicOp);
-	 }
-      } else if (ctx->Color.BlendEnabled & (1 << b) && !integer &&
-                 !ctx->Color._AdvancedBlendMode) {
-	 GLenum eqRGB = ctx->Color.Blend[b].EquationRGB;
-	 GLenum eqA = ctx->Color.Blend[b].EquationA;
-	 GLenum srcRGB = ctx->Color.Blend[b].SrcRGB;
-	 GLenum dstRGB = ctx->Color.Blend[b].DstRGB;
-	 GLenum srcA = ctx->Color.Blend[b].SrcA;
-	 GLenum dstA = ctx->Color.Blend[b].DstA;
-
-	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	    srcRGB = dstRGB = GL_ONE;
-	 }
-
-	 if (eqA == GL_MIN || eqA == GL_MAX) {
-	    srcA = dstA = GL_ONE;
-	 }
-
-         /* Due to hardware limitations, the destination may have information
-          * in an alpha channel even when the format specifies no alpha
-          * channel. In order to avoid getting any incorrect blending due to
-          * that alpha channel, coerce the blend factors to values that will
-          * not read the alpha channel, but will instead use the correct
-          * implicit value for alpha.
-          */
-         if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat, GL_TEXTURE_ALPHA_TYPE))
-         {
-            srcRGB = brw_fix_xRGB_alpha(srcRGB);
-            srcA = brw_fix_xRGB_alpha(srcA);
-            dstRGB = brw_fix_xRGB_alpha(dstRGB);
-            dstA = brw_fix_xRGB_alpha(dstA);
-         }
-
-	 blend[b].blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-	 blend[b].blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
-	 blend[b].blend0.blend_func = brw_translate_blend_equation(eqRGB);
-
-	 blend[b].blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-	 blend[b].blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
-	 blend[b].blend0.ia_blend_func = brw_translate_blend_equation(eqA);
-
-	 blend[b].blend0.blend_enable = 1;
-	 blend[b].blend0.ia_blend_enable = (srcA != srcRGB ||
-					 dstA != dstRGB ||
-					 eqA != eqRGB);
-      }
-
-      /* See section 8.1.6 "Pre-Blend Color Clamping" of the
-       * SandyBridge PRM Volume 2 Part 1 for HW requirements.
-       *
-       * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
-       * clamping in the fragment shader.  For its clamping of
-       * blending, the spec says:
-       *
-       *     "RESOLVED: For fixed-point color buffers, the inputs and
-       *      the result of the blending equation are clamped.  For
-       *      floating-point color buffers, no clamping occurs."
-       *
-       * So, generally, we want clamping to the render target's range.
-       * And, good news, the hardware tables for both pre- and
-       * post-blend color clamping are either ignored, or any are
-       * allowed, or clamping is required but RT range clamping is a
-       * valid option.
-       */
-      blend[b].blend1.pre_blend_clamp_enable = 1;
-      blend[b].blend1.post_blend_clamp_enable = 1;
-      blend[b].blend1.clamp_range = BRW_RENDERTARGET_CLAMPRANGE_FORMAT;
-
-      /* _NEW_COLOR */
-      if (ctx->Color.AlphaEnabled && !integer) {
-	 blend[b].blend1.alpha_test_enable = 1;
-	 blend[b].blend1.alpha_test_func =
-	    intel_translate_compare_func(ctx->Color.AlphaFunc);
-
-      }
-
-      /* _NEW_COLOR */
-      if (ctx->Color.DitherFlag && !integer) {
-	 blend[b].blend1.dither_enable = 1;
-	 blend[b].blend1.y_dither_offset = 0;
-	 blend[b].blend1.x_dither_offset = 0;
-      }
-
-      blend[b].blend1.write_disable_r = !ctx->Color.ColorMask[b][0];
-      blend[b].blend1.write_disable_g = !ctx->Color.ColorMask[b][1];
-      blend[b].blend1.write_disable_b = !ctx->Color.ColorMask[b][2];
-      blend[b].blend1.write_disable_a = !ctx->Color.ColorMask[b][3];
-
-      /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
-       * "If drawbuffer zero is not NONE and the buffer it references has an
-       * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
-       * operations are skipped."
-       */
-      if(!is_buffer_zero_integer_format) {
-         /* _NEW_MULTISAMPLE */
-         blend[b].blend1.alpha_to_coverage =
-            _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage;
-
-	/* From SandyBridge PRM, volume 2 Part 1, section 8.2.3, BLEND_STATE:
-	 * DWord 1, Bit 30 (AlphaToOne Enable):
-	 * "If Dual Source Blending is enabled, this bit must be disabled"
-	 */
-         WARN_ONCE(ctx->Color.Blend[b]._UsesDualSrc &&
-                   _mesa_is_multisample_enabled(ctx) &&
-                   ctx->Multisample.SampleAlphaToOne,
-                   "HW workaround: disabling alpha to one with dual src "
-                   "blending\n");
-	 if (ctx->Color.Blend[b]._UsesDualSrc)
-            blend[b].blend1.alpha_to_one = false;
-	 else
-	    blend[b].blend1.alpha_to_one =
-	       _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne;
-
-         blend[b].blend1.alpha_to_coverage_dither = (brw->gen >= 7);
-      }
-      else {
-         blend[b].blend1.alpha_to_coverage = false;
-         blend[b].blend1.alpha_to_one = false;
-      }
-   }
-
-   /* Point the GPU at the new indirect state. */
-   if (brw->gen == 6) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-      OUT_BATCH(brw->cc.blend_state_offset | 1);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_BLEND_STATE_POINTERS << 16 | (2 - 2));
-      OUT_BATCH(brw->cc.blend_state_offset | 1);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen6_blend_state = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_COLOR |
-              _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_STATE_BASE_ADDRESS,
-   },
-   .emit = gen6_upload_blend_state,
-};
-
-static void
-gen6_upload_color_calc_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct gen6_color_calc_state *cc;
-
-   cc = brw_state_batch(brw, sizeof(*cc), 64, &brw->cc.state_offset);
-   memset(cc, 0, sizeof(*cc));
-
-   /* _NEW_COLOR */
-   cc->cc0.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-   UNCLAMPED_FLOAT_TO_UBYTE(cc->cc1.alpha_ref_fi.ui, ctx->Color.AlphaRef);
-
-   if (brw->gen < 9) {
-      /* _NEW_STENCIL */
-      cc->cc0.stencil_ref = _mesa_get_stencil_ref(ctx, 0);
-      cc->cc0.bf_stencil_ref =
-         _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
-   }
-
-   /* _NEW_COLOR */
-   cc->constant_r = ctx->Color.BlendColorUnclamped[0];
-   cc->constant_g = ctx->Color.BlendColorUnclamped[1];
-   cc->constant_b = ctx->Color.BlendColorUnclamped[2];
-   cc->constant_a = ctx->Color.BlendColorUnclamped[3];
-
-   /* Point the GPU at the new indirect state. */
-   if (brw->gen == 6) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(brw->cc.state_offset | 1);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
-      OUT_BATCH(brw->cc.state_offset | 1);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen6_color_calc_state = {
-   .dirty = {
-      .mesa = _NEW_COLOR |
-              _NEW_STENCIL,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_CC_STATE |
-             BRW_NEW_STATE_BASE_ADDRESS,
-   },
-   .emit = gen6_upload_color_calc_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 23d969b..2fffb67 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -88,142 +88,3 @@
    return false;
 }
 
-static void
-upload_clip_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_META_IN_PROGRESS */
-   uint32_t dw1 = brw->meta_in_progress ? 0 : GEN6_CLIP_STATISTICS_ENABLE;
-   uint32_t dw2 = 0;
-
-   /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-
-   /* BRW_NEW_FS_PROG_DATA */
-   if (brw_wm_prog_data(brw->wm.base.prog_data)->barycentric_interp_modes &
-       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) {
-      dw2 |= GEN6_CLIP_NON_PERSPECTIVE_BARYCENTRIC_ENABLE;
-   }
-
-   /* BRW_NEW_VS_PROG_DATA */
-   dw1 |= brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
-
-   if (brw->gen >= 7)
-      dw1 |= GEN7_CLIP_EARLY_CULL;
-
-   if (brw->gen == 7) {
-      /* _NEW_POLYGON */
-      if (ctx->Polygon._FrontBit == _mesa_is_user_fbo(fb))
-         dw1 |= GEN7_CLIP_WINDING_CCW;
-
-      if (ctx->Polygon.CullFlag) {
-         switch (ctx->Polygon.CullFaceMode) {
-         case GL_FRONT:
-            dw1 |= GEN7_CLIP_CULLMODE_FRONT;
-            break;
-         case GL_BACK:
-            dw1 |= GEN7_CLIP_CULLMODE_BACK;
-            break;
-         case GL_FRONT_AND_BACK:
-            dw1 |= GEN7_CLIP_CULLMODE_BOTH;
-            break;
-         default:
-            unreachable("Should not get here: invalid CullFlag");
-         }
-      } else {
-         dw1 |= GEN7_CLIP_CULLMODE_NONE;
-      }
-   }
-
-   if (brw->gen < 8 && !ctx->Transform.DepthClamp)
-      dw2 |= GEN6_CLIP_Z_TEST;
-
-   /* _NEW_LIGHT */
-   if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
-      dw2 |=
-	 (0 << GEN6_CLIP_TRI_PROVOKE_SHIFT) |
-	 (1 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) |
-	 (0 << GEN6_CLIP_LINE_PROVOKE_SHIFT);
-   } else {
-      dw2 |=
-	 (2 << GEN6_CLIP_TRI_PROVOKE_SHIFT) |
-	 (2 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) |
-	 (1 << GEN6_CLIP_LINE_PROVOKE_SHIFT);
-   }
-
-   /* _NEW_TRANSFORM */
-   dw2 |= (ctx->Transform.ClipPlanesEnabled <<
-           GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT);
-
-   /* Have the hardware use the user clip distance clip test enable bitmask
-    * specified here in 3DSTATE_CLIP rather than the one in 3DSTATE_VS/DS/GS.
-    * We already listen to _NEW_TRANSFORM here, but the other atoms don't
-    * need to other than this.
-    */
-   if (brw->gen >= 8)
-      dw1 |= GEN8_CLIP_FORCE_USER_CLIP_DISTANCE_BITMASK;
-
-   if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
-      dw2 |= GEN6_CLIP_API_D3D;
-   else
-      dw2 |= GEN6_CLIP_API_OGL;
-
-   dw2 |= GEN6_CLIP_GB_TEST;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   /* BRW_NEW_RASTERIZER_DISCARD */
-   if (ctx->RasterDiscard) {
-      dw2 |= GEN6_CLIP_MODE_REJECT_ALL;
-      if (brw->gen == 6) {
-         perf_debug("Rasterizer discard is currently implemented via the "
-                    "clipper; having the GS not write primitives would "
-                    "likely be faster.\n");
-      }
-   }
-
-   uint32_t enable;
-   if (brw->primitive == _3DPRIM_RECTLIST)
-      enable = 0;
-   else
-      enable = GEN6_CLIP_ENABLE;
-
-   /* _NEW_POLYGON,
-    * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
-    */
-   if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
-      dw2 |= GEN6_CLIP_XY_TEST;
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(enable |
-	     GEN6_CLIP_MODE_NORMAL |
-	     dw2);
-   OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
-             U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
-             (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
-             ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen6_clip_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_POLYGON |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_VS_PROG_DATA |
-               BRW_NEW_META_IN_PROGRESS |
-               BRW_NEW_PRIMITIVE |
-               BRW_NEW_RASTERIZER_DISCARD |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = upload_clip_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c
index 40941c1..dd4e224 100644
--- a/src/mesa/drivers/dri/i965/gen6_constant_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c
@@ -25,81 +25,9 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
+#include "intel_buffer_objects.h"
 #include "program/prog_parameter.h"
 
-void
-gen7_upload_constant_state(struct brw_context *brw,
-                           const struct brw_stage_state *stage_state,
-                           bool active, unsigned opcode)
-{
-   uint32_t mocs = brw->gen < 8 ? GEN7_MOCS_L3 : 0;
-
-   /* Disable if the shader stage is inactive or there are no push constants. */
-   active = active && stage_state->push_const_size != 0;
-
-   int dwords = brw->gen >= 8 ? 11 : 7;
-   BEGIN_BATCH(dwords);
-   OUT_BATCH(opcode << 16 | (dwords - 2));
-
-   /* Workaround for SKL+ (we use option #2 until we have a need for more
-    * constant buffers). This comes from the documentation for 3DSTATE_CONSTANT_*
-    *
-    * The driver must ensure The following case does not occur without a flush
-    * to the 3D engine: 3DSTATE_CONSTANT_* with buffer 3 read length equal to
-    * zero committed followed by a 3DSTATE_CONSTANT_* with buffer 0 read length
-    * not equal to zero committed. Possible ways to avoid this condition
-    * include:
-    *     1. always force buffer 3 to have a non zero read length
-    *     2. always force buffer 0 to a zero read length
-    */
-   if (brw->gen >= 9 && active) {
-      OUT_BATCH(0);
-      OUT_BATCH(stage_state->push_const_size);
-   } else {
-      OUT_BATCH(active ? stage_state->push_const_size : 0);
-      OUT_BATCH(0);
-   }
-   /* Pointer to the constant buffer.  Covered by the set of state flags
-    * from gen6_prepare_wm_contants
-    */
-   if (brw->gen >= 9 && active) {
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      /* XXX: When using buffers other than 0, you need to specify the
-       * graphics virtual address regardless of INSPM/debug bits
-       */
-      OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_RENDER, 0,
-                  stage_state->push_const_offset);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   } else if (brw->gen >= 8) {
-      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   } else {
-      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   }
-
-   ADVANCE_BATCH();
-
-   /* On SKL+ the new constants don't take effect until the next corresponding
-    * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
-    * that is sent
-    */
-   if (brw->gen >= 9)
-      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
-}
-
 /**
  * Creates a streamed BO containing the push constants for the VS or GS on
  * gen6+.
@@ -133,12 +61,17 @@
       if (prog)
          _mesa_load_state_parameters(ctx, prog->Parameters);
 
-      gl_constant_value *param;
       int i;
-
-      param = brw_state_batch(brw,
-                              prog_data->nr_params * sizeof(gl_constant_value),
-                              32, &stage_state->push_const_offset);
+      const int size = prog_data->nr_params * sizeof(gl_constant_value);
+      gl_constant_value *param;
+      if (brw->gen >= 8 || brw->is_haswell) {
+         param = intel_upload_space(brw, size, 32,
+                                    &stage_state->push_const_bo,
+                                    &stage_state->push_const_offset);
+      } else {
+         param = brw_state_batch(brw, size, 32,
+                                 &stage_state->push_const_offset);
+      }
 
       STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
 
@@ -186,4 +119,6 @@
        */
       assert(stage_state->push_const_size <= 32);
    }
+
+   stage_state->push_constants_dirty = true;
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index a77e461..cd04239 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -91,7 +91,7 @@
       break;
    case GL_TEXTURE_3D:
       assert(mt);
-      depth = MAX2(mt->logical_depth0, 1);
+      depth = mt->surf.logical_level0_px.depth;
       /* fallthrough */
    default:
       surftype = translate_tex_target(gl_target);
@@ -103,8 +103,8 @@
    lod = irb ? irb->mt_level - irb->mt->first_level : 0;
 
    if (mt) {
-      width = mt->logical_width0;
-      height = mt->logical_height0;
+      width = mt->surf.logical_level0_px.width;
+      height = mt->surf.logical_level0_px.height;
    }
 
    BEGIN_BATCH(7);
@@ -112,13 +112,12 @@
    OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
 
    /* 3DSTATE_DEPTH_BUFFER dw1 */
-   OUT_BATCH((depth_mt ? depth_mt->pitch - 1 : 0) |
+   OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
              (depthbuffer_format << 18) |
              ((enable_hiz_ss ? 1 : 0) << 21) | /* separate stencil enable */
              ((enable_hiz_ss ? 1 : 0) << 22) | /* hiz enable */
              (BRW_TILEWALK_YMAJOR << 26) |
-             ((depth_mt ? depth_mt->tiling != I915_TILING_NONE : 1)
-              << 27) |
+             (1 << 27) |
              (surftype << 29));
 
    /* 3DSTATE_DEPTH_BUFFER dw2 */
@@ -161,20 +160,15 @@
       /* Emit hiz buffer. */
       if (hiz) {
          assert(depth_mt);
-         struct intel_mipmap_tree *hiz_mt = depth_mt->hiz_buf->mt;
-         uint32_t offset = 0;
 
-         if (hiz_mt->array_layout == GEN6_HIZ_STENCIL) {
-            offset = intel_miptree_get_aligned_offset(
-                        hiz_mt,
-                        hiz_mt->level[lod].level_x,
-                        hiz_mt->level[lod].level_y);
-         }
+         uint32_t offset;
+         isl_surf_get_image_offset_B_tile_sa(&depth_mt->hiz_buf->surf,
+                                             lod, 0, 0, &offset, NULL, NULL);
 
 	 BEGIN_BATCH(3);
 	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(depth_mt->hiz_buf->aux_base.pitch - 1);
-	 OUT_RELOC(depth_mt->hiz_buf->aux_base.bo,
+	 OUT_BATCH(depth_mt->hiz_buf->surf.row_pitch - 1);
+	 OUT_RELOC(depth_mt->hiz_buf->bo,
 		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 		   offset);
 	 ADVANCE_BATCH();
@@ -188,27 +182,16 @@
 
       /* Emit stencil buffer. */
       if (separate_stencil) {
-         uint32_t offset = 0;
+         assert(stencil_mt->format == MESA_FORMAT_S_UINT8);
+         assert(stencil_mt->surf.size > 0);
 
-         if (stencil_mt->array_layout == GEN6_HIZ_STENCIL) {
-            assert(stencil_mt->format == MESA_FORMAT_S_UINT8);
-
-            /* Note: we can't compute the stencil offset using
-             * intel_region_get_aligned_offset(), because stencil_region
-             * claims that the region is untiled even though it's W tiled.
-             */
-            offset = stencil_mt->level[lod].level_y * stencil_mt->pitch +
-                     stencil_mt->level[lod].level_x * 64;
-         }
+         uint32_t offset;
+         isl_surf_get_image_offset_B_tile_sa(&stencil_mt->surf,
+                                             lod, 0, 0, &offset, NULL, NULL);
 
 	 BEGIN_BATCH(3);
 	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-         /* The stencil buffer has quirky pitch requirements.  From Vol 2a,
-          * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
-          *    The pitch must be set to 2x the value computed based on width, as
-          *    the stencil buffer is stored with two rows interleaved.
-          */
-	 OUT_BATCH(2 * stencil_mt->pitch - 1);
+	 OUT_BATCH(stencil_mt->surf.row_pitch - 1);
 	 OUT_RELOC(stencil_mt->bo,
 		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 		   offset);
@@ -234,6 +217,11 @@
    OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 |
              GEN5_DEPTH_CLEAR_VALID |
              (2 - 2));
-   OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0);
+   if (depth_mt) {
+      OUT_BATCH(brw_convert_depth_value(depth_mt->format,
+                                        depth_mt->fast_clear_color.f32[0]));
+   } else {
+      OUT_BATCH(0);
+   }
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_depthstencil.c b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
deleted file mode 100644
index 0f9626c..0000000
--- a/src/mesa/drivers/dri/i965/gen6_depthstencil.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-
-static void
-gen6_upload_depth_stencil_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct gen6_depth_stencil_state *ds;
-   struct intel_renderbuffer *depth_irb;
-
-   /* _NEW_BUFFERS */
-   depth_irb = intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
-
-   ds = brw_state_batch(brw, sizeof(*ds), 64,
-			&brw->cc.depth_stencil_state_offset);
-   memset(ds, 0, sizeof(*ds));
-
-   /* _NEW_STENCIL | _NEW_BUFFERS */
-   if (ctx->Stencil._Enabled) {
-      int back = ctx->Stencil._BackFace;
-
-      ds->ds0.stencil_enable = 1;
-      ds->ds0.stencil_func =
-	 intel_translate_compare_func(ctx->Stencil.Function[0]);
-      ds->ds0.stencil_fail_op =
-	 intel_translate_stencil_op(ctx->Stencil.FailFunc[0]);
-      ds->ds0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]);
-      ds->ds0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]);
-      ds->ds1.stencil_write_mask = ctx->Stencil.WriteMask[0];
-      ds->ds1.stencil_test_mask = ctx->Stencil.ValueMask[0];
-
-      if (ctx->Stencil._TestTwoSide) {
-	 ds->ds0.bf_stencil_enable = 1;
-	 ds->ds0.bf_stencil_func =
-	    intel_translate_compare_func(ctx->Stencil.Function[back]);
-	 ds->ds0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(ctx->Stencil.FailFunc[back]);
-	 ds->ds0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]);
-	 ds->ds0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]);
-	 ds->ds1.bf_stencil_write_mask = ctx->Stencil.WriteMask[back];
-	 ds->ds1.bf_stencil_test_mask = ctx->Stencil.ValueMask[back];
-      }
-
-      ds->ds0.stencil_write_enable = ctx->Stencil._WriteEnabled;
-   }
-
-   /* _NEW_DEPTH */
-   if (ctx->Depth.Test && depth_irb) {
-      ds->ds2.depth_test_enable = ctx->Depth.Test;
-      ds->ds2.depth_test_func = intel_translate_compare_func(ctx->Depth.Func);
-      ds->ds2.depth_write_enable = brw_depth_writes_enabled(brw);
-   }
-
-   /* Point the GPU at the new indirect state. */
-   if (brw->gen == 6) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(brw->cc.depth_stencil_state_offset | 1);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_DEPTH_STENCIL_STATE_POINTERS << 16 | (2 - 2));
-      OUT_BATCH(brw->cc.depth_stencil_state_offset | 1);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen6_depth_stencil_state = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_DEPTH |
-              _NEW_STENCIL,
-      .brw  = BRW_NEW_BATCH |
-              BRW_NEW_BLORP |
-              BRW_NEW_STATE_BASE_ADDRESS,
-   },
-   .emit = gen6_upload_depth_stencil_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
deleted file mode 100644
index 0cdfcf5..0000000
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/shaderapi.h"
-
-static void
-gen6_upload_gs_push_constants(struct brw_context *brw)
-{
-   struct brw_stage_state *stage_state = &brw->gs.base;
-
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   const struct brw_program *gp = brw_program_const(brw->geometry_program);
-
-   if (gp) {
-      /* BRW_NEW_GS_PROG_DATA */
-      struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
-
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_GEOMETRY);
-      gen6_upload_push_constants(brw, &gp->program, prog_data, stage_state);
-   }
-
-   if (brw->gen >= 7)
-      gen7_upload_constant_state(brw, stage_state, gp, _3DSTATE_CONSTANT_GS);
-}
-
-const struct brw_tracked_state gen6_gs_push_constants = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
-   },
-   .emit = gen6_upload_gs_push_constants,
-};
-
-static void
-upload_gs_state_for_tf(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-
-   BEGIN_BATCH(7);
-   OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-   OUT_BATCH(brw->ff_gs.prog_offset);
-   OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE);
-   OUT_BATCH(0); /* no scratch space */
-   OUT_BATCH((2 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
-             (brw->ff_gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT));
-   OUT_BATCH(((devinfo->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
-             GEN6_GS_STATISTICS_ENABLE |
-             GEN6_GS_SO_STATISTICS_ENABLE |
-             GEN6_GS_RENDERING_ENABLE);
-   OUT_BATCH(GEN6_GS_SVBI_PAYLOAD_ENABLE |
-             GEN6_GS_SVBI_POSTINCREMENT_ENABLE |
-             (brw->ff_gs.prog_data->svbi_postincrement_value <<
-              GEN6_GS_SVBI_POSTINCREMENT_VALUE_SHIFT) |
-             GEN6_GS_ENABLE);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_gs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool active = brw->geometry_program;
-   /* BRW_NEW_GS_PROG_DATA */
-   const struct brw_stage_state *stage_state = &brw->gs.base;
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-
-   if (!active || stage_state->push_const_size == 0) {
-      /* Disable the push constant buffers. */
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 |
-		GEN6_CONSTANT_BUFFER_0_ENABLE |
-		(5 - 2));
-      /* Pointer to the GS constant buffer.  Covered by the set of
-       * state flags from gen6_upload_vs_constants
-       */
-      OUT_BATCH(stage_state->push_const_offset +
-                stage_state->push_const_size - 1);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   if (active) {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(stage_state->prog_offset);
-
-      /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
-       * was previously done for gen6.
-       *
-       * TODO: test with both disabled to see if the HW is behaving
-       * as expected, like in gen7.
-       */
-      OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE |
-                ((ALIGN(stage_state->sampler_count, 4)/4) <<
-                 GEN6_GS_SAMPLER_COUNT_SHIFT) |
-                ((prog_data->binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-      if (prog_data->total_scratch) {
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0); /* no scratch space */
-      }
-
-      OUT_BATCH((vue_prog_data->urb_read_length <<
-                 GEN6_GS_URB_READ_LENGTH_SHIFT) |
-                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
-                (prog_data->dispatch_grf_start_reg <<
-                 GEN6_GS_DISPATCH_START_GRF_SHIFT));
-
-      OUT_BATCH(((devinfo->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
-                GEN6_GS_STATISTICS_ENABLE |
-                GEN6_GS_SO_STATISTICS_ENABLE |
-                GEN6_GS_RENDERING_ENABLE);
-
-      if (brw->geometry_program->info.has_transform_feedback_varyings) {
-         /* GEN6_GS_REORDER is equivalent to GEN7_GS_REORDER_TRAILING
-          * in gen7. SNB and IVB specs are the same regarding the reordering of
-          * TRISTRIP/TRISTRIP_REV vertices and triangle orientation, so we do
-          * the same thing in both generations. For more details, see the
-          * comment in gen7_gs_state.c
-          */
-         OUT_BATCH(GEN6_GS_REORDER |
-                   GEN6_GS_SVBI_PAYLOAD_ENABLE |
-                   GEN6_GS_ENABLE);
-      } else {
-         OUT_BATCH(GEN6_GS_REORDER | GEN6_GS_ENABLE);
-      }
-      ADVANCE_BATCH();
-   } else if (brw->ff_gs.prog_active) {
-      /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
-       * program. This function provides the needed 3DSTATE_GS for this.
-       */
-      upload_gs_state_for_tf(brw);
-   } else {
-      /* No GS function required */
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(0); /* prog_bo */
-      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
-                (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-      OUT_BATCH(0); /* scratch space base offset */
-      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
-                (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
-                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
-      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
-                GEN6_GS_STATISTICS_ENABLE |
-                GEN6_GS_RENDERING_ENABLE);
-                OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-   brw->gs.enabled = active;
-}
-
-const struct brw_tracked_state gen6_gs_state = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FF_GS_PROG_DATA |
-               BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
-   },
-   .emit = upload_gs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index a59ffec..bfa84fb 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -118,106 +118,3 @@
    memcpy(ctx->Const.SampleMap8x, map_8x, sizeof(map_8x));
    memcpy(ctx->Const.SampleMap16x, map_16x, sizeof(map_16x));
 }
-
-/**
- * 3DSTATE_MULTISAMPLE
- */
-void
-gen6_emit_3dstate_multisample(struct brw_context *brw,
-                              unsigned num_samples)
-{
-   uint32_t number_of_multisamples = 0;
-   uint32_t sample_positions_3210 = 0;
-   uint32_t sample_positions_7654 = 0;
-
-   assert(brw->gen < 8);
-
-   switch (num_samples) {
-   case 0:
-   case 1:
-      number_of_multisamples = MS_NUMSAMPLES_1;
-      break;
-   case 4:
-      number_of_multisamples = MS_NUMSAMPLES_4;
-      sample_positions_3210 = brw_multisample_positions_4x;
-      break;
-   case 8:
-      number_of_multisamples = MS_NUMSAMPLES_8;
-      sample_positions_3210 = brw_multisample_positions_8x[0];
-      sample_positions_7654 = brw_multisample_positions_8x[1];
-      break;
-   default:
-      unreachable("Unrecognized num_samples in gen6_emit_3dstate_multisample");
-   }
-
-   int len = brw->gen >= 7 ? 4 : 3;
-   BEGIN_BATCH(len);
-   OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (len - 2));
-   OUT_BATCH(MS_PIXEL_LOCATION_CENTER | number_of_multisamples);
-   OUT_BATCH(sample_positions_3210);
-   if (brw->gen >= 7)
-      OUT_BATCH(sample_positions_7654);
-   ADVANCE_BATCH();
-}
-
-unsigned
-gen6_determine_sample_mask(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   float coverage = 1.0f;
-   float coverage_invert = false;
-   unsigned sample_mask = ~0u;
-
-   /* BRW_NEW_NUM_SAMPLES */
-   unsigned num_samples = brw->num_samples;
-
-   if (_mesa_is_multisample_enabled(ctx)) {
-      if (ctx->Multisample.SampleCoverage) {
-         coverage = ctx->Multisample.SampleCoverageValue;
-         coverage_invert = ctx->Multisample.SampleCoverageInvert;
-      }
-      if (ctx->Multisample.SampleMask) {
-         sample_mask = ctx->Multisample.SampleMaskValue;
-      }
-   }
-
-   if (num_samples > 1) {
-      int coverage_int = (int) (num_samples * coverage + 0.5f);
-      uint32_t coverage_bits = (1 << coverage_int) - 1;
-      if (coverage_invert)
-         coverage_bits ^= (1 << num_samples) - 1;
-      return coverage_bits & sample_mask;
-   } else {
-      return 1;
-   }
-}
-
-/**
- * 3DSTATE_SAMPLE_MASK
- */
-void
-gen6_emit_3dstate_sample_mask(struct brw_context *brw, unsigned mask)
-{
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
-   OUT_BATCH(mask);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_multisample_state(struct brw_context *brw)
-{
-   /* BRW_NEW_NUM_SAMPLES */
-   gen6_emit_3dstate_multisample(brw, brw->num_samples);
-   gen6_emit_3dstate_sample_mask(brw, gen6_determine_sample_mask(brw));
-}
-
-const struct brw_tracked_state gen6_multisample_state = {
-   .dirty = {
-      .mesa = _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_BLORP |
-             BRW_NEW_CONTEXT |
-             BRW_NEW_NUM_SAMPLES,
-   },
-   .emit = upload_multisample_state
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 35b8859..8e639cf 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -60,8 +60,17 @@
     */
    if (brw->ctx.Extensions.ARB_query_buffer_object &&
        brw_is_query_pipelined(query)) {
-      brw_emit_pipe_control_write(brw,
-                                  PIPE_CONTROL_WRITE_IMMEDIATE,
+      unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
+
+      if (available) {
+         /* Order available *after* the query results. */
+         flags |= PIPE_CONTROL_FLUSH_ENABLE;
+      } else {
+         /* Make it unavailable *before* any pipelined reads. */
+         flags |= PIPE_CONTROL_CS_STALL;
+      }
+
+      brw_emit_pipe_control_write(brw, flags,
                                   query->bo, 2 * sizeof(uint64_t),
                                   available);
    }
@@ -212,8 +221,7 @@
    if (query->bo == NULL)
       return;
 
-   brw_bo_map(brw, query->bo, false);
-   uint64_t *results = query->bo->virtual;
+   uint64_t *results = brw_bo_map(brw, query->bo, MAP_READ);
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED:
       /* The query BO contains the starting and ending timestamps.
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
deleted file mode 100644
index 3407f6a..0000000
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-
-static void
-gen6_upload_scissor_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   struct gen6_scissor_rect *scissor;
-   uint32_t scissor_state_offset;
-   const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
-   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   scissor = brw_state_batch(brw, sizeof(*scissor) * viewport_count, 32,
-                             &scissor_state_offset);
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
-
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.  Clipping to the boundaries of static shared buffers
-    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-   for (unsigned i = 0; i < viewport_count; i++) {
-      int bbox[4];
-
-      bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
-      bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
-      bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
-      bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
-      _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
-
-      if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
-         /* If the scissor was out of bounds and got clamped to 0 width/height
-          * at the bounds, the subtraction of 1 from maximums could produce a
-          * negative number and thus not clip anything.  Instead, just provide
-          * a min > max scissor inside the bounds, which produces the expected
-          * no rendering.
-          */
-         scissor[i].xmin = 1;
-         scissor[i].xmax = 0;
-         scissor[i].ymin = 1;
-         scissor[i].ymax = 0;
-      } else if (render_to_fbo) {
-         /* texmemory: Y=0=bottom */
-         scissor[i].xmin = bbox[0];
-         scissor[i].xmax = bbox[1] - 1;
-         scissor[i].ymin = bbox[2];
-         scissor[i].ymax = bbox[3] - 1;
-      }
-      else {
-         /* memory: Y=0=top */
-         scissor[i].xmin = bbox[0];
-         scissor[i].xmax = bbox[1] - 1;
-         scissor[i].ymin = fb_height - bbox[3];
-         scissor[i].ymax = fb_height - bbox[2] - 1;
-      }
-   }
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_SCISSOR_STATE_POINTERS << 16 | (2 - 2));
-   OUT_BATCH(scissor_state_offset);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen6_scissor_state = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_SCISSOR |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = gen6_upload_scissor_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
deleted file mode 100644
index 0f118b6..0000000
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "compiler/nir/nir.h"
-#include "main/macros.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "intel_batchbuffer.h"
-
-/**
- * Determine the appropriate attribute override value to store into the
- * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
- * override value contains two pieces of information: the location of the
- * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
- * flag indicating whether to "swizzle" the attribute based on the direction
- * the triangle is facing.
- *
- * If an attribute is "swizzled", then the given VUE location is used for
- * front-facing triangles, and the VUE location that immediately follows is
- * used for back-facing triangles.  We use this to implement the mapping from
- * gl_FrontColor/gl_BackColor to gl_Color.
- *
- * urb_entry_read_offset is the offset into the VUE at which the SF unit is
- * being instructed to begin reading attribute data.  It can be set to a
- * nonzero value to prevent the SF unit from wasting time reading elements of
- * the VUE that are not needed by the fragment shader.  It is measured in
- * 256-bit increments.
- */
-static uint32_t
-get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
-                  int fs_attr, bool two_side_color, uint32_t *max_source_attr)
-{
-   /* Find the VUE slot for this attribute. */
-   int slot = vue_map->varying_to_slot[fs_attr];
-
-   /* Viewport and Layer are stored in the VUE header.  We need to override
-    * them to zero if earlier stages didn't write them, as GL requires that
-    * they read back as zero when not explicitly set.
-    */
-   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
-      unsigned override =
-         ATTRIBUTE_0_OVERRIDE_X | ATTRIBUTE_0_OVERRIDE_W |
-         ATTRIBUTE_CONST_0000 << ATTRIBUTE_0_CONST_SOURCE_SHIFT;
-
-      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
-         override |= ATTRIBUTE_0_OVERRIDE_Y;
-      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
-         override |= ATTRIBUTE_0_OVERRIDE_Z;
-
-      return override;
-   }
-
-   /* If there was only a back color written but not front, use back
-    * as the color instead of undefined
-    */
-   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
-      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
-   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
-      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
-
-   if (slot == -1) {
-      /* This attribute does not exist in the VUE--that means that the vertex
-       * shader did not write to it.  This means that either:
-       *
-       * (a) This attribute is a texture coordinate, and it is going to be
-       * replaced with point coordinates (as a consequence of a call to
-       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
-       * hardware will ignore whatever attribute override we supply.
-       *
-       * (b) This attribute is read by the fragment shader but not written by
-       * the vertex shader, so its value is undefined.  Therefore the
-       * attribute override we supply doesn't matter.
-       *
-       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
-       * previous shader stage.
-       *
-       * Note that we don't have to worry about the cases where the attribute
-       * is gl_PointCoord or is undergoing point sprite coordinate
-       * replacement, because in those cases, this function isn't called.
-       *
-       * In case (c), we need to program the attribute overrides so that the
-       * primitive ID will be stored in this slot.  In every other case, the
-       * attribute override we supply doesn't matter.  So just go ahead and
-       * program primitive ID in every case.
-       */
-      return (ATTRIBUTE_0_OVERRIDE_W |
-              ATTRIBUTE_0_OVERRIDE_Z |
-              ATTRIBUTE_0_OVERRIDE_Y |
-              ATTRIBUTE_0_OVERRIDE_X |
-              (ATTRIBUTE_CONST_PRIM_ID << ATTRIBUTE_0_CONST_SOURCE_SHIFT));
-   }
-
-   /* Compute the location of the attribute relative to urb_entry_read_offset.
-    * Each increment of urb_entry_read_offset represents a 256-bit value, so
-    * it counts for two 128-bit VUE slots.
-    */
-   int source_attr = slot - 2 * urb_entry_read_offset;
-   assert(source_attr >= 0 && source_attr < 32);
-
-   /* If we are doing two-sided color, and the VUE slot following this one
-    * represents a back-facing color, then we need to instruct the SF unit to
-    * do back-facing swizzling.
-    */
-   bool swizzling = two_side_color &&
-      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
-        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
-       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
-        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
-
-   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
-   if (*max_source_attr < source_attr + swizzling)
-      *max_source_attr = source_attr + swizzling;
-
-   if (swizzling) {
-      return source_attr |
-         (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
-   }
-
-   return source_attr;
-}
-
-
-/**
- * Create the mapping from the FS inputs we produce to the previous pipeline
- * stage (GS or VS) outputs they source from.
- */
-void
-calculate_attr_overrides(const struct brw_context *brw,
-                         uint16_t *attr_overrides,
-                         uint32_t *point_sprite_enables,
-                         uint32_t *urb_entry_read_length,
-                         uint32_t *urb_entry_read_offset)
-{
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   uint32_t max_source_attr = 0;
-
-   *point_sprite_enables = 0;
-
-   *urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM
-    *
-    * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
-    * the full vertex header.  Otherwise, we can program the SF to start
-    * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
-    * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
-    * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
-    */
-
-   bool fs_needs_vue_header = brw->fragment_program->info.inputs_read &
-      (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
-
-   *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
-
-   /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
-    * description of dw10 Point Sprite Texture Coordinate Enable:
-    *
-    * "This field must be programmed to zero when non-point primitives
-    * are rendered."
-    *
-    * The SandyBridge PRM doesn't explicitly say that point sprite enables
-    * must be programmed to zero when rendering non-point primitives, but
-    * the IvyBridge PRM does, and if we don't, we get garbage.
-    *
-    * This is not required on Haswell, as the hardware ignores this state
-    * when drawing non-points -- although we do still need to be careful to
-    * correctly set the attr overrides.
-    *
-    * _NEW_POLYGON
-    * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
-    */
-   bool drawing_points = brw_is_drawing_points(brw);
-
-   /* Initialize all the attr_overrides to 0.  In the loop below we'll modify
-    * just the ones that correspond to inputs used by the fs.
-    */
-   memset(attr_overrides, 0, 16*sizeof(*attr_overrides));
-
-   for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      int input_index = wm_prog_data->urb_setup[attr];
-
-      if (input_index < 0)
-	 continue;
-
-      /* _NEW_POINT */
-      bool point_sprite = false;
-      if (drawing_points) {
-         if (brw->ctx.Point.PointSprite &&
-             (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
-             (brw->ctx.Point.CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
-            point_sprite = true;
-         }
-
-         if (attr == VARYING_SLOT_PNTC)
-            point_sprite = true;
-
-         if (point_sprite)
-            *point_sprite_enables |= (1 << input_index);
-      }
-
-      /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
-      uint16_t attr_override = point_sprite ? 0 :
-         get_attr_override(&brw->vue_map_geom_out,
-			   *urb_entry_read_offset, attr,
-                           brw->ctx.VertexProgram._TwoSideEnabled,
-                           &max_source_attr);
-
-      /* The hardware can only do the overrides on 16 overrides at a
-       * time, and the other up to 16 have to be lined up so that the
-       * input index = the output index.  We'll need to do some
-       * tweaking to make sure that's the case.
-       */
-      if (input_index < 16)
-         attr_overrides[input_index] = attr_override;
-      else
-         assert(attr_override == input_index);
-   }
-
-   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
-    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
-    *
-    * "This field should be set to the minimum length required to read the
-    *  maximum source attribute.  The maximum source attribute is indicated
-    *  by the maximum value of the enabled Attribute # Source Attribute if
-    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
-    *  enable is not set.
-    *  read_length = ceiling((max_source_attr + 1) / 2)
-    *
-    *  [errata] Corruption/Hang possible if length programmed larger than
-    *  recommended"
-    *
-    * Similar text exists for Ivy Bridge.
-    */
-   *urb_entry_read_length = ALIGN(max_source_attr + 1, 2) / 2;
-}
-
-
-static void
-upload_sf_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   uint32_t num_outputs = wm_prog_data->num_varying_inputs;
-   uint32_t dw1, dw2, dw3, dw4;
-   uint32_t point_sprite_enables;
-   int i;
-   /* _NEW_BUFFER */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
-
-   float point_size;
-   uint16_t attr_overrides[16];
-   uint32_t point_sprite_origin;
-
-   dw1 = GEN6_SF_SWIZZLE_ENABLE | num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT;
-   dw2 = GEN6_SF_STATISTICS_ENABLE;
-   dw3 = GEN6_SF_SCISSOR_ENABLE | GEN6_SF_LINE_AA_MODE_TRUE;
-   dw4 = 0;
-
-   if (brw->sf.viewport_transform_enable)
-       dw2 |= GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon._FrontBit == render_to_fbo)
-      dw2 |= GEN6_SF_WINDING_CCW;
-
-   if (ctx->Polygon.OffsetFill)
-       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
-
-   if (ctx->Polygon.OffsetLine)
-       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
-
-   if (ctx->Polygon.OffsetPoint)
-       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
-
-   switch (ctx->Polygon.FrontMode) {
-   case GL_FILL:
-       dw2 |= GEN6_SF_FRONT_SOLID;
-       break;
-
-   case GL_LINE:
-       dw2 |= GEN6_SF_FRONT_WIREFRAME;
-       break;
-
-   case GL_POINT:
-       dw2 |= GEN6_SF_FRONT_POINT;
-       break;
-
-   default:
-       unreachable("not reached");
-   }
-
-   switch (ctx->Polygon.BackMode) {
-   case GL_FILL:
-       dw2 |= GEN6_SF_BACK_SOLID;
-       break;
-
-   case GL_LINE:
-       dw2 |= GEN6_SF_BACK_WIREFRAME;
-       break;
-
-   case GL_POINT:
-       dw2 |= GEN6_SF_BACK_POINT;
-       break;
-
-   default:
-       unreachable("not reached");
-   }
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon.CullFlag) {
-      switch (ctx->Polygon.CullFaceMode) {
-      case GL_FRONT:
-	 dw3 |= GEN6_SF_CULL_FRONT;
-	 break;
-      case GL_BACK:
-	 dw3 |= GEN6_SF_CULL_BACK;
-	 break;
-      case GL_FRONT_AND_BACK:
-	 dw3 |= GEN6_SF_CULL_BOTH;
-	 break;
-      default:
-	 unreachable("not reached");
-      }
-   } else {
-      dw3 |= GEN6_SF_CULL_NONE;
-   }
-
-   /* _NEW_LINE */
-   {
-      uint32_t line_width_u3_7 = brw_get_line_width(brw);
-      dw3 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
-   }
-   if (ctx->Line.SmoothFlag) {
-      dw3 |= GEN6_SF_LINE_AA_ENABLE;
-      dw3 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0;
-   }
-   /* _NEW_MULTISAMPLE */
-   if (multisampled_fbo && ctx->Multisample.Enabled)
-      dw3 |= GEN6_SF_MSRAST_ON_PATTERN;
-
-   /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
-   if (use_state_point_size(brw))
-      dw4 |= GEN6_SF_USE_STATE_POINT_WIDTH;
-
-   /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
-   point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-
-   /* Clamp to the hardware limits and convert to fixed point */
-   dw4 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
-
-   /*
-    * Window coordinates in an FBO are inverted, which means point
-    * sprite origin must be inverted, too.
-    */
-   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
-      point_sprite_origin = GEN6_SF_POINT_SPRITE_LOWERLEFT;
-   } else {
-      point_sprite_origin = GEN6_SF_POINT_SPRITE_UPPERLEFT;
-   }
-   dw1 |= point_sprite_origin;
-
-   /* _NEW_LIGHT */
-   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
-      dw4 |=
-	 (2 << GEN6_SF_TRI_PROVOKE_SHIFT) |
-	 (2 << GEN6_SF_TRIFAN_PROVOKE_SHIFT) |
-	 (1 << GEN6_SF_LINE_PROVOKE_SHIFT);
-   } else {
-      dw4 |=
-	 (1 << GEN6_SF_TRIFAN_PROVOKE_SHIFT);
-   }
-
-   /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
-    * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
-    */
-   uint32_t urb_entry_read_length;
-   uint32_t urb_entry_read_offset;
-   calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &urb_entry_read_length, &urb_entry_read_offset);
-   dw1 |= (urb_entry_read_length << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-           urb_entry_read_offset << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
-
-   BEGIN_BATCH(20);
-   OUT_BATCH(_3DSTATE_SF << 16 | (20 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   OUT_BATCH(dw3);
-   OUT_BATCH(dw4);
-   OUT_BATCH_F(ctx->Polygon.OffsetUnits * 2); /* constant.  copied from gen4 */
-   OUT_BATCH_F(ctx->Polygon.OffsetFactor); /* scale */
-   OUT_BATCH_F(ctx->Polygon.OffsetClamp); /* global depth offset clamp */
-   for (i = 0; i < 8; i++) {
-      OUT_BATCH(attr_overrides[i * 2] | attr_overrides[i * 2 + 1] << 16);
-   }
-   OUT_BATCH(point_sprite_enables); /* dw16 */
-   OUT_BATCH(wm_prog_data->flat_inputs);
-   OUT_BATCH(0); /* wrapshortest enables 0-7 */
-   OUT_BATCH(0); /* wrapshortest enables 8-15 */
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen6_sf_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_PROGRAM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_PRIMITIVE |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
-   },
-   .emit = upload_sf_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 436775a..b4824b6 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -247,8 +247,7 @@
    if (unlikely(brw->perf_debug && brw_bo_busy(obj->prim_count_bo)))
       perf_debug("Stalling for # of transform feedback primitives written.\n");
 
-   brw_bo_map(brw, obj->prim_count_bo, false);
-   uint64_t *prim_counts = obj->prim_count_bo->virtual;
+   uint64_t *prim_counts = brw_bo_map(brw, obj->prim_count_bo, MAP_READ);
 
    assert(obj->prim_count_buffer_index % (2 * streams) == 0);
    int pairs = obj->prim_count_buffer_index / (2 * streams);
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
deleted file mode 100644
index 41cc459..0000000
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "main/viewport.h"
-
-void
-brw_calculate_guardband_size(const struct gen_device_info *devinfo,
-                             uint32_t fb_width, uint32_t fb_height,
-                             float m00, float m11, float m30, float m31,
-                             float *xmin, float *xmax,
-                             float *ymin, float *ymax)
-{
-   /* According to the "Vertex X,Y Clamping and Quantization" section of the
-    * Strips and Fans documentation:
-    *
-    * "The vertex X and Y screen-space coordinates are also /clamped/ to the
-    *  fixed-point "guardband" range supported by the rasterization hardware"
-    *
-    * and
-    *
-    * "In almost all circumstances, if an object’s vertices are actually
-    *  modified by this clamping (i.e., had X or Y coordinates outside of
-    *  the guardband extent the rendered object will not match the intended
-    *  result.  Therefore software should take steps to ensure that this does
-    *  not happen - e.g., by clipping objects such that they do not exceed
-    *  these limits after the Drawing Rectangle is applied."
-    *
-    * I believe the fundamental restriction is that the rasterizer (in
-    * the SF/WM stages) have a limit on the number of pixels that can be
-    * rasterized.  We need to ensure any coordinates beyond the rasterizer
-    * limit are handled by the clipper.  So effectively that limit becomes
-    * the clipper's guardband size.
-    *
-    * It goes on to say:
-    *
-    * "In addition, in order to be correctly rendered, objects must have a
-    *  screenspace bounding box not exceeding 8K in the X or Y direction.
-    *  This additional restriction must also be comprehended by software,
-    *  i.e., enforced by use of clipping."
-    *
-    * This makes no sense.  Gen7+ hardware supports 16K render targets,
-    * and you definitely need to be able to draw polygons that fill the
-    * surface.  Our assumption is that the rasterizer was limited to 8K
-    * on Sandybridge, which only supports 8K surfaces, and it was actually
-    * increased to 16K on Ivybridge and later.
-    *
-    * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
-    */
-   const float gb_size = devinfo->gen >= 7 ? 16384.0f : 8192.0f;
-
-   if (m00 != 0 && m11 != 0) {
-      /* First, we compute the screen-space render area */
-      const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
-      const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
-      const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
-      const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
-
-      /* We want the guardband to be centered on that */
-      const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
-      const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
-      const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
-      const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
-
-      /* Now we need it in native device coordinates */
-      const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
-      const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
-      const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
-      const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
-
-      /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
-       * flipped upside-down.  X should be fine though.
-       */
-      assert(ndc_gb_xmin <= ndc_gb_xmax);
-      *xmin = ndc_gb_xmin;
-      *xmax = ndc_gb_xmax;
-      *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
-      *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
-   } else {
-      /* The viewport scales to 0, so nothing will be rendered. */
-      *xmin = 0.0f;
-      *xmax = 0.0f;
-      *ymin = 0.0f;
-      *ymax = 0.0f;
-   }
-}
-
-static void
-gen6_upload_sf_and_clip_viewports(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gen6_sf_viewport *sfv;
-   struct brw_clipper_viewport *clv;
-   GLfloat y_scale, y_bias;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const bool render_to_fbo = _mesa_is_user_fbo(fb);
-   const uint32_t fb_width = _mesa_geometric_width(ctx->DrawBuffer);
-   const uint32_t fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-
-   sfv = brw_state_batch(brw, sizeof(*sfv) * viewport_count,
-                         32, &brw->sf.vp_offset);
-   memset(sfv, 0, sizeof(*sfv) * viewport_count);
-
-   clv = brw_state_batch(brw, sizeof(*clv) * viewport_count,
-                         32, &brw->clip.vp_offset);
-
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0.0;
-   } else {
-      y_scale = -1.0;
-      y_bias = (float)fb_height;
-   }
-
-   for (unsigned i = 0; i < viewport_count; i++) {
-      float scale[3], translate[3];
-
-      /* _NEW_VIEWPORT */
-      _mesa_get_viewport_xform(ctx, i, scale, translate);
-      sfv[i].m00 = scale[0];
-      sfv[i].m11 = scale[1] * y_scale;
-      sfv[i].m22 = scale[2];
-      sfv[i].m30 = translate[0];
-      sfv[i].m31 = translate[1] * y_scale + y_bias;
-      sfv[i].m32 = translate[2];
-
-      brw_calculate_guardband_size(devinfo, fb_width, fb_height,
-                                   sfv[i].m00, sfv[i].m11,
-                                   sfv[i].m30, sfv[i].m31,
-                                   &clv[i].xmin, &clv[i].xmax,
-                                   &clv[i].ymin, &clv[i].ymax);
-   }
-
-   brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
-}
-
-const struct brw_tracked_state gen6_sf_and_clip_viewports = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = gen6_upload_sf_and_clip_viewports,
-};
-
-static void upload_viewport_state_pointers(struct brw_context *brw)
-{
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) |
-	     GEN6_CC_VIEWPORT_MODIFY |
-	     GEN6_SF_VIEWPORT_MODIFY |
-	     GEN6_CLIP_VIEWPORT_MODIFY);
-   OUT_BATCH(brw->clip.vp_offset);
-   OUT_BATCH(brw->sf.vp_offset);
-   OUT_BATCH(brw->cc.vp_offset);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen6_viewport_state = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_CC_VP |
-             BRW_NEW_CLIP_VP |
-             BRW_NEW_SF_VP |
-             BRW_NEW_STATE_BASE_ADDRESS,
-   },
-   .emit = upload_viewport_state_pointers,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
deleted file mode 100644
index 17b8118..0000000
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "program/prog_parameter.h"
-#include "program/prog_statevars.h"
-#include "main/shaderapi.h"
-#include "intel_batchbuffer.h"
-
-static void
-gen6_upload_vs_push_constants(struct brw_context *brw)
-{
-   struct brw_stage_state *stage_state = &brw->vs.base;
-
-   /* _BRW_NEW_VERTEX_PROGRAM */
-   const struct brw_program *vp = brw_program_const(brw->vertex_program);
-   /* BRW_NEW_VS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
-
-   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_VERTEX);
-   gen6_upload_push_constants(brw, &vp->program, prog_data, stage_state);
-
-   if (brw->gen >= 7) {
-      if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
-         gen7_emit_vs_workaround_flush(brw);
-
-      gen7_upload_constant_state(brw, stage_state, true /* active */,
-                                 _3DSTATE_CONSTANT_VS);
-   }
-}
-
-const struct brw_tracked_state gen6_vs_push_constants = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_VERTEX_PROGRAM |
-               BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = gen6_upload_vs_push_constants,
-};
-
-static void
-upload_vs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->vs.base;
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   uint32_t floating_point_mode = 0;
-
-   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
-    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
-    *
-    *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
-    *   command that causes the VS Function Enable to toggle. Pipeline
-    *   flush can be executed by sending a PIPE_CONTROL command with CS
-    *   stall bit set and a post sync operation.
-    *
-    * We've already done such a flush at the start of state upload, so we
-    * don't need to do another one here.
-    */
-
-   if (stage_state->push_const_size == 0) {
-      /* Disable the push constant buffers. */
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 |
-		GEN6_CONSTANT_BUFFER_0_ENABLE |
-		(5 - 2));
-      /* Pointer to the VS constant buffer.  Covered by the set of
-       * state flags from gen6_upload_vs_constants
-       */
-      OUT_BATCH(stage_state->push_const_offset +
-		stage_state->push_const_size - 1);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   if (prog_data->use_alt_mode)
-      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;
-
-   BEGIN_BATCH(6);
-   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-   OUT_BATCH(stage_state->prog_offset);
-   OUT_BATCH(floating_point_mode |
-	     ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) |
-             ((prog_data->binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-   if (prog_data->total_scratch) {
-      OUT_RELOC(stage_state->scratch_bo,
-		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-   }
-
-   OUT_BATCH((prog_data->dispatch_grf_start_reg <<
-              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-	     (vue_prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
-	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
-
-   OUT_BATCH(((devinfo->max_vs_threads - 1) << GEN6_VS_MAX_THREADS_SHIFT) |
-	     GEN6_VS_STATISTICS_ENABLE |
-	     GEN6_VS_ENABLE);
-   ADVANCE_BATCH();
-
-   /* Based on my reading of the simulator, the VS constants don't get
-    * pulled into the VS FF unit until an appropriate pipeline flush
-    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
-    * references to them into a little FIFO.  The flushes are common,
-    * but don't reliably happen between this and a 3DPRIMITIVE, causing
-    * the primitive to use the wrong constants.  Then the FIFO
-    * containing the constant setup gets added to again on the next
-    * constants change, and eventually when a flush does happen the
-    * unit is overwhelmed by constant changes and dies.
-    *
-    * To avoid this, send a PIPE_CONTROL down the line that will
-    * update the unit immediately loading the constants.  The flush
-    * type bits here were those set by the STATE_BASE_ADDRESS whose
-    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
-    * bug reports that led to this workaround, and may be more than
-    * what is strictly required to avoid the issue.
-    */
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_DEPTH_STALL |
-                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                               PIPE_CONTROL_STATE_CACHE_INVALIDATE);
-}
-
-const struct brw_tracked_state gen6_vs_state = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_VERTEX_PROGRAM |
-               BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = upload_vs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
deleted file mode 100644
index aabae70..0000000
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright © 2009 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "compiler/brw_eu_defines.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-#include "program/program.h"
-#include "program/prog_parameter.h"
-#include "program/prog_statevars.h"
-#include "main/shaderapi.h"
-#include "main/framebuffer.h"
-#include "intel_batchbuffer.h"
-
-static void
-gen6_upload_wm_push_constants(struct brw_context *brw)
-{
-   struct brw_stage_state *stage_state = &brw->wm.base;
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct brw_program *fp = brw_program_const(brw->fragment_program);
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
-
-   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_FRAGMENT);
-
-   gen6_upload_push_constants(brw, &fp->program, prog_data, stage_state);
-
-   if (brw->gen >= 7) {
-      gen7_upload_constant_state(brw, &brw->wm.base, true,
-                                 _3DSTATE_CONSTANT_PS);
-   }
-}
-
-const struct brw_tracked_state gen6_wm_push_constants = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
-   },
-   .emit = gen6_upload_wm_push_constants,
-};
-
-void
-gen6_upload_wm_state(struct brw_context *brw,
-                     const struct brw_wm_prog_data *prog_data,
-                     const struct brw_stage_state *stage_state,
-                     bool multisampled_fbo,
-                     bool dual_source_blend_enable, bool kill_enable,
-                     bool color_buffer_write_enable, bool msaa_enabled,
-                     bool line_stipple_enable, bool polygon_stipple_enable,
-                     bool statistic_enable)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   uint32_t dw2, dw4, dw5, dw6, ksp0, ksp2;
-
-   /* We can't fold this into gen6_upload_wm_push_constants(), because
-    * according to the SNB PRM, vol 2 part 1 section 7.2.2
-    * (3DSTATE_CONSTANT_PS [DevSNB]):
-    *
-    *     "[DevSNB]: This packet must be followed by WM_STATE."
-    */
-   if (prog_data->base.nr_params == 0) {
-      /* Disable the push constant buffers. */
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
-		GEN6_CONSTANT_BUFFER_0_ENABLE |
-		(5 - 2));
-      /* Pointer to the WM constant buffer.  Covered by the set of
-       * state flags from gen6_upload_wm_push_constants.
-       */
-      OUT_BATCH(stage_state->push_const_offset +
-                stage_state->push_const_size - 1);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   dw2 = dw4 = dw5 = dw6 = ksp2 = 0;
-
-   if (statistic_enable)
-      dw4 |= GEN6_WM_STATISTICS_ENABLE;
-
-   dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0;
-   dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5;
-
-   if (prog_data->base.use_alt_mode)
-      dw2 |= GEN6_WM_FLOATING_POINT_MODE_ALT;
-
-   dw2 |= (ALIGN(stage_state->sampler_count, 4) / 4) <<
-           GEN6_WM_SAMPLER_COUNT_SHIFT;
-
-   dw2 |= ((prog_data->base.binding_table.size_bytes / 4) <<
-           GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-
-   dw5 |= (devinfo->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
-
-   if (prog_data->dispatch_8)
-      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-
-   if (prog_data->dispatch_16)
-      dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
-
-   dw4 |= prog_data->base.dispatch_grf_start_reg <<
-          GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
-   dw4 |= prog_data->dispatch_grf_start_reg_2 <<
-          GEN6_WM_DISPATCH_START_GRF_SHIFT_2;
-
-   ksp0 = stage_state->prog_offset;
-   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
-
-   if (dual_source_blend_enable)
-      dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE;
-
-   if (line_stipple_enable)
-      dw5 |= GEN6_WM_LINE_STIPPLE_ENABLE;
-
-   if (polygon_stipple_enable)
-      dw5 |= GEN6_WM_POLYGON_STIPPLE_ENABLE;
-
-   if (prog_data->uses_src_depth)
-      dw5 |= GEN6_WM_USES_SOURCE_DEPTH;
-   if (prog_data->uses_src_w)
-      dw5 |= GEN6_WM_USES_SOURCE_W;
-   if (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
-      dw5 |= GEN6_WM_COMPUTED_DEPTH;
-   dw6 |= prog_data->barycentric_interp_modes <<
-      GEN6_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
-
-   if (kill_enable)
-      dw5 |= GEN6_WM_KILL_ENABLE;
-
-   if (color_buffer_write_enable ||
-       dw5 & (GEN6_WM_KILL_ENABLE | GEN6_WM_COMPUTED_DEPTH))
-      dw5 |= GEN6_WM_DISPATCH_ENABLE;
-
-   /* From the SNB PRM, volume 2 part 1, page 278:
-    * "This bit is inserted in the PS payload header and made available to
-    * the DataPort (either via the message header or via header bypass) to
-    * indicate that oMask data (one or two phases) is included in Render
-    * Target Write messages. If present, the oMask data is used to mask off
-    * samples."
-    */
-    if (prog_data->uses_omask)
-      dw5 |= GEN6_WM_OMASK_TO_RENDER_TARGET;
-
-   dw6 |= prog_data->num_varying_inputs <<
-      GEN6_WM_NUM_SF_OUTPUTS_SHIFT;
-   if (multisampled_fbo) {
-      if (msaa_enabled)
-         dw6 |= GEN6_WM_MSRAST_ON_PATTERN;
-      else
-         dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
-
-      if (prog_data->persample_dispatch)
-         dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
-      else {
-         dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
-      }
-   } else {
-      dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
-      dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
-   }
-
-   /* From the SNB PRM, volume 2 part 1, page 281:
-    * "If the PS kernel does not need the Position XY Offsets
-    * to compute a Position XY value, then this field should be
-    * programmed to POSOFFSET_NONE."
-    *
-    * "SW Recommendation: If the PS kernel needs the Position Offsets
-    * to compute a Position XY value, this field should match Position
-    * ZW Interpolation Mode to ensure a consistent position.xyzw
-    * computation."
-    * We only require XY sample offsets. So, this recommendation doesn't
-    * look useful at the moment. We might need this in future.
-    */
-   if (prog_data->uses_pos_offset)
-      dw6 |= GEN6_WM_POSOFFSET_SAMPLE;
-   else
-      dw6 |= GEN6_WM_POSOFFSET_NONE;
-
-   BEGIN_BATCH(9);
-   OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
-   OUT_BATCH(ksp0);
-   OUT_BATCH(dw2);
-   if (prog_data->base.total_scratch) {
-      OUT_RELOC(stage_state->scratch_bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-   }
-   OUT_BATCH(dw4);
-   OUT_BATCH(dw5);
-   OUT_BATCH(dw6);
-   OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(ksp2);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_wm_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-
-   /* _NEW_BUFFERS */
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
-
-   /* BRW_NEW_FS_PROG_DATA | _NEW_COLOR */
-   const bool dual_src_blend_enable = prog_data->dual_src_blend &&
-                                      (ctx->Color.BlendEnabled & 1) &&
-                                      ctx->Color.Blend[0]._UsesDualSrc;
-
-   /* _NEW_COLOR, _NEW_MULTISAMPLE _NEW_BUFFERS */
-   const bool kill_enable = prog_data->uses_kill ||
-                            _mesa_is_alpha_test_enabled(ctx) ||
-                            _mesa_is_alpha_to_coverage_enabled(ctx) ||
-                            prog_data->uses_omask;
-
-   /* Rendering against the gl-context is always taken into account. */
-   const bool statistic_enable = true;
-
-   /* _NEW_LINE | _NEW_POLYGON | _NEW_BUFFERS | _NEW_COLOR |
-    * _NEW_MULTISAMPLE
-    */
-   gen6_upload_wm_state(brw, prog_data, &brw->wm.base,
-                        multisampled_fbo,
-                        dual_src_blend_enable, kill_enable,
-                        brw_color_buffer_write_enabled(brw),
-                        ctx->Multisample.Enabled,
-                        ctx->Line.StippleFlag, ctx->Polygon.StippleFlag,
-                        statistic_enable);
-}
-
-const struct brw_tracked_state gen6_wm_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_COLOR |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POLYGON |
-               _NEW_PROGRAM_CONSTANTS,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
-   },
-   .emit = upload_wm_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 48b3d87..26e4264 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -33,168 +33,6 @@
 #include "compiler/glsl/ir_uniform.h"
 #include "main/shaderapi.h"
 
-static void
-brw_upload_cs_state(struct brw_context *brw)
-{
-   if (!brw->cs.base.prog_data)
-      return;
-
-   uint32_t offset;
-   uint32_t *desc = (uint32_t*) brw_state_batch(brw, 8 * 4, 64, &offset);
-   struct brw_stage_state *stage_state = &brw->cs.base;
-   struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
-      brw_emit_buffer_surface_state(
-         brw, &stage_state->surf_offset[
-                 prog_data->binding_table.shader_time_start],
-         brw->shader_time.bo, 0, ISL_FORMAT_RAW,
-         brw->shader_time.bo->size, 1, true);
-   }
-
-   uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
-                                    32, &stage_state->bind_bo_offset);
-
-   uint32_t dwords = brw->gen < 8 ? 8 : 9;
-   BEGIN_BATCH(dwords);
-   OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
-
-   if (prog_data->total_scratch) {
-      if (brw->gen >= 8) {
-         /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
-          * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
-          */
-         OUT_RELOC64(stage_state->scratch_bo,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
-      } else if (brw->is_haswell) {
-         /* Haswell's Per Thread Scratch Space is in the range [0, 10]
-          * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
-          */
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 12);
-      } else {
-         /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
-          * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
-          */
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   stage_state->per_thread_scratch / 1024 - 1);
-      }
-   } else {
-      OUT_BATCH(0);
-      if (brw->gen >= 8)
-         OUT_BATCH(0);
-   }
-
-   const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0;
-   const uint32_t vfe_gpgpu_mode =
-      brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0;
-   const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
-   OUT_BATCH(SET_FIELD(devinfo->max_cs_threads * subslices - 1,
-                       MEDIA_VFE_STATE_MAX_THREADS) |
-             SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) |
-             SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) |
-             SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) |
-             vfe_gpgpu_mode);
-
-   OUT_BATCH(0);
-   const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;
-
-   /* We are uploading duplicated copies of push constant uniforms for each
-    * thread. Although the local id data needs to vary per thread, it won't
-    * change for other uniform data. Unfortunately this duplication is
-    * required for gen7. As of Haswell, this duplication can be avoided, but
-    * this older mechanism with duplicated data continues to work.
-    *
-    * FINISHME: As of Haswell, we could make use of the
-    * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field
-    * to only store one copy of uniform data.
-    *
-    * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
-    * which is described in the GPGPU_WALKER command and in the Broadwell PRM
-    * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
-    * Operations => GPGPU Mode => Indirect Payload Storage.
-    *
-    * Note: The constant data is built in brw_upload_cs_push_constants below.
-    */
-   const uint32_t vfe_curbe_allocation =
-      ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
-            cs_prog_data->push.cross_thread.regs, 2);
-   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
-             SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   if (cs_prog_data->push.total.size > 0) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64));
-      OUT_BATCH(stage_state->push_const_offset);
-      ADVANCE_BATCH();
-   }
-
-   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
-   memcpy(bind, stage_state->surf_offset,
-          prog_data->binding_table.size_bytes);
-
-   memset(desc, 0, 8 * 4);
-
-   int dw = 0;
-   desc[dw++] = brw->cs.base.prog_offset;
-   if (brw->gen >= 8)
-      desc[dw++] = 0; /* Kernel Start Pointer High */
-   desc[dw++] = 0;
-   desc[dw++] = stage_state->sampler_offset |
-      ((stage_state->sampler_count + 3) / 4);
-   desc[dw++] = stage_state->bind_bo_offset;
-   desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs,
-                          MEDIA_CURBE_READ_LENGTH);
-   const uint32_t media_threads =
-      brw->gen >= 8 ?
-      SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
-      SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT);
-   assert(cs_prog_data->threads <= devinfo->max_cs_threads);
-
-   const uint32_t slm_size =
-      encode_slm_size(devinfo->gen, prog_data->total_shared);
-
-   desc[dw++] =
-      SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) |
-      SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) |
-      media_threads;
-
-   desc[dw++] =
-      SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH);
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(8 * 4);
-   OUT_BATCH(offset);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_cs_state = {
-   .dirty = {
-      .mesa = _NEW_PROGRAM_CONSTANTS,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_CS_PROG_DATA |
-             BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-             BRW_NEW_SAMPLER_STATE_TABLE |
-             BRW_NEW_SURFACES,
-   },
-   .emit = brw_upload_cs_state
-};
-
-
 /**
  * Creates a region containing the push constants for the CS on gen7+.
  *
@@ -295,8 +133,7 @@
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_COMPUTE_PROGRAM |
-             BRW_NEW_CS_PROG_DATA |
-             BRW_NEW_PUSH_CONSTANT_ALLOCATION,
+             BRW_NEW_CS_PROG_DATA,
    },
    .emit = gen7_upload_cs_push_constants,
 };
diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c
deleted file mode 100644
index 04d7a86..0000000
--- a/src/mesa/drivers/dri/i965/gen7_ds_state.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/shaderapi.h"
-
-static void
-gen7_upload_tes_push_constants(struct brw_context *brw)
-{
-   struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   const struct brw_program *tep = brw_program_const(brw->tess_eval_program);
-
-   if (tep) {
-      /* BRW_NEW_TES_PROG_DATA */
-      const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_EVAL);
-      gen6_upload_push_constants(brw, &tep->program, prog_data, stage_state);
-   }
-
-   gen7_upload_constant_state(brw, stage_state, tep, _3DSTATE_CONSTANT_DS);
-}
-
-const struct brw_tracked_state gen7_tes_push_constants = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_PROGRAMS |
-               BRW_NEW_TES_PROG_DATA,
-   },
-   .emit = gen7_upload_tes_push_constants,
-};
-
-static void
-gen7_upload_ds_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
-
-   /* BRW_NEW_TES_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   const struct brw_tes_prog_data *tes_prog_data =
-      brw_tes_prog_data(stage_state->prog_data);
-
-   const unsigned thread_count = (devinfo->max_tes_threads - 1) <<
-      (brw->is_haswell ? HSW_DS_MAX_THREADS_SHIFT : GEN7_DS_MAX_THREADS_SHIFT);
-
-   if (active) {
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
-      OUT_BATCH(stage_state->prog_offset);
-      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
-                          GEN7_DS_SAMPLER_COUNT) |
-                SET_FIELD(prog_data->binding_table.size_bytes / 4,
-                          GEN7_DS_BINDING_TABLE_ENTRY_COUNT));
-      if (prog_data->total_scratch) {
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-      }
-      OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg,
-                          GEN7_DS_DISPATCH_START_GRF) |
-                SET_FIELD(vue_prog_data->urb_read_length,
-                          GEN7_DS_URB_READ_LENGTH));
-
-      OUT_BATCH(GEN7_DS_ENABLE |
-                GEN7_DS_STATISTICS_ENABLE |
-                thread_count |
-                (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
-                 GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-   brw->tes.enabled = active;
-}
-
-const struct brw_tracked_state gen7_ds_state = {
-   .dirty = {
-      .mesa  = _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_TESS_PROGRAMS |
-               BRW_NEW_TES_PROG_DATA,
-   },
-   .emit = gen7_upload_ds_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
deleted file mode 100644
index 1b5b782..0000000
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright © 2013 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_gs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->gs.base;
-   const int max_threads_shift = brw->is_haswell ?
-      HSW_GS_MAX_THREADS_SHIFT : GEN6_GS_MAX_THREADS_SHIFT;
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool active = brw->geometry_program;
-   /* BRW_NEW_GS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   const struct brw_gs_prog_data *gs_prog_data =
-      brw_gs_prog_data(stage_state->prog_data);
-
-   /**
-    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
-    * Geometry > Geometry Shader > State:
-    *
-    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
-    *     whole fixed function pipeline when the GS enable changes value in
-    *     the 3DSTATE_GS."
-    *
-    * The hardware architects have clarified that in this context "flush the
-    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
-    * Stall" bit set.
-    */
-   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active)
-      gen7_emit_cs_stall_flush(brw);
-
-   if (active) {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(stage_state->prog_offset);
-      OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
-                 GEN6_GS_SAMPLER_COUNT_SHIFT) |
-                ((prog_data->binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-      if (prog_data->total_scratch) {
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-      }
-
-      uint32_t dw4 =
-         ((gs_prog_data->output_vertex_size_hwords * 2 - 1) <<
-          GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
-         (gs_prog_data->output_topology << GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
-         (vue_prog_data->urb_read_length <<
-          GEN6_GS_URB_READ_LENGTH_SHIFT) |
-         (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
-         (prog_data->dispatch_grf_start_reg <<
-          GEN6_GS_DISPATCH_START_GRF_SHIFT);
-
-      /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
-       * Ivy Bridge and Haswell.
-       *
-       * On Ivy Bridge, setting this bit causes the vertices of a triangle
-       * strip to be delivered to the geometry shader in an order that does
-       * not strictly follow the OpenGL spec, but preserves triangle
-       * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
-       * the geometry shader sees triangles:
-       *
-       * (1, 2, 3), (2, 4, 3), (3, 4, 5)
-       *
-       * (Clearing the bit is even worse, because it fails to preserve
-       * orientation).
-       *
-       * Triangle strips with adjacency always ordered in a way that preserves
-       * triangle orientation but does not strictly follow the OpenGL spec,
-       * regardless of the setting of this bit.
-       *
-       * On Haswell, both triangle strips and triangle strips with adjacency
-       * are always ordered in a way that preserves triangle orientation.
-       * Setting this bit causes the ordering to strictly follow the OpenGL
-       * spec.
-       *
-       * So in either case we want to set the bit.  Unfortunately on Ivy
-       * Bridge this will get the order close to correct but not perfect.
-       */
-      uint32_t dw5 =
-         ((devinfo->max_gs_threads - 1) << max_threads_shift) |
-         (gs_prog_data->control_data_header_size_hwords <<
-          GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
-         ((gs_prog_data->invocations - 1) <<
-          GEN7_GS_INSTANCE_CONTROL_SHIFT) |
-         SET_FIELD(vue_prog_data->dispatch_mode, GEN7_GS_DISPATCH_MODE) |
-         GEN6_GS_STATISTICS_ENABLE |
-         (gs_prog_data->include_primitive_id ?
-          GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
-         GEN7_GS_REORDER_TRAILING |
-         GEN7_GS_ENABLE;
-      uint32_t dw6 = 0;
-
-      if (brw->is_haswell) {
-         dw6 |= gs_prog_data->control_data_format <<
-            HSW_GS_CONTROL_DATA_FORMAT_SHIFT;
-      } else {
-         dw5 |= gs_prog_data->control_data_format <<
-            IVB_GS_CONTROL_DATA_FORMAT_SHIFT;
-      }
-
-      OUT_BATCH(dw4);
-      OUT_BATCH(dw5);
-      OUT_BATCH(dw6);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
-      OUT_BATCH(0); /* prog_bo */
-      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
-                (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-      OUT_BATCH(0); /* scratch space base offset */
-      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
-                (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
-                GEN7_GS_INCLUDE_VERTEX_HANDLES |
-                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
-      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
-                GEN6_GS_STATISTICS_ENABLE);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-   brw->gs.enabled = active;
-}
-
-const struct brw_tracked_state gen7_gs_state = {
-   .dirty = {
-      .mesa  = _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA,
-   },
-   .emit = upload_gs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c
deleted file mode 100644
index 765253f..0000000
--- a/src/mesa/drivers/dri/i965/gen7_hs_state.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/shaderapi.h"
-
-static void
-gen7_upload_tcs_push_constants(struct brw_context *brw)
-{
-   struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   const struct brw_program *tcp = brw_program_const(brw->tess_ctrl_program);
-   bool active = brw->tess_eval_program;
-
-   if (active) {
-      /* BRW_NEW_TCS_PROG_DATA */
-      const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
-
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_CTRL);
-      gen6_upload_push_constants(brw, &tcp->program, prog_data, stage_state);
-   }
-
-   gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_HS);
-}
-
-const struct brw_tracked_state gen7_tcs_push_constants = {
-   .dirty = {
-      .mesa  = _NEW_PROGRAM_CONSTANTS,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_DEFAULT_TESS_LEVELS |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_PROGRAMS |
-               BRW_NEW_TCS_PROG_DATA,
-   },
-   .emit = gen7_upload_tcs_push_constants,
-};
-
-static void
-gen7_upload_hs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
-   /* BRW_NEW_TCS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_tcs_prog_data *tcs_prog_data =
-      brw_tcs_prog_data(stage_state->prog_data);
-
-   if (active) {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
-      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
-                          GEN7_HS_SAMPLER_COUNT) |
-                SET_FIELD(prog_data->binding_table.size_bytes / 4,
-                          GEN7_HS_BINDING_TABLE_ENTRY_COUNT) |
-                (devinfo->max_tcs_threads - 1));
-      OUT_BATCH(GEN7_HS_ENABLE |
-                GEN7_HS_STATISTICS_ENABLE |
-                SET_FIELD(tcs_prog_data->instances - 1,
-                          GEN7_HS_INSTANCE_COUNT));
-      OUT_BATCH(stage_state->prog_offset);
-      if (prog_data->total_scratch) {
-         OUT_RELOC(stage_state->scratch_bo,
-                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-      }
-      OUT_BATCH(GEN7_HS_INCLUDE_VERTEX_HANDLES |
-                SET_FIELD(prog_data->dispatch_grf_start_reg,
-                          GEN7_HS_DISPATCH_START_GRF));
-      /* Ignore URB semaphores */
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(7);
-      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-   brw->tcs.enabled = active;
-}
-
-const struct brw_tracked_state gen7_hs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_TCS_PROG_DATA |
-               BRW_NEW_TESS_PROGRAMS,
-   },
-   .emit = gen7_upload_hs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_l3_state.c b/src/mesa/drivers/dri/i965/gen7_l3_state.c
index 536c00c..53638eb 100644
--- a/src/mesa/drivers/dri/i965/gen7_l3_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_l3_state.c
@@ -204,6 +204,15 @@
    if (brw->urb.size != sz) {
       brw->urb.size = sz;
       brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE;
+
+      /* If we change the total URB size, reset the individual stage sizes to
+       * zero so that, even if there is no URB size change, gen7_upload_urb
+       * still re-emits 3DSTATE_URB_*.
+       */
+      brw->urb.vsize = 0;
+      brw->urb.gsize = 0;
+      brw->urb.hsize = 0;
+      brw->urb.dsize = 0;
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index af9be66..1a9e645 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -83,7 +83,7 @@
       break;
    case GL_TEXTURE_3D:
       assert(mt);
-      depth = MAX2(mt->logical_depth0, 1);
+      depth = mt->surf.logical_level0_px.depth;
       /* fallthrough */
    default:
       surftype = translate_tex_target(gl_target);
@@ -95,8 +95,8 @@
    lod = irb ? irb->mt_level - irb->mt->first_level : 0;
 
    if (mt) {
-      width = mt->logical_width0;
-      height = mt->logical_height0;
+      width = mt->surf.logical_level0_px.width;
+      height = mt->surf.logical_level0_px.height;
    }
 
    /* _NEW_DEPTH, _NEW_STENCIL, _NEW_BUFFERS */
@@ -105,10 +105,10 @@
    OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
 
    /* 3DSTATE_DEPTH_BUFFER dw1 */
-   OUT_BATCH((depth_mt ? depth_mt->pitch - 1 : 0) |
+   OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
              (depthbuffer_format << 18) |
              ((hiz ? 1 : 0) << 22) |
-             ((stencil_mt != NULL && ctx->Stencil._WriteEnabled) << 27) |
+             ((stencil_mt != NULL && brw->stencil_write_enabled) << 27) |
              (brw_depth_writes_enabled(brw) << 28) |
              (surftype << 29));
 
@@ -146,13 +146,12 @@
       ADVANCE_BATCH();
    } else {
       assert(depth_mt);
-      struct intel_miptree_hiz_buffer *hiz_buf = depth_mt->hiz_buf;
 
       BEGIN_BATCH(3);
       OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2));
       OUT_BATCH((mocs << 25) |
-                (hiz_buf->aux_base.pitch - 1));
-      OUT_RELOC(hiz_buf->aux_base.bo,
+                (depth_mt->hiz_buf->pitch - 1));
+      OUT_RELOC(depth_mt->hiz_buf->bo,
                 I915_GEM_DOMAIN_RENDER,
                 I915_GEM_DOMAIN_RENDER,
                 0);
@@ -171,19 +170,9 @@
 
       BEGIN_BATCH(3);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2));
-      /* The stencil buffer has quirky pitch requirements.  From the
-       * Sandybridge PRM, Volume 2 Part 1, page 329 (3DSTATE_STENCIL_BUFFER
-       * dword 1 bits 16:0 - Surface Pitch):
-       *
-       *    The pitch must be set to 2x the value computed based on width, as
-       *    the stencil buffer is stored with two rows interleaved.
-       *
-       * While the Ivybridge PRM lacks this comment, the BSpec contains the
-       * same text, and experiments indicate that this is necessary.
-       */
       OUT_BATCH(enabled |
                 mocs << 25 |
-	        (2 * stencil_mt->pitch - 1));
+	        (stencil_mt->surf.row_pitch - 1));
       OUT_RELOC(stencil_mt->bo,
 	        I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 		0);
@@ -192,7 +181,12 @@
 
    BEGIN_BATCH(3);
    OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
-   OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0);
+   if (depth_mt) {
+      OUT_BATCH(brw_convert_depth_value(depth_mt->format,
+                                        depth_mt->fast_clear_color.f32[0]));
+   } else {
+      OUT_BATCH(0);
+   }
    OUT_BATCH(1);
    ADVANCE_BATCH();
 
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
deleted file mode 100644
index d577a36..0000000
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "main/macros.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_sbe_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   uint32_t num_outputs = wm_prog_data->num_varying_inputs;
-   uint32_t dw1;
-   uint32_t point_sprite_enables;
-   int i;
-   uint16_t attr_overrides[16];
-   /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   uint32_t point_sprite_origin;
-
-   /* FINISHME: Attribute Swizzle Control Mode? */
-   dw1 = GEN7_SBE_SWIZZLE_ENABLE | num_outputs << GEN7_SBE_NUM_OUTPUTS_SHIFT;
-
-   /* _NEW_POINT
-    *
-    * Window coordinates in an FBO are inverted, which means point
-    * sprite origin must be inverted.
-    */
-   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
-      point_sprite_origin = GEN6_SF_POINT_SPRITE_LOWERLEFT;
-   } else {
-      point_sprite_origin = GEN6_SF_POINT_SPRITE_UPPERLEFT;
-   }
-   dw1 |= point_sprite_origin;
-
-   /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
-    * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
-    * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
-    * BRW_NEW_VUE_MAP_GEOM_OUT
-    */
-   uint32_t urb_entry_read_length;
-   uint32_t urb_entry_read_offset;
-   calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &urb_entry_read_length, &urb_entry_read_offset);
-   dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-          urb_entry_read_offset << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT;
-
-   BEGIN_BATCH(14);
-   OUT_BATCH(_3DSTATE_SBE << 16 | (14 - 2));
-   OUT_BATCH(dw1);
-
-   /* Output dwords 2 through 9 */
-   for (i = 0; i < 8; i++) {
-      OUT_BATCH(attr_overrides[i * 2] | attr_overrides[i * 2 + 1] << 16);
-   }
-
-   OUT_BATCH(point_sprite_enables); /* dw10 */
-   OUT_BATCH(wm_prog_data->flat_inputs);
-   OUT_BATCH(0); /* wrapshortest enables 0-7 */
-   OUT_BATCH(0); /* wrapshortest enables 8-15 */
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen7_sbe_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_PROGRAM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_PRIMITIVE |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
-   },
-   .emit = upload_sbe_state,
-};
-
-static void
-upload_sf_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1, dw2, dw3;
-   float point_size;
-   /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
-
-   dw1 = GEN6_SF_STATISTICS_ENABLE;
-
-   if (brw->sf.viewport_transform_enable)
-       dw1 |= GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
-
-   /* _NEW_BUFFERS */
-   dw1 |= (brw_depthbuffer_format(brw) << GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT);
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon._FrontBit == render_to_fbo)
-      dw1 |= GEN6_SF_WINDING_CCW;
-
-   if (ctx->Polygon.OffsetFill)
-       dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
-
-   if (ctx->Polygon.OffsetLine)
-       dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
-
-   if (ctx->Polygon.OffsetPoint)
-       dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
-
-   switch (ctx->Polygon.FrontMode) {
-   case GL_FILL:
-       dw1 |= GEN6_SF_FRONT_SOLID;
-       break;
-
-   case GL_LINE:
-       dw1 |= GEN6_SF_FRONT_WIREFRAME;
-       break;
-
-   case GL_POINT:
-       dw1 |= GEN6_SF_FRONT_POINT;
-       break;
-
-   default:
-       unreachable("not reached");
-   }
-
-   switch (ctx->Polygon.BackMode) {
-   case GL_FILL:
-       dw1 |= GEN6_SF_BACK_SOLID;
-       break;
-
-   case GL_LINE:
-       dw1 |= GEN6_SF_BACK_WIREFRAME;
-       break;
-
-   case GL_POINT:
-       dw1 |= GEN6_SF_BACK_POINT;
-       break;
-
-   default:
-       unreachable("not reached");
-   }
-
-   dw2 = GEN6_SF_SCISSOR_ENABLE;
-
-   if (ctx->Polygon.CullFlag) {
-      switch (ctx->Polygon.CullFaceMode) {
-      case GL_FRONT:
-	 dw2 |= GEN6_SF_CULL_FRONT;
-	 break;
-      case GL_BACK:
-	 dw2 |= GEN6_SF_CULL_BACK;
-	 break;
-      case GL_FRONT_AND_BACK:
-	 dw2 |= GEN6_SF_CULL_BOTH;
-	 break;
-      default:
-	 unreachable("not reached");
-      }
-   } else {
-      dw2 |= GEN6_SF_CULL_NONE;
-   }
-
-   /* _NEW_LINE */
-   {
-      uint32_t line_width_u3_7 = brw_get_line_width(brw);
-      dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
-   }
-   if (ctx->Line.SmoothFlag) {
-      dw2 |= GEN6_SF_LINE_AA_ENABLE;
-      dw2 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0;
-   }
-   if (ctx->Line.StippleFlag && brw->is_haswell) {
-      dw2 |= HSW_SF_LINE_STIPPLE_ENABLE;
-   }
-   /* _NEW_MULTISAMPLE */
-   if (multisampled_fbo && ctx->Multisample.Enabled)
-      dw2 |= GEN6_SF_MSRAST_ON_PATTERN;
-
-   /* FINISHME: Last Pixel Enable?  Vertex Sub Pixel Precision Select?
-    */
-
-   dw3 = GEN6_SF_LINE_AA_MODE_TRUE;
-
-   /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
-   if (use_state_point_size(brw))
-      dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH;
-
-   /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
-   point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-
-   /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
-
-   /* _NEW_LIGHT */
-   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
-      dw3 |=
-	 (2 << GEN6_SF_TRI_PROVOKE_SHIFT) |
-	 (2 << GEN6_SF_TRIFAN_PROVOKE_SHIFT) |
-	 (1 << GEN6_SF_LINE_PROVOKE_SHIFT);
-   } else {
-      dw3 |= (1 << GEN6_SF_TRIFAN_PROVOKE_SHIFT);
-   }
-
-   BEGIN_BATCH(7);
-   OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   OUT_BATCH(dw3);
-   OUT_BATCH_F(ctx->Polygon.OffsetUnits * 2); /* constant.  copied from gen4 */
-   OUT_BATCH_F(ctx->Polygon.OffsetFactor); /* scale */
-   OUT_BATCH_F(ctx->Polygon.OffsetClamp); /* global depth offset clamp */
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen7_sf_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_PROGRAM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_PRIMITIVE |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
-   },
-   .emit = upload_sf_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index f1bd19c..f54b370 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -35,313 +35,6 @@
 #include "intel_buffer_objects.h"
 #include "main/transformfeedback.h"
 
-static void
-upload_3dstate_so_buffers(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   const struct gl_transform_feedback_info *linked_xfb_info =
-      xfb_obj->program->sh.LinkedTransformFeedback;
-   int i;
-
-   /* Set up the up to 4 output buffers.  These are the ranges defined in the
-    * gl_transform_feedback_object.
-    */
-   for (i = 0; i < 4; i++) {
-      struct intel_buffer_object *bufferobj =
-	 intel_buffer_object(xfb_obj->Buffers[i]);
-      struct brw_bo *bo;
-      uint32_t start, end;
-      uint32_t stride;
-
-      if (!xfb_obj->Buffers[i]) {
-	 /* The pitch of 0 in this command indicates that the buffer is
-	  * unbound and won't be written to.
-	  */
-	 BEGIN_BATCH(4);
-	 OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
-	 OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-
-	 continue;
-      }
-
-      stride = linked_xfb_info->Buffers[i].Stride * 4;
-
-      start = xfb_obj->Offset[i];
-      assert(start % 4 == 0);
-      end = ALIGN(start + xfb_obj->Size[i], 4);
-      bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start);
-      assert(end <= bo->size);
-
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
-      OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Outputs the 3DSTATE_SO_DECL_LIST command.
- *
- * The data output is a series of 64-bit entries containing a SO_DECL per
- * stream.  We only have one stream of rendering coming out of the GS unit, so
- * we only emit stream 0 (low 16 bits) SO_DECLs.
- */
-void
-gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
-                                 const struct brw_vue_map *vue_map)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   const struct gl_transform_feedback_info *linked_xfb_info =
-      xfb_obj->program->sh.LinkedTransformFeedback;
-   uint16_t so_decl[MAX_VERTEX_STREAMS][128];
-   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
-   int next_offset[BRW_MAX_SOL_BUFFERS] = {0, 0, 0, 0};
-   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
-   int max_decls = 0;
-   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
-
-   memset(so_decl, 0, sizeof(so_decl));
-
-   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
-    * command is feels strange -- each dword pair contains a SO_DECL per stream.
-    */
-   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
-      int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
-      uint16_t decl = 0;
-      int varying = linked_xfb_info->Outputs[i].OutputRegister;
-      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
-      unsigned component_mask = (1 << components) - 1;
-      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
-      unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
-      assert(stream_id < MAX_VERTEX_STREAMS);
-
-      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
-       * gl_Layer is stored in VARYING_SLOT_PSIZ.y
-       * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
-       */
-      if (varying == VARYING_SLOT_PSIZ) {
-         assert(components == 1);
-         component_mask <<= 3;
-      } else if (varying == VARYING_SLOT_LAYER) {
-         assert(components == 1);
-         component_mask <<= 1;
-      } else if (varying == VARYING_SLOT_VIEWPORT) {
-         assert(components == 1);
-         component_mask <<= 2;
-      } else {
-         component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
-      }
-
-      buffer_mask[stream_id] |= 1 << buffer;
-
-      decl |= decl_buffer_slot;
-      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
-         decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] <<
-            SO_DECL_REGISTER_INDEX_SHIFT;
-      } else {
-         assert(vue_map->varying_to_slot[varying] >= 0);
-         decl |= vue_map->varying_to_slot[varying] <<
-            SO_DECL_REGISTER_INDEX_SHIFT;
-      }
-      decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT;
-
-      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
-       * array.  Instead, it simply increments DstOffset for the following
-       * input by the number of components that should be skipped.
-       *
-       * Our hardware is unusual in that it requires us to program SO_DECLs
-       * for fake "hole" components, rather than simply taking the offset
-       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
-       * program as many size = 4 holes as we can, then a final hole to
-       * accommodate the final 1, 2, or 3 remaining.
-       */
-      int skip_components =
-         linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
-
-      next_offset[buffer] += skip_components;
-
-      while (skip_components >= 4) {
-         so_decl[stream_id][decls[stream_id]++] =
-            SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot;
-         skip_components -= 4;
-      }
-      if (skip_components > 0)
-         so_decl[stream_id][decls[stream_id]++] =
-            SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) |
-            decl_buffer_slot;
-
-      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
-
-      next_offset[buffer] += components;
-
-      so_decl[stream_id][decls[stream_id]++] = decl;
-
-      if (decls[stream_id] > max_decls)
-         max_decls = decls[stream_id];
-   }
-
-   BEGIN_BATCH(max_decls * 2 + 3);
-   OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1));
-
-   OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) |
-             (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) |
-             (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) |
-             (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT));
-
-   OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) |
-             (decls[1] << SO_NUM_ENTRIES_1_SHIFT) |
-             (decls[2] << SO_NUM_ENTRIES_2_SHIFT) |
-             (decls[3] << SO_NUM_ENTRIES_3_SHIFT));
-
-   for (int i = 0; i < max_decls; i++) {
-      /* Stream 1 | Stream 0 */
-      OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]);
-      /* Stream 3 | Stream 2 */
-      OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]);
-   }
-
-   ADVANCE_BATCH();
-}
-
-static bool
-query_active(struct gl_query_object *q)
-{
-   return q && q->Active;
-}
-
-static void
-upload_3dstate_streamout(struct brw_context *brw, bool active,
-			 const struct brw_vue_map *vue_map)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0;
-   int i;
-
-   if (active) {
-      const struct gl_transform_feedback_info *linked_xfb_info =
-         xfb_obj->program->sh.LinkedTransformFeedback;
-      int urb_entry_read_offset = 0;
-      int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
-	 urb_entry_read_offset;
-
-      dw1 |= SO_FUNCTION_ENABLE;
-      dw1 |= SO_STATISTICS_ENABLE;
-
-      /* BRW_NEW_RASTERIZER_DISCARD */
-      if (ctx->RasterDiscard) {
-         if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
-            dw1 |= SO_RENDERING_DISABLE;
-         } else {
-            perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
-                       "query active relies on the clipper.");
-         }
-      }
-
-      /* _NEW_LIGHT */
-      if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
-	 dw1 |= SO_REORDER_TRAILING;
-
-      if (brw->gen < 8) {
-         for (i = 0; i < 4; i++) {
-            if (xfb_obj->Buffers[i]) {
-               dw1 |= SO_BUFFER_ENABLE(i);
-            }
-         }
-      }
-
-      /* We always read the whole vertex.  This could be reduced at some
-       * point by reading less and offsetting the register index in the
-       * SO_DECLs.
-       */
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH);
-
-      if (brw->gen >= 8) {
-	 /* Set buffer pitches; 0 means unbound. */
-	 if (xfb_obj->Buffers[0])
-	    dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
-	 if (xfb_obj->Buffers[1])
-	    dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
-	 if (xfb_obj->Buffers[2])
-	    dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
-	 if (xfb_obj->Buffers[3])
-	    dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
-      }
-   }
-
-   const int dwords = brw->gen >= 8 ? 5 : 3;
-
-   BEGIN_BATCH(dwords);
-   OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   if (dwords > 3) {
-      OUT_BATCH(dw3);
-      OUT_BATCH(dw4);
-   }
-   ADVANCE_BATCH();
-}
-
-static void
-upload_sol_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
-
-   if (active) {
-      if (brw->gen >= 8)
-         gen8_upload_3dstate_so_buffers(brw);
-      else
-         upload_3dstate_so_buffers(brw);
-
-      /* BRW_NEW_VUE_MAP_GEOM_OUT */
-      gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out);
-   }
-
-   /* Finally, set up the SOL stage.  This command must always follow updates to
-    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
-    * MMIO register updates (current performed by the kernel at each batch
-    * emit).
-    */
-   upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out);
-}
-
-const struct brw_tracked_state gen7_sol_state = {
-   .dirty = {
-      .mesa  = _NEW_LIGHT,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_RASTERIZER_DISCARD |
-               BRW_NEW_VUE_MAP_GEOM_OUT |
-               BRW_NEW_TRANSFORM_FEEDBACK,
-   },
-   .emit = upload_sol_state,
-};
-
 void
 gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
                               struct gl_transform_feedback_object *obj)
diff --git a/src/mesa/drivers/dri/i965/gen7_te_state.c b/src/mesa/drivers/dri/i965/gen7_te_state.c
deleted file mode 100644
index e56fdcf..0000000
--- a/src/mesa/drivers/dri/i965/gen7_te_state.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_te_state(struct brw_context *brw)
-{
-   /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
-
-   const struct brw_tes_prog_data *tes_prog_data =
-      brw_tes_prog_data(brw->tes.base.prog_data);
-
-   if (active) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_TE << 16 | (4 - 2));
-      OUT_BATCH((tes_prog_data->partitioning << GEN7_TE_PARTITIONING_SHIFT) |
-                (tes_prog_data->output_topology << GEN7_TE_OUTPUT_TOPOLOGY_SHIFT) |
-                (tes_prog_data->domain << GEN7_TE_DOMAIN_SHIFT) |
-                GEN7_TE_ENABLE);
-      OUT_BATCH_F(63.0);
-      OUT_BATCH_F(64.0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_TE << 16 | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH_F(0);
-      OUT_BATCH_F(0);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen7_te_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_TESS_PROGRAMS,
-   },
-   .emit = upload_te_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 028161d..06113fa 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -101,7 +101,11 @@
     * Similar text exists for the other 3DSTATE_PUSH_CONSTANT_ALLOC_*
     * commands.
     */
-   brw->ctx.NewDriverState |= BRW_NEW_PUSH_CONSTANT_ALLOCATION;
+   brw->vs.base.push_constants_dirty = true;
+   brw->tcs.base.push_constants_dirty = true;
+   brw->tes.base.push_constants_dirty = true;
+   brw->gs.base.push_constants_dirty = true;
+   brw->wm.base.push_constants_dirty = true;
 }
 
 void
@@ -197,9 +201,7 @@
    /* If we're just switching between programs with the same URB requirements,
     * skip the rest of the logic.
     */
-   if (!(brw->ctx.NewDriverState & BRW_NEW_CONTEXT) &&
-       !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) &&
-       brw->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
+   if (brw->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
        brw->urb.gs_present == gs_present &&
        brw->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
        brw->urb.tess_present == tess_present &&
@@ -224,6 +226,7 @@
 
    BEGIN_BATCH(8);
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+      assert(brw->gen != 10 || entry_size[i] % 3);
       OUT_BATCH((_3DSTATE_URB_VS + i) << 16 | (2 - 2));
       OUT_BATCH(entries[i] |
                 ((entry_size[i] - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
@@ -235,7 +238,8 @@
 const struct brw_tracked_state gen7_urb = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT |
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_CONTEXT |
              BRW_NEW_URB_SIZE |
              BRW_NEW_GS_PROG_DATA |
              BRW_NEW_TCS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
deleted file mode 100644
index a3cb454..0000000
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "main/viewport.h"
-
-static void
-gen7_upload_sf_clip_viewport(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   GLfloat y_scale, y_bias;
-   struct gen7_sf_clip_viewport *vp;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const bool render_to_fbo = _mesa_is_user_fbo(fb);
-   const uint32_t fb_width = _mesa_geometric_width(ctx->DrawBuffer);
-   const uint32_t fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-
-   vp = brw_state_batch(brw,
-                        sizeof(*vp) * viewport_count, 64,
-                        &brw->sf.vp_offset);
-   /* Also assign to clip.vp_offset in case something uses it. */
-   brw->clip.vp_offset = brw->sf.vp_offset;
-
-   /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0.0;
-   } else {
-      y_scale = -1.0;
-      y_bias = (float)fb_height;
-   }
-
-   for (unsigned i = 0; i < viewport_count; i++) {
-      float scale[3], translate[3];
-      _mesa_get_viewport_xform(ctx, i, scale, translate);
-
-      /* _NEW_VIEWPORT */
-      vp[i].viewport.m00 = scale[0];
-      vp[i].viewport.m11 = scale[1] * y_scale;
-      vp[i].viewport.m22 = scale[2];
-      vp[i].viewport.m30 = translate[0];
-      vp[i].viewport.m31 = translate[1] * y_scale + y_bias;
-      vp[i].viewport.m32 = translate[2];
-
-      brw_calculate_guardband_size(devinfo, fb_width, fb_height,
-                                   vp[i].viewport.m00, vp[i].viewport.m11,
-                                   vp[i].viewport.m30, vp[i].viewport.m31,
-                                   &vp[i].guardband.xmin,
-                                   &vp[i].guardband.xmax,
-                                   &vp[i].guardband.ymin,
-                                   &vp[i].guardband.ymax);
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL << 16 | (2 - 2));
-   OUT_BATCH(brw->sf.vp_offset);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen7_sf_clip_viewport = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = gen7_upload_sf_clip_viewport,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
deleted file mode 100644
index 623c784..0000000
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "program/prog_parameter.h"
-#include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_vs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->vs.base;
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   uint32_t floating_point_mode = 0;
-   const int max_threads_shift = brw->is_haswell ?
-      HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
-
-   if (!brw->is_haswell && !brw->is_baytrail)
-      gen7_emit_vs_workaround_flush(brw);
-
-   if (prog_data->use_alt_mode)
-      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;
-
-   BEGIN_BATCH(6);
-   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-   OUT_BATCH(stage_state->prog_offset);
-   OUT_BATCH(floating_point_mode |
-	     ((ALIGN(stage_state->sampler_count, 4)/4) <<
-              GEN6_VS_SAMPLER_COUNT_SHIFT) |
-             ((prog_data->binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-   if (prog_data->total_scratch) {
-      OUT_RELOC(stage_state->scratch_bo,
-		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-   }
-
-   OUT_BATCH((prog_data->dispatch_grf_start_reg <<
-              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-	     (vue_prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
-	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
-
-   OUT_BATCH(((devinfo->max_vs_threads - 1) << max_threads_shift) |
-	     GEN6_VS_STATISTICS_ENABLE |
-	     GEN6_VS_ENABLE);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen7_vs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = upload_vs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
deleted file mode 100644
index 1c33db4..0000000
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdbool.h>
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "compiler/brw_eu_defines.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-#include "program/program.h"
-#include "program/prog_parameter.h"
-#include "program/prog_statevars.h"
-#include "main/framebuffer.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_wm_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   bool writes_depth = prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
-   uint32_t dw1, dw2;
-
-   /* _NEW_BUFFERS */
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
-
-   dw1 = dw2 = 0;
-   dw1 |= GEN7_WM_STATISTICS_ENABLE;
-   dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
-   dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
-
-   /* _NEW_LINE */
-   if (ctx->Line.StippleFlag)
-      dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon.StippleFlag)
-      dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
-
-   if (prog_data->uses_src_depth)
-      dw1 |= GEN7_WM_USES_SOURCE_DEPTH;
-
-   if (prog_data->uses_src_w)
-      dw1 |= GEN7_WM_USES_SOURCE_W;
-
-   dw1 |= prog_data->computed_depth_mode << GEN7_WM_COMPUTED_DEPTH_MODE_SHIFT;
-   dw1 |= prog_data->barycentric_interp_modes <<
-      GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
-
-   /* _NEW_COLOR, _NEW_MULTISAMPLE _NEW_BUFFERS */
-   /* Enable if the pixel shader kernel generates and outputs oMask.
-    */
-   if (prog_data->uses_kill ||
-       _mesa_is_alpha_test_enabled(ctx) ||
-       _mesa_is_alpha_to_coverage_enabled(ctx) ||
-       prog_data->uses_omask) {
-      dw1 |= GEN7_WM_KILL_ENABLE;
-   }
-
-   /* _NEW_BUFFERS | _NEW_COLOR */
-   if (brw_color_buffer_write_enabled(brw) || writes_depth ||
-       prog_data->has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) {
-      dw1 |= GEN7_WM_DISPATCH_ENABLE;
-   }
-   if (multisampled_fbo) {
-      /* _NEW_MULTISAMPLE */
-      if (ctx->Multisample.Enabled)
-         dw1 |= GEN7_WM_MSRAST_ON_PATTERN;
-      else
-         dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
-
-      if (prog_data->persample_dispatch)
-         dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
-      else
-         dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
-   } else {
-      dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
-      dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
-   }
-
-   if (prog_data->uses_sample_mask) {
-      dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
-   }
-
-   /* BRW_NEW_FS_PROG_DATA */
-   if (prog_data->early_fragment_tests)
-      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (prog_data->has_side_effects)
-      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
-
-   /* The "UAV access enable" bits are unnecessary on HSW because they only
-    * seem to have an effect on the HW-assisted coherency mechanism which we
-    * don't need, and the rasterization-related UAV_ONLY flag and the
-    * DISPATCH_ENABLE bit can be set independently from it.
-    * C.f. gen8_upload_ps_extra().
-    *
-    * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR
-    */
-   if (brw->is_haswell &&
-       !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
-       prog_data->has_side_effects)
-      dw2 |= HSW_WM_UAV_ONLY;
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen7_wm_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_COLOR |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POLYGON,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
-   },
-   .emit = upload_wm_state,
-};
-
-static void
-gen7_upload_ps_state(struct brw_context *brw,
-                     const struct brw_stage_state *stage_state,
-                     const struct brw_wm_prog_data *prog_data,
-                     bool enable_dual_src_blend, unsigned sample_mask,
-                     unsigned fast_clear_op)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   uint32_t dw2, dw4, dw5, ksp0, ksp2;
-   const int max_threads_shift = brw->is_haswell ?
-      HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
-
-   dw2 = dw4 = dw5 = ksp2 = 0;
-
-   const unsigned sampler_count =
-      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
-   dw2 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT);
-
-   dw2 |= ((prog_data->base.binding_table.size_bytes / 4) <<
-           GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-
-   if (prog_data->base.use_alt_mode)
-      dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
-
-   /* Haswell requires the sample mask to be set in this packet as well as
-    * in 3DSTATE_SAMPLE_MASK; the values should match. */
-   /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
-   if (brw->is_haswell)
-      dw4 |= SET_FIELD(sample_mask, HSW_PS_SAMPLE_MASK);
-
-   dw4 |= (devinfo->max_wm_threads - 1) << max_threads_shift;
-
-   if (prog_data->base.nr_params > 0)
-      dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
-
-   /* From the IVB PRM, volume 2 part 1, page 287:
-    * "This bit is inserted in the PS payload header and made available to
-    * the DataPort (either via the message header or via header bypass) to
-    * indicate that oMask data (one or two phases) is included in Render
-    * Target Write messages. If present, the oMask data is used to mask off
-    * samples."
-    */
-   if (prog_data->uses_omask)
-      dw4 |= GEN7_PS_OMASK_TO_RENDER_TARGET;
-
-   /* From the IVB PRM, volume 2 part 1, page 287:
-    * "If the PS kernel does not need the Position XY Offsets to
-    * compute a Position Value, then this field should be programmed
-    * to POSOFFSET_NONE."
-    * "SW Recommendation: If the PS kernel needs the Position Offsets
-    * to compute a Position XY value, this field should match Position
-    * ZW Interpolation Mode to ensure a consistent position.xyzw
-    * computation."
-    * We only require XY sample offsets. So, this recommendation doesn't
-    * look useful at the moment. We might need this in future.
-    */
-   if (prog_data->uses_pos_offset)
-      dw4 |= GEN7_PS_POSOFFSET_SAMPLE;
-   else
-      dw4 |= GEN7_PS_POSOFFSET_NONE;
-
-   /* The hardware wedges if you have this bit set but don't turn on any dual
-    * source blend factors.
-    */
-   if (enable_dual_src_blend)
-      dw4 |= GEN7_PS_DUAL_SOURCE_BLEND_ENABLE;
-
-   /* BRW_NEW_FS_PROG_DATA */
-   if (prog_data->num_varying_inputs != 0)
-      dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
-
-   dw4 |= fast_clear_op;
-
-   if (prog_data->dispatch_16)
-      dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
-
-   if (prog_data->dispatch_8)
-      dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
-
-   dw5 |= prog_data->base.dispatch_grf_start_reg <<
-          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
-   dw5 |= prog_data->dispatch_grf_start_reg_2 <<
-          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
-
-   ksp0 = stage_state->prog_offset;
-   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
-
-   BEGIN_BATCH(8);
-   OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
-   OUT_BATCH(ksp0);
-   OUT_BATCH(dw2);
-   if (prog_data->base.total_scratch) {
-      OUT_RELOC(brw->wm.base.scratch_bo,
-		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-   }
-   OUT_BATCH(dw4);
-   OUT_BATCH(dw5);
-   OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(ksp2);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_ps_state(struct brw_context *brw)
-{
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   const struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA | _NEW_COLOR */
-   const bool enable_dual_src_blend = prog_data->dual_src_blend &&
-                                      (ctx->Color.BlendEnabled & 1) &&
-                                      ctx->Color.Blend[0]._UsesDualSrc;
-   /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
-   const unsigned sample_mask =
-      brw->is_haswell ? gen6_determine_sample_mask(brw) : 0;
-
-   gen7_upload_ps_state(brw, &brw->wm.base, prog_data,
-                        enable_dual_src_blend, sample_mask,
-                        brw->wm.fast_clear_op);
-}
-
-const struct brw_tracked_state gen7_ps_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_COLOR |
-               _NEW_MULTISAMPLE,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
-   },
-   .emit = upload_ps_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c
deleted file mode 100644
index 1fa8ba2..0000000
--- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-#include "intel_batchbuffer.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "main/glformats.h"
-
-#define blend_factor(x) brw_translate_blend_factor(x)
-#define blend_eqn(x) brw_translate_blend_equation(x)
-
-static void
-gen8_upload_blend_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   /* We need at least one BLEND_STATE written, because we might do
-    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
-    * for computed depth or alpha test), which will do an FB write
-    * with render target 0, which will reference BLEND_STATE[0] for
-    * alpha test enable.
-    */
-   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
-   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
-      nr_draw_buffers = 1;
-
-   int size = 4 + 8 * nr_draw_buffers;
-   uint32_t *blend =
-      brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
-   memset(blend, 0, size);
-
-   /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
-    * "If drawbuffer zero is not NONE and the buffer it references has an
-    * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
-    * operations are skipped."
-    */
-   if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
-      /* _NEW_MULTISAMPLE */
-      if (_mesa_is_multisample_enabled(ctx)) {
-         if (ctx->Multisample.SampleAlphaToCoverage) {
-            blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_ENABLE;
-            blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_DITHER_ENABLE;
-         }
-         if (ctx->Multisample.SampleAlphaToOne)
-            blend[0] |= GEN8_BLEND_ALPHA_TO_ONE_ENABLE;
-      }
-
-      /* _NEW_COLOR */
-      if (ctx->Color.AlphaEnabled) {
-         blend[0] |=
-            GEN8_BLEND_ALPHA_TEST_ENABLE |
-            SET_FIELD(intel_translate_compare_func(ctx->Color.AlphaFunc),
-                      GEN8_BLEND_ALPHA_TEST_FUNCTION);
-      }
-
-      if (ctx->Color.DitherFlag) {
-         blend[0] |= GEN8_BLEND_COLOR_DITHER_ENABLE;
-      }
-   }
-
-   for (int i = 0; i < nr_draw_buffers; i++) {
-      /* _NEW_BUFFERS */
-      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-
-      /* Used for implementing the following bit of GL_EXT_texture_integer:
-       * "Per-fragment operations that require floating-point color
-       *  components, including multisample alpha operations, alpha test,
-       *  blending, and dithering, have no effect when the corresponding
-       *  colors are written to an integer color buffer."
-      */
-      bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
-
-      /* _NEW_COLOR */
-      if (ctx->Color.ColorLogicOpEnabled) {
-         blend[1 + 2*i+1] |=
-            GEN8_BLEND_LOGIC_OP_ENABLE |
-            SET_FIELD(intel_translate_logic_op(ctx->Color.LogicOp),
-                      GEN8_BLEND_LOGIC_OP_FUNCTION);
-      } else if (ctx->Color.BlendEnabled & (1 << i) && !integer &&
-                 !ctx->Color._AdvancedBlendMode) {
-         GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
-         GLenum eqA = ctx->Color.Blend[i].EquationA;
-         GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
-         GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
-         GLenum srcA = ctx->Color.Blend[i].SrcA;
-         GLenum dstA = ctx->Color.Blend[i].DstA;
-
-         if (eqRGB == GL_MIN || eqRGB == GL_MAX)
-            srcRGB = dstRGB = GL_ONE;
-
-         if (eqA == GL_MIN || eqA == GL_MAX)
-            srcA = dstA = GL_ONE;
-
-         /* Due to hardware limitations, the destination may have information
-          * in an alpha channel even when the format specifies no alpha
-          * channel. In order to avoid getting any incorrect blending due to
-          * that alpha channel, coerce the blend factors to values that will
-          * not read the alpha channel, but will instead use the correct
-          * implicit value for alpha.
-          */
-         if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat, GL_TEXTURE_ALPHA_TYPE)) {
-            srcRGB = brw_fix_xRGB_alpha(srcRGB);
-            srcA = brw_fix_xRGB_alpha(srcA);
-            dstRGB = brw_fix_xRGB_alpha(dstRGB);
-            dstA = brw_fix_xRGB_alpha(dstA);
-         }
-
-         blend[1 + 2*i] |=
-            GEN8_BLEND_COLOR_BUFFER_BLEND_ENABLE |
-            SET_FIELD(blend_factor(dstRGB), GEN8_BLEND_DST_BLEND_FACTOR) |
-            SET_FIELD(blend_factor(srcRGB), GEN8_BLEND_SRC_BLEND_FACTOR) |
-            SET_FIELD(blend_factor(dstA), GEN8_BLEND_DST_ALPHA_BLEND_FACTOR) |
-            SET_FIELD(blend_factor(srcA), GEN8_BLEND_SRC_ALPHA_BLEND_FACTOR) |
-            SET_FIELD(blend_eqn(eqRGB), GEN8_BLEND_COLOR_BLEND_FUNCTION) |
-            SET_FIELD(blend_eqn(eqA), GEN8_BLEND_ALPHA_BLEND_FUNCTION);
-
-         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
-            blend[0] |= GEN8_BLEND_INDEPENDENT_ALPHA_BLEND_ENABLE;
-      }
-
-      /* See section 8.1.6 "Pre-Blend Color Clamping" of the
-       * SandyBridge PRM Volume 2 Part 1 for HW requirements.
-       *
-       * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
-       * clamping in the fragment shader.  For its clamping of
-       * blending, the spec says:
-       *
-       *     "RESOLVED: For fixed-point color buffers, the inputs and
-       *      the result of the blending equation are clamped.  For
-       *      floating-point color buffers, no clamping occurs."
-       *
-       * So, generally, we want clamping to the render target's range.
-       * And, good news, the hardware tables for both pre- and
-       * post-blend color clamping are either ignored, or any are
-       * allowed, or clamping is required but RT range clamping is a
-       * valid option.
-       */
-      blend[1 + 2*i+1] |=
-         GEN8_BLEND_PRE_BLEND_COLOR_CLAMP_ENABLE |
-         GEN8_BLEND_POST_BLEND_COLOR_CLAMP_ENABLE |
-         GEN8_BLEND_COLOR_CLAMP_RANGE_RTFORMAT;
-
-      if (!ctx->Color.ColorMask[i][0])
-         blend[1 + 2*i] |= GEN8_BLEND_WRITE_DISABLE_RED;
-      if (!ctx->Color.ColorMask[i][1])
-         blend[1 + 2*i] |= GEN8_BLEND_WRITE_DISABLE_GREEN;
-      if (!ctx->Color.ColorMask[i][2])
-         blend[1 + 2*i] |= GEN8_BLEND_WRITE_DISABLE_BLUE;
-      if (!ctx->Color.ColorMask[i][3])
-         blend[1 + 2*i] |= GEN8_BLEND_WRITE_DISABLE_ALPHA;
-
-     /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
-      * "If Dual Source Blending is enabled, this bit must be disabled."
-      */
-      WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc &&
-                _mesa_is_multisample_enabled(ctx) &&
-                ctx->Multisample.SampleAlphaToOne,
-                "HW workaround: disabling alpha to one with dual src "
-                "blending\n");
-      if (ctx->Color.Blend[i]._UsesDualSrc)
-         blend[0] &= ~GEN8_BLEND_ALPHA_TO_ONE_ENABLE;
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_BLEND_STATE_POINTERS << 16 | (2 - 2));
-   OUT_BATCH(brw->cc.blend_state_offset | 1);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_blend_state = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_COLOR |
-              _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_STATE_BASE_ADDRESS,
-   },
-   .emit = gen8_upload_blend_state,
-};
-
-static void
-gen8_upload_ps_blend(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0;
-
-   /* _NEW_BUFFERS */
-   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
-   const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
-   if (brw_color_buffer_write_enabled(brw))
-      dw1 |= GEN8_PS_BLEND_HAS_WRITEABLE_RT;
-
-   if (!buffer0_is_integer) {
-      /* _NEW_COLOR */
-      if (ctx->Color.AlphaEnabled)
-         dw1 |= GEN8_PS_BLEND_ALPHA_TEST_ENABLE;
-
-      /* _NEW_MULTISAMPLE */
-      if (_mesa_is_multisample_enabled(ctx) &&
-          ctx->Multisample.SampleAlphaToCoverage)
-         dw1 |= GEN8_PS_BLEND_ALPHA_TO_COVERAGE_ENABLE;
-   }
-
-   /* Used for implementing the following bit of GL_EXT_texture_integer:
-    * "Per-fragment operations that require floating-point color
-    *  components, including multisample alpha operations, alpha test,
-    *  blending, and dithering, have no effect when the corresponding
-    *  colors are written to an integer color buffer."
-    *
-    * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
-    * "If drawbuffer zero is not NONE and the buffer it references has an
-    *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
-    *  operations are skipped."
-    */
-   if (rb && !buffer0_is_integer && (ctx->Color.BlendEnabled & 1)) {
-      GLenum eqRGB = ctx->Color.Blend[0].EquationRGB;
-      GLenum eqA = ctx->Color.Blend[0].EquationA;
-      GLenum srcRGB = ctx->Color.Blend[0].SrcRGB;
-      GLenum dstRGB = ctx->Color.Blend[0].DstRGB;
-      GLenum srcA = ctx->Color.Blend[0].SrcA;
-      GLenum dstA = ctx->Color.Blend[0].DstA;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
-         srcRGB = dstRGB = GL_ONE;
-
-      if (eqA == GL_MIN || eqA == GL_MAX)
-         srcA = dstA = GL_ONE;
-
-      /* Due to hardware limitations, the destination may have information
-       * in an alpha channel even when the format specifies no alpha
-       * channel. In order to avoid getting any incorrect blending due to
-       * that alpha channel, coerce the blend factors to values that will
-       * not read the alpha channel, but will instead use the correct
-       * implicit value for alpha.
-       */
-      if (!_mesa_base_format_has_channel(rb->_BaseFormat, GL_TEXTURE_ALPHA_TYPE)) {
-         srcRGB = brw_fix_xRGB_alpha(srcRGB);
-         srcA = brw_fix_xRGB_alpha(srcA);
-         dstRGB = brw_fix_xRGB_alpha(dstRGB);
-         dstA = brw_fix_xRGB_alpha(dstA);
-      }
-
-      dw1 |=
-         GEN8_PS_BLEND_COLOR_BUFFER_BLEND_ENABLE |
-         SET_FIELD(blend_factor(dstRGB), GEN8_PS_BLEND_DST_BLEND_FACTOR) |
-         SET_FIELD(blend_factor(srcRGB), GEN8_PS_BLEND_SRC_BLEND_FACTOR) |
-         SET_FIELD(blend_factor(dstA), GEN8_PS_BLEND_DST_ALPHA_BLEND_FACTOR) |
-         SET_FIELD(blend_factor(srcA), GEN8_PS_BLEND_SRC_ALPHA_BLEND_FACTOR);
-
-      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
-         dw1 |= GEN8_PS_BLEND_INDEPENDENT_ALPHA_BLEND_ENABLE;
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_PS_BLEND << 16 | (2 - 2));
-   OUT_BATCH(dw1);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_ps_blend = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_COLOR |
-              _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_BLORP |
-             BRW_NEW_CONTEXT |
-             BRW_NEW_FRAGMENT_PROGRAM,
-   },
-   .emit = gen8_upload_ps_blend
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 1e9e709..429c4b0 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -24,7 +24,6 @@
 #include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
 #include "intel_fbo.h"
-#include "intel_resolve_map.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
@@ -68,7 +67,7 @@
              (stencil_mt != NULL && stencil_writable) << 27 |
              (hiz ? 1 : 0) << 22 |
              depthbuffer_format << 18 |
-             (depth_mt ? depth_mt->pitch - 1 : 0));
+             (depth_mt ? depth_mt->surf.row_pitch - 1 : 0));
    if (depth_mt) {
       OUT_RELOC64(depth_mt->bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
@@ -79,7 +78,8 @@
    OUT_BATCH(((width - 1) << 4) | ((height - 1) << 18) | lod);
    OUT_BATCH(((depth - 1) << 21) | (min_array_element << 10) | mocs_wb);
    OUT_BATCH(0);
-   OUT_BATCH(((depth - 1) << 21) | (depth_mt ? depth_mt->qpitch >> 2 : 0));
+   OUT_BATCH(((depth - 1) << 21) |
+              (depth_mt ? depth_mt->surf.array_pitch_el_rows >> 2 : 0));
    ADVANCE_BATCH();
 
    if (!hiz) {
@@ -94,10 +94,10 @@
       assert(depth_mt);
       BEGIN_BATCH(5);
       OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2));
-      OUT_BATCH((depth_mt->hiz_buf->aux_base.pitch - 1) | mocs_wb << 25);
-      OUT_RELOC64(depth_mt->hiz_buf->aux_base.bo,
+      OUT_BATCH((depth_mt->hiz_buf->pitch - 1) | mocs_wb << 25);
+      OUT_RELOC64(depth_mt->hiz_buf->bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-      OUT_BATCH(depth_mt->hiz_buf->aux_base.qpitch >> 2);
+      OUT_BATCH(depth_mt->hiz_buf->qpitch >> 2);
       ADVANCE_BATCH();
    }
 
@@ -112,31 +112,17 @@
    } else {
       BEGIN_BATCH(5);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
-      /* The stencil buffer has quirky pitch requirements.  From the Graphics
-       * BSpec: vol2a.11 3D Pipeline Windower > Early Depth/Stencil Processing
-       * > Depth/Stencil Buffer State > 3DSTATE_STENCIL_BUFFER [DevIVB+],
-       * field "Surface Pitch":
-       *
-       *    The pitch must be set to 2x the value computed based on width, as
-       *    the stencil buffer is stored with two rows interleaved.
-       *
-       * (Note that it is not 100% clear whether this intended to apply to
-       * Gen7; the BSpec flags this comment as "DevILK,DevSNB" (which would
-       * imply that it doesn't), however the comment appears on a "DevIVB+"
-       * page (which would imply that it does).  Experiments with the hardware
-       * indicate that it does.
-       */
       OUT_BATCH(HSW_STENCIL_ENABLED | mocs_wb << 22 |
-                (2 * stencil_mt->pitch - 1));
+                (stencil_mt->surf.row_pitch - 1));
       OUT_RELOC64(stencil_mt->bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-      OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0);
+      OUT_BATCH(stencil_mt->surf.array_pitch_el_rows >> 2);
       ADVANCE_BATCH();
    }
 
    BEGIN_BATCH(3);
    OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
-   OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0);
+   OUT_BATCH(depth_mt ? depth_mt->fast_clear_color.u32[0] : 0);
    OUT_BATCH(1);
    ADVANCE_BATCH();
 
@@ -190,7 +176,7 @@
       break;
    case GL_TEXTURE_3D:
       assert(mt);
-      depth = MAX2(mt->logical_depth0, 1);
+      depth = mt->surf.logical_level0_px.depth;
       surftype = translate_tex_target(gl_target);
       break;
    case GL_TEXTURE_1D_ARRAY:
@@ -214,13 +200,13 @@
    lod = irb ? irb->mt_level - irb->mt->first_level : 0;
 
    if (mt) {
-      width = mt->logical_width0;
-      height = mt->logical_height0;
+      width = mt->surf.logical_level0_px.width;
+      height = mt->surf.logical_level0_px.height;
    }
 
    emit_depth_packets(brw, depth_mt, brw_depthbuffer_format(brw), surftype,
                       brw_depth_writes_enabled(brw),
-                      stencil_mt, ctx->Stencil._WriteEnabled,
+                      stencil_mt, brw->stencil_write_enabled,
                       hiz, width, height, depth, lod, min_array_element);
 }
 
@@ -288,7 +274,7 @@
     * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable ||
     * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable
     */
-   const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled;
+   const bool stencil_writes_enabled = brw->stencil_write_enabled;
 
    /* 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode != PSCDEPTH_OFF */
    const bool ps_computes_depth =
@@ -325,8 +311,6 @@
 void
 gen8_write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits)
 {
-   struct gl_context *ctx = &brw->ctx;
-
    /* If we haven't actually changed the value, bail now to avoid unnecessary
     * pipeline stalls and register writes.
     */
@@ -341,7 +325,7 @@
     * Flush is also necessary.
     */
    const uint32_t render_cache_flush =
-      ctx->Stencil._WriteEnabled ? PIPE_CONTROL_RENDER_TARGET_FLUSH : 0;
+      brw->stencil_write_enabled ? PIPE_CONTROL_RENDER_TARGET_FLUSH : 0;
    brw_emit_pipe_control_flush(brw,
                                PIPE_CONTROL_CS_STALL |
                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
@@ -391,163 +375,3 @@
    },
    .emit = gen8_emit_pma_stall_workaround
 };
-
-/**
- * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear.
- *
- * See the "Optimized Depth Buffer Clear and/or Stencil Buffer Clear" section
- * of the hardware documentation for details.
- */
-void
-gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-              unsigned int level, unsigned int layer, enum blorp_hiz_op op)
-{
-   if (op == BLORP_HIZ_OP_NONE)
-      return;
-
-   /* Disable the PMA stall fix since we're about to do a HiZ operation. */
-   if (brw->gen == 8)
-      gen8_write_pma_stall_bits(brw, 0);
-
-   assert(mt->first_level == 0);
-   assert(mt->logical_depth0 >= 1);
-
-   /* If we're operating on LOD 0, align to 8x4 to meet the alignment
-    * requirements for most HiZ operations.  Otherwise, use the actual size
-    * to allow the hardware to calculate the miplevel offsets correctly.
-    */
-   uint32_t surface_width  = ALIGN(mt->logical_width0,  level == 0 ? 8 : 1);
-   uint32_t surface_height = ALIGN(mt->logical_height0, level == 0 ? 4 : 1);
-
-   /* From the documentation for 3DSTATE_WM_HZ_OP: "3DSTATE_MULTISAMPLE packet
-    * must be used prior to this packet to change the Number of Multisamples.
-    * This packet must not be used to change Number of Multisamples in a
-    * rendering sequence."
-    */
-   if (brw->num_samples != mt->num_samples) {
-      gen8_emit_3dstate_multisample(brw, mt->num_samples);
-      brw->NewGLState |= _NEW_MULTISAMPLE;
-   }
-
-   /* The basic algorithm is:
-    * - If needed, emit 3DSTATE_{DEPTH,HIER_DEPTH,STENCIL}_BUFFER and
-    *   3DSTATE_CLEAR_PARAMS packets to set up the relevant buffers.
-    * - If needed, emit 3DSTATE_DRAWING_RECTANGLE.
-    * - Emit 3DSTATE_WM_HZ_OP with a bit set for the particular operation.
-    * - Do a special PIPE_CONTROL to trigger an implicit rectangle primitive.
-    * - Emit 3DSTATE_WM_HZ_OP with no bits set to return to normal rendering.
-    */
-   emit_depth_packets(brw, mt,
-                      brw_depth_format(brw, mt->format),
-                      BRW_SURFACE_2D,
-                      true, /* depth writes */
-                      NULL, false, /* no stencil for now */
-                      true, /* hiz */
-                      surface_width,
-                      surface_height,
-                      mt->logical_depth0,
-                      level,
-                      layer); /* min_array_element */
-
-   /* Depth buffer clears and HiZ resolves must use an 8x4 aligned rectangle.
-    * Note that intel_miptree_level_enable_hiz disables HiZ for miplevels > 0
-    * which aren't 8x4 aligned, so expanding the size is safe - it'll just
-    * draw into empty padding space.
-    */
-   unsigned rect_width = ALIGN(minify(mt->logical_width0, level), 8);
-   unsigned rect_height = ALIGN(minify(mt->logical_height0, level), 4);
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(((rect_width - 1) & 0xffff) | ((rect_height - 1) << 16));
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   /* Emit 3DSTATE_WM_HZ_OP to override pipeline state for the particular
-    * resolve or clear operation we want to perform.
-    */
-   uint32_t dw1 = 0;
-
-   switch (op) {
-   case BLORP_HIZ_OP_DEPTH_RESOLVE:
-      dw1 |= GEN8_WM_HZ_DEPTH_RESOLVE;
-      break;
-   case BLORP_HIZ_OP_HIZ_RESOLVE:
-      dw1 |= GEN8_WM_HZ_HIZ_RESOLVE;
-      break;
-   case BLORP_HIZ_OP_DEPTH_CLEAR:
-      dw1 |= GEN8_WM_HZ_DEPTH_CLEAR;
-
-      /* The "Clear Rectangle X Max" (and Y Max) fields are exclusive,
-       * rather than inclusive, and limited to 16383.  This means that
-       * for a 16384x16384 render target, we would miss the last row
-       * or column of pixels along the edge.
-       *
-       * To work around this, we have to set the "Full Surface Depth
-       * and Stencil Clear" bit.  We can do this in all cases because
-       * we always clear the full rectangle anyway.  We'll need to
-       * change this if we ever add scissored clear support.
-       */
-      dw1 |= GEN8_WM_HZ_FULL_SURFACE_DEPTH_CLEAR;
-      break;
-   case BLORP_HIZ_OP_NONE:
-      unreachable("Should not get here.");
-   }
-
-   if (mt->num_samples > 0)
-      dw1 |= SET_FIELD(ffs(mt->num_samples) - 1, GEN8_WM_HZ_NUM_SAMPLES);
-
-   BEGIN_BATCH(5);
-   OUT_BATCH(_3DSTATE_WM_HZ_OP << 16 | (5 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(0);
-   OUT_BATCH(SET_FIELD(rect_width, GEN8_WM_HZ_CLEAR_RECTANGLE_X_MAX) |
-             SET_FIELD(rect_height, GEN8_WM_HZ_CLEAR_RECTANGLE_Y_MAX));
-   OUT_BATCH(SET_FIELD(0xFFFF, GEN8_WM_HZ_SAMPLE_MASK));
-   ADVANCE_BATCH();
-
-   /* Emit a PIPE_CONTROL with "Post-Sync Operation" set to "Write Immediate
-    * Data", and no other bits set.  This causes 3DSTATE_WM_HZ_OP's state to
-    * take effect, and spawns a rectangle primitive.
-    */
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->workaround_bo, 0, 0);
-
-   /* Emit 3DSTATE_WM_HZ_OP again to disable the state overrides. */
-   BEGIN_BATCH(5);
-   OUT_BATCH(_3DSTATE_WM_HZ_OP << 16 | (5 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   /*
-    * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
-    *
-    *  Depth buffer clear pass using any of the methods (WM_STATE, 3DSTATE_WM
-    *  or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL command with
-    *  DEPTH_STALL bit and Depth FLUSH bits "set" before starting to render.
-    *  DepthStall and DepthFlush are not needed between consecutive depth
-    *  clear passes nor is it required if th e depth clear pass was done with
-    *  "full_surf_clear" bit set in the 3DSTATE_WM_HZ_OP.
-    *
-    *  TODO: Such as the spec says, this could be conditional.
-    */
-   brw_emit_pipe_control_flush(brw, 
-                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                               PIPE_CONTROL_DEPTH_STALL);
-
-   /* Mark this buffer as needing a TC flush, as we've rendered to it. */
-   brw_render_cache_set_add_bo(brw, mt->bo);
-
-   /* We've clobbered all of the depth packets, and the drawing rectangle,
-    * so we need to ensure those packets are re-emitted before the next
-    * primitive.
-    *
-    * Setting _NEW_DEPTH and _NEW_BUFFERS covers it, but is rather overkill.
-    */
-   brw->NewGLState |= _NEW_DEPTH | _NEW_BUFFERS;
-}
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
deleted file mode 100644
index 32e1447..0000000
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "main/bufferobj.h"
-#include "main/context.h"
-#include "main/enums.h"
-#include "main/macros.h"
-
-#include "brw_draw.h"
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_state.h"
-
-#include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
-
-#ifndef NDEBUG
-static bool
-is_passthru_format(uint32_t format)
-{
-   switch (format) {
-   case ISL_FORMAT_R64_PASSTHRU:
-   case ISL_FORMAT_R64G64_PASSTHRU:
-   case ISL_FORMAT_R64G64B64_PASSTHRU:
-   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
-      return true;
-   default:
-      return false;
-   }
-}
-#endif
-
-static void
-gen8_emit_vertices(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   bool uses_edge_flag;
-
-   brw_prepare_vertices(brw);
-   brw_prepare_shader_draw_parameters(brw);
-
-   uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
-                     ctx->Polygon.BackMode != GL_FILL);
-
-   const struct brw_vs_prog_data *vs_prog_data =
-      brw_vs_prog_data(brw->vs.base.prog_data);
-
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
-      unsigned vue = brw->vb.nr_enabled;
-
-      /* The element for the edge flags must always be last, so we have to
-       * insert the SGVS before it in that case.
-       */
-      if (uses_edge_flag) {
-         assert(vue > 0);
-         vue--;
-      }
-
-      WARN_ONCE(vue >= 33,
-                "Trying to insert VID/IID past 33rd vertex element, "
-                "need to reorder the vertex attrbutes.");
-
-      unsigned dw1 = 0;
-      if (vs_prog_data->uses_vertexid) {
-         dw1 |= GEN8_SGVS_ENABLE_VERTEX_ID |
-                (2 << GEN8_SGVS_VERTEX_ID_COMPONENT_SHIFT) |  /* .z channel */
-                (vue << GEN8_SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT);
-      }
-
-      if (vs_prog_data->uses_instanceid) {
-         dw1 |= GEN8_SGVS_ENABLE_INSTANCE_ID |
-                (3 << GEN8_SGVS_INSTANCE_ID_COMPONENT_SHIFT) | /* .w channel */
-                (vue << GEN8_SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT);
-      }
-
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
-      OUT_BATCH(dw1);
-      ADVANCE_BATCH();
-
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
-      OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-
-   /* Normally we don't need an element for the SGVS attribute because the
-    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
-    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
-    * we're using draw parameters then we need an element for the those
-    * values.  Additionally if there is an edge flag element then the SGVS
-    * can't be inserted past that so we need a dummy element to ensure that
-    * the edge flag is the last one.
-    */
-   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
-                                    vs_prog_data->uses_baseinstance ||
-                                    ((vs_prog_data->uses_instanceid ||
-                                      vs_prog_data->uses_vertexid) &&
-                                     uses_edge_flag));
-   const unsigned nr_elements =
-      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
-
-   /* If the VS doesn't read any inputs (calculating vertex position from
-    * a state variable for some reason, for example), emit a single pad
-    * VERTEX_ELEMENT struct and bail.
-    *
-    * The stale VB state stays in place, but they don't do anything unless
-    * a VE loads from them.
-    */
-   if (nr_elements == 0) {
-      BEGIN_BATCH(3);
-      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (3 - 2));
-      OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
-                GEN6_VE0_VALID |
-                (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
-                (0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
-      ADVANCE_BATCH();
-      return;
-   }
-
-   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
-   const bool uses_draw_params =
-      vs_prog_data->uses_basevertex ||
-      vs_prog_data->uses_baseinstance;
-   const unsigned nr_buffers = brw->vb.nr_buffers +
-      uses_draw_params + vs_prog_data->uses_drawid;
-
-   if (nr_buffers) {
-      assert(nr_buffers <= 33);
-
-      BEGIN_BATCH(1 + 4 * nr_buffers);
-      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
-      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
-         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo,
-                                  buffer->offset,
-                                  buffer->offset + buffer->size,
-                                  buffer->stride, 0 /* unused */);
-      }
-
-      if (uses_draw_params) {
-         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
-                                  brw->draw.draw_params_bo,
-                                  brw->draw.draw_params_offset,
-                                  brw->draw.draw_params_bo->size,
-                                  0 /* stride */,
-                                  0 /* unused */);
-      }
-
-      if (vs_prog_data->uses_drawid) {
-         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1,
-                                  brw->draw.draw_id_bo,
-                                  brw->draw.draw_id_offset,
-                                  brw->draw.draw_id_bo->size,
-                                  0 /* stride */,
-                                  0 /* unused */);
-      }
-      ADVANCE_BATCH();
-   }
-
-   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
-    * presumably for VertexID/InstanceID.
-    */
-   assert(nr_elements <= 34);
-
-   struct brw_vertex_element *gen6_edgeflag_input = NULL;
-
-   BEGIN_BATCH(1 + nr_elements * 2);
-   OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1));
-   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
-      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
-      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
-
-      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
-       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
-       * element which has edge flag enabled."
-       */
-      assert(!(is_passthru_format(format) && uses_edge_flag));
-
-      /* The gen4 driver expects edgeflag to come in as a float, and passes
-       * that float on to the tests in the clipper.  Mesa's current vertex
-       * attribute value for EdgeFlag is stored as a float, which works out.
-       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
-       * integer ubyte.  Just rewrite that to convert to a float.
-       */
-      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
-         /* Gen6+ passes edgeflag as sideband along with the vertex, instead
-          * of in the VUE.  We have to upload it sideband as the last vertex
-          * element according to the B-Spec.
-          */
-         gen6_edgeflag_input = input;
-         continue;
-      }
-
-      switch (input->glarray->Size) {
-      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
-      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
-      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
-      case 3:
-         if (input->glarray->Doubles) {
-            comp3 = BRW_VE1_COMPONENT_STORE_0;
-         } else if (input->glarray->Integer) {
-            comp3 = BRW_VE1_COMPONENT_STORE_1_INT;
-         } else {
-            comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
-         }
-
-         break;
-      }
-
-      /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
-       *
-       *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
-       *     formats, 64-bit components are stored in the URB without any
-       *     conversion. In this case, vertex elements must be written as 128
-       *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output
-       *     as required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
-       *     component into the URB, Component 1 must be specified as
-       *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE)
-       *     in order to output a 128-bit vertex element, or Components 1-3 must
-       *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
-       *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
-       *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
-       *     element."
-       */
-      if (input->glarray->Doubles && !input->is_dual_slot) {
-         /* Store vertex elements which correspond to double and dvec2 vertex
-          * shader inputs as 128-bit vertex elements, instead of 256-bits.
-          */
-         comp2 = BRW_VE1_COMPONENT_NOSTORE;
-         comp3 = BRW_VE1_COMPONENT_NOSTORE;
-      }
-
-      OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |
-                GEN6_VE0_VALID |
-                (format << BRW_VE0_FORMAT_SHIFT) |
-                (input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
-
-      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-                (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
-
-   if (needs_sgvs_element) {
-      if (vs_prog_data->uses_basevertex ||
-          vs_prog_data->uses_baseinstance) {
-         OUT_BATCH(GEN6_VE0_VALID |
-                   brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
-                   ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT);
-         OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-      } else {
-         OUT_BATCH(GEN6_VE0_VALID);
-         OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-      }
-   }
-
-   if (vs_prog_data->uses_drawid) {
-      OUT_BATCH(GEN6_VE0_VALID |
-                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
-                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT));
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
-
-   if (gen6_edgeflag_input) {
-      uint32_t format =
-         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
-
-      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
-                GEN6_VE0_VALID |
-                GEN6_VE0_EDGE_FLAG_ENABLE |
-                (format << BRW_VE0_FORMAT_SHIFT) |
-                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
-   ADVANCE_BATCH();
-
-   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
-      const struct brw_vertex_element *input = brw->vb.enabled[i];
-      const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
-      unsigned element_index;
-
-      /* The edge flag element is reordered to be the last one in the code
-       * above so we need to compensate for that in the element indices used
-       * below.
-       */
-      if (input == gen6_edgeflag_input)
-         element_index = nr_elements - 1;
-      else
-         element_index = j++;
-
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
-      OUT_BATCH(element_index |
-                (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0));
-      OUT_BATCH(buffer->step_rate);
-      ADVANCE_BATCH();
-   }
-
-   if (vs_prog_data->uses_drawid) {
-      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
-      BEGIN_BATCH(3);
-      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
-      OUT_BATCH(element);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen8_vertices = {
-   .dirty = {
-      .mesa = _NEW_POLYGON,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VERTICES |
-             BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = gen8_emit_vertices,
-};
-
-static void
-gen8_emit_index_buffer(struct brw_context *brw)
-{
-   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-
-   if (index_buffer == NULL)
-      return;
-
-   BEGIN_BATCH(5);
-   OUT_BATCH(CMD_INDEX_BUFFER << 16 | (5 - 2));
-   OUT_BATCH(brw_get_index_type(index_buffer->type) | mocs_wb);
-   OUT_RELOC64(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
-   OUT_BATCH(brw->ib.size);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_index_buffer = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_INDEX_BUFFER,
-   },
-   .emit = gen8_emit_index_buffer,
-};
-
-static void
-gen8_emit_vf_topology(struct brw_context *brw)
-{
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_VF_TOPOLOGY << 16 | (2 - 2));
-   OUT_BATCH(brw->primitive);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_vf_topology = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_BLORP |
-             BRW_NEW_PRIMITIVE,
-   },
-   .emit = gen8_emit_vf_topology,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c
deleted file mode 100644
index ee2f82e..0000000
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-gen8_upload_ds_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
-
-   /* BRW_NEW_TES_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   const struct brw_tes_prog_data *tes_prog_data =
-      brw_tes_prog_data(stage_state->prog_data);
-   const int ds_pkt_len = brw->gen >= 9 ? 11 : 9;
-
-   if (active) {
-      BEGIN_BATCH(ds_pkt_len);
-      OUT_BATCH(_3DSTATE_DS << 16 | (ds_pkt_len - 2));
-      OUT_BATCH(stage_state->prog_offset);
-      OUT_BATCH(0);
-      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
-                          GEN7_DS_SAMPLER_COUNT) |
-                SET_FIELD(prog_data->binding_table.size_bytes / 4,
-                          GEN7_DS_BINDING_TABLE_ENTRY_COUNT));
-      if (prog_data->total_scratch) {
-         OUT_RELOC64(stage_state->scratch_bo,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-      }
-      OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg,
-                          GEN7_DS_DISPATCH_START_GRF) |
-                SET_FIELD(vue_prog_data->urb_read_length,
-                          GEN7_DS_URB_READ_LENGTH));
-
-      OUT_BATCH(GEN7_DS_ENABLE |
-                GEN7_DS_STATISTICS_ENABLE |
-                (devinfo->max_tes_threads - 1) << HSW_DS_MAX_THREADS_SHIFT |
-                (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
-                 GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) |
-                (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
-                 GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
-      OUT_BATCH(SET_FIELD(vue_prog_data->cull_distance_mask,
-                          GEN8_DS_USER_CULL_DISTANCE));
-
-
-      if (brw->gen >= 9) {
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-      }
-
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(ds_pkt_len);
-      OUT_BATCH(_3DSTATE_DS << 16 | (ds_pkt_len - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-
-      if (brw->gen >= 9) {
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-      }
-
-      ADVANCE_BATCH();
-   }
-
-   brw->tes.enabled = active;
-}
-
-const struct brw_tracked_state gen8_ds_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_TESS_PROGRAMS |
-               BRW_NEW_TES_PROG_DATA,
-   },
-   .emit = gen8_upload_ds_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
deleted file mode 100644
index 2b74f1b..0000000
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright © 2013 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-gen8_upload_gs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->gs.base;
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool active = brw->geometry_program;
-   /* BRW_NEW_GS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-   const struct brw_gs_prog_data *gs_prog_data =
-      brw_gs_prog_data(stage_state->prog_data);
-
-   if (active) {
-      int urb_entry_write_offset = 1;
-      uint32_t urb_entry_output_length =
-         ((vue_prog_data->vue_map.num_slots + 1) / 2 - urb_entry_write_offset);
-
-      if (urb_entry_output_length == 0)
-         urb_entry_output_length = 1;
-
-      BEGIN_BATCH(10);
-      OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
-      OUT_BATCH(stage_state->prog_offset);
-      OUT_BATCH(0);
-      OUT_BATCH(gs_prog_data->vertices_in |
-                ((ALIGN(stage_state->sampler_count, 4)/4) <<
-                 GEN6_GS_SAMPLER_COUNT_SHIFT) |
-                ((prog_data->binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-      if (prog_data->total_scratch) {
-         OUT_RELOC64(stage_state->scratch_bo,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-      }
-
-      /* DW6 */
-      OUT_BATCH(((gs_prog_data->output_vertex_size_hwords * 2 - 1) <<
-                 GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
-                (gs_prog_data->output_topology <<
-                 GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
-                (vue_prog_data->include_vue_handles ?
-                 GEN7_GS_INCLUDE_VERTEX_HANDLES : 0) |
-                (vue_prog_data->urb_read_length <<
-                 GEN6_GS_URB_READ_LENGTH_SHIFT) |
-                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
-                (prog_data->dispatch_grf_start_reg <<
-                 GEN6_GS_DISPATCH_START_GRF_SHIFT));
-
-      uint32_t dw7 = (gs_prog_data->control_data_header_size_hwords <<
-                      GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
-                     SET_FIELD(vue_prog_data->dispatch_mode,
-                               GEN7_GS_DISPATCH_MODE) |
-                     ((gs_prog_data->invocations - 1) <<
-                      GEN7_GS_INSTANCE_CONTROL_SHIFT) |
-                      GEN6_GS_STATISTICS_ENABLE |
-                      (gs_prog_data->include_primitive_id ?
-                       GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
-                      GEN7_GS_REORDER_TRAILING |
-                      GEN7_GS_ENABLE;
-      uint32_t dw8 = gs_prog_data->control_data_format <<
-                     HSW_GS_CONTROL_DATA_FORMAT_SHIFT;
-
-      if (gs_prog_data->static_vertex_count != -1) {
-         dw8 |= GEN8_GS_STATIC_OUTPUT |
-                SET_FIELD(gs_prog_data->static_vertex_count,
-                          GEN8_GS_STATIC_VERTEX_COUNT);
-      }
-
-      if (brw->gen < 9)
-         dw7 |= (devinfo->max_gs_threads / 2 - 1) << HSW_GS_MAX_THREADS_SHIFT;
-      else
-         dw8 |= devinfo->max_gs_threads - 1;
-
-      /* DW7 */
-      OUT_BATCH(dw7);
-
-      /* DW8 */
-      OUT_BATCH(dw8);
-
-      /* DW9 */
-      OUT_BATCH(vue_prog_data->cull_distance_mask |
-                (urb_entry_output_length << GEN8_GS_URB_OUTPUT_LENGTH_SHIFT) |
-                (urb_entry_write_offset <<
-                 GEN8_GS_URB_ENTRY_OUTPUT_OFFSET_SHIFT));
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(10);
-      OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
-      OUT_BATCH(0); /* prog_bo */
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0); /* scratch space base offset */
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(GEN6_GS_STATISTICS_ENABLE);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-}
-
-const struct brw_tracked_state gen8_gs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA,
-   },
-   .emit = gen8_upload_gs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c
deleted file mode 100644
index ee47e5e..0000000
--- a/src/mesa/drivers/dri/i965/gen8_hs_state.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-
-static void
-gen8_upload_hs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
-   /* BRW_NEW_TCS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_tcs_prog_data *tcs_prog_data =
-      brw_tcs_prog_data(stage_state->prog_data);
-
-   if (active) {
-      BEGIN_BATCH(9);
-      OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2));
-      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
-                          GEN7_HS_SAMPLER_COUNT) |
-                SET_FIELD(prog_data->binding_table.size_bytes / 4,
-                          GEN7_HS_BINDING_TABLE_ENTRY_COUNT));
-      OUT_BATCH(GEN7_HS_ENABLE |
-                GEN7_HS_STATISTICS_ENABLE |
-                (devinfo->max_tcs_threads - 1) << GEN8_HS_MAX_THREADS_SHIFT |
-                SET_FIELD(tcs_prog_data->instances - 1,
-                          GEN7_HS_INSTANCE_COUNT));
-      OUT_BATCH(stage_state->prog_offset);
-      OUT_BATCH(0);
-      if (prog_data->total_scratch) {
-         OUT_RELOC64(stage_state->scratch_bo,
-                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
-      } else {
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-      }
-      OUT_BATCH(GEN7_HS_INCLUDE_VERTEX_HANDLES |
-                SET_FIELD(prog_data->dispatch_grf_start_reg,
-                          GEN7_HS_DISPATCH_START_GRF));
-      OUT_BATCH(0); /* MBZ */
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(9);
-      OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-   brw->tcs.enabled = active;
-}
-
-const struct brw_tracked_state gen8_hs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_TCS_PROG_DATA |
-               BRW_NEW_TESS_PROGRAMS,
-   },
-   .emit = gen8_upload_hs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
index e36d037..7a31a5d 100644
--- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
@@ -69,21 +69,3 @@
    OUT_BATCH(brw_multisample_positions_1x_2x);
    ADVANCE_BATCH();
 }
-
-
-static void
-upload_multisample_state(struct brw_context *brw)
-{
-   gen8_emit_3dstate_multisample(brw, brw->num_samples);
-   gen6_emit_3dstate_sample_mask(brw, gen6_determine_sample_mask(brw));
-}
-
-const struct brw_tracked_state gen8_multisample_state = {
-   .dirty = {
-      .mesa = _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_BLORP |
-             BRW_NEW_CONTEXT |
-             BRW_NEW_NUM_SAMPLES,
-   },
-   .emit = upload_multisample_state
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
deleted file mode 100644
index 0346826..0000000
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdbool.h>
-#include "program/program.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_wm.h"
-#include "intel_batchbuffer.h"
-
-void
-gen8_upload_ps_extra(struct brw_context *brw,
-                     const struct brw_wm_prog_data *prog_data)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0;
-
-   dw1 |= GEN8_PSX_PIXEL_SHADER_VALID;
-   dw1 |= prog_data->computed_depth_mode << GEN8_PSX_COMPUTED_DEPTH_MODE_SHIFT;
-
-   if (prog_data->uses_kill)
-      dw1 |= GEN8_PSX_KILL_ENABLE;
-
-   if (prog_data->num_varying_inputs != 0)
-      dw1 |= GEN8_PSX_ATTRIBUTE_ENABLE;
-
-   if (prog_data->uses_src_depth)
-      dw1 |= GEN8_PSX_USES_SOURCE_DEPTH;
-
-   if (prog_data->uses_src_w)
-      dw1 |= GEN8_PSX_USES_SOURCE_W;
-
-   if (prog_data->persample_dispatch)
-      dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE;
-
-   /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
-   if (prog_data->uses_sample_mask) {
-      if (brw->gen >= 9) {
-         if (prog_data->post_depth_coverage)
-            dw1 |= BRW_PCICMS_DEPTH << GEN9_PSX_SHADER_NORMAL_COVERAGE_MASK_SHIFT;
-         else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
-            dw1 |= BRW_PSICMS_INNER << GEN9_PSX_SHADER_NORMAL_COVERAGE_MASK_SHIFT;
-         else
-            dw1 |= BRW_PSICMS_NORMAL << GEN9_PSX_SHADER_NORMAL_COVERAGE_MASK_SHIFT;
-      }
-      else {
-         dw1 |= GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK;
-      }
-   }
-
-   if (prog_data->uses_omask)
-      dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
-
-   if (brw->gen >= 9 && prog_data->pulls_bary)
-      dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
-
-   /* The stricter cross-primitive coherency guarantees that the hardware
-    * gives us with the "Accesses UAV" bit set for at least one shader stage
-    * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are
-    * redundant within the current image, atomic counter and SSBO GL APIs,
-    * which all have very loose ordering and coherency requirements and
-    * generally rely on the application to insert explicit barriers when a
-    * shader invocation is expected to see the memory writes performed by the
-    * invocations of some previous primitive.  Regardless of the value of "UAV
-    * coherency required", the "Accesses UAV" bits will implicitly cause an in
-    * most cases useless DC flush when the lowermost stage with the bit set
-    * finishes execution.
-    *
-    * It would be nice to disable it, but in some cases we can't because on
-    * Gen8+ it also has an influence on rasterization via the PS UAV-only
-    * signal (which could be set independently from the coherency mechanism in
-    * the 3DSTATE_WM command on Gen7), and because in some cases it will
-    * determine whether the hardware skips execution of the fragment shader or
-    * not via the ThreadDispatchEnable signal.  However if we know that
-    * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
-    * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
-    * difference so we may just disable it here.
-    *
-    * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
-    * take into account KillPixels when no depth or stencil writes are enabled.
-    * In order for occlusion queries to work correctly with no attachments, we
-    * need to force-enable here.
-    *
-    * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
-    */
-   if ((prog_data->has_side_effects || prog_data->uses_kill) &&
-       !brw_color_buffer_write_enabled(brw))
-      dw1 |= GEN8_PSX_SHADER_HAS_UAV;
-
-   if (prog_data->computed_stencil) {
-      assert(brw->gen >= 9);
-      dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL;
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
-   OUT_BATCH(dw1);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_ps_extra(struct brw_context *brw)
-{
-   /* BRW_NEW_FS_PROG_DATA */
-   gen8_upload_ps_extra(brw, brw_wm_prog_data(brw->wm.base.prog_data));
-}
-
-const struct brw_tracked_state gen8_ps_extra = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS | _NEW_COLOR,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_CONSERVATIVE_RASTERIZATION,
-   },
-   .emit = upload_ps_extra,
-};
-
-static void
-upload_wm_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0;
-
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-
-   dw1 |= GEN7_WM_STATISTICS_ENABLE;
-   dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
-   dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
-   dw1 |= GEN7_WM_POINT_RASTRULE_UPPER_RIGHT;
-
-   /* _NEW_LINE */
-   if (ctx->Line.StippleFlag)
-      dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon.StippleFlag)
-      dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
-
-   dw1 |= wm_prog_data->barycentric_interp_modes <<
-      GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
-
-   /* BRW_NEW_FS_PROG_DATA */
-   if (wm_prog_data->early_fragment_tests)
-      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (wm_prog_data->has_side_effects)
-      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
-   OUT_BATCH(dw1);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_wm_state = {
-   .dirty = {
-      .mesa  = _NEW_LINE |
-               _NEW_POLYGON,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FS_PROG_DATA,
-   },
-   .emit = upload_wm_state,
-};
-
-void
-gen8_upload_ps_state(struct brw_context *brw,
-                     const struct brw_stage_state *stage_state,
-                     const struct brw_wm_prog_data *prog_data,
-                     uint32_t fast_clear_op)
-{
-   uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;
-
-   /* Initialize the execution mask with VMask.  Otherwise, derivatives are
-    * incorrect for subspans where some of the pixels are unlit.  We believe
-    * the bit just didn't take effect in previous generations.
-    */
-   dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;
-
-   const unsigned sampler_count =
-      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
-   dw3 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT);
-
-   /* BRW_NEW_FS_PROG_DATA */
-   dw3 |=
-      ((prog_data->base.binding_table.size_bytes / 4) <<
-       GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-
-   if (prog_data->base.use_alt_mode)
-      dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
-
-   /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
-    * it implicitly scales for different GT levels (which have some # of PSDs).
-    *
-    * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
-    */
-   if (brw->gen >= 9)
-      dw6 |= (64 - 1) << HSW_PS_MAX_THREADS_SHIFT;
-   else
-      dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT;
-
-   if (prog_data->base.nr_params > 0)
-      dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
-
-   /* From the documentation for this packet:
-    * "If the PS kernel does not need the Position XY Offsets to
-    *  compute a Position Value, then this field should be programmed
-    *  to POSOFFSET_NONE."
-    *
-    * "SW Recommendation: If the PS kernel needs the Position Offsets
-    *  to compute a Position XY value, this field should match Position
-    *  ZW Interpolation Mode to ensure a consistent position.xyzw
-    *  computation."
-    *
-    * We only require XY sample offsets. So, this recommendation doesn't
-    * look useful at the moment. We might need this in future.
-    */
-   if (prog_data->uses_pos_offset)
-      dw6 |= GEN7_PS_POSOFFSET_SAMPLE;
-   else
-      dw6 |= GEN7_PS_POSOFFSET_NONE;
-
-   dw6 |= fast_clear_op;
-
-   if (prog_data->dispatch_8)
-      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
-
-   if (prog_data->dispatch_16)
-      dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
-
-   dw7 |= prog_data->base.dispatch_grf_start_reg <<
-          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
-   dw7 |= prog_data->dispatch_grf_start_reg_2 <<
-          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
-
-   ksp0 = stage_state->prog_offset;
-   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
-
-   BEGIN_BATCH(12);
-   OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));
-   OUT_BATCH(ksp0);
-   OUT_BATCH(0);
-   OUT_BATCH(dw3);
-   if (prog_data->base.total_scratch) {
-      OUT_RELOC64(stage_state->scratch_bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   }
-   OUT_BATCH(dw6);
-   OUT_BATCH(dw7);
-   OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(0);
-   OUT_BATCH(ksp2);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-}
-
-static void
-upload_ps_state(struct brw_context *brw)
-{
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   gen8_upload_ps_state(brw, &brw->wm.base, prog_data, brw->wm.fast_clear_op);
-}
-
-const struct brw_tracked_state gen8_ps_state = {
-   .dirty = {
-      .mesa  = _NEW_MULTISAMPLE,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
-   },
-   .emit = upload_ps_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
deleted file mode 100644
index 41e94fb..0000000
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "compiler/nir/nir.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "main/macros.h"
-#include "main/fbobject.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_sbe(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_FS_PROG_DATA */
-   const struct brw_wm_prog_data *wm_prog_data =
-      brw_wm_prog_data(brw->wm.base.prog_data);
-   uint32_t num_outputs = wm_prog_data->num_varying_inputs;
-   uint16_t attr_overrides[VARYING_SLOT_MAX];
-   uint32_t urb_entry_read_length;
-   uint32_t urb_entry_read_offset;
-   uint32_t point_sprite_enables;
-   int sbe_cmd_length;
-
-   uint32_t dw1 =
-      GEN7_SBE_SWIZZLE_ENABLE |
-      num_outputs << GEN7_SBE_NUM_OUTPUTS_SHIFT;
-   uint32_t dw4 = 0;
-   uint32_t dw5 = 0;
-
-   /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-
-   /* _NEW_POINT
-    *
-    * Window coordinates in an FBO are inverted, which means point
-    * sprite origin must be inverted.
-    */
-   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
-      dw1 |= GEN6_SF_POINT_SPRITE_LOWERLEFT;
-   else
-      dw1 |= GEN6_SF_POINT_SPRITE_UPPERLEFT;
-
-   /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
-    * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
-    * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
-    * BRW_NEW_VUE_MAP_GEOM_OUT
-    */
-   calculate_attr_overrides(brw, attr_overrides,
-                            &point_sprite_enables,
-                            &urb_entry_read_length,
-                            &urb_entry_read_offset);
-
-   /* Typically, the URB entry read length and offset should be programmed in
-    * 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active stage
-    * which produces geometry.  However, we don't know the proper value until
-    * we call calculate_attr_overrides().
-    *
-    * To fit with our existing code, we override the inherited values and
-    * specify it here directly, as we did on previous generations.
-    */
-   dw1 |=
-      urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-      urb_entry_read_offset << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT |
-      GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
-      GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET;
-
-   if (brw->gen == 8) {
-      sbe_cmd_length = 4;
-   } else {
-      sbe_cmd_length = 6;
-
-      /* prepare the active component dwords */
-      int input_index = 0;
-      for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-         if (!(brw->fragment_program->info.inputs_read &
-               BITFIELD64_BIT(attr))) {
-            continue;
-         }
-
-         assert(input_index < 32);
-
-         if (input_index < 16)
-            dw4 |= (GEN9_SBE_ACTIVE_COMPONENT_XYZW << (input_index << 1));
-         else
-            dw5 |= (GEN9_SBE_ACTIVE_COMPONENT_XYZW << ((input_index - 16) << 1));
-
-         ++input_index;
-      }
-   }
-   BEGIN_BATCH(sbe_cmd_length);
-   OUT_BATCH(_3DSTATE_SBE << 16 | (sbe_cmd_length - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(point_sprite_enables);
-   OUT_BATCH(wm_prog_data->flat_inputs);
-   if (sbe_cmd_length >= 6) {
-      OUT_BATCH(dw4);
-      OUT_BATCH(dw5);
-   }
-   ADVANCE_BATCH();
-
-   BEGIN_BATCH(11);
-   OUT_BATCH(_3DSTATE_SBE_SWIZ << 16 | (11 - 2));
-
-   /* Output DWords 1 through 8: */
-   for (int i = 0; i < 8; i++) {
-      OUT_BATCH(attr_overrides[i * 2] | attr_overrides[i * 2 + 1] << 16);
-   }
-
-   OUT_BATCH(0); /* wrapshortest enables 0-7 */
-   OUT_BATCH(0); /* wrapshortest enables 8-15 */
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_sbe_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LIGHT |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_PROGRAM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
-   },
-   .emit = upload_sbe,
-};
-
-static void
-upload_sf(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0, dw2 = 0, dw3 = 0;
-   float point_size;
-
-   dw1 = GEN6_SF_STATISTICS_ENABLE;
-
-   if (brw->sf.viewport_transform_enable)
-       dw1 |= GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
-
-   /* _NEW_LINE */
-   uint32_t line_width_u3_7 = brw_get_line_width(brw);
-   if (brw->gen >= 9 || brw->is_cherryview) {
-      dw1 |= line_width_u3_7 << GEN9_SF_LINE_WIDTH_SHIFT;
-   } else {
-      dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
-   }
-
-   if (ctx->Line.SmoothFlag) {
-      dw2 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0;
-   }
-
-   /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
-   point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-
-   /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
-
-   /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
-   if (use_state_point_size(brw))
-      dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH;
-
-   /* _NEW_POINT | _NEW_MULTISAMPLE */
-   if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
-       !ctx->Point.PointSprite) {
-      dw3 |= GEN8_SF_SMOOTH_POINT_ENABLE;
-   }
-
-   dw3 |= GEN6_SF_LINE_AA_MODE_TRUE;
-
-   /* _NEW_LIGHT */
-   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
-      dw3 |= (2 << GEN6_SF_TRI_PROVOKE_SHIFT) |
-             (2 << GEN6_SF_TRIFAN_PROVOKE_SHIFT) |
-             (1 << GEN6_SF_LINE_PROVOKE_SHIFT);
-   } else {
-      dw3 |= (1 << GEN6_SF_TRIFAN_PROVOKE_SHIFT);
-   }
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_SF << 16 | (4 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   OUT_BATCH(dw3);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_sf_state = {
-   .dirty = {
-      .mesa  = _NEW_LIGHT |
-               _NEW_PROGRAM |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POINT,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
-   },
-   .emit = upload_sf,
-};
-
-static void
-upload_raster(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0;
-
-   /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(brw->ctx.DrawBuffer);
-
-   /* _NEW_POLYGON */
-   if (ctx->Polygon._FrontBit == render_to_fbo)
-      dw1 |= GEN8_RASTER_FRONT_WINDING_CCW;
-
-   if (ctx->Polygon.CullFlag) {
-      switch (ctx->Polygon.CullFaceMode) {
-      case GL_FRONT:
-         dw1 |= GEN8_RASTER_CULL_FRONT;
-         break;
-      case GL_BACK:
-         dw1 |= GEN8_RASTER_CULL_BACK;
-         break;
-      case GL_FRONT_AND_BACK:
-         dw1 |= GEN8_RASTER_CULL_BOTH;
-         break;
-      default:
-         unreachable("not reached");
-      }
-   } else {
-      dw1 |= GEN8_RASTER_CULL_NONE;
-   }
-
-   /* _NEW_POINT */
-   if (ctx->Point.SmoothFlag)
-      dw1 |= GEN8_RASTER_SMOOTH_POINT_ENABLE;
-
-   if (_mesa_is_multisample_enabled(ctx))
-      dw1 |= GEN8_RASTER_API_MULTISAMPLE_ENABLE;
-
-   if (ctx->Polygon.OffsetFill)
-      dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
-
-   if (ctx->Polygon.OffsetLine)
-      dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
-
-   if (ctx->Polygon.OffsetPoint)
-      dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
-
-   switch (ctx->Polygon.FrontMode) {
-   case GL_FILL:
-      dw1 |= GEN6_SF_FRONT_SOLID;
-      break;
-   case GL_LINE:
-      dw1 |= GEN6_SF_FRONT_WIREFRAME;
-      break;
-   case GL_POINT:
-      dw1 |= GEN6_SF_FRONT_POINT;
-      break;
-
-   default:
-      unreachable("not reached");
-   }
-
-   switch (ctx->Polygon.BackMode) {
-   case GL_FILL:
-      dw1 |= GEN6_SF_BACK_SOLID;
-      break;
-   case GL_LINE:
-      dw1 |= GEN6_SF_BACK_WIREFRAME;
-      break;
-   case GL_POINT:
-      dw1 |= GEN6_SF_BACK_POINT;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   /* _NEW_LINE */
-   if (ctx->Line.SmoothFlag)
-      dw1 |= GEN8_RASTER_LINE_AA_ENABLE;
-
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
-      dw1 |= GEN8_RASTER_SCISSOR_ENABLE;
-
-   /* _NEW_TRANSFORM */
-   if (!ctx->Transform.DepthClamp) {
-      if (brw->gen >= 9) {
-         dw1 |= GEN9_RASTER_VIEWPORT_Z_NEAR_CLIP_TEST_ENABLE |
-                GEN9_RASTER_VIEWPORT_Z_FAR_CLIP_TEST_ENABLE;
-      } else {
-         dw1 |= GEN8_RASTER_VIEWPORT_Z_CLIP_TEST_ENABLE;
-      }
-   }
-
-   /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
-   if (ctx->IntelConservativeRasterization) {
-      if (brw->gen >= 9)
-         dw1 |= GEN9_RASTER_CONSERVATIVE_RASTERIZATION_ENABLE;
-   }
-
-   BEGIN_BATCH(5);
-   OUT_BATCH(_3DSTATE_RASTER << 16 | (5 - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH_F(ctx->Polygon.OffsetUnits * 2); /* constant.  copied from gen4 */
-   OUT_BATCH_F(ctx->Polygon.OffsetFactor); /* scale */
-   OUT_BATCH_F(ctx->Polygon.OffsetClamp); /* global depth offset clamp */
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_raster_state = {
-   .dirty = {
-      .mesa  = _NEW_BUFFERS |
-               _NEW_LINE |
-               _NEW_MULTISAMPLE |
-               _NEW_POINT |
-               _NEW_POLYGON |
-               _NEW_SCISSOR |
-               _NEW_TRANSFORM,
-      .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_CONSERVATIVE_RASTERIZATION,
-   },
-   .emit = upload_raster,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
deleted file mode 100644
index 6866539..0000000
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/**
- * @file gen8_sol_state.c
- *
- * Controls the stream output logic (SOL) stage of the gen8 hardware, which is
- * used to implement GL_EXT_transform_feedback.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
-#include "main/transformfeedback.h"
-
-void
-gen8_upload_3dstate_so_buffers(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   struct brw_transform_feedback_object *brw_obj =
-      (struct brw_transform_feedback_object *) xfb_obj;
-   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-
-   /* Set up the up to 4 output buffers.  These are the ranges defined in the
-    * gl_transform_feedback_object.
-    */
-   for (int i = 0; i < 4; i++) {
-      struct intel_buffer_object *bufferobj =
-         intel_buffer_object(xfb_obj->Buffers[i]);
-
-      if (!bufferobj) {
-         BEGIN_BATCH(8);
-         OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
-         OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         ADVANCE_BATCH();
-         continue;
-      }
-
-      uint32_t start = xfb_obj->Offset[i];
-      assert(start % 4 == 0);
-      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
-      struct brw_bo *bo =
-         intel_bufferobj_buffer(brw, bufferobj, start, end - start);
-      assert(end <= bo->size);
-
-      BEGIN_BATCH(8);
-      OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
-      OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) |
-                GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE |
-                GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE |
-                (mocs_wb << 22));
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
-      OUT_BATCH(xfb_obj->Size[i] / 4 - 1);
-      OUT_RELOC64(brw_obj->offset_bo,
-                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  i * sizeof(uint32_t));
-      if (brw_obj->zero_offsets)
-         OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */
-      else
-         OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */
-      ADVANCE_BATCH();
-   }
-   brw_obj->zero_offsets = false;
-}
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
deleted file mode 100644
index ffb1426..0000000
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "main/viewport.h"
-
-static void
-gen8_upload_sf_clip_viewport(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   float y_scale, y_bias;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const bool render_to_fbo = _mesa_is_user_fbo(fb);
-   const uint32_t fb_width = _mesa_geometric_width(ctx->DrawBuffer);
-   const uint32_t fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-
-   float *vp = brw_state_batch(brw,
-                               16 * 4 * viewport_count,
-                               64, &brw->sf.vp_offset);
-   /* Also assign to clip.vp_offset in case something uses it. */
-   brw->clip.vp_offset = brw->sf.vp_offset;
-
-   /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   } else {
-      y_scale = -1.0;
-      y_bias = (float)fb_height;
-   }
-
-   for (unsigned i = 0; i < viewport_count; i++) {
-      float scale[3], translate[3];
-      _mesa_get_viewport_xform(ctx, i, scale, translate);
-
-      /* _NEW_VIEWPORT: Viewport Matrix Elements */
-      vp[0] = scale[0];                        /* m00 */
-      vp[1] = scale[1] * y_scale;              /* m11 */
-      vp[2] = scale[2];                        /* m22 */
-      vp[3] = translate[0];                    /* m30 */
-      vp[4] = translate[1] * y_scale + y_bias; /* m31 */
-      vp[5] = translate[2];                    /* m32 */
-
-      /* Reserved */
-      vp[6] = 0;
-      vp[7] = 0;
-
-      brw_calculate_guardband_size(devinfo, fb_width, fb_height,
-                                   vp[0], vp[1], vp[3], vp[4],
-                                   &vp[8], &vp[9], &vp[10], &vp[11]);
-
-      /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
-       * The hardware will take the intersection of the drawing rectangle,
-       * scissor rectangle, and the viewport extents. We don't need to be
-       * smart, and can therefore just program the viewport extents.
-       */
-      float viewport_Xmax = ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
-      float viewport_Ymax = ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
-      if (render_to_fbo) {
-         vp[12] = ctx->ViewportArray[i].X;
-         vp[13] = viewport_Xmax - 1;
-         vp[14] = ctx->ViewportArray[i].Y;
-         vp[15] = viewport_Ymax - 1;
-      } else {
-         vp[12] = ctx->ViewportArray[i].X;
-         vp[13] = viewport_Xmax - 1;
-         vp[14] = fb_height - viewport_Ymax;
-         vp[15] = fb_height - ctx->ViewportArray[i].Y - 1;
-      }
-
-      vp += 16;
-   }
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL << 16 | (2 - 2));
-   OUT_BATCH(brw->sf.vp_offset);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_sf_clip_viewport = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = gen8_upload_sf_clip_viewport,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
deleted file mode 100644
index 7b66da4..0000000
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-#include "program/prog_parameter.h"
-#include "program/prog_statevars.h"
-#include "intel_batchbuffer.h"
-
-static void
-upload_vs_state(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   const struct brw_stage_state *stage_state = &brw->vs.base;
-   uint32_t floating_point_mode = 0;
-
-   /* BRW_NEW_VS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(stage_state->prog_data);
-
-   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
-          vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
-
-   if (prog_data->use_alt_mode)
-      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;
-
-   BEGIN_BATCH(9);
-   OUT_BATCH(_3DSTATE_VS << 16 | (9 - 2));
-   OUT_BATCH(stage_state->prog_offset);
-   OUT_BATCH(0);
-   OUT_BATCH(floating_point_mode |
-             ((ALIGN(stage_state->sampler_count, 4) / 4) <<
-               GEN6_VS_SAMPLER_COUNT_SHIFT) |
-             ((prog_data->binding_table.size_bytes / 4) <<
-               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-
-   if (prog_data->total_scratch) {
-      OUT_RELOC64(stage_state->scratch_bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(stage_state->per_thread_scratch) - 11);
-   } else {
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   }
-
-   OUT_BATCH((prog_data->dispatch_grf_start_reg <<
-              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-             (vue_prog_data->urb_read_length <<
-              GEN6_VS_URB_READ_LENGTH_SHIFT) |
-             (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
-
-   uint32_t simd8_enable =
-      vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
-      GEN8_VS_SIMD8_ENABLE : 0;
-   OUT_BATCH(((devinfo->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) |
-             GEN6_VS_STATISTICS_ENABLE |
-             simd8_enable |
-             GEN6_VS_ENABLE);
-
-   OUT_BATCH(vue_prog_data->cull_distance_mask);
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_vs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_VS_PROG_DATA,
-   },
-   .emit = upload_vs_state,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c b/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
deleted file mode 100644
index 9a6c9e0..0000000
--- a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-#include "main/stencil.h"
-
-static void
-gen8_upload_wm_depth_stencil(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   uint32_t dw1 = 0, dw2 = 0, dw3 = 0;
-
-   /* _NEW_BUFFERS */
-   struct intel_renderbuffer *depth_irb =
-      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
-
-   struct gl_stencil_attrib *stencil = &ctx->Stencil;
-
-   /* _NEW_STENCIL | _NEW_BUFFERS */
-   if (stencil->_Enabled) {
-      #define FUNC intel_translate_compare_func
-      #define OP intel_translate_stencil_op
-
-      dw1 |=
-         GEN8_WM_DS_STENCIL_TEST_ENABLE |
-         FUNC(stencil->Function[0]) << GEN8_WM_DS_STENCIL_FUNC_SHIFT |
-         OP(stencil->FailFunc[0])  << GEN8_WM_DS_STENCIL_FAIL_OP_SHIFT |
-         OP(stencil->ZFailFunc[0]) << GEN8_WM_DS_Z_FAIL_OP_SHIFT |
-         OP(stencil->ZPassFunc[0]) << GEN8_WM_DS_Z_PASS_OP_SHIFT;
-
-      if (stencil->_WriteEnabled)
-         dw1 |= GEN8_WM_DS_STENCIL_BUFFER_WRITE_ENABLE;
-
-      dw2 |=
-         SET_FIELD(stencil->WriteMask[0] & 0xff, GEN8_WM_DS_STENCIL_WRITE_MASK) |
-         SET_FIELD(stencil->ValueMask[0] & 0xff, GEN8_WM_DS_STENCIL_TEST_MASK);
-
-      if (stencil->_TestTwoSide) {
-         const int b = stencil->_BackFace;
-
-         dw1 |=
-            GEN8_WM_DS_DOUBLE_SIDED_STENCIL_ENABLE |
-            FUNC(stencil->Function[b]) << GEN8_WM_DS_BF_STENCIL_FUNC_SHIFT |
-            OP(stencil->FailFunc[b]) << GEN8_WM_DS_BF_STENCIL_FAIL_OP_SHIFT |
-            OP(stencil->ZFailFunc[b]) << GEN8_WM_DS_BF_Z_FAIL_OP_SHIFT |
-            OP(stencil->ZPassFunc[b]) << GEN8_WM_DS_BF_Z_PASS_OP_SHIFT;
-
-         dw2 |= SET_FIELD(stencil->WriteMask[b] & 0xff,
-                          GEN8_WM_DS_BF_STENCIL_WRITE_MASK) |
-                SET_FIELD(stencil->ValueMask[b] & 0xff,
-                          GEN8_WM_DS_BF_STENCIL_TEST_MASK);
-      }
-
-      if (brw->gen >= 9) {
-         int stencil_ref  = _mesa_get_stencil_ref(ctx, 0);
-         int backface_ref = _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
-
-         dw3 = SET_FIELD(stencil_ref, GEN9_WM_DS_STENCIL_REF) |
-               SET_FIELD(backface_ref, GEN9_WM_DS_BF_STENCIL_REF);
-      }
-   }
-
-   /* _NEW_DEPTH */
-   if (ctx->Depth.Test && depth_irb) {
-      dw1 |=
-         GEN8_WM_DS_DEPTH_TEST_ENABLE |
-         FUNC(ctx->Depth.Func) << GEN8_WM_DS_DEPTH_FUNC_SHIFT;
-
-      if (brw_depth_writes_enabled(brw))
-         dw1 |= GEN8_WM_DS_DEPTH_BUFFER_WRITE_ENABLE;
-   }
-
-   int pkt_len = brw->gen >= 9 ? 4 : 3;
-
-   BEGIN_BATCH(pkt_len);
-   OUT_BATCH(_3DSTATE_WM_DEPTH_STENCIL << 16 | (pkt_len - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   if (pkt_len > 3) {
-      OUT_BATCH(dw3);
-   }
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state gen8_wm_depth_stencil = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_DEPTH |
-              _NEW_STENCIL,
-      .brw  = BRW_NEW_BLORP |
-              BRW_NEW_CONTEXT,
-   },
-   .emit = gen8_upload_wm_depth_stencil,
-};
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 00720f8..62d5c4a 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -32,6 +32,10 @@
 
 #include "blorp/blorp_genX_exec.h"
 
+#if GEN_GEN <= 5
+#include "gen4_blorp_exec.h"
+#endif
+
 #include "brw_blorp.h"
 
 static void *
@@ -69,10 +73,10 @@
    struct brw_context *brw = batch->driver_batch;
    struct brw_bo *bo = address.buffer;
 
-   brw_emit_reloc(&brw->batch, ss_offset, bo, address.offset + delta,
-                  address.read_domains, address.write_domain);
+   uint64_t reloc_val =
+      brw_emit_reloc(&brw->batch, ss_offset, bo, address.offset + delta,
+                     address.read_domains, address.write_domain);
 
-   uint64_t reloc_val = bo->offset64 + address.offset + delta;
    void *reloc_ptr = (void *)brw->batch.map + ss_offset;
 #if GEN_GEN >= 8
    *(uint64_t *)reloc_ptr = reloc_val;
@@ -146,6 +150,19 @@
    return data;
 }
 
+#if GEN_GEN >= 8
+static struct blorp_address
+blorp_get_workaround_page(struct blorp_batch *batch)
+{
+   assert(batch->blorp->driver_ctx == batch->driver_batch);
+   struct brw_context *brw = batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = brw->workaround_bo,
+   };
+}
+#endif
+
 static void
 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
 {
@@ -155,21 +172,22 @@
 }
 
 static void
-blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size)
+blorp_emit_urb_config(struct blorp_batch *batch,
+                      unsigned vs_entry_size, unsigned sf_entry_size)
 {
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
 
 #if GEN_GEN >= 7
-   if (!(brw->ctx.NewDriverState & (BRW_NEW_CONTEXT | BRW_NEW_URB_SIZE)) &&
-       brw->urb.vsize >= vs_entry_size)
+   if (brw->urb.vsize >= vs_entry_size)
       return;
 
-   brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE;
-
    gen7_upload_urb(brw, vs_entry_size, false, false);
-#else
+#elif GEN_GEN == 6
    gen6_upload_urb(brw, vs_entry_size, false, 0);
+#else
+   /* We calculate it now and emit later. */
+   brw_calculate_urb_fence(brw, 0, vs_entry_size, sf_entry_size);
 #endif
 }
 
@@ -214,7 +232,9 @@
    gen7_l3_state.emit(brw);
 #endif
 
+#if GEN_GEN >= 6
    brw_emit_depth_stall_flushes(brw);
+#endif
 
 #if GEN_GEN == 8
    gen8_write_pma_stall_bits(brw, 0);
@@ -263,8 +283,9 @@
     * rendering tracks for GL.
     */
    brw->ctx.NewDriverState |= BRW_NEW_BLORP;
-   brw->no_depth_or_stencil = false;
-   brw->ib.type = -1;
+   brw->no_depth_or_stencil = !params->depth.enabled &&
+                              !params->stencil.enabled;
+   brw->ib.index_size = -1;
 
    if (params->dst.enabled)
       brw_render_cache_set_add_bo(brw, params->dst.addr.buffer);
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
new file mode 100644
index 0000000..ef04603
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -0,0 +1,5648 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "common/gen_device_info.h"
+#include "common/gen_sample_positions.h"
+#include "genxml/gen_macros.h"
+
+#include "main/bufferobj.h"
+#include "main/context.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/state.h"
+
+#include "brw_context.h"
+#if GEN_GEN == 6
+#include "brw_defines.h"
+#endif
+#include "brw_draw.h"
+#include "brw_multisample_state.h"
+#include "brw_state.h"
+#include "brw_wm.h"
+#include "brw_util.h"
+
+#include "intel_batchbuffer.h"
+#include "intel_buffer_objects.h"
+#include "intel_fbo.h"
+
+#include "main/enums.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/glformats.h"
+#include "main/samplerobj.h"
+#include "main/shaderapi.h"
+#include "main/stencil.h"
+#include "main/transformfeedback.h"
+#include "main/varray.h"
+#include "main/viewport.h"
+#include "util/half_float.h"
+
+UNUSED static void *
+emit_dwords(struct brw_context *brw, unsigned n)
+{
+   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   uint32_t *map = brw->batch.map_next;
+   brw->batch.map_next += n;
+   intel_batchbuffer_advance(brw);
+   return map;
+}
+
+struct brw_address {
+   struct brw_bo *bo;
+   uint32_t read_domains;
+   uint32_t write_domain;
+   uint32_t offset;
+};
+
+static uint64_t
+emit_reloc(struct brw_context *brw,
+           void *location, struct brw_address address, uint32_t delta)
+{
+   uint32_t offset = (char *) location - (char *) brw->batch.map;
+
+   return brw_emit_reloc(&brw->batch, offset, address.bo,
+                         address.offset + delta,
+                         address.read_domains,
+                         address.write_domain);
+}
+
+#define __gen_address_type struct brw_address
+#define __gen_user_data struct brw_context
+
+static uint64_t
+__gen_combine_address(struct brw_context *brw, void *location,
+                      struct brw_address address, uint32_t delta)
+{
+   if (address.bo == NULL) {
+      return address.offset + delta;
+   } else {
+      return emit_reloc(brw, location, address, delta);
+   }
+}
+
+static inline struct brw_address
+render_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_RENDER,
+            .write_domain = I915_GEM_DOMAIN_RENDER,
+   };
+}
+
+static inline struct brw_address
+render_ro_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_RENDER,
+            .write_domain = 0,
+   };
+}
+
+static inline struct brw_address
+instruction_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
+            .write_domain = I915_GEM_DOMAIN_INSTRUCTION,
+   };
+}
+
+static inline struct brw_address
+instruction_ro_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
+            .write_domain = 0,
+   };
+}
+
+static inline struct brw_address
+vertex_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_VERTEX,
+            .write_domain = 0,
+   };
+}
+
+#if GEN_GEN == 4
+static inline struct brw_address
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_bo(brw->cache.bo, offset);
+}
+
+static inline struct brw_address
+KSP_ro(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_ro_bo(brw->cache.bo, offset);
+}
+#else
+static inline uint32_t
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return offset;
+}
+
+#define KSP_ro KSP
+
+#endif
+
+#include "genxml/genX_pack.h"
+
+#define _brw_cmd_length(cmd) cmd ## _length
+#define _brw_cmd_length_bias(cmd) cmd ## _length_bias
+#define _brw_cmd_header(cmd) cmd ## _header
+#define _brw_cmd_pack(cmd) cmd ## _pack
+
+#define brw_batch_emit(brw, cmd, name)                  \
+   for (struct cmd name = { _brw_cmd_header(cmd) },     \
+        *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
+        __builtin_expect(_dst != NULL, 1);              \
+        _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
+        _dst = NULL)
+
+#define brw_batch_emitn(brw, cmd, n, ...) ({           \
+      uint32_t *_dw = emit_dwords(brw, n);             \
+      struct cmd template = {                          \
+         _brw_cmd_header(cmd),                         \
+         .DWordLength = n - _brw_cmd_length_bias(cmd), \
+         __VA_ARGS__                                   \
+      };                                               \
+      _brw_cmd_pack(cmd)(brw, _dw, &template);         \
+      _dw + 1; /* Array starts at dw[1] */             \
+   })
+
+#define brw_state_emit(brw, cmd, align, offset, name)              \
+   for (struct cmd name = { 0, },                                  \
+        *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
+                                align, offset);                    \
+        __builtin_expect(_dst != NULL, 1);                         \
+        _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
+        _dst = NULL)
+
+/**
+ * Polygon stipple packet
+ */
+static void
+genX(upload_polygon_stipple)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_POLYGON */
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
+   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
+      /* Polygon stipple is provided in OpenGL order, i.e. bottom
+       * row first.  If we're rendering to a window (i.e. the
+       * default frame buffer object, 0), then we need to invert
+       * it to match our pixel layout.  But if we're rendering
+       * to a FBO (i.e. any named frame buffer object), we *don't*
+       * need to invert - we already match the layout.
+       */
+      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+         for (unsigned i = 0; i < 32; i++)
+            poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
+      } else {
+         for (unsigned i = 0; i < 32; i++)
+            poly.PatternRow[i] = ctx->PolygonStipple[i];
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(polygon_stipple) = {
+   .dirty = {
+      .mesa = _NEW_POLYGON |
+              _NEW_POLYGONSTIPPLE,
+      .brw = BRW_NEW_CONTEXT,
+   },
+   .emit = genX(upload_polygon_stipple),
+};
+
+/**
+ * Polygon stipple offset packet
+ */
+static void
+genX(upload_polygon_stipple_offset)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_POLYGON */
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
+   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
+      /* _NEW_BUFFERS
+       *
+       * If we're drawing to a system window we have to invert the Y axis
+       * in order to match the OpenGL pixel coordinate system, and our
+       * offset must be matched to the window position.  If we're drawing
+       * to a user-created FBO then our native pixel coordinate system
+       * works just fine, and there's no window system to worry about.
+       */
+      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+         poly.PolygonStippleYOffset =
+            (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(polygon_stipple_offset) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_POLYGON,
+      .brw = BRW_NEW_CONTEXT,
+   },
+   .emit = genX(upload_polygon_stipple_offset),
+};
+
+/**
+ * Line stipple packet
+ */
+static void
+genX(upload_line_stipple)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   if (!ctx->Line.StippleFlag)
+      return;
+
+   brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
+      line.LineStipplePattern = ctx->Line.StipplePattern;
+
+      line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
+      line.LineStippleRepeatCount = ctx->Line.StippleFactor;
+   }
+}
+
+static const struct brw_tracked_state genX(line_stipple) = {
+   .dirty = {
+      .mesa = _NEW_LINE,
+      .brw = BRW_NEW_CONTEXT,
+   },
+   .emit = genX(upload_line_stipple),
+};
+
+/* Constant single cliprect for framebuffer object or DRI2 drawing */
+static void
+genX(upload_drawing_rect)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const unsigned int fb_width = _mesa_geometric_width(fb);
+   const unsigned int fb_height = _mesa_geometric_height(fb);
+
+   brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+      rect.ClippedDrawingRectangleXMax = fb_width - 1;
+      rect.ClippedDrawingRectangleYMax = fb_height - 1;
+   }
+}
+
+static const struct brw_tracked_state genX(drawing_rect) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS,
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_CONTEXT,
+   },
+   .emit = genX(upload_drawing_rect),
+};
+
+static uint32_t *
+genX(emit_vertex_buffer_state)(struct brw_context *brw,
+                               uint32_t *dw,
+                               unsigned buffer_nr,
+                               struct brw_bo *bo,
+                               unsigned start_offset,
+                               unsigned end_offset,
+                               unsigned stride,
+                               unsigned step_rate)
+{
+   struct GENX(VERTEX_BUFFER_STATE) buf_state = {
+      .VertexBufferIndex = buffer_nr,
+      .BufferPitch = stride,
+      .BufferStartingAddress = vertex_bo(bo, start_offset),
+#if GEN_GEN >= 8
+      .BufferSize = end_offset - start_offset,
+#endif
+
+#if GEN_GEN >= 7
+      .AddressModifyEnable = true,
+#endif
+
+#if GEN_GEN < 8
+      .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
+      .InstanceDataStepRate = step_rate,
+#if GEN_GEN >= 5
+      .EndAddress = vertex_bo(bo, end_offset - 1),
+#endif
+#endif
+
+#if GEN_GEN == 10
+      .VertexBufferMOCS = CNL_MOCS_WB,
+#elif GEN_GEN == 9
+      .VertexBufferMOCS = SKL_MOCS_WB,
+#elif GEN_GEN == 8
+      .VertexBufferMOCS = BDW_MOCS_WB,
+#elif GEN_GEN == 7
+      .VertexBufferMOCS = GEN7_MOCS_L3,
+#endif
+   };
+
+   GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
+   return dw + GENX(VERTEX_BUFFER_STATE_length);
+}
+
+UNUSED static bool
+is_passthru_format(uint32_t format)
+{
+   switch (format) {
+   case ISL_FORMAT_R64_PASSTHRU:
+   case ISL_FORMAT_R64G64_PASSTHRU:
+   case ISL_FORMAT_R64G64B64_PASSTHRU:
+   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
+      return true;
+   default:
+      return false;
+   }
+}
+
+UNUSED static int
+uploads_needed(uint32_t format)
+{
+   if (!is_passthru_format(format))
+      return 1;
+
+   switch (format) {
+   case ISL_FORMAT_R64_PASSTHRU:
+   case ISL_FORMAT_R64G64_PASSTHRU:
+      return 1;
+   case ISL_FORMAT_R64G64B64_PASSTHRU:
+   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
+      return 2;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/*
+ * Returns the format that we are finally going to use when upload a vertex
+ * element. It will only change if we are using *64*PASSTHRU formats, as for
+ * gen < 8 they need to be splitted on two *32*FLOAT formats.
+ *
+ * @upload points in which upload we are. Valid values are [0,1]
+ */
+static uint32_t
+downsize_format_if_needed(uint32_t format,
+                          int upload)
+{
+   assert(upload == 0 || upload == 1);
+
+   if (!is_passthru_format(format))
+      return format;
+
+   switch (format) {
+   case ISL_FORMAT_R64_PASSTHRU:
+      return ISL_FORMAT_R32G32_FLOAT;
+   case ISL_FORMAT_R64G64_PASSTHRU:
+      return ISL_FORMAT_R32G32B32A32_FLOAT;
+   case ISL_FORMAT_R64G64B64_PASSTHRU:
+      return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
+                     : ISL_FORMAT_R32G32_FLOAT;
+   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
+      return ISL_FORMAT_R32G32B32A32_FLOAT;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/*
+ * Returns the number of componentes associated with a format that is used on
+ * a 64 to 32 format split. See downsize_format()
+ */
+static int
+upload_format_size(uint32_t upload_format)
+{
+   switch (upload_format) {
+   case ISL_FORMAT_R32G32_FLOAT:
+      return 2;
+   case ISL_FORMAT_R32G32B32A32_FLOAT:
+      return 4;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static void
+genX(emit_vertices)(struct brw_context *brw)
+{
+   uint32_t *dw;
+
+   brw_prepare_vertices(brw);
+   brw_prepare_shader_draw_parameters(brw);
+
+#if GEN_GEN < 6
+   brw_emit_query_begin(brw);
+#endif
+
+   const struct brw_vs_prog_data *vs_prog_data =
+      brw_vs_prog_data(brw->vs.base.prog_data);
+
+#if GEN_GEN >= 8
+   struct gl_context *ctx = &brw->ctx;
+   const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
+                                ctx->Polygon.BackMode != GL_FILL);
+
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
+      unsigned vue = brw->vb.nr_enabled;
+
+      /* The element for the edge flags must always be last, so we have to
+       * insert the SGVS before it in that case.
+       */
+      if (uses_edge_flag) {
+         assert(vue > 0);
+         vue--;
+      }
+
+      WARN_ONCE(vue >= 33,
+                "Trying to insert VID/IID past 33rd vertex element, "
+                "need to reorder the vertex attrbutes.");
+
+      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
+         if (vs_prog_data->uses_vertexid) {
+            vfs.VertexIDEnable = true;
+            vfs.VertexIDComponentNumber = 2;
+            vfs.VertexIDElementOffset = vue;
+         }
+
+         if (vs_prog_data->uses_instanceid) {
+            vfs.InstanceIDEnable = true;
+            vfs.InstanceIDComponentNumber = 3;
+            vfs.InstanceIDElementOffset = vue;
+         }
+      }
+
+      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         vfi.InstancingEnable = true;
+         vfi.VertexElementIndex = vue;
+      }
+   } else {
+      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
+   }
+
+   /* Normally we don't need an element for the SGVS attribute because the
+    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
+    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
+    * we're using draw parameters then we need an element for the those
+    * values.  Additionally if there is an edge flag element then the SGVS
+    * can't be inserted past that so we need a dummy element to ensure that
+    * the edge flag is the last one.
+    */
+   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
+                                    vs_prog_data->uses_baseinstance ||
+                                    ((vs_prog_data->uses_instanceid ||
+                                      vs_prog_data->uses_vertexid)
+                                     && uses_edge_flag));
+#else
+   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
+                                    vs_prog_data->uses_baseinstance ||
+                                    vs_prog_data->uses_instanceid ||
+                                    vs_prog_data->uses_vertexid);
+#endif
+   unsigned nr_elements =
+      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
+
+#if GEN_GEN < 8
+   /* If any of the formats of vb.enabled needs more that one upload, we need
+    * to add it to nr_elements
+    */
+   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
+      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+
+      if (uploads_needed(format) > 1)
+         nr_elements++;
+   }
+#endif
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (nr_elements == 0) {
+      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
+                           1 + GENX(VERTEX_ELEMENT_STATE_length));
+      struct GENX(VERTEX_ELEMENT_STATE) elem = {
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .Component0Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_1_FP,
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
+      return;
+   }
+
+   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
+   const bool uses_draw_params =
+      vs_prog_data->uses_basevertex ||
+      vs_prog_data->uses_baseinstance;
+   const unsigned nr_buffers = brw->vb.nr_buffers +
+      uses_draw_params + vs_prog_data->uses_drawid;
+
+   if (nr_buffers) {
+      assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
+
+      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
+                           1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
+
+      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
+         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
+         /* Prior to Haswell and Bay Trail we have to use 4-component formats
+          * to fake 3-component ones.  In particular, we do this for
+          * half-float and 8 and 16-bit integer formats.  This means that the
+          * vertex element may poke over the end of the buffer by 2 bytes.
+          */
+         const unsigned padding =
+            (GEN_GEN <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;
+         const unsigned end = buffer->offset + buffer->size + padding;
+         dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
+                                             buffer->offset,
+                                             end,
+                                             buffer->stride,
+                                             buffer->step_rate);
+      }
+
+      if (uses_draw_params) {
+         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
+                                             brw->draw.draw_params_bo,
+                                             brw->draw.draw_params_offset,
+                                             brw->draw.draw_params_bo->size,
+                                             0 /* stride */,
+                                             0 /* step rate */);
+      }
+
+      if (vs_prog_data->uses_drawid) {
+         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
+                                             brw->draw.draw_id_bo,
+                                             brw->draw.draw_id_offset,
+                                             brw->draw.draw_id_bo->size,
+                                             0 /* stride */,
+                                             0 /* step rate */);
+      }
+   }
+
+   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
+    * presumably for VertexID/InstanceID.
+    */
+#if GEN_GEN >= 6
+   assert(nr_elements <= 34);
+   const struct brw_vertex_element *gen6_edgeflag_input = NULL;
+#else
+   assert(nr_elements <= 18);
+#endif
+
+   dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
+                        1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
+   unsigned i;
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      const struct brw_vertex_element *input = brw->vb.enabled[i];
+      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+      uint32_t comp0 = VFCOMP_STORE_SRC;
+      uint32_t comp1 = VFCOMP_STORE_SRC;
+      uint32_t comp2 = VFCOMP_STORE_SRC;
+      uint32_t comp3 = VFCOMP_STORE_SRC;
+      const unsigned num_uploads = GEN_GEN < 8 ? uploads_needed(format) : 1;
+
+#if GEN_GEN >= 8
+      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
+       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
+       * element which has edge flag enabled."
+       */
+      assert(!(is_passthru_format(format) && uses_edge_flag));
+#endif
+
+      /* The gen4 driver expects edgeflag to come in as a float, and passes
+       * that float on to the tests in the clipper.  Mesa's current vertex
+       * attribute value for EdgeFlag is stored as a float, which works out.
+       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
+       * integer ubyte.  Just rewrite that to convert to a float.
+       *
+       * Gen6+ passes edgeflag as sideband along with the vertex, instead
+       * of in the VUE.  We have to upload it sideband as the last vertex
+       * element according to the B-Spec.
+       */
+#if GEN_GEN >= 6
+      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
+         gen6_edgeflag_input = input;
+         continue;
+      }
+#endif
+
+      for (unsigned c = 0; c < num_uploads; c++) {
+         const uint32_t upload_format = GEN_GEN >= 8 ? format :
+            downsize_format_if_needed(format, c);
+         /* If we need more that one upload, the offset stride would be 128
+          * bits (16 bytes), as for previous uploads we are using the full
+          * entry. */
+         const unsigned offset = input->offset + c * 16;
+
+         const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
+            upload_format_size(upload_format) : input->glarray->Size;
+
+         switch (size) {
+            case 0: comp0 = VFCOMP_STORE_0;
+            case 1: comp1 = VFCOMP_STORE_0;
+            case 2: comp2 = VFCOMP_STORE_0;
+            case 3:
+               if (GEN_GEN >= 8 && input->glarray->Doubles) {
+                  comp3 = VFCOMP_STORE_0;
+               } else if (input->glarray->Integer) {
+                  comp3 = VFCOMP_STORE_1_INT;
+               } else {
+                  comp3 = VFCOMP_STORE_1_FP;
+               }
+
+               break;
+         }
+
+#if GEN_GEN >= 8
+         /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
+          *
+          *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
+          *     formats, 64-bit components are stored in the URB without any
+          *     conversion. In this case, vertex elements must be written as 128
+          *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
+          *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
+          *     component into the URB, Component 1 must be specified as
+          *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
+          *     order to output a 128-bit vertex element, or Components 1-3 must
+          *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
+          *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
+          *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
+          *     vertex element."
+          */
+         if (input->glarray->Doubles && !input->is_dual_slot) {
+            /* Store vertex elements which correspond to double and dvec2 vertex
+             * shader inputs as 128-bit vertex elements, instead of 256-bits.
+             */
+            comp2 = VFCOMP_NOSTORE;
+            comp3 = VFCOMP_NOSTORE;
+         }
+#endif
+
+         struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
+            .VertexBufferIndex = input->buffer,
+            .Valid = true,
+            .SourceElementFormat = upload_format,
+            .SourceElementOffset = offset,
+            .Component0Control = comp0,
+            .Component1Control = comp1,
+            .Component2Control = comp2,
+            .Component3Control = comp3,
+#if GEN_GEN < 5
+            .DestinationElementOffset = i * 4,
+#endif
+         };
+
+         GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
+         dw += GENX(VERTEX_ELEMENT_STATE_length);
+      }
+   }
+
+   if (needs_sgvs_element) {
+      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
+         .Valid = true,
+         .Component0Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN < 5
+         .DestinationElementOffset = i * 4,
+#endif
+      };
+
+#if GEN_GEN >= 8
+      if (vs_prog_data->uses_basevertex ||
+          vs_prog_data->uses_baseinstance) {
+         elem_state.VertexBufferIndex = brw->vb.nr_buffers;
+         elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+         elem_state.Component0Control = VFCOMP_STORE_SRC;
+         elem_state.Component1Control = VFCOMP_STORE_SRC;
+      }
+#else
+      elem_state.VertexBufferIndex = brw->vb.nr_buffers;
+      elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+      if (vs_prog_data->uses_basevertex)
+         elem_state.Component0Control = VFCOMP_STORE_SRC;
+
+      if (vs_prog_data->uses_baseinstance)
+         elem_state.Component1Control = VFCOMP_STORE_SRC;
+
+      if (vs_prog_data->uses_vertexid)
+         elem_state.Component2Control = VFCOMP_STORE_VID;
+
+      if (vs_prog_data->uses_instanceid)
+         elem_state.Component3Control = VFCOMP_STORE_IID;
+#endif
+
+      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
+      dw += GENX(VERTEX_ELEMENT_STATE_length);
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
+         .Valid = true,
+         .VertexBufferIndex = brw->vb.nr_buffers + 1,
+         .SourceElementFormat = ISL_FORMAT_R32_UINT,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN < 5
+         .DestinationElementOffset = i * 4,
+#endif
+      };
+
+      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
+      dw += GENX(VERTEX_ELEMENT_STATE_length);
+   }
+
+#if GEN_GEN >= 6
+   if (gen6_edgeflag_input) {
+      const uint32_t format =
+         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
+
+      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
+         .Valid = true,
+         .VertexBufferIndex = gen6_edgeflag_input->buffer,
+         .EdgeFlagEnable = true,
+         .SourceElementFormat = format,
+         .SourceElementOffset = gen6_edgeflag_input->offset,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      };
+
+      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
+      dw += GENX(VERTEX_ELEMENT_STATE_length);
+   }
+#endif
+
+#if GEN_GEN >= 8
+   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
+      const struct brw_vertex_element *input = brw->vb.enabled[i];
+      const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
+      unsigned element_index;
+
+      /* The edge flag element is reordered to be the last one in the code
+       * above so we need to compensate for that in the element indices used
+       * below.
+       */
+      if (input == gen6_edgeflag_input)
+         element_index = nr_elements - 1;
+      else
+         element_index = j++;
+
+      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         vfi.VertexElementIndex = element_index;
+         vfi.InstancingEnable = buffer->step_rate != 0;
+         vfi.InstanceDataStepRate = buffer->step_rate;
+      }
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
+
+      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         vfi.VertexElementIndex = element;
+      }
+   }
+#endif
+}
+
+static const struct brw_tracked_state genX(vertices) = {
+   .dirty = {
+      .mesa = _NEW_POLYGON,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VERTICES |
+             BRW_NEW_VS_PROG_DATA,
+   },
+   .emit = genX(emit_vertices),
+};
+
+static void
+genX(emit_index_buffer)(struct brw_context *brw)
+{
+   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
+
+   if (index_buffer == NULL)
+      return;
+
+   brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
+#if GEN_GEN < 8 && !GEN_IS_HASWELL
+      ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+#endif
+      ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
+      ib.BufferStartingAddress = vertex_bo(brw->ib.bo, 0);
+#if GEN_GEN >= 8
+      ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+      ib.BufferSize = brw->ib.size;
+#else
+      ib.BufferEndingAddress = vertex_bo(brw->ib.bo, brw->ib.size - 1);
+#endif
+   }
+}
+
+static const struct brw_tracked_state genX(index_buffer) = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_INDEX_BUFFER,
+   },
+   .emit = genX(emit_index_buffer),
+};
+
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+static void
+genX(upload_cut_index)(struct brw_context *brw)
+{
+   const struct gl_context *ctx = &brw->ctx;
+
+   brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
+      if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
+         vf.IndexedDrawCutIndexEnable = true;
+         vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
+      }
+   }
+}
+
+const struct brw_tracked_state genX(cut_index) = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = BRW_NEW_INDEX_BUFFER,
+   },
+   .emit = genX(upload_cut_index),
+};
+#endif
+
+#if GEN_GEN >= 6
+/**
+ * Determine the appropriate attribute override value to store into the
+ * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
+ * override value contains two pieces of information: the location of the
+ * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
+ * flag indicating whether to "swizzle" the attribute based on the direction
+ * the triangle is facing.
+ *
+ * If an attribute is "swizzled", then the given VUE location is used for
+ * front-facing triangles, and the VUE location that immediately follows is
+ * used for back-facing triangles.  We use this to implement the mapping from
+ * gl_FrontColor/gl_BackColor to gl_Color.
+ *
+ * urb_entry_read_offset is the offset into the VUE at which the SF unit is
+ * being instructed to begin reading attribute data.  It can be set to a
+ * nonzero value to prevent the SF unit from wasting time reading elements of
+ * the VUE that are not needed by the fragment shader.  It is measured in
+ * 256-bit increments.
+ */
+static void
+genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
+                        const struct brw_vue_map *vue_map,
+                        int urb_entry_read_offset, int fs_attr,
+                        bool two_side_color, uint32_t *max_source_attr)
+{
+   /* Find the VUE slot for this attribute. */
+   int slot = vue_map->varying_to_slot[fs_attr];
+
+   /* Viewport and Layer are stored in the VUE header.  We need to override
+    * them to zero if earlier stages didn't write them, as GL requires that
+    * they read back as zero when not explicitly set.
+    */
+   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
+      attr->ComponentOverrideX = true;
+      attr->ComponentOverrideW = true;
+      attr->ConstantSource = CONST_0000;
+
+      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
+         attr->ComponentOverrideY = true;
+      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
+         attr->ComponentOverrideZ = true;
+
+      return;
+   }
+
+   /* If there was only a back color written but not front, use back
+    * as the color instead of undefined
+    */
+   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
+      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
+   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
+      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
+
+   if (slot == -1) {
+      /* This attribute does not exist in the VUE--that means that the vertex
+       * shader did not write to it.  This means that either:
+       *
+       * (a) This attribute is a texture coordinate, and it is going to be
+       * replaced with point coordinates (as a consequence of a call to
+       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
+       * hardware will ignore whatever attribute override we supply.
+       *
+       * (b) This attribute is read by the fragment shader but not written by
+       * the vertex shader, so its value is undefined.  Therefore the
+       * attribute override we supply doesn't matter.
+       *
+       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
+       * previous shader stage.
+       *
+       * Note that we don't have to worry about the cases where the attribute
+       * is gl_PointCoord or is undergoing point sprite coordinate
+       * replacement, because in those cases, this function isn't called.
+       *
+       * In case (c), we need to program the attribute overrides so that the
+       * primitive ID will be stored in this slot.  In every other case, the
+       * attribute override we supply doesn't matter.  So just go ahead and
+       * program primitive ID in every case.
+       */
+      attr->ComponentOverrideW = true;
+      attr->ComponentOverrideX = true;
+      attr->ComponentOverrideY = true;
+      attr->ComponentOverrideZ = true;
+      attr->ConstantSource = PRIM_ID;
+      return;
+   }
+
+   /* Compute the location of the attribute relative to urb_entry_read_offset.
+    * Each increment of urb_entry_read_offset represents a 256-bit value, so
+    * it counts for two 128-bit VUE slots.
+    */
+   int source_attr = slot - 2 * urb_entry_read_offset;
+   assert(source_attr >= 0 && source_attr < 32);
+
+   /* If we are doing two-sided color, and the VUE slot following this one
+    * represents a back-facing color, then we need to instruct the SF unit to
+    * do back-facing swizzling.
+    */
+   bool swizzling = two_side_color &&
+      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
+        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
+       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
+        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
+
+   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
+   if (*max_source_attr < source_attr + swizzling)
+      *max_source_attr = source_attr + swizzling;
+
+   attr->SourceAttribute = source_attr;
+   if (swizzling)
+      attr->SwizzleSelect = INPUTATTR_FACING;
+}
+
+
+static void
+genX(calculate_attr_overrides)(const struct brw_context *brw,
+                               struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
+                               uint32_t *point_sprite_enables,
+                               uint32_t *urb_entry_read_length,
+                               uint32_t *urb_entry_read_offset)
+{
+   const struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_POINT */
+   const struct gl_point_attrib *point = &ctx->Point;
+
+   /* BRW_NEW_FS_PROG_DATA */
+   const struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+   uint32_t max_source_attr = 0;
+
+   *point_sprite_enables = 0;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM
+    *
+    * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
+    * the full vertex header.  Otherwise, we can program the SF to start
+    * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
+    * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
+    * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
+    */
+
+   bool fs_needs_vue_header = brw->fragment_program->info.inputs_read &
+      (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+   *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
+
+   /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
+    * description of dw10 Point Sprite Texture Coordinate Enable:
+    *
+    * "This field must be programmed to zero when non-point primitives
+    * are rendered."
+    *
+    * The SandyBridge PRM doesn't explicitly say that point sprite enables
+    * must be programmed to zero when rendering non-point primitives, but
+    * the IvyBridge PRM does, and if we don't, we get garbage.
+    *
+    * This is not required on Haswell, as the hardware ignores this state
+    * when drawing non-points -- although we do still need to be careful to
+    * correctly set the attr overrides.
+    *
+    * _NEW_POLYGON
+    * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
+    */
+   bool drawing_points = brw_is_drawing_points(brw);
+
+   for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      int input_index = wm_prog_data->urb_setup[attr];
+
+      if (input_index < 0)
+         continue;
+
+      /* _NEW_POINT */
+      bool point_sprite = false;
+      if (drawing_points) {
+         if (point->PointSprite &&
+             (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
+             (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
+            point_sprite = true;
+         }
+
+         if (attr == VARYING_SLOT_PNTC)
+            point_sprite = true;
+
+         if (point_sprite)
+            *point_sprite_enables |= (1 << input_index);
+      }
+
+      /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
+      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
+
+      if (!point_sprite) {
+         genX(get_attr_override)(&attribute,
+                                 &brw->vue_map_geom_out,
+                                 *urb_entry_read_offset, attr,
+                                 _mesa_vertex_program_two_side_enabled(ctx),
+                                 &max_source_attr);
+      }
+
+      /* The hardware can only do the overrides on 16 overrides at a
+       * time, and the other up to 16 have to be lined up so that the
+       * input index = the output index.  We'll need to do some
+       * tweaking to make sure that's the case.
+       */
+      if (input_index < 16)
+         attr_overrides[input_index] = attribute;
+      else
+         assert(attribute.SourceAttribute == input_index);
+   }
+
+   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
+    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
+    *
+    * "This field should be set to the minimum length required to read the
+    *  maximum source attribute.  The maximum source attribute is indicated
+    *  by the maximum value of the enabled Attribute # Source Attribute if
+    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
+    *  enable is not set.
+    *  read_length = ceiling((max_source_attr + 1) / 2)
+    *
+    *  [errata] Corruption/Hang possible if length programmed larger than
+    *  recommended"
+    *
+    * Similar text exists for Ivy Bridge.
+    */
+   *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
+#elif GEN_GEN >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   struct intel_renderbuffer *depth_irb =
+      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
+
+   /* _NEW_DEPTH */
+   struct gl_depthbuffer_attrib *depth = &ctx->Depth;
+
+   /* _NEW_STENCIL */
+   struct gl_stencil_attrib *stencil = &ctx->Stencil;
+   const int b = stencil->_BackFace;
+
+   if (depth->Test && depth_irb) {
+      ds->DepthTestEnable = true;
+      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
+      ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
+   }
+
+   if (brw->stencil_enabled) {
+      ds->StencilTestEnable = true;
+      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
+      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
+
+      ds->StencilTestFunction =
+         intel_translate_compare_func(stencil->Function[0]);
+      ds->StencilFailOp =
+         intel_translate_stencil_op(stencil->FailFunc[0]);
+      ds->StencilPassDepthPassOp =
+         intel_translate_stencil_op(stencil->ZPassFunc[0]);
+      ds->StencilPassDepthFailOp =
+         intel_translate_stencil_op(stencil->ZFailFunc[0]);
+
+      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
+
+      if (brw->stencil_two_sided) {
+         ds->DoubleSidedStencilEnable = true;
+         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
+         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
+
+         ds->BackfaceStencilTestFunction =
+            intel_translate_compare_func(stencil->Function[b]);
+         ds->BackfaceStencilFailOp =
+            intel_translate_stencil_op(stencil->FailFunc[b]);
+         ds->BackfaceStencilPassDepthPassOp =
+            intel_translate_stencil_op(stencil->ZPassFunc[b]);
+         ds->BackfaceStencilPassDepthFailOp =
+            intel_translate_stencil_op(stencil->ZFailFunc[b]);
+      }
+
+#if GEN_GEN <= 5 || GEN_GEN >= 9
+      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
+      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
+#endif
+   }
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_depth_stencil_state)(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
+      set_depth_stencil_bits(brw, &wmds);
+   }
+#else
+   uint32_t ds_offset;
+   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
+      set_depth_stencil_bits(brw, &ds);
+   }
+
+   /* Now upload a pointer to the indirect state */
+#if GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+      ptr.DEPTH_STENCIL_STATEChange = true;
+   }
+#else
+   brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
+      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+   }
+#endif
+#endif
+}
+
+static const struct brw_tracked_state genX(depth_stencil_state) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_DEPTH |
+              _NEW_STENCIL,
+      .brw  = BRW_NEW_BLORP |
+              (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
+                            : BRW_NEW_BATCH |
+                              BRW_NEW_STATE_BASE_ADDRESS),
+   },
+   .emit = genX(upload_depth_stencil_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN <= 5
+
+static void
+genX(upload_clip_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
+      clip.KernelStartPointer = KSP_ro(brw, brw->clip.prog_offset);
+      clip.GRFRegisterCount =
+         DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
+      clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+      clip.SingleProgramFlow = true;
+      clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
+      clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
+
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
+      clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
+      clip.DispatchGRFStartRegisterForURBData = 1;
+      clip.VertexURBEntryReadOffset = 0;
+
+      /* BRW_NEW_URB_FENCE */
+      clip.NumberofURBEntries = brw->urb.nr_clip_entries;
+      clip.URBEntryAllocationSize = brw->urb.vsize - 1;
+
+      if (brw->urb.nr_clip_entries >= 10) {
+         /* Half of the URB entries go to each thread, and it has to be an
+          * even number.
+          */
+         assert(brw->urb.nr_clip_entries % 2 == 0);
+
+         /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
+          * only 2 threads can output VUEs at a time.
+          */
+         clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
+      } else {
+         assert(brw->urb.nr_clip_entries >= 5);
+         clip.MaximumNumberofThreads = 1 - 1;
+      }
+
+      clip.VertexPositionSpace = VPOS_NDCSPACE;
+      clip.UserClipFlagsMustClipEnable = true;
+      clip.GuardbandClipTestEnable = true;
+
+      clip.ClipperViewportStatePointer =
+         instruction_ro_bo(brw->batch.bo, brw->clip.vp_offset);
+
+      clip.ScreenSpaceViewportXMin = -1;
+      clip.ScreenSpaceViewportXMax = 1;
+      clip.ScreenSpaceViewportYMin = -1;
+      clip.ScreenSpaceViewportYMax = 1;
+
+      clip.ViewportXYClipTestEnable = true;
+      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+
+      /* _NEW_TRANSFORM */
+      if (GEN_GEN == 5 || GEN_IS_G4X) {
+         clip.UserClipDistanceClipTestEnableBitmask =
+            ctx->Transform.ClipPlanesEnabled;
+      } else {
+         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
+          * workaround.
+          */
+         clip.UserClipDistanceClipTestEnableBitmask =
+            (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
+      }
+
+      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
+         clip.APIMode = APIMODE_D3D;
+      else
+         clip.APIMode = APIMODE_OGL;
+
+      clip.GuardbandClipTestEnable = true;
+
+      clip.ClipMode = brw->clip.prog_data->clip_mode;
+
+#if GEN_IS_G4X
+      clip.NegativeWClipTestEnable = true;
+#endif
+   }
+}
+
+const struct brw_tracked_state genX(clip_state) = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM |
+               _NEW_VIEWPORT,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_CLIP_PROG_DATA |
+               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+               BRW_NEW_PROGRAM_CACHE |
+               BRW_NEW_URB_FENCE,
+   },
+   .emit = genX(upload_clip_state),
+};
+
+#else
+
+static void
+genX(upload_clip_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+   /* BRW_NEW_FS_PROG_DATA */
+   struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+
+   brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
+      clip.StatisticsEnable = !brw->meta_in_progress;
+
+      if (wm_prog_data->barycentric_interp_modes &
+          BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
+         clip.NonPerspectiveBarycentricEnable = true;
+
+#if GEN_GEN >= 7
+      clip.EarlyCullEnable = true;
+#endif
+
+#if GEN_GEN == 7
+      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+
+      if (ctx->Polygon.CullFlag) {
+         switch (ctx->Polygon.CullFaceMode) {
+         case GL_FRONT:
+            clip.CullMode = CULLMODE_FRONT;
+            break;
+         case GL_BACK:
+            clip.CullMode = CULLMODE_BACK;
+            break;
+         case GL_FRONT_AND_BACK:
+            clip.CullMode = CULLMODE_BOTH;
+            break;
+         default:
+            unreachable("Should not get here: invalid CullFlag");
+         }
+      } else {
+         clip.CullMode = CULLMODE_NONE;
+      }
+#endif
+
+#if GEN_GEN < 8
+      clip.UserClipDistanceCullTestEnableBitmask =
+         brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
+
+      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+#endif
+
+      /* _NEW_LIGHT */
+      if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
+         clip.TriangleStripListProvokingVertexSelect = 0;
+         clip.TriangleFanProvokingVertexSelect = 1;
+         clip.LineStripListProvokingVertexSelect = 0;
+      } else {
+         clip.TriangleStripListProvokingVertexSelect = 2;
+         clip.TriangleFanProvokingVertexSelect = 2;
+         clip.LineStripListProvokingVertexSelect = 1;
+      }
+
+      /* _NEW_TRANSFORM */
+      clip.UserClipDistanceClipTestEnableBitmask =
+         ctx->Transform.ClipPlanesEnabled;
+
+#if GEN_GEN >= 8
+      clip.ForceUserClipDistanceClipTestEnableBitmask = true;
+#endif
+
+      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
+         clip.APIMode = APIMODE_D3D;
+      else
+         clip.APIMode = APIMODE_OGL;
+
+      clip.GuardbandClipTestEnable = true;
+
+      /* BRW_NEW_VIEWPORT_COUNT */
+      const unsigned viewport_count = brw->clip.viewport_count;
+
+      if (ctx->RasterDiscard) {
+         clip.ClipMode = CLIPMODE_REJECT_ALL;
+#if GEN_GEN == 6
+         perf_debug("Rasterizer discard is currently implemented via the "
+                    "clipper; having the GS not write primitives would "
+                    "likely be faster.\n");
+#endif
+      } else {
+         clip.ClipMode = CLIPMODE_NORMAL;
+      }
+
+      clip.ClipEnable = true;
+
+      /* _NEW_POLYGON,
+       * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
+       */
+      if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
+         clip.ViewportXYClipTestEnable = true;
+
+      clip.MinimumPointWidth = 0.125;
+      clip.MaximumPointWidth = 255.875;
+      clip.MaximumVPIndex = viewport_count - 1;
+      if (_mesa_geometric_layers(fb) == 0)
+         clip.ForceZeroRTAIndexEnable = true;
+   }
+}
+
+static const struct brw_tracked_state genX(clip_state) = {
+   .dirty = {
+      .mesa  = _NEW_BUFFERS |
+               _NEW_LIGHT |
+               _NEW_POLYGON |
+               _NEW_TRANSFORM,
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GS_PROG_DATA |
+               BRW_NEW_VS_PROG_DATA |
+               BRW_NEW_META_IN_PROGRESS |
+               BRW_NEW_PRIMITIVE |
+               BRW_NEW_RASTERIZER_DISCARD |
+               BRW_NEW_TES_PROG_DATA |
+               BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_clip_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void
+genX(upload_sf)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   float point_size;
+
+#if GEN_GEN <= 7
+   /* _NEW_BUFFERS */
+   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   UNUSED const bool multisampled_fbo =
+      _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+#endif
+
+#if GEN_GEN < 6
+   const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
+
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+
+   brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
+      sf.KernelStartPointer = KSP_ro(brw, brw->sf.prog_offset);
+      sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+      sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
+      sf.DispatchGRFStartRegisterForURBData = 3;
+      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+      sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
+      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
+      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
+
+      /* STATE_PREFETCH command description describes this state as being
+       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
+       * domain.
+       */
+      sf.SetupViewportStateOffset =
+         instruction_ro_bo(brw->batch.bo, brw->sf.vp_offset);
+
+      sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+
+      /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
+      /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
+
+      sf.MaximumNumberofThreads =
+         MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
+
+      sf.SpritePointEnable = ctx->Point.PointSprite;
+
+      sf.DestinationOriginHorizontalBias = 0.5;
+      sf.DestinationOriginVerticalBias = 0.5;
+#else
+   brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
+      sf.StatisticsEnable = true;
+#endif
+      sf.ViewportTransformEnable = true;
+
+#if GEN_GEN == 7
+      /* _NEW_BUFFERS */
+      sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
+#endif
+
+#if GEN_GEN <= 7
+      /* _NEW_POLYGON */
+      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+#if GEN_GEN >= 6
+      sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
+      sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
+      sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
+
+      switch (ctx->Polygon.FrontMode) {
+         case GL_FILL:
+            sf.FrontFaceFillMode = FILL_MODE_SOLID;
+            break;
+         case GL_LINE:
+            sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
+            break;
+         case GL_POINT:
+            sf.FrontFaceFillMode = FILL_MODE_POINT;
+            break;
+         default:
+            unreachable("not reached");
+      }
+
+      switch (ctx->Polygon.BackMode) {
+         case GL_FILL:
+            sf.BackFaceFillMode = FILL_MODE_SOLID;
+            break;
+         case GL_LINE:
+            sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
+            break;
+         case GL_POINT:
+            sf.BackFaceFillMode = FILL_MODE_POINT;
+            break;
+         default:
+            unreachable("not reached");
+      }
+
+      if (multisampled_fbo && ctx->Multisample.Enabled)
+         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+
+      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
+      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
+      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
+#endif
+
+      sf.ScissorRectangleEnable = true;
+
+      if (ctx->Polygon.CullFlag) {
+         switch (ctx->Polygon.CullFaceMode) {
+            case GL_FRONT:
+               sf.CullMode = CULLMODE_FRONT;
+               break;
+            case GL_BACK:
+               sf.CullMode = CULLMODE_BACK;
+               break;
+            case GL_FRONT_AND_BACK:
+               sf.CullMode = CULLMODE_BOTH;
+               break;
+            default:
+               unreachable("not reached");
+         }
+      } else {
+         sf.CullMode = CULLMODE_NONE;
+      }
+
+#if GEN_IS_HASWELL
+      sf.LineStippleEnable = ctx->Line.StippleFlag;
+#endif
+
+#endif
+
+      /* _NEW_LINE */
+#if GEN_GEN == 8
+      if (brw->is_cherryview)
+         sf.CHVLineWidth = brw_get_line_width(brw);
+      else
+         sf.LineWidth = brw_get_line_width(brw);
+#else
+      sf.LineWidth = brw_get_line_width(brw);
+#endif
+
+      if (ctx->Line.SmoothFlag) {
+         sf.LineEndCapAntialiasingRegionWidth = _10pixels;
+#if GEN_GEN <= 7
+         sf.AntiAliasingEnable = true;
+#endif
+      }
+
+      /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
+      point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
+      /* Clamp to the hardware limits */
+      sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
+
+      /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
+      if (use_state_point_size(brw))
+         sf.PointWidthSource = State;
+
+#if GEN_GEN >= 8
+      /* _NEW_POINT | _NEW_MULTISAMPLE */
+      if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
+          !ctx->Point.PointSprite)
+         sf.SmoothPointEnable = true;
+#endif
+
+#if GEN_IS_G4X || GEN_GEN >= 5
+      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+#endif
+
+      /* _NEW_LIGHT */
+      if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
+         sf.TriangleStripListProvokingVertexSelect = 2;
+         sf.TriangleFanProvokingVertexSelect = 2;
+         sf.LineStripListProvokingVertexSelect = 1;
+      } else {
+         sf.TriangleFanProvokingVertexSelect = 1;
+      }
+
+#if GEN_GEN == 6
+      /* BRW_NEW_FS_PROG_DATA */
+      const struct brw_wm_prog_data *wm_prog_data =
+         brw_wm_prog_data(brw->wm.base.prog_data);
+
+      sf.AttributeSwizzleEnable = true;
+      sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+
+      /*
+       * Window coordinates in an FBO are inverted, which means point
+       * sprite origin must be inverted, too.
+       */
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+         sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
+      } else {
+         sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+      }
+
+      /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
+       * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
+       */
+      uint32_t urb_entry_read_length;
+      uint32_t urb_entry_read_offset;
+      uint32_t point_sprite_enables;
+      genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
+                                     &urb_entry_read_length,
+                                     &urb_entry_read_offset);
+      sf.VertexURBEntryReadLength = urb_entry_read_length;
+      sf.VertexURBEntryReadOffset = urb_entry_read_offset;
+      sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+      sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+#endif
+   }
+}
+
+static const struct brw_tracked_state genX(sf_state) = {
+   .dirty = {
+      .mesa  = _NEW_LIGHT |
+               _NEW_LINE |
+               _NEW_POINT |
+               _NEW_PROGRAM |
+               (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
+               (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_VUE_MAP_GEOM_OUT |
+               (GEN_GEN <= 5 ? BRW_NEW_BATCH |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_SF_PROG_DATA |
+                               BRW_NEW_SF_VP |
+                               BRW_NEW_URB_FENCE
+                             : 0) |
+               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
+               (GEN_GEN >= 6 && GEN_GEN <= 7 ?
+                               BRW_NEW_GS_PROG_DATA |
+                               BRW_NEW_PRIMITIVE |
+                               BRW_NEW_TES_PROG_DATA
+                             : 0) |
+               (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
+                               BRW_NEW_FRAGMENT_PROGRAM
+                             : 0),
+   },
+   .emit = genX(upload_sf),
+};
+
+/* ---------------------------------------------------------------------- */
+
+static bool
+brw_color_buffer_write_enabled(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct gl_program *fp = brw->fragment_program;
+   unsigned i;
+
+   /* _NEW_BUFFERS */
+   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      uint64_t outputs_written = fp->info.outputs_written;
+
+      /* _NEW_COLOR */
+      if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
+                 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
+          (ctx->Color.ColorMask[i][0] ||
+           ctx->Color.ColorMask[i][1] ||
+           ctx->Color.ColorMask[i][2] ||
+           ctx->Color.ColorMask[i][3])) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static void
+genX(upload_wm)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* BRW_NEW_FS_PROG_DATA */
+   const struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+
+   UNUSED bool writes_depth =
+      wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
+   UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+#if GEN_GEN == 6
+   /* We can't fold this into gen6_upload_wm_push_constants(), because
+    * according to the SNB PRM, vol 2 part 1 section 7.2.2
+    * (3DSTATE_CONSTANT_PS [DevSNB]):
+    *
+    *     "[DevSNB]: This packet must be followed by WM_STATE."
+    */
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
+      if (wm_prog_data->base.nr_params != 0) {
+         wmcp.Buffer0Valid = true;
+         /* Pointer to the WM constant buffer.  Covered by the set of
+          * state flags from gen6_upload_wm_push_constants.
+          */
+         wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
+         wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+      }
+   }
+#endif
+
+#if GEN_GEN >= 6
+   brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
+      wm.LineAntialiasingRegionWidth = _10pixels;
+      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
+      if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
+         /* These two fields should be the same pre-gen6, which is why we
+          * only have one hardware field to program for both dispatch
+          * widths.
+          */
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                wm_prog_data->dispatch_grf_start_reg_2);
+      }
+
+      if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
+         wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
+
+      if (stage_state->sampler_count)
+         wm.SamplerStatePointer =
+            instruction_ro_bo(brw->batch.bo, stage_state->sampler_offset);
+#if GEN_GEN == 5
+      if (wm_prog_data->prog_offset_2)
+         wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
+#endif
+
+      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
+      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+      wm.EarlyDepthTestEnable = true;
+      wm.LineAntialiasingRegionWidth = _05pixels;
+      wm.LineEndCapAntialiasingRegionWidth = _10pixels;
+
+      /* _NEW_POLYGON */
+      if (ctx->Polygon.OffsetFill) {
+         wm.GlobalDepthOffsetEnable = true;
+         /* Something weird going on with legacy_global_depth_bias,
+          * offset_constant, scaling and MRD.  This value passes glean
+          * but gives some odd results elsewere (eg. the
+          * quad-offset-units test).
+          */
+         wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
+
+         /* This is the only value that passes glean:
+         */
+         wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
+      }
+
+      wm.DepthCoefficientURBReadOffset = 1;
+#endif
+
+      /* BRW_NEW_STATS_WM */
+      wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
+
+#if GEN_GEN < 7
+      if (wm_prog_data->base.use_alt_mode)
+         wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+
+      wm.SamplerCount = GEN_GEN == 5 ?
+         0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
+
+      wm.BindingTableEntryCount =
+         wm_prog_data->base.binding_table.size_bytes / 4;
+      wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
+      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         wm_prog_data->base.dispatch_grf_start_reg;
+      if (GEN_GEN == 6 ||
+          wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
+         wm.KernelStartPointer0 = KSP_ro(brw,
+                                         stage_state->prog_offset);
+      }
+
+#if GEN_GEN >= 5
+      if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
+         wm.KernelStartPointer2 =
+            KSP_ro(brw, stage_state->prog_offset +
+                   wm_prog_data->prog_offset_2);
+      }
+#endif
+
+#if GEN_GEN == 6
+      wm.DualSourceBlendEnable =
+         wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
+         ctx->Color.Blend[0]._UsesDualSrc;
+      wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+      wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+
+      /* From the SNB PRM, volume 2 part 1, page 281:
+       * "If the PS kernel does not need the Position XY Offsets
+       * to compute a Position XY value, then this field should be
+       * programmed to POSOFFSET_NONE."
+       *
+       * "SW Recommendation: If the PS kernel needs the Position Offsets
+       * to compute a Position XY value, this field should match Position
+       * ZW Interpolation Mode to ensure a consistent position.xyzw
+       * computation."
+       * We only require XY sample offsets. So, this recommendation doesn't
+       * look useful at the moment. We might need this in future.
+       */
+      if (wm_prog_data->uses_pos_offset)
+         wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
+      else
+         wm.PositionXYOffsetSelect = POSOFFSET_NONE;
+
+      wm.DispatchGRFStartRegisterForConstantSetupData2 =
+         wm_prog_data->dispatch_grf_start_reg_2;
+#endif
+
+      if (wm_prog_data->base.total_scratch) {
+         wm.ScratchSpaceBasePointer =
+            render_bo(stage_state->scratch_bo, 0);
+         wm.PerThreadScratchSpace =
+            ffs(stage_state->per_thread_scratch) - 11;
+      }
+
+      wm.PixelShaderComputedDepth = writes_depth;
+#endif
+
+      /* _NEW_LINE */
+      wm.LineStippleEnable = ctx->Line.StippleFlag;
+
+      /* _NEW_POLYGON */
+      wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
+
+#if GEN_GEN < 8
+
+#if GEN_GEN >= 6
+      wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
+
+      /* _NEW_BUFFERS */
+      const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+
+      if (multisampled_fbo) {
+         /* _NEW_MULTISAMPLE */
+         if (ctx->Multisample.Enabled)
+            wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+         else
+            wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+
+         if (wm_prog_data->persample_dispatch)
+            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+         else
+            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+      } else {
+         wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+      }
+#endif
+      wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+      if (wm_prog_data->uses_kill ||
+          _mesa_is_alpha_test_enabled(ctx) ||
+          _mesa_is_alpha_to_coverage_enabled(ctx) ||
+          (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
+         wm.PixelShaderKillsPixel = true;
+      }
+
+      /* _NEW_BUFFERS | _NEW_COLOR */
+      if (brw_color_buffer_write_enabled(brw) || writes_depth ||
+          wm.PixelShaderKillsPixel ||
+          (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
+         wm.ThreadDispatchEnable = true;
+      }
+
+#if GEN_GEN >= 7
+      wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+#endif
+
+      /* The "UAV access enable" bits are unnecessary on HSW because they only
+       * seem to have an effect on the HW-assisted coherency mechanism which we
+       * don't need, and the rasterization-related UAV_ONLY flag and the
+       * DISPATCH_ENABLE bit can be set independently from it.
+       * C.f. gen8_upload_ps_extra().
+       *
+       * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
+       * _NEW_COLOR
+       */
+#if GEN_IS_HASWELL
+      if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
+          wm_prog_data->has_side_effects)
+         wm.PSUAVonly = ON;
+#endif
+#endif
+
+#if GEN_GEN >= 7
+      /* BRW_NEW_FS_PROG_DATA */
+      if (wm_prog_data->early_fragment_tests)
+         wm.EarlyDepthStencilControl = EDSC_PREPS;
+      else if (wm_prog_data->has_side_effects)
+         wm.EarlyDepthStencilControl = EDSC_PSEXEC;
+#endif
+   }
+
+#if GEN_GEN <= 5
+   if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
+      brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
+         clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
+      }
+
+      brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
+   }
+#endif
+}
+
+static const struct brw_tracked_state genX(wm_state) = {
+   .dirty = {
+      .mesa  = _NEW_LINE |
+               _NEW_POLYGON |
+               (GEN_GEN < 8 ? _NEW_BUFFERS |
+                              _NEW_COLOR :
+                              0) |
+               (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
+               (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
+               (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+                              BRW_NEW_FRAGMENT_PROGRAM |
+                              BRW_NEW_PROGRAM_CACHE |
+                              BRW_NEW_SAMPLER_STATE_TABLE |
+                              BRW_NEW_STATS_WM
+                            : 0) |
+               (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
+   },
+   .emit = genX(upload_wm),
+};
+
+/* ---------------------------------------------------------------------- */
+
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
+   pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
+   pkt.SamplerCount       =                                               \
+      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   pkt.BindingTableEntryCount =                                           \
+      stage_prog_data->binding_table.size_bytes / 4;                      \
+   pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
+                                                                          \
+   if (stage_prog_data->total_scratch) {                                  \
+      pkt.ScratchSpaceBasePointer =                                       \
+         render_bo(stage_state->scratch_bo, 0);                           \
+      pkt.PerThreadScratchSpace =                                         \
+         ffs(stage_state->per_thread_scratch) - 11;                       \
+   }                                                                      \
+                                                                          \
+   pkt.DispatchGRFStartRegisterForURBData =                               \
+      stage_prog_data->dispatch_grf_start_reg;                            \
+   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
+   pkt.prefix##URBEntryReadOffset = 0;                                    \
+                                                                          \
+   pkt.StatisticsEnable = true;                                           \
+   pkt.Enable           = true;
+
+static void
+genX(upload_vs_state)(struct brw_context *brw)
+{
+   UNUSED struct gl_context *ctx = &brw->ctx;
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct brw_stage_state *stage_state = &brw->vs.base;
+
+   /* BRW_NEW_VS_PROG_DATA */
+   const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(brw->vs.base.prog_data);
+   const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
+
+   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
+
+#if GEN_GEN == 6
+   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
+    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
+    *
+    *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
+    *   command that causes the VS Function Enable to toggle. Pipeline
+    *   flush can be executed by sending a PIPE_CONTROL command with CS
+    *   stall bit set and a post sync operation.
+    *
+    * We've already done such a flush at the start of state upload, so we
+    * don't need to do another one here.
+    */
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
+      if (stage_state->push_const_size != 0) {
+         cvs.Buffer0Valid = true;
+         cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
+         cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+      }
+   }
+#endif
+
+   if (GEN_GEN == 7 && devinfo->is_ivybridge)
+      gen7_emit_vs_workaround_flush(brw);
+
+#if GEN_GEN >= 6
+   brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
+#endif
+      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
+
+      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
+
+#if GEN_GEN < 6
+      vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
+      vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
+      vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
+
+      vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
+      vs.URBEntryAllocationSize = brw->urb.vsize - 1;
+
+      vs.MaximumNumberofThreads =
+         CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
+
+      vs.StatisticsEnable = false;
+      vs.SamplerStatePointer =
+         instruction_ro_bo(brw->batch.bo, stage_state->sampler_offset);
+#endif
+
+#if GEN_GEN == 5
+      /* Force single program flow on Ironlake.  We cannot reliably get
+       * all applications working without it.  See:
+       * https://bugs.freedesktop.org/show_bug.cgi?id=29172
+       *
+       * The most notable and reliably failing application is the Humus
+       * demo "CelShading"
+       */
+      vs.SingleProgramFlow = true;
+      vs.SamplerCount = 0; /* hardware requirement */
+#endif
+
+#if GEN_GEN >= 8
+      vs.SIMD8DispatchEnable =
+         vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
+
+      vs.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
+#endif
+   }
+
+#if GEN_GEN == 6
+   /* Based on my reading of the simulator, the VS constants don't get
+    * pulled into the VS FF unit until an appropriate pipeline flush
+    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
+    * references to them into a little FIFO.  The flushes are common,
+    * but don't reliably happen between this and a 3DPRIMITIVE, causing
+    * the primitive to use the wrong constants.  Then the FIFO
+    * containing the constant setup gets added to again on the next
+    * constants change, and eventually when a flush does happen the
+    * unit is overwhelmed by constant changes and dies.
+    *
+    * To avoid this, send a PIPE_CONTROL down the line that will
+    * update the unit immediately loading the constants.  The flush
+    * type bits here were those set by the STATE_BASE_ADDRESS whose
+    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
+    * bug reports that led to this workaround, and may be more than
+    * what is strictly required to avoid the issue.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                               PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+#endif
+}
+
+static const struct brw_tracked_state genX(vs_state) = {
+   .dirty = {
+      .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_VS_PROG_DATA |
+               (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
+               (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_SAMPLER_STATE_TABLE |
+                               BRW_NEW_URB_FENCE
+                             : 0),
+   },
+   .emit = genX(upload_vs_state),
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void
+genX(upload_cc_viewport)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   struct GENX(CC_VIEWPORT) ccv;
+   uint32_t cc_vp_offset;
+   uint32_t *cc_map =
+      brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
+                      32, &cc_vp_offset);
+
+   for (unsigned i = 0; i < viewport_count; i++) {
+      /* _NEW_VIEWPORT | _NEW_TRANSFORM */
+      const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
+      if (ctx->Transform.DepthClamp) {
+         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+      } else {
+         ccv.MinimumDepth = 0.0;
+         ccv.MaximumDepth = 1.0;
+      }
+      GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
+      cc_map += GENX(CC_VIEWPORT_length);
+   }
+
+#if GEN_GEN >= 7
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
+      ptr.CCViewportPointer = cc_vp_offset;
+   }
+#elif GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+      vp.CCViewportStateChange = 1;
+      vp.PointertoCC_VIEWPORT = cc_vp_offset;
+   }
+#else
+   brw->cc.vp_offset = cc_vp_offset;
+   ctx->NewDriverState |= BRW_NEW_CC_VP;
+#endif
+}
+
+const struct brw_tracked_state genX(cc_vp) = {
+   .dirty = {
+      .mesa = _NEW_TRANSFORM |
+              _NEW_VIEWPORT,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_cc_viewport)
+};
+
+/* ---------------------------------------------------------------------- */
+
+static inline void
+set_scissor_bits(const struct gl_context *ctx, int i,
+                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 struct GENX(SCISSOR_RECT) *sc)
+{
+   int bbox[4];
+
+   bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
+   bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
+   bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+   bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
+   _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
+
+   if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
+      /* If the scissor was out of bounds and got clamped to 0 width/height
+       * at the bounds, the subtraction of 1 from maximums could produce a
+       * negative number and thus not clip anything.  Instead, just provide
+       * a min > max scissor inside the bounds, which produces the expected
+       * no rendering.
+       */
+      sc->ScissorRectangleXMin = 1;
+      sc->ScissorRectangleXMax = 0;
+      sc->ScissorRectangleYMin = 1;
+      sc->ScissorRectangleYMax = 0;
+   } else if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = bbox[2];
+      sc->ScissorRectangleYMax = bbox[3] - 1;
+   } else {
+      /* memory: Y=0=top */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = fb_height - bbox[3];
+      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
+   }
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_scissor_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   struct GENX(SCISSOR_RECT) scissor;
+   uint32_t scissor_state_offset;
+   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
+   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
+   uint32_t *scissor_map;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   scissor_map = brw_state_batch(
+      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
+      32, &scissor_state_offset);
+
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
+
+   /* The scissor only needs to handle the intersection of drawable and
+    * scissor rect.  Clipping to the boundaries of static shared buffers
+    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+    *
+    * Note that the hardware's coordinates are inclusive, while Mesa's min is
+    * inclusive but max is exclusive.
+    */
+   for (unsigned i = 0; i < viewport_count; i++) {
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      GENX(SCISSOR_RECT_pack)(
+         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
+   }
+
+   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+      ptr.ScissorRectPointer = scissor_state_offset;
+   }
+}
+
+static const struct brw_tracked_state genX(scissor_state) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_SCISSOR |
+              _NEW_VIEWPORT,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_scissor_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void
+brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
+                             float m00, float m11, float m30, float m31,
+                             float *xmin, float *xmax,
+                             float *ymin, float *ymax)
+{
+   /* According to the "Vertex X,Y Clamping and Quantization" section of the
+    * Strips and Fans documentation:
+    *
+    * "The vertex X and Y screen-space coordinates are also /clamped/ to the
+    *  fixed-point "guardband" range supported by the rasterization hardware"
+    *
+    * and
+    *
+    * "In almost all circumstances, if an object’s vertices are actually
+    *  modified by this clamping (i.e., had X or Y coordinates outside of
+    *  the guardband extent the rendered object will not match the intended
+    *  result.  Therefore software should take steps to ensure that this does
+    *  not happen - e.g., by clipping objects such that they do not exceed
+    *  these limits after the Drawing Rectangle is applied."
+    *
+    * I believe the fundamental restriction is that the rasterizer (in
+    * the SF/WM stages) have a limit on the number of pixels that can be
+    * rasterized.  We need to ensure any coordinates beyond the rasterizer
+    * limit are handled by the clipper.  So effectively that limit becomes
+    * the clipper's guardband size.
+    *
+    * It goes on to say:
+    *
+    * "In addition, in order to be correctly rendered, objects must have a
+    *  screenspace bounding box not exceeding 8K in the X or Y direction.
+    *  This additional restriction must also be comprehended by software,
+    *  i.e., enforced by use of clipping."
+    *
+    * This makes no sense.  Gen7+ hardware supports 16K render targets,
+    * and you definitely need to be able to draw polygons that fill the
+    * surface.  Our assumption is that the rasterizer was limited to 8K
+    * on Sandybridge, which only supports 8K surfaces, and it was actually
+    * increased to 16K on Ivybridge and later.
+    *
+    * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
+    */
+   const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
+
+   if (m00 != 0 && m11 != 0) {
+      /* First, we compute the screen-space render area */
+      const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
+      const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
+      const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
+      const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
+
+      /* We want the guardband to be centered on that */
+      const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
+      const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
+      const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
+      const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
+
+      /* Now we need it in native device coordinates */
+      const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
+      const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
+      const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
+      const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
+
+      /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
+       * flipped upside-down.  X should be fine though.
+       */
+      assert(ndc_gb_xmin <= ndc_gb_xmax);
+      *xmin = ndc_gb_xmin;
+      *xmax = ndc_gb_xmax;
+      *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
+      *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
+   } else {
+      /* The viewport scales to 0, so nothing will be rendered. */
+      *xmin = 0.0f;
+      *xmax = 0.0f;
+      *ymin = 0.0f;
+      *ymax = 0.0f;
+   }
+}
+
+static void
+genX(upload_sf_clip_viewport)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   float y_scale, y_bias;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   /* _NEW_BUFFERS */
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
+   const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
+
+#if GEN_GEN >= 7
+#define clv sfv
+   struct GENX(SF_CLIP_VIEWPORT) sfv;
+   uint32_t sf_clip_vp_offset;
+   uint32_t *sf_clip_map =
+      brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      64, &sf_clip_vp_offset);
+#else
+   struct GENX(SF_VIEWPORT) sfv;
+   struct GENX(CLIP_VIEWPORT) clv;
+   uint32_t sf_vp_offset, clip_vp_offset;
+   uint32_t *sf_map =
+      brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
+                      32, &sf_vp_offset);
+   uint32_t *clip_map =
+      brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      32, &clip_vp_offset);
+#endif
+
+   /* _NEW_BUFFERS */
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = (float)fb_height;
+   }
+
+   for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
+      /* _NEW_VIEWPORT: Guardband Clipping */
+      float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
+      _mesa_get_viewport_xform(ctx, i, scale, translate);
+
+      sfv.ViewportMatrixElementm00 = scale[0];
+      sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
+      sfv.ViewportMatrixElementm22 = scale[2],
+      sfv.ViewportMatrixElementm30 = translate[0],
+      sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
+      sfv.ViewportMatrixElementm32 = translate[2],
+      brw_calculate_guardband_size(fb_width, fb_height,
+                                   sfv.ViewportMatrixElementm00,
+                                   sfv.ViewportMatrixElementm11,
+                                   sfv.ViewportMatrixElementm30,
+                                   sfv.ViewportMatrixElementm31,
+                                   &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
+
+
+      clv.XMinClipGuardband = gb_xmin;
+      clv.XMaxClipGuardband = gb_xmax;
+      clv.YMinClipGuardband = gb_ymin;
+      clv.YMaxClipGuardband = gb_ymax;
+
+#if GEN_GEN < 6
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+                       &sfv.ScissorRectangle);
+#elif GEN_GEN >= 8
+      /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
+       * The hardware will take the intersection of the drawing rectangle,
+       * scissor rectangle, and the viewport extents. We don't need to be
+       * smart, and can therefore just program the viewport extents.
+       */
+      const float viewport_Xmax =
+         ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
+      const float viewport_Ymax =
+         ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
+
+      if (render_to_fbo) {
+         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+         sfv.XMaxViewPort = viewport_Xmax - 1;
+         sfv.YMinViewPort = ctx->ViewportArray[i].Y;
+         sfv.YMaxViewPort = viewport_Ymax - 1;
+      } else {
+         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+         sfv.XMaxViewPort = viewport_Xmax - 1;
+         sfv.YMinViewPort = fb_height - viewport_Ymax;
+         sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
+      }
+#endif
+
+#if GEN_GEN >= 7
+      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
+      sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
+#else
+      GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
+      GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
+      sf_map += GENX(SF_VIEWPORT_length);
+      clip_map += GENX(CLIP_VIEWPORT_length);
+#endif
+   }
+
+#if GEN_GEN >= 7
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
+      ptr.SFClipViewportPointer = sf_clip_vp_offset;
+   }
+#elif GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+      vp.SFViewportStateChange = 1;
+      vp.CLIPViewportStateChange = 1;
+      vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
+      vp.PointertoSF_VIEWPORT = sf_vp_offset;
+   }
+#else
+   brw->sf.vp_offset = sf_vp_offset;
+   brw->clip.vp_offset = clip_vp_offset;
+   brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
+#endif
+}
+
+static const struct brw_tracked_state genX(sf_clip_viewport) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_VIEWPORT |
+              (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_sf_clip_viewport),
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void
+genX(upload_gs_state)(struct brw_context *brw)
+{
+   UNUSED struct gl_context *ctx = &brw->ctx;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const struct brw_stage_state *stage_state = &brw->gs.base;
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   bool active = GEN_GEN >= 6 && brw->geometry_program;
+
+   /* BRW_NEW_GS_PROG_DATA */
+   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
+   UNUSED const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(stage_prog_data);
+#if GEN_GEN >= 7
+   const struct brw_gs_prog_data *gs_prog_data =
+      brw_gs_prog_data(stage_prog_data);
+#endif
+
+#if GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
+      if (active && stage_state->push_const_size != 0) {
+         cgs.Buffer0Valid = true;
+         cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
+         cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+      }
+   }
+#endif
+
+#if GEN_GEN == 7 && !GEN_IS_HASWELL
+   /**
+    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
+    * Geometry > Geometry Shader > State:
+    *
+    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
+    *     whole fixed function pipeline when the GS enable changes value in
+    *     the 3DSTATE_GS."
+    *
+    * The hardware architects have clarified that in this context "flush the
+    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
+    * Stall" bit set.
+    */
+   if (brw->gt == 2 && brw->gs.enabled != active)
+      gen7_emit_cs_stall_flush(brw);
+#endif
+
+#if GEN_GEN >= 6
+   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
+#endif
+
+#if GEN_GEN >= 6
+      if (active) {
+         INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
+
+#if GEN_GEN >= 7
+         gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+         gs.OutputTopology = gs_prog_data->output_topology;
+         gs.ControlDataHeaderSize =
+            gs_prog_data->control_data_header_size_hwords;
+
+         gs.InstanceControl = gs_prog_data->invocations - 1;
+         gs.DispatchMode = vue_prog_data->dispatch_mode;
+
+         gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
+
+         gs.ControlDataFormat = gs_prog_data->control_data_format;
+#endif
+
+         /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
+          * Ivy Bridge and Haswell.
+          *
+          * On Ivy Bridge, setting this bit causes the vertices of a triangle
+          * strip to be delivered to the geometry shader in an order that does
+          * not strictly follow the OpenGL spec, but preserves triangle
+          * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
+          * the geometry shader sees triangles:
+          *
+          * (1, 2, 3), (2, 4, 3), (3, 4, 5)
+          *
+          * (Clearing the bit is even worse, because it fails to preserve
+          * orientation).
+          *
+          * Triangle strips with adjacency always ordered in a way that preserves
+          * triangle orientation but does not strictly follow the OpenGL spec,
+          * regardless of the setting of this bit.
+          *
+          * On Haswell, both triangle strips and triangle strips with adjacency
+          * are always ordered in a way that preserves triangle orientation.
+          * Setting this bit causes the ordering to strictly follow the OpenGL
+          * spec.
+          *
+          * So in either case we want to set the bit.  Unfortunately on Ivy
+          * Bridge this will get the order close to correct but not perfect.
+          */
+         gs.ReorderMode = TRAILING;
+         gs.MaximumNumberofThreads =
+            GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
+                         : (devinfo->max_gs_threads - 1);
+
+#if GEN_GEN < 7
+         gs.SOStatisticsEnable = true;
+         if (brw->geometry_program->info.has_transform_feedback_varyings)
+            gs.SVBIPayloadEnable = true;
+
+         /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
+          * was previously done for gen6.
+          *
+          * TODO: test with both disabled to see if the HW is behaving
+          * as expected, like in gen7.
+          */
+         gs.SingleProgramFlow = true;
+         gs.VectorMaskEnable = true;
+#endif
+
+#if GEN_GEN >= 8
+         gs.ExpectedVertexCount = gs_prog_data->vertices_in;
+
+         if (gs_prog_data->static_vertex_count != -1) {
+            gs.StaticOutput = true;
+            gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
+         }
+         gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
+
+         gs.UserClipDistanceCullTestEnableBitmask =
+            vue_prog_data->cull_distance_mask;
+
+         const int urb_entry_write_offset = 1;
+         const uint32_t urb_entry_output_length =
+            DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
+            urb_entry_write_offset;
+
+         gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
+         gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
+#endif
+      }
+#endif
+
+#if GEN_GEN <= 6
+      if (!active && brw->ff_gs.prog_active) {
+         /* In gen6, transform feedback for the VS stage is done with an
+          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+          * for this.
+          */
+         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
+         gs.SingleProgramFlow = true;
+         gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
+         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
+
+#if GEN_GEN <= 5
+         gs.GRFRegisterCount =
+            DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
+         /* BRW_NEW_URB_FENCE */
+         gs.NumberofURBEntries = brw->urb.nr_gs_entries;
+         gs.URBEntryAllocationSize = brw->urb.vsize - 1;
+         gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
+         gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+#else
+         gs.Enable = true;
+         gs.VectorMaskEnable = true;
+         gs.SVBIPayloadEnable = true;
+         gs.SVBIPostIncrementEnable = true;
+         gs.SVBIPostIncrementValue =
+            brw->ff_gs.prog_data->svbi_postincrement_value;
+         gs.SOStatisticsEnable = true;
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
+#endif
+      }
+#endif
+      if (!active && !brw->ff_gs.prog_active) {
+#if GEN_GEN < 8
+         gs.DispatchGRFStartRegisterForURBData = 1;
+#if GEN_GEN >= 7
+         gs.IncludeVertexHandles = true;
+#endif
+#endif
+      }
+
+#if GEN_GEN >= 6
+      gs.StatisticsEnable = true;
+#endif
+#if GEN_GEN == 5 || GEN_GEN == 6
+      gs.RenderingEnabled = true;
+#endif
+#if GEN_GEN <= 5
+      gs.MaximumVPIndex = brw->clip.viewport_count - 1;
+#endif
+   }
+
+#if GEN_GEN == 6
+   brw->gs.enabled = active;
+#endif
+}
+
+static const struct brw_tracked_state genX(gs_state) = {
+   .dirty = {
+      .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_URB_FENCE |
+                               BRW_NEW_VIEWPORT_COUNT
+                             : 0) |
+               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
+                               BRW_NEW_GEOMETRY_PROGRAM |
+                               BRW_NEW_GS_PROG_DATA
+                             : 0) |
+               (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
+   },
+   .emit = genX(upload_gs_state),
+};
+
+/* ---------------------------------------------------------------------- */
+
+UNUSED static GLenum
+fix_dual_blend_alpha_to_one(GLenum function)
+{
+   switch (function) {
+   case GL_SRC1_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_SRC1_ALPHA:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
+#define blend_factor(x) brw_translate_blend_factor(x)
+#define blend_eqn(x) brw_translate_blend_equation(x)
+
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+brw_fix_xRGB_alpha(GLenum function)
+{
+   switch (function) {
+   case GL_DST_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
+#if GEN_GEN >= 6
+typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
+#endif
+
+UNUSED static bool
+set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
+                     bool alpha_to_one)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+
+   bool independent_alpha_blend = false;
+
+   /* Used for implementing the following bit of GL_EXT_texture_integer:
+    * "Per-fragment operations that require floating-point color
+    *  components, including multisample alpha operations, alpha test,
+    *  blending, and dithering, have no effect when the corresponding
+    *  colors are written to an integer color buffer."
+    */
+   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
+
+   const unsigned blend_enabled = GEN_GEN >= 6 ?
+      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
+
+   /* _NEW_COLOR */
+   if (ctx->Color.ColorLogicOpEnabled) {
+      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
+         : GL_UNSIGNED_NORMALIZED;
+      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
+                rb_type != GL_UNSIGNED_NORMALIZED &&
+                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
+                "renderbuffer\n",
+                _mesa_enum_to_string(ctx->Color.LogicOp),
+                _mesa_enum_to_string(rb_type));
+      if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
+         entry->LogicOpEnable = true;
+         entry->LogicOpFunction =
+            intel_translate_logic_op(ctx->Color.LogicOp);
+      }
+   } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
+              && (GEN_GEN <= 5 || !integer)) {
+      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
+      GLenum eqA = ctx->Color.Blend[i].EquationA;
+      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
+      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
+      GLenum srcA = ctx->Color.Blend[i].SrcA;
+      GLenum dstA = ctx->Color.Blend[i].DstA;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
+         srcRGB = dstRGB = GL_ONE;
+
+      if (eqA == GL_MIN || eqA == GL_MAX)
+         srcA = dstA = GL_ONE;
+
+      /* Due to hardware limitations, the destination may have information
+       * in an alpha channel even when the format specifies no alpha
+       * channel. In order to avoid getting any incorrect blending due to
+       * that alpha channel, coerce the blend factors to values that will
+       * not read the alpha channel, but will instead use the correct
+       * implicit value for alpha.
+       */
+      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
+                                               GL_TEXTURE_ALPHA_TYPE)) {
+         srcRGB = brw_fix_xRGB_alpha(srcRGB);
+         srcA = brw_fix_xRGB_alpha(srcA);
+         dstRGB = brw_fix_xRGB_alpha(dstRGB);
+         dstA = brw_fix_xRGB_alpha(dstA);
+      }
+
+      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
+       * "If Dual Source Blending is enabled, this bit must be disabled."
+       *
+       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
+       * and leave it enabled anyway.
+       */
+      if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
+         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+         srcA = fix_dual_blend_alpha_to_one(srcA);
+         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+         dstA = fix_dual_blend_alpha_to_one(dstA);
+      }
+
+      entry->ColorBufferBlendEnable = true;
+      entry->DestinationBlendFactor = blend_factor(dstRGB);
+      entry->SourceBlendFactor = blend_factor(srcRGB);
+      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
+      entry->SourceAlphaBlendFactor = blend_factor(srcA);
+      entry->ColorBlendFunction = blend_eqn(eqRGB);
+      entry->AlphaBlendFunction = blend_eqn(eqA);
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
+         independent_alpha_blend = true;
+   }
+
+   return independent_alpha_blend;
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_blend_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   int size;
+
+   /* We need at least one BLEND_STATE written, because we might do
+    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
+    * for computed depth or alpha test), which will do an FB write
+    * with render target 0, which will reference BLEND_STATE[0] for
+    * alpha test enable.
+    */
+   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
+   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
+      nr_draw_buffers = 1;
+
+   size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
+#if GEN_GEN >= 8
+   size += GENX(BLEND_STATE_length) * 4;
+#endif
+
+   uint32_t *blend_map;
+   blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
+
+#if GEN_GEN >= 8
+   struct GENX(BLEND_STATE) blend = { 0 };
+   {
+#else
+   for (int i = 0; i < nr_draw_buffers; i++) {
+      struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
+#define blend entry
+#endif
+      /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
+       * "If drawbuffer zero is not NONE and the buffer it references has an
+       * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
+       * operations are skipped."
+       */
+      if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
+         /* _NEW_MULTISAMPLE */
+         if (_mesa_is_multisample_enabled(ctx)) {
+            if (ctx->Multisample.SampleAlphaToCoverage) {
+               blend.AlphaToCoverageEnable = true;
+               blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
+            }
+            if (ctx->Multisample.SampleAlphaToOne)
+               blend.AlphaToOneEnable = true;
+         }
+
+         /* _NEW_COLOR */
+         if (ctx->Color.AlphaEnabled) {
+            blend.AlphaTestEnable = true;
+            blend.AlphaTestFunction =
+               intel_translate_compare_func(ctx->Color.AlphaFunc);
+         }
+
+         if (ctx->Color.DitherFlag) {
+            blend.ColorDitherEnable = true;
+         }
+      }
+
+#if GEN_GEN >= 8
+      for (int i = 0; i < nr_draw_buffers; i++) {
+         struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
+#else
+      {
+#endif
+         blend.IndependentAlphaBlendEnable =
+            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
+            blend.IndependentAlphaBlendEnable;
+
+         /* See section 8.1.6 "Pre-Blend Color Clamping" of the
+          * SandyBridge PRM Volume 2 Part 1 for HW requirements.
+          *
+          * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
+          * clamping in the fragment shader.  For its clamping of
+          * blending, the spec says:
+          *
+          *     "RESOLVED: For fixed-point color buffers, the inputs and
+          *      the result of the blending equation are clamped.  For
+          *      floating-point color buffers, no clamping occurs."
+          *
+          * So, generally, we want clamping to the render target's range.
+          * And, good news, the hardware tables for both pre- and
+          * post-blend color clamping are either ignored, or any are
+          * allowed, or clamping is required but RT range clamping is a
+          * valid option.
+          */
+         entry.PreBlendColorClampEnable = true;
+         entry.PostBlendColorClampEnable = true;
+         entry.ColorClampRange = COLORCLAMP_RTFORMAT;
+
+         entry.WriteDisableRed   = !ctx->Color.ColorMask[i][0];
+         entry.WriteDisableGreen = !ctx->Color.ColorMask[i][1];
+         entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
+         entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
+
+#if GEN_GEN >= 8
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
+#else
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
+#endif
+      }
+   }
+
+#if GEN_GEN >= 8
+   GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
+#endif
+
+#if GEN_GEN < 7
+   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+      ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
+      ptr.BLEND_STATEChange = true;
+   }
+#else
+   brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
+      ptr.BlendStatePointer = brw->cc.blend_state_offset;
+#if GEN_GEN >= 8
+      ptr.BlendStatePointerValid = true;
+#endif
+   }
+#endif
+}
+
+static const struct brw_tracked_state genX(blend_state) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_COLOR |
+              _NEW_MULTISAMPLE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_STATE_BASE_ADDRESS,
+   },
+   .emit = genX(upload_blend_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+UNUSED static const uint32_t push_constant_opcodes[] = {
+   [MESA_SHADER_VERTEX]                      = 21,
+   [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+   [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+   [MESA_SHADER_GEOMETRY]                    = 22,
+   [MESA_SHADER_FRAGMENT]                    = 23,
+   [MESA_SHADER_COMPUTE]                     = 0,
+};
+
+static void
+genX(upload_push_constant_packets)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
+
+   struct brw_stage_state *stage_states[] = {
+      &brw->vs.base,
+      &brw->tcs.base,
+      &brw->tes.base,
+      &brw->gs.base,
+      &brw->wm.base,
+   };
+
+   if (GEN_GEN == 7 && !GEN_IS_HASWELL && !brw->is_baytrail &&
+       stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
+      gen7_emit_vs_workaround_flush(brw);
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      struct brw_stage_state *stage_state = stage_states[stage];
+      struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
+
+      if (!stage_state->push_constants_dirty)
+         continue;
+
+      brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
+         pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
+         if (stage_state->prog_data) {
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+            /* The Skylake PRM contains the following restriction:
+             *
+             *    "The driver must ensure The following case does not occur
+             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+             *     buffer 3 read length equal to zero committed followed by a
+             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+             *     zero committed."
+             *
+             * To avoid this, we program the buffers in the highest slots.
+             * This way, slot 0 is only used if slot 3 is also used.
+             */
+            int n = 3;
+
+            for (int i = 3; i >= 0; i--) {
+               const struct brw_ubo_range *range =
+                  &stage_state->prog_data->ubo_ranges[i];
+
+               if (range->length == 0)
+                  continue;
+
+               const struct gl_uniform_block *block =
+                  prog->sh.UniformBlocks[range->block];
+               const struct gl_uniform_buffer_binding *binding =
+                  &ctx->UniformBufferBindings[block->Binding];
+
+               if (binding->BufferObject == ctx->Shared->NullBufferObj) {
+                  static unsigned msg_id = 0;
+                  _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
+                                 MESA_DEBUG_TYPE_UNDEFINED,
+                                 MESA_DEBUG_SEVERITY_HIGH,
+                                 "UBO %d unbound, %s shader uniform data "
+                                 "will be undefined.",
+                                 range->block,
+                                 _mesa_shader_stage_to_string(stage));
+                  continue;
+               }
+
+               assert(binding->Offset % 32 == 0);
+
+               struct brw_bo *bo = intel_bufferobj_buffer(brw,
+                  intel_buffer_object(binding->BufferObject),
+                  binding->Offset, range->length * 32, false);
+
+               pkt.ConstantBody.ReadLength[n] = range->length;
+               pkt.ConstantBody.Buffer[n] =
+                  render_ro_bo(bo, range->start * 32 + binding->Offset);
+               n--;
+            }
+
+            if (stage_state->push_const_size > 0) {
+               assert(n >= 0);
+               pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
+               pkt.ConstantBody.Buffer[n] =
+                  render_ro_bo(stage_state->push_const_bo,
+                               stage_state->push_const_offset);
+            }
+#else
+            pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
+            pkt.ConstantBody.Buffer[0].offset =
+               stage_state->push_const_offset | mocs;
+#endif
+         }
+      }
+
+      stage_state->push_constants_dirty = false;
+   }
+
+   brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
+}
+
+const struct brw_tracked_state genX(push_constant_packets) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_DRAW_CALL,
+   },
+   .emit = genX(upload_push_constant_packets),
+};
+#endif
+
+#if GEN_GEN >= 6
+static void
+genX(upload_vs_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->vs.base;
+
+   /* _BRW_NEW_VERTEX_PROGRAM */
+   const struct brw_program *vp = brw_program_const(brw->vertex_program);
+   /* BRW_NEW_VS_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
+
+   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_VERTEX);
+   gen6_upload_push_constants(brw, &vp->program, prog_data, stage_state);
+}
+
+static const struct brw_tracked_state genX(vs_push_constants) = {
+   .dirty = {
+      .mesa  = _NEW_PROGRAM_CONSTANTS |
+               _NEW_TRANSFORM,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_VERTEX_PROGRAM |
+               BRW_NEW_VS_PROG_DATA,
+   },
+   .emit = genX(upload_vs_push_constants),
+};
+
+static void
+genX(upload_gs_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->gs.base;
+
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   const struct brw_program *gp = brw_program_const(brw->geometry_program);
+
+   if (gp) {
+      /* BRW_NEW_GS_PROG_DATA */
+      struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
+
+      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_GEOMETRY);
+      gen6_upload_push_constants(brw, &gp->program, prog_data, stage_state);
+   }
+}
+
+static const struct brw_tracked_state genX(gs_push_constants) = {
+   .dirty = {
+      .mesa  = _NEW_PROGRAM_CONSTANTS |
+               _NEW_TRANSFORM,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_GEOMETRY_PROGRAM |
+               BRW_NEW_GS_PROG_DATA,
+   },
+   .emit = genX(upload_gs_push_constants),
+};
+
+static void
+genX(upload_wm_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->wm.base;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct brw_program *fp = brw_program_const(brw->fragment_program);
+   /* BRW_NEW_FS_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
+
+   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_FRAGMENT);
+
+   gen6_upload_push_constants(brw, &fp->program, prog_data, stage_state);
+}
+
+static const struct brw_tracked_state genX(wm_push_constants) = {
+   .dirty = {
+      .mesa  = _NEW_PROGRAM_CONSTANTS,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_FRAGMENT_PROGRAM |
+               BRW_NEW_FS_PROG_DATA,
+   },
+   .emit = genX(upload_wm_push_constants),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 6
+static unsigned
+genX(determine_sample_mask)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   float coverage = 1.0f;
+   float coverage_invert = false;
+   unsigned sample_mask = ~0u;
+
+   /* BRW_NEW_NUM_SAMPLES */
+   unsigned num_samples = brw->num_samples;
+
+   if (_mesa_is_multisample_enabled(ctx)) {
+      if (ctx->Multisample.SampleCoverage) {
+         coverage = ctx->Multisample.SampleCoverageValue;
+         coverage_invert = ctx->Multisample.SampleCoverageInvert;
+      }
+      if (ctx->Multisample.SampleMask) {
+         sample_mask = ctx->Multisample.SampleMaskValue;
+      }
+   }
+
+   if (num_samples > 1) {
+      int coverage_int = (int) (num_samples * coverage + 0.5f);
+      uint32_t coverage_bits = (1 << coverage_int) - 1;
+      if (coverage_invert)
+         coverage_bits ^= (1 << num_samples) - 1;
+      return coverage_bits & sample_mask;
+   } else {
+      return 1;
+   }
+}
+
+static void
+genX(emit_3dstate_multisample2)(struct brw_context *brw,
+                                unsigned num_samples)
+{
+   unsigned log2_samples = ffs(num_samples) - 1;
+
+   brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
+      multi.PixelLocation = CENTER;
+      multi.NumberofMultisamples = log2_samples;
+#if GEN_GEN == 6
+      GEN_SAMPLE_POS_4X(multi.Sample);
+#elif GEN_GEN == 7
+      switch (num_samples) {
+      case 1:
+         GEN_SAMPLE_POS_1X(multi.Sample);
+         break;
+      case 2:
+         GEN_SAMPLE_POS_2X(multi.Sample);
+         break;
+      case 4:
+         GEN_SAMPLE_POS_4X(multi.Sample);
+         break;
+      case 8:
+         GEN_SAMPLE_POS_8X(multi.Sample);
+         break;
+      default:
+         break;
+      }
+#endif
+   }
+}
+
+static void
+genX(upload_multisample_state)(struct brw_context *brw)
+{
+   assert(brw->num_samples > 0 && brw->num_samples <= 16);
+
+   genX(emit_3dstate_multisample2)(brw, brw->num_samples);
+
+   brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = genX(determine_sample_mask)(brw);
+   }
+}
+
+static const struct brw_tracked_state genX(multisample_state) = {
+   .dirty = {
+      .mesa = _NEW_MULTISAMPLE,
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_CONTEXT |
+             BRW_NEW_NUM_SAMPLES,
+   },
+   .emit = genX(upload_multisample_state)
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void
+genX(upload_color_calc_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
+#if GEN_GEN <= 5
+      cc.IndependentAlphaBlendEnable =
+         set_blend_entry_bits(brw, &cc, 0, false);
+      set_depth_stencil_bits(brw, &cc);
+
+      if (ctx->Color.AlphaEnabled &&
+          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
+         cc.AlphaTestEnable = true;
+         cc.AlphaTestFunction =
+            intel_translate_compare_func(ctx->Color.AlphaFunc);
+      }
+
+      cc.ColorDitherEnable = ctx->Color.DitherFlag;
+
+      cc.StatisticsEnable = brw->stats_wm;
+
+      cc.CCViewportStatePointer =
+         instruction_ro_bo(brw->batch.bo, brw->cc.vp_offset);
+#else
+      /* _NEW_COLOR */
+      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+
+#if GEN_GEN < 9
+      /* _NEW_STENCIL */
+      cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
+      cc.BackfaceStencilReferenceValue =
+         _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
+#endif
+
+#endif
+
+      /* _NEW_COLOR */
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
+                               ctx->Color.AlphaRef);
+   }
+
+#if GEN_GEN >= 6
+   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+      ptr.ColorCalcStatePointer = brw->cc.state_offset;
+#if GEN_GEN != 7
+      ptr.ColorCalcStatePointerValid = true;
+#endif
+   }
+#else
+   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+#endif
+}
+
+static const struct brw_tracked_state genX(color_calc_state) = {
+   .dirty = {
+      .mesa = _NEW_COLOR |
+              _NEW_STENCIL |
+              (GEN_GEN <= 5 ? _NEW_BUFFERS |
+                              _NEW_DEPTH
+                            : 0),
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
+                             BRW_NEW_STATS_WM
+                           : BRW_NEW_CC_STATE |
+                             BRW_NEW_STATE_BASE_ADDRESS),
+   },
+   .emit = genX(upload_color_calc_state),
+};
+
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(upload_sbe)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FS_PROG_DATA */
+   const struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+#if GEN_GEN >= 8
+   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
+#else
+#define attr_overrides sbe.Attribute
+#endif
+   uint32_t urb_entry_read_length;
+   uint32_t urb_entry_read_offset;
+   uint32_t point_sprite_enables;
+
+   brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
+      sbe.AttributeSwizzleEnable = true;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+
+      /* _NEW_BUFFERS */
+      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+
+      /* _NEW_POINT
+       *
+       * Window coordinates in an FBO are inverted, which means point
+       * sprite origin must be inverted.
+       */
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+         sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
+      else
+         sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+
+      /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
+       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
+       * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
+       * BRW_NEW_VUE_MAP_GEOM_OUT
+       */
+      genX(calculate_attr_overrides)(brw,
+                                     attr_overrides,
+                                     &point_sprite_enables,
+                                     &urb_entry_read_length,
+                                     &urb_entry_read_offset);
+
+      /* Typically, the URB entry read length and offset should be programmed
+       * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
+       * stage which produces geometry.  However, we don't know the proper
+       * value until we call calculate_attr_overrides().
+       *
+       * To fit with our existing code, we override the inherited values and
+       * specify it here directly, as we did on previous generations.
+       */
+      sbe.VertexURBEntryReadLength = urb_entry_read_length;
+      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+      sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+
+#if GEN_GEN >= 8
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+#endif
+
+#if GEN_GEN >= 9
+      /* prepare the active component dwords */
+      int input_index = 0;
+      for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+         if (!(brw->fragment_program->info.inputs_read &
+               BITFIELD64_BIT(attr))) {
+            continue;
+         }
+
+         assert(input_index < 32);
+
+         sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
+         ++input_index;
+      }
+#endif
+   }
+
+#if GEN_GEN >= 8
+   brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
+      for (int i = 0; i < 16; i++)
+         sbes.Attribute[i] = attr_overrides[i];
+   }
+#endif
+
+#undef attr_overrides
+}
+
+static const struct brw_tracked_state genX(sbe_state) = {
+   .dirty = {
+      .mesa  = _NEW_BUFFERS |
+               _NEW_LIGHT |
+               _NEW_POINT |
+               _NEW_POLYGON |
+               _NEW_PROGRAM,
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_FRAGMENT_PROGRAM |
+               BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GS_PROG_DATA |
+               BRW_NEW_TES_PROG_DATA |
+               BRW_NEW_VUE_MAP_GEOM_OUT |
+               (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
+                             : 0),
+   },
+   .emit = genX(upload_sbe),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+/**
+ * Outputs the 3DSTATE_SO_DECL_LIST command.
+ *
+ * The data output is a series of 64-bit entries containing a SO_DECL per
+ * stream.  We only have one stream of rendering coming out of the GS unit, so
+ * we only emit stream 0 (low 16 bits) SO_DECLs.
+ */
+static void
+genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
+                                  const struct brw_vue_map *vue_map)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      xfb_obj->program->sh.LinkedTransformFeedback;
+   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
+   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int max_decls = 0;
+   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
+
+   memset(so_decl, 0, sizeof(so_decl));
+
+   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
+    * command feels strange -- each dword pair contains a SO_DECL per stream.
+    */
+   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
+      const struct gl_transform_feedback_output *output =
+         &linked_xfb_info->Outputs[i];
+      const int buffer = output->OutputBuffer;
+      const int varying = output->OutputRegister;
+      const unsigned stream_id = output->StreamId;
+      assert(stream_id < MAX_VERTEX_STREAMS);
+
+      buffer_mask[stream_id] |= 1 << buffer;
+
+      assert(vue_map->varying_to_slot[varying] >= 0);
+
+      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
+       * array.  Instead, it simply increments DstOffset for the following
+       * input by the number of components that should be skipped.
+       *
+       * Our hardware is unusual in that it requires us to program SO_DECLs
+       * for fake "hole" components, rather than simply taking the offset
+       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
+       * program as many size = 4 holes as we can, then a final hole to
+       * accommodate the final 1, 2, or 3 remaining.
+       */
+      int skip_components = output->DstOffset - next_offset[buffer];
+
+      while (skip_components > 0) {
+         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+            .HoleFlag = 1,
+            .OutputBufferSlot = output->OutputBuffer,
+            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
+         };
+         skip_components -= 4;
+      }
+
+      next_offset[buffer] = output->DstOffset + output->NumComponents;
+
+      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+         .OutputBufferSlot = output->OutputBuffer,
+         .RegisterIndex = vue_map->varying_to_slot[varying],
+         .ComponentMask =
+            ((1 << output->NumComponents) - 1) << output->ComponentOffset,
+      };
+
+      if (decls[stream_id] > max_decls)
+         max_decls = decls[stream_id];
+   }
+
+   uint32_t *dw;
+   dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
+                        .StreamtoBufferSelects0 = buffer_mask[0],
+                        .StreamtoBufferSelects1 = buffer_mask[1],
+                        .StreamtoBufferSelects2 = buffer_mask[2],
+                        .StreamtoBufferSelects3 = buffer_mask[3],
+                        .NumEntries0 = decls[0],
+                        .NumEntries1 = decls[1],
+                        .NumEntries2 = decls[2],
+                        .NumEntries3 = decls[3]);
+
+   for (int i = 0; i < max_decls; i++) {
+      GENX(SO_DECL_ENTRY_pack)(
+         brw, dw + 2 + i * 2,
+         &(struct GENX(SO_DECL_ENTRY)) {
+            .Stream0Decl = so_decl[0][i],
+            .Stream1Decl = so_decl[1][i],
+            .Stream2Decl = so_decl[2][i],
+            .Stream3Decl = so_decl[3][i],
+         });
+   }
+}
+
+static void
+genX(upload_3dstate_so_buffers)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+#if GEN_GEN < 8
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      xfb_obj->program->sh.LinkedTransformFeedback;
+#else
+   struct brw_transform_feedback_object *brw_obj =
+      (struct brw_transform_feedback_object *) xfb_obj;
+   uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+#endif
+
+   /* Set up the up to 4 output buffers.  These are the ranges defined in the
+    * gl_transform_feedback_object.
+    */
+   for (int i = 0; i < 4; i++) {
+      struct intel_buffer_object *bufferobj =
+         intel_buffer_object(xfb_obj->Buffers[i]);
+
+      if (!bufferobj) {
+         brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
+            sob.SOBufferIndex = i;
+         }
+         continue;
+      }
+
+      uint32_t start = xfb_obj->Offset[i];
+      assert(start % 4 == 0);
+      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+      struct brw_bo *bo =
+         intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
+      assert(end <= bo->size);
+
+      brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
+         sob.SOBufferIndex = i;
+
+         sob.SurfaceBaseAddress = render_bo(bo, start);
+#if GEN_GEN < 8
+         sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
+         sob.SurfaceEndAddress = render_bo(bo, end);
+#else
+         sob.SOBufferEnable = true;
+         sob.StreamOffsetWriteEnable = true;
+         sob.StreamOutputBufferOffsetAddressEnable = true;
+         sob.SOBufferMOCS = mocs_wb;
+
+         sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
+         sob.StreamOutputBufferOffsetAddress =
+            instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
+
+         if (brw_obj->zero_offsets) {
+            /* Zero out the offset and write that to offset_bo */
+            sob.StreamOffset = 0;
+         } else {
+            /* Use offset_bo as the "Stream Offset." */
+            sob.StreamOffset = 0xFFFFFFFF;
+         }
+#endif
+      }
+   }
+
+#if GEN_GEN >= 8
+   brw_obj->zero_offsets = false;
+#endif
+}
+
+static inline bool
+query_active(struct gl_query_object *q)
+{
+   return q && q->Active;
+}
+
+static void
+genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
+                               const struct brw_vue_map *vue_map)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+
+   brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
+      if (active) {
+         int urb_entry_read_offset = 0;
+         int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
+            urb_entry_read_offset;
+
+         sos.SOFunctionEnable = true;
+         sos.SOStatisticsEnable = true;
+
+         /* BRW_NEW_RASTERIZER_DISCARD */
+         if (ctx->RasterDiscard) {
+            if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
+               sos.RenderingDisable = true;
+            } else {
+               perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
+                          "query active relies on the clipper.");
+            }
+         }
+
+         /* _NEW_LIGHT */
+         if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
+            sos.ReorderMode = TRAILING;
+
+#if GEN_GEN < 8
+         sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
+         sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
+         sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
+         sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
+#else
+         const struct gl_transform_feedback_info *linked_xfb_info =
+            xfb_obj->program->sh.LinkedTransformFeedback;
+         /* Set buffer pitches; 0 means unbound. */
+         if (xfb_obj->Buffers[0])
+            sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
+         if (xfb_obj->Buffers[1])
+            sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
+         if (xfb_obj->Buffers[2])
+            sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
+         if (xfb_obj->Buffers[3])
+            sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
+#endif
+
+         /* We always read the whole vertex.  This could be reduced at some
+          * point by reading less and offsetting the register index in the
+          * SO_DECLs.
+          */
+         sos.Stream0VertexReadOffset = urb_entry_read_offset;
+         sos.Stream0VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream1VertexReadOffset = urb_entry_read_offset;
+         sos.Stream1VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream2VertexReadOffset = urb_entry_read_offset;
+         sos.Stream2VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream3VertexReadOffset = urb_entry_read_offset;
+         sos.Stream3VertexReadLength = urb_entry_read_length - 1;
+      }
+   }
+}
+
+static void
+genX(upload_sol)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
+
+   if (active) {
+      genX(upload_3dstate_so_buffers)(brw);
+
+      /* BRW_NEW_VUE_MAP_GEOM_OUT */
+      genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
+   }
+
+   /* Finally, set up the SOL stage.  This command must always follow updates to
+    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
+    * MMIO register updates (current performed by the kernel at each batch
+    * emit).
+    */
+   genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
+}
+
+static const struct brw_tracked_state genX(sol_state) = {
+   .dirty = {
+      .mesa  = _NEW_LIGHT,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_RASTERIZER_DISCARD |
+               BRW_NEW_VUE_MAP_GEOM_OUT |
+               BRW_NEW_TRANSFORM_FEEDBACK,
+   },
+   .emit = genX(upload_sol),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(upload_ps)(struct brw_context *brw)
+{
+   UNUSED const struct gl_context *ctx = &brw->ctx;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   /* BRW_NEW_FS_PROG_DATA */
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+   const struct brw_stage_state *stage_state = &brw->wm.base;
+
+#if GEN_GEN < 8
+#endif
+
+   brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
+      /* Initialize the execution mask with VMask.  Otherwise, derivatives are
+       * incorrect for subspans where some of the pixels are unlit.  We believe
+       * the bit just didn't take effect in previous generations.
+       */
+      ps.VectorMaskEnable = GEN_GEN >= 8;
+
+      ps.SamplerCount =
+         DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
+
+      /* BRW_NEW_FS_PROG_DATA */
+      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+
+      if (prog_data->base.use_alt_mode)
+         ps.FloatingPointMode = Alternate;
+
+      /* Haswell requires the sample mask to be set in this packet as well as
+       * in 3DSTATE_SAMPLE_MASK; the values should match.
+       */
+
+      /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
+#if GEN_IS_HASWELL
+      ps.SampleMask = genX(determine_sample_mask(brw));
+#endif
+
+      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
+       * it implicitly scales for different GT levels (which have some # of
+       * PSDs).
+       *
+       * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+       */
+#if GEN_GEN >= 9
+      ps.MaximumNumberofThreadsPerPSD = 64 - 1;
+#elif GEN_GEN >= 8
+      ps.MaximumNumberofThreadsPerPSD = 64 - 2;
+#else
+      ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
+#endif
+
+      if (prog_data->base.nr_params > 0 ||
+          prog_data->base.ubo_ranges[0].length > 0)
+         ps.PushConstantEnable = true;
+
+#if GEN_GEN < 8
+      /* From the IVB PRM, volume 2 part 1, page 287:
+       * "This bit is inserted in the PS payload header and made available to
+       * the DataPort (either via the message header or via header bypass) to
+       * indicate that oMask data (one or two phases) is included in Render
+       * Target Write messages. If present, the oMask data is used to mask off
+       * samples."
+       */
+      ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
+
+      /* The hardware wedges if you have this bit set but don't turn on any
+       * dual source blend factors.
+       *
+       * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
+       */
+      ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
+                                 (ctx->Color.BlendEnabled & 1) &&
+                                 ctx->Color.Blend[0]._UsesDualSrc;
+
+      /* BRW_NEW_FS_PROG_DATA */
+      ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
+#endif
+
+      /* From the documentation for this packet:
+       * "If the PS kernel does not need the Position XY Offsets to
+       *  compute a Position Value, then this field should be programmed
+       *  to POSOFFSET_NONE."
+       *
+       * "SW Recommendation: If the PS kernel needs the Position Offsets
+       *  to compute a Position XY value, this field should match Position
+       *  ZW Interpolation Mode to ensure a consistent position.xyzw
+       *  computation."
+       *
+       * We only require XY sample offsets. So, this recommendation doesn't
+       * look useful at the moment. We might need this in future.
+       */
+      if (prog_data->uses_pos_offset)
+         ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
+      else
+         ps.PositionXYOffsetSelect = POSOFFSET_NONE;
+
+      ps.RenderTargetFastClearEnable = brw->wm.fast_clear_op;
+      ps._8PixelDispatchEnable = prog_data->dispatch_8;
+      ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         prog_data->base.dispatch_grf_start_reg;
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         prog_data->dispatch_grf_start_reg_2;
+
+      ps.KernelStartPointer0 = stage_state->prog_offset;
+      ps.KernelStartPointer2 = stage_state->prog_offset +
+         prog_data->prog_offset_2;
+
+      if (prog_data->base.total_scratch) {
+         ps.ScratchSpaceBasePointer =
+            render_bo(stage_state->scratch_bo,
+                      ffs(stage_state->per_thread_scratch) - 11);
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(ps_state) = {
+   .dirty = {
+      .mesa  = _NEW_MULTISAMPLE |
+               (GEN_GEN < 8 ? _NEW_BUFFERS |
+                              _NEW_COLOR
+                            : 0),
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_FS_PROG_DATA,
+   },
+   .emit = genX(upload_ps),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(upload_hs_state)(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
+   const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(stage_prog_data);
+
+   /* BRW_NEW_TES_PROG_DATA */
+   struct brw_tcs_prog_data *tcs_prog_data =
+      brw_tcs_prog_data(stage_prog_data);
+
+   if (!tcs_prog_data) {
+      brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
+   } else {
+      brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
+         INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
+
+         hs.InstanceCount = tcs_prog_data->instances - 1;
+         hs.IncludeVertexHandles = true;
+
+         hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(hs_state) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_TCS_PROG_DATA |
+               BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_hs_state),
+};
+
+static void
+genX(upload_ds_state)(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const struct brw_stage_state *stage_state = &brw->tes.base;
+   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
+
+   /* BRW_NEW_TES_PROG_DATA */
+   const struct brw_tes_prog_data *tes_prog_data =
+      brw_tes_prog_data(stage_prog_data);
+   const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(stage_prog_data);
+
+   if (!tes_prog_data) {
+      brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
+   } else {
+      brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
+         INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
+
+        ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+        ds.ComputeWCoordinateEnable =
+           tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+
+#if GEN_GEN >= 8
+        if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
+           ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+        ds.UserClipDistanceCullTestEnableBitmask =
+            vue_prog_data->cull_distance_mask;
+#endif
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(ds_state) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_TESS_PROGRAMS |
+               BRW_NEW_TES_PROG_DATA,
+   },
+   .emit = genX(upload_ds_state),
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void
+upload_te_state(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool active = brw->tess_eval_program;
+
+   /* BRW_NEW_TES_PROG_DATA */
+   const struct brw_tes_prog_data *tes_prog_data =
+      brw_tes_prog_data(brw->tes.base.prog_data);
+
+   if (active) {
+      brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
+         te.Partitioning = tes_prog_data->partitioning;
+         te.OutputTopology = tes_prog_data->output_topology;
+         te.TEDomain = tes_prog_data->domain;
+         te.TEEnable = true;
+         te.MaximumTessellationFactorOdd = 63.0;
+         te.MaximumTessellationFactorNotOdd = 64.0;
+      }
+   } else {
+      brw_batch_emit(brw, GENX(3DSTATE_TE), te);
+   }
+}
+
+static const struct brw_tracked_state genX(te_state) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_TES_PROG_DATA |
+               BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = upload_te_state,
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void
+genX(upload_tes_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->tes.base;
+   /* BRW_NEW_TESS_PROGRAMS */
+   const struct brw_program *tep = brw_program_const(brw->tess_eval_program);
+
+   if (tep) {
+      /* BRW_NEW_TES_PROG_DATA */
+      const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
+      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_EVAL);
+      gen6_upload_push_constants(brw, &tep->program, prog_data, stage_state);
+   }
+}
+
+static const struct brw_tracked_state genX(tes_push_constants) = {
+   .dirty = {
+      .mesa  = _NEW_PROGRAM_CONSTANTS,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_TESS_PROGRAMS |
+               BRW_NEW_TES_PROG_DATA,
+   },
+   .emit = genX(upload_tes_push_constants),
+};
+
+static void
+genX(upload_tcs_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   /* BRW_NEW_TESS_PROGRAMS */
+   const struct brw_program *tcp = brw_program_const(brw->tess_ctrl_program);
+   bool active = brw->tess_eval_program;
+
+   if (active) {
+      /* BRW_NEW_TCS_PROG_DATA */
+      const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
+
+      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_CTRL);
+      gen6_upload_push_constants(brw, &tcp->program, prog_data, stage_state);
+   }
+}
+
+static const struct brw_tracked_state genX(tcs_push_constants) = {
+   .dirty = {
+      .mesa  = _NEW_PROGRAM_CONSTANTS,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_DEFAULT_TESS_LEVELS |
+               BRW_NEW_TESS_PROGRAMS |
+               BRW_NEW_TCS_PROG_DATA,
+   },
+   .emit = genX(upload_tcs_push_constants),
+};
+
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(upload_cs_state)(struct brw_context *brw)
+{
+   if (!brw->cs.base.prog_data)
+      return;
+
+   uint32_t offset;
+   uint32_t *desc = (uint32_t*) brw_state_batch(
+      brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
+      &offset);
+
+   struct brw_stage_state *stage_state = &brw->cs.base;
+   struct brw_stage_prog_data *prog_data = stage_state->prog_data;
+   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      brw_emit_buffer_surface_state(
+         brw, &stage_state->surf_offset[
+                 prog_data->binding_table.shader_time_start],
+         brw->shader_time.bo, 0, ISL_FORMAT_RAW,
+         brw->shader_time.bo->size, 1, true);
+   }
+
+   uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
+                                    32, &stage_state->bind_bo_offset);
+
+   brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
+      if (prog_data->total_scratch) {
+         uint32_t bo_offset;
+
+         if (GEN_GEN >= 8) {
+            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+             */
+            bo_offset = ffs(stage_state->per_thread_scratch) - 11;
+         } else if (GEN_IS_HASWELL) {
+            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+             */
+            bo_offset = ffs(stage_state->per_thread_scratch) - 12;
+         } else {
+            /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
+             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+             */
+            bo_offset = stage_state->per_thread_scratch / 1024 - 1;
+         }
+         vfe.ScratchSpaceBasePointer =
+            render_bo(stage_state->scratch_bo, bo_offset);
+      }
+
+      const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
+      vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
+      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
+      vfe.ResetGatewayTimer =
+         Resettingrelativetimerandlatchingtheglobaltimestamp;
+#if GEN_GEN < 9
+      vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
+#endif
+#if GEN_GEN == 7
+      vfe.GPGPUMode = 1;
+#endif
+
+      /* We are uploading duplicated copies of push constant uniforms for each
+       * thread. Although the local id data needs to vary per thread, it won't
+       * change for other uniform data. Unfortunately this duplication is
+       * required for gen7. As of Haswell, this duplication can be avoided,
+       * but this older mechanism with duplicated data continues to work.
+       *
+       * FINISHME: As of Haswell, we could make use of the
+       * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
+       * field to only store one copy of uniform data.
+       *
+       * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
+       * which is described in the GPGPU_WALKER command and in the Broadwell
+       * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
+       * Operations => GPGPU Mode => Indirect Payload Storage.
+       *
+       * Note: The constant data is built in brw_upload_cs_push_constants
+       * below.
+       */
+      vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
+
+      const uint32_t vfe_curbe_allocation =
+         ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+               cs_prog_data->push.cross_thread.regs, 2);
+      vfe.CURBEAllocationSize = vfe_curbe_allocation;
+   }
+
+   if (cs_prog_data->push.total.size > 0) {
+      brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
+         curbe.CURBETotalDataLength =
+            ALIGN(cs_prog_data->push.total.size, 64);
+         curbe.CURBEDataStartAddress = stage_state->push_const_offset;
+      }
+   }
+
+   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
+   memcpy(bind, stage_state->surf_offset,
+          prog_data->binding_table.size_bytes);
+   const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
+      .KernelStartPointer = brw->cs.base.prog_offset,
+      .SamplerStatePointer = stage_state->sampler_offset,
+      .SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4) >> 2,
+      .BindingTablePointer = stage_state->bind_bo_offset,
+      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
+      .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
+      .SharedLocalMemorySize = encode_slm_size(devinfo->gen,
+                                               prog_data->total_shared),
+      .BarrierEnable = cs_prog_data->uses_barrier,
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+      .CrossThreadConstantDataReadLength =
+         cs_prog_data->push.cross_thread.regs,
+#endif
+   };
+
+   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
+
+   brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
+      load.InterfaceDescriptorTotalLength =
+         GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+      load.InterfaceDescriptorDataStartAddress = offset;
+   }
+}
+
+static const struct brw_tracked_state genX(cs_state) = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_CS_PROG_DATA |
+             BRW_NEW_SAMPLER_STATE_TABLE |
+             BRW_NEW_SURFACES,
+   },
+   .emit = genX(upload_cs_state)
+};
+
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+static void
+genX(upload_raster)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+
+   /* _NEW_POLYGON */
+   struct gl_polygon_attrib *polygon = &ctx->Polygon;
+
+   /* _NEW_POINT */
+   struct gl_point_attrib *point = &ctx->Point;
+
+   brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
+      if (brw->polygon_front_bit == render_to_fbo)
+         raster.FrontWinding = CounterClockwise;
+
+      if (polygon->CullFlag) {
+         switch (polygon->CullFaceMode) {
+         case GL_FRONT:
+            raster.CullMode = CULLMODE_FRONT;
+            break;
+         case GL_BACK:
+            raster.CullMode = CULLMODE_BACK;
+            break;
+         case GL_FRONT_AND_BACK:
+            raster.CullMode = CULLMODE_BOTH;
+            break;
+         default:
+            unreachable("not reached");
+         }
+      } else {
+         raster.CullMode = CULLMODE_NONE;
+      }
+
+      point->SmoothFlag = raster.SmoothPointEnable;
+
+      raster.DXMultisampleRasterizationEnable =
+         _mesa_is_multisample_enabled(ctx);
+
+      raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
+      raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
+      raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
+
+      switch (polygon->FrontMode) {
+      case GL_FILL:
+         raster.FrontFaceFillMode = FILL_MODE_SOLID;
+         break;
+      case GL_LINE:
+         raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
+         break;
+      case GL_POINT:
+         raster.FrontFaceFillMode = FILL_MODE_POINT;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      switch (polygon->BackMode) {
+      case GL_FILL:
+         raster.BackFaceFillMode = FILL_MODE_SOLID;
+         break;
+      case GL_LINE:
+         raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
+         break;
+      case GL_POINT:
+         raster.BackFaceFillMode = FILL_MODE_POINT;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      /* _NEW_LINE */
+      raster.AntialiasingEnable = ctx->Line.SmoothFlag;
+
+      /* _NEW_SCISSOR */
+      raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
+
+      /* _NEW_TRANSFORM */
+      if (!ctx->Transform.DepthClamp) {
+#if GEN_GEN >= 9
+         raster.ViewportZFarClipTestEnable = true;
+         raster.ViewportZNearClipTestEnable = true;
+#else
+         raster.ViewportZClipTestEnable = true;
+#endif
+      }
+
+      /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
+#if GEN_GEN >= 9
+      raster.ConservativeRasterizationEnable =
+         ctx->IntelConservativeRasterization;
+#endif
+
+      raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
+      raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
+
+      raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
+   }
+}
+
+static const struct brw_tracked_state genX(raster_state) = {
+   .dirty = {
+      .mesa  = _NEW_BUFFERS |
+               _NEW_LINE |
+               _NEW_MULTISAMPLE |
+               _NEW_POINT |
+               _NEW_POLYGON |
+               _NEW_SCISSOR |
+               _NEW_TRANSFORM,
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_CONSERVATIVE_RASTERIZATION,
+   },
+   .emit = genX(upload_raster),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+static void
+genX(upload_ps_extra)(struct brw_context *brw)
+{
+   UNUSED struct gl_context *ctx = &brw->ctx;
+
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data(brw->wm.base.prog_data);
+
+   brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
+      psx.PixelShaderValid = true;
+      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
+      psx.PixelShaderKillsPixel = prog_data->uses_kill;
+      psx.AttributeEnable = prog_data->num_varying_inputs != 0;
+      psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
+      psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
+      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+
+      /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
+      if (prog_data->uses_sample_mask) {
+#if GEN_GEN >= 9
+         if (prog_data->post_depth_coverage)
+            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
+         else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
+            psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
+         else
+            psx.InputCoverageMaskState = ICMS_NORMAL;
+#else
+         psx.PixelShaderUsesInputCoverageMask = true;
+#endif
+      }
+
+      psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
+#if GEN_GEN >= 9
+      psx.PixelShaderPullsBary = prog_data->pulls_bary;
+      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
+#endif
+
+      /* The stricter cross-primitive coherency guarantees that the hardware
+       * gives us with the "Accesses UAV" bit set for at least one shader stage
+       * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
+       * are redundant within the current image, atomic counter and SSBO GL
+       * APIs, which all have very loose ordering and coherency requirements
+       * and generally rely on the application to insert explicit barriers when
+       * a shader invocation is expected to see the memory writes performed by
+       * the invocations of some previous primitive.  Regardless of the value
+       * of "UAV coherency required", the "Accesses UAV" bits will implicitly
+       * cause an in most cases useless DC flush when the lowermost stage with
+       * the bit set finishes execution.
+       *
+       * It would be nice to disable it, but in some cases we can't because on
+       * Gen8+ it also has an influence on rasterization via the PS UAV-only
+       * signal (which could be set independently from the coherency mechanism
+       * in the 3DSTATE_WM command on Gen7), and because in some cases it will
+       * determine whether the hardware skips execution of the fragment shader
+       * or not via the ThreadDispatchEnable signal.  However if we know that
+       * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
+       * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
+       * difference so we may just disable it here.
+       *
+       * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
+       * take into account KillPixels when no depth or stencil writes are
+       * enabled.  In order for occlusion queries to work correctly with no
+       * attachments, we need to force-enable here.
+       *
+       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
+       * _NEW_COLOR
+       */
+      if ((prog_data->has_side_effects || prog_data->uses_kill) &&
+          !brw_color_buffer_write_enabled(brw))
+         psx.PixelShaderHasUAV = true;
+   }
+}
+
+const struct brw_tracked_state genX(ps_extra) = {
+   .dirty = {
+      .mesa  = _NEW_BUFFERS | _NEW_COLOR,
+      .brw   = BRW_NEW_BLORP |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_FRAGMENT_PROGRAM |
+               BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_CONSERVATIVE_RASTERIZATION,
+   },
+   .emit = genX(upload_ps_extra),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+static void
+genX(upload_ps_blend)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
+
+   /* _NEW_COLOR */
+   struct gl_colorbuffer_attrib *color = &ctx->Color;
+
+   brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
+      /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
+      pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
+
+      bool alpha_to_one = false;
+
+      if (!buffer0_is_integer) {
+         /* _NEW_MULTISAMPLE */
+
+         if (_mesa_is_multisample_enabled(ctx)) {
+            pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
+            alpha_to_one = ctx->Multisample.SampleAlphaToOne;
+         }
+
+         pb.AlphaTestEnable = color->AlphaEnabled;
+      }
+
+      /* Used for implementing the following bit of GL_EXT_texture_integer:
+       * "Per-fragment operations that require floating-point color
+       *  components, including multisample alpha operations, alpha test,
+       *  blending, and dithering, have no effect when the corresponding
+       *  colors are written to an integer color buffer."
+       *
+       * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
+       * "If drawbuffer zero is not NONE and the buffer it references has an
+       *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
+       *  operations are skipped."
+       */
+      if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
+         GLenum eqRGB = color->Blend[0].EquationRGB;
+         GLenum eqA = color->Blend[0].EquationA;
+         GLenum srcRGB = color->Blend[0].SrcRGB;
+         GLenum dstRGB = color->Blend[0].DstRGB;
+         GLenum srcA = color->Blend[0].SrcA;
+         GLenum dstA = color->Blend[0].DstA;
+
+         if (eqRGB == GL_MIN || eqRGB == GL_MAX)
+            srcRGB = dstRGB = GL_ONE;
+
+         if (eqA == GL_MIN || eqA == GL_MAX)
+            srcA = dstA = GL_ONE;
+
+         /* Due to hardware limitations, the destination may have information
+          * in an alpha channel even when the format specifies no alpha
+          * channel. In order to avoid getting any incorrect blending due to
+          * that alpha channel, coerce the blend factors to values that will
+          * not read the alpha channel, but will instead use the correct
+          * implicit value for alpha.
+          */
+         if (!_mesa_base_format_has_channel(rb->_BaseFormat,
+                                            GL_TEXTURE_ALPHA_TYPE)) {
+            srcRGB = brw_fix_xRGB_alpha(srcRGB);
+            srcA = brw_fix_xRGB_alpha(srcA);
+            dstRGB = brw_fix_xRGB_alpha(dstRGB);
+            dstA = brw_fix_xRGB_alpha(dstA);
+         }
+
+         /* Alpha to One doesn't work with Dual Color Blending.  Override
+          * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
+          */
+         if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
+            srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+            srcA = fix_dual_blend_alpha_to_one(srcA);
+            dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+            dstA = fix_dual_blend_alpha_to_one(dstA);
+         }
+
+         pb.ColorBufferBlendEnable = true;
+         pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
+         pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
+         pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
+         pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
+
+         pb.IndependentAlphaBlendEnable =
+            srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
+      }
+   }
+}
+
+static const struct brw_tracked_state genX(ps_blend) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_COLOR |
+              _NEW_MULTISAMPLE,
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_CONTEXT |
+             BRW_NEW_FRAGMENT_PROGRAM,
+   },
+   .emit = genX(upload_ps_blend)
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+static void
+genX(emit_vf_topology)(struct brw_context *brw)
+{
+   brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
+      vftopo.PrimitiveTopologyType = brw->primitive;
+   }
+}
+
+static const struct brw_tracked_state genX(vf_topology) = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_PRIMITIVE,
+   },
+   .emit = genX(emit_vf_topology),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(emit_mi_report_perf_count)(struct brw_context *brw,
+                                struct brw_bo *bo,
+                                uint32_t offset_in_bytes,
+                                uint32_t report_id)
+{
+   brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
+      mi_rpc.MemoryAddress = instruction_bo(bo, offset_in_bytes);
+      mi_rpc.ReportID = report_id;
+   }
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/**
+ * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
+ */
+static void
+genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
+                                     struct brw_stage_state *stage_state)
+{
+#if GEN_GEN >= 7
+   static const uint16_t packet_headers[] = {
+      [MESA_SHADER_VERTEX] = 43,
+      [MESA_SHADER_TESS_CTRL] = 44,
+      [MESA_SHADER_TESS_EVAL] = 45,
+      [MESA_SHADER_GEOMETRY] = 46,
+      [MESA_SHADER_FRAGMENT] = 47,
+   };
+
+   /* Ivybridge requires a workaround flush before VS packets. */
+   if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
+       stage_state->stage == MESA_SHADER_VERTEX) {
+      gen7_emit_vs_workaround_flush(brw);
+   }
+
+   brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+      ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
+      ptr.PointertoVSSamplerState = stage_state->sampler_offset;
+   }
+#endif
+}
+
+UNUSED static bool
+has_component(mesa_format format, int i)
+{
+   if (_mesa_is_format_color_format(format))
+      return _mesa_format_has_color_component(format, i);
+
+   /* depth and stencil have only one component */
+   return i == 0;
+}
+
+/**
+ * Upload SAMPLER_BORDER_COLOR_STATE.
+ */
+static void
+genX(upload_default_color)(struct brw_context *brw,
+                           const struct gl_sampler_object *sampler,
+                           mesa_format format, GLenum base_format,
+                           bool is_integer_format, bool is_stencil_sampling,
+                           uint32_t *sdc_offset)
+{
+   union gl_color_union color;
+
+   switch (base_format) {
+   case GL_DEPTH_COMPONENT:
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[0];
+      break;
+   case GL_ALPHA:
+      color.ui[0] = 0u;
+      color.ui[1] = 0u;
+      color.ui[2] = 0u;
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   case GL_INTENSITY:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[0];
+      break;
+   case GL_LUMINANCE:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = float_as_int(1.0);
+      break;
+   case GL_LUMINANCE_ALPHA:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   default:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[1];
+      color.ui[2] = sampler->BorderColor.ui[2];
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   }
+
+   /* In some cases we use an RGBA surface format for GL RGB textures,
+    * where we've initialized the A channel to 1.0.  We also have to set
+    * the border color alpha to 1.0 in that case.
+    */
+   if (base_format == GL_RGB)
+      color.ui[3] = float_as_int(1.0);
+
+   int alignment = 32;
+   if (brw->gen >= 8) {
+      alignment = 64;
+   } else if (brw->is_haswell && (is_integer_format || is_stencil_sampling)) {
+      alignment = 512;
+   }
+
+   uint32_t *sdc = brw_state_batch(
+      brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
+      alignment, sdc_offset);
+
+   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
+
+#define ASSIGN(dst, src) \
+   do {                  \
+      dst = src;         \
+   } while (0)
+
+#define ASSIGNu16(dst, src) \
+   do {                     \
+      dst = (uint16_t)src;  \
+   } while (0)
+
+#define ASSIGNu8(dst, src) \
+   do {                    \
+      dst = (uint8_t)src;  \
+   } while (0)
+
+#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
+   macro(state.BorderColor ## _color_type ## Red, src[0]);   \
+   macro(state.BorderColor ## _color_type ## Green, src[1]);   \
+   macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
+   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
+
+#if GEN_GEN >= 8
+   /* On Broadwell, the border color is represented as four 32-bit floats,
+    * integers, or unsigned values, interpreted according to the surface
+    * format.  This matches the sampler->BorderColor union exactly; just
+    * memcpy the values.
+    */
+   BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
+#elif GEN_IS_HASWELL
+   if (is_integer_format || is_stencil_sampling) {
+      bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
+      const int bits_per_channel =
+         _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
+
+      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
+       * "If any color channel is missing from the surface format,
+       *  corresponding border color should be programmed as zero and if
+       *  alpha channel is missing, corresponding Alpha border color should
+       *  be programmed as 1."
+       */
+      unsigned c[4] = { 0, 0, 0, 1 };
+      for (int i = 0; i < 4; i++) {
+         if (has_component(format, i))
+            c[i] = color.ui[i];
+      }
+
+      switch (bits_per_channel) {
+      case 8:
+         /* Copy RGBA in order. */
+         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
+         break;
+      case 10:
+         /* R10G10B10A2_UINT is treated like a 16-bit format. */
+      case 16:
+         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
+         break;
+      case 32:
+         if (base_format == GL_RG) {
+            /* Careful inspection of the tables reveals that for RG32 formats,
+             * the green channel needs to go where blue normally belongs.
+             */
+            state.BorderColor32bitRed = c[0];
+            state.BorderColor32bitBlue = c[1];
+            state.BorderColor32bitAlpha = 1;
+         } else {
+            /* Copy RGBA in order. */
+            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
+         }
+         break;
+      default:
+         assert(!"Invalid number of bits per channel in integer format.");
+         break;
+      }
+   } else {
+      BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+   }
+#elif GEN_GEN == 5 || GEN_GEN == 6
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
+
+#define MESA_FLOAT_TO_HALF(dst, src) \
+   dst = _mesa_float_to_half(src);
+
+   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
+
+#undef MESA_FLOAT_TO_HALF
+
+   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
+   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
+   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
+   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
+
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#elif GEN_GEN == 4
+   BORDER_COLOR_ATTR(ASSIGN, , color.f);
+#else
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#endif
+
+#undef ASSIGN
+#undef BORDER_COLOR_ATTR
+
+   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
+}
+
+static uint32_t
+translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
+{
+   switch (wrap) {
+   case GL_REPEAT:
+      return TCM_WRAP;
+   case GL_CLAMP:
+#if GEN_GEN >= 8
+      /* GL_CLAMP is the weird mode where coordinates are clamped to
+       * [0.0, 1.0], so linear filtering of coordinates outside of
+       * [0.0, 1.0] give you half edge texel value and half border
+       * color.
+       *
+       * Gen8+ supports this natively.
+       */
+      return TCM_HALF_BORDER;
+#else
+      /* On Gen4-7.5, we clamp the coordinates in the fragment shader
+       * and set clamp_border here, which gets the result desired.
+       * We just use clamp(_to_edge) for nearest, because for nearest
+       * clamping to 1.0 gives border color instead of the desired
+       * edge texels.
+       */
+      if (using_nearest)
+         return TCM_CLAMP;
+      else
+         return TCM_CLAMP_BORDER;
+#endif
+   case GL_CLAMP_TO_EDGE:
+      return TCM_CLAMP;
+   case GL_CLAMP_TO_BORDER:
+      return TCM_CLAMP_BORDER;
+   case GL_MIRRORED_REPEAT:
+      return TCM_MIRROR;
+   case GL_MIRROR_CLAMP_TO_EDGE:
+      return TCM_MIRROR_ONCE;
+   default:
+      return TCM_WRAP;
+   }
+}
+
+/**
+ * Return true if the given wrap mode requires the border color to exist.
+ */
+static bool
+wrap_mode_needs_border_color(unsigned wrap_mode)
+{
+#if GEN_GEN >= 8
+   return wrap_mode == TCM_CLAMP_BORDER ||
+          wrap_mode == TCM_HALF_BORDER;
+#else
+   return wrap_mode == TCM_CLAMP_BORDER;
+#endif
+}
+
+/**
+ * Sets the sampler state for a single unit based off of the sampler key
+ * entry.
+ */
+static void
+genX(update_sampler_state)(struct brw_context *brw,
+                           GLenum target, bool tex_cube_map_seamless,
+                           GLfloat tex_unit_lod_bias,
+                           mesa_format format, GLenum base_format,
+                           const struct gl_texture_object *texObj,
+                           const struct gl_sampler_object *sampler,
+                           uint32_t *sampler_state,
+                           uint32_t batch_offset_for_sampler_state)
+{
+   struct GENX(SAMPLER_STATE) samp_st = { 0 };
+
+   /* Select min and mip filters. */
+   switch (sampler->MinFilter) {
+   case GL_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_NONE;
+      break;
+   case GL_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_NONE;
+      break;
+   case GL_NEAREST_MIPMAP_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_NEAREST;
+      break;
+   case GL_LINEAR_MIPMAP_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_NEAREST;
+      break;
+   case GL_NEAREST_MIPMAP_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_LINEAR;
+      break;
+   case GL_LINEAR_MIPMAP_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_LINEAR;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Select mag filter. */
+   samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
+      MAPFILTER_LINEAR : MAPFILTER_NEAREST;
+
+   /* Enable anisotropic filtering if desired. */
+   samp_st.MaximumAnisotropy = RATIO21;
+
+   if (sampler->MaxAnisotropy > 1.0f) {
+      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
+         samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
+      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
+         samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
+
+      if (sampler->MaxAnisotropy > 2.0f) {
+         samp_st.MaximumAnisotropy =
+            MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
+      }
+   }
+
+   /* Set address rounding bits if not using nearest filtering. */
+   if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
+      samp_st.UAddressMinFilterRoundingEnable = true;
+      samp_st.VAddressMinFilterRoundingEnable = true;
+      samp_st.RAddressMinFilterRoundingEnable = true;
+   }
+
+   if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
+      samp_st.UAddressMagFilterRoundingEnable = true;
+      samp_st.VAddressMagFilterRoundingEnable = true;
+      samp_st.RAddressMagFilterRoundingEnable = true;
+   }
+
+   bool either_nearest =
+      sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
+   unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
+   unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
+   unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
+
+   if (target == GL_TEXTURE_CUBE_MAP ||
+       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
+      /* Cube maps must use the same wrap mode for all three coordinate
+       * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
+       *
+       * Ivybridge and Baytrail seem to have problems with CUBE mode and
+       * integer formats.  Fall back to CLAMP for now.
+       */
+      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
+          !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
+         wrap_s = TCM_CUBE;
+         wrap_t = TCM_CUBE;
+         wrap_r = TCM_CUBE;
+      } else {
+         wrap_s = TCM_CLAMP;
+         wrap_t = TCM_CLAMP;
+         wrap_r = TCM_CLAMP;
+      }
+   } else if (target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      wrap_t = TCM_WRAP;
+   }
+
+   samp_st.TCXAddressControlMode = wrap_s;
+   samp_st.TCYAddressControlMode = wrap_t;
+   samp_st.TCZAddressControlMode = wrap_r;
+
+   samp_st.ShadowFunction =
+      sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
+      intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
+
+#if GEN_GEN >= 7
+   /* Set shadow function. */
+   samp_st.AnisotropicAlgorithm =
+      samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
+      EWAApproximation : LEGACY;
+#endif
+
+#if GEN_GEN >= 6
+   samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
+#endif
+
+   const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
+   samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
+   samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
+   samp_st.TextureLODBias =
+      CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
+
+#if GEN_GEN == 6
+   samp_st.BaseMipLevel =
+      CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
+   samp_st.MinandMagStateNotEqual =
+      samp_st.MinModeFilter != samp_st.MagModeFilter;
+#endif
+
+   /* Upload the border color if necessary.  If not, just point it at
+    * offset 0 (the start of the batch) - the color should be ignored,
+    * but that address won't fault in case something reads it anyway.
+    */
+   uint32_t border_color_offset = 0;
+   if (wrap_mode_needs_border_color(wrap_s) ||
+       wrap_mode_needs_border_color(wrap_t) ||
+       wrap_mode_needs_border_color(wrap_r)) {
+      genX(upload_default_color)(brw, sampler, format, base_format,
+                                 texObj->_IsIntegerFormat,
+                                 texObj->StencilSampling,
+                                 &border_color_offset);
+   }
+
+   samp_st.BorderColorPointer = border_color_offset;
+
+   if (GEN_GEN < 6) {
+      samp_st.BorderColorPointer += brw->batch.bo->offset64; /* reloc */
+      brw_emit_reloc(&brw->batch, batch_offset_for_sampler_state + 8,
+                     brw->batch.bo, border_color_offset,
+                     I915_GEM_DOMAIN_SAMPLER, 0);
+   }
+
+#if GEN_GEN >= 8
+   samp_st.LODPreClampMode = CLAMP_MODE_OGL;
+#else
+   samp_st.LODPreClampEnable = true;
+#endif
+
+   GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
+}
+
+static void
+update_sampler_state(struct brw_context *brw,
+                     int unit,
+                     uint32_t *sampler_state,
+                     uint32_t batch_offset_for_sampler_state)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   const struct gl_texture_object *texObj = texUnit->_Current;
+   const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
+
+   /* These don't use samplers at all. */
+   if (texObj->Target == GL_TEXTURE_BUFFER)
+      return;
+
+   struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
+   genX(update_sampler_state)(brw, texObj->Target,
+                              ctx->Texture.CubeMapSeamless,
+                              texUnit->LodBias,
+                              firstImage->TexFormat, firstImage->_BaseFormat,
+                              texObj, sampler,
+                              sampler_state, batch_offset_for_sampler_state);
+}
+
+static void
+genX(upload_sampler_state_table)(struct brw_context *brw,
+                                 struct gl_program *prog,
+                                 struct brw_stage_state *stage_state)
+{
+   struct gl_context *ctx = &brw->ctx;
+   uint32_t sampler_count = stage_state->sampler_count;
+
+   GLbitfield SamplersUsed = prog->SamplersUsed;
+
+   if (sampler_count == 0)
+      return;
+
+   /* SAMPLER_STATE is 4 DWords on all platforms. */
+   const int dwords = GENX(SAMPLER_STATE_length);
+   const int size_in_bytes = dwords * sizeof(uint32_t);
+
+   uint32_t *sampler_state = brw_state_batch(brw,
+                                             sampler_count * size_in_bytes,
+                                             32, &stage_state->sampler_offset);
+   /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
+
+   uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
+
+   for (unsigned s = 0; s < sampler_count; s++) {
+      if (SamplersUsed & (1 << s)) {
+         const unsigned unit = prog->SamplerUnits[s];
+         if (ctx->Texture.Unit[unit]._Current) {
+            update_sampler_state(brw, unit, sampler_state,
+                                 batch_offset_for_sampler_state);
+         }
+      }
+
+      sampler_state += dwords;
+      batch_offset_for_sampler_state += size_in_bytes;
+   }
+
+   if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
+      /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
+      genX(emit_sampler_state_pointers_xs)(brw, stage_state);
+   } else {
+      /* Flag that the sampler state table pointer has changed; later atoms
+       * will handle it.
+       */
+      brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
+   }
+}
+
+static void
+genX(upload_fs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_program *fs = (struct gl_program *) brw->fragment_program;
+   genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
+}
+
+static const struct brw_tracked_state genX(fs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_FRAGMENT_PROGRAM,
+   },
+   .emit = genX(upload_fs_samplers),
+};
+
+static void
+genX(upload_vs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
+   genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
+}
+
+static const struct brw_tracked_state genX(vs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM,
+   },
+   .emit = genX(upload_vs_samplers),
+};
+
+#if GEN_GEN >= 6
+static void
+genX(upload_gs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_program *gs = (struct gl_program *) brw->geometry_program;
+   if (!gs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
+}
+
+
+static const struct brw_tracked_state genX(gs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_GEOMETRY_PROGRAM,
+   },
+   .emit = genX(upload_gs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tcs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
+   if (!tcs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
+}
+
+static const struct brw_tracked_state genX(tcs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_tcs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tes_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
+   if (!tes)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
+}
+
+static const struct brw_tracked_state genX(tes_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_tes_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_cs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   if (!cs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
+}
+
+const struct brw_tracked_state genX(cs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_COMPUTE_PROGRAM,
+   },
+   .emit = genX(upload_cs_samplers),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN <= 5
+
+static void genX(upload_blend_constant_color)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+   }
+}
+
+static const struct brw_tracked_state genX(blend_constant_color) = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_BLORP,
+   },
+   .emit = genX(upload_blend_constant_color)
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+void
+genX(init_atoms)(struct brw_context *brw)
+{
+#if GEN_GEN < 6
+   static const struct brw_tracked_state *render_atoms[] =
+   {
+      /* Once all the programs are done, we know how large urb entry
+       * sizes need to be and can decide if we need to change the urb
+       * layout.
+       */
+      &brw_curbe_offsets,
+      &brw_recalculate_urb_fence,
+
+      &genX(cc_vp),
+      &genX(color_calc_state),
+
+      /* Surface state setup.  Must come before the VS/WM unit.  The binding
+       * table upload must be last.
+       */
+      &brw_vs_pull_constants,
+      &brw_wm_pull_constants,
+      &brw_renderbuffer_surfaces,
+      &brw_renderbuffer_read_surfaces,
+      &brw_texture_surfaces,
+      &brw_vs_binding_table,
+      &brw_wm_binding_table,
+
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+
+      /* These set up state for brw_psp_urb_cbs */
+      &genX(wm_state),
+      &genX(sf_clip_viewport),
+      &genX(sf_state),
+      &genX(vs_state), /* always required, enabled or not */
+      &genX(clip_state),
+      &genX(gs_state),
+
+      /* Command packets:
+       */
+      &brw_invariant_state,
+
+      &brw_binding_table_pointers,
+      &genX(blend_constant_color),
+
+      &brw_depthbuffer,
+
+      &genX(polygon_stipple),
+      &genX(polygon_stipple_offset),
+
+      &genX(line_stipple),
+
+      &brw_psp_urb_cbs,
+
+      &genX(drawing_rect),
+      &brw_indices, /* must come before brw_vertices */
+      &genX(index_buffer),
+      &genX(vertices),
+
+      &brw_constant_buffer
+   };
+#elif GEN_GEN == 6
+   static const struct brw_tracked_state *render_atoms[] =
+   {
+      &genX(sf_clip_viewport),
+
+      /* Command packets: */
+
+      &genX(cc_vp),
+
+      &gen6_urb,
+      &genX(blend_state),		/* must do before cc unit */
+      &genX(color_calc_state),	/* must do before cc unit */
+      &genX(depth_stencil_state),	/* must do before cc unit */
+
+      &genX(vs_push_constants), /* Before vs_state */
+      &genX(gs_push_constants), /* Before gs_state */
+      &genX(wm_push_constants), /* Before wm_state */
+
+      /* Surface state setup.  Must come before the VS/WM unit.  The binding
+       * table upload must be last.
+       */
+      &brw_vs_pull_constants,
+      &brw_vs_ubo_surfaces,
+      &brw_gs_pull_constants,
+      &brw_gs_ubo_surfaces,
+      &brw_wm_pull_constants,
+      &brw_wm_ubo_surfaces,
+      &gen6_renderbuffer_surfaces,
+      &brw_renderbuffer_read_surfaces,
+      &brw_texture_surfaces,
+      &gen6_sol_surface,
+      &brw_vs_binding_table,
+      &gen6_gs_binding_table,
+      &brw_wm_binding_table,
+
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(gs_samplers),
+      &gen6_sampler_state,
+      &genX(multisample_state),
+
+      &genX(vs_state),
+      &genX(gs_state),
+      &genX(clip_state),
+      &genX(sf_state),
+      &genX(wm_state),
+
+      &genX(scissor_state),
+
+      &gen6_binding_table_pointers,
+
+      &brw_depthbuffer,
+
+      &genX(polygon_stipple),
+      &genX(polygon_stipple_offset),
+
+      &genX(line_stipple),
+
+      &genX(drawing_rect),
+
+      &brw_indices, /* must come before brw_vertices */
+      &genX(index_buffer),
+      &genX(vertices),
+   };
+#elif GEN_GEN == 7
+   static const struct brw_tracked_state *render_atoms[] =
+   {
+      /* Command packets: */
+
+      &genX(cc_vp),
+      &genX(sf_clip_viewport),
+
+      &gen7_l3_state,
+      &gen7_push_constant_space,
+      &gen7_urb,
+      &genX(blend_state),		/* must do before cc unit */
+      &genX(color_calc_state),	/* must do before cc unit */
+      &genX(depth_stencil_state),	/* must do before cc unit */
+
+      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
+      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
+      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
+      &genX(vs_push_constants), /* Before vs_state */
+      &genX(tcs_push_constants),
+      &genX(tes_push_constants),
+      &genX(gs_push_constants), /* Before gs_state */
+      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
+
+      /* Surface state setup.  Must come before the VS/WM unit.  The binding
+       * table upload must be last.
+       */
+      &brw_vs_pull_constants,
+      &brw_vs_ubo_surfaces,
+      &brw_vs_abo_surfaces,
+      &brw_tcs_pull_constants,
+      &brw_tcs_ubo_surfaces,
+      &brw_tcs_abo_surfaces,
+      &brw_tes_pull_constants,
+      &brw_tes_ubo_surfaces,
+      &brw_tes_abo_surfaces,
+      &brw_gs_pull_constants,
+      &brw_gs_ubo_surfaces,
+      &brw_gs_abo_surfaces,
+      &brw_wm_pull_constants,
+      &brw_wm_ubo_surfaces,
+      &brw_wm_abo_surfaces,
+      &gen6_renderbuffer_surfaces,
+      &brw_renderbuffer_read_surfaces,
+      &brw_texture_surfaces,
+
+      &genX(push_constant_packets),
+
+      &brw_vs_binding_table,
+      &brw_tcs_binding_table,
+      &brw_tes_binding_table,
+      &brw_gs_binding_table,
+      &brw_wm_binding_table,
+
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
+      &genX(multisample_state),
+
+      &genX(vs_state),
+      &genX(hs_state),
+      &genX(te_state),
+      &genX(ds_state),
+      &genX(gs_state),
+      &genX(sol_state),
+      &genX(clip_state),
+      &genX(sbe_state),
+      &genX(sf_state),
+      &genX(wm_state),
+      &genX(ps_state),
+
+      &genX(scissor_state),
+
+      &gen7_depthbuffer,
+
+      &genX(polygon_stipple),
+      &genX(polygon_stipple_offset),
+
+      &genX(line_stipple),
+
+      &genX(drawing_rect),
+
+      &brw_indices, /* must come before brw_vertices */
+      &genX(index_buffer),
+      &genX(vertices),
+
+#if GEN_IS_HASWELL
+      &genX(cut_index),
+#endif
+   };
+#elif GEN_GEN >= 8
+   static const struct brw_tracked_state *render_atoms[] =
+   {
+      &genX(cc_vp),
+      &genX(sf_clip_viewport),
+
+      &gen7_l3_state,
+      &gen7_push_constant_space,
+      &gen7_urb,
+      &genX(blend_state),
+      &genX(color_calc_state),
+
+      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
+      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
+      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
+      &genX(vs_push_constants), /* Before vs_state */
+      &genX(tcs_push_constants),
+      &genX(tes_push_constants),
+      &genX(gs_push_constants), /* Before gs_state */
+      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
+
+      /* Surface state setup.  Must come before the VS/WM unit.  The binding
+       * table upload must be last.
+       */
+      &brw_vs_pull_constants,
+      &brw_vs_ubo_surfaces,
+      &brw_vs_abo_surfaces,
+      &brw_tcs_pull_constants,
+      &brw_tcs_ubo_surfaces,
+      &brw_tcs_abo_surfaces,
+      &brw_tes_pull_constants,
+      &brw_tes_ubo_surfaces,
+      &brw_tes_abo_surfaces,
+      &brw_gs_pull_constants,
+      &brw_gs_ubo_surfaces,
+      &brw_gs_abo_surfaces,
+      &brw_wm_pull_constants,
+      &brw_wm_ubo_surfaces,
+      &brw_wm_abo_surfaces,
+      &gen6_renderbuffer_surfaces,
+      &brw_renderbuffer_read_surfaces,
+      &brw_texture_surfaces,
+
+      &genX(push_constant_packets),
+
+      &brw_vs_binding_table,
+      &brw_tcs_binding_table,
+      &brw_tes_binding_table,
+      &brw_gs_binding_table,
+      &brw_wm_binding_table,
+
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
+      &genX(multisample_state),
+
+      &genX(vs_state),
+      &genX(hs_state),
+      &genX(te_state),
+      &genX(ds_state),
+      &genX(gs_state),
+      &genX(sol_state),
+      &genX(clip_state),
+      &genX(raster_state),
+      &genX(sbe_state),
+      &genX(sf_state),
+      &genX(ps_blend),
+      &genX(ps_extra),
+      &genX(ps_state),
+      &genX(depth_stencil_state),
+      &genX(wm_state),
+
+      &genX(scissor_state),
+
+      &gen7_depthbuffer,
+
+      &genX(polygon_stipple),
+      &genX(polygon_stipple_offset),
+
+      &genX(line_stipple),
+
+      &genX(drawing_rect),
+
+      &genX(vf_topology),
+
+      &brw_indices,
+      &genX(index_buffer),
+      &genX(vertices),
+
+      &genX(cut_index),
+      &gen8_pma_fix,
+   };
+#endif
+
+   STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
+   brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
+                           render_atoms, ARRAY_SIZE(render_atoms));
+
+#if GEN_GEN >= 7
+   static const struct brw_tracked_state *compute_atoms[] =
+   {
+      &gen7_l3_state,
+      &brw_cs_image_surfaces,
+      &gen7_cs_push_constants,
+      &brw_cs_pull_constants,
+      &brw_cs_ubo_surfaces,
+      &brw_cs_abo_surfaces,
+      &brw_cs_texture_surfaces,
+      &brw_cs_work_groups_surface,
+      &genX(cs_samplers),
+      &genX(cs_state),
+   };
+
+   STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
+   brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
+                           compute_atoms, ARRAY_SIZE(compute_atoms));
+
+   brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
+#endif
+}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 6e4b55c..e2f208a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -78,8 +78,8 @@
    batch->exec_array_size = 100;
    batch->exec_bos =
       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
-   batch->exec_objects =
-      malloc(batch->exec_array_size * sizeof(batch->exec_objects[0]));
+   batch->validation_list =
+      malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
 
    if (INTEL_DEBUG & DEBUG_BATCH) {
       batch->state_batch_sizes =
@@ -100,8 +100,7 @@
 
    batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
    if (has_llc) {
-      brw_bo_map(NULL, batch->bo, true);
-      batch->map = batch->bo->virtual;
+      batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
    }
    batch->map_next = batch->map;
 
@@ -163,7 +162,7 @@
    }
    free(batch->relocs);
    free(batch->exec_bos);
-   free(batch->exec_objects);
+   free(batch->validation_list);
 
    brw_bo_unreference(batch->last_bo);
    brw_bo_unreference(batch->bo);
@@ -240,16 +239,16 @@
    if (batch->ring != RENDER_RING)
       return;
 
-   int ret = brw_bo_map(brw, batch->bo, false);
-   if (ret != 0) {
+   void *map = brw_bo_map(brw, batch->bo, MAP_READ);
+   if (map == NULL) {
       fprintf(stderr,
-	      "WARNING: failed to map batchbuffer (%s), "
-	      "dumping uploaded data instead.\n", strerror(ret));
+	      "WARNING: failed to map batchbuffer, "
+	      "dumping uploaded data instead.\n");
    }
 
-   uint32_t *data = batch->bo->virtual ? batch->bo->virtual : batch->map;
+   uint32_t *data = map ? map : batch->map;
    uint32_t *end = data + USED_BATCH(*batch);
-   uint32_t gtt_offset = batch->bo->virtual ? batch->bo->offset64 : 0;
+   uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
    int length;
 
    bool color = INTEL_DEBUG & DEBUG_COLOR;
@@ -275,7 +274,27 @@
 
       switch (gen_group_get_opcode(inst) >> 16) {
       case _3DSTATE_PIPELINED_POINTERS:
-         /* TODO: Decode Gen4-5 pipelined pointers */
+         /* Note: these Gen4-5 pointers are full relocations rather than
+          * offsets from the start of the batch.  So we need to subtract
+          * gtt_offset (the start of the batch) to obtain an offset we
+          * can add to the map and get at the data.
+          */
+         decode_struct(brw, spec, "VS_STATE", data, gtt_offset,
+                       (p[1] & ~0x1fu) - gtt_offset, color);
+         if (p[2] & 1) {
+            decode_struct(brw, spec, "GS_STATE", data, gtt_offset,
+                          (p[2] & ~0x1fu) - gtt_offset, color);
+         }
+         if (p[3] & 1) {
+            decode_struct(brw, spec, "CLIP_STATE", data, gtt_offset,
+                          (p[3] & ~0x1fu) - gtt_offset, color);
+         }
+         decode_struct(brw, spec, "SF_STATE", data, gtt_offset,
+                       (p[4] & ~0x1fu) - gtt_offset, color);
+         decode_struct(brw, spec, "WM_STATE", data, gtt_offset,
+                       (p[5] & ~0x1fu) - gtt_offset, color);
+         decode_struct(brw, spec, "COLOR_CALC_STATE", data, gtt_offset,
+                       (p[6] & ~0x3fu) - gtt_offset, color);
          break;
       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
@@ -350,7 +369,7 @@
       }
    }
 
-   if (ret == 0) {
+   if (map != NULL) {
       brw_bo_unmap(batch->bo);
    }
 }
@@ -389,7 +408,7 @@
 
    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 
-   brw->ib.type = -1;
+   brw->ib.index_size = -1;
 
    /* We need to periodically reap the shader time results, because rollover
     * happens every few seconds.  We also want to see results every once in a
@@ -446,12 +465,6 @@
                                           PIPE_CONTROL_CS_STALL);
       }
    }
-
-   /* Mark that the current program cache BO has been used by the GPU.
-    * It will be reallocated if we need to put new programs in for the
-    * next batch.
-    */
-   brw->cache.bo_used_by_gpu = true;
 }
 
 static void
@@ -478,7 +491,7 @@
             /* Pass NULL rather than brw so we avoid perf_debug warnings;
              * stalling is common and expected here...
              */
-            brw_bo_wait_rendering(NULL, brw->throttle_batch[1]);
+            brw_bo_wait_rendering(brw->throttle_batch[1]);
          }
          brw_bo_unreference(brw->throttle_batch[1]);
       }
@@ -513,13 +526,13 @@
       batch->exec_bos =
          realloc(batch->exec_bos,
                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
-      batch->exec_objects =
-         realloc(batch->exec_objects,
-                 batch->exec_array_size * sizeof(batch->exec_objects[0]));
+      batch->validation_list =
+         realloc(batch->validation_list,
+                 batch->exec_array_size * sizeof(batch->validation_list[0]));
    }
 
    struct drm_i915_gem_exec_object2 *validation_entry =
-      &batch->exec_objects[batch->exec_count];
+      &batch->validation_list[batch->exec_count];
    validation_entry->handle = bo->gem_handle;
    if (bo == batch->bo) {
       validation_entry->relocation_count = batch->reloc_count;
@@ -530,7 +543,7 @@
    }
    validation_entry->alignment = bo->align;
    validation_entry->offset = bo->offset64;
-   validation_entry->flags = 0;
+   validation_entry->flags = bo->kflags;
    validation_entry->rsvd1 = 0;
    validation_entry->rsvd2 = 0;
 
@@ -549,7 +562,7 @@
            int flags)
 {
    struct drm_i915_gem_execbuffer2 execbuf = {
-      .buffers_ptr = (uintptr_t) batch->exec_objects,
+      .buffers_ptr = (uintptr_t) batch->validation_list,
       .buffer_count = batch->exec_count,
       .batch_start_offset = 0,
       .batch_len = used,
@@ -580,10 +593,10 @@
       bo->idle = false;
 
       /* Update brw_bo::offset64 */
-      if (batch->exec_objects[i].offset != bo->offset64) {
+      if (batch->validation_list[i].offset != bo->offset64) {
          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
-             bo->gem_handle, bo->offset64, batch->exec_objects[i].offset);
-         bo->offset64 = batch->exec_objects[i].offset;
+             bo->gem_handle, bo->offset64, batch->validation_list[i].offset);
+         bo->offset64 = batch->validation_list[i].offset;
       }
    }
 
@@ -704,7 +717,7 @@
 
    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
       fprintf(stderr, "waiting for idle\n");
-      brw_bo_wait_rendering(brw, brw->batch.bo);
+      brw_bo_wait_rendering(brw->batch.bo);
    }
 
    /* Start a new batch buffer. */
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 2783ba3..f1a5c1f 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -74,21 +74,6 @@
                         struct brw_bo *target, uint32_t target_offset,
                         uint32_t read_domains, uint32_t write_domain);
 
-static inline uint32_t
-brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
-		  uint32_t prog_offset)
-{
-   if (brw->gen >= 5) {
-      /* Using state base address. */
-      return prog_offset;
-   }
-
-   brw_emit_reloc(&brw->batch, state_offset, brw->cache.bo, prog_offset,
-                  I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   return brw->cache.bo->offset64 + prog_offset;
-}
-
 #define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
 
 static inline uint32_t float_as_int(float f)
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index b1e1eaa..a9cdf48 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -131,8 +131,8 @@
 static int
 blt_pitch(struct intel_mipmap_tree *mt)
 {
-   int pitch = mt->pitch;
-   if (mt->tiling)
+   int pitch = mt->surf.row_pitch;
+   if (mt->surf.tiling != ISL_TILING_LINEAR)
       pitch /= 4;
    return pitch;
 }
@@ -171,13 +171,12 @@
                              uint32_t *x_offset_el,
                              uint32_t *y_offset_el)
 {
-   enum isl_tiling tiling = intel_miptree_get_isl_tiling(mt);
-   isl_tiling_get_intratile_offset_el(&brw->isl_dev,
-                                      tiling, mt->cpp, mt->pitch,
+   isl_tiling_get_intratile_offset_el(mt->surf.tiling,
+                                      mt->cpp * 8, mt->surf.row_pitch,
                                       total_x_offset_el, total_y_offset_el,
                                       base_address_offset,
                                       x_offset_el, y_offset_el);
-   if (tiling == ISL_TILING_LINEAR) {
+   if (mt->surf.tiling == ISL_TILING_LINEAR) {
       /* From the Broadwell PRM docs for XY_SRC_COPY_BLT::SourceBaseAddress:
        *
        *    "Base address of the destination surface: X=0, Y=0. Lower 32bits
@@ -188,7 +187,6 @@
        * The offsets we get from ISL in the tiled case are already aligned.
        * In the linear case, we need to do some of our own aligning.
        */
-      assert(mt->pitch % 64 == 0);
       uint32_t delta = *base_address_offset & 63;
       assert(delta % mt->cpp == 0);
       *base_address_offset -= delta;
@@ -252,12 +250,13 @@
 
          if (!intelEmitCopyBlit(brw,
                                 src_mt->cpp,
-                                reverse ? -src_mt->pitch : src_mt->pitch,
+                                reverse ? -src_mt->surf.row_pitch :
+                                           src_mt->surf.row_pitch,
                                 src_mt->bo, src_mt->offset + src_offset,
-                                src_mt->tiling,
-                                dst_mt->pitch,
+                                src_mt->surf.tiling,
+                                dst_mt->surf.row_pitch,
                                 dst_mt->bo, dst_mt->offset + dst_offset,
-                                dst_mt->tiling,
+                                dst_mt->surf.tiling,
                                 src_tile_x, src_tile_y,
                                 dst_tile_x, dst_tile_y,
                                 chunk_w, chunk_h,
@@ -299,7 +298,7 @@
                    GLenum logicop)
 {
    /* The blitter doesn't understand multisampling at all. */
-   if (src_mt->num_samples > 0 || dst_mt->num_samples > 0)
+   if (src_mt->surf.samples > 1 || dst_mt->surf.samples > 1)
       return false;
 
    /* No sRGB decode or encode is done by the hardware blitter, which is
@@ -325,17 +324,18 @@
    /* The blitter has no idea about HiZ or fast color clears, so we need to
     * resolve the miptrees before we do anything.
     */
-   intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_slice);
-   intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_slice);
-   intel_miptree_resolve_color(brw, src_mt, src_level, src_slice, 1, 0);
-   intel_miptree_resolve_color(brw, dst_mt, dst_level, dst_slice, 1, 0);
-   intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_slice);
+   intel_miptree_access_raw(brw, src_mt, src_level, src_slice, false);
+   intel_miptree_access_raw(brw, dst_mt, dst_level, dst_slice, true);
 
-   if (src_flip)
-      src_y = minify(src_mt->physical_height0, src_level - src_mt->first_level) - src_y - height;
-
-   if (dst_flip)
-      dst_y = minify(dst_mt->physical_height0, dst_level - dst_mt->first_level) - dst_y - height;
+   if (src_flip) {
+      const unsigned h0 = src_mt->surf.phys_level0_sa.height;
+      src_y = minify(h0, src_level - src_mt->first_level) - src_y - height;
+   }
+ 
+   if (dst_flip) {
+      const unsigned h0 = dst_mt->surf.phys_level0_sa.height;
+      dst_y = minify(h0, dst_level - dst_mt->first_level) - dst_y - height;
+   }
 
    uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
    intel_miptree_get_image_offset(src_mt, src_level, src_slice,
@@ -375,7 +375,7 @@
                    uint32_t src_width, uint32_t src_height)
 {
    /* The blitter doesn't understand multisampling at all. */
-   if (src_mt->num_samples > 0 || dst_mt->num_samples > 0)
+   if (src_mt->surf.samples > 1 || dst_mt->surf.samples > 1)
       return false;
 
    if (src_mt->format == MESA_FORMAT_S_UINT8)
@@ -384,11 +384,8 @@
    /* The blitter has no idea about HiZ or fast color clears, so we need to
     * resolve the miptrees before we do anything.
     */
-   intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_slice);
-   intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_slice);
-   intel_miptree_resolve_color(brw, src_mt, src_level, src_slice, 1, 0);
-   intel_miptree_resolve_color(brw, dst_mt, dst_level, dst_slice, 1, 0);
-   intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_slice);
+   intel_miptree_access_raw(brw, src_mt, src_level, src_slice, false);
+   intel_miptree_access_raw(brw, dst_mt, dst_level, dst_slice, true);
 
    uint32_t src_image_x, src_image_y;
    intel_miptree_get_image_offset(src_mt, src_level, src_slice,
@@ -406,10 +403,13 @@
        */
       assert(src_x % bw == 0);
       assert(src_y % bh == 0);
+
       assert(src_width % bw == 0 ||
-             src_x + src_width == minify(src_mt->logical_width0, src_level));
+             src_x + src_width ==
+             minify(src_mt->surf.logical_level0_px.width, src_level));
       assert(src_height % bh == 0 ||
-             src_y + src_height == minify(src_mt->logical_height0, src_level));
+             src_y + src_height ==
+             minify(src_mt->surf.logical_level0_px.height, src_level));
 
       src_x /= (int)bw;
       src_y /= (int)bh;
@@ -442,10 +442,11 @@
 }
 
 static bool
-alignment_valid(struct brw_context *brw, unsigned offset, uint32_t tiling)
+alignment_valid(struct brw_context *brw, unsigned offset,
+                enum isl_tiling tiling)
 {
    /* Tiled buffers must be page-aligned (4K). */
-   if (tiling != I915_TILING_NONE)
+   if (tiling != ISL_TILING_LINEAR)
       return (offset & 4095) == 0;
 
    /* On Gen8+, linear buffers must be cacheline-aligned. */
@@ -456,7 +457,8 @@
 }
 
 static uint32_t
-xy_blit_cmd(uint32_t src_tiling, uint32_t dst_tiling, uint32_t cpp)
+xy_blit_cmd(enum isl_tiling src_tiling, enum isl_tiling dst_tiling,
+            uint32_t cpp)
 {
    uint32_t CMD = 0;
 
@@ -473,10 +475,10 @@
       unreachable("not reached");
    }
 
-   if (dst_tiling != I915_TILING_NONE)
+   if (dst_tiling != ISL_TILING_LINEAR)
       CMD |= XY_DST_TILED;
 
-   if (src_tiling != I915_TILING_NONE)
+   if (src_tiling != ISL_TILING_LINEAR)
       CMD |= XY_SRC_TILED;
 
    return CMD;
@@ -490,11 +492,11 @@
 		  int32_t src_pitch,
 		  struct brw_bo *src_buffer,
 		  GLuint src_offset,
-		  uint32_t src_tiling,
+		  enum isl_tiling src_tiling,
 		  int32_t dst_pitch,
 		  struct brw_bo *dst_buffer,
 		  GLuint dst_offset,
-		  uint32_t dst_tiling,
+		  enum isl_tiling dst_tiling,
 		  GLshort src_x, GLshort src_y,
 		  GLshort dst_x, GLshort dst_y,
 		  GLshort w, GLshort h,
@@ -503,8 +505,8 @@
    GLuint CMD, BR13;
    int dst_y2 = dst_y + h;
    int dst_x2 = dst_x + w;
-   bool dst_y_tiled = dst_tiling == I915_TILING_Y;
-   bool src_y_tiled = src_tiling == I915_TILING_Y;
+   bool dst_y_tiled = dst_tiling == ISL_TILING_Y0;
+   bool src_y_tiled = src_tiling == ISL_TILING_Y0;
    uint32_t src_tile_w, src_tile_h;
    uint32_t dst_tile_w, dst_tile_h;
 
@@ -535,8 +537,8 @@
     * (X direction width of the Tile). This is ensured while allocating the
     * buffer object.
     */
-   assert(src_tiling == I915_TILING_NONE || (src_pitch % src_tile_w) == 0);
-   assert(dst_tiling == I915_TILING_NONE || (dst_pitch % dst_tile_w) == 0);
+   assert(src_tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
+   assert(dst_tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);
 
    /* For big formats (such as floating point), do the copy using 16 or
     * 32bpp and multiply the coordinates.
@@ -576,10 +578,10 @@
    /* For tiled source and destination, pitch value should be specified
     * as a number of Dwords.
     */
-   if (dst_tiling != I915_TILING_NONE)
+   if (dst_tiling != ISL_TILING_LINEAR)
       dst_pitch /= 4;
 
-   if (src_tiling != I915_TILING_NONE)
+   if (src_tiling != ISL_TILING_LINEAR)
       src_pitch /= 4;
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x)
@@ -629,7 +631,7 @@
 				  GLshort dst_pitch,
 				  struct brw_bo *dst_buffer,
 				  GLuint dst_offset,
-				  uint32_t dst_tiling,
+				  enum isl_tiling dst_tiling,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  GLenum logic_op)
@@ -637,10 +639,10 @@
    int dwords = ALIGN(src_size, 8) / 4;
    uint32_t opcode, br13, blit_cmd;
 
-   if (dst_tiling != I915_TILING_NONE) {
+   if (dst_tiling != ISL_TILING_LINEAR) {
       if (dst_offset & 4095)
 	 return false;
-      if (dst_tiling == I915_TILING_Y)
+      if (dst_tiling == ISL_TILING_Y0)
 	 return false;
    }
 
@@ -661,7 +663,7 @@
    opcode = XY_SETUP_BLT_CMD;
    if (cpp == 4)
       opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-   if (dst_tiling != I915_TILING_NONE) {
+   if (dst_tiling != ISL_TILING_LINEAR) {
       opcode |= XY_DST_TILED;
       dst_pitch /= 4;
    }
@@ -670,7 +672,7 @@
    br13 |= br13_for_cpp(cpp);
 
    blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */
-   if (dst_tiling != I915_TILING_NONE)
+   if (dst_tiling != ISL_TILING_LINEAR)
       blit_cmd |= XY_DST_TILED;
 
    BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
@@ -737,8 +739,10 @@
       assert(dst_x + pitch < 1 << 15);
 
       ok = intelEmitCopyBlit(brw, 1,
-                             pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
-                             pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                             pitch, src_bo, src_offset - src_x,
+                             ISL_TILING_LINEAR,
+                             pitch, dst_bo, dst_offset - dst_x,
+                             ISL_TILING_LINEAR,
                              src_x, 0, /* src x/y */
                              dst_x, 0, /* dst x/y */
                              MIN2(size, pitch), height, /* w, h */
@@ -775,7 +779,7 @@
    uint32_t BR13, CMD;
    int pitch, cpp;
 
-   pitch = mt->pitch;
+   pitch = mt->surf.row_pitch;
    cpp = mt->cpp;
 
    DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n",
@@ -785,7 +789,7 @@
    CMD = XY_COLOR_BLT_CMD;
    CMD |= XY_BLT_WRITE_ALPHA;
 
-   if (mt->tiling != I915_TILING_NONE) {
+   if (mt->surf.tiling != ISL_TILING_LINEAR) {
       CMD |= XY_DST_TILED;
       pitch /= 4;
    }
@@ -796,7 +800,7 @@
       intel_batchbuffer_flush(brw);
 
    unsigned length = brw->gen >= 8 ? 7 : 6;
-   bool dst_y_tiled = mt->tiling == I915_TILING_Y;
+   const bool dst_y_tiled = mt->surf.tiling == ISL_TILING_Y0;
 
    /* We need to split the blit into chunks that each fit within the blitter's
     * restrictions.  We can't use a chunk size of 32768 because we need to
@@ -826,11 +830,11 @@
          if (brw->gen >= 8) {
             OUT_RELOC64(mt->bo,
                         I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                        offset);
+                        mt->offset + offset);
          } else {
             OUT_RELOC(mt->bo,
                       I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                      offset);
+                      mt->offset + offset);
          }
          OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
          ADVANCE_BATCH_TILED(dst_y_tiled, false);
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index 2604417..0a0c57d 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -34,11 +34,11 @@
                   int32_t src_pitch,
                   struct brw_bo *src_buffer,
                   GLuint src_offset,
-                  uint32_t src_tiling,
+                  enum isl_tiling src_tiling,
                   int32_t dst_pitch,
                   struct brw_bo *dst_buffer,
                   GLuint dst_offset,
-                  uint32_t dst_tiling,
+                  enum isl_tiling dst_tiling,
                   GLshort srcx, GLshort srcy,
                   GLshort dstx, GLshort dsty,
                   GLshort w, GLshort h,
@@ -73,7 +73,7 @@
 				  GLshort dst_pitch,
 				  struct brw_bo *dst_buffer,
 				  GLuint dst_offset,
-				  uint32_t dst_tiling,
+                                  enum isl_tiling dst_tiling,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  GLenum logic_op);
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 9f1f793..e932bad 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -54,13 +54,46 @@
    intel_obj->gpu_active_end = 0;
 }
 
+static void
+mark_buffer_valid_data(struct intel_buffer_object *intel_obj,
+                       uint32_t offset, uint32_t size)
+{
+   intel_obj->valid_data_start = MIN2(intel_obj->valid_data_start, offset);
+   intel_obj->valid_data_end = MAX2(intel_obj->valid_data_end, offset + size);
+}
+
+static void
+mark_buffer_invalid(struct intel_buffer_object *intel_obj)
+{
+   intel_obj->valid_data_start = ~0;
+   intel_obj->valid_data_end = 0;
+}
+
 /** Allocates a new brw_bo to store the data for the buffer object. */
 static void
 alloc_buffer_object(struct brw_context *brw,
                     struct intel_buffer_object *intel_obj)
 {
-   intel_obj->buffer = brw_bo_alloc(brw->bufmgr, "bufferobj",
-					  intel_obj->Base.Size, 64);
+   const struct gl_context *ctx = &brw->ctx;
+
+   uint64_t size = intel_obj->Base.Size;
+   if (ctx->Const.RobustAccess) {
+      /* Pad out buffer objects with an extra 2kB (half a page).
+       *
+       * When pushing UBOs, we need to safeguard against 3DSTATE_CONSTANT_*
+       * reading out of bounds memory.  The application might bind a UBO that's
+       * smaller than what the program expects.  Ideally, we'd bind an extra
+       * push buffer containing zeros, but we have a limited number of those,
+       * so it's not always viable.  Our only safe option is to pad all buffer
+       * objects by the maximum push data length, so that it will never read
+       * past the end of a BO.
+       *
+       * This is unfortunate, but it should result in at most 1 extra page,
+       * which probably isn't too terrible.
+       */
+      size += 64 * 32; /* max read length of 64 256-bit units */
+   }
+   intel_obj->buffer = brw_bo_alloc(brw->bufmgr, "bufferobj", size, 64);
 
    /* the buffer might be bound as a uniform buffer, need to update it
     */
@@ -74,6 +107,7 @@
       brw->ctx.NewDriverState |= BRW_NEW_ATOMIC_BUFFER;
 
    mark_buffer_inactive(intel_obj);
+   mark_buffer_invalid(intel_obj);
 }
 
 static void
@@ -99,6 +133,7 @@
    struct intel_buffer_object *obj = CALLOC_STRUCT(intel_buffer_object);
    if (!obj) {
       _mesa_error_no_memory(__func__);
+      return NULL;
    }
 
    _mesa_initialize_buffer_object(ctx, &obj->Base, name);
@@ -172,8 +207,10 @@
       if (!intel_obj->buffer)
          return false;
 
-      if (data != NULL)
+      if (data != NULL) {
 	 brw_bo_subdata(intel_obj->buffer, 0, size, data);
+         mark_buffer_valid_data(intel_obj, 0, size);
+      }
    }
 
    return true;
@@ -215,18 +252,18 @@
     * up with blitting all the time, at the cost of bandwidth)
     */
    if (offset + size <= intel_obj->gpu_active_start ||
-       intel_obj->gpu_active_end <= offset) {
-      if (brw->has_llc) {
-         brw_bo_map_unsynchronized(brw, intel_obj->buffer);
-         memcpy(intel_obj->buffer->virtual + offset, data, size);
-         brw_bo_unmap(intel_obj->buffer);
+       intel_obj->gpu_active_end <= offset ||
+       offset + size <= intel_obj->valid_data_start ||
+       intel_obj->valid_data_end <= offset) {
+      void *map = brw_bo_map(brw, intel_obj->buffer, MAP_WRITE | MAP_ASYNC);
+      memcpy(map + offset, data, size);
+      brw_bo_unmap(intel_obj->buffer);
 
-         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
-            intel_obj->prefer_stall_to_blit = true;
-         return;
-      } else {
-         perf_debug("BufferSubData could be unsynchronized, but !LLC doesn't support it yet\n");
-      }
+      if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
+         intel_obj->prefer_stall_to_blit = true;
+
+      mark_buffer_valid_data(intel_obj, offset, size);
+      return;
    }
 
    busy =
@@ -234,17 +271,21 @@
       brw_batch_references(&brw->batch, intel_obj->buffer);
 
    if (busy) {
-      if (size == intel_obj->Base.Size) {
+      if (size == intel_obj->Base.Size ||
+          (intel_obj->valid_data_start >= offset &&
+           intel_obj->valid_data_end <= offset + size)) {
 	 /* Replace the current busy bo so the subdata doesn't stall. */
 	 brw_bo_unreference(intel_obj->buffer);
 	 alloc_buffer_object(brw, intel_obj);
       } else if (!intel_obj->prefer_stall_to_blit) {
          perf_debug("Using a blit copy to avoid stalling on "
                     "glBufferSubData(%ld, %ld) (%ldkb) to a busy "
-                    "(%d-%d) buffer object.\n",
+                    "(%d-%d) / valid (%d-%d) buffer object.\n",
                     (long)offset, (long)offset + size, (long)(size/1024),
                     intel_obj->gpu_active_start,
-                    intel_obj->gpu_active_end);
+                    intel_obj->gpu_active_end,
+                    intel_obj->valid_data_start,
+                    intel_obj->valid_data_end);
 	 struct brw_bo *temp_bo =
 	    brw_bo_alloc(brw->bufmgr, "subdata temp", size, 64);
 
@@ -256,6 +297,7 @@
 				size);
 
 	 brw_bo_unreference(temp_bo);
+         mark_buffer_valid_data(intel_obj, offset, size);
          return;
       } else {
          perf_debug("Stalling on glBufferSubData(%ld, %ld) (%ldkb) to a busy "
@@ -270,6 +312,7 @@
 
    brw_bo_subdata(intel_obj->buffer, offset, size, data);
    mark_buffer_inactive(intel_obj);
+   mark_buffer_valid_data(intel_obj, offset, size);
 }
 
 
@@ -293,7 +336,16 @@
    if (brw_batch_references(&brw->batch, intel_obj->buffer)) {
       intel_batchbuffer_flush(brw);
    }
-   brw_bo_get_subdata(intel_obj->buffer, offset, size, data);
+
+   void *map = brw_bo_map(brw, intel_obj->buffer, MAP_READ);
+
+   if (unlikely(!map)) {
+      _mesa_error_no_memory(__func__);
+      return;
+   }
+
+   memcpy(data, map + offset, size);
+   brw_bo_unmap(intel_obj->buffer);
 
    mark_buffer_inactive(intel_obj);
 }
@@ -328,6 +380,13 @@
 
    assert(intel_obj);
 
+   STATIC_ASSERT(GL_MAP_UNSYNCHRONIZED_BIT == MAP_ASYNC);
+   STATIC_ASSERT(GL_MAP_WRITE_BIT == MAP_WRITE);
+   STATIC_ASSERT(GL_MAP_READ_BIT == MAP_READ);
+   STATIC_ASSERT(GL_MAP_PERSISTENT_BIT == MAP_PERSISTENT);
+   STATIC_ASSERT(GL_MAP_COHERENT_BIT == MAP_COHERENT);
+   assert((access & MAP_INTERNAL_MASK) == 0);
+
    /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also
     * internally uses our functions directly.
     */
@@ -365,6 +424,9 @@
       }
    }
 
+   if (access & MAP_WRITE)
+      mark_buffer_valid_data(intel_obj, offset, length);
+
    /* If the user is mapping a range of an active buffer object but
     * doesn't require the current contents of that range, make a new
     * BO, and we'll copy what they put in there out at unmap or
@@ -388,33 +450,17 @@
                                                           length +
                                                           intel_obj->map_extra[index],
                                                           alignment);
-      if (brw->has_llc) {
-         brw_bo_map(brw, intel_obj->range_map_bo[index],
-                    (access & GL_MAP_WRITE_BIT) != 0);
-      } else {
-         brw_bo_map_gtt(brw, intel_obj->range_map_bo[index]);
-      }
-      obj->Mappings[index].Pointer =
-         intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
+      void *map = brw_bo_map(brw, intel_obj->range_map_bo[index], access);
+      obj->Mappings[index].Pointer = map + intel_obj->map_extra[index];
       return obj->Mappings[index].Pointer;
    }
 
-   if (access & GL_MAP_UNSYNCHRONIZED_BIT) {
-      if (!brw->has_llc && brw->perf_debug &&
-          brw_bo_busy(intel_obj->buffer)) {
-         perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
-      }
-      brw_bo_map_unsynchronized(brw, intel_obj->buffer);
-   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
-                              (access & GL_MAP_PERSISTENT_BIT))) {
-      brw_bo_map_gtt(brw, intel_obj->buffer);
-      mark_buffer_inactive(intel_obj);
-   } else {
-      brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
+   void *map = brw_bo_map(brw, intel_obj->buffer, access);
+   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) {
       mark_buffer_inactive(intel_obj);
    }
 
-   obj->Mappings[index].Pointer = intel_obj->buffer->virtual + offset;
+   obj->Mappings[index].Pointer = map + offset;
    return obj->Mappings[index].Pointer;
 }
 
@@ -543,7 +589,7 @@
 struct brw_bo *
 intel_bufferobj_buffer(struct brw_context *brw,
                        struct intel_buffer_object *intel_obj,
-                       uint32_t offset, uint32_t size)
+                       uint32_t offset, uint32_t size, bool write)
 {
    /* This is needed so that things like transform feedback and texture buffer
     * objects that need a BO but don't want to check that they exist for
@@ -554,6 +600,10 @@
 
    mark_buffer_gpu_usage(intel_obj, offset, size);
 
+   /* If writing, (conservatively) mark this section as having valid data. */
+   if (write)
+      mark_buffer_valid_data(intel_obj, offset, size);
+
    return intel_obj->buffer;
 }
 
@@ -579,8 +629,8 @@
    if (size == 0)
       return;
 
-   dst_bo = intel_bufferobj_buffer(brw, intel_dst, write_offset, size);
-   src_bo = intel_bufferobj_buffer(brw, intel_src, read_offset, size);
+   dst_bo = intel_bufferobj_buffer(brw, intel_dst, write_offset, size, true);
+   src_bo = intel_bufferobj_buffer(brw, intel_src, read_offset, size, false);
 
    intel_emit_linear_blit(brw,
 			  dst_bo, write_offset,
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
index a1bfaa9..3b46d5c 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
@@ -70,6 +70,17 @@
    uint32_t gpu_active_start;
    uint32_t gpu_active_end;
 
+   /** @{
+    * Tracking for what range of the BO may contain valid data.
+    *
+    * Users may create a large buffer object and only fill part of it
+    * with valid data.  This is a conservative estimate of what part
+    * of the buffer contains valid data that we have to preserve.
+    */
+   uint32_t valid_data_start;
+   uint32_t valid_data_end;
+   /** @} */
+
    /**
     * If we've avoided stalls/blits using the active tracking, flag the buffer
     * for (occasional) stalling in the future to avoid getting stuck in a
@@ -83,9 +94,10 @@
 /* Get the bm buffer associated with a GL bufferobject:
  */
 struct brw_bo *intel_bufferobj_buffer(struct brw_context *brw,
-                                     struct intel_buffer_object *obj,
-                                     uint32_t offset,
-                                     uint32_t size);
+                                      struct intel_buffer_object *obj,
+                                      uint32_t offset,
+                                      uint32_t size,
+                                      bool write);
 
 void intel_upload_data(struct brw_context *brw,
                        const void *data,
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index 85585c7..2ebd8d7 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -35,82 +35,6 @@
 #include "drivers/common/meta.h"
 
 static void
-copy_image_with_memcpy(struct brw_context *brw,
-                       struct intel_mipmap_tree *src_mt, int src_level,
-                       int src_x, int src_y, int src_z,
-                       struct intel_mipmap_tree *dst_mt, int dst_level,
-                       int dst_x, int dst_y, int dst_z,
-                       int src_width, int src_height)
-{
-   bool same_slice;
-   void *mapped, *src_mapped, *dst_mapped;
-   ptrdiff_t src_stride, dst_stride, cpp;
-   int map_x1, map_y1, map_x2, map_y2;
-   GLuint src_bw, src_bh;
-
-   cpp = _mesa_get_format_bytes(src_mt->format);
-   _mesa_get_format_block_size(src_mt->format, &src_bw, &src_bh);
-
-   assert(src_width % src_bw == 0);
-   assert(src_height % src_bh == 0);
-   assert(src_x % src_bw == 0);
-   assert(src_y % src_bh == 0);
-
-   /* If we are on the same miptree, same level, and same slice, then
-    * intel_miptree_map won't let us map it twice.  We have to do things a
-    * bit differently.  In particular, we do a single map large enough for
-    * both portions and in read-write mode.
-    */
-   same_slice = src_mt == dst_mt && src_level == dst_level && src_z == dst_z;
-
-   if (same_slice) {
-      assert(dst_x % src_bw == 0);
-      assert(dst_y % src_bh == 0);
-
-      map_x1 = MIN2(src_x, dst_x);
-      map_y1 = MIN2(src_y, dst_y);
-      map_x2 = MAX2(src_x, dst_x) + src_width;
-      map_y2 = MAX2(src_y, dst_y) + src_height;
-
-      intel_miptree_map(brw, src_mt, src_level, src_z,
-                        map_x1, map_y1, map_x2 - map_x1, map_y2 - map_y1,
-                        GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                        &mapped, &src_stride);
-
-      dst_stride = src_stride;
-
-      /* Set the offsets here so we don't have to think about while looping */
-      src_mapped = mapped + ((src_y - map_y1) / src_bh) * src_stride +
-                            ((src_x - map_x1) / src_bw) * cpp;
-      dst_mapped = mapped + ((dst_y - map_y1) / src_bh) * dst_stride +
-                            ((dst_x - map_x1) / src_bw) * cpp;
-   } else {
-      intel_miptree_map(brw, src_mt, src_level, src_z,
-                        src_x, src_y, src_width, src_height,
-                        GL_MAP_READ_BIT, &src_mapped, &src_stride);
-      intel_miptree_map(brw, dst_mt, dst_level, dst_z,
-                        dst_x, dst_y, src_width, src_height,
-                        GL_MAP_WRITE_BIT, &dst_mapped, &dst_stride);
-   }
-
-   src_width /= (int)src_bw;
-   src_height /= (int)src_bh;
-
-   for (int i = 0; i < src_height; ++i) {
-      memcpy(dst_mapped, src_mapped, src_width * cpp);
-      src_mapped += src_stride;
-      dst_mapped += dst_stride;
-   }
-
-   if (same_slice) {
-      intel_miptree_unmap(brw, src_mt, src_level, src_z);
-   } else {
-      intel_miptree_unmap(brw, dst_mt, dst_level, dst_z);
-      intel_miptree_unmap(brw, src_mt, src_level, src_z);
-   }
-}
-
-static void
 copy_miptrees(struct brw_context *brw,
               struct intel_mipmap_tree *src_mt,
               int src_x, int src_y, int src_z, unsigned src_level,
@@ -118,55 +42,25 @@
               int dst_x, int dst_y, int dst_z, unsigned dst_level,
               int src_width, int src_height)
 {
-   unsigned bw, bh;
-
-   if (brw->gen >= 6) {
-      brw_blorp_copy_miptrees(brw,
-                              src_mt, src_level, src_z,
-                              dst_mt, dst_level, dst_z,
-                              src_x, src_y, dst_x, dst_y,
-                              src_width, src_height);
-      return;
+   if (brw->gen <= 5) {
+      /* On gen4-5, try BLT first.
+       *
+       * Gen4-5 have a single ring for both 3D and BLT operations, so there's
+       * no inter-ring synchronization issues like on Gen6+.  It is apparently
+       * faster than using the 3D pipeline.  Original Gen4 also has to rebase
+       * and copy miptree slices in order to render to unaligned locations.
+       */
+      if (intel_miptree_copy(brw, src_mt, src_level, src_z, src_x, src_y,
+                             dst_mt, dst_level, dst_z, dst_x, dst_y,
+                             src_width, src_height))
+         return;
    }
 
-   /* We are now going to try and copy the texture using the blitter.  If
-    * that fails, we will fall back mapping the texture and using memcpy.
-    * In either case, we need to do a full resolve.
-    */
-   intel_miptree_all_slices_resolve_hiz(brw, src_mt);
-   intel_miptree_all_slices_resolve_depth(brw, src_mt);
-   intel_miptree_all_slices_resolve_color(brw, src_mt, 0);
-
-   intel_miptree_all_slices_resolve_hiz(brw, dst_mt);
-   intel_miptree_all_slices_resolve_depth(brw, dst_mt);
-   intel_miptree_all_slices_resolve_color(brw, dst_mt, 0);
-
-   _mesa_get_format_block_size(src_mt->format, &bw, &bh);
-
-   /* It's legal to have a WxH that's smaller than a compressed block. This
-    * happens for example when you are using a higher level LOD. For this case,
-    * we still want to copy the entire block, or else the decompression will be
-    * incorrect.
-    */
-   if (src_width < bw)
-      src_width = ALIGN_NPOT(src_width, bw);
-
-   if (src_height < bh)
-      src_height = ALIGN_NPOT(src_height, bh);
-
-   if (intel_miptree_copy(brw, src_mt, src_level, src_z, src_x, src_y,
-                          dst_mt, dst_level, dst_z, dst_x, dst_y,
-                          src_width, src_height))
-      return;
-
-   /* This is a worst-case scenario software fallback that maps the two
-    * textures and does a memcpy between them.
-    */
-   copy_image_with_memcpy(brw, src_mt, src_level,
-                          src_x, src_y, src_z,
-                          dst_mt, dst_level,
-                          dst_x, dst_y, dst_z,
-                          src_width, src_height);
+   brw_blorp_copy_miptrees(brw,
+                           src_mt, src_level, src_z,
+                           dst_mt, dst_level, dst_z,
+                           src_x, src_y, dst_x, dst_y,
+                           src_width, src_height);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 0133fa1..b91bbdc 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -65,11 +65,11 @@
    ctx->Extensions.ARB_map_buffer_range = true;
    ctx->Extensions.ARB_occlusion_query = true;
    ctx->Extensions.ARB_occlusion_query2 = true;
-   ctx->Extensions.ARB_pipeline_statistics_query = true;
    ctx->Extensions.ARB_point_sprite = true;
    ctx->Extensions.ARB_seamless_cube_map = true;
    ctx->Extensions.ARB_shader_bit_encoding = true;
    ctx->Extensions.ARB_shader_draw_parameters = true;
+   ctx->Extensions.ARB_shader_group_vote = true;
    ctx->Extensions.ARB_shader_texture_lod = true;
    ctx->Extensions.ARB_shading_language_packing = true;
    ctx->Extensions.ARB_shadow = true;
@@ -172,6 +172,7 @@
       ctx->Extensions.ARB_enhanced_layouts = true;
       ctx->Extensions.ARB_ES3_compatibility = true;
       ctx->Extensions.ARB_fragment_layer_viewport = true;
+      ctx->Extensions.ARB_pipeline_statistics_query = true;
       ctx->Extensions.ARB_sample_shading = true;
       ctx->Extensions.ARB_shading_language_420pack = true;
       ctx->Extensions.ARB_texture_buffer_object = true;
@@ -196,7 +197,6 @@
        * slightly differently when the extension is enabled.
        */
       if (ctx->API == API_OPENGL_CORE) {
-         ctx->Extensions.ARB_shader_subroutine = true;
          ctx->Extensions.ARB_shader_viewport_layer_array = true;
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
@@ -271,6 +271,7 @@
 
    if (brw->gen >= 8) {
       ctx->Extensions.ARB_gpu_shader_int64 = true;
+      ctx->Extensions.ARB_shader_ballot = true; /* requires ARB_gpu_shader_int64 */
       ctx->Extensions.ARB_ES3_2_compatibility = true;
    }
 
@@ -285,6 +286,9 @@
       ctx->Extensions.ARB_post_depth_coverage = true;
    }
 
+   if (brw->is_broxton)
+      ctx->Extensions.KHR_texture_compression_astc_hdr = true;
+
    if (brw->gen >= 6)
       ctx->Extensions.INTEL_performance_query = true;
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 8accbe9..ca80b96 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -143,7 +143,7 @@
          irb->singlesample_mt =
             intel_miptree_create_for_renderbuffer(brw, irb->mt->format,
                                                   rb->Width, rb->Height,
-                                                  0 /*num_samples*/);
+                                                  1 /*num_samples*/);
          if (!irb->singlesample_mt)
             goto fail;
          irb->singlesample_mt_is_tmp = true;
@@ -303,7 +303,7 @@
 
    irb->mt = intel_miptree_create_for_renderbuffer(brw, rb->Format,
 						   width, height,
-                                                   rb->NumSamples);
+                                                   MAX2(rb->NumSamples, 1));
    if (!irb->mt)
       return false;
 
@@ -348,7 +348,7 @@
    }
 
    /* __DRIimage is opaque to the core so it has to be checked here */
-   if (!brw->format_supported_as_render_target[image->format]) {
+   if (!brw->mesa_format_supports_render[image->format]) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
             "glEGLImageTargetRenderbufferStorage(unsupported image format)");
       return;
@@ -362,31 +362,11 @@
     * buffer's content to the main buffer nor for invalidating the aux buffer's
     * content.
     */
-   irb->mt = intel_miptree_create_for_bo(brw,
-                                         image->bo,
-                                         image->format,
-                                         image->offset,
-                                         image->width,
-                                         image->height,
-                                         1,
-                                         image->pitch,
-                                         MIPTREE_LAYOUT_DISABLE_AUX);
+   irb->mt = intel_miptree_create_for_dri_image(brw, image, GL_TEXTURE_2D,
+                                                ISL_COLORSPACE_NONE, false);
    if (!irb->mt)
       return;
 
-   /* Adjust the miptree's upper-left coordinate.
-    *
-    * FIXME: Adjusting the miptree's layout outside of
-    * intel_miptree_create_layout() is fragile. Plumb the adjustment through
-    * intel_miptree_create_layout() and brw_tex_layout().
-    */
-   irb->mt->level[0].level_x = image->tile_x;
-   irb->mt->level[0].level_y = image->tile_y;
-   irb->mt->level[0].slice[0].x_offset = image->tile_x;
-   irb->mt->level[0].slice[0].y_offset = image->tile_y;
-   irb->mt->total_width += image->tile_x;
-   irb->mt->total_height += image->tile_y;
-
    rb->InternalFormat = image->internal_format;
    rb->Width = image->width;
    rb->Height = image->height;
@@ -431,31 +411,49 @@
 }
 
 /**
- * Create a new intel_renderbuffer which corresponds to an on-screen window,
- * not a user-created renderbuffer.
+ * Create an intel_renderbuffer for a __DRIdrawable. This function is
+ * unrelated to GL renderbuffers (that is, those created by
+ * glGenRenderbuffers).
  *
  * \param num_samples must be quantized.
  */
 struct intel_renderbuffer *
-intel_create_renderbuffer(mesa_format format, unsigned num_samples)
+intel_create_winsys_renderbuffer(struct intel_screen *screen,
+                                 mesa_format format, unsigned num_samples)
 {
-   struct intel_renderbuffer *irb;
-   struct gl_renderbuffer *rb;
-
-   irb = CALLOC_STRUCT(intel_renderbuffer);
+   struct intel_renderbuffer *irb = CALLOC_STRUCT(intel_renderbuffer);
    if (!irb)
       return NULL;
 
-   rb = &irb->Base.Base;
+   struct gl_renderbuffer *rb = &irb->Base.Base;
    irb->layer_count = 1;
 
    _mesa_init_renderbuffer(rb, 0);
    rb->ClassID = INTEL_RB_CLASS;
-   rb->_BaseFormat = _mesa_get_format_base_format(format);
-   rb->Format = format;
-   rb->InternalFormat = rb->_BaseFormat;
    rb->NumSamples = num_samples;
 
+   /* The base format and internal format must be derived from the user-visible
+    * format (that is, the gl_config's format), even if we internally use
+    * choose a different format for the renderbuffer. Otherwise, rendering may
+    * use incorrect channel write masks.
+    */
+   rb->_BaseFormat = _mesa_get_format_base_format(format);
+   rb->InternalFormat = rb->_BaseFormat;
+
+   rb->Format = format;
+   if (!screen->mesa_format_supports_render[rb->Format]) {
+      /* The glRenderbufferStorage paths in core Mesa detect if the driver
+       * does not support the user-requested format, and then searches for
+       * a falback format. The DRI code bypasses core Mesa, though. So we do
+       * the fallbacks here.
+       *
+       * We must support MESA_FORMAT_R8G8B8X8 on Android because the Android
+       * framework requires HAL_PIXEL_FORMAT_RGBX8888 winsys surfaces.
+       */
+      rb->Format = _mesa_format_fallback_rgbx_to_rgba(rb->Format);
+      assert(screen->mesa_format_supports_render[rb->Format]);
+   }
+
    /* intel-specific methods */
    rb->Delete = intel_delete_renderbuffer;
    rb->AllocStorage = intel_alloc_window_storage;
@@ -465,18 +463,19 @@
 
 /**
  * Private window-system buffers (as opposed to ones shared with the display
- * server created with intel_create_renderbuffer()) are most similar in their
+ * server created with intel_create_winsys_renderbuffer()) are most similar in their
  * handling to user-created renderbuffers, but they have a resize handler that
  * may be called at intel_update_renderbuffers() time.
  *
  * \param num_samples must be quantized.
  */
 struct intel_renderbuffer *
-intel_create_private_renderbuffer(mesa_format format, unsigned num_samples)
+intel_create_private_renderbuffer(struct intel_screen *screen,
+                                  mesa_format format, unsigned num_samples)
 {
    struct intel_renderbuffer *irb;
 
-   irb = intel_create_renderbuffer(format, num_samples);
+   irb = intel_create_winsys_renderbuffer(screen, format, num_samples);
    irb->Base.Base.AllocStorage = intel_alloc_private_renderbuffer_storage;
 
    return irb;
@@ -531,38 +530,22 @@
 
    intel_miptree_check_level_layer(mt, level, layer);
    irb->mt_level = level;
-
-   int layer_multiplier;
-   switch (mt->msaa_layout) {
-      case INTEL_MSAA_LAYOUT_UMS:
-      case INTEL_MSAA_LAYOUT_CMS:
-         layer_multiplier = MAX2(mt->num_samples, 1);
-         break;
-
-      default:
-         layer_multiplier = 1;
-   }
-
-   irb->mt_layer = layer_multiplier * layer;
+   irb->mt_layer = layer;
 
    if (!layered) {
       irb->layer_count = 1;
    } else if (mt->target != GL_TEXTURE_3D && image->TexObject->NumLayers > 0) {
       irb->layer_count = image->TexObject->NumLayers;
    } else {
-      irb->layer_count = mt->level[level].depth / layer_multiplier;
+      irb->layer_count = mt->surf.dim == ISL_SURF_DIM_3D ?
+                            minify(mt->surf.logical_level0_px.depth, level) :
+                            mt->surf.logical_level0_px.array_len;
    }
 
    intel_miptree_reference(&irb->mt, mt);
 
    intel_renderbuffer_set_draw_offset(irb);
 
-   if (intel_miptree_wants_hiz_buffer(brw, mt)) {
-      intel_miptree_alloc_hiz(brw, mt);
-      if (!mt->hiz_buf)
-	 return false;
-   }
-
    return true;
 }
 
@@ -672,14 +655,26 @@
 
    if (depth_mt && stencil_mt) {
       if (brw->gen >= 6) {
+         const unsigned d_width = depth_mt->surf.phys_level0_sa.width;
+         const unsigned d_height = depth_mt->surf.phys_level0_sa.height;
+         const unsigned d_depth = depth_mt->surf.dim == ISL_SURF_DIM_3D ?
+                                     depth_mt->surf.phys_level0_sa.depth :
+                                     depth_mt->surf.phys_level0_sa.array_len;
+
+         const unsigned s_width = stencil_mt->surf.phys_level0_sa.width;
+         const unsigned s_height = stencil_mt->surf.phys_level0_sa.height;
+         const unsigned s_depth = stencil_mt->surf.dim == ISL_SURF_DIM_3D ?
+                                     stencil_mt->surf.phys_level0_sa.depth :
+                                     stencil_mt->surf.phys_level0_sa.array_len;
+
          /* For gen >= 6, we are using the lod/minimum-array-element fields
           * and supporting layered rendering. This means that we must restrict
           * the depth & stencil attachments to match in various more retrictive
           * ways. (width, height, depth, LOD and layer)
           */
-	 if (depth_mt->physical_width0 != stencil_mt->physical_width0 ||
-             depth_mt->physical_height0 != stencil_mt->physical_height0 ||
-             depth_mt->physical_depth0 != stencil_mt->physical_depth0 ||
+	 if (d_width != s_width ||
+             d_height != s_height ||
+             d_depth != s_depth ||
              depthRb->mt_level != stencilRb->mt_level ||
 	     depthRb->mt_layer != stencilRb->mt_layer) {
 	    fbo_incomplete(fb,
@@ -882,6 +877,22 @@
    if (!_mesa_check_conditional_render(ctx))
       return;
 
+   if (brw->gen < 6) {
+      /* On gen4-5, try BLT first.
+       *
+       * Gen4-5 have a single ring for both 3D and BLT operations, so there's
+       * no inter-ring synchronization issues like on Gen6+.  It is apparently
+       * faster than using the 3D pipeline.  Original Gen4 also has to rebase
+       * and copy miptree slices in order to render to unaligned locations.
+       */
+      mask = intel_blit_framebuffer_with_blitter(ctx, readFb, drawFb,
+                                                 srcX0, srcY0, srcX1, srcY1,
+                                                 dstX0, dstY0, dstX1, dstY1,
+                                                 mask);
+      if (mask == 0x0)
+         return;
+   }
+
    mask = brw_blorp_framebuffer(brw, readFb, drawFb,
                                 srcX0, srcY0, srcX1, srcY1,
                                 dstX0, dstY0, dstX1, dstY1,
@@ -915,51 +926,6 @@
 }
 
 /**
- * Gen4-5 implementation of glBlitFrameBuffer().
- *
- * Tries BLT, Meta, then swrast.
- *
- * Gen4-5 have a single ring for both 3D and BLT operations, so there's no
- * inter-ring synchronization issues like on Gen6+.  It is apparently faster
- * than using the 3D pipeline.  Original Gen4 also has to rebase and copy
- * miptree slices in order to render to unaligned locations.
- */
-static void
-gen4_blit_framebuffer(struct gl_context *ctx,
-                      struct gl_framebuffer *readFb,
-                      struct gl_framebuffer *drawFb,
-                      GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                      GLbitfield mask, GLenum filter)
-{
-   /* Page 679 of OpenGL 4.4 spec says:
-    * "Added BlitFramebuffer to commands affected by conditional rendering in
-    *  section 10.10 (Bug 9562)."
-    */
-   if (!_mesa_check_conditional_render(ctx))
-      return;
-
-   mask = intel_blit_framebuffer_with_blitter(ctx, readFb, drawFb,
-                                              srcX0, srcY0, srcX1, srcY1,
-                                              dstX0, dstY0, dstX1, dstY1,
-                                              mask);
-   if (mask == 0x0)
-      return;
-
-   mask = _mesa_meta_BlitFramebuffer(ctx, readFb, drawFb,
-                                     srcX0, srcY0, srcX1, srcY1,
-                                     dstX0, dstY0, dstX1, dstY1,
-                                     mask, filter);
-   if (mask == 0x0)
-      return;
-
-   _swrast_BlitFramebuffer(ctx, readFb, drawFb,
-                           srcX0, srcY0, srcX1, srcY1,
-                           dstX0, dstY0, dstX1, dstY1,
-                           mask, filter);
-}
-
-/**
  * Does the renderbuffer have hiz enabled?
  */
 bool
@@ -968,47 +934,6 @@
    return intel_miptree_level_has_hiz(irb->mt, irb->mt_level);
 }
 
-bool
-intel_renderbuffer_resolve_hiz(struct brw_context *brw,
-			       struct intel_renderbuffer *irb)
-{
-   if (irb->mt)
-      return intel_miptree_slice_resolve_hiz(brw,
-                                             irb->mt,
-                                             irb->mt_level,
-                                             irb->mt_layer);
-
-   return false;
-}
-
-void
-intel_renderbuffer_att_set_needs_depth_resolve(struct gl_renderbuffer_attachment *att)
-{
-   struct intel_renderbuffer *irb = intel_renderbuffer(att->Renderbuffer);
-   if (irb->mt) {
-      if (att->Layered) {
-         intel_miptree_set_all_slices_need_depth_resolve(irb->mt, irb->mt_level);
-      } else {
-         intel_miptree_slice_set_needs_depth_resolve(irb->mt,
-                                                     irb->mt_level,
-                                                     irb->mt_layer);
-      }
-   }
-}
-
-bool
-intel_renderbuffer_resolve_depth(struct brw_context *brw,
-				 struct intel_renderbuffer *irb)
-{
-   if (irb->mt)
-      return intel_miptree_slice_resolve_depth(brw,
-                                               irb->mt,
-                                               irb->mt_level,
-                                               irb->mt_layer);
-
-   return false;
-}
-
 void
 intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                 struct intel_renderbuffer *irb,
@@ -1019,28 +944,26 @@
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                           MIPTREE_LAYOUT_TILING_ANY;
-
    intel_get_image_dims(rb->TexImage, &width, &height, &depth);
 
-   new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
+   assert(irb->align_wa_mt == NULL);
+   new_mt = intel_miptree_create(brw, GL_TEXTURE_2D,
                                  intel_image->base.Base.TexFormat,
-                                 intel_image->base.Base.Level,
-                                 intel_image->base.Base.Level,
-                                 width, height, depth,
-                                 irb->mt->num_samples,
-                                 layout_flags);
+                                 0, 0,
+                                 width, height, 1,
+                                 irb->mt->surf.samples,
+                                 MIPTREE_CREATE_BUSY);
 
-   if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
-      intel_miptree_alloc_hiz(brw, new_mt);
-   }
+   if (!invalidate)
+      intel_miptree_copy_slice(brw, intel_image->mt,
+                               intel_image->base.Base.Level, irb->mt_layer,
+                               new_mt, 0, 0);
 
-   intel_miptree_copy_teximage(brw, intel_image, new_mt, invalidate);
-
-   intel_miptree_reference(&irb->mt, intel_image->mt);
-   intel_renderbuffer_set_draw_offset(irb);
+   intel_miptree_reference(&irb->align_wa_mt, new_mt);
    intel_miptree_release(&new_mt);
+
+   irb->draw_x = 0;
+   irb->draw_y = 0;
 }
 
 void
@@ -1106,10 +1029,7 @@
    dd->UnmapRenderbuffer = intel_unmap_renderbuffer;
    dd->RenderTexture = intel_render_texture;
    dd->ValidateFramebuffer = intel_validate_framebuffer;
-   if (brw->gen >= 6)
-      dd->BlitFramebuffer = intel_blit_framebuffer;
-   else
-      dd->BlitFramebuffer = gen4_blit_framebuffer;
+   dd->BlitFramebuffer = intel_blit_framebuffer;
    dd->EGLImageTargetRenderbufferStorage =
       intel_image_target_renderbuffer_storage;
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.h b/src/mesa/drivers/dri/i965/intel_fbo.h
index 08b82e8..1e24942 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.h
+++ b/src/mesa/drivers/dri/i965/intel_fbo.h
@@ -67,6 +67,16 @@
     */
    struct intel_mipmap_tree *singlesample_mt;
 
+   /* Gen < 6 doesn't have layer specifier for render targets or depth. Driver
+    * needs to manually offset surfaces to correct level/layer. There are,
+    * however, alignment restrictions to respect as well and in come cases
+    * the only option is to use temporary single slice surface which driver
+    * copies after rendering to the full miptree.
+    *
+    * See intel_renderbuffer_move_to_temp().
+    */
+   struct intel_mipmap_tree *align_wa_mt;
+
    /**
     * \name Miptree view
     * \{
@@ -79,11 +89,6 @@
     *
     * For renderbuffers not created with glFramebufferTexture*(), mt_level and
     * mt_layer are 0.
-    *
-    * Note: for a 2D multisample array texture on Gen7+ using
-    * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, mt_layer is the physical
-    * layer holding sample 0.  So, for example, if mt->num_samples == 4, then
-    * logical layer n corresponds to mt_layer == 4*n.
     */
    unsigned int mt_level;
    unsigned int mt_layer;
@@ -136,6 +141,14 @@
       return NULL;
 }
 
+static inline struct intel_mipmap_tree *
+intel_renderbuffer_get_mt(struct intel_renderbuffer *irb)
+{
+   if (!irb)
+      return NULL;
+
+   return (irb->align_wa_mt) ? irb->align_wa_mt : irb->mt;
+}
 
 /**
  * \brief Return the framebuffer attachment specified by attIndex.
@@ -167,10 +180,12 @@
 }
 
 extern struct intel_renderbuffer *
-intel_create_renderbuffer(mesa_format format, unsigned num_samples);
+intel_create_winsys_renderbuffer(struct intel_screen *screen,
+                                 mesa_format format, unsigned num_samples);
 
 struct intel_renderbuffer *
-intel_create_private_renderbuffer(mesa_format format, unsigned num_samples);
+intel_create_private_renderbuffer(struct intel_screen *screen,
+                                  mesa_format format, unsigned num_samples);
 
 struct gl_renderbuffer*
 intel_create_wrapped_renderbuffer(struct gl_context * ctx,
@@ -188,6 +203,12 @@
                                     uint32_t *tile_x,
                                     uint32_t *tile_y)
 {
+   if (irb->align_wa_mt) {
+      *tile_x = 0;
+      *tile_y = 0;
+      return 0;
+   }
+
    return intel_miptree_get_tile_offsets(irb->mt, irb->mt_level, irb->mt_layer,
                                          tile_x, tile_y);
 }
@@ -195,33 +216,6 @@
 bool
 intel_renderbuffer_has_hiz(struct intel_renderbuffer *irb);
 
-void
-intel_renderbuffer_att_set_needs_depth_resolve(struct gl_renderbuffer_attachment *att);
-
-
-/**
- * \brief Perform a HiZ resolve on the renderbuffer.
- *
- * It is safe to call this function on a renderbuffer without HiZ. In that
- * case, the function is a no-op.
- *
- * \return false if no resolve was needed
- */
-bool
-intel_renderbuffer_resolve_hiz(struct brw_context *brw,
-			       struct intel_renderbuffer *irb);
-
-/**
- * \brief Perform a depth resolve on the renderbuffer.
- *
- * It is safe to call this function on a renderbuffer without HiZ. In that
- * case, the function is a no-op.
- *
- * \return false if no resolve was needed
- */
-bool
-intel_renderbuffer_resolve_depth(struct brw_context *brw,
-				 struct intel_renderbuffer *irb);
 
 void intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                      struct intel_renderbuffer *irb,
diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
index ad42691..cf06105 100644
--- a/src/mesa/drivers/dri/i965/intel_image.h
+++ b/src/mesa/drivers/dri/i965/intel_image.h
@@ -70,7 +70,7 @@
    uint32_t pitch; /**< in bytes */
    GLenum internal_format;
    uint32_t dri_format;
-   GLuint format;
+   GLuint format; /**< mesa_format or mesa_array_format */
    uint64_t modifier; /**< fb modifier (fourcc) */
    uint32_t offset;
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index e06ee59..34bfa8a 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -27,8 +27,8 @@
 #include <GL/internal/dri_interface.h>
 
 #include "intel_batchbuffer.h"
+#include "intel_image.h"
 #include "intel_mipmap_tree.h"
-#include "intel_resolve_map.h"
 #include "intel_tex.h"
 #include "intel_blit.h"
 #include "intel_fbo.h"
@@ -49,33 +49,28 @@
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
 static void *intel_miptree_map_raw(struct brw_context *brw,
-                                   struct intel_mipmap_tree *mt);
+                                   struct intel_mipmap_tree *mt,
+                                   GLbitfield mode);
 
 static void intel_miptree_unmap_raw(struct intel_mipmap_tree *mt);
 
 static bool
-intel_miptree_alloc_mcs(struct brw_context *brw,
-                        struct intel_mipmap_tree *mt,
-                        GLuint num_samples);
+intel_miptree_alloc_aux(struct brw_context *brw,
+                        struct intel_mipmap_tree *mt);
 
-/**
- * Determine which MSAA layout should be used by the MSAA surface being
- * created, based on the chip generation and the surface type.
- */
-static enum intel_msaa_layout
-compute_msaa_layout(struct brw_context *brw, mesa_format format,
-                    enum intel_aux_disable aux_disable)
+static bool
+is_mcs_supported(const struct brw_context *brw, mesa_format format)
 {
    /* Prior to Gen7, all MSAA surfaces used IMS layout. */
    if (brw->gen < 7)
-      return INTEL_MSAA_LAYOUT_IMS;
+      return false;
 
    /* In Gen7, IMS layout is only used for depth and stencil buffers. */
    switch (_mesa_get_format_base_format(format)) {
    case GL_DEPTH_COMPONENT:
    case GL_STENCIL_INDEX:
    case GL_DEPTH_STENCIL:
-      return INTEL_MSAA_LAYOUT_IMS;
+      return false;
    default:
       /* From the Ivy Bridge PRM, Vol4 Part1 p77 ("MCS Enable"):
        *
@@ -89,21 +84,16 @@
        * which is expensive.
        */
       if (brw->gen == 7 && _mesa_get_format_datatype(format) == GL_INT) {
-         return INTEL_MSAA_LAYOUT_UMS;
-      } else if (aux_disable & INTEL_AUX_DISABLE_MCS) {
-         /* We can't use the CMS layout because it uses an aux buffer, the MCS
-          * buffer. So fallback to UMS, which is identical to CMS without the
-          * MCS. */
-         return INTEL_MSAA_LAYOUT_UMS;
+         return false;
       } else {
-         return INTEL_MSAA_LAYOUT_CMS;
+         return true;
       }
    }
 }
 
-bool
-intel_tiling_supports_non_msrt_mcs(const struct brw_context *brw,
-                                   unsigned tiling)
+static bool
+intel_tiling_supports_ccs(const struct brw_context *brw,
+                          enum isl_tiling tiling)
 {
    /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
     * Target(s)", beneath the "Fast Color Clear" bullet (p326):
@@ -113,9 +103,9 @@
     * Gen9 changes the restriction to Y-tile only.
     */
    if (brw->gen >= 9)
-      return tiling == I915_TILING_Y;
+      return tiling == ISL_TILING_Y0;
    else if (brw->gen >= 7)
-      return tiling != I915_TILING_NONE;
+      return tiling != ISL_TILING_LINEAR;
    else
       return false;
 }
@@ -141,19 +131,16 @@
  *     - MCS and Lossless compression is supported for TiledY/TileYs/TileYf
  *     non-MSRTs only.
  */
-bool
-intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
-                                           const struct intel_mipmap_tree *mt)
+static bool
+intel_miptree_supports_ccs(struct brw_context *brw,
+                           const struct intel_mipmap_tree *mt)
 {
    /* MCS support does not exist prior to Gen7 */
    if (brw->gen < 7)
       return false;
 
-   if (mt->aux_disable & INTEL_AUX_DISABLE_MCS)
-      return false;
-
    /* This function applies only to non-multisampled render targets. */
-   if (mt->num_samples > 1)
+   if (mt->surf.samples > 1)
       return false;
 
    /* MCS is only supported for color buffers */
@@ -168,7 +155,8 @@
       return false;
 
    const bool mip_mapped = mt->first_level != 0 || mt->last_level != 0;
-   const bool arrayed = mt->physical_depth0 != 1;
+   const bool arrayed = mt->surf.logical_level0_px.array_len > 1 ||
+                        mt->surf.logical_level0_px.depth > 1;
 
    if (arrayed) {
        /* Multisample surfaces with the CMS layout are not layered surfaces,
@@ -176,7 +164,7 @@
         * accidentally reject a multisampled surface here. We should have
         * rejected it earlier by explicitly checking the sample count.
         */
-      assert(mt->num_samples <= 1);
+      assert(mt->surf.samples == 1);
    }
 
    /* Handle the hardware restrictions...
@@ -204,48 +192,69 @@
    /* There's no point in using an MCS buffer if the surface isn't in a
     * renderable format.
     */
-   if (!brw->format_supported_as_render_target[mt->format])
+   if (!brw->mesa_format_supports_render[mt->format])
       return false;
 
-   if (brw->gen >= 9) {
-      mesa_format linear_format = _mesa_get_srgb_format_linear(mt->format);
-      const uint32_t brw_format = brw_isl_format_for_mesa_format(linear_format);
-      return isl_format_supports_ccs_e(&brw->screen->devinfo, brw_format);
-   } else
-      return true;
+   return true;
 }
 
-/* On Gen9 support for color buffer compression was extended to single
- * sampled surfaces. This is a helper considering both auxiliary buffer
- * type and number of samples telling if the given miptree represents
- * the new single sampled case - also called lossless compression.
- */
-bool
-intel_miptree_is_lossless_compressed(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt)
+static bool
+intel_tiling_supports_hiz(const struct brw_context *brw,
+                          enum isl_tiling tiling)
 {
-   /* Only available from Gen9 onwards. */
+   if (brw->gen < 6)
+      return false;
+
+   return tiling == ISL_TILING_Y0;
+}
+
+static bool
+intel_miptree_supports_hiz(const struct brw_context *brw,
+                           const struct intel_mipmap_tree *mt)
+{
+   if (!brw->has_hiz)
+      return false;
+
+   switch (mt->format) {
+   case MESA_FORMAT_Z_FLOAT32:
+   case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
+   case MESA_FORMAT_Z24_UNORM_X8_UINT:
+   case MESA_FORMAT_Z24_UNORM_S8_UINT:
+   case MESA_FORMAT_Z_UNORM16:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Return true if the format that will be used to access the miptree is
+ * CCS_E-compatible with the miptree's linear/non-sRGB format.
+ *
+ * Why use the linear format? Well, although the miptree may be specified with
+ * an sRGB format, the usage of that color space/format can be toggled. Since
+ * our HW tends to support more linear formats than sRGB ones, we use this
+ * format variant for check for CCS_E compatibility.
+ */
+static bool
+format_ccs_e_compat_with_miptree(const struct gen_device_info *devinfo,
+                                 const struct intel_mipmap_tree *mt,
+                                 enum isl_format access_format)
+{
+   assert(mt->aux_usage == ISL_AUX_USAGE_CCS_E);
+
+   mesa_format linear_format = _mesa_get_srgb_format_linear(mt->format);
+   enum isl_format isl_format = brw_isl_format_for_mesa_format(linear_format);
+   return isl_formats_are_ccs_e_compatible(devinfo, isl_format, access_format);
+}
+
+static bool
+intel_miptree_supports_ccs_e(struct brw_context *brw,
+                             const struct intel_mipmap_tree *mt)
+{
    if (brw->gen < 9)
       return false;
 
-   /* Compression always requires auxiliary buffer. */
-   if (!mt->mcs_buf)
-      return false;
-
-   /* Single sample compression is represented re-using msaa compression
-    * layout type: "Compressed Multisampled Surfaces".
-    */
-   if (mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS)
-      return false;
-
-   /* And finally distinguish between msaa and single sample case. */
-   return mt->num_samples <= 1;
-}
-
-bool
-intel_miptree_supports_lossless_compressed(struct brw_context *brw,
-                                           const struct intel_mipmap_tree *mt)
-{
    /* For now compression is only enabled for integer formats even though
     * there exist supported floating point formats also. This is a heuristic
     * decision based on current public benchmarks. In none of the cases these
@@ -256,16 +265,16 @@
    if (_mesa_get_format_datatype(mt->format) == GL_FLOAT)
       return false;
 
-   /* Fast clear mechanism and lossless compression go hand in hand. */
-   if (!intel_miptree_supports_non_msrt_fast_clear(brw, mt))
+   if (!intel_miptree_supports_ccs(brw, mt))
       return false;
 
-   /* Fast clear can be also used to clear srgb surfaces by using equivalent
-    * linear format. This trick, however, can't be extended to be used with
-    * lossless compression and therefore a check is needed to see if the format
-    * really is linear.
+   /* Many window system buffers are sRGB even if they are never rendered as
+    * sRGB.  For those, we want CCS_E for when sRGBEncode is false.  When the
+    * surface is used as sRGB, we fall back to CCS_D.
     */
-   return _mesa_get_srgb_format_linear(mt->format) == mt->format;
+   mesa_format linear_format = _mesa_get_srgb_format_linear(mt->format);
+   enum isl_format isl_format = brw_isl_format_for_mesa_format(linear_format);
+   return isl_format_supports_ccs_e(&brw->screen->devinfo, isl_format);
 }
 
 /**
@@ -284,256 +293,74 @@
    }
 }
 
+static bool
+create_mapping_table(GLenum target, unsigned first_level, unsigned last_level,
+                     unsigned depth0, struct intel_mipmap_level *table)
+{
+   for (unsigned level = first_level; level <= last_level; level++) {
+      const unsigned d =
+         target == GL_TEXTURE_3D ? minify(depth0, level) : depth0;
+
+      table[level].slice = calloc(d, sizeof(*table[0].slice));
+      if (!table[level].slice)
+         goto unwind;
+   }
+
+   return true;
+
+unwind:
+   for (unsigned level = first_level; level <= last_level; level++)
+      free(table[level].slice);
+
+   return false;
+}
+
+static bool
+needs_separate_stencil(const struct brw_context *brw,
+                       struct intel_mipmap_tree *mt,
+                       mesa_format format)
+{
+   if (_mesa_get_format_base_format(format) != GL_DEPTH_STENCIL)
+      return false;
+
+   if (brw->must_use_separate_stencil)
+      return true;
+
+   return brw->has_separate_stencil &&
+          intel_miptree_supports_hiz(brw, mt);
+}
 
 /**
- * @param for_bo Indicates that the caller is
- *        intel_miptree_create_for_bo(). If true, then do not create
- *        \c stencil_mt.
+ * Choose the aux usage for this miptree.  This function must be called fairly
+ * late in the miptree create process after we have a tiling.
  */
-static struct intel_mipmap_tree *
-intel_miptree_create_layout(struct brw_context *brw,
-                            GLenum target,
-                            mesa_format format,
-                            GLuint first_level,
-                            GLuint last_level,
-                            GLuint width0,
-                            GLuint height0,
-                            GLuint depth0,
-                            GLuint num_samples,
-                            uint32_t layout_flags)
+static void
+intel_miptree_choose_aux_usage(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt)
 {
-   struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
-   if (!mt)
-      return NULL;
+   assert(mt->aux_usage == ISL_AUX_USAGE_NONE);
 
-   DBG("%s target %s format %s level %d..%d slices %d <-- %p\n", __func__,
-       _mesa_enum_to_string(target),
-       _mesa_get_format_name(format),
-       first_level, last_level, depth0, mt);
-
-   if (target == GL_TEXTURE_1D_ARRAY)
-      assert(height0 == 1);
-
-   mt->target = target;
-   mt->format = format;
-   mt->first_level = first_level;
-   mt->last_level = last_level;
-   mt->logical_width0 = width0;
-   mt->logical_height0 = height0;
-   mt->logical_depth0 = depth0;
-   mt->aux_disable = (layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) != 0 ?
-      INTEL_AUX_DISABLE_ALL : INTEL_AUX_DISABLE_NONE;
-   mt->aux_disable |= INTEL_AUX_DISABLE_CCS;
-   mt->is_scanout = (layout_flags & MIPTREE_LAYOUT_FOR_SCANOUT) != 0;
-   exec_list_make_empty(&mt->hiz_map);
-   exec_list_make_empty(&mt->color_resolve_map);
-   mt->cpp = _mesa_get_format_bytes(format);
-   mt->num_samples = num_samples;
-   mt->compressed = _mesa_is_format_compressed(format);
-   mt->msaa_layout = INTEL_MSAA_LAYOUT_NONE;
-   mt->refcount = 1;
-
-   int depth_multiply = 1;
-   if (num_samples > 1) {
-      /* Adjust width/height/depth for MSAA */
-      mt->msaa_layout = compute_msaa_layout(brw, format, mt->aux_disable);
-      if (mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS) {
-         /* From the Ivybridge PRM, Volume 1, Part 1, page 108:
-          * "If the surface is multisampled and it is a depth or stencil
-          *  surface or Multisampled Surface StorageFormat in SURFACE_STATE is
-          *  MSFMT_DEPTH_STENCIL, WL and HL must be adjusted as follows before
-          *  proceeding:
-          *
-          *  +----------------------------------------------------------------+
-          *  | Num Multisamples |        W_l =         |        H_l =         |
-          *  +----------------------------------------------------------------+
-          *  |         2        | ceiling(W_l / 2) * 4 | H_l (no adjustment)  |
-          *  |         4        | ceiling(W_l / 2) * 4 | ceiling(H_l / 2) * 4 |
-          *  |         8        | ceiling(W_l / 2) * 8 | ceiling(H_l / 2) * 4 |
-          *  |        16        | ceiling(W_l / 2) * 8 | ceiling(H_l / 2) * 8 |
-          *  +----------------------------------------------------------------+
-          * "
-          *
-          * Note that MSFMT_DEPTH_STENCIL just means the IMS (interleaved)
-          * format rather than UMS/CMS (array slices).  The Sandybridge PRM,
-          * Volume 1, Part 1, Page 111 has the same formula for 4x MSAA.
-          *
-          * Another more complicated explanation for these adjustments comes
-          * from the Sandybridge PRM, volume 4, part 1, page 31:
-          *
-          *     "Any of the other messages (sample*, LOD, load4) used with a
-          *      (4x) multisampled surface will in-effect sample a surface with
-          *      double the height and width as that indicated in the surface
-          *      state. Each pixel position on the original-sized surface is
-          *      replaced with a 2x2 of samples with the following arrangement:
-          *
-          *         sample 0 sample 2
-          *         sample 1 sample 3"
-          *
-          * Thus, when sampling from a multisampled texture, it behaves as
-          * though the layout in memory for (x,y,sample) is:
-          *
-          *      (0,0,0) (0,0,2)   (1,0,0) (1,0,2)
-          *      (0,0,1) (0,0,3)   (1,0,1) (1,0,3)
-          *
-          *      (0,1,0) (0,1,2)   (1,1,0) (1,1,2)
-          *      (0,1,1) (0,1,3)   (1,1,1) (1,1,3)
-          *
-          * However, the actual layout of multisampled data in memory is:
-          *
-          *      (0,0,0) (1,0,0)   (0,0,1) (1,0,1)
-          *      (0,1,0) (1,1,0)   (0,1,1) (1,1,1)
-          *
-          *      (0,0,2) (1,0,2)   (0,0,3) (1,0,3)
-          *      (0,1,2) (1,1,2)   (0,1,3) (1,1,3)
-          *
-          * This pattern repeats for each 2x2 pixel block.
-          *
-          * As a result, when calculating the size of our 4-sample buffer for
-          * an odd width or height, we have to align before scaling up because
-          * sample 3 is in that bottom right 2x2 block.
-          */
-         switch (num_samples) {
-         case 2:
-            assert(brw->gen >= 8);
-            width0 = ALIGN(width0, 2) * 2;
-            height0 = ALIGN(height0, 2);
-            break;
-         case 4:
-            width0 = ALIGN(width0, 2) * 2;
-            height0 = ALIGN(height0, 2) * 2;
-            break;
-         case 8:
-            width0 = ALIGN(width0, 2) * 4;
-            height0 = ALIGN(height0, 2) * 2;
-            break;
-         case 16:
-            width0 = ALIGN(width0, 2) * 4;
-            height0 = ALIGN(height0, 2) * 4;
-            break;
-         default:
-            /* num_samples should already have been quantized to 0, 1, 2, 4, 8
-             * or 16.
-             */
-            unreachable("not reached");
-         }
+   if (mt->surf.samples > 1 && is_mcs_supported(brw, mt->format)) {
+      assert(mt->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY);
+      mt->aux_usage = ISL_AUX_USAGE_MCS;
+   } else if (intel_tiling_supports_ccs(brw, mt->surf.tiling) &&
+              intel_miptree_supports_ccs(brw, mt)) {
+      if (!unlikely(INTEL_DEBUG & DEBUG_NO_RBC) &&
+          intel_miptree_supports_ccs_e(brw, mt)) {
+         mt->aux_usage = ISL_AUX_USAGE_CCS_E;
       } else {
-         /* Non-interleaved */
-         depth_multiply = num_samples;
-         depth0 *= depth_multiply;
+         mt->aux_usage = ISL_AUX_USAGE_CCS_D;
       }
+   } else if (intel_tiling_supports_hiz(brw, mt->surf.tiling) &&
+              intel_miptree_supports_hiz(brw, mt)) {
+      mt->aux_usage = ISL_AUX_USAGE_HIZ;
    }
 
-   /* Set array_layout to ALL_SLICES_AT_EACH_LOD when array_spacing_lod0 can
-    * be used. array_spacing_lod0 is only used for non-IMS MSAA surfaces on
-    * Gen 7 and 8. On Gen 8 and 9 this layout is not available but it is still
-    * used on Gen8 to make it pick a qpitch value which doesn't include space
-    * for the mipmaps. On Gen9 this is not necessary because it will
-    * automatically pick a packed qpitch value whenever mt->first_level ==
-    * mt->last_level.
-    * TODO: can we use it elsewhere?
-    * TODO: also disable this on Gen8 and pick the qpitch value like Gen9
+   /* We can do fast-clear on all auxiliary surface types that are
+    * allocated through the normal texture creation paths.
     */
-   if (brw->gen >= 9) {
-      mt->array_layout = ALL_LOD_IN_EACH_SLICE;
-   } else {
-      switch (mt->msaa_layout) {
-      case INTEL_MSAA_LAYOUT_NONE:
-      case INTEL_MSAA_LAYOUT_IMS:
-         mt->array_layout = ALL_LOD_IN_EACH_SLICE;
-         break;
-      case INTEL_MSAA_LAYOUT_UMS:
-      case INTEL_MSAA_LAYOUT_CMS:
-         mt->array_layout = ALL_SLICES_AT_EACH_LOD;
-         break;
-      }
-   }
-
-   if (target == GL_TEXTURE_CUBE_MAP)
-      assert(depth0 == 6 * depth_multiply);
-
-   mt->physical_width0 = width0;
-   mt->physical_height0 = height0;
-   mt->physical_depth0 = depth0;
-
-   if (!(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
-       _mesa_get_format_base_format(format) == GL_DEPTH_STENCIL &&
-       (brw->must_use_separate_stencil ||
-	(brw->has_separate_stencil &&
-         intel_miptree_wants_hiz_buffer(brw, mt)))) {
-      uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6) {
-         stencil_flags |= MIPTREE_LAYOUT_GEN6_HIZ_STENCIL |
-                          MIPTREE_LAYOUT_TILING_ANY;
-      }
-
-      mt->stencil_mt = intel_miptree_create(brw,
-                                            mt->target,
-                                            MESA_FORMAT_S_UINT8,
-                                            mt->first_level,
-                                            mt->last_level,
-                                            mt->logical_width0,
-                                            mt->logical_height0,
-                                            mt->logical_depth0,
-                                            num_samples,
-                                            stencil_flags);
-
-      if (!mt->stencil_mt) {
-	 intel_miptree_release(&mt);
-	 return NULL;
-      }
-      mt->stencil_mt->r8stencil_needs_update = true;
-
-      /* Fix up the Z miptree format for how we're splitting out separate
-       * stencil.  Gen7 expects there to be no stencil bits in its depth buffer.
-       */
-      mt->format = intel_depth_format_for_depthstencil_format(mt->format);
-      mt->cpp = 4;
-
-      if (format == mt->format) {
-         _mesa_problem(NULL, "Unknown format %s in separate stencil mt\n",
-                       _mesa_get_format_name(mt->format));
-      }
-   }
-
-   if (layout_flags & MIPTREE_LAYOUT_GEN6_HIZ_STENCIL)
-      mt->array_layout = GEN6_HIZ_STENCIL;
-
-   /*
-    * Obey HALIGN_16 constraints for Gen8 and Gen9 buffers which are
-    * multisampled or have an AUX buffer attached to it.
-    *
-    * GEN  |    MSRT        | AUX_CCS_* or AUX_MCS
-    *  -------------------------------------------
-    *  9   |  HALIGN_16     |    HALIGN_16
-    *  8   |  HALIGN_ANY    |    HALIGN_16
-    *  7   |      ?         |        ?
-    *  6   |      ?         |        ?
-    */
-   if (intel_miptree_supports_non_msrt_fast_clear(brw, mt)) {
-      if (brw->gen >= 9 || (brw->gen == 8 && num_samples <= 1))
-         layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
-   } else if (brw->gen >= 9 && num_samples > 1) {
-      layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
-   } else {
-      const UNUSED bool is_lossless_compressed_aux =
-         brw->gen >= 9 && num_samples == 1 &&
-         mt->format == MESA_FORMAT_R_UINT32;
-
-      /* For now, nothing else has this requirement */
-      assert(is_lossless_compressed_aux ||
-             (layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
-   }
-
-   if (!brw_miptree_layout(brw, mt, layout_flags)) {
-      intel_miptree_release(&mt);
-      return NULL;
-   }
-
-   if (mt->aux_disable & INTEL_AUX_DISABLE_MCS)
-      assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
-
-   return mt;
+   if (mt->aux_usage != ISL_AUX_USAGE_NONE)
+      mt->supports_fast_clear = true;
 }
 
 
@@ -576,6 +403,247 @@
    }
 }
 
+static unsigned
+get_num_logical_layers(const struct intel_mipmap_tree *mt, unsigned level)
+{
+   if (mt->surf.dim == ISL_SURF_DIM_3D)
+      return minify(mt->surf.logical_level0_px.depth, level);
+   else
+      return mt->surf.logical_level0_px.array_len;
+}
+
+static unsigned
+get_num_phys_layers(const struct isl_surf *surf, unsigned level)
+{
+   /* In case of physical dimensions one needs to consider also the layout.
+    * See isl_calc_phys_level0_extent_sa().
+    */
+   if (surf->dim != ISL_SURF_DIM_3D)
+      return surf->phys_level0_sa.array_len;
+
+   if (surf->dim_layout == ISL_DIM_LAYOUT_GEN4_2D)
+      return minify(surf->phys_level0_sa.array_len, level);
+
+   return minify(surf->phys_level0_sa.depth, level);
+}
+
+/** \brief Assert that the level and layer are valid for the miptree. */
+void
+intel_miptree_check_level_layer(const struct intel_mipmap_tree *mt,
+                                uint32_t level,
+                                uint32_t layer)
+{
+   (void) mt;
+   (void) level;
+   (void) layer;
+
+   assert(level >= mt->first_level);
+   assert(level <= mt->last_level);
+   assert(layer < get_num_phys_layers(&mt->surf, level));
+}
+
+static enum isl_aux_state **
+create_aux_state_map(struct intel_mipmap_tree *mt,
+                     enum isl_aux_state initial)
+{
+   const uint32_t levels = mt->last_level + 1;
+
+   uint32_t total_slices = 0;
+   for (uint32_t level = 0; level < levels; level++)
+      total_slices += get_num_logical_layers(mt, level);
+
+   const size_t per_level_array_size = levels * sizeof(enum isl_aux_state *);
+
+   /* We're going to allocate a single chunk of data for both the per-level
+    * reference array and the arrays of aux_state.  This makes cleanup
+    * significantly easier.
+    */
+   const size_t total_size = per_level_array_size +
+                             total_slices * sizeof(enum isl_aux_state);
+   void *data = malloc(total_size);
+   if (data == NULL)
+      return NULL;
+
+   enum isl_aux_state **per_level_arr = data;
+   enum isl_aux_state *s = data + per_level_array_size;
+   for (uint32_t level = 0; level < levels; level++) {
+      per_level_arr[level] = s;
+      const unsigned level_layers = get_num_logical_layers(mt, level);
+      for (uint32_t a = 0; a < level_layers; a++)
+         *(s++) = initial;
+   }
+   assert((void *)s == data + total_size);
+
+   return per_level_arr;
+}
+
+static void
+free_aux_state_map(enum isl_aux_state **state)
+{
+   free(state);
+}
+
+static bool
+need_to_retile_as_linear(struct brw_context *brw, unsigned row_pitch,
+                         enum isl_tiling tiling, unsigned samples)
+{
+   if (samples > 1)
+      return false;
+
+   if (tiling == ISL_TILING_LINEAR)
+      return false;
+
+    /* If the width is much smaller than a tile, don't bother tiling. */
+   if (row_pitch < 64)
+      return true;
+
+   if (ALIGN(row_pitch, 512) >= 32768) {
+      perf_debug("row pitch %u too large to blit, falling back to untiled",
+                 row_pitch);
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+need_to_retile_as_x(const struct brw_context *brw, uint64_t size,
+                    enum isl_tiling tiling)
+{
+   /* If the BO is too large to fit in the aperture, we need to use the
+    * BLT engine to support it.  Prior to Sandybridge, the BLT paths can't
+    * handle Y-tiling, so we need to fall back to X.
+    */
+   if (brw->gen < 6 && size >= brw->max_gtt_map_object_size &&
+       tiling == ISL_TILING_Y0)
+      return true;
+
+   return false;
+}
+
+static struct intel_mipmap_tree *
+make_surface(struct brw_context *brw, GLenum target, mesa_format format,
+             unsigned first_level, unsigned last_level,
+             unsigned width0, unsigned height0, unsigned depth0,
+             unsigned num_samples, isl_tiling_flags_t tiling_flags,
+             isl_surf_usage_flags_t isl_usage_flags, uint32_t alloc_flags,
+             unsigned row_pitch, struct brw_bo *bo)
+{
+   struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
+   if (!mt)
+      return NULL;
+
+   if (!create_mapping_table(target, first_level, last_level, depth0,
+                             mt->level)) {
+      free(mt);
+      return NULL;
+   }
+
+   mt->refcount = 1;
+
+   if (target == GL_TEXTURE_CUBE_MAP ||
+       target == GL_TEXTURE_CUBE_MAP_ARRAY)
+      isl_usage_flags |= ISL_SURF_USAGE_CUBE_BIT;
+
+   DBG("%s: %s %s %ux %u:%u:%u %d..%d <-- %p\n",
+        __func__,
+       _mesa_enum_to_string(target),
+       _mesa_get_format_name(format),
+       num_samples, width0, height0, depth0,
+       first_level, last_level, mt);
+
+   struct isl_surf_init_info init_info = {
+      .dim = get_isl_surf_dim(target),
+      .format = translate_tex_format(brw, format, false),
+      .width = width0,
+      .height = height0,
+      .depth = target == GL_TEXTURE_3D ? depth0 : 1,
+      .levels = last_level - first_level + 1,
+      .array_len = target == GL_TEXTURE_3D ? 1 : depth0,
+      .samples = num_samples,
+      .row_pitch = row_pitch,
+      .usage = isl_usage_flags, 
+      .tiling_flags = tiling_flags,
+   };
+
+   if (!isl_surf_init_s(&brw->isl_dev, &mt->surf, &init_info))
+      goto fail;
+
+   /* In case caller doesn't specifically request Y-tiling (needed
+    * unconditionally for depth), check for corner cases needing special
+    * treatment.
+    */
+   if (tiling_flags & ~ISL_TILING_Y0_BIT) {
+      if (need_to_retile_as_linear(brw, mt->surf.row_pitch,
+                                   mt->surf.tiling, mt->surf.samples)) {
+         init_info.tiling_flags = 1u << ISL_TILING_LINEAR;
+         if (!isl_surf_init_s(&brw->isl_dev, &mt->surf, &init_info))
+            goto fail;
+      } else if (need_to_retile_as_x(brw, mt->surf.size, mt->surf.tiling)) {
+         init_info.tiling_flags = 1u << ISL_TILING_X;
+         if (!isl_surf_init_s(&brw->isl_dev, &mt->surf, &init_info))
+            goto fail;
+      }
+   }
+
+   /* In case of linear the buffer gets padded by fixed 64 bytes and therefore
+    * the size may not be multiple of row_pitch.
+    * See isl_apply_surface_padding().
+    */
+   if (mt->surf.tiling != ISL_TILING_LINEAR)
+      assert(mt->surf.size % mt->surf.row_pitch == 0);
+
+   if (!bo) {
+      mt->bo = brw_bo_alloc_tiled(brw->bufmgr, "isl-miptree",
+                                  mt->surf.size,
+                                  isl_tiling_to_i915_tiling(
+                                     mt->surf.tiling),
+                                  mt->surf.row_pitch, alloc_flags);
+      if (!mt->bo)
+         goto fail;
+   } else {
+      mt->bo = bo;
+   }
+
+   mt->first_level = first_level;
+   mt->last_level = last_level;
+   mt->target = target;
+   mt->format = format;
+   mt->aux_state = NULL;
+   mt->cpp = isl_format_get_layout(mt->surf.format)->bpb / 8;
+   mt->compressed = _mesa_is_format_compressed(format);
+
+   return mt;
+
+fail:
+   intel_miptree_release(&mt);
+   return NULL;
+}
+
+static bool
+make_separate_stencil_surface(struct brw_context *brw,
+                              struct intel_mipmap_tree *mt)
+{
+   mt->stencil_mt = make_surface(brw, mt->target, MESA_FORMAT_S_UINT8,
+                                 0, mt->surf.levels - 1,
+                                 mt->surf.logical_level0_px.width,
+                                 mt->surf.logical_level0_px.height,
+                                 mt->surf.dim == ISL_SURF_DIM_3D ?
+                                    mt->surf.logical_level0_px.depth :
+                                    mt->surf.logical_level0_px.array_len,
+                                 mt->surf.samples, ISL_TILING_W_BIT,
+                                 ISL_SURF_USAGE_STENCIL_BIT |
+                                 ISL_SURF_USAGE_TEXTURE_BIT,
+                                 BO_ALLOC_FOR_RENDER, 0, NULL);
+
+   if (!mt->stencil_mt)
+      return false;
+   
+   mt->stencil_mt->r8stencil_needs_update = true;
+
+   return true;
+}
+
 static struct intel_mipmap_tree *
 miptree_create(struct brw_context *brw,
                GLenum target,
@@ -586,9 +654,46 @@
                GLuint height0,
                GLuint depth0,
                GLuint num_samples,
-               uint32_t layout_flags)
+               enum intel_miptree_create_flags flags)
 {
-   struct intel_mipmap_tree *mt;
+   if (format == MESA_FORMAT_S_UINT8)
+      return make_surface(brw, target, format, first_level, last_level,
+                          width0, height0, depth0, num_samples,
+                          ISL_TILING_W_BIT,
+                          ISL_SURF_USAGE_STENCIL_BIT |
+                          ISL_SURF_USAGE_TEXTURE_BIT,
+                          BO_ALLOC_FOR_RENDER,
+                          0,
+                          NULL);
+
+   const GLenum base_format = _mesa_get_format_base_format(format);
+   if ((base_format == GL_DEPTH_COMPONENT ||
+        base_format == GL_DEPTH_STENCIL) &&
+       !(flags & MIPTREE_CREATE_LINEAR)) {
+      /* Fix up the Z miptree format for how we're splitting out separate
+       * stencil.  Gen7 expects there to be no stencil bits in its depth buffer.
+       */
+      const mesa_format depth_only_format =
+         intel_depth_format_for_depthstencil_format(format);
+      struct intel_mipmap_tree *mt = make_surface(
+         brw, target, brw->gen >= 6 ? depth_only_format : format,
+         first_level, last_level,
+         width0, height0, depth0, num_samples, ISL_TILING_Y0_BIT,
+         ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT,
+         BO_ALLOC_FOR_RENDER, 0, NULL);
+
+      if (needs_separate_stencil(brw, mt, format) &&
+          !make_separate_stencil_surface(brw, mt)) {
+         intel_miptree_release(&mt);
+         return NULL;
+      }
+
+      if (!(flags & MIPTREE_CREATE_NO_AUX))
+         intel_miptree_choose_aux_usage(brw, mt);
+
+      return mt;
+   }
+
    mesa_format tex_format = format;
    mesa_format etc_format = MESA_FORMAT_NONE;
    uint32_t alloc_flags = 0;
@@ -597,35 +702,31 @@
 
    etc_format = (format != tex_format) ? tex_format : MESA_FORMAT_NONE;
 
-   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
-   mt = intel_miptree_create_layout(brw, target, format,
-                                    first_level, last_level, width0,
-                                    height0, depth0, num_samples,
-                                    layout_flags);
+   if (flags & MIPTREE_CREATE_BUSY)
+      alloc_flags |= BO_ALLOC_FOR_RENDER;
+
+   isl_tiling_flags_t tiling_flags = (flags & MIPTREE_CREATE_LINEAR) ?
+      ISL_TILING_LINEAR_BIT : ISL_TILING_ANY_MASK;
+
+   /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */
+   if (brw->gen < 6)
+      tiling_flags &= ~ISL_TILING_Y0_BIT;
+
+   struct intel_mipmap_tree *mt = make_surface(
+                                     brw, target, format,
+                                     first_level, last_level,
+                                     width0, height0, depth0,
+                                     num_samples, tiling_flags,
+                                     ISL_SURF_USAGE_RENDER_TARGET_BIT |
+                                     ISL_SURF_USAGE_TEXTURE_BIT,
+                                     alloc_flags, 0, NULL);
    if (!mt)
       return NULL;
 
-   if (mt->tiling == (I915_TILING_Y | I915_TILING_X))
-      mt->tiling = I915_TILING_Y;
-
-   if (layout_flags & MIPTREE_LAYOUT_ACCELERATED_UPLOAD)
-      alloc_flags |= BO_ALLOC_FOR_RENDER;
-
    mt->etc_format = etc_format;
 
-   if (format == MESA_FORMAT_S_UINT8) {
-      /* Align to size of W tile, 64x64. */
-      mt->bo = brw_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                  ALIGN(mt->total_width, 64),
-                                  ALIGN(mt->total_height, 64),
-                                  mt->cpp, mt->tiling, &mt->pitch,
-                                  alloc_flags);
-   } else {
-      mt->bo = brw_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                  mt->total_width, mt->total_height,
-                                  mt->cpp, mt->tiling, &mt->pitch,
-                                  alloc_flags);
-   }
+   if (!(flags & MIPTREE_CREATE_NO_AUX))
+      intel_miptree_choose_aux_usage(brw, mt);
 
    return mt;
 }
@@ -640,73 +741,23 @@
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
-                     uint32_t layout_flags)
+                     enum intel_miptree_create_flags flags)
 {
+   assert(num_samples > 0);
+
    struct intel_mipmap_tree *mt = miptree_create(
                                      brw, target, format,
                                      first_level, last_level,
                                      width0, height0, depth0, num_samples,
-                                     layout_flags);
-
-   /* If the BO is too large to fit in the aperture, we need to use the
-    * BLT engine to support it.  Prior to Sandybridge, the BLT paths can't
-    * handle Y-tiling, so we need to fall back to X.
-    */
-   if (brw->gen < 6 && mt->bo->size >= brw->max_gtt_map_object_size &&
-       mt->tiling == I915_TILING_Y) {
-      const uint32_t alloc_flags =
-         (layout_flags & MIPTREE_LAYOUT_ACCELERATED_UPLOAD) ?
-         BO_ALLOC_FOR_RENDER : 0;
-      perf_debug("%dx%d miptree larger than aperture; falling back to X-tiled\n",
-                 mt->total_width, mt->total_height);
-
-      mt->tiling = I915_TILING_X;
-      brw_bo_unreference(mt->bo);
-      mt->bo = brw_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                  mt->total_width, mt->total_height, mt->cpp,
-                                  mt->tiling, &mt->pitch, alloc_flags);
-   }
+                                     flags);
+   if (!mt)
+      return NULL;
 
    mt->offset = 0;
 
-   if (!mt->bo) {
-       intel_miptree_release(&mt);
-       return NULL;
-   }
-
-
-   if (mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
-      assert(mt->num_samples > 1);
-      if (!intel_miptree_alloc_mcs(brw, mt, num_samples)) {
-         intel_miptree_release(&mt);
-         return NULL;
-      }
-   }
-
-   /* If this miptree is capable of supporting fast color clears, set
-    * fast_clear_state appropriately to ensure that fast clears will occur.
-    * Allocation of the MCS miptree will be deferred until the first fast
-    * clear actually occurs or when compressed single sampled buffer is
-    * written by the GPU for the first time.
-    */
-   if (intel_tiling_supports_non_msrt_mcs(brw, mt->tiling) &&
-       intel_miptree_supports_non_msrt_fast_clear(brw, mt)) {
-      mt->aux_disable &= ~INTEL_AUX_DISABLE_CCS;
-      assert(brw->gen < 8 || mt->halign == 16 || num_samples <= 1);
-
-      /* On Gen9+ clients are not currently capable of consuming compressed
-       * single-sampled buffers. Disabling compression allows us to skip
-       * resolves.
-       */
-      const bool lossless_compression_disabled = INTEL_DEBUG & DEBUG_NO_RBC;
-      const bool is_lossless_compressed =
-         unlikely(!lossless_compression_disabled) &&
-         brw->gen >= 9 && !mt->is_scanout &&
-         intel_miptree_supports_lossless_compressed(brw, mt);
-
-      if (is_lossless_compressed) {
-         intel_miptree_alloc_non_msrt_mcs(brw, mt, is_lossless_compressed);
-      }
+   if (!intel_miptree_alloc_aux(brw, mt)) {
+      intel_miptree_release(&mt);
+      return NULL;
    }
 
    return mt;
@@ -721,11 +772,44 @@
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            uint32_t layout_flags)
+                            enum intel_miptree_create_flags flags)
 {
    struct intel_mipmap_tree *mt;
    uint32_t tiling, swizzle;
-   GLenum target;
+   const GLenum target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
+   const GLenum base_format = _mesa_get_format_base_format(format);
+
+   if ((base_format == GL_DEPTH_COMPONENT ||
+        base_format == GL_DEPTH_STENCIL)) {
+      const mesa_format depth_only_format =
+         intel_depth_format_for_depthstencil_format(format);
+      mt = make_surface(brw, target,
+                        brw->gen >= 6 ? depth_only_format : format,
+                        0, 0, width, height, depth, 1, ISL_TILING_Y0_BIT,
+                        ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT,
+                        BO_ALLOC_FOR_RENDER, pitch, bo);
+
+      brw_bo_reference(bo);
+
+      if (!(flags & MIPTREE_CREATE_NO_AUX))
+         intel_miptree_choose_aux_usage(brw, mt);
+
+      return mt;
+   } else if (format == MESA_FORMAT_S_UINT8) {
+      mt = make_surface(brw, target, MESA_FORMAT_S_UINT8,
+                        0, 0, width, height, depth, 1,
+                        ISL_TILING_W_BIT,
+                        ISL_SURF_USAGE_STENCIL_BIT |
+                        ISL_SURF_USAGE_TEXTURE_BIT,
+                        BO_ALLOC_FOR_RENDER, pitch, bo);
+      if (!mt)
+         return NULL;
+
+      assert(bo->size >= mt->surf.size);
+
+      brw_bo_reference(bo);
+      return mt;
+   }
 
    brw_bo_get_tiling(bo, &tiling, &swizzle);
 
@@ -740,27 +824,166 @@
     */
    assert(pitch >= 0);
 
-   target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
-
    /* The BO already has a tiling format and we shouldn't confuse the lower
     * layers by making it try to find a tiling format again.
     */
-   assert((layout_flags & MIPTREE_LAYOUT_TILING_ANY) == 0);
-   assert((layout_flags & MIPTREE_LAYOUT_TILING_NONE) == 0);
+   assert((flags & MIPTREE_CREATE_LINEAR) == 0);
 
-   layout_flags |= MIPTREE_LAYOUT_FOR_BO;
-   mt = intel_miptree_create_layout(brw, target, format,
-                                    0, 0,
-                                    width, height, depth, 0,
-                                    layout_flags);
+   mt = make_surface(brw, target, format,
+                     0, 0, width, height, depth, 1,
+                     1lu << isl_tiling_from_i915_tiling(tiling),
+                     ISL_SURF_USAGE_RENDER_TARGET_BIT |
+                     ISL_SURF_USAGE_TEXTURE_BIT,
+                     0, pitch, bo);
    if (!mt)
       return NULL;
 
    brw_bo_reference(bo);
    mt->bo = bo;
-   mt->pitch = pitch;
    mt->offset = offset;
-   mt->tiling = tiling;
+
+   if (!(flags & MIPTREE_CREATE_NO_AUX)) {
+      intel_miptree_choose_aux_usage(brw, mt);
+
+      if (!intel_miptree_alloc_aux(brw, mt)) {
+         intel_miptree_release(&mt);
+         return NULL;
+      }
+   }
+
+   return mt;
+}
+
+static struct intel_mipmap_tree *
+miptree_create_for_planar_image(struct brw_context *brw,
+                                __DRIimage *image, GLenum target)
+{
+   struct intel_image_format *f = image->planar_format;
+   struct intel_mipmap_tree *planar_mt = NULL;
+
+   for (int i = 0; i < f->nplanes; i++) {
+      const int index = f->planes[i].buffer_index;
+      const uint32_t dri_format = f->planes[i].dri_format;
+      const mesa_format format = driImageFormatToGLFormat(dri_format);
+      const uint32_t width = image->width >> f->planes[i].width_shift;
+      const uint32_t height = image->height >> f->planes[i].height_shift;
+
+      /* Disable creation of the texture's aux buffers because the driver
+       * exposes no EGL API to manage them. That is, there is no API for
+       * resolving the aux buffer's content to the main buffer nor for
+       * invalidating the aux buffer's content.
+       */
+      struct intel_mipmap_tree *mt =
+         intel_miptree_create_for_bo(brw, image->bo, format,
+                                     image->offsets[index],
+                                     width, height, 1,
+                                     image->strides[index],
+                                     MIPTREE_CREATE_NO_AUX);
+      if (mt == NULL)
+         return NULL;
+
+      mt->target = target;
+
+      if (i == 0)
+         planar_mt = mt;
+      else
+         planar_mt->plane[i - 1] = mt;
+   }
+
+   return planar_mt;
+}
+
+struct intel_mipmap_tree *
+intel_miptree_create_for_dri_image(struct brw_context *brw,
+                                   __DRIimage *image, GLenum target,
+                                   enum isl_colorspace colorspace,
+                                   bool is_winsys_image)
+{
+   if (image->planar_format && image->planar_format->nplanes > 0) {
+      assert(colorspace == ISL_COLORSPACE_NONE ||
+             colorspace == ISL_COLORSPACE_YUV);
+      return miptree_create_for_planar_image(brw, image, target);
+   }
+
+   mesa_format format = image->format;
+   switch (colorspace) {
+   case ISL_COLORSPACE_NONE:
+      /* Keep the image format unmodified */
+      break;
+
+   case ISL_COLORSPACE_LINEAR:
+      format =_mesa_get_srgb_format_linear(format);
+      break;
+
+   case ISL_COLORSPACE_SRGB:
+      format =_mesa_get_linear_format_srgb(format);
+      break;
+
+   default:
+      unreachable("Inalid colorspace for non-planar image");
+   }
+
+   if (!brw->ctx.TextureFormatSupported[format]) {
+      /* The texture storage paths in core Mesa detect if the driver does not
+       * support the user-requested format, and then searches for a
+       * fallback format. The DRIimage code bypasses core Mesa, though. So we
+       * do the fallbacks here for important formats.
+       *
+       * We must support DRM_FOURCC_XBGR8888 textures because the Android
+       * framework produces HAL_PIXEL_FORMAT_RGBX8888 winsys surfaces, which
+       * the Chrome OS compositor consumes as dma_buf EGLImages.
+       */
+      format = _mesa_format_fallback_rgbx_to_rgba(format);
+   }
+
+   if (!brw->ctx.TextureFormatSupported[format])
+      return NULL;
+
+   /* If this image comes in from a window system, we have different
+    * requirements than if it comes in via an EGL import operation.  Window
+    * system images can use any form of auxiliary compression we wish because
+    * they get "flushed" before being handed off to the window system and we
+    * have the opportunity to do resolves.  Window system buffers also may be
+    * used for scanout so we need to flag that appropriately.
+    */
+   const enum intel_miptree_create_flags mt_create_flags =
+      is_winsys_image ? 0 : MIPTREE_CREATE_NO_AUX;
+
+   /* Disable creation of the texture's aux buffers because the driver exposes
+    * no EGL API to manage them. That is, there is no API for resolving the aux
+    * buffer's content to the main buffer nor for invalidating the aux buffer's
+    * content.
+    */
+   struct intel_mipmap_tree *mt =
+      intel_miptree_create_for_bo(brw, image->bo, format,
+                                  image->offset, image->width, image->height, 1,
+                                  image->pitch, mt_create_flags);
+   if (mt == NULL)
+      return NULL;
+
+   mt->target = target;
+   mt->level[0].level_x = image->tile_x;
+   mt->level[0].level_y = image->tile_y;
+
+   /* From "OES_EGL_image" error reporting. We report GL_INVALID_OPERATION
+    * for EGL images from non-tile aligned sufaces in gen4 hw and earlier which has
+    * trouble resolving back to destination image due to alignment issues.
+    */
+   if (!brw->has_surface_tile_offset) {
+      uint32_t draw_x, draw_y;
+      intel_miptree_get_tile_offsets(mt, 0, 0, &draw_x, &draw_y);
+
+      if (draw_x != 0 || draw_y != 0) {
+         _mesa_error(&brw->ctx, GL_INVALID_OPERATION, __func__);
+         intel_miptree_release(&mt);
+         return NULL;
+      }
+   }
+
+   /* Don't assume coherency for imported EGLimages.  We don't know what
+    * external clients are going to do with it.  They may scan it out.
+    */
+   image->bo->cache_coherent = false;
 
    return mt;
 }
@@ -775,18 +998,17 @@
  * that will contain the actual rendering (which is lazily resolved to
  * irb->singlesample_mt).
  */
-void
+bool
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                          struct intel_renderbuffer *irb,
-                                         struct brw_bo *bo,
+                                         struct intel_mipmap_tree *singlesample_mt,
                                          uint32_t width, uint32_t height,
                                          uint32_t pitch)
 {
-   struct intel_mipmap_tree *singlesample_mt = NULL;
    struct intel_mipmap_tree *multisample_mt = NULL;
    struct gl_renderbuffer *rb = &irb->Base.Base;
    mesa_format format = rb->Format;
-   int num_samples = rb->NumSamples;
+   const unsigned num_samples = MAX2(rb->NumSamples, 1);
 
    /* Only the front and back buffers, which are color buffers, are allocated
     * through the image loader.
@@ -794,29 +1016,9 @@
    assert(_mesa_get_format_base_format(format) == GL_RGB ||
           _mesa_get_format_base_format(format) == GL_RGBA);
 
-   singlesample_mt = intel_miptree_create_for_bo(intel,
-                                                 bo,
-                                                 format,
-                                                 0,
-                                                 width,
-                                                 height,
-                                                 1,
-                                                 pitch,
-                                                 MIPTREE_LAYOUT_FOR_SCANOUT);
-   if (!singlesample_mt)
-      goto fail;
+   assert(singlesample_mt);
 
-   /* If this miptree is capable of supporting fast color clears, set
-    * mcs_state appropriately to ensure that fast clears will occur.
-    * Allocation of the MCS miptree will be deferred until the first fast
-    * clear actually occurs.
-    */
-   if (intel_tiling_supports_non_msrt_mcs(intel, singlesample_mt->tiling) &&
-       intel_miptree_supports_non_msrt_fast_clear(intel, singlesample_mt)) {
-      singlesample_mt->aux_disable &= ~INTEL_AUX_DISABLE_CCS;
-   }
-
-   if (num_samples == 0) {
+   if (num_samples == 1) {
       intel_miptree_release(&irb->mt);
       irb->mt = singlesample_mt;
 
@@ -826,8 +1028,8 @@
       irb->singlesample_mt = singlesample_mt;
 
       if (!irb->mt ||
-          irb->mt->logical_width0 != width ||
-          irb->mt->logical_height0 != height) {
+          irb->mt->surf.logical_level0_px.width != width ||
+          irb->mt->surf.logical_level0_px.height != height) {
          multisample_mt = intel_miptree_create_for_renderbuffer(intel,
                                                                 format,
                                                                 width,
@@ -841,12 +1043,11 @@
          irb->mt = multisample_mt;
       }
    }
-   return;
+   return true;
 
 fail:
-   intel_miptree_release(&irb->singlesample_mt);
    intel_miptree_release(&irb->mt);
-   return;
+   return false;
 }
 
 struct intel_mipmap_tree*
@@ -858,24 +1059,14 @@
 {
    struct intel_mipmap_tree *mt;
    uint32_t depth = 1;
-   bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
-   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                 MIPTREE_LAYOUT_TILING_ANY |
-                                 MIPTREE_LAYOUT_FOR_SCANOUT;
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             layout_flags);
+                             MIPTREE_CREATE_BUSY);
    if (!mt)
       goto fail;
 
-   if (intel_miptree_wants_hiz_buffer(brw, mt)) {
-      ok = intel_miptree_alloc_hiz(brw, mt);
-      if (!ok)
-         goto fail;
-   }
-
    return mt;
 
 fail:
@@ -901,17 +1092,14 @@
 }
 
 static void
-intel_miptree_hiz_buffer_free(struct intel_miptree_hiz_buffer *hiz_buf)
+intel_miptree_aux_buffer_free(struct intel_miptree_aux_buffer *aux_buf)
 {
-   if (hiz_buf == NULL)
+   if (aux_buf == NULL)
       return;
 
-   if (hiz_buf->mt)
-      intel_miptree_release(&hiz_buf->mt);
-   else
-      brw_bo_unreference(hiz_buf->aux_base.bo);
+   brw_bo_unreference(aux_buf->bo);
 
-   free(hiz_buf);
+   free(aux_buf);
 }
 
 void
@@ -929,13 +1117,9 @@
       brw_bo_unreference((*mt)->bo);
       intel_miptree_release(&(*mt)->stencil_mt);
       intel_miptree_release(&(*mt)->r8stencil_mt);
-      intel_miptree_hiz_buffer_free((*mt)->hiz_buf);
-      if ((*mt)->mcs_buf) {
-         brw_bo_unreference((*mt)->mcs_buf->bo);
-         free((*mt)->mcs_buf);
-      }
-      intel_resolve_map_clear(&(*mt)->hiz_map);
-      intel_resolve_map_clear(&(*mt)->color_resolve_map);
+      intel_miptree_aux_buffer_free((*mt)->hiz_buf);
+      intel_miptree_aux_buffer_free((*mt)->mcs_buf);
+      free_aux_state_map((*mt)->aux_state);
 
       intel_miptree_release(&(*mt)->plane[0]);
       intel_miptree_release(&(*mt)->plane[1]);
@@ -1019,73 +1203,18 @@
    if (mt->target == GL_TEXTURE_CUBE_MAP)
       depth = 6;
 
-   int level_depth = mt->level[level].depth;
-   if (mt->num_samples > 1) {
-      switch (mt->msaa_layout) {
-      case INTEL_MSAA_LAYOUT_NONE:
-      case INTEL_MSAA_LAYOUT_IMS:
-         break;
-      case INTEL_MSAA_LAYOUT_UMS:
-      case INTEL_MSAA_LAYOUT_CMS:
-         level_depth /= mt->num_samples;
-         break;
-      }
-   }
-
-   /* Test image dimensions against the base level image adjusted for
-    * minification.  This will also catch images not present in the
-    * tree, changed targets, etc.
-    */
-   if (width != minify(mt->logical_width0, level - mt->first_level) ||
-       height != minify(mt->logical_height0, level - mt->first_level) ||
-       depth != level_depth) {
-      return false;
-   }
-
-   if (image->NumSamples != mt->num_samples)
+   if (level >= mt->surf.levels)
       return false;
 
-   return true;
-}
+   const unsigned level_depth =
+      mt->surf.dim == ISL_SURF_DIM_3D ?
+         minify(mt->surf.logical_level0_px.depth, level) :
+         mt->surf.logical_level0_px.array_len;
 
-
-void
-intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
-			     GLuint level,
-			     GLuint x, GLuint y, GLuint d)
-{
-   mt->level[level].depth = d;
-   mt->level[level].level_x = x;
-   mt->level[level].level_y = y;
-
-   DBG("%s level %d, depth %d, offset %d,%d\n", __func__,
-       level, d, x, y);
-
-   assert(mt->level[level].slice == NULL);
-
-   mt->level[level].slice = calloc(d, sizeof(*mt->level[0].slice));
-   mt->level[level].slice[0].x_offset = mt->level[level].level_x;
-   mt->level[level].slice[0].y_offset = mt->level[level].level_y;
-}
-
-
-void
-intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint img,
-			       GLuint x, GLuint y)
-{
-   if (img == 0 && level == 0)
-      assert(x == 0 && y == 0);
-
-   assert(img < mt->level[level].depth);
-
-   mt->level[level].slice[img].x_offset = mt->level[level].level_x + x;
-   mt->level[level].slice[img].y_offset = mt->level[level].level_y + y;
-
-   DBG("%s level %d img %d pos %d,%d\n",
-       __func__, level, img,
-       mt->level[level].slice[img].x_offset,
-       mt->level[level].slice[img].y_offset);
+   return width == minify(mt->surf.logical_level0_px.width, level) &&
+          height == minify(mt->surf.logical_level0_px.height, level) &&
+          depth == level_depth &&
+          MAX2(image->NumSamples, 1) == mt->surf.samples;
 }
 
 void
@@ -1093,10 +1222,34 @@
 			       GLuint level, GLuint slice,
 			       GLuint *x, GLuint *y)
 {
-   assert(slice < mt->level[level].depth);
+   if (level == 0 && slice == 0) {
+      *x = mt->level[0].level_x;
+      *y = mt->level[0].level_y;
+      return;
+   }
 
-   *x = mt->level[level].slice[slice].x_offset;
-   *y = mt->level[level].slice[slice].y_offset;
+   uint32_t x_offset_sa, y_offset_sa;
+
+   /* Miptree itself can have an offset only if it represents a single
+    * slice in an imported buffer object.
+    * See intel_miptree_create_for_dri_image().
+    */
+   assert(mt->level[0].level_x == 0);
+   assert(mt->level[0].level_y == 0);
+
+   /* Given level is relative to level zero while the miptree may be
+    * represent just a subset of all levels starting from 'first_level'.
+    */
+   assert(level >= mt->first_level);
+   level -= mt->first_level;
+
+   const unsigned z = mt->surf.dim == ISL_SURF_DIM_3D ? slice : 0;
+   slice = mt->surf.dim == ISL_SURF_DIM_3D ? 0 : slice;
+   isl_surf_get_image_offset_el(&mt->surf, level, slice, z,
+                                &x_offset_sa, &y_offset_sa);
+
+   *x = x_offset_sa;
+   *y = y_offset_sa;
 }
 
 
@@ -1106,19 +1259,19 @@
  * and tile_h is set to 1.
  */
 void
-intel_get_tile_dims(uint32_t tiling, uint32_t cpp,
+intel_get_tile_dims(enum isl_tiling tiling, uint32_t cpp,
                     uint32_t *tile_w, uint32_t *tile_h)
 {
    switch (tiling) {
-   case I915_TILING_X:
+   case ISL_TILING_X:
       *tile_w = 512;
       *tile_h = 8;
       break;
-   case I915_TILING_Y:
+   case ISL_TILING_Y0:
       *tile_w = 128;
       *tile_h = 32;
       break;
-   case I915_TILING_NONE:
+   case ISL_TILING_LINEAR:
       *tile_w = cpp;
       *tile_h = 1;
       break;
@@ -1134,7 +1287,7 @@
  * untiled, the masks are set to 0.
  */
 void
-intel_get_tile_masks(uint32_t tiling, uint32_t cpp,
+intel_get_tile_masks(enum isl_tiling tiling, uint32_t cpp,
                      uint32_t *mask_x, uint32_t *mask_y)
 {
    uint32_t tile_w_bytes, tile_h;
@@ -1155,19 +1308,18 @@
                                  uint32_t x, uint32_t y)
 {
    int cpp = mt->cpp;
-   uint32_t pitch = mt->pitch;
-   uint32_t tiling = mt->tiling;
+   uint32_t pitch = mt->surf.row_pitch;
 
-   switch (tiling) {
+   switch (mt->surf.tiling) {
    default:
       unreachable("not reached");
-   case I915_TILING_NONE:
+   case ISL_TILING_LINEAR:
       return y * pitch + x * cpp;
-   case I915_TILING_X:
+   case ISL_TILING_X:
       assert((x % (512 / cpp)) == 0);
       assert((y % 8) == 0);
       return y * pitch + x / (512 / cpp) * 4096;
-   case I915_TILING_Y:
+   case ISL_TILING_Y0:
       assert((x % (128 / cpp)) == 0);
       assert((y % 32) == 0);
       return y * pitch + x / (128 / cpp) * 4096;
@@ -1193,7 +1345,7 @@
    uint32_t x, y;
    uint32_t mask_x, mask_y;
 
-   intel_get_tile_masks(mt->tiling, mt->cpp, &mask_x, &mask_y);
+   intel_get_tile_masks(mt->surf.tiling, mt->cpp, &mask_x, &mask_y);
    intel_miptree_get_image_offset(mt, level, slice, &x, &y);
 
    *tile_x = x & mask_x;
@@ -1204,26 +1356,25 @@
 
 static void
 intel_miptree_copy_slice_sw(struct brw_context *brw,
-                            struct intel_mipmap_tree *dst_mt,
                             struct intel_mipmap_tree *src_mt,
-                            int level,
-                            int slice,
-                            int width,
-                            int height)
+                            unsigned src_level, unsigned src_layer,
+                            struct intel_mipmap_tree *dst_mt,
+                            unsigned dst_level, unsigned dst_layer,
+                            unsigned width, unsigned height)
 {
    void *src, *dst;
    ptrdiff_t src_stride, dst_stride;
-   int cpp = dst_mt->cpp;
+   const unsigned cpp = (isl_format_get_layout(dst_mt->surf.format)->bpb / 8);
 
    intel_miptree_map(brw, src_mt,
-                     level, slice,
+                     src_level, src_layer,
                      0, 0,
                      width, height,
                      GL_MAP_READ_BIT | BRW_MAP_DIRECT_BIT,
                      &src, &src_stride);
 
    intel_miptree_map(brw, dst_mt,
-                     level, slice,
+                     dst_level, dst_layer,
                      0, 0,
                      width, height,
                      GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT |
@@ -1249,8 +1400,8 @@
       }
    }
 
-   intel_miptree_unmap(brw, dst_mt, level, slice);
-   intel_miptree_unmap(brw, src_mt, level, slice);
+   intel_miptree_unmap(brw, dst_mt, dst_level, dst_layer);
+   intel_miptree_unmap(brw, src_mt, src_level, src_layer);
 
    /* Don't forget to copy the stencil data over, too.  We could have skipped
     * passing BRW_MAP_DIRECT_BIT, but that would have meant intel_miptree_map
@@ -1259,31 +1410,30 @@
     */
    if (dst_mt->stencil_mt) {
       assert(src_mt->stencil_mt);
-      intel_miptree_copy_slice_sw(brw, dst_mt->stencil_mt, src_mt->stencil_mt,
-                                  level, slice, width, height);
+      intel_miptree_copy_slice_sw(brw,
+                                  src_mt->stencil_mt, src_level, src_layer,
+                                  dst_mt->stencil_mt, dst_level, dst_layer,
+                                  width, height);
    }
 }
 
-static void
+void
 intel_miptree_copy_slice(struct brw_context *brw,
-			 struct intel_mipmap_tree *dst_mt,
-			 struct intel_mipmap_tree *src_mt,
-			 int level,
-			 int face,
-			 int depth)
+                         struct intel_mipmap_tree *src_mt,
+                         unsigned src_level, unsigned src_layer,
+                         struct intel_mipmap_tree *dst_mt,
+                         unsigned dst_level, unsigned dst_layer)
 
 {
    mesa_format format = src_mt->format;
-   uint32_t width = minify(src_mt->physical_width0, level - src_mt->first_level);
-   uint32_t height = minify(src_mt->physical_height0, level - src_mt->first_level);
-   int slice;
+   unsigned width = minify(src_mt->surf.phys_level0_sa.width,
+                           src_level - src_mt->first_level);
+   unsigned height = minify(src_mt->surf.phys_level0_sa.height,
+                            src_level - src_mt->first_level);
 
-   if (face > 0)
-      slice = face;
-   else
-      slice = depth;
+   assert(src_layer < get_num_phys_layers(&src_mt->surf,
+                                          src_level - src_mt->first_level));
 
-   assert(depth < src_mt->level[level].depth);
    assert(src_mt->format == dst_mt->format);
 
    if (dst_mt->compressed) {
@@ -1299,31 +1449,35 @@
     */
    if (src_mt->stencil_mt) {
       intel_miptree_copy_slice_sw(brw,
-                                  dst_mt, src_mt,
-                                  level, slice,
+                                  src_mt, src_level, src_layer,
+                                  dst_mt, dst_level, dst_layer,
                                   width, height);
       return;
    }
 
    uint32_t dst_x, dst_y, src_x, src_y;
-   intel_miptree_get_image_offset(dst_mt, level, slice, &dst_x, &dst_y);
-   intel_miptree_get_image_offset(src_mt, level, slice, &src_x, &src_y);
+   intel_miptree_get_image_offset(dst_mt, dst_level, dst_layer,
+                                  &dst_x, &dst_y);
+   intel_miptree_get_image_offset(src_mt, src_level, src_layer,
+                                  &src_x, &src_y);
 
    DBG("validate blit mt %s %p %d,%d/%d -> mt %s %p %d,%d/%d (%dx%d)\n",
        _mesa_get_format_name(src_mt->format),
-       src_mt, src_x, src_y, src_mt->pitch,
+       src_mt, src_x, src_y, src_mt->surf.row_pitch,
        _mesa_get_format_name(dst_mt->format),
-       dst_mt, dst_x, dst_y, dst_mt->pitch,
+       dst_mt, dst_x, dst_y, dst_mt->surf.row_pitch,
        width, height);
 
    if (!intel_miptree_blit(brw,
-                           src_mt, level, slice, 0, 0, false,
-                           dst_mt, level, slice, 0, 0, false,
+                           src_mt, src_level, src_layer, 0, 0, false,
+                           dst_mt, dst_level, dst_layer, 0, 0, false,
                            width, height, GL_COPY)) {
       perf_debug("miptree validate blit for %s failed\n",
                  _mesa_get_format_name(format));
 
-      intel_miptree_copy_slice_sw(brw, dst_mt, src_mt, level, slice,
+      intel_miptree_copy_slice_sw(brw,
+                                  src_mt, src_level, src_layer,
+                                  dst_mt, dst_level, dst_layer,
                                   width, height);
    }
 }
@@ -1346,17 +1500,28 @@
    struct intel_texture_object *intel_obj =
       intel_texture_object(intelImage->base.Base.TexObject);
    int level = intelImage->base.Base.Level;
-   int face = intelImage->base.Base.Face;
+   const unsigned face = intelImage->base.Base.Face;
+   unsigned start_layer, end_layer;
 
-   GLuint depth;
-   if (intel_obj->base.Target == GL_TEXTURE_1D_ARRAY)
-      depth = intelImage->base.Base.Height;
-   else
-      depth = intelImage->base.Base.Depth;
+   if (intel_obj->base.Target == GL_TEXTURE_1D_ARRAY) {
+      assert(face == 0);
+      assert(intelImage->base.Base.Height);
+      start_layer = 0;
+      end_layer = intelImage->base.Base.Height - 1;
+   } else if (face > 0) {
+      start_layer = face;
+      end_layer = face;
+   } else {
+      assert(intelImage->base.Base.Depth);
+      start_layer = 0;
+      end_layer = intelImage->base.Base.Depth - 1;
+   }
 
    if (!invalidate) {
-      for (int slice = 0; slice < depth; slice++) {
-         intel_miptree_copy_slice(brw, dst_mt, src_mt, level, face, slice);
+      for (unsigned i = start_layer; i <= end_layer; i++) {
+         intel_miptree_copy_slice(brw,
+                                  src_mt, level, i,
+                                  dst_mt, level, i);
       }
    }
 
@@ -1381,64 +1546,45 @@
     *
     * Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
     */
-   const int ret = brw_bo_map_gtt(brw, mt->mcs_buf->bo);
-   if (unlikely(ret)) {
+   void *map = brw_bo_map(brw, mt->mcs_buf->bo, MAP_WRITE);
+   if (unlikely(map == NULL)) {
       fprintf(stderr, "Failed to map mcs buffer into GTT\n");
       brw_bo_unreference(mt->mcs_buf->bo);
       free(mt->mcs_buf);
       return;
    }
-   void *data = mt->mcs_buf->bo->virtual;
+   void *data = map;
    memset(data, init_value, mt->mcs_buf->size);
    brw_bo_unmap(mt->mcs_buf->bo);
 }
 
 static struct intel_miptree_aux_buffer *
-intel_mcs_miptree_buf_create(struct brw_context *brw,
-                             struct intel_mipmap_tree *mt,
-                             mesa_format format,
-                             unsigned mcs_width,
-                             unsigned mcs_height,
-                             uint32_t layout_flags)
+intel_alloc_aux_buffer(struct brw_context *brw,
+                       const char *name,
+                       const struct isl_surf *aux_surf,
+                       uint32_t alloc_flags,
+                       struct intel_mipmap_tree *mt)
 {
    struct intel_miptree_aux_buffer *buf = calloc(sizeof(*buf), 1);
-   struct intel_mipmap_tree *temp_mt;
-
    if (!buf)
-      return NULL;
+      return false;
 
-   /* From the Ivy Bridge PRM, Vol4 Part1 p76, "MCS Base Address":
-    *
-    *     "The MCS surface must be stored as Tile Y."
+   buf->size = aux_surf->size;
+   buf->pitch = aux_surf->row_pitch;
+   buf->qpitch = isl_surf_get_array_pitch_sa_rows(aux_surf);
+
+   /* ISL has stricter set of alignment rules then the drm allocator.
+    * Therefore one can pass the ISL dimensions in terms of bytes instead of
+    * trying to recalculate based on different format block sizes.
     */
-   layout_flags |= MIPTREE_LAYOUT_TILING_Y;
-   temp_mt = miptree_create(brw,
-                            mt->target,
-                            format,
-                            mt->first_level,
-                            mt->last_level,
-                            mcs_width,
-                            mcs_height,
-                            mt->logical_depth0,
-                            0 /* num_samples */,
-                            layout_flags);
-   if (!temp_mt) {
+   buf->bo = brw_bo_alloc_tiled(brw->bufmgr, name, buf->size,
+                                I915_TILING_Y, buf->pitch, alloc_flags);
+   if (!buf->bo) {
       free(buf);
       return NULL;
    }
 
-   buf->bo = temp_mt->bo;
-   buf->offset = temp_mt->offset;
-   buf->size = temp_mt->total_height * temp_mt->pitch;
-   buf->pitch = temp_mt->pitch;
-   buf->qpitch = temp_mt->qpitch;
-
-   /* Just hang on to the BO which backs the AUX buffer; the rest of the miptree
-    * structure should go away. We use miptree create simply as a means to make
-    * sure all the constraints for the buffer are satisfied.
-    */
-   brw_bo_reference(temp_mt->bo);
-   intel_miptree_release(&temp_mt);
+   buf->surf = *aux_surf;
 
    return buf;
 }
@@ -1450,127 +1596,84 @@
 {
    assert(brw->gen >= 7); /* MCS only used on Gen7+ */
    assert(mt->mcs_buf == NULL);
-   assert((mt->aux_disable & INTEL_AUX_DISABLE_MCS) == 0);
-
-   /* Choose the correct format for the MCS buffer.  All that really matters
-    * is that we allocate the right buffer size, since we'll always be
-    * accessing this miptree using MCS-specific hardware mechanisms, which
-    * infer the correct format based on num_samples.
-    */
-   mesa_format format;
-   switch (num_samples) {
-   case 2:
-   case 4:
-      /* 8 bits/pixel are required for MCS data when using 4x MSAA (2 bits for
-       * each sample).
-       */
-      format = MESA_FORMAT_R_UNORM8;
-      break;
-   case 8:
-      /* 32 bits/pixel are required for MCS data when using 8x MSAA (3 bits
-       * for each sample, plus 8 padding bits).
-       */
-      format = MESA_FORMAT_R_UINT32;
-      break;
-   case 16:
-      /* 64 bits/pixel are required for MCS data when using 16x MSAA (4 bits
-       * for each sample).
-       */
-      format = MESA_FORMAT_RG_UINT32;
-      break;
-   default:
-      unreachable("Unrecognized sample count in intel_miptree_alloc_mcs");
-   };
-
-   mt->mcs_buf =
-      intel_mcs_miptree_buf_create(brw, mt,
-                                   format,
-                                   mt->logical_width0,
-                                   mt->logical_height0,
-                                   MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
-   if (!mt->mcs_buf)
-      return false;
-
-   intel_miptree_init_mcs(brw, mt, 0xFF);
+   assert(mt->aux_usage == ISL_AUX_USAGE_MCS);
 
    /* Multisampled miptrees are only supported for single level. */
    assert(mt->first_level == 0);
-   intel_miptree_set_fast_clear_state(brw, mt, mt->first_level, 0,
-                                      mt->logical_depth0,
-                                      INTEL_FAST_CLEAR_STATE_CLEAR);
+   enum isl_aux_state **aux_state =
+      create_aux_state_map(mt, ISL_AUX_STATE_CLEAR);
+   if (!aux_state)
+      return false;
+
+   struct isl_surf temp_mcs_surf;
+
+   MAYBE_UNUSED bool ok =
+      isl_surf_get_mcs_surf(&brw->isl_dev, &mt->surf, &temp_mcs_surf);
+   assert(ok);
+
+   /* Buffer needs to be initialised requiring the buffer to be immediately
+    * mapped to cpu space for writing. Therefore do not use the gpu access
+    * flag which can cause an unnecessary delay if the backing pages happened
+    * to be just used by the GPU.
+    */
+   const uint32_t alloc_flags = 0;
+   mt->mcs_buf = intel_alloc_aux_buffer(brw, "mcs-miptree",
+                                        &temp_mcs_surf, alloc_flags, mt);
+   if (!mt->mcs_buf) {
+      free(aux_state);
+      return false;
+   }
+
+   mt->aux_state = aux_state;
+
+   intel_miptree_init_mcs(brw, mt, 0xFF);
 
    return true;
 }
 
-
 bool
-intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
-                                 struct intel_mipmap_tree *mt,
-                                 bool is_lossless_compressed)
+intel_miptree_alloc_ccs(struct brw_context *brw,
+                        struct intel_mipmap_tree *mt)
 {
    assert(mt->mcs_buf == NULL);
-   assert(!(mt->aux_disable & (INTEL_AUX_DISABLE_MCS | INTEL_AUX_DISABLE_CCS)));
+   assert(mt->aux_usage == ISL_AUX_USAGE_CCS_E ||
+          mt->aux_usage == ISL_AUX_USAGE_CCS_D);
 
-   struct isl_surf temp_main_surf;
    struct isl_surf temp_ccs_surf;
 
-   /* Create first an ISL presentation for the main color surface and let ISL
-    * calculate equivalent CCS surface against it.
-    */
-   intel_miptree_get_isl_surf(brw, mt, &temp_main_surf);
-   if (!isl_surf_get_ccs_surf(&brw->isl_dev, &temp_main_surf, &temp_ccs_surf))
+   if (!isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &temp_ccs_surf, 0))
       return false;
 
    assert(temp_ccs_surf.size &&
           (temp_ccs_surf.size % temp_ccs_surf.row_pitch == 0));
 
-   struct intel_miptree_aux_buffer *buf = calloc(sizeof(*buf), 1);
-   if (!buf)
+   enum isl_aux_state **aux_state =
+      create_aux_state_map(mt, ISL_AUX_STATE_PASS_THROUGH);
+   if (!aux_state)
       return false;
 
-   buf->size = temp_ccs_surf.size;
-   buf->pitch = temp_ccs_surf.row_pitch;
-   buf->qpitch = isl_surf_get_array_pitch_sa_rows(&temp_ccs_surf);
-
-   /* In case of compression mcs buffer needs to be initialised requiring the
-    * buffer to be immediately mapped to cpu space for writing. Therefore do
-    * not use the gpu access flag which can cause an unnecessary delay if the
-    * backing pages happened to be just used by the GPU.
+   /* When CCS_E is used, we need to ensure that the CCS starts off in a valid
+    * state.  From the Sky Lake PRM, "MCS Buffer for Render Target(s)":
+    *
+    *    "If Software wants to enable Color Compression without Fast clear,
+    *    Software needs to initialize MCS with zeros."
+    *
+    * A CCS value of 0 indicates that the corresponding block is in the
+    * pass-through state which is what we want.
+    *
+    * For CCS_D, on the other hand, we don't care as we're about to perform a
+    * fast-clear operation.  In that case, being hot in caches more useful.
     */
-   const uint32_t alloc_flags =
-      is_lossless_compressed ? 0 : BO_ALLOC_FOR_RENDER;
-
-   /* ISL has stricter set of alignment rules then the drm allocator.
-    * Therefore one can pass the ISL dimensions in terms of bytes instead of
-    * trying to recalculate based on different format block sizes.
-    */
-   buf->bo = brw_bo_alloc_tiled(brw->bufmgr, "ccs-miptree",
-                                buf->pitch, buf->size / buf->pitch,
-                                1, I915_TILING_Y, &buf->pitch, alloc_flags);
-   if (!buf->bo) {
-      free(buf);
+   const uint32_t alloc_flags = mt->aux_usage == ISL_AUX_USAGE_CCS_E ?
+                                BO_ALLOC_ZEROED : BO_ALLOC_FOR_RENDER;
+   mt->mcs_buf = intel_alloc_aux_buffer(brw, "ccs-miptree",
+                                        &temp_ccs_surf, alloc_flags, mt);
+   if (!mt->mcs_buf) {
+      free(aux_state);
       return false;
    }
-
-   mt->mcs_buf = buf;
-
-   /* From Gen9 onwards single-sampled (non-msrt) auxiliary buffers are
-    * used for lossless compression which requires similar initialisation
-    * as multi-sample compression.
-    */
-   if (is_lossless_compressed) {
-      /* Hardware sets the auxiliary buffer to all zeroes when it does full
-       * resolve. Initialize it accordingly in case the first renderer is
-       * cpu (or other none compression aware party).
-       *
-       * This is also explicitly stated in the spec (MCS Buffer for Render
-       * Target(s)):
-       *   "If Software wants to enable Color Compression without Fast clear,
-       *    Software needs to initialize MCS with zeros."
-       */
-      intel_miptree_init_mcs(brw, mt, 0);
-      mt->msaa_layout = INTEL_MSAA_LAYOUT_CMS;
-   }
+  
+   mt->aux_state = aux_state;
 
    return true;
 }
@@ -1586,10 +1689,11 @@
                                uint32_t level)
 {
    assert(mt->hiz_buf);
+   assert(mt->surf.size > 0);
 
    if (brw->gen >= 8 || brw->is_haswell) {
-      uint32_t width = minify(mt->physical_width0, level);
-      uint32_t height = minify(mt->physical_height0, level);
+      uint32_t width = minify(mt->surf.phys_level0_sa.width, level);
+      uint32_t height = minify(mt->surf.phys_level0_sa.height, level);
 
       /* Disable HiZ for LOD > 0 unless the width is 8 aligned
        * and the height is 4 aligned. This allows our HiZ support
@@ -1608,284 +1712,88 @@
    return true;
 }
 
-
-/**
- * Helper for intel_miptree_alloc_hiz() that determines the required hiz
- * buffer dimensions and allocates a bo for the hiz buffer.
- */
-static struct intel_miptree_hiz_buffer *
-intel_gen7_hiz_buf_create(struct brw_context *brw,
-                          struct intel_mipmap_tree *mt)
-{
-   unsigned z_width = mt->logical_width0;
-   unsigned z_height = mt->logical_height0;
-   const unsigned z_depth = MAX2(mt->logical_depth0, 1);
-   unsigned hz_width, hz_height;
-   struct intel_miptree_hiz_buffer *buf = calloc(sizeof(*buf), 1);
-
-   if (!buf)
-      return NULL;
-
-   /* Gen7 PRM Volume 2, Part 1, 11.5.3 "Hierarchical Depth Buffer" documents
-    * adjustments required for Z_Height and Z_Width based on multisampling.
-    */
-   switch (mt->num_samples) {
-   case 0:
-   case 1:
-      break;
-   case 2:
-   case 4:
-      z_width *= 2;
-      z_height *= 2;
-      break;
-   case 8:
-      z_width *= 4;
-      z_height *= 2;
-      break;
-   default:
-      unreachable("unsupported sample count");
-   }
-
-   const unsigned vertical_align = 8; /* 'j' in the docs */
-   const unsigned H0 = z_height;
-   const unsigned h0 = ALIGN(H0, vertical_align);
-   const unsigned h1 = ALIGN(minify(H0, 1), vertical_align);
-   const unsigned Z0 = z_depth;
-
-   /* HZ_Width (bytes) = ceiling(Z_Width / 16) * 16 */
-   hz_width = ALIGN(z_width, 16);
-
-   if (mt->target == GL_TEXTURE_3D) {
-      unsigned H_i = H0;
-      unsigned Z_i = Z0;
-      hz_height = 0;
-      for (unsigned level = mt->first_level; level <= mt->last_level; ++level) {
-         unsigned h_i = ALIGN(H_i, vertical_align);
-         /* sum(i=0 to m; h_i * max(1, floor(Z_Depth/2**i))) */
-         hz_height += h_i * Z_i;
-         H_i = minify(H_i, 1);
-         Z_i = minify(Z_i, 1);
-      }
-      /* HZ_Height =
-       *    (1/2) * sum(i=0 to m; h_i * max(1, floor(Z_Depth/2**i)))
-       */
-      hz_height = DIV_ROUND_UP(hz_height, 2);
-   } else {
-      const unsigned hz_qpitch = h0 + h1 + (12 * vertical_align);
-      /* HZ_Height (rows) = Ceiling ( ( Q_pitch * Z_depth/2) /8 ) * 8 */
-      hz_height = DIV_ROUND_UP(hz_qpitch * Z0, 2 * 8) * 8;
-   }
-
-   buf->aux_base.bo = brw_bo_alloc_tiled(brw->bufmgr, "hiz",
-                                         hz_width, hz_height, 1,
-                                         I915_TILING_Y, &buf->aux_base.pitch,
-                                         BO_ALLOC_FOR_RENDER);
-   if (!buf->aux_base.bo) {
-      free(buf);
-      return NULL;
-   }
-
-   buf->aux_base.size = hz_width * hz_height;
-
-   return buf;
-}
-
-
-/**
- * Helper for intel_miptree_alloc_hiz() that determines the required hiz
- * buffer dimensions and allocates a bo for the hiz buffer.
- */
-static struct intel_miptree_hiz_buffer *
-intel_gen8_hiz_buf_create(struct brw_context *brw,
-                          struct intel_mipmap_tree *mt)
-{
-   unsigned z_width = mt->logical_width0;
-   unsigned z_height = mt->logical_height0;
-   const unsigned z_depth = MAX2(mt->logical_depth0, 1);
-   unsigned hz_width, hz_height;
-   struct intel_miptree_hiz_buffer *buf = calloc(sizeof(*buf), 1);
-
-   if (!buf)
-      return NULL;
-
-   /* Gen7 PRM Volume 2, Part 1, 11.5.3 "Hierarchical Depth Buffer" documents
-    * adjustments required for Z_Height and Z_Width based on multisampling.
-    */
-   if (brw->gen < 9) {
-      switch (mt->num_samples) {
-      case 0:
-      case 1:
-         break;
-      case 2:
-      case 4:
-         z_width *= 2;
-         z_height *= 2;
-         break;
-      case 8:
-         z_width *= 4;
-         z_height *= 2;
-         break;
-      default:
-         unreachable("unsupported sample count");
-      }
-   }
-
-   const unsigned vertical_align = 8; /* 'j' in the docs */
-   const unsigned H0 = z_height;
-   const unsigned h0 = ALIGN(H0, vertical_align);
-   const unsigned h1 = ALIGN(minify(H0, 1), vertical_align);
-   const unsigned Z0 = z_depth;
-
-   /* HZ_Width (bytes) = ceiling(Z_Width / 16) * 16 */
-   hz_width = ALIGN(z_width, 16);
-
-   unsigned H_i = H0;
-   unsigned Z_i = Z0;
-   unsigned sum_h_i = 0;
-   unsigned hz_height_3d_sum = 0;
-   for (unsigned level = mt->first_level; level <= mt->last_level; ++level) {
-      unsigned i = level - mt->first_level;
-      unsigned h_i = ALIGN(H_i, vertical_align);
-      /* sum(i=2 to m; h_i) */
-      if (i >= 2) {
-         sum_h_i += h_i;
-      }
-      /* sum(i=0 to m; h_i * max(1, floor(Z_Depth/2**i))) */
-      hz_height_3d_sum += h_i * Z_i;
-      H_i = minify(H_i, 1);
-      Z_i = minify(Z_i, 1);
-   }
-   /* HZ_QPitch = h0 + max(h1, sum(i=2 to m; h_i)) */
-   buf->aux_base.qpitch = h0 + MAX2(h1, sum_h_i);
-
-   if (mt->target == GL_TEXTURE_3D) {
-      /* (1/2) * sum(i=0 to m; h_i * max(1, floor(Z_Depth/2**i))) */
-      hz_height = DIV_ROUND_UP(hz_height_3d_sum, 2);
-   } else {
-      /* HZ_Height (rows) = ceiling( (HZ_QPitch/2)/8) *8 * Z_Depth */
-      hz_height = DIV_ROUND_UP(buf->aux_base.qpitch, 2 * 8) * 8 * Z0;
-   }
-
-   buf->aux_base.bo = brw_bo_alloc_tiled(brw->bufmgr, "hiz",
-                                         hz_width, hz_height, 1,
-                                         I915_TILING_Y, &buf->aux_base.pitch,
-                                         BO_ALLOC_FOR_RENDER);
-   if (!buf->aux_base.bo) {
-      free(buf);
-      return NULL;
-   }
-
-   buf->aux_base.size = hz_width * hz_height;
-
-   return buf;
-}
-
-
-static struct intel_miptree_hiz_buffer *
-intel_hiz_miptree_buf_create(struct brw_context *brw,
-                             struct intel_mipmap_tree *mt)
-{
-   struct intel_miptree_hiz_buffer *buf = calloc(sizeof(*buf), 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-
-   if (brw->gen == 6)
-      layout_flags |= MIPTREE_LAYOUT_GEN6_HIZ_STENCIL;
-
-   if (!buf)
-      return NULL;
-
-   layout_flags |= MIPTREE_LAYOUT_TILING_ANY;
-   buf->mt = intel_miptree_create(brw,
-                                  mt->target,
-                                  mt->format,
-                                  mt->first_level,
-                                  mt->last_level,
-                                  mt->logical_width0,
-                                  mt->logical_height0,
-                                  mt->logical_depth0,
-                                  mt->num_samples,
-                                  layout_flags);
-   if (!buf->mt) {
-      free(buf);
-      return NULL;
-   }
-
-   buf->aux_base.bo = buf->mt->bo;
-   buf->aux_base.size = buf->mt->total_height * buf->mt->pitch;
-   buf->aux_base.pitch = buf->mt->pitch;
-
-   /* On gen6 hiz is unconditionally laid out packing all slices
-    * at each level-of-detail (LOD). This means there is no valid qpitch
-    * setting. In fact, this is ignored when hardware is setup - there is no
-    * hardware qpitch setting of hiz on gen6.
-    */
-   buf->aux_base.qpitch = 0;
-
-   return buf;
-}
-
-bool
-intel_miptree_wants_hiz_buffer(struct brw_context *brw,
-                               struct intel_mipmap_tree *mt)
-{
-   if (!brw->has_hiz)
-      return false;
-
-   if (mt->hiz_buf != NULL)
-      return false;
-
-   if (mt->aux_disable & INTEL_AUX_DISABLE_HIZ)
-      return false;
-
-   switch (mt->format) {
-   case MESA_FORMAT_Z_FLOAT32:
-   case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
-   case MESA_FORMAT_Z24_UNORM_X8_UINT:
-   case MESA_FORMAT_Z24_UNORM_S8_UINT:
-   case MESA_FORMAT_Z_UNORM16:
-      return true;
-   default:
-      return false;
-   }
-}
-
 bool
 intel_miptree_alloc_hiz(struct brw_context *brw,
 			struct intel_mipmap_tree *mt)
 {
    assert(mt->hiz_buf == NULL);
-   assert((mt->aux_disable & INTEL_AUX_DISABLE_HIZ) == 0);
+   assert(mt->aux_usage == ISL_AUX_USAGE_HIZ);
 
-   if (brw->gen == 7) {
-      mt->hiz_buf = intel_gen7_hiz_buf_create(brw, mt);
-   } else if (brw->gen >= 8) {
-      mt->hiz_buf = intel_gen8_hiz_buf_create(brw, mt);
-   } else {
-      mt->hiz_buf = intel_hiz_miptree_buf_create(brw, mt);
-   }
-
-   if (!mt->hiz_buf)
+   enum isl_aux_state **aux_state =
+      create_aux_state_map(mt, ISL_AUX_STATE_AUX_INVALID);
+   if (!aux_state)
       return false;
 
-   /* Mark that all slices need a HiZ resolve. */
-   for (unsigned level = mt->first_level; level <= mt->last_level; ++level) {
-      if (!intel_miptree_level_enable_hiz(brw, mt, level))
-         continue;
+   struct isl_surf temp_hiz_surf;
 
-      for (unsigned layer = 0; layer < mt->level[level].depth; ++layer) {
-         struct intel_resolve_map *m = malloc(sizeof(struct intel_resolve_map));
-         exec_node_init(&m->link);
-         m->level = level;
-         m->layer = layer;
-         m->need = BLORP_HIZ_OP_HIZ_RESOLVE;
+   MAYBE_UNUSED bool ok =
+      isl_surf_get_hiz_surf(&brw->isl_dev, &mt->surf, &temp_hiz_surf);
+   assert(ok);
 
-         exec_list_push_tail(&mt->hiz_map, &m->link);
-      }
+   const uint32_t alloc_flags = BO_ALLOC_FOR_RENDER;
+   mt->hiz_buf = intel_alloc_aux_buffer(brw, "hiz-miptree",
+                                        &temp_hiz_surf, alloc_flags, mt);
+
+   if (!mt->hiz_buf) {
+      free(aux_state);
+      return false;
    }
 
+   for (unsigned level = mt->first_level; level <= mt->last_level; ++level)
+      intel_miptree_level_enable_hiz(brw, mt, level);
+
+   mt->aux_state = aux_state;
+
    return true;
 }
 
+
+/**
+ * Allocate the initial aux surface for a miptree based on mt->aux_usage
+ *
+ * Since MCS, HiZ, and CCS_E can compress more than just clear color, we
+ * create the auxiliary surfaces up-front.  CCS_D, on the other hand, can only
+ * compress clear color so we wait until an actual fast-clear to allocate it.
+ */
+static bool
+intel_miptree_alloc_aux(struct brw_context *brw,
+                        struct intel_mipmap_tree *mt)
+{
+   switch (mt->aux_usage) {
+   case ISL_AUX_USAGE_NONE:
+      return true;
+
+   case ISL_AUX_USAGE_HIZ:
+      assert(!_mesa_is_format_color_format(mt->format));
+      if (!intel_miptree_alloc_hiz(brw, mt))
+         return false;
+      return true;
+
+   case ISL_AUX_USAGE_MCS:
+      assert(_mesa_is_format_color_format(mt->format));
+      assert(mt->surf.samples > 1);
+      if (!intel_miptree_alloc_mcs(brw, mt, mt->surf.samples))
+         return false;
+      return true;
+
+   case ISL_AUX_USAGE_CCS_D:
+      /* Since CCS_D can only compress clear color so we wait until an actual
+       * fast-clear to allocate it.
+       */
+      return true;
+
+   case ISL_AUX_USAGE_CCS_E:
+      assert(_mesa_is_format_color_format(mt->format));
+      assert(mt->surf.samples == 1);
+      if (!intel_miptree_alloc_ccs(brw, mt))
+         return false;
+      return true;
+   }
+
+   unreachable("Invalid aux usage");
+}
+
+
 /**
  * Can the miptree sample using the hiz buffer?
  */
@@ -1908,7 +1816,7 @@
     * mipmap levels aren't available in the HiZ buffer. So we need all levels
     * of the texture to be HiZ enabled.
     */
-   for (unsigned level = mt->first_level; level <= mt->last_level; ++level) {
+   for (unsigned level = 0; level < mt->surf.levels; ++level) {
       if (!intel_miptree_level_has_hiz(mt, level))
          return false;
    }
@@ -1925,7 +1833,7 @@
     * There is no such blurb for 1D textures, but there is sufficient evidence
     * that this is broken on SKL+.
     */
-   return (mt->num_samples <= 1 &&
+   return (mt->surf.samples == 1 &&
            mt->target != GL_TEXTURE_3D &&
            mt->target != GL_TEXTURE_1D /* gen9+ restriction */);
 }
@@ -1934,137 +1842,72 @@
  * Does the miptree slice have hiz enabled?
  */
 bool
-intel_miptree_level_has_hiz(struct intel_mipmap_tree *mt, uint32_t level)
+intel_miptree_level_has_hiz(const struct intel_mipmap_tree *mt, uint32_t level)
 {
    intel_miptree_check_level_layer(mt, level, 0);
    return mt->level[level].has_hiz;
 }
 
-void
-intel_miptree_slice_set_needs_hiz_resolve(struct intel_mipmap_tree *mt,
-					  uint32_t level,
-					  uint32_t layer)
+static inline uint32_t
+miptree_level_range_length(const struct intel_mipmap_tree *mt,
+                           uint32_t start_level, uint32_t num_levels)
 {
-   if (!intel_miptree_level_has_hiz(mt, level))
-      return;
+   assert(start_level >= mt->first_level);
+   assert(start_level <= mt->last_level);
 
-   intel_resolve_map_set(&mt->hiz_map,
-			 level, layer, BLORP_HIZ_OP_HIZ_RESOLVE);
+   if (num_levels == INTEL_REMAINING_LAYERS)
+      num_levels = mt->last_level - start_level + 1;
+   /* Check for overflow */
+   assert(start_level + num_levels >= start_level);
+   assert(start_level + num_levels <= mt->last_level + 1);
+
+   return num_levels;
 }
 
-
-void
-intel_miptree_slice_set_needs_depth_resolve(struct intel_mipmap_tree *mt,
-                                            uint32_t level,
-                                            uint32_t layer)
+static inline uint32_t
+miptree_layer_range_length(const struct intel_mipmap_tree *mt, uint32_t level,
+                           uint32_t start_layer, uint32_t num_layers)
 {
-   if (!intel_miptree_level_has_hiz(mt, level))
-      return;
+   assert(level <= mt->last_level);
 
-   intel_resolve_map_set(&mt->hiz_map,
-			 level, layer, BLORP_HIZ_OP_DEPTH_RESOLVE);
+   const uint32_t total_num_layers = get_num_logical_layers(mt, level);
+   assert(start_layer < total_num_layers);
+   if (num_layers == INTEL_REMAINING_LAYERS)
+      num_layers = total_num_layers - start_layer;
+   /* Check for overflow */
+   assert(start_layer + num_layers >= start_layer);
+   assert(start_layer + num_layers <= total_num_layers);
+
+   return num_layers;
 }
 
-void
-intel_miptree_set_all_slices_need_depth_resolve(struct intel_mipmap_tree *mt,
-                                                uint32_t level)
+bool
+intel_miptree_has_color_unresolved(const struct intel_mipmap_tree *mt,
+                                   unsigned start_level, unsigned num_levels,
+                                   unsigned start_layer, unsigned num_layers)
 {
-   uint32_t layer;
-   uint32_t end_layer = mt->level[level].depth;
+   assert(_mesa_is_format_color_format(mt->format));
 
-   for (layer = 0; layer < end_layer; layer++) {
-      intel_miptree_slice_set_needs_depth_resolve(mt, level, layer);
-   }
-}
-
-static bool
-intel_miptree_slice_resolve(struct brw_context *brw,
-			    struct intel_mipmap_tree *mt,
-			    uint32_t level,
-			    uint32_t layer,
-			    enum blorp_hiz_op need)
-{
-   intel_miptree_check_level_layer(mt, level, layer);
-
-   struct intel_resolve_map *item =
-	 intel_resolve_map_get(&mt->hiz_map, level, layer);
-
-   if (!item || item->need != need)
+   if (!mt->mcs_buf)
       return false;
 
-   intel_hiz_exec(brw, mt, level, layer, 1, need);
-   intel_resolve_map_remove(item);
-   return true;
-}
+   /* Clamp the level range to fit the miptree */
+   num_levels = miptree_level_range_length(mt, start_level, num_levels);
 
-bool
-intel_miptree_slice_resolve_hiz(struct brw_context *brw,
-				struct intel_mipmap_tree *mt,
-				uint32_t level,
-				uint32_t layer)
-{
-   return intel_miptree_slice_resolve(brw, mt, level, layer,
-				      BLORP_HIZ_OP_HIZ_RESOLVE);
-}
-
-bool
-intel_miptree_slice_resolve_depth(struct brw_context *brw,
-				  struct intel_mipmap_tree *mt,
-				  uint32_t level,
-				  uint32_t layer)
-{
-   return intel_miptree_slice_resolve(brw, mt, level, layer,
-				      BLORP_HIZ_OP_DEPTH_RESOLVE);
-}
-
-static bool
-intel_miptree_all_slices_resolve(struct brw_context *brw,
-				 struct intel_mipmap_tree *mt,
-				 enum blorp_hiz_op need)
-{
-   bool did_resolve = false;
-
-   foreach_list_typed_safe(struct intel_resolve_map, map, link, &mt->hiz_map) {
-      if (map->need != need)
-	 continue;
-
-      intel_hiz_exec(brw, mt, map->level, map->layer, 1, need);
-      intel_resolve_map_remove(map);
-      did_resolve = true;
+   for (uint32_t l = 0; l < num_levels; l++) {
+      const uint32_t level = start_level + l;
+      const uint32_t level_layers =
+         miptree_layer_range_length(mt, level, start_layer, num_layers);
+      for (unsigned a = 0; a < level_layers; a++) {
+         enum isl_aux_state aux_state =
+            intel_miptree_get_aux_state(mt, level, start_layer + a);
+         assert(aux_state != ISL_AUX_STATE_AUX_INVALID);
+         if (aux_state != ISL_AUX_STATE_PASS_THROUGH)
+            return true;
+      }
    }
 
-   return did_resolve;
-}
-
-bool
-intel_miptree_all_slices_resolve_hiz(struct brw_context *brw,
-				     struct intel_mipmap_tree *mt)
-{
-   return intel_miptree_all_slices_resolve(brw, mt,
-					   BLORP_HIZ_OP_HIZ_RESOLVE);
-}
-
-bool
-intel_miptree_all_slices_resolve_depth(struct brw_context *brw,
-				       struct intel_mipmap_tree *mt)
-{
-   return intel_miptree_all_slices_resolve(brw, mt,
-					   BLORP_HIZ_OP_DEPTH_RESOLVE);
-}
-
-enum intel_fast_clear_state
-intel_miptree_get_fast_clear_state(const struct intel_mipmap_tree *mt,
-                                   unsigned level, unsigned layer)
-{
-   intel_miptree_check_level_layer(mt, level, layer);
-
-   const struct intel_resolve_map *item =
-      intel_resolve_map_const_get(&mt->color_resolve_map, level, layer);
-
-   if (!item)
-      return INTEL_FAST_CLEAR_STATE_RESOLVED;
-
-   return item->fast_clear_state;
+   return false;
 }
 
 static void
@@ -2073,7 +1916,7 @@
                                   unsigned level, unsigned layer)
 {
 
-   if ((mt->aux_disable & INTEL_AUX_DISABLE_CCS) || !mt->mcs_buf)
+   if (!mt->mcs_buf)
       return;
 
    /* Fast color clear is supported for mipmapped surfaces only on Gen8+. */
@@ -2081,146 +1924,757 @@
           (level == 0 && mt->first_level == 0 && mt->last_level == 0));
 
    /* Compression of arrayed msaa surfaces is supported. */
-   if (mt->num_samples > 1)
+   if (mt->surf.samples > 1)
       return;
 
    /* Fast color clear is supported for non-msaa arrays only on Gen8+. */
-   assert(brw->gen >= 8 || (layer == 0 && mt->logical_depth0 == 1));
+   assert(brw->gen >= 8 ||
+          (layer == 0 &&
+           mt->surf.logical_level0_px.depth == 1 &&
+           mt->surf.logical_level0_px.array_len == 1));
 
    (void)level;
    (void)layer;
 }
 
-void
-intel_miptree_set_fast_clear_state(const struct brw_context *brw,
-                                   struct intel_mipmap_tree *mt,
-                                   unsigned level,
-                                   unsigned first_layer,
-                                   unsigned num_layers,
-                                   enum intel_fast_clear_state new_state)
+static enum blorp_fast_clear_op
+get_ccs_d_resolve_op(enum isl_aux_state aux_state,
+                     enum isl_aux_usage aux_usage,
+                     bool fast_clear_supported)
 {
-   /* Setting the state to resolved means removing the item from the list
-    * altogether.
-    */
-   assert(new_state != INTEL_FAST_CLEAR_STATE_RESOLVED);
+   assert(aux_usage == ISL_AUX_USAGE_NONE || aux_usage == ISL_AUX_USAGE_CCS_D);
 
-   intel_miptree_check_color_resolve(brw, mt, level, first_layer);
+   const bool ccs_supported = aux_usage == ISL_AUX_USAGE_CCS_D;
 
-   assert(first_layer + num_layers <= mt->physical_depth0);
+   assert(ccs_supported == fast_clear_supported);
 
-   for (unsigned i = 0; i < num_layers; i++)
-      intel_resolve_map_set(&mt->color_resolve_map, level,
-                            first_layer + i, new_state);
+   switch (aux_state) {
+   case ISL_AUX_STATE_CLEAR:
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      if (!ccs_supported)
+         return BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+      else
+         return BLORP_FAST_CLEAR_OP_NONE;
+
+   case ISL_AUX_STATE_PASS_THROUGH:
+      return BLORP_FAST_CLEAR_OP_NONE;
+
+   case ISL_AUX_STATE_RESOLVED:
+   case ISL_AUX_STATE_AUX_INVALID:
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      break;
+   }
+
+   unreachable("Invalid aux state for CCS_D");
 }
 
-bool
-intel_miptree_has_color_unresolved(const struct intel_mipmap_tree *mt,
-                                   unsigned start_level, unsigned num_levels,
-                                   unsigned start_layer, unsigned num_layers)
+static enum blorp_fast_clear_op
+get_ccs_e_resolve_op(enum isl_aux_state aux_state,
+                     enum isl_aux_usage aux_usage,
+                     bool fast_clear_supported)
 {
-   return intel_resolve_map_find_any(&mt->color_resolve_map,
-                                     start_level, num_levels,
-                                     start_layer, num_layers) != NULL;
+   /* CCS_E surfaces can be accessed as CCS_D if we're careful. */
+   assert(aux_usage == ISL_AUX_USAGE_NONE ||
+          aux_usage == ISL_AUX_USAGE_CCS_D ||
+          aux_usage == ISL_AUX_USAGE_CCS_E);
+
+   if (aux_usage == ISL_AUX_USAGE_CCS_D)
+      assert(fast_clear_supported);
+
+   switch (aux_state) {
+   case ISL_AUX_STATE_CLEAR:
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      if (fast_clear_supported)
+         return BLORP_FAST_CLEAR_OP_NONE;
+      else if (aux_usage == ISL_AUX_USAGE_CCS_E)
+         return BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
+      else
+         return BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      if (aux_usage != ISL_AUX_USAGE_CCS_E)
+         return BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+      else if (!fast_clear_supported)
+         return BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
+      else
+         return BLORP_FAST_CLEAR_OP_NONE;
+
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      if (aux_usage != ISL_AUX_USAGE_CCS_E)
+         return BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+      else
+         return BLORP_FAST_CLEAR_OP_NONE;
+
+   case ISL_AUX_STATE_PASS_THROUGH:
+      return BLORP_FAST_CLEAR_OP_NONE;
+
+   case ISL_AUX_STATE_RESOLVED:
+   case ISL_AUX_STATE_AUX_INVALID:
+      break;
+   }
+
+   unreachable("Invalid aux state for CCS_E");
 }
 
-void
-intel_miptree_used_for_rendering(const struct brw_context *brw,
-                                 struct intel_mipmap_tree *mt, unsigned level,
-                                 unsigned start_layer, unsigned num_layers)
+static void
+intel_miptree_prepare_ccs_access(struct brw_context *brw,
+                                 struct intel_mipmap_tree *mt,
+                                 uint32_t level, uint32_t layer,
+                                 enum isl_aux_usage aux_usage,
+                                 bool fast_clear_supported)
 {
-   const bool is_lossless_compressed =
-      intel_miptree_is_lossless_compressed(brw, mt);
+   enum isl_aux_state aux_state = intel_miptree_get_aux_state(mt, level, layer);
 
-   for (unsigned i = 0; i < num_layers; ++i) {
-      const enum intel_fast_clear_state fast_clear_state =
-         intel_miptree_get_fast_clear_state(mt, level, start_layer + i);
+   enum blorp_fast_clear_op resolve_op;
+   if (mt->aux_usage == ISL_AUX_USAGE_CCS_E) {
+      resolve_op = get_ccs_e_resolve_op(aux_state, aux_usage,
+                                        fast_clear_supported);
+   } else {
+      assert(mt->aux_usage == ISL_AUX_USAGE_CCS_D);
+      resolve_op = get_ccs_d_resolve_op(aux_state, aux_usage,
+                                        fast_clear_supported);
+   }
 
-      /* If the buffer was previously in fast clear state, change it to
-       * unresolved state, since it won't be guaranteed to be clear after
-       * rendering occurs.
-       */
-      if (is_lossless_compressed ||
-          fast_clear_state == INTEL_FAST_CLEAR_STATE_CLEAR) {
-         intel_miptree_set_fast_clear_state(
-            brw, mt, level, start_layer + i, 1,
-            INTEL_FAST_CLEAR_STATE_UNRESOLVED);
+   if (resolve_op != BLORP_FAST_CLEAR_OP_NONE) {
+      intel_miptree_check_color_resolve(brw, mt, level, layer);
+      brw_blorp_resolve_color(brw, mt, level, layer, resolve_op);
+
+      switch (resolve_op) {
+      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+         /* The CCS full resolve operation destroys the CCS and sets it to the
+          * pass-through state.  (You can also think of this as being both a
+          * resolve and an ambiguate in one operation.)
+          */
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_PASS_THROUGH);
+         break;
+
+      case BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL:
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_COMPRESSED_NO_CLEAR);
+         break;
+
+      default:
+         unreachable("Invalid resolve op");
       }
    }
 }
 
-static bool
-intel_miptree_needs_color_resolve(const struct brw_context *brw,
-                                  const struct intel_mipmap_tree *mt,
-                                  int flags)
+static void
+intel_miptree_finish_ccs_write(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt,
+                               uint32_t level, uint32_t layer,
+                               enum isl_aux_usage aux_usage)
 {
-   if (mt->aux_disable & INTEL_AUX_DISABLE_CCS)
+   assert(aux_usage == ISL_AUX_USAGE_NONE ||
+          aux_usage == ISL_AUX_USAGE_CCS_D ||
+          aux_usage == ISL_AUX_USAGE_CCS_E);
+
+   enum isl_aux_state aux_state = intel_miptree_get_aux_state(mt, level, layer);
+
+   if (mt->aux_usage == ISL_AUX_USAGE_CCS_E) {
+      switch (aux_state) {
+      case ISL_AUX_STATE_CLEAR:
+      case ISL_AUX_STATE_PARTIAL_CLEAR:
+         assert(aux_usage == ISL_AUX_USAGE_CCS_E ||
+                aux_usage == ISL_AUX_USAGE_CCS_D);
+
+         if (aux_usage == ISL_AUX_USAGE_CCS_E) {
+            intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                        ISL_AUX_STATE_COMPRESSED_CLEAR);
+         } else if (aux_state != ISL_AUX_STATE_PARTIAL_CLEAR) {
+            intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                        ISL_AUX_STATE_PARTIAL_CLEAR);
+         }
+         break;
+
+      case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+         assert(aux_usage == ISL_AUX_USAGE_CCS_E);
+         break; /* Nothing to do */
+
+      case ISL_AUX_STATE_PASS_THROUGH:
+         if (aux_usage == ISL_AUX_USAGE_CCS_E) {
+            intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                        ISL_AUX_STATE_COMPRESSED_NO_CLEAR);
+         } else {
+            /* Nothing to do */
+         }
+         break;
+
+      case ISL_AUX_STATE_RESOLVED:
+      case ISL_AUX_STATE_AUX_INVALID:
+         unreachable("Invalid aux state for CCS_E");
+      }
+   } else {
+      assert(mt->aux_usage == ISL_AUX_USAGE_CCS_D);
+      /* CCS_D is a bit simpler */
+      switch (aux_state) {
+      case ISL_AUX_STATE_CLEAR:
+         assert(aux_usage == ISL_AUX_USAGE_CCS_D);
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_PARTIAL_CLEAR);
+         break;
+
+      case ISL_AUX_STATE_PARTIAL_CLEAR:
+         assert(aux_usage == ISL_AUX_USAGE_CCS_D);
+         break; /* Nothing to do */
+
+      case ISL_AUX_STATE_PASS_THROUGH:
+         /* Nothing to do */
+         break;
+
+      case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      case ISL_AUX_STATE_RESOLVED:
+      case ISL_AUX_STATE_AUX_INVALID:
+         unreachable("Invalid aux state for CCS_D");
+      }
+   }
+}
+
+static void
+intel_miptree_prepare_mcs_access(struct brw_context *brw,
+                                 struct intel_mipmap_tree *mt,
+                                 uint32_t layer,
+                                 enum isl_aux_usage aux_usage,
+                                 bool fast_clear_supported)
+{
+   assert(aux_usage == ISL_AUX_USAGE_MCS);
+
+   switch (intel_miptree_get_aux_state(mt, 0, layer)) {
+   case ISL_AUX_STATE_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      if (!fast_clear_supported) {
+         brw_blorp_mcs_partial_resolve(brw, mt, layer, 1);
+         intel_miptree_set_aux_state(brw, mt, 0, layer, 1,
+                                     ISL_AUX_STATE_COMPRESSED_NO_CLEAR);
+      }
+      break;
+
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      break; /* Nothing to do */
+
+   case ISL_AUX_STATE_RESOLVED:
+   case ISL_AUX_STATE_PASS_THROUGH:
+   case ISL_AUX_STATE_AUX_INVALID:
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      unreachable("Invalid aux state for MCS");
+   }
+}
+
+static void
+intel_miptree_finish_mcs_write(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt,
+                               uint32_t layer,
+                               enum isl_aux_usage aux_usage)
+{
+   assert(aux_usage == ISL_AUX_USAGE_MCS);
+
+   switch (intel_miptree_get_aux_state(mt, 0, layer)) {
+   case ISL_AUX_STATE_CLEAR:
+      intel_miptree_set_aux_state(brw, mt, 0, layer, 1,
+                                  ISL_AUX_STATE_COMPRESSED_CLEAR);
+      break;
+
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      break; /* Nothing to do */
+
+   case ISL_AUX_STATE_RESOLVED:
+   case ISL_AUX_STATE_PASS_THROUGH:
+   case ISL_AUX_STATE_AUX_INVALID:
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      unreachable("Invalid aux state for MCS");
+   }
+}
+
+static void
+intel_miptree_prepare_hiz_access(struct brw_context *brw,
+                                 struct intel_mipmap_tree *mt,
+                                 uint32_t level, uint32_t layer,
+                                 enum isl_aux_usage aux_usage,
+                                 bool fast_clear_supported)
+{
+   assert(aux_usage == ISL_AUX_USAGE_NONE || aux_usage == ISL_AUX_USAGE_HIZ);
+
+   enum blorp_hiz_op hiz_op = BLORP_HIZ_OP_NONE;
+   switch (intel_miptree_get_aux_state(mt, level, layer)) {
+   case ISL_AUX_STATE_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      if (aux_usage != ISL_AUX_USAGE_HIZ || !fast_clear_supported)
+         hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE;
+      break;
+
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      if (aux_usage != ISL_AUX_USAGE_HIZ)
+         hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE;
+      break;
+
+   case ISL_AUX_STATE_PASS_THROUGH:
+   case ISL_AUX_STATE_RESOLVED:
+      break;
+
+   case ISL_AUX_STATE_AUX_INVALID:
+      if (aux_usage == ISL_AUX_USAGE_HIZ)
+         hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE;
+      break;
+
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      unreachable("Invalid HiZ state");
+   }
+
+   if (hiz_op != BLORP_HIZ_OP_NONE) {
+      intel_hiz_exec(brw, mt, level, layer, 1, hiz_op);
+
+      switch (hiz_op) {
+      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_RESOLVED);
+         break;
+
+      case BLORP_HIZ_OP_HIZ_RESOLVE:
+         /* The HiZ resolve operation is actually an ambiguate */
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_PASS_THROUGH);
+         break;
+
+      default:
+         unreachable("Invalid HiZ op");
+      }
+   }
+}
+
+static void
+intel_miptree_finish_hiz_write(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt,
+                               uint32_t level, uint32_t layer,
+                               enum isl_aux_usage aux_usage)
+{
+   assert(aux_usage == ISL_AUX_USAGE_NONE || aux_usage == ISL_AUX_USAGE_HIZ);
+
+   switch (intel_miptree_get_aux_state(mt, level, layer)) {
+   case ISL_AUX_STATE_CLEAR:
+      assert(aux_usage == ISL_AUX_USAGE_HIZ);
+      intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                  ISL_AUX_STATE_COMPRESSED_CLEAR);
+      break;
+
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      assert(aux_usage == ISL_AUX_USAGE_HIZ);
+      break; /* Nothing to do */
+
+   case ISL_AUX_STATE_RESOLVED:
+      if (aux_usage == ISL_AUX_USAGE_HIZ) {
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_COMPRESSED_NO_CLEAR);
+      } else {
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_AUX_INVALID);
+      }
+      break;
+
+   case ISL_AUX_STATE_PASS_THROUGH:
+      if (aux_usage == ISL_AUX_USAGE_HIZ) {
+         intel_miptree_set_aux_state(brw, mt, level, layer, 1,
+                                     ISL_AUX_STATE_COMPRESSED_NO_CLEAR);
+      }
+      break;
+
+   case ISL_AUX_STATE_AUX_INVALID:
+      assert(aux_usage != ISL_AUX_USAGE_HIZ);
+      break;
+
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      unreachable("Invalid HiZ state");
+   }
+}
+
+void
+intel_miptree_prepare_access(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt,
+                             uint32_t start_level, uint32_t num_levels,
+                             uint32_t start_layer, uint32_t num_layers,
+                             enum isl_aux_usage aux_usage,
+                             bool fast_clear_supported)
+{
+   num_levels = miptree_level_range_length(mt, start_level, num_levels);
+
+   switch (mt->aux_usage) {
+   case ISL_AUX_USAGE_NONE:
+      /* Nothing to do */
+      break;
+
+   case ISL_AUX_USAGE_MCS:
+      assert(mt->mcs_buf);
+      assert(start_level == 0 && num_levels == 1);
+      const uint32_t level_layers =
+         miptree_layer_range_length(mt, 0, start_layer, num_layers);
+      for (uint32_t a = 0; a < level_layers; a++) {
+         intel_miptree_prepare_mcs_access(brw, mt, start_layer + a,
+                                          aux_usage, fast_clear_supported);
+      }
+      break;
+
+   case ISL_AUX_USAGE_CCS_D:
+   case ISL_AUX_USAGE_CCS_E:
+      if (!mt->mcs_buf)
+         return;
+
+      for (uint32_t l = 0; l < num_levels; l++) {
+         const uint32_t level = start_level + l;
+         const uint32_t level_layers =
+            miptree_layer_range_length(mt, level, start_layer, num_layers);
+         for (uint32_t a = 0; a < level_layers; a++) {
+            intel_miptree_prepare_ccs_access(brw, mt, level,
+                                             start_layer + a,
+                                             aux_usage, fast_clear_supported);
+         }
+      }
+      break;
+
+   case ISL_AUX_USAGE_HIZ:
+      assert(mt->hiz_buf);
+      for (uint32_t l = 0; l < num_levels; l++) {
+         const uint32_t level = start_level + l;
+         if (!intel_miptree_level_has_hiz(mt, level))
+            continue;
+
+         const uint32_t level_layers =
+            miptree_layer_range_length(mt, level, start_layer, num_layers);
+         for (uint32_t a = 0; a < level_layers; a++) {
+            intel_miptree_prepare_hiz_access(brw, mt, level, start_layer + a,
+                                             aux_usage, fast_clear_supported);
+         }
+      }
+      break;
+
+   default:
+      unreachable("Invalid aux usage");
+   }
+}
+
+void
+intel_miptree_finish_write(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt, uint32_t level,
+                           uint32_t start_layer, uint32_t num_layers,
+                           enum isl_aux_usage aux_usage)
+{
+   num_layers = miptree_layer_range_length(mt, level, start_layer, num_layers);
+
+   switch (mt->aux_usage) {
+   case ISL_AUX_USAGE_NONE:
+      /* Nothing to do */
+      break;
+
+   case ISL_AUX_USAGE_MCS:
+      assert(mt->mcs_buf);
+      for (uint32_t a = 0; a < num_layers; a++) {
+         intel_miptree_finish_mcs_write(brw, mt, start_layer + a,
+                                        aux_usage);
+      }
+      break;
+
+   case ISL_AUX_USAGE_CCS_D:
+   case ISL_AUX_USAGE_CCS_E:
+      if (!mt->mcs_buf)
+         return;
+
+      for (uint32_t a = 0; a < num_layers; a++) {
+         intel_miptree_finish_ccs_write(brw, mt, level, start_layer + a,
+                                        aux_usage);
+      }
+      break;
+
+   case ISL_AUX_USAGE_HIZ:
+      if (!intel_miptree_level_has_hiz(mt, level))
+         return;
+
+      for (uint32_t a = 0; a < num_layers; a++) {
+         intel_miptree_finish_hiz_write(brw, mt, level, start_layer + a,
+                                        aux_usage);
+      }
+      break;
+
+   default:
+      unreachable("Invavlid aux usage");
+   }
+}
+
+enum isl_aux_state
+intel_miptree_get_aux_state(const struct intel_mipmap_tree *mt,
+                            uint32_t level, uint32_t layer)
+{
+   intel_miptree_check_level_layer(mt, level, layer);
+
+   if (_mesa_is_format_color_format(mt->format)) {
+      assert(mt->mcs_buf != NULL);
+      assert(mt->surf.samples == 1 ||
+             mt->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY);
+   } else if (mt->format == MESA_FORMAT_S_UINT8) {
+      unreachable("Cannot get aux state for stencil");
+   } else {
+      assert(intel_miptree_level_has_hiz(mt, level));
+   }
+
+   return mt->aux_state[level][layer];
+}
+
+void
+intel_miptree_set_aux_state(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t num_layers,
+                            enum isl_aux_state aux_state)
+{
+   num_layers = miptree_layer_range_length(mt, level, start_layer, num_layers);
+
+   if (_mesa_is_format_color_format(mt->format)) {
+      assert(mt->mcs_buf != NULL);
+      assert(mt->surf.samples == 1 ||
+             mt->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY);
+   } else if (mt->format == MESA_FORMAT_S_UINT8) {
+      unreachable("Cannot get aux state for stencil");
+   } else {
+      assert(intel_miptree_level_has_hiz(mt, level));
+   }
+
+   for (unsigned a = 0; a < num_layers; a++)
+      mt->aux_state[level][start_layer + a] = aux_state;
+}
+
+/* On Gen9 color buffers may be compressed by the hardware (lossless
+ * compression). There are, however, format restrictions and care needs to be
+ * taken that the sampler engine is capable for re-interpreting a buffer with
+ * format different the buffer was originally written with.
+ *
+ * For example, SRGB formats are not compressible and the sampler engine isn't
+ * capable of treating RGBA_UNORM as SRGB_ALPHA. In such a case the underlying
+ * color buffer needs to be resolved so that the sampling surface can be
+ * sampled as non-compressed (i.e., without the auxiliary MCS buffer being
+ * set).
+ */
+static bool
+can_texture_with_ccs(struct brw_context *brw,
+                     struct intel_mipmap_tree *mt,
+                     enum isl_format view_format)
+{
+   if (mt->aux_usage != ISL_AUX_USAGE_CCS_E)
       return false;
 
-   const bool is_lossless_compressed =
-      intel_miptree_is_lossless_compressed(brw, mt);
-
-   /* From gen9 onwards there is new compression scheme for single sampled
-    * surfaces called "lossless compressed". These don't need to be always
-    * resolved.
-    */
-   if ((flags & INTEL_MIPTREE_IGNORE_CCS_E) && is_lossless_compressed)
+   /* TODO: Replace with format_ccs_e_compat_with_miptree for better perf. */
+   if (!isl_formats_are_ccs_e_compatible(&brw->screen->devinfo,
+                                         mt->surf.format, view_format)) {
+      perf_debug("Incompatible sampling format (%s) for rbc (%s)\n",
+                 isl_format_get_layout(view_format)->name,
+                 _mesa_get_format_name(mt->format));
       return false;
-
-   /* Fast color clear resolves only make sense for non-MSAA buffers. */
-   if (mt->msaa_layout != INTEL_MSAA_LAYOUT_NONE && !is_lossless_compressed)
-      return false;
+   }
 
    return true;
 }
 
-bool
-intel_miptree_resolve_color(struct brw_context *brw,
-                            struct intel_mipmap_tree *mt, unsigned level,
-                            unsigned start_layer, unsigned num_layers,
-                            int flags)
+enum isl_aux_usage
+intel_miptree_texture_aux_usage(struct brw_context *brw,
+                                struct intel_mipmap_tree *mt,
+                                enum isl_format view_format)
 {
-   intel_miptree_check_color_resolve(brw, mt, level, start_layer);
+   switch (mt->aux_usage) {
+   case ISL_AUX_USAGE_HIZ:
+      if (intel_miptree_sample_with_hiz(brw, mt))
+         return ISL_AUX_USAGE_HIZ;
+      break;
 
-   if (!intel_miptree_needs_color_resolve(brw, mt, flags))
-      return false;
+   case ISL_AUX_USAGE_MCS:
+      return ISL_AUX_USAGE_MCS;
 
-   /* Arrayed fast clear is only supported for gen8+. */
-   assert(brw->gen >= 8 || num_layers == 1);
-
-   bool resolved = false;
-   for (unsigned i = 0; i < num_layers; ++i) {
-      intel_miptree_check_level_layer(mt, level, start_layer + i);
-
-      struct intel_resolve_map *item =
-         intel_resolve_map_get(&mt->color_resolve_map, level,
-                               start_layer + i);
-
-      if (item) {
-         assert(item->fast_clear_state != INTEL_FAST_CLEAR_STATE_RESOLVED);
-
-         brw_blorp_resolve_color(brw, mt, level, start_layer);
-         intel_resolve_map_remove(item);
-         resolved = true;
+   case ISL_AUX_USAGE_CCS_D:
+   case ISL_AUX_USAGE_CCS_E:
+      if (!mt->mcs_buf) {
+         assert(mt->aux_usage == ISL_AUX_USAGE_CCS_D);
+         return ISL_AUX_USAGE_NONE;
       }
+
+      /* If we don't have any unresolved color, report an aux usage of
+       * ISL_AUX_USAGE_NONE.  This way, texturing won't even look at the
+       * aux surface and we can save some bandwidth.
+       */
+      if (!intel_miptree_has_color_unresolved(mt, 0, INTEL_REMAINING_LEVELS,
+                                              0, INTEL_REMAINING_LAYERS))
+         return ISL_AUX_USAGE_NONE;
+
+      if (can_texture_with_ccs(brw, mt, view_format))
+         return ISL_AUX_USAGE_CCS_E;
+      break;
+
+   default:
+      break;
    }
 
-   return resolved;
+   return ISL_AUX_USAGE_NONE;
+}
+
+static bool
+isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b)
+{
+   /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear
+    * values so sRGB curve application was a no-op for all fast-clearable
+    * formats.
+    *
+    * On gen9+, the hardware supports arbitrary clear values.  For sRGB clear
+    * values, the hardware interprets the floats, not as what would be
+    * returned from the sampler (or written by the shader), but as being
+    * between format conversion and sRGB curve application.  This means that
+    * we can switch between sRGB and UNORM without having to whack the clear
+    * color.
+    */
+   return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b);
+}
+
+static void
+intel_miptree_prepare_texture_slices(struct brw_context *brw,
+                                     struct intel_mipmap_tree *mt,
+                                     enum isl_format view_format,
+                                     uint32_t start_level, uint32_t num_levels,
+                                     uint32_t start_layer, uint32_t num_layers,
+                                     bool *aux_supported_out)
+{
+   enum isl_aux_usage aux_usage =
+      intel_miptree_texture_aux_usage(brw, mt, view_format);
+   bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE;
+
+   /* Clear color is specified as ints or floats and the conversion is done by
+    * the sampler.  If we have a texture view, we would have to perform the
+    * clear color conversion manually.  Just disable clear color.
+    */
+   if (!isl_formats_are_fast_clear_compatible(mt->surf.format, view_format))
+      clear_supported = false;
+
+   intel_miptree_prepare_access(brw, mt, start_level, num_levels,
+                                start_layer, num_layers,
+                                aux_usage, clear_supported);
+   if (aux_supported_out)
+      *aux_supported_out = aux_usage != ISL_AUX_USAGE_NONE;
 }
 
 void
-intel_miptree_all_slices_resolve_color(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt,
-                                       int flags)
+intel_miptree_prepare_texture(struct brw_context *brw,
+                              struct intel_mipmap_tree *mt,
+                              enum isl_format view_format,
+                              bool *aux_supported_out)
 {
-   if (!intel_miptree_needs_color_resolve(brw, mt, flags))
-      return;
-      
-   foreach_list_typed_safe(struct intel_resolve_map, map, link,
-                           &mt->color_resolve_map) {
-      assert(map->fast_clear_state != INTEL_FAST_CLEAR_STATE_RESOLVED);
+   intel_miptree_prepare_texture_slices(brw, mt, view_format,
+                                        0, INTEL_REMAINING_LEVELS,
+                                        0, INTEL_REMAINING_LAYERS,
+                                        aux_supported_out);
+}
 
-      brw_blorp_resolve_color(brw, mt, map->level, map->layer);
-      intel_resolve_map_remove(map);
+void
+intel_miptree_prepare_image(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt)
+{
+   /* The data port doesn't understand any compression */
+   intel_miptree_prepare_access(brw, mt, 0, INTEL_REMAINING_LEVELS,
+                                0, INTEL_REMAINING_LAYERS,
+                                ISL_AUX_USAGE_NONE, false);
+}
+
+void
+intel_miptree_prepare_fb_fetch(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt, uint32_t level,
+                               uint32_t start_layer, uint32_t num_layers)
+{
+   intel_miptree_prepare_texture_slices(brw, mt, mt->surf.format, level, 1,
+                                        start_layer, num_layers, NULL);
+}
+
+enum isl_aux_usage
+intel_miptree_render_aux_usage(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt,
+                               enum isl_format render_format,
+                               bool blend_enabled)
+{
+   switch (mt->aux_usage) {
+   case ISL_AUX_USAGE_MCS:
+      assert(mt->mcs_buf);
+      return ISL_AUX_USAGE_MCS;
+
+   case ISL_AUX_USAGE_CCS_D:
+      return mt->mcs_buf ? ISL_AUX_USAGE_CCS_D : ISL_AUX_USAGE_NONE;
+
+   case ISL_AUX_USAGE_CCS_E: {
+      /* If the format supports CCS_E and is compatible with the miptree,
+       * then we can use it.
+       */
+      if (format_ccs_e_compat_with_miptree(&brw->screen->devinfo,
+                                           mt, render_format))
+         return ISL_AUX_USAGE_CCS_E;
+
+      /* Otherwise, we have to fall back to CCS_D */
+
+      /* gen9 hardware technically supports non-0/1 clear colors with sRGB
+       * formats.  However, there are issues with blending where it doesn't
+       * properly apply the sRGB curve to the clear color when blending.
+       */
+      if (blend_enabled && isl_format_is_srgb(render_format) &&
+          !isl_color_value_is_zero_one(mt->fast_clear_color, render_format))
+         return ISL_AUX_USAGE_NONE;
+
+      return ISL_AUX_USAGE_CCS_D;
+   }
+
+   default:
+      return ISL_AUX_USAGE_NONE;
+   }
+}
+
+void
+intel_miptree_prepare_render(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt, uint32_t level,
+                             uint32_t start_layer, uint32_t layer_count,
+                             enum isl_format render_format,
+                             bool blend_enabled)
+{
+   enum isl_aux_usage aux_usage =
+      intel_miptree_render_aux_usage(brw, mt, render_format, blend_enabled);
+   intel_miptree_prepare_access(brw, mt, level, 1, start_layer, layer_count,
+                                aux_usage, aux_usage != ISL_AUX_USAGE_NONE);
+}
+
+void
+intel_miptree_finish_render(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t layer_count,
+                            enum isl_format render_format,
+                            bool blend_enabled)
+{
+   assert(_mesa_is_format_color_format(mt->format));
+
+   enum isl_aux_usage aux_usage =
+      intel_miptree_render_aux_usage(brw, mt, render_format, blend_enabled);
+   intel_miptree_finish_write(brw, mt, level, start_layer, layer_count,
+                              aux_usage);
+}
+
+void
+intel_miptree_prepare_depth(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t layer_count)
+{
+   intel_miptree_prepare_access(brw, mt, level, 1, start_layer, layer_count,
+                                mt->aux_usage, mt->hiz_buf != NULL);
+}
+
+void
+intel_miptree_finish_depth(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt, uint32_t level,
+                           uint32_t start_layer, uint32_t layer_count,
+                           bool depth_written)
+{
+   if (depth_written) {
+      intel_miptree_finish_write(brw, mt, level, start_layer, layer_count,
+                                 mt->hiz_buf != NULL);
    }
 }
 
@@ -2243,11 +2697,14 @@
     * pixel data is stored.  Fortunately this code path should never be
     * reached for multisample buffers.
     */
-   assert(mt->msaa_layout == INTEL_MSAA_LAYOUT_NONE || mt->num_samples <= 1);
+   assert(mt->surf.msaa_layout == ISL_MSAA_LAYOUT_NONE ||
+          mt->surf.samples == 1);
+
+   intel_miptree_prepare_access(brw, mt, 0, INTEL_REMAINING_LEVELS,
+                                0, INTEL_REMAINING_LAYERS,
+                                ISL_AUX_USAGE_NONE, false);
 
    if (mt->mcs_buf) {
-      intel_miptree_all_slices_resolve_color(brw, mt, 0);
-      mt->aux_disable |= (INTEL_AUX_DISABLE_CCS | INTEL_AUX_DISABLE_MCS);
       brw_bo_unreference(mt->mcs_buf->bo);
       free(mt->mcs_buf);
       mt->mcs_buf = NULL;
@@ -2256,13 +2713,12 @@
        * execute any will likely crash due to the missing aux buffer. So let's
        * delete all pending ops.
        */
-      exec_list_make_empty(&mt->color_resolve_map);
+      free(mt->aux_state);
+      mt->aux_state = NULL;
    }
 
    if (mt->hiz_buf) {
-      mt->aux_disable |= INTEL_AUX_DISABLE_HIZ;
-      intel_miptree_all_slices_resolve_depth(brw, mt);
-      intel_miptree_hiz_buffer_free(mt->hiz_buf);
+      intel_miptree_aux_buffer_free(mt->hiz_buf);
       mt->hiz_buf = NULL;
 
       for (uint32_t l = mt->first_level; l <= mt->last_level; ++l) {
@@ -2273,8 +2729,12 @@
        * any will likely crash due to the missing aux buffer. So let's delete
        * all pending ops.
        */
-      exec_list_make_empty(&mt->hiz_map);
+      free(mt->aux_state);
+      mt->aux_state = NULL;
    }
+
+   mt->aux_usage = ISL_AUX_USAGE_NONE;
+   mt->supports_fast_clear = false;
 }
 
 
@@ -2300,7 +2760,7 @@
    uint32_t tile_size = 4096;
    uint32_t tile_width = 64;
    uint32_t tile_height = 64;
-   uint32_t row_size = 64 * stride;
+   uint32_t row_size = 64 * stride / 2; /* Two rows are interleaved. */
 
    uint32_t tile_x = x / tile_width;
    uint32_t tile_y = y / tile_height;
@@ -2339,27 +2799,33 @@
                            struct intel_mipmap_tree *src,
                            struct intel_mipmap_tree *dst)
 {
+   unsigned src_w = src->surf.logical_level0_px.width;
+   unsigned src_h = src->surf.logical_level0_px.height;
+   unsigned dst_w = dst->surf.logical_level0_px.width;
+   unsigned dst_h = dst->surf.logical_level0_px.height;
+
    brw_blorp_blit_miptrees(brw,
                            src, 0 /* level */, 0 /* layer */,
                            src->format, SWIZZLE_XYZW,
                            dst, 0 /* level */, 0 /* layer */, dst->format,
-                           0, 0,
-                           src->logical_width0, src->logical_height0,
-                           0, 0,
-                           dst->logical_width0, dst->logical_height0,
+                           0, 0, src_w, src_h,
+                           0, 0, dst_w, dst_h,
                            GL_NEAREST, false, false /*mirror x, y*/,
                            false, false);
 
    if (src->stencil_mt) {
+      src_w = src->stencil_mt->surf.logical_level0_px.width;
+      src_h = src->stencil_mt->surf.logical_level0_px.height;
+      dst_w = dst->stencil_mt->surf.logical_level0_px.width;
+      dst_h = dst->stencil_mt->surf.logical_level0_px.height;
+
       brw_blorp_blit_miptrees(brw,
                               src->stencil_mt, 0 /* level */, 0 /* layer */,
                               src->stencil_mt->format, SWIZZLE_XYZW,
                               dst->stencil_mt, 0 /* level */, 0 /* layer */,
                               dst->stencil_mt->format,
-                              0, 0,
-                              src->logical_width0, src->logical_height0,
-                              0, 0,
-                              dst->logical_width0, dst->logical_height0,
+                              0, 0, src_w, src_h,
+                              0, 0, dst_w, dst_h,
                               GL_NEAREST, false, false /*mirror x, y*/,
                               false, false /* decode/encode srgb */);
    }
@@ -2375,47 +2841,43 @@
    if (!src || brw->gen >= 8 || !src->r8stencil_needs_update)
       return;
 
+   assert(src->surf.size > 0);
+
    if (!mt->r8stencil_mt) {
-      const uint32_t r8stencil_flags =
-         MIPTREE_LAYOUT_ACCELERATED_UPLOAD | MIPTREE_LAYOUT_TILING_Y |
-         MIPTREE_LAYOUT_DISABLE_AUX;
       assert(brw->gen > 6); /* Handle MIPTREE_LAYOUT_GEN6_HIZ_STENCIL */
-      mt->r8stencil_mt = intel_miptree_create(brw,
-                                              src->target,
-                                              MESA_FORMAT_R_UINT8,
-                                              src->first_level,
-                                              src->last_level,
-                                              src->logical_width0,
-                                              src->logical_height0,
-                                              src->logical_depth0,
-                                              src->num_samples,
-                                              r8stencil_flags);
+      mt->r8stencil_mt = make_surface(
+                            brw,
+                            src->target,
+                            MESA_FORMAT_R_UINT8,
+                            src->first_level, src->last_level,
+                            src->surf.logical_level0_px.width,
+                            src->surf.logical_level0_px.height,
+                            src->surf.dim == ISL_SURF_DIM_3D ?
+                               src->surf.logical_level0_px.depth :
+                               src->surf.logical_level0_px.array_len,
+                            src->surf.samples,
+                            ISL_TILING_Y0_BIT,
+                            ISL_SURF_USAGE_TEXTURE_BIT,
+                            BO_ALLOC_FOR_RENDER, 0, NULL);
       assert(mt->r8stencil_mt);
    }
 
    struct intel_mipmap_tree *dst = mt->r8stencil_mt;
 
    for (int level = src->first_level; level <= src->last_level; level++) {
-      const unsigned depth = src->level[level].depth;
-      const int layers_per_blit =
-         (dst->msaa_layout == INTEL_MSAA_LAYOUT_UMS ||
-          dst->msaa_layout == INTEL_MSAA_LAYOUT_CMS) ?
-         dst->num_samples : 1;
+      const unsigned depth = src->surf.dim == ISL_SURF_DIM_3D ?
+         minify(src->surf.phys_level0_sa.depth, level) :
+         src->surf.phys_level0_sa.array_len;
 
       for (unsigned layer = 0; layer < depth; layer++) {
-         brw_blorp_blit_miptrees(brw,
+         brw_blorp_copy_miptrees(brw,
                                  src, level, layer,
-                                 src->format, SWIZZLE_X,
-                                 dst, level, layers_per_blit * layer,
-                                 MESA_FORMAT_R_UNORM8,
-                                 0, 0,
-                                 minify(src->logical_width0, level),
-                                 minify(src->logical_height0, level),
-                                 0, 0,
-                                 minify(dst->logical_width0, level),
-                                 minify(dst->logical_height0, level),
-                                 GL_NEAREST, false, false /*mirror x, y*/,
-                                 false, false /* decode/encode srgb */);
+                                 dst, level, layer,
+                                 0, 0, 0, 0,
+                                 minify(src->surf.logical_level0_px.width,
+                                        level),
+                                 minify(src->surf.logical_level0_px.height,
+                                        level));
       }
    }
 
@@ -2424,36 +2886,16 @@
 }
 
 static void *
-intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
+intel_miptree_map_raw(struct brw_context *brw,
+                      struct intel_mipmap_tree *mt,
+                      GLbitfield mode)
 {
-   /* CPU accesses to color buffers don't understand fast color clears, so
-    * resolve any pending fast color clears before we map.
-    */
-   intel_miptree_all_slices_resolve_color(brw, mt, 0);
-
    struct brw_bo *bo = mt->bo;
 
    if (brw_batch_references(&brw->batch, bo))
       intel_batchbuffer_flush(brw);
 
-   /* brw_bo_map() uses a WB mmaping of the buffer's backing storage. It
-    * will utilize the CPU cache even if the buffer is incoherent with the
-    * GPU (i.e. any writes will be stored in the cache and not flushed to
-    * memory and so will be invisible to the GPU or display engine). This
-    * is the majority of buffers on a !llc machine, but even on a llc
-    * almost all scanouts are incoherent with the CPU. A WB write into the
-    * backing storage of the current scanout will not be immediately
-    * visible on the screen. The transfer from cache to screen is slow and
-    * indeterministic causing visible glitching on the screen. Never use
-    * this WB mapping for writes to an active scanout (reads are fine, so
-    * long as cache consistency is maintained).
-    */
-   if (mt->tiling != I915_TILING_NONE || mt->is_scanout)
-      brw_bo_map_gtt(brw, bo);
-   else
-      brw_bo_map(brw, bo, true);
-
-   return bo->virtual;
+   return brw_bo_map(brw, bo, mode);
 }
 
 static void
@@ -2484,11 +2926,13 @@
    y /= bh;
    x /= bw;
 
-   base = intel_miptree_map_raw(brw, mt) + mt->offset;
+   base = intel_miptree_map_raw(brw, mt, map->mode);
 
    if (base == NULL)
       map->ptr = NULL;
    else {
+      base += mt->offset;
+
       /* Note that in the case of cube maps, the caller must have passed the
        * slice number referencing the face.
       */
@@ -2496,7 +2940,7 @@
       x += image_x;
       y += image_y;
 
-      map->stride = mt->pitch;
+      map->stride = mt->surf.row_pitch;
       map->ptr = base + y * map->stride + x * mt->cpp;
    }
 
@@ -2523,14 +2967,14 @@
                                          /* first_level */ 0,
                                          /* last_level */ 0,
                                          map->w, map->h, 1,
-                                         /* samples */ 0,
-                                         MIPTREE_LAYOUT_TILING_NONE);
+                                         /* samples */ 1,
+                                         MIPTREE_CREATE_LINEAR);
 
    if (!map->linear_mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
       goto fail;
    }
-   map->stride = map->linear_mt->pitch;
+   map->stride = map->linear_mt->surf.row_pitch;
 
    /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
     * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
@@ -2547,7 +2991,7 @@
       }
    }
 
-   map->ptr = intel_miptree_map_raw(brw, map->linear_mt);
+   map->ptr = intel_miptree_map_raw(brw, map->linear_mt, map->mode);
 
    DBG("%s: %d,%d %dx%d from mt %p (%s) %d,%d = %p/%d\n", __func__,
        map->x, map->y, map->w, map->h,
@@ -2609,13 +3053,13 @@
    image_x += map->x;
    image_y += map->y;
 
-   void *src = intel_miptree_map_raw(brw, mt);
+   void *src = intel_miptree_map_raw(brw, mt, map->mode);
    if (!src)
       return;
 
    src += mt->offset;
 
-   src += image_y * mt->pitch;
+   src += image_y * mt->surf.row_pitch;
    src += image_x * mt->cpp;
 
    /* Due to the pixel offsets for the particular image being mapped, our
@@ -2623,7 +3067,7 @@
     * divisible by 16, then the amount by which it's misaligned will remain
     * consistent from row to row.
     */
-   assert((mt->pitch % 16) == 0);
+   assert((mt->surf.row_pitch % 16) == 0);
    const int misalignment = ((uintptr_t) src) & 15;
 
    /* Create an untiled temporary buffer for the mapping. */
@@ -2639,7 +3083,7 @@
 
    for (uint32_t y = 0; y < map->h; y++) {
       void *dst_ptr = map->ptr + y * map->stride;
-      void *src_ptr = src + y * mt->pitch;
+      void *src_ptr = src + y * mt->surf.row_pitch;
 
       _mesa_streaming_load_memcpy(dst_ptr, src_ptr, width_bytes);
    }
@@ -2678,14 +3122,14 @@
     */
    if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
       uint8_t *untiled_s8_map = map->ptr;
-      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt);
+      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt, GL_MAP_READ_BIT);
       unsigned int image_x, image_y;
 
       intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
 
       for (uint32_t y = 0; y < map->h; y++) {
 	 for (uint32_t x = 0; x < map->w; x++) {
-	    ptrdiff_t offset = intel_offset_S8(mt->pitch,
+	    ptrdiff_t offset = intel_offset_S8(mt->surf.row_pitch,
 	                                       x + image_x + map->x,
 	                                       y + image_y + map->y,
 					       brw->has_swizzling);
@@ -2715,13 +3159,13 @@
    if (map->mode & GL_MAP_WRITE_BIT) {
       unsigned int image_x, image_y;
       uint8_t *untiled_s8_map = map->ptr;
-      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt);
+      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt, GL_MAP_WRITE_BIT);
 
       intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
 
       for (uint32_t y = 0; y < map->h; y++) {
 	 for (uint32_t x = 0; x < map->w; x++) {
-	    ptrdiff_t offset = intel_offset_S8(mt->pitch,
+	    ptrdiff_t offset = intel_offset_S8(mt->surf.row_pitch,
 	                                       image_x + x + map->x,
 	                                       image_y + y + map->y,
 					       brw->has_swizzling);
@@ -2770,16 +3214,16 @@
    image_x += map->x;
    image_y += map->y;
 
-   uint8_t *dst = intel_miptree_map_raw(brw, mt)
-                + image_y * mt->pitch
+   uint8_t *dst = intel_miptree_map_raw(brw, mt, GL_MAP_WRITE_BIT)
+                + image_y * mt->surf.row_pitch
                 + image_x * mt->cpp;
 
    if (mt->etc_format == MESA_FORMAT_ETC1_RGB8)
-      _mesa_etc1_unpack_rgba8888(dst, mt->pitch,
+      _mesa_etc1_unpack_rgba8888(dst, mt->surf.row_pitch,
                                  map->ptr, map->stride,
                                  map->w, map->h);
    else
-      _mesa_unpack_etc2_format(dst, mt->pitch,
+      _mesa_unpack_etc2_format(dst, mt->surf.row_pitch,
                                map->ptr, map->stride,
                                map->w, map->h, mt->etc_format);
 
@@ -2821,8 +3265,8 @@
     */
    if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
       uint32_t *packed_map = map->ptr;
-      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt);
-      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt);
+      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt, GL_MAP_READ_BIT);
+      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt, GL_MAP_READ_BIT);
       unsigned int s_image_x, s_image_y;
       unsigned int z_image_x, z_image_y;
 
@@ -2834,12 +3278,12 @@
       for (uint32_t y = 0; y < map->h; y++) {
 	 for (uint32_t x = 0; x < map->w; x++) {
 	    int map_x = map->x + x, map_y = map->y + y;
-	    ptrdiff_t s_offset = intel_offset_S8(s_mt->pitch,
+	    ptrdiff_t s_offset = intel_offset_S8(s_mt->surf.row_pitch,
 						 map_x + s_image_x,
 						 map_y + s_image_y,
 						 brw->has_swizzling);
 	    ptrdiff_t z_offset = ((map_y + z_image_y) *
-                                  (z_mt->pitch / 4) +
+                                  (z_mt->surf.row_pitch / 4) +
 				  (map_x + z_image_x));
 	    uint8_t s = s_map[s_offset];
 	    uint32_t z = z_map[z_offset];
@@ -2882,8 +3326,8 @@
 
    if (map->mode & GL_MAP_WRITE_BIT) {
       uint32_t *packed_map = map->ptr;
-      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt);
-      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt);
+      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt, GL_MAP_WRITE_BIT);
+      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt, GL_MAP_WRITE_BIT);
       unsigned int s_image_x, s_image_y;
       unsigned int z_image_x, z_image_y;
 
@@ -2894,12 +3338,12 @@
 
       for (uint32_t y = 0; y < map->h; y++) {
 	 for (uint32_t x = 0; x < map->w; x++) {
-	    ptrdiff_t s_offset = intel_offset_S8(s_mt->pitch,
+	    ptrdiff_t s_offset = intel_offset_S8(s_mt->surf.row_pitch,
 						 x + s_image_x + map->x,
 						 y + s_image_y + map->y,
 						 brw->has_swizzling);
 	    ptrdiff_t z_offset = ((y + z_image_y + map->y) *
-                                  (z_mt->pitch / 4) +
+                                  (z_mt->surf.row_pitch / 4) +
 				  (x + z_image_x + map->x));
 
 	    if (map_z32f_x24s8) {
@@ -2979,7 +3423,7 @@
                unsigned int level, unsigned int slice)
 {
    /* See intel_miptree_blit() for details on the 32k pitch limit. */
-   if (mt->pitch >= 32768)
+   if (mt->surf.row_pitch >= 32768)
       return false;
 
    return true;
@@ -2998,15 +3442,15 @@
        */
        !(mode & GL_MAP_WRITE_BIT) &&
        !mt->compressed &&
-       (mt->tiling == I915_TILING_X ||
+       (mt->surf.tiling == ISL_TILING_X ||
         /* Prior to Sandybridge, the blitter can't handle Y tiling */
-        (brw->gen >= 6 && mt->tiling == I915_TILING_Y) ||
+        (brw->gen >= 6 && mt->surf.tiling == ISL_TILING_Y0) ||
         /* Fast copy blit on skl+ supports all tiling formats. */
         brw->gen >= 9) &&
        can_blit_slice(mt, level, slice))
       return true;
 
-   if (mt->tiling != I915_TILING_NONE &&
+   if (mt->surf.tiling != ISL_TILING_LINEAR &&
        mt->bo->size >= brw->max_gtt_map_object_size) {
       assert(can_blit_slice(mt, level, slice));
       return true;
@@ -3041,7 +3485,7 @@
 {
    struct intel_miptree_map *map;
 
-   assert(mt->num_samples <= 1);
+   assert(mt->surf.samples == 1);
 
    map = intel_miptree_attach_map(mt, level, slice, x, y, w, h, mode);
    if (!map){
@@ -3050,10 +3494,8 @@
       return;
    }
 
-   intel_miptree_slice_resolve_depth(brw, mt, level, slice);
-   if (map->mode & GL_MAP_WRITE_BIT) {
-      intel_miptree_slice_set_needs_hiz_resolve(mt, level, slice);
-   }
+   intel_miptree_access_raw(brw, mt, level, slice,
+                            map->mode & GL_MAP_WRITE_BIT);
 
    if (mt->format == MESA_FORMAT_S_UINT8) {
       intel_miptree_map_s8(brw, mt, map, level, slice);
@@ -3067,7 +3509,7 @@
 #if defined(USE_SSE41)
    } else if (!(mode & GL_MAP_WRITE_BIT) &&
               !mt->compressed && cpu_has_sse4_1 &&
-              (mt->pitch % 16 == 0)) {
+              (mt->surf.row_pitch % 16 == 0)) {
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
@@ -3089,7 +3531,7 @@
 {
    struct intel_miptree_map *map = mt->level[level].slice[slice].map;
 
-   assert(mt->num_samples <= 1);
+   assert(mt->surf.samples == 1);
 
    if (!map)
       return;
@@ -3143,13 +3585,13 @@
 }
 
 enum isl_dim_layout
-get_isl_dim_layout(const struct gen_device_info *devinfo, uint32_t tiling,
-                   GLenum target)
+get_isl_dim_layout(const struct gen_device_info *devinfo,
+                   enum isl_tiling tiling, GLenum target)
 {
    switch (target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
-      return (devinfo->gen >= 9 && tiling == I915_TILING_NONE ?
+      return (devinfo->gen >= 9 && tiling == ISL_TILING_LINEAR ?
               ISL_DIM_LAYOUT_GEN9_1D : ISL_DIM_LAYOUT_GEN4_2D);
 
    case GL_TEXTURE_2D:
@@ -3173,282 +3615,15 @@
    unreachable("Invalid texture target");
 }
 
-enum isl_tiling
-intel_miptree_get_isl_tiling(const struct intel_mipmap_tree *mt)
+enum isl_aux_usage
+intel_miptree_get_aux_isl_usage(const struct brw_context *brw,
+                                const struct intel_mipmap_tree *mt)
 {
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      return ISL_TILING_W;
-   } else {
-      switch (mt->tiling) {
-      case I915_TILING_NONE:
-         return ISL_TILING_LINEAR;
-      case I915_TILING_X:
-         return ISL_TILING_X;
-      case I915_TILING_Y:
-            return ISL_TILING_Y0;
-      default:
-         unreachable("Invalid tiling mode");
-      }
-   }
-}
+   if (mt->hiz_buf)
+      return ISL_AUX_USAGE_HIZ;
 
-void
-intel_miptree_get_isl_surf(struct brw_context *brw,
-                           const struct intel_mipmap_tree *mt,
-                           struct isl_surf *surf)
-{
-   surf->dim = get_isl_surf_dim(mt->target);
-   surf->dim_layout = get_isl_dim_layout(&brw->screen->devinfo,
-                                         mt->tiling, mt->target);
+   if (!mt->mcs_buf)
+      return ISL_AUX_USAGE_NONE;
 
-   if (mt->num_samples > 1) {
-      switch (mt->msaa_layout) {
-      case INTEL_MSAA_LAYOUT_IMS:
-         surf->msaa_layout = ISL_MSAA_LAYOUT_INTERLEAVED;
-         break;
-      case INTEL_MSAA_LAYOUT_UMS:
-      case INTEL_MSAA_LAYOUT_CMS:
-         surf->msaa_layout = ISL_MSAA_LAYOUT_ARRAY;
-         break;
-      default:
-         unreachable("Invalid MSAA layout");
-      }
-   } else {
-      surf->msaa_layout = ISL_MSAA_LAYOUT_NONE;
-   }
-
-   surf->tiling = intel_miptree_get_isl_tiling(mt);
-
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      /* The ISL definition of row_pitch matches the surface state pitch field
-       * a bit better than intel_mipmap_tree.  In particular, ISL incorporates
-       * the factor of 2 for W-tiling in row_pitch.
-       */
-      surf->row_pitch = 2 * mt->pitch;
-   } else {
-      surf->row_pitch = mt->pitch;
-   }
-
-   surf->format = translate_tex_format(brw, mt->format, false);
-
-   if (brw->gen >= 9) {
-      if (surf->dim == ISL_SURF_DIM_1D && surf->tiling == ISL_TILING_LINEAR) {
-         /* For gen9 1-D surfaces, intel_mipmap_tree has a bogus alignment. */
-         surf->image_alignment_el = isl_extent3d(64, 1, 1);
-      } else {
-         /* On gen9+, intel_mipmap_tree stores the horizontal and vertical
-          * alignment in terms of surface elements like we want.
-          */
-         surf->image_alignment_el = isl_extent3d(mt->halign, mt->valign, 1);
-      }
-   } else {
-      /* On earlier gens it's stored in pixels. */
-      unsigned bw, bh;
-      _mesa_get_format_block_size(mt->format, &bw, &bh);
-      surf->image_alignment_el =
-         isl_extent3d(mt->halign / bw, mt->valign / bh, 1);
-   }
-
-   surf->logical_level0_px.width = mt->logical_width0;
-   surf->logical_level0_px.height = mt->logical_height0;
-   if (surf->dim == ISL_SURF_DIM_3D) {
-      surf->logical_level0_px.depth = mt->logical_depth0;
-      surf->logical_level0_px.array_len = 1;
-   } else {
-      surf->logical_level0_px.depth = 1;
-      surf->logical_level0_px.array_len = mt->logical_depth0;
-   }
-
-   surf->phys_level0_sa.width = mt->physical_width0;
-   surf->phys_level0_sa.height = mt->physical_height0;
-   if (surf->dim == ISL_SURF_DIM_3D) {
-      surf->phys_level0_sa.depth = mt->physical_depth0;
-      surf->phys_level0_sa.array_len = 1;
-   } else {
-      surf->phys_level0_sa.depth = 1;
-      surf->phys_level0_sa.array_len = mt->physical_depth0;
-   }
-
-   surf->levels = mt->last_level + 1;
-   surf->samples = MAX2(mt->num_samples, 1);
-
-   surf->size = 0; /* TODO */
-   surf->alignment = 0; /* TODO */
-
-   switch (surf->dim_layout) {
-   case ISL_DIM_LAYOUT_GEN4_2D:
-   case ISL_DIM_LAYOUT_GEN4_3D:
-      if (brw->gen >= 9) {
-         surf->array_pitch_el_rows = mt->qpitch;
-      } else {
-         unsigned bw, bh;
-         _mesa_get_format_block_size(mt->format, &bw, &bh);
-         assert(mt->qpitch % bh == 0);
-         surf->array_pitch_el_rows = mt->qpitch / bh;
-      }
-      break;
-   case ISL_DIM_LAYOUT_GEN9_1D:
-      surf->array_pitch_el_rows = 1;
-      break;
-   }
-
-   switch (mt->array_layout) {
-   case ALL_LOD_IN_EACH_SLICE:
-      surf->array_pitch_span = ISL_ARRAY_PITCH_SPAN_FULL;
-      break;
-   case ALL_SLICES_AT_EACH_LOD:
-   case GEN6_HIZ_STENCIL:
-      surf->array_pitch_span = ISL_ARRAY_PITCH_SPAN_COMPACT;
-      break;
-   default:
-      unreachable("Invalid array layout");
-   }
-
-   GLenum base_format = _mesa_get_format_base_format(mt->format);
-   switch (base_format) {
-   case GL_DEPTH_COMPONENT:
-      surf->usage = ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT;
-      break;
-   case GL_STENCIL_INDEX:
-      surf->usage = ISL_SURF_USAGE_STENCIL_BIT;
-      if (brw->gen >= 8)
-         surf->usage |= ISL_SURF_USAGE_TEXTURE_BIT;
-      break;
-   case GL_DEPTH_STENCIL:
-      /* In this case we only texture from the depth part */
-      surf->usage = ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_STENCIL_BIT |
-                    ISL_SURF_USAGE_TEXTURE_BIT;
-      break;
-   default:
-      surf->usage = ISL_SURF_USAGE_TEXTURE_BIT;
-      if (brw->format_supported_as_render_target[mt->format])
-         surf->usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
-      break;
-   }
-
-   if (_mesa_is_cube_map_texture(mt->target))
-      surf->usage |= ISL_SURF_USAGE_CUBE_BIT;
-}
-
-/* WARNING: THE SURFACE CREATED BY THIS FUNCTION IS NOT COMPLETE AND CANNOT BE
- * USED FOR ANY REAL CALCULATIONS.  THE ONLY VALID USE OF SUCH A SURFACE IS TO
- * PASS IT INTO isl_surf_fill_state.
- */
-void
-intel_miptree_get_aux_isl_surf(struct brw_context *brw,
-                               const struct intel_mipmap_tree *mt,
-                               struct isl_surf *surf,
-                               enum isl_aux_usage *usage)
-{
-   uint32_t aux_pitch, aux_qpitch;
-   if (mt->mcs_buf) {
-      aux_pitch = mt->mcs_buf->pitch;
-      aux_qpitch = mt->mcs_buf->qpitch;
-
-      if (mt->num_samples > 1) {
-         assert(mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS);
-         *usage = ISL_AUX_USAGE_MCS;
-      } else if (intel_miptree_is_lossless_compressed(brw, mt)) {
-         assert(brw->gen >= 9);
-         *usage = ISL_AUX_USAGE_CCS_E;
-      } else if ((mt->aux_disable & INTEL_AUX_DISABLE_CCS) == 0) {
-         *usage = ISL_AUX_USAGE_CCS_D;
-      } else {
-         unreachable("Invalid MCS miptree");
-      }
-   } else if (mt->hiz_buf) {
-      aux_pitch = mt->hiz_buf->aux_base.pitch;
-      aux_qpitch = mt->hiz_buf->aux_base.qpitch;
-
-      *usage = ISL_AUX_USAGE_HIZ;
-   } else {
-      *usage = ISL_AUX_USAGE_NONE;
-      return;
-   }
-
-   /* Start with a copy of the original surface. */
-   intel_miptree_get_isl_surf(brw, mt, surf);
-
-   /* Figure out the format and tiling of the auxiliary surface */
-   switch (*usage) {
-   case ISL_AUX_USAGE_NONE:
-      unreachable("Invalid auxiliary usage");
-
-   case ISL_AUX_USAGE_HIZ:
-      isl_surf_get_hiz_surf(&brw->isl_dev, surf, surf);
-      break;
-
-   case ISL_AUX_USAGE_MCS:
-      /*
-       * From the SKL PRM:
-       *    "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E,
-       *    HALIGN 16 must be used."
-       */
-      if (brw->gen >= 9)
-         assert(mt->halign == 16);
-
-      isl_surf_get_mcs_surf(&brw->isl_dev, surf, surf);
-      break;
-
-   case ISL_AUX_USAGE_CCS_D:
-   case ISL_AUX_USAGE_CCS_E:
-      /*
-       * From the BDW PRM, Volume 2d, page 260 (RENDER_SURFACE_STATE):
-       *
-       *    "When MCS is enabled for non-MSRT, HALIGN_16 must be used"
-       *
-       * From the hardware spec for GEN9:
-       *
-       *    "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E,
-       *    HALIGN 16 must be used."
-       */
-      assert(mt->num_samples <= 1);
-      if (brw->gen >= 8)
-         assert(mt->halign == 16);
-
-      isl_surf_get_ccs_surf(&brw->isl_dev, surf, surf);
-      break;
-   }
-
-   /* We want the pitch of the actual aux buffer. */
-   surf->row_pitch = aux_pitch;
-
-   /* Auxiliary surfaces in ISL have compressed formats and array_pitch_el_rows
-    * is in elements.  This doesn't match intel_mipmap_tree::qpitch which is
-    * in elements of the primary color surface so we have to divide by the
-    * compression block height.
-    */
-   surf->array_pitch_el_rows =
-      aux_qpitch / isl_format_get_layout(surf->format)->bh;
-}
-
-union isl_color_value
-intel_miptree_get_isl_clear_color(struct brw_context *brw,
-                                  const struct intel_mipmap_tree *mt)
-{
-   union isl_color_value clear_color;
-
-   if (_mesa_get_format_base_format(mt->format) == GL_DEPTH_COMPONENT) {
-      clear_color.i32[0] = mt->depth_clear_value;
-      clear_color.i32[1] = 0;
-      clear_color.i32[2] = 0;
-      clear_color.i32[3] = 0;
-   } else if (brw->gen >= 9) {
-      clear_color.i32[0] = mt->gen9_fast_clear_color.i[0];
-      clear_color.i32[1] = mt->gen9_fast_clear_color.i[1];
-      clear_color.i32[2] = mt->gen9_fast_clear_color.i[2];
-      clear_color.i32[3] = mt->gen9_fast_clear_color.i[3];
-   } else if (_mesa_is_format_integer(mt->format)) {
-      clear_color.i32[0] = (mt->fast_clear_color_value & (1u << 31)) != 0;
-      clear_color.i32[1] = (mt->fast_clear_color_value & (1u << 30)) != 0;
-      clear_color.i32[2] = (mt->fast_clear_color_value & (1u << 29)) != 0;
-      clear_color.i32[3] = (mt->fast_clear_color_value & (1u << 28)) != 0;
-   } else {
-      clear_color.f32[0] = (mt->fast_clear_color_value & (1u << 31)) != 0;
-      clear_color.f32[1] = (mt->fast_clear_color_value & (1u << 30)) != 0;
-      clear_color.f32[2] = (mt->fast_clear_color_value & (1u << 29)) != 0;
-      clear_color.f32[3] = (mt->fast_clear_color_value & (1u << 28)) != 0;
-   }
-
-   return clear_color;
+   return mt->aux_usage;
 }
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 890378c..5f5110b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -48,8 +48,8 @@
 
 #include "main/mtypes.h"
 #include "isl/isl.h"
+#include "blorp/blorp.h"
 #include "brw_bufmgr.h"
-#include "intel_resolve_map.h"
 #include <GL/internal/dri_interface.h>
 
 #ifdef __cplusplus
@@ -59,7 +59,6 @@
 struct brw_context;
 struct intel_renderbuffer;
 
-struct intel_resolve_map;
 struct intel_texture_image;
 
 /**
@@ -101,21 +100,6 @@
    GLuint level_y;
 
    /**
-    * \brief Number of 2D slices in this miplevel.
-    *
-    * The exact semantics of depth varies according to the texture target:
-    *    - For GL_TEXTURE_CUBE_MAP, depth is 6.
-    *    - For GL_TEXTURE_2D_ARRAY, depth is the number of array slices. It is
-    *      identical for all miplevels in the texture.
-    *    - For GL_TEXTURE_3D, it is the texture's depth at this miplevel. Its
-    *      value, like width and height, varies with miplevel.
-    *    - For other texture types, depth is 1.
-    *    - Additionally, for UMS and CMS miptrees, depth is multiplied by
-    *      sample count.
-    */
-   GLuint depth;
-
-   /**
     * \brief Is HiZ enabled for this level?
     *
     * If \c mt->level[l].has_hiz is set, then (1) \c mt->hiz_mt has been
@@ -132,26 +116,6 @@
     */
    struct intel_mipmap_slice {
       /**
-       * \name Offset to slice
-       * \{
-       *
-       * Hardware formats are so diverse that that there is no unified way to
-       * compute the slice offsets, so we store them in this table.
-       *
-       * The (x, y) offset to slice \c s at level \c l relative the miptrees
-       * base address is
-       * \code
-       *     x = mt->level[l].slice[s].x_offset
-       *     y = mt->level[l].slice[s].y_offset
-       *
-       * On some hardware generations, we program these offsets into
-       * RENDER_SURFACE_STATE.XOffset and RENDER_SURFACE_STATE.YOffset.
-       */
-      GLuint x_offset;
-      GLuint y_offset;
-      /** \} */
-
-      /**
        * Mapping information. Persistent for the duration of
        * intel_miptree_map/unmap on this slice.
        */
@@ -160,145 +124,6 @@
 };
 
 /**
- * Enum for keeping track of the different MSAA layouts supported by Gen7.
- */
-enum intel_msaa_layout
-{
-   /**
-    * Ordinary surface with no MSAA.
-    */
-   INTEL_MSAA_LAYOUT_NONE,
-
-   /**
-    * Interleaved Multisample Surface.  The additional samples are
-    * accommodated by scaling up the width and the height of the surface so
-    * that all the samples corresponding to a pixel are located at nearby
-    * memory locations.
-    *
-    * @see PRM section "Interleaved Multisampled Surfaces"
-    */
-   INTEL_MSAA_LAYOUT_IMS,
-
-   /**
-    * Uncompressed Multisample Surface.  The surface is stored as a 2D array,
-    * with array slice n containing all pixel data for sample n.
-    *
-    * @see PRM section "Uncompressed Multisampled Surfaces"
-    */
-   INTEL_MSAA_LAYOUT_UMS,
-
-   /**
-    * Compressed Multisample Surface.  The surface is stored as in
-    * INTEL_MSAA_LAYOUT_UMS, but there is an additional buffer called the MCS
-    * (Multisample Control Surface) buffer.  Each pixel in the MCS buffer
-    * indicates the mapping from sample number to array slice.  This allows
-    * the common case (where all samples constituting a pixel have the same
-    * color value) to be stored efficiently by just using a single array
-    * slice.
-    *
-    * @see PRM section "Compressed Multisampled Surfaces"
-    */
-   INTEL_MSAA_LAYOUT_CMS,
-};
-
-enum miptree_array_layout {
-   /* Each array slice contains all miplevels packed together.
-    *
-    * Gen hardware usually wants multilevel miptrees configured this way.
-    *
-    * A 2D Array texture with 2 slices and multiple LODs using
-    * ALL_LOD_IN_EACH_SLICE would look somewhat like this:
-    *
-    *   +----------+
-    *   |          |
-    *   |          |
-    *   +----------+
-    *   +---+ +-+
-    *   |   | +-+
-    *   +---+ *
-    *   +----------+
-    *   |          |
-    *   |          |
-    *   +----------+
-    *   +---+ +-+
-    *   |   | +-+
-    *   +---+ *
-    */
-   ALL_LOD_IN_EACH_SLICE,
-
-   /* Each LOD contains all slices of that LOD packed together.
-    *
-    * In some situations, Gen7+ hardware can use the array_spacing_lod0
-    * feature to save space when the surface only contains LOD 0.
-    *
-    * Gen6 uses this for separate stencil and hiz since gen6 does not support
-    * multiple LODs for separate stencil and hiz.
-    *
-    * A 2D Array texture with 2 slices and multiple LODs using
-    * ALL_SLICES_AT_EACH_LOD would look somewhat like this:
-    *
-    *   +----------+
-    *   |          |
-    *   |          |
-    *   +----------+
-    *   |          |
-    *   |          |
-    *   +----------+
-    *   +---+ +-+
-    *   |   | +-+
-    *   +---+ +-+
-    *   |   | :
-    *   +---+
-    */
-   ALL_SLICES_AT_EACH_LOD,
-
-   /* On Sandy Bridge, HiZ and stencil buffers work the same as on Ivy Bridge
-    * except that they don't technically support mipmapping.  That does not,
-    * however, stop us from doing it.  As far as Sandy Bridge hardware is
-    * concerned, HiZ and stencil always operates on a single miplevel 2D
-    * (possibly array) image.  The dimensions of that image are NOT minified.
-    *
-    * In order to implement HiZ and stencil on Sandy Bridge, we create one
-    * full-sized 2D (possibly array) image for every LOD with every image
-    * aligned to a page boundary.  In order to save memory, we pretend that
-    * the width of each miplevel is minified and we place LOD1 and above below
-    * LOD0 but horizontally adjacent to each other.  When considered as
-    * full-sized images, LOD1 and above technically overlap.  However, since
-    * we only write to part of that image, the hardware will never notice the
-    * overlap.
-    *
-    * This layout looks something like this:
-    *
-    *   +---------+
-    *   |         |
-    *   |         |
-    *   +---------+
-    *   |         |
-    *   |         |
-    *   +---------+
-    *
-    *   +----+ +-+ .
-    *   |    | +-+
-    *   +----+
-    *
-    *   +----+ +-+ .
-    *   |    | +-+
-    *   +----+
-    */
-   GEN6_HIZ_STENCIL,
-};
-
-enum intel_aux_disable {
-   INTEL_AUX_DISABLE_NONE = 0,
-   INTEL_AUX_DISABLE_HIZ  = 1 << 1,
-   INTEL_AUX_DISABLE_MCS  = 1 << 2,
-   INTEL_AUX_DISABLE_CCS  = 1 << 3,
-   INTEL_AUX_DISABLE_ALL  = INTEL_AUX_DISABLE_HIZ |
-                            INTEL_AUX_DISABLE_MCS |
-                            INTEL_AUX_DISABLE_CCS
-};
-
-/**
  * Miptree aux buffer. These buffers are associated with a miptree, but the
  * format is managed by the hardware.
  *
@@ -308,6 +133,8 @@
  */
 struct intel_miptree_aux_buffer
 {
+   struct isl_surf surf;
+
    /**
     * Buffer object containing the pixel data.
     *
@@ -353,23 +180,11 @@
     */
    uint32_t qpitch;
 };
-/**
- * The HiZ buffer requires extra attributes on earlier GENs. This is easily
- * contained within an intel_mipmap_tree. To make sure we do not abuse this, we
- * keep the hiz datastructure separate.
- */
-struct intel_miptree_hiz_buffer
-{
-   struct intel_miptree_aux_buffer aux_base;
-
-   /**
-    * Hiz miptree. Used only by Gen6.
-    */
-   struct intel_mipmap_tree *mt;
-};
 
 struct intel_mipmap_tree
 {
+   struct isl_surf surf;
+
    /**
     * Buffer object containing the surface.
     *
@@ -383,25 +198,6 @@
    struct brw_bo *bo;
 
    /**
-    * Pitch in bytes.
-    *
-    * @see RENDER_SURFACE_STATE.SurfacePitch
-    * @see RENDER_SURFACE_STATE.AuxiliarySurfacePitch
-    * @see 3DSTATE_DEPTH_BUFFER.SurfacePitch
-    * @see 3DSTATE_HIER_DEPTH_BUFFER.SurfacePitch
-    * @see 3DSTATE_STENCIL_BUFFER.SurfacePitch
-    */
-   uint32_t pitch;
-
-   /**
-    * One of the I915_TILING_* flags.
-    *
-    * @see RENDER_SURFACE_STATE.TileMode
-    * @see 3DSTATE_DEPTH_BUFFER.TileMode
-    */
-   uint32_t tiling;
-
-   /**
     * @brief One of GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY, etc.
     *
     * @see RENDER_SURFACE_STATE.SurfaceType
@@ -436,130 +232,14 @@
     */
    mesa_format etc_format;
 
-   /**
-    * @name Surface Alignment
-    * @{
-    *
-    * This defines the alignment of the upperleft pixel of each "slice" in the
-    * surface. The alignment is in pixel coordinates relative to the surface's
-    * most upperleft pixel, which is the pixel at (x=0, y=0, layer=0,
-    * level=0).
-    *
-    * The hardware docs do not use the term "slice".  We use "slice" to mean
-    * the pixels at a given miplevel and layer. For 2D surfaces, the layer is
-    * the array slice; for 3D surfaces, the layer is the z offset.
-    *
-    * In the surface layout equations found in the hardware docs, the
-    * horizontal and vertical surface alignments often appear as variables 'i'
-    * and 'j'.
-    */
-
-   /** @see RENDER_SURFACE_STATE.SurfaceHorizontalAlignment */
-   uint32_t halign;
-
-   /** @see RENDER_SURFACE_STATE.SurfaceVerticalAlignment */
-   uint32_t valign;
-   /** @} */
-
    GLuint first_level;
    GLuint last_level;
 
-   /**
-    * Level zero image dimensions.  These dimensions correspond to the
-    * physical layout of data in memory.  Accordingly, they account for the
-    * extra width, height, and or depth that must be allocated in order to
-    * accommodate multisample formats, and they account for the extra factor
-    * of 6 in depth that must be allocated in order to accommodate cubemap
-    * textures.
-    */
-   GLuint physical_width0, physical_height0, physical_depth0;
-
    /** Bytes per pixel (or bytes per block if compressed) */
    GLuint cpp;
 
-   /**
-    * @see RENDER_SURFACE_STATE.NumberOfMultisamples
-    * @see 3DSTATE_MULTISAMPLE.NumberOfMultisamples
-    */
-   GLuint num_samples;
-
    bool compressed;
 
-   /**
-    * @name Level zero image dimensions
-    * @{
-    *
-    * These dimensions correspond to the
-    * logical width, height, and depth of the texture as seen by client code.
-    * Accordingly, they do not account for the extra width, height, and/or
-    * depth that must be allocated in order to accommodate multisample
-    * formats, nor do they account for the extra factor of 6 in depth that
-    * must be allocated in order to accommodate cubemap textures.
-    */
-
-   /**
-    * @see RENDER_SURFACE_STATE.Width
-    * @see 3DSTATE_DEPTH_BUFFER.Width
-    */
-   uint32_t logical_width0;
-
-   /**
-    * @see RENDER_SURFACE_STATE.Height
-    * @see 3DSTATE_DEPTH_BUFFER.Height
-    */
-   uint32_t logical_height0;
-
-   /**
-    * @see RENDER_SURFACE_STATE.Depth
-    * @see 3DSTATE_DEPTH_BUFFER.Depth
-    */
-   uint32_t logical_depth0;
-   /** @} */
-
-   /**
-    * Indicates if we use the standard miptree layout (ALL_LOD_IN_EACH_SLICE),
-    * or if we tightly pack array slices at each LOD (ALL_SLICES_AT_EACH_LOD).
-    */
-   enum miptree_array_layout array_layout;
-
-   /**
-    * The distance in between array slices.
-    *
-    * The value is the one that is sent in the surface state. The actual
-    * meaning depends on certain criteria. Usually it is simply the number of
-    * uncompressed rows between each slice. However on Gen9+ for compressed
-    * surfaces it is the number of blocks. For 1D array surfaces that have the
-    * mipmap tree stored horizontally it is the number of pixels between each
-    * slice.
-    *
-    * @see RENDER_SURFACE_STATE.SurfaceQPitch
-    * @see 3DSTATE_DEPTH_BUFFER.SurfaceQPitch
-    * @see 3DSTATE_HIER_DEPTH_BUFFER.SurfaceQPitch
-    * @see 3DSTATE_STENCIL_BUFFER.SurfaceQPitch
-    */
-   uint32_t qpitch;
-
-   /**
-    * MSAA layout used by this buffer.
-    *
-    * @see RENDER_SURFACE_STATE.MultisampledSurfaceStorageFormat
-    */
-   enum intel_msaa_layout msaa_layout;
-
-   /* Derived from the above:
-    */
-   GLuint total_width;
-   GLuint total_height;
-
-   /**
-    * The depth value used during the most recent fast depth clear performed
-    * on the surface. This field is invalid only if surface has never
-    * underwent a fast depth clear.
-    *
-    * @see 3DSTATE_CLEAR_PARAMS.DepthClearValue
-    */
-   uint32_t depth_clear_value;
-
    /* Includes image offset tables: */
    struct intel_mipmap_level level[MAX_TEXTURE_LEVELS];
 
@@ -581,27 +261,36 @@
     * To allocate the hiz buffer, use intel_miptree_alloc_hiz().
     *
     * To determine if hiz is enabled, do not check this pointer. Instead, use
-    * intel_miptree_slice_has_hiz().
+    * intel_miptree_level_has_hiz().
     */
-   struct intel_miptree_hiz_buffer *hiz_buf;
+   struct intel_miptree_aux_buffer *hiz_buf;
 
    /**
-    * \brief Maps of miptree slices to needed resolves.
+    * \brief The type of auxiliary compression used by this miptree.
     *
-    * hiz_map is used only when the miptree has a child HiZ miptree.
-    *
-    * Let \c mt be a depth miptree with HiZ enabled. Then the resolve map is
-    * \c mt->hiz_map. The resolve map of the child HiZ miptree, \c
-    * mt->hiz_mt->hiz_map, is unused.
-    *
-    *
-    * color_resolve_map is used only when the miptree uses fast clear (Gen7+)
-    * lossless compression (Gen9+). It should be noted that absence in the
-    * map means implicitly RESOLVED state. If item is found it always
-    * indicates state other than RESOLVED.
+    * This describes the type of auxiliary compression that is intended to be
+    * used by this miptree.  An aux usage of ISL_AUX_USAGE_NONE means that
+    * auxiliary compression is permanently disabled.  An aux usage other than
+    * ISL_AUX_USAGE_NONE does not imply that the auxiliary buffer has actually
+    * been allocated nor does it imply that auxiliary compression will always
+    * be enabled for this surface.  For instance, with CCS_D, we may allocate
+    * the CCS on-the-fly and it may not be used for texturing if the miptree
+    * is fully resolved.
     */
-   struct exec_list hiz_map; /* List of intel_resolve_map. */
-   struct exec_list color_resolve_map; /* List of intel_resolve_map. */
+   enum isl_aux_usage aux_usage;
+
+   /**
+    * \brief Whether or not this miptree supports fast clears.
+    */
+   bool supports_fast_clear;
+
+   /**
+    * \brief Maps miptree slices to their current aux state
+    *
+    * This two-dimensional array is indexed as [level][layer] and stores an
+    * aux state for each slice.
+    */
+   enum isl_aux_state **aux_state;
 
    /**
     * \brief Stencil miptree for depthstencil textures.
@@ -645,39 +334,10 @@
    struct intel_mipmap_tree *plane[2];
 
    /**
-    * The SURFACE_STATE bits associated with the last fast color clear to this
-    * color mipmap tree, if any.
-    *
-    * Prior to GEN9 there is a single bit for RGBA clear values which gives you
-    * the option of 2^4 clear colors. Each bit determines if the color channel
-    * is fully saturated or unsaturated (Cherryview does add a 32b value per
-    * channel, but it is globally applied instead of being part of the render
-    * surface state). Starting with GEN9, the surface state accepts a 32b value
-    * for each color channel.
-    *
-    * @see RENDER_SURFACE_STATE.RedClearColor
-    * @see RENDER_SURFACE_STATE.GreenClearColor
-    * @see RENDER_SURFACE_STATE.BlueClearColor
-    * @see RENDER_SURFACE_STATE.AlphaClearColor
+    * Fast clear color for this surface.  For depth surfaces, the clear value
+    * is stored as a float32 in the red component.
     */
-   union {
-      uint32_t fast_clear_color_value;
-      union gl_color_union gen9_fast_clear_color;
-   };
-
-   /**
-    * Disable allocation of auxiliary buffers, such as the HiZ buffer and MCS
-    * buffer. This is useful for sharing the miptree bo with an external client
-    * that doesn't understand auxiliary buffers.
-    */
-   enum intel_aux_disable aux_disable;
-
-   /**
-    * Tells if the underlying buffer is to be also consumed by entities other
-    * than the driver. This allows logic to turn off features such as lossless
-    * compression which is not currently understood by client applications.
-    */
-   bool is_scanout;
+   union isl_color_value fast_clear_color;
 
    /* These are also refcounted:
     */
@@ -685,39 +345,32 @@
 };
 
 bool
-intel_miptree_is_lossless_compressed(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt);
+intel_miptree_alloc_ccs(struct brw_context *brw,
+                        struct intel_mipmap_tree *mt);
 
-bool
-intel_tiling_supports_non_msrt_mcs(const struct brw_context *brw,
-                                   unsigned tiling);
+enum intel_miptree_create_flags {
+   /** No miptree create flags */
+   MIPTREE_CREATE_DEFAULT  = 0,
 
-bool
-intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw,
-                                           const struct intel_mipmap_tree *mt);
+   /** Miptree creation should try to allocate a currently busy BO
+    *
+    * This may be advantageous if we know the next thing to touch the BO will
+    * be the GPU because the BO will likely already be in the GTT and maybe
+    * even in some caches.  If there is a chance that the next thing to touch
+    * the miptree BO will be the CPU, this flag should not be set.
+    */
+   MIPTREE_CREATE_BUSY     = 1 << 0,
 
-bool
-intel_miptree_supports_lossless_compressed(struct brw_context *brw,
-                                           const struct intel_mipmap_tree *mt);
+   /** Create a linear (not tiled) miptree */
+   MIPTREE_CREATE_LINEAR   = 1 << 1,
 
-bool
-intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
-                                 struct intel_mipmap_tree *mt,
-                                 bool is_lossless_compressed);
-
-enum {
-   MIPTREE_LAYOUT_ACCELERATED_UPLOAD       = 1 << 0,
-   MIPTREE_LAYOUT_GEN6_HIZ_STENCIL         = 1 << 1,
-   MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
-   MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
-   MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
-
-   MIPTREE_LAYOUT_TILING_Y                 = 1 << 5,
-   MIPTREE_LAYOUT_TILING_NONE              = 1 << 6,
-   MIPTREE_LAYOUT_TILING_ANY               = MIPTREE_LAYOUT_TILING_Y |
-                                             MIPTREE_LAYOUT_TILING_NONE,
-
-   MIPTREE_LAYOUT_FOR_SCANOUT              = 1 << 7,
+   /** Create the miptree with auxiliary compression disabled
+    *
+    * This does not prevent the caller of intel_miptree_create from coming
+    * along later and turning auxiliary compression back on but it does mean
+    * that the miptree will be created with mt->aux_usage == NONE.
+    */
+   MIPTREE_CREATE_NO_AUX   = 1 << 2,
 };
 
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
@@ -729,7 +382,7 @@
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
-                                               uint32_t flags);
+                                               enum intel_miptree_create_flags flags);
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_bo(struct brw_context *brw,
@@ -740,12 +393,19 @@
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            uint32_t layout_flags);
+                            enum intel_miptree_create_flags flags);
 
-void
+struct intel_mipmap_tree *
+intel_miptree_create_for_dri_image(struct brw_context *brw,
+                                   __DRIimage *image,
+                                   GLenum target,
+                                   enum isl_colorspace colorspace,
+                                   bool is_winsys_image);
+
+bool
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                          struct intel_renderbuffer *irb,
-                                         struct brw_bo *bo,
+                                         struct intel_mipmap_tree *singlesample_mt,
                                          uint32_t width, uint32_t height,
                                          uint32_t pitch);
 
@@ -770,19 +430,10 @@
 intel_lower_compressed_format(struct brw_context *brw, mesa_format format);
 
 /** \brief Assert that the level and layer are valid for the miptree. */
-static inline void
+void
 intel_miptree_check_level_layer(const struct intel_mipmap_tree *mt,
                                 uint32_t level,
-                                uint32_t layer)
-{
-   (void) mt;
-   (void) level;
-   (void) layer;
-
-   assert(level >= mt->first_level);
-   assert(level <= mt->last_level);
-   assert(layer < mt->level[level].depth);
-}
+                                uint32_t layer);
 
 void intel_miptree_reference(struct intel_mipmap_tree **dst,
                              struct intel_mipmap_tree *src);
@@ -803,36 +454,23 @@
 get_isl_surf_dim(GLenum target);
 
 enum isl_dim_layout
-get_isl_dim_layout(const struct gen_device_info *devinfo, uint32_t tiling,
-                   GLenum target);
+get_isl_dim_layout(const struct gen_device_info *devinfo,
+                   enum isl_tiling tiling, GLenum target);
 
-enum isl_tiling
-intel_miptree_get_isl_tiling(const struct intel_mipmap_tree *mt);
-
-void
-intel_miptree_get_isl_surf(struct brw_context *brw,
-                           const struct intel_mipmap_tree *mt,
-                           struct isl_surf *surf);
-void
-intel_miptree_get_aux_isl_surf(struct brw_context *brw,
-                               const struct intel_mipmap_tree *mt,
-                               struct isl_surf *surf,
-                               enum isl_aux_usage *usage);
-
-union isl_color_value
-intel_miptree_get_isl_clear_color(struct brw_context *brw,
-                                  const struct intel_mipmap_tree *mt);
+enum isl_aux_usage
+intel_miptree_get_aux_isl_usage(const struct brw_context *brw,
+                                const struct intel_mipmap_tree *mt);
 
 void
 intel_get_image_dims(struct gl_texture_image *image,
                      int *width, int *height, int *depth);
 
 void
-intel_get_tile_masks(uint32_t tiling, uint32_t cpp,
+intel_get_tile_masks(enum isl_tiling tiling, uint32_t cpp,
                      uint32_t *mask_x, uint32_t *mask_y);
 
 void
-intel_get_tile_dims(uint32_t tiling, uint32_t cpp,
+intel_get_tile_dims(enum isl_tiling tiling, uint32_t cpp,
                     uint32_t *tile_w, uint32_t *tile_h);
 
 uint32_t
@@ -844,13 +482,12 @@
 intel_miptree_get_aligned_offset(const struct intel_mipmap_tree *mt,
                                  uint32_t x, uint32_t y);
 
-void intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
-                                  GLuint level,
-                                  GLuint x, GLuint y, GLuint d);
-
-void intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-                                    GLuint level,
-                                    GLuint img, GLuint x, GLuint y);
+void
+intel_miptree_copy_slice(struct brw_context *brw,
+                         struct intel_mipmap_tree *src_mt,
+                         unsigned src_level, unsigned src_layer,
+                         struct intel_mipmap_tree *dst_mt,
+                         unsigned dst_level, unsigned dst_layer);
 
 void
 intel_miptree_copy_teximage(struct brw_context *brw,
@@ -865,10 +502,6 @@
  * functions on a miptree without HiZ. In that case, each function is a no-op.
  */
 
-bool
-intel_miptree_wants_hiz_buffer(struct brw_context *brw,
-			       struct intel_mipmap_tree *mt);
-
 /**
  * \brief Allocate the miptree's embedded HiZ miptree.
  * \see intel_mipmap_tree:hiz_mt
@@ -879,100 +512,157 @@
 			struct intel_mipmap_tree *mt);
 
 bool
-intel_miptree_level_has_hiz(struct intel_mipmap_tree *mt, uint32_t level);
-
-void
-intel_miptree_slice_set_needs_hiz_resolve(struct intel_mipmap_tree *mt,
-                                          uint32_t level,
-					  uint32_t depth);
-void
-intel_miptree_slice_set_needs_depth_resolve(struct intel_mipmap_tree *mt,
-                                            uint32_t level,
-					    uint32_t depth);
-
-void
-intel_miptree_set_all_slices_need_depth_resolve(struct intel_mipmap_tree *mt,
-                                                uint32_t level);
-
-/**
- * \return false if no resolve was needed
- */
-bool
-intel_miptree_slice_resolve_hiz(struct brw_context *brw,
-				struct intel_mipmap_tree *mt,
-				unsigned int level,
-				unsigned int depth);
-
-/**
- * \return false if no resolve was needed
- */
-bool
-intel_miptree_slice_resolve_depth(struct brw_context *brw,
-				  struct intel_mipmap_tree *mt,
-				  unsigned int level,
-				  unsigned int depth);
-
-/**
- * \return false if no resolve was needed
- */
-bool
-intel_miptree_all_slices_resolve_hiz(struct brw_context *brw,
-				     struct intel_mipmap_tree *mt);
-
-/**
- * \return false if no resolve was needed
- */
-bool
-intel_miptree_all_slices_resolve_depth(struct brw_context *brw,
-				       struct intel_mipmap_tree *mt);
+intel_miptree_level_has_hiz(const struct intel_mipmap_tree *mt, uint32_t level);
 
 /**\}*/
 
-enum intel_fast_clear_state
-intel_miptree_get_fast_clear_state(const struct intel_mipmap_tree *mt,
-                                   unsigned level, unsigned layer);
-
-void
-intel_miptree_set_fast_clear_state(const struct brw_context *brw,
-                                   struct intel_mipmap_tree *mt,
-                                   unsigned level,
-                                   unsigned first_layer,
-                                   unsigned num_layers,
-                                   enum intel_fast_clear_state new_state);
-
 bool
 intel_miptree_has_color_unresolved(const struct intel_mipmap_tree *mt,
                                    unsigned start_level, unsigned num_levels,
                                    unsigned start_layer, unsigned num_layers);
 
-/**
- * Update the fast clear state for a miptree to indicate that it has been used
- * for rendering.
- */
-void
-intel_miptree_used_for_rendering(const struct brw_context *brw,
-                                 struct intel_mipmap_tree *mt, unsigned level,
-                                 unsigned start_layer, unsigned num_layers);
 
-/**
- * Flag values telling color resolve pass which special types of buffers
- * can be ignored.
+#define INTEL_REMAINING_LAYERS UINT32_MAX
+#define INTEL_REMAINING_LEVELS UINT32_MAX
+
+/** Prepare a miptree for access
  *
- * INTEL_MIPTREE_IGNORE_CCS_E:   Lossless compressed (single-sample
- *                               compression scheme since gen9)
+ * This function should be called prior to any access to miptree in order to
+ * perform any needed resolves.
+ *
+ * \param[in]  start_level    The first mip level to be accessed
+ *
+ * \param[in]  num_levels     The number of miplevels to be accessed or
+ *                            INTEL_REMAINING_LEVELS to indicate every level
+ *                            above start_level will be accessed
+ *
+ * \param[in]  start_layer    The first array slice or 3D layer to be accessed
+ *
+ * \param[in]  num_layers     The number of array slices or 3D layers be
+ *                            accessed or INTEL_REMAINING_LAYERS to indicate
+ *                            every layer above start_layer will be accessed
+ *
+ * \param[in]  aux_supported  Whether or not the access will support the
+ *                            miptree's auxiliary compression format;  this
+ *                            must be false for uncompressed miptrees
+ *
+ * \param[in]  fast_clear_supported Whether or not the access will support
+ *                                  fast clears in the miptree's auxiliary
+ *                                  compression format
  */
-#define INTEL_MIPTREE_IGNORE_CCS_E (1 << 0)
-
-bool
-intel_miptree_resolve_color(struct brw_context *brw,
-                            struct intel_mipmap_tree *mt, unsigned level,
-                            unsigned start_layer, unsigned num_layers,
-                            int flags);
-
 void
-intel_miptree_all_slices_resolve_color(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt,
-                                       int flags);
+intel_miptree_prepare_access(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt,
+                             uint32_t start_level, uint32_t num_levels,
+                             uint32_t start_layer, uint32_t num_layers,
+                             enum isl_aux_usage aux_usage,
+                             bool fast_clear_supported);
+
+/** Complete a write operation
+ *
+ * This function should be called after any operation writes to a miptree.
+ * This will update the miptree's compression state so that future resolves
+ * happen correctly.  Technically, this function can be called before the
+ * write occurs but the caller must ensure that they don't interlace
+ * intel_miptree_prepare_access and intel_miptree_finish_write calls to
+ * overlapping layer/level ranges.
+ *
+ * \param[in]  level             The mip level that was written
+ *
+ * \param[in]  start_layer       The first array slice or 3D layer written
+ *
+ * \param[in]  num_layers        The number of array slices or 3D layers
+ *                               written or INTEL_REMAINING_LAYERS to indicate
+ *                               every layer above start_layer was written
+ *
+ * \param[in]  written_with_aux  Whether or not the write was done with
+ *                               auxiliary compression enabled
+ */
+void
+intel_miptree_finish_write(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt, uint32_t level,
+                           uint32_t start_layer, uint32_t num_layers,
+                           enum isl_aux_usage aux_usage);
+
+/** Get the auxiliary compression state of a miptree slice */
+enum isl_aux_state
+intel_miptree_get_aux_state(const struct intel_mipmap_tree *mt,
+                            uint32_t level, uint32_t layer);
+
+/** Set the auxiliary compression state of a miptree slice range
+ *
+ * This function directly sets the auxiliary compression state of a slice
+ * range of a miptree.  It only modifies data structures and does not do any
+ * resolves.  This should only be called by code which directly performs
+ * compression operations such as fast clears and resolves.  Most code should
+ * use intel_miptree_prepare_access or intel_miptree_finish_write.
+ */
+void
+intel_miptree_set_aux_state(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t num_layers,
+                            enum isl_aux_state aux_state);
+
+/**
+ * Prepare a miptree for raw access
+ *
+ * This helper prepares the miptree for access that knows nothing about any
+ * sort of compression whatsoever.  This is useful when mapping the surface or
+ * using it with the blitter.
+ */
+static inline void
+intel_miptree_access_raw(struct brw_context *brw,
+                         struct intel_mipmap_tree *mt,
+                         uint32_t level, uint32_t layer,
+                         bool write)
+{
+   intel_miptree_prepare_access(brw, mt, level, 1, layer, 1, false, false);
+   if (write)
+      intel_miptree_finish_write(brw, mt, level, layer, 1, false);
+}
+
+enum isl_aux_usage
+intel_miptree_texture_aux_usage(struct brw_context *brw,
+                                struct intel_mipmap_tree *mt,
+                                enum isl_format view_format);
+void
+intel_miptree_prepare_texture(struct brw_context *brw,
+                              struct intel_mipmap_tree *mt,
+                              enum isl_format view_format,
+                              bool *aux_supported_out);
+void
+intel_miptree_prepare_image(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt);
+void
+intel_miptree_prepare_fb_fetch(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt, uint32_t level,
+                               uint32_t start_layer, uint32_t num_layers);
+enum isl_aux_usage
+intel_miptree_render_aux_usage(struct brw_context *brw,
+                               struct intel_mipmap_tree *mt,
+                               enum isl_format render_format,
+                               bool blend_enabled);
+void
+intel_miptree_prepare_render(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt, uint32_t level,
+                             uint32_t start_layer, uint32_t layer_count,
+                             enum isl_format render_format,
+                             bool blend_enabled);
+void
+intel_miptree_finish_render(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t layer_count,
+                            enum isl_format render_format,
+                            bool blend_enabled);
+void
+intel_miptree_prepare_depth(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt, uint32_t level,
+                            uint32_t start_layer, uint32_t layer_count);
+void
+intel_miptree_finish_depth(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt, uint32_t level,
+                           uint32_t start_layer, uint32_t layer_count,
+                           bool depth_written);
 
 void
 intel_miptree_make_shareable(struct brw_context *brw,
@@ -987,24 +677,6 @@
 intel_update_r8stencil(struct brw_context *brw,
                        struct intel_mipmap_tree *mt);
 
-/**
- * Horizontal distance from one slice to the next in the two-dimensional
- * miptree layout.
- */
-unsigned
-brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
-                                       const struct intel_mipmap_tree *mt,
-                                       unsigned level);
-
-/**
- * Vertical distance from one slice to the next in the two-dimensional miptree
- * layout.
- */
-unsigned
-brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt,
-                                     unsigned level);
-
 bool
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/i965/intel_pixel.c b/src/mesa/drivers/dri/i965/intel_pixel.c
index d4f86fd..c69c3cc 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel.c
@@ -26,6 +26,7 @@
 #include "main/accum.h"
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/stencil.h"
 #include "main/bufferobj.h"
 #include "main/context.h"
 #include "swrast/swrast.h"
@@ -58,7 +59,7 @@
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   if (ctx->FragmentProgram._Enabled) {
+   if (_mesa_arb_fragment_program_enabled(ctx)) {
       DBG("fallback due to fragment program\n");
       return false;
    }
@@ -107,7 +108,7 @@
       return false;
    }
 
-   if (ctx->Stencil._Enabled) {
+   if (_mesa_stencil_is_enabled(ctx)) {
       DBG("fallback due to image stencil\n");
       return false;
    }
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
index 4522d28..8d467ad 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
@@ -256,7 +256,7 @@
    /* The blitter has no idea about fast color clears, so we need to resolve
     * the miptree before we do anything.
     */
-   intel_miptree_all_slices_resolve_color(brw, irb->mt, 0);
+   intel_miptree_access_raw(brw, irb->mt, irb->mt_level, irb->mt_layer, true);
 
    /* Chop it all into chunks that can be digested by hardware: */
    for (py = 0; py < height; py += DY) {
@@ -292,10 +292,10 @@
 						(GLubyte *)stipple,
 						sz,
 						color,
-						irb->mt->pitch,
+						irb->mt->surf.row_pitch,
 						irb->mt->bo,
-						0,
-						irb->mt->tiling,
+						irb->mt->offset,
+						irb->mt->surf.tiling,
 						dstx + px,
 						dsty + py,
 						w, h,
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index 05c35bd..3f8df30 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -25,6 +25,7 @@
 
 #include "main/image.h"
 #include "main/state.h"
+#include "main/stencil.h"
 #include "main/mtypes.h"
 #include "main/condrender.h"
 #include "main/fbobject.h"
@@ -100,7 +101,7 @@
       return false;
    }
 
-   if (draw_irb->mt->num_samples > 1 || read_irb->mt->num_samples > 1) {
+   if (draw_irb->mt->surf.samples > 1 || read_irb->mt->surf.samples > 1) {
       perf_debug("glCopyPixels() fallback: multisampled buffers\n");
       return false;
    }
@@ -115,14 +116,14 @@
       return false;
    }
 
-   if (ctx->Stencil._Enabled) {
+   if (brw->stencil_enabled) {
       perf_debug("glCopyPixels(): Unsupported stencil test state\n");
       return false;
    }
 
    if (ctx->Fog.Enabled ||
        ctx->Texture._MaxEnabledTexImageUnit != -1 ||
-       ctx->FragmentProgram._Enabled) {
+       _mesa_arb_fragment_program_enabled(ctx)) {
       perf_debug("glCopyPixels(): Unsupported fragment shader state\n");
       return false;
    }
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
index e84e473..81299da 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -109,7 +109,7 @@
 				    format, type, 0, 0, 0);
 
    src_buffer = intel_bufferobj_buffer(brw, src, src_offset,
-                                       height * src_stride);
+                                       height * src_stride, false);
 
    struct intel_mipmap_tree *pbo_mt =
       intel_miptree_create_for_bo(brw,
@@ -118,7 +118,7 @@
                                   src_offset,
                                   width, height, 1,
                                   src_stride,
-                                  0);
+                                  MIPTREE_CREATE_DEFAULT);
    if (!pbo_mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 8793c3e..cd4fbab 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -84,8 +84,6 @@
    /* The miptree's buffer. */
    struct brw_bo *bo;
 
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -129,16 +127,27 @@
       return false;
 
    if (!irb->mt ||
-       (irb->mt->tiling != I915_TILING_X &&
-       irb->mt->tiling != I915_TILING_Y)) {
+       (irb->mt->surf.tiling != ISL_TILING_X &&
+        irb->mt->surf.tiling != ISL_TILING_Y0)) {
       /* The algorithm is written only for X- or Y-tiled memory. */
       return false;
    }
 
+   /* tiled_to_linear() assumes that if the object is swizzled, it is using
+    * I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.  This is only
+    * true on gen5 and above.
+    *
+    * The killer on top is that some gen4 have an L-shaped swizzle mode, where
+    * parts of the memory aren't swizzled at all. Userspace just can't handle
+    * that.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
    /* Since we are going to read raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
-   intel_miptree_all_slices_resolve_color(brw, irb->mt, 0);
+   intel_miptree_access_raw(brw, irb->mt, irb->mt_level, irb->mt_layer, false);
 
    bo = irb->mt->bo;
 
@@ -147,14 +156,17 @@
       intel_batchbuffer_flush(brw);
    }
 
-   error = brw_bo_map(brw, bo, false /* write enable */);
-   if (error) {
+   void *map = brw_bo_map(brw, bo, MAP_READ | MAP_RAW);
+   if (map == NULL) {
       DBG("%s: failed to map bo\n", __func__);
       return false;
    }
 
-   xoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].x_offset;
-   yoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].y_offset;
+   unsigned slice_offset_x, slice_offset_y;
+   intel_miptree_get_image_offset(irb->mt, irb->mt_level, irb->mt_layer,
+                                  &slice_offset_x, &slice_offset_y);
+   xoffset += slice_offset_x;
+   yoffset += slice_offset_y;
 
    dst_pitch = _mesa_image_row_stride(pack, width, format, type);
 
@@ -180,7 +192,7 @@
        "mesa_format=0x%x tiling=%d "
        "pack=(alignment=%d row_length=%d skip_pixels=%d skip_rows=%d)\n",
        __func__, xoffset, yoffset, width, height,
-       format, type, rb->Format, irb->mt->tiling,
+       format, type, rb->Format, irb->mt->surf.tiling,
        pack->Alignment, pack->RowLength, pack->SkipPixels,
        pack->SkipRows);
 
@@ -188,10 +200,10 @@
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual + irb->mt->offset,
-      dst_pitch, irb->mt->pitch,
+      map + irb->mt->offset,
+      dst_pitch, irb->mt->surf.row_pitch,
       brw->has_swizzling,
-      irb->mt->tiling,
+      irb->mt->surf.tiling,
       mem_copy
    );
 
diff --git a/src/mesa/drivers/dri/i965/intel_resolve_map.c b/src/mesa/drivers/dri/i965/intel_resolve_map.c
deleted file mode 100644
index 5b132ca..0000000
--- a/src/mesa/drivers/dri/i965/intel_resolve_map.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "intel_resolve_map.h"
-
-#include <stdlib.h>
-
-/**
- * \brief Set that the miptree slice at (level, layer) needs a resolve.
- *
- * If a map element already exists with the given key, then the value is
- * changed to the given value of \c need.
- */
-void
-intel_resolve_map_set(struct exec_list *resolve_map,
-                      uint32_t level,
-                      uint32_t layer,
-                      unsigned need)
-{
-   foreach_list_typed(struct intel_resolve_map, map, link, resolve_map) {
-      if (map->level == level && map->layer == layer) {
-         map->need = need;
-	 return;
-      }
-   }
-
-   struct intel_resolve_map *m = malloc(sizeof(struct intel_resolve_map));
-   exec_node_init(&m->link);
-   m->level = level;
-   m->layer = layer;
-   m->need = need;
-
-   exec_list_push_tail(resolve_map, &m->link);
-}
-
-/**
- * \brief Get an element from the map.
- * \return null if element is not contained in map.
- */
-const struct intel_resolve_map *
-intel_resolve_map_find_any(const struct exec_list *resolve_map,
-                           uint32_t start_level, uint32_t num_levels,
-                           uint32_t start_layer, uint32_t num_layers)
-{
-   foreach_list_typed(struct intel_resolve_map, map, link, resolve_map) {
-      if (map->level >= start_level &&
-          map->level < (start_level + num_levels) &&
-          map->layer >= start_layer &&
-          map->layer < (start_layer + num_layers))
-         return map;
-   }
-
-   return NULL;
-}
-
-/**
- * \brief Remove and free an element from the map.
- */
-void
-intel_resolve_map_remove(struct intel_resolve_map *elem)
-{
-   exec_node_remove(&elem->link);
-   free(elem);
-}
-
-/**
- * \brief Remove and free all elements of the map.
- */
-void
-intel_resolve_map_clear(struct exec_list *resolve_map)
-{
-   foreach_in_list_safe(struct exec_node, node, resolve_map) {
-      free(node);
-   }
-
-   exec_list_make_empty(resolve_map);
-}
diff --git a/src/mesa/drivers/dri/i965/intel_resolve_map.h b/src/mesa/drivers/dri/i965/intel_resolve_map.h
deleted file mode 100644
index 17d3983..0000000
--- a/src/mesa/drivers/dri/i965/intel_resolve_map.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef INTEL_RESLVE_MAP_H
-#define INTEL_RESLVE_MAP_H
-
-#include <stdint.h>
-#include "blorp/blorp.h"
-#include "compiler/glsl/list.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Enum for keeping track of the fast clear state of a buffer associated with
- * a miptree.
- *
- * Fast clear works by deferring the memory writes that would be used to clear
- * the buffer, so that instead of performing them at the time of the clear
- * operation, the hardware automatically performs them at the time that the
- * buffer is later accessed for rendering.  The MCS buffer keeps track of
- * which regions of the buffer still have pending clear writes.
- *
- * This enum keeps track of the driver's knowledge of pending fast clears in
- * the MCS buffer.
- *
- * MCS buffers only exist on Gen7+.
- */
-enum intel_fast_clear_state
-{
-   /**
-    * No deferred clears are pending for this miptree, and the contents of the
-    * color buffer are entirely correct.  An MCS buffer may or may not exist
-    * for this miptree.  If it does exist, it is entirely in the "no deferred
-    * clears pending" state.  If it does not exist, it will be created the
-    * first time a fast color clear is executed.
-    *
-    * In this state, the color buffer can be used for purposes other than
-    * rendering without needing a render target resolve.
-    *
-    * Since there is no such thing as a "fast color clear resolve" for MSAA
-    * buffers, an MSAA buffer will never be in this state.
-    */
-   INTEL_FAST_CLEAR_STATE_RESOLVED,
-
-   /**
-    * An MCS buffer exists for this miptree, and deferred clears are pending
-    * for some regions of the color buffer, as indicated by the MCS buffer.
-    * The contents of the color buffer are only correct for the regions where
-    * the MCS buffer doesn't indicate a deferred clear.
-    *
-    * If a single-sample buffer is in this state, a render target resolve must
-    * be performed before it can be used for purposes other than rendering.
-    */
-   INTEL_FAST_CLEAR_STATE_UNRESOLVED,
-
-   /**
-    * An MCS buffer exists for this miptree, and deferred clears are pending
-    * for the entire color buffer, and the contents of the MCS buffer reflect
-    * this.  The contents of the color buffer are undefined.
-    *
-    * If a single-sample buffer is in this state, a render target resolve must
-    * be performed before it can be used for purposes other than rendering.
-    *
-    * If the client attempts to clear a buffer which is already in this state,
-    * the clear can be safely skipped, since the buffer is already clear.
-    */
-   INTEL_FAST_CLEAR_STATE_CLEAR,
-};
-
-/**
- * \brief Map of miptree slices to needed resolves.
- *
- * The map is implemented as a linear doubly-linked list.
- *
- * In the intel_resolve_map*() functions, the \c head argument is not
- * inspected for its data. It only serves as an anchor for the list.
- *
- * \par Design Discussion
- *
- *     There are two possible ways to record which miptree slices need
- *     resolves. 1) Maintain a flag for every miptree slice in the texture,
- *     likely in intel_mipmap_level::slice, or 2) maintain a list of only
- *     those slices that need a resolve.
- *
- *     Immediately before drawing, a full depth resolve performed on each
- *     enabled depth texture. If design 1 were chosen, then at each draw call
- *     it would be necessary to iterate over each miptree slice of each
- *     enabled depth texture in order to query if each slice needed a resolve.
- *     In the worst case, this would require 2^16 iterations: 16 texture
- *     units, 16 miplevels, and 256 depth layers (assuming maximums for OpenGL
- *     2.1).
- *
- *     By choosing design 2, the number of iterations is exactly the minimum
- *     necessary.
- */
-struct intel_resolve_map {
-   struct exec_node link;
-
-   uint32_t level;
-   uint32_t layer;
-
-   union {
-      enum blorp_hiz_op need;
-      enum intel_fast_clear_state fast_clear_state;
-   };
-};
-
-void
-intel_resolve_map_set(struct exec_list *resolve_map,
-                      uint32_t level,
-                      uint32_t layer,
-                      unsigned new_state);
-
-const struct intel_resolve_map *
-intel_resolve_map_find_any(const struct exec_list *resolve_map,
-                           uint32_t start_level, uint32_t num_levels,
-                           uint32_t start_layer, uint32_t num_layers);
-
-static inline const struct intel_resolve_map *
-intel_resolve_map_const_get(const struct exec_list *resolve_map,
-                            uint32_t level,
-                            uint32_t layer)
-{
-   return intel_resolve_map_find_any(resolve_map, level, 1, layer, 1);
-}
-
-static inline struct intel_resolve_map *
-intel_resolve_map_get(struct exec_list *resolve_map,
-		      uint32_t level,
-		      uint32_t layer)
-{
-   return (struct intel_resolve_map *)intel_resolve_map_find_any(
-                                         resolve_map, level, 1, layer, 1);
-}
-
-void
-intel_resolve_map_remove(struct intel_resolve_map *resolve_map);
-
-void
-intel_resolve_map_clear(struct exec_list *resolve_map);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* INTEL_RESLVE_MAP_H */
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 0aba1be..d7f2a31 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -37,6 +37,7 @@
 #include "swrast/s_renderbuffer.h"
 #include "util/ralloc.h"
 #include "brw_defines.h"
+#include "brw_state.h"
 #include "compiler/nir/nir.h"
 
 #include "utils.h"
@@ -55,7 +56,6 @@
    .xml =
 DRI_CONF_BEGIN
    DRI_CONF_SECTION_PERFORMANCE
-      DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_ALWAYS_SYNC)
       /* Options correspond to DRI_CONF_BO_REUSE_DISABLED,
        * DRI_CONF_BO_REUSE_ALL
        */
@@ -65,6 +65,7 @@
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+      DRI_CONF_MESA_NO_ERROR("false")
    DRI_CONF_SECTION_END
 
    DRI_CONF_SECTION_QUALITY
@@ -89,6 +90,7 @@
       DRI_CONF_DISABLE_BLEND_FUNC_EXTENDED("false")
       DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION("false")
       DRI_CONF_ALLOW_GLSL_EXTENSION_DIRECTIVE_MIDSHADER("false")
+      DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION("false")
       DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION("false")
       DRI_CONF_FORCE_GLSL_ABS_SQRT("false")
 
@@ -277,52 +279,59 @@
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
 
-   /* For YUYV buffers, we set up two overlapping DRI images and treat
-    * them as planar buffers in the compositors.  Plane 0 is GR88 and
-    * samples YU or YV pairs and places Y into the R component, while
-    * plane 1 is ARGB and samples YUYV clusters and places pairs and
-    * places U into the G component and V into A.  This lets the
-    * texture sampler interpolate the Y components correctly when
-    * sampling from plane 0, and interpolate U and V correctly when
-    * sampling from plane 1. */
+   /* For YUYV and UYVY buffers, we set up two overlapping DRI images
+    * and treat them as planar buffers in the compositors.
+    * Plane 0 is GR88 and samples YU or YV pairs and places Y into
+    * the R component, while plane 1 is ARGB/ABGR and samples YUYV/UYVY
+    * clusters and places pairs and places U into the G component and
+    * V into A.  This lets the texture sampler interpolate the Y
+    * components correctly when sampling from plane 0, and interpolate
+    * U and V correctly when sampling from plane 1. */
    { __DRI_IMAGE_FOURCC_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 },
-       { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }
+       { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } },
+   { __DRI_IMAGE_FOURCC_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 },
+       { 0, 1, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }
 };
 
 static const struct {
-   uint32_t tiling;
    uint64_t modifier;
-} tiling_modifier_map[] = {
-   { .tiling = I915_TILING_NONE, .modifier = DRM_FORMAT_MOD_LINEAR },
-   { .tiling = I915_TILING_X, .modifier = I915_FORMAT_MOD_X_TILED },
-   { .tiling = I915_TILING_Y, .modifier = I915_FORMAT_MOD_Y_TILED },
+   unsigned since_gen;
+} supported_modifiers[] = {
+   { .modifier = DRM_FORMAT_MOD_LINEAR       , .since_gen = 1 },
+   { .modifier = I915_FORMAT_MOD_X_TILED     , .since_gen = 1 },
+   { .modifier = I915_FORMAT_MOD_Y_TILED     , .since_gen = 6 },
 };
 
-static uint32_t
-modifier_to_tiling(uint64_t modifier)
+static bool
+modifier_is_supported(const struct gen_device_info *devinfo,
+                      uint64_t modifier)
 {
    int i;
 
-   for (i = 0; i < ARRAY_SIZE(tiling_modifier_map); i++) {
-      if (tiling_modifier_map[i].modifier == modifier)
-         return tiling_modifier_map[i].tiling;
+   for (i = 0; i < ARRAY_SIZE(supported_modifiers); i++) {
+      if (supported_modifiers[i].modifier != modifier)
+         continue;
+
+      return supported_modifiers[i].since_gen <= devinfo->gen;
    }
 
-   unreachable("modifier_to_tiling should only receive known modifiers");
+   return false;
 }
 
 static uint64_t
 tiling_to_modifier(uint32_t tiling)
 {
-   int i;
+   static const uint64_t map[] = {
+      [I915_TILING_NONE]   = DRM_FORMAT_MOD_LINEAR,
+      [I915_TILING_X]      = I915_FORMAT_MOD_X_TILED,
+      [I915_TILING_Y]      = I915_FORMAT_MOD_Y_TILED,
+   };
 
-   for (i = 0; i < ARRAY_SIZE(tiling_modifier_map); i++) {
-      if (tiling_modifier_map[i].tiling == tiling)
-         return tiling_modifier_map[i].modifier;
-   }
+   assert(tiling < ARRAY_SIZE(map));
 
-   unreachable("tiling_to_modifier received unknown tiling mode");
+   return map[tiling];
 }
 
 static void
@@ -402,9 +411,11 @@
 
    intel_miptree_check_level_layer(mt, level, zoffset);
 
-   image->width = minify(mt->physical_width0, level - mt->first_level);
-   image->height = minify(mt->physical_height0, level - mt->first_level);
-   image->pitch = mt->pitch;
+   image->width = minify(mt->surf.phys_level0_sa.width,
+                         level - mt->first_level);
+   image->height = minify(mt->surf.phys_level0_sa.height,
+                          level - mt->first_level);
+   image->pitch = mt->surf.row_pitch;
 
    image->offset = intel_miptree_get_tile_offsets(mt, level, zoffset,
                                                   &image->tile_x,
@@ -471,7 +482,8 @@
 
    image->internal_format = rb->InternalFormat;
    image->format = rb->Format;
-   image->modifier = tiling_to_modifier(irb->mt->tiling);
+   image->modifier = tiling_to_modifier(
+                        isl_tiling_to_i915_tiling(irb->mt->surf.tiling));
    image->offset = 0;
    image->data = loaderPrivate;
    brw_bo_unreference(image->bo);
@@ -479,7 +491,7 @@
    brw_bo_reference(irb->mt->bo);
    image->width = rb->Width;
    image->height = rb->Height;
-   image->pitch = irb->mt->pitch;
+   image->pitch = irb->mt->surf.row_pitch;
    image->dri_format = driGLFormatToImageFormat(image->format);
    image->has_depthstencil = irb->mt->stencil_mt? true : false;
 
@@ -533,7 +545,8 @@
 
    image->internal_format = obj->Image[face][level]->InternalFormat;
    image->format = obj->Image[face][level]->TexFormat;
-   image->modifier = tiling_to_modifier(iobj->mt->tiling);
+   image->modifier = tiling_to_modifier(
+                        isl_tiling_to_i915_tiling(iobj->mt->surf.tiling));
    image->data = loaderPrivate;
    intel_setup_image_from_mipmap_tree(brw, image, iobj->mt, level, zoffset);
    image->dri_format = driGLFormatToImageFormat(image->format);
@@ -577,6 +590,9 @@
    enum modifier_priority prio = MODIFIER_PRIORITY_INVALID;
 
    for (int i = 0; i < count; i++) {
+      if (!modifier_is_supported(devinfo, modifiers[i]))
+         continue;
+
       switch (modifiers[i]) {
       case I915_FORMAT_MOD_Y_TILED:
          prio = MAX2(prio, MODIFIER_PRIORITY_Y);
@@ -606,8 +622,8 @@
 {
    __DRIimage *image;
    struct intel_screen *screen = dri_screen->driverPrivate;
-   uint32_t tiling;
-   int cpp;
+   uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+   bool ok;
 
    /* Callers of this may specify a modifier, or a dri usage, but not both. The
     * newer modifier interface deprecates the older usage flags newer modifier
@@ -615,44 +631,70 @@
     */
    assert(!(use && count));
 
-   uint64_t modifier = select_best_modifier(&screen->devinfo, modifiers, count);
-   if (modifier == DRM_FORMAT_MOD_INVALID) {
-      /* User requested specific modifiers, none of which work */
-      if (modifiers)
-         return NULL;
-
-      /* Historically, X-tiled was the default, and so lack of modifier means
-       * X-tiled.
-       */
-      tiling = I915_TILING_X;
-   } else {
-      /* select_best_modifier has found a modifier we support */
-      tiling = modifier_to_tiling(modifier);
-   }
-
    if (use & __DRI_IMAGE_USE_CURSOR) {
       if (width != 64 || height != 64)
 	 return NULL;
-      tiling = I915_TILING_NONE;
+      modifier = DRM_FORMAT_MOD_LINEAR;
    }
 
    if (use & __DRI_IMAGE_USE_LINEAR)
-      tiling = I915_TILING_NONE;
+      modifier = DRM_FORMAT_MOD_LINEAR;
+
+   if (modifier == DRM_FORMAT_MOD_INVALID) {
+      if (modifiers) {
+         /* User requested specific modifiers */
+         modifier = select_best_modifier(&screen->devinfo, modifiers, count);
+         if (modifier == DRM_FORMAT_MOD_INVALID)
+            return NULL;
+      } else {
+         /* Historically, X-tiled was the default, and so lack of modifier means
+          * X-tiled.
+          */
+         modifier = I915_FORMAT_MOD_X_TILED;
+      }
+   }
 
    image = intel_allocate_image(screen, format, loaderPrivate);
    if (image == NULL)
       return NULL;
 
-   cpp = _mesa_get_format_bytes(image->format);
-   image->bo = brw_bo_alloc_tiled(screen->bufmgr, "image",
-                                  width, height, cpp, tiling,
-                                  &image->pitch, 0);
+   const struct isl_drm_modifier_info *mod_info =
+      isl_drm_modifier_get_info(modifier);
+
+   struct isl_surf surf;
+   ok = isl_surf_init(&screen->isl_dev, &surf,
+                      .dim = ISL_SURF_DIM_2D,
+                      .format = brw_isl_format_for_mesa_format(image->format),
+                      .width = width,
+                      .height = height,
+                      .depth = 1,
+                      .levels = 1,
+                      .array_len = 1,
+                      .samples = 1,
+                      .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT |
+                               ISL_SURF_USAGE_TEXTURE_BIT |
+                               ISL_SURF_USAGE_STORAGE_BIT,
+                      .tiling_flags = (1 << mod_info->tiling));
+   assert(ok);
+   if (!ok) {
+      free(image);
+      return NULL;
+   }
+
+   /* We request that the bufmgr zero because, if a buffer gets re-used from
+    * the pool, we don't want to leak random garbage from our process to some
+    * other.
+    */
+   image->bo = brw_bo_alloc_tiled(screen->bufmgr, "image", surf.size,
+                                  isl_tiling_to_i915_tiling(mod_info->tiling),
+                                  surf.row_pitch, BO_ALLOC_ZEROED);
    if (image->bo == NULL) {
       free(image);
       return NULL;
    }
    image->width = width;
    image->height = height;
+   image->pitch = surf.row_pitch;
    image->modifier = modifier;
 
    return image;
@@ -806,28 +848,29 @@
 }
 
 static __DRIimage *
-intel_create_image_from_fds(__DRIscreen *dri_screen,
-                            int width, int height, int fourcc,
-                            int *fds, int num_fds, int *strides, int *offsets,
-                            void *loaderPrivate)
+intel_create_image_from_fds_common(__DRIscreen *dri_screen,
+                                   int width, int height, int fourcc,
+                                   uint64_t modifier, int *fds, int num_fds,
+                                   int *strides, int *offsets,
+                                   void *loaderPrivate)
 {
    struct intel_screen *screen = dri_screen->driverPrivate;
    struct intel_image_format *f;
    __DRIimage *image;
    int i, index;
+   bool ok;
 
    if (fds == NULL || num_fds < 1)
       return NULL;
 
-   /* We only support all planes from the same bo */
-   for (i = 0; i < num_fds; i++)
-      if (fds[0] != fds[i])
-         return NULL;
-
    f = intel_image_format_lookup(fourcc);
    if (f == NULL)
       return NULL;
 
+   if (modifier != DRM_FORMAT_MOD_INVALID &&
+       !modifier_is_supported(&screen->devinfo, modifier))
+      return NULL;
+
    if (f->nplanes == 1)
       image = intel_allocate_image(screen, f->planes[0].dri_format,
                                    loaderPrivate);
@@ -843,25 +886,77 @@
    image->pitch = strides[0];
 
    image->planar_format = f;
+
+   image->bo = brw_bo_gem_create_from_prime(screen->bufmgr, fds[0]);
+   if (image->bo == NULL) {
+      free(image);
+      return NULL;
+   }
+
+   /* We only support all planes from the same bo.
+    * brw_bo_gem_create_from_prime() should return the same pointer for all
+    * fds received here */
+   for (i = 1; i < num_fds; i++) {
+      struct brw_bo *aux = brw_bo_gem_create_from_prime(screen->bufmgr, fds[i]);
+      brw_bo_unreference(aux);
+      if (aux != image->bo) {
+         brw_bo_unreference(image->bo);
+         free(image);
+         return NULL;
+      }
+   }
+
+   if (modifier != DRM_FORMAT_MOD_INVALID)
+      image->modifier = modifier;
+   else
+      image->modifier = tiling_to_modifier(image->bo->tiling_mode);
+
    int size = 0;
    for (i = 0; i < f->nplanes; i++) {
       index = f->planes[i].buffer_index;
       image->offsets[index] = offsets[index];
       image->strides[index] = strides[index];
 
-      const int plane_height = height >> f->planes[i].height_shift;
-      const int end = offsets[index] + plane_height * strides[index];
+      const struct isl_drm_modifier_info *mod_info =
+         isl_drm_modifier_get_info(image->modifier);
+
+      mesa_format format = driImageFormatToGLFormat(f->planes[i].dri_format);
+
+      struct isl_surf surf;
+      ok = isl_surf_init(&screen->isl_dev, &surf,
+                         .dim = ISL_SURF_DIM_2D,
+                         .format = brw_isl_format_for_mesa_format(format),
+                         .width = image->width >> f->planes[i].width_shift,
+                         .height = image->height >> f->planes[i].height_shift,
+                         .depth = 1,
+                         .levels = 1,
+                         .array_len = 1,
+                         .samples = 1,
+                         .row_pitch = strides[index],
+                         .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT |
+                                  ISL_SURF_USAGE_TEXTURE_BIT |
+                                  ISL_SURF_USAGE_STORAGE_BIT,
+                         .tiling_flags = (1 << mod_info->tiling));
+      if (!ok) {
+         brw_bo_unreference(image->bo);
+         free(image);
+         return NULL;
+      }
+
+      const int end = offsets[index] + surf.size;
       if (size < end)
          size = end;
    }
 
-   image->bo = brw_bo_gem_create_from_prime(screen->bufmgr,
-                                                  fds[0], size);
-   if (image->bo == NULL) {
+   /* Check that the requested image actually fits within the BO. 'size'
+    * is already relative to the offsets, so we don't need to add that. */
+   if (image->bo->size == 0) {
+      image->bo->size = size;
+   } else if (size > image->bo->size) {
+      brw_bo_unreference(image->bo);
       free(image);
       return NULL;
    }
-   image->modifier = tiling_to_modifier(image->bo->tiling_mode);
 
    if (f->nplanes == 1) {
       image->offset = image->offsets[0];
@@ -872,16 +967,29 @@
 }
 
 static __DRIimage *
-intel_create_image_from_dma_bufs(__DRIscreen *dri_screen,
-                                 int width, int height, int fourcc,
-                                 int *fds, int num_fds,
-                                 int *strides, int *offsets,
-                                 enum __DRIYUVColorSpace yuv_color_space,
-                                 enum __DRISampleRange sample_range,
-                                 enum __DRIChromaSiting horizontal_siting,
-                                 enum __DRIChromaSiting vertical_siting,
-                                 unsigned *error,
-                                 void *loaderPrivate)
+intel_create_image_from_fds(__DRIscreen *dri_screen,
+                            int width, int height, int fourcc,
+                            int *fds, int num_fds, int *strides, int *offsets,
+                            void *loaderPrivate)
+{
+   return intel_create_image_from_fds_common(dri_screen, width, height, fourcc,
+                                             DRM_FORMAT_MOD_INVALID,
+                                             fds, num_fds, strides, offsets,
+                                             loaderPrivate);
+}
+
+static __DRIimage *
+intel_create_image_from_dma_bufs2(__DRIscreen *dri_screen,
+                                  int width, int height,
+                                  int fourcc, uint64_t modifier,
+                                  int *fds, int num_fds,
+                                  int *strides, int *offsets,
+                                  enum __DRIYUVColorSpace yuv_color_space,
+                                  enum __DRISampleRange sample_range,
+                                  enum __DRIChromaSiting horizontal_siting,
+                                  enum __DRIChromaSiting vertical_siting,
+                                  unsigned *error,
+                                  void *loaderPrivate)
 {
    __DRIimage *image;
    struct intel_image_format *f = intel_image_format_lookup(fourcc);
@@ -891,9 +999,10 @@
       return NULL;
    }
 
-   image = intel_create_image_from_fds(dri_screen, width, height, fourcc, fds,
-                                       num_fds, strides, offsets,
-                                       loaderPrivate);
+   image = intel_create_image_from_fds_common(dri_screen, width, height,
+                                              fourcc, modifier,
+                                              fds, num_fds, strides, offsets,
+                                              loaderPrivate);
 
    /*
     * Invalid parameters and any inconsistencies between are assumed to be
@@ -916,6 +1025,95 @@
 }
 
 static __DRIimage *
+intel_create_image_from_dma_bufs(__DRIscreen *dri_screen,
+                                 int width, int height, int fourcc,
+                                 int *fds, int num_fds,
+                                 int *strides, int *offsets,
+                                 enum __DRIYUVColorSpace yuv_color_space,
+                                 enum __DRISampleRange sample_range,
+                                 enum __DRIChromaSiting horizontal_siting,
+                                 enum __DRIChromaSiting vertical_siting,
+                                 unsigned *error,
+                                 void *loaderPrivate)
+{
+   return intel_create_image_from_dma_bufs2(dri_screen, width, height,
+                                            fourcc, DRM_FORMAT_MOD_INVALID,
+                                            fds, num_fds, strides, offsets,
+                                            yuv_color_space,
+                                            sample_range,
+                                            horizontal_siting,
+                                            vertical_siting,
+                                            error,
+                                            loaderPrivate);
+}
+
+static GLboolean
+intel_query_dma_buf_formats(__DRIscreen *screen, int max,
+                            int *formats, int *count)
+{
+   int i, j = 0;
+
+   if (max == 0) {
+      *count = ARRAY_SIZE(intel_image_formats) - 1; /* not SARGB */
+      return true;
+   }
+
+   for (i = 0; i < (ARRAY_SIZE(intel_image_formats)) && j < max; i++) {
+     if (intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SARGB8888)
+       continue;
+     formats[j++] = intel_image_formats[i].fourcc;
+   }
+
+   *count = j;
+   return true;
+}
+
+static GLboolean
+intel_query_dma_buf_modifiers(__DRIscreen *_screen, int fourcc, int max,
+                              uint64_t *modifiers,
+                              unsigned int *external_only,
+                              int *count)
+{
+   struct intel_screen *screen = _screen->driverPrivate;
+   struct intel_image_format *f;
+   int num_mods = 0, i;
+
+   f = intel_image_format_lookup(fourcc);
+   if (f == NULL)
+      return false;
+
+   for (i = 0; i < ARRAY_SIZE(supported_modifiers); i++) {
+      uint64_t modifier = supported_modifiers[i].modifier;
+      if (!modifier_is_supported(&screen->devinfo, modifier))
+         continue;
+
+      num_mods++;
+      if (max == 0)
+         continue;
+
+      modifiers[num_mods - 1] = modifier;
+      if (num_mods >= max)
+        break;
+   }
+
+   if (external_only != NULL) {
+      for (i = 0; i < num_mods && i < max; i++) {
+         if (f->components == __DRI_IMAGE_COMPONENTS_Y_U_V ||
+             f->components == __DRI_IMAGE_COMPONENTS_Y_UV ||
+             f->components == __DRI_IMAGE_COMPONENTS_Y_XUXV) {
+            external_only[i] = GL_TRUE;
+         }
+         else {
+            external_only[i] = GL_FALSE;
+         }
+      }
+   }
+
+   *count = num_mods;
+   return true;
+}
+
+static __DRIimage *
 intel_from_planar(__DRIimage *parent, int plane, void *loaderPrivate)
 {
     int width, height, offset, stride, dri_format, index;
@@ -962,7 +1160,7 @@
 }
 
 static const __DRIimageExtension intelImageExtension = {
-    .base = { __DRI_IMAGE, 14 },
+    .base = { __DRI_IMAGE, 15 },
 
     .createImageFromName                = intel_create_image_from_name,
     .createImageFromRenderbuffer        = intel_create_image_from_renderbuffer,
@@ -981,6 +1179,9 @@
     .mapImage                           = NULL,
     .unmapImage                         = NULL,
     .createImageWithModifiers           = intel_create_image_with_modifiers,
+    .createImageFromDmaBufs2            = intel_create_image_from_dma_bufs2,
+    .queryDmaBufFormats                 = intel_query_dma_buf_formats,
+    .queryDmaBufModifiers               = intel_query_dma_buf_modifiers,
 };
 
 static uint64_t
@@ -1086,6 +1287,7 @@
     &intelImageExtension.base,
     &intelRendererQueryExtension.base,
     &dri2ConfigQueryExtension.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
@@ -1097,6 +1299,7 @@
     &intelRendererQueryExtension.base,
     &dri2ConfigQueryExtension.base,
     &dri2Robustness.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
@@ -1151,7 +1354,11 @@
 
 
 /**
- * This is called when we need to set up GL rendering to a new X window.
+ * Create a gl_framebuffer and attach it to __DRIdrawable::driverPrivate.
+ *
+ *_This implements driDriverAPI::createNewDrawable, which the DRI layer calls
+ * when creating a EGLSurface, GLXDrawable, or GLXPixmap. Despite the name,
+ * this does not allocate GPU memory.
  */
 static GLboolean
 intelCreateBuffer(__DRIscreen *dri_screen,
@@ -1164,12 +1371,11 @@
    mesa_format rgbFormat;
    unsigned num_samples =
       intel_quantize_num_samples(screen, mesaVis->samples);
-   struct gl_framebuffer *fb;
 
    if (isPixmap)
       return false;
 
-   fb = CALLOC_STRUCT(gl_framebuffer);
+   struct gl_framebuffer *fb = CALLOC_STRUCT(gl_framebuffer);
    if (!fb)
       return false;
 
@@ -1196,12 +1402,12 @@
    }
 
    /* setup the hardware-based renderbuffers */
-   rb = intel_create_renderbuffer(rgbFormat, num_samples);
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_FRONT_LEFT, &rb->Base.Base);
+   rb = intel_create_winsys_renderbuffer(screen, rgbFormat, num_samples);
+   _mesa_attach_and_own_rb(fb, BUFFER_FRONT_LEFT, &rb->Base.Base);
 
    if (mesaVis->doubleBufferMode) {
-      rb = intel_create_renderbuffer(rgbFormat, num_samples);
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_BACK_LEFT, &rb->Base.Base);
+      rb = intel_create_winsys_renderbuffer(screen, rgbFormat, num_samples);
+      _mesa_attach_and_own_rb(fb, BUFFER_BACK_LEFT, &rb->Base.Base);
    }
 
    /*
@@ -1213,29 +1419,30 @@
       assert(mesaVis->stencilBits == 8);
 
       if (screen->devinfo.has_hiz_and_separate_stencil) {
-         rb = intel_create_private_renderbuffer(MESA_FORMAT_Z24_UNORM_X8_UINT,
+         rb = intel_create_private_renderbuffer(screen,
+                                                MESA_FORMAT_Z24_UNORM_X8_UINT,
                                                 num_samples);
-         _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, &rb->Base.Base);
-         rb = intel_create_private_renderbuffer(MESA_FORMAT_S_UINT8,
+         _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, &rb->Base.Base);
+         rb = intel_create_private_renderbuffer(screen, MESA_FORMAT_S_UINT8,
                                                 num_samples);
-         _mesa_add_renderbuffer_without_ref(fb, BUFFER_STENCIL,
-                                            &rb->Base.Base);
+         _mesa_attach_and_own_rb(fb, BUFFER_STENCIL, &rb->Base.Base);
       } else {
          /*
           * Use combined depth/stencil. Note that the renderbuffer is
           * attached to two attachment points.
           */
-         rb = intel_create_private_renderbuffer(MESA_FORMAT_Z24_UNORM_S8_UINT,
+         rb = intel_create_private_renderbuffer(screen,
+                                                MESA_FORMAT_Z24_UNORM_S8_UINT,
                                                 num_samples);
-         _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, &rb->Base.Base);
-         _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &rb->Base.Base);
+         _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, &rb->Base.Base);
+         _mesa_attach_and_reference_rb(fb, BUFFER_STENCIL, &rb->Base.Base);
       }
    }
    else if (mesaVis->depthBits == 16) {
       assert(mesaVis->stencilBits == 0);
-      rb = intel_create_private_renderbuffer(MESA_FORMAT_Z_UNORM16,
+      rb = intel_create_private_renderbuffer(screen, MESA_FORMAT_Z_UNORM16,
                                              num_samples);
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, &rb->Base.Base);
+      _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, &rb->Base.Base);
    }
    else {
       assert(mesaVis->depthBits == 0);
@@ -1330,8 +1537,8 @@
    uint32_t tiling = I915_TILING_X;
    uint32_t swizzle_mode = 0;
 
-   buffer = brw_bo_alloc_tiled(screen->bufmgr, "swizzle test",
-                               64, 64, 4, tiling, &aligned_pitch, flags);
+   buffer = brw_bo_alloc_tiled_2d(screen->bufmgr, "swizzle test",
+                                  64, 64, 4, tiling, &aligned_pitch, flags);
    if (buffer == NULL)
       return false;
 
@@ -1405,6 +1612,7 @@
    struct brw_bo *results, *bo;
    uint32_t *batch;
    uint32_t offset = 0;
+   void *map;
    bool success = false;
 
    /* Create a zero'ed temporary buffer for reading our results */
@@ -1416,10 +1624,11 @@
    if (bo == NULL)
       goto err_results;
 
-   if (brw_bo_map(NULL, bo, 1))
+   map = brw_bo_map(NULL, bo, MAP_WRITE);
+   if (!map)
       goto err_batch;
 
-   batch = bo->virtual;
+   batch = map;
 
    /* Write the register. */
    *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
@@ -1430,7 +1639,7 @@
    *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
    *batch++ = reg;
    struct drm_i915_gem_relocation_entry reloc = {
-      .offset = (char *) batch - (char *) bo->virtual,
+      .offset = (char *) batch - (char *) map,
       .delta = offset * sizeof(uint32_t),
       .target_handle = results->gem_handle,
       .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
@@ -1461,7 +1670,7 @@
    struct drm_i915_gem_execbuffer2 execbuf = {
       .buffers_ptr = (uintptr_t) exec_objects,
       .buffer_count = 2,
-      .batch_len = ALIGN((char *) batch - (char *) bo->virtual, 8),
+      .batch_len = ALIGN((char *) batch - (char *) map, 8),
       .flags = I915_EXEC_RENDER,
    };
 
@@ -1472,8 +1681,9 @@
    drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
 
    /* Check whether the value got written. */
-   if (brw_bo_map(NULL, results, false) == 0) {
-      success = *((uint32_t *)results->virtual + offset) == expected_value;
+   void *results_map = brw_bo_map(NULL, results, MAP_READ);
+   if (results_map) {
+      success = *((uint32_t *)results_map + offset) == expected_value;
       brw_bo_unmap(results);
    }
 
@@ -1542,7 +1752,28 @@
    static const mesa_format formats[] = {
       MESA_FORMAT_B5G6R5_UNORM,
       MESA_FORMAT_B8G8R8A8_UNORM,
-      MESA_FORMAT_B8G8R8X8_UNORM
+      MESA_FORMAT_B8G8R8X8_UNORM,
+
+      /* The 32-bit RGBA format must not precede the 32-bit BGRA format.
+       * Likewise for RGBX and BGRX.  Otherwise, the GLX client and the GLX
+       * server may disagree on which format the GLXFBConfig represents,
+       * resulting in swapped color channels.
+       *
+       * The problem, as of 2017-05-30:
+       * When matching a GLXFBConfig to a __DRIconfig, GLX ignores the channel
+       * order and chooses the first __DRIconfig with the expected channel
+       * sizes. Specifically, GLX compares the GLXFBConfig's and __DRIconfig's
+       * __DRI_ATTRIB_{CHANNEL}_SIZE but ignores __DRI_ATTRIB_{CHANNEL}_MASK.
+       *
+       * EGL does not suffer from this problem. It correctly compares the
+       * channel masks when matching EGLConfig to __DRIconfig.
+       */
+
+      /* Required by Android, for HAL_PIXEL_FORMAT_RGBA_8888. */
+      MESA_FORMAT_R8G8B8A8_UNORM,
+
+      /* Required by Android, for HAL_PIXEL_FORMAT_RGBX_8888. */
+      MESA_FORMAT_R8G8B8X8_UNORM,
    };
 
    /* GLX_SWAP_COPY_OML is not supported due to page flipping. */
@@ -1679,6 +1910,7 @@
    const bool has_astc = screen->devinfo.gen >= 9;
 
    switch (screen->devinfo.gen) {
+   case 10:
    case 9:
    case 8:
       dri_screen->max_gl_core_version = 45;
@@ -1801,6 +2033,7 @@
       { "bdw", 0x162e },
       { "skl", 0x1912 },
       { "kbl", 0x5912 },
+      { "cnl", 0x5a52 },
    };
 
    for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
@@ -1808,7 +2041,7 @@
          return name_map[i].pci_id;
    }
 
-   return strtod(devid_override, NULL);
+   return strtol(devid_override, NULL, 0);
 }
 
 /**
@@ -1920,6 +2153,9 @@
    screen->hw_has_swizzling = intel_detect_swizzling(screen);
    screen->hw_has_timestamp = intel_detect_timestamp(screen);
 
+   isl_device_init(&screen->isl_dev, &screen->devinfo,
+                   screen->hw_has_swizzling);
+
    /* GENs prior to 8 do not support EU/Subslice info */
    if (devinfo->gen >= 8) {
       intel_detect_sseu(screen);
@@ -2030,6 +2266,14 @@
       screen->cmd_parser_version = 0;
    }
 
+   /* Kernel 4.13 retuired for exec object capture */
+#ifndef I915_PARAM_HAS_EXEC_CAPTURE
+#define I915_PARAM_HAS_EXEC_CAPTURE 45
+#endif
+   if (intel_get_boolean(screen, I915_PARAM_HAS_EXEC_CAPTURE)) {
+      screen->kernel_features |= KERNEL_ALLOWS_EXEC_CAPTURE;
+   }
+
    if (!intel_detect_pipelined_so(screen)) {
       /* We can't do anything, so the effective version is 0. */
       screen->cmd_parser_version = 0;
@@ -2098,11 +2342,14 @@
    screen->compiler = brw_compiler_create(screen, devinfo);
    screen->compiler->shader_debug_log = shader_debug_log_mesa;
    screen->compiler->shader_perf_log = shader_perf_log_mesa;
+   screen->compiler->constant_buffer_0_is_relative = true;
    screen->program_id = 1;
 
    screen->has_exec_fence =
      intel_get_boolean(screen, I915_PARAM_HAS_EXEC_FENCE);
 
+   intel_screen_init_surface_formats(screen);
+
    return (const __DRIconfig**) intel_screen_make_configs(dri_screen);
 }
 
@@ -2131,13 +2378,13 @@
     * through to here. */
    uint32_t pitch;
    int cpp = format / 8;
-   intelBuffer->bo = brw_bo_alloc_tiled(screen->bufmgr,
-                                        "intelAllocateBuffer",
-                                        width,
-                                        height,
-                                        cpp,
-                                        I915_TILING_X, &pitch,
-                                        BO_ALLOC_FOR_RENDER);
+   intelBuffer->bo = brw_bo_alloc_tiled_2d(screen->bufmgr,
+                                           "intelAllocateBuffer",
+                                           width,
+                                           height,
+                                           cpp,
+                                           I915_TILING_X, &pitch,
+                                           BO_ALLOC_FOR_RENDER);
 
    if (intelBuffer->bo == NULL) {
 	   free(intelBuffer);
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index fe0e044..0980c8f 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -31,12 +31,15 @@
 
 #include <GL/internal/dri_interface.h>
 
+#include "isl/isl.h"
 #include "dri_util.h"
 #include "brw_bufmgr.h"
 #include "common/gen_device_info.h"
 #include "i915_drm.h"
 #include "xmlconfig.h"
 
+#include "isl/isl.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -59,6 +62,8 @@
 
    int hw_has_timestamp;
 
+   struct isl_device isl_dev;
+
    /**
     * Does the kernel support context reset notifications?
     */
@@ -74,6 +79,7 @@
 #define KERNEL_ALLOWS_MI_MATH_AND_LRR               (1<<2)
 #define KERNEL_ALLOWS_HSW_SCRATCH1_AND_ROW_CHICKEN3 (1<<3)
 #define KERNEL_ALLOWS_COMPUTE_DISPATCH              (1<<4)
+#define KERNEL_ALLOWS_EXEC_CAPTURE                  (1<<5)
 
    struct brw_bufmgr *bufmgr;
 
@@ -106,6 +112,10 @@
     * Number of EUs reported by the I915_PARAM_EU_TOTAL parameter
     */
    int eu_total;
+
+   bool mesa_format_supports_texture[MESA_FORMAT_COUNT];
+   bool mesa_format_supports_render[MESA_FORMAT_COUNT];
+   enum isl_format mesa_to_isl_render_format[MESA_FORMAT_COUNT];
 };
 
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
@@ -155,6 +165,12 @@
    return screen->kernel_features & KERNEL_ALLOWS_PREDICATE_WRITES;
 }
 
+static inline bool
+can_do_exec_capture(const struct intel_screen *screen)
+{
+   return screen->kernel_features & KERNEL_ALLOWS_EXEC_CAPTURE;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 6da666c..94a7ad3 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -94,7 +94,9 @@
    } else {
       intel_image->mt = intel_miptree_create_for_teximage(brw, intel_texobj,
                                                           intel_image,
-                                                          0);
+                                                          MIPTREE_CREATE_DEFAULT);
+      if (!intel_image->mt)
+         return false;
 
       /* Even if the object currently has a mipmap tree associated
        * with it, this one is a more likely candidate to represent the
@@ -147,8 +149,8 @@
                                               first_image->TexFormat,
                                               0, levels - 1,
                                               width, height, depth,
-                                              num_samples,
-                                              MIPTREE_LAYOUT_TILING_ANY);
+                                              MAX2(num_samples, 1),
+                                              MIPTREE_CREATE_DEFAULT);
 
       if (intel_texobj->mt == NULL) {
          return false;
@@ -325,7 +327,7 @@
          return false;
       }
 
-      if (!brw->format_supported_as_render_target[image->TexFormat]) {
+      if (!brw->mesa_format_supports_render[image->TexFormat]) {
          perf_debug("Non-renderable PBO format; fallback to CPU mapping\n");
          return false;
       }
@@ -335,14 +337,15 @@
 
    struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_obj,
                                              buffer_offset,
-                                             row_stride * image->Height);
+                                             row_stride * image->Height,
+                                             !read_only);
    intel_texobj->mt =
       intel_miptree_create_for_bo(brw, bo,
                                   image->TexFormat,
                                   buffer_offset,
                                   image->Width, image->Height, image->Depth,
                                   row_stride,
-                                  0);
+                                  MIPTREE_CREATE_DEFAULT);
    if (!intel_texobj->mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_copy.c b/src/mesa/drivers/dri/i965/intel_tex_copy.c
index 9c255ae..4fe3585 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_copy.c
@@ -51,7 +51,13 @@
                        GLint x, GLint y, GLsizei width, GLsizei height)
 {
    const GLenum internalFormat = intelImage->base.Base.InternalFormat;
-   bool ret;
+
+   if (!intelImage->mt || !irb || !irb->mt) {
+      if (unlikely(INTEL_DEBUG & DEBUG_PERF))
+	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
+		 __func__, intelImage->mt, irb, internalFormat);
+      return false;
+   }
 
    /* No pixel transfer operations (zoom, bias, mapping), just a blit */
    if (brw->ctx._ImageTransferState)
@@ -70,32 +76,19 @@
    /* glCopyTexSubImage() can't be called on a multisampled texture. */
    assert(intelImage->base.Base.NumSamples == 0);
 
-   if (!intelImage->mt || !irb || !irb->mt) {
-      if (unlikely(INTEL_DEBUG & DEBUG_PERF))
-	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
-		 __func__, intelImage->mt, irb, internalFormat);
-      return false;
-   }
-
    /* account for view parameters and face index */
    int dst_level = intelImage->base.Base.Level +
                    intelImage->base.Base.TexObject->MinLevel;
    int dst_slice = slice + intelImage->base.Base.Face +
                    intelImage->base.Base.TexObject->MinLayer;
 
-   _mesa_unlock_texture(&brw->ctx, intelImage->base.Base.TexObject);
-
    /* blit from src buffer to texture */
-   ret = intel_miptree_blit(brw,
-                            irb->mt, irb->mt_level, irb->mt_layer,
-                            x, y, irb->Base.Base.Name == 0,
-                            intelImage->mt, dst_level, dst_slice,
-                            dstx, dsty, false,
-                            width, height, GL_COPY);
-
-   _mesa_lock_texture(&brw->ctx, intelImage->base.Base.TexObject);
-
-   return ret;
+   return intel_miptree_blit(brw,
+                             irb->mt, irb->mt_level, irb->mt_layer,
+                             x, y, irb->Base.Base.Name == 0,
+                             intelImage->mt, dst_level, dst_slice,
+                             dstx, dsty, false,
+                             width, height, GL_COPY);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 7208d8e..298a256 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -52,15 +52,24 @@
 intel_miptree_create_for_teximage(struct brw_context *brw,
 				  struct intel_texture_object *intelObj,
 				  struct intel_texture_image *intelImage,
-                                  uint32_t layout_flags)
+                                  enum intel_miptree_create_flags flags)
 {
    GLuint lastLevel;
    int width, height, depth;
+   unsigned old_width = 0, old_height = 0, old_depth = 0;
    const struct intel_mipmap_tree *old_mt = intelObj->mt;
    const unsigned level = intelImage->base.Base.Level;
 
    intel_get_image_dims(&intelImage->base.Base, &width, &height, &depth);
 
+   if (old_mt) {
+      old_width = old_mt->surf.logical_level0_px.width;
+      old_height = old_mt->surf.logical_level0_px.height;
+      old_depth = old_mt->surf.dim == ISL_SURF_DIM_3D ?
+                     old_mt->surf.logical_level0_px.depth :
+                     old_mt->surf.logical_level0_px.array_len;
+   }
+
    DBG("%s\n", __func__);
 
    /* Figure out image dimensions at start level. */
@@ -72,19 +81,19 @@
       assert(level == 0);
       break;
    case GL_TEXTURE_3D:
-      depth = old_mt ? get_base_dim(old_mt->logical_depth0, depth, level) :
+      depth = old_mt ? get_base_dim(old_depth, depth, level) :
                        depth << level;
       /* Fall through */
    case GL_TEXTURE_2D:
    case GL_TEXTURE_2D_ARRAY:
    case GL_TEXTURE_CUBE_MAP:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
-      height = old_mt ? get_base_dim(old_mt->logical_height0, height, level) :
+      height = old_mt ? get_base_dim(old_height, height, level) :
                         height << level;
       /* Fall through */
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
-      width = old_mt ? get_base_dim(old_mt->logical_width0, width, level) :
+      width = old_mt ? get_base_dim(old_width, width, level) :
                        width << level;
       break;
    default:
@@ -114,8 +123,8 @@
 			       width,
 			       height,
 			       depth,
-                               intelImage->base.Base.NumSamples,
-                               layout_flags | MIPTREE_LAYOUT_TILING_ANY);
+                               MAX2(intelImage->base.Base.NumSamples, 1),
+                               flags);
 }
 
 static void
@@ -186,14 +195,15 @@
    struct intel_texture_image *intel_image = intel_texture_image(image);
 
    _mesa_init_teximage_fields(&brw->ctx, image,
-			      mt->logical_width0, mt->logical_height0, 1,
-			      0, internal_format, mt->format);
+                              mt->surf.logical_level0_px.width,
+                              mt->surf.logical_level0_px.height, 1,
+                              0, internal_format, mt->format);
 
    brw->ctx.Driver.FreeTextureImageBuffer(&brw->ctx, image);
 
    intel_texobj->needs_validate = true;
-   intel_image->base.RowStride = mt->pitch / mt->cpp;
-   assert(mt->pitch % mt->cpp == 0);
+   intel_image->base.RowStride = mt->surf.row_pitch / mt->cpp;
+   assert(mt->surf.row_pitch % mt->cpp == 0);
 
    intel_miptree_reference(&intel_image->mt, mt);
 
@@ -201,94 +211,6 @@
    intel_miptree_reference(&intel_texobj->mt, mt);
 }
 
-static struct intel_mipmap_tree *
-create_mt_for_planar_dri_image(struct brw_context *brw,
-                               GLenum target, __DRIimage *image)
-{
-   struct intel_image_format *f = image->planar_format;
-   struct intel_mipmap_tree *planar_mt;
-
-   for (int i = 0; i < f->nplanes; i++) {
-      const int index = f->planes[i].buffer_index;
-      const uint32_t dri_format = f->planes[i].dri_format;
-      const mesa_format format = driImageFormatToGLFormat(dri_format);
-      const uint32_t width = image->width >> f->planes[i].width_shift;
-      const uint32_t height = image->height >> f->planes[i].height_shift;
-
-      /* Disable creation of the texture's aux buffers because the driver
-       * exposes no EGL API to manage them. That is, there is no API for
-       * resolving the aux buffer's content to the main buffer nor for
-       * invalidating the aux buffer's content.
-       */
-      struct intel_mipmap_tree *mt =
-         intel_miptree_create_for_bo(brw, image->bo, format,
-                                     image->offsets[index],
-                                     width, height, 1,
-                                     image->strides[index],
-                                     MIPTREE_LAYOUT_DISABLE_AUX);
-      if (mt == NULL)
-         return NULL;
-
-      mt->target = target;
-      mt->total_width = width;
-      mt->total_height = height;
-
-      if (i == 0)
-         planar_mt = mt;
-      else
-         planar_mt->plane[i - 1] = mt;
-   }
-
-   return planar_mt;
-}
-
-/**
- * Binds a BO to a texture image, as if it was uploaded by glTexImage2D().
- *
- * Used for GLX_EXT_texture_from_pixmap and EGL image extensions,
- */
-static struct intel_mipmap_tree *
-create_mt_for_dri_image(struct brw_context *brw,
-                        GLenum target, __DRIimage *image)
-{
-   struct intel_mipmap_tree *mt;
-   uint32_t draw_x, draw_y;
-
-   /* Disable creation of the texture's aux buffers because the driver exposes
-    * no EGL API to manage them. That is, there is no API for resolving the aux
-    * buffer's content to the main buffer nor for invalidating the aux buffer's
-    * content.
-    */
-   mt = intel_miptree_create_for_bo(brw, image->bo, image->format,
-                                    0, image->width, image->height, 1,
-                                    image->pitch,
-                                    MIPTREE_LAYOUT_DISABLE_AUX);
-   if (mt == NULL)
-      return NULL;
-
-   mt->target = target;
-   mt->total_width = image->width;
-   mt->total_height = image->height;
-   mt->level[0].slice[0].x_offset = image->tile_x;
-   mt->level[0].slice[0].y_offset = image->tile_y;
-
-   intel_miptree_get_tile_offsets(mt, 0, 0, &draw_x, &draw_y);
-
-   /* From "OES_EGL_image" error reporting. We report GL_INVALID_OPERATION
-    * for EGL images from non-tile aligned sufaces in gen4 hw and earlier which has
-    * trouble resolving back to destination image due to alignment issues.
-    */
-   if (!brw->has_surface_tile_offset &&
-       (draw_x != 0 || draw_y != 0)) {
-      _mesa_error(&brw->ctx, GL_INVALID_OPERATION, __func__);
-      intel_miptree_release(&mt);
-      return NULL;
-   }
-
-   mt->offset = image->offset;
-
-   return mt;
-}
 
 void
 intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
@@ -339,12 +261,11 @@
    mt = intel_miptree_create_for_bo(brw, rb->mt->bo, texFormat, 0,
                                     rb->Base.Base.Width,
                                     rb->Base.Base.Height,
-                                    1, rb->mt->pitch, 0);
+                                    1, rb->mt->surf.row_pitch,
+                                    MIPTREE_CREATE_DEFAULT);
    if (mt == NULL)
        return;
    mt->target = target;
-   mt->total_width = rb->Base.Base.Width;
-   mt->total_height = rb->Base.Base.Height;
 
    _mesa_lock_texture(&brw->ctx, texObj);
    texImage = _mesa_get_tex_image(ctx, texObj, target, 0);
@@ -431,10 +352,8 @@
       return;
    }
 
-   if (image->planar_format && image->planar_format->nplanes > 0)
-      mt = create_mt_for_planar_dri_image(brw, target, image);
-   else
-      mt = create_mt_for_dri_image(brw, target, image);
+   mt = intel_miptree_create_for_dri_image(brw, image, target,
+                                           ISL_COLORSPACE_NONE, false);
    if (mt == NULL)
       return;
 
@@ -469,8 +388,6 @@
    /* The miptree's buffer. */
    struct brw_bo *bo;
 
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -514,16 +431,32 @@
       return false;
 
    if (!image->mt ||
-       (image->mt->tiling != I915_TILING_X &&
-       image->mt->tiling != I915_TILING_Y)) {
+       (image->mt->surf.tiling != ISL_TILING_X &&
+        image->mt->surf.tiling != ISL_TILING_Y0)) {
       /* The algorithm is written only for X- or Y-tiled memory. */
       return false;
    }
 
+   /* tiled_to_linear() assumes that if the object is swizzled, it is using
+    * I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.  This is only
+    * true on gen5 and above.
+    *
+    * The killer on top is that some gen4 have an L-shaped swizzle mode, where
+    * parts of the memory aren't swizzled at all. Userspace just can't handle
+    * that.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
+   int level = texImage->Level + texImage->TexObject->MinLevel;
+
    /* Since we are going to write raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
-   intel_miptree_all_slices_resolve_color(brw, image->mt, 0);
+   assert(image->mt->surf.logical_level0_px.depth == 1);
+   assert(image->mt->surf.logical_level0_px.array_len == 1);
+
+   intel_miptree_access_raw(brw, image->mt, level, 0, true);
 
    bo = image->mt->bo;
 
@@ -532,8 +465,8 @@
       intel_batchbuffer_flush(brw);
    }
 
-   error = brw_bo_map(brw, bo, false /* write enable */);
-   if (error) {
+   void *map = brw_bo_map(brw, bo, MAP_READ | MAP_RAW);
+   if (map == NULL) {
       DBG("%s: failed to map bo\n", __func__);
       return false;
    }
@@ -544,24 +477,24 @@
        "mesa_format=0x%x tiling=%d "
        "packing=(alignment=%d row_length=%d skip_pixels=%d skip_rows=%d)\n",
        __func__, texImage->Level, xoffset, yoffset, width, height,
-       format, type, texImage->TexFormat, image->mt->tiling,
+       format, type, texImage->TexFormat, image->mt->surf.tiling,
        packing->Alignment, packing->RowLength, packing->SkipPixels,
        packing->SkipRows);
 
-   int level = texImage->Level + texImage->TexObject->MinLevel;
-
    /* Adjust x and y offset based on miplevel */
-   xoffset += image->mt->level[level].level_x;
-   yoffset += image->mt->level[level].level_y;
+   unsigned level_x, level_y;
+   intel_miptree_get_image_offset(image->mt, level, 0, &level_x, &level_y);
+   xoffset += level_x;
+   yoffset += level_y;
 
    tiled_to_linear(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
-      dst_pitch, image->mt->pitch,
+      map,
+      dst_pitch, image->mt->surf.row_pitch,
       brw->has_swizzling,
-      image->mt->tiling,
+      image->mt->surf.tiling,
       mem_copy
    );
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 9126222..5953e61 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -86,8 +86,6 @@
    /* The miptree's buffer. */
    struct brw_bo *bo;
 
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -130,16 +128,32 @@
       ctx->Driver.AllocTextureImageBuffer(ctx, texImage);
 
    if (!image->mt ||
-       (image->mt->tiling != I915_TILING_X &&
-       image->mt->tiling != I915_TILING_Y)) {
+       (image->mt->surf.tiling != ISL_TILING_X &&
+        image->mt->surf.tiling != ISL_TILING_Y0)) {
       /* The algorithm is written only for X- or Y-tiled memory. */
       return false;
    }
 
+   /* linear_to_tiled() assumes that if the object is swizzled, it is using
+    * I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.  This is only
+    * true on gen5 and above.
+    *
+    * The killer on top is that some gen4 have an L-shaped swizzle mode, where
+    * parts of the memory aren't swizzled at all. Userspace just can't handle
+    * that.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
+   int level = texImage->Level + texImage->TexObject->MinLevel;
+
    /* Since we are going to write raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
-   intel_miptree_all_slices_resolve_color(brw, image->mt, 0);
+   assert(image->mt->surf.logical_level0_px.depth == 1);
+   assert(image->mt->surf.logical_level0_px.array_len == 1);
+
+   intel_miptree_access_raw(brw, image->mt, level, 0, true);
 
    bo = image->mt->bo;
 
@@ -148,8 +162,8 @@
       intel_batchbuffer_flush(brw);
    }
 
-   error = brw_bo_map(brw, bo, true /* write enable */);
-   if (error || bo->virtual == NULL) {
+   void *map = brw_bo_map(brw, bo, MAP_WRITE | MAP_RAW);
+   if (map == NULL) {
       DBG("%s: failed to map bo\n", __func__);
       return false;
    }
@@ -164,24 +178,24 @@
        "packing=(alignment=%d row_length=%d skip_pixels=%d skip_rows=%d) "
        "for_glTexImage=%d\n",
        __func__, texImage->Level, xoffset, yoffset, width, height,
-       format, type, texImage->TexFormat, image->mt->tiling,
+       format, type, texImage->TexFormat, image->mt->surf.tiling,
        packing->Alignment, packing->RowLength, packing->SkipPixels,
        packing->SkipRows, for_glTexImage);
 
-   int level = texImage->Level + texImage->TexObject->MinLevel;
-
    /* Adjust x and y offset based on miplevel */
-   xoffset += image->mt->level[level].level_x;
-   yoffset += image->mt->level[level].level_y;
+   unsigned level_x, level_y;
+   intel_miptree_get_image_offset(image->mt, level, 0, &level_x, &level_y);
+   xoffset += level_x;
+   yoffset += level_y;
 
    linear_to_tiled(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
-      bo->virtual,
+      map,
       pixels - (ptrdiff_t) yoffset * src_pitch - (ptrdiff_t) xoffset * cpp,
-      image->mt->pitch, src_pitch,
+      image->mt->surf.row_pitch, src_pitch,
       brw->has_swizzling,
-      image->mt->tiling,
+      image->mt->surf.tiling,
       mem_copy
    );
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 08cf3bf..16354d2 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,8 +136,6 @@
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
-      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
-                                    MIPTREE_LAYOUT_TILING_ANY;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -146,8 +144,8 @@
                                           width,
                                           height,
                                           depth,
-                                          0 /* num_samples */,
-                                          layout_flags);
+                                          1 /* num_samples */,
+                                          MIPTREE_CREATE_BUSY);
       if (!intelObj->mt)
          return;
    }
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index c888e46..53a5679 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -633,7 +633,7 @@
                 char *dst, const char *src,
                 uint32_t dst_pitch, int32_t src_pitch,
                 bool has_swizzling,
-                uint32_t tiling,
+                enum isl_tiling tiling,
                 mem_copy_fn mem_copy)
 {
    tile_copy_fn tile_copy;
@@ -643,12 +643,12 @@
    uint32_t tw, th, span;
    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 
-   if (tiling == I915_TILING_X) {
+   if (tiling == ISL_TILING_X) {
       tw = xtile_width;
       th = xtile_height;
       span = xtile_span;
       tile_copy = linear_to_xtiled_faster;
-   } else if (tiling == I915_TILING_Y) {
+   } else if (tiling == ISL_TILING_Y0) {
       tw = ytile_width;
       th = ytile_height;
       span = ytile_span;
@@ -724,7 +724,7 @@
                 char *dst, const char *src,
                 int32_t dst_pitch, uint32_t src_pitch,
                 bool has_swizzling,
-                uint32_t tiling,
+                enum isl_tiling tiling,
                 mem_copy_fn mem_copy)
 {
    tile_copy_fn tile_copy;
@@ -734,12 +734,12 @@
    uint32_t tw, th, span;
    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 
-   if (tiling == I915_TILING_X) {
+   if (tiling == ISL_TILING_X) {
       tw = xtile_width;
       th = xtile_height;
       span = xtile_span;
       tile_copy = xtiled_to_linear_faster;
-   } else if (tiling == I915_TILING_Y) {
+   } else if (tiling == ISL_TILING_Y0) {
       tw = ytile_width;
       th = ytile_height;
       span = ytile_span;
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index d9148bb..62ec884 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -43,7 +43,7 @@
                 char *dst, const char *src,
                 uint32_t dst_pitch, int32_t src_pitch,
                 bool has_swizzling,
-                uint32_t tiling,
+                enum isl_tiling tiling,
                 mem_copy_fn mem_copy);
 
 void
@@ -52,7 +52,7 @@
                 char *dst, const char *src,
                 int32_t dst_pitch, uint32_t src_pitch,
                 bool has_swizzling,
-                uint32_t tiling,
+                enum isl_tiling tiling,
                 mem_copy_fn mem_copy);
 
 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index bb1e7d0..4b5d880 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -47,12 +47,14 @@
 void
 intel_upload_finish(struct brw_context *brw)
 {
+   assert((brw->upload.bo == NULL) == (brw->upload.map == NULL));
    if (!brw->upload.bo)
       return;
 
    brw_bo_unmap(brw->upload.bo);
    brw_bo_unreference(brw->upload.bo);
    brw->upload.bo = NULL;
+   brw->upload.map = NULL;
    brw->upload.next_offset = 0;
 }
 
@@ -94,13 +96,11 @@
       offset = 0;
    }
 
+   assert((brw->upload.bo == NULL) == (brw->upload.map == NULL));
    if (!brw->upload.bo) {
       brw->upload.bo = brw_bo_alloc(brw->bufmgr, "streamed data",
                                     MAX2(INTEL_UPLOAD_SIZE, size), 4096);
-      if (brw->has_llc)
-         brw_bo_map(brw, brw->upload.bo, true);
-      else
-         brw_bo_map_gtt(brw, brw->upload.bo);
+      brw->upload.map = brw_bo_map(brw, brw->upload.bo, MAP_READ | MAP_WRITE);
    }
 
    brw->upload.next_offset = offset + size;
@@ -112,7 +112,7 @@
       brw_bo_reference(brw->upload.bo);
    }
 
-   return brw->upload.bo->virtual + offset;
+   return brw->upload.map + offset;
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/libdrm_macros.h b/src/mesa/drivers/dri/i965/libdrm_macros.h
index 8be103c..2cb76d5 100644
--- a/src/mesa/drivers/dri/i965/libdrm_macros.h
+++ b/src/mesa/drivers/dri/i965/libdrm_macros.h
@@ -29,26 +29,13 @@
 #include <sys/mman.h>
 
 #if defined(ANDROID) && !defined(__LP64__)
-#include <errno.h> /* for EINVAL */
-
-extern void *__mmap2(void *, size_t, int, int, int, size_t);
-
-static inline void *drm_mmap(void *addr, size_t length, int prot, int flags,
-                             int fd, loff_t offset)
-{
-   /* offset must be aligned to 4096 (not necessarily the page size) */
-   if (offset & 4095) {
-      errno = EINVAL;
-      return MAP_FAILED;
-   }
-
-   return __mmap2(addr, length, prot, flags, fd, (size_t) (offset >> 12));
-}
+/* 32-bit needs mmap64 for 64-bit offsets */
+#  define drm_mmap(addr, length, prot, flags, fd, offset) \
+              mmap64(addr, length, prot, flags, fd, offset)
 
 #  define drm_munmap(addr, length) \
               munmap(addr, length)
 
-
 #else
 
 /* assume large file support exists */
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.c b/src/mesa/drivers/dri/nouveau/nouveau_context.c
index 6ddcadc..d6f9e53 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_context.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c
@@ -63,7 +63,7 @@
 	struct nouveau_context *nctx;
 	struct gl_context *ctx;
 
-	if (flags & ~__DRI_CTX_FLAG_DEBUG) {
+	if (flags & ~(__DRI_CTX_FLAG_DEBUG | __DRI_CTX_FLAG_NO_ERROR)) {
 		*error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
 		return false;
 	}
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_gldefs.h b/src/mesa/drivers/dri/nouveau/nouveau_gldefs.h
index 46ec14e..7df04c1 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_gldefs.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_gldefs.h
@@ -239,6 +239,25 @@
 }
 
 static inline unsigned
+nvgl_wrap_mode_nv20(unsigned wrap)
+{
+	switch (wrap) {
+	case GL_REPEAT:
+		return 0x1;
+	case GL_MIRRORED_REPEAT:
+		return 0x2;
+	case GL_CLAMP:
+		return 0x5;
+	case GL_CLAMP_TO_EDGE:
+		return 0x3;
+	case GL_CLAMP_TO_BORDER:
+		return 0x4;
+	default:
+		unreachable("Bad GL texture wrap mode");
+	}
+}
+
+static inline unsigned
 nvgl_filter_mode(unsigned filter)
 {
 	switch (filter) {
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_render_t.c b/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
index 1625a87..db60b59 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
@@ -158,16 +158,16 @@
 		unsigned max_out;
 
 		if (ib) {
-			switch (ib->type) {
-			case GL_UNSIGNED_INT:
+			switch (ib->index_size) {
+			case 4:
 				max_out = MAX_OUT_I32;
 				break;
 
-			case GL_UNSIGNED_SHORT:
+			case 2:
 				max_out = MAX_OUT_I16;
 				break;
 
-			case GL_UNSIGNED_BYTE:
+			case 1:
 				max_out = MAX_OUT_I16;
 				break;
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
index 2dbd9d1..65caec2 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
@@ -259,27 +259,27 @@
 
 	/* Front buffer. */
 	rb = nouveau_renderbuffer_dri_new(color_format, drawable);
-	_mesa_add_renderbuffer_without_ref(fb, BUFFER_FRONT_LEFT, rb);
+	_mesa_attach_and_own_rb(fb, BUFFER_FRONT_LEFT, rb);
 
 	/* Back buffer */
 	if (visual->doubleBufferMode) {
 		rb = nouveau_renderbuffer_dri_new(color_format, drawable);
-		_mesa_add_renderbuffer_without_ref(fb, BUFFER_BACK_LEFT, rb);
+		_mesa_attach_and_own_rb(fb, BUFFER_BACK_LEFT, rb);
 	}
 
 	/* Depth/stencil buffer. */
 	if (visual->depthBits == 24 && visual->stencilBits == 8) {
 		rb = nouveau_renderbuffer_dri_new(GL_DEPTH24_STENCIL8_EXT, drawable);
-		_mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, rb);
-		_mesa_add_renderbuffer(fb, BUFFER_STENCIL, rb);
+		_mesa_attach_and_own_rb(fb, BUFFER_DEPTH, rb);
+		_mesa_attach_and_reference_rb(fb, BUFFER_STENCIL, rb);
 
 	} else if (visual->depthBits == 24) {
 		rb = nouveau_renderbuffer_dri_new(GL_DEPTH_COMPONENT24, drawable);
-		_mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, rb);
+		_mesa_attach_and_own_rb(fb, BUFFER_DEPTH, rb);
 
 	} else if (visual->depthBits == 16) {
 		rb = nouveau_renderbuffer_dri_new(GL_DEPTH_COMPONENT16, drawable);
-		_mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, rb);
+		_mesa_attach_and_own_rb(fb, BUFFER_DEPTH, rb);
 	}
 
 	/* Software renderbuffers. */
@@ -324,6 +324,7 @@
     &nouveau_texbuffer_extension.base,
     &nouveau_renderer_query_extension.base,
     &dri2ConfigQueryExtension.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_state.c b/src/mesa/drivers/dri/nouveau/nouveau_state.c
index de36fa4..1aa26e9 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_state.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_state.c
@@ -32,6 +32,7 @@
 #include "swrast/swrast.h"
 #include "tnl/tnl.h"
 #include "util/bitscan.h"
+#include "main/framebuffer.h"
 
 static void
 nouveau_alpha_func(struct gl_context *ctx, GLenum func, GLfloat ref)
@@ -451,10 +452,14 @@
 }
 
 static void
-nouveau_update_state(struct gl_context *ctx, GLbitfield new_state)
+nouveau_update_state(struct gl_context *ctx)
 {
+	GLbitfield new_state = ctx->NewState;
 	int i;
 
+	if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+		_mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
 	if (new_state & (_NEW_PROJECTION | _NEW_MODELVIEW))
 		context_dirty(ctx, PROJECTION);
 
@@ -493,7 +498,6 @@
 
 	_swrast_InvalidateState(ctx, new_state);
 	_tnl_InvalidateState(ctx, new_state);
-	_vbo_InvalidateState(ctx, new_state);
 
 	nouveau_state_emit(ctx);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index 51ffd5a..fdd135c 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -59,9 +59,19 @@
 	GLboolean imm = (render->mode == IMM);
 	int i, attr;
 
-	if (ib)
-		nouveau_init_array(&render->ib, 0, 0, ib->count, ib->type,
+	if (ib) {
+		GLenum ib_type;
+
+		if (ib->index_size == 4)
+			ib_type = GL_UNSIGNED_INT;
+		else if (ib->index_size == 2)
+			ib_type = GL_UNSIGNED_SHORT;
+		else
+			ib_type = GL_UNSIGNED_BYTE;
+
+		nouveau_init_array(&render->ib, 0, 0, ib->count, ib_type,
 				   ib->obj, ib->ptr, GL_TRUE, ctx);
+	}
 
 	FOR_EACH_BOUND_ATTR(render, i, attr) {
 		const struct gl_vertex_array *array = arrays[attr];
diff --git a/src/mesa/drivers/dri/nouveau/nv04_state_raster.c b/src/mesa/drivers/dri/nouveau/nv04_state_raster.c
index fbcc840..69664fb 100644
--- a/src/mesa/drivers/dri/nouveau/nv04_state_raster.c
+++ b/src/mesa/drivers/dri/nouveau/nv04_state_raster.c
@@ -175,7 +175,7 @@
 	if (ctx->Stencil.WriteMask[0])
 		nv04->ctrl[0] |= NV04_MULTITEX_TRIANGLE_CONTROL0_STENCIL_WRITE;
 
-	if (ctx->Stencil._Enabled)
+	if (_mesa_stencil_is_enabled(ctx))
 		nv04->ctrl[1] |= NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_ENABLE;
 
 	nv04->ctrl[1] |= get_comparison_op(ctx->Stencil.Function[0]) << 4 |
diff --git a/src/mesa/drivers/dri/nouveau/nv10_state_raster.c b/src/mesa/drivers/dri/nouveau/nv10_state_raster.c
index ffde87a..d537f7b 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_state_raster.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_state_raster.c
@@ -145,7 +145,7 @@
 	struct nouveau_pushbuf *push = context_push(ctx);
 
 	BEGIN_NV04(push, NV10_3D(STENCIL_ENABLE), 1);
-	PUSH_DATAb(push, ctx->Stencil._Enabled);
+	PUSH_DATAb(push, _mesa_stencil_is_enabled(ctx));
 
 	BEGIN_NV04(push, NV10_3D(STENCIL_FUNC_FUNC), 3);
 	PUSH_DATA (push, nvgl_comparison_op(ctx->Stencil.Function[0]));
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
index b0a4c9f..7972069 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
@@ -193,9 +193,19 @@
 		| NV20_3D_TEX_FORMAT_NO_BORDER
 		| 1 << 16;
 
-	tx_wrap = nvgl_wrap_mode(sa->WrapR) << 16
-		| nvgl_wrap_mode(sa->WrapT) << 8
-		| nvgl_wrap_mode(sa->WrapS) << 0;
+	switch (t->Target) {
+	case GL_TEXTURE_1D:
+		tx_wrap = NV20_3D_TEX_WRAP_R_CLAMP_TO_EDGE
+			| NV20_3D_TEX_WRAP_T_CLAMP_TO_EDGE
+			| nvgl_wrap_mode_nv20(sa->WrapS) << 0;
+		break;
+
+	default:
+		tx_wrap = nvgl_wrap_mode_nv20(sa->WrapR) << 16
+			| nvgl_wrap_mode_nv20(sa->WrapT) << 8
+			| nvgl_wrap_mode_nv20(sa->WrapS) << 0;
+		break;
+	}
 
 	tx_filter = nvgl_filter_mode(sa->MagFilter) << 24
 		| nvgl_filter_mode(sa->MinFilter) << 16
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index aaa9b93..5a7f334 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -189,7 +189,7 @@
    int i;
    int tcl_mode;
 
-   if (flags & ~__DRI_CTX_FLAG_DEBUG) {
+   if (flags & ~(__DRI_CTX_FLAG_DEBUG | __DRI_CTX_FLAG_NO_ERROR)) {
       *error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
       return false;
    }
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 9b16cf8..60f851b 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -36,6 +36,7 @@
 #include "main/mtypes.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/state.h"
 
 #include "swrast_setup/swrast_setup.h"
 #include "math/m_translate.h"
@@ -114,7 +115,7 @@
 	    /* special handling to fix up fog. Will get us into trouble with vbos...*/
 	    assert(attrib == VERT_ATTRIB_FOG);
 	    if (!rmesa->radeon.tcl.aos[i].bo) {
-	       if (ctx->VertexProgram._Enabled)
+	       if (_mesa_arb_vertex_program_enabled(ctx))
 		  rcommon_emit_vector( ctx,
 				       &(rmesa->radeon.tcl.aos[nr]),
 				       (char *)VB->AttribPtr[attrib]->data,
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 86733a8..2705d22 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -35,11 +35,11 @@
 
 #include "main/glheader.h"
 #include "main/imports.h"
-#include "main/api_arrayelt.h"
 #include "main/enums.h"
 #include "main/light.h"
 #include "main/framebuffer.h"
 #include "main/fbobject.h"
+#include "main/state.h"
 #include "main/stencil.h"
 #include "main/viewport.h"
 
@@ -2266,7 +2266,7 @@
 	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
 	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
 	_NEW_FOG|_NEW_POINT|_NEW_TRACK_MATRIX)) {
-      if (ctx->VertexProgram._Enabled) {
+      if (_mesa_arb_vertex_program_enabled(ctx)) {
 	 r200SetupVertexProg( ctx );
       }
       else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
@@ -2277,15 +2277,18 @@
 }
 
 
-static void r200InvalidateState( struct gl_context *ctx, GLuint new_state )
+static void r200InvalidateState(struct gl_context *ctx)
 {
+   GLuint new_state = ctx->NewState;
+
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    _swrast_InvalidateState( ctx, new_state );
    _swsetup_InvalidateState( ctx, new_state );
-   _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
-   _ae_invalidate_state( ctx, new_state );
    R200_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 
    if (new_state & _NEW_PROGRAM)
@@ -2326,7 +2329,8 @@
       if (!r200ValidateState( ctx ))
 	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
 
-   has_material = !ctx->VertexProgram._Enabled && ctx->Light.Enabled && check_material( ctx );
+   has_material = !_mesa_arb_vertex_program_enabled(ctx) &&
+                  ctx->Light.Enabled && check_material( ctx );
 
    if (has_material) {
       TCL_FALLBACK( ctx, R200_TCL_FALLBACK_MATERIAL, GL_TRUE );
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 8e14ba7..4f584d3 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -35,6 +35,7 @@
 #include "main/imports.h"
 #include "main/enums.h"
 #include "main/api_arrayelt.h"
+#include "main/state.h"
 
 #include "swrast/swrast.h"
 #include "vbo/vbo.h"
@@ -229,7 +230,7 @@
 static int check_##NM( struct gl_context *ctx, struct radeon_state_atom *atom) \
 {									\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
-   return (!rmesa->radeon.TclFallback && !ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
+   return (!rmesa->radeon.TclFallback && !_mesa_arb_vertex_program_enabled(ctx) && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
 }
 
 #define TCL_OR_VP_CHECK( NM, FLAG, ADD )			\
@@ -244,18 +245,18 @@
 {									\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
    (void) atom;								\
-   return (!rmesa->radeon.TclFallback && ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
+   return (!rmesa->radeon.TclFallback && _mesa_arb_vertex_program_enabled(ctx) && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
 }
 
 CHECK( always, GL_TRUE, 0 )
 CHECK( always_add4, GL_TRUE, 4 )
 CHECK( never, GL_FALSE, 0 )
 CHECK( tex_any, ctx->Texture._MaxEnabledTexImageUnit != -1, 0 )
-CHECK( tf, (ctx->Texture._MaxEnabledTexImageUnit != -1 && !ctx->ATIFragmentShader._Enabled), 0 );
-CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled, 0 )
-CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
-CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
-CHECK( afs, ctx->ATIFragmentShader._Enabled, 0 )
+CHECK( tf, (ctx->Texture._MaxEnabledTexImageUnit != -1 && !_mesa_ati_fragment_shader_enabled(ctx)), 0 );
+CHECK( pix_zero, !_mesa_ati_fragment_shader_enabled(ctx), 0 )
+CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !_mesa_ati_fragment_shader_enabled(ctx)), 0 )
+CHECK( afs_pass1, (_mesa_ati_fragment_shader_enabled(ctx) && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
+CHECK( afs, _mesa_ati_fragment_shader_enabled(ctx), 0 )
 CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 3 + 3*5 - CUBE_STATE_SIZE )
 CHECK( tex_cube_cs, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 2 + 4*5 - CUBE_STATE_SIZE )
 TCL_CHECK( tcl_fog_add4, ctx->Fog.Enabled, 4 )
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index 6ca85f5..0fda586 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -38,6 +38,7 @@
 #include "main/image.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/state.h"
 
 #include "swrast/s_context.h"
 #include "swrast/s_fog.h"
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index 26968af..662c041 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -383,7 +383,7 @@
       if (!r200ValidateState( ctx ))
          return GL_TRUE; /* fallback to sw t&l */
 
-   if (!ctx->VertexProgram._Enabled) {
+   if (!_mesa_arb_vertex_program_enabled(ctx)) {
    /* NOTE: inputs != tnl->render_inputs - these are the untransformed
     * inputs.
     */
@@ -553,7 +553,7 @@
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
 
-   if (ctx->VertexProgram._Enabled) {
+   if (_mesa_arb_vertex_program_enabled(ctx)) {
       rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE;
    }
 
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 441ac73..dcf211f 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -36,6 +36,7 @@
 #include "main/imports.h"
 #include "main/context.h"
 #include "main/macros.h"
+#include "main/state.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
@@ -1480,7 +1481,7 @@
       atoms. */
    R200_NEWPRIM( rmesa );
 
-   if (ctx->ATIFragmentShader._Enabled) {
+   if (_mesa_ati_fragment_shader_enabled(ctx)) {
       GLuint i;
       for (i = 0; i < R200_MAX_TEXTURE_UNITS; i++) {
          if (ctx->Texture.Unit[i]._Current)
@@ -1502,7 +1503,7 @@
 	 r200UpdateTextureUnit( ctx, 5 ));
    }
 
-   if (ok && ctx->ATIFragmentShader._Enabled) {
+   if (ok && _mesa_ati_fragment_shader_enabled(ctx)) {
       r200UpdateFragmentShader(ctx);
    }
 
@@ -1528,7 +1529,7 @@
 	   rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
 	 rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] |= R200_TXFORMAT_LOOKUP_DISABLE;
       }
-      else if (!ctx->ATIFragmentShader._Enabled) {
+      else if (!_mesa_ati_fragment_shader_enabled(ctx)) {
 	 if ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_1_ENABLE) &&
 	    (rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] & R200_TXFORMAT_LOOKUP_DISABLE)) {
 	    R200_STATECHANGE(rmesa, tex[1]);
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index 100b715..bb85503 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -350,7 +350,7 @@
 		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
 			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
 			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
-			 t_src_class(b.File) == VSF_IN_CLASS_ATTR))) \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))
 
 /* fglrx on rv250 codes up unused sources as follows:
    unused but necessary sources are same as previous source, zero-ed out.
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index ee4d5f8..6e4b4c4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -306,7 +306,6 @@
 	if (ctx->Driver.Enable) {
 		ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
 				   (ctx->Depth.Test && fb->Visual.depthBits > 0));
-		/* Need to update the derived ctx->Stencil._Enabled first */
 		ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
 				   (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
 	} else {
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index 11afe20..5ef3467 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -155,7 +155,7 @@
    int i;
    int tcl_mode, fthrottle_mode;
 
-   if (flags & ~__DRI_CTX_FLAG_DEBUG) {
+   if (flags & ~(__DRI_CTX_FLAG_DEBUG | __DRI_CTX_FLAG_NO_ERROR)) {
       *error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
       return false;
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 89ea776..37c9c3f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -144,8 +144,7 @@
 #endif
 
 static void
-radeon_map_renderbuffer_s8z24(struct gl_context *ctx,
-		       struct gl_renderbuffer *rb,
+radeon_map_renderbuffer_s8z24(struct gl_renderbuffer *rb,
 		       GLuint x, GLuint y, GLuint w, GLuint h,
 		       GLbitfield mode,
 		       GLubyte **out_map,
@@ -183,8 +182,7 @@
 }
 
 static void
-radeon_map_renderbuffer_z16(struct gl_context *ctx,
-			    struct gl_renderbuffer *rb,
+radeon_map_renderbuffer_z16(struct gl_renderbuffer *rb,
 			    GLuint x, GLuint y, GLuint w, GLuint h,
 			    GLbitfield mode,
 			    GLubyte **out_map,
@@ -307,12 +305,12 @@
 
    if ((rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_DEPTH_ALWAYS_TILED) && !rrb->has_surface) {
        if (rb->Format == MESA_FORMAT_Z24_UNORM_S8_UINT || rb->Format == MESA_FORMAT_Z24_UNORM_X8_UINT) {
-	   radeon_map_renderbuffer_s8z24(ctx, rb, x, y, w, h,
+	   radeon_map_renderbuffer_s8z24(rb, x, y, w, h,
 					 mode, out_map, out_stride);
 	   return;
        }
        if (rb->Format == MESA_FORMAT_Z_UNORM16) {
-	   radeon_map_renderbuffer_z16(ctx, rb, x, y, w, h,
+	   radeon_map_renderbuffer_z16(rb, x, y, w, h,
 				       mode, out_map, out_stride);
 	   return;
        }
@@ -621,8 +619,11 @@
 
 /** Dummy function for gl_renderbuffer::AllocStorage() */
 static GLboolean
-radeon_nop_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *rb,
-			 GLenum internalFormat, GLuint width, GLuint height)
+radeon_nop_alloc_storage(struct gl_context * ctx,
+			 UNUSED struct gl_renderbuffer *rb,
+			 UNUSED GLenum internalFormat,
+			 UNUSED GLuint width,
+			 UNUSED GLuint height)
 {
    _mesa_problem(ctx, "radeon_op_alloc_storage should never be called.");
    return GL_FALSE;
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index fab2eea..0f072af 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -78,7 +78,6 @@
     DRI_CONF_SECTION_PERFORMANCE
         DRI_CONF_TCL_MODE(DRI_CONF_TCL_CODEGEN)
         DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
-        DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(3,2,3)
         DRI_CONF_HYPERZ("false")
         DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
@@ -106,7 +105,6 @@
     DRI_CONF_SECTION_PERFORMANCE
         DRI_CONF_TCL_MODE(DRI_CONF_TCL_CODEGEN)
         DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
-        DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(6,2,6)
         DRI_CONF_HYPERZ("false")
         DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
@@ -544,6 +542,7 @@
     &radeonFlushExtension.base,
     &radeonImageExtension.base,
     &radeonRendererQueryExtension.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
@@ -677,13 +676,13 @@
 
     /* front color renderbuffer */
     rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
-    _mesa_add_renderbuffer_without_ref(&rfb->base, BUFFER_FRONT_LEFT, &rfb->color_rb[0]->base.Base);
+    _mesa_attach_and_own_rb(&rfb->base, BUFFER_FRONT_LEFT, &rfb->color_rb[0]->base.Base);
     rfb->color_rb[0]->has_surface = 1;
 
     /* back color renderbuffer */
     if (mesaVis->doubleBufferMode) {
       rfb->color_rb[1] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
-	_mesa_add_renderbuffer_without_ref(&rfb->base, BUFFER_BACK_LEFT, &rfb->color_rb[1]->base.Base);
+	_mesa_attach_and_own_rb(&rfb->base, BUFFER_BACK_LEFT, &rfb->color_rb[1]->base.Base);
 	rfb->color_rb[1]->has_surface = 1;
     }
 
@@ -691,21 +690,21 @@
       if (mesaVis->stencilBits == 8) {
 	struct radeon_renderbuffer *depthStencilRb =
            radeon_create_renderbuffer(MESA_FORMAT_Z24_UNORM_S8_UINT, driDrawPriv);
-	_mesa_add_renderbuffer_without_ref(&rfb->base, BUFFER_DEPTH, &depthStencilRb->base.Base);
-	_mesa_add_renderbuffer(&rfb->base, BUFFER_STENCIL, &depthStencilRb->base.Base);
+	_mesa_attach_and_own_rb(&rfb->base, BUFFER_DEPTH, &depthStencilRb->base.Base);
+	_mesa_attach_and_reference_rb(&rfb->base, BUFFER_STENCIL, &depthStencilRb->base.Base);
 	depthStencilRb->has_surface = screen->depthHasSurface;
       } else {
 	/* depth renderbuffer */
 	struct radeon_renderbuffer *depth =
            radeon_create_renderbuffer(MESA_FORMAT_Z24_UNORM_X8_UINT, driDrawPriv);
-	_mesa_add_renderbuffer_without_ref(&rfb->base, BUFFER_DEPTH, &depth->base.Base);
+	_mesa_attach_and_own_rb(&rfb->base, BUFFER_DEPTH, &depth->base.Base);
 	depth->has_surface = screen->depthHasSurface;
       }
     } else if (mesaVis->depthBits == 16) {
         /* just 16-bit depth buffer, no hw stencil */
 	struct radeon_renderbuffer *depth =
            radeon_create_renderbuffer(MESA_FORMAT_Z_UNORM16, driDrawPriv);
-	_mesa_add_renderbuffer_without_ref(&rfb->base, BUFFER_DEPTH, &depth->base.Base);
+	_mesa_attach_and_own_rb(&rfb->base, BUFFER_DEPTH, &depth->base.Base);
 	depth->has_surface = screen->depthHasSurface;
     }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index c6b1f38..d2ca812 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -34,7 +34,6 @@
 
 #include "main/glheader.h"
 #include "main/imports.h"
-#include "main/api_arrayelt.h"
 #include "main/enums.h"
 #include "main/light.h"
 #include "main/context.h"
@@ -2045,13 +2044,16 @@
 }
 
 
-static void radeonInvalidateState( struct gl_context *ctx, GLuint new_state )
+static void radeonInvalidateState(struct gl_context *ctx)
 {
+   GLuint new_state = ctx->NewState;
+
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    _swrast_InvalidateState( ctx, new_state );
    _swsetup_InvalidateState( ctx, new_state );
-   _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
-   _ae_invalidate_state( ctx, new_state );
    R100_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index f2bc462..d5365cd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -37,6 +37,7 @@
 #include "main/enums.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/state.h"
 
 #include "math/m_xform.h"
 
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index f43ac60..7de90d3 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -208,6 +208,8 @@
 static const __DRIextension *dri_screen_extensions[] = {
     &swrastTexBufferExtension.base,
     &swrast_query_renderer_extension.base,
+    &dri2ConfigQueryExtension.base,
+    &dri2NoErrorExtension.base,
     NULL
 };
 
@@ -569,14 +571,12 @@
 
     /* add front renderbuffer */
     frontrb = swrast_new_renderbuffer(visual, dPriv, GL_TRUE);
-    _mesa_add_renderbuffer_without_ref(fb, BUFFER_FRONT_LEFT,
-                                       &frontrb->Base.Base);
+    _mesa_attach_and_own_rb(fb, BUFFER_FRONT_LEFT, &frontrb->Base.Base);
 
     /* add back renderbuffer */
     if (visual->doubleBufferMode) {
 	backrb = swrast_new_renderbuffer(visual, dPriv, GL_FALSE);
-        _mesa_add_renderbuffer_without_ref(fb, BUFFER_BACK_LEFT,
-                                           &backrb->Base.Base);
+        _mesa_attach_and_own_rb(fb, BUFFER_BACK_LEFT, &backrb->Base.Base);
     }
 
     /* add software renderbuffers */
@@ -699,12 +699,16 @@
 }
 
 static void
-update_state( struct gl_context *ctx, GLuint new_state )
+update_state(struct gl_context *ctx)
 {
+    GLuint new_state = ctx->NewState;
+
+    if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
     /* not much to do here - pass it on */
     _swrast_InvalidateState( ctx, new_state );
     _swsetup_InvalidateState( ctx, new_state );
-    _vbo_InvalidateState( ctx, new_state );
     _tnl_InvalidateState( ctx, new_state );
 }
 
@@ -962,7 +966,6 @@
     &driCoreExtension.base,
     &driSWRastExtension.base,
     &driCopySubBufferExtension.base,
-    &dri2ConfigQueryExtension.base,
     &swrast_vtable.base,
     NULL
 };
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 68fca3e..734a4e8 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -117,15 +117,22 @@
 
 
 static void
-osmesa_update_state( struct gl_context *ctx, GLuint new_state )
+osmesa_update_state(struct gl_context *ctx, GLuint new_state)
 {
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    /* easy - just propogate */
    _swrast_InvalidateState( ctx, new_state );
    _swsetup_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
-   _vbo_InvalidateState( ctx, new_state );
 }
 
+static void
+osmesa_update_state_wrapper(struct gl_context *ctx)
+{
+   osmesa_update_state(ctx, ctx->NewState);
+}
 
 
 /**
@@ -828,7 +835,7 @@
       _mesa_init_driver_functions(&functions);
       /* override with our functions */
       functions.GetString = get_string;
-      functions.UpdateState = osmesa_update_state;
+      functions.UpdateState = osmesa_update_state_wrapper;
 
       if (!_mesa_initialize_context(&osmesa->mesa,
                                     api_profile,
@@ -1020,15 +1027,15 @@
     * There is no back color buffer.
     * If the user tries to use a 8, 16 or 32-bit/channel buffer that
     * doesn't match what Mesa was compiled for (CHAN_BITS) the
-    * _mesa_add_renderbuffer() function will create a "wrapper" renderbuffer
-    * that converts rendering from CHAN_BITS to the user-requested channel
-    * size.
+    * _mesa_attach_and_reference_rb() function will create a "wrapper"
+    * renderbuffer that converts rendering from CHAN_BITS to the
+    * user-requested channel size.
     */
    if (!osmesa->srb) {
       osmesa->srb = new_osmesa_renderbuffer(&osmesa->mesa, osmesa->format, type);
       _mesa_remove_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT);
-      _mesa_add_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT,
-                             &osmesa->srb->Base);
+      _mesa_attach_and_reference_rb(osmesa->gl_buffer, BUFFER_FRONT_LEFT,
+                                    &osmesa->srb->Base);
       assert(osmesa->srb->Base.RefCount == 2);
    }
 
@@ -1051,8 +1058,8 @@
     * renderbuffer adaptor/wrapper if needed (for bpp conversion).
     */
    _mesa_remove_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT);
-   _mesa_add_renderbuffer(osmesa->gl_buffer, BUFFER_FRONT_LEFT,
-                          &osmesa->srb->Base);
+   _mesa_attach_and_reference_rb(osmesa->gl_buffer, BUFFER_FRONT_LEFT,
+                                 &osmesa->srb->Base);
 
 
    /* this updates the visual's red/green/blue/alphaBits fields */
diff --git a/src/mesa/drivers/x11/Makefile.am b/src/mesa/drivers/x11/Makefile.am
index d5ca69d..6e123ba 100644
--- a/src/mesa/drivers/x11/Makefile.am
+++ b/src/mesa/drivers/x11/Makefile.am
@@ -26,7 +26,6 @@
 EXTRA_DIST = SConscript
 
 if HAVE_SHARED_GLAPI
-SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
 SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
 endif
 
@@ -39,7 +38,6 @@
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/main \
 	$(X11_INCLUDES) \
-	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES)
 
 lib_LTLIBRARIES = lib@GL_LIB@.la
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 155d273..a0695c3 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -311,8 +311,8 @@
    b->frontxrb->Parent = b;
    b->frontxrb->drawable = d;
    b->frontxrb->pixmap = (XMesaPixmap) d;
-   _mesa_add_renderbuffer_without_ref(&b->mesa_buffer, BUFFER_FRONT_LEFT,
-                                      &b->frontxrb->Base.Base);
+   _mesa_attach_and_own_rb(&b->mesa_buffer, BUFFER_FRONT_LEFT,
+                           &b->frontxrb->Base.Base);
 
    /*
     * Back renderbuffer
@@ -328,8 +328,8 @@
       /* determine back buffer implementation */
       b->db_mode = vis->ximage_flag ? BACK_XIMAGE : BACK_PIXMAP;
       
-      _mesa_add_renderbuffer_without_ref(&b->mesa_buffer, BUFFER_BACK_LEFT,
-                                         &b->backxrb->Base.Base);
+      _mesa_attach_and_own_rb(&b->mesa_buffer, BUFFER_BACK_LEFT,
+                              &b->backxrb->Base.Base);
    }
 
    /*
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index cd5809e..3e61342 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -33,6 +33,7 @@
 #include "main/context.h"
 #include "main/colormac.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/macros.h"
 #include "main/mipmap.h"
 #include "main/image.h"
@@ -678,17 +679,20 @@
  * Called when the driver should update its state, based on the new_state
  * flags.
  */
-void
-xmesa_update_state( struct gl_context *ctx, GLbitfield new_state )
+static void
+xmesa_update_state(struct gl_context *ctx)
 {
+   GLbitfield new_state = ctx->NewState;
    const XMesaContext xmesa = XMESA_CONTEXT(ctx);
 
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    /* Propagate statechange information to swrast and swrast_setup
     * modules.  The X11 driver has no internal GL-dependent state.
     */
    _swrast_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
-   _vbo_InvalidateState( ctx, new_state );
    _swsetup_InvalidateState( ctx, new_state );
 
    if (_mesa_is_user_fbo(ctx->DrawBuffer))
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 6cd020f..ff3ddc4 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -249,7 +249,7 @@
 #define PACK_TRUECOLOR( PIXEL, R, G, B )	\
    PIXEL = xmesa->xm_visual->RtoPixel[R]	\
          | xmesa->xm_visual->GtoPixel[G]	\
-         | xmesa->xm_visual->BtoPixel[B];	\
+         | xmesa->xm_visual->BtoPixel[B];
 
 
 /**
@@ -354,10 +354,6 @@
                              struct dd_function_table *driver );
 
 extern void
-xmesa_update_state( struct gl_context *ctx, GLbitfield new_state );
-
-
-extern void
 xmesa_MapRenderbuffer(struct gl_context *ctx,
                       struct gl_renderbuffer *rb,
                       GLuint x, GLuint y, GLuint w, GLuint h,
diff --git a/src/mesa/main/.gitignore b/src/mesa/main/.gitignore
index 836d8f1..8cc33cf 100644
--- a/src/mesa/main/.gitignore
+++ b/src/mesa/main/.gitignore
@@ -4,6 +4,7 @@
 remap_helper.h
 get_hash.h
 get_hash.h.tmp
+format_fallback.c
 format_info.h
 format_info.c
 format_pack.c
diff --git a/src/mesa/main/accum.c b/src/mesa/main/accum.c
index ef74468..2b15b6e 100644
--- a/src/mesa/main/accum.c
+++ b/src/mesa/main/accum.c
@@ -28,6 +28,7 @@
 #include "context.h"
 #include "format_unpack.h"
 #include "format_pack.h"
+#include "framebuffer.h"
 #include "imports.h"
 #include "macros.h"
 #include "state.h"
@@ -53,57 +54,6 @@
 }
 
 
-void GLAPIENTRY
-_mesa_Accum( GLenum op, GLfloat value )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
-
-   switch (op) {
-   case GL_ADD:
-   case GL_MULT:
-   case GL_ACCUM:
-   case GL_LOAD:
-   case GL_RETURN:
-      /* OK */
-      break;
-   default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "glAccum(op)");
-      return;
-   }
-
-   if (ctx->DrawBuffer->Visual.haveAccumBuffer == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glAccum(no accum buffer)");
-      return;
-   }
-
-   if (ctx->DrawBuffer != ctx->ReadBuffer) {
-      /* See GLX_SGI_make_current_read or WGL_ARB_make_current_read,
-       * or GL_EXT_framebuffer_blit.
-       */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glAccum(different read/draw buffers)");
-      return;
-   }
-
-   if (ctx->NewState)
-      _mesa_update_state(ctx);
-
-   if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
-      _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
-                  "glAccum(incomplete framebuffer)");
-      return;
-   }
-
-   if (ctx->RasterDiscard)
-      return;
-
-   if (ctx->RenderMode == GL_RENDER) {
-      _mesa_accum(ctx, op, value);
-   }
-}
-
-
 /**
  * Clear the accumulation buffer by mapping the renderbuffer and
  * writing the clear color to it.  Called by the driver's implementation
@@ -124,6 +74,8 @@
    if (!accRb)
       return;   /* missing accum buffer, not an error */
 
+   _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    /* bounds, with scissor */
    x = ctx->DrawBuffer->_Xmin;
    y = ctx->DrawBuffer->_Ymin;
@@ -436,8 +388,8 @@
  * signed 16-bit color channels could implement hardware accumulation
  * operations, but no driver does so at this time.
  */
-void
-_mesa_accum(struct gl_context *ctx, GLenum op, GLfloat value)
+static void
+accum(struct gl_context *ctx, GLenum op, GLfloat value)
 {
    GLint xpos, ypos, width, height;
 
@@ -449,6 +401,8 @@
    if (!_mesa_check_conditional_render(ctx))
       return;
 
+   _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    xpos = ctx->DrawBuffer->_Xmin;
    ypos = ctx->DrawBuffer->_Ymin;
    width =  ctx->DrawBuffer->_Xmax - ctx->DrawBuffer->_Xmin;
@@ -477,7 +431,7 @@
       accum_return(ctx, value, xpos, ypos, width, height);
       break;
    default:
-      _mesa_problem(ctx, "invalid mode in _mesa_accum()");
+      unreachable("invalid mode in _mesa_Accum()");
       break;
    }
 }
@@ -489,3 +443,54 @@
    /* Accumulate buffer group */
    ASSIGN_4V( ctx->Accum.ClearColor, 0.0, 0.0, 0.0, 0.0 );
 }
+
+
+void GLAPIENTRY
+_mesa_Accum( GLenum op, GLfloat value )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   switch (op) {
+   case GL_ADD:
+   case GL_MULT:
+   case GL_ACCUM:
+   case GL_LOAD:
+   case GL_RETURN:
+      /* OK */
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "glAccum(op)");
+      return;
+   }
+
+   if (ctx->DrawBuffer->Visual.haveAccumBuffer == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glAccum(no accum buffer)");
+      return;
+   }
+
+   if (ctx->DrawBuffer != ctx->ReadBuffer) {
+      /* See GLX_SGI_make_current_read or WGL_ARB_make_current_read,
+       * or GL_EXT_framebuffer_blit.
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glAccum(different read/draw buffers)");
+      return;
+   }
+
+   if (ctx->NewState)
+      _mesa_update_state(ctx);
+
+   if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+      _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
+                  "glAccum(incomplete framebuffer)");
+      return;
+   }
+
+   if (ctx->RasterDiscard)
+      return;
+
+   if (ctx->RenderMode == GL_RENDER) {
+      accum(ctx, op, value);
+   }
+}
diff --git a/src/mesa/main/accum.h b/src/mesa/main/accum.h
index a5665c7..fe253a2 100644
--- a/src/mesa/main/accum.h
+++ b/src/mesa/main/accum.h
@@ -39,9 +39,7 @@
 
 #include "main/glheader.h"
 
-struct _glapi_table;
 struct gl_context;
-struct gl_renderbuffer;
 
 extern void GLAPIENTRY
 _mesa_ClearAccum( GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha );
@@ -49,9 +47,6 @@
 _mesa_Accum( GLenum op, GLfloat value );
 
 extern void
-_mesa_accum(struct gl_context *ctx, GLenum op, GLfloat value);
-
-extern void
 _mesa_clear_accum_buffer(struct gl_context *ctx);
 
 extern void
diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c
index 46175e4..2dfa74f 100644
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -65,12 +65,13 @@
 typedef struct {
    AEarray arrays[32];
    AEattrib attribs[VERT_ATTRIB_MAX + 1];
-   GLbitfield NewState;
 
    /* List of VBOs we need to map before executing ArrayElements */
    struct gl_buffer_object *vbo[VERT_ATTRIB_MAX];
    GLuint nr_vbos;
    GLboolean mapped_vbos;  /**< Any currently mapped VBOs? */
+
+   bool dirty_state;
 } AEcontext;
 
 
@@ -94,6 +95,13 @@
 }
 
 
+bool
+_ae_is_state_dirty(struct gl_context *ctx)
+{
+   return AE_CONTEXT(ctx)->dirty_state;
+}
+
+
 #define NUM_TYPES 8
 
 
@@ -1511,7 +1519,7 @@
    if (!ctx->aelt_context)
       return GL_FALSE;
 
-   AE_CONTEXT(ctx)->NewState = ~0;
+   AE_CONTEXT(ctx)->dirty_state = true;
    return GL_TRUE;
 }
 
@@ -1690,7 +1698,7 @@
    at->func = NULL;  /* terminate the list */
    aa->offset = -1;  /* terminate the list */
 
-   actx->NewState = 0;
+   actx->dirty_state = false;
 }
 
 
@@ -1707,7 +1715,7 @@
    if (actx->mapped_vbos)
       return;
 
-   if (actx->NewState)
+   if (actx->dirty_state)
       _ae_update_state(ctx);
 
    for (i = 0; i < actx->nr_vbos; i++)
@@ -1734,7 +1742,7 @@
    if (!actx->mapped_vbos)
       return;
 
-   assert (!actx->NewState);
+   assert (!actx->dirty_state);
 
    for (i = 0; i < actx->nr_vbos; i++)
       ctx->Driver.UnmapBuffer(ctx, actx->vbo[i], MAP_INTERNAL);
@@ -1767,7 +1775,7 @@
       return;
    }
 
-   if (actx->NewState) {
+   if (actx->dirty_state) {
       assert(!actx->mapped_vbos);
       _ae_update_state(ctx);
    }
@@ -1802,7 +1810,7 @@
 
 
 void
-_ae_invalidate_state(struct gl_context *ctx, GLbitfield new_state)
+_ae_invalidate_state(struct gl_context *ctx)
 {
    AEcontext *actx = AE_CONTEXT(ctx);
 
@@ -1815,11 +1823,10 @@
     * Luckily, neither the drivers nor tnl muck with the state that
     * concerns us here:
     */
-   new_state &= _NEW_ARRAY | _NEW_PROGRAM;
-   if (new_state) {
-      assert(!actx->mapped_vbos);
-      actx->NewState |= new_state;
-   }
+   assert(ctx->NewState & (_NEW_ARRAY | _NEW_PROGRAM));
+
+   assert(!actx->mapped_vbos);
+   actx->dirty_state = true;
 }
 
 
diff --git a/src/mesa/main/api_arrayelt.h b/src/mesa/main/api_arrayelt.h
index 03cd9ec..965e0ad 100644
--- a/src/mesa/main/api_arrayelt.h
+++ b/src/mesa/main/api_arrayelt.h
@@ -33,7 +33,8 @@
 
 extern GLboolean _ae_create_context( struct gl_context *ctx );
 extern void _ae_destroy_context( struct gl_context *ctx );
-extern void _ae_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
+extern void _ae_invalidate_state(struct gl_context *ctx);
+extern bool _ae_is_state_dirty(struct gl_context *ctx);
 extern void GLAPIENTRY _ae_ArrayElement( GLint elt );
 
 /* May optionally be called before a batch of element calls:
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 59b59d3..b552d17 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -89,6 +89,8 @@
 #define ATTRIB3_D(index,x,y,z)     CALL_VertexAttribL3d(GET_DISPATCH(), (index,x,y,z))
 #define ATTRIB4_D(index,x,y,z,w)    CALL_VertexAttribL4d(GET_DISPATCH(), (index,x,y,z,w))
 
+#define ATTRIB1_UI64(index, x)     CALL_VertexAttribL1ui64ARB(GET_DISPATCH(), (index, x))
+
 void GLAPIENTRY
 _mesa_Color3b( GLbyte red, GLbyte green, GLbyte blue )
 {
@@ -1529,6 +1531,18 @@
 }
 
 void GLAPIENTRY
+_mesa_VertexAttribL1ui64ARB(GLuint index, GLuint64EXT x)
+{
+   ATTRIB1_UI64(index, x);
+}
+
+void GLAPIENTRY
+_mesa_VertexAttribL1ui64vARB(GLuint index, const GLuint64EXT *v)
+{
+   ATTRIB1_UI64(index, v[0]);
+}
+
+void GLAPIENTRY
 _mesa_VertexAttribL2dv(GLuint index, const GLdouble *v)
 {
    ATTRIB2_D(index, v[0], v[1]);
@@ -1789,5 +1803,9 @@
       SET_VertexAttribL2dv(dest, _mesa_VertexAttribL2dv);
       SET_VertexAttribL3dv(dest, _mesa_VertexAttribL3dv);
       SET_VertexAttribL4dv(dest, _mesa_VertexAttribL4dv);
+
+      /* GL_ARB_bindless_texture */
+      SET_VertexAttribL1ui64ARB(dest, _mesa_VertexAttribL1ui64ARB);
+      SET_VertexAttribL1ui64vARB(dest, _mesa_VertexAttribL1ui64vARB);
    }
 }
diff --git a/src/mesa/main/api_loopback.h b/src/mesa/main/api_loopback.h
index 026bfd6..c1e7b24 100644
--- a/src/mesa/main/api_loopback.h
+++ b/src/mesa/main/api_loopback.h
@@ -481,4 +481,10 @@
 _mesa_VertexAttribL3dv(GLuint index, const GLdouble *v);
 void GLAPIENTRY
 _mesa_VertexAttribL4dv(GLuint index, const GLdouble *v);
+
+void GLAPIENTRY
+_mesa_VertexAttribL1ui64ARB(GLuint index, GLuint64EXT x);
+void GLAPIENTRY
+_mesa_VertexAttribL1ui64vARB(GLuint index, const GLuint64EXT *v);
+
 #endif /* API_LOOPBACK_H */
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index e23be60..6ccb9e7 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -99,7 +99,7 @@
        *     the blend equation or "blend_support_all_equations", the error
        *     INVALID_OPERATION is generated [...]"
        */
-      const struct gl_program *prog = ctx->_Shader->_CurrentFragmentProgram;
+      const struct gl_program *prog = ctx->FragmentProgram._Current;
       const GLbitfield blend_support = !prog ? 0 : prog->sh.fs.BlendSupport;
 
       if ((blend_support & ctx->Color._AdvancedBlendMode) == 0) {
@@ -133,15 +133,17 @@
       /* Any shader stages that are not supplied by the GLSL shader and have
        * assembly shaders enabled must now be validated.
        */
-      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]
-          && ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) {
+      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] &&
+          ctx->VertexProgram.Enabled &&
+          !_mesa_arb_vertex_program_enabled(ctx)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(vertex program not valid)", where);
          return GL_FALSE;
       }
 
       if (!ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT]) {
-         if (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled) {
+         if (ctx->FragmentProgram.Enabled &&
+             !_mesa_arb_fragment_program_enabled(ctx)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(fragment program not valid)", where);
             return GL_FALSE;
@@ -243,7 +245,20 @@
       return false;
    }
 
-   if (!_mesa_all_buffers_are_unmapped(ctx->Array.VAO)) {
+   /* Section 6.3.2 from the GL 4.5:
+    * "Any GL command which attempts to read from, write to, or change
+    *  the state of a buffer object may generate an INVALID_OPERATION error if
+    *  all or part of the buffer object is mapped ... However, only commands
+    *  which explicitly describe this error are required to do so. If an error
+    *  is not generated, such commands will have undefined results and may
+    *  result in GL interruption or termination."
+    *
+    * Only some buffer API functions require INVALID_OPERATION with mapped
+    * buffers. No other functions list such an error, thus it's not required
+    * to report INVALID_OPERATION for draw calls with mapped buffers.
+    */
+   if (!ctx->Const.AllowMappedBuffersDuringExecution &&
+       !_mesa_all_buffers_are_unmapped(ctx->Array.VAO)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(vertex buffers are mapped)", function);
       return false;
@@ -290,15 +305,6 @@
                      "%s(tess ctrl shader is missing)", function);
          return false;
       }
-
-      /* For ES2, we can draw if we have a vertex program/shader). */
-      return ctx->VertexProgram._Current != NULL;
-
-   case API_OPENGLES:
-      /* For OpenGL ES, only draw if we have vertex positions
-       */
-      if (!ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POS].Enabled)
-         return false;
       break;
 
    case API_OPENGL_CORE:
@@ -312,32 +318,10 @@
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(no VAO bound)", function);
          return false;
       }
+      break;
 
-      /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec
-       * says:
-       *
-       *     "If there is no active program for the vertex or fragment shader
-       *     stages, the results of vertex and/or fragment processing will be
-       *     undefined. However, this is not an error."
-       *
-       * The fragment shader is not tested here because other state (e.g.,
-       * GL_RASTERIZER_DISCARD) affects whether or not we actually care.
-       */
-      return ctx->VertexProgram._Current != NULL;
-
+   case API_OPENGLES:
    case API_OPENGL_COMPAT:
-      if (ctx->VertexProgram._Current != NULL) {
-         /* Draw regardless of whether or not we have any vertex arrays.
-          * (Ex: could draw a point using a constant vertex pos)
-          */
-         return true;
-      } else {
-         /* Draw if we have vertex positions (GL_VERTEX_ARRAY or generic
-          * array [0]).
-          */
-         return (ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POS].Enabled ||
-                 ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_GENERIC0].Enabled);
-      }
       break;
 
    default:
@@ -355,7 +339,7 @@
  * Note: This may be called during display list compilation.
  */
 bool
-_mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
+_mesa_is_valid_prim_mode(const struct gl_context *ctx, GLenum mode)
 {
    /* The overwhelmingly common case is (mode <= GL_TRIANGLE_FAN).  Test that
     * first and exit.  You would think that a switch-statement would be the
@@ -700,14 +684,6 @@
    if (!check_valid_to_render(ctx, caller))
       return false;
 
-   /* Not using a VBO for indices, so avoid NULL pointer derefs later.
-    */
-   if (!_mesa_is_bufferobj(ctx->Array.VAO->IndexBufferObj) && indices == NULL)
-      return false;
-
-   if (count == 0)
-      return false;
-
    return true;
 }
 
@@ -1402,237 +1378,3 @@
    return valid_draw_indirect_parameters(
          ctx, "glMultiDrawElementsIndirectCountARB", drawcount);
 }
-
-static bool
-check_valid_to_compute(struct gl_context *ctx, const char *function)
-{
-   if (!_mesa_has_compute_shaders(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "unsupported function (%s) called",
-                  function);
-      return false;
-   }
-
-   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
-    *
-    * "An INVALID_OPERATION error is generated if there is no active program
-    *  for the compute shader stage."
-    */
-   if (ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE] == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(no active compute shader)",
-                  function);
-      return false;
-   }
-
-   return true;
-}
-
-GLboolean
-_mesa_validate_DispatchCompute(struct gl_context *ctx,
-                               const GLuint *num_groups)
-{
-   int i;
-   FLUSH_CURRENT(ctx, 0);
-
-   if (!check_valid_to_compute(ctx, "glDispatchCompute"))
-      return GL_FALSE;
-
-   for (i = 0; i < 3; i++) {
-      /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
-       *
-       * "An INVALID_VALUE error is generated if any of num_groups_x,
-       *  num_groups_y and num_groups_z are greater than or equal to the
-       *  maximum work group count for the corresponding dimension."
-       *
-       * However, the "or equal to" portions appears to be a specification
-       * bug. In all other areas, the specification appears to indicate that
-       * the number of workgroups can match the MAX_COMPUTE_WORK_GROUP_COUNT
-       * value. For example, under DispatchComputeIndirect:
-       *
-       * "If any of num_groups_x, num_groups_y or num_groups_z is greater than
-       *  the value of MAX_COMPUTE_WORK_GROUP_COUNT for the corresponding
-       *  dimension then the results are undefined."
-       *
-       * Additionally, the OpenGLES 3.1 specification does not contain "or
-       * equal to" as an error condition.
-       */
-      if (num_groups[i] > ctx->Const.MaxComputeWorkGroupCount[i]) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glDispatchCompute(num_groups_%c)", 'x' + i);
-         return GL_FALSE;
-      }
-   }
-
-   /* The ARB_compute_variable_group_size spec says:
-    *
-    * "An INVALID_OPERATION error is generated by DispatchCompute if the active
-    *  program for the compute shader stage has a variable work group size."
-    */
-   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
-   if (prog->info.cs.local_size_variable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glDispatchCompute(variable work group size forbidden)");
-      return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-GLboolean
-_mesa_validate_DispatchComputeGroupSizeARB(struct gl_context *ctx,
-                                           const GLuint *num_groups,
-                                           const GLuint *group_size)
-{
-   GLuint total_invocations = 1;
-   int i;
-
-   FLUSH_CURRENT(ctx, 0);
-
-   if (!check_valid_to_compute(ctx, "glDispatchComputeGroupSizeARB"))
-      return GL_FALSE;
-
-   /* The ARB_compute_variable_group_size spec says:
-    *
-    * "An INVALID_OPERATION error is generated by
-    *  DispatchComputeGroupSizeARB if the active program for the compute
-    *  shader stage has a fixed work group size."
-    */
-   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
-   if (!prog->info.cs.local_size_variable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glDispatchComputeGroupSizeARB(fixed work group size "
-                  "forbidden)");
-      return GL_FALSE;
-   }
-
-   for (i = 0; i < 3; i++) {
-      /* The ARB_compute_variable_group_size spec says:
-       *
-       * "An INVALID_VALUE error is generated if any of num_groups_x,
-       *  num_groups_y and num_groups_z are greater than or equal to the
-       *  maximum work group count for the corresponding dimension."
-       */
-      if (num_groups[i] > ctx->Const.MaxComputeWorkGroupCount[i]) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glDispatchComputeGroupSizeARB(num_groups_%c)", 'x' + i);
-         return GL_FALSE;
-      }
-
-      /* The ARB_compute_variable_group_size spec says:
-       *
-       * "An INVALID_VALUE error is generated by DispatchComputeGroupSizeARB if
-       *  any of <group_size_x>, <group_size_y>, or <group_size_z> is less than
-       *  or equal to zero or greater than the maximum local work group size
-       *  for compute shaders with variable group size
-       *  (MAX_COMPUTE_VARIABLE_GROUP_SIZE_ARB) in the corresponding
-       *  dimension."
-       *
-       * However, the "less than" is a spec bug because they are declared as
-       * unsigned integers.
-       */
-      if (group_size[i] == 0 ||
-          group_size[i] > ctx->Const.MaxComputeVariableGroupSize[i]) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glDispatchComputeGroupSizeARB(group_size_%c)", 'x' + i);
-         return GL_FALSE;
-      }
-
-      total_invocations *= group_size[i];
-   }
-
-   /* The ARB_compute_variable_group_size spec says:
-    *
-    * "An INVALID_VALUE error is generated by DispatchComputeGroupSizeARB if
-    *  the product of <group_size_x>, <group_size_y>, and <group_size_z> exceeds
-    *  the implementation-dependent maximum local work group invocation count
-    *  for compute shaders with variable group size
-    *  (MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB)."
-    */
-   if (total_invocations > ctx->Const.MaxComputeVariableGroupInvocations) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glDispatchComputeGroupSizeARB(product of local_sizes "
-                  "exceeds MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB "
-                  "(%d > %d))", total_invocations,
-                  ctx->Const.MaxComputeVariableGroupInvocations);
-      return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-static GLboolean
-valid_dispatch_indirect(struct gl_context *ctx,
-                        GLintptr indirect,
-                        GLsizei size, const char *name)
-{
-   const uint64_t end = (uint64_t) indirect + size;
-
-   if (!check_valid_to_compute(ctx, name))
-      return GL_FALSE;
-
-   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
-    *
-    * "An INVALID_VALUE error is generated if indirect is negative or is not a
-    *  multiple of four."
-    */
-   if (indirect & (sizeof(GLuint) - 1)) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(indirect is not aligned)", name);
-      return GL_FALSE;
-   }
-
-   if (indirect < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(indirect is less than zero)", name);
-      return GL_FALSE;
-   }
-
-   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
-    *
-    * "An INVALID_OPERATION error is generated if no buffer is bound to the
-    *  DRAW_INDIRECT_BUFFER binding, or if the command would source data
-    *  beyond the end of the buffer object."
-    */
-   if (!_mesa_is_bufferobj(ctx->DispatchIndirectBuffer)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s: no buffer bound to DISPATCH_INDIRECT_BUFFER", name);
-      return GL_FALSE;
-   }
-
-   if (_mesa_check_disallowed_mapping(ctx->DispatchIndirectBuffer)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(DISPATCH_INDIRECT_BUFFER is mapped)", name);
-      return GL_FALSE;
-   }
-
-   if (ctx->DispatchIndirectBuffer->Size < end) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(DISPATCH_INDIRECT_BUFFER too small)", name);
-      return GL_FALSE;
-   }
-
-   /* The ARB_compute_variable_group_size spec says:
-    *
-    * "An INVALID_OPERATION error is generated if the active program for the
-    *  compute shader stage has a variable work group size."
-    */
-   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
-   if (prog->info.cs.local_size_variable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(variable work group size forbidden)", name);
-      return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-GLboolean
-_mesa_validate_DispatchComputeIndirect(struct gl_context *ctx,
-                                       GLintptr indirect)
-{
-   FLUSH_CURRENT(ctx, 0);
-
-   return valid_dispatch_indirect(ctx, indirect, 3 * sizeof(GLuint),
-                                  "glDispatchComputeIndirect");
-}
diff --git a/src/mesa/main/api_validate.h b/src/mesa/main/api_validate.h
index 93ec93d..7a18115 100644
--- a/src/mesa/main/api_validate.h
+++ b/src/mesa/main/api_validate.h
@@ -39,7 +39,7 @@
 _mesa_valid_to_render(struct gl_context *ctx, const char *where);
 
 extern bool
-_mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode);
+_mesa_is_valid_prim_mode(const struct gl_context *ctx, GLenum mode);
 
 extern GLboolean
 _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name);
@@ -128,17 +128,4 @@
                                               GLsizei maxdrawcount,
                                               GLsizei stride);
 
-extern GLboolean
-_mesa_validate_DispatchCompute(struct gl_context *ctx,
-                               const GLuint *num_groups);
-
-extern GLboolean
-_mesa_validate_DispatchComputeIndirect(struct gl_context *ctx,
-                                       GLintptr indirect);
-
-extern GLboolean
-_mesa_validate_DispatchComputeGroupSizeARB(struct gl_context *ctx,
-                                           const GLuint *num_groups,
-                                           const GLuint *group_size);
-
 #endif
diff --git a/src/mesa/main/arbprogram.c b/src/mesa/main/arbprogram.c
index f3a0a54c..625dc66 100644
--- a/src/mesa/main/arbprogram.c
+++ b/src/mesa/main/arbprogram.c
@@ -41,6 +41,23 @@
 #include "program/program.h"
 #include "program/prog_print.h"
 
+static void
+flush_vertices_for_program_constants(struct gl_context *ctx, GLenum target)
+{
+   uint64_t new_driver_state;
+
+   if (target == GL_FRAGMENT_PROGRAM_ARB) {
+      new_driver_state =
+         ctx->DriverFlags.NewShaderConstants[MESA_SHADER_FRAGMENT];
+   } else {
+      new_driver_state =
+         ctx->DriverFlags.NewShaderConstants[MESA_SHADER_VERTEX];
+   }
+
+   FLUSH_VERTICES(ctx, new_driver_state ? 0 : _NEW_PROGRAM_CONSTANTS);
+   ctx->NewDriverState |= new_driver_state;
+}
+
 /**
  * Bind a program (make it current)
  * \note Called from the GL API dispatcher by both glBindProgramNV
@@ -105,7 +122,8 @@
    }
 
    /* signal new program (and its new constants) */
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   flush_vertices_for_program_constants(ctx, target);
 
    /* bind newProg */
    if (target == GL_VERTEX_PROGRAM_ARB) {
@@ -434,7 +452,7 @@
 
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   flush_vertices_for_program_constants(ctx, target);
 
    if (get_env_param_pointer(ctx, "glProgramEnvParameter",
 			     target, index, &param)) {
@@ -456,7 +474,7 @@
 
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   flush_vertices_for_program_constants(ctx, target);
 
    if (get_env_param_pointer(ctx, "glProgramEnvParameter4fv",
 			      target, index, &param)) {
@@ -472,7 +490,7 @@
    GET_CURRENT_CONTEXT(ctx);
    GLfloat * dest;
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   flush_vertices_for_program_constants(ctx, target);
 
    if (count <= 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glProgramEnvParameters4fv(count)");
@@ -539,7 +557,7 @@
    GET_CURRENT_CONTEXT(ctx);
    GLfloat *param;
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   flush_vertices_for_program_constants(ctx, target);
 
    if (get_local_param_pointer(ctx, "glProgramLocalParameterARB",
 			       target, index, &param)) {
@@ -565,7 +583,7 @@
    GET_CURRENT_CONTEXT(ctx);
    GLfloat *dest;
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   flush_vertices_for_program_constants(ctx, target);
 
    if (count <= 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glProgramLocalParameters4fv(count)");
diff --git a/src/mesa/main/arrayobj.c b/src/mesa/main/arrayobj.c
index ab1b834..6e23115 100644
--- a/src/mesa/main/arrayobj.c
+++ b/src/mesa/main/arrayobj.c
@@ -28,9 +28,8 @@
 /**
  * \file arrayobj.c
  *
- * Implementation of Vertex Array Objects (VAOs), from OpenGL 3.1+,
- * the GL_ARB_vertex_array_object extension, or the older
- * GL_APPLE_vertex_array_object extension.
+ * Implementation of Vertex Array Objects (VAOs), from OpenGL 3.1+ /
+ * the GL_ARB_vertex_array_object extension.
  *
  * \todo
  * The code in this file borrows a lot from bufferobj.c.  There's a certain
@@ -71,7 +70,7 @@
       return NULL;
    else
       return (struct gl_vertex_array_object *)
-         _mesa_HashLookup(ctx->Array.Objects, id);
+         _mesa_HashLookupLocked(ctx->Array.Objects, id);
 }
 
 
@@ -108,7 +107,7 @@
          vao = ctx->Array.LastLookedUpVAO;
       } else {
          vao = (struct gl_vertex_array_object *)
-            _mesa_HashLookup(ctx->Array.Objects, id);
+            _mesa_HashLookupLocked(ctx->Array.Objects, id);
 
          /* The ARB_direct_state_access specification says:
           *
@@ -169,7 +168,6 @@
 {
    unbind_array_object_vbos(ctx, obj);
    _mesa_reference_buffer_object(ctx, &obj->IndexBufferObj, NULL);
-   mtx_destroy(&obj->Mutex);
    free(obj->Label);
    free(obj);
 }
@@ -189,16 +187,12 @@
 
    if (*ptr) {
       /* Unreference the old array object */
-      GLboolean deleteFlag = GL_FALSE;
       struct gl_vertex_array_object *oldObj = *ptr;
 
-      mtx_lock(&oldObj->Mutex);
       assert(oldObj->RefCount > 0);
       oldObj->RefCount--;
-      deleteFlag = (oldObj->RefCount == 0);
-      mtx_unlock(&oldObj->Mutex);
 
-      if (deleteFlag)
+      if (oldObj->RefCount == 0)
          _mesa_delete_vao(ctx, oldObj);
 
       *ptr = NULL;
@@ -207,18 +201,10 @@
 
    if (vao) {
       /* reference new array object */
-      mtx_lock(&vao->Mutex);
-      if (vao->RefCount == 0) {
-         /* this array's being deleted (look just above) */
-         /* Not sure this can every really happen.  Warn if it does. */
-         _mesa_problem(NULL, "referencing deleted array object");
-         *ptr = NULL;
-      }
-      else {
-         vao->RefCount++;
-         *ptr = vao;
-      }
-      mtx_unlock(&vao->Mutex);
+      assert(vao->RefCount > 0);
+
+      vao->RefCount++;
+      *ptr = vao;
    }
 }
 
@@ -274,7 +260,6 @@
 
    vao->Name = name;
 
-   mtx_init(&vao->Mutex, mtx_plain);
    vao->RefCount = 1;
 
    /* Init the individual arrays */
@@ -313,33 +298,6 @@
 
 
 /**
- * Add the given array object to the array object pool.
- */
-static void
-save_array_object(struct gl_context *ctx, struct gl_vertex_array_object *vao)
-{
-   if (vao->Name > 0) {
-      /* insert into hash table */
-      _mesa_HashInsert(ctx->Array.Objects, vao->Name, vao);
-   }
-}
-
-
-/**
- * Remove the given array object from the array object pool.
- * Do not deallocate the array object though.
- */
-static void
-remove_array_object(struct gl_context *ctx, struct gl_vertex_array_object *vao)
-{
-   if (vao->Name > 0) {
-      /* remove from hash table */
-      _mesa_HashRemove(ctx->Array.Objects, vao->Name);
-   }
-}
-
-
-/**
  * Updates the derived gl_vertex_arrays when a gl_vertex_attrib_array
  * or a gl_vertex_buffer_binding has changed.
  */
@@ -432,19 +390,19 @@
 
 
 /**
- * Helper for _mesa_BindVertexArray() and _mesa_BindVertexArrayAPPLE().
- * \param genRequired  specifies behavour when id was not generated with
- *                     glGenVertexArrays().
+ * ARB version of glBindVertexArray()
  */
-static void
-bind_vertex_array(struct gl_context *ctx, GLuint id, GLboolean genRequired)
+void GLAPIENTRY
+_mesa_BindVertexArray( GLuint id )
 {
+   GET_CURRENT_CONTEXT(ctx);
+
    struct gl_vertex_array_object * const oldObj = ctx->Array.VAO;
    struct gl_vertex_array_object *newObj = NULL;
 
    assert(oldObj != NULL);
 
-   if ( oldObj->Name == id )
+   if (oldObj->Name == id)
       return;   /* rebinding the same array object- no change */
 
    /*
@@ -460,32 +418,12 @@
       /* non-default array object */
       newObj = _mesa_lookup_vao(ctx, id);
       if (!newObj) {
-         if (genRequired) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBindVertexArray(non-gen name)");
-            return;
-         }
-
-         /* For APPLE version, generate a new array object now */
-	 newObj = _mesa_new_vao(ctx, id);
-         if (!newObj) {
-            _mesa_error(ctx, GL_OUT_OF_MEMORY, "glBindVertexArrayAPPLE");
-            return;
-         }
-
-         save_array_object(ctx, newObj);
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBindVertexArray(non-gen name)");
+         return;
       }
 
-      if (!newObj->EverBound) {
-         /* The "Interactions with APPLE_vertex_array_object" section of the
-          * GL_ARB_vertex_array_object spec says:
-          *
-          *     "The first bind call, either BindVertexArray or
-          *     BindVertexArrayAPPLE, determines the semantic of the object."
-          */
-         newObj->ARBsemantics = genRequired;
-         newObj->EverBound = GL_TRUE;
-      }
+      newObj->EverBound = GL_TRUE;
    }
 
    if (ctx->Array.DrawMethod == DRAW_ARRAYS) {
@@ -509,36 +447,6 @@
 
 
 /**
- * ARB version of glBindVertexArray()
- * This function behaves differently from glBindVertexArrayAPPLE() in
- * that this function requires all ids to have been previously generated
- * by glGenVertexArrays[APPLE]().
- */
-void GLAPIENTRY
-_mesa_BindVertexArray( GLuint id )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   bind_vertex_array(ctx, id, GL_TRUE);
-}
-
-
-/**
- * Bind a new array.
- *
- * \todo
- * The binding could be done more efficiently by comparing the non-NULL
- * pointers in the old and new objects.  The only arrays that are "dirty" are
- * the ones that are non-NULL in either object.
- */
-void GLAPIENTRY
-_mesa_BindVertexArrayAPPLE( GLuint id )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   bind_vertex_array(ctx, id, GL_FALSE);
-}
-
-
-/**
  * Delete a set of array objects.
  *
  * \param n      Number of array objects to delete.
@@ -558,19 +466,18 @@
    for (i = 0; i < n; i++) {
       struct gl_vertex_array_object *obj = _mesa_lookup_vao(ctx, ids[i]);
 
-      if ( obj != NULL ) {
-	 assert( obj->Name == ids[i] );
+      if (obj) {
+         assert(obj->Name == ids[i]);
 
-	 /* If the array object is currently bound, the spec says "the binding
-	  * for that object reverts to zero and the default vertex array
-	  * becomes current."
-	  */
-	 if ( obj == ctx->Array.VAO ) {
-	    _mesa_BindVertexArray(0);
-	 }
+         /* If the array object is currently bound, the spec says "the binding
+          * for that object reverts to zero and the default vertex array
+          * becomes current."
+          */
+         if (obj == ctx->Array.VAO)
+            _mesa_BindVertexArray(0);
 
-	 /* The ID is immediately freed for re-use */
-	 remove_array_object(ctx, obj);
+         /* The ID is immediately freed for re-use */
+         _mesa_HashRemoveLocked(ctx->Array.Objects, obj->Name);
 
          if (ctx->Array.LastLookedUpVAO == obj)
             _mesa_reference_vao(ctx, &ctx->Array.LastLookedUpVAO, NULL);
@@ -586,7 +493,7 @@
 
 /**
  * Generate a set of unique array object IDs and store them in \c arrays.
- * Helper for _mesa_GenVertexArrays[APPLE]() and _mesa_CreateVertexArrays()
+ * Helper for _mesa_GenVertexArrays() and _mesa_CreateVertexArrays()
  * below.
  *
  * \param n       Number of IDs to generate.
@@ -626,7 +533,7 @@
          return;
       }
       obj->EverBound = create;
-      save_array_object(ctx, obj);
+      _mesa_HashInsertLocked(ctx->Array.Objects, obj->Name, obj);
       arrays[i] = first + i;
    }
 }
@@ -645,18 +552,6 @@
 
 
 /**
- * APPLE version of glGenVertexArraysAPPLE()
- * Arrays may live in VBOs or ordinary memory.
- */
-void GLAPIENTRY
-_mesa_GenVertexArraysAPPLE(GLsizei n, GLuint *arrays)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   gen_vertex_arrays(ctx, n, arrays, false, "glGenVertexArraysAPPLE");
-}
-
-
-/**
  * ARB_direct_state_access
  * Generates ID's and creates the array objects.
  */
@@ -682,14 +577,9 @@
    GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_FALSE);
 
-   if (id == 0)
-      return GL_FALSE;
-
    obj = _mesa_lookup_vao(ctx, id);
-   if (obj == NULL)
-      return GL_FALSE;
 
-   return obj->EverBound;
+   return obj != NULL && obj->EverBound;
 }
 
 
diff --git a/src/mesa/main/arrayobj.h b/src/mesa/main/arrayobj.h
index 830502e..1794968 100644
--- a/src/mesa/main/arrayobj.h
+++ b/src/mesa/main/arrayobj.h
@@ -35,7 +35,7 @@
 
 /**
  * \file arrayobj.h
- * Functions for the GL_APPLE_vertex_array_object extension.
+ * Functions for the GL_ARB_vertex_array_object extension.
  *
  * \author Ian Romanick <idr@us.ibm.com>
  * \author Brian Paul
@@ -96,14 +96,10 @@
 
 void GLAPIENTRY _mesa_BindVertexArray( GLuint id );
 
-void GLAPIENTRY _mesa_BindVertexArrayAPPLE( GLuint id );
-
 void GLAPIENTRY _mesa_DeleteVertexArrays(GLsizei n, const GLuint *ids);
 
 void GLAPIENTRY _mesa_GenVertexArrays(GLsizei n, GLuint *arrays);
 
-void GLAPIENTRY _mesa_GenVertexArraysAPPLE(GLsizei n, GLuint *buffer);
-
 void GLAPIENTRY _mesa_CreateVertexArrays(GLsizei n, GLuint *arrays);
 
 GLboolean GLAPIENTRY _mesa_IsVertexArray( GLuint id );
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index 8e738c9..43b5856 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -1105,6 +1105,20 @@
                enable = (const struct gl_enable_attrib *) attr->data;
                pop_enable_group(ctx, enable);
 	       ctx->NewState |= _NEW_ALL;
+               ctx->NewDriverState |= ctx->DriverFlags.NewAlphaTest |
+                                      ctx->DriverFlags.NewBlend |
+                                      ctx->DriverFlags.NewClipPlaneEnable |
+                                      ctx->DriverFlags.NewDepth |
+                                      ctx->DriverFlags.NewDepthClamp |
+                                      ctx->DriverFlags.NewFramebufferSRGB |
+                                      ctx->DriverFlags.NewLineState |
+                                      ctx->DriverFlags.NewLogicOp |
+                                      ctx->DriverFlags.NewMultisampleEnable |
+                                      ctx->DriverFlags.NewPolygonState |
+                                      ctx->DriverFlags.NewSampleAlphaToXEnable |
+                                      ctx->DriverFlags.NewSampleMask |
+                                      ctx->DriverFlags.NewScissorTest |
+                                      ctx->DriverFlags.NewStencil;
             }
             break;
          case GL_EVAL_BIT:
@@ -1287,7 +1301,12 @@
             break;
 	 case GL_POLYGON_STIPPLE_BIT:
 	    memcpy( ctx->PolygonStipple, attr->data, 32*sizeof(GLuint) );
-	    ctx->NewState |= _NEW_POLYGONSTIPPLE;
+
+            if (ctx->DriverFlags.NewPolygonStipple)
+               ctx->NewDriverState |= ctx->DriverFlags.NewPolygonStipple;
+            else
+               ctx->NewState |= _NEW_POLYGONSTIPPLE;
+
 	    if (ctx->Driver.PolygonStipple)
 	       ctx->Driver.PolygonStipple( ctx, (const GLubyte *) attr->data );
 	    break;
@@ -1428,8 +1447,7 @@
             break;
 
          default:
-            _mesa_problem( ctx, "Bad attrib flag in PopAttrib");
-            break;
+            unreachable("Bad attrib flag in PopAttrib");
       }
 
       next = attr->next;
@@ -1479,9 +1497,6 @@
    /* skip Name */
    /* skip RefCount */
 
-   /* In theory must be the same anyway, but on recreate make sure it matches */
-   dest->ARBsemantics = src->ARBsemantics;
-
    for (i = 0; i < ARRAY_SIZE(src->VertexAttrib); i++) {
       _mesa_copy_client_array(ctx, &dest->_VertexAttrib[i], &src->_VertexAttrib[i]);
       _mesa_copy_vertex_attrib_array(ctx, &dest->VertexAttrib[i], &src->VertexAttrib[i]);
@@ -1557,6 +1572,8 @@
                      struct gl_array_attrib *dest,
                      struct gl_array_attrib *src)
 {
+   bool is_vao_name_zero = src->VAO->Name == 0;
+
    /* The ARB_vertex_array_object spec says:
     *
     *     "BindVertexArray fails and an INVALID_OPERATION error is generated
@@ -1565,22 +1582,15 @@
     *     DeleteVertexArrays."
     *
     * Therefore popping a deleted VAO cannot magically recreate it.
-    *
-    * The semantics of objects created using APPLE_vertex_array_objects behave
-    * differently.  These objects expect to be recreated by pop.  Alas.
     */
-   const bool arb_vao = (src->VAO->Name != 0
-			 && src->VAO->ARBsemantics);
-
-   if (arb_vao && !_mesa_IsVertexArray(src->VAO->Name))
+   if (!is_vao_name_zero && !_mesa_IsVertexArray(src->VAO->Name))
       return;
 
-   _mesa_BindVertexArrayAPPLE(src->VAO->Name);
+   _mesa_BindVertexArray(src->VAO->Name);
 
    /* Restore or recreate the buffer objects by the names ... */
-   if (!arb_vao
-       || src->ArrayBufferObj->Name == 0
-       || _mesa_IsBuffer(src->ArrayBufferObj->Name)) {
+   if (is_vao_name_zero || src->ArrayBufferObj->Name == 0 ||
+       _mesa_IsBuffer(src->ArrayBufferObj->Name)) {
       /* ... and restore its content */
       copy_array_attrib(ctx, dest, src, false);
 
@@ -1590,9 +1600,8 @@
       copy_array_attrib(ctx, dest, src, true);
    }
 
-   if (!arb_vao
-       || src->VAO->IndexBufferObj->Name == 0
-       || _mesa_IsBuffer(src->VAO->IndexBufferObj->Name))
+   if (is_vao_name_zero || src->VAO->IndexBufferObj->Name == 0 ||
+       _mesa_IsBuffer(src->VAO->IndexBufferObj->Name))
       _mesa_BindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB,
 			  src->VAO->IndexBufferObj->Name);
 }
@@ -1762,8 +1771,7 @@
             break;
 	 }
          default:
-            _mesa_problem( ctx, "Bad attrib flag in PopClientAttrib");
-            break;
+            unreachable("Bad attrib flag in PopClientAttrib");
       }
 
       next = node->next;
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 955fda1..3265628 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -157,20 +157,6 @@
 }
 
 
-/**
- * Specify the blending operation.
- *
- * \param sfactor source factor operator.
- * \param dfactor destination factor operator.
- *
- * \sa glBlendFunc, glBlendFuncSeparateEXT
- */
-void GLAPIENTRY
-_mesa_BlendFunc( GLenum sfactor, GLenum dfactor )
-{
-   _mesa_BlendFuncSeparate(sfactor, dfactor, sfactor, dfactor);
-}
-
 static GLboolean
 blend_factor_is_dual_src(GLenum factor)
 {
@@ -203,40 +189,23 @@
 }
 
 
-/**
- * Set the separate blend source/dest factors for all draw buffers.
- *
- * \param sfactorRGB RGB source factor operator.
- * \param dfactorRGB RGB destination factor operator.
- * \param sfactorA alpha source factor operator.
- * \param dfactorA alpha destination factor operator.
- */
-void GLAPIENTRY
-_mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
-                            GLenum sfactorA, GLenum dfactorA )
+/* Returns true if there was no change */
+static bool
+skip_blend_state_update(const struct gl_context *ctx,
+                        GLenum sfactorRGB, GLenum dfactorRGB,
+                        GLenum sfactorA, GLenum dfactorA)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   const unsigned numBuffers = num_buffers(ctx);
-   unsigned buf;
-   bool changed = false;
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
-                  _mesa_enum_to_string(sfactorRGB),
-                  _mesa_enum_to_string(dfactorRGB),
-                  _mesa_enum_to_string(sfactorA),
-                  _mesa_enum_to_string(dfactorA));
-
    /* Check if we're really changing any state.  If not, return early. */
    if (ctx->Color._BlendFuncPerBuffer) {
+      const unsigned numBuffers = num_buffers(ctx);
+
       /* Check all per-buffer states */
-      for (buf = 0; buf < numBuffers; buf++) {
+      for (unsigned buf = 0; buf < numBuffers; buf++) {
          if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB ||
              ctx->Color.Blend[buf].DstRGB != dfactorRGB ||
              ctx->Color.Blend[buf].SrcA != sfactorA ||
              ctx->Color.Blend[buf].DstA != dfactorA) {
-            changed = true;
-            break;
+            return false;
          }
       }
    }
@@ -246,22 +215,24 @@
           ctx->Color.Blend[0].DstRGB != dfactorRGB ||
           ctx->Color.Blend[0].SrcA != sfactorA ||
           ctx->Color.Blend[0].DstA != dfactorA) {
-         changed = true;
+         return false;
       }
    }
 
-   if (!changed)
-      return;
+   return true;
+}
 
-   if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
-                               sfactorRGB, dfactorRGB,
-                               sfactorA, dfactorA)) {
-      return;
-   }
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+static void
+blend_func_separate(struct gl_context *ctx,
+                    GLenum sfactorRGB, GLenum dfactorRGB,
+                    GLenum sfactorA, GLenum dfactorA)
+{
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewBlend ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewBlend;
 
-   for (buf = 0; buf < numBuffers; buf++) {
+   const unsigned numBuffers = num_buffers(ctx);
+   for (unsigned buf = 0; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf].SrcRGB = sfactorRGB;
       ctx->Color.Blend[buf].DstRGB = dfactorRGB;
       ctx->Color.Blend[buf].SrcA = sfactorA;
@@ -269,7 +240,7 @@
    }
 
    update_uses_dual_src(ctx, 0);
-   for (buf = 1; buf < numBuffers; buf++) {
+   for (unsigned buf = 1; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf]._UsesDualSrc = ctx->Color.Blend[0]._UsesDualSrc;
    }
 
@@ -283,6 +254,100 @@
 
 
 /**
+ * Specify the blending operation.
+ *
+ * \param sfactor source factor operator.
+ * \param dfactor destination factor operator.
+ *
+ * \sa glBlendFunc, glBlendFuncSeparateEXT
+ */
+void GLAPIENTRY
+_mesa_BlendFunc( GLenum sfactor, GLenum dfactor )
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (skip_blend_state_update(ctx, sfactor, dfactor, sfactor, dfactor))
+      return;
+
+   if (!validate_blend_factors(ctx, "glBlendFunc",
+                               sfactor, dfactor, sfactor, dfactor)) {
+      return;
+   }
+
+   blend_func_separate(ctx, sfactor, dfactor, sfactor, dfactor);
+}
+
+
+void GLAPIENTRY
+_mesa_BlendFunc_no_error(GLenum sfactor, GLenum dfactor)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (skip_blend_state_update(ctx, sfactor, dfactor, sfactor, dfactor))
+      return;
+
+   blend_func_separate(ctx, sfactor, dfactor, sfactor, dfactor);
+}
+
+
+/**
+ * Set the separate blend source/dest factors for all draw buffers.
+ *
+ * \param sfactorRGB RGB source factor operator.
+ * \param dfactorRGB RGB destination factor operator.
+ * \param sfactorA alpha source factor operator.
+ * \param dfactorA alpha destination factor operator.
+ */
+void GLAPIENTRY
+_mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
+                            GLenum sfactorA, GLenum dfactorA )
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
+                  _mesa_enum_to_string(sfactorRGB),
+                  _mesa_enum_to_string(dfactorRGB),
+                  _mesa_enum_to_string(sfactorA),
+                  _mesa_enum_to_string(dfactorA));
+
+
+
+   if (skip_blend_state_update(ctx, sfactorRGB, dfactorRGB, sfactorA, dfactorA))
+      return;
+
+   if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
+                               sfactorRGB, dfactorRGB,
+                               sfactorA, dfactorA)) {
+      return;
+   }
+
+   blend_func_separate(ctx, sfactorRGB, dfactorRGB, sfactorA, dfactorA);
+}
+
+
+void GLAPIENTRY
+_mesa_BlendFuncSeparate_no_error(GLenum sfactorRGB, GLenum dfactorRGB,
+                                 GLenum sfactorA, GLenum dfactorA)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (skip_blend_state_update(ctx, sfactorRGB, dfactorRGB, sfactorA, dfactorA))
+      return;
+
+   blend_func_separate(ctx, sfactorRGB, dfactorRGB, sfactorA, dfactorA);
+}
+
+
+void GLAPIENTRY
+_mesa_BlendFunciARB_no_error(GLuint buf, GLenum sfactor, GLenum dfactor)
+{
+   _mesa_BlendFuncSeparateiARB_no_error(buf, sfactor, dfactor, sfactor,
+                                        dfactor);
+}
+
+
+/**
  * Set blend source/dest factors for one color buffer/target.
  */
 void GLAPIENTRY
@@ -292,24 +357,23 @@
 }
 
 
-/**
- * Set separate blend source/dest factors for one color buffer/target.
- */
-void GLAPIENTRY
-_mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB,
-                         GLenum sfactorA, GLenum dfactorA)
+static ALWAYS_INLINE void
+blend_func_separatei(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB,
+                     GLenum sfactorA, GLenum dfactorA, bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (!ctx->Extensions.ARB_draw_buffers_blend) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBlendFunc[Separate]i()");
-      return;
-   }
+   if (!no_error) {
+      if (!ctx->Extensions.ARB_draw_buffers_blend) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glBlendFunc[Separate]i()");
+         return;
+      }
 
-   if (buf >= ctx->Const.MaxDrawBuffers) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
-                  buf);
-      return;
+      if (buf >= ctx->Const.MaxDrawBuffers) {
+         _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
+                     buf);
+         return;
+      }
    }
 
    if (ctx->Color.Blend[buf].SrcRGB == sfactorRGB &&
@@ -318,13 +382,14 @@
        ctx->Color.Blend[buf].DstA == dfactorA)
       return; /* no change */
 
-   if (!validate_blend_factors(ctx, "glBlendFuncSeparatei",
-                               sfactorRGB, dfactorRGB,
-                               sfactorA, dfactorA)) {
+   if (!no_error && !validate_blend_factors(ctx, "glBlendFuncSeparatei",
+                                            sfactorRGB, dfactorRGB,
+                                            sfactorA, dfactorA)) {
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewBlend ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewBlend;
 
    ctx->Color.Blend[buf].SrcRGB = sfactorRGB;
    ctx->Color.Blend[buf].DstRGB = dfactorRGB;
@@ -335,6 +400,28 @@
 }
 
 
+void GLAPIENTRY
+_mesa_BlendFuncSeparateiARB_no_error(GLuint buf, GLenum sfactorRGB,
+                                     GLenum dfactorRGB, GLenum sfactorA,
+                                     GLenum dfactorA)
+{
+   blend_func_separatei(buf, sfactorRGB, dfactorRGB, sfactorA, dfactorA,
+                        true);
+}
+
+
+/**
+ * Set separate blend source/dest factors for one color buffer/target.
+ */
+void GLAPIENTRY
+_mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB,
+                            GLenum sfactorA, GLenum dfactorA)
+{
+   blend_func_separatei(buf, sfactorRGB, dfactorRGB, sfactorA, dfactorA,
+                        false);
+}
+
+
 /**
  * Return true if \p mode is a legal blending equation, excluding
  * GL_KHR_blend_equation_advanced modes.
@@ -448,7 +535,7 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   _mesa_flush_vertices_for_blend_state(ctx);
 
    for (buf = 0; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf].EquationRGB = mode;
@@ -490,7 +577,7 @@
        ctx->Color.Blend[buf].EquationA == mode)
       return;  /* no change */
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   _mesa_flush_vertices_for_blend_state(ctx);
    ctx->Color.Blend[buf].EquationRGB = mode;
    ctx->Color.Blend[buf].EquationA = mode;
    ctx->Color._BlendEquationPerBuffer = GL_TRUE;
@@ -556,7 +643,7 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   _mesa_flush_vertices_for_blend_state(ctx);
 
    for (buf = 0; buf < numBuffers; buf++) {
       ctx->Color.Blend[buf].EquationRGB = modeRGB;
@@ -570,6 +657,31 @@
 }
 
 
+static void
+blend_equation_separatei(struct gl_context *ctx, GLuint buf, GLenum modeRGB,
+                         GLenum modeA)
+{
+   if (ctx->Color.Blend[buf].EquationRGB == modeRGB &&
+       ctx->Color.Blend[buf].EquationA == modeA)
+      return;  /* no change */
+
+   _mesa_flush_vertices_for_blend_state(ctx);
+   ctx->Color.Blend[buf].EquationRGB = modeRGB;
+   ctx->Color.Blend[buf].EquationA = modeA;
+   ctx->Color._BlendEquationPerBuffer = GL_TRUE;
+   ctx->Color._AdvancedBlendMode = BLEND_NONE;
+}
+
+
+void GLAPIENTRY
+_mesa_BlendEquationSeparateiARB_no_error(GLuint buf, GLenum modeRGB,
+                                         GLenum modeA)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   blend_equation_separatei(ctx, buf, modeRGB, modeA);
+}
+
+
 /**
  * Set separate blend equations for one color buffer/target.
  */
@@ -605,15 +717,7 @@
       return;
    }
 
-   if (ctx->Color.Blend[buf].EquationRGB == modeRGB &&
-       ctx->Color.Blend[buf].EquationA == modeA)
-      return;  /* no change */
-
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
-   ctx->Color.Blend[buf].EquationRGB = modeRGB;
-   ctx->Color.Blend[buf].EquationA = modeA;
-   ctx->Color._BlendEquationPerBuffer = GL_TRUE;
-   ctx->Color._AdvancedBlendMode = BLEND_NONE;
+   blend_equation_separatei(ctx, buf, modeRGB, modeA);
 }
 
 
@@ -645,7 +749,8 @@
    if (TEST_EQ_4V(tmp, ctx->Color.BlendColorUnclamped))
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewBlendColor ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewBlendColor;
    COPY_4FV( ctx->Color.BlendColorUnclamped, tmp );
 
    ctx->Color.BlendColor[0] = CLAMP(tmp[0], 0.0F, 1.0F);
@@ -689,7 +794,8 @@
    case GL_NOTEQUAL:
    case GL_GEQUAL:
    case GL_ALWAYS:
-      FLUSH_VERTICES(ctx, _NEW_COLOR);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewAlphaTest ? 0 : _NEW_COLOR);
+      ctx->NewDriverState |= ctx->DriverFlags.NewAlphaTest;
       ctx->Color.AlphaFunc = func;
       ctx->Color.AlphaRefUnclamped = ref;
       ctx->Color.AlphaRef = CLAMP(ref, 0.0F, 1.0F);
@@ -705,6 +811,21 @@
 }
 
 
+static void
+logic_op(struct gl_context *ctx, GLenum opcode)
+{
+   if (ctx->Color.LogicOp == opcode)
+      return;
+
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLogicOp ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewLogicOp;
+   ctx->Color.LogicOp = opcode;
+
+   if (ctx->Driver.LogicOpcode)
+      ctx->Driver.LogicOpcode(ctx, opcode);
+}
+
+
 /**
  * Specify a logic pixel operation for color index rendering.
  *
@@ -746,14 +867,15 @@
 	 return;
    }
 
-   if (ctx->Color.LogicOp == opcode)
-      return;
+   logic_op(ctx, opcode);
+}
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
-   ctx->Color.LogicOp = opcode;
 
-   if (ctx->Driver.LogicOpcode)
-      ctx->Driver.LogicOpcode( ctx, opcode );
+void GLAPIENTRY
+_mesa_LogicOp_no_error(GLenum opcode)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   logic_op(ctx, opcode);
 }
 
 
@@ -765,7 +887,8 @@
    if (ctx->Color.IndexMask == mask)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewColorMask ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewColorMask;
    ctx->Color.IndexMask = mask;
 }
 
@@ -809,7 +932,8 @@
    for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
       if (!TEST_EQ_4V(tmp, ctx->Color.ColorMask[i])) {
          if (!flushed) {
-            FLUSH_VERTICES(ctx, _NEW_COLOR);
+            FLUSH_VERTICES(ctx, ctx->DriverFlags.NewColorMask ? 0 : _NEW_COLOR);
+            ctx->NewDriverState |= ctx->DriverFlags.NewColorMask;
          }
          flushed = GL_TRUE;
          COPY_4UBV(ctx->Color.ColorMask[i], tmp);
@@ -851,7 +975,8 @@
    if (TEST_EQ_4V(tmp, ctx->Color.ColorMask[buf]))
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_COLOR);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewColorMask ? 0 : _NEW_COLOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewColorMask;
    COPY_4UBV(ctx->Color.ColorMask[buf], tmp);
 }
 
diff --git a/src/mesa/main/blend.h b/src/mesa/main/blend.h
index 8ab9e02..0f0bb62 100644
--- a/src/mesa/main/blend.h
+++ b/src/mesa/main/blend.h
@@ -34,7 +34,9 @@
 
 
 #include "glheader.h"
+#include "context.h"
 #include "formats.h"
+#include "extensions.h"
 
 struct gl_context;
 struct gl_framebuffer;
@@ -43,17 +45,28 @@
 extern void GLAPIENTRY
 _mesa_BlendFunc( GLenum sfactor, GLenum dfactor );
 
+extern void GLAPIENTRY
+_mesa_BlendFunc_no_error(GLenum sfactor, GLenum dfactor);
 
 extern void GLAPIENTRY
 _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
                             GLenum sfactorA, GLenum dfactorA );
 
+extern void GLAPIENTRY
+_mesa_BlendFuncSeparate_no_error(GLenum sfactorRGB, GLenum dfactorRGB,
+                                 GLenum sfactorA, GLenum dfactorA);
 
 extern void GLAPIENTRY
+_mesa_BlendFunciARB_no_error(GLuint buf, GLenum sfactor, GLenum dfactor);
+extern void GLAPIENTRY
 _mesa_BlendFunciARB(GLuint buf, GLenum sfactor, GLenum dfactor);
 
 
 extern void GLAPIENTRY
+_mesa_BlendFuncSeparateiARB_no_error(GLuint buf, GLenum sfactorRGB,
+                                     GLenum dfactorRGB, GLenum sfactorA,
+                                     GLenum dfactorA);
+extern void GLAPIENTRY
 _mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB,
                          GLenum sfactorA, GLenum dfactorA);
 
@@ -71,6 +84,9 @@
 
 
 extern void GLAPIENTRY
+_mesa_BlendEquationSeparateiARB_no_error(GLuint buf, GLenum modeRGB,
+                                         GLenum modeA);
+extern void GLAPIENTRY
 _mesa_BlendEquationSeparateiARB(GLuint buf, GLenum modeRGB, GLenum modeA);
 
 
@@ -87,6 +103,10 @@
 
 
 extern void GLAPIENTRY
+_mesa_LogicOp_no_error(GLenum opcode);
+
+
+extern void GLAPIENTRY
 _mesa_IndexMask( GLuint mask );
 
 extern void GLAPIENTRY
@@ -127,4 +147,22 @@
 extern void  
 _mesa_init_color( struct gl_context * ctx );
 
+
+static inline void
+_mesa_flush_vertices_for_blend_state(struct gl_context *ctx)
+{
+   /* The advanced blend mode needs _NEW_COLOR to update the state constant,
+    * so we have to set it. This is inefficient.
+    * This should only be done for states that affect the state constant.
+    * It shouldn't be done for other blend states.
+    */
+   if (_mesa_has_KHR_blend_equation_advanced(ctx) ||
+       !ctx->DriverFlags.NewBlend) {
+      FLUSH_VERTICES(ctx, _NEW_COLOR);
+   } else {
+      FLUSH_VERTICES(ctx, 0);
+   }
+   ctx->NewDriverState |= ctx->DriverFlags.NewBlend;
+}
+
 #endif
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index b41a21f..8bb3ba3 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -177,18 +177,176 @@
 }
 
 
-void
-_mesa_blit_framebuffer(struct gl_context *ctx,
-                       struct gl_framebuffer *readFb,
-                       struct gl_framebuffer *drawFb,
-                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                       GLbitfield mask, GLenum filter, const char *func)
+static bool
+validate_color_buffer(struct gl_context *ctx, struct gl_framebuffer *readFb,
+                      struct gl_framebuffer *drawFb, GLenum filter,
+                      const char *func)
 {
-   const GLbitfield legalMaskBits = (GL_COLOR_BUFFER_BIT |
-                                     GL_DEPTH_BUFFER_BIT |
-                                     GL_STENCIL_BUFFER_BIT);
+   const GLuint numColorDrawBuffers = drawFb->_NumColorDrawBuffers;
+   const struct gl_renderbuffer *colorReadRb = readFb->_ColorReadBuffer;
+   const struct gl_renderbuffer *colorDrawRb = NULL;
+   GLuint i;
 
+   for (i = 0; i < numColorDrawBuffers; i++) {
+      colorDrawRb = drawFb->_ColorDrawBuffers[i];
+      if (!colorDrawRb)
+         continue;
+
+      /* Page 193 (page 205 of the PDF) in section 4.3.2 of the OpenGL
+       * ES 3.0.1 spec says:
+       *
+       *     "If the source and destination buffers are identical, an
+       *     INVALID_OPERATION error is generated. Different mipmap levels of a
+       *     texture, different layers of a three- dimensional texture or
+       *     two-dimensional array texture, and different faces of a cube map
+       *     texture do not constitute identical buffers."
+       */
+      if (_mesa_is_gles3(ctx) && (colorDrawRb == colorReadRb)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(source and destination color buffer cannot be the "
+                     "same)", func);
+         return false;
+      }
+
+      if (!compatible_color_datatypes(colorReadRb->Format,
+                                      colorDrawRb->Format)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(color buffer datatypes mismatch)", func);
+         return false;
+      }
+
+      /* extra checks for multisample copies... */
+      if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) {
+         /* color formats must match on GLES. This isn't checked on desktop GL
+          * because the GL 4.4 spec was changed to allow it.  In the section
+          * entitled “Changes in the released
+          * Specification of July 22, 2013” it says:
+          *
+          * “Relax BlitFramebuffer in section 18.3.1 so that format conversion
+          * can take place during multisample blits, since drivers already
+          * allow this and some apps depend on it.”
+          */
+         if (_mesa_is_gles(ctx) &&
+             !compatible_resolve_formats(colorReadRb, colorDrawRb)) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(bad src/dst multisample pixel formats)", func);
+            return false;
+         }
+      }
+
+   }
+
+   if (filter != GL_NEAREST) {
+      /* From EXT_framebuffer_multisample_blit_scaled specification:
+       * "Calling BlitFramebuffer will result in an INVALID_OPERATION error if
+       * filter is not NEAREST and read buffer contains integer data."
+       */
+      GLenum type = _mesa_get_format_datatype(colorReadRb->Format);
+      if (type == GL_INT || type == GL_UNSIGNED_INT) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(integer color type)", func);
+         return false;
+      }
+   }
+   return true;
+}
+
+
+static bool
+validate_stencil_buffer(struct gl_context *ctx, struct gl_framebuffer *readFb,
+                        struct gl_framebuffer *drawFb, const char *func)
+{
+   struct gl_renderbuffer *readRb =
+      readFb->Attachment[BUFFER_STENCIL].Renderbuffer;
+   struct gl_renderbuffer *drawRb =
+      drawFb->Attachment[BUFFER_STENCIL].Renderbuffer;
+   int read_z_bits, draw_z_bits;
+
+   if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(source and destination stencil buffer cannot be the "
+                  "same)", func);
+      return false;
+   }
+
+   if (_mesa_get_format_bits(readRb->Format, GL_STENCIL_BITS) !=
+       _mesa_get_format_bits(drawRb->Format, GL_STENCIL_BITS)) {
+      /* There is no need to check the stencil datatype here, because
+       * there is only one: GL_UNSIGNED_INT.
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(stencil attachment format mismatch)", func);
+      return false;
+   }
+
+   read_z_bits = _mesa_get_format_bits(readRb->Format, GL_DEPTH_BITS);
+   draw_z_bits = _mesa_get_format_bits(drawRb->Format, GL_DEPTH_BITS);
+
+   /* If both buffers also have depth data, the depth formats must match
+    * as well.  If one doesn't have depth, it's not blitted, so we should
+    * ignore the depth format check.
+    */
+   if (read_z_bits > 0 && draw_z_bits > 0 &&
+       (read_z_bits != draw_z_bits ||
+        _mesa_get_format_datatype(readRb->Format) !=
+        _mesa_get_format_datatype(drawRb->Format))) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(stencil attachment depth format mismatch)", func);
+      return false;
+   }
+   return true;
+}
+
+
+static bool
+validate_depth_buffer(struct gl_context *ctx, struct gl_framebuffer *readFb,
+                      struct gl_framebuffer *drawFb, const char *func)
+{
+   struct gl_renderbuffer *readRb =
+      readFb->Attachment[BUFFER_DEPTH].Renderbuffer;
+   struct gl_renderbuffer *drawRb =
+      drawFb->Attachment[BUFFER_DEPTH].Renderbuffer;
+   int read_s_bit, draw_s_bit;
+
+   if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(source and destination depth buffer cannot be the same)",
+                  func);
+      return false;
+   }
+
+   if ((_mesa_get_format_bits(readRb->Format, GL_DEPTH_BITS) !=
+        _mesa_get_format_bits(drawRb->Format, GL_DEPTH_BITS)) ||
+       (_mesa_get_format_datatype(readRb->Format) !=
+        _mesa_get_format_datatype(drawRb->Format))) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(depth attachment format mismatch)", func);
+      return false;
+   }
+
+   read_s_bit = _mesa_get_format_bits(readRb->Format, GL_STENCIL_BITS);
+   draw_s_bit = _mesa_get_format_bits(drawRb->Format, GL_STENCIL_BITS);
+
+   /* If both buffers also have stencil data, the stencil formats must match as
+    * well.  If one doesn't have stencil, it's not blitted, so we should ignore
+    * the stencil format check.
+    */
+   if (read_s_bit > 0 && draw_s_bit > 0 && read_s_bit != draw_s_bit) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(depth attachment stencil bits mismatch)", func);
+      return false;
+   }
+   return true;
+}
+
+
+static ALWAYS_INLINE void
+blit_framebuffer(struct gl_context *ctx,
+                 struct gl_framebuffer *readFb, struct gl_framebuffer *drawFb,
+                 GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                 GLbitfield mask, GLenum filter, bool no_error, const char *func)
+{
    FLUSH_VERTICES(ctx, 0);
 
    if (!readFb || !drawFb) {
@@ -204,47 +362,105 @@
    /* Make sure drawFb has an initialized bounding box. */
    _mesa_update_draw_buffer_bounds(ctx, drawFb);
 
-   /* check for complete framebuffers */
-   if (drawFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT ||
-       readFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
-      _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
-                  "%s(incomplete draw/read buffers)", func);
-      return;
-   }
+   if (!no_error) {
+      const GLbitfield legalMaskBits = (GL_COLOR_BUFFER_BIT |
+                                        GL_DEPTH_BUFFER_BIT |
+                                        GL_STENCIL_BUFFER_BIT);
 
-   if (!is_valid_blit_filter(ctx, filter)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
-                  _mesa_enum_to_string(filter));
-      return;
-   }
+      /* check for complete framebuffers */
+      if (drawFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT ||
+          readFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+         _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
+                     "%s(incomplete draw/read buffers)", func);
+         return;
+      }
 
-   if ((filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
-        filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
-        (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
-                  _mesa_enum_to_string(filter));
-      return;
-   }
+      if (!is_valid_blit_filter(ctx, filter)) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
+                     _mesa_enum_to_string(filter));
+         return;
+      }
 
-   if (mask & ~legalMaskBits) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid mask bits set)", func);
-      return;
-   }
+      if ((filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
+           filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
+           (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
+                     _mesa_enum_to_string(filter));
+         return;
+      }
 
-   /* depth/stencil must be blitted with nearest filtering */
-   if ((mask & (GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT))
-        && filter != GL_NEAREST) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-             "%s(depth/stencil requires GL_NEAREST filter)", func);
-      return;
+      if (mask & ~legalMaskBits) {
+         _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid mask bits set)", func);
+         return;
+      }
+
+      /* depth/stencil must be blitted with nearest filtering */
+      if ((mask & (GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT))
+           && filter != GL_NEAREST) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                "%s(depth/stencil requires GL_NEAREST filter)", func);
+         return;
+      }
+
+      if (_mesa_is_gles3(ctx)) {
+         /* Page 194 (page 206 of the PDF) in section 4.3.2 of the OpenGL ES
+          * 3.0.1 spec says:
+          *
+          *     "If SAMPLE_BUFFERS for the draw framebuffer is greater than
+          *     zero, an INVALID_OPERATION error is generated."
+          */
+         if (drawFb->Visual.samples > 0) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(destination samples must be 0)", func);
+            return;
+         }
+
+         /* Page 194 (page 206 of the PDF) in section 4.3.2 of the OpenGL ES
+          * 3.0.1 spec says:
+          *
+          *     "If SAMPLE_BUFFERS for the read framebuffer is greater than
+          *     zero, no copy is performed and an INVALID_OPERATION error is
+          *     generated if the formats of the read and draw framebuffers are
+          *     not identical or if the source and destination rectangles are
+          *     not defined with the same (X0, Y0) and (X1, Y1) bounds."
+          *
+          * The format check was made above because desktop OpenGL has the same
+          * requirement.
+          */
+         if (readFb->Visual.samples > 0
+             && (srcX0 != dstX0 || srcY0 != dstY0
+                 || srcX1 != dstX1 || srcY1 != dstY1)) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(bad src/dst multisample region)", func);
+            return;
+         }
+      } else {
+         if (readFb->Visual.samples > 0 &&
+             drawFb->Visual.samples > 0 &&
+             readFb->Visual.samples != drawFb->Visual.samples) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(mismatched samples)", func);
+            return;
+         }
+
+         /* extra checks for multisample copies... */
+         if ((readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) &&
+             (filter == GL_NEAREST || filter == GL_LINEAR)) {
+            /* src and dest region sizes must be the same */
+            if (abs(srcX1 - srcX0) != abs(dstX1 - dstX0) ||
+                abs(srcY1 - srcY0) != abs(dstY1 - dstY0)) {
+               _mesa_error(ctx, GL_INVALID_OPERATION,
+                           "%s(bad src/dst multisample region sizes)", func);
+               return;
+            }
+         }
+      }
    }
 
    /* get color read/draw renderbuffers */
    if (mask & GL_COLOR_BUFFER_BIT) {
       const GLuint numColorDrawBuffers = drawFb->_NumColorDrawBuffers;
       const struct gl_renderbuffer *colorReadRb = readFb->_ColorReadBuffer;
-      const struct gl_renderbuffer *colorDrawRb = NULL;
-      GLuint i;
 
       /* From the EXT_framebuffer_object spec:
        *
@@ -254,67 +470,9 @@
        */
       if (!colorReadRb || numColorDrawBuffers == 0) {
          mask &= ~GL_COLOR_BUFFER_BIT;
-      }
-      else {
-         for (i = 0; i < numColorDrawBuffers; i++) {
-            colorDrawRb = drawFb->_ColorDrawBuffers[i];
-            if (!colorDrawRb)
-               continue;
-
-            /* Page 193 (page 205 of the PDF) in section 4.3.2 of the OpenGL
-             * ES 3.0.1 spec says:
-             *
-             *     "If the source and destination buffers are identical, an
-             *     INVALID_OPERATION error is generated. Different mipmap
-             *     levels of a texture, different layers of a three-
-             *     dimensional texture or two-dimensional array texture, and
-             *     different faces of a cube map texture do not constitute
-             *     identical buffers."
-             */
-            if (_mesa_is_gles3(ctx) && (colorDrawRb == colorReadRb)) {
-               _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "%s(source and destination color "
-                           "buffer cannot be the same)", func);
-               return;
-            }
-
-            if (!compatible_color_datatypes(colorReadRb->Format,
-                                            colorDrawRb->Format)) {
-               _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "%s(color buffer datatypes mismatch)", func);
-               return;
-            }
-            /* extra checks for multisample copies... */
-            if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) {
-               /* color formats must match on GLES. This isn't checked on
-                * desktop GL because the GL 4.4 spec was changed to allow it.
-                * In the section entitled “Changes in the released
-                * Specification of July 22, 2013” it says:
-                *
-                * “Relax BlitFramebuffer in section 18.3.1 so that format
-                *  conversion can take place during multisample blits, since
-                *  drivers already allow this and some apps depend on it.”
-                */
-               if (_mesa_is_gles(ctx) &&
-                   !compatible_resolve_formats(colorReadRb, colorDrawRb)) {
-                  _mesa_error(ctx, GL_INVALID_OPERATION,
-                         "%s(bad src/dst multisample pixel formats)", func);
-                  return;
-               }
-            }
-         }
-         if (filter != GL_NEAREST) {
-            /* From EXT_framebuffer_multisample_blit_scaled specification:
-             * "Calling BlitFramebuffer will result in an INVALID_OPERATION error
-             * if filter is not NEAREST and read buffer contains integer data."
-             */
-            GLenum type = _mesa_get_format_datatype(colorReadRb->Format);
-            if (type == GL_INT || type == GL_UNSIGNED_INT) {
-               _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "%s(integer color type)", func);
-               return;
-            }
-         }
+      } else if (!no_error) {
+         if (!validate_color_buffer(ctx, readFb, drawFb, filter, func))
+            return;
       }
    }
 
@@ -332,43 +490,9 @@
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
          mask &= ~GL_STENCIL_BUFFER_BIT;
-      }
-      else {
-         int read_z_bits, draw_z_bits;
-
-         if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(source and destination stencil "
-                        "buffer cannot be the same)", func);
+      } else if (!no_error) {
+         if (!validate_stencil_buffer(ctx, readFb, drawFb, func))
             return;
-         }
-
-         if (_mesa_get_format_bits(readRb->Format, GL_STENCIL_BITS) !=
-             _mesa_get_format_bits(drawRb->Format, GL_STENCIL_BITS)) {
-            /* There is no need to check the stencil datatype here, because
-             * there is only one: GL_UNSIGNED_INT.
-             */
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(stencil attachment format mismatch)", func);
-            return;
-         }
-
-         read_z_bits = _mesa_get_format_bits(readRb->Format, GL_DEPTH_BITS);
-         draw_z_bits = _mesa_get_format_bits(drawRb->Format, GL_DEPTH_BITS);
-
-         /* If both buffers also have depth data, the depth formats must match
-          * as well.  If one doesn't have depth, it's not blitted, so we should
-          * ignore the depth format check.
-          */
-         if (read_z_bits > 0 && draw_z_bits > 0 &&
-             (read_z_bits != draw_z_bits ||
-              _mesa_get_format_datatype(readRb->Format) !=
-              _mesa_get_format_datatype(drawRb->Format))) {
-
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(stencil attachment depth format mismatch)", func);
-            return;
-         }
       }
    }
 
@@ -386,93 +510,9 @@
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
          mask &= ~GL_DEPTH_BUFFER_BIT;
-      }
-      else {
-         int read_s_bit, draw_s_bit;
-
-         if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(source and destination depth "
-                        "buffer cannot be the same)", func);
+      } else if (!no_error) {
+         if (!validate_depth_buffer(ctx, readFb, drawFb, func))
             return;
-         }
-
-         if ((_mesa_get_format_bits(readRb->Format, GL_DEPTH_BITS) !=
-              _mesa_get_format_bits(drawRb->Format, GL_DEPTH_BITS)) ||
-             (_mesa_get_format_datatype(readRb->Format) !=
-              _mesa_get_format_datatype(drawRb->Format))) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(depth attachment format mismatch)", func);
-            return;
-         }
-
-         read_s_bit = _mesa_get_format_bits(readRb->Format, GL_STENCIL_BITS);
-         draw_s_bit = _mesa_get_format_bits(drawRb->Format, GL_STENCIL_BITS);
-
-         /* If both buffers also have stencil data, the stencil formats must
-          * match as well.  If one doesn't have stencil, it's not blitted, so
-          * we should ignore the stencil format check.
-          */
-         if (read_s_bit > 0 && draw_s_bit > 0 && read_s_bit != draw_s_bit) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(depth attachment stencil bits mismatch)", func);
-            return;
-         }
-      }
-   }
-
-
-   if (_mesa_is_gles3(ctx)) {
-      /* Page 194 (page 206 of the PDF) in section 4.3.2 of the OpenGL ES
-       * 3.0.1 spec says:
-       *
-       *     "If SAMPLE_BUFFERS for the draw framebuffer is greater than zero,
-       *     an INVALID_OPERATION error is generated."
-       */
-      if (drawFb->Visual.samples > 0) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(destination samples must be 0)", func);
-         return;
-      }
-
-      /* Page 194 (page 206 of the PDF) in section 4.3.2 of the OpenGL ES
-       * 3.0.1 spec says:
-       *
-       *     "If SAMPLE_BUFFERS for the read framebuffer is greater than zero,
-       *     no copy is performed and an INVALID_OPERATION error is generated
-       *     if the formats of the read and draw framebuffers are not
-       *     identical or if the source and destination rectangles are not
-       *     defined with the same (X0, Y0) and (X1, Y1) bounds."
-       *
-       * The format check was made above because desktop OpenGL has the same
-       * requirement.
-       */
-      if (readFb->Visual.samples > 0
-          && (srcX0 != dstX0 || srcY0 != dstY0
-              || srcX1 != dstX1 || srcY1 != dstY1)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(bad src/dst multisample region)", func);
-         return;
-      }
-   } else {
-      if (readFb->Visual.samples > 0 &&
-          drawFb->Visual.samples > 0 &&
-          readFb->Visual.samples != drawFb->Visual.samples) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(mismatched samples)", func);
-         return;
-      }
-
-      /* extra checks for multisample copies... */
-      if ((readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) &&
-          (filter == GL_NEAREST || filter == GL_LINEAR)) {
-         /* src and dest region sizes must be the same */
-         if (abs(srcX1 - srcX0) != abs(dstX1 - dstX0) ||
-             abs(srcY1 - srcY0) != abs(dstY1 - dstY0)) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(bad src/dst multisample region sizes)", func);
-            return;
-         }
       }
    }
 
@@ -540,6 +580,22 @@
 }
 
 
+static void
+blit_framebuffer_err(struct gl_context *ctx,
+                     struct gl_framebuffer *readFb,
+                     struct gl_framebuffer *drawFb,
+                     GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                     GLbitfield mask, GLenum filter, const char *func)
+{
+   /* We are wrapping the err variant of the always inlined
+    * blit_framebuffer() to avoid inlining it in every caller.
+    */
+   blit_framebuffer(ctx, readFb, drawFb, srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1, mask, filter, false, func);
+}
+
+
 /**
  * Blit rectangular region, optionally from one framebuffer to another.
  *
@@ -547,6 +603,21 @@
  * when the samples must be resolved to a single color.
  */
 void GLAPIENTRY
+_mesa_BlitFramebuffer_no_error(GLint srcX0, GLint srcY0, GLint srcX1,
+                               GLint srcY1, GLint dstX0, GLint dstY0,
+                               GLint dstX1, GLint dstY1,
+                               GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+                    srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1,
+                    mask, filter, true, "glBlitFramebuffer");
+}
+
+
+void GLAPIENTRY
 _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                       GLbitfield mask, GLenum filter)
@@ -561,31 +632,22 @@
                   dstX0, dstY0, dstX1, dstY1,
                   mask, _mesa_enum_to_string(filter));
 
-   _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
-                          srcX0, srcY0, srcX1, srcY1,
-                          dstX0, dstY0, dstX1, dstY1,
-                          mask, filter, "glBlitFramebuffer");
+   blit_framebuffer_err(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+                        srcX0, srcY0, srcX1, srcY1,
+                        dstX0, dstY0, dstX1, dstY1,
+                        mask, filter, "glBlitFramebuffer");
 }
 
 
-void GLAPIENTRY
-_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
-                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                           GLbitfield mask, GLenum filter)
+static ALWAYS_INLINE void
+blit_named_framebuffer(struct gl_context *ctx,
+                       GLuint readFramebuffer, GLuint drawFramebuffer,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter, bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *readFb, *drawFb;
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx,
-                  "glBlitNamedFramebuffer(%u %u %d, %d, %d, %d, "
-                  " %d, %d, %d, %d, 0x%x, %s)\n",
-                  readFramebuffer, drawFramebuffer,
-                  srcX0, srcY0, srcX1, srcY1,
-                  dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_enum_to_string(filter));
-
    /*
     * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
     * Section 18.3 Copying Pixels):
@@ -595,25 +657,75 @@
     *   respectively."
     */
    if (readFramebuffer) {
-      readFb = _mesa_lookup_framebuffer_err(ctx, readFramebuffer,
-                                            "glBlitNamedFramebuffer");
-      if (!readFb)
-         return;
-   }
-   else
+      if (no_error) {
+         readFb = _mesa_lookup_framebuffer(ctx, readFramebuffer);
+      } else {
+         readFb = _mesa_lookup_framebuffer_err(ctx, readFramebuffer,
+                                               "glBlitNamedFramebuffer");
+         if (!readFb)
+            return;
+      }
+   } else {
       readFb = ctx->WinSysReadBuffer;
+   }
 
    if (drawFramebuffer) {
-      drawFb = _mesa_lookup_framebuffer_err(ctx, drawFramebuffer,
-                                            "glBlitNamedFramebuffer");
-      if (!drawFb)
-         return;
-   }
-   else
+      if (no_error) {
+         drawFb = _mesa_lookup_framebuffer(ctx, drawFramebuffer);
+      } else {
+         drawFb = _mesa_lookup_framebuffer_err(ctx, drawFramebuffer,
+                                               "glBlitNamedFramebuffer");
+         if (!drawFb)
+            return;
+      }
+   } else {
       drawFb = ctx->WinSysDrawBuffer;
+   }
 
-   _mesa_blit_framebuffer(ctx, readFb, drawFb,
+   blit_framebuffer(ctx, readFb, drawFb,
+                    srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1,
+                    mask, filter, no_error, "glBlitNamedFramebuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_BlitNamedFramebuffer_no_error(GLuint readFramebuffer,
+                                    GLuint drawFramebuffer,
+                                    GLint srcX0, GLint srcY0,
+                                    GLint srcX1, GLint srcY1,
+                                    GLint dstX0, GLint dstY0,
+                                    GLint dstX1, GLint dstY1,
+                                    GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   blit_named_framebuffer(ctx, readFramebuffer, drawFramebuffer,
                           srcX0, srcY0, srcX1, srcY1,
                           dstX0, dstY0, dstX1, dstY1,
-                          mask, filter, "glBlitNamedFramebuffer");
+                          mask, filter, true);
+}
+
+
+void GLAPIENTRY
+_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
+                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                           GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+                  "glBlitNamedFramebuffer(%u %u %d, %d, %d, %d, "
+                  " %d, %d, %d, %d, 0x%x, %s)\n",
+                  readFramebuffer, drawFramebuffer,
+                  srcX0, srcY0, srcX1, srcY1,
+                  dstX0, dstY0, dstX1, dstY1,
+                  mask, _mesa_enum_to_string(filter));
+
+   blit_named_framebuffer(ctx, readFramebuffer, drawFramebuffer,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mask, filter, false);
 }
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 88dd4a9..39021e7 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -34,19 +34,26 @@
                       int dstX0, int dstY0,
                       int dstX1, int dstY1);
 
-extern void
-_mesa_blit_framebuffer(struct gl_context *ctx,
-                       struct gl_framebuffer *readFb,
-                       struct gl_framebuffer *drawFb,
-                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                       GLbitfield mask, GLenum filter, const char *func);
+void GLAPIENTRY
+_mesa_BlitFramebuffer_no_error(GLint srcX0, GLint srcY0, GLint srcX1,
+                               GLint srcY1, GLint dstX0, GLint dstY0,
+                               GLint dstX1, GLint dstY1,
+                               GLbitfield mask, GLenum filter);
 
 extern void GLAPIENTRY
 _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                          GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                          GLbitfield mask, GLenum filter);
 
+void GLAPIENTRY
+_mesa_BlitNamedFramebuffer_no_error(GLuint readFramebuffer,
+                                    GLuint drawFramebuffer,
+                                    GLint srcX0, GLint srcY0,
+                                    GLint srcX1, GLint srcY1,
+                                    GLint dstX0, GLint dstY0,
+                                    GLint dstX1, GLint dstY1,
+                                    GLbitfield mask, GLenum filter);
+
 extern void GLAPIENTRY
 _mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 922c7d8..419972e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -511,16 +511,10 @@
    if (bufObj) {
       /* reference new buffer */
       mtx_lock(&bufObj->Mutex);
-      if (bufObj->RefCount == 0) {
-         /* this buffer's being deleted (look just above) */
-         /* Not sure this can every really happen.  Warn if it does. */
-         _mesa_problem(NULL, "referencing deleted buffer object");
-         *ptr = NULL;
-      }
-      else {
-         bufObj->RefCount++;
-         *ptr = bufObj;
-      }
+      assert(bufObj->RefCount > 0);
+
+      bufObj->RefCount++;
+      *ptr = bufObj;
       mtx_unlock(&bufObj->Mutex);
    }
 }
@@ -693,9 +687,9 @@
  * \sa glBufferGetSubDataARB, dd_function_table::GetBufferSubData.
  */
 static void
-_mesa_buffer_get_subdata( struct gl_context *ctx, GLintptrARB offset,
-			  GLsizeiptrARB size, GLvoid * data,
-			  struct gl_buffer_object * bufObj )
+buffer_get_subdata(struct gl_context *ctx, GLintptrARB offset,
+                   GLsizeiptrARB size, GLvoid *data,
+                   struct gl_buffer_object *bufObj )
 {
    (void) ctx;
 
@@ -1201,7 +1195,7 @@
    driver->DeleteBuffer = _mesa_delete_buffer_object;
    driver->BufferData = buffer_data_fallback;
    driver->BufferSubData = buffer_sub_data_fallback;
-   driver->GetBufferSubData = _mesa_buffer_get_subdata;
+   driver->GetBufferSubData = buffer_get_subdata;
    driver->UnmapBuffer = unmap_buffer_fallback;
 
    /* GL_ARB_clear_buffer_object */
@@ -1220,9 +1214,7 @@
 _mesa_buffer_unmap_all_mappings(struct gl_context *ctx,
                                 struct gl_buffer_object *bufObj)
 {
-   int i;
-
-   for (i = 0; i < MAP_COUNT; i++) {
+   for (int i = 0; i < MAP_COUNT; i++) {
       if (_mesa_bufferobj_mapped(bufObj, i)) {
          ctx->Driver.UnmapBuffer(ctx, bufObj, i);
          assert(bufObj->Mappings[i].Pointer == NULL);
@@ -1237,6 +1229,16 @@
 /**********************************************************************/
 
 void GLAPIENTRY
+_mesa_BindBuffer_no_error(GLenum target, GLuint buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object **bindTarget = get_buffer_target(ctx, target);
+   bind_buffer_object(ctx, bindTarget, buffer);
+}
+
+
+void GLAPIENTRY
 _mesa_BindBuffer(GLenum target, GLuint buffer)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -1414,26 +1416,13 @@
  * driver internals.
  */
 static void
-create_buffers(GLsizei n, GLuint *buffers, bool dsa)
+create_buffers(struct gl_context *ctx, GLsizei n, GLuint *buffers, bool dsa)
 {
-   GET_CURRENT_CONTEXT(ctx);
    GLuint first;
-   GLint i;
    struct gl_buffer_object *buf;
 
-   const char *func = dsa ? "glCreateBuffers" : "glGenBuffers";
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "%s(%d)\n", func, n);
-
-   if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n %d < 0)", func, n);
+   if (!buffers)
       return;
-   }
-
-   if (!buffers) {
-      return;
-   }
 
    /*
     * This must be atomic (generation and allocation of buffer object IDs)
@@ -1446,13 +1435,13 @@
     * DummyBufferObject.  Otherwise, create a new buffer object and insert
     * it.
     */
-   for (i = 0; i < n; i++) {
+   for (int i = 0; i < n; i++) {
       buffers[i] = first + i;
       if (dsa) {
          assert(ctx->Driver.NewBufferObject);
          buf = ctx->Driver.NewBufferObject(ctx, buffers[i]);
          if (!buf) {
-            _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", func);
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCreateBuffers");
             _mesa_HashUnlockMutex(ctx->Shared->BufferObjects);
             return;
          }
@@ -1466,6 +1455,23 @@
    _mesa_HashUnlockMutex(ctx->Shared->BufferObjects);
 }
 
+
+static void
+create_buffers_err(struct gl_context *ctx, GLsizei n, GLuint *buffers, bool dsa)
+{
+   const char *func = dsa ? "glCreateBuffers" : "glGenBuffers";
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "%s(%d)\n", func, n);
+
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n %d < 0)", func, n);
+      return;
+   }
+
+   create_buffers(ctx, n, buffers, dsa);
+}
+
 /**
  * Generate a set of unique buffer object IDs and store them in \c buffers.
  *
@@ -1473,9 +1479,18 @@
  * \param buffers  Array of \c n locations to store the IDs.
  */
 void GLAPIENTRY
+_mesa_GenBuffers_no_error(GLsizei n, GLuint *buffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_buffers(ctx, n, buffers, false);
+}
+
+
+void GLAPIENTRY
 _mesa_GenBuffers(GLsizei n, GLuint *buffers)
 {
-   create_buffers(n, buffers, false);
+   GET_CURRENT_CONTEXT(ctx);
+   create_buffers_err(ctx, n, buffers, false);
 }
 
 /**
@@ -1485,9 +1500,18 @@
  * \param buffers  Array of \c n locations to store the IDs.
  */
 void GLAPIENTRY
+_mesa_CreateBuffers_no_error(GLsizei n, GLuint *buffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_buffers(ctx, n, buffers, true);
+}
+
+
+void GLAPIENTRY
 _mesa_CreateBuffers(GLsizei n, GLuint *buffers)
 {
-   create_buffers(n, buffers, true);
+   GET_CURRENT_CONTEXT(ctx);
+   create_buffers_err(ctx, n, buffers, true);
 }
 
 
@@ -1511,14 +1535,14 @@
 }
 
 
-void
-_mesa_buffer_storage(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                     GLenum target, GLsizeiptr size, const GLvoid *data,
-                     GLbitfield flags, const char *func)
+static bool
+validate_buffer_storage(struct gl_context *ctx,
+                        struct gl_buffer_object *bufObj, GLsizeiptr size,
+                        GLbitfield flags, const char *func)
 {
    if (size <= 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size <= 0)", func);
-      return;
+      return false;
    }
 
    GLbitfield valid_flags = GL_MAP_READ_BIT |
@@ -1533,7 +1557,7 @@
 
    if (flags & ~valid_flags) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid flag bits set)", func);
-      return;
+      return false;
    }
 
    /* The Errors section of the GL_ARB_sparse_buffer spec says:
@@ -1545,31 +1569,40 @@
    if (flags & GL_SPARSE_STORAGE_BIT_ARB &&
        flags & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(SPARSE_STORAGE and READ/WRITE)", func);
-      return;
+      return false;
    }
 
    if (flags & GL_MAP_PERSISTENT_BIT &&
        !(flags & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT))) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(PERSISTENT and flags!=READ/WRITE)", func);
-      return;
+      return false;
    }
 
    if (flags & GL_MAP_COHERENT_BIT && !(flags & GL_MAP_PERSISTENT_BIT)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(COHERENT and flags!=PERSISTENT)", func);
-      return;
+      return false;
    }
 
-   if (bufObj->Immutable) {
+   if (bufObj->Immutable || bufObj->HandleAllocated) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(immutable)", func);
-      return;
+      return false;
    }
 
+   return true;
+}
+
+
+static void
+buffer_storage(struct gl_context *ctx, struct gl_buffer_object *bufObj,
+               GLenum target, GLsizeiptr size, const GLvoid *data,
+               GLbitfield flags, const char *func)
+{
    /* Unmap the existing buffer.  We'll replace it now.  Not an error. */
    _mesa_buffer_unmap_all_mappings(ctx, bufObj);
 
-   FLUSH_VERTICES(ctx, _NEW_BUFFER_OBJECT);
+   FLUSH_VERTICES(ctx, 0);
 
    bufObj->Written = GL_TRUE;
    bufObj->Immutable = GL_TRUE;
@@ -1591,38 +1624,78 @@
    }
 }
 
-void GLAPIENTRY
-_mesa_BufferStorage(GLenum target, GLsizeiptr size, const GLvoid *data,
-                    GLbitfield flags)
+
+static ALWAYS_INLINE void
+inlined_buffer_storage(GLenum target, GLuint buffer, GLsizeiptr size,
+                       const GLvoid *data, GLbitfield flags, bool dsa,
+                       bool no_error, const char *func)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
-   bufObj = get_buffer(ctx, "glBufferStorage", target, GL_INVALID_OPERATION);
-   if (!bufObj)
-      return;
+   if (dsa) {
+      if (no_error) {
+         bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+      } else {
+         bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, func);
+         if (!bufObj)
+            return;
+      }
+   } else {
+      if (no_error) {
+         struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+         bufObj = *bufObjPtr;
+      } else {
+         bufObj = get_buffer(ctx, func, target, GL_INVALID_OPERATION);
+         if (!bufObj)
+            return;
+      }
+   }
 
-   _mesa_buffer_storage(ctx, bufObj, target, size, data, flags,
-                        "glBufferStorage");
+   if (no_error || validate_buffer_storage(ctx, bufObj, size, flags, func))
+      buffer_storage(ctx, bufObj, target, size, data, flags, func);
 }
 
+
+void GLAPIENTRY
+_mesa_BufferStorage_no_error(GLenum target, GLsizeiptr size,
+                             const GLvoid *data, GLbitfield flags)
+{
+   inlined_buffer_storage(target, 0, size, data, flags, false, true,
+                          "glBufferStorage");
+}
+
+
+void GLAPIENTRY
+_mesa_BufferStorage(GLenum target, GLsizeiptr size, const GLvoid *data,
+                    GLbitfield flags)
+{
+   inlined_buffer_storage(target, 0, size, data, flags, false, false,
+                          "glBufferStorage");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedBufferStorage_no_error(GLuint buffer, GLsizeiptr size,
+                                  const GLvoid *data, GLbitfield flags)
+{
+   /* In direct state access, buffer objects have an unspecified target
+    * since they are not required to be bound.
+    */
+   inlined_buffer_storage(GL_NONE, buffer, size, data, flags, true, true,
+                          "glNamedBufferStorage");
+}
+
+
 void GLAPIENTRY
 _mesa_NamedBufferStorage(GLuint buffer, GLsizeiptr size, const GLvoid *data,
                          GLbitfield flags)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_buffer_object *bufObj;
-
-   bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, "glNamedBufferStorage");
-   if (!bufObj)
-      return;
-
-   /*
-    * In direct state access, buffer objects have an unspecified target since
-    * they are not required to be bound.
+   /* In direct state access, buffer objects have an unspecified target
+    * since they are not required to be bound.
     */
-   _mesa_buffer_storage(ctx, bufObj, GL_NONE, size, data, flags,
-                        "glNamedBufferStorage");
+   inlined_buffer_storage(GL_NONE, buffer, size, data, flags, true, false,
+                          "glNamedBufferStorage");
 }
 
 
@@ -1676,7 +1749,7 @@
       return;
    }
 
-   if (bufObj->Immutable) {
+   if (bufObj->Immutable || bufObj->HandleAllocated) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(immutable)", func);
       return;
    }
@@ -1684,7 +1757,7 @@
    /* Unmap the existing buffer.  We'll replace it now.  Not an error. */
    _mesa_buffer_unmap_all_mappings(ctx, bufObj);
 
-   FLUSH_VERTICES(ctx, _NEW_BUFFER_OBJECT);
+   FLUSH_VERTICES(ctx, 0);
 
    bufObj->Written = GL_TRUE;
    bufObj->MinMaxCacheDirty = true;
@@ -1753,6 +1826,41 @@
 }
 
 
+static bool
+validate_buffer_sub_data(struct gl_context *ctx,
+                         struct gl_buffer_object *bufObj,
+                         GLintptr offset, GLsizeiptr size,
+                         const char *func)
+{
+   if (!buffer_object_subdata_range_good(ctx, bufObj, offset, size,
+                                         true, func)) {
+      /* error already recorded */
+      return false;
+   }
+
+   if (bufObj->Immutable &&
+       !(bufObj->StorageFlags & GL_DYNAMIC_STORAGE_BIT)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", func);
+      return false;
+   }
+
+   if ((bufObj->Usage == GL_STATIC_DRAW ||
+        bufObj->Usage == GL_STATIC_COPY) &&
+       bufObj->NumSubDataCalls >= BUFFER_WARNING_CALL_COUNT - 1) {
+      /* If the application declared the buffer as static draw/copy or stream
+       * draw, it should not be frequently modified with glBufferSubData.
+       */
+      BUFFER_USAGE_WARNING(ctx,
+                           "using %s(buffer %u, offset %u, size %u) to "
+                           "update a %s buffer",
+                           func, bufObj->Name, offset, size,
+                           _mesa_enum_to_string(bufObj->Usage));
+   }
+
+   return true;
+}
+
+
 /**
  * Implementation for glBufferSubData and glNamedBufferSubData.
  *
@@ -1766,39 +1874,12 @@
  */
 void
 _mesa_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                      GLintptr offset, GLsizeiptr size, const GLvoid *data,
-                      const char *func)
+                      GLintptr offset, GLsizeiptr size, const GLvoid *data)
 {
-   if (!buffer_object_subdata_range_good(ctx, bufObj, offset, size,
-                                         true, func)) {
-      /* error already recorded */
-      return;
-   }
-
-   if (bufObj->Immutable &&
-       !(bufObj->StorageFlags & GL_DYNAMIC_STORAGE_BIT)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", func);
-      return;
-   }
-
    if (size == 0)
       return;
 
    bufObj->NumSubDataCalls++;
-
-   if ((bufObj->Usage == GL_STATIC_DRAW ||
-        bufObj->Usage == GL_STATIC_COPY) &&
-       bufObj->NumSubDataCalls >= BUFFER_WARNING_CALL_COUNT) {
-      /* If the application declared the buffer as static draw/copy or stream
-       * draw, it should not be frequently modified with glBufferSubData.
-       */
-      BUFFER_USAGE_WARNING(ctx,
-                           "using %s(buffer %u, offset %u, size %u) to "
-                           "update a %s buffer",
-                           func, bufObj->Name, offset, size,
-                           _mesa_enum_to_string(bufObj->Usage));
-   }
-
    bufObj->Written = GL_TRUE;
    bufObj->MinMaxCacheDirty = true;
 
@@ -1806,33 +1887,70 @@
    ctx->Driver.BufferSubData(ctx, offset, size, data, bufObj);
 }
 
-void GLAPIENTRY
-_mesa_BufferSubData(GLenum target, GLintptr offset,
-                    GLsizeiptr size, const GLvoid *data)
+
+static ALWAYS_INLINE void
+buffer_sub_data(GLenum target, GLuint buffer, GLintptr offset,
+                GLsizeiptr size, const GLvoid *data,
+                bool dsa, bool no_error, const char *func)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
-   bufObj = get_buffer(ctx, "glBufferSubData", target, GL_INVALID_OPERATION);
-   if (!bufObj)
-      return;
+   if (dsa) {
+      if (no_error) {
+         bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+      } else {
+         bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, func);
+         if (!bufObj)
+            return;
+      }
+   } else {
+      if (no_error) {
+         struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+         bufObj = *bufObjPtr;
+      } else {
+         bufObj = get_buffer(ctx, func, target, GL_INVALID_OPERATION);
+         if (!bufObj)
+            return;
+      }
+   }
 
-   _mesa_buffer_sub_data(ctx, bufObj, offset, size, data, "glBufferSubData");
+   if (no_error || validate_buffer_sub_data(ctx, bufObj, offset, size, func))
+      _mesa_buffer_sub_data(ctx, bufObj, offset, size, data);
+}
+
+
+void GLAPIENTRY
+_mesa_BufferSubData_no_error(GLenum target, GLintptr offset,
+                             GLsizeiptr size, const GLvoid *data)
+{
+   buffer_sub_data(target, 0, offset, size, data, false, true,
+                   "glBufferSubData");
+}
+
+
+void GLAPIENTRY
+_mesa_BufferSubData(GLenum target, GLintptr offset,
+                    GLsizeiptr size, const GLvoid *data)
+{
+   buffer_sub_data(target, 0, offset, size, data, false, false,
+                   "glBufferSubData");
+}
+
+void GLAPIENTRY
+_mesa_NamedBufferSubData_no_error(GLuint buffer, GLintptr offset,
+                                  GLsizeiptr size, const GLvoid *data)
+{
+   buffer_sub_data(0, buffer, offset, size, data, true, true,
+                   "glNamedBufferSubData");
 }
 
 void GLAPIENTRY
 _mesa_NamedBufferSubData(GLuint buffer, GLintptr offset,
                          GLsizeiptr size, const GLvoid *data)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_buffer_object *bufObj;
-
-   bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, "glNamedBufferSubData");
-   if (!bufObj)
-      return;
-
-   _mesa_buffer_sub_data(ctx, bufObj, offset, size, data,
-                         "glNamedBufferSubData");
+   buffer_sub_data(0, buffer, offset, size, data, true, false,
+                   "glNamedBufferSubData");
 }
 
 
@@ -1882,14 +2000,11 @@
 /**
  * \param subdata   true if caller is *SubData, false if *Data
  */
-void
-_mesa_clear_buffer_sub_data(struct gl_context *ctx,
-                            struct gl_buffer_object *bufObj,
-                            GLenum internalformat,
-                            GLintptr offset, GLsizeiptr size,
-                            GLenum format, GLenum type,
-                            const GLvoid *data,
-                            const char *func, bool subdata)
+static void
+clear_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
+                      GLenum internalformat, GLintptr offset, GLsizeiptr size,
+                      GLenum format, GLenum type, const GLvoid *data,
+                      const char *func, bool subdata)
 {
    mesa_format mesaFormat;
    GLubyte clearValue[MAX_PIXEL_BYTES];
@@ -1949,9 +2064,8 @@
    if (!bufObj)
       return;
 
-   _mesa_clear_buffer_sub_data(ctx, bufObj, internalformat, 0, bufObj->Size,
-                               format, type, data,
-                               "glClearBufferData", false);
+   clear_buffer_sub_data(ctx, bufObj, internalformat, 0, bufObj->Size,
+                         format, type, data, "glClearBufferData", false);
 }
 
 void GLAPIENTRY
@@ -1965,9 +2079,8 @@
    if (!bufObj)
       return;
 
-   _mesa_clear_buffer_sub_data(ctx, bufObj, internalformat, 0, bufObj->Size,
-                               format, type, data,
-                               "glClearNamedBufferData", false);
+   clear_buffer_sub_data(ctx, bufObj, internalformat, 0, bufObj->Size,
+                         format, type, data, "glClearNamedBufferData", false);
 }
 
 
@@ -1984,9 +2097,8 @@
    if (!bufObj)
       return;
 
-   _mesa_clear_buffer_sub_data(ctx, bufObj, internalformat, offset, size,
-                               format, type, data,
-                               "glClearBufferSubData", true);
+   clear_buffer_sub_data(ctx, bufObj, internalformat, offset, size,
+                         format, type, data, "glClearBufferSubData", true);
 }
 
 void GLAPIENTRY
@@ -2003,17 +2115,27 @@
    if (!bufObj)
       return;
 
-   _mesa_clear_buffer_sub_data(ctx, bufObj, internalformat, offset, size,
-                               format, type, data,
-                               "glClearNamedBufferSubData", true);
+   clear_buffer_sub_data(ctx, bufObj, internalformat, offset, size, format,
+                         type, data, "glClearNamedBufferSubData", true);
 }
 
-
-GLboolean
-_mesa_unmap_buffer(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                   const char *func)
+static GLboolean
+unmap_buffer(struct gl_context *ctx, struct gl_buffer_object *bufObj)
 {
-   GLboolean status = GL_TRUE;
+   GLboolean status = ctx->Driver.UnmapBuffer(ctx, bufObj, MAP_USER);
+   bufObj->Mappings[MAP_USER].AccessFlags = 0;
+   assert(bufObj->Mappings[MAP_USER].Pointer == NULL);
+   assert(bufObj->Mappings[MAP_USER].Offset == 0);
+   assert(bufObj->Mappings[MAP_USER].Length == 0);
+
+   return status;
+}
+
+static GLboolean
+validate_and_unmap_buffer(struct gl_context *ctx,
+                          struct gl_buffer_object *bufObj,
+                          const char *func)
+{
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_FALSE);
 
    if (!_mesa_bufferobj_mapped(bufObj, MAP_USER)) {
@@ -2058,13 +2180,17 @@
    }
 #endif
 
-   status = ctx->Driver.UnmapBuffer(ctx, bufObj, MAP_USER);
-   bufObj->Mappings[MAP_USER].AccessFlags = 0;
-   assert(bufObj->Mappings[MAP_USER].Pointer == NULL);
-   assert(bufObj->Mappings[MAP_USER].Offset == 0);
-   assert(bufObj->Mappings[MAP_USER].Length == 0);
+   return unmap_buffer(ctx, bufObj);
+}
 
-   return status;
+GLboolean GLAPIENTRY
+_mesa_UnmapBuffer_no_error(GLenum target)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+   struct gl_buffer_object *bufObj = *bufObjPtr;
+
+   return unmap_buffer(ctx, bufObj);
 }
 
 GLboolean GLAPIENTRY
@@ -2077,7 +2203,16 @@
    if (!bufObj)
       return GL_FALSE;
 
-   return _mesa_unmap_buffer(ctx, bufObj, "glUnmapBuffer");
+   return validate_and_unmap_buffer(ctx, bufObj, "glUnmapBuffer");
+}
+
+GLboolean GLAPIENTRY
+_mesa_UnmapNamedBuffer_no_error(GLuint buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+
+   return unmap_buffer(ctx, bufObj);
 }
 
 GLboolean GLAPIENTRY
@@ -2090,7 +2225,7 @@
    if (!bufObj)
       return GL_FALSE;
 
-   return _mesa_unmap_buffer(ctx, bufObj, "glUnmapNamedBuffer");
+   return validate_and_unmap_buffer(ctx, bufObj, "glUnmapNamedBuffer");
 }
 
 
@@ -2269,12 +2404,10 @@
 }
 
 
-void
-_mesa_copy_buffer_sub_data(struct gl_context *ctx,
-                           struct gl_buffer_object *src,
-                           struct gl_buffer_object *dst,
-                           GLintptr readOffset, GLintptr writeOffset,
-                           GLsizeiptr size, const char *func)
+static void
+copy_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *src,
+                     struct gl_buffer_object *dst, GLintptr readOffset,
+                     GLintptr writeOffset, GLsizeiptr size, const char *func)
 {
    if (_mesa_check_disallowed_mapping(src)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -2341,6 +2474,24 @@
 }
 
 void GLAPIENTRY
+_mesa_CopyBufferSubData_no_error(GLenum readTarget, GLenum writeTarget,
+                                 GLintptr readOffset, GLintptr writeOffset,
+                                 GLsizeiptr size)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object **src_ptr = get_buffer_target(ctx, readTarget);
+   struct gl_buffer_object *src = *src_ptr;
+
+   struct gl_buffer_object **dst_ptr = get_buffer_target(ctx, writeTarget);
+   struct gl_buffer_object *dst = *dst_ptr;
+
+   dst->MinMaxCacheDirty = true;
+   ctx->Driver.CopyBufferSubData(ctx, src, dst, readOffset, writeOffset,
+                                 size);
+}
+
+void GLAPIENTRY
 _mesa_CopyBufferSubData(GLenum readTarget, GLenum writeTarget,
                         GLintptr readOffset, GLintptr writeOffset,
                         GLsizeiptr size)
@@ -2358,8 +2509,23 @@
    if (!dst)
       return;
 
-   _mesa_copy_buffer_sub_data(ctx, src, dst, readOffset, writeOffset, size,
-                              "glCopyBufferSubData");
+   copy_buffer_sub_data(ctx, src, dst, readOffset, writeOffset, size,
+                        "glCopyBufferSubData");
+}
+
+void GLAPIENTRY
+_mesa_CopyNamedBufferSubData_no_error(GLuint readBuffer, GLuint writeBuffer,
+                                      GLintptr readOffset,
+                                      GLintptr writeOffset, GLsizeiptr size)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object *src = _mesa_lookup_bufferobj(ctx, readBuffer);
+   struct gl_buffer_object *dst = _mesa_lookup_bufferobj(ctx, writeBuffer);
+
+   dst->MinMaxCacheDirty = true;
+   ctx->Driver.CopyBufferSubData(ctx, src, dst, readOffset, writeOffset,
+                                 size);
 }
 
 void GLAPIENTRY
@@ -2380,32 +2546,30 @@
    if (!dst)
       return;
 
-   _mesa_copy_buffer_sub_data(ctx, src, dst, readOffset, writeOffset, size,
-                              "glCopyNamedBufferSubData");
+   copy_buffer_sub_data(ctx, src, dst, readOffset, writeOffset, size,
+                        "glCopyNamedBufferSubData");
 }
 
-
-void *
-_mesa_map_buffer_range(struct gl_context *ctx,
-                       struct gl_buffer_object *bufObj,
-                       GLintptr offset, GLsizeiptr length,
-                       GLbitfield access, const char *func)
+static bool
+validate_map_buffer_range(struct gl_context *ctx,
+                          struct gl_buffer_object *bufObj, GLintptr offset,
+                          GLsizeiptr length, GLbitfield access,
+                          const char *func)
 {
-   void *map;
    GLbitfield allowed_access;
 
-   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, NULL);
+   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, false);
 
    if (offset < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(offset %ld < 0)", func, (long) offset);
-      return NULL;
+      return false;
    }
 
    if (length < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(length %ld < 0)", func, (long) length);
-      return NULL;
+      return false;
    }
 
    /* Page 38 of the PDF of the OpenGL ES 3.0 spec says:
@@ -2421,7 +2585,7 @@
     */
    if (length == 0) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(length = 0)", func);
-      return NULL;
+      return false;
    }
 
    allowed_access = GL_MAP_READ_BIT |
@@ -2440,13 +2604,13 @@
       /* generate an error if any bits other than those allowed are set */
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(access has undefined bits set)", func);
-      return NULL;
+      return false;
    }
 
    if ((access & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) == 0) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(access indicates neither read or write)", func);
-      return NULL;
+      return false;
    }
 
    if ((access & GL_MAP_READ_BIT) &&
@@ -2455,42 +2619,42 @@
                   GL_MAP_UNSYNCHRONIZED_BIT))) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(read access with disallowed bits)", func);
-      return NULL;
+      return false;
    }
 
    if ((access & GL_MAP_FLUSH_EXPLICIT_BIT) &&
        ((access & GL_MAP_WRITE_BIT) == 0)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(access has flush explicit without write)", func);
-      return NULL;
+      return false;
    }
 
    if (access & GL_MAP_READ_BIT &&
        !(bufObj->StorageFlags & GL_MAP_READ_BIT)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(buffer does not allow read access)", func);
-      return NULL;
+      return false;
    }
 
    if (access & GL_MAP_WRITE_BIT &&
        !(bufObj->StorageFlags & GL_MAP_WRITE_BIT)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(buffer does not allow write access)", func);
-      return NULL;
+      return false;
    }
 
    if (access & GL_MAP_COHERENT_BIT &&
        !(bufObj->StorageFlags & GL_MAP_COHERENT_BIT)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(buffer does not allow coherent access)", func);
-      return NULL;
+      return false;
    }
 
    if (access & GL_MAP_PERSISTENT_BIT &&
        !(bufObj->StorageFlags & GL_MAP_PERSISTENT_BIT)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(buffer does not allow persistent access)", func);
-      return NULL;
+      return false;
    }
 
    if (offset + length > bufObj->Size) {
@@ -2498,18 +2662,13 @@
                   "%s(offset %lu + length %lu > buffer_size %lu)", func,
                   (unsigned long) offset, (unsigned long) length,
                   (unsigned long) bufObj->Size);
-      return NULL;
+      return false;
    }
 
    if (_mesa_bufferobj_mapped(bufObj, MAP_USER)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(buffer already mapped)", func);
-      return NULL;
-   }
-
-   if (!bufObj->Size) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(buffer size = 0)", func);
-      return NULL;
+      return false;
    }
 
    if (access & GL_MAP_WRITE_BIT) {
@@ -2525,9 +2684,22 @@
       }
    }
 
+   return true;
+}
+
+static void *
+map_buffer_range(struct gl_context *ctx, struct gl_buffer_object *bufObj,
+                 GLintptr offset, GLsizeiptr length, GLbitfield access,
+                 const char *func)
+{
+   if (!bufObj->Size) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(buffer size = 0)", func);
+      return NULL;
+   }
+
    assert(ctx->Driver.MapBufferRange);
-   map = ctx->Driver.MapBufferRange(ctx, offset, length, access, bufObj,
-                                    MAP_USER);
+   void *map = ctx->Driver.MapBufferRange(ctx, offset, length, access, bufObj,
+                                          MAP_USER);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(map failed)", func);
    }
@@ -2576,6 +2748,19 @@
 }
 
 void * GLAPIENTRY
+_mesa_MapBufferRange_no_error(GLenum target, GLintptr offset,
+                              GLsizeiptr length, GLbitfield access)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+   struct gl_buffer_object *bufObj = *bufObjPtr;
+
+   return map_buffer_range(ctx, bufObj, offset, length, access,
+                           "glMapBufferRange");
+}
+
+void * GLAPIENTRY
 _mesa_MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length,
                      GLbitfield access)
 {
@@ -2592,8 +2777,23 @@
    if (!bufObj)
       return NULL;
 
-   return _mesa_map_buffer_range(ctx, bufObj, offset, length, access,
-                                 "glMapBufferRange");
+   if (!validate_map_buffer_range(ctx, bufObj, offset, length, access,
+                                  "glMapBufferRange"))
+      return NULL;
+
+   return map_buffer_range(ctx, bufObj, offset, length, access,
+                           "glMapBufferRange");
+}
+
+void * GLAPIENTRY
+_mesa_MapNamedBufferRange_no_error(GLuint buffer, GLintptr offset,
+                                   GLsizeiptr length, GLbitfield access)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+
+   return map_buffer_range(ctx, bufObj, offset, length, access,
+                           "glMapNamedBufferRange");
 }
 
 void * GLAPIENTRY
@@ -2614,13 +2814,17 @@
    if (!bufObj)
       return NULL;
 
-   return _mesa_map_buffer_range(ctx, bufObj, offset, length, access,
-                                 "glMapNamedBufferRange");
+   if (!validate_map_buffer_range(ctx, bufObj, offset, length, access,
+                                  "glMapNamedBufferRange"))
+      return NULL;
+
+   return map_buffer_range(ctx, bufObj, offset, length, access,
+                           "glMapNamedBufferRange");
 }
 
 /**
  * Converts GLenum access from MapBuffer and MapNamedBuffer into
- * flags for input to _mesa_map_buffer_range.
+ * flags for input to map_buffer_range.
  *
  * \return true if the type of requested access is permissible.
  */
@@ -2644,6 +2848,21 @@
 }
 
 void * GLAPIENTRY
+_mesa_MapBuffer_no_error(GLenum target, GLenum access)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   GLbitfield accessFlags;
+   get_map_buffer_access_flags(ctx, access, &accessFlags);
+
+   struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+   struct gl_buffer_object *bufObj = *bufObjPtr;
+
+   return map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                           "glMapBuffer");
+}
+
+void * GLAPIENTRY
 _mesa_MapBuffer(GLenum target, GLenum access)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -2659,8 +2878,26 @@
    if (!bufObj)
       return NULL;
 
-   return _mesa_map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
-                                 "glMapBuffer");
+   if (!validate_map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                                  "glMapBuffer"))
+      return NULL;
+
+   return map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                           "glMapBuffer");
+}
+
+void * GLAPIENTRY
+_mesa_MapNamedBuffer_no_error(GLuint buffer, GLenum access)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   GLbitfield accessFlags;
+   get_map_buffer_access_flags(ctx, access, &accessFlags);
+
+   struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+
+   return map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                           "glMapNamedBuffer");
 }
 
 void * GLAPIENTRY
@@ -2679,16 +2916,20 @@
    if (!bufObj)
       return NULL;
 
-   return _mesa_map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
-                                 "glMapNamedBuffer");
+   if (!validate_map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                                  "glMapNamedBuffer"))
+      return NULL;
+
+   return map_buffer_range(ctx, bufObj, 0, bufObj->Size, accessFlags,
+                           "glMapNamedBuffer");
 }
 
 
-void
-_mesa_flush_mapped_buffer_range(struct gl_context *ctx,
-                                struct gl_buffer_object *bufObj,
-                                GLintptr offset, GLsizeiptr length,
-                                const char *func)
+static void
+flush_mapped_buffer_range(struct gl_context *ctx,
+                          struct gl_buffer_object *bufObj,
+                          GLintptr offset, GLsizeiptr length,
+                          const char *func)
 {
    if (!ctx->Extensions.ARB_map_buffer_range) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -2738,6 +2979,19 @@
 }
 
 void GLAPIENTRY
+_mesa_FlushMappedBufferRange_no_error(GLenum target, GLintptr offset,
+                                      GLsizeiptr length)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_buffer_object **bufObjPtr = get_buffer_target(ctx, target);
+   struct gl_buffer_object *bufObj = *bufObjPtr;
+
+   if (ctx->Driver.FlushMappedBufferRange)
+      ctx->Driver.FlushMappedBufferRange(ctx, offset, length, bufObj,
+                                         MAP_USER);
+}
+
+void GLAPIENTRY
 _mesa_FlushMappedBufferRange(GLenum target, GLintptr offset,
                              GLsizeiptr length)
 {
@@ -2749,8 +3003,20 @@
    if (!bufObj)
       return;
 
-   _mesa_flush_mapped_buffer_range(ctx, bufObj, offset, length,
-                                   "glFlushMappedBufferRange");
+   flush_mapped_buffer_range(ctx, bufObj, offset, length,
+                             "glFlushMappedBufferRange");
+}
+
+void GLAPIENTRY
+_mesa_FlushMappedNamedBufferRange_no_error(GLuint buffer, GLintptr offset,
+                                           GLsizeiptr length)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+
+   if (ctx->Driver.FlushMappedBufferRange)
+      ctx->Driver.FlushMappedBufferRange(ctx, offset, length, bufObj,
+                                         MAP_USER);
 }
 
 void GLAPIENTRY
@@ -2765,8 +3031,8 @@
    if (!bufObj)
       return;
 
-   _mesa_flush_mapped_buffer_range(ctx, bufObj, offset, length,
-                                   "glFlushMappedNamedBufferRange");
+   flush_mapped_buffer_range(ctx, bufObj, offset, length,
+                             "glFlushMappedNamedBufferRange");
 }
 
 
@@ -2886,6 +3152,20 @@
    set_ssbo_binding(ctx, binding, bufObj, offset, size, autoSize);
 }
 
+static void
+bind_buffer_range_uniform_buffer(struct gl_context *ctx, GLuint index,
+                                 struct gl_buffer_object *bufObj,
+                                 GLintptr offset, GLsizeiptr size)
+{
+   if (bufObj == ctx->Shared->NullBufferObj) {
+      offset = -1;
+      size = -1;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, bufObj);
+   bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+}
+
 /**
  * Bind a region of a buffer object to a uniform block binding point.
  * \param index  the uniform buffer binding point index
@@ -2894,11 +3174,9 @@
  * \param size  size of the buffer object region
  */
 static void
-bind_buffer_range_uniform_buffer(struct gl_context *ctx,
-				 GLuint index,
-				 struct gl_buffer_object *bufObj,
-				 GLintptr offset,
-				 GLsizeiptr size)
+bind_buffer_range_uniform_buffer_err(struct gl_context *ctx, GLuint index,
+                                     struct gl_buffer_object *bufObj,
+                                     GLintptr offset, GLsizeiptr size)
 {
    if (index >= ctx->Const.MaxUniformBufferBindings) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(index=%d)", index);
@@ -2912,13 +3190,23 @@
       return;
    }
 
+   bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
+}
+
+static void
+bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
+                                        GLuint index,
+                                        struct gl_buffer_object *bufObj,
+                                        GLintptr offset,
+                                        GLsizeiptr size)
+{
    if (bufObj == ctx->Shared->NullBufferObj) {
       offset = -1;
       size = -1;
    }
 
-   _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, bufObj);
-   bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+   bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
 }
 
 /**
@@ -2929,11 +3217,10 @@
  * \param size  size of the buffer object region
  */
 static void
-bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
-                                        GLuint index,
-                                        struct gl_buffer_object *bufObj,
-                                        GLintptr offset,
-                                        GLsizeiptr size)
+bind_buffer_range_shader_storage_buffer_err(struct gl_context *ctx,
+                                            GLuint index,
+                                            struct gl_buffer_object *bufObj,
+                                            GLintptr offset, GLsizeiptr size)
 {
    if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(index=%d)", index);
@@ -2947,13 +3234,7 @@
       return;
    }
 
-   if (bufObj == ctx->Shared->NullBufferObj) {
-      offset = -1;
-      size = -1;
-   }
-
-   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
-   bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+   bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset, size);
 }
 
 /**
@@ -3025,6 +3306,27 @@
    }
 }
 
+static void
+bind_atomic_buffer(struct gl_context *ctx, unsigned index,
+                   struct gl_buffer_object *bufObj, GLintptr offset,
+                   GLsizeiptr size)
+{
+   _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer, bufObj);
+
+   struct gl_atomic_buffer_binding *binding =
+      &ctx->AtomicBufferBindings[index];
+   if (binding->BufferObject == bufObj &&
+       binding->Offset == offset &&
+       binding->Size == size) {
+      return;
+   }
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewAtomicBuffer;
+
+   set_atomic_buffer_binding(ctx, binding, bufObj, offset, size);
+}
+
 /**
  * Binds a buffer object to an atomic buffer binding point.
  *
@@ -3034,15 +3336,10 @@
  * updating it.
  */
 static void
-bind_atomic_buffer(struct gl_context *ctx,
-                   unsigned index,
-                   struct gl_buffer_object *bufObj,
-                   GLintptr offset,
-                   GLsizeiptr size,
-                   const char *name)
+bind_atomic_buffer_err(struct gl_context *ctx, unsigned index,
+                       struct gl_buffer_object *bufObj, GLintptr offset,
+                       GLsizeiptr size, const char *name)
 {
-   struct gl_atomic_buffer_binding *binding;
-
    if (index >= ctx->Const.MaxAtomicBufferBindings) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(index=%d)", name, index);
       return;
@@ -3055,19 +3352,7 @@
       return;
    }
 
-   _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer, bufObj);
-
-   binding = &ctx->AtomicBufferBindings[index];
-   if (binding->BufferObject == bufObj &&
-       binding->Offset == offset &&
-       binding->Size == size) {
-      return;
-   }
-
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewAtomicBuffer;
-
-   set_atomic_buffer_binding(ctx, binding, bufObj, offset, size);
+   bind_atomic_buffer(ctx, index, bufObj, offset, size);
 }
 
 static inline bool
@@ -3169,9 +3454,8 @@
 unbind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
 {
    struct gl_buffer_object *bufObj = ctx->Shared->NullBufferObj;
-   GLint i;
 
-   for (i = 0; i < count; i++)
+   for (int i = 0; i < count; i++)
       set_ubo_binding(ctx, &ctx->UniformBufferBindings[first + i],
                       bufObj, -1, -1, GL_TRUE);
 }
@@ -3185,9 +3469,8 @@
                               GLsizei count)
 {
    struct gl_buffer_object *bufObj = ctx->Shared->NullBufferObj;
-   GLint i;
 
-   for (i = 0; i < count; i++)
+   for (int i = 0; i < count; i++)
       set_ssbo_binding(ctx, &ctx->ShaderStorageBufferBindings[first + i],
                        bufObj, -1, -1, GL_TRUE);
 }
@@ -3199,8 +3482,6 @@
                      const GLintptr *offsets, const GLsizeiptr *sizes,
                      const char *caller)
 {
-   GLint i;
-
    if (!error_check_bind_uniform_buffers(ctx, first, count, caller))
       return;
 
@@ -3242,7 +3523,7 @@
 
    _mesa_HashLockMutex(ctx->Shared->BufferObjects);
 
-   for (i = 0; i < count; i++) {
+   for (int i = 0; i < count; i++) {
       struct gl_uniform_buffer_binding *binding =
          &ctx->UniformBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
@@ -3311,8 +3592,6 @@
                             const GLsizeiptr *sizes,
                             const char *caller)
 {
-   GLint i;
-
    if (!error_check_bind_shader_storage_buffers(ctx, first, count, caller))
       return;
 
@@ -3354,7 +3633,7 @@
 
    _mesa_HashLockMutex(ctx->Shared->BufferObjects);
 
-   for (i = 0; i < count; i++) {
+   for (int i = 0; i < count; i++) {
       struct gl_shader_storage_buffer_binding *binding =
          &ctx->ShaderStorageBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
@@ -3472,9 +3751,8 @@
                    GLuint first, GLsizei count)
 {
    struct gl_buffer_object * const bufObj = ctx->Shared->NullBufferObj;
-   GLint i;
 
-   for (i = 0; i < count; i++)
+   for (int i = 0; i < count; i++)
       _mesa_set_transform_feedback_binding(ctx, tfObj, first + i,
                                            bufObj, 0, 0);
 }
@@ -3490,7 +3768,6 @@
 {
    struct gl_transform_feedback_object *tfObj =
        ctx->TransformFeedback.CurrentObject;
-   GLint i;
 
    if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count, caller))
       return;
@@ -3533,7 +3810,7 @@
 
    _mesa_HashLockMutex(ctx->Shared->BufferObjects);
 
-   for (i = 0; i < count; i++) {
+   for (int i = 0; i < count; i++) {
       const GLuint index = first + i;
       struct gl_buffer_object * const boundBufObj = tfObj->Buffers[index];
       struct gl_buffer_object *bufObj;
@@ -3636,9 +3913,8 @@
 unbind_atomic_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
 {
    struct gl_buffer_object * const bufObj = ctx->Shared->NullBufferObj;
-   GLint i;
 
-   for (i = 0; i < count; i++)
+   for (int i = 0; i < count; i++)
       set_atomic_buffer_binding(ctx, &ctx->AtomicBufferBindings[first + i],
                                 bufObj, -1, -1);
 }
@@ -3653,8 +3929,6 @@
                     const GLsizeiptr *sizes,
                     const char *caller)
 {
-   GLint i;
-
    if (!error_check_bind_atomic_buffers(ctx, first, count, caller))
      return;
 
@@ -3696,7 +3970,7 @@
 
    _mesa_HashLockMutex(ctx->Shared->BufferObjects);
 
-   for (i = 0; i < count; i++) {
+   for (int i = 0; i < count; i++) {
       struct gl_atomic_buffer_binding *binding =
          &ctx->AtomicBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
@@ -3750,9 +4024,9 @@
    _mesa_HashUnlockMutex(ctx->Shared->BufferObjects);
 }
 
-void GLAPIENTRY
-_mesa_BindBufferRange(GLenum target, GLuint index,
-                      GLuint buffer, GLintptr offset, GLsizeiptr size)
+static ALWAYS_INLINE void
+bind_buffer_range(GLenum target, GLuint index, GLuint buffer, GLintptr offset,
+                  GLsizeiptr size, bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
@@ -3772,41 +4046,82 @@
                                      &bufObj, "glBindBufferRange"))
       return;
 
-   if (!bufObj) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glBindBufferRange(invalid buffer=%u)", buffer);
-      return;
-   }
+   if (no_error) {
+      switch (target) {
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
+         _mesa_bind_buffer_range_xfb(ctx, ctx->TransformFeedback.CurrentObject,
+                                     index, bufObj, offset, size);
+         return;
+      case GL_UNIFORM_BUFFER:
+         bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
+         return;
+      case GL_SHADER_STORAGE_BUFFER:
+         bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset,
+                                                 size);
+         return;
+      case GL_ATOMIC_COUNTER_BUFFER:
+         bind_atomic_buffer(ctx, index, bufObj, offset, size);
+         return;
+      default:
+         unreachable("invalid BindBufferRange target with KHR_no_error");
+      }
+   } else {
+      if (!bufObj) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBindBufferRange(invalid buffer=%u)", buffer);
+         return;
+      }
 
-   if (buffer != 0) {
-      if (size <= 0) {
-         _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(size=%d)",
-                     (int) size);
+      if (buffer != 0) {
+         if (size <= 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(size=%d)",
+                        (int) size);
+            return;
+         }
+      }
+
+      switch (target) {
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
+         if (!_mesa_validate_buffer_range_xfb(ctx,
+                                              ctx->TransformFeedback.CurrentObject,
+                                              index, bufObj, offset, size,
+                                              false))
+            return;
+
+         _mesa_bind_buffer_range_xfb(ctx, ctx->TransformFeedback.CurrentObject,
+                                     index, bufObj, offset, size);
+         return;
+      case GL_UNIFORM_BUFFER:
+         bind_buffer_range_uniform_buffer_err(ctx, index, bufObj, offset,
+                                              size);
+         return;
+      case GL_SHADER_STORAGE_BUFFER:
+         bind_buffer_range_shader_storage_buffer_err(ctx, index, bufObj,
+                                                     offset, size);
+         return;
+      case GL_ATOMIC_COUNTER_BUFFER:
+         bind_atomic_buffer_err(ctx, index, bufObj, offset, size,
+                                "glBindBufferRange");
+         return;
+      default:
+         _mesa_error(ctx, GL_INVALID_ENUM, "glBindBufferRange(target)");
          return;
       }
    }
+}
 
-   switch (target) {
-   case GL_TRANSFORM_FEEDBACK_BUFFER:
-      _mesa_bind_buffer_range_transform_feedback(ctx,
-                                                 ctx->TransformFeedback.CurrentObject,
-                                                 index, bufObj, offset, size,
-                                                 false);
-      return;
-   case GL_UNIFORM_BUFFER:
-      bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
-      return;
-   case GL_SHADER_STORAGE_BUFFER:
-      bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset, size);
-      return;
-   case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffer(ctx, index, bufObj, offset, size,
-                         "glBindBufferRange");
-      return;
-   default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "glBindBufferRange(target)");
-      return;
-   }
+void GLAPIENTRY
+_mesa_BindBufferRange_no_error(GLenum target, GLuint index, GLuint buffer,
+                               GLintptr offset, GLsizeiptr size)
+{
+   bind_buffer_range(target, index, buffer, offset, size, true);
+}
+
+void GLAPIENTRY
+_mesa_BindBufferRange(GLenum target, GLuint index,
+                      GLuint buffer, GLintptr offset, GLsizeiptr size)
+{
+   bind_buffer_range(target, index, buffer, offset, size, false);
 }
 
 void GLAPIENTRY
@@ -3874,8 +4189,8 @@
       bind_buffer_base_shader_storage_buffer(ctx, index, bufObj);
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffer(ctx, index, bufObj, 0, 0,
-                         "glBindBufferBase");
+      bind_atomic_buffer_err(ctx, index, bufObj, 0, 0,
+                             "glBindBufferBase");
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBufferBase(target)");
@@ -3955,6 +4270,25 @@
    }
 }
 
+static ALWAYS_INLINE void
+invalidate_buffer_subdata(struct gl_context *ctx,
+                          struct gl_buffer_object *bufObj, GLintptr offset,
+                          GLsizeiptr length)
+{
+   if (ctx->Driver.InvalidateBufferSubData)
+      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, offset, length);
+}
+
+void GLAPIENTRY
+_mesa_InvalidateBufferSubData_no_error(GLuint buffer, GLintptr offset,
+                                       GLsizeiptr length)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer);
+   invalidate_buffer_subdata(ctx, bufObj, offset, length);
+}
+
 void GLAPIENTRY
 _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
                               GLsizeiptr length)
@@ -4004,8 +4338,16 @@
       return;
    }
 
-   if (ctx->Driver.InvalidateBufferSubData)
-      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, offset, length);
+   invalidate_buffer_subdata(ctx, bufObj, offset, length);
+}
+
+void GLAPIENTRY
+_mesa_InvalidateBufferData_no_error(GLuint buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_buffer_object *bufObj =_mesa_lookup_bufferobj(ctx, buffer);
+   invalidate_buffer_subdata(ctx, bufObj, 0, bufObj->Size);
 }
 
 void GLAPIENTRY
@@ -4042,8 +4384,7 @@
       return;
    }
 
-   if (ctx->Driver.InvalidateBufferSubData)
-      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, 0, bufObj->Size);
+   invalidate_buffer_subdata(ctx, bufObj, 0, bufObj->Size);
 }
 
 static void
diff --git a/src/mesa/main/bufferobj.h b/src/mesa/main/bufferobj.h
index 259de94..662ceba 100644
--- a/src/mesa/main/bufferobj.h
+++ b/src/mesa/main/bufferobj.h
@@ -133,86 +133,63 @@
 _mesa_init_buffer_object_functions(struct dd_function_table *driver);
 
 extern void
-_mesa_buffer_storage(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                     GLenum target, GLsizeiptr size, const GLvoid *data,
-                     GLbitfield flags, const char *func);
-
-extern void
 _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
                   GLenum target, GLsizeiptr size, const GLvoid *data,
                   GLenum usage, const char *func);
 
 extern void
 _mesa_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                      GLintptr offset, GLsizeiptr size, const GLvoid *data,
-                      const char *func);
+                      GLintptr offset, GLsizeiptr size, const GLvoid *data);
 
 extern void
 _mesa_buffer_unmap_all_mappings(struct gl_context *ctx,
                                 struct gl_buffer_object *bufObj);
 
 extern void
-_mesa_copy_buffer_sub_data(struct gl_context *ctx,
-                           struct gl_buffer_object *src,
-                           struct gl_buffer_object *dst,
-                           GLintptr readOffset, GLintptr writeOffset,
-                           GLsizeiptr size, const char *func);
-
-extern void *
-_mesa_map_buffer_range(struct gl_context *ctx,
-                       struct gl_buffer_object *bufObj,
-                       GLintptr offset, GLsizeiptr length,
-                       GLbitfield access, const char *func);
-
-extern void
-_mesa_flush_mapped_buffer_range(struct gl_context *ctx,
-                                struct gl_buffer_object *bufObj,
-                                GLintptr offset, GLsizeiptr length,
-                                const char *func);
-
-extern void
 _mesa_ClearBufferSubData_sw(struct gl_context *ctx,
                             GLintptr offset, GLsizeiptr size,
                             const GLvoid *clearValue,
                             GLsizeiptr clearValueSize,
                             struct gl_buffer_object *bufObj);
 
-extern void
-_mesa_clear_buffer_sub_data(struct gl_context *ctx,
-                            struct gl_buffer_object *bufObj,
-                            GLenum internalformat,
-                            GLintptr offset, GLsizeiptr size,
-                            GLenum format, GLenum type,
-                            const GLvoid *data,
-                            const char *func, bool subdata);
-
-extern GLboolean
-_mesa_unmap_buffer(struct gl_context *ctx, struct gl_buffer_object *bufObj,
-                   const char *func);
-
 /*
  * API functions
  */
 void GLAPIENTRY
+_mesa_BindBuffer_no_error(GLenum target, GLuint buffer);
+
+void GLAPIENTRY
 _mesa_BindBuffer(GLenum target, GLuint buffer);
 
 void GLAPIENTRY
 _mesa_DeleteBuffers(GLsizei n, const GLuint * buffer);
 
 void GLAPIENTRY
+_mesa_GenBuffers_no_error(GLsizei n, GLuint *buffers);
+
+void GLAPIENTRY
 _mesa_GenBuffers(GLsizei n, GLuint *buffers);
 
 void GLAPIENTRY
+_mesa_CreateBuffers_no_error(GLsizei n, GLuint *buffers);
+
+void GLAPIENTRY
 _mesa_CreateBuffers(GLsizei n, GLuint *buffers);
 
 GLboolean GLAPIENTRY
 _mesa_IsBuffer(GLuint buffer);
 
 void GLAPIENTRY
+_mesa_BufferStorage_no_error(GLenum target, GLsizeiptr size,
+                             const GLvoid *data, GLbitfield flags);
+void GLAPIENTRY
 _mesa_BufferStorage(GLenum target, GLsizeiptr size, const GLvoid *data,
                     GLbitfield flags);
 
 void GLAPIENTRY
+_mesa_NamedBufferStorage_no_error(GLuint buffer, GLsizeiptr size,
+                                  const GLvoid *data, GLbitfield flags);
+void GLAPIENTRY
 _mesa_NamedBufferStorage(GLuint buffer, GLsizeiptr size, const GLvoid *data,
                          GLbitfield flags);
 
@@ -225,10 +202,16 @@
                       const GLvoid *data, GLenum usage);
 
 void GLAPIENTRY
+_mesa_BufferSubData_no_error(GLenum target, GLintptr offset,
+                             GLsizeiptr size, const GLvoid *data);
+void GLAPIENTRY
 _mesa_BufferSubData(GLenum target, GLintptr offset,
                     GLsizeiptr size, const GLvoid *data);
 
 void GLAPIENTRY
+_mesa_NamedBufferSubData_no_error(GLuint buffer, GLintptr offset,
+                                  GLsizeiptr size, const GLvoid *data);
+void GLAPIENTRY
 _mesa_NamedBufferSubData(GLuint buffer, GLintptr offset,
                          GLsizeiptr size, const GLvoid *data);
 
@@ -263,9 +246,13 @@
                               const GLvoid *data);
 
 GLboolean GLAPIENTRY
+_mesa_UnmapBuffer_no_error(GLenum target);
+GLboolean GLAPIENTRY
 _mesa_UnmapBuffer(GLenum target);
 
 GLboolean GLAPIENTRY
+_mesa_UnmapNamedBuffer_no_error(GLuint buffer);
+GLboolean GLAPIENTRY
 _mesa_UnmapNamedBuffer(GLuint buffer);
 
 void GLAPIENTRY
@@ -287,41 +274,66 @@
 void GLAPIENTRY
 _mesa_GetNamedBufferPointerv(GLuint buffer, GLenum pname, GLvoid **params);
 
-
+void GLAPIENTRY
+_mesa_CopyBufferSubData_no_error(GLenum readTarget, GLenum writeTarget,
+                                 GLintptr readOffset, GLintptr writeOffset,
+                                 GLsizeiptr size);
 void GLAPIENTRY
 _mesa_CopyBufferSubData(GLenum readTarget, GLenum writeTarget,
                         GLintptr readOffset, GLintptr writeOffset,
                         GLsizeiptr size);
 
 void GLAPIENTRY
+_mesa_CopyNamedBufferSubData_no_error(GLuint readBuffer, GLuint writeBuffer,
+                                      GLintptr readOffset,
+                                      GLintptr writeOffset, GLsizeiptr size);
+void GLAPIENTRY
 _mesa_CopyNamedBufferSubData(GLuint readBuffer, GLuint writeBuffer,
                              GLintptr readOffset, GLintptr writeOffset,
                              GLsizeiptr size);
 
 void * GLAPIENTRY
+_mesa_MapBufferRange_no_error(GLenum target, GLintptr offset,
+                              GLsizeiptr length, GLbitfield access);
+void * GLAPIENTRY
 _mesa_MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length,
                      GLbitfield access);
 
 void * GLAPIENTRY
+_mesa_MapNamedBufferRange_no_error(GLuint buffer, GLintptr offset,
+                                   GLsizeiptr length, GLbitfield access);
+void * GLAPIENTRY
 _mesa_MapNamedBufferRange(GLuint buffer, GLintptr offset, GLsizeiptr length,
                           GLbitfield access);
 
 void * GLAPIENTRY
+_mesa_MapBuffer_no_error(GLenum target, GLenum access);
+void * GLAPIENTRY
 _mesa_MapBuffer(GLenum target, GLenum access);
 
 void * GLAPIENTRY
+_mesa_MapNamedBuffer_no_error(GLuint buffer, GLenum access);
+void * GLAPIENTRY
 _mesa_MapNamedBuffer(GLuint buffer, GLenum access);
 
-
+void GLAPIENTRY
+_mesa_FlushMappedBufferRange_no_error(GLenum target, GLintptr offset,
+                                      GLsizeiptr length);
 void GLAPIENTRY
 _mesa_FlushMappedBufferRange(GLenum target,
                              GLintptr offset, GLsizeiptr length);
 
 void GLAPIENTRY
+_mesa_FlushMappedNamedBufferRange_no_error(GLuint buffer, GLintptr offset,
+                                           GLsizeiptr length);
+void GLAPIENTRY
 _mesa_FlushMappedNamedBufferRange(GLuint buffer, GLintptr offset,
                                   GLsizeiptr length);
 
 void GLAPIENTRY
+_mesa_BindBufferRange_no_error(GLenum target, GLuint index, GLuint buffer,
+                               GLintptr offset, GLsizeiptr size);
+void GLAPIENTRY
 _mesa_BindBufferRange(GLenum target, GLuint index,
                       GLuint buffer, GLintptr offset, GLsizeiptr size);
 
@@ -335,11 +347,19 @@
 void GLAPIENTRY
 _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
                       const GLuint *buffers);
+
+void GLAPIENTRY
+_mesa_InvalidateBufferSubData_no_error(GLuint buffer, GLintptr offset,
+                                       GLsizeiptr length);
+
 void GLAPIENTRY
 _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
                               GLsizeiptr length);
 
 void GLAPIENTRY
+_mesa_InvalidateBufferData_no_error(GLuint buffer);
+
+void GLAPIENTRY
 _mesa_InvalidateBufferData(GLuint buffer);
 
 void GLAPIENTRY
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 7d17ae9..6359e1b 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -231,7 +231,7 @@
          if (buffer >= GL_COLOR_ATTACHMENT8 && buffer <= GL_COLOR_ATTACHMENT31)
             return BUFFER_COUNT;
          /* error */
-         return -1;
+         return BUFFER_NONE;
    }
 }
 
@@ -738,11 +738,10 @@
  * renderbuffer for reading pixels.
  * \param mode color buffer such as GL_FRONT, GL_BACK, etc.
  */
-static void
+static ALWAYS_INLINE void
 read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
-            GLenum buffer, const char *caller)
+            GLenum buffer, const char *caller, bool no_error)
 {
-   GLbitfield supportedMask;
    gl_buffer_index srcBuffer;
 
    FLUSH_VERTICES(ctx, 0);
@@ -752,27 +751,33 @@
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
-      srcBuffer = -1;
+      srcBuffer = BUFFER_NONE;
    }
    else {
       /* general case / window-system framebuffer */
-      if (_mesa_is_gles3(ctx) && !is_legal_es3_readbuffer_enum(buffer))
-         srcBuffer = -1;
+      if (!no_error &&_mesa_is_gles3(ctx) &&
+          !is_legal_es3_readbuffer_enum(buffer))
+         srcBuffer = BUFFER_NONE;
       else
          srcBuffer = read_buffer_enum_to_index(ctx, buffer);
 
-      if (srcBuffer == -1) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "%s(invalid buffer %s)", caller,
-                     _mesa_enum_to_string(buffer));
-         return;
-      }
-      supportedMask = supported_buffer_bitmask(ctx, fb);
-      if (((1 << srcBuffer) & supportedMask) == 0) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(invalid buffer %s)", caller,
-                     _mesa_enum_to_string(buffer));
-         return;
+      if (!no_error) {
+         GLbitfield supportedMask;
+
+         if (srcBuffer == BUFFER_NONE) {
+            _mesa_error(ctx, GL_INVALID_ENUM,
+                        "%s(invalid buffer %s)", caller,
+                        _mesa_enum_to_string(buffer));
+            return;
+         }
+
+         supportedMask = supported_buffer_bitmask(ctx, fb);
+         if (((1 << srcBuffer) & supportedMask) == 0) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(invalid buffer %s)", caller,
+                        _mesa_enum_to_string(buffer));
+            return;
+         }
       }
    }
 
@@ -788,11 +793,52 @@
 }
 
 
+static void
+read_buffer_err(struct gl_context *ctx, struct gl_framebuffer *fb,
+                GLenum buffer, const char *caller)
+{
+   read_buffer(ctx, fb, buffer, caller, false);
+}
+
+
+static void
+read_buffer_no_error(struct gl_context *ctx, struct gl_framebuffer *fb,
+                     GLenum buffer, const char *caller)
+{
+   read_buffer(ctx, fb, buffer, caller, true);
+}
+
+
+void GLAPIENTRY
+_mesa_ReadBuffer_no_error(GLenum buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   read_buffer_no_error(ctx, ctx->ReadBuffer, buffer, "glReadBuffer");
+}
+
+
 void GLAPIENTRY
 _mesa_ReadBuffer(GLenum buffer)
 {
    GET_CURRENT_CONTEXT(ctx);
-   read_buffer(ctx, ctx->ReadBuffer, buffer, "glReadBuffer");
+   read_buffer_err(ctx, ctx->ReadBuffer, buffer, "glReadBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer_no_error(GLuint framebuffer, GLenum src)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer(ctx, framebuffer);
+   } else {
+      fb = ctx->WinSysReadBuffer;
+   }
+
+   read_buffer_no_error(ctx, fb, src, "glNamedFramebufferReadBuffer");
 }
 
 
@@ -811,5 +857,5 @@
    else
       fb = ctx->WinSysReadBuffer;
 
-   read_buffer(ctx, fb, src, "glNamedFramebufferReadBuffer");
+   read_buffer_err(ctx, fb, src, "glNamedFramebufferReadBuffer");
 }
diff --git a/src/mesa/main/buffers.h b/src/mesa/main/buffers.h
index 9df0815..ba06138 100644
--- a/src/mesa/main/buffers.h
+++ b/src/mesa/main/buffers.h
@@ -65,10 +65,15 @@
 extern void
 _mesa_update_draw_buffers(struct gl_context *ctx);
 
+void GLAPIENTRY
+_mesa_ReadBuffer_no_error(GLenum mode);
 
 extern void GLAPIENTRY
 _mesa_ReadBuffer( GLenum mode );
 
+void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer_no_error(GLuint framebuffer, GLenum src);
+
 extern void GLAPIENTRY
 _mesa_NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src);
 
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index a1bb36e..3adbe38 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -115,16 +115,17 @@
 {
    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[idx];
    GLuint c;
-   GLubyte colorMask = 0;
 
    if (rb) {
       for (c = 0; c < 4; c++) {
-         if (_mesa_format_has_color_component(rb->Format, c))
-            colorMask |= ctx->Color.ColorMask[idx][c];
+         if (ctx->Color.ColorMask[idx][c] &&
+             _mesa_format_has_color_component(rb->Format, c)) {
+            return true;
+         }
       }
    }
 
-   return colorMask != 0;
+   return false;
 }
 
 
@@ -139,40 +140,36 @@
  * GL_RENDER then requests the driver to clear the buffers, via the
  * dd_function_table::Clear callback.
  */
-void GLAPIENTRY
-_mesa_Clear( GLbitfield mask )
+static ALWAYS_INLINE void
+clear(struct gl_context *ctx, GLbitfield mask, bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
    FLUSH_VERTICES(ctx, 0);
-
    FLUSH_CURRENT(ctx, 0);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glClear 0x%x\n", mask);
+   if (!no_error) {
+      if (mask & ~(GL_COLOR_BUFFER_BIT |
+                   GL_DEPTH_BUFFER_BIT |
+                   GL_STENCIL_BUFFER_BIT |
+                   GL_ACCUM_BUFFER_BIT)) {
+         _mesa_error( ctx, GL_INVALID_VALUE, "glClear(0x%x)", mask);
+         return;
+      }
 
-   if (mask & ~(GL_COLOR_BUFFER_BIT |
-                GL_DEPTH_BUFFER_BIT |
-                GL_STENCIL_BUFFER_BIT |
-                GL_ACCUM_BUFFER_BIT)) {
-      /* invalid bit set */
-      _mesa_error( ctx, GL_INVALID_VALUE, "glClear(0x%x)", mask);
-      return;
-   }
-
-   /* Accumulation buffers were removed in core contexts, and they never
-    * existed in OpenGL ES.
-    */
-   if ((mask & GL_ACCUM_BUFFER_BIT) != 0
-       && (ctx->API == API_OPENGL_CORE || _mesa_is_gles(ctx))) {
-      _mesa_error( ctx, GL_INVALID_VALUE, "glClear(GL_ACCUM_BUFFER_BIT)");
-      return;
+      /* Accumulation buffers were removed in core contexts, and they never
+       * existed in OpenGL ES.
+       */
+      if ((mask & GL_ACCUM_BUFFER_BIT) != 0
+          && (ctx->API == API_OPENGL_CORE || _mesa_is_gles(ctx))) {
+         _mesa_error( ctx, GL_INVALID_VALUE, "glClear(GL_ACCUM_BUFFER_BIT)");
+         return;
+      }
    }
 
    if (ctx->NewState) {
       _mesa_update_state( ctx );	/* update _Xmin, etc */
    }
 
-   if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+   if (!no_error && ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
       _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
                   "glClear(incomplete framebuffer)");
       return;
@@ -226,6 +223,26 @@
 }
 
 
+void GLAPIENTRY
+_mesa_Clear_no_error(GLbitfield mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   clear(ctx, mask, true);
+}
+
+
+void GLAPIENTRY
+_mesa_Clear(GLbitfield mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glClear 0x%x\n", mask);
+
+   clear(ctx, mask, false);
+}
+
+
 /** Returned by make_color_buffer_mask() for errors */
 #define INVALID_MASK ~0x0U
 
diff --git a/src/mesa/main/clear.h b/src/mesa/main/clear.h
index fb3bcde..6ae63ac 100644
--- a/src/mesa/main/clear.h
+++ b/src/mesa/main/clear.h
@@ -43,6 +43,8 @@
 extern void GLAPIENTRY
 _mesa_ClearColorIuiEXT(GLuint r, GLuint g, GLuint b, GLuint a);
 
+void GLAPIENTRY
+_mesa_Clear_no_error(GLbitfield mask);
 
 extern void GLAPIENTRY
 _mesa_Clear( GLbitfield mask );
diff --git a/src/mesa/main/clip.c b/src/mesa/main/clip.c
index f994728..0950283 100644
--- a/src/mesa/main/clip.c
+++ b/src/mesa/main/clip.c
@@ -83,7 +83,9 @@
    if (TEST_EQ_4V(ctx->Transform.EyeUserPlane[p], equation))
       return;
 
+   /* EyeUserPlane is used by program state constants. */
    FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+   ctx->NewDriverState |= ctx->DriverFlags.NewClipPlane;
    COPY_4FV(ctx->Transform.EyeUserPlane[p], equation);
 
    if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
diff --git a/src/mesa/main/compute.c b/src/mesa/main/compute.c
index bb62539..cbd166b 100644
--- a/src/mesa/main/compute.c
+++ b/src/mesa/main/compute.c
@@ -22,23 +22,241 @@
  */
 
 #include "glheader.h"
+#include "bufferobj.h"
 #include "compute.h"
 #include "context.h"
-#include "api_validate.h"
 
-void GLAPIENTRY
-_mesa_DispatchCompute(GLuint num_groups_x,
-                      GLuint num_groups_y,
-                      GLuint num_groups_z)
+static bool
+check_valid_to_compute(struct gl_context *ctx, const char *function)
+{
+   if (!_mesa_has_compute_shaders(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "unsupported function (%s) called",
+                  function);
+      return false;
+   }
+
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+    *
+    * "An INVALID_OPERATION error is generated if there is no active program
+    *  for the compute shader stage."
+    */
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE] == NULL) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(no active compute shader)",
+                  function);
+      return false;
+   }
+
+   return true;
+}
+
+static bool
+validate_DispatchCompute(struct gl_context *ctx, const GLuint *num_groups)
+{
+   if (!check_valid_to_compute(ctx, "glDispatchCompute"))
+      return GL_FALSE;
+
+   for (int i = 0; i < 3; i++) {
+      /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+       *
+       * "An INVALID_VALUE error is generated if any of num_groups_x,
+       *  num_groups_y and num_groups_z are greater than or equal to the
+       *  maximum work group count for the corresponding dimension."
+       *
+       * However, the "or equal to" portions appears to be a specification
+       * bug. In all other areas, the specification appears to indicate that
+       * the number of workgroups can match the MAX_COMPUTE_WORK_GROUP_COUNT
+       * value. For example, under DispatchComputeIndirect:
+       *
+       * "If any of num_groups_x, num_groups_y or num_groups_z is greater than
+       *  the value of MAX_COMPUTE_WORK_GROUP_COUNT for the corresponding
+       *  dimension then the results are undefined."
+       *
+       * Additionally, the OpenGLES 3.1 specification does not contain "or
+       * equal to" as an error condition.
+       */
+      if (num_groups[i] > ctx->Const.MaxComputeWorkGroupCount[i]) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glDispatchCompute(num_groups_%c)", 'x' + i);
+         return GL_FALSE;
+      }
+   }
+
+   /* The ARB_compute_variable_group_size spec says:
+    *
+    * "An INVALID_OPERATION error is generated by DispatchCompute if the active
+    *  program for the compute shader stage has a variable work group size."
+    */
+   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+   if (prog->info.cs.local_size_variable) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glDispatchCompute(variable work group size forbidden)");
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static bool
+validate_DispatchComputeGroupSizeARB(struct gl_context *ctx,
+                                     const GLuint *num_groups,
+                                     const GLuint *group_size)
+{
+   GLuint total_invocations = 1;
+
+   if (!check_valid_to_compute(ctx, "glDispatchComputeGroupSizeARB"))
+      return GL_FALSE;
+
+   /* The ARB_compute_variable_group_size spec says:
+    *
+    * "An INVALID_OPERATION error is generated by
+    *  DispatchComputeGroupSizeARB if the active program for the compute
+    *  shader stage has a fixed work group size."
+    */
+   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+   if (!prog->info.cs.local_size_variable) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glDispatchComputeGroupSizeARB(fixed work group size "
+                  "forbidden)");
+      return GL_FALSE;
+   }
+
+   for (int i = 0; i < 3; i++) {
+      /* The ARB_compute_variable_group_size spec says:
+       *
+       * "An INVALID_VALUE error is generated if any of num_groups_x,
+       *  num_groups_y and num_groups_z are greater than or equal to the
+       *  maximum work group count for the corresponding dimension."
+       */
+      if (num_groups[i] > ctx->Const.MaxComputeWorkGroupCount[i]) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glDispatchComputeGroupSizeARB(num_groups_%c)", 'x' + i);
+         return GL_FALSE;
+      }
+
+      /* The ARB_compute_variable_group_size spec says:
+       *
+       * "An INVALID_VALUE error is generated by DispatchComputeGroupSizeARB if
+       *  any of <group_size_x>, <group_size_y>, or <group_size_z> is less than
+       *  or equal to zero or greater than the maximum local work group size
+       *  for compute shaders with variable group size
+       *  (MAX_COMPUTE_VARIABLE_GROUP_SIZE_ARB) in the corresponding
+       *  dimension."
+       *
+       * However, the "less than" is a spec bug because they are declared as
+       * unsigned integers.
+       */
+      if (group_size[i] == 0 ||
+          group_size[i] > ctx->Const.MaxComputeVariableGroupSize[i]) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glDispatchComputeGroupSizeARB(group_size_%c)", 'x' + i);
+         return GL_FALSE;
+      }
+
+      total_invocations *= group_size[i];
+   }
+
+   /* The ARB_compute_variable_group_size spec says:
+    *
+    * "An INVALID_VALUE error is generated by DispatchComputeGroupSizeARB if
+    *  the product of <group_size_x>, <group_size_y>, and <group_size_z> exceeds
+    *  the implementation-dependent maximum local work group invocation count
+    *  for compute shaders with variable group size
+    *  (MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB)."
+    */
+   if (total_invocations > ctx->Const.MaxComputeVariableGroupInvocations) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glDispatchComputeGroupSizeARB(product of local_sizes "
+                  "exceeds MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB "
+                  "(%d > %d))", total_invocations,
+                  ctx->Const.MaxComputeVariableGroupInvocations);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static bool
+valid_dispatch_indirect(struct gl_context *ctx,  GLintptr indirect)
+{
+   GLsizei size = 3 * sizeof(GLuint);
+   const uint64_t end = (uint64_t) indirect + size;
+   const char *name = "glDispatchComputeIndirect";
+
+   if (!check_valid_to_compute(ctx, name))
+      return GL_FALSE;
+
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+    *
+    * "An INVALID_VALUE error is generated if indirect is negative or is not a
+    *  multiple of four."
+    */
+   if (indirect & (sizeof(GLuint) - 1)) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(indirect is not aligned)", name);
+      return GL_FALSE;
+   }
+
+   if (indirect < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(indirect is less than zero)", name);
+      return GL_FALSE;
+   }
+
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+    *
+    * "An INVALID_OPERATION error is generated if no buffer is bound to the
+    *  DRAW_INDIRECT_BUFFER binding, or if the command would source data
+    *  beyond the end of the buffer object."
+    */
+   if (!_mesa_is_bufferobj(ctx->DispatchIndirectBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s: no buffer bound to DISPATCH_INDIRECT_BUFFER", name);
+      return GL_FALSE;
+   }
+
+   if (_mesa_check_disallowed_mapping(ctx->DispatchIndirectBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(DISPATCH_INDIRECT_BUFFER is mapped)", name);
+      return GL_FALSE;
+   }
+
+   if (ctx->DispatchIndirectBuffer->Size < end) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(DISPATCH_INDIRECT_BUFFER too small)", name);
+      return GL_FALSE;
+   }
+
+   /* The ARB_compute_variable_group_size spec says:
+    *
+    * "An INVALID_OPERATION error is generated if the active program for the
+    *  compute shader stage has a variable work group size."
+    */
+   struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+   if (prog->info.cs.local_size_variable) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(variable work group size forbidden)", name);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static ALWAYS_INLINE void
+dispatch_compute(GLuint num_groups_x, GLuint num_groups_y,
+                 GLuint num_groups_z, bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
    const GLuint num_groups[3] = { num_groups_x, num_groups_y, num_groups_z };
 
+   FLUSH_CURRENT(ctx, 0);
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDispatchCompute(%d, %d, %d)\n",
                   num_groups_x, num_groups_y, num_groups_z);
 
-   if (!_mesa_validate_DispatchCompute(ctx, num_groups))
+   if (!no_error && !validate_DispatchCompute(ctx, num_groups))
       return;
 
    if (num_groups_x == 0u || num_groups_y == 0u || num_groups_z == 0u)
@@ -47,37 +265,69 @@
    ctx->Driver.DispatchCompute(ctx, num_groups);
 }
 
-extern void GLAPIENTRY
-_mesa_DispatchComputeIndirect(GLintptr indirect)
+void GLAPIENTRY
+_mesa_DispatchCompute_no_error(GLuint num_groups_x, GLuint num_groups_y,
+                               GLuint num_groups_z)
+{
+   dispatch_compute(num_groups_x, num_groups_y, num_groups_z, true);
+}
+
+void GLAPIENTRY
+_mesa_DispatchCompute(GLuint num_groups_x,
+                      GLuint num_groups_y,
+                      GLuint num_groups_z)
+{
+   dispatch_compute(num_groups_x, num_groups_y, num_groups_z, false);
+}
+
+static ALWAYS_INLINE void
+dispatch_compute_indirect(GLintptr indirect, bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   FLUSH_CURRENT(ctx, 0);
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDispatchComputeIndirect(%ld)\n", (long) indirect);
 
-   if (!_mesa_validate_DispatchComputeIndirect(ctx, indirect))
+   if (!no_error && !valid_dispatch_indirect(ctx, indirect))
       return;
 
    ctx->Driver.DispatchComputeIndirect(ctx, indirect);
 }
 
-void GLAPIENTRY
-_mesa_DispatchComputeGroupSizeARB(GLuint num_groups_x, GLuint num_groups_y,
-                                  GLuint num_groups_z, GLuint group_size_x,
-                                  GLuint group_size_y, GLuint group_size_z)
+extern void GLAPIENTRY
+_mesa_DispatchComputeIndirect_no_error(GLintptr indirect)
+{
+   dispatch_compute_indirect(indirect, true);
+}
+
+extern void GLAPIENTRY
+_mesa_DispatchComputeIndirect(GLintptr indirect)
+{
+   dispatch_compute_indirect(indirect, false);
+}
+
+static ALWAYS_INLINE void
+dispatch_compute_group_size(GLuint num_groups_x, GLuint num_groups_y,
+                            GLuint num_groups_z, GLuint group_size_x,
+                            GLuint group_size_y, GLuint group_size_z,
+                            bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
    const GLuint num_groups[3] = { num_groups_x, num_groups_y, num_groups_z };
    const GLuint group_size[3] = { group_size_x, group_size_y, group_size_z };
 
+   FLUSH_CURRENT(ctx, 0);
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx,
                   "glDispatchComputeGroupSizeARB(%d, %d, %d, %d, %d, %d)\n",
                   num_groups_x, num_groups_y, num_groups_z,
                   group_size_x, group_size_y, group_size_z);
 
-   if (!_mesa_validate_DispatchComputeGroupSizeARB(ctx, num_groups,
-                                                   group_size))
+   if (!no_error &&
+       !validate_DispatchComputeGroupSizeARB(ctx, num_groups, group_size))
       return;
 
    if (num_groups_x == 0u || num_groups_y == 0u || num_groups_z == 0u)
@@ -85,3 +335,26 @@
 
    ctx->Driver.DispatchComputeGroupSize(ctx, num_groups, group_size);
 }
+
+void GLAPIENTRY
+_mesa_DispatchComputeGroupSizeARB_no_error(GLuint num_groups_x,
+                                           GLuint num_groups_y,
+                                           GLuint num_groups_z,
+                                           GLuint group_size_x,
+                                           GLuint group_size_y,
+                                           GLuint group_size_z)
+{
+   dispatch_compute_group_size(num_groups_x, num_groups_y, num_groups_z,
+                               group_size_x, group_size_y, group_size_z,
+                               true);
+}
+
+void GLAPIENTRY
+_mesa_DispatchComputeGroupSizeARB(GLuint num_groups_x, GLuint num_groups_y,
+                                  GLuint num_groups_z, GLuint group_size_x,
+                                  GLuint group_size_y, GLuint group_size_z)
+{
+   dispatch_compute_group_size(num_groups_x, num_groups_y, num_groups_z,
+                               group_size_x, group_size_y, group_size_z,
+                               false);
+}
diff --git a/src/mesa/main/compute.h b/src/mesa/main/compute.h
index 8018bbb..bfb3223d 100644
--- a/src/mesa/main/compute.h
+++ b/src/mesa/main/compute.h
@@ -28,14 +28,26 @@
 #include "glheader.h"
 
 extern void GLAPIENTRY
+_mesa_DispatchCompute_no_error(GLuint num_groups_x, GLuint num_groups_y,
+                               GLuint num_groups_z);
+extern void GLAPIENTRY
 _mesa_DispatchCompute(GLuint num_groups_x,
                       GLuint num_groups_y,
                       GLuint num_groups_z);
 
 extern void GLAPIENTRY
+_mesa_DispatchComputeIndirect_no_error(GLintptr indirect);
+extern void GLAPIENTRY
 _mesa_DispatchComputeIndirect(GLintptr indirect);
 
 extern void GLAPIENTRY
+_mesa_DispatchComputeGroupSizeARB_no_error(GLuint num_groups_x,
+                                           GLuint num_groups_y,
+                                           GLuint num_groups_z,
+                                           GLuint group_size_x,
+                                           GLuint group_size_y,
+                                           GLuint group_size_z);
+extern void GLAPIENTRY
 _mesa_DispatchComputeGroupSizeARB(GLuint num_groups_x, GLuint num_groups_y,
                                   GLuint num_groups_z, GLuint group_size_x,
                                   GLuint group_size_y, GLuint group_size_z);
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 77e11d2..3aabdc9 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -133,6 +133,7 @@
 #include "varray.h"
 #include "version.h"
 #include "viewport.h"
+#include "texturebindless.h"
 #include "program/program.h"
 #include "math/m_matrix.h"
 #include "main/dispatch.h" /* for _gloffset_COUNT */
@@ -855,6 +856,7 @@
    _mesa_init_transform_feedback( ctx );
    _mesa_init_varray( ctx );
    _mesa_init_viewport( ctx );
+   _mesa_init_resident_handles( ctx );
 
    if (!_mesa_init_texture( ctx ))
       return GL_FALSE;
@@ -1208,6 +1210,16 @@
    if (!init_attrib_groups( ctx ))
       goto fail;
 
+   /* KHR_no_error is likely to crash, overflow memory, etc if an application
+    * has errors so don't enable it for setuid processes.
+    */
+   if (getenv("MESA_NO_ERROR")) {
+#if !defined(_WIN32)
+      if (geteuid() == getuid())
+#endif
+         ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR;
+   }
+
    /* setup the API dispatch tables with all nop functions */
    ctx->OutsideBeginEnd = _mesa_alloc_dispatch_table();
    if (!ctx->OutsideBeginEnd)
@@ -1312,6 +1324,8 @@
    _mesa_reference_program(ctx, &ctx->FragmentProgram._Current, NULL);
    _mesa_reference_program(ctx, &ctx->FragmentProgram._TexEnvProgram, NULL);
 
+   _mesa_reference_program(ctx, &ctx->ComputeProgram._Current, NULL);
+
    _mesa_reference_vao(ctx, &ctx->Array.VAO, NULL);
    _mesa_reference_vao(ctx, &ctx->Array.DefaultVAO, NULL);
 
@@ -1329,6 +1343,7 @@
    _mesa_free_transform_feedback(ctx);
    _mesa_free_performance_monitors(ctx);
    _mesa_free_performance_queries(ctx);
+   _mesa_free_resident_handles(ctx);
 
    _mesa_reference_buffer_object(ctx, &ctx->Pack.BufferObj, NULL);
    _mesa_reference_buffer_object(ctx, &ctx->Unpack.BufferObj, NULL);
@@ -1650,8 +1665,9 @@
        /* make sure this context is valid for flushing */
        curCtx != newCtx &&
        curCtx->Const.ContextReleaseBehavior ==
-       GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH)
+       GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH) {
       _mesa_flush(curCtx);
+   }
 
    /* We used to call _glapi_check_multithread() here.  Now do it in drivers */
 
@@ -1821,20 +1837,6 @@
 
 
 /**
- * Flush commands and wait for completion.
- */
-void
-_mesa_finish(struct gl_context *ctx)
-{
-   FLUSH_VERTICES( ctx, 0 );
-   FLUSH_CURRENT( ctx, 0 );
-   if (ctx->Driver.Finish) {
-      ctx->Driver.Finish(ctx);
-   }
-}
-
-
-/**
  * Flush commands.
  */
 void
@@ -1850,7 +1852,7 @@
 
 
 /**
- * Execute glFinish().
+ * Flush commands and wait for completion.
  *
  * Calls the #ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH macro and the
  * dd_function_table::Finish driver callback, if not NULL.
@@ -1860,7 +1862,13 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END(ctx);
-   _mesa_finish(ctx);
+
+   FLUSH_VERTICES(ctx, 0);
+   FLUSH_CURRENT(ctx, 0);
+
+   if (ctx->Driver.Finish) {
+      ctx->Driver.Finish(ctx);
+   }
 }
 
 
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 9704a96..4f75f57 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -162,9 +162,6 @@
 
 
 extern void
-_mesa_finish(struct gl_context *ctx);
-
-extern void
 _mesa_flush(struct gl_context *ctx);
 
 extern void GLAPIENTRY
@@ -324,6 +321,13 @@
 }
 
 
+static inline bool
+_mesa_is_no_error_enabled(const struct gl_context *ctx)
+{
+   return ctx->Const.ContextFlags & GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR;
+}
+
+
 /**
  * Checks if the context supports geometry shaders.
  */
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index cf25159..10777cf 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -57,16 +57,16 @@
  * \return true if success, false if error
  */
 static bool
-prepare_target(struct gl_context *ctx, GLuint name, GLenum target,
-               int level, int z, int depth,
-               struct gl_texture_image **tex_image,
-               struct gl_renderbuffer **renderbuffer,
-               mesa_format *format,
-               GLenum *internalFormat,
-               GLuint *width,
-               GLuint *height,
-               GLuint *num_samples,
-               const char *dbg_prefix)
+prepare_target_err(struct gl_context *ctx, GLuint name, GLenum target,
+                   int level, int z, int depth,
+                   struct gl_texture_image **tex_image,
+                   struct gl_renderbuffer **renderbuffer,
+                   mesa_format *format,
+                   GLenum *internalFormat,
+                   GLuint *width,
+                   GLuint *height,
+                   GLuint *num_samples,
+                   const char *dbg_prefix)
 {
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
@@ -149,9 +149,64 @@
          return false;
       }
 
+      /* The ARB_copy_image specification says:
+       *
+       *    "INVALID_OPERATION is generated if either object is a texture and
+       *     the texture is not complete (as defined in section 3.9.14)"
+       *
+       * The cited section says:
+       *
+       *    "Using the preceding definitions, a texture is complete unless any
+       *     of the following conditions hold true: [...]
+       *
+       *     * The minification filter requires a mipmap (is neither NEAREST
+       *       nor LINEAR), and the texture is not mipmap complete."
+       *
+       * This imposes the bizarre restriction that glCopyImageSubData requires
+       * mipmap completion based on the sampler minification filter, even
+       * though the call fundamentally ignores the sampler.  Additionally, it
+       * doesn't work with texture units, so it can't consider any bound
+       * separate sampler objects.  It appears that you're supposed to use
+       * the sampler object which is built-in to the texture object.
+       *
+       * dEQP and the Android CTS mandate this behavior, and the Khronos
+       * GL and ES working groups both affirmed that this is unfortunate but
+       * correct.  See https://cvs.khronos.org/bugzilla/show_bug.cgi?id=16224.
+       *
+       * Integer textures with filtering cause another completeness snag:
+       *
+       *    "Any of:
+       *     – The internal format of the texture is integer (see table 8.12).
+       *     – The internal format is STENCIL_INDEX.
+       *     – The internal format is DEPTH_STENCIL, and the value of
+       *       DEPTH_STENCIL_TEXTURE_MODE for the texture is STENCIL_INDEX.
+       *     and either the magnification filter is not NEAREST, or the
+       *     minification filter is neither NEAREST nor
+       *     NEAREST_MIPMAP_NEAREST."
+       *
+       * However, applications in the wild (such as "Total War: WARHAMMER")
+       * appear to call glCopyImageSubData with integer textures and the
+       * default mipmap filters of GL_LINEAR and GL_NEAREST_MIPMAP_LINEAR,
+       * which would be considered incomplete, but expect this to work.  In
+       * fact, until VK-GL-CTS commit fef80039ff875a51806b54d151c5f2d0c12da,
+       * the GL 4.5 CTS contained three tests which did the exact same thing
+       * by accident, and all conformant implementations allowed it.
+       *
+       * A proposal was made to amend the spec to say "is not complete (as
+       * defined in section <X>, but ignoring format-based completeness
+       * rules)" to allow this case.  It makes some sense, given that
+       * glCopyImageSubData copies raw data without considering format.
+       * While the official edits have not yet been made, the OpenGL
+       * working group agreed with the idea of allowing this behavior.
+       *
+       * To ignore formats, we check texObj->_MipmapComplete directly
+       * rather than calling _mesa_is_texture_complete().
+       */
       _mesa_test_texobj_completeness(ctx, texObj);
-      if (!texObj->_BaseComplete ||
-          (level != 0 && !texObj->_MipmapComplete)) {
+      const bool texture_complete_aside_from_formats =
+         _mesa_is_mipmap_filter(&texObj->Sampler) ? texObj->_MipmapComplete
+                                                  : texObj->_BaseComplete;
+      if (!texture_complete_aside_from_formats) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyImageSubData(%sName incomplete)", dbg_prefix);
          return false;
@@ -214,6 +269,30 @@
    return true;
 }
 
+static void
+prepare_target(struct gl_context *ctx, GLuint name, GLenum target,
+               int level, int z,
+               struct gl_texture_image **texImage,
+               struct gl_renderbuffer **renderbuffer)
+{
+   if (target == GL_RENDERBUFFER) {
+      struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, name);
+
+      *renderbuffer = rb;
+      *texImage = NULL;
+   } else {
+      struct gl_texture_object *texObj = _mesa_lookup_texture(ctx, name);
+
+      if (target == GL_TEXTURE_CUBE_MAP) {
+         *texImage = texObj->Image[z][level];
+      }
+      else {
+         *texImage = _mesa_select_tex_image(texObj, target, level);
+      }
+
+      *renderbuffer = NULL;
+   }
+}
 
 /**
  * Check that the x,y,z,width,height,region is within the texture image
@@ -450,6 +529,71 @@
    return false;
 }
 
+static void
+copy_image_subdata(struct gl_context *ctx,
+                   struct gl_texture_image *srcTexImage,
+                   struct gl_renderbuffer *srcRenderbuffer,
+                   int srcX, int srcY, int srcZ, int srcLevel,
+                   struct gl_texture_image *dstTexImage,
+                   struct gl_renderbuffer *dstRenderbuffer,
+                   int dstX, int dstY, int dstZ, int dstLevel,
+                   int srcWidth, int srcHeight, int srcDepth)
+{
+   /* loop over 2D slices/faces/layers */
+   for (int i = 0; i < srcDepth; ++i) {
+      int newSrcZ = srcZ + i;
+      int newDstZ = dstZ + i;
+
+      if (srcTexImage &&
+          srcTexImage->TexObject->Target == GL_TEXTURE_CUBE_MAP) {
+         /* need to update srcTexImage pointer for the cube face */
+         assert(srcZ + i < MAX_FACES);
+         srcTexImage = srcTexImage->TexObject->Image[srcZ + i][srcLevel];
+         assert(srcTexImage);
+         newSrcZ = 0;
+      }
+
+      if (dstTexImage &&
+          dstTexImage->TexObject->Target == GL_TEXTURE_CUBE_MAP) {
+         /* need to update dstTexImage pointer for the cube face */
+         assert(dstZ + i < MAX_FACES);
+         dstTexImage = dstTexImage->TexObject->Image[dstZ + i][dstLevel];
+         assert(dstTexImage);
+         newDstZ = 0;
+      }
+
+      ctx->Driver.CopyImageSubData(ctx,
+                                   srcTexImage, srcRenderbuffer,
+                                   srcX, srcY, newSrcZ,
+                                   dstTexImage, dstRenderbuffer,
+                                   dstX, dstY, newDstZ,
+                                   srcWidth, srcHeight);
+   }
+}
+
+void GLAPIENTRY
+_mesa_CopyImageSubData_no_error(GLuint srcName, GLenum srcTarget, GLint srcLevel,
+                                GLint srcX, GLint srcY, GLint srcZ,
+                                GLuint dstName, GLenum dstTarget, GLint dstLevel,
+                                GLint dstX, GLint dstY, GLint dstZ,
+                                GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth)
+{
+   struct gl_texture_image *srcTexImage, *dstTexImage;
+   struct gl_renderbuffer *srcRenderbuffer, *dstRenderbuffer;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   prepare_target(ctx, srcName, srcTarget, srcLevel, srcZ, &srcTexImage,
+                  &srcRenderbuffer);
+
+   prepare_target(ctx, dstName, dstTarget, dstLevel, dstZ, &dstTexImage,
+                  &dstRenderbuffer);
+
+   copy_image_subdata(ctx, srcTexImage, srcRenderbuffer, srcX, srcY, srcZ,
+                      srcLevel, dstTexImage, dstRenderbuffer, dstX, dstY, dstZ,
+                      dstLevel, srcWidth, srcHeight, srcDepth);
+}
+
 void GLAPIENTRY
 _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
                        GLint srcX, GLint srcY, GLint srcZ,
@@ -466,7 +610,6 @@
    GLuint src_bw, src_bh, dst_bw, dst_bh;
    GLuint src_num_samples, dst_num_samples;
    int dstWidth, dstHeight, dstDepth;
-   int i;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
@@ -484,14 +627,16 @@
       return;
    }
 
-   if (!prepare_target(ctx, srcName, srcTarget, srcLevel, srcZ, srcDepth,
-                       &srcTexImage, &srcRenderbuffer, &srcFormat,
-                       &srcIntFormat, &src_w, &src_h, &src_num_samples, "src"))
+   if (!prepare_target_err(ctx, srcName, srcTarget, srcLevel, srcZ, srcDepth,
+                           &srcTexImage, &srcRenderbuffer, &srcFormat,
+                           &srcIntFormat, &src_w, &src_h, &src_num_samples,
+                           "src"))
       return;
 
-   if (!prepare_target(ctx, dstName, dstTarget, dstLevel, dstZ, srcDepth,
-                       &dstTexImage, &dstRenderbuffer, &dstFormat,
-                       &dstIntFormat, &dst_w, &dst_h, &dst_num_samples, "dst"))
+   if (!prepare_target_err(ctx, dstName, dstTarget, dstLevel, dstZ, srcDepth,
+                           &dstTexImage, &dstRenderbuffer, &dstFormat,
+                           &dstIntFormat, &dst_w, &dst_h, &dst_num_samples,
+                           "dst"))
       return;
 
    _mesa_get_format_block_size(srcFormat, &src_bw, &src_bh);
@@ -580,34 +725,7 @@
       return;
    }
 
-   /* loop over 2D slices/faces/layers */
-   for (i = 0; i < srcDepth; ++i) {
-      int newSrcZ = srcZ + i;
-      int newDstZ = dstZ + i;
-
-      if (srcTexImage &&
-          srcTexImage->TexObject->Target == GL_TEXTURE_CUBE_MAP) {
-         /* need to update srcTexImage pointer for the cube face */
-         assert(srcZ + i < MAX_FACES);
-         srcTexImage = srcTexImage->TexObject->Image[srcZ + i][srcLevel];
-         assert(srcTexImage);
-         newSrcZ = 0;
-      }
-
-      if (dstTexImage &&
-          dstTexImage->TexObject->Target == GL_TEXTURE_CUBE_MAP) {
-         /* need to update dstTexImage pointer for the cube face */
-         assert(dstZ + i < MAX_FACES);
-         dstTexImage = dstTexImage->TexObject->Image[dstZ + i][dstLevel];
-         assert(dstTexImage);
-         newDstZ = 0;
-      }
-
-      ctx->Driver.CopyImageSubData(ctx,
-                                   srcTexImage, srcRenderbuffer,
-                                   srcX, srcY, newSrcZ,
-                                   dstTexImage, dstRenderbuffer,
-                                   dstX, dstY, newDstZ,
-                                   srcWidth, srcHeight);
-   }
+   copy_image_subdata(ctx, srcTexImage, srcRenderbuffer, srcX, srcY, srcZ,
+                      srcLevel, dstTexImage, dstRenderbuffer, dstX, dstY, dstZ,
+                      dstLevel, srcWidth, srcHeight, srcDepth);
 }
diff --git a/src/mesa/main/copyimage.h b/src/mesa/main/copyimage.h
index 40e95b6..ea2f15b4 100644
--- a/src/mesa/main/copyimage.h
+++ b/src/mesa/main/copyimage.h
@@ -35,6 +35,13 @@
 extern "C" {
 #endif
 
+void GLAPIENTRY
+_mesa_CopyImageSubData_no_error(GLuint srcName, GLenum srcTarget, GLint srcLevel,
+                                GLint srcX, GLint srcY, GLint srcZ,
+                                GLuint destName, GLenum destTarget, GLint destLevel,
+                                GLint destX, GLint destY, GLint destZ,
+                                GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
+
 extern void GLAPIENTRY
 _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
                        GLint srcX, GLint srcY, GLint srcZ,
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 3f31025..8e382e1 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -50,6 +50,7 @@
 struct gl_texture_image;
 struct gl_texture_object;
 struct gl_memory_info;
+struct util_queue_monitoring;
 
 /* GL_ARB_vertex_buffer_object */
 /* Modifies GL_MAP_UNSYNCHRONIZED_BIT to allow driver to fail (return
@@ -93,7 +94,7 @@
     * This is in addition to any state change callbacks Mesa may already have
     * made.
     */
-   void (*UpdateState)( struct gl_context *ctx, GLbitfield new_state );
+   void (*UpdateState)(struct gl_context *ctx);
 
    /**
     * This is called whenever glFinish() is called.
@@ -1039,7 +1040,8 @@
     *
     * Mesa will only call this function if GL multithreading is enabled.
     */
-   void (*SetBackgroundContext)(struct gl_context *ctx);
+   void (*SetBackgroundContext)(struct gl_context *ctx,
+                                struct util_queue_monitoring *queue_info);
 
    /**
     * \name GL_ARB_sparse_buffer interface
@@ -1050,6 +1052,23 @@
                                 GLintptr offset, GLsizeiptr size,
                                 GLboolean commit);
    /*@}*/
+
+   /**
+    * \name GL_ARB_bindless_texture interface
+    */
+   /*@{*/
+   GLuint64 (*NewTextureHandle)(struct gl_context *ctx,
+                                struct gl_texture_object *texObj,
+                                struct gl_sampler_object *sampObj);
+   void (*DeleteTextureHandle)(struct gl_context *ctx, GLuint64 handle);
+   void (*MakeTextureHandleResident)(struct gl_context *ctx, GLuint64 handle,
+                                     bool resident);
+   GLuint64 (*NewImageHandle)(struct gl_context *ctx,
+                              struct gl_image_unit *imgObj);
+   void (*DeleteImageHandle)(struct gl_context *ctx, GLuint64 handle);
+   void (*MakeImageHandleResident)(struct gl_context *ctx, GLuint64 handle,
+                                   GLenum access, bool resident);
+   /*@}*/
 };
 
 
@@ -1225,6 +1244,8 @@
    void (GLAPIENTRYP VertexAttribL3dv)( GLuint index, const GLdouble *v);
    void (GLAPIENTRYP VertexAttribL4dv)( GLuint index, const GLdouble *v);
 
+   void (GLAPIENTRYP VertexAttribL1ui64ARB)( GLuint index, GLuint64EXT x);
+   void (GLAPIENTRYP VertexAttribL1ui64vARB)( GLuint index, const GLuint64EXT *v);
 } GLvertexformat;
 
 
diff --git a/src/mesa/main/debug.c b/src/mesa/main/debug.c
index 7b76a94..d7e0143 100644
--- a/src/mesa/main/debug.c
+++ b/src/mesa/main/debug.c
@@ -235,6 +235,11 @@
       fprintf(f,"255\n");
       fclose(f);
       f = fopen( filename, "ab" );  /* reopen in binary append mode */
+      if (!f) {
+         fprintf(stderr, "Error while reopening %s in write_ppm()\n",
+                 filename);
+         return;
+      }
       for (y=0; y < height; y++) {
          for (x = 0; x < width; x++) {
             int yy = invert ? (height - 1 - y) : y;
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index c353440..930f5e8 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -57,37 +57,55 @@
 }
 
 
+static ALWAYS_INLINE void
+depth_func(struct gl_context *ctx, GLenum func, bool no_error)
+{
+   if (ctx->Depth.Func == func)
+      return;
+
+   if (!no_error) {
+      switch (func) {
+      case GL_LESS:    /* (default) pass if incoming z < stored z */
+      case GL_GEQUAL:
+      case GL_LEQUAL:
+      case GL_GREATER:
+      case GL_NOTEQUAL:
+      case GL_EQUAL:
+      case GL_ALWAYS:
+      case GL_NEVER:
+         break;
+      default:
+         _mesa_error(ctx, GL_INVALID_ENUM, "glDepth.Func");
+         return;
+      }
+   }
+
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepth ? 0 : _NEW_DEPTH);
+   ctx->NewDriverState |= ctx->DriverFlags.NewDepth;
+   ctx->Depth.Func = func;
+
+   if (ctx->Driver.DepthFunc)
+      ctx->Driver.DepthFunc(ctx, func);
+}
+
+
 void GLAPIENTRY
-_mesa_DepthFunc( GLenum func )
+_mesa_DepthFunc_no_error(GLenum func)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   depth_func(ctx, func, true);
+}
+
+
+void GLAPIENTRY
+_mesa_DepthFunc(GLenum func)
 {
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_enum_to_string(func));
 
-   if (ctx->Depth.Func == func)
-      return;
-
-   switch (func) {
-   case GL_LESS:    /* (default) pass if incoming z < stored z */
-   case GL_GEQUAL:
-   case GL_LEQUAL:
-   case GL_GREATER:
-   case GL_NOTEQUAL:
-   case GL_EQUAL:
-   case GL_ALWAYS:
-   case GL_NEVER:
-      break;
-   default:
-      _mesa_error( ctx, GL_INVALID_ENUM, "glDepth.Func" );
-      return;
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_DEPTH);
-   ctx->Depth.Func = func;
-
-   if (ctx->Driver.DepthFunc)
-      ctx->Driver.DepthFunc( ctx, func );
+   depth_func(ctx, func, false);
 }
 
 
@@ -107,7 +125,8 @@
    if (ctx->Depth.Mask == flag)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_DEPTH);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepth ? 0 : _NEW_DEPTH);
+   ctx->NewDriverState |= ctx->DriverFlags.NewDepth;
    ctx->Depth.Mask = flag;
 
    if (ctx->Driver.DepthMask)
@@ -138,7 +157,8 @@
    if (ctx->Depth.BoundsMin == zmin && ctx->Depth.BoundsMax == zmax)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_DEPTH);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepth ? 0 : _NEW_DEPTH);
+   ctx->NewDriverState |= ctx->DriverFlags.NewDepth;
    ctx->Depth.BoundsMin = (GLfloat) zmin;
    ctx->Depth.BoundsMax = (GLfloat) zmax;
 }
diff --git a/src/mesa/main/depth.h b/src/mesa/main/depth.h
index 5ff7a5e..478249f 100644
--- a/src/mesa/main/depth.h
+++ b/src/mesa/main/depth.h
@@ -43,8 +43,11 @@
 extern void GLAPIENTRY
 _mesa_ClearDepthf( GLclampf depth );
 
+void GLAPIENTRY
+_mesa_DepthFunc_no_error(GLenum func);
+
 extern void GLAPIENTRY
-_mesa_DepthFunc( GLenum func );
+_mesa_DepthFunc(GLenum func);
 
 extern void GLAPIENTRY
 _mesa_DepthMask( GLboolean flag );
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 7e44054..208471a 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -325,7 +325,8 @@
    OPCODE_STENCIL_FUNC_SEPARATE,
    OPCODE_STENCIL_OP_SEPARATE,
    OPCODE_STENCIL_MASK_SEPARATE,
-
+   /* GL_NV_primitive_restart */
+   OPCODE_PRIMITIVE_RESTART_NV,
    /* GL_ARB_shader_objects */
    OPCODE_USE_PROGRAM,
    OPCODE_UNIFORM_1F,
@@ -5766,25 +5767,9 @@
       _mesa_compile_error(ctx, GL_INVALID_OPERATION, "recursive glBegin");
    }
    else {
-      Node *n;
-
       ctx->Driver.CurrentSavePrimitive = mode;
 
-      /* Give the driver an opportunity to hook in an optimized
-       * display list compiler.
-       */
-      if (vbo_save_NotifyBegin(ctx, mode))
-         return;
-
-      SAVE_FLUSH_VERTICES(ctx);
-      n = alloc_instruction(ctx, OPCODE_BEGIN, 1);
-      if (n) {
-         n[1].e = mode;
-      }
-
-      if (ctx->ExecuteFlag) {
-         CALL_Begin(ctx->Exec, (mode));
-      }
+      vbo_save_NotifyBegin(ctx, mode);
    }
 }
 
@@ -6111,6 +6096,19 @@
 }
 
 static void GLAPIENTRY
+save_PrimitiveRestartNV(void)
+{
+   /* Note: this is used when outside a glBegin/End pair in a display list */
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   (void) alloc_instruction(ctx, OPCODE_PRIMITIVE_RESTART_NV, 0);
+   if (ctx->ExecuteFlag) {
+      CALL_PrimitiveRestartNV(ctx->Exec, ());
+   }
+}
+
+
+static void GLAPIENTRY
 save_BlitFramebufferEXT(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                         GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                         GLbitfield mask, GLenum filter)
@@ -8686,6 +8684,10 @@
                                                 n[5].i, n[6].i, n[7].i, n[8].i,
                                                 n[9].i, n[10].e));
             break;
+         case OPCODE_PRIMITIVE_RESTART_NV:
+            CALL_PrimitiveRestartNV(ctx->Exec, ());
+            break;
+
          case OPCODE_USE_PROGRAM:
             CALL_UseProgram(ctx->Exec, (n[1].ui));
             break;
@@ -10476,6 +10478,8 @@
    vfmt->VertexAttrib3fvARB = save_VertexAttrib3fvARB;
    vfmt->VertexAttrib4fARB = save_VertexAttrib4fARB;
    vfmt->VertexAttrib4fvARB = save_VertexAttrib4fvARB;
+
+   vfmt->PrimitiveRestartNV = save_PrimitiveRestartNV;
 }
 
 
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index ef278a3..2e5fb00 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -29,6 +29,7 @@
 
 
 #include "glheader.h"
+#include "blend.h"
 #include "clip.h"
 #include "context.h"
 #include "debug_output.h"
@@ -37,7 +38,6 @@
 #include "light.h"
 #include "mtypes.h"
 #include "enums.h"
-#include "api_arrayelt.h"
 #include "texstate.h"
 
 
@@ -110,12 +110,15 @@
 
       /* GL_NV_primitive_restart */
       case GL_PRIMITIVE_RESTART_NV:
-	 if (!ctx->Extensions.NV_primitive_restart) {
+         if (!ctx->Extensions.NV_primitive_restart)
             goto invalid_enum_error;
-         }
-         var = &ctx->Array.PrimitiveRestart;
-         flag = 0;
-         break;
+         if (ctx->Array.PrimitiveRestart == state)
+            return;
+
+         FLUSH_VERTICES(ctx, 0);
+         ctx->Array.PrimitiveRestart = state;
+         update_derived_primitive_restart_state(ctx);
+         return;
 
       default:
          goto invalid_enum_error;
@@ -126,12 +129,8 @@
 
    FLUSH_VERTICES(ctx, _NEW_ARRAY);
 
-   _ae_invalidate_state(ctx, _NEW_ARRAY);
-
    *var = state;
 
-   update_derived_primitive_restart_state(ctx);
-
    if (state)
       vao->_Enabled |= flag;
    else
@@ -241,7 +240,18 @@
 {
    if (ctx->Multisample.Enabled == state)
       return;
-   FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+
+   /* GL compatibility needs Multisample.Enable to determine program state
+    * constants.
+    */
+   if (ctx->API == API_OPENGL_COMPAT || ctx->API == API_OPENGLES ||
+       !ctx->DriverFlags.NewMultisampleEnable) {
+      FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+   } else {
+      FLUSH_VERTICES(ctx, 0);
+   }
+
+   ctx->NewDriverState |= ctx->DriverFlags.NewMultisampleEnable;
    ctx->Multisample.Enabled = state;
 
    if (ctx->Driver.Enable) {
@@ -258,7 +268,10 @@
 {
    if (ctx->Color.sRGBEnabled == state)
       return;
-   FLUSH_VERTICES(ctx, _NEW_BUFFERS);
+
+   /* TODO: Switch i965 to the new flag and remove the conditional */
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewFramebufferSRGB ? 0 : _NEW_BUFFERS);
+   ctx->NewDriverState |= ctx->DriverFlags.NewFramebufferSRGB;
    ctx->Color.sRGBEnabled = state;
 
    if (ctx->Driver.Enable) {
@@ -293,7 +306,9 @@
             goto invalid_enum_error;
          if (ctx->Color.AlphaEnabled == state)
             return;
+         /* AlphaEnabled is used by the fixed-func fragment program */
          FLUSH_VERTICES(ctx, _NEW_COLOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewAlphaTest;
          ctx->Color.AlphaEnabled = state;
          break;
       case GL_AUTO_NORMAL:
@@ -309,7 +324,7 @@
             GLbitfield newEnabled =
                state * ((1 << ctx->Const.MaxDrawBuffers) - 1);
             if (newEnabled != ctx->Color.BlendEnabled) {
-               FLUSH_VERTICES(ctx, _NEW_COLOR);
+               _mesa_flush_vertices_for_blend_state(ctx);
                ctx->Color.BlendEnabled = newEnabled;
             }
          }
@@ -332,15 +347,30 @@
                 == ((GLuint) state << p))
                return;
 
-            FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+            /* The compatibility profile needs _NEW_TRANSFORM to transform
+             * clip planes according to the projection matrix.
+             */
+            if (ctx->API == API_OPENGL_COMPAT || ctx->API == API_OPENGLES ||
+                !ctx->DriverFlags.NewClipPlaneEnable) {
+               FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+            } else {
+               FLUSH_VERTICES(ctx, 0);
+            }
+            ctx->NewDriverState |= ctx->DriverFlags.NewClipPlaneEnable;
 
             if (state) {
                ctx->Transform.ClipPlanesEnabled |= (1 << p);
-               _mesa_update_clip_plane(ctx, p);
+
+               /* The projection matrix transforms the clip plane. */
+               /* TODO: glEnable might not be the best place to do it. */
+               if (ctx->API == API_OPENGL_COMPAT || ctx->API == API_OPENGLES) {
+                  _mesa_update_clip_plane(ctx, p);
+                  ctx->NewDriverState |= ctx->DriverFlags.NewClipPlane;
+               }
             }
             else {
                ctx->Transform.ClipPlanesEnabled &= ~(1 << p);
-            }               
+            }
          }
          break;
       case GL_COLOR_MATERIAL:
@@ -359,13 +389,16 @@
       case GL_CULL_FACE:
          if (ctx->Polygon.CullFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.CullFlag = state;
          break;
       case GL_DEPTH_TEST:
          if (ctx->Depth.Test == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_DEPTH);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepth ? 0 : _NEW_DEPTH);
+         ctx->NewDriverState |= ctx->DriverFlags.NewDepth;
          ctx->Depth.Test = state;
          break;
       case GL_DEBUG_OUTPUT:
@@ -375,7 +408,8 @@
       case GL_DITHER:
          if (ctx->Color.DitherFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_COLOR);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewBlend ? 0 : _NEW_COLOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewBlend;
          ctx->Color.DitherFlag = state;
          break;
       case GL_FOG:
@@ -421,7 +455,8 @@
             goto invalid_enum_error;
          if (ctx->Line.SmoothFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_LINE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLineState ? 0 : _NEW_LINE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewLineState;
          ctx->Line.SmoothFlag = state;
          break;
       case GL_LINE_STIPPLE:
@@ -429,7 +464,8 @@
             goto invalid_enum_error;
          if (ctx->Line.StippleFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_LINE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLineState ? 0 : _NEW_LINE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewLineState;
          ctx->Line.StippleFlag = state;
          break;
       case GL_INDEX_LOGIC_OP:
@@ -437,7 +473,8 @@
             goto invalid_enum_error;
          if (ctx->Color.IndexLogicOpEnabled == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_COLOR);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLogicOp ? 0 : _NEW_COLOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewLogicOp;
          ctx->Color.IndexLogicOpEnabled = state;
          break;
       case GL_CONSERVATIVE_RASTERIZATION_INTEL:
@@ -455,7 +492,8 @@
             goto invalid_enum_error;
          if (ctx->Color.ColorLogicOpEnabled == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_COLOR);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLogicOp ? 0 : _NEW_COLOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewLogicOp;
          ctx->Color.ColorLogicOpEnabled = state;
          break;
       case GL_MAP1_COLOR_4:
@@ -623,7 +661,9 @@
             goto invalid_enum_error;
          if (ctx->Polygon.SmoothFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.SmoothFlag = state;
          break;
       case GL_POLYGON_STIPPLE:
@@ -631,7 +671,9 @@
             goto invalid_enum_error;
          if (ctx->Polygon.StippleFlag == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.StippleFlag = state;
          break;
       case GL_POLYGON_OFFSET_POINT:
@@ -639,7 +681,9 @@
             goto invalid_enum_error;
          if (ctx->Polygon.OffsetPoint == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.OffsetPoint = state;
          break;
       case GL_POLYGON_OFFSET_LINE:
@@ -647,13 +691,17 @@
             goto invalid_enum_error;
          if (ctx->Polygon.OffsetLine == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.OffsetLine = state;
          break;
       case GL_POLYGON_OFFSET_FILL:
          if (ctx->Polygon.OffsetFill == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_POLYGON);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
          ctx->Polygon.OffsetFill = state;
          break;
       case GL_RESCALE_NORMAL_EXT:
@@ -670,7 +718,9 @@
             GLbitfield newEnabled =
                state * ((1 << ctx->Const.MaxViewports) - 1);
             if (newEnabled != ctx->Scissor.EnableFlags) {
-               FLUSH_VERTICES(ctx, _NEW_SCISSOR);
+               FLUSH_VERTICES(ctx, ctx->DriverFlags.NewScissorTest ? 0 :
+                                                                _NEW_SCISSOR);
+               ctx->NewDriverState |= ctx->DriverFlags.NewScissorTest;
                ctx->Scissor.EnableFlags = newEnabled;
             }
          }
@@ -678,7 +728,8 @@
       case GL_STENCIL_TEST:
          if (ctx->Stencil.Enabled == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_STENCIL);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+         ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
          ctx->Stencil.Enabled = state;
          break;
       case GL_TEXTURE_1D:
@@ -726,8 +777,8 @@
          break;
 
       case GL_TEXTURE_GEN_STR_OES:
-	 /* disable S, T, and R at the same time */
-	 {
+         /* disable S, T, and R at the same time */
+         {
             struct gl_texture_unit *texUnit = get_texcoord_unit(ctx);
 
             if (ctx->API != API_OPENGLES)
@@ -735,7 +786,7 @@
 
             if (texUnit) {
                GLuint newenabled =
-		  texUnit->TexGenEnabled & ~STR_BITS;
+                  texUnit->TexGenEnabled & ~STR_BITS;
                if (state)
                   newenabled |= STR_BITS;
                if (texUnit->TexGenEnabled == newenabled)
@@ -798,7 +849,9 @@
       case GL_SAMPLE_ALPHA_TO_COVERAGE_ARB:
          if (ctx->Multisample.SampleAlphaToCoverage == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleAlphaToXEnable ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleAlphaToXEnable;
          ctx->Multisample.SampleAlphaToCoverage = state;
          break;
       case GL_SAMPLE_ALPHA_TO_ONE_ARB:
@@ -806,13 +859,17 @@
             goto invalid_enum_error;
          if (ctx->Multisample.SampleAlphaToOne == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleAlphaToXEnable ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleAlphaToXEnable;
          ctx->Multisample.SampleAlphaToOne = state;
          break;
       case GL_SAMPLE_COVERAGE_ARB:
          if (ctx->Multisample.SampleCoverage == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleMask ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleMask;
          ctx->Multisample.SampleCoverage = state;
          break;
       case GL_SAMPLE_COVERAGE_INVERT_ARB:
@@ -820,7 +877,9 @@
             goto invalid_enum_error;
          if (ctx->Multisample.SampleCoverageInvert == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleMask ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleMask;
          ctx->Multisample.SampleCoverageInvert = state;
          break;
 
@@ -831,7 +890,9 @@
          CHECK_EXTENSION(ARB_sample_shading, cap);
          if (ctx->Multisample.SampleShading == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleShading ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleShading;
          ctx->Multisample.SampleShading = state;
          break;
 
@@ -841,7 +902,7 @@
             goto invalid_enum_error;
          if (ctx->Transform.RasterPositionUnclipped == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+         FLUSH_VERTICES(ctx, 0);
          ctx->Transform.RasterPositionUnclipped = state;
          break;
 
@@ -862,7 +923,7 @@
          CHECK_EXTENSION(ARB_vertex_program, cap);
          if (ctx->VertexProgram.Enabled == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_PROGRAM); 
+         FLUSH_VERTICES(ctx, _NEW_PROGRAM);
          ctx->VertexProgram.Enabled = state;
          break;
       case GL_VERTEX_PROGRAM_POINT_SIZE_ARB:
@@ -883,7 +944,7 @@
          CHECK_EXTENSION(ARB_vertex_program, cap);
          if (ctx->VertexProgram.TwoSideEnabled == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_PROGRAM); 
+         FLUSH_VERTICES(ctx, _NEW_PROGRAM);
          ctx->VertexProgram.TwoSideEnabled = state;
          break;
 
@@ -904,7 +965,8 @@
          CHECK_EXTENSION(EXT_stencil_two_side, cap);
          if (ctx->Stencil.TestTwoSide == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_STENCIL);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+         ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
          ctx->Stencil.TestTwoSide = state;
          if (state) {
             ctx->Stencil._BackFace = 2;
@@ -930,44 +992,47 @@
          CHECK_EXTENSION(EXT_depth_bounds_test, cap);
          if (ctx->Depth.BoundsTest == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_DEPTH);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepth ? 0 : _NEW_DEPTH);
+         ctx->NewDriverState |= ctx->DriverFlags.NewDepth;
          ctx->Depth.BoundsTest = state;
          break;
 
       case GL_DEPTH_CLAMP:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(ARB_depth_clamp, cap);
+         CHECK_EXTENSION(ARB_depth_clamp, cap);
          if (ctx->Transform.DepthClamp == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
-	 ctx->Transform.DepthClamp = state;
-	 break;
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewDepthClamp ? 0 :
+                                                           _NEW_TRANSFORM);
+         ctx->NewDriverState |= ctx->DriverFlags.NewDepthClamp;
+         ctx->Transform.DepthClamp = state;
+         break;
 
       case GL_FRAGMENT_SHADER_ATI:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
         CHECK_EXTENSION(ATI_fragment_shader, cap);
-	if (ctx->ATIFragmentShader.Enabled == state)
-	  return;
-	FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-	ctx->ATIFragmentShader.Enabled = state;
+        if (ctx->ATIFragmentShader.Enabled == state)
+           return;
+        FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+        ctx->ATIFragmentShader.Enabled = state;
         break;
 
       case GL_TEXTURE_CUBE_MAP_SEAMLESS:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(ARB_seamless_cube_map, cap);
-	 if (ctx->Texture.CubeMapSeamless != state) {
-	    FLUSH_VERTICES(ctx, _NEW_TEXTURE_OBJECT);
-	    ctx->Texture.CubeMapSeamless = state;
-	 }
-	 break;
+         CHECK_EXTENSION(ARB_seamless_cube_map, cap);
+         if (ctx->Texture.CubeMapSeamless != state) {
+            FLUSH_VERTICES(ctx, _NEW_TEXTURE_OBJECT);
+            ctx->Texture.CubeMapSeamless = state;
+         }
+         break;
 
       case GL_RASTERIZER_DISCARD:
          if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(EXT_transform_feedback, cap);
+         CHECK_EXTENSION(EXT_transform_feedback, cap);
          if (ctx->RasterDiscard != state) {
             FLUSH_VERTICES(ctx, 0);
             ctx->NewDriverState |= ctx->DriverFlags.NewRasterizerDiscard;
@@ -983,17 +1048,17 @@
             goto invalid_enum_error;
          }
          if (ctx->Array.PrimitiveRestart != state) {
-            FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+            FLUSH_VERTICES(ctx, 0);
             ctx->Array.PrimitiveRestart = state;
             update_derived_primitive_restart_state(ctx);
          }
          break;
 
       case GL_PRIMITIVE_RESTART_FIXED_INDEX:
-	 if (!_mesa_is_gles3(ctx) && !ctx->Extensions.ARB_ES3_compatibility)
+         if (!_mesa_is_gles3(ctx) && !ctx->Extensions.ARB_ES3_compatibility)
             goto invalid_enum_error;
          if (ctx->Array.PrimitiveRestartFixedIndex != state) {
-            FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+            FLUSH_VERTICES(ctx, 0);
             ctx->Array.PrimitiveRestartFixedIndex = state;
             update_derived_primitive_restart_state(ctx);
          }
@@ -1024,7 +1089,9 @@
          CHECK_EXTENSION(ARB_texture_multisample, cap);
          if (ctx->Multisample.SampleMask == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleMask ? 0 :
+                                                         _NEW_MULTISAMPLE);
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleMask;
          ctx->Multisample.SampleMask = state;
          break;
 
@@ -1032,7 +1099,8 @@
          CHECK_EXTENSION(KHR_blend_equation_advanced_coherent, cap);
          if (ctx->Color.BlendCoherent == state)
             return;
-         FLUSH_VERTICES(ctx, _NEW_COLOR);
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewBlend ? 0 : _NEW_COLOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewBlend;
          ctx->Color.BlendCoherent = state;
          break;
 
@@ -1098,7 +1166,7 @@
          return;
       }
       if (((ctx->Color.BlendEnabled >> index) & 1) != state) {
-         FLUSH_VERTICES(ctx, _NEW_COLOR);
+         _mesa_flush_vertices_for_blend_state(ctx);
          if (state)
             ctx->Color.BlendEnabled |= (1 << index);
          else
@@ -1112,7 +1180,9 @@
          return;
       }
       if (((ctx->Scissor.EnableFlags >> index) & 1) != state) {
-         FLUSH_VERTICES(ctx, _NEW_SCISSOR);
+         FLUSH_VERTICES(ctx,
+                        ctx->DriverFlags.NewScissorTest ? 0 : _NEW_SCISSOR);
+         ctx->NewDriverState |= ctx->DriverFlags.NewScissorTest;
          if (state)
             ctx->Scissor.EnableFlags |= (1 << index);
          else
@@ -1225,7 +1295,7 @@
       case GL_AUTO_NORMAL:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.AutoNormal;
+         return ctx->Eval.AutoNormal;
       case GL_BLEND:
          return ctx->Color.BlendEnabled & 1;  /* return state for buffer[0] */
       case GL_CLIP_DISTANCE0: /* aka GL_CLIP_PLANE0 */
@@ -1241,12 +1311,12 @@
          if (p >= ctx->Const.MaxClipPlanes)
             goto invalid_enum_error;
 
-	 return (ctx->Transform.ClipPlanesEnabled >> p) & 1;
+         return (ctx->Transform.ClipPlanesEnabled >> p) & 1;
       }
       case GL_COLOR_MATERIAL:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Light.ColorMaterialEnabled;
+         return ctx->Light.ColorMaterialEnabled;
       case GL_CULL_FACE:
          return ctx->Polygon.CullFlag;
       case GL_DEBUG_OUTPUT:
@@ -1255,11 +1325,11 @@
       case GL_DEPTH_TEST:
          return ctx->Depth.Test;
       case GL_DITHER:
-	 return ctx->Color.DitherFlag;
+         return ctx->Color.DitherFlag;
       case GL_FOG:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Fog.Enabled;
+         return ctx->Fog.Enabled;
       case GL_LIGHTING:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
@@ -1278,125 +1348,125 @@
       case GL_LINE_SMOOTH:
          if (!_mesa_is_desktop_gl(ctx) && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Line.SmoothFlag;
+         return ctx->Line.SmoothFlag;
       case GL_LINE_STIPPLE:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Line.StippleFlag;
+         return ctx->Line.StippleFlag;
       case GL_INDEX_LOGIC_OP:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Color.IndexLogicOpEnabled;
+         return ctx->Color.IndexLogicOpEnabled;
       case GL_COLOR_LOGIC_OP:
          if (!_mesa_is_desktop_gl(ctx) && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Color.ColorLogicOpEnabled;
+         return ctx->Color.ColorLogicOpEnabled;
       case GL_MAP1_COLOR_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1Color4;
+         return ctx->Eval.Map1Color4;
       case GL_MAP1_INDEX:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1Index;
+         return ctx->Eval.Map1Index;
       case GL_MAP1_NORMAL:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1Normal;
+         return ctx->Eval.Map1Normal;
       case GL_MAP1_TEXTURE_COORD_1:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1TextureCoord1;
+         return ctx->Eval.Map1TextureCoord1;
       case GL_MAP1_TEXTURE_COORD_2:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1TextureCoord2;
+         return ctx->Eval.Map1TextureCoord2;
       case GL_MAP1_TEXTURE_COORD_3:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1TextureCoord3;
+         return ctx->Eval.Map1TextureCoord3;
       case GL_MAP1_TEXTURE_COORD_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1TextureCoord4;
+         return ctx->Eval.Map1TextureCoord4;
       case GL_MAP1_VERTEX_3:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1Vertex3;
+         return ctx->Eval.Map1Vertex3;
       case GL_MAP1_VERTEX_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map1Vertex4;
+         return ctx->Eval.Map1Vertex4;
       case GL_MAP2_COLOR_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2Color4;
+         return ctx->Eval.Map2Color4;
       case GL_MAP2_INDEX:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2Index;
+         return ctx->Eval.Map2Index;
       case GL_MAP2_NORMAL:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2Normal;
+         return ctx->Eval.Map2Normal;
       case GL_MAP2_TEXTURE_COORD_1:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2TextureCoord1;
+         return ctx->Eval.Map2TextureCoord1;
       case GL_MAP2_TEXTURE_COORD_2:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2TextureCoord2;
+         return ctx->Eval.Map2TextureCoord2;
       case GL_MAP2_TEXTURE_COORD_3:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2TextureCoord3;
+         return ctx->Eval.Map2TextureCoord3;
       case GL_MAP2_TEXTURE_COORD_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2TextureCoord4;
+         return ctx->Eval.Map2TextureCoord4;
       case GL_MAP2_VERTEX_3:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2Vertex3;
+         return ctx->Eval.Map2Vertex3;
       case GL_MAP2_VERTEX_4:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Eval.Map2Vertex4;
+         return ctx->Eval.Map2Vertex4;
       case GL_NORMALIZE:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Transform.Normalize;
+         return ctx->Transform.Normalize;
       case GL_POINT_SMOOTH:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
-	 return ctx->Point.SmoothFlag;
+         return ctx->Point.SmoothFlag;
       case GL_POLYGON_SMOOTH:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 return ctx->Polygon.SmoothFlag;
+         return ctx->Polygon.SmoothFlag;
       case GL_POLYGON_STIPPLE:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 return ctx->Polygon.StippleFlag;
+         return ctx->Polygon.StippleFlag;
       case GL_POLYGON_OFFSET_POINT:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 return ctx->Polygon.OffsetPoint;
+         return ctx->Polygon.OffsetPoint;
       case GL_POLYGON_OFFSET_LINE:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 return ctx->Polygon.OffsetLine;
+         return ctx->Polygon.OffsetLine;
       case GL_POLYGON_OFFSET_FILL:
-	 return ctx->Polygon.OffsetFill;
+         return ctx->Polygon.OffsetFill;
       case GL_RESCALE_NORMAL_EXT:
          if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
          return ctx->Transform.RescaleNormals;
       case GL_SCISSOR_TEST:
-	 return ctx->Scissor.EnableFlags & 1;  /* return state for index 0 */
+         return ctx->Scissor.EnableFlags & 1;  /* return state for index 0 */
       case GL_STENCIL_TEST:
-	 return ctx->Stencil.Enabled;
+         return ctx->Stencil.Enabled;
       case GL_TEXTURE_1D:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
@@ -1426,7 +1496,7 @@
          }
          return GL_FALSE;
       case GL_TEXTURE_GEN_STR_OES:
-	 {
+         {
             const struct gl_texture_unit *texUnit = get_texcoord_unit(ctx);
 
             if (ctx->API != API_OPENGLES)
@@ -1575,19 +1645,19 @@
       case GL_FRAGMENT_SHADER_ATI:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_enum_error;
-	 CHECK_EXTENSION(ATI_fragment_shader);
-	 return ctx->ATIFragmentShader.Enabled;
+         CHECK_EXTENSION(ATI_fragment_shader);
+         return ctx->ATIFragmentShader.Enabled;
 
       case GL_TEXTURE_CUBE_MAP_SEAMLESS:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(ARB_seamless_cube_map);
-	 return ctx->Texture.CubeMapSeamless;
+         CHECK_EXTENSION(ARB_seamless_cube_map);
+         return ctx->Texture.CubeMapSeamless;
 
       case GL_RASTERIZER_DISCARD:
          if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(EXT_transform_feedback);
+         CHECK_EXTENSION(EXT_transform_feedback);
          return ctx->RasterDiscard;
 
       /* GL_NV_primitive_restart */
@@ -1605,7 +1675,7 @@
          return ctx->Array.PrimitiveRestart;
 
       case GL_PRIMITIVE_RESTART_FIXED_INDEX:
-	 if (!_mesa_is_gles3(ctx) && !ctx->Extensions.ARB_ES3_compatibility) {
+         if (!_mesa_is_gles3(ctx) && !ctx->Extensions.ARB_ES3_compatibility) {
             goto invalid_enum_error;
          }
          return ctx->Array.PrimitiveRestartFixedIndex;
@@ -1614,14 +1684,14 @@
       case GL_FRAMEBUFFER_SRGB_EXT:
          if (!_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(EXT_framebuffer_sRGB);
-	 return ctx->Color.sRGBEnabled;
+         CHECK_EXTENSION(EXT_framebuffer_sRGB);
+         return ctx->Color.sRGBEnabled;
 
       /* GL_OES_EGL_image_external */
       case GL_TEXTURE_EXTERNAL_OES:
          if (!_mesa_is_gles(ctx))
             goto invalid_enum_error;
-	 CHECK_EXTENSION(OES_EGL_image_external);
+         CHECK_EXTENSION(OES_EGL_image_external);
          return is_texture_enabled(ctx, TEXTURE_EXTERNAL_BIT);
 
       /* ARB_texture_multisample */
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index d11cb0f..757b7bf 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -26,7 +26,6 @@
 EXT(APPLE_object_purgeable                  , APPLE_object_purgeable                 , GLL, GLC,  x ,  x , 2006)
 EXT(APPLE_packed_pixels                     , dummy_true                             , GLL,  x ,  x ,  x , 2002)
 EXT(APPLE_texture_max_level                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
-EXT(APPLE_vertex_array_object               , dummy_true                             , GLL,  x ,  x ,  x , 2002)
 
 EXT(ARB_ES2_compatibility                   , ARB_ES2_compatibility                  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_ES3_1_compatibility                 , ARB_ES3_1_compatibility                ,  x , GLC,  x ,  x , 2014)
@@ -34,6 +33,7 @@
 EXT(ARB_ES3_compatibility                   , ARB_ES3_compatibility                  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_arrays_of_arrays                    , ARB_arrays_of_arrays                   , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_base_instance                       , ARB_base_instance                      , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_bindless_texture                    , ARB_bindless_texture                   , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_blend_func_extended                 , ARB_blend_func_extended                , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_buffer_storage                      , ARB_buffer_storage                     , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_clear_buffer_object                 , dummy_true                             , GLL, GLC,  x ,  x , 2012)
@@ -118,7 +118,7 @@
 EXT(ARB_shader_precision                    , ARB_shader_precision                   , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_shader_storage_buffer_object        , ARB_shader_storage_buffer_object       , GLL, GLC,  x ,  x , 2012)
-EXT(ARB_shader_subroutine                   , ARB_shader_subroutine                  ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_shader_subroutine                   , dummy_true                             ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_shader_texture_image_samples        , ARB_shader_texture_image_samples       , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_shader_texture_lod                  , ARB_shader_texture_lod                 , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_shader_viewport_layer_array         , ARB_shader_viewport_layer_array        ,  x , GLC,  x ,  x , 2015)
@@ -303,6 +303,7 @@
 EXT(KHR_blend_equation_advanced_coherent    , KHR_blend_equation_advanced_coherent   , GLL, GLC,  x , ES2, 2014)
 EXT(KHR_context_flush_control               , dummy_true                             , GLL, GLC,  x , ES2, 2014)
 EXT(KHR_debug                               , dummy_true                             , GLL, GLC,  11, ES2, 2012)
+EXT(KHR_no_error                            , dummy_true                             , GLL, GLC, ES1, ES2, 2015)
 EXT(KHR_robust_buffer_access_behavior       , ARB_robust_buffer_access_behavior      , GLL, GLC,  x , ES2, 2014)
 EXT(KHR_robustness                          , KHR_robustness                         , GLL, GLC,  x , ES2, 2012)
 EXT(KHR_texture_compression_astc_hdr        , KHR_texture_compression_astc_hdr       , GLL, GLC,  x , ES2, 2012)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index d486d01..2e2300e 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -285,8 +285,8 @@
  * window-system framebuffer (not user-created framebuffer objects).
  */
 static struct gl_renderbuffer_attachment *
-_mesa_get_fb0_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
-                         GLenum attachment)
+get_fb0_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
+                   GLenum attachment)
 {
    assert(_mesa_is_winsys_fbo(fb));
 
@@ -303,7 +303,7 @@
             return &fb->Attachment[BUFFER_BACK_LEFT];
          return &fb->Attachment[BUFFER_FRONT_LEFT];
       case GL_DEPTH:
-      return &fb->Attachment[BUFFER_DEPTH];
+         return &fb->Attachment[BUFFER_DEPTH];
       case GL_STENCIL:
          return &fb->Attachment[BUFFER_STENCIL];
       }
@@ -330,6 +330,15 @@
       return &fb->Attachment[BUFFER_BACK_LEFT];
    case GL_BACK_RIGHT:
       return &fb->Attachment[BUFFER_BACK_RIGHT];
+   case GL_BACK:
+      /* The ARB_ES3_1_compatibility spec says:
+       *
+       *    "Since this command can only query a single framebuffer
+       *     attachment, BACK is equivalent to BACK_LEFT."
+       */
+      if (ctx->Extensions.ARB_ES3_1_compatibility)
+         return &fb->Attachment[BUFFER_BACK_LEFT];
+      return NULL;
    case GL_AUX0:
       if (fb->Visual.numAuxBuffers == 1) {
          return &fb->Attachment[BUFFER_AUX0];
@@ -1705,11 +1714,6 @@
    GLuint first;
    GLint i;
 
-   if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n<0)", func);
-      return;
-   }
-
    if (!renderbuffers)
       return;
 
@@ -1734,8 +1738,23 @@
 }
 
 
+static void
+create_render_buffers_err(struct gl_context *ctx, GLsizei n,
+                          GLuint *renderbuffers, bool dsa)
+{
+   const char *func = dsa ? "glCreateRenderbuffers" : "glGenRenderbuffers";
+
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n<0)", func);
+      return;
+   }
+
+   create_render_buffers(ctx, n, renderbuffers, dsa);
+}
+
+
 void GLAPIENTRY
-_mesa_GenRenderbuffers(GLsizei n, GLuint *renderbuffers)
+_mesa_GenRenderbuffers_no_error(GLsizei n, GLuint *renderbuffers)
 {
    GET_CURRENT_CONTEXT(ctx);
    create_render_buffers(ctx, n, renderbuffers, false);
@@ -1743,13 +1762,29 @@
 
 
 void GLAPIENTRY
-_mesa_CreateRenderbuffers(GLsizei n, GLuint *renderbuffers)
+_mesa_GenRenderbuffers(GLsizei n, GLuint *renderbuffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_render_buffers_err(ctx, n, renderbuffers, false);
+}
+
+
+void GLAPIENTRY
+_mesa_CreateRenderbuffers_no_error(GLsizei n, GLuint *renderbuffers)
 {
    GET_CURRENT_CONTEXT(ctx);
    create_render_buffers(ctx, n, renderbuffers, true);
 }
 
 
+void GLAPIENTRY
+_mesa_CreateRenderbuffers(GLsizei n, GLuint *renderbuffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_render_buffers_err(ctx, n, renderbuffers, true);
+}
+
+
 /**
  * Given an internal format token for a render buffer, return the
  * corresponding base format (one of GL_RGB, GL_RGBA, GL_STENCIL_INDEX,
@@ -2823,6 +2858,16 @@
 
 
 GLenum GLAPIENTRY
+_mesa_CheckFramebufferStatus_no_error(GLenum target)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_framebuffer *fb = get_framebuffer_target(ctx, target);
+   return _mesa_check_framebuffer_status(ctx, fb);
+}
+
+
+GLenum GLAPIENTRY
 _mesa_CheckFramebufferStatus(GLenum target)
 {
    struct gl_framebuffer *fb;
@@ -2910,6 +2955,16 @@
 }
 
 
+static struct gl_texture_object *
+get_texture_for_framebuffer(struct gl_context *ctx, GLuint texture)
+{
+   if (!texture)
+      return NULL;
+
+   return _mesa_lookup_texture(ctx, texture);
+}
+
+
 /**
  * Common code called by gl*FramebufferTexture*() to retrieve the correct
  * texture object pointer.
@@ -2920,9 +2975,9 @@
  * \return true if no errors, false if errors
  */
 static bool
-get_texture_for_framebuffer(struct gl_context *ctx, GLuint texture,
-                            bool layered, const char *caller,
-                            struct gl_texture_object **texObj)
+get_texture_for_framebuffer_err(struct gl_context *ctx, GLuint texture,
+                                bool layered, const char *caller,
+                                struct gl_texture_object **texObj)
 {
    *texObj = NULL; /* This will get returned if texture = 0. */
 
@@ -3188,25 +3243,22 @@
 }
 
 
-void
-_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
-                          GLenum attachment,
-                          struct gl_texture_object *texObj, GLenum textarget,
-                          GLint level, GLuint layer, GLboolean layered,
-                          const char *caller)
+struct gl_renderbuffer_attachment *
+_mesa_get_and_validate_attachment(struct gl_context *ctx,
+                                  struct gl_framebuffer *fb,
+                                  GLenum attachment, const char *caller)
 {
-   struct gl_renderbuffer_attachment *att;
-   bool is_color_attachment;
-
    /* The window-system framebuffer object is immutable */
    if (_mesa_is_winsys_fbo(fb)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(window-system framebuffer)",
                   caller);
-      return;
+      return NULL;
    }
 
    /* Not a hash lookup, so we can afford to get the attachment here. */
-   att = get_attachment(ctx, fb, attachment, &is_color_attachment);
+   bool is_color_attachment;
+   struct gl_renderbuffer_attachment *att =
+      get_attachment(ctx, fb, attachment, &is_color_attachment);
    if (att == NULL) {
       if (is_color_attachment) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -3217,9 +3269,20 @@
                      "%s(invalid attachment %s)", caller,
                      _mesa_enum_to_string(attachment));
       }
-      return;
+      return NULL;
    }
 
+   return att;
+}
+
+
+void
+_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
+                          GLenum attachment,
+                          struct gl_renderbuffer_attachment *att,
+                          struct gl_texture_object *texObj, GLenum textarget,
+                          GLint level, GLuint layer, GLboolean layered)
+{
    FLUSH_VERTICES(ctx, _NEW_BUFFERS);
 
    mtx_lock(&fb->Mutex);
@@ -3286,6 +3349,28 @@
 
 
 static void
+framebuffer_texture_with_dims_no_error(GLenum target, GLenum attachment,
+                                       GLenum textarget, GLuint texture,
+                                       GLint level, GLint layer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* Get the framebuffer object */
+   struct gl_framebuffer *fb = get_framebuffer_target(ctx, target);
+
+   /* Get the texture object */
+   struct gl_texture_object *texObj =
+      get_texture_for_framebuffer(ctx, texture);
+
+   struct gl_renderbuffer_attachment *att =
+      get_attachment(ctx, fb, attachment, NULL);
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, att, texObj, textarget,
+                             level, layer, GL_FALSE);
+}
+
+
+static void
 framebuffer_texture_with_dims(int dims, GLenum target,
                               GLenum attachment, GLenum textarget,
                               GLuint texture, GLint level, GLint layer,
@@ -3304,7 +3389,7 @@
    }
 
    /* Get the texture object */
-   if (!get_texture_for_framebuffer(ctx, texture, false, caller, &texObj))
+   if (!get_texture_for_framebuffer_err(ctx, texture, false, caller, &texObj))
       return;
 
    if (texObj) {
@@ -3318,8 +3403,23 @@
          return;
    }
 
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
-                             layer, GL_FALSE, caller);
+   struct gl_renderbuffer_attachment *att =
+      _mesa_get_and_validate_attachment(ctx, fb, attachment, caller);
+   if (!att)
+      return;
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, att, texObj, textarget,
+                             level, layer, GL_FALSE);
+}
+
+
+void GLAPIENTRY
+_mesa_FramebufferTexture1D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level)
+{
+   framebuffer_texture_with_dims_no_error(target, attachment, textarget,
+                                          texture, level, 0);
 }
 
 
@@ -3333,6 +3433,16 @@
 
 
 void GLAPIENTRY
+_mesa_FramebufferTexture2D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level)
+{
+   framebuffer_texture_with_dims_no_error(target, attachment, textarget,
+                                          texture, level, 0);
+}
+
+
+void GLAPIENTRY
 _mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture, GLint level)
 {
@@ -3342,6 +3452,16 @@
 
 
 void GLAPIENTRY
+_mesa_FramebufferTexture3D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level, GLint layer)
+{
+   framebuffer_texture_with_dims_no_error(target, attachment, textarget,
+                                          texture, level, layer);
+}
+
+
+void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture,
                            GLint level, GLint layer)
@@ -3351,49 +3471,122 @@
 }
 
 
-void GLAPIENTRY
-_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
-                              GLuint texture, GLint level, GLint layer)
+static ALWAYS_INLINE void
+frame_buffer_texture(GLuint framebuffer, GLenum target,
+                     GLenum attachment, GLuint texture,
+                     GLint level, GLint layer, const char *func,
+                     bool dsa, bool no_error, bool check_layered)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_framebuffer *fb;
-   struct gl_texture_object *texObj;
-   GLenum textarget = 0;
+   GLboolean layered = GL_FALSE;
 
-   const char *func = "glFramebufferTextureLayer";
-
-   /* Get the framebuffer object */
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTextureLayer(invalid target %s)",
-                  _mesa_enum_to_string(target));
-      return;
+   if (!no_error && check_layered) {
+      if (!_mesa_has_geometry_shaders(ctx)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "unsupported function (%s) called", func);
+         return;
+      }
    }
 
-   /* Get the texture object */
-   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
-      return;
+   /* Get the framebuffer object */
+   struct gl_framebuffer *fb;
+   if (no_error) {
+      if (dsa) {
+         fb = _mesa_lookup_framebuffer(ctx, framebuffer);
+      } else {
+         fb = get_framebuffer_target(ctx, target);
+      }
+   } else {
+      if (dsa) {
+         fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
+         if (!fb)
+            return;
+      } else {
+         fb = get_framebuffer_target(ctx, target);
+         if (!fb) {
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)",
+                        func, _mesa_enum_to_string(target));
+            return;
+         }
+      }
+   }
 
+   /* Get the texture object and framebuffer attachment*/
+   struct gl_renderbuffer_attachment *att;
+   struct gl_texture_object *texObj;
+   if (no_error) {
+      texObj = get_texture_for_framebuffer(ctx, texture);
+      att = get_attachment(ctx, fb, attachment, NULL);
+   } else {
+      if (!get_texture_for_framebuffer_err(ctx, texture, check_layered, func,
+                                           &texObj))
+         return;
+
+      att = _mesa_get_and_validate_attachment(ctx, fb, attachment, func);
+      if (!att)
+         return;
+   }
+
+   GLenum textarget = 0;
    if (texObj) {
-      if (!check_texture_target(ctx, texObj->Target, func))
-         return;
+      if (check_layered) {
+         /* We do this regardless of no_error because this sets layered */
+         if (!check_layered_texture_target(ctx, texObj->Target, func,
+                                           &layered))
+            return;
+      }
 
-      if (!check_layer(ctx, texObj->Target, layer, func))
-         return;
+      if (!no_error) {
+         if (!check_layered) {
+            if (!check_texture_target(ctx, texObj->Target, func))
+               return;
 
-      if (!check_level(ctx, texObj->Target, level, func))
-         return;
+            if (!check_layer(ctx, texObj->Target, layer, func))
+               return;
+         }
 
-      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+         if (!check_level(ctx, texObj->Target, level, func))
+            return;
+      }
+
+      if (!check_layered && texObj->Target == GL_TEXTURE_CUBE_MAP) {
          assert(layer >= 0 && layer < 6);
          textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
          layer = 0;
       }
    }
 
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
-                             layer, GL_FALSE, func);
+   _mesa_framebuffer_texture(ctx, fb, attachment, att, texObj, textarget,
+                             level, layer, layered);
+}
+
+void GLAPIENTRY
+_mesa_FramebufferTextureLayer_no_error(GLenum target, GLenum attachment,
+                                       GLuint texture, GLint level,
+                                       GLint layer)
+{
+   frame_buffer_texture(0, target, attachment, texture, level, layer,
+                        "glFramebufferTextureLayer", false, true, false);
+}
+
+
+void GLAPIENTRY
+_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
+                              GLuint texture, GLint level, GLint layer)
+{
+   frame_buffer_texture(0, target, attachment, texture, level, layer,
+                        "glFramebufferTextureLayer", false, false, false);
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferTextureLayer_no_error(GLuint framebuffer,
+                                            GLenum attachment,
+                                            GLuint texture, GLint level,
+                                            GLint layer)
+{
+   frame_buffer_texture(framebuffer, 0, attachment, texture, level, layer,
+                        "glNamedFramebufferTextureLayer", true, true, false);
 }
 
 
@@ -3401,41 +3594,17 @@
 _mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
                                    GLuint texture, GLint level, GLint layer)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_framebuffer *fb;
-   struct gl_texture_object *texObj;
-   GLenum textarget = 0;
+   frame_buffer_texture(framebuffer, 0, attachment, texture, level, layer,
+                        "glNamedFramebufferTextureLayer", true, false, false);
+}
 
-   const char *func = "glNamedFramebufferTextureLayer";
 
-   /* Get the framebuffer object */
-   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
-   if (!fb)
-      return;
-
-   /* Get the texture object */
-   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
-      return;
-
-   if (texObj) {
-      if (!check_texture_target(ctx, texObj->Target, func))
-         return;
-
-      if (!check_layer(ctx, texObj->Target, layer, func))
-         return;
-
-      if (!check_level(ctx, texObj->Target, level, func))
-         return;
-
-      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-         assert(layer >= 0 && layer < 6);
-         textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
-         layer = 0;
-      }
-   }
-
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
-                             layer, GL_FALSE, func);
+void GLAPIENTRY
+_mesa_FramebufferTexture_no_error(GLenum target, GLenum attachment,
+                                  GLuint texture, GLint level)
+{
+   frame_buffer_texture(0, target, attachment, texture, level, 0,
+                        "glFramebufferTexture", false, true, true);
 }
 
 
@@ -3443,42 +3612,16 @@
 _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_framebuffer *fb;
-   struct gl_texture_object *texObj;
-   GLboolean layered = GL_FALSE;
+   frame_buffer_texture(0, target, attachment, texture, level, 0,
+                        "glFramebufferTexture", false, false, true);
+}
 
-   const char *func = "FramebufferTexture";
-
-   if (!_mesa_has_geometry_shaders(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "unsupported function (glFramebufferTexture) called");
-      return;
-   }
-
-   /* Get the framebuffer object */
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTexture(invalid target %s)",
-                  _mesa_enum_to_string(target));
-      return;
-   }
-
-   /* Get the texture object */
-   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
-      return;
-
-   if (texObj) {
-      if (!check_layered_texture_target(ctx, texObj->Target, func, &layered))
-         return;
-
-      if (!check_level(ctx, texObj->Target, level, func))
-         return;
-   }
-
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
-                             0, layered, func);
+void GLAPIENTRY
+_mesa_NamedFramebufferTexture_no_error(GLuint framebuffer, GLenum attachment,
+                                       GLuint texture, GLint level)
+{
+   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0,
+                        "glNamedFramebufferTexture", true, true, true);
 }
 
 
@@ -3486,39 +3629,8 @@
 _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
                               GLuint texture, GLint level)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_framebuffer *fb;
-   struct gl_texture_object *texObj;
-   GLboolean layered = GL_FALSE;
-
-   const char *func = "glNamedFramebufferTexture";
-
-   if (!_mesa_has_geometry_shaders(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "unsupported function (glNamedFramebufferTexture) called");
-      return;
-   }
-
-   /* Get the framebuffer object */
-   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
-   if (!fb)
-      return;
-
-   /* Get the texture object */
-   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
-      return;
-
-   if (texObj) {
-      if (!check_layered_texture_target(ctx, texObj->Target, func,
-                                        &layered))
-         return;
-
-      if (!check_level(ctx, texObj->Target, level, func))
-         return;
-   }
-
-   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
-                             0, layered, func);
+   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0,
+                        "glNamedFramebufferTexture", true, false, true);
 }
 
 
@@ -3542,15 +3654,29 @@
 }
 
 static void
-framebuffer_renderbuffer(struct gl_context *ctx,
-                         struct gl_framebuffer *fb,
-                         GLenum attachment,
-                         struct gl_renderbuffer *rb,
-                         const char *func)
+framebuffer_renderbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                         GLenum attachment, GLenum renderbuffertarget,
+                         GLuint renderbuffer, const char *func)
 {
    struct gl_renderbuffer_attachment *att;
+   struct gl_renderbuffer *rb;
    bool is_color_attachment;
 
+   if (renderbuffertarget != GL_RENDERBUFFER) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(renderbuffertarget is not GL_RENDERBUFFER)", func);
+      return;
+   }
+
+   if (renderbuffer) {
+      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer, func);
+      if (!rb)
+         return;
+   } else {
+      /* remove renderbuffer attachment */
+      rb = NULL;
+   }
+
    if (_mesa_is_winsys_fbo(fb)) {
       /* Can't attach new renderbuffers to a window system framebuffer */
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -3604,7 +3730,6 @@
                               GLuint renderbuffer)
 {
    struct gl_framebuffer *fb;
-   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
 
    fb = get_framebuffer_target(ctx, target);
@@ -3615,26 +3740,8 @@
       return;
    }
 
-   if (renderbuffertarget != GL_RENDERBUFFER) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(renderbuffertarget is not "
-                  "GL_RENDERBUFFER)");
-      return;
-   }
-
-   if (renderbuffer) {
-      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
-                                         "glFramebufferRenderbuffer");
-      if (!rb)
-         return;
-   }
-   else {
-      /* remove renderbuffer attachment */
-      rb = NULL;
-   }
-
-   framebuffer_renderbuffer(ctx, fb, attachment, rb,
-                            "glFramebufferRenderbuffer");
+   framebuffer_renderbuffer(ctx, fb, attachment, renderbuffertarget,
+                            renderbuffer, "glFramebufferRenderbuffer");
 }
 
 
@@ -3644,7 +3751,6 @@
                                    GLuint renderbuffer)
 {
    struct gl_framebuffer *fb;
-   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
 
    fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
@@ -3652,34 +3758,16 @@
    if (!fb)
       return;
 
-   if (renderbuffertarget != GL_RENDERBUFFER) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glNamedFramebufferRenderbuffer(renderbuffertarget is not "
-                  "GL_RENDERBUFFER)");
-      return;
-   }
-
-   if (renderbuffer) {
-      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
-                                         "glNamedFramebufferRenderbuffer");
-      if (!rb)
-         return;
-   }
-   else {
-      /* remove renderbuffer attachment */
-      rb = NULL;
-   }
-
-   framebuffer_renderbuffer(ctx, fb, attachment, rb,
-                            "glNamedFramebufferRenderbuffer");
+   framebuffer_renderbuffer(ctx, fb, attachment, renderbuffertarget,
+                            renderbuffer, "glNamedFramebufferRenderbuffer");
 }
 
 
-void
-_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
-                                           struct gl_framebuffer *buffer,
-                                           GLenum attachment, GLenum pname,
-                                           GLint *params, const char *caller)
+static void
+get_framebuffer_attachment_parameter(struct gl_context *ctx,
+                                     struct gl_framebuffer *buffer,
+                                     GLenum attachment, GLenum pname,
+                                     GLint *params, const char *caller)
 {
    const struct gl_renderbuffer_attachment *att;
    bool is_color_attachment = false;
@@ -3746,7 +3834,7 @@
       }
 
       /* the default / window-system FBO */
-      att = _mesa_get_fb0_attachment(ctx, buffer, attachment);
+      att = get_fb0_attachment(ctx, buffer, attachment);
    }
    else {
       /* user-created framebuffer FBO */
@@ -3966,10 +4054,6 @@
           && !_mesa_is_gles3(ctx)) {
          goto invalid_pname_enum;
       }
-      else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_enum_to_string(pname));
-      }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
             _mesa_select_tex_image(att->Texture, att->Texture->Target,
@@ -3987,7 +4071,9 @@
                                       att->Renderbuffer->Format);
       }
       else {
-         _mesa_problem(ctx, "%s: invalid FBO attachment structure", caller);
+         assert(att->Type == GL_NONE);
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_enum_to_string(pname));
       }
       return;
    case GL_FRAMEBUFFER_ATTACHMENT_LAYERED:
@@ -4030,8 +4116,8 @@
       return;
    }
 
-   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
-                                              params,
+   get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                        params,
                                     "glGetFramebufferAttachmentParameteriv");
 }
 
@@ -4060,8 +4146,8 @@
       buffer = ctx->WinSysDrawBuffer;
    }
 
-   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
-                                              params,
+   get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                        params,
                               "glGetNamedFramebufferAttachmentParameteriv");
 }
 
diff --git a/src/mesa/main/fbobject.h b/src/mesa/main/fbobject.h
index 540bd9d..6e10c18 100644
--- a/src/mesa/main/fbobject.h
+++ b/src/mesa/main/fbobject.h
@@ -119,25 +119,23 @@
                           struct gl_framebuffer *fb,
                           const void *att);
 
+extern struct gl_renderbuffer_attachment *
+_mesa_get_and_validate_attachment(struct gl_context *ctx,
+                                  struct gl_framebuffer *fb,
+                                  GLenum attachment, const char *caller);
+
 extern void
 _mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
                           GLenum attachment,
+                          struct gl_renderbuffer_attachment *att,
                           struct gl_texture_object *texObj, GLenum textarget,
-                          GLint level, GLuint layer, GLboolean layered,
-                          const char *caller);
+                          GLint level, GLuint layer, GLboolean layered);
 
 extern GLenum
 _mesa_check_framebuffer_status(struct gl_context *ctx,
                                struct gl_framebuffer *fb);
 
 extern void
-_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
-                                           struct gl_framebuffer *buffer,
-                                           GLenum attachment, GLenum pname,
-                                           GLint *params, const char *caller);
-
-
-extern void
 _mesa_bind_framebuffers(struct gl_context *ctx,
                         struct gl_framebuffer *newDrawFb,
                         struct gl_framebuffer *newReadFb);
@@ -154,9 +152,15 @@
 extern void GLAPIENTRY
 _mesa_DeleteRenderbuffers(GLsizei n, const GLuint *renderbuffers);
 
+void GLAPIENTRY
+_mesa_GenRenderbuffers_no_error(GLsizei n, GLuint *renderbuffers);
+
 extern void GLAPIENTRY
 _mesa_GenRenderbuffers(GLsizei n, GLuint *renderbuffers);
 
+void GLAPIENTRY
+_mesa_CreateRenderbuffers_no_error(GLsizei n, GLuint *renderbuffers);
+
 extern void GLAPIENTRY
 _mesa_CreateRenderbuffers(GLsizei n, GLuint *renderbuffers);
 
@@ -211,6 +215,9 @@
 extern void GLAPIENTRY
 _mesa_CreateFramebuffers(GLsizei n, GLuint *framebuffers);
 
+GLenum GLAPIENTRY
+_mesa_CheckFramebufferStatus_no_error(GLenum target);
+
 extern GLenum GLAPIENTRY
 _mesa_CheckFramebufferStatus(GLenum target);
 
@@ -218,31 +225,58 @@
 _mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target);
 
 extern void GLAPIENTRY
+_mesa_FramebufferTexture1D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level);
+extern void GLAPIENTRY
 _mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture, GLint level);
 
 extern void GLAPIENTRY
+_mesa_FramebufferTexture2D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level);
+extern void GLAPIENTRY
 _mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture, GLint level);
 
 extern void GLAPIENTRY
+_mesa_FramebufferTexture3D_no_error(GLenum target, GLenum attachment,
+                                    GLenum textarget, GLuint texture,
+                                    GLint level, GLint layer);
+extern void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture,
                               GLint level, GLint layer);
 
 extern void GLAPIENTRY
+_mesa_FramebufferTextureLayer_no_error(GLenum target, GLenum attachment,
+                                       GLuint texture, GLint level,
+                                       GLint layer);
+extern void GLAPIENTRY
 _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
                                  GLuint texture, GLint level, GLint layer);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTextureLayer_no_error(GLuint framebuffer,
+                                            GLenum attachment,
+                                            GLuint texture, GLint level,
+                                            GLint layer);
+extern void GLAPIENTRY
 _mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
                                    GLuint texture, GLint level, GLint layer);
 
 extern void GLAPIENTRY
+_mesa_FramebufferTexture_no_error(GLenum target, GLenum attachment,
+                                  GLuint texture, GLint level);
+extern void GLAPIENTRY
 _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTexture_no_error(GLuint framebuffer, GLenum attachment,
+                                       GLuint texture, GLint level);
+extern void GLAPIENTRY
 _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
                               GLuint texture, GLint level);
 
diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp
index aac9de7..2b924f6 100644
--- a/src/mesa/main/ff_fragment_shader.cpp
+++ b/src/mesa/main/ff_fragment_shader.cpp
@@ -33,6 +33,7 @@
 #include "main/macros.h"
 #include "main/samplerobj.h"
 #include "main/shaderobj.h"
+#include "main/state.h"
 #include "main/texenvprogram.h"
 #include "main/texobj.h"
 #include "main/uniforms.h"
@@ -172,7 +173,7 @@
    /* _NEW_PROGRAM */
    const GLboolean vertexShader =
          ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] != NULL;
-   const GLboolean vertexProgram = ctx->VertexProgram._Enabled;
+   const GLboolean vertexProgram = _mesa_arb_vertex_program_enabled(ctx);
 
    if (!(vertexProgram || vertexShader)) {
       /* Fixed function vertex logic */
diff --git a/src/mesa/main/format_fallback.py b/src/mesa/main/format_fallback.py
new file mode 100644
index 0000000..2f02d0d
--- /dev/null
+++ b/src/mesa/main/format_fallback.py
@@ -0,0 +1,172 @@
+COPYRIGHT = """\
+/*
+ * Copyright 2017 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+# stdlib
+import argparse
+from sys import stdout
+from mako.template import Template
+
+# local
+import format_parser
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("csv")
+    p.add_argument("out")
+    return p.parse_args()
+
+def get_unorm_to_srgb_map(formats):
+    names = set(fmt.name for fmt in formats)
+
+    for fmt in formats:
+        if fmt.colorspace != 'srgb':
+            continue;
+
+        replacements = [
+            ('SRGB', 'RGB'),
+            ('SRGB', 'UNORM'),
+            ('SRGB8_ALPHA8', 'RGBA'),
+            ('SRGB8_ALPHA8', 'RGBA8'),
+            ('SRGB_ALPHA_UNORM', 'RGBA_UNORM'),
+        ]
+        found_unorm_name = False
+        for rep in replacements:
+            if fmt.name.find(rep[0]) == -1:
+                continue
+
+            unorm_name = fmt.name.replace(rep[0], rep[1])
+            if unorm_name in names:
+                yield unorm_name, fmt.name
+                found_unorm_name = True
+                break
+
+        # Every sRGB format MUST have a UNORM equivalent
+        assert found_unorm_name
+
+def get_rgbx_to_rgba_map(formats):
+    names = set(fmt.name for fmt in formats)
+
+    for fmt in formats:
+        if not fmt.has_channel('r') or not fmt.has_channel('x'):
+            continue
+
+        # The condition above will still let MESA_FORMAT_R9G9B9E5_FLOAT
+        # through.  We need to ensure it actually has an X in the name.
+        if not 'X' in fmt.name:
+            continue
+
+        rgbx_name = fmt.name
+        rgba_name = rgbx_name.replace("X", "A")
+        if rgba_name not in names:
+            continue;
+
+        yield rgbx_name, rgba_name
+
+TEMPLATE = Template(COPYRIGHT + """
+#include "formats.h"
+
+/**
+ * For an sRGB format, return the corresponding linear color space format.
+ * For non-sRGB formats, return the format as-is.
+ */
+mesa_format
+_mesa_get_srgb_format_linear(mesa_format format)
+{
+   switch (format) {
+%for unorm, srgb in unorm_to_srgb_map:
+   case ${srgb}:
+      return ${unorm};
+%endfor
+   default:
+      return format;
+   }
+}
+
+/**
+ * For a linear format, return the corresponding sRGB color space format.
+ * For an sRGB format, return the format as-is.
+ * Assert-fails if the format is not sRGB and does not have an sRGB equivalent.
+ */
+mesa_format
+_mesa_get_linear_format_srgb(mesa_format format)
+{
+   switch (format) {
+%for unorm, srgb in unorm_to_srgb_map:
+   case ${unorm}:
+      return ${srgb};
+%endfor
+%for unorm, srgb in unorm_to_srgb_map:
+   case ${srgb}:
+%endfor
+      return format;
+   default:
+      unreachable("Given format does not have an sRGB equivalent");
+   }
+}
+
+/**
+ * If the format has an alpha channel, and there exists a non-alpha
+ * variant of the format with an identical bit layout, then return
+ * the non-alpha format. Otherwise return the original format.
+ *
+ * Examples:
+ *    Fallback exists:
+ *       MESA_FORMAT_R8G8B8X8_UNORM -> MESA_FORMAT_R8G8B8A8_UNORM
+ *       MESA_FORMAT_RGBX_UNORM16 -> MESA_FORMAT_RGBA_UNORM16
+ *
+ *    No fallback:
+ *       MESA_FORMAT_R8G8B8A8_UNORM -> MESA_FORMAT_R8G8B8A8_UNORM
+ *       MESA_FORMAT_Z_FLOAT32 -> MESA_FORMAT_Z_FLOAT32
+ */
+mesa_format
+_mesa_format_fallback_rgbx_to_rgba(mesa_format format)
+{
+   switch (format) {
+%for rgbx, rgba in rgbx_to_rgba_map:
+   case ${rgbx}:
+      return ${rgba};
+%endfor
+   default:
+      return format;
+   }
+}
+""");
+
+def main():
+    pargs = parse_args()
+
+    formats = list(format_parser.parse(pargs.csv))
+
+    template_env = {
+        'unorm_to_srgb_map': list(get_unorm_to_srgb_map(formats)),
+        'rgbx_to_rgba_map': list(get_rgbx_to_rgba_map(formats)),
+    }
+
+    with open(pargs.out, 'w') as f:
+        f.write(TEMPLATE.render(**template_env))
+
+if __name__ == "__main__":
+    main()
diff --git a/src/mesa/main/format_info.py b/src/mesa/main/format_info.py
index 780dc0b..b0308ef 100644
--- a/src/mesa/main/format_info.py
+++ b/src/mesa/main/format_info.py
@@ -165,34 +165,37 @@
   * manually or commit it into version control.
   */
 
-static struct gl_format_info format_info[MESA_FORMAT_COUNT] =
+static const struct gl_format_info format_info[MESA_FORMAT_COUNT] =
 {
 '''
 
+def format_channel_bits(fmat, tuple_list):
+   return ['.%s = %s' % (field, str(get_channel_bits(fmat, name))) for (field, name) in tuple_list]
+
+
 for fmat in formats:
    print '   {'
-   print '      {0},'.format(fmat.name)
-   print '      "{0}",'.format(fmat.name)
-   print '      {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper())
-   print '      {0},'.format(get_gl_base_format(fmat))
-   print '      {0},'.format(get_gl_data_type(fmat))
+   print '      .Name = {0},'.format(fmat.name)
+   print '      .StrName = "{0}",'.format(fmat.name)
+   print '      .Layout = {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper())
+   print '      .BaseFormat = {0},'.format(get_gl_base_format(fmat))
+   print '      .DataType = {0},'.format(get_gl_data_type(fmat))
 
-   bits = [ get_channel_bits(fmat, name) for name in ['r', 'g', 'b', 'a']]
-   print '      {0},'.format(', '.join(map(str, bits)))
-   bits = [ get_channel_bits(fmat, name) for name in ['l', 'i', 'z', 's']]
-   print '      {0},'.format(', '.join(map(str, bits)))
+   bits = [('RedBits', 'r'), ('GreenBits', 'g'), ('BlueBits', 'b'), ('AlphaBits', 'a')]
+   print '      {0},'.format(', '.join(format_channel_bits(fmat, bits)))
+   bits = [('LuminanceBits', 'l'), ('IntensityBits', 'i'), ('DepthBits', 'z'), ('StencilBits', 's')]
+   print '      {0},'.format(', '.join(format_channel_bits(fmat, bits)))
 
-   print '      {0:d},'.format(fmat.colorspace == 'srgb')
+   print '      .IsSRGBFormat = {0:d},'.format(fmat.colorspace == 'srgb')
 
-   print '      {0}, {1}, {2}, {3},'.format(fmat.block_width, fmat.block_height,
-                                            fmat.block_depth,
-                                            int(fmat.block_size() / 8))
+   print '      .BlockWidth = {0}, .BlockHeight = {1}, .BlockDepth = {2},'.format(fmat.block_width, fmat.block_height, fmat.block_depth)
+   print '      .BytesPerBlock = {0},'.format(int(fmat.block_size() / 8))
 
-   print '      {{ {0} }},'.format(', '.join(map(str, fmat.swizzle)))
+   print '      .Swizzle = {{ {0} }},'.format(', '.join(map(str, fmat.swizzle)))
    if fmat.is_array():
       chan = fmat.array_element()
       norm = chan.norm or chan.type == parser.FLOAT
-      print '      MESA_ARRAY_FORMAT({0}),'.format(', '.join([
+      print '      .ArrayFormat = MESA_ARRAY_FORMAT({0}),'.format(', '.join([
          str(chan.size / 8),
          str(int(chan.sign)),
          str(int(chan.type == parser.FLOAT)),
@@ -204,7 +207,7 @@
          str(fmat.swizzle[3]),
       ]))
    else:
-      print '      0,'
+      print '      .ArrayFormat = 0,'
    print '   },'
 
 print '};'
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 9d9830f..5c29d37 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -622,123 +622,6 @@
 
 
 /**
- * For an sRGB format, return the corresponding linear color space format.
- * For non-sRGB formats, return the format as-is.
- */
-mesa_format
-_mesa_get_srgb_format_linear(mesa_format format)
-{
-   switch (format) {
-   case MESA_FORMAT_BGR_SRGB8:
-      format = MESA_FORMAT_BGR_UNORM8;
-      break;
-   case MESA_FORMAT_A8B8G8R8_SRGB:
-      format = MESA_FORMAT_A8B8G8R8_UNORM;
-      break;
-   case MESA_FORMAT_B8G8R8A8_SRGB:
-      format = MESA_FORMAT_B8G8R8A8_UNORM;
-      break;
-   case MESA_FORMAT_A8R8G8B8_SRGB:
-      format = MESA_FORMAT_A8R8G8B8_UNORM;
-      break;
-   case MESA_FORMAT_R8G8B8A8_SRGB:
-      format = MESA_FORMAT_R8G8B8A8_UNORM;
-      break;
-   case MESA_FORMAT_L_SRGB8:
-      format = MESA_FORMAT_L_UNORM8;
-      break;
-   case MESA_FORMAT_L8A8_SRGB:
-      format = MESA_FORMAT_L8A8_UNORM;
-      break;
-   case MESA_FORMAT_A8L8_SRGB:
-      format = MESA_FORMAT_A8L8_UNORM;
-      break;
-   case MESA_FORMAT_SRGB_DXT1:
-      format = MESA_FORMAT_RGB_DXT1;
-      break;
-   case MESA_FORMAT_SRGBA_DXT1:
-      format = MESA_FORMAT_RGBA_DXT1;
-      break;
-   case MESA_FORMAT_SRGBA_DXT3:
-      format = MESA_FORMAT_RGBA_DXT3;
-      break;
-   case MESA_FORMAT_SRGBA_DXT5:
-      format = MESA_FORMAT_RGBA_DXT5;
-      break;
-   case MESA_FORMAT_R8G8B8X8_SRGB:
-      format = MESA_FORMAT_R8G8B8X8_UNORM;
-      break;
-   case MESA_FORMAT_X8B8G8R8_SRGB:
-      format = MESA_FORMAT_X8B8G8R8_UNORM;
-      break;
-   case MESA_FORMAT_ETC2_SRGB8:
-      format = MESA_FORMAT_ETC2_RGB8;
-      break;
-   case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC:
-      format = MESA_FORMAT_ETC2_RGBA8_EAC;
-      break;
-   case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
-      format = MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1;
-      break;
-   case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
-      format = MESA_FORMAT_BPTC_RGBA_UNORM;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
-      format = MESA_FORMAT_RGBA_ASTC_4x4;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
-      format = MESA_FORMAT_RGBA_ASTC_5x4;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
-      format = MESA_FORMAT_RGBA_ASTC_5x5;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
-      format = MESA_FORMAT_RGBA_ASTC_6x5;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
-      format = MESA_FORMAT_RGBA_ASTC_6x6;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
-      format = MESA_FORMAT_RGBA_ASTC_8x5;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
-      format = MESA_FORMAT_RGBA_ASTC_8x6;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
-      format = MESA_FORMAT_RGBA_ASTC_8x8;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
-      format = MESA_FORMAT_RGBA_ASTC_10x5;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
-      format = MESA_FORMAT_RGBA_ASTC_10x6;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
-      format = MESA_FORMAT_RGBA_ASTC_10x8;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
-      format = MESA_FORMAT_RGBA_ASTC_10x10;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
-      format = MESA_FORMAT_RGBA_ASTC_12x10;
-      break;
-   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
-      format = MESA_FORMAT_RGBA_ASTC_12x12;
-      break;
-   case MESA_FORMAT_B8G8R8X8_SRGB:
-      format = MESA_FORMAT_B8G8R8X8_UNORM;
-      break;
-   case MESA_FORMAT_X8R8G8B8_SRGB:
-      format = MESA_FORMAT_X8R8G8B8_UNORM;
-      break;
-   default:
-      break;
-   }
-   return format;
-}
-
-
-/**
  * If the given format is a compressed format, return a corresponding
  * uncompressed format.
  */
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index b88466f..fbcbe36 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -749,6 +749,9 @@
 _mesa_get_srgb_format_linear(mesa_format format);
 
 extern mesa_format
+_mesa_get_linear_format_srgb(mesa_format format);
+
+extern mesa_format
 _mesa_get_uncompressed_format(mesa_format format);
 
 extern GLuint
@@ -762,6 +765,9 @@
 				     GLenum format, GLenum type,
 				     GLboolean swapBytes, GLenum *error);
 
+mesa_format
+_mesa_format_fallback_rgbx_to_rgba(mesa_format format);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 9002020..039762a 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -251,7 +251,7 @@
       oldFb->RefCount--;
       deleteFlag = (oldFb->RefCount == 0);
       mtx_unlock(&oldFb->Mutex);
-      
+
       if (deleteFlag)
          oldFb->Delete(oldFb);
 
@@ -321,43 +321,6 @@
 }
 
 /**
- * Examine all the framebuffer's renderbuffers to update the Width/Height
- * fields of the framebuffer.  If we have renderbuffers with different
- * sizes, set the framebuffer's width and height to the min size.
- * Note: this is only intended for user-created framebuffers, not
- * window-system framebuffes.
- */
-static void
-update_framebuffer_size(struct gl_context *ctx, struct gl_framebuffer *fb)
-{
-   GLuint minWidth = ~0, minHeight = ~0;
-   GLuint i;
-
-   /* user-created framebuffers only */
-   assert(_mesa_is_user_fbo(fb));
-
-   for (i = 0; i < BUFFER_COUNT; i++) {
-      struct gl_renderbuffer_attachment *att = &fb->Attachment[i];
-      const struct gl_renderbuffer *rb = att->Renderbuffer;
-      if (rb) {
-         minWidth = MIN2(minWidth, rb->Width);
-         minHeight = MIN2(minHeight, rb->Height);
-      }
-   }
-
-   if (minWidth != ~0U) {
-      fb->Width = minWidth;
-      fb->Height = minHeight;
-   }
-   else {
-      fb->Width = 0;
-      fb->Height = 0;
-   }
-}
-
-
-
-/**
  * Given a bounding box, intersect the bounding box with the scissor of
  * a specified vieport.
  *
@@ -403,14 +366,14 @@
  *                xmax, ymin, ymax.
  *
  * \warning This function assumes that the framebuffer dimensions are up to
- * date (e.g., update_framebuffer_size has been recently called on \c buffer).
+ * date.
  *
  * \sa _mesa_clip_to_region
  */
-void
-_mesa_scissor_bounding_box(const struct gl_context *ctx,
-                           const struct gl_framebuffer *buffer,
-                           unsigned idx, int *bbox)
+static void
+scissor_bounding_box(const struct gl_context *ctx,
+                     const struct gl_framebuffer *buffer,
+                     unsigned idx, int *bbox)
 {
    bbox[0] = 0;
    bbox[2] = 0;
@@ -438,13 +401,8 @@
    if (!buffer)
       return;
 
-   if (_mesa_is_user_fbo(buffer)) {
-      /* user-created framebuffer size depends on the renderbuffers */
-      update_framebuffer_size(ctx, buffer);
-   }
-
    /* Default to the first scissor as that's always valid */
-   _mesa_scissor_bounding_box(ctx, buffer, 0, bbox);
+   scissor_bounding_box(ctx, buffer, 0, bbox);
    buffer->_Xmin = bbox[0];
    buffer->_Ymin = bbox[2];
    buffer->_Xmax = bbox[1];
@@ -474,13 +432,6 @@
    memset(&fb->Visual, 0, sizeof(fb->Visual));
    fb->Visual.rgbMode = GL_TRUE; /* assume this */
 
-#if 0 /* this _might_ be needed */
-   if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
-      /* leave visual fields zero'd */
-      return;
-   }
-#endif
-
    /* find first RGB renderbuffer */
    for (i = 0; i < BUFFER_COUNT; i++) {
       if (fb->Attachment[i].Renderbuffer) {
@@ -631,7 +582,7 @@
 update_color_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 {
    (void) ctx;
-   if (fb->_ColorReadBufferIndex == -1 ||
+   if (fb->_ColorReadBufferIndex == BUFFER_NONE ||
        fb->DeletePending ||
        fb->Width == 0 ||
        fb->Height == 0) {
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index ee0690b..bc6e7bc 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -72,10 +72,6 @@
 _mesa_resizebuffers( struct gl_context *ctx );
 
 extern void
-_mesa_scissor_bounding_box(const struct gl_context *ctx,
-                           const struct gl_framebuffer *buffer,
-                           unsigned idx, int *bbox);
-extern void
 _mesa_intersect_scissor_bounding_box(const struct gl_context *ctx,
                                      unsigned idx, int *bbox);
 
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 6021c02..be49136 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -107,10 +107,10 @@
  * Implements glGenerateMipmap and glGenerateTextureMipmap.
  * Generates all the mipmap levels below the base level.
  */
-void
-_mesa_generate_texture_mipmap(struct gl_context *ctx,
-                              struct gl_texture_object *texObj, GLenum target,
-                              bool dsa)
+static void
+generate_texture_mipmap(struct gl_context *ctx,
+                        struct gl_texture_object *texObj, GLenum target,
+                        bool dsa)
 {
    struct gl_texture_image *srcImage;
    const char *suffix = dsa ? "Texture" : "";
@@ -187,7 +187,7 @@
    if (!texObj)
       return;
 
-   _mesa_generate_texture_mipmap(ctx, texObj, target, false);
+   generate_texture_mipmap(ctx, texObj, target, false);
 }
 
 /**
@@ -209,5 +209,5 @@
       return;
    }
 
-   _mesa_generate_texture_mipmap(ctx, texObj, texObj->Target, true);
+   generate_texture_mipmap(ctx, texObj, texObj->Target, true);
 }
diff --git a/src/mesa/main/genmipmap.h b/src/mesa/main/genmipmap.h
index 40b7f36..94f7f7a 100644
--- a/src/mesa/main/genmipmap.h
+++ b/src/mesa/main/genmipmap.h
@@ -28,10 +28,6 @@
 
 #include "glheader.h"
 
-extern void
-_mesa_generate_texture_mipmap(struct gl_context *ctx,
-                              struct gl_texture_object *texObj, GLenum target,
-                              bool dsa);
 bool
 _mesa_is_valid_generate_texture_mipmap_target(struct gl_context *ctx,
                                               GLenum target);
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index cf3ee63..68f520f 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -147,7 +147,7 @@
    EXTRA_API_ES3,
    EXTRA_API_ES31,
    EXTRA_API_ES32,
-   EXTRA_NEW_BUFFERS, 
+   EXTRA_NEW_BUFFERS,
    EXTRA_NEW_FRAG_CLAMP,
    EXTRA_VALID_DRAW_BUFFER,
    EXTRA_VALID_TEXTURE_UNIT,
@@ -162,6 +162,7 @@
    EXTRA_EXT_SSBO_GS,
    EXTRA_EXT_FB_NO_ATTACH_GS,
    EXTRA_EXT_ES_GS,
+   EXTRA_EXT_PROVOKING_VERTEX_32,
 };
 
 #define NO_EXTRA NULL
@@ -490,7 +491,6 @@
 EXTRA_EXT(EXT_polygon_offset_clamp);
 EXTRA_EXT(ARB_framebuffer_no_attachments);
 EXTRA_EXT(ARB_tessellation_shader);
-EXTRA_EXT(ARB_shader_subroutine);
 EXTRA_EXT(ARB_shader_storage_buffer_object);
 EXTRA_EXT(ARB_indirect_parameters);
 EXTRA_EXT(ATI_meminfo);
@@ -574,6 +574,12 @@
    EXTRA_END
 };
 
+static const int extra_EXT_provoking_vertex_32[] = {
+   EXTRA_EXT_PROVOKING_VERTEX_32,
+   EXTRA_END
+};
+
+
 /* This is the big table describing all the enums we accept in
  * glGet*v().  The table is partitioned into six parts: enums
  * understood by all GL APIs (OpenGL, GLES and GLES2), enums shared
@@ -731,15 +737,15 @@
 
    case GL_LIST_INDEX:
       v->value_int =
-	 ctx->ListState.CurrentList ? ctx->ListState.CurrentList->Name : 0;
+         ctx->ListState.CurrentList ? ctx->ListState.CurrentList->Name : 0;
       break;
    case GL_LIST_MODE:
       if (!ctx->CompileFlag)
-	 v->value_enum = 0;
+         v->value_enum = 0;
       else if (ctx->ExecuteFlag)
-	 v->value_enum = GL_COMPILE_AND_EXECUTE;
+         v->value_enum = GL_COMPILE_AND_EXECUTE;
       else
-	 v->value_enum = GL_COMPILE;
+         v->value_enum = GL_COMPILE;
       break;
 
    case GL_VIEWPORT:
@@ -806,8 +812,8 @@
       v->value_int = _mesa_get_compressed_formats(ctx, NULL);
       break;
    case GL_COMPRESSED_TEXTURE_FORMATS_ARB:
-      v->value_int_n.n = 
-	 _mesa_get_compressed_formats(ctx, v->value_int_n.ints);
+      v->value_int_n.n =
+         _mesa_get_compressed_formats(ctx, v->value_int_n.ints);
       assert(v->value_int_n.n <= (int) ARRAY_SIZE(v->value_int_n.ints));
       break;
 
@@ -830,7 +836,7 @@
    case GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY:
       unit = ctx->Texture.CurrentUnit;
       v->value_int =
-	 ctx->Texture.Unit[unit].CurrentTex[d->offset]->Name;
+         ctx->Texture.Unit[unit].CurrentTex[d->offset]->Name;
       break;
 
    /* GL_EXT_packed_float */
@@ -881,7 +887,7 @@
    case GL_SECONDARY_COLOR_ARRAY_BUFFER_BINDING_ARB:
    case GL_FOG_COORDINATE_ARRAY_BUFFER_BINDING_ARB:
       buffer_obj = (struct gl_buffer_object **)
-	 ((char *) ctx->Array.VAO + d->offset);
+         ((char *) ctx->Array.VAO + d->offset);
       v->value_int = (*buffer_obj)->Name;
       break;
    case GL_ARRAY_BUFFER_BINDING_ARB:
@@ -889,7 +895,7 @@
       break;
    case GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING_ARB:
       v->value_int =
-	 ctx->Array.VAO->BufferBinding[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].BufferObj->Name;
+         ctx->Array.VAO->BufferBinding[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].BufferObj->Name;
       break;
    case GL_ELEMENT_ARRAY_BUFFER_BINDING_ARB:
       v->value_int = ctx->Array.VAO->IndexBufferObj->Name;
@@ -945,14 +951,14 @@
        *                         the last program set by UseProgram (bug 7822).
        */
       v->value_int =
-	 ctx->Shader.ActiveProgram ? ctx->Shader.ActiveProgram->Name : 0;
+         ctx->Shader.ActiveProgram ? ctx->Shader.ActiveProgram->Name : 0;
       break;
    case GL_READ_FRAMEBUFFER_BINDING_EXT:
       v->value_int = ctx->ReadBuffer->Name;
       break;
    case GL_RENDERBUFFER_BINDING_EXT:
       v->value_int =
-	 ctx->CurrentRenderbuffer ? ctx->CurrentRenderbuffer->Name : 0;
+         ctx->CurrentRenderbuffer ? ctx->CurrentRenderbuffer->Name : 0;
       break;
    case GL_POINT_SIZE_ARRAY_BUFFER_BINDING_OES:
       v->value_int = ctx->Array.VAO->BufferBinding[VERT_ATTRIB_POINT_SIZE].BufferObj->Name;
@@ -1170,17 +1176,17 @@
          api_check = GL_TRUE;
          if (version >= 30)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_VERSION_31:
          api_check = GL_TRUE;
          if (version >= 31)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_VERSION_32:
          api_check = GL_TRUE;
          if (version >= 32)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_NEW_FRAG_CLAMP:
          if (ctx->NewState & (_NEW_BUFFERS | _NEW_FRAG_CLAMP))
             _mesa_update_state(ctx);
@@ -1189,65 +1195,65 @@
          api_check = GL_TRUE;
          if (ctx->API == API_OPENGLES2)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_API_ES3:
          api_check = GL_TRUE;
          if (_mesa_is_gles3(ctx))
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_API_ES31:
          api_check = GL_TRUE;
          if (_mesa_is_gles31(ctx))
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_API_ES32:
          api_check = GL_TRUE;
          if (_mesa_is_gles32(ctx))
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_API_GL:
          api_check = GL_TRUE;
          if (_mesa_is_desktop_gl(ctx))
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_API_GL_CORE:
          api_check = GL_TRUE;
          if (ctx->API == API_OPENGL_CORE)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_NEW_BUFFERS:
-	 if (ctx->NewState & _NEW_BUFFERS)
-	    _mesa_update_state(ctx);
-	 break;
+         if (ctx->NewState & _NEW_BUFFERS)
+            _mesa_update_state(ctx);
+         break;
       case EXTRA_FLUSH_CURRENT:
-	 FLUSH_CURRENT(ctx, 0);
-	 break;
+         FLUSH_CURRENT(ctx, 0);
+         break;
       case EXTRA_VALID_DRAW_BUFFER:
-	 if (d->pname - GL_DRAW_BUFFER0_ARB >= ctx->Const.MaxDrawBuffers) {
-	    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(draw buffer %u)",
-			func, d->pname - GL_DRAW_BUFFER0_ARB);
-	    return GL_FALSE;
-	 }
-	 break;
+         if (d->pname - GL_DRAW_BUFFER0_ARB >= ctx->Const.MaxDrawBuffers) {
+            _mesa_error(ctx, GL_INVALID_OPERATION, "%s(draw buffer %u)",
+                        func, d->pname - GL_DRAW_BUFFER0_ARB);
+            return GL_FALSE;
+         }
+         break;
       case EXTRA_VALID_TEXTURE_UNIT:
-	 if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
-	    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(texture %u)",
-			func, ctx->Texture.CurrentUnit);
-	    return GL_FALSE;
-	 }
-	 break;
+         if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
+            _mesa_error(ctx, GL_INVALID_OPERATION, "%s(texture %u)",
+                        func, ctx->Texture.CurrentUnit);
+            return GL_FALSE;
+         }
+         break;
       case EXTRA_VALID_CLIP_DISTANCE:
-	 if (d->pname - GL_CLIP_DISTANCE0 >= ctx->Const.MaxClipPlanes) {
-	    _mesa_error(ctx, GL_INVALID_ENUM, "%s(clip distance %u)",
-			func, d->pname - GL_CLIP_DISTANCE0);
-	    return GL_FALSE;
-	 }
-	 break;
+         if (d->pname - GL_CLIP_DISTANCE0 >= ctx->Const.MaxClipPlanes) {
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(clip distance %u)",
+                        func, d->pname - GL_CLIP_DISTANCE0);
+            return GL_FALSE;
+         }
+         break;
       case EXTRA_GLSL_130:
          api_check = GL_TRUE;
          if (ctx->Const.GLSLVersion >= 130)
             api_found = GL_TRUE;
-	 break;
+         break;
       case EXTRA_EXT_UBO_GS:
          api_check = GL_TRUE;
          if (ctx->Extensions.ARB_uniform_buffer_object &&
@@ -1294,13 +1300,18 @@
          if (_mesa_has_OES_geometry_shader(ctx))
             api_found = GL_TRUE;
          break;
+      case EXTRA_EXT_PROVOKING_VERTEX_32:
+         api_check = TRUE;
+         if (ctx->API == API_OPENGL_COMPAT || version == 32)
+            api_found = ctx->Extensions.EXT_provoking_vertex;
+         break;
       case EXTRA_END:
-	 break;
+         break;
       default: /* *e is a offset into the extension struct */
-	 api_check = GL_TRUE;
-	 if (*(GLboolean *) ((char *) &ctx->Extensions + *e))
-	    api_found = GL_TRUE;
-	 break;
+         api_check = GL_TRUE;
+         if (*(GLboolean *) ((char *) &ctx->Extensions + *e))
+            api_found = GL_TRUE;
+         break;
       }
    }
 
@@ -1318,7 +1329,7 @@
 
 /**
  * Find the struct value_desc corresponding to the enum 'pname'.
- * 
+ *
  * We hash the enum value to get an index into the 'table' array,
  * which holds the index in the 'values' array of struct value_desc.
  * Once we've found the entry, we do the extra checks, if any, then
@@ -1327,7 +1338,7 @@
  * If the value has to be computed (for example, it's the result of a
  * function call or we need to add 1 to it), we use the tmp 'v' to
  * store the result.
- * 
+ *
  * \param func name of glGet*v() func for error reporting
  * \param pname the enum value we're looking up
  * \param p is were we return the pointer to the value
@@ -1480,7 +1491,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = INT_TO_BOOLEAN(v.value_int_n.ints[i]);
+         params[i] = INT_TO_BOOLEAN(v.value_int_n.ints[i]);
       break;
 
    case TYPE_INT64:
@@ -1489,18 +1500,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = ((GLboolean*) p)[0];
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_BOOLEAN(m->m[i]);
+         params[i] = FLOAT_TO_BOOLEAN(m->m[i]);
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_BOOLEAN(m->m[transpose[i]]);
+         params[i] = FLOAT_TO_BOOLEAN(m->m[transpose[i]]);
       break;
 
    case TYPE_BIT_0:
@@ -1573,7 +1584,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = (GLfloat) v.value_int_n.ints[i];
+         params[i] = (GLfloat) v.value_int_n.ints[i];
       break;
 
    case TYPE_UINT_4:
@@ -1592,18 +1603,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = BOOLEAN_TO_FLOAT(*(GLboolean*) p);
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[i];
+         params[i] = m->m[i];
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[transpose[i]];
+         params[i] = m->m[transpose[i]];
       break;
 
    case TYPE_BIT_0:
@@ -1686,7 +1697,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = v.value_int_n.ints[i];
+         params[i] = v.value_int_n.ints[i];
       break;
 
    case TYPE_INT64:
@@ -1695,18 +1706,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = BOOLEAN_TO_INT(*(GLboolean*) p);
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_INT(m->m[i]);
+         params[i] = FLOAT_TO_INT(m->m[i]);
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_INT(m->m[transpose[i]]);
+         params[i] = FLOAT_TO_INT(m->m[transpose[i]]);
       break;
 
    case TYPE_BIT_0:
@@ -1785,7 +1796,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = INT_TO_BOOLEAN(v.value_int_n.ints[i]);
+         params[i] = INT_TO_BOOLEAN(v.value_int_n.ints[i]);
       break;
 
    case TYPE_UINT_4:
@@ -1804,18 +1815,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = ((GLboolean*) p)[0];
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_INT64(m->m[i]);
+         params[i] = FLOAT_TO_INT64(m->m[i]);
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_INT64(m->m[transpose[i]]);
+         params[i] = FLOAT_TO_INT64(m->m[transpose[i]]);
       break;
 
    case TYPE_BIT_0:
@@ -1888,7 +1899,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = v.value_int_n.ints[i];
+         params[i] = v.value_int_n.ints[i];
       break;
 
    case TYPE_UINT_4:
@@ -1907,18 +1918,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = *(GLboolean*) p;
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[i];
+         params[i] = m->m[i];
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[transpose[i]];
+         params[i] = m->m[transpose[i]];
       break;
 
    case TYPE_BIT_0:
@@ -1990,9 +2001,9 @@
 
    case GL_BLEND:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.EXT_draw_buffers2)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = (ctx->Color.BlendEnabled >> index) & 1;
       return TYPE_INT;
 
@@ -2000,54 +2011,54 @@
       /* fall-through */
    case GL_BLEND_SRC_RGB:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].SrcRGB;
       return TYPE_INT;
    case GL_BLEND_SRC_ALPHA:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].SrcA;
       return TYPE_INT;
    case GL_BLEND_DST:
       /* fall-through */
    case GL_BLEND_DST_RGB:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].DstRGB;
       return TYPE_INT;
    case GL_BLEND_DST_ALPHA:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].DstA;
       return TYPE_INT;
    case GL_BLEND_EQUATION_RGB:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].EquationRGB;
       return TYPE_INT;
    case GL_BLEND_EQUATION_ALPHA:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_draw_buffers_blend)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->Color.Blend[index].EquationA;
       return TYPE_INT;
 
    case GL_COLOR_WRITEMASK:
       if (index >= ctx->Const.MaxDrawBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.EXT_draw_buffers2)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int_4[0] = ctx->Color.ColorMask[index][RCOMP] ? 1 : 0;
       v->value_int_4[1] = ctx->Color.ColorMask[index][GCOMP] ? 1 : 0;
       v->value_int_4[2] = ctx->Color.ColorMask[index][BCOMP] ? 1 : 0;
@@ -2092,51 +2103,51 @@
 
    case GL_TRANSFORM_FEEDBACK_BUFFER_START:
       if (index >= ctx->Const.MaxTransformFeedbackBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.EXT_transform_feedback)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int64 = ctx->TransformFeedback.CurrentObject->Offset[index];
       return TYPE_INT64;
 
    case GL_TRANSFORM_FEEDBACK_BUFFER_SIZE:
       if (index >= ctx->Const.MaxTransformFeedbackBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.EXT_transform_feedback)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int64
          = ctx->TransformFeedback.CurrentObject->RequestedSize[index];
       return TYPE_INT64;
 
    case GL_TRANSFORM_FEEDBACK_BUFFER_BINDING:
       if (index >= ctx->Const.MaxTransformFeedbackBuffers)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.EXT_transform_feedback)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->TransformFeedback.CurrentObject->BufferNames[index];
       return TYPE_INT;
 
    case GL_UNIFORM_BUFFER_BINDING:
       if (index >= ctx->Const.MaxUniformBufferBindings)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_uniform_buffer_object)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->UniformBufferBindings[index].BufferObject->Name;
       return TYPE_INT;
 
    case GL_UNIFORM_BUFFER_START:
       if (index >= ctx->Const.MaxUniformBufferBindings)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_uniform_buffer_object)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->UniformBufferBindings[index].Offset < 0 ? 0 :
                      ctx->UniformBufferBindings[index].Offset;
       return TYPE_INT;
 
    case GL_UNIFORM_BUFFER_SIZE:
       if (index >= ctx->Const.MaxUniformBufferBindings)
-	 goto invalid_value;
+         goto invalid_value;
       if (!ctx->Extensions.ARB_uniform_buffer_object)
-	 goto invalid_enum;
+         goto invalid_enum;
       v->value_int = ctx->UniformBufferBindings[index].Size < 0 ? 0 :
                      ctx->UniformBufferBindings[index].Size;
       return TYPE_INT;
@@ -2520,7 +2531,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = (GLfloat) v.value_int_n.ints[i];
+         params[i] = (GLfloat) v.value_int_n.ints[i];
       break;
 
    case TYPE_UINT_4:
@@ -2544,13 +2555,13 @@
    case TYPE_MATRIX:
       m = *(GLmatrix **) &v;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[i];
+         params[i] = m->m[i];
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) &v;
       for (i = 0; i < 16; i++)
-	 params[i] = m->m[transpose[i]];
+         params[i] = m->m[transpose[i]];
       break;
 
    default:
@@ -2602,7 +2613,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = (GLdouble) v.value_int_n.ints[i];
+         params[i] = (GLdouble) v.value_int_n.ints[i];
       break;
 
    case TYPE_UINT_4:
@@ -2626,13 +2637,13 @@
    case TYPE_MATRIX:
       m = *(GLmatrix **) &v;
       for (i = 0; i < 16; i++)
-	 params[i] = (GLdouble) m->m[i];
+         params[i] = (GLdouble) m->m[i];
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) &v;
       for (i = 0; i < 16; i++)
-	 params[i] = (GLdouble) m->m[transpose[i]];
+         params[i] = (GLdouble) m->m[transpose[i]];
       break;
 
    default:
@@ -2695,7 +2706,7 @@
 
    case TYPE_INT_N:
       for (i = 0; i < v.value_int_n.n; i++)
-	 params[i] = INT_TO_FIXED(v.value_int_n.ints[i]);
+         params[i] = INT_TO_FIXED(v.value_int_n.ints[i]);
       break;
 
    case TYPE_INT64:
@@ -2704,18 +2715,18 @@
 
    case TYPE_BOOLEAN:
       params[0] = BOOLEAN_TO_FIXED(((GLboolean*) p)[0]);
-      break;		
+      break;
 
    case TYPE_MATRIX:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_FIXED(m->m[i]);
+         params[i] = FLOAT_TO_FIXED(m->m[i]);
       break;
 
    case TYPE_MATRIX_T:
       m = *(GLmatrix **) p;
       for (i = 0; i < 16; i++)
-	 params[i] = FLOAT_TO_FIXED(m->m[transpose[i]]);
+         params[i] = FLOAT_TO_FIXED(m->m[transpose[i]]);
       break;
 
    case TYPE_BIT_0:
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index f6ffb4c..9d67ca4 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -119,8 +119,8 @@
 # GLSL:
   [ "MAX_CLIP_PLANES", "CONTEXT_INT(Const.MaxClipPlanes), NO_EXTRA" ],
 
-# GL_{APPLE,ARB,OES}_vertex_array_object
-  [ "VERTEX_ARRAY_BINDING_APPLE", "ARRAY_INT(Name), NO_EXTRA" ],
+# GL_{ARB,OES}_vertex_array_object
+  [ "VERTEX_ARRAY_BINDING", "ARRAY_INT(Name), NO_EXTRA" ],
 
 # GL_EXT_texture_filter_anisotropic
   [ "MAX_TEXTURE_MAX_ANISOTROPY_EXT", "CONTEXT_FLOAT(Const.MaxTextureMaxAnisotropy), extra_EXT_texture_filter_anisotropic" ],
@@ -888,7 +888,7 @@
 
 # GL_EXT_provoking_vertex
   [ "PROVOKING_VERTEX_EXT", "CONTEXT_ENUM(Light.ProvokingVertex), extra_EXT_provoking_vertex" ],
-  [ "QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION_EXT", "CONTEXT_BOOL(Const.QuadsFollowProvokingVertexConvention), extra_EXT_provoking_vertex" ],
+  [ "QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION_EXT", "CONTEXT_BOOL(Const.QuadsFollowProvokingVertexConvention), extra_EXT_provoking_vertex_32" ],
 
 # GL_ARB_seamless_cube_map
   [ "TEXTURE_CUBE_MAP_SEAMLESS", "CONTEXT_BOOL(Texture.CubeMapSeamless), extra_ARB_seamless_cube_map" ],
@@ -957,8 +957,8 @@
 # Enums restricted to OpenGL Core profile
 { "apis": ["GL_CORE"], "params": [
 # GL_ARB_shader_subroutine
-  [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
-  [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
+  [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), NO_EXTRA" ],
+  [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), NO_EXTRA" ],
 
 # GL_ARB_indirect_parameters
   [ "PARAMETER_BUFFER_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_indirect_parameters" ],
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 6e90511..5da405d 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -304,6 +304,17 @@
    GLenum e = ctx->ErrorValue;
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
+   /* From Issue (3) of the KHR_no_error spec:
+    *
+    *    "Should glGetError() always return NO_ERROR or have undefined
+    *    results?
+    *
+    *    RESOLVED: It should for all errors except OUT_OF_MEMORY."
+    */
+   if (_mesa_is_no_error_enabled(ctx) && e != GL_OUT_OF_MEMORY) {
+      e = GL_NO_ERROR;
+   }
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_enum_to_string(e));
 
diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c
index 455b829..c71c037 100644
--- a/src/mesa/main/glthread.c
+++ b/src/mesa/main/glthread.c
@@ -36,26 +36,15 @@
 #include "main/glthread.h"
 #include "main/marshal.h"
 #include "main/marshal_generated.h"
+#include "util/u_atomic.h"
 #include "util/u_thread.h"
 
-#ifdef HAVE_PTHREAD
 
 static void
-glthread_allocate_batch(struct gl_context *ctx)
+glthread_unmarshal_batch(void *job, int thread_index)
 {
-   struct glthread_state *glthread = ctx->GLThread;
-
-   /* TODO: handle memory allocation failure. */
-   glthread->batch = malloc(sizeof(*glthread->batch));
-   if (!glthread->batch)
-      return;
-   memset(glthread->batch, 0, offsetof(struct glthread_batch, buffer));
-}
-
-static void
-glthread_unmarshal_batch(struct gl_context *ctx, struct glthread_batch *batch,
-                         const bool release_batch)
-{
+   struct glthread_batch *batch = (struct glthread_batch*)job;
+   struct gl_context *ctx = batch->ctx;
    size_t pos = 0;
 
    _glapi_set_dispatch(ctx->CurrentServerDispatch);
@@ -64,57 +53,16 @@
       pos += _mesa_unmarshal_dispatch_cmd(ctx, &batch->buffer[pos]);
 
    assert(pos == batch->used);
-
-   if (release_batch)
-      free(batch);
-   else
-      batch->used = 0;
+   batch->used = 0;
 }
 
-static void *
-glthread_worker(void *data)
+static void
+glthread_thread_initialization(void *job, int thread_index)
 {
-   struct gl_context *ctx = data;
-   struct glthread_state *glthread = ctx->GLThread;
+   struct gl_context *ctx = (struct gl_context*)job;
 
-   ctx->Driver.SetBackgroundContext(ctx);
+   ctx->Driver.SetBackgroundContext(ctx, &ctx->GLThread->stats);
    _glapi_set_context(ctx);
-
-   u_thread_setname("mesa_glthread");
-
-   pthread_mutex_lock(&glthread->mutex);
-
-   while (true) {
-      struct glthread_batch *batch;
-
-      /* Block (dropping the lock) until new work arrives for us. */
-      while (!glthread->batch_queue && !glthread->shutdown) {
-         pthread_cond_broadcast(&glthread->work_done);
-         pthread_cond_wait(&glthread->new_work, &glthread->mutex);
-      }
-
-      batch = glthread->batch_queue;
-
-      if (glthread->shutdown && !batch) {
-         pthread_cond_broadcast(&glthread->work_done);
-         pthread_mutex_unlock(&glthread->mutex);
-         return NULL;
-      }
-      glthread->batch_queue = batch->next;
-      if (glthread->batch_queue_tail == &batch->next)
-         glthread->batch_queue_tail = &glthread->batch_queue;
-
-      glthread->busy = true;
-      pthread_mutex_unlock(&glthread->mutex);
-
-      glthread_unmarshal_batch(ctx, batch, true);
-
-      pthread_mutex_lock(&glthread->mutex);
-      glthread->busy = false;
-   }
-
-   /* UNREACHED */
-   return NULL;
 }
 
 void
@@ -125,24 +73,35 @@
    if (!glthread)
       return;
 
-   ctx->MarshalExec = _mesa_create_marshal_table(ctx);
-   if (!ctx->MarshalExec) {
+   if (!util_queue_init(&glthread->queue, "glthread", MARSHAL_MAX_BATCHES - 2,
+                        1, 0)) {
       free(glthread);
       return;
    }
 
+   ctx->MarshalExec = _mesa_create_marshal_table(ctx);
+   if (!ctx->MarshalExec) {
+      util_queue_destroy(&glthread->queue);
+      free(glthread);
+      return;
+   }
+
+   for (unsigned i = 0; i < MARSHAL_MAX_BATCHES; i++) {
+      glthread->batches[i].ctx = ctx;
+      util_queue_fence_init(&glthread->batches[i].fence);
+   }
+
+   glthread->stats.queue = &glthread->queue;
    ctx->CurrentClientDispatch = ctx->MarshalExec;
-
-   pthread_mutex_init(&glthread->mutex, NULL);
-   pthread_cond_init(&glthread->new_work, NULL);
-   pthread_cond_init(&glthread->work_done, NULL);
-
-   glthread->batch_queue_tail = &glthread->batch_queue;
    ctx->GLThread = glthread;
 
-   glthread_allocate_batch(ctx);
-
-   pthread_create(&glthread->thread, NULL, glthread_worker, ctx);
+   /* Execute the thread initialization function in the thread. */
+   struct util_queue_fence fence;
+   util_queue_fence_init(&fence);
+   util_queue_add_job(&glthread->queue, ctx, &fence,
+                      glthread_thread_initialization, NULL);
+   util_queue_fence_wait(&fence);
+   util_queue_fence_destroy(&fence);
 }
 
 void
@@ -153,29 +112,11 @@
    if (!glthread)
       return;
 
-   _mesa_glthread_flush_batch(ctx);
+   _mesa_glthread_finish(ctx);
+   util_queue_destroy(&glthread->queue);
 
-   pthread_mutex_lock(&glthread->mutex);
-   glthread->shutdown = true;
-   pthread_cond_broadcast(&glthread->new_work);
-   pthread_mutex_unlock(&glthread->mutex);
-
-   /* Since this waits for the thread to exit, it means that all queued work
-    * will have been completed.
-    */
-   pthread_join(glthread->thread, NULL);
-
-   pthread_cond_destroy(&glthread->new_work);
-   pthread_cond_destroy(&glthread->work_done);
-   pthread_mutex_destroy(&glthread->mutex);
-
-   /* Due to the join above, there should be one empty batch allocated at this
-    * point, and no batches queued.
-    */
-   assert(!glthread->batch->used);
-   assert(!glthread->batch->next);
-   free(glthread->batch);
-   assert(!glthread->batch_queue);
+   for (unsigned i = 0; i < MARSHAL_MAX_BATCHES; i++)
+      util_queue_fence_destroy(&glthread->batches[i].fence);
 
    free(glthread);
    ctx->GLThread = NULL;
@@ -198,19 +139,16 @@
    }
 }
 
-static void
-_mesa_glthread_flush_batch_locked(struct gl_context *ctx)
+void
+_mesa_glthread_flush_batch(struct gl_context *ctx)
 {
    struct glthread_state *glthread = ctx->GLThread;
-   struct glthread_batch *batch = glthread->batch;
-
-   if (!batch->used)
+   if (!glthread)
       return;
 
-   /* Immediately reallocate a new batch, since the next marshalled call would
-    * just do it.
-    */
-   glthread_allocate_batch(ctx);
+   struct glthread_batch *next = &glthread->batches[glthread->next];
+   if (!next->used)
+      return;
 
    /* Debug: execute the batch immediately from this thread.
     *
@@ -218,32 +156,17 @@
     * need to restore it when it returns.
     */
    if (false) {
-      glthread_unmarshal_batch(ctx, batch, true);
+      glthread_unmarshal_batch(next, 0);
       _glapi_set_dispatch(ctx->CurrentClientDispatch);
       return;
    }
 
-   *glthread->batch_queue_tail = batch;
-   glthread->batch_queue_tail = &batch->next;
-   pthread_cond_broadcast(&glthread->new_work);
-}
+   p_atomic_add(&glthread->stats.num_offloaded_items, next->used);
 
-void
-_mesa_glthread_flush_batch(struct gl_context *ctx)
-{
-   struct glthread_state *glthread = ctx->GLThread;
-   struct glthread_batch *batch;
-
-   if (!glthread)
-      return;
-
-   batch = glthread->batch;
-   if (!batch->used)
-      return;
-
-   pthread_mutex_lock(&glthread->mutex);
-   _mesa_glthread_flush_batch_locked(ctx);
-   pthread_mutex_unlock(&glthread->mutex);
+   util_queue_add_job(&glthread->queue, next, &next->fence,
+                      glthread_unmarshal_batch, NULL);
+   glthread->last = glthread->next;
+   glthread->next = (glthread->next + 1) % MARSHAL_MAX_BATCHES;
 }
 
 /**
@@ -256,7 +179,6 @@
 _mesa_glthread_finish(struct gl_context *ctx)
 {
    struct glthread_state *glthread = ctx->GLThread;
-
    if (!glthread)
       return;
 
@@ -265,24 +187,34 @@
     * dri interface entrypoints), in which case we don't need to actually
     * synchronize against ourself.
     */
-   if (pthread_equal(pthread_self(), glthread->thread))
+   if (u_thread_is_self(glthread->queue.threads[0]))
       return;
 
-   pthread_mutex_lock(&glthread->mutex);
+   struct glthread_batch *last = &glthread->batches[glthread->last];
+   struct glthread_batch *next = &glthread->batches[glthread->next];
+   bool synced = false;
 
-   if (!(glthread->batch_queue || glthread->busy)) {
-      if (glthread->batch && glthread->batch->used) {
-         struct _glapi_table *dispatch = _glapi_get_dispatch();
-         glthread_unmarshal_batch(ctx, glthread->batch, false);
-         _glapi_set_dispatch(dispatch);
-      }
-   } else {
-      _mesa_glthread_flush_batch_locked(ctx);
-      while (glthread->batch_queue || glthread->busy)
-         pthread_cond_wait(&glthread->work_done, &glthread->mutex);
+   if (!util_queue_fence_is_signalled(&last->fence)) {
+      util_queue_fence_wait(&last->fence);
+      synced = true;
    }
 
-   pthread_mutex_unlock(&glthread->mutex);
-}
+   if (next->used) {
+      p_atomic_add(&glthread->stats.num_direct_items, next->used);
 
-#endif
+      /* Since glthread_unmarshal_batch changes the dispatch to direct,
+       * restore it after it's done.
+       */
+      struct _glapi_table *dispatch = _glapi_get_dispatch();
+      glthread_unmarshal_batch(next, 0);
+      _glapi_set_dispatch(dispatch);
+
+      /* It's not a sync because we don't enqueue partial batches, but
+       * it would be a sync if we did. So count it anyway.
+       */
+      synced = true;
+   }
+
+   if (synced)
+      p_atomic_inc(&glthread->stats.num_syncs);
+}
diff --git a/src/mesa/main/glthread.h b/src/mesa/main/glthread.h
index 50c1db2..306246c 100644
--- a/src/mesa/main/glthread.h
+++ b/src/mesa/main/glthread.h
@@ -26,60 +26,63 @@
 
 #include "main/mtypes.h"
 
-/* Command size is a number of bytes stored in a short. */
-#define MARSHAL_MAX_CMD_SIZE 65535
+/* The size of one batch and the maximum size of one call.
+ *
+ * This should be as low as possible, so that:
+ * - multiple synchronizations within a frame don't slow us down much
+ * - a smaller number of calls per frame can still get decent parallelism
+ * - the memory footprint of the queue is low, and with that comes a lower
+ *   chance of experiencing CPU cache thrashing
+ * but it should be high enough so that u_queue overhead remains negligible.
+ */
+#define MARSHAL_MAX_CMD_SIZE (8 * 1024)
 
-#ifdef HAVE_PTHREAD
+/* The number of batch slots in memory.
+ *
+ * One batch is being executed, one batch is being filled, the rest are
+ * waiting batches. There must be at least 1 slot for a waiting batch,
+ * so the minimum number of batches is 3.
+ */
+#define MARSHAL_MAX_BATCHES 8
 
 #include <inttypes.h>
 #include <stdbool.h>
-#include <pthread.h>
+#include "util/u_queue.h"
 
 enum marshal_dispatch_cmd_id;
 
+/** A single batch of commands queued up for execution. */
+struct glthread_batch
+{
+   /** Batch fence for waiting for the execution to finish. */
+   struct util_queue_fence fence;
+
+   /** The worker thread will access the context with this. */
+   struct gl_context *ctx;
+
+   /** Amount of data used by batch commands, in bytes. */
+   size_t used;
+
+   /** Data contained in the command buffer. */
+   uint8_t buffer[MARSHAL_MAX_CMD_SIZE];
+};
+
 struct glthread_state
 {
-   /** The worker thread that asynchronously processes our GL commands. */
-   pthread_t thread;
+   /** Multithreaded queue. */
+   struct util_queue queue;
 
-   /**
-    * Mutex used for synchronizing between the main thread and the worker
-    * thread.
-    */
-   pthread_mutex_t mutex;
+   /** This is sent to the driver for framebuffer overlay / HUD. */
+   struct util_queue_monitoring stats;
 
-   /** Condvar used for waking the worker thread. */
-   pthread_cond_t new_work;
+   /** The ring of batches in memory. */
+   struct glthread_batch batches[MARSHAL_MAX_BATCHES];
 
-   /** Condvar used for waking the main thread. */
-   pthread_cond_t work_done;
+   /** Index of the last submitted batch. */
+   unsigned last;
 
-   /** Used to tell the worker thread to quit */
-   bool shutdown;
-
-   /** Indicates that the worker thread is currently processing a batch */
-   bool busy;
-
-   /**
-    * Singly-linked list of command batches that are awaiting execution by
-    * a thread pool task.  NULL if empty.
-    */
-   struct glthread_batch *batch_queue;
-
-   /**
-    * Tail pointer for appending batches to the end of batch_queue.  If the
-    * queue is empty, this points to batch_queue.
-    */
-   struct glthread_batch **batch_queue_tail;
-
-   /**
-    * Batch containing commands that are being prepared for insertion into
-    * batch_queue.  NULL if there are no such commands.
-    *
-    * Since this is only used by the main thread, it doesn't need the mutex to
-    * be accessed.
-    */
-   struct glthread_batch *batch;
+   /** Index of the batch being filled and about to be submitted. */
+   unsigned next;
 
    /**
     * Tracks on the main thread side whether the current vertex array binding
@@ -94,29 +97,6 @@
    bool element_array_is_vbo;
 };
 
-/**
- * A single batch of commands queued up for later execution by a thread pool
- * task.
- */
-struct glthread_batch
-{
-   /**
-    * Next batch of commands to execute after this batch, or NULL if this is
-    * the last set of commands queued.  Protected by ctx->Marshal.Mutex.
-    */
-   struct glthread_batch *next;
-
-   /**
-    * Amount of data used by batch commands, in bytes.
-    */
-   size_t used;
-
-   /**
-    * Data contained in the command buffer.
-    */
-   uint8_t buffer[MARSHAL_MAX_CMD_SIZE];
-};
-
 void _mesa_glthread_init(struct gl_context *ctx);
 void _mesa_glthread_destroy(struct gl_context *ctx);
 
@@ -124,32 +104,4 @@
 void _mesa_glthread_flush_batch(struct gl_context *ctx);
 void _mesa_glthread_finish(struct gl_context *ctx);
 
-#else /* HAVE_PTHREAD */
-
-static inline void
-_mesa_glthread_init(struct gl_context *ctx)
-{
-}
-
-static inline void
-_mesa_glthread_destroy(struct gl_context *ctx)
-{
-}
-
-static inline void
-_mesa_glthread_finish(struct gl_context *ctx)
-{
-}
-
-static inline void
-_mesa_glthread_restore_dispatch(struct gl_context *ctx)
-{
-}
-
-static inline void
-_mesa_glthread_flush_batch(struct gl_context *ctx)
-{
-}
-
-#endif /* !HAVE_PTHREAD */
 #endif /* _GLTHREAD_H*/
diff --git a/src/mesa/main/hash.c b/src/mesa/main/hash.c
index b7a7bd9..d0e575e 100644
--- a/src/mesa/main/hash.c
+++ b/src/mesa/main/hash.c
@@ -35,78 +35,9 @@
  */
 
 #include "glheader.h"
-#include "imports.h"
 #include "hash.h"
 #include "util/hash_table.h"
 
-/**
- * Magic GLuint object name that gets stored outside of the struct hash_table.
- *
- * The hash table needs a particular pointer to be the marker for a key that
- * was deleted from the table, along with NULL for the "never allocated in the
- * table" marker.  Legacy GL allows any GLuint to be used as a GL object name,
- * and we use a 1:1 mapping from GLuints to key pointers, so we need to be
- * able to track a GLuint that happens to match the deleted key outside of
- * struct hash_table.  We tell the hash table to use "1" as the deleted key
- * value, so that we test the deleted-key-in-the-table path as best we can.
- */
-#define DELETED_KEY_VALUE 1
-
-/**
- * The hash table data structure.  
- */
-struct _mesa_HashTable {
-   struct hash_table *ht;
-   GLuint MaxKey;                        /**< highest key inserted so far */
-   mtx_t Mutex;                /**< mutual exclusion lock */
-   GLboolean InDeleteAll;                /**< Debug check */
-   /** Value that would be in the table for DELETED_KEY_VALUE. */
-   void *deleted_key_data;
-};
-
-/** @{
- * Mapping from our use of GLuint as both the key and the hash value to the
- * hash_table.h API
- *
- * There exist many integer hash functions, designed to avoid collisions when
- * the integers are spread across key space with some patterns.  In GL, the
- * pattern (in the case of glGen*()ed object IDs) is that the keys are unique
- * contiguous integers starting from 1.  Because of that, we just use the key
- * as the hash value, to minimize the cost of the hash function.  If objects
- * are never deleted, we will never see a collision in the table, because the
- * table resizes itself when it approaches full, and thus key % table_size ==
- * key.
- *
- * The case where we could have collisions for genned objects would be
- * something like: glGenBuffers(&a, 100); glDeleteBuffers(&a + 50, 50);
- * glGenBuffers(&b, 100), because objects 1-50 and 101-200 are allocated at
- * the end of that sequence, instead of 1-150.  So far it doesn't appear to be
- * a problem.
- */
-static bool
-uint_key_compare(const void *a, const void *b)
-{
-   return a == b;
-}
-
-static uint32_t
-uint_hash(GLuint id)
-{
-   return id;
-}
-
-static uint32_t
-uint_key_hash(const void *key)
-{
-   return uint_hash((uintptr_t)key);
-}
-
-static void *
-uint_key(GLuint id)
-{
-   return (void *)(uintptr_t) id;
-}
-/** @} */
 
 /**
  * Create a new hash table.
@@ -205,10 +136,9 @@
 _mesa_HashLookup(struct _mesa_HashTable *table, GLuint key)
 {
    void *res;
-   assert(table);
-   mtx_lock(&table->Mutex);
+   _mesa_HashLockMutex(table);
    res = _mesa_HashLookup_unlocked(table, key);
-   mtx_unlock(&table->Mutex);
+   _mesa_HashUnlockMutex(table);
    return res;
 }
 
@@ -231,36 +161,6 @@
 }
 
 
-/**
- * Lock the hash table mutex.
- *
- * This function should be used when multiple objects need
- * to be looked up in the hash table, to avoid having to lock
- * and unlock the mutex each time.
- *
- * \param table the hash table.
- */
-void
-_mesa_HashLockMutex(struct _mesa_HashTable *table)
-{
-   assert(table);
-   mtx_lock(&table->Mutex);
-}
-
-
-/**
- * Unlock the hash table mutex.
- *
- * \param table the hash table.
- */
-void
-_mesa_HashUnlockMutex(struct _mesa_HashTable *table)
-{
-   assert(table);
-   mtx_unlock(&table->Mutex);
-}
-
-
 static inline void
 _mesa_HashInsert_unlocked(struct _mesa_HashTable *table, GLuint key, void *data)
 {
@@ -315,10 +215,9 @@
 void
 _mesa_HashInsert(struct _mesa_HashTable *table, GLuint key, void *data)
 {
-   assert(table);
-   mtx_lock(&table->Mutex);
+   _mesa_HashLockMutex(table);
    _mesa_HashInsert_unlocked(table, key, data);
-   mtx_unlock(&table->Mutex);
+   _mesa_HashUnlockMutex(table);
 }
 
 
@@ -339,12 +238,10 @@
    assert(table);
    assert(key);
 
-   /* have to check this outside of mutex lock */
-   if (table->InDeleteAll) {
-      _mesa_problem(NULL, "_mesa_HashRemove illegally called from "
-                    "_mesa_HashDeleteAll callback function");
-      return;
-   }
+   /* assert if _mesa_HashRemove illegally called from _mesa_HashDeleteAll
+    * callback function. Have to check this outside of mutex lock.
+    */
+   assert(!table->InDeleteAll);
 
    if (key == DELETED_KEY_VALUE) {
       table->deleted_key_data = NULL;
@@ -366,9 +263,9 @@
 void
 _mesa_HashRemove(struct _mesa_HashTable *table, GLuint key)
 {
-   mtx_lock(&table->Mutex);
+   _mesa_HashLockMutex(table);
    _mesa_HashRemove_unlocked(table, key);
-   mtx_unlock(&table->Mutex);
+   _mesa_HashUnlockMutex(table);
 }
 
 /**
@@ -387,9 +284,8 @@
 {
    struct hash_entry *entry;
 
-   assert(table);
    assert(callback);
-   mtx_lock(&table->Mutex);
+   _mesa_HashLockMutex(table);
    table->InDeleteAll = GL_TRUE;
    hash_table_foreach(table->ht, entry) {
       callback((uintptr_t)entry->key, entry->data, userData);
@@ -400,7 +296,7 @@
       table->deleted_key_data = NULL;
    }
    table->InDeleteAll = GL_FALSE;
-   mtx_unlock(&table->Mutex);
+   _mesa_HashUnlockMutex(table);
 }
 
 
@@ -411,6 +307,23 @@
  * \param userData  arbitrary pointer to pass along to the callback
  *                  (this is typically a struct gl_context pointer)
  */
+static void
+hash_walk_unlocked(const struct _mesa_HashTable *table,
+                   void (*callback)(GLuint key, void *data, void *userData),
+                   void *userData)
+{
+   assert(table);
+   assert(callback);
+
+   struct hash_entry *entry;
+   hash_table_foreach(table->ht, entry) {
+      callback((uintptr_t)entry->key, entry->data, userData);
+   }
+   if (table->deleted_key_data)
+      callback(DELETED_KEY_VALUE, table->deleted_key_data, userData);
+}
+
+
 void
 _mesa_HashWalk(const struct _mesa_HashTable *table,
                void (*callback)(GLuint key, void *data, void *userData),
@@ -418,17 +331,18 @@
 {
    /* cast-away const */
    struct _mesa_HashTable *table2 = (struct _mesa_HashTable *) table;
-   struct hash_entry *entry;
 
-   assert(table);
-   assert(callback);
-   mtx_lock(&table2->Mutex);
-   hash_table_foreach(table->ht, entry) {
-      callback((uintptr_t)entry->key, entry->data, userData);
-   }
-   if (table->deleted_key_data)
-      callback(DELETED_KEY_VALUE, table->deleted_key_data, userData);
-   mtx_unlock(&table2->Mutex);
+   _mesa_HashLockMutex(table2);
+   hash_walk_unlocked(table, callback, userData);
+   _mesa_HashUnlockMutex(table2);
+}
+
+void
+_mesa_HashWalkLocked(const struct _mesa_HashTable *table,
+               void (*callback)(GLuint key, void *data, void *userData),
+               void *userData)
+{
+   hash_walk_unlocked(table, callback, userData);
 }
 
 static void
diff --git a/src/mesa/main/hash.h b/src/mesa/main/hash.h
index 52a6c5d..02960e3 100644
--- a/src/mesa/main/hash.h
+++ b/src/mesa/main/hash.h
@@ -33,7 +33,76 @@
 
 
 #include "glheader.h"
+#include "imports.h"
 
+/**
+ * Magic GLuint object name that gets stored outside of the struct hash_table.
+ *
+ * The hash table needs a particular pointer to be the marker for a key that
+ * was deleted from the table, along with NULL for the "never allocated in the
+ * table" marker.  Legacy GL allows any GLuint to be used as a GL object name,
+ * and we use a 1:1 mapping from GLuints to key pointers, so we need to be
+ * able to track a GLuint that happens to match the deleted key outside of
+ * struct hash_table.  We tell the hash table to use "1" as the deleted key
+ * value, so that we test the deleted-key-in-the-table path as best we can.
+ */
+#define DELETED_KEY_VALUE 1
+
+/** @{
+ * Mapping from our use of GLuint as both the key and the hash value to the
+ * hash_table.h API
+ *
+ * There exist many integer hash functions, designed to avoid collisions when
+ * the integers are spread across key space with some patterns.  In GL, the
+ * pattern (in the case of glGen*()ed object IDs) is that the keys are unique
+ * contiguous integers starting from 1.  Because of that, we just use the key
+ * as the hash value, to minimize the cost of the hash function.  If objects
+ * are never deleted, we will never see a collision in the table, because the
+ * table resizes itself when it approaches full, and thus key % table_size ==
+ * key.
+ *
+ * The case where we could have collisions for genned objects would be
+ * something like: glGenBuffers(&a, 100); glDeleteBuffers(&a + 50, 50);
+ * glGenBuffers(&b, 100), because objects 1-50 and 101-200 are allocated at
+ * the end of that sequence, instead of 1-150.  So far it doesn't appear to be
+ * a problem.
+ */
+static inline bool
+uint_key_compare(const void *a, const void *b)
+{
+   return a == b;
+}
+
+static inline uint32_t
+uint_hash(GLuint id)
+{
+   return id;
+}
+
+static inline uint32_t
+uint_key_hash(const void *key)
+{
+   return uint_hash((uintptr_t)key);
+}
+
+static inline void *
+uint_key(GLuint id)
+{
+   return (void *)(uintptr_t) id;
+}
+/** @} */
+
+/**
+ * The hash table data structure.
+ */
+struct _mesa_HashTable {
+   struct hash_table *ht;
+   GLuint MaxKey;                        /**< highest key inserted so far */
+   mtx_t Mutex;                          /**< mutual exclusion lock */
+   GLboolean InDeleteAll;                /**< Debug check */
+   /** Value that would be in the table for DELETED_KEY_VALUE. */
+   void *deleted_key_data;
+};
 
 extern struct _mesa_HashTable *_mesa_NewHashTable(void);
 
@@ -45,9 +114,34 @@
 
 extern void _mesa_HashRemove(struct _mesa_HashTable *table, GLuint key);
 
-extern void _mesa_HashLockMutex(struct _mesa_HashTable *table);
+/**
+ * Lock the hash table mutex.
+ *
+ * This function should be used when multiple objects need
+ * to be looked up in the hash table, to avoid having to lock
+ * and unlock the mutex each time.
+ *
+ * \param table the hash table.
+ */
+static inline void
+_mesa_HashLockMutex(struct _mesa_HashTable *table)
+{
+   assert(table);
+   mtx_lock(&table->Mutex);
+}
 
-extern void _mesa_HashUnlockMutex(struct _mesa_HashTable *table);
+
+/**
+ * Unlock the hash table mutex.
+ *
+ * \param table the hash table.
+ */
+static inline void
+_mesa_HashUnlockMutex(struct _mesa_HashTable *table)
+{
+   assert(table);
+   mtx_unlock(&table->Mutex);
+}
 
 extern void *_mesa_HashLookupLocked(struct _mesa_HashTable *table, GLuint key);
 
@@ -66,6 +160,11 @@
                void (*callback)(GLuint key, void *data, void *userData),
                void *userData);
 
+extern void
+_mesa_HashWalkLocked(const struct _mesa_HashTable *table,
+                     void (*callback)(GLuint key, void *data, void *userData),
+                     void *userData);
+
 extern void _mesa_HashPrint(const struct _mesa_HashTable *table);
 
 extern GLuint _mesa_HashFindFreeKeyBlock(struct _mesa_HashTable *table, GLuint numKeys);
diff --git a/src/mesa/main/image.c b/src/mesa/main/image.c
index a039b51..ad6b378 100644
--- a/src/mesa/main/image.c
+++ b/src/mesa/main/image.c
@@ -581,7 +581,7 @@
       }
       break;
    default:
-      _mesa_problem(NULL, "Invalid datatype in _mesa_convert_colors");
+      unreachable("Invalid datatype in _mesa_convert_colors");
    }
 
    free(tempBuffer);
diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index 87a06db..245692a 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -186,8 +186,7 @@
       light->QuadraticAttenuation = params[0];
       break;
    default:
-      _mesa_problem(ctx, "Unexpected pname in _mesa_light()");
-      return;
+      unreachable("Unexpected pname in _mesa_light()");
    }
 
    if (ctx->Driver.Lightfv)
diff --git a/src/mesa/main/lines.c b/src/mesa/main/lines.c
index 93b80af..d83a7d8 100644
--- a/src/mesa/main/lines.c
+++ b/src/mesa/main/lines.c
@@ -72,7 +72,8 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_LINE);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLineState ? 0 : _NEW_LINE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewLineState;
    ctx->Line.Width = width;
 
    if (ctx->Driver.LineWidth)
@@ -106,7 +107,8 @@
        ctx->Line.StipplePattern == pattern)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_LINE);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewLineState ? 0 : _NEW_LINE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewLineState;
    ctx->Line.StippleFactor = factor;
    ctx->Line.StipplePattern = pattern;
 
diff --git a/src/mesa/main/marshal.c b/src/mesa/main/marshal.c
index ae4efb5..8f8e8c7 100644
--- a/src/mesa/main/marshal.c
+++ b/src/mesa/main/marshal.c
@@ -33,8 +33,6 @@
 #include "dispatch.h"
 #include "marshal_generated.h"
 
-#ifdef HAVE_PTHREAD
-
 struct marshal_cmd_Flush
 {
    struct marshal_cmd_base cmd_base;
@@ -410,8 +408,125 @@
    }
 }
 
-/* ClearBufferfv: marshalled asynchronously */
-struct marshal_cmd_ClearBufferfv
+/* NamedBufferData: marshalled asynchronously */
+struct marshal_cmd_NamedBufferData
+{
+   struct marshal_cmd_base cmd_base;
+   GLuint name;
+   GLsizei size;
+   GLenum usage;
+   bool data_null; /* If set, no data follows for "data" */
+   /* Next size bytes are GLubyte data[size] */
+};
+
+void
+_mesa_unmarshal_NamedBufferData(struct gl_context *ctx,
+                                const struct marshal_cmd_NamedBufferData *cmd)
+{
+   const GLuint name = cmd->name;
+   const GLsizei size = cmd->size;
+   const GLenum usage = cmd->usage;
+   const void *data;
+
+   if (cmd->data_null)
+      data = NULL;
+   else
+      data = (const void *) (cmd + 1);
+
+   CALL_NamedBufferData(ctx->CurrentServerDispatch,
+                        (name, size, data, usage));
+}
+
+void GLAPIENTRY
+_mesa_marshal_NamedBufferData(GLuint buffer, GLsizeiptr size,
+                              const GLvoid * data, GLenum usage)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   size_t cmd_size = sizeof(struct marshal_cmd_NamedBufferData) + (data ? size : 0);
+
+   debug_print_marshal("NamedBufferData");
+   if (unlikely(size < 0)) {
+      _mesa_glthread_finish(ctx);
+      _mesa_error(ctx, GL_INVALID_VALUE, "NamedBufferData(size < 0)");
+      return;
+   }
+
+   if (buffer > 0 && cmd_size <= MARSHAL_MAX_CMD_SIZE) {
+      struct marshal_cmd_NamedBufferData *cmd =
+         _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NamedBufferData,
+                                         cmd_size);
+      cmd->name = buffer;
+      cmd->size = size;
+      cmd->usage = usage;
+      cmd->data_null = !data;
+      if (data) {
+         char *variable_data = (char *) (cmd + 1);
+         memcpy(variable_data, data, size);
+      }
+      _mesa_post_marshal_hook(ctx);
+   } else {
+      _mesa_glthread_finish(ctx);
+      CALL_NamedBufferData(ctx->CurrentServerDispatch,
+                           (buffer, size, data, usage));
+   }
+}
+
+/* NamedBufferSubData: marshalled asynchronously */
+struct marshal_cmd_NamedBufferSubData
+{
+   struct marshal_cmd_base cmd_base;
+   GLuint name;
+   GLintptr offset;
+   GLsizei size;
+   /* Next size bytes are GLubyte data[size] */
+};
+
+void
+_mesa_unmarshal_NamedBufferSubData(struct gl_context *ctx,
+                                   const struct marshal_cmd_NamedBufferSubData *cmd)
+{
+   const GLuint name = cmd->name;
+   const GLintptr offset = cmd->offset;
+   const GLsizei size = cmd->size;
+   const void *data = (const void *) (cmd + 1);
+
+   CALL_NamedBufferSubData(ctx->CurrentServerDispatch,
+                           (name, offset, size, data));
+}
+
+void GLAPIENTRY
+_mesa_marshal_NamedBufferSubData(GLuint buffer, GLintptr offset,
+                                 GLsizeiptr size, const GLvoid * data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   size_t cmd_size = sizeof(struct marshal_cmd_NamedBufferSubData) + size;
+
+   debug_print_marshal("NamedBufferSubData");
+   if (unlikely(size < 0)) {
+      _mesa_glthread_finish(ctx);
+      _mesa_error(ctx, GL_INVALID_VALUE, "NamedBufferSubData(size < 0)");
+      return;
+   }
+
+   if (buffer > 0 && cmd_size <= MARSHAL_MAX_CMD_SIZE) {
+      struct marshal_cmd_NamedBufferSubData *cmd =
+         _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_NamedBufferSubData,
+                                         cmd_size);
+      cmd->name = buffer;
+      cmd->offset = offset;
+      cmd->size = size;
+      char *variable_data = (char *) (cmd + 1);
+      memcpy(variable_data, data, size);
+      _mesa_post_marshal_hook(ctx);
+   } else {
+      _mesa_glthread_finish(ctx);
+      CALL_NamedBufferSubData(ctx->CurrentServerDispatch,
+                              (buffer, offset, size, data));
+   }
+}
+
+/* ClearBuffer* (all variants): marshalled asynchronously */
+struct marshal_cmd_ClearBuffer
 {
    struct marshal_cmd_base cmd_base;
    GLenum buffer;
@@ -420,7 +535,7 @@
 
 void
 _mesa_unmarshal_ClearBufferfv(struct gl_context *ctx,
-                              const struct marshal_cmd_ClearBufferfv *cmd)
+                              const struct marshal_cmd_ClearBuffer *cmd)
 {
    const GLenum buffer = cmd->buffer;
    const GLint drawbuffer = cmd->drawbuffer;
@@ -431,6 +546,87 @@
                       (buffer, drawbuffer, value));
 }
 
+void
+_mesa_unmarshal_ClearBufferiv(struct gl_context *ctx,
+                              const struct marshal_cmd_ClearBuffer *cmd)
+{
+   const GLenum buffer = cmd->buffer;
+   const GLint drawbuffer = cmd->drawbuffer;
+   const char *variable_data = (const char *) (cmd + 1);
+   const GLint *value = (const GLint *) variable_data;
+
+   CALL_ClearBufferiv(ctx->CurrentServerDispatch,
+                      (buffer, drawbuffer, value));
+}
+
+void
+_mesa_unmarshal_ClearBufferuiv(struct gl_context *ctx,
+                               const struct marshal_cmd_ClearBuffer *cmd)
+{
+   const GLenum buffer = cmd->buffer;
+   const GLint drawbuffer = cmd->drawbuffer;
+   const char *variable_data = (const char *) (cmd + 1);
+   const GLuint *value = (const GLuint *) variable_data;
+
+   CALL_ClearBufferuiv(ctx->CurrentServerDispatch,
+                       (buffer, drawbuffer, value));
+}
+
+void
+_mesa_unmarshal_ClearBufferfi(struct gl_context *ctx,
+                              const struct marshal_cmd_ClearBuffer *cmd)
+{
+   const GLenum buffer = cmd->buffer;
+   const GLint drawbuffer = cmd->drawbuffer;
+   const char *variable_data = (const char *) (cmd + 1);
+   const GLfloat *depth = (const GLfloat *) variable_data;
+   const GLint *stencil = (const GLint *) (variable_data + 4);
+
+   CALL_ClearBufferfi(ctx->CurrentServerDispatch,
+                      (buffer, drawbuffer, *depth, *stencil));
+}
+
+static inline size_t buffer_to_size(GLenum buffer)
+{
+   switch (buffer) {
+   case GL_COLOR:
+      return 4;
+   case GL_DEPTH_STENCIL:
+      return 2;
+   case GL_STENCIL:
+   case GL_DEPTH:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+static inline bool clear_buffer_add_command(struct gl_context *ctx, uint16_t id,
+                                            GLenum buffer, GLint drawbuffer,
+                                            const GLuint *value, size_t size)
+{
+   size_t cmd_size = sizeof(struct marshal_cmd_ClearBuffer) + 4 * size;
+   if (cmd_size <= MARSHAL_MAX_CMD_SIZE) {
+      struct marshal_cmd_ClearBuffer *cmd =
+         _mesa_glthread_allocate_command(ctx, id,
+                                         cmd_size);
+      cmd->buffer = buffer;
+      cmd->drawbuffer = drawbuffer;
+      GLuint *variable_data = (GLuint *) (cmd + 1);
+      if (size == 4)
+         COPY_4V(variable_data,  value);
+      else if (size == 2)
+         COPY_2V(variable_data, value);
+      else
+         *variable_data = *value;
+
+      _mesa_post_marshal_hook(ctx);
+      return true;
+   }
+
+   return false;
+}
+
 void GLAPIENTRY
 _mesa_marshal_ClearBufferfv(GLenum buffer, GLint drawbuffer,
                             const GLfloat *value)
@@ -438,15 +634,7 @@
    GET_CURRENT_CONTEXT(ctx);
    debug_print_marshal("ClearBufferfv");
 
-   size_t size;
-   switch (buffer) {
-   case GL_DEPTH:
-      size = sizeof(GLfloat);
-      break;
-   case GL_COLOR:
-      size = sizeof(GLfloat) * 4;
-      break;
-   default:
+   if (!(buffer == GL_DEPTH || buffer == GL_COLOR)) {
       _mesa_glthread_finish(ctx);
 
       /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
@@ -457,24 +645,11 @@
        */
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfv(buffer=%s)",
                   _mesa_enum_to_string(buffer));
-      return;
    }
 
-   size_t cmd_size = sizeof(struct marshal_cmd_ClearBufferfv) + size;
-   if (cmd_size <= MARSHAL_MAX_CMD_SIZE) {
-      struct marshal_cmd_ClearBufferfv *cmd =
-         _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_ClearBufferfv,
-                                         cmd_size);
-      cmd->buffer = buffer;
-      cmd->drawbuffer = drawbuffer;
-      GLfloat *variable_data = (GLfloat *) (cmd + 1);
-      if (buffer == GL_COLOR)
-         COPY_4V(variable_data, value);
-      else
-         *variable_data = *value;
-
-      _mesa_post_marshal_hook(ctx);
-   } else {
+   size_t size = buffer_to_size(buffer);
+   if (!clear_buffer_add_command(ctx, DISPATCH_CMD_ClearBufferfv, buffer,
+                                 drawbuffer, (GLuint *)value, size)) {
       debug_print_sync("ClearBufferfv");
       _mesa_glthread_finish(ctx);
       CALL_ClearBufferfv(ctx->CurrentServerDispatch,
@@ -482,4 +657,93 @@
    }
 }
 
-#endif
+void GLAPIENTRY
+_mesa_marshal_ClearBufferiv(GLenum buffer, GLint drawbuffer,
+                            const GLint *value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   debug_print_marshal("ClearBufferiv");
+
+   if (!(buffer == GL_STENCIL || buffer == GL_COLOR)) {
+      _mesa_glthread_finish(ctx);
+
+      /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+       * of the OpenGL 4.5 spec states:
+       *
+       *    "An INVALID_ENUM error is generated by ClearBufferiv and
+       *     ClearNamedFramebufferiv if buffer is not COLOR or STENCIL."
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferiv(buffer=%s)",
+                  _mesa_enum_to_string(buffer));
+   }
+
+   size_t size = buffer_to_size(buffer);
+   if (!clear_buffer_add_command(ctx, DISPATCH_CMD_ClearBufferiv, buffer,
+                                 drawbuffer, (GLuint *)value, size)) {
+      debug_print_sync("ClearBufferiv");
+      _mesa_glthread_finish(ctx);
+      CALL_ClearBufferiv(ctx->CurrentServerDispatch,
+                         (buffer, drawbuffer, value));
+   }
+}
+
+void GLAPIENTRY
+_mesa_marshal_ClearBufferuiv(GLenum buffer, GLint drawbuffer,
+                             const GLuint *value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   debug_print_marshal("ClearBufferuiv");
+
+   if (buffer != GL_COLOR) {
+      _mesa_glthread_finish(ctx);
+
+      /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+       * of the OpenGL 4.5 spec states:
+       *
+       *    "An INVALID_ENUM error is generated by ClearBufferuiv and
+       *     ClearNamedFramebufferuiv if buffer is not COLOR."
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferuiv(buffer=%s)",
+                  _mesa_enum_to_string(buffer));
+   }
+
+   if (!clear_buffer_add_command(ctx, DISPATCH_CMD_ClearBufferuiv, buffer,
+                                 drawbuffer, (GLuint *)value, 4)) {
+      debug_print_sync("ClearBufferuiv");
+      _mesa_glthread_finish(ctx);
+      CALL_ClearBufferuiv(ctx->CurrentServerDispatch,
+                         (buffer, drawbuffer, value));
+   }
+}
+
+void GLAPIENTRY
+_mesa_marshal_ClearBufferfi(GLenum buffer, GLint drawbuffer,
+                            const GLfloat depth, const GLint stencil)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   debug_print_marshal("ClearBufferfi");
+
+   if (buffer != GL_DEPTH_STENCIL) {
+      _mesa_glthread_finish(ctx);
+
+      /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+       * of the OpenGL 4.5 spec states:
+       *
+       *    "An INVALID_ENUM error is generated by ClearBufferfi and
+       *     ClearNamedFramebufferfi if buffer is not DEPTH_STENCIL."
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfi(buffer=%s)",
+                  _mesa_enum_to_string(buffer));
+   }
+
+   fi_type value[2];
+   value[0].f = depth;
+   value[1].i = stencil;
+   if (!clear_buffer_add_command(ctx, DISPATCH_CMD_ClearBufferfi, buffer,
+                                 drawbuffer, (GLuint *)value, 2)) {
+      debug_print_sync("ClearBufferfi");
+      _mesa_glthread_finish(ctx);
+      CALL_ClearBufferfi(ctx->CurrentServerDispatch,
+                         (buffer, drawbuffer, depth, stencil));
+   }
+}
diff --git a/src/mesa/main/marshal.h b/src/mesa/main/marshal.h
index 4842d27..63e0295 100644
--- a/src/mesa/main/marshal.h
+++ b/src/mesa/main/marshal.h
@@ -47,23 +47,23 @@
    uint16_t cmd_size;
 };
 
-#ifdef HAVE_PTHREAD
-
 static inline void *
 _mesa_glthread_allocate_command(struct gl_context *ctx,
                                 uint16_t cmd_id,
                                 size_t size)
 {
    struct glthread_state *glthread = ctx->GLThread;
+   struct glthread_batch *next = &glthread->batches[glthread->next];
    struct marshal_cmd_base *cmd_base;
    const size_t aligned_size = ALIGN(size, 8);
 
-   if (unlikely(glthread->batch->used + size > MARSHAL_MAX_CMD_SIZE))
+   if (unlikely(next->used + size > MARSHAL_MAX_CMD_SIZE)) {
       _mesa_glthread_flush_batch(ctx);
+      next = &glthread->batches[glthread->next];
+   }
 
-   cmd_base = (struct marshal_cmd_base *)
-      &glthread->batch->buffer[glthread->batch->used];
-   glthread->batch->used += aligned_size;
+   cmd_base = (struct marshal_cmd_base *)&next->buffer[next->used];
+   next->used += aligned_size;
    cmd_base->cmd_id = cmd_id;
    cmd_base->cmd_size = aligned_size;
    return cmd_base;
@@ -94,31 +94,6 @@
    return ctx->API != API_OPENGL_CORE && !glthread->element_array_is_vbo;
 }
 
-#else
-
-/* FIXME: dummy functions for non PTHREAD platforms */
-static inline void *
-_mesa_glthread_allocate_command(struct gl_context *ctx,
-                                uint16_t cmd_id,
-                                size_t size)
-{
-   return NULL;
-}
-
-static inline bool
-_mesa_glthread_is_non_vbo_vertex_attrib_pointer(const struct gl_context *ctx)
-{
-   return false;
-}
-
-static inline bool
-_mesa_glthread_is_non_vbo_draw_elements(const struct gl_context *ctx)
-{
-   return false;
-}
-
-#endif
-
 #define DEBUG_MARSHAL_PRINT_CALLS 0
 
 /**
@@ -205,7 +180,13 @@
 struct marshal_cmd_BindBuffer;
 struct marshal_cmd_BufferData;
 struct marshal_cmd_BufferSubData;
-struct marshal_cmd_ClearBufferfv;
+struct marshal_cmd_NamedBufferData;
+struct marshal_cmd_NamedBufferSubData;
+struct marshal_cmd_ClearBuffer;
+#define marshal_cmd_ClearBufferfv   marshal_cmd_ClearBuffer
+#define marshal_cmd_ClearBufferiv   marshal_cmd_ClearBuffer
+#define marshal_cmd_ClearBufferuiv  marshal_cmd_ClearBuffer
+#define marshal_cmd_ClearBufferfi   marshal_cmd_ClearBuffer
 
 void
 _mesa_unmarshal_Enable(struct gl_context *ctx,
@@ -253,11 +234,51 @@
                             const GLvoid * data);
 
 void
+_mesa_unmarshal_NamedBufferData(struct gl_context *ctx,
+                                const struct marshal_cmd_NamedBufferData *cmd);
+
+void GLAPIENTRY
+_mesa_marshal_NamedBufferData(GLuint buffer, GLsizeiptr size,
+                              const GLvoid * data, GLenum usage);
+
+void
+_mesa_unmarshal_NamedBufferSubData(struct gl_context *ctx,
+                                   const struct marshal_cmd_NamedBufferSubData *cmd);
+
+void GLAPIENTRY
+_mesa_marshal_NamedBufferSubData(GLuint buffer, GLintptr offset, GLsizeiptr size,
+                                 const GLvoid * data);
+
+void
 _mesa_unmarshal_ClearBufferfv(struct gl_context *ctx,
-                              const struct marshal_cmd_ClearBufferfv *cmd);
+                              const struct marshal_cmd_ClearBuffer *cmd);
 
 void GLAPIENTRY
 _mesa_marshal_ClearBufferfv(GLenum buffer, GLint drawbuffer,
                             const GLfloat *value);
 
+void
+_mesa_unmarshal_ClearBufferiv(struct gl_context *ctx,
+                              const struct marshal_cmd_ClearBuffer *cmd);
+
+void GLAPIENTRY
+_mesa_marshal_ClearBufferiv(GLenum buffer, GLint drawbuffer,
+                            const GLint *value);
+
+void
+_mesa_unmarshal_ClearBufferuiv(struct gl_context *ctx,
+                               const struct marshal_cmd_ClearBuffer *cmd);
+
+void GLAPIENTRY
+_mesa_marshal_ClearBufferuiv(GLenum buffer, GLint drawbuffer,
+                             const GLuint *value);
+
+void
+_mesa_unmarshal_ClearBufferfi(struct gl_context *ctx,
+                              const struct marshal_cmd_ClearBuffer *cmd);
+
+void GLAPIENTRY
+_mesa_marshal_ClearBufferfi(GLenum buffer, GLint drawbuffer,
+                            const GLfloat depth, const GLint stencil);
+
 #endif /* MARSHAL_H */
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 8745dd9..fc36d40 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -757,7 +757,7 @@
    }
 
    else {
-      _mesa_problem(NULL, "bad format in do_row()");
+      unreachable("bad format in do_row()");
    }
 }
 
@@ -1401,7 +1401,7 @@
    }
 
    else {
-      _mesa_problem(NULL, "bad format in do_row()");
+      unreachable("bad format in do_row()");
    }
 }
 
@@ -1754,8 +1754,7 @@
       /* no mipmaps, do nothing */
       break;
    default:
-      _mesa_problem(NULL, "bad tex target in _mesa_generate_mipmaps");
-      return;
+      unreachable("bad tex target in _mesa_generate_mipmaps");
    }
 }
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index b76c530..22c8799 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -46,6 +46,7 @@
 #include "main/formats.h"       /* MESA_FORMAT_COUNT */
 #include "compiler/glsl/list.h"
 #include "util/bitscan.h"
+#include "util/u_dynarray.h"
 
 
 #ifdef __cplusplus
@@ -143,7 +144,8 @@
    BUFFER_COLOR5,
    BUFFER_COLOR6,
    BUFFER_COLOR7,
-   BUFFER_COUNT
+   BUFFER_COUNT,
+   BUFFER_NONE = -1,
 } gl_buffer_index;
 
 /**
@@ -186,6 +188,9 @@
                             BUFFER_BIT_COLOR6 | \
                             BUFFER_BIT_COLOR7)
 
+/* Mask of bits for depth+stencil buffers */
+#define BUFFER_BITS_DEPTH_STENCIL (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL)
+
 /**
  * Framebuffer configuration (aka visual / pixelformat)
  * Note: some of these fields should be boolean, but it appears that
@@ -703,8 +708,8 @@
    /* ARB_texture_multisample / GL3.2 additions */
    GLboolean SampleMask;
 
-   GLfloat SampleCoverageValue;
-   GLfloat MinSampleShadingValue;
+   GLfloat SampleCoverageValue;  /**< In range [0, 1] */
+   GLfloat MinSampleShadingValue;  /**< In range [0, 1] */
 
    /** The GL spec defines this as an array but >32x MSAA is madness */
    GLbitfield SampleMaskValue;
@@ -797,7 +802,6 @@
    GLenum FrontFace;		/**< Either GL_CW or GL_CCW */
    GLenum FrontMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
    GLenum BackMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
-   GLboolean _FrontBit;		/**< 0=GL_CCW, 1=GL_CW */
    GLboolean CullFlag;		/**< Culling on/off flag */
    GLboolean SmoothFlag;	/**< True if GL_POLYGON_SMOOTH is enabled */
    GLboolean StippleFlag;	/**< True if GL_POLYGON_STIPPLE is enabled */
@@ -850,9 +854,6 @@
    GLboolean Enabled;		/**< Enabled flag */
    GLboolean TestTwoSide;	/**< GL_EXT_stencil_two_side */
    GLubyte ActiveFace;		/**< GL_EXT_stencil_two_side (0 or 2) */
-   GLboolean _Enabled;          /**< Enabled and stencil buffer present */
-   GLboolean _WriteEnabled;     /**< _Enabled and non-zero writemasks */
-   GLboolean _TestTwoSide;
    GLubyte _BackFace;           /**< Current back stencil state (1 or 2) */
    GLenum Function[3];		/**< Stencil function */
    GLenum FailFunc[3];		/**< Fail function */
@@ -986,6 +987,10 @@
    GLenum CompareFunc;		/**< GL_ARB_shadow */
    GLenum sRGBDecode;           /**< GL_DECODE_EXT or GL_SKIP_DECODE_EXT */
    GLboolean CubeMapSeamless;   /**< GL_AMD_seamless_cubemap_per_texture */
+
+   /** GL_ARB_bindless_texture */
+   struct util_dynarray Handles;
+   bool HandleAllocated;
 };
 
 
@@ -1050,6 +1055,11 @@
 
    /** GL_ARB_shader_image_load_store */
    GLenum ImageFormatCompatibilityType;
+
+   /** GL_ARB_bindless_texture */
+   struct util_dynarray SamplerHandles;
+   struct util_dynarray ImageHandles;
+   bool HandleAllocated;
 };
 
 
@@ -1402,6 +1412,8 @@
    unsigned MinMaxCacheHitIndices;
    unsigned MinMaxCacheMissIndices;
    bool MinMaxCacheDirty;
+
+   bool HandleAllocated; /**< GL_ARB_bindless_texture */
 };
 
 
@@ -1495,9 +1507,8 @@
 
 
 /**
- * A representation of "Vertex Array Objects" (VAOs) from OpenGL 3.1+,
- * GL_ARB_vertex_array_object, or the original GL_APPLE_vertex_array_object
- * extension.
+ * A representation of "Vertex Array Objects" (VAOs) from OpenGL 3.1+ /
+ * the GL_ARB_vertex_array_object extension.
  */
 struct gl_vertex_array_object
 {
@@ -1508,24 +1519,6 @@
 
    GLchar *Label;       /**< GL_KHR_debug */
 
-   mtx_t Mutex;
-
-   /**
-    * Does the VAO use ARB semantics or Apple semantics?
-    *
-    * There are several ways in which ARB_vertex_array_object and
-    * APPLE_vertex_array_object VAOs have differing semantics.  At the very
-    * least,
-    *
-    *     - ARB VAOs require that all array data be sourced from vertex buffer
-    *       objects, but Apple VAOs do not.
-    *
-    *     - ARB VAOs require that names come from GenVertexArrays.
-    *
-    * This flag notes which behavior governs this VAO.
-    */
-   GLboolean ARBsemantics;
-
    /**
     * Has this array object been bound?
     */
@@ -1589,7 +1582,7 @@
  */
 struct gl_array_attrib
 {
-   /** Currently bound array object. See _mesa_BindVertexArrayAPPLE() */
+   /** Currently bound array object. */
    struct gl_vertex_array_object *VAO;
 
    /** The default vertex array object */
@@ -1598,7 +1591,7 @@
    /** The last VAO accessed by a DSA function */
    struct gl_vertex_array_object *LastLookedUpVAO;
 
-   /** Array objects (GL_ARB/APPLE_vertex_array_object) */
+   /** Array objects (GL_ARB_vertex_array_object) */
    struct _mesa_HashTable *Objects;
 
    GLint ActiveTexture;		/**< Client Active Texture */
@@ -1996,6 +1989,42 @@
 
 
 /**
+ * A bindless sampler object.
+ */
+struct gl_bindless_sampler
+{
+   /** Texture unit (set by glUniform1()). */
+   GLubyte unit;
+
+   /** Texture Target (TEXTURE_1D/2D/3D/etc_INDEX). */
+   gl_texture_index target;
+
+   /** Whether this bindless sampler is bound to a unit. */
+   GLboolean bound;
+
+   /** Pointer to the base of the data. */
+   GLvoid *data;
+};
+
+/**
+ * A bindless image object.
+ */
+struct gl_bindless_image
+{
+   /** Image unit (set by glUniform1()). */
+   GLubyte unit;
+
+   /** Access qualifier (GL_READ_WRITE, GL_READ_ONLY, GL_WRITE_ONLY) */
+   GLenum access;
+
+   /** Whether this bindless image is bound to a unit. */
+   GLboolean bound;
+
+   /** Pointer to the base of the data. */
+   GLvoid *data;
+};
+
+/**
  * Names of the various vertex/fragment program register files, etc.
  *
  * NOTE: first four tokens must fit into 2 bits (see t_vb_arbprogram.c)
@@ -2129,6 +2158,22 @@
           */
          gl_texture_index SamplerTargets[MAX_SAMPLERS];
 
+         /**
+          * Number of samplers declared with the bindless_sampler layout
+          * qualifier as specified by ARB_bindless_texture.
+          */
+         GLuint NumBindlessSamplers;
+         GLboolean HasBoundBindlessSampler;
+         struct gl_bindless_sampler *BindlessSamplers;
+
+         /**
+          * Number of images declared with the bindless_image layout qualifier
+          * as specified by ARB_bindless_texture.
+          */
+         GLuint NumBindlessImages;
+         GLboolean HasBoundBindlessImage;
+         struct gl_bindless_image *BindlessImages;
+
          union {
             struct {
                /**
@@ -2205,11 +2250,8 @@
 struct gl_vertex_program_state
 {
    GLboolean Enabled;            /**< User-set GL_VERTEX_PROGRAM_ARB/NV flag */
-   GLboolean _Enabled;           /**< Enabled and _valid_ user program? */
    GLboolean PointSizeEnabled;   /**< GL_VERTEX_PROGRAM_POINT_SIZE_ARB/NV */
    GLboolean TwoSideEnabled;     /**< GL_VERTEX_PROGRAM_TWO_SIDE_ARB/NV */
-   /** Computed two sided lighting for fixed function/programs. */
-   GLboolean _TwoSideEnabled;
    struct gl_program *Current;  /**< User-bound vertex program */
 
    /** Currently enabled and valid vertex program (including internal
@@ -2271,7 +2313,6 @@
 struct gl_fragment_program_state
 {
    GLboolean Enabled;     /**< User-set fragment program enable flag */
-   GLboolean _Enabled;    /**< Enabled and _valid_ user program? */
    struct gl_program *Current;  /**< User-bound fragment program */
 
    /** Currently enabled and valid fragment program (including internal
@@ -2340,7 +2381,6 @@
 struct gl_ati_fragment_shader_state
 {
    GLboolean Enabled;
-   GLboolean _Enabled;                  /**< enabled and valid shader? */
    GLboolean Compiling;
    GLfloat GlobalConstants[8][4];
    struct ati_fragment_shader *Current;
@@ -2565,6 +2605,16 @@
    bool origin_upper_left;
    bool pixel_center_integer;
 
+   /**
+    * Whether bindless_sampler/bindless_image, and respectively
+    * bound_sampler/bound_image are declared at global scope as defined by
+    * ARB_bindless_texture.
+    */
+   bool bindless_sampler;
+   bool bindless_image;
+   bool bound_sampler;
+   bool bound_image;
+
    /** Global xfb_stride out qualifier if any */
    GLuint TransformFeedbackBufferStride[MAX_FEEDBACK_BUFFERS];
 
@@ -2614,8 +2664,7 @@
 
    /**
     * Index (GL_UNIFORM_BLOCK_BINDING) into ctx->UniformBufferBindings[] to use
-    * with glBindBufferBase to bind a buffer object to this uniform block.  When
-    * updated in the program, _NEW_BUFFER_OBJECT will be set.
+    * with glBindBufferBase to bind a buffer object to this uniform block.
     */
    GLuint Binding;
 
@@ -2992,6 +3041,7 @@
 #define GLSL_REPORT_ERRORS 0x40  /**< Print compilation errors */
 #define GLSL_DUMP_ON_ERROR 0x80 /**< Dump shaders to stderr on compile error */
 #define GLSL_CACHE_INFO 0x100 /**< Print debug information about shader cache */
+#define GLSL_CACHE_FALLBACK 0x200 /**< Force shader cache fallback paths */
 
 
 /**
@@ -3007,8 +3057,6 @@
 
    GLint RefCount;
 
-   mtx_t Mutex;
-
    GLchar *Label;   /**< GL_KHR_debug */
 
    /**
@@ -3020,8 +3068,6 @@
 
    struct gl_shader_program *ReferencedPrograms[MESA_SHADER_STAGES];
 
-   struct gl_program *_CurrentFragmentProgram;
-
    /**
     * Program used by glUniform calls.
     *
@@ -3060,7 +3106,6 @@
 {
    /** Driver-selectable options: */
    GLboolean EmitNoLoops;
-   GLboolean EmitNoFunctions;
    GLboolean EmitNoCont;                  /**< Emit CONT opcode? */
    GLboolean EmitNoMainReturn;            /**< Emit CONT/RET opcodes? */
    GLboolean EmitNoPow;                   /**< Emit POW opcodes? */
@@ -3223,6 +3268,11 @@
    /** GL_ARB_sampler_objects */
    struct _mesa_HashTable *SamplerObjects;
 
+   /* GL_ARB_bindless_texture */
+   struct hash_table_u64 *TextureHandles;
+   struct hash_table_u64 *ImageHandles;
+   mtx_t HandlesMutex; /**< For texture/image handles safety */
+
    /**
     * Some context in this share group was affected by a GPU reset
     *
@@ -3650,6 +3700,11 @@
    GLboolean AllowGLSLExtensionDirectiveMidShader;
 
    /**
+    * Allow GLSL built-in variables to be redeclared verbatim
+    */
+   GLboolean AllowGLSLBuiltinVariableRedeclaration;
+
+   /**
     * Allow creating a higher compat profile (version 3.1+) for apps that
     * request it. Be careful when adding that driconf option because some
     * features are unimplemented and might not work correctly.
@@ -3928,6 +3983,9 @@
 
    /** Used as an input for sha1 generation in the on-disk shader cache */
    unsigned char *dri_config_options_sha1;
+
+   /** When drivers are OK with mapped buffers during draw and other calls. */
+   bool AllowMappedBuffersDuringExecution;
 };
 
 
@@ -3947,6 +4005,7 @@
    GLboolean ARB_ES3_2_compatibility;
    GLboolean ARB_arrays_of_arrays;
    GLboolean ARB_base_instance;
+   GLboolean ARB_bindless_texture;
    GLboolean ARB_blend_func_extended;
    GLboolean ARB_buffer_storage;
    GLboolean ARB_clear_texture;
@@ -4006,7 +4065,6 @@
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
    GLboolean ARB_shader_storage_buffer_object;
-   GLboolean ARB_shader_subroutine;
    GLboolean ARB_shader_texture_image_samples;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shader_viewport_layer_array;
@@ -4219,7 +4277,7 @@
 #define _NEW_TRACK_MATRIX      (1u << 25)  /**< gl_context::VertexProgram */
 #define _NEW_PROGRAM           (1u << 26)  /**< New program/shader state */
 #define _NEW_PROGRAM_CONSTANTS (1u << 27)
-#define _NEW_BUFFER_OBJECT     (1u << 28)
+/* gap */
 #define _NEW_FRAG_CLAMP        (1u << 29)
 /* gap, re-use for core Mesa state only; use ctx->DriverFlags otherwise */
 #define _NEW_VARYING_VP_INPUTS (1u << 31) /**< gl_context::varying_vp_inputs */
@@ -4401,6 +4459,80 @@
     * gl_context::IntelConservativeRasterization
     */
    uint64_t NewIntelConservativeRasterization;
+
+   /**
+    * gl_context::Scissor::WindowRects
+    */
+   uint64_t NewWindowRectangles;
+
+   /** gl_context::Color::sRGBEnabled */
+   uint64_t NewFramebufferSRGB;
+
+   /** gl_context::Scissor::EnableFlags */
+   uint64_t NewScissorTest;
+
+   /** gl_context::Scissor::ScissorArray */
+   uint64_t NewScissorRect;
+
+   /** gl_context::Color::Alpha* */
+   uint64_t NewAlphaTest;
+
+   /** gl_context::Color::Blend/Dither */
+   uint64_t NewBlend;
+
+   /** gl_context::Color::BlendColor */
+   uint64_t NewBlendColor;
+
+   /** gl_context::Color::Color/Index */
+   uint64_t NewColorMask;
+
+   /** gl_context::Depth */
+   uint64_t NewDepth;
+
+   /** gl_context::Color::LogicOp/ColorLogicOp/IndexLogicOp */
+   uint64_t NewLogicOp;
+
+   /** gl_context::Multisample::Enabled */
+   uint64_t NewMultisampleEnable;
+
+   /** gl_context::Multisample::SampleAlphaTo* */
+   uint64_t NewSampleAlphaToXEnable;
+
+   /** gl_context::Multisample::SampleCoverage/SampleMaskValue */
+   uint64_t NewSampleMask;
+
+   /** gl_context::Multisample::(Min)SampleShading */
+   uint64_t NewSampleShading;
+
+   /** gl_context::Stencil */
+   uint64_t NewStencil;
+
+   /** gl_context::Transform::ClipOrigin/ClipDepthMode */
+   uint64_t NewClipControl;
+
+   /** gl_context::Transform::EyeUserPlane */
+   uint64_t NewClipPlane;
+
+   /** gl_context::Transform::ClipPlanesEnabled */
+   uint64_t NewClipPlaneEnable;
+
+   /** gl_context::Transform::DepthClamp */
+   uint64_t NewDepthClamp;
+
+   /** gl_context::Line */
+   uint64_t NewLineState;
+
+   /** gl_context::Polygon */
+   uint64_t NewPolygonState;
+
+   /** gl_context::PolygonStipple */
+   uint64_t NewPolygonStipple;
+
+   /** gl_context::ViewportArray */
+   uint64_t NewViewport;
+
+   /** Shader constants (uniforms, program parameters, state constants) */
+   uint64_t NewShaderConstants[MESA_SHADER_STAGES];
 };
 
 struct gl_uniform_buffer_binding
@@ -4504,6 +4636,19 @@
    GLuint *IndexPtr;
 };
 
+struct gl_texture_handle_object
+{
+   struct gl_texture_object *texObj;
+   struct gl_sampler_object *sampObj;
+   GLuint64 handle;
+};
+
+struct gl_image_handle_object
+{
+   struct gl_image_unit imgObj;
+   GLuint64 handle;
+};
+
 /**
  * Mesa rendering context.
  *
@@ -4858,6 +5003,14 @@
    GLfloat PrimitiveBoundingBox[8];
 
    struct disk_cache *Cache;
+
+   /**
+    * \name GL_ARB_bindless_texture
+    */
+   /*@{*/
+   struct hash_table_u64 *ResidentTextureHandles;
+   struct hash_table_u64 *ResidentImageHandles;
+   /*@}*/
 };
 
 /**
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 5453e38..38d91f7 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -41,11 +41,16 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
+   value = CLAMP(value, 0.0f, 1.0f);
 
-   ctx->Multisample.SampleCoverageValue = CLAMP(value, 0.0f, 1.0f);
+   if (ctx->Multisample.SampleCoverageInvert == invert &&
+       ctx->Multisample.SampleCoverageValue == value)
+      return;
+
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleMask ? 0 : _NEW_MULTISAMPLE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewSampleMask;
+   ctx->Multisample.SampleCoverageValue = value;
    ctx->Multisample.SampleCoverageInvert = invert;
-   ctx->NewState |= _NEW_MULTISAMPLE;
 }
 
 
@@ -115,7 +120,11 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_MULTISAMPLE);
+   if (ctx->Multisample.SampleMaskValue == mask)
+      return;
+
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewSampleMask ? 0 : _NEW_MULTISAMPLE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewSampleMask;
    ctx->Multisample.SampleMaskValue = mask;
 }
 
@@ -133,10 +142,15 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, 0);
+   value = CLAMP(value, 0.0f, 1.0f);
 
-   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0f, 1.0f);
-   ctx->NewState |= _NEW_MULTISAMPLE;
+   if (ctx->Multisample.MinSampleShadingValue == value)
+      return;
+
+   FLUSH_VERTICES(ctx,
+                  ctx->DriverFlags.NewSampleShading ? 0 : _NEW_MULTISAMPLE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewSampleShading;
+   ctx->Multisample.MinSampleShadingValue = value;
 }
 
 /**
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index 760c46a..94a6d28 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -460,8 +460,7 @@
          break;
 
       default:
-         _mesa_problem(NULL, "bad srcType in extract_uint_indexes");
-         return;
+         unreachable("bad srcType in extract_uint_indexes");
    }
 }
 
@@ -585,7 +584,7 @@
             }
             break;
          default:
-            _mesa_problem(ctx, "bad dstType in _mesa_unpack_stencil_span");
+            unreachable("bad dstType in _mesa_unpack_stencil_span");
       }
 
       free(indexes);
@@ -732,7 +731,7 @@
       }
       break;
    default:
-      _mesa_problem(ctx, "bad type in _mesa_pack_index_span");
+      unreachable("bad type in _mesa_pack_index_span");
    }
 
    free(stencil);
@@ -1123,8 +1122,7 @@
       }
       break;
    default:
-      _mesa_problem(ctx, "bad type in _mesa_pack_depth_span (%s)",
-                    _mesa_enum_to_string(dstType));
+      unreachable("bad type in _mesa_pack_depth_span()");
    }
 
    free(depthCopy);
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 721a15e..f401111 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -58,15 +58,12 @@
 {
    unsigned i;
 
-   _mesa_reference_program(ctx, &obj->_CurrentFragmentProgram, NULL);
-
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
       _mesa_reference_program(ctx, &obj->CurrentProgram[i], NULL);
       _mesa_reference_shader_program(ctx, &obj->ReferencedPrograms[i], NULL);
    }
 
    _mesa_reference_shader_program(ctx, &obj->ActiveProgram, NULL);
-   mtx_destroy(&obj->Mutex);
    free(obj->Label);
    ralloc_free(obj);
 }
@@ -80,7 +77,6 @@
    struct gl_pipeline_object *obj = rzalloc(NULL, struct gl_pipeline_object);
    if (obj) {
       obj->Name = name;
-      mtx_init(&obj->Mutex, mtx_plain);
       obj->RefCount = 1;
       obj->Flags = _mesa_get_shader_flags();
       obj->InfoLog = NULL;
@@ -146,7 +142,7 @@
       return NULL;
    else
       return (struct gl_pipeline_object *)
-         _mesa_HashLookup(ctx->Pipeline.Objects, id);
+         _mesa_HashLookupLocked(ctx->Pipeline.Objects, id);
 }
 
 /**
@@ -156,7 +152,7 @@
 save_pipeline_object(struct gl_context *ctx, struct gl_pipeline_object *obj)
 {
    if (obj->Name > 0) {
-      _mesa_HashInsert(ctx->Pipeline.Objects, obj->Name, obj);
+      _mesa_HashInsertLocked(ctx->Pipeline.Objects, obj->Name, obj);
    }
 }
 
@@ -168,7 +164,7 @@
 remove_pipeline_object(struct gl_context *ctx, struct gl_pipeline_object *obj)
 {
    if (obj->Name > 0) {
-      _mesa_HashRemove(ctx->Pipeline.Objects, obj->Name);
+      _mesa_HashRemoveLocked(ctx->Pipeline.Objects, obj->Name);
    }
 }
 
@@ -186,16 +182,12 @@
 
    if (*ptr) {
       /* Unreference the old pipeline object */
-      GLboolean deleteFlag = GL_FALSE;
       struct gl_pipeline_object *oldObj = *ptr;
 
-      mtx_lock(&oldObj->Mutex);
       assert(oldObj->RefCount > 0);
       oldObj->RefCount--;
-      deleteFlag = (oldObj->RefCount == 0);
-      mtx_unlock(&oldObj->Mutex);
 
-      if (deleteFlag) {
+      if (oldObj->RefCount == 0) {
          _mesa_delete_pipeline_object(ctx, oldObj);
       }
 
@@ -205,18 +197,10 @@
 
    if (obj) {
       /* reference new pipeline object */
-      mtx_lock(&obj->Mutex);
-      if (obj->RefCount == 0) {
-         /* this pipeline's being deleted (look just above) */
-         /* Not sure this can ever really happen.  Warn if it does. */
-         _mesa_problem(NULL, "referencing deleted pipeline object");
-         *ptr = NULL;
-      }
-      else {
-         obj->RefCount++;
-         *ptr = obj;
-      }
-      mtx_unlock(&obj->Mutex);
+      assert(obj->RefCount > 0);
+
+      obj->RefCount++;
+      *ptr = obj;
    }
 }
 
@@ -232,6 +216,65 @@
    _mesa_use_program(ctx, stage, shProg, prog, pipe);
 }
 
+static void
+use_program_stages(struct gl_context *ctx, struct gl_shader_program *shProg,
+                   GLbitfield stages, struct gl_pipeline_object *pipe) {
+
+   /* Enable individual stages from the program as requested by the
+    * application.  If there is no shader for a requested stage in the
+    * program, _mesa_use_shader_program will enable fixed-function processing
+    * as dictated by the spec.
+    *
+    * Section 2.11.4 (Program Pipeline Objects) of the OpenGL 4.1 spec
+    * says:
+    *
+    *     "If UseProgramStages is called with program set to zero or with a
+    *     program object that contains no executable code for the given
+    *     stages, it is as if the pipeline object has no programmable stage
+    *     configured for the indicated shader stages."
+    */
+   if ((stages & GL_VERTEX_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_VERTEX_SHADER, shProg, pipe);
+
+   if ((stages & GL_FRAGMENT_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_FRAGMENT_SHADER, shProg, pipe);
+
+   if ((stages & GL_GEOMETRY_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_GEOMETRY_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_CONTROL_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_TESS_CONTROL_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
+
+   if ((stages & GL_COMPUTE_SHADER_BIT) != 0)
+      use_program_stage(ctx, GL_COMPUTE_SHADER, shProg, pipe);
+
+   pipe->Validated = false;
+}
+
+void GLAPIENTRY
+_mesa_UseProgramStages_no_error(GLuint pipeline, GLbitfield stages,
+                                GLuint prog)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_pipeline_object *pipe =
+      _mesa_lookup_pipeline_object(ctx, pipeline);
+   struct gl_shader_program *shProg = NULL;
+
+   if (prog)
+      shProg = _mesa_lookup_shader_program(ctx, prog);
+
+   /* Object is created by any Pipeline call but glGenProgramPipelines,
+    * glIsProgramPipeline and GetProgramPipelineInfoLog
+    */
+   pipe->EverBound = GL_TRUE;
+
+   use_program_stages(ctx, shProg, stages, pipe);
+}
+
 /**
  * Bound program to severals stages of the pipeline
  */
@@ -325,38 +368,25 @@
       }
    }
 
-   /* Enable individual stages from the program as requested by the
-    * application.  If there is no shader for a requested stage in the
-    * program, _mesa_use_shader_program will enable fixed-function processing
-    * as dictated by the spec.
-    *
-    * Section 2.11.4 (Program Pipeline Objects) of the OpenGL 4.1 spec
-    * says:
-    *
-    *     "If UseProgramStages is called with program set to zero or with a
-    *     program object that contains no executable code for the given
-    *     stages, it is as if the pipeline object has no programmable stage
-    *     configured for the indicated shader stages."
+   use_program_stages(ctx, shProg, stages, pipe);
+}
+
+void GLAPIENTRY
+_mesa_ActiveShaderProgram_no_error(GLuint pipeline, GLuint program)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_shader_program *shProg = NULL;
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
+
+   if (program)
+      shProg = _mesa_lookup_shader_program(ctx, program);
+
+   /* Object is created by any Pipeline call but glGenProgramPipelines,
+    * glIsProgramPipeline and GetProgramPipelineInfoLog
     */
-   if ((stages & GL_VERTEX_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_VERTEX_SHADER, shProg, pipe);
+   pipe->EverBound = GL_TRUE;
 
-   if ((stages & GL_FRAGMENT_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_FRAGMENT_SHADER, shProg, pipe);
-
-   if ((stages & GL_GEOMETRY_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_GEOMETRY_SHADER, shProg, pipe);
-
-   if ((stages & GL_TESS_CONTROL_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_TESS_CONTROL_SHADER, shProg, pipe);
-
-   if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
-
-   if ((stages & GL_COMPUTE_SHADER_BIT) != 0)
-      use_program_stage(ctx, GL_COMPUTE_SHADER, shProg, pipe);
-
-   pipe->Validated = false;
+   _mesa_reference_shader_program(ctx, &pipe->ActiveProgram, shProg);
 }
 
 /**
@@ -399,6 +429,32 @@
    _mesa_reference_shader_program(ctx, &pipe->ActiveProgram, shProg);
 }
 
+void GLAPIENTRY
+_mesa_BindProgramPipeline_no_error(GLuint pipeline)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_pipeline_object *newObj = NULL;
+
+   /* Rebinding the same pipeline object: no change.
+    */
+   if (ctx->_Shader->Name == pipeline)
+      return;
+
+   /* Get pointer to new pipeline object (newObj)
+    */
+   if (pipeline) {
+      /* non-default pipeline object */
+      newObj = _mesa_lookup_pipeline_object(ctx, pipeline);
+
+      /* Object is created by any Pipeline call but glGenProgramPipelines,
+       * glIsProgramPipeline and GetProgramPipelineInfoLog
+       */
+      newObj->EverBound = GL_TRUE;
+   }
+
+   _mesa_bind_pipeline(ctx, newObj);
+}
+
 /**
  * Make program of the pipeline current
  */
@@ -547,20 +603,12 @@
 create_program_pipelines(struct gl_context *ctx, GLsizei n, GLuint *pipelines,
                          bool dsa)
 {
-   const char *func;
+   const char *func = dsa ? "glCreateProgramPipelines" : "glGenProgramPipelines";
    GLuint first;
    GLint i;
 
-   func = dsa ? "glCreateProgramPipelines" : "glGenProgramPipelines";
-
-   if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s (n < 0)", func);
+   if (!pipelines)
       return;
-   }
-
-   if (!pipelines) {
-      return;
-   }
 
    first = _mesa_HashFindFreeKeyBlock(ctx->Pipeline.Objects, n);
 
@@ -582,7 +630,27 @@
       save_pipeline_object(ctx, obj);
       pipelines[i] = first + i;
    }
+}
 
+static void
+create_program_pipelines_err(struct gl_context *ctx, GLsizei n,
+                             GLuint *pipelines, bool dsa)
+{
+   const char *func = dsa ? "glCreateProgramPipelines" : "glGenProgramPipelines";
+
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s (n < 0)", func);
+      return;
+   }
+
+   create_program_pipelines(ctx, n, pipelines, dsa);
+}
+
+void GLAPIENTRY
+_mesa_GenProgramPipelines_no_error(GLsizei n, GLuint *pipelines)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_program_pipelines(ctx, n, pipelines, false);
 }
 
 void GLAPIENTRY
@@ -593,7 +661,14 @@
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGenProgramPipelines(%d, %p)\n", n, pipelines);
 
-   create_program_pipelines(ctx, n, pipelines, false);
+   create_program_pipelines_err(ctx, n, pipelines, false);
+}
+
+void GLAPIENTRY
+_mesa_CreateProgramPipelines_no_error(GLsizei n, GLuint *pipelines)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_program_pipelines(ctx, n, pipelines, true);
 }
 
 void GLAPIENTRY
@@ -604,7 +679,7 @@
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCreateProgramPipelines(%d, %p)\n", n, pipelines);
 
-   create_program_pipelines(ctx, n, pipelines, true);
+   create_program_pipelines_err(ctx, n, pipelines, true);
 }
 
 /**
diff --git a/src/mesa/main/pipelineobj.h b/src/mesa/main/pipelineobj.h
index fbcb765..8cc5954 100644
--- a/src/mesa/main/pipelineobj.h
+++ b/src/mesa/main/pipelineobj.h
@@ -71,22 +71,35 @@
                                 struct gl_pipeline_object *pipe);
 
 
+void GLAPIENTRY
+_mesa_UseProgramStages_no_error(GLuint pipeline, GLbitfield stages,
+                                GLuint prog);
 extern void GLAPIENTRY
 _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program);
 
+void GLAPIENTRY
+_mesa_ActiveShaderProgram_no_error(GLuint pipeline, GLuint program);
 extern void GLAPIENTRY
 _mesa_ActiveShaderProgram(GLuint pipeline, GLuint program);
 
+void GLAPIENTRY
+_mesa_BindProgramPipeline_no_error(GLuint pipeline);
 extern void GLAPIENTRY
 _mesa_BindProgramPipeline(GLuint pipeline);
 
 extern void GLAPIENTRY
 _mesa_DeleteProgramPipelines(GLsizei n, const GLuint *pipelines);
 
+void GLAPIENTRY
+_mesa_GenProgramPipelines_no_error(GLsizei n, GLuint *pipelines);
+
 extern void GLAPIENTRY
 _mesa_GenProgramPipelines(GLsizei n, GLuint *pipelines);
 
 void GLAPIENTRY
+_mesa_CreateProgramPipelines_no_error(GLsizei n, GLuint *pipelines);
+
+void GLAPIENTRY
 _mesa_CreateProgramPipelines(GLsizei n, GLuint *pipelines);
 
 extern GLboolean GLAPIENTRY
diff --git a/src/mesa/main/pixel.c b/src/mesa/main/pixel.c
index 608a545..345c5d1 100644
--- a/src/mesa/main/pixel.c
+++ b/src/mesa/main/pixel.c
@@ -598,12 +598,13 @@
 /*****                    State Management                        *****/
 /**********************************************************************/
 
-/*
- * Return a bitmask of IMAGE_*_BIT flags which to indicate which
- * pixel transfer operations are enabled.
+
+/**
+ * Update mesa pixel transfer derived state to indicate which operations are
+ * enabled.
  */
-static void
-update_image_transfer_state(struct gl_context *ctx)
+void
+_mesa_update_pixel( struct gl_context *ctx )
 {
    GLuint mask = 0;
 
@@ -623,16 +624,6 @@
 }
 
 
-/**
- * Update mesa pixel transfer derived state.
- */
-void _mesa_update_pixel( struct gl_context *ctx, GLuint new_state )
-{
-   if (new_state & _NEW_PIXEL)
-      update_image_transfer_state(ctx);
-}
-
-
 /**********************************************************************/
 /*****                      Initialization                        *****/
 /**********************************************************************/
diff --git a/src/mesa/main/pixel.h b/src/mesa/main/pixel.h
index fd1782e..17e7376 100644
--- a/src/mesa/main/pixel.h
+++ b/src/mesa/main/pixel.h
@@ -64,7 +64,7 @@
 _mesa_PixelTransferi( GLenum pname, GLint param );
 
 extern void 
-_mesa_update_pixel( struct gl_context *ctx, GLuint newstate );
+_mesa_update_pixel( struct gl_context *ctx );
 
 extern void 
 _mesa_init_pixel( struct gl_context * ctx );
diff --git a/src/mesa/main/pixelstore.c b/src/mesa/main/pixelstore.c
index fc81533..6123da9 100644
--- a/src/mesa/main/pixelstore.c
+++ b/src/mesa/main/pixelstore.c
@@ -35,204 +35,228 @@
 #include "mtypes.h"
 
 
-void GLAPIENTRY
-_mesa_PixelStorei( GLenum pname, GLint param )
+static ALWAYS_INLINE void
+pixel_storei(GLenum pname, GLint param, bool no_error)
 {
    /* NOTE: this call can't be compiled into the display list */
    GET_CURRENT_CONTEXT(ctx);
 
    switch (pname) {
       case GL_PACK_SWAP_BYTES:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
          ctx->Pack.SwapBytes = param ? GL_TRUE : GL_FALSE;
          break;
       case GL_PACK_LSB_FIRST:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
          ctx->Pack.LsbFirst = param ? GL_TRUE : GL_FALSE;
          break;
       case GL_PACK_ROW_LENGTH:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.RowLength = param;
          break;
       case GL_PACK_IMAGE_HEIGHT:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.ImageHeight = param;
          break;
       case GL_PACK_SKIP_PIXELS:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.SkipPixels = param;
          break;
       case GL_PACK_SKIP_ROWS:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.SkipRows = param;
          break;
       case GL_PACK_SKIP_IMAGES:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.SkipImages = param;
          break;
       case GL_PACK_ALIGNMENT:
-         if (param!=1 && param!=2 && param!=4 && param!=8)
+         if (!no_error && param!=1 && param!=2 && param!=4 && param!=8)
             goto invalid_value_error;
          ctx->Pack.Alignment = param;
          break;
       case GL_PACK_INVERT_MESA:
-         if (!_mesa_is_desktop_gl(ctx) || !ctx->Extensions.MESA_pack_invert)
+         if (!no_error &&
+             (!_mesa_is_desktop_gl(ctx) || !ctx->Extensions.MESA_pack_invert))
             goto invalid_enum_error;
          ctx->Pack.Invert = param;
          break;
       case GL_PACK_COMPRESSED_BLOCK_WIDTH:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.CompressedBlockWidth = param;
          break;
       case GL_PACK_COMPRESSED_BLOCK_HEIGHT:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.CompressedBlockHeight = param;
          break;
       case GL_PACK_COMPRESSED_BLOCK_DEPTH:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.CompressedBlockDepth = param;
          break;
       case GL_PACK_COMPRESSED_BLOCK_SIZE:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Pack.CompressedBlockSize = param;
          break;
 
       case GL_UNPACK_SWAP_BYTES:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
          ctx->Unpack.SwapBytes = param ? GL_TRUE : GL_FALSE;
          break;
       case GL_UNPACK_LSB_FIRST:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
          ctx->Unpack.LsbFirst = param ? GL_TRUE : GL_FALSE;
          break;
       case GL_UNPACK_ROW_LENGTH:
-         if (ctx->API == API_OPENGLES)
+         if (!no_error && ctx->API == API_OPENGLES)
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.RowLength = param;
          break;
       case GL_UNPACK_IMAGE_HEIGHT:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.ImageHeight = param;
          break;
       case GL_UNPACK_SKIP_PIXELS:
-         if (ctx->API == API_OPENGLES)
+         if (!no_error && ctx->API == API_OPENGLES)
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.SkipPixels = param;
          break;
       case GL_UNPACK_SKIP_ROWS:
-         if (ctx->API == API_OPENGLES)
+         if (!no_error && ctx->API == API_OPENGLES)
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.SkipRows = param;
          break;
       case GL_UNPACK_SKIP_IMAGES:
-         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
-         if (param < 0)
+         if (!no_error && param < 0)
             goto invalid_value_error;
          ctx->Unpack.SkipImages = param;
          break;
       case GL_UNPACK_ALIGNMENT:
-         if (param!=1 && param!=2 && param!=4 && param!=8)
+         if (!no_error && param!=1 && param!=2 && param!=4 && param!=8)
             goto invalid_value_error;
          ctx->Unpack.Alignment = param;
          break;
       case GL_UNPACK_COMPRESSED_BLOCK_WIDTH:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.CompressedBlockWidth = param;
          break;
       case GL_UNPACK_COMPRESSED_BLOCK_HEIGHT:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.CompressedBlockHeight = param;
          break;
       case GL_UNPACK_COMPRESSED_BLOCK_DEPTH:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.CompressedBlockDepth = param;
          break;
       case GL_UNPACK_COMPRESSED_BLOCK_SIZE:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!no_error && !_mesa_is_desktop_gl(ctx))
             goto invalid_enum_error;
-         if (param<0)
+         if (!no_error && param<0)
             goto invalid_value_error;
          ctx->Unpack.CompressedBlockSize = param;
          break;
       default:
-         goto invalid_enum_error;
+         if (!no_error)
+            goto invalid_enum_error;
+         else
+            unreachable("invalid pixel store enum");
    }
 
    return;
 
 invalid_enum_error:
-   _mesa_error( ctx, GL_INVALID_ENUM, "glPixelStore" );
+   _mesa_error(ctx, GL_INVALID_ENUM, "glPixelStore");
    return;
 
 invalid_value_error:
-   _mesa_error( ctx, GL_INVALID_VALUE, "glPixelStore(param)" );
+   _mesa_error(ctx, GL_INVALID_VALUE, "glPixelStore(param)");
    return;
 }
 
 
 void GLAPIENTRY
-_mesa_PixelStoref( GLenum pname, GLfloat param )
+_mesa_PixelStorei(GLenum pname, GLint param)
 {
-   _mesa_PixelStorei( pname, IROUND(param) );
+   pixel_storei(pname, param, false);
 }
 
 
+void GLAPIENTRY
+_mesa_PixelStoref(GLenum pname, GLfloat param)
+{
+   _mesa_PixelStorei(pname, IROUND(param));
+}
+
+
+void GLAPIENTRY
+_mesa_PixelStorei_no_error(GLenum pname, GLint param)
+{
+   pixel_storei(pname, param, true);
+}
+
+
+void GLAPIENTRY
+_mesa_PixelStoref_no_error(GLenum pname, GLfloat param)
+{
+   _mesa_PixelStorei_no_error(pname, IROUND(param));
+}
+
 
 /**
  * Initialize the context's pixel store state.
  */
 void
-_mesa_init_pixelstore( struct gl_context *ctx )
+_mesa_init_pixelstore(struct gl_context *ctx)
 {
    /* Pixel transfer */
    ctx->Pack.Alignment = 4;
diff --git a/src/mesa/main/pixelstore.h b/src/mesa/main/pixelstore.h
index 6838454..d21c6fe 100644
--- a/src/mesa/main/pixelstore.h
+++ b/src/mesa/main/pixelstore.h
@@ -45,6 +45,14 @@
 _mesa_PixelStoref( GLenum pname, GLfloat param );
 
 
+extern void GLAPIENTRY
+_mesa_PixelStorei_no_error(GLenum pname, GLint param);
+
+
+extern void GLAPIENTRY
+_mesa_PixelStoref_no_error(GLenum pname, GLfloat param);
+
+
 extern void
 _mesa_init_pixelstore( struct gl_context *ctx );
 
diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index 1bb7190..f447993 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -50,27 +50,43 @@
  * change, flushes the vertices and notifies the driver via
  * the dd_function_table::CullFace callback.
  */
-void GLAPIENTRY
-_mesa_CullFace( GLenum mode )
+static void
+cull_face(struct gl_context *ctx, GLenum mode)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glCullFace %s\n", _mesa_enum_to_string(mode));
-
-   if (mode!=GL_FRONT && mode!=GL_BACK && mode!=GL_FRONT_AND_BACK) {
-      _mesa_error( ctx, GL_INVALID_ENUM, "glCullFace" );
-      return;
-   }
-
    if (ctx->Polygon.CullFaceMode == mode)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_POLYGON);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+   ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
    ctx->Polygon.CullFaceMode = mode;
 
    if (ctx->Driver.CullFace)
-      ctx->Driver.CullFace( ctx, mode );
+      ctx->Driver.CullFace(ctx, mode);
+}
+
+
+void GLAPIENTRY
+_mesa_CullFace_no_error(GLenum mode)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   cull_face(ctx, mode);
+}
+
+
+void GLAPIENTRY
+_mesa_CullFace(GLenum mode)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glCullFace %s\n", _mesa_enum_to_string(mode));
+
+   if (mode != GL_FRONT && mode != GL_BACK && mode != GL_FRONT_AND_BACK) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glCullFace");
+      return;
+   }
+
+   cull_face(ctx, mode);
 }
 
 
@@ -85,27 +101,43 @@
  * flushes the vertices and notifies the driver via
  * the dd_function_table::FrontFace callback.
  */
-void GLAPIENTRY
-_mesa_FrontFace( GLenum mode )
+static ALWAYS_INLINE void
+front_face(struct gl_context *ctx, GLenum mode, bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
-
    if (ctx->Polygon.FrontFace == mode)
       return;
 
-   if (mode!=GL_CW && mode!=GL_CCW) {
-      _mesa_error( ctx, GL_INVALID_ENUM, "glFrontFace" );
+   if (!no_error && mode != GL_CW && mode != GL_CCW) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glFrontFace");
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_POLYGON);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+   ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
    ctx->Polygon.FrontFace = mode;
 
    if (ctx->Driver.FrontFace)
-      ctx->Driver.FrontFace( ctx, mode );
+      ctx->Driver.FrontFace(ctx, mode);
+}
+
+
+void GLAPIENTRY
+_mesa_FrontFace_no_error(GLenum mode)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   front_face(ctx, mode, true);
+}
+
+
+void GLAPIENTRY
+_mesa_FrontFace(GLenum mode)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
+
+   front_face(ctx, mode, false);
 }
 
 
@@ -153,13 +185,15 @@
       }
       if (ctx->Polygon.FrontMode == mode)
          return;
-      FLUSH_VERTICES(ctx, _NEW_POLYGON);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+      ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
       ctx->Polygon.FrontMode = mode;
       break;
    case GL_FRONT_AND_BACK:
       if (ctx->Polygon.FrontMode == mode && ctx->Polygon.BackMode == mode)
          return;
-      FLUSH_VERTICES(ctx, _NEW_POLYGON);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+      ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
       ctx->Polygon.FrontMode = mode;
       ctx->Polygon.BackMode = mode;
       break;
@@ -170,7 +204,8 @@
       }
       if (ctx->Polygon.BackMode == mode)
          return;
-      FLUSH_VERTICES(ctx, _NEW_POLYGON);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+      ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
       ctx->Polygon.BackMode = mode;
       break;
    default:
@@ -194,7 +229,9 @@
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glPolygonStipple\n");
 
-   FLUSH_VERTICES(ctx, _NEW_POLYGONSTIPPLE);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonStipple ? 0 :
+                                                      _NEW_POLYGONSTIPPLE);
+   ctx->NewDriverState |= ctx->DriverFlags.NewPolygonStipple;
 
    pattern = _mesa_map_validate_pbo_source(ctx, 2,
                                            &ctx->Unpack, 32, 32, 1,
@@ -252,7 +289,8 @@
        ctx->Polygon.OffsetClamp == clamp)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_POLYGON);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewPolygonState ? 0 : _NEW_POLYGON);
+   ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
    ctx->Polygon.OffsetFactor = factor;
    ctx->Polygon.OffsetUnits = units;
    ctx->Polygon.OffsetClamp = clamp;
@@ -318,7 +356,6 @@
    ctx->Polygon.CullFlag = GL_FALSE;
    ctx->Polygon.CullFaceMode = GL_BACK;
    ctx->Polygon.FrontFace = GL_CCW;
-   ctx->Polygon._FrontBit = 0;
    ctx->Polygon.FrontMode = GL_FILL;
    ctx->Polygon.BackMode = GL_FILL;
    ctx->Polygon.SmoothFlag = GL_FALSE;
diff --git a/src/mesa/main/polygon.h b/src/mesa/main/polygon.h
index 41344a2..39b7a73 100644
--- a/src/mesa/main/polygon.h
+++ b/src/mesa/main/polygon.h
@@ -39,11 +39,17 @@
 extern void GLAPIENTRY
 _mesa_GetnPolygonStippleARB( GLsizei bufSize, GLubyte *dest );
 
-extern void GLAPIENTRY
-_mesa_CullFace( GLenum mode );
+void GLAPIENTRY
+_mesa_CullFace_no_error(GLenum mode);
 
 extern void GLAPIENTRY
-_mesa_FrontFace( GLenum mode );
+_mesa_CullFace(GLenum mode);
+
+void GLAPIENTRY
+_mesa_FrontFace_no_error(GLenum mode);
+
+extern void GLAPIENTRY
+_mesa_FrontFace(GLenum mode);
 
 extern void GLAPIENTRY
 _mesa_PolygonMode( GLenum face, GLenum mode );
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index e4edb51..46535d7 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -278,7 +278,7 @@
             q->EverBound = GL_TRUE;
          }
          ids[i] = first + i;
-         _mesa_HashInsert(ctx->Query.QueryObjects, first + i, q);
+         _mesa_HashInsertLocked(ctx->Query.QueryObjects, first + i, q);
       }
    }
 }
@@ -345,7 +345,7 @@
                q->Active = GL_FALSE;
                ctx->Driver.EndQuery(ctx, q);
             }
-            _mesa_HashRemove(ctx->Query.QueryObjects, ids[i]);
+            _mesa_HashRemoveLocked(ctx->Query.QueryObjects, ids[i]);
             ctx->Driver.DeleteQuery(ctx, q);
          }
       }
@@ -448,7 +448,7 @@
             _mesa_error(ctx, GL_OUT_OF_MEMORY, "glBeginQuery{Indexed}");
             return;
          }
-         _mesa_HashInsert(ctx->Query.QueryObjects, id, q);
+         _mesa_HashInsertLocked(ctx->Query.QueryObjects, id, q);
       }
    }
    else {
@@ -590,7 +590,7 @@
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glQueryCounter");
          return;
       }
-      _mesa_HashInsert(ctx->Query.QueryObjects, id, q);
+      _mesa_HashInsertLocked(ctx->Query.QueryObjects, id, q);
    }
    else {
       if (q->Target && q->Target != GL_TIMESTAMP) {
diff --git a/src/mesa/main/queryobj.h b/src/mesa/main/queryobj.h
index d1036fc..24a8257 100644
--- a/src/mesa/main/queryobj.h
+++ b/src/mesa/main/queryobj.h
@@ -35,7 +35,7 @@
 _mesa_lookup_query_object(struct gl_context *ctx, GLuint id)
 {
    return (struct gl_query_object *)
-      _mesa_HashLookup(ctx->Query.QueryObjects, id);
+      _mesa_HashLookupLocked(ctx->Query.QueryObjects, id);
 }
 
 
diff --git a/src/mesa/main/rastpos.c b/src/mesa/main/rastpos.c
index 4fddad1..43f6759 100644
--- a/src/mesa/main/rastpos.c
+++ b/src/mesa/main/rastpos.c
@@ -372,7 +372,7 @@
 void
 _mesa_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
 {
-   if (ctx->VertexProgram._Enabled) {
+   if (_mesa_arb_vertex_program_enabled(ctx)) {
       /* XXX implement this */
       _mesa_problem(ctx, "Vertex programs not implemented for glRasterPos");
       return;
diff --git a/src/mesa/main/renderbuffer.c b/src/mesa/main/renderbuffer.c
index b58b3f3..5046f9d 100644
--- a/src/mesa/main/renderbuffer.c
+++ b/src/mesa/main/renderbuffer.c
@@ -144,9 +144,9 @@
  * used with a freshly created renderbuffer.
  */
 void
-_mesa_add_renderbuffer_without_ref(struct gl_framebuffer *fb,
-                                   gl_buffer_index bufferName,
-                                   struct gl_renderbuffer *rb)
+_mesa_attach_and_own_rb(struct gl_framebuffer *fb,
+                        gl_buffer_index bufferName,
+                        struct gl_renderbuffer *rb)
 {
    assert(rb->RefCount == 1);
 
@@ -162,8 +162,9 @@
  * \param bufferName  one of the BUFFER_x tokens
  */
 void
-_mesa_add_renderbuffer(struct gl_framebuffer *fb,
-                       gl_buffer_index bufferName, struct gl_renderbuffer *rb)
+_mesa_attach_and_reference_rb(struct gl_framebuffer *fb,
+                              gl_buffer_index bufferName,
+                              struct gl_renderbuffer *rb)
 {
    validate_and_init_renderbuffer_attachment(fb, bufferName, rb);
    _mesa_reference_renderbuffer(&fb->Attachment[bufferName].Renderbuffer, rb);
diff --git a/src/mesa/main/renderbuffer.h b/src/mesa/main/renderbuffer.h
index a6f1439..f9a6462 100644
--- a/src/mesa/main/renderbuffer.h
+++ b/src/mesa/main/renderbuffer.h
@@ -47,13 +47,14 @@
 _mesa_delete_renderbuffer(struct gl_context *ctx, struct gl_renderbuffer *rb);
 
 extern void
-_mesa_add_renderbuffer_without_ref(struct gl_framebuffer *fb,
-                                   gl_buffer_index bufferName,
-                                   struct gl_renderbuffer *rb);
+_mesa_attach_and_own_rb(struct gl_framebuffer *fb,
+                        gl_buffer_index bufferName,
+                        struct gl_renderbuffer *rb);
 
 extern void
-_mesa_add_renderbuffer(struct gl_framebuffer *fb,
-                       gl_buffer_index bufferName, struct gl_renderbuffer *rb);
+_mesa_attach_and_reference_rb(struct gl_framebuffer *fb,
+                              gl_buffer_index bufferName,
+                              struct gl_renderbuffer *rb);
 
 extern void
 _mesa_remove_renderbuffer(struct gl_framebuffer *fb,
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index 183f1d2..26e7725 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -38,6 +38,7 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/samplerobj.h"
+#include "main/texturebindless.h"
 
 
 struct gl_sampler_object *
@@ -61,6 +62,7 @@
 delete_sampler_object(struct gl_context *ctx,
                       struct gl_sampler_object *sampObj)
 {
+   _mesa_delete_sampler_handles(ctx, sampObj);
    mtx_destroy(&sampObj->Mutex);
    free(sampObj->Label);
    free(sampObj);
@@ -97,16 +99,10 @@
    if (samp) {
       /* reference new sampler */
       mtx_lock(&samp->Mutex);
-      if (samp->RefCount == 0) {
-         /* this sampler's being deleted (look just above) */
-         /* Not sure this can every really happen.  Warn if it does. */
-         _mesa_problem(NULL, "referencing deleted sampler object");
-         *ptr = NULL;
-      }
-      else {
-         samp->RefCount++;
-         *ptr = samp;
-      }
+      assert(samp->RefCount > 0);
+
+      samp->RefCount++;
+      *ptr = samp;
       mtx_unlock(&samp->Mutex);
    }
 }
@@ -138,6 +134,10 @@
    sampObj->CompareFunc = GL_LEQUAL;
    sampObj->sRGBDecode = GL_DECODE_EXT;
    sampObj->CubeMapSeamless = GL_FALSE;
+   sampObj->HandleAllocated = GL_FALSE;
+
+   /* GL_ARB_bindless_texture */
+   _mesa_init_sampler_handles(sampObj);
 }
 
 /**
@@ -154,20 +154,11 @@
 }
 
 static void
-create_samplers(struct gl_context *ctx, GLsizei count, GLuint *samplers,
-                const char *caller)
+create_samplers(struct gl_context *ctx, GLsizei count, GLuint *samplers)
 {
    GLuint first;
    GLint i;
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "%s(%d)\n", caller, count);
-
-   if (count < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n<0)", caller);
-      return;
-   }
-
    if (!samplers)
       return;
 
@@ -186,18 +177,48 @@
    _mesa_HashUnlockMutex(ctx->Shared->SamplerObjects);
 }
 
+static void
+create_samplers_err(struct gl_context *ctx, GLsizei count, GLuint *samplers,
+                    const char *caller)
+{
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "%s(%d)\n", caller, count);
+
+   if (count < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n<0)", caller);
+      return;
+   }
+
+   create_samplers(ctx, count, samplers);
+}
+
+void GLAPIENTRY
+_mesa_GenSamplers_no_error(GLsizei count, GLuint *samplers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_samplers(ctx, count, samplers);
+}
+
 void GLAPIENTRY
 _mesa_GenSamplers(GLsizei count, GLuint *samplers)
 {
    GET_CURRENT_CONTEXT(ctx);
-   create_samplers(ctx, count, samplers, "glGenSamplers");
+   create_samplers_err(ctx, count, samplers, "glGenSamplers");
+}
+
+void GLAPIENTRY
+_mesa_CreateSamplers_no_error(GLsizei count, GLuint *samplers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_samplers(ctx, count, samplers);
 }
 
 void GLAPIENTRY
 _mesa_CreateSamplers(GLsizei count, GLuint *samplers)
 {
    GET_CURRENT_CONTEXT(ctx);
-   create_samplers(ctx, count, samplers, "glCreateSamplers");
+   create_samplers_err(ctx, count, samplers, "glCreateSamplers");
 }
 
 
@@ -759,8 +780,19 @@
    if (samp->sRGBDecode == param)
       return GL_FALSE;
 
+   /* The EXT_texture_sRGB_decode spec says:
+    *
+    *    "INVALID_ENUM is generated if the <pname> parameter of
+    *     TexParameter[i,f,Ii,Iui][v][EXT],
+    *     MultiTexParameter[i,f,Ii,Iui][v]EXT,
+    *     TextureParameter[i,f,Ii,Iui][v]EXT, SamplerParameter[i,f,Ii,Iui][v]
+    *     is TEXTURE_SRGB_DECODE_EXT when the <param> parameter is not one of
+    *     DECODE_EXT or SKIP_DECODE_EXT.
+    *
+    * Returning INVALID_PARAM makes that happen.
+    */
    if (param != GL_DECODE_EXT && param != GL_SKIP_DECODE_EXT)
-      return INVALID_VALUE;
+      return INVALID_PARAM;
 
    flush(ctx);
    samp->sRGBDecode = param;
@@ -769,7 +801,7 @@
 
 static struct gl_sampler_object *
 sampler_parameter_error_check(struct gl_context *ctx, GLuint sampler,
-                              const char *name)
+                              bool get, const char *name)
 {
    struct gl_sampler_object *sampObj;
 
@@ -786,6 +818,17 @@
       return NULL;
    }
 
+   if (!get && sampObj->HandleAllocated) {
+      /* The ARB_bindless_texture spec says:
+       *
+       * "The error INVALID_OPERATION is generated by SamplerParameter* if
+       *  <sampler> identifies a sampler object referenced by one or more
+       *  texture handles."
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(immutable sampler)", name);
+      return NULL;
+   }
+
    return sampObj;
 }
 
@@ -796,7 +839,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameteri");
    if (!sampObj)
       return;
@@ -879,7 +922,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameterf");
    if (!sampObj)
       return;
@@ -961,7 +1004,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameteriv");
    if (!sampObj)
       return;
@@ -1051,7 +1094,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameterfv");
    if (!sampObj)
       return;
@@ -1134,7 +1177,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameterIiv");
    if (!sampObj)
       return;
@@ -1218,7 +1261,7 @@
    GLuint res;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, false,
                                            "glSamplerParameterIuiv");
    if (!sampObj)
       return;
@@ -1301,7 +1344,7 @@
    struct gl_sampler_object *sampObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, true,
                                            "glGetSamplerParameteriv");
    if (!sampObj)
       return;
@@ -1385,7 +1428,7 @@
    struct gl_sampler_object *sampObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, true,
                                            "glGetSamplerParameterfv");
    if (!sampObj)
       return;
@@ -1457,7 +1500,7 @@
    struct gl_sampler_object *sampObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, true,
                                            "glGetSamplerParameterIiv");
    if (!sampObj)
       return;
@@ -1529,7 +1572,7 @@
    struct gl_sampler_object *sampObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   sampObj = sampler_parameter_error_check(ctx, sampler,
+   sampObj = sampler_parameter_error_check(ctx, sampler, true,
                                            "glGetSamplerParameterIuiv");
    if (!sampObj)
       return;
diff --git a/src/mesa/main/samplerobj.h b/src/mesa/main/samplerobj.h
index 8e9539d..26b5dd6 100644
--- a/src/mesa/main/samplerobj.h
+++ b/src/mesa/main/samplerobj.h
@@ -96,7 +96,14 @@
                    struct gl_sampler_object *sampObj);
 
 void GLAPIENTRY
+_mesa_GenSamplers_no_error(GLsizei count, GLuint *samplers);
+
+void GLAPIENTRY
 _mesa_GenSamplers(GLsizei count, GLuint *samplers);
+
+void GLAPIENTRY
+_mesa_CreateSamplers_no_error(GLsizei count, GLuint *samplers);
+
 void GLAPIENTRY
 _mesa_CreateSamplers(GLsizei count, GLuint *samplers);
 void GLAPIENTRY
diff --git a/src/mesa/main/scissor.c b/src/mesa/main/scissor.c
index 631ea4d..b38db06 100644
--- a/src/mesa/main/scissor.c
+++ b/src/mesa/main/scissor.c
@@ -48,29 +48,19 @@
        height == ctx->Scissor.ScissorArray[idx].Height)
       return;
 
-   FLUSH_VERTICES(ctx, _NEW_SCISSOR);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewScissorRect ? 0 : _NEW_SCISSOR);
+   ctx->NewDriverState |= ctx->DriverFlags.NewScissorRect;
+
    ctx->Scissor.ScissorArray[idx].X = x;
    ctx->Scissor.ScissorArray[idx].Y = y;
    ctx->Scissor.ScissorArray[idx].Width = width;
    ctx->Scissor.ScissorArray[idx].Height = height;
 }
 
-/**
- * Called via glScissor
- */
-void GLAPIENTRY
-_mesa_Scissor( GLint x, GLint y, GLsizei width, GLsizei height )
+static void
+scissor(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei height)
 {
    unsigned i;
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glScissor %d %d %d %d\n", x, y, width, height);
-
-   if (width < 0 || height < 0) {
-      _mesa_error( ctx, GL_INVALID_VALUE, "glScissor" );
-      return;
-   }
 
    /* The GL_ARB_viewport_array spec says:
     *
@@ -91,6 +81,32 @@
       ctx->Driver.Scissor(ctx);
 }
 
+/**
+ * Called via glScissor
+ */
+void GLAPIENTRY
+_mesa_Scissor_no_error(GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   scissor(ctx, x, y, width, height);
+}
+
+void GLAPIENTRY
+_mesa_Scissor(GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glScissor %d %d %d %d\n", x, y, width, height);
+
+   if (width < 0 || height < 0) {
+      _mesa_error( ctx, GL_INVALID_VALUE, "glScissor" );
+      return;
+   }
+
+   scissor(ctx, x, y, width, height);
+}
+
 
 /**
  * Define the scissor box.
@@ -115,6 +131,19 @@
       ctx->Driver.Scissor(ctx);
 }
 
+static void
+scissor_array(struct gl_context *ctx, GLuint first, GLsizei count,
+              struct gl_scissor_rect *rect)
+{
+   for (GLsizei i = 0; i < count; i++) {
+      set_scissor_no_notify(ctx, i + first, rect[i].X, rect[i].Y,
+                            rect[i].Width, rect[i].Height);
+   }
+
+   if (ctx->Driver.Scissor)
+      ctx->Driver.Scissor(ctx);
+}
+
 /**
  * Define count scissor boxes starting at index.
  *
@@ -127,6 +156,15 @@
  * Verifies the parameters and call set_scissor_no_notify to do the work.
  */
 void GLAPIENTRY
+_mesa_ScissorArrayv_no_error(GLuint first, GLsizei count, const GLint *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_scissor_rect *p = (struct gl_scissor_rect *)v;
+   scissor_array(ctx, first, count, p);
+}
+
+void GLAPIENTRY
 _mesa_ScissorArrayv(GLuint first, GLsizei count, const GLint *v)
 {
    int i;
@@ -150,12 +188,7 @@
       }
    }
 
-   for (i = 0; i < count; i++)
-      set_scissor_no_notify(ctx, i + first,
-                            p[i].X, p[i].Y, p[i].Width, p[i].Height);
-
-   if (ctx->Driver.Scissor)
-      ctx->Driver.Scissor(ctx);
+   scissor_array(ctx, first, count, p);
 }
 
 /**
@@ -169,11 +202,10 @@
  * Verifies the parameters call set_scissor_no_notify to do the work.
  */
 static void
-ScissorIndexed(GLuint index, GLint left, GLint bottom,
-               GLsizei width, GLsizei height, const char *function)
+scissor_indexed_err(struct gl_context *ctx, GLuint index, GLint left,
+                    GLint bottom, GLsizei width, GLsizei height,
+                    const char *function)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s(%d, %d, %d, %d, %d)\n",
                   function, index, left, bottom, width, height);
@@ -192,23 +224,39 @@
       return;
    }
 
-   set_scissor_no_notify(ctx, index, left, bottom, width, height);
+   _mesa_set_scissor(ctx, index, left, bottom, width, height);
+}
 
-   if (ctx->Driver.Scissor)
-      ctx->Driver.Scissor(ctx);
+void GLAPIENTRY
+_mesa_ScissorIndexed_no_error(GLuint index, GLint left, GLint bottom,
+                              GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_set_scissor(ctx, index, left, bottom, width, height);
 }
 
 void GLAPIENTRY
 _mesa_ScissorIndexed(GLuint index, GLint left, GLint bottom,
                      GLsizei width, GLsizei height)
 {
-   ScissorIndexed(index, left, bottom, width, height, "glScissorIndexed");
+   GET_CURRENT_CONTEXT(ctx);
+   scissor_indexed_err(ctx, index, left, bottom, width, height,
+                       "glScissorIndexed");
+}
+
+void GLAPIENTRY
+_mesa_ScissorIndexedv_no_error(GLuint index, const GLint *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_set_scissor(ctx, index, v[0], v[1], v[2], v[3]);
 }
 
 void GLAPIENTRY
 _mesa_ScissorIndexedv(GLuint index, const GLint *v)
 {
-   ScissorIndexed(index, v[0], v[1], v[2], v[3], "glScissorIndexedv");
+   GET_CURRENT_CONTEXT(ctx);
+   scissor_indexed_err(ctx, index, v[0], v[1], v[2], v[3],
+                       "glScissorIndexedv");
 }
 
 void GLAPIENTRY
@@ -253,14 +301,13 @@
       box += 4;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_SCISSOR);
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewWindowRectangles;
+
    memcpy(ctx->Scissor.WindowRects, newval,
           sizeof(struct gl_scissor_rect) * count);
    ctx->Scissor.NumWindowRects = count;
    ctx->Scissor.WindowRectMode = mode;
-
-   if (ctx->Driver.Scissor)
-      ctx->Driver.Scissor(ctx);
 }
 
 
diff --git a/src/mesa/main/scissor.h b/src/mesa/main/scissor.h
index 1d0fac8..264873e 100644
--- a/src/mesa/main/scissor.h
+++ b/src/mesa/main/scissor.h
@@ -31,15 +31,28 @@
 
 struct gl_context;
 
+void GLAPIENTRY
+_mesa_Scissor_no_error(GLint x, GLint y, GLsizei width, GLsizei height);
+
 extern void GLAPIENTRY
 _mesa_Scissor( GLint x, GLint y, GLsizei width, GLsizei height );
 
+void GLAPIENTRY
+_mesa_ScissorArrayv_no_error(GLuint first, GLsizei count, const GLint * v);
+
 extern void GLAPIENTRY
 _mesa_ScissorArrayv(GLuint first, GLsizei count, const GLint * v);
 
+void GLAPIENTRY
+_mesa_ScissorIndexed_no_error(GLuint index, GLint left, GLint bottom,
+                              GLsizei width, GLsizei height);
+
 extern void GLAPIENTRY
 _mesa_ScissorIndexed(GLuint index, GLint left, GLint bottom, GLsizei width, GLsizei height);
 
+void GLAPIENTRY
+_mesa_ScissorIndexedv_no_error(GLuint index, const GLint * v);
+
 extern void GLAPIENTRY
 _mesa_ScissorIndexedv(GLuint index, const GLint * v);
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 6efbc37..64e68b4 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -37,7 +37,7 @@
 #include "compiler/glsl/glsl_symbol_table.h"
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/program.h"
-#include "util/string_to_uint_map.h"
+#include "compiler/glsl/string_to_uint_map.h"
 
 
 static GLint
@@ -62,30 +62,26 @@
 DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_buffer);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
-void GLAPIENTRY
-_mesa_BindAttribLocation(GLuint program, GLuint index,
-                         const GLchar *name)
+static void
+bind_attrib_location(struct gl_context *ctx,
+                     struct gl_shader_program *const shProg, GLuint index,
+                     const GLchar *name, bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
-   struct gl_shader_program *const shProg =
-      _mesa_lookup_shader_program_err(ctx, program, "glBindAttribLocation");
-   if (!shProg)
-      return;
-
    if (!name)
       return;
 
-   if (strncmp(name, "gl_", 3) == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glBindAttribLocation(illegal name)");
-      return;
-   }
+   if (!no_error) {
+      if (strncmp(name, "gl_", 3) == 0) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBindAttribLocation(illegal name)");
+         return;
+      }
 
-   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glBindAttribLocation(%u >= %u)",
-                  index, ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs);
-      return;
+      if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+         _mesa_error(ctx, GL_INVALID_VALUE, "glBindAttribLocation(%u >= %u)",
+                     index, ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs);
+         return;
+      }
    }
 
    /* Replace the current value if it's already in the list.  Add
@@ -101,6 +97,31 @@
 }
 
 void GLAPIENTRY
+_mesa_BindAttribLocation_no_error(GLuint program, GLuint index,
+                                  const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_shader_program *const shProg =
+      _mesa_lookup_shader_program(ctx, program);
+   bind_attrib_location(ctx, shProg, index, name, true);
+}
+
+void GLAPIENTRY
+_mesa_BindAttribLocation(GLuint program, GLuint index,
+                         const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_shader_program *const shProg =
+      _mesa_lookup_shader_program_err(ctx, program, "glBindAttribLocation");
+   if (!shProg)
+      return;
+
+   bind_attrib_location(ctx, shProg, index, name, false);
+}
+
+void GLAPIENTRY
 _mesa_GetActiveAttrib(GLuint program, GLuint desired_index,
                       GLsizei maxLength, GLsizei * length, GLint * size,
                       GLenum * type, GLchar * name)
@@ -233,6 +254,24 @@
    return longest;
 }
 
+void static
+bind_frag_data_location(struct gl_shader_program *const shProg,
+                        const char *name, unsigned colorNumber,
+                        unsigned index)
+{
+   /* Replace the current value if it's already in the list.  Add
+    * FRAG_RESULT_DATA0 because that's how the linker differentiates
+    * between built-in attributes and user-defined attributes.
+    */
+   shProg->FragDataBindings->put(colorNumber + FRAG_RESULT_DATA0, name);
+   shProg->FragDataIndexBindings->put(index, name);
+
+   /*
+    * Note that this binding won't go into effect until
+    * glLinkProgram is called again.
+    */
+}
+
 void GLAPIENTRY
 _mesa_BindFragDataLocation(GLuint program, GLuint colorNumber,
 			   const GLchar *name)
@@ -241,6 +280,21 @@
 }
 
 void GLAPIENTRY
+_mesa_BindFragDataLocation_no_error(GLuint program, GLuint colorNumber,
+                                    const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!name)
+      return;
+
+   struct gl_shader_program *const shProg =
+      _mesa_lookup_shader_program(ctx, program);
+
+   bind_frag_data_location(shProg, name, colorNumber, 0);
+}
+
+void GLAPIENTRY
 _mesa_BindFragDataLocationIndexed(GLuint program, GLuint colorNumber,
                                   GLuint index, const GLchar *name)
 {
@@ -274,17 +328,22 @@
       return;
    }
 
-   /* Replace the current value if it's already in the list.  Add
-    * FRAG_RESULT_DATA0 because that's how the linker differentiates
-    * between built-in attributes and user-defined attributes.
-    */
-   shProg->FragDataBindings->put(colorNumber + FRAG_RESULT_DATA0, name);
-   shProg->FragDataIndexBindings->put(index, name);
-   /*
-    * Note that this binding won't go into effect until
-    * glLinkProgram is called again.
-    */
+   bind_frag_data_location(shProg, name, colorNumber, index);
+}
 
+void GLAPIENTRY
+_mesa_BindFragDataLocationIndexed_no_error(GLuint program, GLuint colorNumber,
+                                           GLuint index, const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!name)
+      return;
+
+   struct gl_shader_program *const shProg =
+      _mesa_lookup_shader_program(ctx, program);
+
+   bind_frag_data_location(shProg, name, colorNumber, index);
 }
 
 GLint GLAPIENTRY
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 187475f..84189f0 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -77,6 +77,8 @@
          flags |= GLSL_DUMP;
       if (strstr(env, "log"))
          flags |= GLSL_LOG;
+      if (strstr(env, "cache_fb"))
+         flags |= GLSL_CACHE_FALLBACK;
       if (strstr(env, "cache_info"))
          flags |= GLSL_CACHE_INFO;
       if (strstr(env, "nopvert"))
@@ -138,8 +140,6 @@
 
    /* Extended for ARB_separate_shader_objects */
    ctx->Shader.RefCount = 1;
-   mtx_init(&ctx->Shader.Mutex, mtx_plain);
-
    ctx->TessCtrlProgram.patch_vertices = 3;
    for (i = 0; i < 4; ++i)
       ctx->TessCtrlProgram.patch_default_outer_level[i] = 1.0;
@@ -157,14 +157,12 @@
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
       _mesa_reference_program(ctx, &ctx->Shader.CurrentProgram[i], NULL);
    }
-   _mesa_reference_program(ctx, &ctx->Shader._CurrentFragmentProgram, NULL);
    _mesa_reference_shader_program(ctx, &ctx->Shader.ActiveProgram, NULL);
 
    /* Extended for ARB_separate_shader_objects */
    _mesa_reference_pipeline_object(ctx, &ctx->_Shader, NULL);
 
    assert(ctx->Shader.RefCount == 1);
-   mtx_destroy(&ctx->Shader.Mutex);
 }
 
 
@@ -246,50 +244,11 @@
  * Attach shader to a shader program.
  */
 static void
-attach_shader(struct gl_context *ctx, GLuint program, GLuint shader)
+attach_shader(struct gl_context *ctx, struct gl_shader_program *shProg,
+              struct gl_shader *sh)
 {
-   struct gl_shader_program *shProg;
-   struct gl_shader *sh;
-   GLuint i, n;
+   GLuint n = shProg->NumShaders;
 
-   const bool same_type_disallowed = _mesa_is_gles(ctx);
-
-   shProg = _mesa_lookup_shader_program_err(ctx, program, "glAttachShader");
-   if (!shProg)
-      return;
-
-   sh = _mesa_lookup_shader_err(ctx, shader, "glAttachShader");
-   if (!sh) {
-      return;
-   }
-
-   n = shProg->NumShaders;
-   for (i = 0; i < n; i++) {
-      if (shProg->Shaders[i] == sh) {
-         /* The shader is already attched to this program.  The
-          * GL_ARB_shader_objects spec says:
-          *
-          *     "The error INVALID_OPERATION is generated by AttachObjectARB
-          *     if <obj> is already attached to <containerObj>."
-          */
-         _mesa_error(ctx, GL_INVALID_OPERATION, "glAttachShader");
-         return;
-      } else if (same_type_disallowed &&
-                 shProg->Shaders[i]->Stage == sh->Stage) {
-        /* Shader with the same type is already attached to this program,
-         * OpenGL ES 2.0 and 3.0 specs say:
-         *
-         *      "Multiple shader objects of the same type may not be attached
-         *      to a single program object. [...] The error INVALID_OPERATION
-         *      is generated if [...] another shader object of the same type
-         *      as shader is already attached to program."
-         */
-         _mesa_error(ctx, GL_INVALID_OPERATION, "glAttachShader");
-         return;
-      }
-   }
-
-   /* grow list */
    shProg->Shaders = realloc(shProg->Shaders,
                              (n + 1) * sizeof(struct gl_shader *));
    if (!shProg->Shaders) {
@@ -303,6 +262,65 @@
    shProg->NumShaders++;
 }
 
+static void
+attach_shader_err(struct gl_context *ctx, GLuint program, GLuint shader,
+                  const char *caller)
+{
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   GLuint i, n;
+
+   const bool same_type_disallowed = _mesa_is_gles(ctx);
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, caller);
+   if (!shProg)
+      return;
+
+   sh = _mesa_lookup_shader_err(ctx, shader, caller);
+   if (!sh) {
+      return;
+   }
+
+   n = shProg->NumShaders;
+   for (i = 0; i < n; i++) {
+      if (shProg->Shaders[i] == sh) {
+         /* The shader is already attched to this program.  The
+          * GL_ARB_shader_objects spec says:
+          *
+          *     "The error INVALID_OPERATION is generated by AttachObjectARB
+          *     if <obj> is already attached to <containerObj>."
+          */
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s", caller);
+         return;
+      } else if (same_type_disallowed &&
+                 shProg->Shaders[i]->Stage == sh->Stage) {
+        /* Shader with the same type is already attached to this program,
+         * OpenGL ES 2.0 and 3.0 specs say:
+         *
+         *      "Multiple shader objects of the same type may not be attached
+         *      to a single program object. [...] The error INVALID_OPERATION
+         *      is generated if [...] another shader object of the same type
+         *      as shader is already attached to program."
+         */
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s", caller);
+         return;
+      }
+   }
+
+   attach_shader(ctx, shProg, sh);
+}
+
+static void
+attach_shader_no_error(struct gl_context *ctx, GLuint program, GLuint shader)
+{
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+
+   shProg = _mesa_lookup_shader_program(ctx, program);
+   sh = _mesa_lookup_shader(ctx, shader);
+
+   attach_shader(ctx, shProg, sh);
+}
 
 static GLuint
 create_shader(struct gl_context *ctx, GLenum type)
@@ -310,12 +328,6 @@
    struct gl_shader *sh;
    GLuint name;
 
-   if (!_mesa_validate_shader_target(ctx, type)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "CreateShader(%s)",
-                  _mesa_enum_to_string(type));
-      return 0;
-   }
-
    _mesa_HashLockMutex(ctx->Shared->ShaderObjects);
    name = _mesa_HashFindFreeKeyBlock(ctx->Shared->ShaderObjects, 1);
    sh = _mesa_new_shader(name, _mesa_shader_enum_to_shader_stage(type));
@@ -328,6 +340,19 @@
 
 
 static GLuint
+create_shader_err(struct gl_context *ctx, GLenum type, const char *caller)
+{
+   if (!_mesa_validate_shader_target(ctx, type)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(%s)",
+                  caller, _mesa_enum_to_string(type));
+      return 0;
+   }
+
+   return create_shader(ctx, type);
+}
+
+
+static GLuint
 create_shader_program(struct gl_context *ctx)
 {
    GLuint name;
@@ -1246,54 +1271,6 @@
 }
 
 
-static void
-use_program(struct gl_context *ctx, gl_shader_stage stage,
-            struct gl_shader_program *shProg, struct gl_program *new_prog,
-            struct gl_pipeline_object *shTarget)
-{
-   struct gl_program **target;
-
-   target = &shTarget->CurrentProgram[stage];
-   if (new_prog) {
-      _mesa_program_init_subroutine_defaults(ctx, new_prog);
-   }
-
-   if (*target != new_prog) {
-      /* Program is current, flush it */
-      if (shTarget == ctx->_Shader) {
-         FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
-      }
-
-      /* If the shader is also bound as the current rendering shader, unbind
-       * it from that binding point as well.  This ensures that the correct
-       * semantics of glDeleteProgram are maintained.
-       */
-      switch (stage) {
-      case MESA_SHADER_VERTEX:
-      case MESA_SHADER_TESS_CTRL:
-      case MESA_SHADER_TESS_EVAL:
-      case MESA_SHADER_GEOMETRY:
-      case MESA_SHADER_COMPUTE:
-         /* Empty for now. */
-         break;
-      case MESA_SHADER_FRAGMENT:
-         if (*target == ctx->_Shader->_CurrentFragmentProgram) {
-	    _mesa_reference_program(ctx,
-                                    &ctx->_Shader->_CurrentFragmentProgram,
-                                    NULL);
-	 }
-	 break;
-      }
-
-      _mesa_reference_shader_program(ctx,
-                                     &shTarget->ReferencedPrograms[stage],
-                                     shProg);
-      _mesa_reference_program(ctx, target, new_prog);
-      return;
-   }
-}
-
-
 /**
  * Use the named shader program for subsequent rendering.
  */
@@ -1305,7 +1282,7 @@
       struct gl_program *new_prog = NULL;
       if (shProg && shProg->_LinkedShaders[i])
          new_prog = shProg->_LinkedShaders[i]->Program;
-      use_program(ctx, i, shProg, new_prog, &ctx->Shader);
+      _mesa_use_program(ctx, i, shProg, new_prog, &ctx->Shader);
    }
    _mesa_active_program(ctx, shProg, "glUseProgram");
 }
@@ -1375,12 +1352,27 @@
 }
 
 
+void GLAPIENTRY
+_mesa_AttachObjectARB_no_error(GLhandleARB program, GLhandleARB shader)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   attach_shader_no_error(ctx, program, shader);
+}
+
 
 void GLAPIENTRY
 _mesa_AttachObjectARB(GLhandleARB program, GLhandleARB shader)
 {
    GET_CURRENT_CONTEXT(ctx);
-   attach_shader(ctx, program, shader);
+   attach_shader_err(ctx, program, shader, "glAttachObjectARB");
+}
+
+
+void GLAPIENTRY
+_mesa_AttachShader_no_error(GLuint program, GLuint shader)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   attach_shader_no_error(ctx, program, shader);
 }
 
 
@@ -1388,7 +1380,7 @@
 _mesa_AttachShader(GLuint program, GLuint shader)
 {
    GET_CURRENT_CONTEXT(ctx);
-   attach_shader(ctx, program, shader);
+   attach_shader_err(ctx, program, shader, "glAttachShader");
 }
 
 
@@ -1404,11 +1396,29 @@
 
 
 GLuint GLAPIENTRY
+_mesa_CreateShader_no_error(GLenum type)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   return create_shader(ctx, type);
+}
+
+
+GLuint GLAPIENTRY
 _mesa_CreateShader(GLenum type)
 {
    GET_CURRENT_CONTEXT(ctx);
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCreateShader %s\n", _mesa_enum_to_string(type));
+
+   return create_shader_err(ctx, type, "glCreateShader");
+}
+
+
+GLhandleARB GLAPIENTRY
+_mesa_CreateShaderObjectARB_no_error(GLenum type)
+{
+   GET_CURRENT_CONTEXT(ctx);
    return create_shader(ctx, type);
 }
 
@@ -1417,7 +1427,7 @@
 _mesa_CreateShaderObjectARB(GLenum type)
 {
    GET_CURRENT_CONTEXT(ctx);
-   return create_shader(ctx, type);
+   return create_shader_err(ctx, type, "glCreateShaderObjectARB");
 }
 
 
@@ -1855,40 +1865,44 @@
 }
 
 
-void GLAPIENTRY
-_mesa_UseProgram(GLuint program)
+static ALWAYS_INLINE void
+use_program(GLuint program, bool no_error)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_shader_program *shProg;
+   struct gl_shader_program *shProg = NULL;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glUseProgram %u\n", program);
 
-   if (_mesa_is_xfb_active_and_unpaused(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glUseProgram(transform feedback active)");
-      return;
-   }
-
-   if (program) {
-      shProg = _mesa_lookup_shader_program_err(ctx, program, "glUseProgram");
-      if (!shProg) {
-         return;
+   if (no_error) {
+      if (program) {
+         shProg = _mesa_lookup_shader_program(ctx, program);
       }
-      if (!shProg->data->LinkStatus) {
+   } else {
+      if (_mesa_is_xfb_active_and_unpaused(ctx)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glUseProgram(program %u not linked)", program);
+                     "glUseProgram(transform feedback active)");
          return;
       }
 
-      /* debug code */
-      if (ctx->_Shader->Flags & GLSL_USE_PROG) {
-         print_shader_info(shProg);
+      if (program) {
+         shProg =
+            _mesa_lookup_shader_program_err(ctx, program, "glUseProgram");
+         if (!shProg)
+            return;
+
+         if (!shProg->data->LinkStatus) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "glUseProgram(program %u not linked)", program);
+            return;
+         }
+
+         /* debug code */
+         if (ctx->_Shader->Flags & GLSL_USE_PROG) {
+            print_shader_info(shProg);
+         }
       }
    }
-   else {
-      shProg = NULL;
-   }
 
    /* The ARB_separate_shader_object spec says:
     *
@@ -1899,7 +1913,7 @@
     *     object (section 2.14.PPO), the program bound to the appropriate
     *     stage of the pipeline object is considered current."
     */
-   if (program) {
+   if (shProg) {
       /* Attach shader state to the binding point */
       _mesa_reference_pipeline_object(ctx, &ctx->_Shader, &ctx->Shader);
       /* Update the program */
@@ -1908,16 +1922,34 @@
       /* Must be done first: detach the progam */
       _mesa_use_shader_program(ctx, shProg);
       /* Unattach shader_state binding point */
-      _mesa_reference_pipeline_object(ctx, &ctx->_Shader, ctx->Pipeline.Default);
+      _mesa_reference_pipeline_object(ctx, &ctx->_Shader,
+                                      ctx->Pipeline.Default);
       /* If a pipeline was bound, rebind it */
       if (ctx->Pipeline.Current) {
-         _mesa_BindProgramPipeline(ctx->Pipeline.Current->Name);
+         if (no_error)
+            _mesa_BindProgramPipeline_no_error(ctx->Pipeline.Current->Name);
+         else
+            _mesa_BindProgramPipeline(ctx->Pipeline.Current->Name);
       }
    }
 }
 
 
 void GLAPIENTRY
+_mesa_UseProgram_no_error(GLuint program)
+{
+   use_program(program, true);
+}
+
+
+void GLAPIENTRY
+_mesa_UseProgram(GLuint program)
+{
+   use_program(program, false);
+}
+
+
+void GLAPIENTRY
 _mesa_ValidateProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -2192,7 +2224,26 @@
                   struct gl_shader_program *shProg, struct gl_program *prog,
                   struct gl_pipeline_object *shTarget)
 {
-   use_program(ctx, stage, shProg, prog, shTarget);
+   struct gl_program **target;
+
+   target = &shTarget->CurrentProgram[stage];
+   if (prog) {
+      _mesa_program_init_subroutine_defaults(ctx, prog);
+   }
+
+   if (*target != prog) {
+      /* Program is current, flush it */
+      if (shTarget == ctx->_Shader) {
+         FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+      }
+
+      _mesa_reference_shader_program(ctx,
+                                     &shTarget->ReferencedPrograms[stage],
+                                     shProg);
+      _mesa_reference_program(ctx, target, prog);
+      return;
+   }
+
 }
 
 
@@ -2242,7 +2293,7 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   const GLuint shader = create_shader(ctx, type);
+   const GLuint shader = create_shader_err(ctx, type, "glCreateShaderProgramv");
    GLuint program = 0;
 
    /*
@@ -2271,7 +2322,7 @@
 
 	 get_shaderiv(ctx, shader, GL_COMPILE_STATUS, &compiled);
 	 if (compiled) {
-	    attach_shader(ctx, program, shader);
+	    attach_shader_err(ctx, program, shader, "glCreateShaderProgramv");
 	    _mesa_link_program(ctx, shProg);
 	    detach_shader(ctx, program, shader);
 
@@ -2363,11 +2414,6 @@
    GLenum resource_type;
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return -1;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
@@ -2398,11 +2444,6 @@
    GLenum resource_type;
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return -1;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return -1;
@@ -2442,11 +2483,6 @@
    GLenum resource_type;
    int count, i, j;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
@@ -2529,11 +2565,6 @@
    GLenum resource_type;
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
@@ -2568,11 +2599,6 @@
    GLenum resource_type;
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
@@ -2602,11 +2628,6 @@
    gl_shader_stage stage;
    int i;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
@@ -2625,6 +2646,7 @@
    }
 
    i = 0;
+   bool flushed = false;
    do {
       struct gl_uniform_storage *uni = p->sh.SubroutineUniformRemapTable[i];
       if (uni == NULL) {
@@ -2632,6 +2654,11 @@
          continue;
       }
 
+      if (!flushed) {
+         _mesa_flush_vertices_for_uniforms(ctx, uni);
+         flushed = true;
+      }
+
       int uni_count = uni->array_elements ? uni->array_elements : 1;
       int j, k, f;
 
@@ -2664,8 +2691,6 @@
       }
       i += uni_count;
    } while(i < count);
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
 }
 
 
@@ -2677,11 +2702,6 @@
    const char *api_name = "glGetUniformSubroutineuiv";
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
@@ -2713,11 +2733,6 @@
    struct gl_linked_shader *sh;
    gl_shader_stage stage;
 
-   if (!_mesa_has_ARB_shader_subroutine(ctx)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
-      return;
-   }
-
    if (!_mesa_validate_shader_target(ctx, shadertype)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
       return;
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index 99b4fe8..b7ba7ab 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -73,6 +73,10 @@
 extern void
 _mesa_shader_write_subroutine_indices(struct gl_context *ctx,
                                       gl_shader_stage stage);
+
+void GLAPIENTRY
+_mesa_AttachObjectARB_no_error(GLhandleARB, GLhandleARB);
+
 extern void GLAPIENTRY
 _mesa_AttachObjectARB(GLhandleARB, GLhandleARB);
 
@@ -82,6 +86,9 @@
 extern GLhandleARB GLAPIENTRY
 _mesa_CreateProgramObjectARB(void);
 
+GLhandleARB GLAPIENTRY
+_mesa_CreateShaderObjectARB_no_error(GLenum type);
+
 extern GLhandleARB GLAPIENTRY
 _mesa_CreateShaderObjectARB(GLenum type);
 
@@ -127,6 +134,8 @@
 extern void GLAPIENTRY
 _mesa_ShaderSource(GLuint, GLsizei, const GLchar* const *, const GLint *);
 
+void GLAPIENTRY
+_mesa_UseProgram_no_error(GLuint);
 extern void GLAPIENTRY
 _mesa_UseProgram(GLuint);
 
@@ -134,6 +143,9 @@
 _mesa_ValidateProgram(GLuint);
 
 
+void GLAPIENTRY
+_mesa_BindAttribLocation_no_error(GLuint program, GLuint, const GLchar *);
+
 extern void GLAPIENTRY
 _mesa_BindAttribLocation(GLuint program, GLuint, const GLchar *);
 
@@ -146,17 +158,29 @@
                                   GLuint index, const GLchar *name);
 
 extern void GLAPIENTRY
+_mesa_BindFragDataLocation_no_error(GLuint program, GLuint colorNumber,
+                                    const GLchar *name);
+
+extern void GLAPIENTRY
+_mesa_BindFragDataLocationIndexed_no_error(GLuint program, GLuint colorNumber,
+                                           GLuint index, const GLchar *name);
+
+extern void GLAPIENTRY
 _mesa_GetActiveAttrib(GLuint, GLuint, GLsizei, GLsizei *, GLint *,
                          GLenum *, GLchar *);
 
 extern GLint GLAPIENTRY
 _mesa_GetAttribLocation(GLuint, const GLchar *);
 
-
+void GLAPIENTRY
+_mesa_AttachShader_no_error(GLuint program, GLuint shader);
 
 extern void GLAPIENTRY
 _mesa_AttachShader(GLuint program, GLuint shader);
 
+GLuint GLAPIENTRY
+_mesa_CreateShader_no_error(GLenum);
+
 extern GLuint GLAPIENTRY
 _mesa_CreateShader(GLenum);
 
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 45b72c9..c722b32 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -477,6 +477,13 @@
    if (!t)
       return GL_FALSE;
 
+   /* The GL 4.5 Core spec doesn't say anything about buffers. In practice,
+    * the image buffer format is always compatible with the underlying
+    * buffer storage.
+    */
+   if (t->Target == GL_TEXTURE_BUFFER)
+      return GL_TRUE;
+
    if (!t->_BaseComplete && !t->_MipmapComplete)
        _mesa_test_texobj_completeness(ctx, t);
 
@@ -490,20 +497,14 @@
        u->_Layer >= _mesa_get_texture_layers(t, u->Level))
       return GL_FALSE;
 
-   if (t->Target == GL_TEXTURE_BUFFER) {
-      tex_format = _mesa_get_shader_image_format(t->BufferObjectFormat);
+   struct gl_texture_image *img = (t->Target == GL_TEXTURE_CUBE_MAP ?
+                                   t->Image[u->_Layer][u->Level] :
+                                   t->Image[0][u->Level]);
 
-   } else {
-      struct gl_texture_image *img = (t->Target == GL_TEXTURE_CUBE_MAP ?
-                                      t->Image[u->_Layer][u->Level] :
-                                      t->Image[0][u->Level]);
+   if (!img || img->Border || img->NumSamples > ctx->Const.MaxImageSamples)
+      return GL_FALSE;
 
-      if (!img || img->Border || img->NumSamples > ctx->Const.MaxImageSamples)
-         return GL_FALSE;
-
-      tex_format = _mesa_get_shader_image_format(img->InternalFormat);
-   }
-
+   tex_format = _mesa_get_shader_image_format(img->InternalFormat);
    if (!tex_format)
       return GL_FALSE;
 
@@ -529,8 +530,8 @@
 
 static GLboolean
 validate_bind_image_texture(struct gl_context *ctx, GLuint unit,
-                            GLuint texture, GLint level, GLboolean layered,
-                            GLint layer, GLenum access, GLenum format)
+                            GLuint texture, GLint level, GLint layer,
+                            GLenum access, GLenum format)
 {
    assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
 
@@ -564,27 +565,75 @@
    return GL_TRUE;
 }
 
-void GLAPIENTRY
-_mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
-                       GLboolean layered, GLint layer, GLenum access,
-                       GLenum format)
+static void
+set_image_binding(struct gl_image_unit *u, struct gl_texture_object *texObj,
+                  GLint level, GLboolean layered, GLint layer, GLenum access,
+                  GLenum format)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   struct gl_image_unit *u;
+   u->Level = level;
+   u->Access = access;
+   u->Format = format;
+   u->_ActualFormat = _mesa_get_shader_image_format(format);
 
-   if (!validate_bind_image_texture(ctx, unit, texture, level,
-                                    layered, layer, access, format))
-      return;
+   if (texObj && _mesa_tex_target_is_layered(texObj->Target)) {
+      u->Layered = layered;
+      u->Layer = layer;
+      u->_Layer = (u->Layered ? 0 : u->Layer);
+   } else {
+      u->Layered = GL_FALSE;
+      u->Layer = 0;
+   }
+
+   _mesa_reference_texobj(&u->TexObj, texObj);
+}
+
+static void
+bind_image_texture(struct gl_context *ctx, struct gl_texture_object *texObj,
+                   GLuint unit, GLint level, GLboolean layered, GLint layer,
+                   GLenum access, GLenum format)
+{
+   struct gl_image_unit *u;
 
    u = &ctx->ImageUnits[unit];
 
    FLUSH_VERTICES(ctx, 0);
    ctx->NewDriverState |= ctx->DriverFlags.NewImageUnits;
 
-   if (texture) {
-      struct gl_texture_object *t = _mesa_lookup_texture(ctx, texture);
+   set_image_binding(u, texObj, level, layered, layer, access, format);
+}
 
-      if (!t) {
+void GLAPIENTRY
+_mesa_BindImageTexture_no_error(GLuint unit, GLuint texture, GLint level,
+                                GLboolean layered, GLint layer, GLenum access,
+                                GLenum format)
+{
+   struct gl_texture_object *texObj = NULL;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (texture)
+      texObj = _mesa_lookup_texture(ctx, texture);
+
+   bind_image_texture(ctx, texObj, unit, level, layered, layer, access, format);
+}
+
+void GLAPIENTRY
+_mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
+                       GLboolean layered, GLint layer, GLenum access,
+                       GLenum format)
+{
+   struct gl_texture_object *texObj = NULL;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!validate_bind_image_texture(ctx, unit, texture, level, layer, access,
+                                    format))
+      return;
+
+   if (texture) {
+      texObj = _mesa_lookup_texture(ctx, texture);
+
+      if (!texObj) {
          _mesa_error(ctx, GL_INVALID_VALUE, "glBindImageTexture(texture)");
          return;
       }
@@ -599,58 +648,23 @@
        * recognizes that there is no way to create immutable buffer textures,
        * so those are excluded from this requirement.
        */
-      if (_mesa_is_gles(ctx) && !t->Immutable &&
-          t->Target != GL_TEXTURE_BUFFER) {
+      if (_mesa_is_gles(ctx) && !texObj->Immutable &&
+          texObj->Target != GL_TEXTURE_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBindImageTexture(!immutable)");
          return;
       }
-
-      _mesa_reference_texobj(&u->TexObj, t);
-   } else {
-      _mesa_reference_texobj(&u->TexObj, NULL);
    }
 
-   u->Level = level;
-   u->Access = access;
-   u->Format = format;
-   u->_ActualFormat = _mesa_get_shader_image_format(format);
-
-   if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) {
-      u->Layered = layered;
-      u->Layer = layer;
-      u->_Layer = (u->Layered ? 0 : u->Layer);
-   } else {
-      u->Layered = GL_FALSE;
-      u->Layer = 0;
-   }
+   bind_image_texture(ctx, texObj, unit, level, layered, layer, access, format);
 }
 
-void GLAPIENTRY
-_mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
+static ALWAYS_INLINE void
+bind_image_textures(struct gl_context *ctx, GLuint first, GLuint count,
+                    const GLuint *textures, bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
    int i;
 
-   if (!ctx->Extensions.ARB_shader_image_load_store) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBindImageTextures()");
-      return;
-   }
-
-   if (first + count > ctx->Const.MaxImageUnits) {
-      /* The ARB_multi_bind spec says:
-       *
-       *    "An INVALID_OPERATION error is generated if <first> + <count>
-       *     is greater than the number of image units supported by
-       *     the implementation."
-       */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glBindImageTextures(first=%u + count=%d > the value of "
-                  "GL_MAX_IMAGE_UNITS=%u)",
-                  first, count, ctx->Const.MaxImageUnits);
-      return;
-   }
-
    /* Assume that at least one binding will be changed */
    FLUSH_VERTICES(ctx, 0);
    ctx->NewDriverState |= ctx->DriverFlags.NewImageUnits;
@@ -680,13 +694,13 @@
       struct gl_image_unit *u = &ctx->ImageUnits[first + i];
       const GLuint texture = textures ? textures[i] : 0;
 
-      if (texture != 0) {
-         struct gl_texture_object *texObj;
+      if (texture) {
+         struct gl_texture_object *texObj = u->TexObj;
          GLenum tex_format;
 
-         if (!u->TexObj || u->TexObj->Name != texture) {
+         if (!texObj || texObj->Name != texture) {
             texObj = _mesa_lookup_texture_locked(ctx, texture);
-            if (!texObj) {
+            if (!no_error && !texObj) {
                /* The ARB_multi_bind spec says:
                 *
                 *    "An INVALID_OPERATION error is generated if any value
@@ -699,8 +713,6 @@
                            "object)", i, texture);
                continue;
             }
-         } else {
-            texObj = u->TexObj;
          }
 
          if (texObj->Target == GL_TEXTURE_BUFFER) {
@@ -708,8 +720,8 @@
          } else {
             struct gl_texture_image *image = texObj->Image[0][0];
 
-            if (!image || image->Width == 0 || image->Height == 0 ||
-                image->Depth == 0) {
+            if (!no_error && (!image || image->Width == 0 ||
+                              image->Height == 0 || image->Depth == 0)) {
                /* The ARB_multi_bind spec says:
                 *
                 *    "An INVALID_OPERATION error is generated if the width,
@@ -726,7 +738,8 @@
             tex_format = image->InternalFormat;
          }
 
-         if (!_mesa_is_shader_image_format_supported(ctx, tex_format)) {
+         if (!no_error &&
+             !_mesa_is_shader_image_format_supported(ctx, tex_format)) {
             /* The ARB_multi_bind spec says:
              *
              *   "An INVALID_OPERATION error is generated if the internal
@@ -743,24 +756,50 @@
          }
 
          /* Update the texture binding */
-         _mesa_reference_texobj(&u->TexObj, texObj);
-         u->Level = 0;
-         u->Layered = _mesa_tex_target_is_layered(texObj->Target);
-         u->_Layer = u->Layer = 0;
-         u->Access = GL_READ_WRITE;
-         u->Format = tex_format;
-         u->_ActualFormat = _mesa_get_shader_image_format(tex_format);
+         set_image_binding(u, texObj, 0,
+                           _mesa_tex_target_is_layered(texObj->Target),
+                           0, GL_READ_WRITE, tex_format);
       } else {
          /* Unbind the texture from the unit */
-         _mesa_reference_texobj(&u->TexObj, NULL);
-         u->Level = 0;
-         u->Layered = GL_FALSE;
-         u->_Layer = u->Layer = 0;
-         u->Access = GL_READ_ONLY;
-         u->Format = GL_R8;
-         u->_ActualFormat = MESA_FORMAT_R_UNORM8;
+         set_image_binding(u, NULL, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
       }
    }
 
    _mesa_HashUnlockMutex(ctx->Shared->TexObjects);
 }
+
+void GLAPIENTRY
+_mesa_BindImageTextures_no_error(GLuint first, GLsizei count,
+                                 const GLuint *textures)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   bind_image_textures(ctx, first, count, textures, true);
+}
+
+void GLAPIENTRY
+_mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!ctx->Extensions.ARB_shader_image_load_store) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glBindImageTextures()");
+      return;
+   }
+
+   if (first + count > ctx->Const.MaxImageUnits) {
+      /* The ARB_multi_bind spec says:
+       *
+       *    "An INVALID_OPERATION error is generated if <first> + <count>
+       *     is greater than the number of image units supported by
+       *     the implementation."
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glBindImageTextures(first=%u + count=%d > the value of "
+                  "GL_MAX_IMAGE_UNITS=%u)",
+                  first, count, ctx->Const.MaxImageUnits);
+      return;
+   }
+
+   bind_image_textures(ctx, first, count, textures, false);
+}
diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h
index 99dddb7..6a9e3d6 100644
--- a/src/mesa/main/shaderimage.h
+++ b/src/mesa/main/shaderimage.h
@@ -80,11 +80,20 @@
 _mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u);
 
 void GLAPIENTRY
+_mesa_BindImageTexture_no_error(GLuint unit, GLuint texture, GLint level,
+                                GLboolean layered, GLint layer, GLenum access,
+                                GLenum format);
+
+void GLAPIENTRY
 _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
                        GLboolean layered, GLint layer, GLenum access,
                        GLenum format);
 
 void GLAPIENTRY
+_mesa_BindImageTextures_no_error(GLuint first, GLsizei count,
+                                 const GLuint *textures);
+
+void GLAPIENTRY
 _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures);
 
 #ifdef __cplusplus
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index 8a5fa5e..b9d1079 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -30,6 +30,7 @@
  */
 
 
+#include "compiler/glsl/string_to_uint_map.h"
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/hash.h"
@@ -40,7 +41,6 @@
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "util/ralloc.h"
-#include "util/string_to_uint_map.h"
 #include "util/u_atomic.h"
 
 /**********************************************************************/
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index b7dae62..97b8ce7 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -216,6 +216,8 @@
       return GL_TESS_CONTROL_SUBROUTINE;
    case MESA_SHADER_TESS_EVAL:
       return GL_TESS_EVALUATION_SUBROUTINE;
+   case MESA_SHADER_NONE:
+      break;
    }
    unreachable("not reached");
 }
@@ -236,6 +238,8 @@
       return GL_TESS_CONTROL_SUBROUTINE_UNIFORM;
    case MESA_SHADER_TESS_EVAL:
       return GL_TESS_EVALUATION_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_NONE:
+      break;
    }
    unreachable("not reached");
 }
diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
index 5344812..6926d40 100644
--- a/src/mesa/main/shared.c
+++ b/src/mesa/main/shared.c
@@ -39,6 +39,7 @@
 #include "shaderapi.h"
 #include "shaderobj.h"
 #include "syncobj.h"
+#include "texturebindless.h"
 
 #include "util/hash_table.h"
 #include "util/set.h"
@@ -84,6 +85,9 @@
    /* GL_ARB_sampler_objects */
    shared->SamplerObjects = _mesa_NewHashTable();
 
+   /* GL_ARB_bindless_texture */
+   _mesa_init_shared_handles(shared);
+
    /* Allocate the default buffer object */
    shared->NullBufferObj = ctx->Driver.NewBufferObject(ctx, 0);
 
@@ -373,6 +377,8 @@
    _mesa_HashDeleteAll(shared->TexObjects, delete_texture_cb, ctx);
    _mesa_DeleteHashTable(shared->TexObjects);
 
+   _mesa_free_shared_handles(shared);
+
    mtx_destroy(&shared->Mutex);
    mtx_destroy(&shared->TexMutex);
 
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 5a760f5..7aec98e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -51,34 +51,12 @@
 #include "texobj.h"
 #include "texstate.h"
 #include "varray.h"
+#include "vbo/vbo_context.h"
 #include "viewport.h"
 #include "blend.h"
 
 
 /**
- * Update the following fields:
- *   ctx->VertexProgram._Enabled
- *   ctx->FragmentProgram._Enabled
- *   ctx->ATIFragmentShader._Enabled
- * This needs to be done before texture state validation.
- */
-static void
-update_program_enables(struct gl_context *ctx)
-{
-   /* These _Enabled flags indicate if the user-defined ARB/NV vertex/fragment
-    * program is enabled AND valid.  Similarly for ATI fragment shaders.
-    * GLSL shaders not relevant here.
-    */
-   ctx->VertexProgram._Enabled = ctx->VertexProgram.Enabled
-      && ctx->VertexProgram.Current->arb.Instructions;
-   ctx->FragmentProgram._Enabled = ctx->FragmentProgram.Enabled
-      && ctx->FragmentProgram.Current->arb.Instructions;
-   ctx->ATIFragmentShader._Enabled = ctx->ATIFragmentShader.Enabled
-      && ctx->ATIFragmentShader.Current->Instructions[0];
-}
-
-
-/**
  * Update the ctx->*Program._Current pointers to point to the
  * current/active programs.
  *
@@ -112,7 +90,6 @@
    const struct gl_program *prevTCP = ctx->TessCtrlProgram._Current;
    const struct gl_program *prevTEP = ctx->TessEvalProgram._Current;
    const struct gl_program *prevCP = ctx->ComputeProgram._Current;
-   GLbitfield new_state = 0x0;
 
    /*
     * Set the ctx->VertexProgram._Current and ctx->FragmentProgram._Current
@@ -133,26 +110,20 @@
 
    if (fsProg) {
       /* Use GLSL fragment shader */
-      _mesa_reference_program(ctx, &ctx->_Shader->_CurrentFragmentProgram,
-                              fsProg);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._Current, fsProg);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._TexEnvProgram,
                               NULL);
    }
-   else if (ctx->FragmentProgram._Enabled) {
+   else if (_mesa_arb_fragment_program_enabled(ctx)) {
       /* Use user-defined fragment program */
-      _mesa_reference_program(ctx, &ctx->_Shader->_CurrentFragmentProgram,
-                              NULL);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._Current,
                               ctx->FragmentProgram.Current);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._TexEnvProgram,
 			      NULL);
    }
-   else if (ctx->ATIFragmentShader._Enabled &&
+   else if (_mesa_ati_fragment_shader_enabled(ctx) &&
             ctx->ATIFragmentShader.Current->Program) {
        /* Use the enabled ATI fragment shader's associated program */
-      _mesa_reference_program(ctx, &ctx->_Shader->_CurrentFragmentProgram,
-                              NULL);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._Current,
                               ctx->ATIFragmentShader.Current->Program);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._TexEnvProgram,
@@ -162,8 +133,6 @@
       /* Use fragment program generated from fixed-function state */
       struct gl_shader_program *f = _mesa_get_fixed_func_fragment_program(ctx);
 
-      _mesa_reference_program(ctx, &ctx->_Shader->_CurrentFragmentProgram,
-                              f->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._Current,
 			      f->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program);
       _mesa_reference_program(ctx, &ctx->FragmentProgram._TexEnvProgram,
@@ -210,7 +179,7 @@
       /* Use GLSL vertex shader */
       _mesa_reference_program(ctx, &ctx->VertexProgram._Current, vsProg);
    }
-   else if (ctx->VertexProgram._Enabled) {
+   else if (_mesa_arb_vertex_program_enabled(ctx)) {
       /* Use user-defined vertex program */
       _mesa_reference_program(ctx, &ctx->VertexProgram._Current,
                               ctx->VertexProgram.Current);
@@ -243,9 +212,9 @@
        ctx->TessEvalProgram._Current != prevTEP ||
        ctx->TessCtrlProgram._Current != prevTCP ||
        ctx->ComputeProgram._Current != prevCP)
-      new_state |= _NEW_PROGRAM;
+      return _NEW_PROGRAM;
 
-   return new_state;
+   return 0;
 }
 
 
@@ -261,7 +230,12 @@
       const struct gl_program_parameter_list *params =
          ctx->FragmentProgram._Current->Parameters;
       if (params && params->StateFlags & ctx->NewState) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
+         if (ctx->DriverFlags.NewShaderConstants[MESA_SHADER_FRAGMENT]) {
+            ctx->NewDriverState |=
+               ctx->DriverFlags.NewShaderConstants[MESA_SHADER_FRAGMENT];
+         } else {
+            new_state |= _NEW_PROGRAM_CONSTANTS;
+         }
       }
    }
 
@@ -273,7 +247,12 @@
       const struct gl_program_parameter_list *params =
          ctx->VertexProgram._Current->Parameters;
       if (params && params->StateFlags & ctx->NewState) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
+         if (ctx->DriverFlags.NewShaderConstants[MESA_SHADER_VERTEX]) {
+            ctx->NewDriverState |=
+               ctx->DriverFlags.NewShaderConstants[MESA_SHADER_VERTEX];
+         } else {
+            new_state |= _NEW_PROGRAM_CONSTANTS;
+         }
       }
    }
 
@@ -281,37 +260,6 @@
 }
 
 
-
-
-/**
- * Update the ctx->Polygon._FrontBit flag.
- */
-static void
-update_frontbit(struct gl_context *ctx)
-{
-   if (ctx->Transform.ClipOrigin == GL_LOWER_LEFT)
-      ctx->Polygon._FrontBit = (ctx->Polygon.FrontFace == GL_CW);
-   else
-      ctx->Polygon._FrontBit = (ctx->Polygon.FrontFace == GL_CCW);
-}
-
-
-/**
- * Update the ctx->VertexProgram._TwoSideEnabled flag.
- */
-static void
-update_twoside(struct gl_context *ctx)
-{
-   if (ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] ||
-       ctx->VertexProgram._Enabled) {
-      ctx->VertexProgram._TwoSideEnabled = ctx->VertexProgram.TwoSideEnabled;
-   } else {
-      ctx->VertexProgram._TwoSideEnabled = (ctx->Light.Enabled &&
-					    ctx->Light.Model.TwoSide);
-   }
-}
-
-
 /**
  * Compute derived GL state.
  * If __struct gl_contextRec::NewState is non-zero then this function \b must
@@ -328,7 +276,6 @@
 _mesa_update_state_locked( struct gl_context *ctx )
 {
    GLbitfield new_state = ctx->NewState;
-   GLbitfield prog_flags = _NEW_PROGRAM;
    GLbitfield new_prog_state = 0x0;
    const GLbitfield computed_states = ~(_NEW_CURRENT_ATTRIB | _NEW_LINE);
 
@@ -341,75 +288,72 @@
    if (MESA_VERBOSE & VERBOSE_STATE)
       _mesa_print_state("_mesa_update_state", new_state);
 
-   /* Determine which state flags effect vertex/fragment program state */
-   if (ctx->FragmentProgram._MaintainTexEnvProgram) {
-      prog_flags |= (_NEW_BUFFERS | _NEW_TEXTURE_OBJECT | _NEW_FOG |
-		     _NEW_VARYING_VP_INPUTS | _NEW_LIGHT | _NEW_POINT |
-		     _NEW_RENDERMODE | _NEW_PROGRAM | _NEW_FRAG_CLAMP |
-		     _NEW_COLOR | _NEW_TEXTURE_STATE);
-   }
-   if (ctx->VertexProgram._MaintainTnlProgram) {
-      prog_flags |= (_NEW_VARYING_VP_INPUTS | _NEW_TEXTURE_OBJECT |
-                     _NEW_TEXTURE_MATRIX | _NEW_TRANSFORM | _NEW_POINT |
-                     _NEW_FOG | _NEW_LIGHT | _NEW_TEXTURE_STATE |
-                     _MESA_NEW_NEED_EYE_COORDS);
-   }
-
-   /*
-    * Now update derived state info
-    */
-
-   if (new_state & prog_flags)
-      update_program_enables( ctx );
-
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
-      _mesa_update_modelview_project( ctx, new_state );
-
-   if (new_state & _NEW_TEXTURE_MATRIX)
-      _mesa_update_texture_matrices(ctx);
-
-   if (new_state & (_NEW_TEXTURE_OBJECT | _NEW_TEXTURE_STATE | _NEW_PROGRAM))
-      _mesa_update_texture_state(ctx);
-
-   if (new_state & _NEW_POLYGON)
-      update_frontbit( ctx );
-
    if (new_state & _NEW_BUFFERS)
       _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
 
-   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
-      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+   /* Handle Core and Compatibility contexts separately. */
+   if (ctx->API == API_OPENGL_COMPAT ||
+       ctx->API == API_OPENGLES) {
+      GLbitfield prog_flags = _NEW_PROGRAM;
 
-   if (new_state & _NEW_LIGHT)
-      _mesa_update_lighting( ctx );
+      /* Determine which state flags effect vertex/fragment program state */
+      if (ctx->FragmentProgram._MaintainTexEnvProgram) {
+         prog_flags |= (_NEW_BUFFERS | _NEW_TEXTURE_OBJECT | _NEW_FOG |
+                        _NEW_VARYING_VP_INPUTS | _NEW_LIGHT | _NEW_POINT |
+                        _NEW_RENDERMODE | _NEW_PROGRAM | _NEW_FRAG_CLAMP |
+                        _NEW_COLOR | _NEW_TEXTURE_STATE);
+      }
+      if (ctx->VertexProgram._MaintainTnlProgram) {
+         prog_flags |= (_NEW_VARYING_VP_INPUTS | _NEW_TEXTURE_OBJECT |
+                        _NEW_TEXTURE_MATRIX | _NEW_TRANSFORM | _NEW_POINT |
+                        _NEW_FOG | _NEW_LIGHT | _NEW_TEXTURE_STATE |
+                        _MESA_NEW_NEED_EYE_COORDS);
+      }
 
-   if (new_state & (_NEW_LIGHT | _NEW_PROGRAM))
-      update_twoside( ctx );
-
-   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS))
-      _mesa_update_stencil( ctx );
-
-   if (new_state & _NEW_PIXEL)
-      _mesa_update_pixel( ctx, new_state );
-
-   /* ctx->_NeedEyeCoords is now up to date.
-    *
-    * If the truth value of this variable has changed, update for the
-    * new lighting space and recompute the positions of lights and the
-    * normal transform.
-    *
-    * If the lighting space hasn't changed, may still need to recompute
-    * light positions & normal transforms for other reasons.
-    */
-   if (new_state & _MESA_NEW_NEED_EYE_COORDS) 
-      _mesa_update_tnl_spaces( ctx, new_state );
-
-   if (new_state & prog_flags) {
-      /* When we generate programs from fixed-function vertex/fragment state
-       * this call may generate/bind a new program.  If so, we need to
-       * propogate the _NEW_PROGRAM flag to the driver.
+      /*
+       * Now update derived state info
        */
-      new_prog_state |= update_program( ctx );
+      if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
+         _mesa_update_modelview_project( ctx, new_state );
+
+      if (new_state & _NEW_TEXTURE_MATRIX)
+         _mesa_update_texture_matrices(ctx);
+
+      if (new_state & (_NEW_TEXTURE_OBJECT | _NEW_TEXTURE_STATE | _NEW_PROGRAM))
+         _mesa_update_texture_state(ctx);
+
+      if (new_state & _NEW_LIGHT)
+         _mesa_update_lighting(ctx);
+
+      if (new_state & _NEW_PIXEL)
+         _mesa_update_pixel( ctx );
+
+      /* ctx->_NeedEyeCoords is now up to date.
+       *
+       * If the truth value of this variable has changed, update for the
+       * new lighting space and recompute the positions of lights and the
+       * normal transform.
+       *
+       * If the lighting space hasn't changed, may still need to recompute
+       * light positions & normal transforms for other reasons.
+       */
+      if (new_state & _MESA_NEW_NEED_EYE_COORDS)
+         _mesa_update_tnl_spaces( ctx, new_state );
+
+      if (new_state & prog_flags) {
+         /* When we generate programs from fixed-function vertex/fragment state
+          * this call may generate/bind a new program.  If so, we need to
+          * propogate the _NEW_PROGRAM flag to the driver.
+          */
+         new_prog_state |= update_program(ctx);
+      }
+   } else {
+      /* GL Core and GLES 2/3 contexts */
+      if (new_state & (_NEW_TEXTURE_OBJECT | _NEW_PROGRAM))
+         _mesa_update_texture_state(ctx);
+
+      if (new_state & _NEW_PROGRAM)
+         update_program(ctx);
    }
 
    if (new_state & _NEW_ARRAY)
@@ -418,18 +362,17 @@
  out:
    new_prog_state |= update_program_constants(ctx);
 
+   ctx->NewState |= new_prog_state;
+   vbo_exec_invalidate_state(ctx);
+
    /*
     * Give the driver a chance to act upon the new_state flags.
     * The driver might plug in different span functions, for example.
     * Also, this is where the driver can invalidate the state of any
     * active modules (such as swrast_setup, swrast, tnl, etc).
-    *
-    * Set ctx->NewState to zero to avoid recursion if
-    * Driver.UpdateState() has to call FLUSH_VERTICES().  (fixed?)
     */
-   new_state = ctx->NewState | new_prog_state;
+   ctx->Driver.UpdateState(ctx);
    ctx->NewState = 0;
-   ctx->Driver.UpdateState(ctx, new_state);
    ctx->Array.VAO->NewArrays = 0x0;
 }
 
@@ -473,6 +416,10 @@
 _mesa_set_varying_vp_inputs( struct gl_context *ctx,
                              GLbitfield64 varying_inputs )
 {
+   if (ctx->API != API_OPENGL_COMPAT &&
+       ctx->API != API_OPENGLES)
+      return;
+
    if (ctx->varying_vp_inputs != varying_inputs) {
       ctx->varying_vp_inputs = varying_inputs;
 
diff --git a/src/mesa/main/state.h b/src/mesa/main/state.h
index 7a6cdac..b719f39 100644
--- a/src/mesa/main/state.h
+++ b/src/mesa/main/state.h
@@ -72,4 +72,46 @@
    return GL_FALSE;
 }
 
+static inline bool
+_mesa_arb_vertex_program_enabled(const struct gl_context *ctx)
+{
+   return ctx->VertexProgram.Enabled &&
+          ctx->VertexProgram.Current->arb.Instructions;
+}
+
+/** Compute two sided lighting state for fixed function or programs. */
+static inline bool
+_mesa_vertex_program_two_side_enabled(const struct gl_context *ctx)
+{
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] ||
+       _mesa_arb_vertex_program_enabled(ctx))
+      return ctx->VertexProgram.TwoSideEnabled;
+
+   return ctx->Light.Enabled && ctx->Light.Model.TwoSide;
+}
+
+/** Return 0=GL_CCW or 1=GL_CW */
+static inline bool
+_mesa_polygon_get_front_bit(const struct gl_context *ctx)
+{
+   if (ctx->Transform.ClipOrigin == GL_LOWER_LEFT)
+      return ctx->Polygon.FrontFace == GL_CW;
+
+   return ctx->Polygon.FrontFace == GL_CCW;
+}
+
+static inline bool
+_mesa_arb_fragment_program_enabled(const struct gl_context *ctx)
+{
+   return ctx->FragmentProgram.Enabled &&
+          ctx->FragmentProgram.Current->arb.Instructions;
+}
+
+static inline bool
+_mesa_ati_fragment_shader_enabled(const struct gl_context *ctx)
+{
+   return ctx->ATIFragmentShader.Enabled &&
+          ctx->ATIFragmentShader.Current->Instructions[0];
+}
+
 #endif
diff --git a/src/mesa/main/stencil.c b/src/mesa/main/stencil.c
index b303bb7..d89312c 100644
--- a/src/mesa/main/stencil.c
+++ b/src/mesa/main/stencil.c
@@ -157,7 +157,8 @@
        ctx->Stencil.Ref[0] == ref &&
        ctx->Stencil.Ref[1] == ref)
       return;
-   FLUSH_VERTICES(ctx, _NEW_STENCIL);
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+   ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
    ctx->Stencil.Function[0]  = frontfunc;
    ctx->Stencil.Function[1]  = backfunc;
    ctx->Stencil.Ref[0]       = ctx->Stencil.Ref[1]       = ref;
@@ -184,26 +185,18 @@
  * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies
  * the driver via the dd_function_table::StencilFunc callback.
  */
-void GLAPIENTRY
-_mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
+static void
+stencil_func(struct gl_context *ctx, GLenum func, GLint ref, GLuint mask)
 {
-   GET_CURRENT_CONTEXT(ctx);
    const GLint face = ctx->Stencil.ActiveFace;
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glStencilFunc()\n");
-
-   if (!validate_stencil_func(ctx, func)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilFunc(func)");
-      return;
-   }
-
    if (face != 0) {
       if (ctx->Stencil.Function[face] == func &&
           ctx->Stencil.ValueMask[face] == mask &&
           ctx->Stencil.Ref[face] == ref)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.Function[face] = func;
       ctx->Stencil.Ref[face] = ref;
       ctx->Stencil.ValueMask[face] = mask;
@@ -224,7 +217,8 @@
           ctx->Stencil.Ref[0] == ref &&
           ctx->Stencil.Ref[1] == ref)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.Function[0]  = ctx->Stencil.Function[1]  = func;
       ctx->Stencil.Ref[0]       = ctx->Stencil.Ref[1]       = ref;
       ctx->Stencil.ValueMask[0] = ctx->Stencil.ValueMask[1] = mask;
@@ -238,6 +232,31 @@
 }
 
 
+void GLAPIENTRY
+_mesa_StencilFunc_no_error(GLenum func, GLint ref, GLuint mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   stencil_func(ctx, func, ref, mask);
+}
+
+
+void GLAPIENTRY
+_mesa_StencilFunc(GLenum func, GLint ref, GLuint mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glStencilFunc()\n");
+
+   if (!validate_stencil_func(ctx, func)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilFunc(func)");
+      return;
+   }
+
+   stencil_func(ctx, func, ref, mask);
+}
+
+
 /**
  * Set the stencil writing mask.
  *
@@ -263,7 +282,8 @@
        */
       if (ctx->Stencil.WriteMask[face] == mask)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.WriteMask[face] = mask;
 
       /* Only propagate the change to the driver if EXT_stencil_two_side
@@ -278,7 +298,8 @@
       if (ctx->Stencil.WriteMask[0] == mask &&
           ctx->Stencil.WriteMask[1] == mask)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.WriteMask[0] = ctx->Stencil.WriteMask[1] = mask;
       if (ctx->Driver.StencilMaskSeparate) {
          ctx->Driver.StencilMaskSeparate(ctx,
@@ -304,35 +325,19 @@
  * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies
  * the driver via the dd_function_table::StencilOp callback.
  */
-void GLAPIENTRY
-_mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+static void
+stencil_op(struct gl_context *ctx, GLenum fail, GLenum zfail, GLenum zpass)
 {
-   GET_CURRENT_CONTEXT(ctx);
    const GLint face = ctx->Stencil.ActiveFace;
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glStencilOp()\n");
-
-   if (!validate_stencil_op(ctx, fail)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(sfail)");
-      return;
-   }
-   if (!validate_stencil_op(ctx, zfail)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(zfail)");
-      return;
-   }
-   if (!validate_stencil_op(ctx, zpass)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(zpass)");
-      return;
-   }
-
    if (face != 0) {
       /* only set active face state */
       if (ctx->Stencil.ZFailFunc[face] == zfail &&
           ctx->Stencil.ZPassFunc[face] == zpass &&
           ctx->Stencil.FailFunc[face] == fail)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.ZFailFunc[face] = zfail;
       ctx->Stencil.ZPassFunc[face] = zpass;
       ctx->Stencil.FailFunc[face] = fail;
@@ -353,7 +358,8 @@
           ctx->Stencil.FailFunc[0] == fail &&
           ctx->Stencil.FailFunc[1] == fail)
          return;
-      FLUSH_VERTICES(ctx, _NEW_STENCIL);
+      FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+      ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
       ctx->Stencil.ZFailFunc[0] = ctx->Stencil.ZFailFunc[1] = zfail;
       ctx->Stencil.ZPassFunc[0] = ctx->Stencil.ZPassFunc[1] = zpass;
       ctx->Stencil.FailFunc[0]  = ctx->Stencil.FailFunc[1]  = fail;
@@ -367,6 +373,40 @@
 }
 
 
+void GLAPIENTRY
+_mesa_StencilOp_no_error(GLenum fail, GLenum zfail, GLenum zpass)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   stencil_op(ctx, fail, zfail, zpass);
+}
+
+
+void GLAPIENTRY
+_mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glStencilOp()\n");
+
+   if (!validate_stencil_op(ctx, fail)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(sfail)");
+      return;
+   }
+
+   if (!validate_stencil_op(ctx, zfail)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(zfail)");
+      return;
+   }
+
+   if (!validate_stencil_op(ctx, zpass)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOp(zpass)");
+      return;
+   }
+
+   stencil_op(ctx, fail, zfail, zpass);
+}
+
 
 /* GL_EXT_stencil_two_side */
 void GLAPIENTRY
@@ -391,11 +431,58 @@
 }
 
 
+static void
+stencil_op_separate(struct gl_context *ctx, GLenum face, GLenum sfail,
+                    GLenum zfail, GLenum zpass)
+{
+   GLboolean set = GL_FALSE;
+
+   if (face != GL_BACK) {
+      /* set front */
+      if (ctx->Stencil.ZFailFunc[0] != zfail ||
+          ctx->Stencil.ZPassFunc[0] != zpass ||
+          ctx->Stencil.FailFunc[0] != sfail){
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+         ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
+         ctx->Stencil.ZFailFunc[0] = zfail;
+         ctx->Stencil.ZPassFunc[0] = zpass;
+         ctx->Stencil.FailFunc[0] = sfail;
+         set = GL_TRUE;
+      }
+   }
+
+   if (face != GL_FRONT) {
+      /* set back */
+      if (ctx->Stencil.ZFailFunc[1] != zfail ||
+          ctx->Stencil.ZPassFunc[1] != zpass ||
+          ctx->Stencil.FailFunc[1] != sfail) {
+         FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+         ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
+         ctx->Stencil.ZFailFunc[1] = zfail;
+         ctx->Stencil.ZPassFunc[1] = zpass;
+         ctx->Stencil.FailFunc[1] = sfail;
+         set = GL_TRUE;
+      }
+   }
+
+   if (set && ctx->Driver.StencilOpSeparate) {
+      ctx->Driver.StencilOpSeparate(ctx, face, sfail, zfail, zpass);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_StencilOpSeparate_no_error(GLenum face, GLenum sfail, GLenum zfail,
+                                 GLenum zpass)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   stencil_op_separate(ctx, face, sfail, zfail, zpass);
+}
+
 
 void GLAPIENTRY
 _mesa_StencilOpSeparate(GLenum face, GLenum sfail, GLenum zfail, GLenum zpass)
 {
-   GLboolean set = GL_FALSE;
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
@@ -405,51 +492,64 @@
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOpSeparate(sfail)");
       return;
    }
+
    if (!validate_stencil_op(ctx, zfail)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOpSeparate(zfail)");
       return;
    }
+
    if (!validate_stencil_op(ctx, zpass)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOpSeparate(zpass)");
       return;
    }
+
    if (face != GL_FRONT && face != GL_BACK && face != GL_FRONT_AND_BACK) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilOpSeparate(face)");
       return;
    }
 
+   stencil_op_separate(ctx, face, sfail, zfail, zpass);
+}
+
+
+static void
+stencil_func_separate(struct gl_context *ctx, GLenum face, GLenum func,
+                      GLint ref, GLuint mask)
+{
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+   ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
+
    if (face != GL_BACK) {
       /* set front */
-      if (ctx->Stencil.ZFailFunc[0] != zfail ||
-          ctx->Stencil.ZPassFunc[0] != zpass ||
-          ctx->Stencil.FailFunc[0] != sfail){
-         FLUSH_VERTICES(ctx, _NEW_STENCIL);
-         ctx->Stencil.ZFailFunc[0] = zfail;
-         ctx->Stencil.ZPassFunc[0] = zpass;
-         ctx->Stencil.FailFunc[0] = sfail;
-         set = GL_TRUE;
-      }
+      ctx->Stencil.Function[0] = func;
+      ctx->Stencil.Ref[0] = ref;
+      ctx->Stencil.ValueMask[0] = mask;
    }
+
    if (face != GL_FRONT) {
       /* set back */
-      if (ctx->Stencil.ZFailFunc[1] != zfail ||
-          ctx->Stencil.ZPassFunc[1] != zpass ||
-          ctx->Stencil.FailFunc[1] != sfail) {
-         FLUSH_VERTICES(ctx, _NEW_STENCIL);
-         ctx->Stencil.ZFailFunc[1] = zfail;
-         ctx->Stencil.ZPassFunc[1] = zpass;
-         ctx->Stencil.FailFunc[1] = sfail;
-         set = GL_TRUE;
-      }
+      ctx->Stencil.Function[1] = func;
+      ctx->Stencil.Ref[1] = ref;
+      ctx->Stencil.ValueMask[1] = mask;
    }
-   if (set && ctx->Driver.StencilOpSeparate) {
-      ctx->Driver.StencilOpSeparate(ctx, face, sfail, zfail, zpass);
+
+   if (ctx->Driver.StencilFuncSeparate) {
+      ctx->Driver.StencilFuncSeparate(ctx, face, func, ref, mask);
    }
 }
 
 
 /* OpenGL 2.0 */
 void GLAPIENTRY
+_mesa_StencilFuncSeparate_no_error(GLenum face, GLenum func, GLint ref,
+                                   GLuint mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   stencil_func_separate(ctx, face, func, ref, mask);
+}
+
+
+void GLAPIENTRY
 _mesa_StencilFuncSeparate(GLenum face, GLenum func, GLint ref, GLuint mask)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -461,33 +561,46 @@
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilFuncSeparate(face)");
       return;
    }
+
    if (!validate_stencil_func(ctx, func)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glStencilFuncSeparate(func)");
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_STENCIL);
+   stencil_func_separate(ctx, face, func, ref, mask);
+}
+
+
+static void
+stencil_mask_separate(struct gl_context *ctx, GLenum face, GLuint mask)
+{
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewStencil ? 0 : _NEW_STENCIL);
+   ctx->NewDriverState |= ctx->DriverFlags.NewStencil;
 
    if (face != GL_BACK) {
-      /* set front */
-      ctx->Stencil.Function[0] = func;
-      ctx->Stencil.Ref[0] = ref;
-      ctx->Stencil.ValueMask[0] = mask;
+      ctx->Stencil.WriteMask[0] = mask;
    }
+
    if (face != GL_FRONT) {
-      /* set back */
-      ctx->Stencil.Function[1] = func;
-      ctx->Stencil.Ref[1] = ref;
-      ctx->Stencil.ValueMask[1] = mask;
+      ctx->Stencil.WriteMask[1] = mask;
    }
-   if (ctx->Driver.StencilFuncSeparate) {
-      ctx->Driver.StencilFuncSeparate(ctx, face, func, ref, mask);
+
+   if (ctx->Driver.StencilMaskSeparate) {
+      ctx->Driver.StencilMaskSeparate(ctx, face, mask);
    }
 }
 
 
 /* OpenGL 2.0 */
 void GLAPIENTRY
+_mesa_StencilMaskSeparate_no_error(GLenum face, GLuint mask)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   stencil_mask_separate(ctx, face, mask);
+}
+
+
+void GLAPIENTRY
 _mesa_StencilMaskSeparate(GLenum face, GLuint mask)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -500,45 +613,7 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_STENCIL);
-
-   if (face != GL_BACK) {
-      ctx->Stencil.WriteMask[0] = mask;
-   }
-   if (face != GL_FRONT) {
-      ctx->Stencil.WriteMask[1] = mask;
-   }
-   if (ctx->Driver.StencilMaskSeparate) {
-      ctx->Driver.StencilMaskSeparate(ctx, face, mask);
-   }
-}
-
-
-/**
- * Update derived stencil state.
- */
-void
-_mesa_update_stencil(struct gl_context *ctx)
-{
-   const GLint face = ctx->Stencil._BackFace;
-
-   ctx->Stencil._Enabled = (ctx->Stencil.Enabled &&
-                            ctx->DrawBuffer->Visual.stencilBits > 0);
-
-    ctx->Stencil._TestTwoSide =
-       ctx->Stencil._Enabled &&
-       (ctx->Stencil.Function[0] != ctx->Stencil.Function[face] ||
-	ctx->Stencil.FailFunc[0] != ctx->Stencil.FailFunc[face] ||
-	ctx->Stencil.ZPassFunc[0] != ctx->Stencil.ZPassFunc[face] ||
-	ctx->Stencil.ZFailFunc[0] != ctx->Stencil.ZFailFunc[face] ||
-	ctx->Stencil.Ref[0] != ctx->Stencil.Ref[face] ||
-	ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[face] ||
-	ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[face]);
-
-   ctx->Stencil._WriteEnabled =
-      ctx->Stencil._Enabled &&
-      (ctx->Stencil.WriteMask[0] != 0 ||
-       (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[face] != 0));
+   stencil_mask_separate(ctx, face, mask);
 }
 
 
diff --git a/src/mesa/main/stencil.h b/src/mesa/main/stencil.h
index 3302cb9..dc371ec 100644
--- a/src/mesa/main/stencil.h
+++ b/src/mesa/main/stencil.h
@@ -41,26 +41,38 @@
 _mesa_ClearStencil( GLint s );
 
 
+void GLAPIENTRY
+_mesa_StencilFunc_no_error(GLenum func, GLint ref, GLuint mask);
+
 extern void GLAPIENTRY
-_mesa_StencilFunc( GLenum func, GLint ref, GLuint mask );
+_mesa_StencilFunc(GLenum func, GLint ref, GLuint mask);
 
 
 extern void GLAPIENTRY
 _mesa_StencilMask( GLuint mask );
 
+void GLAPIENTRY
+_mesa_StencilOp_no_error(GLenum fail, GLenum zfail, GLenum zpass);
 
 extern void GLAPIENTRY
-_mesa_StencilOp( GLenum fail, GLenum zfail, GLenum zpass );
+_mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass);
 
 
 extern void GLAPIENTRY
 _mesa_ActiveStencilFaceEXT(GLenum face);
 
+void GLAPIENTRY
+_mesa_StencilOpSeparate_no_error(GLenum face, GLenum fail, GLenum zfail,
+                                 GLenum zpass);
 
 extern void GLAPIENTRY
 _mesa_StencilOpSeparate(GLenum face, GLenum fail, GLenum zfail, GLenum zpass);
 
 
+void GLAPIENTRY
+_mesa_StencilFuncSeparate_no_error(GLenum face, GLenum func, GLint ref,
+                                   GLuint mask);
+
 extern void GLAPIENTRY
 _mesa_StencilFuncSeparate(GLenum face, GLenum func, GLint ref, GLuint mask);
 
@@ -68,14 +80,12 @@
 extern void GLAPIENTRY
 _mesa_StencilFuncSeparateATI(GLenum frontfunc, GLenum backfunc, GLint ref, GLuint mask);
 
+void GLAPIENTRY
+_mesa_StencilMaskSeparate_no_error(GLenum face, GLuint mask);
+
 extern void GLAPIENTRY
 _mesa_StencilMaskSeparate(GLenum face, GLuint mask);
 
-
-extern void
-_mesa_update_stencil(struct gl_context *ctx);
-
-
 extern void 
 _mesa_init_stencil( struct gl_context * ctx );
 
@@ -93,4 +103,35 @@
    return CLAMP(ref, 0, stencilMax);
 }
 
+static inline bool
+_mesa_stencil_is_enabled(const struct gl_context *ctx)
+{
+   return ctx->Stencil.Enabled &&
+          ctx->DrawBuffer->Visual.stencilBits > 0;
+}
+
+static inline bool
+_mesa_stencil_is_two_sided(const struct gl_context *ctx)
+{
+   const int face = ctx->Stencil._BackFace;
+
+   return _mesa_stencil_is_enabled(ctx) &&
+          (ctx->Stencil.Function[0] != ctx->Stencil.Function[face] ||
+           ctx->Stencil.FailFunc[0] != ctx->Stencil.FailFunc[face] ||
+           ctx->Stencil.ZPassFunc[0] != ctx->Stencil.ZPassFunc[face] ||
+           ctx->Stencil.ZFailFunc[0] != ctx->Stencil.ZFailFunc[face] ||
+           ctx->Stencil.Ref[0] != ctx->Stencil.Ref[face] ||
+           ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[face] ||
+           ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[face]);
+}
+
+static inline bool
+_mesa_stencil_is_write_enabled(const struct gl_context *ctx, bool is_two_sided)
+{
+   return _mesa_stencil_is_enabled(ctx) &&
+          (ctx->Stencil.WriteMask[0] != 0 ||
+           (is_two_sided &&
+            ctx->Stencil.WriteMask[ctx->Stencil._BackFace] != 0));
+}
+
 #endif
diff --git a/src/mesa/main/syncobj.c b/src/mesa/main/syncobj.c
index a3124e4..9c16531 100644
--- a/src/mesa/main/syncobj.c
+++ b/src/mesa/main/syncobj.c
@@ -258,24 +258,10 @@
 }
 
 
-GLsync GLAPIENTRY
-_mesa_FenceSync(GLenum condition, GLbitfield flags)
+static GLsync
+fence_sync(struct gl_context *ctx, GLenum condition, GLbitfield flags)
 {
-   GET_CURRENT_CONTEXT(ctx);
    struct gl_sync_object *syncObj;
-   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
-
-   if (condition != GL_SYNC_GPU_COMMANDS_COMPLETE) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glFenceSync(condition=0x%x)",
-		  condition);
-      return 0;
-   }
-
-   if (flags != 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glFenceSync(flags=0x%x)",
-		  condition);
-      return 0;
-   }
 
    syncObj = ctx->Driver.NewSyncObject(ctx, GL_SYNC_FENCE);
    if (syncObj != NULL) {
@@ -298,31 +284,48 @@
       _mesa_set_add(ctx->Shared->SyncObjects, syncObj);
       mtx_unlock(&ctx->Shared->Mutex);
 
-      return (GLsync) syncObj;
+      return (GLsync)syncObj;
    }
 
    return NULL;
 }
 
 
-GLenum GLAPIENTRY
-_mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+GLsync GLAPIENTRY
+_mesa_FenceSync_no_error(GLenum condition, GLbitfield flags)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *syncObj;
+   return fence_sync(ctx, condition, flags);
+}
+
+
+GLsync GLAPIENTRY
+_mesa_FenceSync(GLenum condition, GLbitfield flags)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
+
+   if (condition != GL_SYNC_GPU_COMMANDS_COMPLETE) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glFenceSync(condition=0x%x)",
+		  condition);
+      return 0;
+   }
+
+   if (flags != 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glFenceSync(flags=0x%x)",
+		  condition);
+      return 0;
+   }
+
+   return fence_sync(ctx, condition, flags);
+}
+
+
+static GLenum
+client_wait_sync(struct gl_context *ctx, struct gl_sync_object *syncObj,
+                 GLbitfield flags, GLuint64 timeout)
+{
    GLenum ret;
-   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_WAIT_FAILED);
-
-   if ((flags & ~GL_SYNC_FLUSH_COMMANDS_BIT) != 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync(flags=0x%x)", flags);
-      return GL_WAIT_FAILED;
-   }
-
-   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
-   if (!syncObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
-      return GL_WAIT_FAILED;
-   }
 
    /* From the GL_ARB_sync spec:
     *
@@ -349,6 +352,39 @@
 }
 
 
+GLenum GLAPIENTRY
+_mesa_ClientWaitSync_no_error(GLsync sync, GLbitfield flags, GLuint64 timeout)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_sync_object *syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   return client_wait_sync(ctx, syncObj, flags, timeout);
+}
+
+
+GLenum GLAPIENTRY
+_mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_sync_object *syncObj;
+
+   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_WAIT_FAILED);
+
+   if ((flags & ~GL_SYNC_FLUSH_COMMANDS_BIT) != 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync(flags=0x%x)", flags);
+      return GL_WAIT_FAILED;
+   }
+
+   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   if (!syncObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
+      return GL_WAIT_FAILED;
+   }
+
+   return client_wait_sync(ctx, syncObj, flags, timeout);
+}
+
+
 void GLAPIENTRY
 _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
 {
diff --git a/src/mesa/main/syncobj.h b/src/mesa/main/syncobj.h
index ea4a712..2b50d06 100644
--- a/src/mesa/main/syncobj.h
+++ b/src/mesa/main/syncobj.h
@@ -60,9 +60,15 @@
 extern void GLAPIENTRY
 _mesa_DeleteSync(GLsync sync);
 
+GLsync GLAPIENTRY
+_mesa_FenceSync_no_error(GLenum condition, GLbitfield flags);
+
 extern GLsync GLAPIENTRY
 _mesa_FenceSync(GLenum condition, GLbitfield flags);
 
+GLenum GLAPIENTRY
+_mesa_ClientWaitSync_no_error(GLsync sync, GLbitfield flags, GLuint64 timeout);
+
 extern GLenum GLAPIENTRY
 _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout);
 
diff --git a/src/mesa/main/tests/Makefile.am b/src/mesa/main/tests/Makefile.am
index 8b4598d..47fce8a 100644
--- a/src/mesa/main/tests/Makefile.am
+++ b/src/mesa/main/tests/Makefile.am
@@ -23,8 +23,6 @@
 	$(CLOCK_LIB)
 
 if HAVE_SHARED_GLAPI
-AM_CPPFLAGS += -DHAVE_SHARED_GLAPI
-
 main_test_SOURCES +=			\
 	dispatch_sanity.cpp		\
 	mesa_formats.cpp			\
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 12a9ee7..724c22e 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -965,12 +965,28 @@
    { "glBufferPageCommitmentARB", 43, -1 },
    { "glNamedBufferPageCommitmentARB", 43, -1 },
 
+   /* GL_ARB_bindless_texture */
+   { "glGetTextureHandleARB", 40, -1 },
+   { "glGetTextureSamplerHandleARB", 40, -1 },
+   { "glMakeTextureHandleResidentARB", 40, -1 },
+   { "glMakeTextureHandleNonResidentARB", 40, -1 },
+   { "glIsTextureHandleResidentARB", 40, -1 },
+   { "glGetImageHandleARB", 40, -1 },
+   { "glMakeImageHandleResidentARB", 40, -1 },
+   { "glMakeImageHandleNonResidentARB", 40, -1 },
+   { "glIsImageHandleResidentARB", 40, -1 },
+   { "glUniformHandleui64ARB", 40, -1 },
+   { "glUniformHandleui64vARB", 40, -1 },
+   { "glProgramUniformHandleui64ARB", 40, -1 },
+   { "glProgramUniformHandleui64vARB", 40, -1 },
+   { "glVertexAttribL1ui64ARB", 40, -1 },
+   { "glVertexAttribL1ui64vARB", 40, -1 },
+   { "glGetVertexAttribLui64vARB", 40, -1 },
+
    { NULL, 0, -1 }
 };
 
 const struct function gl_compatibility_functions_possible[] = {
-   { "glBindVertexArrayAPPLE", 10, -1 },
-   { "glGenVertexArraysAPPLE", 10, -1 },
    { "glBindRenderbufferEXT", 10, -1 },
    { "glBindFramebufferEXT", 10, -1 },
    { "glNewList", 10, _gloffset_NewList },
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index ee5171c..2fcaf7c 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -444,7 +444,7 @@
       if (pname == GL_TEXTURE_LOD_BIAS_EXT) {
 	 if (texUnit->LodBias == param[0])
 	    return;
-	 FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
+	 FLUSH_VERTICES(ctx, _NEW_TEXTURE_OBJECT);
          texUnit->LodBias = param[0];
       }
       else {
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 658b0e5..715bc24 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -1458,13 +1458,10 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    static const char *caller = "glGetTextureSubImage";
-   struct gl_texture_object *texObj = NULL;
-
-   if (texture > 0)
-      texObj = _mesa_lookup_texture(ctx, texture);
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
    if (!texObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(texture)", caller);
       return;
    }
 
@@ -1778,11 +1775,8 @@
    static const char *caller = "glGetCompressedTextureImage";
    struct gl_texture_object *texObj = NULL;
 
-   if (texture > 0)
-      texObj = _mesa_lookup_texture(ctx, texture);
-
+   texObj = _mesa_lookup_texture_err(ctx, texture, caller);
    if (!texObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(texture)", caller);
       return;
    }
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 1a00d25..5509d80 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -813,9 +813,6 @@
    img->Width2 = width - 2 * border;   /* == 1 << img->WidthLog2; */
    img->WidthLog2 = _mesa_logbase2(img->Width2);
 
-   img->NumSamples = 0;
-   img->FixedSampleLocations = GL_TRUE;
-
    switch(target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_BUFFER:
@@ -1578,7 +1575,7 @@
 
 /**
  * Helper function to determine if a texture object is mutable (in terms
- * of GL_ARB_texture_storage).
+ * of GL_ARB_texture_storage/GL_ARB_bindless_texture).
  */
 static GLboolean
 mutable_tex_object(struct gl_context *ctx, GLenum target)
@@ -1587,6 +1584,17 @@
    if (!texObj)
       return GL_FALSE;
 
+   if (texObj->HandleAllocated) {
+      /* The ARB_bindless_texture spec says:
+       *
+       * "The error INVALID_OPERATION is generated by TexImage*, CopyTexImage*,
+       *  CompressedTexImage*, TexBuffer*, TexParameter*, as well as other
+       *  functions defined in terms of these, if the texture object to be
+       *  modified is referenced by one or more texture or image handles."
+       */
+      return GL_FALSE;
+   }
+
    return !texObj->Immutable;
 }
 
@@ -2493,6 +2501,8 @@
                             GLint xoffset, GLint yoffset, GLint zoffset,
                             GLint width, GLint height, const char *caller)
 {
+   assert(texObj);
+
    struct gl_texture_image *texImage;
 
    /* Check that the source buffer is complete */
@@ -2519,12 +2529,6 @@
       return GL_TRUE;
    }
 
-   /* Get dest image pointers */
-   if (!texObj) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s()", caller);
-      return GL_TRUE;
-   }
-
    texImage = _mesa_select_tex_image(texObj, target, level);
    if (!texImage) {
       /* destination image does not exist */
@@ -2854,19 +2858,19 @@
  * \param type  the user's image type (only used if !compressed)
  * \param imageSize  only used for glCompressedTexImage1D/2D/3D calls.
  */
-static void
+static ALWAYS_INLINE void
 teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
          GLenum target, GLint level, GLint internalFormat,
          GLsizei width, GLsizei height, GLsizei depth,
          GLint border, GLenum format, GLenum type,
-         GLsizei imageSize, const GLvoid *pixels)
+         GLsizei imageSize, const GLvoid *pixels, bool no_error)
 {
    const char *func = compressed ? "glCompressedTexImage" : "glTexImage";
    struct gl_pixelstore_attrib unpack_no_border;
    const struct gl_pixelstore_attrib *unpack = &ctx->Unpack;
    struct gl_texture_object *texObj;
    mesa_format texFormat;
-   GLboolean dimensionsOK, sizeOK;
+   bool dimensionsOK = true, sizeOK = true;
 
    FLUSH_VERTICES(ctx, 0);
 
@@ -2891,26 +2895,27 @@
 
    internalFormat = override_internal_format(internalFormat, width, height);
 
-   /* target error checking */
-   if (!legal_teximage_target(ctx, dims, target)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s%uD(target=%s)",
-                  func, dims, _mesa_enum_to_string(target));
-      return;
-   }
+   if (!no_error) {
+      /* target error checking */
+      if (!legal_teximage_target(ctx, dims, target)) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s%uD(target=%s)",
+                     func, dims, _mesa_enum_to_string(target));
+         return;
+      }
 
-   /* general error checking */
-   if (compressed) {
-      if (compressed_texture_error_check(ctx, dims, target, level,
-                                         internalFormat,
-                                         width, height, depth,
-                                         border, imageSize, pixels))
-         return;
-   }
-   else {
-      if (texture_error_check(ctx, dims, target, level, internalFormat,
-                              format, type, width, height, depth, border,
-                              pixels))
-         return;
+      /* general error checking */
+      if (compressed) {
+         if (compressed_texture_error_check(ctx, dims, target, level,
+                                            internalFormat,
+                                            width, height, depth,
+                                            border, imageSize, pixels))
+            return;
+      } else {
+         if (texture_error_check(ctx, dims, target, level, internalFormat,
+                                 format, type, width, height, depth, border,
+                                 pixels))
+            return;
+      }
    }
 
    /* Here we convert a cpal compressed image into a regular glTexImage2D
@@ -2965,14 +2970,16 @@
 
    assert(texFormat != MESA_FORMAT_NONE);
 
-   /* check that width, height, depth are legal for the mipmap level */
-   dimensionsOK = _mesa_legal_texture_dimensions(ctx, target, level, width,
-                                                 height, depth, border);
+   if (!no_error) {
+      /* check that width, height, depth are legal for the mipmap level */
+      dimensionsOK = _mesa_legal_texture_dimensions(ctx, target, level, width,
+                                                    height, depth, border);
 
-   /* check that the texture won't take too much memory, etc */
-   sizeOK = ctx->Driver.TestProxyTexImage(ctx, proxy_target(target),
-                                          0, level, texFormat, 1,
-                                          width, height, depth);
+      /* check that the texture won't take too much memory, etc */
+      sizeOK = ctx->Driver.TestProxyTexImage(ctx, proxy_target(target),
+                                             0, level, texFormat, 1,
+                                             width, height, depth);
+   }
 
    if (_mesa_is_proxy_texture(target)) {
       /* Proxy texture: just clear or set state depending on error checking */
@@ -2997,8 +3004,8 @@
 
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "%s%uD(invalid width or height or depth)",
-                     func, dims);
+                     "%s%uD(invalid width=%d or height=%d or depth=%d)",
+                     func, dims, width, height, depth);
          return;
       }
 
@@ -3061,6 +3068,31 @@
    }
 }
 
+/* This is a wrapper around teximage() so that we can force the KHR_no_error
+ * logic to be inlined without inlining the function into all the callers.
+ */
+static void
+teximage_err(struct gl_context *ctx, GLboolean compressed, GLuint dims,
+             GLenum target, GLint level, GLint internalFormat,
+             GLsizei width, GLsizei height, GLsizei depth,
+             GLint border, GLenum format, GLenum type,
+             GLsizei imageSize, const GLvoid *pixels)
+{
+   teximage(ctx, compressed, dims, target, level, internalFormat, width, height,
+            depth, border, format, type, imageSize, pixels, false);
+}
+
+
+static void
+teximage_no_error(struct gl_context *ctx, GLboolean compressed, GLuint dims,
+                  GLenum target, GLint level, GLint internalFormat,
+                  GLsizei width, GLsizei height, GLsizei depth,
+                  GLint border, GLenum format, GLenum type,
+                  GLsizei imageSize, const GLvoid *pixels)
+{
+   teximage(ctx, compressed, dims, target, level, internalFormat, width, height,
+            depth, border, format, type, imageSize, pixels, true);
+}
 
 
 /*
@@ -3072,8 +3104,8 @@
                   GLenum type, const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_FALSE, 1, target, level, internalFormat, width, 1, 1,
-            border, format, type, 0, pixels);
+   teximage_err(ctx, GL_FALSE, 1, target, level, internalFormat, width, 1, 1,
+                border, format, type, 0, pixels);
 }
 
 
@@ -3084,8 +3116,8 @@
                   const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_FALSE, 2, target, level, internalFormat, width, height, 1,
-            border, format, type, 0, pixels);
+   teximage_err(ctx, GL_FALSE, 2, target, level, internalFormat, width, height, 1,
+                border, format, type, 0, pixels);
 }
 
 
@@ -3100,9 +3132,8 @@
                   const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_FALSE, 3, target, level, internalFormat,
-            width, height, depth,
-            border, format, type, 0, pixels);
+   teximage_err(ctx, GL_FALSE, 3, target, level, internalFormat,
+                width, height, depth, border, format, type, 0, pixels);
 }
 
 
@@ -3118,6 +3149,40 @@
 
 
 void GLAPIENTRY
+_mesa_TexImage1D_no_error(GLenum target, GLint level, GLint internalFormat,
+                          GLsizei width, GLint border, GLenum format,
+                          GLenum type, const GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_FALSE, 1, target, level, internalFormat, width, 1,
+                     1, border, format, type, 0, pixels);
+}
+
+
+void GLAPIENTRY
+_mesa_TexImage2D_no_error(GLenum target, GLint level, GLint internalFormat,
+                          GLsizei width, GLsizei height, GLint border,
+                          GLenum format, GLenum type, const GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_FALSE, 2, target, level, internalFormat, width,
+                     height, 1, border, format, type, 0, pixels);
+}
+
+
+void GLAPIENTRY
+_mesa_TexImage3D_no_error(GLenum target, GLint level, GLint internalFormat,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLint border, GLenum format, GLenum type,
+                          const GLvoid *pixels )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_FALSE, 3, target, level, internalFormat,
+                     width, height, depth, border, format, type, 0, pixels);
+}
+
+
+void GLAPIENTRY
 _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
 {
    struct gl_texture_object *texObj;
@@ -3186,15 +3251,15 @@
  * Helper that implements the glTexSubImage1/2/3D()
  * and glTextureSubImage1/2/3D() functions.
  */
-void
-_mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage,
-                        GLenum target, GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type, const GLvoid *pixels,
-                        bool dsa)
+static void
+texture_sub_image(struct gl_context *ctx, GLuint dims,
+                  struct gl_texture_object *texObj,
+                  struct gl_texture_image *texImage,
+                  GLenum target, GLint level,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLsizei depth,
+                  GLenum format, GLenum type, const GLvoid *pixels,
+                  bool dsa)
 {
    FLUSH_VERTICES(ctx, 0);
 
@@ -3238,11 +3303,11 @@
  * Must split this out this way because of GL_TEXTURE_CUBE_MAP.
  */
 static void
-texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
-            GLint xoffset, GLint yoffset, GLint zoffset,
-            GLsizei width, GLsizei height, GLsizei depth,
-            GLenum format, GLenum type, const GLvoid *pixels,
-            const char *callerName)
+texsubimage_err(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
+                GLint xoffset, GLint yoffset, GLint zoffset,
+                GLsizei width, GLsizei height, GLsizei depth,
+                GLenum format, GLenum type, const GLvoid *pixels,
+                const char *callerName)
 {
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
@@ -3276,9 +3341,27 @@
                   _mesa_enum_to_string(format),
                   _mesa_enum_to_string(type), pixels);
 
-   _mesa_texture_sub_image(ctx, dims, texObj, texImage, target, level,
-                           xoffset, yoffset, zoffset, width, height, depth,
-                           format, type, pixels, false);
+   texture_sub_image(ctx, dims, texObj, texImage, target, level,
+                     xoffset, yoffset, zoffset, width, height, depth,
+                     format, type, pixels, false);
+}
+
+
+static void
+texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
+            GLint xoffset, GLint yoffset, GLint zoffset,
+            GLsizei width, GLsizei height, GLsizei depth,
+            GLenum format, GLenum type, const GLvoid *pixels)
+{
+   struct gl_texture_object *texObj;
+   struct gl_texture_image *texImage;
+
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   texImage = _mesa_select_tex_image(texObj, target, level);
+
+   texture_sub_image(ctx, dims, texObj, texImage, target, level,
+                     xoffset, yoffset, zoffset, width, height, depth,
+                     format, type, pixels, false);
 }
 
 
@@ -3307,12 +3390,9 @@
                   _mesa_enum_to_string(type), pixels);
 
    /* Get the texture object by Name. */
-   texObj = _mesa_lookup_texture(ctx, texture);
-   if (!texObj) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureSubImage%uD(texture)",
-                  dims);
+   texObj = _mesa_lookup_texture_err(ctx, texture, callerName);
+   if (!texObj)
       return;
-   }
 
    /* check target (proxies not allowed) */
    if (!legal_texsubimage_target(ctx, dims, texObj->Target, true)) {
@@ -3376,10 +3456,10 @@
          texImage = texObj->Image[i][level];
          assert(texImage);
 
-         _mesa_texture_sub_image(ctx, 3, texObj, texImage, texObj->Target,
-                                 level, xoffset, yoffset, 0,
-                                 width, height, 1, format,
-                                 type, pixels, true);
+         texture_sub_image(ctx, 3, texObj, texImage, texObj->Target,
+                           level, xoffset, yoffset, 0,
+                           width, height, 1, format,
+                           type, pixels, true);
          pixels = (GLubyte *) pixels + imageStride;
       }
    }
@@ -3387,25 +3467,54 @@
       texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
       assert(texImage);
 
-      _mesa_texture_sub_image(ctx, dims, texObj, texImage, texObj->Target,
-                              level, xoffset, yoffset, zoffset,
-                              width, height, depth, format,
-                              type, pixels, true);
+      texture_sub_image(ctx, dims, texObj, texImage, texObj->Target,
+                        level, xoffset, yoffset, zoffset,
+                        width, height, depth, format,
+                        type, pixels, true);
    }
 }
 
 
 void GLAPIENTRY
+_mesa_TexSubImage1D_no_error(GLenum target, GLint level,
+                             GLint xoffset, GLsizei width,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   texsubimage(ctx, 1, target, level,
+               xoffset, 0, 0,
+               width, 1, 1,
+               format, type, pixels);
+}
+
+
+void GLAPIENTRY
 _mesa_TexSubImage1D( GLenum target, GLint level,
                      GLint xoffset, GLsizei width,
                      GLenum format, GLenum type,
                      const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   texsubimage(ctx, 1, target, level,
-               xoffset, 0, 0,
-               width, 1, 1,
-               format, type, pixels, "glTexSubImage1D");
+   texsubimage_err(ctx, 1, target, level,
+                   xoffset, 0, 0,
+                   width, 1, 1,
+                   format, type, pixels, "glTexSubImage1D");
+}
+
+
+void GLAPIENTRY
+_mesa_TexSubImage2D_no_error(GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset,
+                             GLsizei width, GLsizei height,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   texsubimage(ctx, 2, target, level,
+               xoffset, yoffset, 0,
+               width, height, 1,
+               format, type, pixels);
 }
 
 
@@ -3417,13 +3526,27 @@
                      const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   texsubimage(ctx, 2, target, level,
-               xoffset, yoffset, 0,
-               width, height, 1,
-               format, type, pixels, "glTexSubImage2D");
+   texsubimage_err(ctx, 2, target, level,
+                   xoffset, yoffset, 0,
+                   width, height, 1,
+                   format, type, pixels, "glTexSubImage2D");
 }
 
 
+void GLAPIENTRY
+_mesa_TexSubImage3D_no_error(GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset, GLint zoffset,
+                             GLsizei width, GLsizei height, GLsizei depth,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   texsubimage(ctx, 3, target, level,
+               xoffset, yoffset, zoffset,
+               width, height, depth,
+               format, type, pixels);
+}
+
 
 void GLAPIENTRY
 _mesa_TexSubImage3D( GLenum target, GLint level,
@@ -3433,10 +3556,10 @@
                      const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   texsubimage(ctx, 3, target, level,
-               xoffset, yoffset, zoffset,
-               width, height, depth,
-               format, type, pixels, "glTexSubImage3D");
+   texsubimage_err(ctx, 3, target, level,
+                   xoffset, yoffset, zoffset,
+                   width, height, depth,
+                   format, type, pixels, "glTexSubImage3D");
 }
 
 void GLAPIENTRY
@@ -3578,19 +3701,114 @@
    return true;
 }
 
+
+/**
+ * Implementation for glCopyTex(ture)SubImage1/2/3D() functions.
+ */
+static void
+copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
+                       struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLint xoffset, GLint yoffset, GLint zoffset,
+                       GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   struct gl_texture_image *texImage;
+
+   _mesa_lock_texture(ctx, texObj);
+
+   texImage = _mesa_select_tex_image(texObj, target, level);
+
+   /* If we have a border, offset=-1 is legal.  Bias by border width. */
+   switch (dims) {
+   case 3:
+      if (target != GL_TEXTURE_2D_ARRAY)
+         zoffset += texImage->Border;
+      /* fall-through */
+   case 2:
+      if (target != GL_TEXTURE_1D_ARRAY)
+         yoffset += texImage->Border;
+      /* fall-through */
+   case 1:
+      xoffset += texImage->Border;
+   }
+
+   if (_mesa_clip_copytexsubimage(ctx, &xoffset, &yoffset, &x, &y,
+                                  &width, &height)) {
+      struct gl_renderbuffer *srcRb =
+         get_copy_tex_image_source(ctx, texImage->TexFormat);
+
+      copytexsubimage_by_slice(ctx, texImage, dims, xoffset, yoffset, zoffset,
+                               srcRb, x, y, width, height);
+
+      check_gen_mipmap(ctx, target, texObj, level);
+
+      /* NOTE: Don't signal _NEW_TEXTURE_OBJECT since we've only changed
+       * the texel data, not the texture format, size, etc.
+       */
+   }
+
+   _mesa_unlock_texture(ctx, texObj);
+}
+
+
+static void
+copy_texture_sub_image_err(struct gl_context *ctx, GLuint dims,
+                           struct gl_texture_object *texObj,
+                           GLenum target, GLint level,
+                           GLint xoffset, GLint yoffset, GLint zoffset,
+                           GLint x, GLint y, GLsizei width, GLsizei height,
+                           const char *caller)
+{
+   FLUSH_VERTICES(ctx, 0);
+
+   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
+      _mesa_debug(ctx, "%s %s %d %d %d %d %d %d %d %d\n", caller,
+                  _mesa_enum_to_string(target),
+                  level, xoffset, yoffset, zoffset, x, y, width, height);
+
+   if (ctx->NewState & NEW_COPY_TEX_STATE)
+      _mesa_update_state(ctx);
+
+   if (copytexsubimage_error_check(ctx, dims, texObj, target, level,
+                                   xoffset, yoffset, zoffset,
+                                   width, height, caller)) {
+      return;
+   }
+
+   copy_texture_sub_image(ctx, dims, texObj, target, level, xoffset, yoffset,
+                          zoffset, x, y, width, height);
+}
+
+
+static void
+copy_texture_sub_image_no_error(struct gl_context *ctx, GLuint dims,
+                                struct gl_texture_object *texObj,
+                                GLenum target, GLint level,
+                                GLint xoffset, GLint yoffset, GLint zoffset,
+                                GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   FLUSH_VERTICES(ctx, 0);
+
+   if (ctx->NewState & NEW_COPY_TEX_STATE)
+      _mesa_update_state(ctx);
+
+   copy_texture_sub_image(ctx, dims, texObj, target, level, xoffset, yoffset,
+                          zoffset, x, y, width, height);
+}
+
+
 /**
  * Implement the glCopyTexImage1/2D() functions.
  */
-static void
+static ALWAYS_INLINE void
 copyteximage(struct gl_context *ctx, GLuint dims,
              GLenum target, GLint level, GLenum internalFormat,
-             GLint x, GLint y, GLsizei width, GLsizei height, GLint border )
+             GLint x, GLint y, GLsizei width, GLsizei height, GLint border,
+             bool no_error)
 {
-   struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
-   const GLuint face = _mesa_tex_target_to_face(target);
+   struct gl_texture_object *texObj;
    mesa_format texFormat;
-   struct gl_renderbuffer *rb;
 
    FLUSH_VERTICES(ctx, 0);
 
@@ -3604,15 +3822,18 @@
    if (ctx->NewState & NEW_COPY_TEX_STATE)
       _mesa_update_state(ctx);
 
-   if (copytexture_error_check(ctx, dims, target, level, internalFormat,
-                               width, height, border))
-      return;
+   if (!no_error) {
+      if (copytexture_error_check(ctx, dims, target, level, internalFormat,
+                                  width, height, border))
+         return;
 
-   if (!_mesa_legal_texture_dimensions(ctx, target, level, width, height,
-                                       1, border)) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glCopyTexImage%uD(invalid width or height)", dims);
-      return;
+      if (!_mesa_legal_texture_dimensions(ctx, target, level, width, height,
+                                          1, border)) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glCopyTexImage%uD(invalid width=%d or height=%d)",
+                     dims, width, height);
+         return;
+      }
    }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
@@ -3630,9 +3851,13 @@
       if (texImage && can_avoid_reallocation(texImage, internalFormat, texFormat,
                                              x, y, width, height, border)) {
          _mesa_unlock_texture(ctx, texObj);
-         _mesa_copy_texture_sub_image(ctx, dims, texObj, target, level,
-                                      0, 0, 0, x, y, width, height,
-                                      "CopyTexImage");
+         if (no_error) {
+            copy_texture_sub_image_no_error(ctx, dims, texObj, target, level, 0,
+                                            0, 0, x, y, width, height);
+         } else {
+            copy_texture_sub_image_err(ctx, dims, texObj, target, level, 0, 0,
+                                       0, x, y, width, height,"CopyTexImage");
+         }
          return;
       }
    }
@@ -3640,9 +3865,10 @@
    _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_LOW, "glCopyTexImage "
                     "can't avoid reallocating texture storage\n");
 
-   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
+   if (!no_error && _mesa_is_gles3(ctx)) {
+      struct gl_renderbuffer *rb =
+         _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
 
-   if (_mesa_is_gles3(ctx)) {
       if (_mesa_is_enum_format_unsized(internalFormat)) {
       /* Conversion from GL_RGB10_A2 source buffer format is not allowed in
        * OpenGL ES 3.0. Khronos bug# 9807.
@@ -3702,6 +3928,7 @@
       }
       else {
          GLint srcX = x, srcY = y, dstX = 0, dstY = 0, dstZ = 0;
+         const GLuint face = _mesa_tex_target_to_face(target);
 
          /* Free old texture image */
          ctx->Driver.FreeTextureImageBuffer(ctx, texImage);
@@ -3735,6 +3962,24 @@
 }
 
 
+static void
+copyteximage_err(struct gl_context *ctx, GLuint dims, GLenum target,
+                 GLint level, GLenum internalFormat, GLint x, GLint y,
+                 GLsizei width, GLsizei height, GLint border)
+{
+   copyteximage(ctx, dims, target, level, internalFormat, x, y, width, height,
+                border, false);
+}
+
+static void
+copyteximage_no_error(struct gl_context *ctx, GLuint dims, GLenum target,
+                      GLint level, GLenum internalFormat, GLint x, GLint y,
+                      GLsizei width, GLsizei height, GLint border)
+{
+   copyteximage(ctx, dims, target, level, internalFormat, x, y, width, height,
+                border, true);
+}
+
 
 void GLAPIENTRY
 _mesa_CopyTexImage1D( GLenum target, GLint level,
@@ -3743,7 +3988,8 @@
                       GLsizei width, GLint border )
 {
    GET_CURRENT_CONTEXT(ctx);
-   copyteximage(ctx, 1, target, level, internalFormat, x, y, width, 1, border);
+   copyteximage_err(ctx, 1, target, level, internalFormat, x, y, width, 1,
+                    border);
 }
 
 
@@ -3754,77 +4000,32 @@
                       GLint border )
 {
    GET_CURRENT_CONTEXT(ctx);
-   copyteximage(ctx, 2, target, level, internalFormat,
-                x, y, width, height, border);
+   copyteximage_err(ctx, 2, target, level, internalFormat,
+                    x, y, width, height, border);
 }
 
-/**
- * Implementation for glCopyTex(ture)SubImage1/2/3D() functions.
- */
-void
-_mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
-                             struct gl_texture_object *texObj,
-                             GLenum target, GLint level,
-                             GLint xoffset, GLint yoffset, GLint zoffset,
-                             GLint x, GLint y,
-                             GLsizei width, GLsizei height,
-                             const char *caller)
+
+void GLAPIENTRY
+_mesa_CopyTexImage1D_no_error(GLenum target, GLint level, GLenum internalFormat,
+                              GLint x, GLint y, GLsizei width, GLint border)
 {
-   struct gl_texture_image *texImage;
-
-   FLUSH_VERTICES(ctx, 0);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "%s %s %d %d %d %d %d %d %d %d\n", caller,
-                  _mesa_enum_to_string(target),
-                  level, xoffset, yoffset, zoffset, x, y, width, height);
-
-   if (ctx->NewState & NEW_COPY_TEX_STATE)
-      _mesa_update_state(ctx);
-
-   if (copytexsubimage_error_check(ctx, dims, texObj, target, level,
-                                   xoffset, yoffset, zoffset,
-                                   width, height, caller)) {
-      return;
-   }
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_select_tex_image(texObj, target, level);
-
-      /* If we have a border, offset=-1 is legal.  Bias by border width. */
-      switch (dims) {
-      case 3:
-         if (target != GL_TEXTURE_2D_ARRAY)
-            zoffset += texImage->Border;
-         /* fall-through */
-      case 2:
-         if (target != GL_TEXTURE_1D_ARRAY)
-            yoffset += texImage->Border;
-         /* fall-through */
-      case 1:
-         xoffset += texImage->Border;
-      }
-
-      if (_mesa_clip_copytexsubimage(ctx, &xoffset, &yoffset, &x, &y,
-                                     &width, &height)) {
-         struct gl_renderbuffer *srcRb =
-            get_copy_tex_image_source(ctx, texImage->TexFormat);
-
-         copytexsubimage_by_slice(ctx, texImage, dims,
-                                  xoffset, yoffset, zoffset,
-                                  srcRb, x, y, width, height);
-
-         check_gen_mipmap(ctx, target, texObj, level);
-
-         /* NOTE: Don't signal _NEW_TEXTURE_OBJECT since we've only changed
-          * the texel data, not the texture format, size, etc.
-          */
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+   GET_CURRENT_CONTEXT(ctx);
+   copyteximage_no_error(ctx, 1, target, level, internalFormat, x, y, width, 1,
+                         border);
 }
 
+
+void GLAPIENTRY
+_mesa_CopyTexImage2D_no_error(GLenum target, GLint level, GLenum internalFormat,
+                              GLint x, GLint y, GLsizei width, GLsizei height,
+                              GLint border)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   copyteximage_no_error(ctx, 2, target, level, internalFormat,
+                         x, y, width, height, border);
+}
+
+
 void GLAPIENTRY
 _mesa_CopyTexSubImage1D( GLenum target, GLint level,
                          GLint xoffset, GLint x, GLint y, GLsizei width )
@@ -3846,12 +4047,10 @@
    if (!texObj)
       return;
 
-   _mesa_copy_texture_sub_image(ctx, 1, texObj, target, level, xoffset, 0, 0,
-                                x, y, width, 1, self);
+   copy_texture_sub_image_err(ctx, 1, texObj, target, level, xoffset, 0, 0,
+                              x, y, width, 1, self);
 }
 
-
-
 void GLAPIENTRY
 _mesa_CopyTexSubImage2D( GLenum target, GLint level,
                          GLint xoffset, GLint yoffset,
@@ -3874,9 +4073,8 @@
    if (!texObj)
       return;
 
-   _mesa_copy_texture_sub_image(ctx, 2, texObj, target, level,
-                                xoffset, yoffset, 0,
-                                x, y, width, height, self);
+   copy_texture_sub_image_err(ctx, 2, texObj, target, level, xoffset, yoffset,
+                              0, x, y, width, height, self);
 }
 
 
@@ -3903,9 +4101,8 @@
    if (!texObj)
       return;
 
-   _mesa_copy_texture_sub_image(ctx, 3, texObj, target, level,
-                                xoffset, yoffset, zoffset,
-                                x, y, width, height, self);
+   copy_texture_sub_image_err(ctx, 3, texObj, target, level, xoffset, yoffset,
+                              zoffset, x, y, width, height, self);
 }
 
 void GLAPIENTRY
@@ -3927,8 +4124,8 @@
       return;
    }
 
-   _mesa_copy_texture_sub_image(ctx, 1, texObj, texObj->Target, level,
-                                xoffset, 0, 0, x, y, width, 1, self);
+   copy_texture_sub_image_err(ctx, 1, texObj, texObj->Target, level, xoffset, 0,
+                              0, x, y, width, 1, self);
 }
 
 void GLAPIENTRY
@@ -3951,9 +4148,8 @@
       return;
    }
 
-   _mesa_copy_texture_sub_image(ctx, 2, texObj, texObj->Target, level,
-                                xoffset, yoffset, 0,
-                                x, y, width, height, self);
+   copy_texture_sub_image_err(ctx, 2, texObj, texObj->Target, level, xoffset,
+                              yoffset, 0, x, y, width, height, self);
 }
 
 
@@ -3980,17 +4176,97 @@
 
    if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
       /* Act like CopyTexSubImage2D */
-      _mesa_copy_texture_sub_image(ctx, 2, texObj,
-                                   GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset,
-                                   level, xoffset, yoffset, 0,
-                                   x, y, width, height, self);
+      copy_texture_sub_image_err(ctx, 2, texObj,
+                                GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset,
+                                level, xoffset, yoffset, 0, x, y, width, height,
+                                self);
    }
    else
-      _mesa_copy_texture_sub_image(ctx, 3, texObj, texObj->Target, level,
-                                   xoffset, yoffset, zoffset,
-                                   x, y, width, height, self);
+      copy_texture_sub_image_err(ctx, 3, texObj, texObj->Target, level, xoffset,
+                                 yoffset, zoffset, x, y, width, height, self);
 }
 
+
+void GLAPIENTRY
+_mesa_CopyTexSubImage1D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint x, GLint y, GLsizei width)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_get_current_tex_object(ctx, target);
+   copy_texture_sub_image_no_error(ctx, 1, texObj, target, level, xoffset, 0, 0,
+                                   x, y, width, 1);
+}
+
+void GLAPIENTRY
+_mesa_CopyTexSubImage2D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint yoffset, GLint x, GLint y, GLsizei width,
+                                 GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_get_current_tex_object(ctx, target);
+   copy_texture_sub_image_no_error(ctx, 2, texObj, target, level, xoffset,
+                                   yoffset, 0, x, y, width, height);
+}
+
+void GLAPIENTRY
+_mesa_CopyTexSubImage3D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint yoffset, GLint zoffset, GLint x, GLint y,
+                                 GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_get_current_tex_object(ctx, target);
+   copy_texture_sub_image_no_error(ctx, 3, texObj, target, level, xoffset,
+                                   yoffset, zoffset, x, y, width, height);
+}
+
+void GLAPIENTRY
+_mesa_CopyTextureSubImage1D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint x, GLint y, GLsizei width)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_lookup_texture(ctx, texture);
+   copy_texture_sub_image_no_error(ctx, 1, texObj, texObj->Target, level,
+                                   xoffset, 0, 0, x, y, width, 1);
+}
+
+void GLAPIENTRY
+_mesa_CopyTextureSubImage2D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint yoffset, GLint x, GLint y,
+                                     GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_lookup_texture(ctx, texture);
+   copy_texture_sub_image_no_error(ctx, 2, texObj, texObj->Target, level,
+                                   xoffset, yoffset, 0, x, y, width, height);
+}
+
+void GLAPIENTRY
+_mesa_CopyTextureSubImage3D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint yoffset, GLint zoffset, GLint x,
+                                     GLint y, GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_texture_object* texObj = _mesa_lookup_texture(ctx, texture);
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+      /* Act like CopyTexSubImage2D */
+      copy_texture_sub_image_no_error(ctx, 2, texObj,
+                                      GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset,
+                                      level, xoffset, yoffset, 0, x, y, width,
+                                      height);
+   }
+   else
+      copy_texture_sub_image_no_error(ctx, 3, texObj, texObj->Target, level,
+                                      xoffset, yoffset, zoffset, x, y, width,
+                                      height);
+}
+
+
 static bool
 check_clear_tex_image(struct gl_context *ctx,
                       const char *function,
@@ -4071,17 +4347,9 @@
 {
    struct gl_texture_object *texObj;
 
-   if (texture == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(zero texture)", function);
+   texObj = _mesa_lookup_texture_err(ctx, texture, function);
+   if (!texObj)
       return NULL;
-   }
-
-   texObj = _mesa_lookup_texture(ctx, texture);
-
-   if (texObj == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", function);
-      return NULL;
-   }
 
    if (texObj->Target == 0) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unbound tex)", function);
@@ -4091,6 +4359,15 @@
    return texObj;
 }
 
+
+/**
+ * For clearing cube textures, the zoffset and depth parameters indicate
+ * which cube map faces are to be cleared.  This is the one case where we
+ * need to be concerned with multiple gl_texture_images.  This function
+ * returns the array of texture images to clear for cube maps, or one
+ * texture image otherwise.
+ * \return number of texture images, 0 for error, 6 for cube, 1 otherwise.
+ */
 static int
 get_tex_images_for_clear(struct gl_context *ctx,
                          const char *function,
@@ -4099,7 +4376,7 @@
                          struct gl_texture_image **texImages)
 {
    GLenum target;
-   int i;
+   int numFaces, i;
 
    if (level < 0 || level >= MAX_TEXTURE_LEVELS) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid level)", function);
@@ -4107,28 +4384,23 @@
    }
 
    if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-      for (i = 0; i < MAX_FACES; i++) {
-         target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + i;
+      target = GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      numFaces = MAX_FACES;
+   }
+   else {
+      target = texObj->Target;
+      numFaces = 1;
+   }
 
-         texImages[i] = _mesa_select_tex_image(texObj, target, level);
-         if (texImages[i] == NULL) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "%s(invalid level)", function);
-            return 0;
-         }
+   for (i = 0; i < numFaces; i++) {
+      texImages[i] = _mesa_select_tex_image(texObj, target + i, level);
+      if (texImages[i] == NULL) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid level)", function);
+         return 0;
       }
-
-      return MAX_FACES;
    }
 
-   texImages[0] = _mesa_select_tex_image(texObj, texObj->Target, level);
-
-   if (texImages[0] == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid level)", function);
-      return 0;
-   }
-
-   return 1;
+   return numFaces;
 }
 
 void GLAPIENTRY
@@ -4160,6 +4432,7 @@
       minDepth = -(int) texImages[0]->Border;
       maxDepth = texImages[0]->Depth;
    } else {
+      assert(numImages == MAX_FACES);
       minDepth = 0;
       maxDepth = numImages;
    }
@@ -4189,7 +4462,9 @@
                                       data ? clearValue[0] : NULL);
       }
    } else {
+      /* loop over cube face images */
       for (i = zoffset; i < zoffset + depth; i++) {
+         assert(i < MAX_FACES);
          if (!check_clear_tex_image(ctx, "glClearTexSubImage",
                                     texImages[i],
                                     format, type, data, clearValue[i]))
@@ -4466,8 +4741,8 @@
                               const GLvoid *data)
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_TRUE, 1, target, level, internalFormat,
-            width, 1, 1, border, GL_NONE, GL_NONE, imageSize, data);
+   teximage_err(ctx, GL_TRUE, 1, target, level, internalFormat,
+                width, 1, 1, border, GL_NONE, GL_NONE, imageSize, data);
 }
 
 
@@ -4478,8 +4753,8 @@
                               const GLvoid *data)
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_TRUE, 2, target, level, internalFormat,
-            width, height, 1, border, GL_NONE, GL_NONE, imageSize, data);
+   teximage_err(ctx, GL_TRUE, 2, target, level, internalFormat,
+                width, height, 1, border, GL_NONE, GL_NONE, imageSize, data);
 }
 
 
@@ -4490,8 +4765,44 @@
                               GLsizei imageSize, const GLvoid *data)
 {
    GET_CURRENT_CONTEXT(ctx);
-   teximage(ctx, GL_TRUE, 3, target, level, internalFormat,
-            width, height, depth, border, GL_NONE, GL_NONE, imageSize, data);
+   teximage_err(ctx, GL_TRUE, 3, target, level, internalFormat, width, height,
+                depth, border, GL_NONE, GL_NONE, imageSize, data);
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexImage1D_no_error(GLenum target, GLint level,
+                                    GLenum internalFormat, GLsizei width,
+                                    GLint border, GLsizei imageSize,
+                                    const GLvoid *data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_TRUE, 1, target, level, internalFormat, width, 1,
+                     1, border, GL_NONE, GL_NONE, imageSize, data);
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexImage2D_no_error(GLenum target, GLint level,
+                                    GLenum internalFormat, GLsizei width,
+                                    GLsizei height, GLint border,
+                                    GLsizei imageSize, const GLvoid *data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_TRUE, 2, target, level, internalFormat, width,
+                     height, 1, border, GL_NONE, GL_NONE, imageSize, data);
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexImage3D_no_error(GLenum target, GLint level,
+                                    GLenum internalFormat, GLsizei width,
+                                    GLsizei height, GLsizei depth, GLint border,
+                                    GLsizei imageSize, const GLvoid *data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage_no_error(ctx, GL_TRUE, 3, target, level, internalFormat, width,
+                     height, depth, border, GL_NONE, GL_NONE, imageSize, data);
 }
 
 
@@ -4499,17 +4810,14 @@
  * Common helper for glCompressedTexSubImage1/2/3D() and
  * glCompressedTextureSubImage1/2/3D().
  */
-void
-_mesa_compressed_texture_sub_image(struct gl_context *ctx, GLuint dims,
-                                   struct gl_texture_object *texObj,
-                                   struct gl_texture_image *texImage,
-                                   GLenum target, GLint level,
-                                   GLint xoffset, GLint yoffset,
-                                   GLint zoffset,
-                                   GLsizei width, GLsizei height,
-                                   GLsizei depth,
-                                   GLenum format, GLsizei imageSize,
-                                   const GLvoid *data)
+static void
+compressed_texture_sub_image(struct gl_context *ctx, GLuint dims,
+                             struct gl_texture_object *texObj,
+                             struct gl_texture_image *texImage,
+                             GLenum target, GLint level, GLint xoffset,
+                             GLint yoffset, GLint zoffset, GLsizei width,
+                             GLsizei height, GLsizei depth, GLenum format,
+                             GLsizei imageSize, const GLvoid *data)
 {
    FLUSH_VERTICES(ctx, 0);
 
@@ -4532,249 +4840,75 @@
 }
 
 
-void GLAPIENTRY
-_mesa_CompressedTexSubImage1D(GLenum target, GLint level, GLint xoffset,
-                              GLsizei width, GLenum format,
-                              GLsizei imageSize, const GLvoid *data)
+static ALWAYS_INLINE void
+compressed_tex_sub_image(unsigned dim, GLenum target, GLuint texture,
+                         GLint level, GLint xoffset, GLint yoffset,
+                         GLint zoffset, GLsizei width, GLsizei height,
+                         GLsizei depth, GLenum format, GLsizei imageSize,
+                         const GLvoid *data, bool dsa, bool no_error,
+                         const char *caller)
 {
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
 
    GET_CURRENT_CONTEXT(ctx);
 
-   if (compressed_subtexture_target_check(ctx, target, 1, format, false,
-                                          "glCompressedTexSubImage1D")) {
+   if (dsa) {
+      if (no_error) {
+         texObj = _mesa_lookup_texture(ctx, texture);
+      } else {
+         texObj = _mesa_lookup_texture_err(ctx, texture, caller);
+         if (!texObj)
+            return;
+      }
+
+      target = texObj->Target;
+   }
+
+   if (!no_error &&
+       compressed_subtexture_target_check(ctx, target, dim, format, dsa,
+                                          caller)) {
       return;
    }
 
-   texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_error_check(ctx, 1, texObj, target,
-                                         level, xoffset, 0, 0,
-                                         width, 1, 1,
-                                         format, imageSize, data,
-                                         "glCompressedTexSubImage1D")) {
-      return;
+   if (!dsa) {
+      texObj = _mesa_get_current_tex_object(ctx, target);
+         if (!no_error && !texObj)
+            return;
    }
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   assert(texImage);
-
-   _mesa_compressed_texture_sub_image(ctx, 1, texObj, texImage, target, level,
-                                      xoffset, 0, 0, width, 1, 1,
-                                      format, imageSize, data);
-}
-
-void GLAPIENTRY
-_mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
-                                  GLsizei width, GLenum format,
-                                  GLsizei imageSize, const GLvoid *data)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-
-   GET_CURRENT_CONTEXT(ctx);
-
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                     "glCompressedTextureSubImage1D");
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format, true,
-                                          "glCompressedTextureSubImage1D")) {
-      return;
-   }
-
-   if (compressed_subtexture_error_check(ctx, 1, texObj, texObj->Target,
-                                         level, xoffset, 0, 0,
-                                         width, 1, 1,
-                                         format, imageSize, data,
-                                         "glCompressedTextureSubImage1D")) {
-      return;
-   }
-
-   texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-   assert(texImage);
-
-   _mesa_compressed_texture_sub_image(ctx, 1, texObj, texImage,
-                                      texObj->Target, level,
-                                      xoffset, 0, 0, width, 1, 1,
-                                      format, imageSize, data);
-}
-
-
-void GLAPIENTRY
-_mesa_CompressedTexSubImage2D(GLenum target, GLint level, GLint xoffset,
-                              GLint yoffset, GLsizei width, GLsizei height,
-                              GLenum format, GLsizei imageSize,
-                              const GLvoid *data)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (compressed_subtexture_target_check(ctx, target, 2, format, false,
-                                          "glCompressedTexSubImage2D")) {
-      return;
-   }
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_error_check(ctx, 2, texObj, target,
-                                         level, xoffset, yoffset, 0,
-                                         width, height, 1,
-                                         format, imageSize, data,
-                                         "glCompressedTexSubImage2D")) {
-      return;
-   }
-
-
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   assert(texImage);
-
-   _mesa_compressed_texture_sub_image(ctx, 2, texObj, texImage, target, level,
-                                      xoffset, yoffset, 0, width, height, 1,
-                                      format, imageSize, data);
-}
-
-void GLAPIENTRY
-_mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
-                                  GLint yoffset,
-                                  GLsizei width, GLsizei height,
-                                  GLenum format, GLsizei imageSize,
-                                  const GLvoid *data)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-
-   GET_CURRENT_CONTEXT(ctx);
-
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                 "glCompressedTextureSubImage2D");
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format, true,
-                                          "glCompressedTextureSubImage2D")) {
-      return;
-   }
-
-   if (compressed_subtexture_error_check(ctx, 2, texObj, texObj->Target,
-                                         level, xoffset, yoffset, 0,
-                                         width, height, 1,
-                                         format, imageSize, data,
-                                         "glCompressedTextureSubImage2D")) {
-      return;
-   }
-
-   texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-   assert(texImage);
-
-   _mesa_compressed_texture_sub_image(ctx, 2, texObj, texImage,
-                                      texObj->Target, level,
-                                      xoffset, yoffset, 0, width, height, 1,
-                                      format, imageSize, data);
-}
-
-void GLAPIENTRY
-_mesa_CompressedTexSubImage3D(GLenum target, GLint level, GLint xoffset,
-                              GLint yoffset, GLint zoffset, GLsizei width,
-                              GLsizei height, GLsizei depth, GLenum format,
-                              GLsizei imageSize, const GLvoid *data)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (compressed_subtexture_target_check(ctx, target, 3, format, false,
-                                          "glCompressedTexSubImage3D")) {
-      return;
-   }
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_error_check(ctx, 3, texObj, target,
-                                         level, xoffset, yoffset, zoffset,
-                                         width, height, depth,
-                                         format, imageSize, data,
-                                         "glCompressedTexSubImage3D")) {
-      return;
-   }
-
-
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   assert(texImage);
-
-   _mesa_compressed_texture_sub_image(ctx, 3, texObj, texImage, target, level,
-                                      xoffset, yoffset, zoffset,
-                                      width, height, depth,
-                                      format, imageSize, data);
-}
-
-void GLAPIENTRY
-_mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
-                                  GLint yoffset, GLint zoffset, GLsizei width,
-                                  GLsizei height, GLsizei depth,
-                                  GLenum format, GLsizei imageSize,
-                                  const GLvoid *data)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-
-   GET_CURRENT_CONTEXT(ctx);
-
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                     "glCompressedTextureSubImage3D");
-   if (!texObj)
-      return;
-
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format, true,
-                                          "glCompressedTextureSubImage3D")) {
-      return;
-   }
-
-   if (compressed_subtexture_error_check(ctx, 3, texObj, texObj->Target,
-                                         level, xoffset, yoffset, zoffset,
-                                         width, height, depth,
-                                         format, imageSize, data,
-                                         "glCompressedTextureSubImage3D")) {
+   if (!no_error &&
+       compressed_subtexture_error_check(ctx, dim, texObj, target, level,
+                                         xoffset, yoffset, zoffset, width,
+                                         height, depth, format,
+                                         imageSize, data, caller)) {
       return;
    }
 
    /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+   if (dim == 3 && dsa && texObj->Target == GL_TEXTURE_CUBE_MAP) {
       const char *pixels = data;
-      int i;
       GLint image_stride;
 
       /* Make sure the texture object is a proper cube.
        * (See texturesubimage in teximage.c for details on why this check is
        * performed.)
        */
-      if (!_mesa_cube_level_complete(texObj, level)) {
+      if (!no_error && !_mesa_cube_level_complete(texObj, level)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCompressedTextureSubImage3D(cube map incomplete)");
          return;
       }
 
       /* Copy in each face. */
-      for (i = 0; i < 6; ++i) {
+      for (int i = zoffset; i < zoffset + depth; ++i) {
          texImage = texObj->Image[i][level];
          assert(texImage);
 
-         _mesa_compressed_texture_sub_image(ctx, 3, texObj, texImage,
-                                            texObj->Target, level,
-                                            xoffset, yoffset, zoffset,
-                                            width, height, 1,
-                                            format, imageSize, pixels);
+         compressed_texture_sub_image(ctx, 3, texObj, texImage,
+                                      texObj->Target, level, xoffset, yoffset,
+                                      0, width, height, 1, format,
+                                      imageSize, pixels);
 
          /* Compressed images don't have a client format */
          image_stride = _mesa_format_image_size(texImage->TexFormat,
@@ -4784,19 +4918,161 @@
          pixels += image_stride;
          imageSize -= image_stride;
       }
-   }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
+   } else {
+      texImage = _mesa_select_tex_image(texObj, target, level);
       assert(texImage);
 
-      _mesa_compressed_texture_sub_image(ctx, 3, texObj, texImage,
-                                         texObj->Target, level,
-                                         xoffset, yoffset, zoffset,
-                                         width, height, depth,
-                                         format, imageSize, data);
+      compressed_texture_sub_image(ctx, dim, texObj, texImage, target, level,
+                                   xoffset, yoffset, zoffset, width, height,
+                                   depth, format, imageSize, data);
    }
 }
 
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage1D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLsizei width,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data)
+{
+   compressed_tex_sub_image(1, target, 0, level, xoffset, 0, 0, width,
+                            1, 1, format, imageSize, data, false, true,
+                            "glCompressedTexSubImage1D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage1D(GLenum target, GLint level, GLint xoffset,
+                              GLsizei width, GLenum format,
+                              GLsizei imageSize, const GLvoid *data)
+{
+   compressed_tex_sub_image(1, target, 0, level, xoffset, 0, 0, width, 1, 1,
+                            format, imageSize, data, false, false,
+                            "glCompressedTexSubImage1D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage1D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLsizei width,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data)
+{
+   compressed_tex_sub_image(1, 0, texture, level, xoffset, 0, 0, width, 1, 1,
+                            format, imageSize, data, true, true,
+                            "glCompressedTextureSubImage1D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
+                                  GLsizei width, GLenum format,
+                                  GLsizei imageSize, const GLvoid *data)
+{
+   compressed_tex_sub_image(1, 0, texture, level, xoffset, 0, 0, width, 1, 1,
+                            format, imageSize, data, true, false,
+                            "glCompressedTextureSubImage1D");
+}
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage2D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLint yoffset,
+                                       GLsizei width, GLsizei height,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data)
+{
+   compressed_tex_sub_image(2, target, 0, level, xoffset, yoffset, 0, width,
+                            height, 1, format, imageSize, data, false, true,
+                            "glCompressedTexSubImage2D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage2D(GLenum target, GLint level, GLint xoffset,
+                              GLint yoffset, GLsizei width, GLsizei height,
+                              GLenum format, GLsizei imageSize,
+                              const GLvoid *data)
+{
+   compressed_tex_sub_image(2, target, 0, level, xoffset, yoffset, 0, width,
+                            height, 1, format, imageSize, data, false, false,
+                            "glCompressedTexSubImage2D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage2D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLint yoffset,
+                                           GLsizei width, GLsizei height,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data)
+{
+   compressed_tex_sub_image(2, 0, texture, level, xoffset, yoffset, 0, width,
+                            height, 1, format, imageSize, data, true, true,
+                            "glCompressedTextureSubImage2D");
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
+                                  GLint yoffset,
+                                  GLsizei width, GLsizei height,
+                                  GLenum format, GLsizei imageSize,
+                                  const GLvoid *data)
+{
+   compressed_tex_sub_image(2, 0, texture, level, xoffset, yoffset, 0, width,
+                            height, 1, format, imageSize, data, true, false,
+                            "glCompressedTextureSubImage2D");
+}
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage3D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLint yoffset,
+                                       GLint zoffset, GLsizei width,
+                                       GLsizei height, GLsizei depth,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data)
+{
+   compressed_tex_sub_image(3, target, 0, level, xoffset, yoffset, zoffset,
+                            width, height, depth, format, imageSize, data,
+                            false, true, "glCompressedTexSubImage3D");
+}
+
+void GLAPIENTRY
+_mesa_CompressedTexSubImage3D(GLenum target, GLint level, GLint xoffset,
+                              GLint yoffset, GLint zoffset, GLsizei width,
+                              GLsizei height, GLsizei depth, GLenum format,
+                              GLsizei imageSize, const GLvoid *data)
+{
+   compressed_tex_sub_image(3, target, 0, level, xoffset, yoffset, zoffset,
+                            width, height, depth, format, imageSize, data,
+                            false, false, "glCompressedTexSubImage3D");
+}
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage3D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLint yoffset,
+                                           GLint zoffset, GLsizei width,
+                                           GLsizei height, GLsizei depth,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data)
+{
+   compressed_tex_sub_image(3, 0, texture, level, xoffset, yoffset, zoffset,
+                            width, height, depth, format, imageSize, data,
+                            true, true, "glCompressedTextureSubImage3D");
+}
+
+void GLAPIENTRY
+_mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
+                                  GLint yoffset, GLint zoffset, GLsizei width,
+                                  GLsizei height, GLsizei depth,
+                                  GLenum format, GLsizei imageSize,
+                                  const GLvoid *data)
+{
+   compressed_tex_sub_image(3, 0, texture, level, xoffset, yoffset, zoffset,
+                            width, height, depth, format, imageSize, data,
+                            true, false, "glCompressedTextureSubImage3D");
+}
+
 static mesa_format
 get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
 {
@@ -5044,6 +5320,18 @@
       return;
    }
 
+   if (texObj->HandleAllocated) {
+      /* The ARB_bindless_texture spec says:
+       *
+       * "The error INVALID_OPERATION is generated by TexImage*, CopyTexImage*,
+       *  CompressedTexImage*, TexBuffer*, TexParameter*, as well as other
+       *  functions defined in terms of these, if the texture object to be
+       *  modified is referenced by one or more texture or image handles."
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(immutable texture)", caller);
+      return;
+   }
+
    format = _mesa_validate_texbuffer_format(ctx, internalFormat);
    if (format == MESA_FORMAT_NONE) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalFormat %s)",
@@ -5453,7 +5741,7 @@
    else {
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "%s(invalid width or height)", func);
+                     "%s(invalid width=%d or height=%d)", func, width, height);
          return;
       }
 
@@ -5482,7 +5770,7 @@
              * like, but being tidy is good.
              */
             _mesa_init_teximage_fields(ctx, texImage,
-                  0, 0, 0, 0, GL_NONE, MESA_FORMAT_NONE);
+                  0, 0, 0, 0, internalformat, texFormat);
          }
       }
 
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index 53c2f59..04ddbb9 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -226,27 +226,6 @@
                         GLenum format, GLenum type, const GLvoid *pixels,
                         bool dsa);
 
-extern void
-_mesa_compressed_texture_sub_image(struct gl_context *ctx, GLuint dims, 
-                                   struct gl_texture_object *texObj, 
-                                   struct gl_texture_image *texImage,
-                                   GLenum target, GLint level,
-                                   GLint xoffset, GLint yoffset,
-                                   GLint zoffset,
-                                   GLsizei width, GLsizei height,
-                                   GLsizei depth,
-                                   GLenum format, GLsizei imageSize,
-                                   const GLvoid *data);
-
-extern void
-_mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
-                             struct gl_texture_object *texObj,
-                             GLenum target, GLint level,
-                             GLint xoffset, GLint yoffset, GLint zoffset,
-                             GLint x, GLint y,
-                             GLsizei width, GLsizei height,
-                             const char *caller);
-
 bool
 _mesa_is_cube_map_texture(GLenum target);
 
@@ -281,14 +260,42 @@
                      const GLvoid *pixels );
 
 extern void GLAPIENTRY
+_mesa_TexImage1D_no_error(GLenum target, GLint level, GLint internalformat,
+                          GLsizei width, GLint border,
+                          GLenum format, GLenum type, const GLvoid *pixels);
+
+extern void GLAPIENTRY
+_mesa_TexImage2D_no_error(GLenum target, GLint level, GLint internalformat,
+                          GLsizei width, GLsizei height, GLint border,
+                          GLenum format, GLenum type, const GLvoid *pixels);
+
+extern void GLAPIENTRY
+_mesa_TexImage3D_no_error(GLenum target, GLint level, GLint internalformat,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLint border, GLenum format, GLenum type,
+                          const GLvoid *pixels);
+
+extern void GLAPIENTRY
 _mesa_EGLImageTargetTexture2DOES( GLenum target, GLeglImageOES image );
 
+void GLAPIENTRY
+_mesa_TexSubImage1D_no_error(GLenum target, GLint level, GLint xoffset,
+                             GLsizei width,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels);
+
 extern void GLAPIENTRY
 _mesa_TexSubImage1D( GLenum target, GLint level, GLint xoffset,
                      GLsizei width,
                      GLenum format, GLenum type,
                      const GLvoid *pixels );
 
+void GLAPIENTRY
+_mesa_TexSubImage2D_no_error(GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset,
+                             GLsizei width, GLsizei height,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels);
 
 extern void GLAPIENTRY
 _mesa_TexSubImage2D( GLenum target, GLint level,
@@ -297,6 +304,12 @@
                      GLenum format, GLenum type,
                      const GLvoid *pixels );
 
+void GLAPIENTRY
+_mesa_TexSubImage3D_no_error(GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset, GLint zoffset,
+                             GLsizei width, GLsizei height, GLsizei depth,
+                             GLenum format, GLenum type,
+                             const GLvoid *pixels);
 
 extern void GLAPIENTRY
 _mesa_TexSubImage3D( GLenum target, GLint level,
@@ -339,6 +352,17 @@
 
 
 extern void GLAPIENTRY
+_mesa_CopyTexImage1D_no_error(GLenum target, GLint level, GLenum internalformat,
+                              GLint x, GLint y, GLsizei width, GLint border);
+
+
+extern void GLAPIENTRY
+_mesa_CopyTexImage2D_no_error(GLenum target, GLint level, GLenum internalformat,
+                              GLint x, GLint y, GLsizei width, GLsizei height,
+                              GLint border );
+
+
+extern void GLAPIENTRY
 _mesa_CopyTexSubImage1D( GLenum target, GLint level, GLint xoffset,
                          GLint x, GLint y, GLsizei width );
 
@@ -371,6 +395,34 @@
                             GLsizei width, GLsizei height);
 
 extern void GLAPIENTRY
+_mesa_CopyTexSubImage1D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint x, GLint y, GLsizei width );
+
+extern void GLAPIENTRY
+_mesa_CopyTexSubImage2D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint yoffset, GLint x, GLint y, GLsizei width,
+                                 GLsizei height);
+
+extern void GLAPIENTRY
+_mesa_CopyTexSubImage3D_no_error(GLenum target, GLint level, GLint xoffset,
+                                 GLint yoffset, GLint zoffset, GLint x, GLint y,
+                                 GLsizei width, GLsizei height);
+
+extern void GLAPIENTRY
+_mesa_CopyTextureSubImage1D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint x, GLint y, GLsizei width);
+
+extern void GLAPIENTRY
+_mesa_CopyTextureSubImage2D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint yoffset, GLint x, GLint y,
+                                     GLsizei width, GLsizei height);
+
+extern void GLAPIENTRY
+_mesa_CopyTextureSubImage3D_no_error(GLuint texture, GLint level, GLint xoffset,
+                                     GLint yoffset, GLint zoffset, GLint x,
+                                     GLint y, GLsizei width, GLsizei height);
+
+extern void GLAPIENTRY
 _mesa_ClearTexSubImage( GLuint texture, GLint level,
                         GLint xoffset, GLint yoffset, GLint zoffset,
                         GLsizei width, GLsizei height, GLsizei depth,
@@ -399,22 +451,63 @@
                               GLsizei imageSize, const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTexImage1D_no_error(GLenum target, GLint level,
+                                    GLenum internalformat, GLsizei width,
+                                    GLint border, GLsizei imageSize,
+                                    const GLvoid *data);
+
+extern void GLAPIENTRY
+_mesa_CompressedTexImage2D_no_error(GLenum target, GLint level,
+                                    GLenum internalformat, GLsizei width,
+                                    GLsizei height, GLint border,
+                                    GLsizei imageSize, const GLvoid *data);
+
+extern void GLAPIENTRY
+_mesa_CompressedTexImage3D_no_error(GLenum target, GLint level,
+                                    GLenum internalformat, GLsizei width,
+                                    GLsizei height, GLsizei depth, GLint border,
+                                    GLsizei imageSize, const GLvoid *data);
+
+
+extern void GLAPIENTRY
+_mesa_CompressedTexSubImage1D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLsizei width,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTexSubImage1D(GLenum target, GLint level, GLint xoffset,
                                  GLsizei width, GLenum format,
                                  GLsizei imageSize, const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTextureSubImage1D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLsizei width,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
                                   GLsizei width, GLenum format,
                                   GLsizei imageSize, const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTexSubImage2D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLint yoffset,
+                                       GLsizei width, GLsizei height,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTexSubImage2D(GLenum target, GLint level, GLint xoffset,
                                  GLint yoffset, GLsizei width, GLsizei height,
                                  GLenum format, GLsizei imageSize,
                                  const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTextureSubImage2D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLint yoffset,
+                                           GLsizei width, GLsizei height,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
                                   GLint yoffset,
                                   GLsizei width, GLsizei height,
@@ -422,12 +515,26 @@
                                   const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTexSubImage3D_no_error(GLenum target, GLint level,
+                                       GLint xoffset, GLint yoffset,
+                                       GLint zoffset, GLsizei width,
+                                       GLsizei height, GLsizei depth,
+                                       GLenum format, GLsizei imageSize,
+                                       const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTexSubImage3D(GLenum target, GLint level, GLint xoffset,
                                  GLint yoffset, GLint zoffset, GLsizei width,
                                  GLsizei height, GLsizei depth, GLenum format,
                                  GLsizei imageSize, const GLvoid *data);
 
 extern void GLAPIENTRY
+_mesa_CompressedTextureSubImage3D_no_error(GLuint texture, GLint level,
+                                           GLint xoffset, GLint yoffset,
+                                           GLint zoffset, GLsizei width,
+                                           GLsizei height, GLsizei depth,
+                                           GLenum format, GLsizei imageSize,
+                                           const GLvoid *data);
+extern void GLAPIENTRY
 _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
                                   GLint yoffset, GLint zoffset,
                                   GLsizei width, GLsizei height,
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index ad644ca..e52ad22 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -43,6 +43,7 @@
 #include "texstate.h"
 #include "mtypes.h"
 #include "program/prog_instruction.h"
+#include "texturebindless.h"
 
 
 
@@ -233,11 +234,14 @@
  * \return pointer to new texture object.
  */
 struct gl_texture_object *
-_mesa_new_texture_object( struct gl_context *ctx, GLuint name, GLenum target )
+_mesa_new_texture_object(struct gl_context *ctx, GLuint name, GLenum target)
 {
    struct gl_texture_object *obj;
-   (void) ctx;
+
    obj = MALLOC_STRUCT(gl_texture_object);
+   if (!obj)
+      return NULL;
+
    _mesa_initialize_texture_object(ctx, obj, name, target);
    return obj;
 }
@@ -311,6 +315,7 @@
    obj->DepthMode = ctx->API == API_OPENGL_CORE ? GL_RED : GL_LUMINANCE;
    obj->StencilSampling = false;
    obj->Sampler.CubeMapSeamless = GL_FALSE;
+   obj->Sampler.HandleAllocated = GL_FALSE;
    obj->Swizzle[0] = GL_RED;
    obj->Swizzle[1] = GL_GREEN;
    obj->Swizzle[2] = GL_BLUE;
@@ -320,6 +325,9 @@
    obj->BufferObjectFormat = GL_R8;
    obj->_BufferObjectFormat = MESA_FORMAT_R_UNORM8;
    obj->ImageFormatCompatibilityType = GL_IMAGE_FORMAT_COMPATIBILITY_BY_SIZE;
+
+   /* GL_ARB_bindless_texture */
+   _mesa_init_texture_handles(obj);
 }
 
 
@@ -329,13 +337,13 @@
  */
 static void
 finish_texture_init(struct gl_context *ctx, GLenum target,
-                    struct gl_texture_object *obj)
+                    struct gl_texture_object *obj, int targetIndex)
 {
    GLenum filter = GL_LINEAR;
    assert(obj->Target == 0);
 
    obj->Target = target;
-   obj->TargetIndex = _mesa_tex_target_to_index(ctx, target);
+   obj->TargetIndex = targetIndex;
    assert(obj->TargetIndex < NUM_TEXTURE_TARGETS);
 
    switch (target) {
@@ -397,6 +405,9 @@
       }
    }
 
+   /* Delete all texture/image handles. */
+   _mesa_delete_texture_handles(ctx, texObj);
+
    _mesa_reference_buffer_object(ctx, &texObj->BufferObject, NULL);
 
    /* destroy the mutex -- it may have allocated memory (eg on bsd) */
@@ -566,16 +577,10 @@
       /* reference new texture */
       assert(valid_texture_object(tex));
       mtx_lock(&tex->Mutex);
-      if (tex->RefCount == 0) {
-         /* this texture's being deleted (look just above) */
-         /* Not sure this can every really happen.  Warn if it does. */
-         _mesa_problem(NULL, "referencing deleted texture object");
-         *ptr = NULL;
-      }
-      else {
-         tex->RefCount++;
-         *ptr = tex;
-      }
+      assert(tex->RefCount > 0);
+
+      tex->RefCount++;
+      *ptr = tex;
       mtx_unlock(&tex->Mutex);
    }
 }
@@ -1190,14 +1195,6 @@
    GLuint first;
    GLint i;
 
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "%s %d\n", caller, n);
-
-   if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller);
-      return;
-   }
-
    if (!textures)
       return;
 
@@ -1215,7 +1212,7 @@
       texObj = ctx->Driver.NewTextureObject(ctx, name, target);
       if (!texObj) {
          _mesa_HashUnlockMutex(ctx->Shared->TexObjects);
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", caller);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", caller);
          return;
       }
 
@@ -1228,6 +1225,22 @@
    _mesa_HashUnlockMutex(ctx->Shared->TexObjects);
 }
 
+
+static void
+create_textures_err(struct gl_context *ctx, GLenum target,
+                    GLsizei n, GLuint *textures, const char *caller)
+{
+   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
+      _mesa_debug(ctx, "%s %d\n", caller, n);
+
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller);
+      return;
+   }
+
+   create_textures(ctx, target, n, textures, caller);
+}
+
 /*@}*/
 
 
@@ -1249,12 +1262,20 @@
  * objects are also generated.
  */
 void GLAPIENTRY
-_mesa_GenTextures(GLsizei n, GLuint *textures)
+_mesa_GenTextures_no_error(GLsizei n, GLuint *textures)
 {
    GET_CURRENT_CONTEXT(ctx);
    create_textures(ctx, 0, n, textures, "glGenTextures");
 }
 
+
+void GLAPIENTRY
+_mesa_GenTextures(GLsizei n, GLuint *textures)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_textures_err(ctx, 0, n, textures, "glGenTextures");
+}
+
 /**
  * Create texture objects.
  *
@@ -1269,6 +1290,14 @@
  * objects are also generated.
  */
 void GLAPIENTRY
+_mesa_CreateTextures_no_error(GLenum target, GLsizei n, GLuint *textures)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   create_textures(ctx, target, n, textures, "glCreateTextures");
+}
+
+
+void GLAPIENTRY
 _mesa_CreateTextures(GLenum target, GLsizei n, GLuint *textures)
 {
    GLint targetIndex;
@@ -1285,7 +1314,7 @@
       return;
    }
 
-   create_textures(ctx, target, n, textures, "glCreateTextures");
+   create_textures_err(ctx, target, n, textures, "glCreateTextures");
 }
 
 /**
@@ -1434,11 +1463,6 @@
 
    FLUSH_VERTICES(ctx, 0); /* too complex */
 
-   if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glDeleteTextures(n)");
-      return;
-   }
-
    if (!textures)
       return;
 
@@ -1467,6 +1491,11 @@
              */
             unbind_texobj_from_image_units(ctx, delObj);
 
+            /* Make all handles that reference this texture object non-resident
+             * in the current context.
+             */
+            _mesa_make_texture_handles_non_resident(ctx, delObj);
+
             _mesa_unlock_texture(ctx, delObj);
 
             ctx->NewState |= _NEW_TEXTURE_OBJECT;
@@ -1485,47 +1514,6 @@
    }
 }
 
-/**
- * This deletes a texObj without altering the hash table.
- */
-void
-_mesa_delete_nameless_texture(struct gl_context *ctx,
-                              struct gl_texture_object *texObj)
-{
-   if (!texObj)
-      return;
-
-   FLUSH_VERTICES(ctx, 0);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      /* Check if texture is bound to any framebuffer objects.
-       * If so, unbind.
-       * See section 4.4.2.3 of GL_EXT_framebuffer_object.
-       */
-      unbind_texobj_from_fbo(ctx, texObj);
-
-      /* Check if this texture is currently bound to any texture units.
-       * If so, unbind it.
-       */
-      unbind_texobj_from_texunits(ctx, texObj);
-
-      /* Check if this texture is currently bound to any shader
-       * image unit.  If so, unbind it.
-       * See section 3.9.X of GL_ARB_shader_image_load_store.
-       */
-      unbind_texobj_from_image_units(ctx, texObj);
-   }
-   _mesa_unlock_texture(ctx, texObj);
-
-   ctx->NewState |= _NEW_TEXTURE_OBJECT;
-
-   /* Unreference the texobj.  If refcount hits zero, the texture
-    * will be deleted.
-    */
-   _mesa_reference_texobj(&texObj, NULL);
-}
-
 
 /**
  * Convert a GL texture target enum such as GL_TEXTURE_2D or GL_TEXTURE_3D
@@ -1655,15 +1643,15 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_texture_object *newTexObj = NULL;
-   GLint targetIndex;
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTexture %s %d\n",
                   _mesa_enum_to_string(target), (GLint) texName);
 
-   targetIndex = _mesa_tex_target_to_index(ctx, target);
+   int targetIndex = _mesa_tex_target_to_index(ctx, target);
    if (targetIndex < 0) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glBindTexture(target)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "glBindTexture(target = %s)",
+                  _mesa_enum_to_string(target));
       return;
    }
    assert(targetIndex < NUM_TEXTURE_TARGETS);
@@ -1689,7 +1677,7 @@
             return;
          }
          if (newTexObj->Target == 0) {
-            finish_texture_init(ctx, target, newTexObj);
+            finish_texture_init(ctx, target, newTexObj, targetIndex);
          }
       }
       else {
@@ -1731,21 +1719,12 @@
  * If the named texture is not 0 or a recognized texture name, this throws
  * GL_INVALID_OPERATION.
  */
-void GLAPIENTRY
-_mesa_BindTextureUnit(GLuint unit, GLuint texture)
+static ALWAYS_INLINE void
+bind_texture_unit(struct gl_context *ctx, GLuint unit, GLuint texture,
+                  bool no_error)
 {
-   GET_CURRENT_CONTEXT(ctx);
    struct gl_texture_object *texObj;
 
-   if (unit >= _mesa_max_tex_unit(ctx)) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glBindTextureUnit(unit=%u)", unit);
-      return;
-   }
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
-                  _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
-
    /* Section 8.1 (Texture Objects) of the OpenGL 4.5 core profile spec
     * (20141030) says:
     *    "When texture is zero, each of the targets enumerated at the
@@ -1759,24 +1738,53 @@
 
    /* Get the non-default texture object */
    texObj = _mesa_lookup_texture(ctx, texture);
+   if (!no_error) {
+      /* Error checking */
+      if (!texObj) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBindTextureUnit(non-gen name)");
+         return;
+      }
 
-   /* Error checking */
-   if (!texObj) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glBindTextureUnit(non-gen name)");
-      return;
+      if (texObj->Target == 0) {
+         /* Texture object was gen'd but never bound so the target is not set */
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glBindTextureUnit(target)");
+         return;
+      }
    }
-   if (texObj->Target == 0) {
-      /* Texture object was gen'd but never bound so the target is not set */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBindTextureUnit(target)");
-      return;
-   }
+
    assert(valid_texture_object(texObj));
 
    bind_texture(ctx, unit, texObj);
 }
 
 
+void GLAPIENTRY
+_mesa_BindTextureUnit_no_error(GLuint unit, GLuint texture)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   bind_texture_unit(ctx, unit, texture, true);
+}
+
+
+void GLAPIENTRY
+_mesa_BindTextureUnit(GLuint unit, GLuint texture)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (unit >= _mesa_max_tex_unit(ctx)) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindTextureUnit(unit=%u)", unit);
+      return;
+   }
+
+   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
+      _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
+                  _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
+
+   bind_texture_unit(ctx, unit, texture, false);
+}
+
+
 /**
  * OpenGL 4.4 / GL_ARB_multi_bind glBindTextures().
  */
diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h
index a9db167..f106238 100644
--- a/src/mesa/main/texobj.h
+++ b/src/mesa/main/texobj.h
@@ -170,11 +170,6 @@
 extern void
 _mesa_lock_context_textures( struct gl_context *ctx );
 
-extern void
-_mesa_delete_nameless_texture(struct gl_context *ctx,
-                              struct gl_texture_object *texObj);
-
-
 /*@}*/
 
 /**
@@ -182,9 +177,15 @@
  */
 /*@{*/
 
+void GLAPIENTRY
+_mesa_GenTextures_no_error(GLsizei n, GLuint *textures);
+
 extern void GLAPIENTRY
 _mesa_GenTextures(GLsizei n, GLuint *textures);
 
+void GLAPIENTRY
+_mesa_CreateTextures_no_error(GLenum target, GLsizei n, GLuint *textures);
+
 extern void GLAPIENTRY
 _mesa_CreateTextures(GLenum target, GLsizei n, GLuint *textures);
 
@@ -195,6 +196,9 @@
 extern void GLAPIENTRY
 _mesa_BindTexture( GLenum target, GLuint texture );
 
+void GLAPIENTRY
+_mesa_BindTextureUnit_no_error(GLuint unit, GLuint texture);
+
 extern void GLAPIENTRY
 _mesa_BindTextureUnit(GLuint unit, GLuint texture);
 
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 25165e4..d8bbabf 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -153,18 +153,13 @@
  * Only the glGetTexLevelParameter() functions accept proxy targets.
  */
 static struct gl_texture_object *
-get_texobj_by_name(struct gl_context *ctx, GLuint texture, GLboolean get)
+get_texobj_by_name(struct gl_context *ctx, GLuint texture, const char *name)
 {
    struct gl_texture_object *texObj;
 
-   texObj = _mesa_lookup_texture(ctx, texture);
-   if (!texObj) {
-      /*
-       * User passed a non-generated name.
-       * Throw the error in the caller.
-       */
+   texObj = _mesa_lookup_texture_err(ctx, texture, name);
+   if (!texObj)
       return NULL;
-   }
 
    switch (texObj->Target) {
    case GL_TEXTURE_1D:
@@ -179,8 +174,7 @@
    case GL_TEXTURE_RECTANGLE:
       return texObj;
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "gl%sTextureParameter(target)", get ? "Get" : "");
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target)", name);
       return NULL;
    }
 
@@ -277,6 +271,19 @@
 {
    const char *suffix = dsa ? "ture" : "";
 
+   if (texObj->HandleAllocated) {
+      /* The ARB_bindless_texture spec says:
+       *
+       * "The error INVALID_OPERATION is generated by TexImage*, CopyTexImage*,
+       * CompressedTexImage*, TexBuffer*, TexParameter*, as well as other
+       * functions defined in terms of these, if the texture object to be
+       * modified is referenced by one or more texture or image handles."
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTex%sParameter(immutable texture)", suffix);
+      return GL_FALSE;
+   }
+
    switch (pname) {
    case GL_TEXTURE_MIN_FILTER:
       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
@@ -369,8 +376,26 @@
       if (texObj->BaseLevel == params[0])
          return GL_FALSE;
 
+      /* Section 8.10 (Texture Parameters) of the OpenGL 4.5 Core Profile spec
+       * says:
+       *
+       *    An INVALID_OPERATION error is generated if the effective target is
+       *    TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY, or
+       *    TEXTURE_RECTANGLE, and pname TEXTURE_BASE_LEVEL is set to a value
+       *    other than zero.
+       *
+       * Note that section 3.8.8 (Texture Parameters) of the OpenGL 3.3 Core
+       * Profile spec said:
+       *
+       *    The error INVALID_VALUE is generated if TEXTURE_BASE_LEVEL is set
+       *    to any value other than zero.
+       *
+       * We take the 4.5 language as a correction to 3.3, and we implement
+       * that on all GL versions.
+       */
       if ((texObj->Target == GL_TEXTURE_2D_MULTISAMPLE ||
-           texObj->Target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY) && params[0] != 0)
+           texObj->Target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY ||
+           texObj->Target == GL_TEXTURE_RECTANGLE) && params[0] != 0)
          goto invalid_operation;
 
       if (params[0] < 0) {
@@ -378,12 +403,6 @@
                      "glTex%sParameter(param=%d)", suffix, params[0]);
          return GL_FALSE;
       }
-      if (texObj->Target == GL_TEXTURE_RECTANGLE_ARB && params[0] != 0) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glTex%sParameter(target=%s, param=%d)", suffix,
-                     _mesa_enum_to_string(texObj->Target), params[0]);
-         return GL_FALSE;
-      }
       incomplete(ctx, texObj);
 
       /** See note about ARB_texture_storage below */
@@ -640,6 +659,19 @@
 {
    const char *suffix = dsa ? "ture" : "";
 
+   if (texObj->HandleAllocated) {
+      /* The ARB_bindless_texture spec says:
+       *
+       * "The error INVALID_OPERATION is generated by TexImage*, CopyTexImage*,
+       * CompressedTexImage*, TexBuffer*, TexParameter*, as well as other
+       * functions defined in terms of these, if the texture object to be
+       * modified is referenced by one or more texture or image handles."
+       */
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTex%sParameter(immutable texture)", suffix);
+      return GL_FALSE;
+   }
+
    switch (pname) {
    case GL_TEXTURE_MIN_LOD:
       if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
@@ -716,8 +748,16 @@
       break;
 
    case GL_TEXTURE_BORDER_COLOR:
+      /* Border color exists in desktop OpenGL since 1.0 for GL_CLAMP.  In
+       * OpenGL ES 2.0+, it only exists in when GL_OES_texture_border_clamp is
+       * enabled.  It is never available in OpenGL ES 1.x.
+       *
+       * FIXME: Every driver that supports GLES2 has this extension.  Elide
+       * the check?
+       */
       if (ctx->API == API_OPENGLES ||
-          !ctx->Extensions.ARB_texture_border_clamp)
+          (ctx->API == API_OPENGLES2 &&
+           !ctx->Extensions.ARB_texture_border_clamp))
          goto invalid_pname;
 
       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
@@ -934,10 +974,6 @@
    switch (pname) {
    case GL_TEXTURE_BORDER_COLOR:
       {
-         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glTextureParameteriv(texture)");
-            return;
-         }
          /* convert int params to float */
          GLfloat fparams[4];
          fparams[0] = INT_TO_FLOAT(params[0]);
@@ -978,6 +1014,12 @@
 {
    switch (pname) {
    case GL_TEXTURE_BORDER_COLOR:
+      if (texObj->HandleAllocated) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glTextureParameterIiv(immutable texture)");
+         return;
+      }
+
       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) {
          _mesa_error(ctx, GL_INVALID_ENUM, "glTextureParameterIiv(texture)");
          return;
@@ -1000,6 +1042,12 @@
 {
    switch (pname) {
    case GL_TEXTURE_BORDER_COLOR:
+      if (texObj->HandleAllocated) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glTextureParameterIuiv(immutable texture)");
+         return;
+      }
+
       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) {
          _mesa_error(ctx, GL_INVALID_ENUM, "glTextureParameterIuiv(texture)");
          return;
@@ -1111,12 +1159,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureParameterfv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameterfv");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameterfv(ctx, texObj, pname, params, true);
 }
@@ -1127,12 +1172,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureParameterf(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameterf");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameterf(ctx, texObj, pname, param, true);
 }
@@ -1143,12 +1185,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureParameteri(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameteri");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameteri(ctx, texObj, pname, param, true);
 }
@@ -1160,12 +1199,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureParameteriv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameteriv");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameteriv(ctx, texObj, pname, params, true);
 }
@@ -1177,13 +1213,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glTextureParameterIiv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameterIiv");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameterIiv(ctx, texObj, pname, params, true);
 }
@@ -1194,13 +1226,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_FALSE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glTextureParameterIuiv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glTextureParameterIuiv");
+   if (!texObj)
       return;
-   }
 
    _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true);
 }
@@ -1405,16 +1433,15 @@
 
       /* GL_ARB_texture_compression */
       case GL_TEXTURE_COMPRESSED_IMAGE_SIZE:
-	 if (_mesa_is_format_compressed(texFormat) &&
+         if (_mesa_is_format_compressed(texFormat) &&
              !_mesa_is_proxy_texture(target)) {
             *params = _mesa_format_image_size(texFormat, img->Width,
                                               img->Height, img->Depth);
-    }
-    else {
-       _mesa_error(ctx, GL_INVALID_OPERATION,
-                   "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                   _mesa_enum_to_string(pname));
-    }
+         } else {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
+                        _mesa_enum_to_string(pname));
+         }
          break;
       case GL_TEXTURE_COMPRESSED:
          *params = (GLint) _mesa_is_format_compressed(texFormat);
@@ -2338,13 +2365,9 @@
    struct gl_texture_object *obj;
    GET_CURRENT_CONTEXT(ctx);
 
-   obj = get_texobj_by_name(ctx, texture, GL_TRUE);
-   if (!obj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTextureParameterfv(texture)");
+   obj = get_texobj_by_name(ctx, texture, "glGetTextureParameterfv");
+   if (!obj)
       return;
-   }
 
    get_tex_parameterfv(ctx, obj, pname, params, true);
 }
@@ -2355,13 +2378,9 @@
    struct gl_texture_object *obj;
    GET_CURRENT_CONTEXT(ctx);
 
-   obj = get_texobj_by_name(ctx, texture, GL_TRUE);
-   if (!obj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTextureParameteriv(texture)");
+   obj = get_texobj_by_name(ctx, texture, "glGetTextureParameteriv");
+   if (!obj)
       return;
-   }
 
    get_tex_parameteriv(ctx, obj, pname, params, true);
 }
@@ -2372,13 +2391,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_TRUE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTextureParameterIiv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glGetTextureParameterIiv");
+   if (!texObj)
       return;
-   }
 
    get_tex_parameterIiv(ctx, texObj, pname, params, true);
 }
@@ -2390,13 +2405,9 @@
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   texObj = get_texobj_by_name(ctx, texture, GL_TRUE);
-   if (!texObj) {
-      /* User passed a non-generated name. */
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTextureParameterIuiv(texture)");
+   texObj = get_texobj_by_name(ctx, texture, "glGetTextureParameterIuiv");
+   if (!texObj)
       return;
-   }
 
    get_tex_parameterIuiv(ctx, texObj, pname, params, true);
 }
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 830b230..269e291 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -38,6 +38,7 @@
 #include "teximage.h"
 #include "texstate.h"
 #include "mtypes.h"
+#include "state.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 
@@ -279,14 +280,12 @@
 }
 
 
-
-
 /* GL_ARB_multitexture */
-void GLAPIENTRY
-_mesa_ActiveTexture(GLenum texture)
+static ALWAYS_INLINE void
+active_texture(GLenum texture, bool no_error)
 {
    const GLuint texUnit = texture - GL_TEXTURE0;
-   GLuint k;
+
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
@@ -296,18 +295,18 @@
    if (ctx->Texture.CurrentUnit == texUnit)
       return;
 
-   k = _mesa_max_tex_unit(ctx);
+   if (!no_error) {
+      GLuint k = _mesa_max_tex_unit(ctx);
 
-   assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
+      assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
 
-   if (texUnit >= k) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
-                  _mesa_enum_to_string(texture));
-      return;
+      if (texUnit >= k) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
+                     _mesa_enum_to_string(texture));
+         return;
+      }
    }
 
-   FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
-
    ctx->Texture.CurrentUnit = texUnit;
    if (ctx->Transform.MatrixMode == GL_TEXTURE) {
       /* update current stack pointer */
@@ -316,6 +315,20 @@
 }
 
 
+void GLAPIENTRY
+_mesa_ActiveTexture_no_error(GLenum texture)
+{
+   active_texture(texture, true);
+}
+
+
+void GLAPIENTRY
+_mesa_ActiveTexture(GLenum texture)
+{
+   active_texture(texture, false);
+}
+
+
 /* GL_ARB_multitexture */
 void GLAPIENTRY
 _mesa_ClientActiveTexture(GLenum texture)
@@ -336,7 +349,7 @@
       return;
    }
 
-   FLUSH_VERTICES(ctx, _NEW_ARRAY);
+   /* Don't flush vertices. This is a "latched" state. */
    ctx->Array.ActiveTexture = texUnit;
 }
 
@@ -612,18 +625,13 @@
 
 static struct gl_texture_object *
 update_single_program_texture(struct gl_context *ctx, struct gl_program *prog,
-                              int s)
+                              int unit)
 {
    gl_texture_index target_index;
    struct gl_texture_unit *texUnit;
    struct gl_texture_object *texObj;
    struct gl_sampler_object *sampler;
-   int unit;
 
-   if (!(prog->SamplersUsed & (1 << s)))
-      return NULL;
-
-   unit = prog->SamplerUnits[s];
    texUnit = &ctx->Texture.Unit[unit];
 
    /* Note: If more than one bit was set in TexturesUsed[unit], then we should
@@ -669,6 +677,24 @@
    return texObj;
 }
 
+static inline void
+update_single_program_texture_state(struct gl_context *ctx,
+                                    struct gl_program *prog,
+                                    int unit,
+                                    BITSET_WORD *enabled_texture_units)
+{
+   struct gl_texture_object *texObj;
+
+   texObj = update_single_program_texture(ctx, prog, unit);
+   if (!texObj)
+      return;
+
+   _mesa_reference_texobj(&ctx->Texture.Unit[unit]._Current, texObj);
+   BITSET_SET(enabled_texture_units, unit);
+   ctx->Texture._MaxEnabledTexImageUnit =
+      MAX2(ctx->Texture._MaxEnabledTexImageUnit, (int)unit);
+}
+
 static void
 update_program_texture_state(struct gl_context *ctx, struct gl_program **prog,
                              BITSET_WORD *enabled_texture_units)
@@ -676,25 +702,34 @@
    int i;
 
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
-      int s;
+      GLbitfield mask;
+      GLuint s;
 
       if (!prog[i])
          continue;
 
-      /* We can't only do the shifting trick as the loop condition because if
-       * sampler 31 is active, the next iteration tries to shift by 32, which is
-       * undefined.
-       */
-      for (s = 0; s < MAX_SAMPLERS && (1 << s) <= prog[i]->SamplersUsed; s++) {
-         struct gl_texture_object *texObj;
+      mask = prog[i]->SamplersUsed;
 
-         texObj = update_single_program_texture(ctx, prog[i], s);
-         if (texObj) {
-            int unit = prog[i]->SamplerUnits[s];
-            _mesa_reference_texobj(&ctx->Texture.Unit[unit]._Current, texObj);
-            BITSET_SET(enabled_texture_units, unit);
-            ctx->Texture._MaxEnabledTexImageUnit =
-               MAX2(ctx->Texture._MaxEnabledTexImageUnit, (int)unit);
+      while (mask) {
+         s = u_bit_scan(&mask);
+
+         update_single_program_texture_state(ctx, prog[i],
+                                             prog[i]->SamplerUnits[s],
+                                             enabled_texture_units);
+      }
+
+      if (unlikely(prog[i]->sh.HasBoundBindlessSampler)) {
+         /* Loop over bindless samplers bound to texture units.
+          */
+         for (s = 0; s < prog[i]->sh.NumBindlessSamplers; s++) {
+            struct gl_bindless_sampler *sampler =
+               &prog[i]->sh.BindlessSamplers[s];
+
+            if (!sampler->bound)
+               continue;
+
+            update_single_program_texture_state(ctx, prog[i], sampler->unit,
+                                                enabled_texture_units);
          }
       }
    }
@@ -803,15 +838,10 @@
    int old_max_unit = ctx->Texture._MaxEnabledTexImageUnit;
    BITSET_DECLARE(enabled_texture_units, MAX_COMBINED_TEXTURE_IMAGE_UNITS);
 
-   for (i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (ctx->_Shader->CurrentProgram[i]) {
-         prog[i] = ctx->_Shader->CurrentProgram[i];
-      } else {
-         prog[i] = NULL;
-      }
-   }
+   memcpy(prog, ctx->_Shader->CurrentProgram, sizeof(prog));
 
-   if (prog[MESA_SHADER_FRAGMENT] == NULL && ctx->FragmentProgram._Enabled) {
+   if (prog[MESA_SHADER_FRAGMENT] == NULL &&
+       _mesa_arb_fragment_program_enabled(ctx)) {
       prog[MESA_SHADER_FRAGMENT] = ctx->FragmentProgram.Current;
    }
 
diff --git a/src/mesa/main/texstate.h b/src/mesa/main/texstate.h
index cb329b0..7c74329 100644
--- a/src/mesa/main/texstate.h
+++ b/src/mesa/main/texstate.h
@@ -78,6 +78,9 @@
 /*@{*/
 
 extern void GLAPIENTRY
+_mesa_ActiveTexture_no_error( GLenum target );
+
+extern void GLAPIENTRY
 _mesa_ActiveTexture( GLenum target );
 
 extern void GLAPIENTRY
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 817a746..ef4fe58 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -386,12 +386,12 @@
  * Helper that does the storage allocation for _mesa_TexStorage1/2/3D()
  * and _mesa_TextureStorage1/2/3D().
  */
-void
-_mesa_texture_storage(struct gl_context *ctx, GLuint dims,
-                      struct gl_texture_object *texObj,
-                      GLenum target, GLsizei levels,
-                      GLenum internalformat, GLsizei width,
-                      GLsizei height, GLsizei depth, bool dsa)
+static void
+texture_storage(struct gl_context *ctx, GLuint dims,
+                struct gl_texture_object *texObj,
+                GLenum target, GLsizei levels,
+                GLenum internalformat, GLsizei width,
+                GLsizei height, GLsizei depth, bool dsa)
 {
    GLboolean sizeOK, dimensionsOK;
    mesa_format texFormat;
@@ -406,7 +406,6 @@
 
    texFormat = _mesa_choose_texture_format(ctx, texObj, target, 0,
                                            internalformat, GL_NONE, GL_NONE);
-   assert(texFormat != MESA_FORMAT_NONE);
 
    /* check that width, height, depth are legal for the mipmap level */
    dimensionsOK = _mesa_legal_texture_dimensions(ctx, target, 0,
@@ -475,24 +474,23 @@
  */
 static void
 texstorage(GLuint dims, GLenum target, GLsizei levels, GLenum internalformat,
-           GLsizei width, GLsizei height, GLsizei depth)
+           GLsizei width, GLsizei height, GLsizei depth, const char *caller)
 {
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
-   /* Check target.  This is done here so that _mesa_texture_storage
+   /* Check target.  This is done here so that texture_storage
     * can receive unsized formats.
     */
    if (!legal_texobj_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glTexStorage%uD(illegal target=%s)",
-                  dims, _mesa_enum_to_string(target));
+                  "%s(illegal target=%s)",
+                  caller, _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexStorage%uD %s %d %s %d %d %d\n",
-                  dims,
+      _mesa_debug(ctx, "%s %s %d %s %d %d %d\n", caller,
                   _mesa_enum_to_string(target), levels,
                   _mesa_enum_to_string(internalformat),
                   width, height, depth);
@@ -500,7 +498,7 @@
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glTexStorage%uD(internalformat = %s)", dims,
+                  "%s(internalformat = %s)", caller,
                   _mesa_enum_to_string(internalformat));
       return;
    }
@@ -509,8 +507,8 @@
    if (!texObj)
       return;
 
-   _mesa_texture_storage(ctx, dims, texObj, target, levels,
-                         internalformat, width, height, depth, false);
+   texture_storage(ctx, dims, texObj, target, levels,
+                   internalformat, width, height, depth, false);
 }
 
 
@@ -520,45 +518,41 @@
 static void
 texturestorage(GLuint dims, GLuint texture, GLsizei levels,
                GLenum internalformat, GLsizei width, GLsizei height,
-               GLsizei depth)
+               GLsizei depth, const char *caller)
 {
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTextureStorage%uD %d %d %s %d %d %d\n",
-                  dims, texture, levels,
+      _mesa_debug(ctx, "%s %d %d %s %d %d %d\n",
+                  caller, texture, levels,
                   _mesa_enum_to_string(internalformat),
                   width, height, depth);
 
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glTextureStorage%uD(internalformat = %s)", dims,
+                  "%s(internalformat = %s)", caller,
                   _mesa_enum_to_string(internalformat));
       return;
    }
 
-   /* Get the texture object by Name. */
-   texObj = _mesa_lookup_texture(ctx, texture);
-   if (!texObj) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glTextureStorage%uD(texture = %d)", dims, texture);
+   texObj = _mesa_lookup_texture_err(ctx, texture, caller);
+   if (!texObj)
       return;
-   }
 
-   /* Check target.  This is done here so that _mesa_texture_storage
+   /* Check target.  This is done here so that texture_storage
     * can receive unsized formats.
     */
    if (!legal_texobj_target(ctx, dims, texObj->Target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glTextureStorage%uD(illegal target=%s)",
-                  dims, _mesa_enum_to_string(texObj->Target));
+                  "%s(illegal target=%s)", caller,
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
-   _mesa_texture_storage(ctx, dims, texObj, texObj->Target,
-                         levels, internalformat, width, height, depth, true);
+   texture_storage(ctx, dims, texObj, texObj->Target,
+                   levels, internalformat, width, height, depth, true);
 }
 
 
@@ -566,7 +560,8 @@
 _mesa_TexStorage1D(GLenum target, GLsizei levels, GLenum internalformat,
                    GLsizei width)
 {
-   texstorage(1, target, levels, internalformat, width, 1, 1);
+   texstorage(1, target, levels, internalformat, width, 1, 1,
+              "glTexStorage1D");
 }
 
 
@@ -574,7 +569,8 @@
 _mesa_TexStorage2D(GLenum target, GLsizei levels, GLenum internalformat,
                    GLsizei width, GLsizei height)
 {
-   texstorage(2, target, levels, internalformat, width, height, 1);
+   texstorage(2, target, levels, internalformat, width, height, 1,
+              "glTexStorage2D");
 }
 
 
@@ -582,7 +578,8 @@
 _mesa_TexStorage3D(GLenum target, GLsizei levels, GLenum internalformat,
                    GLsizei width, GLsizei height, GLsizei depth)
 {
-   texstorage(3, target, levels, internalformat, width, height, depth);
+   texstorage(3, target, levels, internalformat, width, height, depth,
+              "glTexStorage3D");
 }
 
 
@@ -590,7 +587,8 @@
 _mesa_TextureStorage1D(GLuint texture, GLsizei levels, GLenum internalformat,
                        GLsizei width)
 {
-   texturestorage(1, texture, levels, internalformat, width, 1, 1);
+   texturestorage(1, texture, levels, internalformat, width, 1, 1,
+                  "glTextureStorage1D");
 }
 
 
@@ -599,7 +597,8 @@
                        GLenum internalformat,
                        GLsizei width, GLsizei height)
 {
-   texturestorage(2, texture, levels, internalformat, width, height, 1);
+   texturestorage(2, texture, levels, internalformat, width, height, 1,
+                  "glTextureStorage2D");
 }
 
 
@@ -607,7 +606,8 @@
 _mesa_TextureStorage3D(GLuint texture, GLsizei levels, GLenum internalformat,
                        GLsizei width, GLsizei height, GLsizei depth)
 {
-   texturestorage(3, texture, levels, internalformat, width, height, depth);
+   texturestorage(3, texture, levels, internalformat, width, height, depth,
+                  "glTextureStorage3D");
 }
 
 
diff --git a/src/mesa/main/texstorage.h b/src/mesa/main/texstorage.h
index e80a9ff..526c61e 100644
--- a/src/mesa/main/texstorage.h
+++ b/src/mesa/main/texstorage.h
@@ -31,13 +31,6 @@
  */
 /*@{*/
 
-extern void
-_mesa_texture_storage(struct gl_context *ctx, GLuint dims,
-                      struct gl_texture_object *texObj,
-                      GLenum target, GLsizei levels,
-                      GLenum internalformat, GLsizei width,
-                      GLsizei height, GLsizei depth, bool dsa);
-
 /**
  * Texture width, height and depth check shared with the
  * multisample variants of TexStorage functions.
diff --git a/src/mesa/main/texturebindless.c b/src/mesa/main/texturebindless.c
new file mode 100644
index 0000000..cb95ed0
--- /dev/null
+++ b/src/mesa/main/texturebindless.c
@@ -0,0 +1,1027 @@
+/*
+ * Copyright © 2017 Valve Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "enums.h"
+#include "imports.h"
+#include "hash.h"
+#include "mtypes.h"
+#include "shaderimage.h"
+#include "teximage.h"
+#include "texobj.h"
+#include "texturebindless.h"
+
+#include "util/hash_table.h"
+
+/**
+ * Return the gl_texture_handle_object for a given 64-bit handle.
+ */
+static struct gl_texture_handle_object *
+lookup_texture_handle(struct gl_context *ctx, GLuint64 id)
+{
+   struct gl_texture_handle_object *texHandleObj;
+
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   texHandleObj = (struct gl_texture_handle_object *)
+      _mesa_hash_table_u64_search(ctx->Shared->TextureHandles, id);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   return texHandleObj;
+}
+
+/**
+ * Return the gl_image_handle_object for a given 64-bit handle.
+ */
+static struct gl_image_handle_object *
+lookup_image_handle(struct gl_context *ctx, GLuint64 id)
+{
+   struct gl_image_handle_object *imgHandleObj;
+
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   imgHandleObj = (struct gl_image_handle_object *)
+      _mesa_hash_table_u64_search(ctx->Shared->ImageHandles, id);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   return imgHandleObj;
+}
+
+/**
+ * Delete a texture handle in the shared state.
+ */
+static void
+delete_texture_handle(struct gl_context *ctx, GLuint64 id)
+{
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   _mesa_hash_table_u64_remove(ctx->Shared->TextureHandles, id);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   ctx->Driver.DeleteTextureHandle(ctx, id);
+}
+
+/**
+ * Delete an image handle in the shared state.
+ */
+static void
+delete_image_handle(struct gl_context *ctx, GLuint64 id)
+{
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   _mesa_hash_table_u64_remove(ctx->Shared->ImageHandles, id);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   ctx->Driver.DeleteImageHandle(ctx, id);
+}
+
+/**
+ * Return TRUE if the texture handle is resident in the current context.
+ */
+static inline bool
+is_texture_handle_resident(struct gl_context *ctx, GLuint64 handle)
+{
+   return _mesa_hash_table_u64_search(ctx->ResidentTextureHandles,
+                                      handle) != NULL;
+}
+
+/**
+ * Return TRUE if the image handle is resident in the current context.
+ */
+static inline bool
+is_image_handle_resident(struct gl_context *ctx, GLuint64 handle)
+{
+   return _mesa_hash_table_u64_search(ctx->ResidentImageHandles,
+                                      handle) != NULL;
+}
+
+/**
+ * Make a texture handle resident/non-resident in the current context.
+ */
+static void
+make_texture_handle_resident(struct gl_context *ctx,
+                             struct gl_texture_handle_object *texHandleObj,
+                             bool resident)
+{
+   struct gl_sampler_object *sampObj = NULL;
+   struct gl_texture_object *texObj = NULL;
+   GLuint64 handle = texHandleObj->handle;
+
+   if (resident) {
+      assert(!is_texture_handle_resident(ctx, handle));
+
+      _mesa_hash_table_u64_insert(ctx->ResidentTextureHandles, handle,
+                                  texHandleObj);
+
+      ctx->Driver.MakeTextureHandleResident(ctx, handle, GL_TRUE);
+
+      /* Reference the texture object (and the separate sampler if needed) to
+       * be sure it won't be deleted until it is not bound anywhere and there
+       * are no handles using the object that are resident in any context.
+       */
+      _mesa_reference_texobj(&texObj, texHandleObj->texObj);
+      if (texHandleObj->sampObj)
+         _mesa_reference_sampler_object(ctx, &sampObj, texHandleObj->sampObj);
+   } else {
+      assert(is_texture_handle_resident(ctx, handle));
+
+      _mesa_hash_table_u64_remove(ctx->ResidentTextureHandles, handle);
+
+      ctx->Driver.MakeTextureHandleResident(ctx, handle, GL_FALSE);
+
+      /* Unreference the texture object but keep the pointer intact, if
+       * refcount hits zero, the texture and all handles will be deleted.
+       */
+      texObj = texHandleObj->texObj;
+      _mesa_reference_texobj(&texObj, NULL);
+
+      /* Unreference the separate sampler object but keep the pointer intact,
+       * if refcount hits zero, the sampler and all handles will be deleted.
+       */
+      if (texHandleObj->sampObj) {
+         sampObj = texHandleObj->sampObj;
+         _mesa_reference_sampler_object(ctx, &sampObj, NULL);
+      }
+   }
+}
+
+/**
+ * Make an image handle resident/non-resident in the current context.
+ */
+static void
+make_image_handle_resident(struct gl_context *ctx,
+                           struct gl_image_handle_object *imgHandleObj,
+                           GLenum access, bool resident)
+{
+   struct gl_texture_object *texObj = NULL;
+   GLuint64 handle = imgHandleObj->handle;
+
+   if (resident) {
+      assert(!is_image_handle_resident(ctx, handle));
+
+      _mesa_hash_table_u64_insert(ctx->ResidentImageHandles, handle,
+                                  imgHandleObj);
+
+      ctx->Driver.MakeImageHandleResident(ctx, handle, access, GL_TRUE);
+
+      /* Reference the texture object to be sure it won't be deleted until it
+       * is not bound anywhere and there are no handles using the object that
+       * are resident in any context.
+       */
+      _mesa_reference_texobj(&texObj, imgHandleObj->imgObj.TexObj);
+   } else {
+      assert(is_image_handle_resident(ctx, handle));
+
+      _mesa_hash_table_u64_remove(ctx->ResidentImageHandles, handle);
+
+      ctx->Driver.MakeImageHandleResident(ctx, handle, access, GL_FALSE);
+
+      /* Unreference the texture object but keep the pointer intact, if
+       * refcount hits zero, the texture and all handles will be deleted.
+       */
+      texObj = imgHandleObj->imgObj.TexObj;
+      _mesa_reference_texobj(&texObj, NULL);
+   }
+}
+
+static struct gl_texture_handle_object *
+find_texhandleobj(struct gl_texture_object *texObj,
+                  struct gl_sampler_object *sampObj)
+{
+   util_dynarray_foreach(&texObj->SamplerHandles,
+                         struct gl_texture_handle_object *, texHandleObj) {
+      if ((*texHandleObj)->sampObj == sampObj)
+         return *texHandleObj;
+   }
+   return NULL;
+}
+
+static GLuint64
+get_texture_handle(struct gl_context *ctx, struct gl_texture_object *texObj,
+                   struct gl_sampler_object *sampObj)
+{
+   bool separate_sampler = &texObj->Sampler != sampObj;
+   struct gl_texture_handle_object *texHandleObj;
+   GLuint64 handle;
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The handle for each texture or texture/sampler pair is unique; the same
+    *  handle will be returned if GetTextureHandleARB is called multiple times
+    *  for the same texture or if GetTextureSamplerHandleARB is called multiple
+    *  times for the same texture/sampler pair."
+    */
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   texHandleObj = find_texhandleobj(texObj, separate_sampler ? sampObj : NULL);
+   if (texHandleObj) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      return texHandleObj->handle;
+   }
+
+   /* Request a new texture handle from the driver. */
+   handle = ctx->Driver.NewTextureHandle(ctx, texObj, sampObj);
+   if (!handle) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexture*HandleARB()");
+      return 0;
+   }
+
+   texHandleObj = CALLOC_STRUCT(gl_texture_handle_object);
+   if (!texHandleObj) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexture*HandleARB()");
+      return 0;
+   }
+
+   /* Store the handle into the texture object. */
+   texHandleObj->texObj = texObj;
+   texHandleObj->sampObj = separate_sampler ? sampObj : NULL;
+   texHandleObj->handle = handle;
+   util_dynarray_append(&texObj->SamplerHandles,
+                        struct gl_texture_handle_object *, texHandleObj);
+
+   if (separate_sampler) {
+      /* Store the handle into the separate sampler if needed. */
+      util_dynarray_append(&sampObj->Handles,
+                           struct gl_texture_handle_object *, texHandleObj);
+   }
+
+   /* When referenced by one or more handles, texture objects are immutable. */
+   texObj->HandleAllocated = true;
+   if (texObj->Target == GL_TEXTURE_BUFFER)
+      texObj->BufferObject->HandleAllocated = true;
+   sampObj->HandleAllocated = true;
+
+   /* Store the handle in the shared state for all contexts. */
+   _mesa_hash_table_u64_insert(ctx->Shared->TextureHandles, handle,
+                               texHandleObj);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   return handle;
+}
+
+static struct gl_image_handle_object *
+find_imghandleobj(struct gl_texture_object *texObj, GLint level,
+                  GLboolean layered, GLint layer, GLenum format)
+{
+   util_dynarray_foreach(&texObj->ImageHandles,
+                         struct gl_image_handle_object *, imgHandleObj) {
+      struct gl_image_unit *u = &(*imgHandleObj)->imgObj;
+
+      if (u->TexObj == texObj && u->Level == level && u->Layered == layered &&
+          u->Layer == layer && u->Format == format)
+         return *imgHandleObj;
+   }
+   return NULL;
+}
+
+static GLuint64
+get_image_handle(struct gl_context *ctx, struct gl_texture_object *texObj,
+                 GLint level, GLboolean layered, GLint layer, GLenum format)
+{
+   struct gl_image_handle_object *imgHandleObj;
+   struct gl_image_unit imgObj;
+   GLuint64 handle;
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The handle returned for each combination of <texture>, <level>,
+    * <layered>, <layer>, and <format> is unique; the same handle will be
+    * returned if GetImageHandleARB is called multiple times with the same
+    * parameters."
+    */
+   mtx_lock(&ctx->Shared->HandlesMutex);
+   imgHandleObj = find_imghandleobj(texObj, level, layered, layer, format);
+   if (imgHandleObj) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      return imgHandleObj->handle;
+   }
+
+   imgObj.TexObj = texObj; /* weak reference */
+   imgObj.Level = level;
+   imgObj.Access = GL_READ_WRITE;
+   imgObj.Format = format;
+   imgObj._ActualFormat = _mesa_get_shader_image_format(format);
+
+   if (_mesa_tex_target_is_layered(texObj->Target)) {
+      imgObj.Layered = layered;
+      imgObj.Layer = layer;
+      imgObj._Layer = (imgObj.Layered ? 0 : imgObj.Layer);
+   } else {
+      imgObj.Layered = GL_FALSE;
+      imgObj.Layer = 0;
+   }
+
+   /* Request a new image handle from the driver. */
+   handle = ctx->Driver.NewImageHandle(ctx, &imgObj);
+   if (!handle) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetImageHandleARB()");
+      return 0;
+   }
+
+   imgHandleObj = CALLOC_STRUCT(gl_image_handle_object);
+   if (!imgHandleObj) {
+      mtx_unlock(&ctx->Shared->HandlesMutex);
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetImageHandleARB()");
+      return 0;
+   }
+
+   /* Store the handle into the texture object. */
+   memcpy(&imgHandleObj->imgObj, &imgObj, sizeof(struct gl_image_unit));
+   imgHandleObj->handle = handle;
+   util_dynarray_append(&texObj->ImageHandles,
+                        struct gl_image_handle_object *, imgHandleObj);
+
+   /* When referenced by one or more handles, texture objects are immutable. */
+   texObj->HandleAllocated = true;
+   if (texObj->Target == GL_TEXTURE_BUFFER)
+      texObj->BufferObject->HandleAllocated = true;
+   texObj->Sampler.HandleAllocated = true;
+
+   /* Store the handle in the shared state for all contexts. */
+   _mesa_hash_table_u64_insert(ctx->Shared->ImageHandles, handle, imgHandleObj);
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+
+   return handle;
+}
+
+/**
+ * Init/free per-context resident handles.
+ */
+void
+_mesa_init_resident_handles(struct gl_context *ctx)
+{
+   ctx->ResidentTextureHandles = _mesa_hash_table_u64_create(NULL);
+   ctx->ResidentImageHandles = _mesa_hash_table_u64_create(NULL);
+}
+
+void
+_mesa_free_resident_handles(struct gl_context *ctx)
+{
+   _mesa_hash_table_u64_destroy(ctx->ResidentTextureHandles, NULL);
+   _mesa_hash_table_u64_destroy(ctx->ResidentImageHandles, NULL);
+}
+
+/**
+ * Init/free shared allocated handles.
+ */
+void
+_mesa_init_shared_handles(struct gl_shared_state *shared)
+{
+   shared->TextureHandles = _mesa_hash_table_u64_create(NULL);
+   shared->ImageHandles = _mesa_hash_table_u64_create(NULL);
+   mtx_init(&shared->HandlesMutex, mtx_recursive);
+}
+
+void
+_mesa_free_shared_handles(struct gl_shared_state *shared)
+{
+   _mesa_hash_table_u64_destroy(shared->TextureHandles, NULL);
+   _mesa_hash_table_u64_destroy(shared->ImageHandles, NULL);
+   mtx_destroy(&shared->HandlesMutex);
+}
+
+/**
+ * Init/free texture/image handles per-texture object.
+ */
+void
+_mesa_init_texture_handles(struct gl_texture_object *texObj)
+{
+   util_dynarray_init(&texObj->SamplerHandles, NULL);
+   util_dynarray_init(&texObj->ImageHandles, NULL);
+}
+
+void
+_mesa_make_texture_handles_non_resident(struct gl_context *ctx,
+                                        struct gl_texture_object *texObj)
+{
+   mtx_lock(&ctx->Shared->HandlesMutex);
+
+   /* Texture handles */
+   util_dynarray_foreach(&texObj->SamplerHandles,
+                         struct gl_texture_handle_object *, texHandleObj) {
+      if (is_texture_handle_resident(ctx, (*texHandleObj)->handle))
+         make_texture_handle_resident(ctx, *texHandleObj, false);
+   }
+
+   /* Image handles */
+   util_dynarray_foreach(&texObj->ImageHandles,
+                         struct gl_image_handle_object *, imgHandleObj) {
+      if (is_image_handle_resident(ctx, (*imgHandleObj)->handle))
+         make_image_handle_resident(ctx, *imgHandleObj, GL_READ_ONLY, false);
+   }
+
+   mtx_unlock(&ctx->Shared->HandlesMutex);
+}
+
+void
+_mesa_delete_texture_handles(struct gl_context *ctx,
+                             struct gl_texture_object *texObj)
+{
+   /* Texture handles */
+   util_dynarray_foreach(&texObj->SamplerHandles,
+                         struct gl_texture_handle_object *, texHandleObj) {
+      struct gl_sampler_object *sampObj = (*texHandleObj)->sampObj;
+
+      if (sampObj) {
+         /* Delete the handle in the separate sampler object. */
+         util_dynarray_delete_unordered(&sampObj->Handles,
+                                        struct gl_texture_handle_object *,
+                                        *texHandleObj);
+      }
+      delete_texture_handle(ctx, (*texHandleObj)->handle);
+      free(*texHandleObj);
+   }
+   util_dynarray_fini(&texObj->SamplerHandles);
+
+   /* Image handles */
+   util_dynarray_foreach(&texObj->ImageHandles,
+                         struct gl_image_handle_object *, imgHandleObj) {
+      delete_image_handle(ctx, (*imgHandleObj)->handle);
+      free(*imgHandleObj);
+   }
+   util_dynarray_fini(&texObj->ImageHandles);
+}
+
+/**
+ * Init/free texture handles per-sampler object.
+ */
+void
+_mesa_init_sampler_handles(struct gl_sampler_object *sampObj)
+{
+   util_dynarray_init(&sampObj->Handles, NULL);
+}
+
+void
+_mesa_delete_sampler_handles(struct gl_context *ctx,
+                             struct gl_sampler_object *sampObj)
+{
+   util_dynarray_foreach(&sampObj->Handles,
+                         struct gl_texture_handle_object *, texHandleObj) {
+      struct gl_texture_object *texObj = (*texHandleObj)->texObj;
+
+      /* Delete the handle in the texture object. */
+      util_dynarray_delete_unordered(&texObj->SamplerHandles,
+                                     struct gl_texture_handle_object *,
+                                     *texHandleObj);
+
+      delete_texture_handle(ctx, (*texHandleObj)->handle);
+      free(*texHandleObj);
+   }
+   util_dynarray_fini(&sampObj->Handles);
+}
+
+static GLboolean
+is_sampler_border_color_valid(struct gl_sampler_object *samp)
+{
+   static const GLfloat valid_float_border_colors[4][4] = {
+      { 0.0, 0.0, 0.0, 0.0 },
+      { 0.0, 0.0, 0.0, 1.0 },
+      { 1.0, 1.0, 1.0, 0.0 },
+      { 1.0, 1.0, 1.0, 1.0 },
+   };
+   static const GLint valid_integer_border_colors[4][4] = {
+      { 0, 0, 0, 0 },
+      { 0, 0, 0, 1 },
+      { 1, 1, 1, 0 },
+      { 1, 1, 1, 1 },
+   };
+   size_t size = sizeof(samp->BorderColor.ui);
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated if the border color (taken from
+    *  the embedded sampler for GetTextureHandleARB or from the <sampler> for
+    *  GetTextureSamplerHandleARB) is not one of the following allowed values.
+    *  If the texture's base internal format is signed or unsigned integer,
+    *  allowed values are (0,0,0,0), (0,0,0,1), (1,1,1,0), and (1,1,1,1). If
+    *  the base internal format is not integer, allowed values are
+    *  (0.0,0.0,0.0,0.0), (0.0,0.0,0.0,1.0), (1.0,1.0,1.0,0.0), and
+    *  (1.0,1.0,1.0,1.0)."
+    */
+   if (!memcmp(samp->BorderColor.f, valid_float_border_colors[0], size) ||
+       !memcmp(samp->BorderColor.f, valid_float_border_colors[1], size) ||
+       !memcmp(samp->BorderColor.f, valid_float_border_colors[2], size) ||
+       !memcmp(samp->BorderColor.f, valid_float_border_colors[3], size))
+      return GL_TRUE;
+
+   if (!memcmp(samp->BorderColor.ui, valid_integer_border_colors[0], size) ||
+       !memcmp(samp->BorderColor.ui, valid_integer_border_colors[1], size) ||
+       !memcmp(samp->BorderColor.ui, valid_integer_border_colors[2], size) ||
+       !memcmp(samp->BorderColor.ui, valid_integer_border_colors[3], size))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureHandleARB_no_error(GLuint texture)
+{
+   struct gl_texture_object *texObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   texObj = _mesa_lookup_texture(ctx, texture);
+   if (!_mesa_is_texture_complete(texObj, &texObj->Sampler))
+      _mesa_test_texobj_completeness(ctx, texObj);
+
+   return get_texture_handle(ctx, texObj, &texObj->Sampler);
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureHandleARB(GLuint texture)
+{
+   struct gl_texture_object *texObj = NULL;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetTextureHandleARB(unsupported)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_VALUE is generated by GetTextureHandleARB or
+    *  GetTextureSamplerHandleARB if <texture> is zero or not the name of an
+    *  existing texture object."
+    */
+   if (texture > 0)
+      texObj = _mesa_lookup_texture(ctx, texture);
+
+   if (!texObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glGetTextureHandleARB(texture)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by GetTextureHandleARB or
+    *  GetTextureSamplerHandleARB if the texture object specified by <texture>
+    *  is not complete."
+    */
+   if (!_mesa_is_texture_complete(texObj, &texObj->Sampler)) {
+      _mesa_test_texobj_completeness(ctx, texObj);
+      if (!_mesa_is_texture_complete(texObj, &texObj->Sampler)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetTextureHandleARB(incomplete texture)");
+         return 0;
+      }
+   }
+
+   if (!is_sampler_border_color_valid(&texObj->Sampler)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetTextureHandleARB(invalid border color)");
+      return 0;
+   }
+
+   return get_texture_handle(ctx, texObj, &texObj->Sampler);
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureSamplerHandleARB_no_error(GLuint texture, GLuint sampler)
+{
+   struct gl_texture_object *texObj;
+   struct gl_sampler_object *sampObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   texObj = _mesa_lookup_texture(ctx, texture);
+   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
+
+   if (!_mesa_is_texture_complete(texObj, sampObj))
+      _mesa_test_texobj_completeness(ctx, texObj);
+
+   return get_texture_handle(ctx, texObj, sampObj);
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureSamplerHandleARB(GLuint texture, GLuint sampler)
+{
+   struct gl_texture_object *texObj = NULL;
+   struct gl_sampler_object *sampObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetTextureSamplerHandleARB(unsupported)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_VALUE is generated by GetTextureHandleARB or
+    *  GetTextureSamplerHandleARB if <texture> is zero or not the name of an
+    *  existing texture object."
+    */
+   if (texture > 0)
+      texObj = _mesa_lookup_texture(ctx, texture);
+
+   if (!texObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glGetTextureSamplerHandleARB(texture)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_VALUE is generated by GetTextureSamplerHandleARB if
+    *  <sampler> is zero or is not the name of an existing sampler object."
+    */
+   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
+   if (!sampObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glGetTextureSamplerHandleARB(sampler)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by GetTextureHandleARB or
+    *  GetTextureSamplerHandleARB if the texture object specified by <texture>
+    *  is not complete."
+    */
+   if (!_mesa_is_texture_complete(texObj, sampObj)) {
+      _mesa_test_texobj_completeness(ctx, texObj);
+      if (!_mesa_is_texture_complete(texObj, sampObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetTextureSamplerHandleARB(incomplete texture)");
+         return 0;
+      }
+   }
+
+   if (!is_sampler_border_color_valid(sampObj)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetTextureSamplerHandleARB(invalid border color)");
+      return 0;
+   }
+
+   return get_texture_handle(ctx, texObj, sampObj);
+}
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleResidentARB_no_error(GLuint64 handle)
+{
+   struct gl_texture_handle_object *texHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   texHandleObj = lookup_texture_handle(ctx, handle);
+   make_texture_handle_resident(ctx, texHandleObj, true);
+}
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleResidentARB(GLuint64 handle)
+{
+   struct gl_texture_handle_object *texHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleResidentARB(unsupported)");
+      return;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by MakeTextureHandleResidentARB
+    *  if <handle> is not a valid texture handle, or if <handle> is already
+    *  resident in the current GL context."
+    */
+   texHandleObj = lookup_texture_handle(ctx, handle);
+   if (!texHandleObj) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleResidentARB(handle)");
+      return;
+   }
+
+   if (is_texture_handle_resident(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleResidentARB(already resident)");
+      return;
+   }
+
+   make_texture_handle_resident(ctx, texHandleObj, true);
+}
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleNonResidentARB_no_error(GLuint64 handle)
+{
+   struct gl_texture_handle_object *texHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   texHandleObj = lookup_texture_handle(ctx, handle);
+   make_texture_handle_resident(ctx, texHandleObj, false);
+}
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleNonResidentARB(GLuint64 handle)
+{
+   struct gl_texture_handle_object *texHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleNonResidentARB(unsupported)");
+      return;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by
+    *  MakeTextureHandleNonResidentARB if <handle> is not a valid texture
+    *  handle, or if <handle> is not resident in the current GL context."
+    */
+   texHandleObj = lookup_texture_handle(ctx, handle);
+   if (!texHandleObj) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleNonResidentARB(handle)");
+      return;
+   }
+
+   if (!is_texture_handle_resident(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeTextureHandleNonResidentARB(not resident)");
+      return;
+   }
+
+   make_texture_handle_resident(ctx, texHandleObj, false);
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetImageHandleARB_no_error(GLuint texture, GLint level, GLboolean layered,
+                                 GLint layer, GLenum format)
+{
+   struct gl_texture_object *texObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   texObj = _mesa_lookup_texture(ctx, texture);
+   if (!_mesa_is_texture_complete(texObj, &texObj->Sampler))
+      _mesa_test_texobj_completeness(ctx, texObj);
+
+   return get_image_handle(ctx, texObj, level, layered, layer, format);
+}
+
+GLuint64 GLAPIENTRY
+_mesa_GetImageHandleARB(GLuint texture, GLint level, GLboolean layered,
+                        GLint layer, GLenum format)
+{
+   struct gl_texture_object *texObj = NULL;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx) ||
+       !_mesa_has_ARB_shader_image_load_store(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetImageHandleARB(unsupported)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_VALUE is generated by GetImageHandleARB if <texture>
+    *  is zero or not the name of an existing texture object, if the image for
+    *  <level> does not existing in <texture>, or if <layered> is FALSE and
+    *  <layer> is greater than or equal to the number of layers in the image at
+    *  <level>."
+    */
+   if (texture > 0)
+      texObj = _mesa_lookup_texture(ctx, texture);
+
+   if (!texObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glGetImageHandleARB(texture)");
+      return 0;
+   }
+
+   if (level < 0 || level >= _mesa_max_texture_levels(ctx, texObj->Target)) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glGetImageHandleARB(level)");
+      return 0;
+   }
+
+   if (!layered && layer > _mesa_get_texture_layers(texObj, level)) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glGetImageHandleARB(layer)");
+      return 0;
+   }
+
+   if (!_mesa_is_shader_image_format_supported(ctx, format)) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glGetImageHandleARB(format)");
+      return 0;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by GetImageHandleARB if the
+    *  texture object <texture> is not complete or if <layered> is TRUE and
+    *  <texture> is not a three-dimensional, one-dimensional array, two
+    *  dimensional array, cube map, or cube map array texture."
+    */
+   if (!_mesa_is_texture_complete(texObj, &texObj->Sampler)) {
+      _mesa_test_texobj_completeness(ctx, texObj);
+      if (!_mesa_is_texture_complete(texObj, &texObj->Sampler)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetImageHandleARB(incomplete texture)");
+         return 0;
+      }
+   }
+
+   if (layered && !_mesa_tex_target_is_layered(texObj->Target)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetImageHandleARB(not layered)");
+      return 0;
+   }
+
+   return get_image_handle(ctx, texObj, level, layered, layer, format);
+}
+
+void GLAPIENTRY
+_mesa_MakeImageHandleResidentARB_no_error(GLuint64 handle, GLenum access)
+{
+   struct gl_image_handle_object *imgHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   imgHandleObj = lookup_image_handle(ctx, handle);
+   make_image_handle_resident(ctx, imgHandleObj, access, true);
+}
+
+void GLAPIENTRY
+_mesa_MakeImageHandleResidentARB(GLuint64 handle, GLenum access)
+{
+   struct gl_image_handle_object *imgHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx) ||
+       !_mesa_has_ARB_shader_image_load_store(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleResidentARB(unsupported)");
+      return;
+   }
+
+   if (access != GL_READ_ONLY &&
+       access != GL_WRITE_ONLY &&
+       access != GL_READ_WRITE) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glMakeImageHandleResidentARB(access)");
+      return;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by MakeImageHandleResidentARB
+    *  if <handle> is not a valid image handle, or if <handle> is already
+    *  resident in the current GL context."
+    */
+   imgHandleObj = lookup_image_handle(ctx, handle);
+   if (!imgHandleObj) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleResidentARB(handle)");
+      return;
+   }
+
+   if (is_image_handle_resident(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleResidentARB(already resident)");
+      return;
+   }
+
+   make_image_handle_resident(ctx, imgHandleObj, access, true);
+}
+
+void GLAPIENTRY
+_mesa_MakeImageHandleNonResidentARB_no_error(GLuint64 handle)
+{
+   struct gl_image_handle_object *imgHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   imgHandleObj = lookup_image_handle(ctx, handle);
+   make_image_handle_resident(ctx, imgHandleObj, GL_READ_ONLY, false);
+}
+
+void GLAPIENTRY
+_mesa_MakeImageHandleNonResidentARB(GLuint64 handle)
+{
+   struct gl_image_handle_object *imgHandleObj;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx) ||
+       !_mesa_has_ARB_shader_image_load_store(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleNonResidentARB(unsupported)");
+      return;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION is generated by
+    *  MakeImageHandleNonResidentARB if <handle> is not a valid image handle,
+    *  or if <handle> is not resident in the current GL context."
+    */
+   imgHandleObj = lookup_image_handle(ctx, handle);
+   if (!imgHandleObj) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleNonResidentARB(handle)");
+      return;
+   }
+
+   if (!is_image_handle_resident(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glMakeImageHandleNonResidentARB(not resident)");
+      return;
+   }
+
+   make_image_handle_resident(ctx, imgHandleObj, GL_READ_ONLY, false);
+}
+
+GLboolean GLAPIENTRY
+_mesa_IsTextureHandleResidentARB_no_error(GLuint64 handle)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   return is_texture_handle_resident(ctx, handle);
+}
+
+GLboolean GLAPIENTRY
+_mesa_IsTextureHandleResidentARB(GLuint64 handle)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glIsTextureHandleResidentARB(unsupported)");
+      return GL_FALSE;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION will be generated by
+    *  IsTextureHandleResidentARB and IsImageHandleResidentARB if <handle> is
+    *  not a valid texture or image handle, respectively."
+    */
+   if (!lookup_texture_handle(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glIsTextureHandleResidentARB(handle)");
+      return GL_FALSE;
+   }
+
+   return is_texture_handle_resident(ctx, handle);
+}
+
+GLboolean GLAPIENTRY
+_mesa_IsImageHandleResidentARB_no_error(GLuint64 handle)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   return is_image_handle_resident(ctx, handle);
+}
+
+GLboolean GLAPIENTRY
+_mesa_IsImageHandleResidentARB(GLuint64 handle)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_ARB_bindless_texture(ctx) ||
+       !_mesa_has_ARB_shader_image_load_store(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glIsImageHandleResidentARB(unsupported)");
+      return GL_FALSE;
+   }
+
+   /* The ARB_bindless_texture spec says:
+    *
+    * "The error INVALID_OPERATION will be generated by
+    *  IsTextureHandleResidentARB and IsImageHandleResidentARB if <handle> is
+    *  not a valid texture or image handle, respectively."
+    */
+   if (!lookup_image_handle(ctx, handle)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glIsImageHandleResidentARB(handle)");
+      return GL_FALSE;
+   }
+
+   return is_image_handle_resident(ctx, handle);
+}
diff --git a/src/mesa/main/texturebindless.h b/src/mesa/main/texturebindless.h
new file mode 100644
index 0000000..8a9fff6
--- /dev/null
+++ b/src/mesa/main/texturebindless.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2017 Valve Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef TEXTUREBINDLESS_H
+#define TEXTUREBINDLESS_H
+
+#include "mtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \name Internal functions
+ */
+/*@{*/
+
+void
+_mesa_init_resident_handles(struct gl_context *ctx);
+void
+_mesa_free_resident_handles(struct gl_context *ctx);
+
+void
+_mesa_init_shared_handles(struct gl_shared_state *shared);
+void
+_mesa_free_shared_handles(struct gl_shared_state *shared);
+
+void
+_mesa_init_texture_handles(struct gl_texture_object *texObj);
+void
+_mesa_make_texture_handles_non_resident(struct gl_context *ctx,
+                                        struct gl_texture_object *texObj);
+void
+_mesa_delete_texture_handles(struct gl_context *ctx,
+                             struct gl_texture_object *texObj);
+
+void
+_mesa_init_sampler_handles(struct gl_sampler_object *sampObj);
+void
+_mesa_delete_sampler_handles(struct gl_context *ctx,
+                             struct gl_sampler_object *sampObj);
+
+/*@}*/
+
+/**
+ * \name API functions
+ */
+/*@{*/
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureHandleARB_no_error(GLuint texture);
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureHandleARB(GLuint texture);
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureSamplerHandleARB_no_error(GLuint texture, GLuint sampler);
+
+GLuint64 GLAPIENTRY
+_mesa_GetTextureSamplerHandleARB(GLuint texture, GLuint sampler);
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleResidentARB_no_error(GLuint64 handle);
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleResidentARB(GLuint64 handle);
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleNonResidentARB_no_error(GLuint64 handle);
+
+void GLAPIENTRY
+_mesa_MakeTextureHandleNonResidentARB(GLuint64 handle);
+
+GLuint64 GLAPIENTRY
+_mesa_GetImageHandleARB_no_error(GLuint texture, GLint level, GLboolean layered,
+                                 GLint layer, GLenum format);
+
+GLuint64 GLAPIENTRY
+_mesa_GetImageHandleARB(GLuint texture, GLint level, GLboolean layered,
+                        GLint layer, GLenum format);
+
+void GLAPIENTRY
+_mesa_MakeImageHandleResidentARB_no_error(GLuint64 handle, GLenum access);
+
+void GLAPIENTRY
+_mesa_MakeImageHandleResidentARB(GLuint64 handle, GLenum access);
+
+void GLAPIENTRY
+_mesa_MakeImageHandleNonResidentARB_no_error(GLuint64 handle);
+
+void GLAPIENTRY
+_mesa_MakeImageHandleNonResidentARB(GLuint64 handle);
+
+GLboolean GLAPIENTRY
+_mesa_IsTextureHandleResidentARB_no_error(GLuint64 handle);
+
+GLboolean GLAPIENTRY
+_mesa_IsTextureHandleResidentARB(GLuint64 handle);
+
+GLboolean GLAPIENTRY
+_mesa_IsImageHandleResidentARB_no_error(GLuint64 handle);
+
+GLboolean GLAPIENTRY
+_mesa_IsImageHandleResidentARB(GLuint64 handle);
+
+/*@}*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index ed66c179..cef9caf 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -633,7 +633,6 @@
 
    texFormat = _mesa_choose_texture_format(ctx, texObj, target, 0,
                                            internalformat, GL_NONE, GL_NONE);
-   assert(texFormat != MESA_FORMAT_NONE);
    if (texFormat == MESA_FORMAT_NONE) return;
 
    newViewNumLevels = MIN2(numlevels, origTexObj->NumLevels - minlevel);
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index 96f3df1..fdc3152 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -72,8 +72,8 @@
    callback_data.found = false;
    callback_data.prog = shProg->last_vert_prog;
 
-   _mesa_HashWalk(ctx->TransformFeedback.Objects,
-                  active_xfb_object_references_program, &callback_data);
+   _mesa_HashWalkLocked(ctx->TransformFeedback.Objects,
+                        active_xfb_object_references_program, &callback_data);
 
    /* Also check DefaultObject, as it's not in the Objects hash table. */
    active_xfb_object_references_program(0, ctx->TransformFeedback.DefaultObject,
@@ -110,16 +110,12 @@
    assert(!*ptr);
 
    if (obj) {
+      assert(obj->RefCount > 0);
+
       /* reference new object */
-      if (obj->RefCount == 0) {
-         _mesa_problem(NULL, "referencing deleted transform feedback object");
-         *ptr = NULL;
-      }
-      else {
-         obj->RefCount++;
-         obj->EverBound = GL_TRUE;
-         *ptr = obj;
-      }
+      obj->RefCount++;
+      obj->EverBound = GL_TRUE;
+      *ptr = obj;
    }
 }
 
@@ -545,19 +541,16 @@
 
 
 /**
- * Specify a buffer object to receive transform feedback results.  Plus,
- * specify the starting offset to place the results, and max size.
+ * Validate the buffer object to receive transform feedback results.  Plus,
+ * validate the starting offset to place the results, and max size.
  * Called from the glBindBufferRange() and glTransformFeedbackBufferRange
  * functions.
  */
-void
-_mesa_bind_buffer_range_transform_feedback(struct gl_context *ctx,
-                                           struct gl_transform_feedback_object *obj,
-                                           GLuint index,
-                                           struct gl_buffer_object *bufObj,
-                                           GLintptr offset,
-                                           GLsizeiptr size,
-                                           bool dsa)
+bool
+_mesa_validate_buffer_range_xfb(struct gl_context *ctx,
+                                struct gl_transform_feedback_object *obj,
+                                GLuint index, struct gl_buffer_object *bufObj,
+                                GLintptr offset, GLsizeiptr size, bool dsa)
 {
    const char *gl_methd_name;
    if (dsa)
@@ -569,7 +562,7 @@
    if (obj->Active) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(transform feedback active)",
                   gl_methd_name);
-      return;
+      return false;
    }
 
    if (index >= ctx->Const.MaxTransformFeedbackBuffers) {
@@ -579,21 +572,21 @@
        */
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(index=%d out of bounds)",
                   gl_methd_name, index);
-      return;
+      return false;
    }
 
    if (size & 0x3) {
       /* OpenGL 4.5 core profile, 6.7, pdf page 103: multiple of 4 */
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size=%d must be a multiple of "
                   "four)", gl_methd_name, (int) size);
-      return;
+      return false;
    }  
 
    if (offset & 0x3) {
       /* OpenGL 4.5 core profile, 6.7, pdf page 103: multiple of 4 */
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(offset=%d must be a multiple "
                   "of four)", gl_methd_name, (int) offset);
-      return;
+      return false;
    }
 
    if (offset < 0) {
@@ -606,7 +599,7 @@
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(offset=%d must be >= 0)",
                   gl_methd_name,
                   (int) offset);
-      return;
+      return false;
    }
 
    if (size <= 0 && (dsa || bufObj != ctx->Shared->NullBufferObj)) {
@@ -620,10 +613,10 @@
        */
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size=%d must be > 0)",
                   gl_methd_name, (int) size);
-      return;
+      return false;
    }
 
-   bind_buffer_range(ctx, obj, index, bufObj, offset, size, dsa);
+   return true;
 }
 
 
@@ -747,8 +740,13 @@
       return;
    }
 
-   _mesa_bind_buffer_range_transform_feedback(ctx, obj, index, bufObj, offset,
-                                              size, true);
+   if (!_mesa_validate_buffer_range_xfb(ctx, obj, index, bufObj, offset,
+                                        size, true))
+      return;
+
+   /* The per-attribute binding point */
+   _mesa_set_transform_feedback_binding(ctx, obj, index, bufObj, offset,
+                                        size);
 }
 
 /**
@@ -802,7 +800,7 @@
       return;
    }
 
-   bind_buffer_range(ctx, obj, index, bufObj, offset, 0, false);
+   _mesa_bind_buffer_range_xfb(ctx, obj, index, bufObj, offset, 0);
 }
 
 
@@ -969,7 +967,7 @@
    }
    else
       return (struct gl_transform_feedback_object *)
-         _mesa_HashLookup(ctx->TransformFeedback.Objects, name);
+         _mesa_HashLookupLocked(ctx->TransformFeedback.Objects, name);
 }
 
 static void
@@ -1004,7 +1002,8 @@
             return;
          }
          ids[i] = first + i;
-         _mesa_HashInsert(ctx->TransformFeedback.Objects, first + i, obj);
+         _mesa_HashInsertLocked(ctx->TransformFeedback.Objects, first + i,
+                                obj);
          if (dsa) {
             /* this is normally done at bind time in the non-dsa case */
             obj->EverBound = GL_TRUE;
@@ -1136,7 +1135,7 @@
                            names[i]);
                return;
             }
-            _mesa_HashRemove(ctx->TransformFeedback.Objects, names[i]);
+            _mesa_HashRemoveLocked(ctx->TransformFeedback.Objects, names[i]);
             /* unref, but object may not be deleted until later */
             if (obj == ctx->TransformFeedback.CurrentObject) {
                reference_transform_feedback_object(
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index c83f917..cb3bb67 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -63,13 +63,11 @@
 extern void GLAPIENTRY
 _mesa_EndTransformFeedback(void);
 
-extern void
-_mesa_bind_buffer_range_transform_feedback(struct gl_context *ctx,
-					   struct gl_transform_feedback_object *obj,
-					   GLuint index,
-					   struct gl_buffer_object *bufObj,
-					   GLintptr offset,
-					   GLsizeiptr size, bool dsa);
+extern bool
+_mesa_validate_buffer_range_xfb(struct gl_context *ctx,
+                                struct gl_transform_feedback_object *obj,
+                                GLuint index, struct gl_buffer_object *bufObj,
+                                GLintptr offset, GLsizeiptr size, bool dsa);
 
 extern void
 _mesa_bind_buffer_base_transform_feedback(struct gl_context *ctx,
@@ -150,6 +148,26 @@
       bufObj->UsageHistory |= USAGE_TRANSFORM_FEEDBACK_BUFFER;
 }
 
+static inline void
+_mesa_bind_buffer_range_xfb(struct gl_context *ctx,
+                            struct gl_transform_feedback_object *obj,
+                            GLuint index, struct gl_buffer_object *bufObj,
+                            GLintptr offset, GLsizeiptr size)
+{
+   /* Note: no need to FLUSH_VERTICES or flag NewTransformFeedback, because
+    * transform feedback buffers can't be changed while transform feedback is
+    * active.
+    */
+
+   /* The general binding point */
+   _mesa_reference_buffer_object(ctx,
+                                 &ctx->TransformFeedback.CurrentBuffer,
+                                 bufObj);
+
+   /* The per-attribute binding point */
+   _mesa_set_transform_feedback_binding(ctx, obj, index, bufObj, offset, size);
+}
+
 /*** GL_ARB_direct_state_access ***/
 
 extern void GLAPIENTRY
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 85bb7a4..a48b6d2 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -321,16 +321,23 @@
    }
 
    {
-      unsigned elements = (uni->type->is_sampler())
-	 ? 1 : uni->type->components();
-      const int dmul = uni->type->is_64bit() ? 2 : 1;
+      unsigned elements = uni->type->components();
       const int rmul = glsl_base_type_is_64bit(returnType) ? 2 : 1;
+      int dmul = (uni->type->is_64bit()) ? 2 : 1;
+
+      if ((uni->type->is_sampler() || uni->type->is_image()) &&
+          !uni->is_bindless) {
+         /* Non-bindless samplers/images are represented using unsigned integer
+          * 32-bit, while bindless handles are 64-bit.
+          */
+         dmul = 1;
+      }
 
       /* Calculate the source base address *BEFORE* modifying elements to
        * account for the size of the user's buffer.
        */
       const union gl_constant_value *const src =
-	 &uni->storage[offset * elements * dmul];
+         &uni->storage[offset * elements * dmul];
 
       assert(returnType == GLSL_TYPE_FLOAT || returnType == GLSL_TYPE_INT ||
              returnType == GLSL_TYPE_UINT || returnType == GLSL_TYPE_DOUBLE ||
@@ -339,100 +346,93 @@
       /* doubles have a different size than the other 3 types */
       unsigned bytes = sizeof(src[0]) * elements * rmul;
       if (bufSize < 0 || bytes > (unsigned) bufSize) {
-	 _mesa_error( ctx, GL_INVALID_OPERATION,
-	             "glGetnUniform*vARB(out of bounds: bufSize is %d,"
-	             " but %u bytes are required)", bufSize, bytes );
-	 return;
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetnUniform*vARB(out of bounds: bufSize is %d,"
+                     " but %u bytes are required)", bufSize, bytes);
+         return;
       }
 
       /* If the return type and the uniform's native type are "compatible,"
        * just memcpy the data.  If the types are not compatible, perform a
        * slower convert-and-copy process.
        */
-      if (returnType == uni->type->base_type
-	  || ((returnType == GLSL_TYPE_INT
-	       || returnType == GLSL_TYPE_UINT)
-	      &&
-	      (uni->type->base_type == GLSL_TYPE_INT
-	       || uni->type->base_type == GLSL_TYPE_UINT
-               || uni->type->is_sampler()
-               || uni->type->is_image()))
-          || ((returnType == GLSL_TYPE_UINT64 ||
-               returnType == GLSL_TYPE_INT64 ) &&
-              (uni->type->base_type == GLSL_TYPE_UINT64 ||
-               uni->type->base_type == GLSL_TYPE_INT64))) {
-	 memcpy(paramsOut, src, bytes);
+      if (returnType == uni->type->base_type ||
+          ((returnType == GLSL_TYPE_INT || returnType == GLSL_TYPE_UINT) &&
+           (uni->type->is_sampler() || uni->type->is_image())) ||
+          (returnType == GLSL_TYPE_UINT64 && uni->is_bindless)) {
+         memcpy(paramsOut, src, bytes);
       } else {
-	 union gl_constant_value *const dst =
-	    (union gl_constant_value *) paramsOut;
-	 /* This code could be optimized by putting the loop inside the switch
-	  * statements.  However, this is not expected to be
-	  * performance-critical code.
-	  */
-	 for (unsigned i = 0; i < elements; i++) {
-	   int sidx = i * dmul;
-	   int didx = i * rmul;
+         union gl_constant_value *const dst =
+            (union gl_constant_value *) paramsOut;
+         /* This code could be optimized by putting the loop inside the switch
+          * statements.  However, this is not expected to be
+          * performance-critical code.
+          */
+         for (unsigned i = 0; i < elements; i++) {
+            int sidx = i * dmul;
+            int didx = i * rmul;
 
-	    switch (returnType) {
-	    case GLSL_TYPE_FLOAT:
-	       switch (uni->type->base_type) {
-	       case GLSL_TYPE_UINT:
-		  dst[didx].f = (float) src[sidx].u;
-		  break;
-	       case GLSL_TYPE_INT:
-	       case GLSL_TYPE_SAMPLER:
+            switch (returnType) {
+            case GLSL_TYPE_FLOAT:
+               switch (uni->type->base_type) {
+               case GLSL_TYPE_UINT:
+                  dst[didx].f = (float) src[sidx].u;
+                  break;
+               case GLSL_TYPE_INT:
+               case GLSL_TYPE_SAMPLER:
                case GLSL_TYPE_IMAGE:
-		  dst[didx].f = (float) src[sidx].i;
-		  break;
-	       case GLSL_TYPE_BOOL:
-		  dst[didx].f = src[sidx].i ? 1.0f : 0.0f;
-		  break;
+                  dst[didx].f = (float) src[sidx].i;
+                  break;
+               case GLSL_TYPE_BOOL:
+                  dst[didx].f = src[sidx].i ? 1.0f : 0.0f;
+                  break;
                case GLSL_TYPE_DOUBLE: {
                   double tmp;
                   memcpy(&tmp, &src[sidx].f, sizeof(tmp));
                   dst[didx].f = tmp;
-		  break;
+                  break;
                }
                case GLSL_TYPE_UINT64: {
                   uint64_t tmp;
                   memcpy(&tmp, &src[sidx].u, sizeof(tmp));
                   dst[didx].f = tmp;
                   break;
-               }
+                }
                case GLSL_TYPE_INT64: {
                   uint64_t tmp;
                   memcpy(&tmp, &src[sidx].i, sizeof(tmp));
                   dst[didx].f = tmp;
                   break;
                }
-	       default:
-		  assert(!"Should not get here.");
-		  break;
-	       }
-	       break;
-	    case GLSL_TYPE_DOUBLE:
-	       switch (uni->type->base_type) {
+               default:
+                  assert(!"Should not get here.");
+                  break;
+               }
+               break;
+
+            case GLSL_TYPE_DOUBLE:
+               switch (uni->type->base_type) {
                case GLSL_TYPE_UINT: {
                   double tmp = src[sidx].u;
                   memcpy(&dst[didx].f, &tmp, sizeof(tmp));
-		  break;
+                  break;
                }
-	       case GLSL_TYPE_INT:
-	       case GLSL_TYPE_SAMPLER:
+               case GLSL_TYPE_INT:
+               case GLSL_TYPE_SAMPLER:
                case GLSL_TYPE_IMAGE: {
                   double tmp = src[sidx].i;
                   memcpy(&dst[didx].f, &tmp, sizeof(tmp));
-		  break;
+                  break;
                }
                case GLSL_TYPE_BOOL: {
                   double tmp = src[sidx].i ? 1.0 : 0.0;
                   memcpy(&dst[didx].f, &tmp, sizeof(tmp));
-		  break;
+                  break;
                }
                case GLSL_TYPE_FLOAT: {
                   double tmp = src[sidx].f;
                   memcpy(&dst[didx].f, &tmp, sizeof(tmp));
-		  break;
+                  break;
                }
                case GLSL_TYPE_UINT64: {
                   uint64_t tmpu;
@@ -450,42 +450,45 @@
                   memcpy(&dst[didx].f, &tmp, sizeof(tmp));
                   break;
                }
-	       default:
-		  assert(!"Should not get here.");
-		  break;
-	       }
-	       break;
-	    case GLSL_TYPE_INT:
-	    case GLSL_TYPE_UINT:
-	       switch (uni->type->base_type) {
-	       case GLSL_TYPE_FLOAT:
-		  /* While the GL 3.2 core spec doesn't explicitly
-		   * state how conversion of float uniforms to integer
-		   * values works, in section 6.2 "State Tables" on
-		   * page 267 it says:
-		   *
-		   *     "Unless otherwise specified, when floating
-		   *      point state is returned as integer values or
-		   *      integer state is returned as floating-point
-		   *      values it is converted in the fashion
-		   *      described in section 6.1.2"
-		   *
-		   * That section, on page 248, says:
-		   *
-		   *     "If GetIntegerv or GetInteger64v are called,
-		   *      a floating-point value is rounded to the
-		   *      nearest integer..."
-		   */
-		  dst[didx].i = IROUND(src[sidx].f);
-		  break;
-	       case GLSL_TYPE_BOOL:
-		  dst[didx].i = src[sidx].i ? 1 : 0;
-		  break;
+               default:
+                  assert(!"Should not get here.");
+                  break;
+               }
+               break;
+
+            case GLSL_TYPE_INT:
+               switch (uni->type->base_type) {
+               case GLSL_TYPE_FLOAT:
+                  /* While the GL 3.2 core spec doesn't explicitly
+                   * state how conversion of float uniforms to integer
+                   * values works, in section 6.2 "State Tables" on
+                   * page 267 it says:
+                   *
+                   *     "Unless otherwise specified, when floating
+                   *      point state is returned as integer values or
+                   *      integer state is returned as floating-point
+                   *      values it is converted in the fashion
+                   *      described in section 6.1.2"
+                   *
+                   * That section, on page 248, says:
+                   *
+                   *     "If GetIntegerv or GetInteger64v are called,
+                   *      a floating-point value is rounded to the
+                   *      nearest integer..."
+                   */
+                  dst[didx].i = (int64_t) roundf(src[sidx].f);
+                  break;
+               case GLSL_TYPE_BOOL:
+                  dst[didx].i = src[sidx].i ? 1 : 0;
+                  break;
+               case GLSL_TYPE_UINT:
+                  dst[didx].i = MIN2(src[sidx].i, INT_MAX);
+                  break;
                case GLSL_TYPE_DOUBLE: {
                   double tmp;
                   memcpy(&tmp, &src[sidx].f, sizeof(tmp));
-                  dst[didx].i = IROUNDD(tmp);
-		  break;
+                  dst[didx].i = (int64_t) round(tmp);
+                  break;
                }
                case GLSL_TYPE_UINT64: {
                   uint64_t tmp;
@@ -499,13 +502,59 @@
                   dst[didx].i = tmp;
                   break;
                }
-	       default:
-		  assert(!"Should not get here.");
-		  break;
-	       }
-	       break;
+               default:
+                  assert(!"Should not get here.");
+                  break;
+               }
+               break;
+
+            case GLSL_TYPE_UINT:
+               switch (uni->type->base_type) {
+               case GLSL_TYPE_FLOAT:
+                  /* The spec isn't terribly clear how to handle negative
+                   * values with an unsigned return type.
+                   *
+                   * GL 4.5 section 2.2.2 ("Data Conversions for State
+                   * Query Commands") says:
+                   *
+                   * "If a value is so large in magnitude that it cannot be
+                   *  represented by the returned data type, then the nearest
+                   *  value representable using the requested type is
+                   *  returned."
+                   */
+                  dst[didx].u = src[sidx].f < 0.0f ?
+                     0u : (uint32_t) roundf(src[sidx].f);
+                  break;
+               case GLSL_TYPE_BOOL:
+                  dst[didx].i = src[sidx].i ? 1 : 0;
+                  break;
+               case GLSL_TYPE_INT:
+                  dst[didx].i = MAX2(src[sidx].i, 0);
+                  break;
+               case GLSL_TYPE_DOUBLE: {
+                  double tmp;
+                  memcpy(&tmp, &src[sidx].f, sizeof(tmp));
+                  dst[didx].u = tmp < 0.0 ? 0u : (uint32_t) round(tmp);
+                  break;
+               }
+               case GLSL_TYPE_UINT64: {
+                  uint64_t tmp;
+                  memcpy(&tmp, &src[sidx].u, sizeof(tmp));
+                  dst[didx].i = MIN2(tmp, INT_MAX);
+                  break;
+               }
+               case GLSL_TYPE_INT64: {
+                  int64_t tmp;
+                  memcpy(&tmp, &src[sidx].i, sizeof(tmp));
+                  dst[didx].i = MAX2(tmp, 0);
+                  break;
+               }
+               default:
+                  unreachable("invalid uniform type");
+               }
+               break;
+
             case GLSL_TYPE_INT64:
-            case GLSL_TYPE_UINT64:
                switch (uni->type->base_type) {
                case GLSL_TYPE_UINT: {
                   uint64_t tmp = src[sidx].u;
@@ -524,8 +573,22 @@
                   memcpy(&dst[didx].u, &tmp, sizeof(tmp));
                   break;
                }
+               case GLSL_TYPE_UINT64: {
+                  uint64_t u64;
+                  memcpy(&u64, &src[sidx].u, sizeof(u64));
+                  int64_t tmp = MIN2(u64, INT_MAX);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
                case GLSL_TYPE_FLOAT: {
-                  int64_t tmp = src[sidx].f;
+                  int64_t tmp = (int64_t) roundf(src[sidx].f);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_DOUBLE: {
+                  double d;
+                  memcpy(&d, &src[sidx].f, sizeof(d));
+                  int64_t tmp = (int64_t) round(d);
                   memcpy(&dst[didx].u, &tmp, sizeof(tmp));
                   break;
                }
@@ -534,11 +597,57 @@
                   break;
                }
                break;
-	    default:
-	       assert(!"Should not get here.");
-	       break;
-	    }
-	 }
+
+            case GLSL_TYPE_UINT64:
+               switch (uni->type->base_type) {
+               case GLSL_TYPE_UINT: {
+                  uint64_t tmp = src[sidx].u;
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_INT:
+               case GLSL_TYPE_SAMPLER:
+               case GLSL_TYPE_IMAGE: {
+                  int64_t tmp = MAX2(src[sidx].i, 0);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_BOOL: {
+                  int64_t tmp = src[sidx].i ? 1.0f : 0.0f;
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_INT64: {
+                  uint64_t i64;
+                  memcpy(&i64, &src[sidx].i, sizeof(i64));
+                  uint64_t tmp = MAX2(i64, 0);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_FLOAT: {
+                  uint64_t tmp = src[sidx].f < 0.0f ?
+                     0ull : (uint64_t) roundf(src[sidx].f);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               case GLSL_TYPE_DOUBLE: {
+                  double d;
+                  memcpy(&d, &src[sidx].f, sizeof(d));
+                  uint64_t tmp = (d < 0.0) ? 0ull : (uint64_t) round(d);
+                  memcpy(&dst[didx].u, &tmp, sizeof(tmp));
+                  break;
+               }
+               default:
+                  assert(!"Should not get here.");
+                  break;
+               }
+               break;
+
+            default:
+               assert(!"Should not get here.");
+               break;
+            }
+         }
       }
    }
 }
@@ -648,10 +757,8 @@
 {
    unsigned i;
 
-   /* vector_elements and matrix_columns can be 0 for samplers.
-    */
-   const unsigned components = MAX2(1, uni->type->vector_elements);
-   const unsigned vectors = MAX2(1, uni->type->matrix_columns);
+   const unsigned components = uni->type->vector_elements;
+   const unsigned vectors = uni->type->matrix_columns;
    const int dmul = uni->type->is_64bit() ? 2 : 1;
 
    /* Store the data in the driver's requested type in the driver's storage
@@ -782,35 +889,28 @@
 }
 
 
-/**
- * Called via glUniform*() functions.
- */
-extern "C" void
-_mesa_uniform(GLint location, GLsizei count, const GLvoid *values,
-              struct gl_context *ctx, struct gl_shader_program *shProg,
-              enum glsl_base_type basicType, unsigned src_components)
+static struct gl_uniform_storage *
+validate_uniform(GLint location, GLsizei count, const GLvoid *values,
+                 unsigned *offset, struct gl_context *ctx,
+                 struct gl_shader_program *shProg,
+                 enum glsl_base_type basicType, unsigned src_components)
 {
-   unsigned offset;
-   int size_mul = glsl_base_type_is_64bit(basicType) ? 2 : 1;
-
    struct gl_uniform_storage *const uni =
-      validate_uniform_parameters(location, count, &offset,
+      validate_uniform_parameters(location, count, offset,
                                   ctx, shProg, "glUniform");
    if (uni == NULL)
-      return;
+      return NULL;
 
    if (uni->type->is_matrix()) {
       /* Can't set matrix uniforms (like mat4) with glUniform */
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glUniform%u(uniform \"%s\"@%d is matrix)",
                   src_components, uni->name, location);
-      return;
+      return NULL;
    }
 
-   /* Verify that the types are compatible.
-    */
-   const unsigned components = uni->type->is_sampler()
-      ? 1 : uni->type->vector_elements;
+   /* Verify that the types are compatible. */
+   const unsigned components = uni->type->vector_elements;
 
    if (components != src_components) {
       /* glUniformN() must match float/vecN type */
@@ -818,7 +918,7 @@
                   "glUniform%u(\"%s\"@%u has %u components, not %u)",
                   src_components, uni->name, location,
                   components, src_components);
-      return;
+      return NULL;
    }
 
    bool match;
@@ -843,12 +943,12 @@
                   src_components, uni->name, location,
                   glsl_type_name(uni->type->base_type),
                   glsl_type_name(basicType));
-      return;
+      return NULL;
    }
 
    if (unlikely(ctx->_Shader->Flags & GLSL_UNIFORMS)) {
       log_uniform(values, basicType, components, 1, count,
-		  false, shProg, location, uni);
+                  false, shProg, location, uni);
    }
 
    /* Page 100 (page 116 of the PDF) of the OpenGL 3.0 spec says:
@@ -870,15 +970,14 @@
     */
    if (uni->type->is_sampler()) {
       for (int i = 0; i < count; i++) {
-	 const unsigned texUnit = ((unsigned *) values)[i];
+         const unsigned texUnit = ((unsigned *) values)[i];
 
          /* check that the sampler (tex unit index) is legal */
          if (texUnit >= ctx->Const.MaxCombinedTextureImageUnits) {
             _mesa_error(ctx, GL_INVALID_VALUE,
                         "glUniform1i(invalid sampler/tex unit index for "
-			"uniform %d)",
-                        location);
-            return;
+                        "uniform %d)", location);
+            return NULL;
          }
       }
       /* We need to reset the validate flag on changes to samplers in case
@@ -896,11 +995,76 @@
             _mesa_error(ctx, GL_INVALID_VALUE,
                         "glUniform1i(invalid image unit index for uniform %d)",
                         location);
-            return;
+            return NULL;
          }
       }
    }
 
+   return uni;
+}
+
+void
+_mesa_flush_vertices_for_uniforms(struct gl_context *ctx,
+                                  const struct gl_uniform_storage *uni)
+{
+   /* Opaque uniforms have no storage unless they are bindless */
+   if (!uni->is_bindless && uni->type->contains_opaque()) {
+      FLUSH_VERTICES(ctx, 0);
+      return;
+   }
+
+   uint64_t new_driver_state = 0;
+   unsigned mask = uni->active_shader_mask;
+
+   while (mask) {
+      unsigned index = u_bit_scan(&mask);
+
+      assert(index < MESA_SHADER_STAGES);
+      new_driver_state |= ctx->DriverFlags.NewShaderConstants[index];
+   }
+
+   FLUSH_VERTICES(ctx, new_driver_state ? 0 : _NEW_PROGRAM_CONSTANTS);
+   ctx->NewDriverState |= new_driver_state;
+}
+
+/**
+ * Called via glUniform*() functions.
+ */
+extern "C" void
+_mesa_uniform(GLint location, GLsizei count, const GLvoid *values,
+              struct gl_context *ctx, struct gl_shader_program *shProg,
+              enum glsl_base_type basicType, unsigned src_components)
+{
+   unsigned offset;
+   int size_mul = glsl_base_type_is_64bit(basicType) ? 2 : 1;
+
+   struct gl_uniform_storage *uni;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      /* From Seciton 7.6 (UNIFORM VARIABLES) of the OpenGL 4.5 spec:
+       *
+       *   "If the value of location is -1, the Uniform* commands will
+       *   silently ignore the data passed in, and the current uniform values
+       *   will not be changed.
+       */
+      if (location == -1)
+         return;
+
+      uni = shProg->UniformRemapTable[location];
+
+      /* The array index specified by the uniform location is just the
+       * uniform location minus the base location of of the uniform.
+       */
+      assert(uni->array_elements > 0 || location == (int)uni->remap_location);
+      offset = location - uni->remap_location;
+   } else {
+      uni = validate_uniform(location, count, values, &offset, ctx, shProg,
+                             basicType, src_components);
+      if (!uni)
+         return;
+   }
+
+   const unsigned components = uni->type->vector_elements;
+
    /* Page 82 (page 96 of the PDF) of the OpenGL 2.1 spec says:
     *
     *     "When loading N elements starting at an arbitrary position k in a
@@ -916,25 +1080,39 @@
       count = MIN2(count, (int) (uni->array_elements - offset));
    }
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   /* We check samplers for changes and flush if needed in the sampler
+    * handling code further down, so just skip them here.
+    */
+   if (!uni->type->is_sampler()) {
+       _mesa_flush_vertices_for_uniforms(ctx, uni);
+   }
 
    /* Store the data in the "actual type" backing storage for the uniform.
     */
-   if (!uni->type->is_boolean()) {
+   if (!uni->type->is_boolean() && !uni->is_bindless) {
       memcpy(&uni->storage[size_mul * components * offset], values,
-	     sizeof(uni->storage[0]) * components * count * size_mul);
+             sizeof(uni->storage[0]) * components * count * size_mul);
+   } else if (uni->is_bindless) {
+      const union gl_constant_value *src =
+         (const union gl_constant_value *) values;
+      GLuint64 *dst = (GLuint64 *)&uni->storage[components * offset].i;
+      const unsigned elems = components * count;
+
+      for (unsigned i = 0; i < elems; i++) {
+         dst[i] = src[i].i;
+      }
    } else {
       const union gl_constant_value *src =
-	 (const union gl_constant_value *) values;
+         (const union gl_constant_value *) values;
       union gl_constant_value *dst = &uni->storage[components * offset];
       const unsigned elems = components * count;
 
       for (unsigned i = 0; i < elems; i++) {
-	 if (basicType == GLSL_TYPE_FLOAT) {
+         if (basicType == GLSL_TYPE_FLOAT) {
             dst[i].i = src[i].f != 0.0f ? ctx->Const.UniformBooleanTrue : 0;
-	 } else {
+         } else {
             dst[i].i = src[i].i != 0    ? ctx->Const.UniformBooleanTrue : 0;
-	 }
+         }
       }
    }
 
@@ -945,35 +1123,52 @@
     */
    if (uni->type->is_sampler()) {
       bool flushed = false;
+
       shProg->SamplersValidated = GL_TRUE;
 
       for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-	 struct gl_linked_shader *const sh = shProg->_LinkedShaders[i];
+         struct gl_linked_shader *const sh = shProg->_LinkedShaders[i];
 
-	 /* If the shader stage doesn't use the sampler uniform, skip this. */
-	 if (!uni->opaque[i].active)
-	    continue;
+         /* If the shader stage doesn't use the sampler uniform, skip this. */
+         if (!uni->opaque[i].active)
+            continue;
 
          bool changed = false;
          for (int j = 0; j < count; j++) {
             unsigned unit = uni->opaque[i].index + offset + j;
-            if (sh->Program->SamplerUnits[unit] != ((unsigned *) values)[j]) {
-               sh->Program->SamplerUnits[unit] = ((unsigned *) values)[j];
-               changed = true;
+            unsigned value = ((unsigned *)values)[j];
+
+            if (uni->is_bindless) {
+               struct gl_bindless_sampler *sampler =
+                  &sh->Program->sh.BindlessSamplers[unit];
+
+               /* Mark this bindless sampler as bound to a texture unit.
+                */
+               if (sampler->unit != value || !sampler->bound) {
+                  sampler->unit = value;
+                  changed = true;
+               }
+               sampler->bound = true;
+               sh->Program->sh.HasBoundBindlessSampler = true;
+            } else {
+               if (sh->Program->SamplerUnits[unit] != value) {
+                  sh->Program->SamplerUnits[unit] = value;
+                  changed = true;
+               }
             }
          }
 
-	 if (changed) {
-	    if (!flushed) {
-	       FLUSH_VERTICES(ctx, _NEW_TEXTURE_OBJECT | _NEW_PROGRAM);
-	       flushed = true;
-	    }
+         if (changed) {
+            if (!flushed) {
+               FLUSH_VERTICES(ctx, _NEW_TEXTURE_OBJECT | _NEW_PROGRAM);
+               flushed = true;
+            }
 
             struct gl_program *const prog = sh->Program;
-	    _mesa_update_shader_textures_used(shProg, prog);
+            _mesa_update_shader_textures_used(shProg, prog);
             if (ctx->Driver.SamplerUniformChange)
-	       ctx->Driver.SamplerUniformChange(ctx, prog->Target, prog);
-	 }
+               ctx->Driver.SamplerUniformChange(ctx, prog->Target, prog);
+         }
       }
    }
 
@@ -982,12 +1177,28 @@
     */
    if (uni->type->is_image()) {
       for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-	 if (uni->opaque[i].active) {
-            struct gl_linked_shader *sh = shProg->_LinkedShaders[i];
+         struct gl_linked_shader *sh = shProg->_LinkedShaders[i];
 
-            for (int j = 0; j < count; j++)
-               sh->Program->sh.ImageUnits[uni->opaque[i].index + offset + j] =
-                  ((GLint *) values)[j];
+         /* If the shader stage doesn't use the image uniform, skip this. */
+         if (!uni->opaque[i].active)
+            continue;
+
+         for (int j = 0; j < count; j++) {
+            unsigned unit = uni->opaque[i].index + offset + j;
+            unsigned value = ((unsigned *)values)[j];
+
+            if (uni->is_bindless) {
+               struct gl_bindless_image *image =
+                  &sh->Program->sh.BindlessImages[unit];
+
+               /* Mark this bindless image as bound to an image unit.
+                */
+               image->unit = value;
+               image->bound = true;
+               sh->Program->sh.HasBoundBindlessImage = true;
+            } else {
+               sh->Program->sh.ImageUnits[unit] = value;
+            }
          }
       }
 
@@ -1090,7 +1301,7 @@
       count = MIN2(count, (int) (uni->array_elements - offset));
    }
 
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   _mesa_flush_vertices_for_uniforms(ctx, uni);
 
    /* Store the data in the "actual type" backing storage for the uniform.
     */
@@ -1135,6 +1346,170 @@
    _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
 }
 
+static void
+update_bound_bindless_sampler_flag(struct gl_program *prog)
+{
+   unsigned i;
+
+   if (likely(!prog->sh.HasBoundBindlessSampler))
+      return;
+
+   for (i = 0; i < prog->sh.NumBindlessSamplers; i++) {
+      struct gl_bindless_sampler *sampler = &prog->sh.BindlessSamplers[i];
+
+      if (sampler->bound)
+         return;
+   }
+   prog->sh.HasBoundBindlessSampler = false;
+}
+
+static void
+update_bound_bindless_image_flag(struct gl_program *prog)
+{
+   unsigned i;
+
+   if (likely(!prog->sh.HasBoundBindlessImage))
+      return;
+
+   for (i = 0; i < prog->sh.NumBindlessImages; i++) {
+      struct gl_bindless_image *image = &prog->sh.BindlessImages[i];
+
+      if (image->bound)
+         return;
+   }
+   prog->sh.HasBoundBindlessImage = false;
+}
+
+/**
+ * Called via glUniformHandleui64*ARB() functions.
+ */
+extern "C" void
+_mesa_uniform_handle(GLint location, GLsizei count, const GLvoid *values,
+                     struct gl_context *ctx, struct gl_shader_program *shProg)
+{
+   unsigned offset;
+   struct gl_uniform_storage *uni;
+
+   if (_mesa_is_no_error_enabled(ctx)) {
+      /* From Section 7.6 (UNIFORM VARIABLES) of the OpenGL 4.5 spec:
+       *
+       *   "If the value of location is -1, the Uniform* commands will
+       *   silently ignore the data passed in, and the current uniform values
+       *   will not be changed.
+       */
+      if (location == -1)
+         return;
+
+      uni = shProg->UniformRemapTable[location];
+
+      /* The array index specified by the uniform location is just the
+       * uniform location minus the base location of of the uniform.
+       */
+      assert(uni->array_elements > 0 || location == (int)uni->remap_location);
+      offset = location - uni->remap_location;
+   } else {
+      uni = validate_uniform_parameters(location, count, &offset,
+                                        ctx, shProg, "glUniformHandleui64*ARB");
+      if (!uni)
+         return;
+
+      if (!uni->is_bindless) {
+         /* From section "Errors" of the ARB_bindless_texture spec:
+          *
+          * "The error INVALID_OPERATION is generated by
+          *  UniformHandleui64{v}ARB if the sampler or image uniform being
+          *  updated has the "bound_sampler" or "bound_image" layout qualifier."
+          *
+          * From section 4.4.6 of the ARB_bindless_texture spec:
+          *
+          * "In the absence of these qualifiers, sampler and image uniforms are
+          *  considered "bound". Additionally, if GL_ARB_bindless_texture is
+          *  not enabled, these uniforms are considered "bound"."
+          */
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glUniformHandleui64*ARB(non-bindless sampler/image uniform)");
+         return;
+      }
+   }
+
+   const unsigned components = uni->type->vector_elements;
+   const int size_mul = 2;
+
+   if (unlikely(ctx->_Shader->Flags & GLSL_UNIFORMS)) {
+      log_uniform(values, GLSL_TYPE_UINT64, components, 1, count,
+                  false, shProg, location, uni);
+   }
+
+   /* Page 82 (page 96 of the PDF) of the OpenGL 2.1 spec says:
+    *
+    *     "When loading N elements starting at an arbitrary position k in a
+    *     uniform declared as an array, elements k through k + N - 1 in the
+    *     array will be replaced with the new values. Values for any array
+    *     element that exceeds the highest array element index used, as
+    *     reported by GetActiveUniform, will be ignored by the GL."
+    *
+    * Clamp 'count' to a valid value.  Note that for non-arrays a count > 1
+    * will have already generated an error.
+    */
+   if (uni->array_elements != 0) {
+      count = MIN2(count, (int) (uni->array_elements - offset));
+   }
+
+   _mesa_flush_vertices_for_uniforms(ctx, uni);
+
+   /* Store the data in the "actual type" backing storage for the uniform.
+    */
+   memcpy(&uni->storage[size_mul * components * offset], values,
+          sizeof(uni->storage[0]) * components * count * size_mul);
+
+   _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
+
+   if (uni->type->is_sampler()) {
+      /* Mark this bindless sampler as not bound to a texture unit because
+       * it refers to a texture handle.
+       */
+      for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+         struct gl_linked_shader *const sh = shProg->_LinkedShaders[i];
+
+         /* If the shader stage doesn't use the sampler uniform, skip this. */
+         if (!uni->opaque[i].active)
+            continue;
+
+         for (int j = 0; j < count; j++) {
+            unsigned unit = uni->opaque[i].index + offset + j;
+            struct gl_bindless_sampler *sampler =
+               &sh->Program->sh.BindlessSamplers[unit];
+
+            sampler->bound = false;
+         }
+
+         update_bound_bindless_sampler_flag(sh->Program);
+      }
+   }
+
+   if (uni->type->is_image()) {
+      /* Mark this bindless image as not bound to an image unit because it
+       * refers to a texture handle.
+       */
+      for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+         struct gl_linked_shader *sh = shProg->_LinkedShaders[i];
+
+         /* If the shader stage doesn't use the sampler uniform, skip this. */
+         if (!uni->opaque[i].active)
+            continue;
+
+         for (int j = 0; j < count; j++) {
+            unsigned unit = uni->opaque[i].index + offset + j;
+            struct gl_bindless_image *image =
+               &sh->Program->sh.BindlessImages[unit];
+
+            image->bound = false;
+         }
+
+         update_bound_bindless_image_flag(sh->Program);
+      }
+   }
+}
 
 extern "C" bool
 _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 8869b6e..ddfe906 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -63,6 +63,40 @@
  * TEXTURE_2D_INDEX, TEXTURE_3D_INDEX, etc.
  * We'll use that info for state validation before rendering.
  */
+static inline void
+update_single_shader_texture_used(struct gl_shader_program *shProg,
+                                  struct gl_program *prog,
+                                  GLuint unit, GLuint target)
+{
+   gl_shader_stage prog_stage =
+      _mesa_program_enum_to_shader_stage(prog->Target);
+
+   assert(unit < ARRAY_SIZE(prog->TexturesUsed));
+   assert(target < NUM_TEXTURE_TARGETS);
+
+   /* From section 7.10 (Samplers) of the OpenGL 4.5 spec:
+    *
+    * "It is not allowed to have variables of different sampler types pointing
+    *  to the same texture image unit within a program object."
+    */
+   unsigned stages_mask = shProg->data->linked_stages;
+   while (stages_mask) {
+      const int stage = u_bit_scan(&stages_mask);
+
+      /* Skip validation if we are yet to update textures used in this
+       * stage.
+       */
+      if (prog_stage < stage)
+         break;
+
+      struct gl_program *glprog = shProg->_LinkedShaders[stage]->Program;
+      if (glprog->TexturesUsed[unit] & ~(1 << target))
+         shProg->SamplersValidated = GL_FALSE;
+   }
+
+   prog->TexturesUsed[unit] |= (1 << target);
+}
+
 void
 _mesa_update_shader_textures_used(struct gl_shader_program *shProg,
                                   struct gl_program *prog)
@@ -70,43 +104,34 @@
    GLbitfield mask = prog->SamplersUsed;
    gl_shader_stage prog_stage =
       _mesa_program_enum_to_shader_stage(prog->Target);
-   struct gl_linked_shader *shader = shProg->_LinkedShaders[prog_stage];
+   MAYBE_UNUSED struct gl_linked_shader *shader =
+      shProg->_LinkedShaders[prog_stage];
+   GLuint s;
 
    assert(shader);
 
    memset(prog->TexturesUsed, 0, sizeof(prog->TexturesUsed));
 
    while (mask) {
-      const int s = u_bit_scan(&mask);
-      GLuint unit = prog->SamplerUnits[s];
-      GLuint tgt = prog->sh.SamplerTargets[s];
-      assert(unit < ARRAY_SIZE(prog->TexturesUsed));
-      assert(tgt < NUM_TEXTURE_TARGETS);
+      s = u_bit_scan(&mask);
 
-      /* The types of the samplers associated with a particular texture
-       * unit must be an exact match.  Page 74 (page 89 of the PDF) of the
-       * OpenGL 3.3 core spec says:
-       *
-       *     "It is not allowed to have variables of different sampler
-       *     types pointing to the same texture image unit within a program
-       *     object."
+      update_single_shader_texture_used(shProg, prog,
+                                        prog->SamplerUnits[s],
+                                        prog->sh.SamplerTargets[s]);
+   }
+
+   if (unlikely(prog->sh.HasBoundBindlessSampler)) {
+      /* Loop over bindless samplers bound to texture units.
        */
-      unsigned stages_mask = shProg->data->linked_stages;
-      while (stages_mask) {
-         const int stage = u_bit_scan(&stages_mask);
+      for (s = 0; s < prog->sh.NumBindlessSamplers; s++) {
+         struct gl_bindless_sampler *sampler = &prog->sh.BindlessSamplers[s];
 
-         /* Skip validation if we are yet to update textures used in this
-          * stage.
-          */
-         if (prog_stage < stage)
-            break;
+         if (!sampler->bound)
+            continue;
 
-         struct gl_program *glprog = shProg->_LinkedShaders[stage]->Program;
-         if (glprog->TexturesUsed[unit] & ~(1 << tgt))
-            shProg->SamplersValidated = GL_FALSE;
+         update_single_shader_texture_used(shProg, prog, sampler->unit,
+                                           sampler->target);
       }
-
-      prog->TexturesUsed[unit] |= (1 << tgt);
    }
 }
 
@@ -294,6 +319,23 @@
    _mesa_uniform(location, count, value, ctx, ctx->_Shader->ActiveProgram, GLSL_TYPE_INT, 4);
 }
 
+void GLAPIENTRY
+_mesa_UniformHandleui64ARB(GLint location, GLuint64 value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_uniform_handle(location, 1, &value, ctx, ctx->_Shader->ActiveProgram);
+}
+
+void GLAPIENTRY
+_mesa_UniformHandleui64vARB(GLint location, GLsizei count,
+                            const GLuint64 *value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_uniform_handle(location, count, value, ctx,
+                        ctx->_Shader->ActiveProgram);
+}
+
+
 /** Same as above with direct state access **/
 void GLAPIENTRY
 _mesa_ProgramUniform1f(GLuint program, GLint location, GLfloat v0)
@@ -485,6 +527,28 @@
    _mesa_uniform(location, count, value, ctx, shProg, GLSL_TYPE_INT, 4);
 }
 
+void GLAPIENTRY
+_mesa_ProgramUniformHandleui64ARB(GLuint program, GLint location,
+                                  GLuint64 value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_shader_program *shProg =
+      _mesa_lookup_shader_program_err(ctx, program,
+            "glProgramUniformHandleui64ARB");
+   _mesa_uniform_handle(location, 1, &value, ctx, shProg);
+}
+
+void GLAPIENTRY
+_mesa_ProgramUniformHandleui64vARB(GLuint program, GLint location,
+                                   GLsizei count, const GLuint64 *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_shader_program *shProg =
+      _mesa_lookup_shader_program_err(ctx, program,
+            "glProgramUniformHandleui64vARB");
+   _mesa_uniform_handle(location, count, values, ctx, shProg);
+}
+
 
 /** OpenGL 3.0 GLuint-valued functions **/
 void GLAPIENTRY
@@ -961,6 +1025,17 @@
    return _mesa_program_resource_location(shProg, GL_UNIFORM, name);
 }
 
+GLint GLAPIENTRY
+_mesa_GetUniformLocation_no_error(GLuint programObj, const GLcharARB *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_shader_program *shProg =
+      _mesa_lookup_shader_program(ctx, programObj);
+
+   return _mesa_program_resource_location(shProg, GL_UNIFORM, name);
+}
+
 GLuint GLAPIENTRY
 _mesa_GetUniformBlockIndex(GLuint program,
 			   const GLchar *uniformBlockName)
@@ -1021,6 +1096,31 @@
    }
 }
 
+static void
+uniform_block_binding(struct gl_context *ctx, struct gl_shader_program *shProg,
+                      GLuint uniformBlockIndex, GLuint uniformBlockBinding)
+{
+   if (shProg->data->UniformBlocks[uniformBlockIndex].Binding !=
+       uniformBlockBinding) {
+
+      FLUSH_VERTICES(ctx, 0);
+      ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
+
+      shProg->data->UniformBlocks[uniformBlockIndex].Binding =
+         uniformBlockBinding;
+   }
+}
+
+void GLAPIENTRY
+_mesa_UniformBlockBinding_no_error(GLuint program, GLuint uniformBlockIndex,
+                                   GLuint uniformBlockBinding)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_shader_program *shProg = _mesa_lookup_shader_program(ctx, program);
+   uniform_block_binding(ctx, shProg, uniformBlockIndex, uniformBlockBinding);
+}
+
 void GLAPIENTRY
 _mesa_UniformBlockBinding(GLuint program,
 			  GLuint uniformBlockIndex,
@@ -1053,18 +1153,39 @@
       return;
    }
 
-   if (shProg->data->UniformBlocks[uniformBlockIndex].Binding !=
-       uniformBlockBinding) {
+   uniform_block_binding(ctx, shProg, uniformBlockIndex, uniformBlockBinding);
+}
+
+static void
+shader_storage_block_binding(struct gl_context *ctx,
+                             struct gl_shader_program *shProg,
+                             GLuint shaderStorageBlockIndex,
+                             GLuint shaderStorageBlockBinding)
+{
+   if (shProg->data->ShaderStorageBlocks[shaderStorageBlockIndex].Binding !=
+       shaderStorageBlockBinding) {
 
       FLUSH_VERTICES(ctx, 0);
-      ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
+      ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      shProg->data->UniformBlocks[uniformBlockIndex].Binding =
-         uniformBlockBinding;
+      shProg->data->ShaderStorageBlocks[shaderStorageBlockIndex].Binding =
+         shaderStorageBlockBinding;
    }
 }
 
 void GLAPIENTRY
+_mesa_ShaderStorageBlockBinding_no_error(GLuint program,
+                                         GLuint shaderStorageBlockIndex,
+                                         GLuint shaderStorageBlockBinding)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_shader_program *shProg = _mesa_lookup_shader_program(ctx, program);
+   shader_storage_block_binding(ctx, shProg, shaderStorageBlockIndex,
+                                shaderStorageBlockBinding);
+}
+
+void GLAPIENTRY
 _mesa_ShaderStorageBlockBinding(GLuint program,
 			        GLuint shaderStorageBlockIndex,
 			        GLuint shaderStorageBlockBinding)
@@ -1098,15 +1219,8 @@
       return;
    }
 
-   if (shProg->data->ShaderStorageBlocks[shaderStorageBlockIndex].Binding !=
-       shaderStorageBlockBinding) {
-
-      FLUSH_VERTICES(ctx, 0);
-      ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
-
-      shProg->data->ShaderStorageBlocks[shaderStorageBlockIndex].Binding =
-         shaderStorageBlockBinding;
-   }
+   shader_storage_block_binding(ctx, shProg, shaderStorageBlockIndex,
+                                shaderStorageBlockBinding);
 }
 
 /**
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index 0656b3a..9236db9 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -113,6 +113,18 @@
                          const GLfloat *value);
 
 void GLAPIENTRY
+_mesa_UniformHandleui64ARB(GLint location, GLuint64 value);
+void GLAPIENTRY
+_mesa_UniformHandleui64vARB(GLint location, GLsizei count,
+                            const GLuint64 *value);
+void GLAPIENTRY
+_mesa_ProgramUniformHandleui64ARB(GLuint program, GLint location,
+                                  GLuint64 value);
+void GLAPIENTRY
+_mesa_ProgramUniformHandleui64vARB(GLuint program, GLint location,
+                                   GLsizei count, const GLuint64 *values);
+
+void GLAPIENTRY
 _mesa_ProgramUniform1f(GLuint program, GLint, GLfloat);
 void GLAPIENTRY
 _mesa_ProgramUniform2f(GLuint program, GLint, GLfloat, GLfloat);
@@ -212,6 +224,8 @@
 _mesa_GetUniformdv(GLuint, GLint, GLdouble *);
 GLint GLAPIENTRY
 _mesa_GetUniformLocation(GLuint, const GLcharARB *);
+GLint GLAPIENTRY
+_mesa_GetUniformLocation_no_error(GLuint, const GLcharARB *);
 GLuint GLAPIENTRY
 _mesa_GetUniformBlockIndex(GLuint program,
 			   const GLchar *uniformBlockName);
@@ -220,10 +234,21 @@
 			GLsizei uniformCount,
 			const GLchar * const *uniformNames,
 			GLuint *uniformIndices);
+
+void GLAPIENTRY
+_mesa_UniformBlockBinding_no_error(GLuint program, GLuint uniformBlockIndex,
+                                   GLuint uniformBlockBinding);
+
 void GLAPIENTRY
 _mesa_UniformBlockBinding(GLuint program,
 			  GLuint uniformBlockIndex,
 			  GLuint uniformBlockBinding);
+
+void GLAPIENTRY
+_mesa_ShaderStorageBlockBinding_no_error(GLuint program,
+                                         GLuint shaderStorageBlockIndex,
+                                         GLuint shaderStorageBlockBinding);
+
 void GLAPIENTRY
 _mesa_ShaderStorageBlockBinding(GLuint program,
                                 GLuint shaderStorageBlockIndex,
@@ -441,6 +466,10 @@
                      GLuint cols, GLuint rows, enum glsl_base_type basicType);
 
 void
+_mesa_uniform_handle(GLint location, GLsizei count, const GLvoid *values,
+                     struct gl_context *, struct gl_shader_program *);
+
+void
 _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
 		  GLsizei bufSize, enum glsl_base_type returnType,
 		  GLvoid *paramsOut);
@@ -470,8 +499,9 @@
 extern bool
 _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *);
 
-extern const struct gl_program_parameter *
-get_uniform_parameter(struct gl_shader_program *shProg, GLint index);
+extern void
+_mesa_flush_vertices_for_uniforms(struct gl_context *ctx,
+                                  const struct gl_uniform_storage *uni);
 
 struct gl_builtin_uniform_element {
    const char *field;
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index 2054985..c67396d 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -258,6 +258,24 @@
    return legalTypesMask;
 }
 
+static GLenum
+get_array_format(const struct gl_context *ctx, GLint sizeMax, GLint *size)
+{
+   GLenum format = GL_RGBA;
+
+   /* Do size parameter checking.
+    * If sizeMax = BGRA_OR_4 it means that size = GL_BGRA is legal and
+    * must be handled specially.
+    */
+   if (ctx->Extensions.EXT_vertex_array_bgra && sizeMax == BGRA_OR_4 &&
+       *size == GL_BGRA) {
+      format = GL_BGRA;
+      *size = 4;
+   }
+
+   return format;
+}
+
 
 /**
  * \param attrib         The index of the attribute array
@@ -302,9 +320,9 @@
 }
 
 /**
- * Does error checking and updates the format in an attrib array.
+ * Does error checking of the format in an attrib array.
  *
- * Called by update_array() and VertexAttrib*Format().
+ * Called by *Pointer() and VertexAttrib*Format().
  *
  * \param func         Name of calling function used for error reporting
  * \param attrib       The index of the attribute array
@@ -317,19 +335,18 @@
  * \param integer      Integer-valued values (will not be normalized to [-1, 1])
  * \param doubles      Double values not reduced to floats
  * \param relativeOffset Offset of the first element relative to the binding offset.
+ * \return bool True if validation is successful, False otherwise.
  */
 static bool
-update_array_format(struct gl_context *ctx,
-                    const char *func,
-                    struct gl_vertex_array_object *vao,
-                    GLuint attrib, GLbitfield legalTypesMask,
-                    GLint sizeMin, GLint sizeMax,
-                    GLint size, GLenum type,
-                    GLboolean normalized, GLboolean integer, GLboolean doubles,
-                    GLuint relativeOffset)
+validate_array_format(struct gl_context *ctx, const char *func,
+                      struct gl_vertex_array_object *vao,
+                      GLuint attrib, GLbitfield legalTypesMask,
+                      GLint sizeMin, GLint sizeMax,
+                      GLint size, GLenum type, GLboolean normalized,
+                      GLboolean integer, GLboolean doubles,
+                      GLuint relativeOffset, GLenum format)
 {
    GLbitfield typeBit;
-   GLenum format = GL_RGBA;
 
    /* at most, one of these bools can be true */
    assert((int) normalized + (int) integer + (int) doubles <= 1);
@@ -359,13 +376,7 @@
       return false;
    }
 
-   /* Do size parameter checking.
-    * If sizeMax = BGRA_OR_4 it means that size = GL_BGRA is legal and
-    * must be handled specially.
-    */
-   if (ctx->Extensions.EXT_vertex_array_bgra &&
-       sizeMax == BGRA_OR_4 &&
-       size == GL_BGRA) {
+   if (format == GL_BGRA) {
       /* Page 298 of the PDF of the OpenGL 4.3 (Core Profile) spec says:
        *
        * "An INVALID_OPERATION error is generated under any of the following
@@ -397,9 +408,6 @@
                      "%s(size=GL_BGRA and normalized=GL_FALSE)", func);
          return false;
       }
-
-      format = GL_BGRA;
-      size = 4;
    }
    else if (size < sizeMin || size > sizeMax || size > 4) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size=%d)", func, size);
@@ -432,16 +440,11 @@
       return false;
    }
 
-   _mesa_update_array_format(ctx, vao, attrib, size, type, format,
-                             normalized, integer, doubles, relativeOffset);
-
    return true;
 }
 
-
 /**
- * Do error checking and update state for glVertex/Color/TexCoord/...Pointer
- * functions.
+ * Do error checking for glVertex/Color/TexCoord/...Pointer functions.
  *
  * \param func  name of calling function used for error reporting
  * \param attrib  the attribute array index to update
@@ -457,17 +460,14 @@
  * \param ptr  the address (or offset inside VBO) of the array data
  */
 static void
-update_array(struct gl_context *ctx,
-             const char *func,
-             GLuint attrib, GLbitfield legalTypesMask,
-             GLint sizeMin, GLint sizeMax,
-             GLint size, GLenum type, GLsizei stride,
-             GLboolean normalized, GLboolean integer, GLboolean doubles,
-             const GLvoid *ptr)
+validate_array(struct gl_context *ctx, const char *func,
+               GLuint attrib, GLbitfield legalTypesMask,
+               GLint sizeMin, GLint sizeMax,
+               GLint size, GLenum type, GLsizei stride,
+               GLboolean normalized, GLboolean integer, GLboolean doubles,
+               const GLvoid *ptr)
 {
    struct gl_vertex_array_object *vao = ctx->Array.VAO;
-   struct gl_array_attributes *array;
-   GLsizei effectiveStride;
 
    /* Page 407 (page 423 of the PDF) of the OpenGL 3.0 spec says:
     *
@@ -509,38 +509,94 @@
     *       to the ARRAY_BUFFER buffer object binding point (see section
     *       2.9.6), and the pointer argument is not NULL."
     */
-   if (ptr != NULL && vao->ARBsemantics &&
+   if (ptr != NULL && vao != ctx->Array.DefaultVAO &&
        !_mesa_is_bufferobj(ctx->Array.ArrayBufferObj)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-VBO array)", func);
       return;
    }
+}
 
-   if (!update_array_format(ctx, func, vao, attrib,
-                            legalTypesMask, sizeMin, sizeMax,
-                            size, type, normalized, integer, doubles, 0)) {
-      return;
-   }
+
+static bool
+validate_array_and_format(struct gl_context *ctx, const char *func,
+                          GLuint attrib, GLbitfield legalTypes,
+                          GLint sizeMin, GLint sizeMax,
+                          GLint size, GLenum type, GLsizei stride,
+                          GLboolean normalized, GLboolean integer,
+                          GLboolean doubles, GLenum format, const GLvoid *ptr,
+                          struct gl_vertex_array_object *vao)
+{
+   validate_array(ctx, func, attrib, legalTypes, sizeMin, sizeMax, size,
+                  type, stride, normalized, integer, doubles, ptr);
+
+   return validate_array_format(ctx, func, vao, attrib, legalTypes, sizeMin,
+                                sizeMax, size, type, normalized, integer,
+                                doubles, 0, format);
+}
+
+
+/**
+ * Update state for glVertex/Color/TexCoord/...Pointer functions.
+ *
+ * \param attrib  the attribute array index to update
+ * \param format  Either GL_RGBA or GL_BGRA.
+ * \param sizeMax  max allowable size value (may also be BGRA_OR_4)
+ * \param size  components per element (1, 2, 3 or 4)
+ * \param type  datatype of each component (GL_FLOAT, GL_INT, etc)
+ * \param stride  stride between elements, in elements
+ * \param normalized  are integer types converted to floats in [-1, 1]?
+ * \param integer  integer-valued values (will not be normalized to [-1,1])
+ * \param doubles  Double values not reduced to floats
+ * \param ptr  the address (or offset inside VBO) of the array data
+ */
+static void
+update_array(struct gl_context *ctx,
+             GLuint attrib, GLenum format,
+             GLint sizeMax,
+             GLint size, GLenum type, GLsizei stride,
+             GLboolean normalized, GLboolean integer, GLboolean doubles,
+             const GLvoid *ptr)
+{
+   struct gl_vertex_array_object *vao = ctx->Array.VAO;
+
+   _mesa_update_array_format(ctx, vao, attrib, size, type, format,
+                             normalized, integer, doubles, 0);
 
    /* Reset the vertex attrib binding */
    vertex_attrib_binding(ctx, vao, attrib, attrib);
 
    /* The Stride and Ptr fields are not set by update_array_format() */
-   array = &vao->VertexAttrib[attrib];
+   struct gl_array_attributes *array = &vao->VertexAttrib[attrib];
    array->Stride = stride;
    array->Ptr = ptr;
 
    /* Update the vertex buffer binding */
-   effectiveStride = stride != 0 ? stride : array->_ElementSize;
+   GLsizei effectiveStride = stride != 0 ? stride : array->_ElementSize;
    _mesa_bind_vertex_buffer(ctx, vao, attrib,
                             ctx->Array.ArrayBufferObj, (GLintptr) ptr,
                             effectiveStride);
 }
 
+void GLAPIENTRY
+_mesa_VertexPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                             const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   update_array(ctx, VERT_ATTRIB_POS, GL_RGBA, 4, size, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+}
+
 
 void GLAPIENTRY
 _mesa_VertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   FLUSH_VERTICES(ctx, 0);
+
+   GLenum format = GL_RGBA;
    GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
       : (SHORT_BIT | INT_BIT | FLOAT_BIT |
@@ -548,11 +604,25 @@
          UNSIGNED_INT_2_10_10_10_REV_BIT |
          INT_2_10_10_10_REV_BIT);
 
+   if (!validate_array_and_format(ctx, "glVertexPointer", VERT_ATTRIB_POS,
+                                  legalTypes, 2, 4, size, type, stride,
+                                  GL_FALSE, GL_FALSE, GL_FALSE, format,
+                                  ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_POS, format, 4, size, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_NormalPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr )
+{
+   GET_CURRENT_CONTEXT(ctx);
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glVertexPointer", VERT_ATTRIB_POS,
-                legalTypes, 2, 4,
-                size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+   update_array(ctx, VERT_ATTRIB_NORMAL, GL_RGBA, 3, 3, type, stride, GL_TRUE,
+                GL_FALSE, GL_FALSE, ptr);
 }
 
 
@@ -560,6 +630,10 @@
 _mesa_NormalPointer(GLenum type, GLsizei stride, const GLvoid *ptr )
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   FLUSH_VERTICES(ctx, 0);
+
+   GLenum format = GL_RGBA;
    const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
       : (BYTE_BIT | SHORT_BIT | INT_BIT |
@@ -567,11 +641,27 @@
          UNSIGNED_INT_2_10_10_10_REV_BIT |
          INT_2_10_10_10_REV_BIT);
 
+   if (!validate_array_and_format(ctx, "glNormalPointer",
+                                  VERT_ATTRIB_NORMAL, legalTypes, 3, 3, 3,
+                                  type, stride, GL_TRUE, GL_FALSE,
+                                  GL_FALSE, format, ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_NORMAL, format, 3, 3, type, stride, GL_TRUE,
+                GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_ColorPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                            const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glNormalPointer", VERT_ATTRIB_NORMAL,
-                legalTypes, 3, 3,
-                3, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
+   update_array(ctx, VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size,
+                type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
 }
 
 
@@ -579,6 +669,11 @@
 _mesa_ColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
+   const GLint sizeMin = (ctx->API == API_OPENGLES) ? 4 : 3;
+
+   FLUSH_VERTICES(ctx, 0);
+
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (UNSIGNED_BYTE_BIT | HALF_BIT | FLOAT_BIT | FIXED_ES_BIT)
       : (BYTE_BIT | UNSIGNED_BYTE_BIT |
@@ -587,42 +682,95 @@
          HALF_BIT | FLOAT_BIT | DOUBLE_BIT |
          UNSIGNED_INT_2_10_10_10_REV_BIT |
          INT_2_10_10_10_REV_BIT);
-   const GLint sizeMin = (ctx->API == API_OPENGLES) ? 4 : 3;
 
+   if (!validate_array_and_format(ctx, "glColorPointer",
+                                  VERT_ATTRIB_COLOR0, legalTypes, sizeMin,
+                                  BGRA_OR_4, size, type, stride, GL_TRUE,
+                                  GL_FALSE, GL_FALSE, format, ptr,
+                                  ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size,
+                type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_FogCoordPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glColorPointer", VERT_ATTRIB_COLOR0,
-                legalTypes, sizeMin, BGRA_OR_4,
-                size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
+   update_array(ctx, VERT_ATTRIB_FOG, GL_RGBA, 1, 1, type, stride, GL_FALSE,
+                GL_FALSE, GL_FALSE, ptr);
 }
 
 
 void GLAPIENTRY
 _mesa_FogCoordPointer(GLenum type, GLsizei stride, const GLvoid *ptr)
 {
-   const GLbitfield legalTypes = (HALF_BIT | FLOAT_BIT | DOUBLE_BIT);
    GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glFogCoordPointer", VERT_ATTRIB_FOG,
-                legalTypes, 1, 1,
-                1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+   GLenum format = GL_RGBA;
+   const GLbitfield legalTypes = (HALF_BIT | FLOAT_BIT | DOUBLE_BIT);
+
+   if (!validate_array_and_format(ctx, "glFogCoordPointer",
+                                  VERT_ATTRIB_FOG, legalTypes, 1, 1, 1,
+                                  type, stride, GL_FALSE, GL_FALSE,
+                                  GL_FALSE, format, ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_FOG, format, 1, 1, type, stride, GL_FALSE,
+                GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_IndexPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   update_array(ctx, VERT_ATTRIB_COLOR_INDEX, GL_RGBA, 1, 1, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
 }
 
 
 void GLAPIENTRY
 _mesa_IndexPointer(GLenum type, GLsizei stride, const GLvoid *ptr)
 {
-   const GLbitfield legalTypes = (UNSIGNED_BYTE_BIT | SHORT_BIT | INT_BIT |
-                                  FLOAT_BIT | DOUBLE_BIT);
    GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glIndexPointer", VERT_ATTRIB_COLOR_INDEX,
-                legalTypes, 1, 1,
-                1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+   GLenum format = GL_RGBA;
+   const GLbitfield legalTypes = (UNSIGNED_BYTE_BIT | SHORT_BIT | INT_BIT |
+                                     FLOAT_BIT | DOUBLE_BIT);
+
+   if (!validate_array_and_format(ctx, "glIndexPointer",
+                                  VERT_ATTRIB_COLOR_INDEX,
+                                  legalTypes, 1, 1, 1, type, stride,
+                                  GL_FALSE, GL_FALSE, GL_FALSE, format,
+                                  ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_COLOR_INDEX, format, 1, 1, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_SecondaryColorPointer_no_error(GLint size, GLenum type,
+                                     GLsizei stride, const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
+   update_array(ctx, VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type,
+                stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
 }
 
 
@@ -630,19 +778,40 @@
 _mesa_SecondaryColorPointer(GLint size, GLenum type,
 			       GLsizei stride, const GLvoid *ptr)
 {
+   GET_CURRENT_CONTEXT(ctx);
+
+   FLUSH_VERTICES(ctx, 0);
+
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT |
                                   SHORT_BIT | UNSIGNED_SHORT_BIT |
                                   INT_BIT | UNSIGNED_INT_BIT |
                                   HALF_BIT | FLOAT_BIT | DOUBLE_BIT |
                                   UNSIGNED_INT_2_10_10_10_REV_BIT |
                                   INT_2_10_10_10_REV_BIT);
-   GET_CURRENT_CONTEXT(ctx);
 
+   if (!validate_array_and_format(ctx, "glSecondaryColorPointer",
+                                  VERT_ATTRIB_COLOR1, legalTypes, 3,
+                                  BGRA_OR_4, size, type, stride,
+                                  GL_TRUE, GL_FALSE, GL_FALSE, format, ptr,
+                                  ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type,
+                stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_TexCoordPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                               const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const GLuint unit = ctx->Array.ActiveTexture;
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glSecondaryColorPointer", VERT_ATTRIB_COLOR1,
-                legalTypes, 3, BGRA_OR_4,
-                size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr);
+   update_array(ctx, VERT_ATTRIB_TEX(unit), GL_RGBA, 4, size, type,
+                stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
 }
 
 
@@ -651,57 +820,117 @@
                       const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   GLbitfield legalTypes = (ctx->API == API_OPENGLES)
-      ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
-      : (SHORT_BIT | INT_BIT |
-         HALF_BIT | FLOAT_BIT | DOUBLE_BIT |
-         UNSIGNED_INT_2_10_10_10_REV_BIT |
-         INT_2_10_10_10_REV_BIT);
    const GLint sizeMin = (ctx->API == API_OPENGLES) ? 2 : 1;
    const GLuint unit = ctx->Array.ActiveTexture;
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glTexCoordPointer", VERT_ATTRIB_TEX(unit),
-                legalTypes, sizeMin, 4,
-                size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE,
-                ptr);
+   GLenum format = GL_RGBA;
+   const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
+      ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
+      : (SHORT_BIT | INT_BIT |
+         HALF_BIT | FLOAT_BIT | DOUBLE_BIT |
+         UNSIGNED_INT_2_10_10_10_REV_BIT |
+         INT_2_10_10_10_REV_BIT);
+
+   if (!validate_array_and_format(ctx, "glTexCoordPointer",
+                                  VERT_ATTRIB_TEX(unit), legalTypes,
+                                  sizeMin, 4, size, type, stride,
+                                  GL_FALSE, GL_FALSE, GL_FALSE, format, ptr,
+                                  ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_TEX(unit), format, 4, size, type,
+                stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_EdgeFlagPointer_no_error(GLsizei stride, const GLvoid *ptr)
+{
+   /* this is the same type that glEdgeFlag uses */
+   const GLboolean integer = GL_FALSE;
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   update_array(ctx, VERT_ATTRIB_EDGEFLAG, GL_RGBA, 1, 1, GL_UNSIGNED_BYTE,
+                stride, GL_FALSE, integer, GL_FALSE, ptr);
 }
 
 
 void GLAPIENTRY
 _mesa_EdgeFlagPointer(GLsizei stride, const GLvoid *ptr)
 {
-   const GLbitfield legalTypes = UNSIGNED_BYTE_BIT;
    /* this is the same type that glEdgeFlag uses */
    const GLboolean integer = GL_FALSE;
    GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array(ctx, "glEdgeFlagPointer", VERT_ATTRIB_EDGEFLAG,
-                legalTypes, 1, 1,
-                1, GL_UNSIGNED_BYTE, stride, GL_FALSE, integer, GL_FALSE, ptr);
+   GLenum format = GL_RGBA;
+   const GLbitfield legalTypes = UNSIGNED_BYTE_BIT;
+
+   if (!validate_array_and_format(ctx, "glEdgeFlagPointer",
+                                  VERT_ATTRIB_EDGEFLAG, legalTypes,
+                                  1, 1, 1, GL_UNSIGNED_BYTE, stride,
+                                  GL_FALSE, integer, GL_FALSE, format, ptr,
+                                  ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_EDGEFLAG, format, 1, 1, GL_UNSIGNED_BYTE,
+                stride, GL_FALSE, integer, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_PointSizePointerOES_no_error(GLenum type, GLsizei stride,
+                                   const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   FLUSH_VERTICES(ctx, 0);
+
+   update_array(ctx, VERT_ATTRIB_POINT_SIZE, GL_RGBA, 1, 1, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
 }
 
 
 void GLAPIENTRY
 _mesa_PointSizePointerOES(GLenum type, GLsizei stride, const GLvoid *ptr)
 {
-   const GLbitfield legalTypes = (FLOAT_BIT | FIXED_ES_BIT);
    GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
+   GLenum format = GL_RGBA;
    if (ctx->API != API_OPENGLES) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glPointSizePointer(ES 1.x only)");
       return;
    }
 
-   update_array(ctx, "glPointSizePointer", VERT_ATTRIB_POINT_SIZE,
-                legalTypes, 1, 1,
-                1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+   const GLbitfield legalTypes = (FLOAT_BIT | FIXED_ES_BIT);
+
+   if (!validate_array_and_format(ctx, "glPointSizePointer",
+                                  VERT_ATTRIB_POINT_SIZE, legalTypes,
+                                  1, 1, 1, type, stride, GL_FALSE, GL_FALSE,
+                                  GL_FALSE, format, ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_POINT_SIZE, format, 1, 1, type, stride,
+                GL_FALSE, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_VertexAttribPointer_no_error(GLuint index, GLint size, GLenum type,
+                                   GLboolean normalized,
+                                   GLsizei stride, const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4,
+                size, type, stride, normalized, GL_FALSE, GL_FALSE, ptr);
 }
 
 
@@ -715,6 +944,14 @@
                              GLboolean normalized,
                              GLsizei stride, const GLvoid *ptr)
 {
+   GET_CURRENT_CONTEXT(ctx);
+
+   GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
+   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribPointerARB(idx)");
+      return;
+   }
+
    const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT |
                                   SHORT_BIT | UNSIGNED_SHORT_BIT |
                                   INT_BIT | UNSIGNED_INT_BIT |
@@ -723,16 +960,29 @@
                                   UNSIGNED_INT_2_10_10_10_REV_BIT |
                                   INT_2_10_10_10_REV_BIT |
                                   UNSIGNED_INT_10F_11F_11F_REV_BIT);
+
+   if (!validate_array_and_format(ctx, "glVertexAttribPointer",
+                                  VERT_ATTRIB_GENERIC(index), legalTypes,
+                                  1, BGRA_OR_4, size, type, stride,
+                                  normalized, GL_FALSE, GL_FALSE, format,
+                                  ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4,
+                size, type, stride, normalized, GL_FALSE, GL_FALSE, ptr);
+}
+
+
+void GLAPIENTRY
+_mesa_VertexAttribIPointer_no_error(GLuint index, GLint size, GLenum type,
+                                    GLsizei stride, const GLvoid *ptr)
+{
+   const GLboolean normalized = GL_FALSE;
+   const GLboolean integer = GL_TRUE;
    GET_CURRENT_CONTEXT(ctx);
 
-   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribPointerARB(index)");
-      return;
-   }
-
-   update_array(ctx, "glVertexAttribPointer", VERT_ATTRIB_GENERIC(index),
-                legalTypes, 1, BGRA_OR_4,
-                size, type, stride, normalized, GL_FALSE, GL_FALSE, ptr);
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), GL_RGBA, 4,  size, type,
+                stride, normalized, integer, GL_FALSE, ptr);
 }
 
 
@@ -746,37 +996,66 @@
 _mesa_VertexAttribIPointer(GLuint index, GLint size, GLenum type,
                            GLsizei stride, const GLvoid *ptr)
 {
-   const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT |
-                                  SHORT_BIT | UNSIGNED_SHORT_BIT |
-                                  INT_BIT | UNSIGNED_INT_BIT);
    const GLboolean normalized = GL_FALSE;
    const GLboolean integer = GL_TRUE;
    GET_CURRENT_CONTEXT(ctx);
 
+   GLenum format = GL_RGBA;
    if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribIPointer(index)");
       return;
    }
 
-   update_array(ctx, "glVertexAttribIPointer", VERT_ATTRIB_GENERIC(index),
-                legalTypes, 1, 4,
-                size, type, stride, normalized, integer, GL_FALSE, ptr);
+   const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT |
+                                  SHORT_BIT | UNSIGNED_SHORT_BIT |
+                                  INT_BIT | UNSIGNED_INT_BIT);
+
+   if (!validate_array_and_format(ctx, "glVertexAttribIPointer",
+                                  VERT_ATTRIB_GENERIC(index), legalTypes,
+                                  1, 4, size, type, stride,
+                                  normalized, integer, GL_FALSE, format,
+                                  ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), format, 4,  size, type,
+                stride, normalized, integer, GL_FALSE, ptr);
 }
 
+
+void GLAPIENTRY
+_mesa_VertexAttribLPointer_no_error(GLuint index, GLint size, GLenum type,
+                                    GLsizei stride, const GLvoid *ptr)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), GL_RGBA, 4, size, type,
+                stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr);
+}
+
+
 void GLAPIENTRY
 _mesa_VertexAttribLPointer(GLuint index, GLint size, GLenum type,
                            GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   const GLbitfield legalTypes = (DOUBLE_BIT);
+
+   GLenum format = GL_RGBA;
    if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribLPointer(index)");
       return;
    }
 
-   update_array(ctx, "glVertexAttribLPointer", VERT_ATTRIB_GENERIC(index),
-                legalTypes, 1, 4,
-                size, type, stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr);
+   const GLbitfield legalTypes = DOUBLE_BIT;
+
+   if (!validate_array_and_format(ctx, "glVertexAttribLPointer",
+                                  VERT_ATTRIB_GENERIC(index), legalTypes,
+                                  1, 4, size, type, stride,
+                                  GL_FALSE, GL_FALSE, GL_TRUE, format,
+                                  ptr, ctx->Array.VAO))
+      return;
+
+   update_array(ctx, VERT_ATTRIB_GENERIC(index), format, 4, size, type,
+                stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr);
 }
 
 
@@ -821,6 +1100,15 @@
 
 
 void GLAPIENTRY
+_mesa_EnableVertexAttribArray_no_error(GLuint index)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_enable_vertex_array_attrib(ctx, ctx->Array.VAO,
+                                    VERT_ATTRIB_GENERIC(index));
+}
+
+
+void GLAPIENTRY
 _mesa_EnableVertexArrayAttrib(GLuint vaobj, GLuint index)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -841,17 +1129,20 @@
 }
 
 
+void GLAPIENTRY
+_mesa_EnableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
+   _mesa_enable_vertex_array_attrib(ctx, vao, VERT_ATTRIB_GENERIC(index));
+}
+
+
 static void
 disable_vertex_array_attrib(struct gl_context *ctx,
                             struct gl_vertex_array_object *vao,
-                            GLuint index,
-                            const char *func)
+                            GLuint index)
 {
-   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(index)", func);
-      return;
-   }
-
    assert(VERT_ATTRIB_GENERIC(index) < ARRAY_SIZE(vao->VertexAttrib));
 
    if (vao->VertexAttrib[VERT_ATTRIB_GENERIC(index)].Enabled) {
@@ -868,8 +1159,21 @@
 _mesa_DisableVertexAttribArray(GLuint index)
 {
    GET_CURRENT_CONTEXT(ctx);
-   disable_vertex_array_attrib(ctx, ctx->Array.VAO, index,
-                               "glDisableVertexAttribArray");
+
+   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glDisableVertexAttribArray(index)");
+      return;
+   }
+
+   disable_vertex_array_attrib(ctx, ctx->Array.VAO, index);
+}
+
+
+void GLAPIENTRY
+_mesa_DisableVertexAttribArray_no_error(GLuint index)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   disable_vertex_array_attrib(ctx, ctx->Array.VAO, index);
 }
 
 
@@ -890,7 +1194,21 @@
    if (!vao)
       return;
 
-   disable_vertex_array_attrib(ctx, vao, index, "glDisableVertexArrayAttrib");
+   if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glDisableVertexArrayAttrib(index)");
+      return;
+   }
+
+   disable_vertex_array_attrib(ctx, vao, index);
+}
+
+
+void GLAPIENTRY
+_mesa_DisableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
+   disable_vertex_array_attrib(ctx, vao, index);
 }
 
 
@@ -1074,6 +1392,29 @@
    }
 }
 
+void GLAPIENTRY
+_mesa_GetVertexAttribLui64vARB(GLuint index, GLenum pname, GLuint64EXT *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (pname == GL_CURRENT_VERTEX_ATTRIB_ARB) {
+      const GLuint64 *v =
+         (const GLuint64 *)get_current_attrib(ctx, index,
+                                              "glGetVertexAttribLui64vARB");
+      if (v != NULL) {
+         params[0] = v[0];
+         params[1] = v[1];
+         params[2] = v[2];
+         params[3] = v[3];
+      }
+   }
+   else {
+      params[0] = (GLuint64) get_vertex_array_attrib(ctx, ctx->Array.VAO,
+                                                     index, pname,
+                                                     "glGetVertexAttribLui64vARB");
+   }
+}
+
 
 /** GL 3.0 */
 void GLAPIENTRY
@@ -1596,12 +1937,38 @@
    }
 
    if (ctx->Array.RestartIndex != index) {
-      FLUSH_VERTICES(ctx, _NEW_TRANSFORM);
+      FLUSH_VERTICES(ctx, 0);
       ctx->Array.RestartIndex = index;
    }
 }
 
 
+void GLAPIENTRY
+_mesa_VertexAttribDivisor_no_error(GLuint index, GLuint divisor)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   const GLuint genericIndex = VERT_ATTRIB_GENERIC(index);
+   struct gl_vertex_array_object * const vao = ctx->Array.VAO;
+
+   assert(genericIndex < ARRAY_SIZE(vao->VertexAttrib));
+
+   /* The ARB_vertex_attrib_binding spec says:
+    *
+    *    "The command
+    *
+    *       void VertexAttribDivisor(uint index, uint divisor);
+    *
+    *     is equivalent to (assuming no errors are generated):
+    *
+    *       VertexAttribBinding(index, index);
+    *       VertexBindingDivisor(index, divisor);"
+    */
+   vertex_attrib_binding(ctx, vao, genericIndex, genericIndex);
+   vertex_binding_divisor(ctx, vao, genericIndex, divisor);
+}
+
+
 /**
  * See GL_ARB_instanced_arrays.
  * Note that the instance divisor only applies to generic arrays, not
@@ -1621,8 +1988,8 @@
    }
 
    if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribDivisor(index = %u)",
-                  index);
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glVertexAttribDivisor(index = %u)", index);
       return;
    }
 
@@ -1644,28 +2011,46 @@
 }
 
 
-unsigned
-_mesa_primitive_restart_index(const struct gl_context *ctx, GLenum ib_type)
+static ALWAYS_INLINE void
+vertex_array_vertex_buffer(struct gl_context *ctx,
+                           struct gl_vertex_array_object *vao,
+                           GLuint bindingIndex, GLuint buffer, GLintptr offset,
+                           GLsizei stride, bool no_error, const char *func)
 {
-   /* From the OpenGL 4.3 core specification, page 302:
-    * "If both PRIMITIVE_RESTART and PRIMITIVE_RESTART_FIXED_INDEX are
-    *  enabled, the index value determined by PRIMITIVE_RESTART_FIXED_INDEX
-    *  is used."
-    */
-   if (ctx->Array.PrimitiveRestartFixedIndex) {
-      switch (ib_type) {
-      case GL_UNSIGNED_BYTE:
-         return 0xff;
-      case GL_UNSIGNED_SHORT:
-         return 0xffff;
-      case GL_UNSIGNED_INT:
-         return 0xffffffff;
-      default:
-         assert(!"_mesa_primitive_restart_index: Invalid index buffer type.");
+   struct gl_buffer_object *vbo;
+   if (buffer ==
+       vao->BufferBinding[VERT_ATTRIB_GENERIC(bindingIndex)].BufferObj->Name) {
+      vbo = vao->BufferBinding[VERT_ATTRIB_GENERIC(bindingIndex)].BufferObj;
+   } else if (buffer != 0) {
+      vbo = _mesa_lookup_bufferobj(ctx, buffer);
+
+      if (!no_error && !vbo && _mesa_is_gles31(ctx)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", func);
+         return;
       }
+      /* From the GL_ARB_vertex_attrib_array spec:
+       *
+       *   "[Core profile only:]
+       *    An INVALID_OPERATION error is generated if buffer is not zero or a
+       *    name returned from a previous call to GenBuffers, or if such a name
+       *    has since been deleted with DeleteBuffers.
+       *
+       * Otherwise, we fall back to the same compat profile behavior as other
+       * object references (automatically gen it).
+       */
+      if (!_mesa_handle_bind_buffer_gen(ctx, buffer, &vbo, func))
+         return;
+   } else {
+      /* The ARB_vertex_attrib_binding spec says:
+       *
+       *    "If <buffer> is zero, any buffer object attached to this
+       *     bindpoint is detached."
+       */
+      vbo = ctx->Shared->NullBufferObj;
    }
 
-   return ctx->Array.RestartIndex;
+   _mesa_bind_vertex_buffer(ctx, vao, VERT_ATTRIB_GENERIC(bindingIndex),
+                            vbo, offset, stride);
 }
 
 
@@ -1673,13 +2058,12 @@
  * GL_ARB_vertex_attrib_binding
  */
 static void
-vertex_array_vertex_buffer(struct gl_context *ctx,
-                           struct gl_vertex_array_object *vao,
-                           GLuint bindingIndex, GLuint buffer, GLintptr offset,
-                           GLsizei stride, const char *func)
+vertex_array_vertex_buffer_err(struct gl_context *ctx,
+                               struct gl_vertex_array_object *vao,
+                               GLuint bindingIndex, GLuint buffer,
+                               GLintptr offset, GLsizei stride,
+                               const char *func)
 {
-   struct gl_buffer_object *vbo;
-
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
    /* The ARB_vertex_attrib_binding spec says:
@@ -1720,39 +2104,19 @@
       return;
    }
 
-   if (buffer ==
-       vao->BufferBinding[VERT_ATTRIB_GENERIC(bindingIndex)].BufferObj->Name) {
-      vbo = vao->BufferBinding[VERT_ATTRIB_GENERIC(bindingIndex)].BufferObj;
-   } else if (buffer != 0) {
-      vbo = _mesa_lookup_bufferobj(ctx, buffer);
+   vertex_array_vertex_buffer(ctx, vao, bindingIndex, buffer, offset,
+                              stride, false, func);
+}
 
-      if (!vbo && _mesa_is_gles31(ctx)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", func);
-         return;
-      }
-      /* From the GL_ARB_vertex_attrib_array spec:
-       *
-       *   "[Core profile only:]
-       *    An INVALID_OPERATION error is generated if buffer is not zero or a
-       *    name returned from a previous call to GenBuffers, or if such a name
-       *    has since been deleted with DeleteBuffers.
-       *
-       * Otherwise, we fall back to the same compat profile behavior as other
-       * object references (automatically gen it).
-       */
-      if (!_mesa_handle_bind_buffer_gen(ctx, buffer, &vbo, func))
-         return;
-   } else {
-      /* The ARB_vertex_attrib_binding spec says:
-       *
-       *    "If <buffer> is zero, any buffer object attached to this
-       *     bindpoint is detached."
-       */
-      vbo = ctx->Shared->NullBufferObj;
-   }
 
-   _mesa_bind_vertex_buffer(ctx, vao, VERT_ATTRIB_GENERIC(bindingIndex),
-                            vbo, offset, stride);
+void GLAPIENTRY
+_mesa_BindVertexBuffer_no_error(GLuint bindingIndex, GLuint buffer,
+                                GLintptr offset, GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   vertex_array_vertex_buffer(ctx, ctx->Array.VAO, bindingIndex,
+                              buffer, offset, stride, true,
+                              "glBindVertexBuffer");
 }
 
 
@@ -1774,8 +2138,22 @@
       return;
    }
 
-   vertex_array_vertex_buffer(ctx, ctx->Array.VAO, bindingIndex,
-                              buffer, offset, stride, "glBindVertexBuffer");
+   vertex_array_vertex_buffer_err(ctx, ctx->Array.VAO, bindingIndex,
+                                  buffer, offset, stride,
+                                  "glBindVertexBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_VertexArrayVertexBuffer_no_error(GLuint vaobj, GLuint bindingIndex,
+                                       GLuint buffer, GLintptr offset,
+                                       GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
+   vertex_array_vertex_buffer(ctx, vao, bindingIndex, buffer, offset,
+                              stride, true, "glVertexArrayVertexBuffer");
 }
 
 
@@ -1796,35 +2174,19 @@
    if (!vao)
       return;
 
-   vertex_array_vertex_buffer(ctx, vao, bindingIndex,
-                              buffer, offset, stride,
-                              "glVertexArrayVertexBuffer");
+   vertex_array_vertex_buffer_err(ctx, vao, bindingIndex, buffer, offset,
+                                  stride, "glVertexArrayVertexBuffer");
 }
 
 
-static void
+static ALWAYS_INLINE void
 vertex_array_vertex_buffers(struct gl_context *ctx,
                             struct gl_vertex_array_object *vao,
                             GLuint first, GLsizei count, const GLuint *buffers,
                             const GLintptr *offsets, const GLsizei *strides,
-                            const char *func)
+                            bool no_error, const char *func)
 {
-   GLuint i;
-
-   ASSERT_OUTSIDE_BEGIN_END(ctx);
-
-   /* The ARB_multi_bind spec says:
-    *
-    *    "An INVALID_OPERATION error is generated if <first> + <count>
-    *     is greater than the value of MAX_VERTEX_ATTRIB_BINDINGS."
-    */
-   if (first + count > ctx->Const.MaxVertexAttribBindings) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(first=%u + count=%d > the value of "
-                  "GL_MAX_VERTEX_ATTRIB_BINDINGS=%u)",
-                  func, first, count, ctx->Const.MaxVertexAttribBindings);
-      return;
-   }
+   GLint i;
 
    if (!buffers) {
       /**
@@ -1869,31 +2231,33 @@
    for (i = 0; i < count; i++) {
       struct gl_buffer_object *vbo;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *    "An INVALID_VALUE error is generated if any value in
-       *     <offsets> or <strides> is negative (per binding)."
-       */
-      if (offsets[i] < 0) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "%s(offsets[%u]=%" PRId64 " < 0)",
-                     func, i, (int64_t) offsets[i]);
-         continue;
-      }
+      if (!no_error) {
+         /* The ARB_multi_bind spec says:
+          *
+          *    "An INVALID_VALUE error is generated if any value in
+          *     <offsets> or <strides> is negative (per binding)."
+          */
+         if (offsets[i] < 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(offsets[%u]=%" PRId64 " < 0)",
+                        func, i, (int64_t) offsets[i]);
+            continue;
+         }
 
-      if (strides[i] < 0) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "%s(strides[%u]=%d < 0)",
-                     func, i, strides[i]);
-         continue;
-      }
+         if (strides[i] < 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(strides[%u]=%d < 0)",
+                        func, i, strides[i]);
+            continue;
+         }
 
-      if (ctx->API == API_OPENGL_CORE && ctx->Version >= 44 &&
-          strides[i] > ctx->Const.MaxVertexAttribStride) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "%s(strides[%u]=%d > "
-                     "GL_MAX_VERTEX_ATTRIB_STRIDE)", func, i, strides[i]);
-         continue;
+         if (ctx->API == API_OPENGL_CORE && ctx->Version >= 44 &&
+             strides[i] > ctx->Const.MaxVertexAttribStride) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(strides[%u]=%d > "
+                        "GL_MAX_VERTEX_ATTRIB_STRIDE)", func, i, strides[i]);
+            continue;
+         }
       }
 
       if (buffers[i]) {
@@ -1919,6 +2283,46 @@
 }
 
 
+static void
+vertex_array_vertex_buffers_err(struct gl_context *ctx,
+                                struct gl_vertex_array_object *vao,
+                                GLuint first, GLsizei count,
+                                const GLuint *buffers, const GLintptr *offsets,
+                                const GLsizei *strides, const char *func)
+{
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   /* The ARB_multi_bind spec says:
+    *
+    *    "An INVALID_OPERATION error is generated if <first> + <count>
+    *     is greater than the value of MAX_VERTEX_ATTRIB_BINDINGS."
+    */
+   if (first + count > ctx->Const.MaxVertexAttribBindings) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(first=%u + count=%d > the value of "
+                  "GL_MAX_VERTEX_ATTRIB_BINDINGS=%u)",
+                  func, first, count, ctx->Const.MaxVertexAttribBindings);
+      return;
+   }
+
+   vertex_array_vertex_buffers(ctx, vao, first, count, buffers, offsets,
+                               strides, false, func);
+}
+
+
+void GLAPIENTRY
+_mesa_BindVertexBuffers_no_error(GLuint first, GLsizei count,
+                                 const GLuint *buffers, const GLintptr *offsets,
+                                 const GLsizei *strides)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   vertex_array_vertex_buffers(ctx, ctx->Array.VAO, first, count,
+                               buffers, offsets, strides, true,
+                               "glBindVertexBuffers");
+}
+
+
 void GLAPIENTRY
 _mesa_BindVertexBuffers(GLuint first, GLsizei count, const GLuint *buffers,
                         const GLintptr *offsets, const GLsizei *strides)
@@ -1937,9 +2341,24 @@
       return;
    }
 
-   vertex_array_vertex_buffers(ctx, ctx->Array.VAO, first, count,
-                               buffers, offsets, strides,
-                               "glBindVertexBuffers");
+   vertex_array_vertex_buffers_err(ctx, ctx->Array.VAO, first, count,
+                                   buffers, offsets, strides,
+                                   "glBindVertexBuffers");
+}
+
+
+void GLAPIENTRY
+_mesa_VertexArrayVertexBuffers_no_error(GLuint vaobj, GLuint first,
+                                        GLsizei count, const GLuint *buffers,
+                                        const GLintptr *offsets,
+                                        const GLsizei *strides)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
+   vertex_array_vertex_buffers(ctx, vao, first, count,
+                               buffers, offsets, strides, true,
+                               "glVertexArrayVertexBuffers");
 }
 
 
@@ -1961,9 +2380,9 @@
    if (!vao)
       return;
 
-   vertex_array_vertex_buffers(ctx, vao, first, count,
-                               buffers, offsets, strides,
-                               "glVertexArrayVertexBuffers");
+   vertex_array_vertex_buffers_err(ctx, vao, first, count,
+                                   buffers, offsets, strides,
+                                   "glVertexArrayVertexBuffers");
 }
 
 
@@ -1971,50 +2390,63 @@
 vertex_attrib_format(GLuint attribIndex, GLint size, GLenum type,
                      GLboolean normalized, GLboolean integer,
                      GLboolean doubles, GLbitfield legalTypes,
-                     GLsizei maxSize, GLuint relativeOffset,
+                     GLsizei sizeMax, GLuint relativeOffset,
                      const char *func)
 {
    GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
-   /* The ARB_vertex_attrib_binding spec says:
-    *
-    *    "An INVALID_OPERATION error is generated under any of the following
-    *     conditions:
-    *     - if no vertex array object is currently bound (see section 2.10);
-    *     - ..."
-    *
-    * This error condition only applies to VertexAttribFormat and
-    * VertexAttribIFormat in the extension spec, but we assume that this
-    * is an oversight.  In the OpenGL 4.3 (Core Profile) spec, it applies
-    * to all three functions.
-    */
-   if ((ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx)) &&
-       ctx->Array.VAO == ctx->Array.DefaultVAO) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "%s(No array object bound)", func);
-      return;
-   }
+   GLenum format = get_array_format(ctx, sizeMax, &size);
 
-   /* The ARB_vertex_attrib_binding spec says:
-    *
-    *   "The error INVALID_VALUE is generated if index is greater than or equal
-    *     to the value of MAX_VERTEX_ATTRIBS."
-    */
-   if (attribIndex >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(attribindex=%u > "
-                  "GL_MAX_VERTEX_ATTRIBS)",
-                  func, attribIndex);
-      return;
+   if (!_mesa_is_no_error_enabled(ctx)) {
+      /* The ARB_vertex_attrib_binding spec says:
+       *
+       *    "An INVALID_OPERATION error is generated under any of the
+       *    following conditions:
+       *     - if no vertex array object is currently bound (see section
+       *       2.10);
+       *     - ..."
+       *
+       * This error condition only applies to VertexAttribFormat and
+       * VertexAttribIFormat in the extension spec, but we assume that this
+       * is an oversight.  In the OpenGL 4.3 (Core Profile) spec, it applies
+       * to all three functions.
+       */
+      if ((ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx)) &&
+          ctx->Array.VAO == ctx->Array.DefaultVAO) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(No array object bound)", func);
+         return;
+      }
+
+      /* The ARB_vertex_attrib_binding spec says:
+       *
+       *   "The error INVALID_VALUE is generated if index is greater than or
+       *   equal to the value of MAX_VERTEX_ATTRIBS."
+       */
+      if (attribIndex >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(attribindex=%u > "
+                     "GL_MAX_VERTEX_ATTRIBS)",
+                     func, attribIndex);
+         return;
+      }
+
+      if (!validate_array_format(ctx, func, ctx->Array.VAO,
+                                 VERT_ATTRIB_GENERIC(attribIndex),
+                                 legalTypes, 1, sizeMax, size, type,
+                                 normalized, integer, doubles, relativeOffset,
+                                 format)) {
+         return;
+      }
    }
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array_format(ctx, func, ctx->Array.VAO,
-                       VERT_ATTRIB_GENERIC(attribIndex),
-                       legalTypes, 1, maxSize, size, type,
-                       normalized, integer, doubles, relativeOffset);
+   _mesa_update_array_format(ctx, ctx->Array.VAO,
+                             VERT_ATTRIB_GENERIC(attribIndex), size, type,
+                             format, normalized, integer, doubles,
+                             relativeOffset);
 }
 
 
@@ -2053,7 +2485,7 @@
 vertex_array_attrib_format(GLuint vaobj, GLuint attribIndex, GLint size,
                            GLenum type, GLboolean normalized,
                            GLboolean integer, GLboolean doubles,
-                           GLbitfield legalTypes, GLsizei maxSize,
+                           GLbitfield legalTypes, GLsizei sizeMax,
                            GLuint relativeOffset, const char *func)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -2061,34 +2493,49 @@
 
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
-   /* The ARB_direct_state_access spec says:
-    *
-    *   "An INVALID_OPERATION error is generated by VertexArrayAttrib*Format
-    *    if <vaobj> is not [compatibility profile: zero or] the name of an
-    *    existing vertex array object."
-    */
-   vao = _mesa_lookup_vao_err(ctx, vaobj, func);
-   if (!vao)
-      return;
+   GLenum format = get_array_format(ctx, sizeMax, &size);
 
-   /* The ARB_vertex_attrib_binding spec says:
-    *
-    *   "The error INVALID_VALUE is generated if index is greater than or equal
-    *    to the value of MAX_VERTEX_ATTRIBS."
-    */
-   if (attribIndex >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(attribindex=%u > GL_MAX_VERTEX_ATTRIBS)",
-                  func, attribIndex);
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      vao = _mesa_lookup_vao(ctx, vaobj);
+      if (!vao)
+         return;
+   } else {
+      /* The ARB_direct_state_access spec says:
+       *
+       *   "An INVALID_OPERATION error is generated by
+       *   VertexArrayAttrib*Format if <vaobj> is not [compatibility profile:
+       *   zero or] the name of an existing vertex array object."
+       */
+      vao = _mesa_lookup_vao_err(ctx, vaobj, func);
+      if (!vao)
+         return;
+
+      /* The ARB_vertex_attrib_binding spec says:
+       *
+       *   "The error INVALID_VALUE is generated if index is greater than or
+       *   equal to the value of MAX_VERTEX_ATTRIBS."
+       */
+      if (attribIndex >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(attribindex=%u > GL_MAX_VERTEX_ATTRIBS)",
+                     func, attribIndex);
+         return;
+      }
+
+      if (!validate_array_format(ctx, func, vao,
+                                 VERT_ATTRIB_GENERIC(attribIndex),
+                                 legalTypes, 1, sizeMax, size, type,
+                                 normalized, integer, doubles, relativeOffset,
+                                 format)) {
+         return;
+      }
    }
 
    FLUSH_VERTICES(ctx, 0);
 
-   update_array_format(ctx, func, vao,
-                       VERT_ATTRIB_GENERIC(attribIndex),
-                       legalTypes, 1, maxSize, size, type, normalized,
-                       integer, doubles, relativeOffset);
+   _mesa_update_array_format(ctx, vao, VERT_ATTRIB_GENERIC(attribIndex), size,
+                             type, format, normalized, integer, doubles,
+                             relativeOffset);
 }
 
 
@@ -2168,6 +2615,16 @@
 
 
 void GLAPIENTRY
+_mesa_VertexAttribBinding_no_error(GLuint attribIndex, GLuint bindingIndex)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   vertex_attrib_binding(ctx, ctx->Array.VAO,
+                         VERT_ATTRIB_GENERIC(attribIndex),
+                         VERT_ATTRIB_GENERIC(bindingIndex));
+}
+
+
+void GLAPIENTRY
 _mesa_VertexAttribBinding(GLuint attribIndex, GLuint bindingIndex)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -2191,6 +2648,19 @@
 
 
 void GLAPIENTRY
+_mesa_VertexArrayAttribBinding_no_error(GLuint vaobj, GLuint attribIndex,
+                                        GLuint bindingIndex)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
+   vertex_attrib_binding(ctx, vao,
+                         VERT_ATTRIB_GENERIC(attribIndex),
+                         VERT_ATTRIB_GENERIC(bindingIndex));
+}
+
+
+void GLAPIENTRY
 _mesa_VertexArrayAttribBinding(GLuint vaobj, GLuint attribIndex, GLuint bindingIndex)
 {
    GET_CURRENT_CONTEXT(ctx);
diff --git a/src/mesa/main/varray.h b/src/mesa/main/varray.h
index 9216571..ca49042 100644
--- a/src/mesa/main/varray.h
+++ b/src/mesa/main/varray.h
@@ -109,28 +109,41 @@
                          GLintptr offset, GLsizei stride);
 
 extern void GLAPIENTRY
+_mesa_VertexPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                             const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_VertexPointer(GLint size, GLenum type, GLsizei stride,
                     const GLvoid *ptr);
 
-
+extern void GLAPIENTRY
+_mesa_NormalPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr);
 extern void GLAPIENTRY
 _mesa_NormalPointer(GLenum type, GLsizei stride, const GLvoid *ptr);
 
-
+extern void GLAPIENTRY
+_mesa_ColorPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                            const GLvoid *ptr);
 extern void GLAPIENTRY
 _mesa_ColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_IndexPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_IndexPointer(GLenum type, GLsizei stride, const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_TexCoordPointer_no_error(GLint size, GLenum type, GLsizei stride,
+                               const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_TexCoordPointer(GLint size, GLenum type, GLsizei stride,
                       const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_EdgeFlagPointer_no_error(GLsizei stride, const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_EdgeFlagPointer(GLsizei stride, const GLvoid *ptr);
 
 
@@ -162,48 +175,79 @@
 extern void GLAPIENTRY
 _mesa_EdgeFlagPointerEXT(GLsizei stride, GLsizei count, const GLboolean *ptr);
 
-
+extern void GLAPIENTRY
+_mesa_FogCoordPointer_no_error(GLenum type, GLsizei stride,
+                               const GLvoid *ptr);
 extern void GLAPIENTRY
 _mesa_FogCoordPointer(GLenum type, GLsizei stride, const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_SecondaryColorPointer_no_error(GLint size, GLenum type,
+                                     GLsizei stride, const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_SecondaryColorPointer(GLint size, GLenum type,
 			       GLsizei stride, const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_PointSizePointerOES_no_error(GLenum type, GLsizei stride,
+                                   const GLvoid *ptr);
+extern void GLAPIENTRY
 _mesa_PointSizePointerOES(GLenum type, GLsizei stride, const GLvoid *ptr);
 
 
 extern void GLAPIENTRY
+_mesa_VertexAttribPointer_no_error(GLuint index, GLint size, GLenum type,
+                                   GLboolean normalized, GLsizei stride,
+                                   const GLvoid *pointer);
+extern void GLAPIENTRY
 _mesa_VertexAttribPointer(GLuint index, GLint size, GLenum type,
                              GLboolean normalized, GLsizei stride,
                              const GLvoid *pointer);
 
 void GLAPIENTRY
+_mesa_VertexAttribIPointer_no_error(GLuint index, GLint size, GLenum type,
+                                    GLsizei stride, const GLvoid *ptr);
+void GLAPIENTRY
 _mesa_VertexAttribIPointer(GLuint index, GLint size, GLenum type,
                            GLsizei stride, const GLvoid *ptr);
 
 extern void GLAPIENTRY
+_mesa_VertexAttribLPointer_no_error(GLuint index, GLint size, GLenum type,
+                                    GLsizei stride, const GLvoid *pointer);
+extern void GLAPIENTRY
 _mesa_VertexAttribLPointer(GLuint index, GLint size, GLenum type,
                            GLsizei stride, const GLvoid *pointer);
 
 extern void GLAPIENTRY
 _mesa_EnableVertexAttribArray(GLuint index);
 
+extern void GLAPIENTRY
+_mesa_EnableVertexAttribArray_no_error(GLuint index);
+
 
 extern void GLAPIENTRY
 _mesa_EnableVertexArrayAttrib(GLuint vaobj, GLuint index);
 
+extern void GLAPIENTRY
+_mesa_EnableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index);
+
+
 
 extern void GLAPIENTRY
 _mesa_DisableVertexAttribArray(GLuint index);
 
+extern void GLAPIENTRY
+_mesa_DisableVertexAttribArray_no_error(GLuint index);
+
 
 extern void GLAPIENTRY
 _mesa_DisableVertexArrayAttrib(GLuint vaobj, GLuint index);
 
+extern void GLAPIENTRY
+_mesa_DisableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index);
+
 
 extern void GLAPIENTRY
 _mesa_GetVertexAttribdv(GLuint index, GLenum pname, GLdouble *params);
@@ -217,6 +261,9 @@
 extern void GLAPIENTRY
 _mesa_GetVertexAttribiv(GLuint index, GLenum pname, GLint *params);
 
+extern void GLAPIENTRY
+_mesa_GetVertexAttribLui64vARB(GLuint index, GLenum pname, GLuint64EXT *params);
+
 
 extern void GLAPIENTRY
 _mesa_GetVertexAttribIiv(GLuint index, GLenum pname, GLint *params);
@@ -307,25 +354,58 @@
 extern void GLAPIENTRY
 _mesa_PrimitiveRestartIndex(GLuint index);
 
-
+extern void GLAPIENTRY
+_mesa_VertexAttribDivisor_no_error(GLuint index, GLuint divisor);
 extern void GLAPIENTRY
 _mesa_VertexAttribDivisor(GLuint index, GLuint divisor);
 
-extern unsigned
-_mesa_primitive_restart_index(const struct gl_context *ctx, GLenum ib_type);
+static inline unsigned
+_mesa_primitive_restart_index(const struct gl_context *ctx,
+                              unsigned index_size)
+{
+   /* From the OpenGL 4.3 core specification, page 302:
+    * "If both PRIMITIVE_RESTART and PRIMITIVE_RESTART_FIXED_INDEX are
+    *  enabled, the index value determined by PRIMITIVE_RESTART_FIXED_INDEX
+    *  is used."
+    */
+   if (ctx->Array.PrimitiveRestartFixedIndex) {
+      /* 1 -> 0xff, 2 -> 0xffff, 4 -> 0xffffffff */
+      return 0xffffffffu >> 8 * (4 - index_size);
+   }
 
+   return ctx->Array.RestartIndex;
+}
+
+extern void GLAPIENTRY
+_mesa_BindVertexBuffer_no_error(GLuint bindingIndex, GLuint buffer,
+                                GLintptr offset, GLsizei stride);
 extern void GLAPIENTRY
 _mesa_BindVertexBuffer(GLuint bindingIndex, GLuint buffer, GLintptr offset,
                        GLsizei stride);
 
+void GLAPIENTRY
+_mesa_VertexArrayVertexBuffer_no_error(GLuint vaobj, GLuint bindingIndex,
+                                       GLuint buffer, GLintptr offset,
+                                       GLsizei stride);
 extern void GLAPIENTRY
 _mesa_VertexArrayVertexBuffer(GLuint vaobj, GLuint bindingIndex, GLuint buffer,
                               GLintptr offset, GLsizei stride);
 
+void GLAPIENTRY
+_mesa_BindVertexBuffers_no_error(GLuint first, GLsizei count,
+                                 const GLuint *buffers, const GLintptr *offsets,
+                                 const GLsizei *strides);
+
 extern void GLAPIENTRY
 _mesa_BindVertexBuffers(GLuint first, GLsizei count, const GLuint *buffers,
                         const GLintptr *offsets, const GLsizei *strides);
 
+void GLAPIENTRY
+_mesa_VertexArrayVertexBuffers_no_error(GLuint vaobj, GLuint first,
+                                        GLsizei count, const GLuint *buffers,
+                                        const GLintptr *offsets,
+                                        const GLsizei *strides);
+
 extern void GLAPIENTRY
 _mesa_VertexArrayVertexBuffers(GLuint vaobj, GLuint first, GLsizei count,
                                const GLuint *buffers,
@@ -358,9 +438,16 @@
                                GLint size, GLenum type,
                                GLuint relativeOffset);
 
+void GLAPIENTRY
+_mesa_VertexAttribBinding_no_error(GLuint attribIndex, GLuint bindingIndex);
+
 extern void GLAPIENTRY
 _mesa_VertexAttribBinding(GLuint attribIndex, GLuint bindingIndex);
 
+void GLAPIENTRY
+_mesa_VertexArrayAttribBinding_no_error(GLuint vaobj, GLuint attribIndex,
+                                        GLuint bindingIndex);
+
 extern void GLAPIENTRY
 _mesa_VertexArrayAttribBinding(GLuint vaobj, GLuint attribIndex,
                                GLuint bindingIndex);
diff --git a/src/mesa/main/vdpau.c b/src/mesa/main/vdpau.c
index 44be3a3..051071b 100644
--- a/src/mesa/main/vdpau.c
+++ b/src/mesa/main/vdpau.c
@@ -145,11 +145,11 @@
    surf->output = isOutput;
    for (i = 0; i < numTextureNames; ++i) {
       struct gl_texture_object *tex;
-      tex  = _mesa_lookup_texture(ctx, textureNames[i]);
+
+      tex = _mesa_lookup_texture_err(ctx, textureNames[i],
+                                     "VDPAURegisterSurfaceNV");
       if (tex == NULL) {
          free(surf);
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "VDPAURegisterSurfaceNV(texture ID not found)");
          return (GLintptr)NULL;
       }
 
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 77ac9bc..7bed569 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -326,7 +326,6 @@
                          extensions->ARB_gpu_shader5 &&
                          extensions->ARB_gpu_shader_fp64 &&
                          extensions->ARB_sample_shading &&
-                         extensions->ARB_shader_subroutine &&
                          extensions->ARB_tessellation_shader &&
                          extensions->ARB_texture_buffer_object_rgb32 &&
                          extensions->ARB_texture_cube_map_array &&
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index 6d3e576..0a5caf0 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -40,8 +40,6 @@
                        GLfloat x, GLfloat y,
                        GLfloat width, GLfloat height)
 {
-   FLUSH_VERTICES(ctx, _NEW_VIEWPORT);
-
    /* clamp width and height to the implementation dependent range */
    width  = MIN2(width, (GLfloat) ctx->Const.MaxViewportWidth);
    height = MIN2(height, (GLfloat) ctx->Const.MaxViewportHeight);
@@ -69,6 +67,9 @@
        ctx->ViewportArray[idx].Height == height)
       return;
 
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewViewport ? 0 : _NEW_VIEWPORT);
+   ctx->NewDriverState |= ctx->DriverFlags.NewViewport;
+
    ctx->ViewportArray[idx].X = x;
    ctx->ViewportArray[idx].Width = width;
    ctx->ViewportArray[idx].Y = y;
@@ -84,29 +85,10 @@
    GLdouble Near, Far;          /**< Depth buffer range */
 };
 
-/**
- * Set the viewport.
- * \sa Called via glViewport() or display list execution.
- *
- * Flushes the vertices and calls _mesa_set_viewport() with the given
- * parameters.
- */
-void GLAPIENTRY
-_mesa_Viewport(GLint x, GLint y, GLsizei width, GLsizei height)
+static void
+viewport(struct gl_context *ctx, GLint x, GLint y, GLsizei width,
+         GLsizei height)
 {
-   unsigned i;
-   GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glViewport %d %d %d %d\n", x, y, width, height);
-
-   if (width < 0 || height < 0) {
-      _mesa_error(ctx,  GL_INVALID_VALUE,
-                   "glViewport(%d, %d, %d, %d)", x, y, width, height);
-      return;
-   }
-
    /* The GL_ARB_viewport_array spec says:
     *
     *     "Viewport sets the parameters for all viewports to the same values
@@ -118,15 +100,42 @@
     * Set all of the viewports supported by the implementation, but only
     * signal the driver once at the end.
     */
-   for (i = 0; i < ctx->Const.MaxViewports; i++)
+   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++)
       set_viewport_no_notify(ctx, i, x, y, width, height);
 
-   if (ctx->Driver.Viewport) {
-      /* Many drivers will use this call to check for window size changes
-       * and reallocate the z/stencil/accum/etc buffers if needed.
-       */
+   if (ctx->Driver.Viewport)
       ctx->Driver.Viewport(ctx);
+}
+
+/**
+ * Set the viewport.
+ * \sa Called via glViewport() or display list execution.
+ *
+ * Flushes the vertices and calls _mesa_set_viewport() with the given
+ * parameters.
+ */
+void GLAPIENTRY
+_mesa_Viewport_no_error(GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   viewport(ctx, x, y, width, height);
+}
+
+void GLAPIENTRY
+_mesa_Viewport(GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glViewport %d %d %d %d\n", x, y, width, height);
+
+   if (width < 0 || height < 0) {
+      _mesa_error(ctx,  GL_INVALID_VALUE,
+                   "glViewport(%d, %d, %d, %d)", x, y, width, height);
+      return;
    }
+
+   viewport(ctx, x, y, width, height);
 }
 
 
@@ -146,12 +155,30 @@
 {
    set_viewport_no_notify(ctx, idx, x, y, width, height);
 
-   if (ctx->Driver.Viewport) {
-      /* Many drivers will use this call to check for window size changes
-       * and reallocate the z/stencil/accum/etc buffers if needed.
-       */
+   if (ctx->Driver.Viewport)
       ctx->Driver.Viewport(ctx);
+}
+
+static void
+viewport_array(struct gl_context *ctx, GLuint first, GLsizei count,
+               const struct gl_viewport_inputs *inputs)
+{
+   for (GLsizei i = 0; i < count; i++) {
+      set_viewport_no_notify(ctx, i + first, inputs[i].X, inputs[i].Y,
+                             inputs[i].Width, inputs[i].Height);
    }
+
+   if (ctx->Driver.Viewport)
+      ctx->Driver.Viewport(ctx);
+}
+
+void GLAPIENTRY
+_mesa_ViewportArrayv_no_error(GLuint first, GLsizei count, const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   const struct gl_viewport_inputs *const p = (struct gl_viewport_inputs *)v;
+   viewport_array(ctx, first, count, p);
 }
 
 void GLAPIENTRY
@@ -183,21 +210,13 @@
       }
    }
 
-   for (i = 0; i < count; i++)
-      set_viewport_no_notify(ctx, i + first,
-                             p[i].X, p[i].Y,
-                             p[i].Width, p[i].Height);
-
-   if (ctx->Driver.Viewport)
-      ctx->Driver.Viewport(ctx);
+   viewport_array(ctx, first, count, p);
 }
 
 static void
-ViewportIndexedf(GLuint index, GLfloat x, GLfloat y,
-                 GLfloat w, GLfloat h, const char *function)
+viewport_indexed_err(struct gl_context *ctx, GLuint index, GLfloat x, GLfloat y,
+                     GLfloat w, GLfloat h, const char *function)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s(%d, %f, %f, %f, %f)\n",
                   function, index, x, y, w, h);
@@ -221,16 +240,34 @@
 }
 
 void GLAPIENTRY
+_mesa_ViewportIndexedf_no_error(GLuint index, GLfloat x, GLfloat y,
+                                GLfloat w, GLfloat h)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_set_viewport(ctx, index, x, y, w, h);
+}
+
+void GLAPIENTRY
 _mesa_ViewportIndexedf(GLuint index, GLfloat x, GLfloat y,
                        GLfloat w, GLfloat h)
 {
-   ViewportIndexedf(index, x, y, w, h, "glViewportIndexedf");
+   GET_CURRENT_CONTEXT(ctx);
+   viewport_indexed_err(ctx, index, x, y, w, h, "glViewportIndexedf");
+}
+
+void GLAPIENTRY
+_mesa_ViewportIndexedfv_no_error(GLuint index, const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_set_viewport(ctx, index, v[0], v[1], v[2], v[3]);
 }
 
 void GLAPIENTRY
 _mesa_ViewportIndexedfv(GLuint index, const GLfloat *v)
 {
-   ViewportIndexedf(index, v[0], v[1], v[2], v[3], "glViewportIndexedfv");
+   GET_CURRENT_CONTEXT(ctx);
+   viewport_indexed_err(ctx, index, v[0], v[1], v[2], v[3],
+                        "glViewportIndexedfv");
 }
 
 static void
@@ -241,7 +278,9 @@
        ctx->ViewportArray[idx].Far == farval)
       return;
 
+   /* The depth range is needed by program state constants. */
    FLUSH_VERTICES(ctx, _NEW_VIEWPORT);
+   ctx->NewDriverState |= ctx->DriverFlags.NewViewport;
 
    ctx->ViewportArray[idx].Near = CLAMP(nearval, 0.0, 1.0);
    ctx->ViewportArray[idx].Far = CLAMP(farval, 0.0, 1.0);
@@ -271,8 +310,6 @@
    unsigned i;
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glDepthRange %f %f\n", nearval, farval);
 
@@ -367,6 +404,15 @@
  *                 the far clip plane
  */
 void GLAPIENTRY
+_mesa_DepthRangeIndexed_no_error(GLuint index, GLclampd nearval,
+                                 GLclampd farval)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_set_depth_range(ctx, index, nearval, farval);
+}
+
+
+void GLAPIENTRY
 _mesa_DepthRangeIndexed(GLuint index, GLclampd nearval, GLclampd farval)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -417,12 +463,54 @@
 }
 
 
-extern void GLAPIENTRY
+static void
+clip_control(struct gl_context *ctx, GLenum origin, GLenum depth)
+{
+   if (ctx->Transform.ClipOrigin == origin &&
+       ctx->Transform.ClipDepthMode == depth)
+      return;
+
+   /* Affects transform state and the viewport transform */
+   FLUSH_VERTICES(ctx, ctx->DriverFlags.NewClipControl ? 0 :
+                  _NEW_TRANSFORM | _NEW_VIEWPORT);
+   ctx->NewDriverState |= ctx->DriverFlags.NewClipControl;
+
+   if (ctx->Transform.ClipOrigin != origin) {
+      ctx->Transform.ClipOrigin = origin;
+
+      /* Affects the winding order of the front face. */
+      if (ctx->DriverFlags.NewPolygonState)
+         ctx->NewDriverState |= ctx->DriverFlags.NewPolygonState;
+      else
+         ctx->NewState |= _NEW_POLYGON;
+
+      if (ctx->Driver.FrontFace)
+         ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+   }
+
+   if (ctx->Transform.ClipDepthMode != depth) {
+      ctx->Transform.ClipDepthMode = depth;
+
+      if (ctx->Driver.DepthRange)
+         ctx->Driver.DepthRange(ctx);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_ClipControl_no_error(GLenum origin, GLenum depth)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   clip_control(ctx, origin, depth);
+}
+
+
+void GLAPIENTRY
 _mesa_ClipControl(GLenum origin, GLenum depth)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE&VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glClipControl(%s, %s)\n",
 	          _mesa_enum_to_string(origin),
                   _mesa_enum_to_string(depth));
@@ -444,29 +532,7 @@
       return;
    }
 
-   if (ctx->Transform.ClipOrigin == origin &&
-       ctx->Transform.ClipDepthMode == depth)
-      return;
-
-   /* Affects transform state and the viewport transform */
-   FLUSH_VERTICES(ctx, _NEW_TRANSFORM | _NEW_VIEWPORT);
-
-   if (ctx->Transform.ClipOrigin != origin) {
-      ctx->Transform.ClipOrigin = origin;
-
-      /* Affects the winding order of the front face. */
-      ctx->NewState |= _NEW_POLYGON;
-
-      if (ctx->Driver.FrontFace)
-         ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
-   }
-
-   if (ctx->Transform.ClipDepthMode != depth) {
-      ctx->Transform.ClipDepthMode = depth;
-
-      if (ctx->Driver.DepthRange)
-         ctx->Driver.DepthRange(ctx);
-   }
+   clip_control(ctx, origin, depth);
 }
 
 /**
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index 3951319..f974da5 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -31,15 +31,28 @@
 
 struct gl_context;
 
+void GLAPIENTRY
+_mesa_Viewport_no_error(GLint x, GLint y, GLsizei width, GLsizei height);
+
 extern void GLAPIENTRY
 _mesa_Viewport(GLint x, GLint y, GLsizei width, GLsizei height);
 
+void GLAPIENTRY
+_mesa_ViewportArrayv_no_error(GLuint first, GLsizei count, const GLfloat * v);
+
 extern void GLAPIENTRY
 _mesa_ViewportArrayv(GLuint first, GLsizei count, const GLfloat * v);
 
+void GLAPIENTRY
+_mesa_ViewportIndexedf_no_error(GLuint index, GLfloat x, GLfloat y, GLfloat w,
+                                GLfloat h);
+
 extern void GLAPIENTRY
 _mesa_ViewportIndexedf(GLuint index, GLfloat x, GLfloat y, GLfloat w, GLfloat h);
 
+void GLAPIENTRY
+_mesa_ViewportIndexedfv_no_error(GLuint index, const GLfloat * v);
+
 extern void GLAPIENTRY
 _mesa_ViewportIndexedfv(GLuint index, const GLfloat * v);
 
@@ -60,6 +73,9 @@
 extern void GLAPIENTRY
 _mesa_DepthRangeArrayfvOES(GLuint first, GLsizei count, const GLfloat * v);
 
+void GLAPIENTRY
+_mesa_DepthRangeIndexed_no_error(GLuint index, GLclampd n, GLclampd f);
+
 extern void GLAPIENTRY
 _mesa_DepthRangeIndexed(GLuint index, GLclampd n, GLclampd f);
 
@@ -74,6 +90,9 @@
 _mesa_init_viewport(struct gl_context *ctx);
 
 
+void GLAPIENTRY
+_mesa_ClipControl_no_error(GLenum origin, GLenum depth);
+
 extern void GLAPIENTRY
 _mesa_ClipControl(GLenum origin, GLenum depth);
 
diff --git a/src/mesa/main/vtxfmt.c b/src/mesa/main/vtxfmt.c
index 81bf4c5..d3c83e5 100644
--- a/src/mesa/main/vtxfmt.c
+++ b/src/mesa/main/vtxfmt.c
@@ -217,6 +217,10 @@
       SET_VertexAttribL2dv(tab, vfmt->VertexAttribL2dv);
       SET_VertexAttribL3dv(tab, vfmt->VertexAttribL3dv);
       SET_VertexAttribL4dv(tab, vfmt->VertexAttribL4dv);
+
+      /* GL_ARB_bindless_texture */
+      SET_VertexAttribL1ui64ARB(tab, vfmt->VertexAttribL1ui64ARB);
+      SET_VertexAttribL1ui64vARB(tab, vfmt->VertexAttribL1ui64vARB);
    }
 }
 
diff --git a/src/mesa/math/m_debug_util.h b/src/mesa/math/m_debug_util.h
index 25ee029..4959785 100644
--- a/src/mesa/math/m_debug_util.h
+++ b/src/mesa/math/m_debug_util.h
@@ -216,7 +216,7 @@
    x = LONG_MAX;							\
    for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) {			\
       unsigned long cycle_tmp1, cycle_tmp2;				\
-      rdtscll(cycle_tmp1);						\
+      rdtscll(cycle_tmp1);
 
 #define END_RACE(x)							\
       rdtscll(cycle_tmp2);						\
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 674905d..eb87fc5 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -47,12 +47,12 @@
 #include "compiler/glsl/linker.h"
 #include "compiler/glsl/program.h"
 #include "compiler/glsl/shader_cache.h"
+#include "compiler/glsl/string_to_uint_map.h"
 #include "program/prog_instruction.h"
 #include "program/prog_optimize.h"
 #include "program/prog_print.h"
 #include "program/program.h"
 #include "program/prog_parameter.h"
-#include "util/string_to_uint_map.h"
 
 
 static int swizzle_for_size(int size);
@@ -1389,12 +1389,6 @@
    case ir_unop_dFdy_fine:
    case ir_unop_subroutine_to_int:
    case ir_unop_get_buffer_size:
-   case ir_unop_ballot:
-   case ir_binop_read_invocation:
-   case ir_unop_read_first_invocation:
-   case ir_unop_vote_any:
-   case ir_unop_vote_all:
-   case ir_unop_vote_eq:
    case ir_unop_bitcast_u642d:
    case ir_unop_bitcast_i642d:
    case ir_unop_bitcast_d2u64:
@@ -1423,6 +1417,10 @@
    case ir_unop_unpack_int_2x32:
    case ir_unop_pack_uint_2x32:
    case ir_unop_unpack_uint_2x32:
+   case ir_unop_pack_sampler_2x32:
+   case ir_unop_unpack_sampler_2x32:
+   case ir_unop_pack_image_2x32:
+   case ir_unop_unpack_image_2x32:
       assert(!"not supported");
       break;
 
@@ -1903,7 +1901,7 @@
     * get lucky, copy propagation will eliminate the extra moves.
     */
 
-   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
+   if (ir->type->is_record()) {
       src_reg temp_base = get_temp(ir->type);
       dst_reg temp = dst_reg(temp_base);
 
@@ -1952,7 +1950,7 @@
       dst_reg mat_column = dst_reg(mat);
 
       for (i = 0; i < ir->type->matrix_columns; i++) {
-	 assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+	 assert(ir->type->is_float());
 	 values = &ir->value.f[i * ir->type->vector_elements];
 
 	 src = src_reg(PROGRAM_CONSTANT, -1, NULL);
@@ -2422,6 +2420,7 @@
    void process(ir_variable *var)
    {
       this->idx = -1;
+      this->var = var;
       this->program_resource_visitor::process(var);
       var->data.param_index = this->idx;
    }
@@ -2435,6 +2434,7 @@
    struct gl_shader_program *shader_program;
    struct gl_program_parameter_list *params;
    int idx;
+   ir_variable *var;
    gl_shader_stage shader_type;
 };
 
@@ -2447,22 +2447,12 @@
                                    const enum glsl_interface_packing,
                                    bool /* last_field */)
 {
-   unsigned int size;
-
    /* atomics don't get real storage */
    if (type->contains_atomic())
       return;
 
-   if (type->is_vector() || type->is_scalar()) {
-      size = type->vector_elements;
-      if (type->is_64bit())
-         size *= 2;
-   } else {
-      size = type_size(type) * 4;
-   }
-
    gl_register_file file;
-   if (type->without_array()->is_sampler()) {
+   if (type->without_array()->is_sampler() && !var->data.bindless) {
       file = PROGRAM_SAMPLER;
    } else {
       file = PROGRAM_UNIFORM;
@@ -2470,6 +2460,8 @@
 
    int index = _mesa_lookup_parameter_index(params, name);
    if (index < 0) {
+      unsigned size = type_size(type) * 4;
+
       index = _mesa_add_parameter(params, file, name, size, type->gl_type,
 				  NULL, NULL);
 
@@ -2536,10 +2528,13 @@
 
 void
 _mesa_associate_uniform_storage(struct gl_context *ctx,
-				struct gl_shader_program *shader_program,
-                                struct gl_program_parameter_list *params,
+                                struct gl_shader_program *shader_program,
+                                struct gl_program *prog,
                                 bool propagate_to_storage)
 {
+   struct gl_program_parameter_list *params = prog->Parameters;
+   gl_shader_stage shader_type = prog->info.stage;
+
    /* After adding each uniform to the parameter list, connect the storage for
     * the parameter with the tracking structure used by the API for the
     * uniform.
@@ -2547,15 +2542,15 @@
    unsigned last_location = unsigned(~0);
    for (unsigned i = 0; i < params->NumParameters; i++) {
       if (params->Parameters[i].Type != PROGRAM_UNIFORM)
-	 continue;
+         continue;
 
       unsigned location;
       const bool found =
-	 shader_program->UniformHash->get(location, params->Parameters[i].Name);
+         shader_program->UniformHash->get(location, params->Parameters[i].Name);
       assert(found);
 
       if (!found)
-	 continue;
+         continue;
 
       struct gl_uniform_storage *storage =
          &shader_program->data->UniformStorage[location];
@@ -2565,48 +2560,47 @@
          continue;
 
       if (location != last_location) {
-	 enum gl_uniform_driver_format format = uniform_native;
+         enum gl_uniform_driver_format format = uniform_native;
+         unsigned columns = 0;
+         int dmul = 4 * sizeof(float);
 
-	 unsigned columns = 0;
-	 int dmul = 4 * sizeof(float);
-	 switch (storage->type->base_type) {
+         switch (storage->type->base_type) {
          case GLSL_TYPE_UINT64:
-	    if (storage->type->vector_elements > 2)
+            if (storage->type->vector_elements > 2)
                dmul *= 2;
-	    /* fallthrough */
-	 case GLSL_TYPE_UINT:
-	    assert(ctx->Const.NativeIntegers);
-	    format = uniform_native;
-	    columns = 1;
-	    break;
+            /* fallthrough */
+         case GLSL_TYPE_UINT:
+            assert(ctx->Const.NativeIntegers);
+            format = uniform_native;
+            columns = 1;
+            break;
          case GLSL_TYPE_INT64:
-	    if (storage->type->vector_elements > 2)
+            if (storage->type->vector_elements > 2)
                dmul *= 2;
-	    /* fallthrough */
-	 case GLSL_TYPE_INT:
-	    format =
-	       (ctx->Const.NativeIntegers) ? uniform_native : uniform_int_float;
-	    columns = 1;
-	    break;
-
-	 case GLSL_TYPE_DOUBLE:
-	    if (storage->type->vector_elements > 2)
+            /* fallthrough */
+         case GLSL_TYPE_INT:
+            format =
+               (ctx->Const.NativeIntegers) ? uniform_native : uniform_int_float;
+            columns = 1;
+            break;
+         case GLSL_TYPE_DOUBLE:
+            if (storage->type->vector_elements > 2)
                dmul *= 2;
-	    /* fallthrough */
-	 case GLSL_TYPE_FLOAT:
-	    format = uniform_native;
-	    columns = storage->type->matrix_columns;
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    format = uniform_native;
-	    columns = 1;
-	    break;
-	 case GLSL_TYPE_SAMPLER:
-	 case GLSL_TYPE_IMAGE:
+            /* fallthrough */
+         case GLSL_TYPE_FLOAT:
+            format = uniform_native;
+            columns = storage->type->matrix_columns;
+            break;
+         case GLSL_TYPE_BOOL:
+            format = uniform_native;
+            columns = 1;
+            break;
+         case GLSL_TYPE_SAMPLER:
+         case GLSL_TYPE_IMAGE:
          case GLSL_TYPE_SUBROUTINE:
-	    format = uniform_native;
-	    columns = 1;
-	    break;
+            format = uniform_native;
+            columns = 1;
+            break;
          case GLSL_TYPE_ATOMIC_UINT:
          case GLSL_TYPE_ARRAY:
          case GLSL_TYPE_VOID:
@@ -2614,27 +2608,49 @@
          case GLSL_TYPE_ERROR:
          case GLSL_TYPE_INTERFACE:
          case GLSL_TYPE_FUNCTION:
-	    assert(!"Should not get here.");
-	    break;
-	 }
+            assert(!"Should not get here.");
+            break;
+         }
 
-	 _mesa_uniform_attach_driver_storage(storage,
-					     dmul * columns,
-					     dmul,
-					     format,
-					     &params->ParameterValues[i]);
+         _mesa_uniform_attach_driver_storage(storage, dmul * columns, dmul,
+                                             format,
+                                             &params->ParameterValues[i]);
 
-	 /* After attaching the driver's storage to the uniform, propagate any
-	  * data from the linker's backing store.  This will cause values from
-	  * initializers in the source code to be copied over.
-	  */
+         /* When a bindless sampler/image is bound to a texture/image unit, we
+          * have to overwrite the constant value by the resident handle
+          * directly in the constant buffer before the next draw. One solution
+          * is to keep track a pointer to the base of the data.
+          */
+         if (storage->is_bindless && (prog->sh.NumBindlessSamplers ||
+                                      prog->sh.NumBindlessImages)) {
+            unsigned array_elements = MAX2(1, storage->array_elements);
+
+            for (unsigned j = 0; j < array_elements; ++j) {
+               unsigned unit = storage->opaque[shader_type].index + j;
+
+               if (storage->type->without_array()->is_sampler()) {
+                  assert(unit >= 0 && unit < prog->sh.NumBindlessSamplers);
+                  prog->sh.BindlessSamplers[unit].data =
+                     &params->ParameterValues[i] + j;
+               } else if (storage->type->without_array()->is_image()) {
+                  assert(unit >= 0 && unit < prog->sh.NumBindlessImages);
+                  prog->sh.BindlessImages[unit].data =
+                     &params->ParameterValues[i] + j;
+               }
+            }
+         }
+
+         /* After attaching the driver's storage to the uniform, propagate any
+          * data from the linker's backing store.  This will cause values from
+          * initializers in the source code to be copied over.
+          */
          if (propagate_to_storage) {
             unsigned array_elements = MAX2(1, storage->array_elements);
             _mesa_propagate_uniforms_to_driver_storage(storage, 0,
                                                        array_elements);
          }
 
-	 last_location = location;
+	      last_location = location;
       }
    }
 }
@@ -2990,8 +3006,7 @@
     * prog->ParameterValues to get reallocated (e.g., anything that adds a
     * program constant) has to happen before creating this linkage.
     */
-   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters,
-                                   true);
+   _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
    if (!shader_program->data->LinkStatus) {
       goto fail_exit;
    }
diff --git a/src/mesa/program/ir_to_mesa.h b/src/mesa/program/ir_to_mesa.h
index 0944619..e3d3644 100644
--- a/src/mesa/program/ir_to_mesa.h
+++ b/src/mesa/program/ir_to_mesa.h
@@ -45,8 +45,8 @@
 					    *params);
 void
 _mesa_associate_uniform_storage(struct gl_context *ctx,
-				struct gl_shader_program *shader_program,
-                                struct gl_program_parameter_list *params,
+                                struct gl_shader_program *shader_program,
+                                struct gl_program *prog,
                                 bool propagate_to_storage);
 
 #ifdef __cplusplus
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 6689c71..40bc47d 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -267,9 +267,8 @@
             COPY_4V(paramList->ParameterValues[oldNum + i], values);
          } else {
             /* copy 1, 2 or 3 values */
-            GLuint remaining = size % 4;
-            assert(remaining < 4);
-            for (j = 0; j < remaining; j++) {
+            assert(size < 4);
+            for (j = 0; j < size; j++) {
                paramList->ParameterValues[oldNum + i][j].f = values[j].f;
             }
             /* fill in remaining positions with zeros */
@@ -278,7 +277,6 @@
             }
          }
          values += 4;
-         p->Initialized = GL_TRUE;
       } else {
          /* silence valgrind */
          for (j = 0; j < 4; j++)
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index 320f64f..f50e99c 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -69,7 +69,6 @@
     * The next program parameter's Size will be Size-4 of this parameter.
     */
    GLuint Size;
-   GLboolean Initialized;   /**< debug: Has the ParameterValue[] been set? */
    /**
     * A sequence of STATE_* tokens and integers to identify GL state.
     */
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index 6d04a38..4f85d14 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -153,6 +153,7 @@
       "fragment.(twenty-seven)", /* VARYING_SLOT_CULL_DIST1 */
       "fragment.(twenty-eight)", /* VARYING_SLOT_BOUNDING_BOX0 */
       "fragment.(twenty-nine)", /* VARYING_SLOT_BOUNDING_BOX1 */
+      "fragment.(thirty)", /* VARYING_SLOT_VIEW_INDEX */
       "fragment.varying[0]",
       "fragment.varying[1]",
       "fragment.varying[2]",
@@ -284,6 +285,7 @@
       "result.(twenty-seven)", /* VARYING_SLOT_CULL_DIST1 */
       "result.(twenty-eight)", /* VARYING_SLOT_BOUNDING_BOX0 */
       "result.(twenty-nine)", /* VARYING_SLOT_BOUNDING_BOX1 */
+      "result.(thirty)", /* VARYING_SLOT_VIEW_INDEX */
       "result.varying[0]",
       "result.varying[1]",
       "result.varying[2]",
@@ -967,6 +969,8 @@
    case MESA_SHADER_COMPUTE:
       type = "comp";
       break;
+   default:
+      break;
    }
 
    _mesa_snprintf(filename, sizeof(filename), "shader_%u.%s", shader->Name, type);
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index a1c5ba6..851b3f2 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -1018,10 +1018,8 @@
 
    nir_builder_init_simple_shader(&c->build, NULL, stage, options);
 
-   /* Use the shader_info from gl_program rather than the one nir_builder
-    * created for us. nir_sweep should clean up the other one for us.
-    */
-   c->build.shader->info = (shader_info *) &prog->info;
+   /* Copy the shader_info from the gl_program */
+   c->build.shader->info = prog->info;
 
    s = c->build.shader;
 
@@ -1048,16 +1046,16 @@
 
    ptn_add_output_stores(c);
 
-   s->info->name = ralloc_asprintf(s, "ARB%d", prog->Id);
-   s->info->num_textures = util_last_bit(prog->SamplersUsed);
-   s->info->num_ubos = 0;
-   s->info->num_abos = 0;
-   s->info->num_ssbos = 0;
-   s->info->num_images = 0;
-   s->info->uses_texture_gather = false;
-   s->info->clip_distance_array_size = 0;
-   s->info->cull_distance_array_size = 0;
-   s->info->separate_shader = false;
+   s->info.name = ralloc_asprintf(s, "ARB%d", prog->Id);
+   s->info.num_textures = util_last_bit(prog->SamplersUsed);
+   s->info.num_ubos = 0;
+   s->info.num_abos = 0;
+   s->info.num_ssbos = 0;
+   s->info.num_images = 0;
+   s->info.uses_texture_gather = false;
+   s->info.clip_distance_array_size = 0;
+   s->info.cull_distance_array_size = 0;
+   s->info.separate_shader = false;
 
 fail:
    if (c->error) {
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index 277e6ce..0defa01 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -271,6 +271,14 @@
       ralloc_free(prog->nir);
    }
 
+   if (prog->sh.BindlessSamplers) {
+      ralloc_free(prog->sh.BindlessSamplers);
+   }
+
+   if (prog->sh.BindlessImages) {
+      ralloc_free(prog->sh.BindlessImages);
+   }
+
    ralloc_free(prog);
 }
 
diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c
index 501acde..f560bce 100644
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -46,7 +46,7 @@
  * May be used to implement the position_invariant option.
  */
 static void
-_mesa_insert_mvp_dp4_code(struct gl_context *ctx, struct gl_program *vprog)
+insert_mvp_dp4_code(struct gl_context *ctx, struct gl_program *vprog)
 {
    struct prog_instruction *newInst;
    const GLuint origLen = vprog->arb.NumInstructions;
@@ -113,7 +113,7 @@
 
 
 static void
-_mesa_insert_mvp_mad_code(struct gl_context *ctx, struct gl_program *vprog)
+insert_mvp_mad_code(struct gl_context *ctx, struct gl_program *vprog)
 {
    struct prog_instruction *newInst;
    const GLuint origLen = vprog->arb.NumInstructions;
@@ -217,9 +217,9 @@
 _mesa_insert_mvp_code(struct gl_context *ctx, struct gl_program *vprog)
 {
    if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS)
-      _mesa_insert_mvp_dp4_code( ctx, vprog );
+      insert_mvp_dp4_code( ctx, vprog );
    else
-      _mesa_insert_mvp_mad_code( ctx, vprog );
+      insert_mvp_mad_code( ctx, vprog );
 }
       
 
diff --git a/src/mesa/sparc/sparc_matrix.h b/src/mesa/sparc/sparc_matrix.h
index f677d9b..6ef0acd 100644
--- a/src/mesa/sparc/sparc_matrix.h
+++ b/src/mesa/sparc/sparc_matrix.h
@@ -151,7 +151,7 @@
 #define LDMATRIX_0_5_10(BASE) 			\
 	ld	[BASE + ( 0 * 0x4)], M0;	\
 	ld	[BASE + ( 5 * 0x4)], M5;	\
-	ld	[BASE + (10 * 0x4)], M10;	\
+	ld	[BASE + (10 * 0x4)], M10;
 
 #define LDMATRIX_0_5_10_12_13_14(BASE) 		\
 	ld	[BASE + ( 0 * 0x4)], M0;	\
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c
index 90286a1..13e013c 100644
--- a/src/mesa/state_tracker/st_atifs_to_tgsi.c
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c
@@ -45,8 +45,8 @@
    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
 
-   const GLuint *inputMapping;
-   const GLuint *outputMapping;
+   const ubyte *inputMapping;
+   const ubyte *outputMapping;
 
    unsigned current_pass;
 
@@ -105,18 +105,18 @@
       imm[0] = src;
       imm[1] = ureg_imm4f(t->ureg, 1.0f, 1.0f, 0.0f, 0.0f);
       imm[2] = ureg_imm4f(t->ureg, 0.0f, 0.0f, 1.0f, 1.0f);
-      ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3);
+      ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3, 0);
 
       if (swizzle == GL_SWIZZLE_STR_DR_ATI) {
          imm[0] = ureg_scalar(src, TGSI_SWIZZLE_Z);
       } else {
          imm[0] = ureg_scalar(src, TGSI_SWIZZLE_W);
       }
-      ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1);
+      ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1, 0);
 
       imm[0] = ureg_src(tmp[0]);
       imm[1] = ureg_src(tmp[1]);
-      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2, 0);
 
       return ureg_src(tmp[0]);
    }
@@ -170,35 +170,35 @@
       src = ureg_scalar(src, TGSI_SWIZZLE_W);
       break;
    }
-   ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1);
+   ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1, 0);
 
    if (srcReg->argMod & GL_COMP_BIT_ATI) {
       struct ureg_src modsrc[2];
       modsrc[0] = ureg_imm1f(t->ureg, 1.0f);
       modsrc[1] = ureg_negate(ureg_src(arg));
 
-      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2, 0);
    }
    if (srcReg->argMod & GL_BIAS_BIT_ATI) {
       struct ureg_src modsrc[2];
       modsrc[0] = ureg_src(arg);
       modsrc[1] = ureg_imm1f(t->ureg, -0.5f);
 
-      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2, 0);
    }
    if (srcReg->argMod & GL_2X_BIT_ATI) {
       struct ureg_src modsrc[2];
       modsrc[0] = ureg_src(arg);
       modsrc[1] = ureg_src(arg);
 
-      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2, 0);
    }
    if (srcReg->argMod & GL_NEGATE_BIT_ATI) {
       struct ureg_src modsrc[2];
       modsrc[0] = ureg_src(arg);
       modsrc[1] = ureg_imm1f(t->ureg, -1.0f);
 
-      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2, 0);
    }
    return  ureg_src(arg);
 }
@@ -217,25 +217,25 @@
       tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */
       src[0] = ureg_imm1f(t->ureg, 0.5f);
       src[1] = ureg_negate(args[2]);
-      ureg_insn(t->ureg, TGSI_OPCODE_ADD, tmp, 1, src, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, tmp, 1, src, 2, 0);
       src[0] = ureg_src(tmp[0]);
       src[1] = args[0];
       src[2] = args[1];
-      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3, 0);
    } else if (!strcmp(desc->name, "CND0")) {
       src[0] = args[2];
       src[1] = args[1];
       src[2] = args[0];
-      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3, 0);
    } else if (!strcmp(desc->name, "DOT2_ADD")) {
       /* note: DP2A is not implemented in most pipe drivers */
       tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); /* re-purpose a1 */
       src[0] = args[0];
       src[1] = args[1];
-      ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2, 0);
       src[0] = ureg_src(tmp[0]);
       src[1] = ureg_scalar(args[2], TGSI_SWIZZLE_Z);
-      ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2, 0);
    }
 }
 
@@ -249,7 +249,7 @@
       return;
    }
 
-   ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount);
+   ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount, 0);
 }
 
 static void
@@ -292,7 +292,7 @@
    if (dstMod & GL_SATURATE_BIT_ATI) {
       dst = ureg_saturate(dst);
    }
-   ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2);
+   ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2, 0);
 }
 
 /**
@@ -334,9 +334,9 @@
       src[1] = t->samplers[r];
       /* the texture target is still unknown, it will be fixed in the draw call */
       ureg_tex_insn(t->ureg, TGSI_OPCODE_TEX, dst, 1, TGSI_TEXTURE_2D,
-                    NULL, 0, src, 2);
+                    TGSI_RETURN_TYPE_FLOAT, NULL, 0, src, 2);
    } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
-      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1, 0);
    }
 
    t->regs_written[t->current_pass][r] = true;
@@ -408,11 +408,11 @@
       /* copy the result into the OUT slot */
       dst[0] = t->outputs[t->outputMapping[FRAG_RESULT_COLOR]];
       src[0] = ureg_src(t->temps[0]);
-      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1, 0);
    }
 
    /* signal the end of the program */
-   ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0);
+   ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0, 0);
 }
 
 /**
@@ -425,12 +425,12 @@
    struct ati_fragment_shader *atifs,
    struct gl_program *program,
    GLuint numInputs,
-   const GLuint inputMapping[],
+   const ubyte inputMapping[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
+   const ubyte interpMode[],
    GLuint numOutputs,
-   const GLuint outputMapping[],
+   const ubyte outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[])
 {
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.h b/src/mesa/state_tracker/st_atifs_to_tgsi.h
index 1422702..ce54791 100644
--- a/src/mesa/state_tracker/st_atifs_to_tgsi.h
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.h
@@ -43,12 +43,12 @@
     struct ati_fragment_shader *atifs,
     struct gl_program *program,
     GLuint numInputs,
-    const GLuint inputMapping[],
+    const ubyte inputMapping[],
     const ubyte inputSemanticName[],
     const ubyte inputSemanticIndex[],
-    const GLuint interpMode[],
+    const ubyte interpMode[],
     GLuint numOutputs,
-    const GLuint outputMapping[],
+    const ubyte outputMapping[],
     const ubyte outputSemanticName[],
     const ubyte outputSemanticIndex[]);
 
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index abbbd4d..253b508 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -36,11 +36,12 @@
 #include "st_program.h"
 #include "st_manager.h"
 
+typedef void (*update_func_t)(struct st_context *st);
 
 /* The list state update functions. */
-static const struct st_tracked_state *atoms[] =
+static const update_func_t update_functions[] =
 {
-#define ST_STATE(FLAG, st_update) &st_update,
+#define ST_STATE(FLAG, st_update) st_update,
 #include "st_atom_list.h"
 #undef ST_STATE
 };
@@ -48,7 +49,7 @@
 
 void st_init_atoms( struct st_context *st )
 {
-   STATIC_ASSERT(ARRAY_SIZE(atoms) <= 64);
+   STATIC_ASSERT(ARRAY_SIZE(update_functions) <= 64);
 }
 
 
@@ -64,9 +65,9 @@
 {
    struct gl_context *ctx = st->ctx;
    struct st_vertex_program *old_vp = st->vp;
-   struct st_tessctrl_program *old_tcp = st->tcp;
-   struct st_tesseval_program *old_tep = st->tep;
-   struct st_geometry_program *old_gp = st->gp;
+   struct st_common_program *old_tcp = st->tcp;
+   struct st_common_program *old_tep = st->tep;
+   struct st_common_program *old_gp = st->gp;
    struct st_fragment_program *old_fp = st->fp;
 
    struct gl_program *new_vp = ctx->VertexProgram._Current;
@@ -75,6 +76,7 @@
    struct gl_program *new_gp = ctx->GeometryProgram._Current;
    struct gl_program *new_fp = ctx->FragmentProgram._Current;
    uint64_t dirty = 0;
+   unsigned num_viewports = 1;
 
    /* Flag states used by both new and old shaders to unbind shader resources
     * properly when transitioning to shaders that don't use them.
@@ -90,21 +92,21 @@
       if (old_tcp)
          dirty |= old_tcp->affected_states;
       if (new_tcp)
-         dirty |= st_tessctrl_program(new_tcp)->affected_states;
+         dirty |= st_common_program(new_tcp)->affected_states;
    }
 
    if (unlikely(new_tep != &old_tep->Base)) {
       if (old_tep)
          dirty |= old_tep->affected_states;
       if (new_tep)
-         dirty |= st_tesseval_program(new_tep)->affected_states;
+         dirty |= st_common_program(new_tep)->affected_states;
    }
 
    if (unlikely(new_gp != &old_gp->Base)) {
       if (old_gp)
          dirty |= old_gp->affected_states;
       if (new_gp)
-         dirty |= st_geometry_program(new_gp)->affected_states;
+         dirty |= st_common_program(new_gp)->affected_states;
    }
 
    if (unlikely(new_fp != &old_fp->Base)) {
@@ -114,8 +116,24 @@
          dirty |= st_fragment_program(new_fp)->affected_states;
    }
 
+   /* Find out the number of viewports. This determines how many scissors
+    * and viewport states we need to update.
+    */
+   struct gl_program *last_prim_shader = new_gp ? new_gp :
+                                         new_tep ? new_tep : new_vp;
+   if (last_prim_shader &&
+       last_prim_shader->info.outputs_written & VARYING_BIT_VIEWPORT)
+      num_viewports = ctx->Const.MaxViewports;
+
+   if (st->state.num_viewports != num_viewports) {
+      st->state.num_viewports = num_viewports;
+      dirty |= ST_NEW_VIEWPORT;
+
+      if (ctx->Scissor.EnableFlags & u_bit_consecutive(0, num_viewports))
+         dirty |= ST_NEW_SCISSOR;
+   }
+
    st->dirty |= dirty;
-   st->gfx_shaders_may_be_dirty = false;
 }
 
 static void check_attrib_edgeflag(struct st_context *st)
@@ -170,7 +188,11 @@
       if (st->ctx->API == API_OPENGL_COMPAT)
          check_attrib_edgeflag(st);
 
-      check_program_state(st);
+      if (st->gfx_shaders_may_be_dirty) {
+         check_program_state(st);
+         st->gfx_shaders_may_be_dirty = false;
+      }
+
       st_manager_validate_framebuffers(st);
 
       pipeline_mask = ST_PIPELINE_RENDER_STATE_MASK;
@@ -226,9 +248,9 @@
     * Don't use u_bit_scan64, it may be slower on 32-bit.
     */
    while (dirty_lo)
-      atoms[u_bit_scan(&dirty_lo)]->update(st);
+      update_functions[u_bit_scan(&dirty_lo)](st);
    while (dirty_hi)
-      atoms[32 + u_bit_scan(&dirty_hi)]->update(st);
+      update_functions[32 + u_bit_scan(&dirty_hi)](st);
 
    /* Clear the render or compute state bits. */
    st->dirty &= ~pipeline_mask;
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 0145cef..f9711d5 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -48,11 +48,6 @@
    ST_PIPELINE_COMPUTE,
 };
 
-struct st_tracked_state {
-   void (*update)( struct st_context *st );
-};
-
-
 void st_init_atoms( struct st_context *st );
 void st_destroy_atoms( struct st_context *st );
 void st_validate_state( struct st_context *st, enum st_pipeline pipeline );
@@ -73,12 +68,12 @@
 /* Define ST_NEW_xxx values as static const uint64_t values.
  * We can't use an enum type because MSVC doesn't allow 64-bit enum values.
  */
-#define ST_STATE(FLAG, st_update) static const uint64_t FLAG = 1llu << FLAG##_INDEX;
+#define ST_STATE(FLAG, st_update) static const uint64_t FLAG = 1ull << FLAG##_INDEX;
 #include "st_atom_list.h"
 #undef ST_STATE
 
-/* Add extern struct declarations. */
-#define ST_STATE(FLAG, st_update) extern const struct st_tracked_state st_update;
+/* Declare function prototypes. */
+#define ST_STATE(FLAG, st_update) void st_update(struct st_context *st);
 #include "st_atom_list.h"
 #undef ST_STATE
 
@@ -150,7 +145,7 @@
 
 /* All state flags within each group: */
 #define ST_PIPELINE_RENDER_STATE_MASK  (ST_NEW_CS_STATE - 1)
-#define ST_PIPELINE_COMPUTE_STATE_MASK (0xffllu << ST_NEW_CS_STATE_INDEX)
+#define ST_PIPELINE_COMPUTE_STATE_MASK (0xffull << ST_NEW_CS_STATE_INDEX)
 #define ST_PIPELINE_CLEAR_STATE_MASK (ST_NEW_FB_STATE | \
                                       ST_NEW_SCISSOR | \
                                       ST_NEW_WINDOW_RECTANGLES)
diff --git a/src/mesa/state_tracker/st_atom_array.c b/src/mesa/state_tracker/st_atom_array.c
index 221b2c7..6af1355 100644
--- a/src/mesa/state_tracker/st_atom_array.c
+++ b/src/mesa/state_tracker/st_atom_array.c
@@ -44,162 +44,191 @@
 
 #include "cso_cache/cso_context.h"
 #include "util/u_math.h"
+#include "util/u_upload_mgr.h"
 #include "main/bufferobj.h"
 #include "main/glformats.h"
 
-
-static GLuint double_types[4] = {
-   PIPE_FORMAT_R64_FLOAT,
-   PIPE_FORMAT_R64G64_FLOAT,
-   PIPE_FORMAT_R64G64B64_FLOAT,
-   PIPE_FORMAT_R64G64B64A64_FLOAT
-};
-
-static GLuint float_types[4] = {
-   PIPE_FORMAT_R32_FLOAT,
-   PIPE_FORMAT_R32G32_FLOAT,
-   PIPE_FORMAT_R32G32B32_FLOAT,
-   PIPE_FORMAT_R32G32B32A32_FLOAT
-};
-
-static GLuint half_float_types[4] = {
-   PIPE_FORMAT_R16_FLOAT,
-   PIPE_FORMAT_R16G16_FLOAT,
-   PIPE_FORMAT_R16G16B16_FLOAT,
-   PIPE_FORMAT_R16G16B16A16_FLOAT
-};
-
-static GLuint uint_types_norm[4] = {
-   PIPE_FORMAT_R32_UNORM,
-   PIPE_FORMAT_R32G32_UNORM,
-   PIPE_FORMAT_R32G32B32_UNORM,
-   PIPE_FORMAT_R32G32B32A32_UNORM
-};
-
-static GLuint uint_types_scale[4] = {
-   PIPE_FORMAT_R32_USCALED,
-   PIPE_FORMAT_R32G32_USCALED,
-   PIPE_FORMAT_R32G32B32_USCALED,
-   PIPE_FORMAT_R32G32B32A32_USCALED
-};
-
-static GLuint uint_types_int[4] = {
-   PIPE_FORMAT_R32_UINT,
-   PIPE_FORMAT_R32G32_UINT,
-   PIPE_FORMAT_R32G32B32_UINT,
-   PIPE_FORMAT_R32G32B32A32_UINT
-};
-
-static GLuint int_types_norm[4] = {
-   PIPE_FORMAT_R32_SNORM,
-   PIPE_FORMAT_R32G32_SNORM,
-   PIPE_FORMAT_R32G32B32_SNORM,
-   PIPE_FORMAT_R32G32B32A32_SNORM
-};
-
-static GLuint int_types_scale[4] = {
-   PIPE_FORMAT_R32_SSCALED,
-   PIPE_FORMAT_R32G32_SSCALED,
-   PIPE_FORMAT_R32G32B32_SSCALED,
-   PIPE_FORMAT_R32G32B32A32_SSCALED
-};
-
-static GLuint int_types_int[4] = {
-   PIPE_FORMAT_R32_SINT,
-   PIPE_FORMAT_R32G32_SINT,
-   PIPE_FORMAT_R32G32B32_SINT,
-   PIPE_FORMAT_R32G32B32A32_SINT
-};
-
-static GLuint ushort_types_norm[4] = {
-   PIPE_FORMAT_R16_UNORM,
-   PIPE_FORMAT_R16G16_UNORM,
-   PIPE_FORMAT_R16G16B16_UNORM,
-   PIPE_FORMAT_R16G16B16A16_UNORM
-};
-
-static GLuint ushort_types_scale[4] = {
-   PIPE_FORMAT_R16_USCALED,
-   PIPE_FORMAT_R16G16_USCALED,
-   PIPE_FORMAT_R16G16B16_USCALED,
-   PIPE_FORMAT_R16G16B16A16_USCALED
-};
-
-static GLuint ushort_types_int[4] = {
-   PIPE_FORMAT_R16_UINT,
-   PIPE_FORMAT_R16G16_UINT,
-   PIPE_FORMAT_R16G16B16_UINT,
-   PIPE_FORMAT_R16G16B16A16_UINT
-};
-
-static GLuint short_types_norm[4] = {
-   PIPE_FORMAT_R16_SNORM,
-   PIPE_FORMAT_R16G16_SNORM,
-   PIPE_FORMAT_R16G16B16_SNORM,
-   PIPE_FORMAT_R16G16B16A16_SNORM
-};
-
-static GLuint short_types_scale[4] = {
-   PIPE_FORMAT_R16_SSCALED,
-   PIPE_FORMAT_R16G16_SSCALED,
-   PIPE_FORMAT_R16G16B16_SSCALED,
-   PIPE_FORMAT_R16G16B16A16_SSCALED
-};
-
-static GLuint short_types_int[4] = {
-   PIPE_FORMAT_R16_SINT,
-   PIPE_FORMAT_R16G16_SINT,
-   PIPE_FORMAT_R16G16B16_SINT,
-   PIPE_FORMAT_R16G16B16A16_SINT
-};
-
-static GLuint ubyte_types_norm[4] = {
-   PIPE_FORMAT_R8_UNORM,
-   PIPE_FORMAT_R8G8_UNORM,
-   PIPE_FORMAT_R8G8B8_UNORM,
-   PIPE_FORMAT_R8G8B8A8_UNORM
-};
-
-static GLuint ubyte_types_scale[4] = {
-   PIPE_FORMAT_R8_USCALED,
-   PIPE_FORMAT_R8G8_USCALED,
-   PIPE_FORMAT_R8G8B8_USCALED,
-   PIPE_FORMAT_R8G8B8A8_USCALED
-};
-
-static GLuint ubyte_types_int[4] = {
-   PIPE_FORMAT_R8_UINT,
-   PIPE_FORMAT_R8G8_UINT,
-   PIPE_FORMAT_R8G8B8_UINT,
-   PIPE_FORMAT_R8G8B8A8_UINT
-};
-
-static GLuint byte_types_norm[4] = {
-   PIPE_FORMAT_R8_SNORM,
-   PIPE_FORMAT_R8G8_SNORM,
-   PIPE_FORMAT_R8G8B8_SNORM,
-   PIPE_FORMAT_R8G8B8A8_SNORM
-};
-
-static GLuint byte_types_scale[4] = {
-   PIPE_FORMAT_R8_SSCALED,
-   PIPE_FORMAT_R8G8_SSCALED,
-   PIPE_FORMAT_R8G8B8_SSCALED,
-   PIPE_FORMAT_R8G8B8A8_SSCALED
-};
-
-static GLuint byte_types_int[4] = {
-   PIPE_FORMAT_R8_SINT,
-   PIPE_FORMAT_R8G8_SINT,
-   PIPE_FORMAT_R8G8B8_SINT,
-   PIPE_FORMAT_R8G8B8A8_SINT
-};
-
-static GLuint fixed_types[4] = {
-   PIPE_FORMAT_R32_FIXED,
-   PIPE_FORMAT_R32G32_FIXED,
-   PIPE_FORMAT_R32G32B32_FIXED,
-   PIPE_FORMAT_R32G32B32A32_FIXED
+/* vertex_formats[gltype - GL_BYTE][integer*2 + normalized][size - 1] */
+static const uint16_t vertex_formats[][4][4] = {
+   { /* GL_BYTE */
+      {
+         PIPE_FORMAT_R8_SSCALED,
+         PIPE_FORMAT_R8G8_SSCALED,
+         PIPE_FORMAT_R8G8B8_SSCALED,
+         PIPE_FORMAT_R8G8B8A8_SSCALED
+      },
+      {
+         PIPE_FORMAT_R8_SNORM,
+         PIPE_FORMAT_R8G8_SNORM,
+         PIPE_FORMAT_R8G8B8_SNORM,
+         PIPE_FORMAT_R8G8B8A8_SNORM
+      },
+      {
+         PIPE_FORMAT_R8_SINT,
+         PIPE_FORMAT_R8G8_SINT,
+         PIPE_FORMAT_R8G8B8_SINT,
+         PIPE_FORMAT_R8G8B8A8_SINT
+      },
+   },
+   { /* GL_UNSIGNED_BYTE */
+      {
+         PIPE_FORMAT_R8_USCALED,
+         PIPE_FORMAT_R8G8_USCALED,
+         PIPE_FORMAT_R8G8B8_USCALED,
+         PIPE_FORMAT_R8G8B8A8_USCALED
+      },
+      {
+         PIPE_FORMAT_R8_UNORM,
+         PIPE_FORMAT_R8G8_UNORM,
+         PIPE_FORMAT_R8G8B8_UNORM,
+         PIPE_FORMAT_R8G8B8A8_UNORM
+      },
+      {
+         PIPE_FORMAT_R8_UINT,
+         PIPE_FORMAT_R8G8_UINT,
+         PIPE_FORMAT_R8G8B8_UINT,
+         PIPE_FORMAT_R8G8B8A8_UINT
+      },
+   },
+   { /* GL_SHORT */
+      {
+         PIPE_FORMAT_R16_SSCALED,
+         PIPE_FORMAT_R16G16_SSCALED,
+         PIPE_FORMAT_R16G16B16_SSCALED,
+         PIPE_FORMAT_R16G16B16A16_SSCALED
+      },
+      {
+         PIPE_FORMAT_R16_SNORM,
+         PIPE_FORMAT_R16G16_SNORM,
+         PIPE_FORMAT_R16G16B16_SNORM,
+         PIPE_FORMAT_R16G16B16A16_SNORM
+      },
+      {
+         PIPE_FORMAT_R16_SINT,
+         PIPE_FORMAT_R16G16_SINT,
+         PIPE_FORMAT_R16G16B16_SINT,
+         PIPE_FORMAT_R16G16B16A16_SINT
+      },
+   },
+   { /* GL_UNSIGNED_SHORT */
+      {
+         PIPE_FORMAT_R16_USCALED,
+         PIPE_FORMAT_R16G16_USCALED,
+         PIPE_FORMAT_R16G16B16_USCALED,
+         PIPE_FORMAT_R16G16B16A16_USCALED
+      },
+      {
+         PIPE_FORMAT_R16_UNORM,
+         PIPE_FORMAT_R16G16_UNORM,
+         PIPE_FORMAT_R16G16B16_UNORM,
+         PIPE_FORMAT_R16G16B16A16_UNORM
+      },
+      {
+         PIPE_FORMAT_R16_UINT,
+         PIPE_FORMAT_R16G16_UINT,
+         PIPE_FORMAT_R16G16B16_UINT,
+         PIPE_FORMAT_R16G16B16A16_UINT
+      },
+   },
+   { /* GL_INT */
+      {
+         PIPE_FORMAT_R32_SSCALED,
+         PIPE_FORMAT_R32G32_SSCALED,
+         PIPE_FORMAT_R32G32B32_SSCALED,
+         PIPE_FORMAT_R32G32B32A32_SSCALED
+      },
+      {
+         PIPE_FORMAT_R32_SNORM,
+         PIPE_FORMAT_R32G32_SNORM,
+         PIPE_FORMAT_R32G32B32_SNORM,
+         PIPE_FORMAT_R32G32B32A32_SNORM
+      },
+      {
+         PIPE_FORMAT_R32_SINT,
+         PIPE_FORMAT_R32G32_SINT,
+         PIPE_FORMAT_R32G32B32_SINT,
+         PIPE_FORMAT_R32G32B32A32_SINT
+      },
+   },
+   { /* GL_UNSIGNED_INT */
+      {
+         PIPE_FORMAT_R32_USCALED,
+         PIPE_FORMAT_R32G32_USCALED,
+         PIPE_FORMAT_R32G32B32_USCALED,
+         PIPE_FORMAT_R32G32B32A32_USCALED
+      },
+      {
+         PIPE_FORMAT_R32_UNORM,
+         PIPE_FORMAT_R32G32_UNORM,
+         PIPE_FORMAT_R32G32B32_UNORM,
+         PIPE_FORMAT_R32G32B32A32_UNORM
+      },
+      {
+         PIPE_FORMAT_R32_UINT,
+         PIPE_FORMAT_R32G32_UINT,
+         PIPE_FORMAT_R32G32B32_UINT,
+         PIPE_FORMAT_R32G32B32A32_UINT
+      },
+   },
+   { /* GL_FLOAT */
+      {
+         PIPE_FORMAT_R32_FLOAT,
+         PIPE_FORMAT_R32G32_FLOAT,
+         PIPE_FORMAT_R32G32B32_FLOAT,
+         PIPE_FORMAT_R32G32B32A32_FLOAT
+      },
+      {
+         PIPE_FORMAT_R32_FLOAT,
+         PIPE_FORMAT_R32G32_FLOAT,
+         PIPE_FORMAT_R32G32B32_FLOAT,
+         PIPE_FORMAT_R32G32B32A32_FLOAT
+      },
+   },
+   {{0}}, /* GL_2_BYTES */
+   {{0}}, /* GL_3_BYTES */
+   {{0}}, /* GL_4_BYTES */
+   { /* GL_DOUBLE */
+      {
+         PIPE_FORMAT_R64_FLOAT,
+         PIPE_FORMAT_R64G64_FLOAT,
+         PIPE_FORMAT_R64G64B64_FLOAT,
+         PIPE_FORMAT_R64G64B64A64_FLOAT
+      },
+      {
+         PIPE_FORMAT_R64_FLOAT,
+         PIPE_FORMAT_R64G64_FLOAT,
+         PIPE_FORMAT_R64G64B64_FLOAT,
+         PIPE_FORMAT_R64G64B64A64_FLOAT
+      },
+   },
+   { /* GL_HALF_FLOAT */
+      {
+         PIPE_FORMAT_R16_FLOAT,
+         PIPE_FORMAT_R16G16_FLOAT,
+         PIPE_FORMAT_R16G16B16_FLOAT,
+         PIPE_FORMAT_R16G16B16A16_FLOAT
+      },
+      {
+         PIPE_FORMAT_R16_FLOAT,
+         PIPE_FORMAT_R16G16_FLOAT,
+         PIPE_FORMAT_R16G16B16_FLOAT,
+         PIPE_FORMAT_R16G16B16A16_FLOAT
+      },
+   },
+   { /* GL_FIXED */
+      {
+         PIPE_FORMAT_R32_FIXED,
+         PIPE_FORMAT_R32G32_FIXED,
+         PIPE_FORMAT_R32G32B32_FIXED,
+         PIPE_FORMAT_R32G32B32A32_FIXED
+      },
+      {
+         PIPE_FORMAT_R32_FIXED,
+         PIPE_FORMAT_R32G32_FIXED,
+         PIPE_FORMAT_R32G32B32_FIXED,
+         PIPE_FORMAT_R32G32B32A32_FIXED
+      },
+   },
 };
 
 
@@ -210,115 +239,71 @@
 st_pipe_vertex_format(GLenum type, GLuint size, GLenum format,
                       GLboolean normalized, GLboolean integer)
 {
-   assert((type >= GL_BYTE && type <= GL_DOUBLE) ||
-          type == GL_FIXED || type == GL_HALF_FLOAT ||
-          type == GL_HALF_FLOAT_OES ||
-          type == GL_INT_2_10_10_10_REV ||
-          type == GL_UNSIGNED_INT_2_10_10_10_REV ||
-          type == GL_UNSIGNED_INT_10F_11F_11F_REV);
-   assert(size >= 1);
-   assert(size <= 4);
+   unsigned index;
+
+   assert(size >= 1 && size <= 4);
    assert(format == GL_RGBA || format == GL_BGRA);
 
-   if (type == GL_INT_2_10_10_10_REV ||
-       type == GL_UNSIGNED_INT_2_10_10_10_REV) {
-      assert(size == 4);
-      assert(!integer);
+   switch (type) {
+   case GL_HALF_FLOAT_OES:
+      type = GL_HALF_FLOAT;
+      break;
+
+   case GL_INT_2_10_10_10_REV:
+      assert(size == 4 && !integer);
 
       if (format == GL_BGRA) {
-         if (type == GL_INT_2_10_10_10_REV) {
-            if (normalized)
-               return PIPE_FORMAT_B10G10R10A2_SNORM;
-            else
-               return PIPE_FORMAT_B10G10R10A2_SSCALED;
-         } else {
-            if (normalized)
-               return PIPE_FORMAT_B10G10R10A2_UNORM;
-            else
-               return PIPE_FORMAT_B10G10R10A2_USCALED;
-         }
+         if (normalized)
+            return PIPE_FORMAT_B10G10R10A2_SNORM;
+         else
+            return PIPE_FORMAT_B10G10R10A2_SSCALED;
       } else {
-         if (type == GL_INT_2_10_10_10_REV) {
-            if (normalized)
-               return PIPE_FORMAT_R10G10B10A2_SNORM;
-            else
-               return PIPE_FORMAT_R10G10B10A2_SSCALED;
-         } else {
-            if (normalized)
-               return PIPE_FORMAT_R10G10B10A2_UNORM;
-            else
-               return PIPE_FORMAT_R10G10B10A2_USCALED;
-         }
+         if (normalized)
+            return PIPE_FORMAT_R10G10B10A2_SNORM;
+         else
+            return PIPE_FORMAT_R10G10B10A2_SSCALED;
       }
-   }
+      break;
 
-   if (type == GL_UNSIGNED_INT_10F_11F_11F_REV) {
-      assert(size == 3);
-      assert(!integer);
-      assert(format == GL_RGBA);
+   case GL_UNSIGNED_INT_2_10_10_10_REV:
+      assert(size == 4 && !integer);
 
+      if (format == GL_BGRA) {
+         if (normalized)
+            return PIPE_FORMAT_B10G10R10A2_UNORM;
+         else
+            return PIPE_FORMAT_B10G10R10A2_USCALED;
+      } else {
+         if (normalized)
+            return PIPE_FORMAT_R10G10B10A2_UNORM;
+         else
+            return PIPE_FORMAT_R10G10B10A2_USCALED;
+      }
+      break;
+
+   case GL_UNSIGNED_INT_10F_11F_11F_REV:
+      assert(size == 3 && !integer && format == GL_RGBA);
       return PIPE_FORMAT_R11G11B10_FLOAT;
+
+   case GL_UNSIGNED_BYTE:
+      if (format == GL_BGRA) {
+         /* this is an odd-ball case */
+         assert(normalized);
+         return PIPE_FORMAT_B8G8R8A8_UNORM;
+      }
+      break;
    }
 
-   if (format == GL_BGRA) {
-      /* this is an odd-ball case */
-      assert(type == GL_UNSIGNED_BYTE);
-      assert(normalized);
-      return PIPE_FORMAT_B8G8R8A8_UNORM;
-   }
-
-   if (integer) {
-      switch (type) {
-      case GL_INT: return int_types_int[size-1];
-      case GL_SHORT: return short_types_int[size-1];
-      case GL_BYTE: return byte_types_int[size-1];
-      case GL_UNSIGNED_INT: return uint_types_int[size-1];
-      case GL_UNSIGNED_SHORT: return ushort_types_int[size-1];
-      case GL_UNSIGNED_BYTE: return ubyte_types_int[size-1];
-      default: assert(0); return 0;
-      }
-   }
-   else if (normalized) {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size-1];
-      case GL_FLOAT: return float_types[size-1];
-      case GL_HALF_FLOAT:
-      case GL_HALF_FLOAT_OES: return half_float_types[size-1];
-      case GL_INT: return int_types_norm[size-1];
-      case GL_SHORT: return short_types_norm[size-1];
-      case GL_BYTE: return byte_types_norm[size-1];
-      case GL_UNSIGNED_INT: return uint_types_norm[size-1];
-      case GL_UNSIGNED_SHORT: return ushort_types_norm[size-1];
-      case GL_UNSIGNED_BYTE: return ubyte_types_norm[size-1];
-      case GL_FIXED: return fixed_types[size-1];
-      default: assert(0); return 0;
-      }
-   }
-   else {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size-1];
-      case GL_FLOAT: return float_types[size-1];
-      case GL_HALF_FLOAT:
-      case GL_HALF_FLOAT_OES: return half_float_types[size-1];
-      case GL_INT: return int_types_scale[size-1];
-      case GL_SHORT: return short_types_scale[size-1];
-      case GL_BYTE: return byte_types_scale[size-1];
-      case GL_UNSIGNED_INT: return uint_types_scale[size-1];
-      case GL_UNSIGNED_SHORT: return ushort_types_scale[size-1];
-      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size-1];
-      case GL_FIXED: return fixed_types[size-1];
-      default: assert(0); return 0;
-      }
-   }
-   return PIPE_FORMAT_NONE; /* silence compiler warning */
+   index = integer*2 + normalized;
+   assert(index <= 2);
+   assert(type >= GL_BYTE && type <= GL_FIXED);
+   return vertex_formats[type - GL_BYTE][index][size-1];
 }
 
 static const struct gl_vertex_array *
-get_client_array(const struct st_vertex_program *vp,
-                 const struct gl_vertex_array **arrays,
-                 int attr)
+get_client_array(const struct gl_vertex_array **arrays,
+                 unsigned mesaAttr)
 {
-   const GLuint mesaAttr = vp->index_to_input[attr];
    /* st_program uses 0xffffffff to denote a double placeholder attribute */
    if (mesaAttr == ST_DOUBLE_ATTRIB_PLACEHOLDER)
       return NULL;
@@ -331,8 +316,8 @@
  */
 static GLboolean
 is_interleaved_arrays(const struct st_vertex_program *vp,
-                      const struct st_vp_variant *vpv,
-                      const struct gl_vertex_array **arrays)
+                      const struct gl_vertex_array **arrays,
+                      unsigned num_inputs)
 {
    GLuint attr;
    const struct gl_buffer_object *firstBufObj = NULL;
@@ -340,16 +325,21 @@
    const GLubyte *firstPtr = NULL;
    GLboolean userSpaceBuffer = GL_FALSE;
 
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
+   for (attr = 0; attr < num_inputs; attr++) {
       const struct gl_vertex_array *array;
       const struct gl_buffer_object *bufObj;
       GLsizei stride;
 
-      array = get_client_array(vp, arrays, attr);
+      array = get_client_array(arrays, vp->index_to_input[attr]);
       if (!array)
 	 continue;
 
       stride = array->StrideB; /* in bytes */
+
+      /* To keep things simple, don't allow interleaved zero-stride attribs. */
+      if (stride == 0)
+         return false;
+
       bufObj = array->BufferObj;
       if (attr == 0) {
          /* save info about the first array */
@@ -388,8 +378,7 @@
    assert(velement->src_format);
 }
 
-static void init_velement_lowered(struct st_context *st,
-                                  const struct st_vertex_program *vp,
+static void init_velement_lowered(const struct st_vertex_program *vp,
                                   struct pipe_vertex_element *velements,
                                   int src_offset, int format,
                                   int instance_divisor, int vbo_index,
@@ -437,20 +426,39 @@
    *attr_idx = idx;
 }
 
+static void
+set_vertex_attribs(struct st_context *st,
+                   struct pipe_vertex_buffer *vbuffers,
+                   unsigned num_vbuffers,
+                   struct pipe_vertex_element *velements,
+                   unsigned num_velements)
+{
+   struct cso_context *cso = st->cso_context;
+
+   cso_set_vertex_buffers(cso, 0, num_vbuffers, vbuffers);
+   if (st->last_num_vbuffers > num_vbuffers) {
+      /* Unbind remaining buffers, if any. */
+      cso_set_vertex_buffers(cso, num_vbuffers,
+                             st->last_num_vbuffers - num_vbuffers, NULL);
+   }
+   st->last_num_vbuffers = num_vbuffers;
+   cso_set_vertex_elements(cso, num_velements, velements);
+}
+
 /**
  * Set up for drawing interleaved arrays that all live in one VBO
  * or all live in user space.
  * \param vbuffer  returns vertex buffer info
  * \param velements  returns vertex element info
  */
-static boolean
+static void
 setup_interleaved_attribs(struct st_context *st,
                           const struct st_vertex_program *vp,
-                          const struct st_vp_variant *vpv,
                           const struct gl_vertex_array **arrays,
-                          struct pipe_vertex_buffer *vbuffer,
-                          struct pipe_vertex_element velements[])
+                          unsigned num_inputs)
 {
+   struct pipe_vertex_buffer vbuffer;
+   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS] = {{0}};
    GLuint attr;
    const GLubyte *low_addr = NULL;
    GLboolean usingVBO;      /* all arrays in a VBO? */
@@ -460,10 +468,10 @@
    /* Find the lowest address of the arrays we're drawing,
     * Init bufobj and stride.
     */
-   if (vpv->num_inputs) {
+   if (num_inputs) {
       const struct gl_vertex_array *array;
 
-      array = get_client_array(vp, arrays, 0);
+      array = get_client_array(arrays, vp->index_to_input[0]);
       assert(array);
 
       /* Since we're doing interleaved arrays, we know there'll be at most
@@ -475,9 +483,9 @@
 
       low_addr = arrays[vp->index_to_input[0]]->Ptr;
 
-      for (attr = 1; attr < vpv->num_inputs; attr++) {
+      for (attr = 1; attr < num_inputs; attr++) {
          const GLubyte *start;
-         array = get_client_array(vp, arrays, attr);
+         array = get_client_array(arrays, vp->index_to_input[attr]);
          if (!array)
             continue;
          start = array->Ptr;
@@ -494,12 +502,12 @@
    /* are the arrays in user space? */
    usingVBO = _mesa_is_bufferobj(bufobj);
 
-   for (attr = 0; attr < vpv->num_inputs;) {
+   for (attr = 0; attr < num_inputs;) {
       const struct gl_vertex_array *array;
       unsigned src_offset;
       unsigned src_format;
 
-      array = get_client_array(vp, arrays, attr);
+      array = get_client_array(arrays, vp->index_to_input[attr]);
       assert(array);
 
       src_offset = (unsigned) (array->Ptr - low_addr);
@@ -512,7 +520,7 @@
                                          array->Normalized,
                                          array->Integer);
 
-      init_velement_lowered(st, vp, velements, src_offset, src_format,
+      init_velement_lowered(vp, velements, src_offset, src_format,
                             array->InstanceDivisor, 0,
                             array->Size, array->Doubles, &attr);
    }
@@ -520,34 +528,40 @@
    /*
     * Return the vbuffer info and setup user-space attrib info, if needed.
     */
-   if (vpv->num_inputs == 0) {
+   if (num_inputs == 0) {
       /* just defensive coding here */
-      vbuffer->buffer = NULL;
-      vbuffer->user_buffer = NULL;
-      vbuffer->buffer_offset = 0;
-      vbuffer->stride = 0;
+      vbuffer.buffer.resource = NULL;
+      vbuffer.is_user_buffer = false;
+      vbuffer.buffer_offset = 0;
+      vbuffer.stride = 0;
    }
    else if (usingVBO) {
       /* all interleaved arrays in a VBO */
       struct st_buffer_object *stobj = st_buffer_object(bufobj);
 
       if (!stobj || !stobj->buffer) {
-         return FALSE; /* out-of-memory error probably */
+         st->vertex_array_out_of_memory = true;
+         return; /* out-of-memory error probably */
       }
 
-      vbuffer->buffer = stobj->buffer;
-      vbuffer->user_buffer = NULL;
-      vbuffer->buffer_offset = pointer_to_offset(low_addr);
-      vbuffer->stride = stride;
+      vbuffer.buffer.resource = stobj->buffer;
+      vbuffer.is_user_buffer = false;
+      vbuffer.buffer_offset = pointer_to_offset(low_addr);
+      vbuffer.stride = stride;
    }
    else {
       /* all interleaved arrays in user memory */
-      vbuffer->buffer = NULL;
-      vbuffer->user_buffer = low_addr;
-      vbuffer->buffer_offset = 0;
-      vbuffer->stride = stride;
+      vbuffer.buffer.user = low_addr;
+      vbuffer.is_user_buffer = !!low_addr; /* if NULL, then unbind */
+      vbuffer.buffer_offset = 0;
+      vbuffer.stride = stride;
+
+      if (low_addr)
+         st->draw_needs_minmax_index = true;
    }
-   return TRUE;
+
+   set_vertex_attribs(st, &vbuffer, num_inputs ? 1 : 0,
+                      velements, num_inputs);
 }
 
 /**
@@ -556,32 +570,31 @@
  * \param vbuffer  returns vertex buffer info
  * \param velements  returns vertex element info
  */
-static boolean
+static void
 setup_non_interleaved_attribs(struct st_context *st,
                               const struct st_vertex_program *vp,
-                              const struct st_vp_variant *vpv,
                               const struct gl_vertex_array **arrays,
-                              struct pipe_vertex_buffer vbuffer[],
-                              struct pipe_vertex_element velements[],
-                              unsigned *num_vbuffers)
+                              unsigned num_inputs)
 {
    struct gl_context *ctx = st->ctx;
+   struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS] = {{0}};
+   unsigned num_vbuffers = 0;
+   unsigned unref_buffers = 0;
    GLuint attr;
 
-   *num_vbuffers = 0;
-
-   for (attr = 0; attr < vpv->num_inputs;) {
-      const GLuint mesaAttr = vp->index_to_input[attr];
+   for (attr = 0; attr < num_inputs;) {
+      const unsigned mesaAttr = vp->index_to_input[attr];
       const struct gl_vertex_array *array;
       struct gl_buffer_object *bufobj;
       GLsizei stride;
       unsigned src_format;
       unsigned bufidx;
 
-      array = get_client_array(vp, arrays, attr);
+      array = get_client_array(arrays, mesaAttr);
       assert(array);
 
-      bufidx = (*num_vbuffers)++;
+      bufidx = num_vbuffers++;
 
       stride = array->StrideB;
       bufobj = array->BufferObj;
@@ -596,31 +609,51 @@
          struct st_buffer_object *stobj = st_buffer_object(bufobj);
 
          if (!stobj || !stobj->buffer) {
-            return FALSE; /* out-of-memory error probably */
+            st->vertex_array_out_of_memory = true;
+            return; /* out-of-memory error probably */
          }
 
-         vbuffer[bufidx].buffer = stobj->buffer;
-         vbuffer[bufidx].user_buffer = NULL;
+         vbuffer[bufidx].buffer.resource = stobj->buffer;
+         vbuffer[bufidx].is_user_buffer = false;
          vbuffer[bufidx].buffer_offset = pointer_to_offset(array->Ptr);
       }
       else {
-         /* wrap user data */
-         void *ptr;
+         if (stride == 0) {
+            unsigned size = array->_ElementSize;
+            /* This is optimal for GPU cache line usage if the upload size
+             * is <= cache line size.
+             */
+            unsigned alignment = util_next_power_of_two(size);
+            void *ptr = array->Ptr ? (void*)array->Ptr :
+                                     (void*)ctx->Current.Attrib[mesaAttr];
 
-         if (array->Ptr) {
-            ptr = (void *) array->Ptr;
+            vbuffer[bufidx].is_user_buffer = false;
+            vbuffer[bufidx].buffer.resource = NULL;
+
+            /* Use const_uploader for zero-stride vertex attributes, because
+             * it may use a better memory placement than stream_uploader.
+             * The reason is that zero-stride attributes can be fetched many
+             * times (thousands of times), so a better placement is going to
+             * perform better.
+             *
+             * Upload the maximum possible size, which is 4x GLdouble = 32.
+             */
+            u_upload_data(st->can_bind_const_buffer_as_vertex ?
+                             st->pipe->const_uploader :
+                             st->pipe->stream_uploader,
+                          0, size, alignment, ptr,
+                          &vbuffer[bufidx].buffer_offset,
+                          &vbuffer[bufidx].buffer.resource);
+            unref_buffers |= 1u << bufidx;
+         } else {
+            assert(array->Ptr);
+            vbuffer[bufidx].buffer.user = array->Ptr;
+            vbuffer[bufidx].is_user_buffer = true;
+            vbuffer[bufidx].buffer_offset = 0;
+
+            if (!array->InstanceDivisor)
+               st->draw_needs_minmax_index = true;
          }
-         else {
-            /* no array, use ctx->Current.Attrib[] value */
-            ptr = (void *) ctx->Current.Attrib[mesaAttr];
-            stride = 0;
-         }
-
-         assert(ptr);
-
-         vbuffer[bufidx].buffer = NULL;
-         vbuffer[bufidx].user_buffer = ptr;
-         vbuffer[bufidx].buffer_offset = 0;
       }
 
       /* common-case setup */
@@ -632,25 +665,33 @@
                                          array->Normalized,
                                          array->Integer);
 
-      init_velement_lowered(st, vp, velements, 0, src_format,
+      init_velement_lowered(vp, velements, 0, src_format,
                             array->InstanceDivisor, bufidx,
                             array->Size, array->Doubles, &attr);
    }
 
-   return TRUE;
+   if (!ctx->Const.AllowMappedBuffersDuringExecution) {
+      u_upload_unmap(st->pipe->stream_uploader);
+   }
+
+   set_vertex_attribs(st, vbuffer, num_vbuffers, velements, num_inputs);
+
+   /* Unreference uploaded zero-stride vertex buffers. */
+   while (unref_buffers) {
+      unsigned i = u_bit_scan(&unref_buffers);
+      pipe_resource_reference(&vbuffer[i].buffer.resource, NULL);
+   }
 }
 
-static void update_array(struct st_context *st)
+void st_update_array(struct st_context *st)
 {
    struct gl_context *ctx = st->ctx;
    const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
    const struct st_vertex_program *vp;
-   const struct st_vp_variant *vpv;
-   struct pipe_vertex_buffer vbuffer[PIPE_MAX_SHADER_INPUTS];
-   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
-   unsigned num_vbuffers;
+   unsigned num_inputs;
 
    st->vertex_array_out_of_memory = FALSE;
+   st->draw_needs_minmax_index = false;
 
    /* No drawing has been done yet, so do nothing. */
    if (!arrays)
@@ -658,42 +699,10 @@
 
    /* vertex program validation must be done before this */
    vp = st->vp;
-   vpv = st->vp_variant;
+   num_inputs = st->vp_variant->num_inputs;
 
-   memset(velements, 0, sizeof(struct pipe_vertex_element) * vpv->num_inputs);
-
-   /*
-    * Setup the vbuffer[] and velements[] arrays.
-    */
-   if (is_interleaved_arrays(vp, vpv, arrays)) {
-      if (!setup_interleaved_attribs(st, vp, vpv, arrays, vbuffer, velements)) {
-         st->vertex_array_out_of_memory = TRUE;
-         return;
-      }
-
-      num_vbuffers = 1;
-      if (vpv->num_inputs == 0)
-         num_vbuffers = 0;
-   }
-   else {
-      if (!setup_non_interleaved_attribs(st, vp, vpv, arrays, vbuffer,
-                                         velements, &num_vbuffers)) {
-         st->vertex_array_out_of_memory = TRUE;
-         return;
-      }
-   }
-
-   cso_set_vertex_buffers(st->cso_context, 0, num_vbuffers, vbuffer);
-   if (st->last_num_vbuffers > num_vbuffers) {
-      /* Unbind remaining buffers, if any. */
-      cso_set_vertex_buffers(st->cso_context, num_vbuffers,
-                             st->last_num_vbuffers - num_vbuffers, NULL);
-   }
-   st->last_num_vbuffers = num_vbuffers;
-   cso_set_vertex_elements(st->cso_context, vpv->num_inputs, velements);
+   if (is_interleaved_arrays(vp, arrays, num_inputs))
+      setup_interleaved_attribs(st, vp, arrays, num_inputs);
+   else
+      setup_non_interleaved_attribs(st, vp, arrays, num_inputs);
 }
-
-
-const struct st_tracked_state st_update_array = {
-   update_array						/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_atomicbuf.c b/src/mesa/state_tracker/st_atom_atomicbuf.c
index 11fb84f..171fecc 100644
--- a/src/mesa/state_tracker/st_atom_atomicbuf.c
+++ b/src/mesa/state_tracker/st_atom_atomicbuf.c
@@ -69,8 +69,8 @@
    }
 }
 
-static void
-bind_vs_atomics(struct st_context *st)
+void
+st_bind_vs_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
@@ -78,12 +78,8 @@
    st_bind_atomics(st, prog, PIPE_SHADER_VERTEX);
 }
 
-const struct st_tracked_state st_bind_vs_atomics = {
-   bind_vs_atomics
-};
-
-static void
-bind_fs_atomics(struct st_context *st)
+void
+st_bind_fs_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
@@ -91,12 +87,8 @@
    st_bind_atomics(st, prog, PIPE_SHADER_FRAGMENT);
 }
 
-const struct st_tracked_state st_bind_fs_atomics = {
-   bind_fs_atomics
-};
-
-static void
-bind_gs_atomics(struct st_context *st)
+void
+st_bind_gs_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
@@ -104,12 +96,8 @@
    st_bind_atomics(st, prog, PIPE_SHADER_GEOMETRY);
 }
 
-const struct st_tracked_state st_bind_gs_atomics = {
-   bind_gs_atomics
-};
-
-static void
-bind_tcs_atomics(struct st_context *st)
+void
+st_bind_tcs_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
@@ -117,12 +105,8 @@
    st_bind_atomics(st, prog, PIPE_SHADER_TESS_CTRL);
 }
 
-const struct st_tracked_state st_bind_tcs_atomics = {
-   bind_tcs_atomics
-};
-
-static void
-bind_tes_atomics(struct st_context *st)
+void
+st_bind_tes_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
@@ -130,19 +114,11 @@
    st_bind_atomics(st, prog, PIPE_SHADER_TESS_EVAL);
 }
 
-const struct st_tracked_state st_bind_tes_atomics = {
-   bind_tes_atomics
-};
-
-static void
-bind_cs_atomics(struct st_context *st)
+void
+st_bind_cs_atomics(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
 
    st_bind_atomics(st, prog, PIPE_SHADER_COMPUTE);
 }
-
-const struct st_tracked_state st_bind_cs_atomics = {
-   bind_cs_atomics
-};
diff --git a/src/mesa/state_tracker/st_atom_blend.c b/src/mesa/state_tracker/st_atom_blend.c
index f76cfab..103c500 100644
--- a/src/mesa/state_tracker/st_atom_blend.c
+++ b/src/mesa/state_tracker/st_atom_blend.c
@@ -39,6 +39,7 @@
 #include "pipe/p_defines.h"
 #include "cso_cache/cso_context.h"
 
+#include "framebuffer.h"
 #include "main/macros.h"
 
 /**
@@ -186,8 +187,8 @@
    return GL_FALSE;
 }
 
-static void 
-update_blend( struct st_context *st )
+void
+st_update_blend( struct st_context *st )
 {
    struct pipe_blend_state *blend = &st->state.blend;
    const struct gl_context *ctx = st->ctx;
@@ -265,8 +266,7 @@
 
    blend->dither = ctx->Color.DitherFlag;
 
-   if (ctx->Multisample.Enabled &&
-       ctx->DrawBuffer->Visual.sampleBuffers > 0 &&
+   if (_mesa_is_multisample_enabled(ctx) &&
        !(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
       /* Unlike in gallium/d3d10 these operations are only performed
        * if both msaa is enabled and we have a multisample buffer.
@@ -276,15 +276,13 @@
    }
 
    cso_set_blend(st->cso_context, blend);
-
-   {
-      struct pipe_blend_color bc;
-      COPY_4FV(bc.color, ctx->Color.BlendColorUnclamped);
-      cso_set_blend_color(st->cso_context, &bc);
-   }
 }
 
+void
+st_update_blend_color(struct st_context *st)
+{
+   struct pipe_blend_color bc;
 
-const struct st_tracked_state st_update_blend = {
-   update_blend,					/* update */
-};
+   COPY_4FV(bc.color, st->ctx->Color.BlendColorUnclamped);
+   cso_set_blend_color(st->cso_context, &bc);
+}
diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c
index 0df7985..0db3a5d 100644
--- a/src/mesa/state_tracker/st_atom_clip.c
+++ b/src/mesa/state_tracker/st_atom_clip.c
@@ -41,7 +41,7 @@
 
 /* Second state atom for user clip planes:
  */
-static void update_clip( struct st_context *st )
+void st_update_clip( struct st_context *st )
 {
    struct pipe_clip_state clip;
    const struct gl_context *ctx = st->ctx;
@@ -66,8 +66,3 @@
       st->pipe->set_clip_state(st->pipe, &clip);
    }
 }
-
-
-const struct st_tracked_state st_update_clip = {
-   update_clip						/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index d16f92e..e4b5851 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -51,12 +51,11 @@
 /**
  * Pass the given program parameters to the graphics pipe as a
  * constant buffer.
- * \param shader_type  either PIPE_SHADER_VERTEX or PIPE_SHADER_FRAGMENT
  */
-void st_upload_constants( struct st_context *st,
-                          struct gl_program_parameter_list *params,
-                          gl_shader_stage stage)
+void st_upload_constants(struct st_context *st, struct gl_program *prog)
 {
+   gl_shader_stage stage = prog->info.stage;
+   struct gl_program_parameter_list *params = prog->Parameters;
    enum pipe_shader_type shader_type = st_shader_stage_to_ptarget(stage);
 
    assert(shader_type == PIPE_SHADER_VERTEX ||
@@ -81,6 +80,12 @@
       }
    }
 
+   /* Make all bindless samplers/images bound texture/image units resident in
+    * the context.
+    */
+   st_make_bound_samplers_resident(st, prog);
+   st_make_bound_images_resident(st, prog);
+
    /* update constants */
    if (params && params->NumParameters) {
       struct pipe_constant_buffer cb;
@@ -139,105 +144,60 @@
 /**
  * Vertex shader:
  */
-static void update_vs_constants(struct st_context *st )
+void st_update_vs_constants(struct st_context *st )
 {
-   struct st_vertex_program *vp = st->vp;
-   struct gl_program_parameter_list *params = vp->Base.Parameters;
-
-   st_upload_constants( st, params, MESA_SHADER_VERTEX );
+   st_upload_constants(st, &st->vp->Base);
 }
 
-
-const struct st_tracked_state st_update_vs_constants = {
-   update_vs_constants					/* update */
-};
-
-
-
 /**
  * Fragment shader:
  */
-static void update_fs_constants(struct st_context *st )
+void st_update_fs_constants(struct st_context *st )
 {
-   struct st_fragment_program *fp = st->fp;
-   struct gl_program_parameter_list *params = fp->Base.Parameters;
-
-   st_upload_constants( st, params, MESA_SHADER_FRAGMENT );
+   st_upload_constants(st, &st->fp->Base);
 }
 
 
-const struct st_tracked_state st_update_fs_constants = {
-   update_fs_constants					/* update */
-};
-
 /* Geometry shader:
  */
-static void update_gs_constants(struct st_context *st )
+void st_update_gs_constants(struct st_context *st )
 {
-   struct st_geometry_program *gp = st->gp;
-   struct gl_program_parameter_list *params;
+   struct st_common_program *gp = st->gp;
 
-   if (gp) {
-      params = gp->Base.Parameters;
-      st_upload_constants( st, params, MESA_SHADER_GEOMETRY );
-   }
+   if (gp)
+      st_upload_constants(st, &gp->Base);
 }
 
-const struct st_tracked_state st_update_gs_constants = {
-   update_gs_constants					/* update */
-};
-
 /* Tessellation control shader:
  */
-static void update_tcs_constants(struct st_context *st )
+void st_update_tcs_constants(struct st_context *st )
 {
-   struct st_tessctrl_program *tcp = st->tcp;
-   struct gl_program_parameter_list *params;
+   struct st_common_program *tcp = st->tcp;
 
-   if (tcp) {
-      params = tcp->Base.Parameters;
-      st_upload_constants( st, params, MESA_SHADER_TESS_CTRL );
-   }
+   if (tcp)
+      st_upload_constants(st, &tcp->Base);
 }
 
-const struct st_tracked_state st_update_tcs_constants = {
-   update_tcs_constants					/* update */
-};
-
 /* Tessellation evaluation shader:
  */
-static void update_tes_constants(struct st_context *st )
+void st_update_tes_constants(struct st_context *st )
 {
-   struct st_tesseval_program *tep = st->tep;
-   struct gl_program_parameter_list *params;
+   struct st_common_program *tep = st->tep;
 
-   if (tep) {
-      params = tep->Base.Parameters;
-      st_upload_constants( st, params, MESA_SHADER_TESS_EVAL );
-   }
+   if (tep)
+      st_upload_constants(st, &tep->Base);
 }
 
-const struct st_tracked_state st_update_tes_constants = {
-   update_tes_constants					/* update */
-};
-
 /* Compute shader:
  */
-static void update_cs_constants(struct st_context *st )
+void st_update_cs_constants(struct st_context *st )
 {
    struct st_compute_program *cp = st->cp;
-   struct gl_program_parameter_list *params;
 
-   if (cp) {
-      params = cp->Base.Parameters;
-      st_upload_constants( st, params, MESA_SHADER_COMPUTE );
-   }
+   if (cp)
+      st_upload_constants(st, &cp->Base);
 }
 
-const struct st_tracked_state st_update_cs_constants = {
-   update_cs_constants					/* update */
-};
-
 static void st_bind_ubos(struct st_context *st, struct gl_program *prog,
                          unsigned shader_type)
 {
@@ -276,7 +236,7 @@
    }
 }
 
-static void bind_vs_ubos(struct st_context *st)
+void st_bind_vs_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
@@ -284,11 +244,7 @@
    st_bind_ubos(st, prog, PIPE_SHADER_VERTEX);
 }
 
-const struct st_tracked_state st_bind_vs_ubos = {
-   bind_vs_ubos
-};
-
-static void bind_fs_ubos(struct st_context *st)
+void st_bind_fs_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
@@ -296,11 +252,7 @@
    st_bind_ubos(st, prog, PIPE_SHADER_FRAGMENT);
 }
 
-const struct st_tracked_state st_bind_fs_ubos = {
-   bind_fs_ubos
-};
-
-static void bind_gs_ubos(struct st_context *st)
+void st_bind_gs_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
@@ -308,11 +260,7 @@
    st_bind_ubos(st, prog, PIPE_SHADER_GEOMETRY);
 }
 
-const struct st_tracked_state st_bind_gs_ubos = {
-   bind_gs_ubos
-};
-
-static void bind_tcs_ubos(struct st_context *st)
+void st_bind_tcs_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
@@ -320,11 +268,7 @@
    st_bind_ubos(st, prog, PIPE_SHADER_TESS_CTRL);
 }
 
-const struct st_tracked_state st_bind_tcs_ubos = {
-   bind_tcs_ubos
-};
-
-static void bind_tes_ubos(struct st_context *st)
+void st_bind_tes_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
@@ -332,18 +276,10 @@
    st_bind_ubos(st, prog, PIPE_SHADER_TESS_EVAL);
 }
 
-const struct st_tracked_state st_bind_tes_ubos = {
-   bind_tes_ubos
-};
-
-static void bind_cs_ubos(struct st_context *st)
+void st_bind_cs_ubos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
 
    st_bind_ubos(st, prog, PIPE_SHADER_COMPUTE);
 }
-
-const struct st_tracked_state st_bind_cs_ubos = {
-   bind_cs_ubos
-};
diff --git a/src/mesa/state_tracker/st_atom_constbuf.h b/src/mesa/state_tracker/st_atom_constbuf.h
index df60a62..e810a24 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.h
+++ b/src/mesa/state_tracker/st_atom_constbuf.h
@@ -35,9 +35,7 @@
 struct st_context;
 
 
-void st_upload_constants( struct st_context *st,
-                          struct gl_program_parameter_list *params,
-                          gl_shader_stage stage);
+void st_upload_constants(struct st_context *st, struct gl_program *prog);
 
 
 #endif /* ST_ATOM_CONSTBUF_H */
diff --git a/src/mesa/state_tracker/st_atom_depth.c b/src/mesa/state_tracker/st_atom_depth.c
index 7092c3f..3b2beaf 100644
--- a/src/mesa/state_tracker/st_atom_depth.c
+++ b/src/mesa/state_tracker/st_atom_depth.c
@@ -95,8 +95,8 @@
    }
 }
 
-static void
-update_depth_stencil_alpha(struct st_context *st)
+void
+st_update_depth_stencil_alpha(struct st_context *st)
 {
    struct pipe_depth_stencil_alpha_state *dsa = &st->state.depth_stencil;
    struct pipe_stencil_ref sr;
@@ -128,7 +128,7 @@
       dsa->stencil[0].writemask = ctx->Stencil.WriteMask[0] & 0xff;
       sr.ref_value[0] = _mesa_get_stencil_ref(ctx, 0);
 
-      if (ctx->Stencil._TestTwoSide) {
+      if (_mesa_stencil_is_two_sided(ctx)) {
          const GLuint back = ctx->Stencil._BackFace;
          dsa->stencil[1].enabled = 1;
          dsa->stencil[1].func = st_compare_func_to_pipe(ctx->Stencil.Function[back]);
@@ -159,8 +159,3 @@
    cso_set_depth_stencil_alpha(st->cso_context, dsa);
    cso_set_stencil_ref(st->cso_context, &sr);
 }
-
-
-const struct st_tracked_state st_update_depth_stencil_alpha = {
-   update_depth_stencil_alpha				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index 7435c00..acbe980 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -43,6 +43,7 @@
 #include "util/u_math.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "util/u_framebuffer.h"
 #include "main/framebuffer.h"
 
 
@@ -104,10 +105,10 @@
 /**
  * Update framebuffer state (color, depth, stencil, etc. buffers)
  */
-static void
-update_framebuffer_state( struct st_context *st )
+void
+st_update_framebuffer_state( struct st_context *st )
 {
-   struct pipe_framebuffer_state *framebuffer = &st->state.framebuffer;
+   struct pipe_framebuffer_state framebuffer;
    struct gl_framebuffer *fb = st->ctx->DrawBuffer;
    struct st_renderbuffer *strb;
    GLuint i;
@@ -128,19 +129,18 @@
    fb->DefaultGeometry._NumSamples =
       framebuffer_quantize_num_samples(st, fb->DefaultGeometry.NumSamples);
 
-   framebuffer->width  = _mesa_geometric_width(fb);
-   framebuffer->height = _mesa_geometric_height(fb);
-   framebuffer->samples = _mesa_geometric_samples(fb);
-   framebuffer->layers = _mesa_geometric_layers(fb);
+   framebuffer.width  = _mesa_geometric_width(fb);
+   framebuffer.height = _mesa_geometric_height(fb);
+   framebuffer.samples = _mesa_geometric_samples(fb);
+   framebuffer.layers = _mesa_geometric_layers(fb);
 
    /* Examine Mesa's ctx->DrawBuffer->_ColorDrawBuffers state
     * to determine which surfaces to draw to
     */
-   framebuffer->nr_cbufs = fb->_NumColorDrawBuffers;
+   framebuffer.nr_cbufs = fb->_NumColorDrawBuffers;
 
    for (i = 0; i < fb->_NumColorDrawBuffers; i++) {
-      pipe_surface_reference(&framebuffer->cbufs[i], NULL);
-
+      framebuffer.cbufs[i] = NULL;
       strb = st_renderbuffer(fb->_ColorDrawBuffers[i]);
 
       if (strb) {
@@ -151,21 +151,21 @@
          }
 
          if (strb->surface) {
-            pipe_surface_reference(&framebuffer->cbufs[i], strb->surface);
-            update_framebuffer_size(framebuffer, strb->surface);
+            framebuffer.cbufs[i] = strb->surface;
+            update_framebuffer_size(&framebuffer, strb->surface);
          }
          strb->defined = GL_TRUE; /* we'll be drawing something */
       }
    }
 
-   for (i = framebuffer->nr_cbufs; i < PIPE_MAX_COLOR_BUFS; i++) {
-      pipe_surface_reference(&framebuffer->cbufs[i], NULL);
+   for (i = framebuffer.nr_cbufs; i < PIPE_MAX_COLOR_BUFS; i++) {
+      framebuffer.cbufs[i] = NULL;
    }
 
    /* Remove trailing GL_NONE draw buffers. */
-   while (framebuffer->nr_cbufs &&
-          !framebuffer->cbufs[framebuffer->nr_cbufs-1]) {
-      framebuffer->nr_cbufs--;
+   while (framebuffer.nr_cbufs &&
+          !framebuffer.cbufs[framebuffer.nr_cbufs-1]) {
+      framebuffer.nr_cbufs--;
    }
 
    /*
@@ -177,8 +177,8 @@
          /* rendering to a GL texture, may have to update surface */
          st_update_renderbuffer_surface(st, strb);
       }
-      pipe_surface_reference(&framebuffer->zsbuf, strb->surface);
-      update_framebuffer_size(framebuffer, strb->surface);
+      framebuffer.zsbuf = strb->surface;
+      update_framebuffer_size(&framebuffer, strb->surface);
    }
    else {
       strb = st_renderbuffer(fb->Attachment[BUFFER_STENCIL].Renderbuffer);
@@ -187,34 +187,33 @@
             /* rendering to a GL texture, may have to update surface */
             st_update_renderbuffer_surface(st, strb);
          }
-         pipe_surface_reference(&framebuffer->zsbuf, strb->surface);
-         update_framebuffer_size(framebuffer, strb->surface);
+         framebuffer.zsbuf = strb->surface;
+         update_framebuffer_size(&framebuffer, strb->surface);
       }
       else
-         pipe_surface_reference(&framebuffer->zsbuf, NULL);
+         framebuffer.zsbuf = NULL;
    }
 
 #ifdef DEBUG
    /* Make sure the resource binding flags were set properly */
-   for (i = 0; i < framebuffer->nr_cbufs; i++) {
-      assert(!framebuffer->cbufs[i] ||
-             framebuffer->cbufs[i]->texture->bind & PIPE_BIND_RENDER_TARGET);
+   for (i = 0; i < framebuffer.nr_cbufs; i++) {
+      assert(!framebuffer.cbufs[i] ||
+             framebuffer.cbufs[i]->texture->bind & PIPE_BIND_RENDER_TARGET);
    }
-   if (framebuffer->zsbuf) {
-      assert(framebuffer->zsbuf->texture->bind & PIPE_BIND_DEPTH_STENCIL);
+   if (framebuffer.zsbuf) {
+      assert(framebuffer.zsbuf->texture->bind & PIPE_BIND_DEPTH_STENCIL);
    }
 #endif
 
-   if (framebuffer->width == USHRT_MAX)
-      framebuffer->width = 0;
-   if (framebuffer->height == USHRT_MAX)
-      framebuffer->height = 0;
+   if (framebuffer.width == USHRT_MAX)
+      framebuffer.width = 0;
+   if (framebuffer.height == USHRT_MAX)
+      framebuffer.height = 0;
 
-   cso_set_framebuffer(st->cso_context, framebuffer);
+   cso_set_framebuffer(st->cso_context, &framebuffer);
+
+   st->state.fb_width = framebuffer.width;
+   st->state.fb_height = framebuffer.height;
+   st->state.fb_num_samples = util_framebuffer_get_num_samples(&framebuffer);
+   st->state.fb_num_layers = util_framebuffer_get_num_layers(&framebuffer);
 }
-
-
-const struct st_tracked_state st_update_framebuffer = {
-   update_framebuffer_state				/* update */
-};
-
diff --git a/src/mesa/state_tracker/st_atom_image.c b/src/mesa/state_tracker/st_atom_image.c
index 077bafd..1c49801 100644
--- a/src/mesa/state_tracker/st_atom_image.c
+++ b/src/mesa/state_tracker/st_atom_image.c
@@ -36,6 +36,7 @@
 #include "util/u_surface.h"
 #include "cso_cache/cso_context.h"
 
+#include "st_cb_bufferobjects.h"
 #include "st_cb_texture.h"
 #include "st_debug.h"
 #include "st_texture.h"
@@ -53,7 +54,6 @@
 {
    struct st_texture_object *stObj = st_texture_object(u->TexObj);
 
-   img->resource = stObj->pt;
    img->format = st_mesa_format_to_pipe_format(st, u->_ActualFormat);
 
    switch (u->Access) {
@@ -70,16 +70,32 @@
       unreachable("bad gl_image_unit::Access");
    }
 
-   if (stObj->pt->target == PIPE_BUFFER) {
+   if (stObj->base.Target == GL_TEXTURE_BUFFER) {
+      struct st_buffer_object *stbuf =
+         st_buffer_object(stObj->base.BufferObject);
       unsigned base, size;
 
-      base = stObj->base.BufferOffset;
-      assert(base < stObj->pt->width0);
-      size = MIN2(stObj->pt->width0 - base, (unsigned)stObj->base.BufferSize);
+      if (!stbuf || !stbuf->buffer) {
+         memset(img, 0, sizeof(*img));
+         return;
+      }
+      struct pipe_resource *buf = stbuf->buffer;
 
+      base = stObj->base.BufferOffset;
+      assert(base < buf->width0);
+      size = MIN2(buf->width0 - base, (unsigned)stObj->base.BufferSize);
+
+      img->resource = stbuf->buffer;
       img->u.buf.offset = base;
       img->u.buf.size = size;
    } else {
+      if (!st_finalize_texture(st->ctx, st->pipe, u->TexObj, 0) ||
+          !stObj->pt) {
+         memset(img, 0, sizeof(*img));
+         return;
+      }
+
+      img->resource = stObj->pt;
       img->u.tex.level = u->Level + stObj->base.MinLevel;
       if (stObj->pt->target == PIPE_TEXTURE_3D) {
          if (u->Layered) {
@@ -102,6 +118,24 @@
    }
 }
 
+/**
+ * Get a pipe_image_view object from an image unit.
+ */
+void
+st_convert_image_from_unit(const struct st_context *st,
+                           struct pipe_image_view *img,
+                           GLuint imgUnit)
+{
+   struct gl_image_unit *u = &st->ctx->ImageUnits[imgUnit];
+
+   if (!_mesa_is_image_unit_valid(st->ctx, u)) {
+      memset(img, 0, sizeof(*img));
+      return;
+   }
+
+   st_convert_image(st, u, img);
+}
+
 static void
 st_bind_images(struct st_context *st, struct gl_program *prog,
                enum pipe_shader_type shader_type)
@@ -116,19 +150,9 @@
    c = &st->ctx->Const.Program[prog->info.stage];
 
    for (i = 0; i < prog->info.num_images; i++) {
-      struct gl_image_unit *u =
-         &st->ctx->ImageUnits[prog->sh.ImageUnits[i]];
-      struct st_texture_object *stObj = st_texture_object(u->TexObj);
       struct pipe_image_view *img = &images[i];
 
-      if (!_mesa_is_image_unit_valid(st->ctx, u) ||
-          !st_finalize_texture(st->ctx, st->pipe, u->TexObj, 0) ||
-          !stObj->pt) {
-         memset(img, 0, sizeof(*img));
-         continue;
-      }
-
-      st_convert_image(st, u, img);
+      st_convert_image_from_unit(st, img, prog->sh.ImageUnits[i]);
    }
    cso_set_shader_images(st->cso_context, shader_type, 0,
                          prog->info.num_images, images);
@@ -139,7 +163,7 @@
             c->MaxImageUniforms - prog->info.num_images, NULL);
 }
 
-static void bind_vs_images(struct st_context *st)
+void st_bind_vs_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
@@ -147,11 +171,7 @@
    st_bind_images(st, prog, PIPE_SHADER_VERTEX);
 }
 
-const struct st_tracked_state st_bind_vs_images = {
-   bind_vs_images
-};
-
-static void bind_fs_images(struct st_context *st)
+void st_bind_fs_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
@@ -159,11 +179,7 @@
    st_bind_images(st, prog, PIPE_SHADER_FRAGMENT);
 }
 
-const struct st_tracked_state st_bind_fs_images = {
-   bind_fs_images
-};
-
-static void bind_gs_images(struct st_context *st)
+void st_bind_gs_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
@@ -171,11 +187,7 @@
    st_bind_images(st, prog, PIPE_SHADER_GEOMETRY);
 }
 
-const struct st_tracked_state st_bind_gs_images = {
-   bind_gs_images
-};
-
-static void bind_tcs_images(struct st_context *st)
+void st_bind_tcs_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
@@ -183,11 +195,7 @@
    st_bind_images(st, prog, PIPE_SHADER_TESS_CTRL);
 }
 
-const struct st_tracked_state st_bind_tcs_images = {
-   bind_tcs_images
-};
-
-static void bind_tes_images(struct st_context *st)
+void st_bind_tes_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
@@ -195,18 +203,10 @@
    st_bind_images(st, prog, PIPE_SHADER_TESS_EVAL);
 }
 
-const struct st_tracked_state st_bind_tes_images = {
-   bind_tes_images
-};
-
-static void bind_cs_images(struct st_context *st)
+void st_bind_cs_images(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
 
    st_bind_images(st, prog, PIPE_SHADER_COMPUTE);
 }
-
-const struct st_tracked_state st_bind_cs_images = {
-   bind_cs_images
-};
diff --git a/src/mesa/state_tracker/st_atom_list.h b/src/mesa/state_tracker/st_atom_list.h
index 4212dac..b76854e 100644
--- a/src/mesa/state_tracker/st_atom_list.h
+++ b/src/mesa/state_tracker/st_atom_list.h
@@ -8,25 +8,23 @@
 ST_STATE(ST_NEW_TCS_STATE, st_update_tcp)
 ST_STATE(ST_NEW_VS_STATE, st_update_vp)
 
-ST_STATE(ST_NEW_RASTERIZER, st_update_rasterizer)
 ST_STATE(ST_NEW_POLY_STIPPLE, st_update_polygon_stipple)
-ST_STATE(ST_NEW_VIEWPORT, st_update_viewport)
-ST_STATE(ST_NEW_SCISSOR, st_update_scissor)
 ST_STATE(ST_NEW_WINDOW_RECTANGLES, st_update_window_rectangles)
 ST_STATE(ST_NEW_BLEND, st_update_blend)
+ST_STATE(ST_NEW_BLEND_COLOR, st_update_blend_color)
 
-ST_STATE(ST_NEW_VS_SAMPLER_VIEWS, st_update_vertex_texture)
-ST_STATE(ST_NEW_FS_SAMPLER_VIEWS, st_update_fragment_texture)
-ST_STATE(ST_NEW_GS_SAMPLER_VIEWS, st_update_geometry_texture)
-ST_STATE(ST_NEW_TCS_SAMPLER_VIEWS, st_update_tessctrl_texture)
-ST_STATE(ST_NEW_TES_SAMPLER_VIEWS, st_update_tesseval_texture)
+ST_STATE(ST_NEW_VS_SAMPLER_VIEWS, st_update_vertex_textures)
+ST_STATE(ST_NEW_FS_SAMPLER_VIEWS, st_update_fragment_textures)
+ST_STATE(ST_NEW_GS_SAMPLER_VIEWS, st_update_geometry_textures)
+ST_STATE(ST_NEW_TCS_SAMPLER_VIEWS, st_update_tessctrl_textures)
+ST_STATE(ST_NEW_TES_SAMPLER_VIEWS, st_update_tesseval_textures)
 
 /* Non-compute samplers. */
-ST_STATE(ST_NEW_VS_SAMPLERS, st_update_vertex_sampler) /* depends on update_*_texture for swizzle */
-ST_STATE(ST_NEW_TCS_SAMPLERS, st_update_tessctrl_sampler) /* depends on update_*_texture for swizzle */
-ST_STATE(ST_NEW_TES_SAMPLERS, st_update_tesseval_sampler) /* depends on update_*_texture for swizzle */
-ST_STATE(ST_NEW_GS_SAMPLERS, st_update_geometry_sampler) /* depends on update_*_texture for swizzle */
-ST_STATE(ST_NEW_FS_SAMPLERS, st_update_fragment_sampler) /* depends on update_*_texture for swizzle */
+ST_STATE(ST_NEW_VS_SAMPLERS, st_update_vertex_samplers) /* depends on update_*_texture for swizzle */
+ST_STATE(ST_NEW_TCS_SAMPLERS, st_update_tessctrl_samplers) /* depends on update_*_texture for swizzle */
+ST_STATE(ST_NEW_TES_SAMPLERS, st_update_tesseval_samplers) /* depends on update_*_texture for swizzle */
+ST_STATE(ST_NEW_GS_SAMPLERS, st_update_geometry_samplers) /* depends on update_*_texture for swizzle */
+ST_STATE(ST_NEW_FS_SAMPLERS, st_update_fragment_samplers) /* depends on update_*_texture for swizzle */
 
 ST_STATE(ST_NEW_VS_IMAGES, st_bind_vs_images)
 ST_STATE(ST_NEW_TCS_IMAGES, st_bind_tcs_images)
@@ -34,9 +32,12 @@
 ST_STATE(ST_NEW_GS_IMAGES, st_bind_gs_images)
 ST_STATE(ST_NEW_FS_IMAGES, st_bind_fs_images)
 
-ST_STATE(ST_NEW_FB_STATE, st_update_framebuffer) /* depends on update_*_texture and bind_*_images */
-ST_STATE(ST_NEW_SAMPLE_MASK, st_update_msaa)
+ST_STATE(ST_NEW_FB_STATE, st_update_framebuffer_state) /* depends on update_*_texture and bind_*_images */
+ST_STATE(ST_NEW_RASTERIZER, st_update_rasterizer) /* depends on update_framebuffer_state */
+ST_STATE(ST_NEW_SAMPLE_MASK, st_update_sample_mask) /* depends on update_framebuffer_state */
 ST_STATE(ST_NEW_SAMPLE_SHADING, st_update_sample_shading)
+ST_STATE(ST_NEW_SCISSOR, st_update_scissor) /* depends on update_framebuffer_state */
+ST_STATE(ST_NEW_VIEWPORT, st_update_viewport) /* depends on update_framebuffer_state */
 
 ST_STATE(ST_NEW_VS_CONSTANTS, st_update_vs_constants)
 ST_STATE(ST_NEW_TCS_CONSTANTS, st_update_tcs_constants)
@@ -70,8 +71,8 @@
 
 /* Compute states must be last. */
 ST_STATE(ST_NEW_CS_STATE, st_update_cp)
-ST_STATE(ST_NEW_CS_SAMPLER_VIEWS, st_update_compute_texture)
-ST_STATE(ST_NEW_CS_SAMPLERS, st_update_compute_sampler) /* depends on update_compute_texture for swizzle */
+ST_STATE(ST_NEW_CS_SAMPLER_VIEWS, st_update_compute_textures)
+ST_STATE(ST_NEW_CS_SAMPLERS, st_update_compute_samplers) /* depends on update_compute_texture for swizzle */
 ST_STATE(ST_NEW_CS_CONSTANTS, st_update_cs_constants)
 ST_STATE(ST_NEW_CS_UBOS, st_bind_cs_ubos)
 ST_STATE(ST_NEW_CS_ATOMICS, st_bind_cs_atomics)
diff --git a/src/mesa/state_tracker/st_atom_msaa.c b/src/mesa/state_tracker/st_atom_msaa.c
index 69aea69..814077f 100644
--- a/src/mesa/state_tracker/st_atom_msaa.c
+++ b/src/mesa/state_tracker/st_atom_msaa.c
@@ -33,19 +33,17 @@
 #include "st_program.h"
 
 #include "cso_cache/cso_context.h"
-#include "util/u_framebuffer.h"
+#include "main/framebuffer.h"
 
 
 /* Update the sample mask for MSAA.
  */
-static void update_sample_mask( struct st_context *st )
+void st_update_sample_mask( struct st_context *st )
 {
    unsigned sample_mask = 0xffffffff;
-   struct pipe_framebuffer_state *framebuffer = &st->state.framebuffer;
-   /* dependency here on bound surface (or rather, sample count) is worrying */
-   unsigned sample_count = util_framebuffer_get_num_samples(framebuffer);
+   unsigned sample_count = st->state.fb_num_samples;
 
-   if (st->ctx->Multisample.Enabled && sample_count > 1) {
+   if (_mesa_is_multisample_enabled(st->ctx) && sample_count > 1) {
       /* unlike in gallium/d3d10 the mask is only active if msaa is enabled */
       if (st->ctx->Multisample.SampleCoverage) {
          unsigned nr_bits;
@@ -64,15 +62,10 @@
          sample_mask &= st->ctx->Multisample.SampleMaskValue;
    }
 
-   /* mask off unused bits or don't care? */
-
-   if (sample_mask != st->state.sample_mask) {
-      st->state.sample_mask = sample_mask;
-      cso_set_sample_mask(st->cso_context, sample_mask);
-   }
+   cso_set_sample_mask(st->cso_context, sample_mask);
 }
 
-static void update_sample_shading( struct st_context *st )
+void st_update_sample_shading( struct st_context *st )
 {
    if (!st->fp)
       return;
@@ -84,11 +77,3 @@
 	 st->cso_context,
          _mesa_get_min_invocations_per_fragment(st->ctx, &st->fp->Base, false));
 }
-
-const struct st_tracked_state st_update_msaa = {
-   update_sample_mask					/* update */
-};
-
-const struct st_tracked_state st_update_sample_shading = {
-   update_sample_shading				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_pixeltransfer.c b/src/mesa/state_tracker/st_atom_pixeltransfer.c
index a2951a1..9b99036 100644
--- a/src/mesa/state_tracker/st_atom_pixeltransfer.c
+++ b/src/mesa/state_tracker/st_atom_pixeltransfer.c
@@ -84,8 +84,8 @@
 /**
  * Upload the pixel transfer color map texture.
  */
-static void
-update_pixel_transfer(struct st_context *st)
+void
+st_update_pixel_transfer(struct st_context *st)
 {
    struct gl_context *ctx = st->ctx;
 
@@ -100,8 +100,3 @@
       load_color_map_texture(ctx, st->pixel_xfer.pixelmap_texture);
    }
 }
-
-
-const struct st_tracked_state st_update_pixel_transfer = {
-   update_pixel_transfer				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 2974fe5..39be6b1 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -32,6 +32,7 @@
  
 #include "main/macros.h"
 #include "main/framebuffer.h"
+#include "main/state.h"
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_debug.h"
@@ -60,7 +61,7 @@
 
 
 
-static void update_raster_state( struct st_context *st )
+void st_update_rasterizer( struct st_context *st )
 {
    struct gl_context *ctx = st->ctx;
    struct pipe_rasterizer_state *raster = &st->state.rasterizer;
@@ -85,7 +86,7 @@
        * must match OpenGL conventions so FBOs use Y=0=BOTTOM.  In that
        * case, we must invert Y and flip the notion of front vs. back.
        */
-      if (st_fb_orientation(ctx->DrawBuffer) == Y_0_BOTTOM) {
+      if (st->state.fb_orientation == Y_0_BOTTOM) {
          /* Drawing to an FBO.  The viewport will be inverted. */
          raster->front_ccw ^= 1;
       }
@@ -99,7 +100,7 @@
                              GL_FIRST_VERTEX_CONVENTION_EXT;
 
    /* _NEW_LIGHT | _NEW_PROGRAM */
-   raster->light_twoside = ctx->VertexProgram._TwoSideEnabled;
+   raster->light_twoside = _mesa_vertex_program_two_side_enabled(ctx);
 
    /*_NEW_LIGHT | _NEW_BUFFERS */
    raster->clamp_vertex_color = !st->clamp_vert_color_in_shader &&
@@ -173,7 +174,7 @@
    if (ctx->Point.PointSprite) {
       /* origin */
       if ((ctx->Point.SpriteOrigin == GL_UPPER_LEFT) ^
-          (st_fb_orientation(ctx->DrawBuffer) == Y_0_BOTTOM))
+          (st->state.fb_orientation == Y_0_BOTTOM))
          raster->sprite_coord_mode = PIPE_SPRITE_COORD_UPPER_LEFT;
       else 
          raster->sprite_coord_mode = PIPE_SPRITE_COORD_LOWER_LEFT;
@@ -268,7 +269,7 @@
                                   ctx->Color._ClampFragmentColor;
 
    raster->half_pixel_center = 1;
-   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP)
+   if (st->state.fb_orientation == Y_0_TOP)
       raster->bottom_edge_rule = 1;
    /* _NEW_TRANSFORM */
    if (ctx->Transform.ClipOrigin == GL_UPPER_LEFT)
@@ -292,7 +293,3 @@
 
    cso_set_rasterizer(st->cso_context, raster);
 }
-
-const struct st_tracked_state st_update_rasterizer = {
-   update_raster_state     /* update function */
-};
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 820a57d..208b6f7 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -58,90 +58,55 @@
 static GLuint
 gl_wrap_xlate(GLenum wrap)
 {
-   switch (wrap) {
-   case GL_REPEAT:
-      return PIPE_TEX_WRAP_REPEAT;
-   case GL_CLAMP:
-      return PIPE_TEX_WRAP_CLAMP;
-   case GL_CLAMP_TO_EDGE:
-      return PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-   case GL_CLAMP_TO_BORDER:
-      return PIPE_TEX_WRAP_CLAMP_TO_BORDER;
-   case GL_MIRRORED_REPEAT:
-      return PIPE_TEX_WRAP_MIRROR_REPEAT;
-   case GL_MIRROR_CLAMP_EXT:
-      return PIPE_TEX_WRAP_MIRROR_CLAMP;
-   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
-      return PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
-   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
-      return PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER;
-   default:
-      assert(0);
-      return 0;
-   }
+   /* Take advantage of how the enums are defined. */
+   static const unsigned table[32] = {
+      [GL_REPEAT & 0x1f] = PIPE_TEX_WRAP_REPEAT,
+      [GL_CLAMP & 0x1f] = PIPE_TEX_WRAP_CLAMP,
+      [GL_CLAMP_TO_EDGE & 0x1f] = PIPE_TEX_WRAP_CLAMP_TO_EDGE,
+      [GL_CLAMP_TO_BORDER & 0x1f] = PIPE_TEX_WRAP_CLAMP_TO_BORDER,
+      [GL_MIRRORED_REPEAT & 0x1f] = PIPE_TEX_WRAP_MIRROR_REPEAT,
+      [GL_MIRROR_CLAMP_EXT & 0x1f] = PIPE_TEX_WRAP_MIRROR_CLAMP,
+      [GL_MIRROR_CLAMP_TO_EDGE & 0x1f] = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE,
+      [GL_MIRROR_CLAMP_TO_BORDER_EXT & 0x1f] = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER,
+   };
+
+   return table[wrap & 0x1f];
 }
 
 
 static GLuint
 gl_filter_to_mip_filter(GLenum filter)
 {
-   switch (filter) {
-   case GL_NEAREST:
-   case GL_LINEAR:
+   /* Take advantage of how the enums are defined. */
+   if (filter <= GL_LINEAR)
       return PIPE_TEX_MIPFILTER_NONE;
-
-   case GL_NEAREST_MIPMAP_NEAREST:
-   case GL_LINEAR_MIPMAP_NEAREST:
+   if (filter <= GL_LINEAR_MIPMAP_NEAREST)
       return PIPE_TEX_MIPFILTER_NEAREST;
 
-   case GL_NEAREST_MIPMAP_LINEAR:
-   case GL_LINEAR_MIPMAP_LINEAR:
-      return PIPE_TEX_MIPFILTER_LINEAR;
-
-   default:
-      assert(0);
-      return PIPE_TEX_MIPFILTER_NONE;
-   }
+   return PIPE_TEX_MIPFILTER_LINEAR;
 }
 
 
 static GLuint
 gl_filter_to_img_filter(GLenum filter)
 {
-   switch (filter) {
-   case GL_NEAREST:
-   case GL_NEAREST_MIPMAP_NEAREST:
-   case GL_NEAREST_MIPMAP_LINEAR:
-      return PIPE_TEX_FILTER_NEAREST;
-
-   case GL_LINEAR:
-   case GL_LINEAR_MIPMAP_NEAREST:
-   case GL_LINEAR_MIPMAP_LINEAR:
+   /* Take advantage of how the enums are defined. */
+   if (filter & 1)
       return PIPE_TEX_FILTER_LINEAR;
 
-   default:
-      assert(0);
-      return PIPE_TEX_FILTER_NEAREST;
-   }
+   return PIPE_TEX_FILTER_NEAREST;
 }
 
 
-static void
-convert_sampler(struct st_context *st,
-                struct pipe_sampler_state *sampler,
-                GLuint texUnit)
+/**
+ * Convert a gl_sampler_object to a pipe_sampler_state object.
+ */
+void
+st_convert_sampler(const struct st_context *st,
+                   const struct gl_texture_object *texobj,
+                   const struct gl_sampler_object *msamp,
+                   struct pipe_sampler_state *sampler)
 {
-   const struct gl_texture_object *texobj;
-   struct gl_context *ctx = st->ctx;
-   const struct gl_sampler_object *msamp;
-   GLenum texBaseFormat;
-
-   texobj = ctx->Texture.Unit[texUnit]._Current;
-   assert(texobj);
-
-   msamp = _mesa_get_samplerobj(ctx, texUnit);
-   texBaseFormat = _mesa_texture_base_format(texobj);
-
    memset(sampler, 0, sizeof(*sampler));
    sampler->wrap_s = gl_wrap_xlate(msamp->WrapS);
    sampler->wrap_t = gl_wrap_xlate(msamp->WrapT);
@@ -154,7 +119,7 @@
    if (texobj->Target != GL_TEXTURE_RECTANGLE_ARB)
       sampler->normalized_coords = 1;
 
-   sampler->lod_bias = ctx->Texture.Unit[texUnit].LodBias + msamp->LodBias;
+   sampler->lod_bias = msamp->LodBias;
    /* Reduce the number of states by allowing only the values that AMD GCN
     * can represent. Apps use lod_bias for smooth transitions to bigger mipmap
     * levels.
@@ -174,42 +139,62 @@
       assert(sampler->min_lod <= sampler->max_lod);
    }
 
+   /* Check that only wrap modes using the border color have the first bit
+    * set.
+    */
+   STATIC_ASSERT(PIPE_TEX_WRAP_CLAMP & 0x1);
+   STATIC_ASSERT(PIPE_TEX_WRAP_CLAMP_TO_BORDER & 0x1);
+   STATIC_ASSERT(PIPE_TEX_WRAP_MIRROR_CLAMP & 0x1);
+   STATIC_ASSERT(PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER & 0x1);
+   STATIC_ASSERT(((PIPE_TEX_WRAP_REPEAT |
+                   PIPE_TEX_WRAP_CLAMP_TO_EDGE |
+                   PIPE_TEX_WRAP_MIRROR_REPEAT |
+                   PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE) & 0x1) == 0);
+
    /* For non-black borders... */
-   if (msamp->BorderColor.ui[0] ||
-       msamp->BorderColor.ui[1] ||
-       msamp->BorderColor.ui[2] ||
-       msamp->BorderColor.ui[3]) {
-      const struct st_texture_object *stobj = st_texture_object_const(texobj);
+   if (/* This is true if wrap modes are using the border color: */
+       (sampler->wrap_s | sampler->wrap_t | sampler->wrap_r) & 0x1 &&
+       (msamp->BorderColor.ui[0] ||
+        msamp->BorderColor.ui[1] ||
+        msamp->BorderColor.ui[2] ||
+        msamp->BorderColor.ui[3])) {
       const GLboolean is_integer = texobj->_IsIntegerFormat;
-      const struct pipe_sampler_view *sv = NULL;
-      union pipe_color_union border_color;
-      GLuint i;
+      GLenum texBaseFormat = _mesa_base_tex_image(texobj)->_BaseFormat;
 
-      /* Just search for the first used view. We can do this because the
-         swizzle is per-texture, not per context. */
-      /* XXX: clean that up to not use the sampler view at all */
-      for (i = 0; i < stobj->num_sampler_views; ++i) {
-         if (stobj->sampler_views[i]) {
-            sv = stobj->sampler_views[i];
-            break;
+      if (st->apply_texture_swizzle_to_border_color) {
+         const struct st_texture_object *stobj = st_texture_object_const(texobj);
+         const struct pipe_sampler_view *sv = NULL;
+
+         /* Just search for the first used view. We can do this because the
+            swizzle is per-texture, not per context. */
+         /* XXX: clean that up to not use the sampler view at all */
+         for (unsigned i = 0; i < stobj->num_sampler_views; ++i) {
+            if (stobj->sampler_views[i]) {
+               sv = stobj->sampler_views[i];
+               break;
+            }
          }
-      }
 
-      if (st->apply_texture_swizzle_to_border_color && sv) {
-         const unsigned char swz[4] =
-         {
-            sv->swizzle_r,
-            sv->swizzle_g,
-            sv->swizzle_b,
-            sv->swizzle_a,
-         };
+         if (sv) {
+            union pipe_color_union tmp;
+            const unsigned char swz[4] =
+            {
+               sv->swizzle_r,
+               sv->swizzle_g,
+               sv->swizzle_b,
+               sv->swizzle_a,
+            };
 
-         st_translate_color(&msamp->BorderColor,
-                            &border_color,
-                            texBaseFormat, is_integer);
+            st_translate_color(&msamp->BorderColor, &tmp,
+                               texBaseFormat, is_integer);
 
-         util_format_apply_color_swizzle(&sampler->border_color,
-                                         &border_color, swz, is_integer);
+            util_format_apply_color_swizzle(&sampler->border_color,
+                                            &tmp, swz, is_integer);
+         } else {
+            st_translate_color(&msamp->BorderColor,
+                               &sampler->border_color,
+                               texBaseFormat, is_integer);
+         }
       } else {
          st_translate_color(&msamp->BorderColor,
                             &sampler->border_color,
@@ -221,15 +206,45 @@
                               0 : (GLuint) msamp->MaxAnisotropy);
 
    /* If sampling a depth texture and using shadow comparison */
-   if ((texBaseFormat == GL_DEPTH_COMPONENT ||
-        (texBaseFormat == GL_DEPTH_STENCIL && !texobj->StencilSampling)) &&
-       msamp->CompareMode == GL_COMPARE_R_TO_TEXTURE) {
-      sampler->compare_mode = PIPE_TEX_COMPARE_R_TO_TEXTURE;
-      sampler->compare_func = st_compare_func_to_pipe(msamp->CompareFunc);
+   if (msamp->CompareMode == GL_COMPARE_R_TO_TEXTURE) {
+      GLenum texBaseFormat = _mesa_base_tex_image(texobj)->_BaseFormat;
+
+      if (texBaseFormat == GL_DEPTH_COMPONENT ||
+          (texBaseFormat == GL_DEPTH_STENCIL && !texobj->StencilSampling)) {
+         sampler->compare_mode = PIPE_TEX_COMPARE_R_TO_TEXTURE;
+         sampler->compare_func = st_compare_func_to_pipe(msamp->CompareFunc);
+      }
    }
 
-   sampler->seamless_cube_map =
-      ctx->Texture.CubeMapSeamless || msamp->CubeMapSeamless;
+   /* Only set the seamless cube map texture parameter because the per-context
+    * enable should be ignored and treated as disabled when using texture
+    * handles, as specified by ARB_bindless_texture.
+    */
+   sampler->seamless_cube_map = msamp->CubeMapSeamless;
+}
+
+/**
+ * Get a pipe_sampler_state object from a texture unit.
+ */
+void
+st_convert_sampler_from_unit(const struct st_context *st,
+                             struct pipe_sampler_state *sampler,
+                             GLuint texUnit)
+{
+   const struct gl_texture_object *texobj;
+   struct gl_context *ctx = st->ctx;
+   const struct gl_sampler_object *msamp;
+
+   texobj = ctx->Texture.Unit[texUnit]._Current;
+   assert(texobj);
+   assert(texobj->Target != GL_TEXTURE_BUFFER);
+
+   msamp = _mesa_get_samplerobj(ctx, texUnit);
+
+   st_convert_sampler(st, texobj, msamp, sampler);
+
+   sampler->lod_bias += ctx->Texture.Unit[texUnit].LodBias;
+   sampler->seamless_cube_map |= ctx->Texture.CubeMapSeamless;
 }
 
 
@@ -241,40 +256,38 @@
 update_shader_samplers(struct st_context *st,
                        enum pipe_shader_type shader_stage,
                        const struct gl_program *prog,
-                       unsigned max_units,
                        struct pipe_sampler_state *samplers,
-                       unsigned *num_samplers)
+                       unsigned *out_num_samplers)
 {
+   struct gl_context *ctx = st->ctx;
    GLbitfield samplers_used = prog->SamplersUsed;
    GLbitfield free_slots = ~prog->SamplersUsed;
    GLbitfield external_samplers_used = prog->ExternalSamplersUsed;
-   GLuint unit;
-   const GLuint old_max = *num_samplers;
+   unsigned unit, num_samplers;
    const struct pipe_sampler_state *states[PIPE_MAX_SAMPLERS];
 
-   if (*num_samplers == 0 && samplers_used == 0x0)
+   if (samplers_used == 0x0) {
+      *out_num_samplers = 0;
       return;
+   }
 
-   *num_samplers = 0;
+   num_samplers = util_last_bit(samplers_used);
 
    /* loop over sampler units (aka tex image units) */
-   for (unit = 0; unit < max_units; unit++, samplers_used >>= 1) {
+   for (unit = 0; samplers_used; unit++, samplers_used >>= 1) {
       struct pipe_sampler_state *sampler = samplers + unit;
+      unsigned tex_unit = prog->SamplerUnits[unit];
 
-      if (samplers_used & 1) {
-         const GLuint texUnit = prog->SamplerUnits[unit];
-
-         convert_sampler(st, sampler, texUnit);
+      /* Don't update the sampler for TBOs. cso_context will not bind sampler
+       * states that are NULL.
+       */
+      if (samplers_used & 1 &&
+          ctx->Texture.Unit[tex_unit]._Current->Target != GL_TEXTURE_BUFFER) {
+         st_convert_sampler_from_unit(st, sampler, tex_unit);
          states[unit] = sampler;
-         *num_samplers = unit + 1;
-      }
-      else if (samplers_used != 0 || unit < old_max) {
+      } else {
          states[unit] = NULL;
       }
-      else {
-         /* if we've reset all the old samplers and we have no more new ones */
-         break;
-      }
    }
 
    /* For any external samplers with multiplaner YUV, stuff the additional
@@ -309,29 +322,29 @@
          break;
       }
 
-      *num_samplers = MAX2(*num_samplers, extra + 1);
+      num_samplers = MAX2(num_samplers, extra + 1);
    }
 
-   cso_set_samplers(st->cso_context, shader_stage, *num_samplers, states);
+   cso_set_samplers(st->cso_context, shader_stage, num_samplers, states);
+   *out_num_samplers = num_samplers;
 }
 
 
-static void
-update_vertex_samplers(struct st_context *st)
+void
+st_update_vertex_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    update_shader_samplers(st,
                           PIPE_SHADER_VERTEX,
                           ctx->VertexProgram._Current,
-                          ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits,
                           st->state.samplers[PIPE_SHADER_VERTEX],
                           &st->state.num_samplers[PIPE_SHADER_VERTEX]);
 }
 
 
-static void
-update_tessctrl_samplers(struct st_context *st)
+void
+st_update_tessctrl_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
@@ -339,15 +352,14 @@
       update_shader_samplers(st,
                              PIPE_SHADER_TESS_CTRL,
                              ctx->TessCtrlProgram._Current,
-                             ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
                              st->state.samplers[PIPE_SHADER_TESS_CTRL],
                              &st->state.num_samplers[PIPE_SHADER_TESS_CTRL]);
    }
 }
 
 
-static void
-update_tesseval_samplers(struct st_context *st)
+void
+st_update_tesseval_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
@@ -355,15 +367,14 @@
       update_shader_samplers(st,
                              PIPE_SHADER_TESS_EVAL,
                              ctx->TessEvalProgram._Current,
-                             ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
                              st->state.samplers[PIPE_SHADER_TESS_EVAL],
                              &st->state.num_samplers[PIPE_SHADER_TESS_EVAL]);
    }
 }
 
 
-static void
-update_geometry_samplers(struct st_context *st)
+void
+st_update_geometry_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
@@ -371,29 +382,27 @@
       update_shader_samplers(st,
                              PIPE_SHADER_GEOMETRY,
                              ctx->GeometryProgram._Current,
-                             ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits,
                              st->state.samplers[PIPE_SHADER_GEOMETRY],
                              &st->state.num_samplers[PIPE_SHADER_GEOMETRY]);
    }
 }
 
 
-static void
-update_fragment_samplers(struct st_context *st)
+void
+st_update_fragment_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    update_shader_samplers(st,
                           PIPE_SHADER_FRAGMENT,
                           ctx->FragmentProgram._Current,
-                          ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
                           st->state.samplers[PIPE_SHADER_FRAGMENT],
                           &st->state.num_samplers[PIPE_SHADER_FRAGMENT]);
 }
 
 
-static void
-update_compute_samplers(struct st_context *st)
+void
+st_update_compute_samplers(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
@@ -401,33 +410,7 @@
       update_shader_samplers(st,
                              PIPE_SHADER_COMPUTE,
                              ctx->ComputeProgram._Current,
-                             ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits,
                              st->state.samplers[PIPE_SHADER_COMPUTE],
                              &st->state.num_samplers[PIPE_SHADER_COMPUTE]);
    }
 }
-
-
-const struct st_tracked_state st_update_vertex_sampler = {
-   update_vertex_samplers				/* update */
-};
-
-const struct st_tracked_state st_update_tessctrl_sampler = {
-   update_tessctrl_samplers				/* update */
-};
-
-const struct st_tracked_state st_update_tesseval_sampler = {
-   update_tesseval_samplers				/* update */
-};
-
-const struct st_tracked_state st_update_geometry_sampler = {
-   update_geometry_samplers				/* update */
-};
-
-const struct st_tracked_state st_update_fragment_sampler = {
-   update_fragment_samplers				/* update */
-};
-
-const struct st_tracked_state st_update_compute_sampler = {
-   update_compute_samplers				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_scissor.c b/src/mesa/state_tracker/st_atom_scissor.c
index fb478a3..a87d029 100644
--- a/src/mesa/state_tracker/st_atom_scissor.c
+++ b/src/mesa/state_tracker/st_atom_scissor.c
@@ -41,8 +41,8 @@
 /**
  * Scissor depends on the scissor box, and the framebuffer dimensions.
  */
-static void
-update_scissor( struct st_context *st )
+void
+st_update_scissor( struct st_context *st )
 {
    struct pipe_scissor_state scissor[PIPE_MAX_VIEWPORTS];
    const struct gl_context *ctx = st->ctx;
@@ -53,7 +53,7 @@
    unsigned i;
    bool changed = false;
 
-   for (i = 0 ; i < ctx->Const.MaxViewports; i++) {
+   for (i = 0 ; i < st->state.num_viewports; i++) {
       scissor[i].minx = 0;
       scissor[i].miny = 0;
       scissor[i].maxx = fb_width;
@@ -82,7 +82,7 @@
       /* Now invert Y if needed.
        * Gallium drivers use the convention Y=0=top for surfaces.
        */
-      if (st_fb_orientation(fb) == Y_0_TOP) {
+      if (st->state.fb_orientation == Y_0_TOP) {
          miny = fb->Height - scissor[i].maxy;
          maxy = fb->Height - scissor[i].miny;
          scissor[i].miny = miny;
@@ -95,12 +95,16 @@
          changed = true;
       }
    }
-   if (changed)
-      st->pipe->set_scissor_states(st->pipe, 0, ctx->Const.MaxViewports, scissor); /* activate */
+
+   if (changed) {
+      struct pipe_context *pipe = st->pipe;
+
+      pipe->set_scissor_states(pipe, 0, st->state.num_viewports, scissor);
+   }
 }
 
-static void
-update_window_rectangles(struct st_context *st)
+void
+st_update_window_rectangles(struct st_context *st)
 {
    struct pipe_scissor_state new_rects[PIPE_MAX_WINDOW_RECTANGLES];
    const struct gl_context *ctx = st->ctx;
@@ -139,11 +143,3 @@
       st->pipe->set_window_rectangles(
             st->pipe, include, num_rects, new_rects);
 }
-
-const struct st_tracked_state st_update_scissor = {
-   update_scissor					/* update */
-};
-
-const struct st_tracked_state st_update_window_rectangles = {
-   update_window_rectangles				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ee97c69..b5ba33a 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -93,8 +93,8 @@
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
  */
-static void
-update_fp( struct st_context *st )
+void
+st_update_fp( struct st_context *st )
 {
    struct st_fragment_program *stfp;
    struct st_fp_variant_key key;
@@ -103,52 +103,56 @@
    stfp = st_fragment_program(st->ctx->FragmentProgram._Current);
    assert(stfp->Base.Target == GL_FRAGMENT_PROGRAM_ARB);
 
-   memset(&key, 0, sizeof(key));
-   key.st = st->has_shareable_shaders ? NULL : st;
+   void *shader;
 
-   /* _NEW_FRAG_CLAMP */
-   key.clamp_color = st->clamp_frag_color_in_shader &&
-                     st->ctx->Color._ClampFragmentColor;
+   if (st->shader_has_one_variant[MESA_SHADER_FRAGMENT] &&
+       !stfp->ati_fs && /* ATI_fragment_shader always has multiple variants */
+       !stfp->Base.ExternalSamplersUsed && /* external samplers need variants */
+       stfp->variants &&
+       !stfp->variants->key.drawpixels &&
+       !stfp->variants->key.bitmap) {
+      shader = stfp->variants->driver_shader;
+   } else {
+      memset(&key, 0, sizeof(key));
+      key.st = st->has_shareable_shaders ? NULL : st;
 
-   /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
-   key.persample_shading =
-      st->force_persample_in_shader &&
-      _mesa_is_multisample_enabled(st->ctx) &&
-      st->ctx->Multisample.SampleShading &&
-      st->ctx->Multisample.MinSampleShadingValue *
-      _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
+      /* _NEW_FRAG_CLAMP */
+      key.clamp_color = st->clamp_frag_color_in_shader &&
+                        st->ctx->Color._ClampFragmentColor;
 
-   if (stfp->ati_fs) {
-      key.fog = st->ctx->Fog._PackedEnabledMode;
+      /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
+      key.persample_shading =
+         st->force_persample_in_shader &&
+         _mesa_is_multisample_enabled(st->ctx) &&
+         st->ctx->Multisample.SampleShading &&
+         st->ctx->Multisample.MinSampleShadingValue *
+         _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
 
-      for (unsigned u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) {
-         key.texture_targets[u] = get_texture_target(st->ctx, u);
+      if (stfp->ati_fs) {
+         key.fog = st->ctx->Fog._PackedEnabledMode;
+
+         for (unsigned u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) {
+            key.texture_targets[u] = get_texture_target(st->ctx, u);
+         }
       }
+
+      key.external = st_get_external_sampler_key(st, &stfp->Base);
+
+      shader = st_get_fp_variant(st, stfp, &key)->driver_shader;
    }
 
-   key.external = st_get_external_sampler_key(st, &stfp->Base);
-
-   st->fp_variant = st_get_fp_variant(st, stfp, &key);
-
    st_reference_fragprog(st, &st->fp, stfp);
 
-   cso_set_fragment_shader_handle(st->cso_context,
-                                  st->fp_variant->driver_shader);
+   cso_set_fragment_shader_handle(st->cso_context, shader);
 }
 
 
-const struct st_tracked_state st_update_fp = {
-   update_fp  					/* update */
-};
-
-
-
 /**
  * Update vertex program state/atom.  This involves translating the
  * Mesa vertex program into a gallium fragment program and binding it.
  */
-static void
-update_vp( struct st_context *st )
+void
+st_update_vp( struct st_context *st )
 {
    struct st_vertex_program *stvp;
    struct st_vp_variant_key key;
@@ -160,131 +164,94 @@
    stvp = st_vertex_program(st->ctx->VertexProgram._Current);
    assert(stvp->Base.Target == GL_VERTEX_PROGRAM_ARB);
 
-   memset(&key, 0, sizeof key);
-   key.st = st->has_shareable_shaders ? NULL : st;
+   if (st->shader_has_one_variant[MESA_SHADER_VERTEX] &&
+       stvp->variants &&
+       stvp->variants->key.passthrough_edgeflags == st->vertdata_edgeflags) {
+      st->vp_variant = stvp->variants;
+   } else {
+      memset(&key, 0, sizeof key);
+      key.st = st->has_shareable_shaders ? NULL : st;
 
-   /* When this is true, we will add an extra input to the vertex
-    * shader translation (for edgeflags), an extra output with
-    * edgeflag semantics, and extend the vertex shader to pass through
-    * the input to the output.  We'll need to use similar logic to set
-    * up the extra vertex_element input for edgeflags.
-    */
-   key.passthrough_edgeflags = st->vertdata_edgeflags;
+      /* When this is true, we will add an extra input to the vertex
+       * shader translation (for edgeflags), an extra output with
+       * edgeflag semantics, and extend the vertex shader to pass through
+       * the input to the output.  We'll need to use similar logic to set
+       * up the extra vertex_element input for edgeflags.
+       */
+      key.passthrough_edgeflags = st->vertdata_edgeflags;
 
-   key.clamp_color = st->clamp_vert_color_in_shader &&
-                     st->ctx->Light._ClampVertexColor &&
-                     (stvp->Base.info.outputs_written &
-                      (VARYING_SLOT_COL0 |
-                       VARYING_SLOT_COL1 |
-                       VARYING_SLOT_BFC0 |
-                       VARYING_SLOT_BFC1));
+      key.clamp_color = st->clamp_vert_color_in_shader &&
+                        st->ctx->Light._ClampVertexColor &&
+                        (stvp->Base.info.outputs_written &
+                         (VARYING_SLOT_COL0 |
+                          VARYING_SLOT_COL1 |
+                          VARYING_SLOT_BFC0 |
+                          VARYING_SLOT_BFC1));
 
-   st->vp_variant = st_get_vp_variant(st, stvp, &key);
+      st->vp_variant = st_get_vp_variant(st, stvp, &key);
+   }
 
    st_reference_vertprog(st, &st->vp, stvp);
 
    cso_set_vertex_shader_handle(st->cso_context, 
                                 st->vp_variant->driver_shader);
-
-   st->vertex_result_to_slot = stvp->result_to_output;
 }
 
 
-const struct st_tracked_state st_update_vp = {
-   update_vp						/* update */
-};
-
-
-
-static void
-update_gp( struct st_context *st )
+static void *
+st_update_common_program(struct st_context *st, struct gl_program *prog,
+                         unsigned pipe_shader, struct st_common_program **dst)
 {
-   struct st_geometry_program *stgp;
+   struct st_common_program *stp;
 
-   if (!st->ctx->GeometryProgram._Current) {
-      cso_set_geometry_shader_handle(st->cso_context, NULL);
-      st_reference_geomprog(st, &st->gp, NULL);
-      return;
+   if (!prog) {
+      st_reference_prog(st, dst, NULL);
+      return NULL;
    }
 
-   stgp = st_geometry_program(st->ctx->GeometryProgram._Current);
-   assert(stgp->Base.Target == GL_GEOMETRY_PROGRAM_NV);
+   stp = st_common_program(prog);
+   st_reference_prog(st, dst, stp);
 
-   st->gp_variant = st_get_basic_variant(st, PIPE_SHADER_GEOMETRY,
-                                         &stgp->tgsi, &stgp->variants);
+   if (st->shader_has_one_variant[prog->info.stage] && stp->variants)
+      return stp->variants->driver_shader;
 
-   st_reference_geomprog(st, &st->gp, stgp);
-
-   cso_set_geometry_shader_handle(st->cso_context,
-                                  st->gp_variant->driver_shader);
+   return st_get_basic_variant(st, pipe_shader, &stp->tgsi,
+                               &stp->variants)->driver_shader;
 }
 
-const struct st_tracked_state st_update_gp = {
-   update_gp  				/* update */
-};
 
-
-
-static void
-update_tcp( struct st_context *st )
+void
+st_update_gp(struct st_context *st)
 {
-   struct st_tessctrl_program *sttcp;
-
-   if (!st->ctx->TessCtrlProgram._Current) {
-      cso_set_tessctrl_shader_handle(st->cso_context, NULL);
-      st_reference_tesscprog(st, &st->tcp, NULL);
-      return;
-   }
-
-   sttcp = st_tessctrl_program(st->ctx->TessCtrlProgram._Current);
-   assert(sttcp->Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
-
-   st->tcp_variant = st_get_basic_variant(st, PIPE_SHADER_TESS_CTRL,
-                                          &sttcp->tgsi, &sttcp->variants);
-
-   st_reference_tesscprog(st, &st->tcp, sttcp);
-
-   cso_set_tessctrl_shader_handle(st->cso_context,
-                                  st->tcp_variant->driver_shader);
+   void *shader = st_update_common_program(st,
+                                           st->ctx->GeometryProgram._Current,
+                                           PIPE_SHADER_GEOMETRY, &st->gp);
+   cso_set_geometry_shader_handle(st->cso_context, shader);
 }
 
-const struct st_tracked_state st_update_tcp = {
-   update_tcp  				/* update */
-};
 
-
-
-static void
-update_tep( struct st_context *st )
+void
+st_update_tcp(struct st_context *st)
 {
-   struct st_tesseval_program *sttep;
-
-   if (!st->ctx->TessEvalProgram._Current) {
-      cso_set_tesseval_shader_handle(st->cso_context, NULL);
-      st_reference_tesseprog(st, &st->tep, NULL);
-      return;
-   }
-
-   sttep = st_tesseval_program(st->ctx->TessEvalProgram._Current);
-   assert(sttep->Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
-
-   st->tep_variant = st_get_basic_variant(st, PIPE_SHADER_TESS_EVAL,
-                                          &sttep->tgsi, &sttep->variants);
-
-   st_reference_tesseprog(st, &st->tep, sttep);
-
-   cso_set_tesseval_shader_handle(st->cso_context,
-                                  st->tep_variant->driver_shader);
+   void *shader = st_update_common_program(st,
+                                           st->ctx->TessCtrlProgram._Current,
+                                           PIPE_SHADER_TESS_CTRL, &st->tcp);
+   cso_set_tessctrl_shader_handle(st->cso_context, shader);
 }
 
-const struct st_tracked_state st_update_tep = {
-   update_tep  				/* update */
-};
+
+void
+st_update_tep(struct st_context *st)
+{
+   void *shader = st_update_common_program(st,
+                                           st->ctx->TessEvalProgram._Current,
+                                           PIPE_SHADER_TESS_EVAL, &st->tep);
+   cso_set_tesseval_shader_handle(st->cso_context, shader);
+}
 
 
-
-static void
-update_cp( struct st_context *st )
+void
+st_update_cp( struct st_context *st )
 {
    struct st_compute_program *stcp;
 
@@ -297,14 +264,16 @@
    stcp = st_compute_program(st->ctx->ComputeProgram._Current);
    assert(stcp->Base.Target == GL_COMPUTE_PROGRAM_NV);
 
-   st->cp_variant = st_get_cp_variant(st, &stcp->tgsi, &stcp->variants);
+   void *shader;
+
+   if (st->shader_has_one_variant[MESA_SHADER_COMPUTE] && stcp->variants) {
+      shader = stcp->variants->driver_shader;
+   } else {
+      shader = st_get_cp_variant(st, &stcp->tgsi,
+                                 &stcp->variants)->driver_shader;
+   }
 
    st_reference_compprog(st, &st->cp, stcp);
 
-   cso_set_compute_shader_handle(st->cso_context,
-                                 st->cp_variant->driver_shader);
+   cso_set_compute_shader_handle(st->cso_context, shader);
 }
-
-const struct st_tracked_state st_update_cp = {
-   update_cp  				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_stipple.c b/src/mesa/state_tracker/st_atom_stipple.c
index 5f7bf82..87599f9 100644
--- a/src/mesa/state_tracker/st_atom_stipple.c
+++ b/src/mesa/state_tracker/st_atom_stipple.c
@@ -60,9 +60,9 @@
 }
 
 
-
-static void
-update_stipple( struct st_context *st )
+/** Update the stipple when the pattern or window height changes */
+void
+st_update_polygon_stipple( struct st_context *st )
 {
    const struct gl_context *ctx = st->ctx;
    const GLuint sz = sizeof(st->state.poly_stipple);
@@ -84,9 +84,3 @@
       st->pipe->set_polygon_stipple(st->pipe, &newStipple);
    }
 }
-
-
-/** Update the stipple when the pattern or window height changes */
-const struct st_tracked_state st_update_polygon_stipple = {
-   update_stipple					/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_storagebuf.c b/src/mesa/state_tracker/st_atom_storagebuf.c
index ec89f16..43dd300 100644
--- a/src/mesa/state_tracker/st_atom_storagebuf.c
+++ b/src/mesa/state_tracker/st_atom_storagebuf.c
@@ -90,7 +90,7 @@
             NULL);
 }
 
-static void bind_vs_ssbos(struct st_context *st)
+void st_bind_vs_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
@@ -98,11 +98,7 @@
    st_bind_ssbos(st, prog, PIPE_SHADER_VERTEX);
 }
 
-const struct st_tracked_state st_bind_vs_ssbos = {
-   bind_vs_ssbos
-};
-
-static void bind_fs_ssbos(struct st_context *st)
+void st_bind_fs_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
@@ -110,11 +106,7 @@
    st_bind_ssbos(st, prog, PIPE_SHADER_FRAGMENT);
 }
 
-const struct st_tracked_state st_bind_fs_ssbos = {
-   bind_fs_ssbos
-};
-
-static void bind_gs_ssbos(struct st_context *st)
+void st_bind_gs_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
@@ -122,11 +114,7 @@
    st_bind_ssbos(st, prog, PIPE_SHADER_GEOMETRY);
 }
 
-const struct st_tracked_state st_bind_gs_ssbos = {
-   bind_gs_ssbos
-};
-
-static void bind_tcs_ssbos(struct st_context *st)
+void st_bind_tcs_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
@@ -134,11 +122,7 @@
    st_bind_ssbos(st, prog, PIPE_SHADER_TESS_CTRL);
 }
 
-const struct st_tracked_state st_bind_tcs_ssbos = {
-   bind_tcs_ssbos
-};
-
-static void bind_tes_ssbos(struct st_context *st)
+void st_bind_tes_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
@@ -146,18 +130,10 @@
    st_bind_ssbos(st, prog, PIPE_SHADER_TESS_EVAL);
 }
 
-const struct st_tracked_state st_bind_tes_ssbos = {
-   bind_tes_ssbos
-};
-
-static void bind_cs_ssbos(struct st_context *st)
+void st_bind_cs_ssbos(struct st_context *st)
 {
    struct gl_program *prog =
       st->ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
 
    st_bind_ssbos(st, prog, PIPE_SHADER_COMPUTE);
 }
-
-const struct st_tracked_state st_bind_cs_ssbos = {
-   bind_cs_ssbos
-};
diff --git a/src/mesa/state_tracker/st_atom_tess.c b/src/mesa/state_tracker/st_atom_tess.c
index 103e41d..6cf3ff7 100644
--- a/src/mesa/state_tracker/st_atom_tess.c
+++ b/src/mesa/state_tracker/st_atom_tess.c
@@ -37,8 +37,8 @@
 #include "st_atom.h"
 
 
-static void
-update_tess(struct st_context *st)
+void
+st_update_tess(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
    struct pipe_context *pipe = st->pipe;
@@ -50,8 +50,3 @@
                         ctx->TessCtrlProgram.patch_default_outer_level,
                         ctx->TessCtrlProgram.patch_default_inner_level);
 }
-
-
-const struct st_tracked_state st_update_tess = {
-   update_tess                  /* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index fa4b644..81bf629 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -52,16 +52,18 @@
 #include "cso_cache/cso_context.h"
 
 
-static GLboolean
-update_single_texture(struct st_context *st,
-                      struct pipe_sampler_view **sampler_view,
-		      GLuint texUnit, unsigned glsl_version)
+/**
+ * Get a pipe_sampler_view object from a texture unit.
+ */
+void
+st_update_single_texture(struct st_context *st,
+                         struct pipe_sampler_view **sampler_view,
+                         GLuint texUnit, bool glsl130_or_later)
 {
    struct gl_context *ctx = st->ctx;
    const struct gl_sampler_object *samp;
    struct gl_texture_object *texObj;
    struct st_texture_object *stObj;
-   GLboolean retval;
 
    samp = _mesa_get_samplerobj(ctx, texUnit);
 
@@ -70,21 +72,27 @@
 
    stObj = st_texture_object(texObj);
 
-   retval = st_finalize_texture(ctx, st->pipe, texObj, 0);
-   if (!retval) {
+   if (unlikely(texObj->Target == GL_TEXTURE_BUFFER)) {
+      *sampler_view = st_get_buffer_sampler_view_from_stobj(st, stObj);
+      return;
+   }
+
+   if (!st_finalize_texture(ctx, st->pipe, texObj, 0) ||
+       !stObj->pt) {
       /* out of mem */
-      return GL_FALSE;
+      *sampler_view = NULL;
+      return;
    }
 
    /* Check a few pieces of state outside the texture object to see if we
     * need to force revalidation.
     */
-   if (stObj->prev_glsl_version != glsl_version ||
+   if (stObj->prev_glsl130_or_later != glsl130_or_later ||
        stObj->prev_sRGBDecode != samp->sRGBDecode) {
 
       st_texture_release_all_sampler_views(st, stObj);
 
-      stObj->prev_glsl_version = glsl_version;
+      stObj->prev_glsl130_or_later = glsl130_or_later;
       stObj->prev_sRGBDecode = samp->sRGBDecode;
    }
 
@@ -93,52 +101,43 @@
          stObj->pt->screen->resource_changed(stObj->pt->screen, stObj->pt);
 
    *sampler_view =
-      st_get_texture_sampler_view_from_stobj(st, stObj, samp, glsl_version);
-   return GL_TRUE;
+      st_get_texture_sampler_view_from_stobj(st, stObj, samp,
+                                             glsl130_or_later);
 }
 
 
 
 static void
 update_textures(struct st_context *st,
-                gl_shader_stage mesa_shader,
+                enum pipe_shader_type shader_stage,
                 const struct gl_program *prog,
-                unsigned max_units,
                 struct pipe_sampler_view **sampler_views,
-                unsigned *num_textures)
+                unsigned *out_num_textures)
 {
-   const GLuint old_max = *num_textures;
+   const GLuint old_max = *out_num_textures;
    GLbitfield samplers_used = prog->SamplersUsed;
    GLbitfield free_slots = ~prog->SamplersUsed;
    GLbitfield external_samplers_used = prog->ExternalSamplersUsed;
    GLuint unit;
-   enum pipe_shader_type shader_stage = st_shader_stage_to_ptarget(mesa_shader);
 
    if (samplers_used == 0x0 && old_max == 0)
       return;
 
-   *num_textures = 0;
+   unsigned num_textures = 0;
+
+   /* prog->sh.data is NULL if it's ARB_fragment_program */
+   bool glsl130 = (prog->sh.data ? prog->sh.data->Version : 0) >= 130;
 
    /* loop over sampler units (aka tex image units) */
-   for (unit = 0; unit < max_units; unit++, samplers_used >>= 1) {
+   for (unit = 0; samplers_used || unit < old_max;
+        unit++, samplers_used >>= 1) {
       struct pipe_sampler_view *sampler_view = NULL;
 
       if (samplers_used & 1) {
-         /* prog->sh.data is NULL if it's ARB_fragment_program */
-         unsigned glsl_version = prog->sh.data ? prog->sh.data->Version : 0;
          const GLuint texUnit = prog->SamplerUnits[unit];
-         GLboolean retval;
 
-         retval = update_single_texture(st, &sampler_view, texUnit,
-                                        glsl_version);
-         if (retval == GL_FALSE)
-            continue;
-
-         *num_textures = unit + 1;
-      }
-      else if (samplers_used == 0 && unit >= old_max) {
-         /* if we've reset all the old views and we have no more new ones */
-         break;
+         st_update_single_texture(st, &sampler_view, texUnit, glsl130);
+         num_textures = unit + 1;
       }
 
       pipe_sampler_view_reference(&(sampler_views[unit]), sampler_view);
@@ -188,136 +187,101 @@
          break;
       }
 
-      *num_textures = MAX2(*num_textures, extra + 1);
+      num_textures = MAX2(num_textures, extra + 1);
    }
 
    cso_set_sampler_views(st->cso_context,
                          shader_stage,
-                         *num_textures,
+                         num_textures,
                          sampler_views);
+   *out_num_textures = num_textures;
 }
 
 
 
-static void
-update_vertex_textures(struct st_context *st)
+void
+st_update_vertex_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    if (ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits > 0) {
       update_textures(st,
-                      MESA_SHADER_VERTEX,
+                      PIPE_SHADER_VERTEX,
                       ctx->VertexProgram._Current,
-                      ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_VERTEX],
                       &st->state.num_sampler_views[PIPE_SHADER_VERTEX]);
    }
 }
 
 
-static void
-update_fragment_textures(struct st_context *st)
+void
+st_update_fragment_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    update_textures(st,
-                   MESA_SHADER_FRAGMENT,
+                   PIPE_SHADER_FRAGMENT,
                    ctx->FragmentProgram._Current,
-                   ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
                    st->state.sampler_views[PIPE_SHADER_FRAGMENT],
                    &st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
 }
 
 
-static void
-update_geometry_textures(struct st_context *st)
+void
+st_update_geometry_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    if (ctx->GeometryProgram._Current) {
       update_textures(st,
-                      MESA_SHADER_GEOMETRY,
+                      PIPE_SHADER_GEOMETRY,
                       ctx->GeometryProgram._Current,
-                      ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_GEOMETRY],
                       &st->state.num_sampler_views[PIPE_SHADER_GEOMETRY]);
    }
 }
 
 
-static void
-update_tessctrl_textures(struct st_context *st)
+void
+st_update_tessctrl_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    if (ctx->TessCtrlProgram._Current) {
       update_textures(st,
-                      MESA_SHADER_TESS_CTRL,
+                      PIPE_SHADER_TESS_CTRL,
                       ctx->TessCtrlProgram._Current,
-                      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_TESS_CTRL],
                       &st->state.num_sampler_views[PIPE_SHADER_TESS_CTRL]);
    }
 }
 
 
-static void
-update_tesseval_textures(struct st_context *st)
+void
+st_update_tesseval_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    if (ctx->TessEvalProgram._Current) {
       update_textures(st,
-                      MESA_SHADER_TESS_EVAL,
+                      PIPE_SHADER_TESS_EVAL,
                       ctx->TessEvalProgram._Current,
-                      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_TESS_EVAL],
                       &st->state.num_sampler_views[PIPE_SHADER_TESS_EVAL]);
    }
 }
 
 
-static void
-update_compute_textures(struct st_context *st)
+void
+st_update_compute_textures(struct st_context *st)
 {
    const struct gl_context *ctx = st->ctx;
 
    if (ctx->ComputeProgram._Current) {
       update_textures(st,
-                      MESA_SHADER_COMPUTE,
+                      PIPE_SHADER_COMPUTE,
                       ctx->ComputeProgram._Current,
-                      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_COMPUTE],
                       &st->state.num_sampler_views[PIPE_SHADER_COMPUTE]);
    }
 }
-
-
-const struct st_tracked_state st_update_fragment_texture = {
-   update_fragment_textures				/* update */
-};
-
-
-const struct st_tracked_state st_update_vertex_texture = {
-   update_vertex_textures				/* update */
-};
-
-
-const struct st_tracked_state st_update_geometry_texture = {
-   update_geometry_textures				/* update */
-};
-
-
-const struct st_tracked_state st_update_tessctrl_texture = {
-   update_tessctrl_textures				/* update */
-};
-
-
-const struct st_tracked_state st_update_tesseval_texture = {
-   update_tesseval_textures				/* update */
-};
-
-
-const struct st_tracked_state st_update_compute_texture = {
-   update_compute_textures				/* update */
-};
diff --git a/src/mesa/state_tracker/st_atom_viewport.c b/src/mesa/state_tracker/st_atom_viewport.c
index 8f750a9..6e3347e 100644
--- a/src/mesa/state_tracker/st_atom_viewport.c
+++ b/src/mesa/state_tracker/st_atom_viewport.c
@@ -39,49 +39,34 @@
  *  - depthrange
  *  - window pos/size or FBO size
  */
-static void
-update_viewport( struct st_context *st )
+void
+st_update_viewport( struct st_context *st )
 {
    struct gl_context *ctx = st->ctx;
-   GLfloat yScale, yBias;
    unsigned i;
-   /* _NEW_BUFFERS
-    */
-   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
-      /* Drawing to a window.  The corresponding gallium surface uses
-       * Y=0=TOP but OpenGL is Y=0=BOTTOM.  So we need to invert the viewport.
-       */
-      yScale = -1;
-      yBias = (GLfloat)ctx->DrawBuffer->Height;
-   }
-   else {
-      /* Drawing to an FBO where Y=0=BOTTOM, like OpenGL - don't invert */
-      yScale = 1.0;
-      yBias = 0.0;
-   }
 
    /* _NEW_VIEWPORT 
     */
-   for (i = 0; i < ctx->Const.MaxViewports; i++)
-   {
-      float scale[3], translate[3];
+   for (i = 0; i < st->state.num_viewports; i++) {
+      float *scale = st->state.viewport[i].scale;
+      float *translate = st->state.viewport[i].translate;
+
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
-      st->state.viewport[i].scale[0] = scale[0];
-      st->state.viewport[i].scale[1] = scale[1] * yScale;
-      st->state.viewport[i].scale[2] = scale[2];
-
-      st->state.viewport[i].translate[0] = translate[0];
-      st->state.viewport[i].translate[1] = translate[1] * yScale + yBias;
-      st->state.viewport[i].translate[2] = translate[2];
+      /* _NEW_BUFFERS */
+      /* Drawing to a window where the coordinate system is upside down. */
+      if (st->state.fb_orientation == Y_0_TOP) {
+         scale[1] *= -1;
+         translate[1] = st->state.fb_height - translate[1];
+      }
    }
 
    cso_set_viewport(st->cso_context, &st->state.viewport[0]);
-   if (ctx->Const.MaxViewports > 1)
-      st->pipe->set_viewport_states(st->pipe, 1, ctx->Const.MaxViewports - 1, &st->state.viewport[1]);
+
+   if (st->state.num_viewports > 1) {
+      struct pipe_context *pipe = st->pipe;
+
+      pipe->set_viewport_states(pipe, 1, st->state.num_viewports - 1,
+                                &st->state.viewport[1]);
+   }
 }
-
-
-const struct st_tracked_state st_update_viewport = {
-   update_viewport					/* update */
-};
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index cf820e4..7ba6d82 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -83,27 +83,6 @@
 #define BITMAP_CACHE_WIDTH  512
 #define BITMAP_CACHE_HEIGHT 32
 
-struct bitmap_cache
-{
-   /** Window pos to render the cached image */
-   GLint xpos, ypos;
-   /** Bounds of region used in window coords */
-   GLint xmin, ymin, xmax, ymax;
-
-   GLfloat color[4];
-
-   /** Bitmap's Z position */
-   GLfloat zpos;
-
-   struct pipe_resource *texture;
-   struct pipe_transfer *trans;
-
-   GLboolean empty;
-
-   /** An I8 texture image: */
-   ubyte *buffer;
-};
-
 
 /** Epsilon for Z comparisons */
 #define Z_EPSILON 1e-06
@@ -212,7 +191,7 @@
       GLfloat colorSave[4];
       COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
-      st_upload_constants(st, st->fp->Base.Parameters, MESA_SHADER_FRAGMENT);
+      st_upload_constants(st, &st->fp->Base);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
    }
 
@@ -270,8 +249,8 @@
    }
 
    /* viewport state: viewport matching window dims */
-   cso_set_viewport_dims(cso, st->state.framebuffer.width,
-                         st->state.framebuffer.height,
+   cso_set_viewport_dims(cso, st->state.fb_width,
+                         st->state.fb_height,
                          st->state.fb_orientation == Y_0_TOP);
 
    cso_set_vertex_elements(cso, 3, st->util_velems);
@@ -304,8 +283,8 @@
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
-   const float fb_width = (float) st->state.framebuffer.width;
-   const float fb_height = (float) st->state.framebuffer.height;
+   const float fb_width = (float) st->state.fb_width;
+   const float fb_height = (float) st->state.fb_height;
    const float x0 = (float) x;
    const float x1 = (float) (x + width);
    const float y0 = (float) y;
@@ -354,7 +333,7 @@
 static void
 reset_cache(struct st_context *st)
 {
-   struct bitmap_cache *cache = st->bitmap.cache;
+   struct st_bitmap_cache *cache = &st->bitmap.cache;
 
    /*memset(cache->buffer, 0xff, sizeof(cache->buffer));*/
    cache->empty = GL_TRUE;
@@ -377,7 +356,7 @@
 
 /** Print bitmap image to stdout (debug) */
 static void
-print_cache(const struct bitmap_cache *cache)
+print_cache(const struct st_bitmap_cache *cache)
 {
    int i, j, k;
 
@@ -402,7 +381,7 @@
 create_cache_trans(struct st_context *st)
 {
    struct pipe_context *pipe = st->pipe;
-   struct bitmap_cache *cache = st->bitmap.cache;
+   struct st_bitmap_cache *cache = &st->bitmap.cache;
 
    if (cache->trans)
       return;
@@ -426,9 +405,9 @@
 void
 st_flush_bitmap_cache(struct st_context *st)
 {
-   struct bitmap_cache *cache = st->bitmap.cache;
+   struct st_bitmap_cache *cache = &st->bitmap.cache;
 
-   if (cache && !cache->empty) {
+   if (!cache->empty) {
       struct pipe_context *pipe = st->pipe;
       struct pipe_sampler_view *sv;
 
@@ -483,7 +462,7 @@
              const GLubyte *bitmap )
 {
    struct st_context *st = ctx->st;
-   struct bitmap_cache *cache = st->bitmap.cache;
+   struct st_bitmap_cache *cache = &st->bitmap.cache;
    int px = -999, py = -999;
    const GLfloat z = ctx->Current.RasterPos[2];
 
@@ -557,14 +536,11 @@
    struct pipe_screen *screen = pipe->screen;
 
    /* This function should only be called once */
-   assert(st->bitmap.cache == NULL);
+   assert(st->bitmap.vs == NULL);
 
    assert(st->internal_target == PIPE_TEXTURE_2D ||
           st->internal_target == PIPE_TEXTURE_RECT);
 
-   /* alloc bitmap cache object */
-   st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
-
    /* init sampler state once */
    memset(&st->bitmap.sampler, 0, sizeof(st->bitmap.sampler));
    st->bitmap.sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
@@ -638,7 +614,7 @@
 
    st_invalidate_readpix_cache(st);
 
-   if (!st->bitmap.cache) {
+   if (!st->bitmap.vs) {
       init_bitmap_state(st);
    }
 
@@ -690,15 +666,15 @@
    /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
    const float z = ctx->Current.RasterPos[2] * 2.0f - 1.0f;
    const float *color = ctx->Current.RasterColor;
-   const float clip_x_scale = 2.0f / st->state.framebuffer.width;
-   const float clip_y_scale = 2.0f / st->state.framebuffer.height;
+   const float clip_x_scale = 2.0f / st->state.fb_width;
+   const float clip_y_scale = 2.0f / st->state.fb_height;
    const unsigned num_verts = count * 4;
    const unsigned num_vert_bytes = num_verts * sizeof(struct st_util_vertex);
    struct st_util_vertex *verts;
    struct pipe_vertex_buffer vb = {0};
    unsigned i;
 
-   if (!st->bitmap.cache) {
+   if (!st->bitmap.vs) {
       init_bitmap_state(st);
    }
 
@@ -718,7 +694,7 @@
    vb.stride = sizeof(struct st_util_vertex);
 
    u_upload_alloc(pipe->stream_uploader, 0, num_vert_bytes, 4,
-                  &vb.buffer_offset, &vb.buffer, (void **) &verts);
+                  &vb.buffer_offset, &vb.buffer.resource, (void **) &verts);
 
    if (unlikely(!verts)) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCallLists(bitmap text)");
@@ -805,7 +781,7 @@
 out:
    restore_render_state(ctx);
 
-   pipe_resource_reference(&vb.buffer, NULL);
+   pipe_resource_reference(&vb.buffer.resource, NULL);
 
    pipe_sampler_view_reference(&sv, NULL);
 
@@ -829,19 +805,15 @@
 st_destroy_bitmap(struct st_context *st)
 {
    struct pipe_context *pipe = st->pipe;
-   struct bitmap_cache *cache = st->bitmap.cache;
+   struct st_bitmap_cache *cache = &st->bitmap.cache;
 
    if (st->bitmap.vs) {
       cso_delete_vertex_shader(st->cso_context, st->bitmap.vs);
       st->bitmap.vs = NULL;
    }
 
-   if (cache) {
-      if (cache->trans && cache->buffer) {
-         pipe_transfer_unmap(pipe, cache->trans);
-      }
-      pipe_resource_reference(&st->bitmap.cache->texture, NULL);
-      free(st->bitmap.cache);
-      st->bitmap.cache = NULL;
+   if (cache->trans && cache->buffer) {
+      pipe_transfer_unmap(pipe, cache->trans);
    }
+   pipe_resource_reference(&st->bitmap.cache.texture, NULL);
 }
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 5911d1e..c1994d5 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -583,9 +583,6 @@
 st_init_bufferobject_functions(struct pipe_screen *screen,
                                struct dd_function_table *functions)
 {
-   /* plug in default driver fallbacks (such as for ClearBufferSubData) */
-   _mesa_init_buffer_object_functions(functions);
-
    functions->NewBufferObject = st_bufferobj_alloc;
    functions->DeleteBuffer = st_bufferobj_free;
    functions->BufferData = st_bufferobj_data;
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index f507775..cda9c71 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -36,6 +36,7 @@
 #include "main/glheader.h"
 #include "main/accum.h"
 #include "main/formats.h"
+#include "main/framebuffer.h"
 #include "main/macros.h"
 #include "main/glformats.h"
 #include "program/prog_instruction.h"
@@ -53,7 +54,6 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "util/u_format.h"
-#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_simple_shaders.h"
 
@@ -180,12 +180,14 @@
    const struct gl_framebuffer *fb = ctx->DrawBuffer;
    const GLfloat fb_width = (GLfloat) fb->Width;
    const GLfloat fb_height = (GLfloat) fb->Height;
+
+   _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    const GLfloat x0 = (GLfloat) ctx->DrawBuffer->_Xmin / fb_width * 2.0f - 1.0f;
    const GLfloat x1 = (GLfloat) ctx->DrawBuffer->_Xmax / fb_width * 2.0f - 1.0f;
    const GLfloat y0 = (GLfloat) ctx->DrawBuffer->_Ymin / fb_height * 2.0f - 1.0f;
    const GLfloat y1 = (GLfloat) ctx->DrawBuffer->_Ymax / fb_height * 2.0f - 1.0f;
-   unsigned num_layers =
-      util_framebuffer_get_num_layers(&st->state.framebuffer);
+   unsigned num_layers = st->state.fb_num_layers;
 
    /*
    printf("%s %s%s%s %f,%f %f,%f\n", __func__,
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index bc4e533..384f965 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -41,6 +41,7 @@
 #include "main/pack.h"
 #include "main/pbo.h"
 #include "main/readpix.h"
+#include "main/state.h"
 #include "main/texformat.h"
 #include "main/teximage.h"
 #include "main/texstore.h"
@@ -1072,6 +1073,8 @@
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
 
+   _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    st_flush_bitmap_cache(st);
    st_invalidate_readpix_cache(st);
 
@@ -1123,7 +1126,7 @@
       /* compiling a new fragment shader variant added new state constants
        * into the constant buffer, we need to update them
        */
-      st_upload_constants(st, st->fp->Base.Parameters, MESA_SHADER_FRAGMENT);
+      st_upload_constants(st, &st->fp->Base);
    }
 
    /* Put glDrawPixels image into a texture */
@@ -1317,7 +1320,7 @@
        !ctx->FragmentProgram.Enabled &&
        !ctx->VertexProgram.Enabled &&
        !ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] &&
-       !ctx->ATIFragmentShader._Enabled &&
+       !_mesa_ati_fragment_shader_enabled(ctx) &&
        ctx->DrawBuffer->_NumColorDrawBuffers == 1 &&
        !ctx->Query.CondRenderQuery &&
        !ctx->Query.CurrentOcclusionObject) {
@@ -1437,6 +1440,8 @@
    GLint readX, readY, readW, readH;
    struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
 
+   _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
+
    st_flush_bitmap_cache(st);
    st_invalidate_readpix_cache(st);
 
@@ -1486,7 +1491,7 @@
       /* compiling a new fragment shader variant added new state constants
        * into the constant buffer, we need to update them
        */
-      st_upload_constants(st, st->fp->Base.Parameters, MESA_SHADER_FRAGMENT);
+      st_upload_constants(st, &st->fp->Base);
    }
    else {
       assert(type == GL_DEPTH);
diff --git a/src/mesa/state_tracker/st_cb_eglimage.c b/src/mesa/state_tracker/st_cb_eglimage.c
index 972bdf7..cca2c02 100644
--- a/src/mesa/state_tracker/st_cb_eglimage.c
+++ b/src/mesa/state_tracker/st_cb_eglimage.c
@@ -70,46 +70,35 @@
 }
 
 /**
- * Return the surface of an EGLImage.
- * FIXME: I think this should operate on resources, not surfaces
+ * Return the gallium texture of an EGLImage.
  */
-static struct pipe_surface *
-st_egl_image_get_surface(struct gl_context *ctx, GLeglImageOES image_handle,
-                         unsigned usage, const char *error)
+static bool
+st_get_egl_image(struct gl_context *ctx, GLeglImageOES image_handle,
+                 unsigned usage, const char *error, struct st_egl_image *out)
 {
    struct st_context *st = st_context(ctx);
    struct pipe_screen *screen = st->pipe->screen;
    struct st_manager *smapi =
       (struct st_manager *) st->iface.st_context_private;
-   struct st_egl_image stimg;
-   struct pipe_surface *ps, surf_tmpl;
 
    if (!smapi || !smapi->get_egl_image)
-      return NULL;
+      return false;
 
-   memset(&stimg, 0, sizeof(stimg));
-   if (!smapi->get_egl_image(smapi, (void *) image_handle, &stimg)) {
+   memset(out, 0, sizeof(*out));
+   if (!smapi->get_egl_image(smapi, (void *) image_handle, out)) {
       /* image_handle does not refer to a valid EGL image object */
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(image handle not found)", error);
-      return NULL;
+      return false;
    }
 
-   if (!is_format_supported(screen, stimg.format, stimg.texture->nr_samples, usage)) {
+   if (!is_format_supported(screen, out->format, out->texture->nr_samples, usage)) {
       /* unable to specify a texture object using the specified EGL image */
-      pipe_resource_reference(&stimg.texture, NULL);
+      pipe_resource_reference(&out->texture, NULL);
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(format not supported)", error);
-      return NULL;
+      return false;
    }
 
-   u_surface_default_template(&surf_tmpl, stimg.texture);
-   surf_tmpl.format = stimg.format;
-   surf_tmpl.u.tex.level = stimg.level;
-   surf_tmpl.u.tex.first_layer = stimg.layer;
-   surf_tmpl.u.tex.last_layer = stimg.layer;
-   ps = st->pipe->create_surface(st->pipe, stimg.texture, &surf_tmpl);
-   pipe_resource_reference(&stimg.texture, NULL);
-
-   return ps;
+   return true;
 }
 
 /**
@@ -148,18 +137,37 @@
 					 GLeglImageOES image_handle)
 {
    struct st_renderbuffer *strb = st_renderbuffer(rb);
-   struct pipe_surface *ps;
+   struct st_egl_image stimg;
 
-   ps = st_egl_image_get_surface(ctx, image_handle, PIPE_BIND_RENDER_TARGET,
-				 "glEGLImageTargetRenderbufferStorage");
-   if (ps) {
+   if (st_get_egl_image(ctx, image_handle, PIPE_BIND_RENDER_TARGET,
+                        "glEGLImageTargetRenderbufferStorage",
+                        &stimg)) {
+      struct pipe_context *pipe = st_context(ctx)->pipe;
+      struct pipe_surface *ps, surf_tmpl;
+
+      u_surface_default_template(&surf_tmpl, stimg.texture);
+      surf_tmpl.format = stimg.format;
+      surf_tmpl.u.tex.level = stimg.level;
+      surf_tmpl.u.tex.first_layer = stimg.layer;
+      surf_tmpl.u.tex.last_layer = stimg.layer;
+      ps = pipe->create_surface(pipe, stimg.texture, &surf_tmpl);
+      pipe_resource_reference(&stimg.texture, NULL);
+
+      if (!ps)
+         return;
+
       strb->Base.Width = ps->width;
       strb->Base.Height = ps->height;
       strb->Base.Format = st_pipe_format_to_mesa_format(ps->format);
       strb->Base._BaseFormat = st_pipe_format_to_base_format(ps->format);
       strb->Base.InternalFormat = strb->Base._BaseFormat;
 
-      pipe_surface_reference(&strb->surface, ps);
+      struct pipe_surface **psurf =
+         util_format_is_srgb(ps->format) ? &strb->surface_srgb :
+                                           &strb->surface_linear;
+
+      pipe_surface_reference(psurf, ps);
+      strb->surface = *psurf;
       pipe_resource_reference(&strb->texture, ps->texture);
 
       pipe_surface_reference(&ps, NULL);
@@ -167,10 +175,10 @@
 }
 
 static void
-st_bind_surface(struct gl_context *ctx, GLenum target,
-                struct gl_texture_object *texObj,
-                struct gl_texture_image *texImage,
-                struct pipe_surface *ps)
+st_bind_egl_image(struct gl_context *ctx,
+                  struct gl_texture_object *texObj,
+                  struct gl_texture_image *texImage,
+                  struct st_egl_image *stimg)
 {
    struct st_context *st = st_context(ctx);
    struct st_texture_object *stObj;
@@ -179,7 +187,8 @@
    mesa_format texFormat;
 
    /* map pipe format to base format */
-   if (util_format_get_component_bits(ps->format, UTIL_FORMAT_COLORSPACE_RGB, 3) > 0)
+   if (util_format_get_component_bits(stimg->format,
+                                      UTIL_FORMAT_COLORSPACE_RGB, 3) > 0)
       internalFormat = GL_RGBA;
    else
       internalFormat = GL_RGB;
@@ -193,13 +202,13 @@
       stObj->surface_based = GL_TRUE;
    }
 
-   texFormat = st_pipe_format_to_mesa_format(ps->format);
+   texFormat = st_pipe_format_to_mesa_format(stimg->format);
 
    /* TODO RequiredTextureImageUnits should probably be reset back
     * to 1 somewhere if different texture is bound??
     */
    if (texFormat == MESA_FORMAT_NONE) {
-      switch (ps->format) {
+      switch (stimg->format) {
       case PIPE_FORMAT_NV12:
          texFormat = MESA_FORMAT_R_UNORM8;
          texObj->RequiredTextureImageUnits = 2;
@@ -214,15 +223,15 @@
    }
 
    _mesa_init_teximage_fields(ctx, texImage,
-                              ps->width, ps->height, 1, 0, internalFormat,
-                              texFormat);
+                              stimg->texture->width0, stimg->texture->height0,
+                              1, 0, internalFormat, texFormat);
 
-   /* FIXME create a non-default sampler view from the pipe_surface? */
-   pipe_resource_reference(&stObj->pt, ps->texture);
+   /* FIXME create a non-default sampler view from the stimg? */
+   pipe_resource_reference(&stObj->pt, stimg->texture);
    st_texture_release_all_sampler_views(st, stObj);
    pipe_resource_reference(&stImage->pt, stObj->pt);
 
-   stObj->surface_format = ps->format;
+   stObj->surface_format = stimg->format;
 
    _mesa_dirty_texobj(ctx, texObj);
 }
@@ -233,14 +242,14 @@
 			       struct gl_texture_image *texImage,
 			       GLeglImageOES image_handle)
 {
-   struct pipe_surface *ps;
+   struct st_egl_image stimg;
 
-   ps = st_egl_image_get_surface(ctx, image_handle, PIPE_BIND_SAMPLER_VIEW,
-				 "glEGLImageTargetTexture2D");
-   if (ps) {
-      st_bind_surface(ctx, target, texObj, texImage, ps);
-      pipe_surface_reference(&ps, NULL);
-   }
+   if (!st_get_egl_image(ctx, image_handle, PIPE_BIND_SAMPLER_VIEW,
+                         "glEGLImageTargetTexture2D", &stimg))
+      return;
+
+   st_bind_egl_image(ctx, texObj, texImage, &stimg);
+   pipe_resource_reference(&stimg.texture, NULL);
 }
 
 void
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 7b9855f..23cbcdc 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 
@@ -112,11 +112,9 @@
                               GLuint width, GLuint height)
 {
    struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = st->pipe->screen;
    struct st_renderbuffer *strb = st_renderbuffer(rb);
    enum pipe_format format = PIPE_FORMAT_NONE;
-   struct pipe_surface surf_tmpl;
    struct pipe_resource templ;
 
    /* init renderbuffer fields */
@@ -132,8 +130,10 @@
 
    /* Free the old surface and texture
     */
-   pipe_surface_reference( &strb->surface, NULL );
-   pipe_resource_reference( &strb->texture, NULL );
+   pipe_surface_reference(&strb->surface_srgb, NULL);
+   pipe_surface_reference(&strb->surface_linear, NULL);
+   strb->surface = NULL;
+   pipe_resource_reference(&strb->texture, NULL);
 
    /* If an sRGB framebuffer is unsupported, sRGB formats behave like linear
     * formats.
@@ -215,17 +215,7 @@
    if (!strb->texture)
       return FALSE;
 
-   u_surface_default_template(&surf_tmpl, strb->texture);
-   strb->surface = pipe->create_surface(pipe,
-                                        strb->texture,
-                                        &surf_tmpl);
-   if (strb->surface) {
-      assert(strb->surface->texture);
-      assert(strb->surface->format);
-      assert(strb->surface->width == width);
-      assert(strb->surface->height == height);
-   }
-
+   st_update_renderbuffer_surface(st, strb);
    return strb->surface != NULL;
 }
 
@@ -239,7 +229,9 @@
    struct st_renderbuffer *strb = st_renderbuffer(rb);
    if (ctx) {
       struct st_context *st = st_context(ctx);
-      pipe_surface_release(st->pipe, &strb->surface);
+      pipe_surface_release(st->pipe, &strb->surface_srgb);
+      pipe_surface_release(st->pipe, &strb->surface_linear);
+      strb->surface = NULL;
    }
    pipe_resource_reference(&strb->texture, NULL);
    free(strb->data);
@@ -361,7 +353,7 @@
       break;
    default:
       _mesa_problem(NULL,
-		    "Unexpected format %s in st_new_renderbuffer_fb",
+                    "Unexpected format %s in st_new_renderbuffer_fb",
                     util_format_name(format));
       free(strb);
       return NULL;
@@ -388,18 +380,19 @@
 {
    struct pipe_context *pipe = st->pipe;
    struct pipe_resource *resource = strb->texture;
-   struct st_texture_object *stTexObj = NULL;
+   const struct st_texture_object *stTexObj = NULL;
    unsigned rtt_width = strb->Base.Width;
    unsigned rtt_height = strb->Base.Height;
    unsigned rtt_depth = strb->Base.Depth;
+
    /*
     * For winsys fbo, it is possible that the renderbuffer is sRGB-capable but
     * the format of strb->texture is linear (because we have no control over
     * the format).  Check strb->Base.Format instead of strb->texture->format
     * to determine if the rb is sRGB-capable.
     */
-   boolean enable_srgb = (st->ctx->Color.sRGBEnabled &&
-         _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB);
+   boolean enable_srgb = st->ctx->Color.sRGBEnabled &&
+      _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB;
    enum pipe_format format = resource->format;
 
    if (strb->is_rtt) {
@@ -408,11 +401,7 @@
          format = stTexObj->surface_format;
    }
 
-   format = (enable_srgb) ?
-      util_format_srgb(format) :
-      util_format_linear(format);
-
-   unsigned first_layer, last_layer, level;
+   format = enable_srgb ? util_format_srgb(format) : util_format_linear(format);
 
    if (resource->target == PIPE_TEXTURE_1D_ARRAY) {
       rtt_depth = rtt_height;
@@ -420,6 +409,7 @@
    }
 
    /* find matching mipmap level size */
+   unsigned level;
    for (level = 0; level <= resource->last_level; level++) {
       if (u_minify(resource->width0, level) == rtt_width &&
           u_minify(resource->height0, level) == rtt_height &&
@@ -431,6 +421,7 @@
    assert(level <= resource->last_level);
 
    /* determine the layer bounds */
+   unsigned first_layer, last_layer;
    if (strb->rtt_layered) {
       first_layer = 0;
       last_layer = util_max_layer(strb->texture, level);
@@ -443,7 +434,7 @@
    /* Adjust for texture views */
    if (strb->is_rtt && resource->array_size > 1 &&
        stTexObj->base.Immutable) {
-      struct gl_texture_object *tex = &stTexObj->base;
+      const struct gl_texture_object *tex = &stTexObj->base;
       first_layer += tex->MinLayer;
       if (!strb->rtt_layered)
          last_layer += tex->MinLayer;
@@ -451,15 +442,19 @@
          last_layer = MIN2(first_layer + tex->NumLayers - 1, last_layer);
    }
 
-   if (!strb->surface ||
-       strb->surface->texture->nr_samples != strb->Base.NumSamples ||
-       strb->surface->format != format ||
-       strb->surface->texture != resource ||
-       strb->surface->width != rtt_width ||
-       strb->surface->height != rtt_height ||
-       strb->surface->u.tex.level != level ||
-       strb->surface->u.tex.first_layer != first_layer ||
-       strb->surface->u.tex.last_layer != last_layer) {
+   struct pipe_surface **psurf =
+      enable_srgb ? &strb->surface_srgb : &strb->surface_linear;
+   struct pipe_surface *surf = *psurf;
+
+   if (!surf ||
+       surf->texture->nr_samples != strb->Base.NumSamples ||
+       surf->format != format ||
+       surf->texture != resource ||
+       surf->width != rtt_width ||
+       surf->height != rtt_height ||
+       surf->u.tex.level != level ||
+       surf->u.tex.first_layer != first_layer ||
+       surf->u.tex.last_layer != last_layer) {
       /* create a new pipe_surface */
       struct pipe_surface surf_tmpl;
       memset(&surf_tmpl, 0, sizeof(surf_tmpl));
@@ -468,12 +463,28 @@
       surf_tmpl.u.tex.first_layer = first_layer;
       surf_tmpl.u.tex.last_layer = last_layer;
 
-      pipe_surface_release(pipe, &strb->surface);
+      pipe_surface_release(pipe, psurf);
 
-      strb->surface = pipe->create_surface(pipe, resource, &surf_tmpl);
+      *psurf = pipe->create_surface(pipe, resource, &surf_tmpl);
    }
+   strb->surface = *psurf;
 }
 
+
+/**
+ * Return the pipe_resource which stores a particular texture image.
+ */
+static struct pipe_resource *
+get_teximage_resource(struct gl_texture_object *texObj,
+                      unsigned face, unsigned level)
+{
+   struct st_texture_image *stImg =
+      st_texture_image(texObj->Image[face][level]);
+
+   return stImg->pt;
+}
+
+
 /**
  * Called by ctx->Driver.RenderTexture
  */
@@ -491,7 +502,9 @@
    if (!st_finalize_texture(ctx, pipe, att->Texture, att->CubeMapFace))
       return;
 
-   pt = st_get_texobj_resource(att->Texture);
+   pt = get_teximage_resource(att->Texture,
+                              att->CubeMapFace,
+                              att->TextureLevel);
    assert(pt);
 
    /* point renderbuffer at texobject */
@@ -508,7 +521,7 @@
     * That's where the new renderbuffer (which we just created) gets
     * passed to the pipe as a (color/depth) render target.
     */
-   st_invalidate_state(ctx, _NEW_BUFFERS);
+   st_invalidate_buffers(st);
 
 
    /* Need to trigger a call to update_framebuffer() since we just
@@ -524,6 +537,7 @@
 static void
 st_finish_render_texture(struct gl_context *ctx, struct gl_renderbuffer *rb)
 {
+   struct st_context *st = st_context(ctx);
    struct st_renderbuffer *strb = st_renderbuffer(rb);
 
    if (!strb)
@@ -532,7 +546,7 @@
    strb->is_rtt = FALSE;
 
    /* restore previous framebuffer state */
-   st_invalidate_state(ctx, _NEW_BUFFERS);
+   st_invalidate_buffers(st);
 }
 
 
@@ -551,9 +565,9 @@
  */
 static GLboolean
 st_validate_attachment(struct gl_context *ctx,
-		       struct pipe_screen *screen,
-		       const struct gl_renderbuffer_attachment *att,
-		       unsigned bindings)
+                       struct pipe_screen *screen,
+                       const struct gl_renderbuffer_attachment *att,
+                       unsigned bindings)
 {
    const struct st_texture_object *stObj = st_texture_object(att->Texture);
    enum pipe_format format;
@@ -597,7 +611,7 @@
 
    return valid;
 }
- 
+
 
 /**
  * Check that the framebuffer configuration is valid in terms of what
@@ -639,18 +653,12 @@
       return;
    }
 
-   if (!st_validate_attachment(ctx,
-                               screen,
-                               depth,
-			       PIPE_BIND_DEPTH_STENCIL)) {
+   if (!st_validate_attachment(ctx, screen, depth, PIPE_BIND_DEPTH_STENCIL)) {
       fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
       st_fbo_invalid("Invalid depth attachment");
       return;
    }
-   if (!st_validate_attachment(ctx,
-                               screen,
-                               stencil,
-			       PIPE_BIND_DEPTH_STENCIL)) {
+   if (!st_validate_attachment(ctx, screen, stencil, PIPE_BIND_DEPTH_STENCIL)) {
       fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
       st_fbo_invalid("Invalid stencil attachment");
       return;
@@ -660,13 +668,10 @@
             &fb->Attachment[BUFFER_COLOR0 + i];
       enum pipe_format format;
 
-      if (!st_validate_attachment(ctx,
-                                  screen,
-				  att,
-				  PIPE_BIND_RENDER_TARGET)) {
-	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
-	 st_fbo_invalid("Invalid color attachment");
-	 return;
+      if (!st_validate_attachment(ctx, screen, att, PIPE_BIND_RENDER_TARGET)) {
+         fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+         st_fbo_invalid("Invalid color attachment");
+         return;
       }
 
       if (!mixed_formats) {
@@ -844,7 +849,8 @@
 
 
 
-void st_init_fbo_functions(struct dd_function_table *functions)
+void
+st_init_fbo_functions(struct dd_function_table *functions)
 {
    functions->NewFramebuffer = _mesa_new_framebuffer;
    functions->NewRenderbuffer = st_new_renderbuffer;
@@ -859,5 +865,3 @@
    functions->MapRenderbuffer = st_MapRenderbuffer;
    functions->UnmapRenderbuffer = st_UnmapRenderbuffer;
 }
-
-
diff --git a/src/mesa/state_tracker/st_cb_fbo.h b/src/mesa/state_tracker/st_cb_fbo.h
index 351fb9a..239bfd9 100644
--- a/src/mesa/state_tracker/st_cb_fbo.h
+++ b/src/mesa/state_tracker/st_cb_fbo.h
@@ -48,7 +48,12 @@
 {
    struct gl_renderbuffer Base;
    struct pipe_resource *texture;
-   struct pipe_surface *surface; /* temporary view into texture */
+   /* This points to either "surface_linear" or "surface_srgb".
+    * It doesn't hold the pipe_surface reference. The other two do.
+    */
+   struct pipe_surface *surface;
+   struct pipe_surface *surface_linear;
+   struct pipe_surface *surface_srgb;
    GLboolean defined;        /**< defined contents? */
 
    struct pipe_transfer *transfer; /**< only used when mapping the resource */
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index 7f383eb..436062b 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -99,13 +99,13 @@
     * color and texcoord attribs to use here.
     */
 
-   slot = st->vertex_result_to_slot[VARYING_SLOT_COL0];
+   slot = st->vp->result_to_output[VARYING_SLOT_COL0];
    if (slot != ~0U)
       color = v->data[slot];
    else
       color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
 
-   slot = st->vertex_result_to_slot[VARYING_SLOT_TEX0];
+   slot = st->vp->result_to_output[VARYING_SLOT_TEX0];
    if (slot != ~0U)
       texcoord = v->data[slot];
    else
@@ -282,7 +282,7 @@
 
    if (newMode == GL_RENDER) {
       /* restore normal VBO draw function */
-      vbo_set_draw_func(ctx, st_draw_vbo);
+      st_init_draw(st);
    }
    else if (newMode == GL_SELECT) {
       if (!st->selection_stage)
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index b104649..555fc5d 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -68,19 +68,11 @@
                                                  struct st_fragment_program);
       return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm);
    }
+   case GL_TESS_CONTROL_PROGRAM_NV:
+   case GL_TESS_EVALUATION_PROGRAM_NV:
    case GL_GEOMETRY_PROGRAM_NV: {
-      struct st_geometry_program *prog = rzalloc(NULL,
-                                                 struct st_geometry_program);
-      return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm);
-   }
-   case GL_TESS_CONTROL_PROGRAM_NV: {
-      struct st_tessctrl_program *prog = rzalloc(NULL,
-                                                 struct st_tessctrl_program);
-      return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm);
-   }
-   case GL_TESS_EVALUATION_PROGRAM_NV: {
-      struct st_tesseval_program *prog = rzalloc(NULL,
-                                                 struct st_tesseval_program);
+      struct st_common_program *prog = rzalloc(NULL,
+                                               struct st_common_program);
       return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm);
    }
    case GL_COMPUTE_PROGRAM_NV: {
@@ -113,16 +105,17 @@
             free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi);
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+   case GL_TESS_EVALUATION_PROGRAM_NV:
    case GL_GEOMETRY_PROGRAM_NV:
       {
-         struct st_geometry_program *stgp =
-            (struct st_geometry_program *) prog;
+         struct st_common_program *p = st_common_program(prog);
 
-         st_release_basic_variants(st, stgp->Base.Target, &stgp->variants,
-                                   &stgp->tgsi);
+         st_release_basic_variants(st, p->Base.Target, &p->variants,
+                                   &p->tgsi);
          
-         if (stgp->glsl_to_tgsi)
-            free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi);
+         if (p->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(p->glsl_to_tgsi);
       }
       break;
    case GL_FRAGMENT_PROGRAM_ARB:
@@ -136,30 +129,6 @@
             free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       }
       break;
-   case GL_TESS_CONTROL_PROGRAM_NV:
-      {
-         struct st_tessctrl_program *sttcp =
-            (struct st_tessctrl_program *) prog;
-
-         st_release_basic_variants(st, sttcp->Base.Target, &sttcp->variants,
-                                   &sttcp->tgsi);
-
-         if (sttcp->glsl_to_tgsi)
-            free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
-      }
-      break;
-   case GL_TESS_EVALUATION_PROGRAM_NV:
-      {
-         struct st_tesseval_program *sttep =
-            (struct st_tesseval_program *) prog;
-
-         st_release_basic_variants(st, sttep->Base.Target,
-                                   &sttep->variants, &sttep->tgsi);
-
-         if (sttep->glsl_to_tgsi)
-            free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
-      }
-      break;
    case GL_COMPUTE_PROGRAM_NV:
       {
          struct st_compute_program *stcp =
@@ -204,7 +173,7 @@
 	 st->dirty |= stfp->affected_states;
    }
    else if (target == GL_GEOMETRY_PROGRAM_NV) {
-      struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
+      struct st_common_program *stgp = st_common_program(prog);
 
       st_release_basic_variants(st, stgp->Base.Target, &stgp->variants,
                                 &stgp->tgsi);
@@ -225,8 +194,8 @@
 	 st->dirty |= ST_NEW_VERTEX_PROGRAM(st, stvp);
    }
    else if (target == GL_TESS_CONTROL_PROGRAM_NV) {
-      struct st_tessctrl_program *sttcp =
-         (struct st_tessctrl_program *) prog;
+      struct st_common_program *sttcp =
+         st_common_program(prog);
 
       st_release_basic_variants(st, sttcp->Base.Target, &sttcp->variants,
                                 &sttcp->tgsi);
@@ -237,8 +206,8 @@
          st->dirty |= sttcp->affected_states;
    }
    else if (target == GL_TESS_EVALUATION_PROGRAM_NV) {
-      struct st_tesseval_program *sttep =
-         (struct st_tesseval_program *) prog;
+      struct st_common_program *sttep =
+         st_common_program(prog);
 
       st_release_basic_variants(st, sttep->Base.Target, &sttep->variants,
                                 &sttep->tgsi);
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 8518454..e266296 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -44,6 +44,7 @@
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_draw.h"
+#include "st_program.h"
 #include "st_cb_rasterpos.h"
 #include "draw/draw_context.h"
 #include "draw/draw_pipe.h"
@@ -109,7 +110,7 @@
  * else copy the current attrib.
  */
 static void
-update_attrib(struct gl_context *ctx, const GLuint *outputMapping,
+update_attrib(struct gl_context *ctx, const ubyte *outputMapping,
               const struct vertex_header *vert,
               GLfloat *dest,
               GLuint result, GLuint defaultAttrib)
@@ -134,7 +135,7 @@
    struct gl_context *ctx = rs->ctx;
    struct st_context *st = st_context(ctx);
    const GLfloat height = (GLfloat) ctx->DrawBuffer->Height;
-   const GLuint *outputMapping = st->vertex_result_to_slot;
+   const ubyte *outputMapping = st->vp->result_to_output;
    const GLfloat *pos;
    GLuint i;
 
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 795519d..84dd2d5 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -175,7 +175,7 @@
 
       if (view_target != PIPE_TEXTURE_3D) {
          templ.u.tex.first_layer = surface->u.tex.first_layer;
-         templ.u.tex.last_layer = templ.u.tex.last_layer;
+         templ.u.tex.last_layer = templ.u.tex.first_layer;
       } else {
          addr.constants.layer_offset = surface->u.tex.first_layer;
       }
@@ -251,14 +251,6 @@
    return success;
 }
 
-/* Invalidate the readpixels cache to ensure we don't read stale data.
- */
-void st_invalidate_readpix_cache(struct st_context *st)
-{
-   pipe_resource_reference(&st->readpix_cache.src, NULL);
-   pipe_resource_reference(&st->readpix_cache.cache, NULL);
-}
-
 /**
  * Create a staging texture and blit the requested region to it.
  */
diff --git a/src/mesa/state_tracker/st_cb_strings.c b/src/mesa/state_tracker/st_cb_strings.c
index fc48fd0..85fe5a7 100644
--- a/src/mesa/state_tracker/st_cb_strings.c
+++ b/src/mesa/state_tracker/st_cb_strings.c
@@ -39,8 +39,6 @@
 #include "st_context.h"
 #include "st_cb_strings.h"
 
-#define ST_VERSION_STRING "0.4"
-
 static const GLubyte *
 st_get_string(struct gl_context * ctx, GLenum name)
 {
@@ -49,17 +47,11 @@
 
    switch (name) {
    case GL_VENDOR: {
-      const char *vendor = screen->get_vendor( screen );
-      util_snprintf(st->vendor, sizeof(st->vendor), "%s", vendor);
-      return (GLubyte *) st->vendor;
+      return (GLubyte *) screen->get_vendor(screen);
    }
 
    case GL_RENDERER:
-      util_snprintf(st->renderer, sizeof(st->renderer), "Gallium %s on %s", 
-               ST_VERSION_STRING,
-	       screen->get_name( screen ));
-
-      return (GLubyte *) st->renderer;
+      return (GLubyte *) screen->get_name(screen);
 
    default:
       return NULL;
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 99c59f7..f66e1bd 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -154,6 +154,8 @@
    DBG("%s\n", __func__);
    _mesa_initialize_texture_object(ctx, &obj->base, name, target);
 
+   obj->needs_validation = true;
+
    return &obj->base;
 }
 
@@ -606,6 +608,8 @@
 
    assert(!stImage->pt); /* xxx this might be wrong */
 
+   stObj->needs_validation = true;
+
    etc_fallback_allocate(st, stImage);
 
    /* Look if the parent texture object has space for this image */
@@ -2465,26 +2469,19 @@
          stObj->lastLevel = stObj->base._MaxLevel;
    }
 
-   if (tObj->Target == GL_TEXTURE_BUFFER) {
-      struct st_buffer_object *st_obj = st_buffer_object(tObj->BufferObject);
-
-      if (!st_obj) {
-         pipe_resource_reference(&stObj->pt, NULL);
-         st_texture_release_all_sampler_views(st, stObj);
-         return GL_TRUE;
-      }
-
-      if (st_obj->buffer != stObj->pt) {
-         pipe_resource_reference(&stObj->pt, st_obj->buffer);
-         st_texture_release_all_sampler_views(st, stObj);
-      }
-      return GL_TRUE;
-
-   }
-
    firstImage = st_texture_image_const(stObj->base.Image[cubeMapFace][stObj->base.BaseLevel]);
    assert(firstImage);
 
+   /* Skip the loop over images in the common case of no images having
+    * changed.  But if the GL_BASE_LEVEL or GL_MAX_LEVEL change to something we
+    * haven't looked at, then we do need to look at those new images.
+    */
+   if (!stObj->needs_validation &&
+       stObj->base.BaseLevel >= stObj->validated_first_level &&
+       stObj->lastLevel <= stObj->validated_last_level) {
+      return GL_TRUE;
+   }
+
    /* If both firstImage and stObj point to a texture which can contain
     * all active images, favour firstImage.  Note that because of the
     * completeness requirement, we know that the image dimensions
@@ -2546,6 +2543,18 @@
                 stObj->base.Target == GL_TEXTURE_CUBE_MAP_ARRAY)
                ptHeight = ptWidth;
          }
+
+         /* At this point, the texture may be incomplete (mismatched cube
+          * face sizes, for example).  If that's the case, give up, but
+          * don't return GL_FALSE as that would raise an incorrect
+          * GL_OUT_OF_MEMORY error.  See Piglit fbo-incomplete-texture-03 test.
+          */
+         if (!stObj->base._BaseComplete) {
+            _mesa_test_texobj_completeness(ctx, &stObj->base);
+            if (!stObj->base._BaseComplete) {
+               return TRUE;
+            }
+         }
       }
 
       ptNumSamples = firstImage->base.NumSamples;
@@ -2631,6 +2640,10 @@
       }
    }
 
+   stObj->validated_first_level = stObj->base.BaseLevel;
+   stObj->validated_last_level = stObj->lastLevel;
+   stObj->needs_validation = false;
+
    return GL_TRUE;
 }
 
@@ -2638,6 +2651,8 @@
 /**
  * Called via ctx->Driver.AllocTextureStorage() to allocate texture memory
  * for a whole mipmap stack.
+ * Note: for multisample textures if the requested sample count is not
+ * supported, we search for the next higher supported sample count.
  */
 static GLboolean
 st_AllocTextureStorage(struct gl_context *ctx,
@@ -2666,10 +2681,11 @@
 
    /* Raise the sample count if the requested one is unsupported. */
    if (num_samples > 1) {
+      enum pipe_texture_target ptarget = gl_target_to_pipe(texObj->Target);
       boolean found = FALSE;
 
       for (; num_samples <= ctx->Const.MaxSamples; num_samples++) {
-         if (screen->is_format_supported(screen, fmt, PIPE_TEXTURE_2D,
+         if (screen->is_format_supported(screen, fmt, ptarget,
                                          num_samples,
                                          PIPE_BIND_SAMPLER_VIEW)) {
             /* Update the sample count in gl_texture_image as well. */
@@ -2712,6 +2728,11 @@
       }
    }
 
+   /* The texture is in a validated state, so no need to check later. */
+   stObj->needs_validation = false;
+   stObj->validated_first_level = 0;
+   stObj->validated_last_level = levels - 1;
+
    return GL_TRUE;
 }
 
@@ -2810,9 +2831,50 @@
     */
    st_texture_release_all_sampler_views(st, tex);
 
+   /* The texture is in a validated state, so no need to check later. */
+   tex->needs_validation = false;
+   tex->validated_first_level = 0;
+   tex->validated_last_level = numLevels - 1;
+
    return GL_TRUE;
 }
 
+
+/**
+ * Find the mipmap level in 'pt' which matches the level described by
+ * 'texImage'.
+ */
+static unsigned
+find_mipmap_level(const struct gl_texture_image *texImage,
+                  const struct pipe_resource *pt)
+{
+   const GLenum target = texImage->TexObject->Target;
+   GLint texWidth = texImage->Width;
+   GLint texHeight = texImage->Height;
+   GLint texDepth = texImage->Depth;
+   unsigned level, w;
+   uint16_t h, d, layers;
+
+   st_gl_texture_dims_to_pipe_dims(target, texWidth, texHeight, texDepth,
+                                   &w, &h, &d, &layers);
+
+   for (level = 0; level <= pt->last_level; level++) {
+      if (u_minify(pt->width0, level) == w &&
+          u_minify(pt->height0, level) == h &&
+          u_minify(pt->depth0, level) == d) {
+         return level;
+      }
+   }
+
+   /* If we get here, there must be some sort of inconsistency between
+    * the Mesa texture object/images and the gallium resource.
+    */
+   debug_printf("Inconsistent textures in find_mipmap_level()\n");
+
+   return texImage->Level;
+}
+
+
 static void
 st_ClearTexSubImage(struct gl_context *ctx,
                     struct gl_texture_image *texImage,
@@ -2821,11 +2883,12 @@
                     const void *clearValue)
 {
    static const char zeros[16] = {0};
+   struct gl_texture_object *texObj = texImage->TexObject;
    struct st_texture_image *stImage = st_texture_image(texImage);
    struct pipe_resource *pt = stImage->pt;
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
-   unsigned level = texImage->Level;
+   unsigned level;
    struct pipe_box box;
 
    if (!pt)
@@ -2836,10 +2899,25 @@
 
    u_box_3d(xoffset, yoffset, zoffset + texImage->Face,
             width, height, depth, &box);
-   if (texImage->TexObject->Immutable) {
-      level += texImage->TexObject->MinLevel;
-      box.z += texImage->TexObject->MinLayer;
+   if (texObj->Immutable) {
+      /* The texture object has to be consistent (no "loose", per-image
+       * gallium resources).  If this texture is a view into another
+       * texture, we have to apply the MinLevel/Layer offsets.  If this is
+       * not a texture view, the offsets will be zero.
+       */
+      assert(stImage->pt == st_texture_object(texObj)->pt);
+      level = texImage->Level + texObj->MinLevel;
+      box.z += texObj->MinLayer;
    }
+   else {
+      /* Texture level sizes may be inconsistent.  We my have "loose",
+       * per-image gallium resources.  The texImage->Level may not match
+       * the gallium resource texture level.
+       */
+      level = find_mipmap_level(texImage, pt);
+   }
+
+   assert(level <= pt->last_level);
 
    pipe->clear_texture(pipe, pt, level, &box, clearValue ? clearValue : zeros);
 }
@@ -2880,6 +2958,85 @@
 }
 
 
+static GLuint64
+st_NewTextureHandle(struct gl_context *ctx, struct gl_texture_object *texObj,
+                    struct gl_sampler_object *sampObj)
+{
+   struct st_context *st = st_context(ctx);
+   struct st_texture_object *stObj = st_texture_object(texObj);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_sampler_view *view;
+   struct pipe_sampler_state sampler = {0};
+
+   if (texObj->Target != GL_TEXTURE_BUFFER) {
+      if (!st_finalize_texture(ctx, pipe, texObj, 0))
+         return 0;
+
+      st_convert_sampler(st, texObj, sampObj, &sampler);
+      view = st_get_texture_sampler_view_from_stobj(st, stObj, sampObj, 0);
+   } else {
+      view = st_get_buffer_sampler_view_from_stobj(st, stObj);
+   }
+
+   return pipe->create_texture_handle(pipe, view, &sampler);
+}
+
+
+static void
+st_DeleteTextureHandle(struct gl_context *ctx, GLuint64 handle)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->delete_texture_handle(pipe, handle);
+}
+
+
+static void
+st_MakeTextureHandleResident(struct gl_context *ctx, GLuint64 handle,
+                             bool resident)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->make_texture_handle_resident(pipe, handle, resident);
+}
+
+
+static GLuint64
+st_NewImageHandle(struct gl_context *ctx, struct gl_image_unit *imgObj)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_image_view image;
+
+   st_convert_image(st, imgObj, &image);
+
+   return pipe->create_image_handle(pipe, &image);
+}
+
+
+static void
+st_DeleteImageHandle(struct gl_context *ctx, GLuint64 handle)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->delete_image_handle(pipe, handle);
+}
+
+
+static void
+st_MakeImageHandleResident(struct gl_context *ctx, GLuint64 handle,
+                           GLenum access, bool resident)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->make_image_handle_resident(pipe, handle, access, resident);
+}
+
+
 void
 st_init_texture_functions(struct dd_function_table *functions)
 {
@@ -2914,4 +3071,12 @@
    functions->ClearTexSubImage = st_ClearTexSubImage;
 
    functions->TexParameter = st_TexParameter;
+
+   /* bindless functions */
+   functions->NewTextureHandle = st_NewTextureHandle;
+   functions->DeleteTextureHandle = st_DeleteTextureHandle;
+   functions->MakeTextureHandleResident = st_MakeTextureHandleResident;
+   functions->NewImageHandle = st_NewImageHandle;
+   functions->DeleteImageHandle = st_DeleteImageHandle;
+   functions->MakeImageHandleResident = st_MakeImageHandleResident;
 }
diff --git a/src/mesa/state_tracker/st_cb_texture.h b/src/mesa/state_tracker/st_cb_texture.h
index f647b16..ed6ed16 100644
--- a/src/mesa/state_tracker/st_cb_texture.h
+++ b/src/mesa/state_tracker/st_cb_texture.h
@@ -37,6 +37,7 @@
 struct gl_texture_object;
 struct pipe_context;
 struct st_context;
+struct st_texture_object;
 
 extern enum pipe_texture_target
 gl_target_to_pipe(GLenum target);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index fb98a34..381ff9d 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -38,6 +38,7 @@
 #include "program/prog_cache.h"
 #include "vbo/vbo.h"
 #include "glapi/glapi.h"
+#include "st_manager.h"
 #include "st_context.h"
 #include "st_debug.h"
 #include "st_cb_bitmap.h"
@@ -131,12 +132,12 @@
 {
    struct st_vertex_program *vp =
       st_vertex_program(ctx->VertexProgram._Current);
-   struct st_tessctrl_program *tcp =
-      st_tessctrl_program(ctx->TessCtrlProgram._Current);
-   struct st_tesseval_program *tep =
-      st_tesseval_program(ctx->TessEvalProgram._Current);
-   struct st_geometry_program *gp =
-      st_geometry_program(ctx->GeometryProgram._Current);
+   struct st_common_program *tcp =
+      st_common_program(ctx->TessCtrlProgram._Current);
+   struct st_common_program *tep =
+      st_common_program(ctx->TessEvalProgram._Current);
+   struct st_common_program *gp =
+      st_common_program(ctx->GeometryProgram._Current);
    struct st_fragment_program *fp =
       st_fragment_program(ctx->FragmentProgram._Current);
    struct st_compute_program *cp =
@@ -161,50 +162,44 @@
 }
 
 
+void
+st_invalidate_buffers(struct st_context *st)
+{
+   st->dirty |= ST_NEW_BLEND |
+                ST_NEW_DSA |
+                ST_NEW_FB_STATE |
+                ST_NEW_SAMPLE_MASK |
+                ST_NEW_SAMPLE_SHADING |
+                ST_NEW_FS_STATE |
+                ST_NEW_POLY_STIPPLE |
+                ST_NEW_VIEWPORT |
+                ST_NEW_RASTERIZER |
+                ST_NEW_SCISSOR |
+                ST_NEW_WINDOW_RECTANGLES;
+}
+
+
 /**
  * Called via ctx->Driver.UpdateState()
  */
-void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)
+static void
+st_invalidate_state(struct gl_context * ctx)
 {
+   GLbitfield new_state = ctx->NewState;
    struct st_context *st = st_context(ctx);
 
    if (new_state & _NEW_BUFFERS) {
-      st->dirty |= ST_NEW_BLEND |
-                   ST_NEW_DSA |
-                   ST_NEW_FB_STATE |
-                   ST_NEW_SAMPLE_MASK |
-                   ST_NEW_SAMPLE_SHADING |
-                   ST_NEW_FS_STATE |
-                   ST_NEW_POLY_STIPPLE |
-                   ST_NEW_VIEWPORT |
-                   ST_NEW_RASTERIZER |
-                   ST_NEW_SCISSOR |
-                   ST_NEW_WINDOW_RECTANGLES;
+      st_invalidate_buffers(st);
    } else {
       /* These set a subset of flags set by _NEW_BUFFERS, so we only have to
        * check them when _NEW_BUFFERS isn't set.
        */
-      if (new_state & (_NEW_DEPTH |
-                       _NEW_STENCIL))
-         st->dirty |= ST_NEW_DSA;
-
       if (new_state & _NEW_PROGRAM)
          st->dirty |= ST_NEW_RASTERIZER;
 
-      if (new_state & _NEW_SCISSOR)
-         st->dirty |= ST_NEW_RASTERIZER |
-                      ST_NEW_SCISSOR |
-                      ST_NEW_WINDOW_RECTANGLES;
-
       if (new_state & _NEW_FOG)
          st->dirty |= ST_NEW_FS_STATE;
 
-      if (new_state & _NEW_POLYGONSTIPPLE)
-         st->dirty |= ST_NEW_POLY_STIPPLE;
-
-      if (new_state & _NEW_VIEWPORT)
-         st->dirty |= ST_NEW_VIEWPORT;
-
       if (new_state & _NEW_FRAG_CLAMP) {
          if (st->clamp_frag_color_in_shader)
             st->dirty |= ST_NEW_FS_STATE;
@@ -213,33 +208,14 @@
       }
    }
 
-   if (new_state & _NEW_MULTISAMPLE) {
-      st->dirty |= ST_NEW_BLEND |
-                   ST_NEW_SAMPLE_MASK |
-                   ST_NEW_SAMPLE_SHADING |
-                   ST_NEW_RASTERIZER |
-                   ST_NEW_FS_STATE;
-   } else {
-      /* These set a subset of flags set by _NEW_MULTISAMPLE, so we only
-       * have to check them when _NEW_MULTISAMPLE isn't set.
-       */
-      if (new_state & (_NEW_LIGHT |
-                       _NEW_LINE |
-                       _NEW_POINT |
-                       _NEW_POLYGON |
-                       _NEW_TRANSFORM))
-         st->dirty |= ST_NEW_RASTERIZER;
-   }
+   if (new_state & (_NEW_LIGHT |
+                    _NEW_POINT))
+      st->dirty |= ST_NEW_RASTERIZER;
 
-   if (new_state & (_NEW_PROJECTION |
-                    _NEW_TRANSFORM) &&
+   if (new_state & _NEW_PROJECTION &&
        st_user_clip_planes_enabled(ctx))
       st->dirty |= ST_NEW_CLIP_STATE;
 
-   if (new_state & _NEW_COLOR)
-      st->dirty |= ST_NEW_BLEND |
-                   ST_NEW_DSA;
-
    if (new_state & _NEW_PIXEL)
       st->dirty |= ST_NEW_PIXEL_TRANSFER;
 
@@ -268,14 +244,6 @@
          st->dirty |= ST_NEW_FS_STATE;
       }
    }
-
-   if (new_state & _NEW_PROGRAM_CONSTANTS)
-      st->dirty |= st->active_states & ST_NEW_CONSTANTS;
-
-   /* This is the only core Mesa module we depend upon.
-    * No longer use swrast, swsetup, tnl.
-    */
-   _vbo_InvalidateState(ctx, new_state);
 }
 
 
@@ -292,6 +260,8 @@
    st_destroy_drawtex(st);
    st_destroy_perfmon(st);
    st_destroy_pbo_helpers(st);
+   st_destroy_bound_texture_handles(st);
+   st_destroy_bound_image_handles(st);
 
    for (shader = 0; shader < ARRAY_SIZE(st->state.sampler_views); shader++) {
       for (i = 0; i < ARRAY_SIZE(st->state.sampler_views[0]); i++) {
@@ -315,10 +285,11 @@
    free( st );
 }
 
+static void st_init_driver_flags(struct st_context *st);
 
 static struct st_context *
 st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
-		const struct st_config_options *options)
+		const struct st_config_options *options, bool no_error)
 {
    struct pipe_screen *screen = pipe->screen;
    uint i;
@@ -338,6 +309,8 @@
 
    st->has_user_constbuf =
       screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS);
+   st->can_bind_const_buffer_as_vertex =
+      screen->get_param(screen, PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX);
 
    /* Drivers still have to upload zero-stride vertex attribs manually
     * with the GL core profile, but they don't have to deal with any complex
@@ -397,6 +370,9 @@
 
    ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
+   if (no_error)
+      ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR;
+
    st->has_stencil_export =
       screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT);
    st->has_shader_model3 = screen->get_param(screen, PIPE_CAP_SM3);
@@ -484,6 +460,8 @@
    st->shader_has_one_variant[MESA_SHADER_GEOMETRY] = st->has_shareable_shaders;
    st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders;
 
+   st->bitmap.cache.empty = true;
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {
@@ -496,26 +474,76 @@
 
    _mesa_initialize_dispatch_tables(ctx);
    _mesa_initialize_vbo_vtxfmt(ctx);
+   st_init_driver_flags(st);
+
+   /* Initialize context's winsys buffers list */
+   LIST_INITHEAD(&st->winsys_buffers);
 
    return st;
 }
 
-static void st_init_driver_flags(struct gl_driver_flags *f)
+static void st_init_driver_flags(struct st_context *st)
 {
+   struct gl_driver_flags *f = &st->ctx->DriverFlags;
+
    f->NewArray = ST_NEW_VERTEX_ARRAYS;
    f->NewRasterizerDiscard = ST_NEW_RASTERIZER;
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
    f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
+
+   /* Shader resources */
    f->NewTextureBuffer = ST_NEW_SAMPLER_VIEWS;
    f->NewAtomicBuffer = ST_NEW_ATOMIC_BUFFER;
    f->NewShaderStorageBuffer = ST_NEW_STORAGE_BUFFER;
    f->NewImageUnits = ST_NEW_IMAGE_UNITS;
+
+   f->NewShaderConstants[MESA_SHADER_VERTEX] = ST_NEW_VS_CONSTANTS;
+   f->NewShaderConstants[MESA_SHADER_TESS_CTRL] = ST_NEW_TCS_CONSTANTS;
+   f->NewShaderConstants[MESA_SHADER_TESS_EVAL] = ST_NEW_TES_CONSTANTS;
+   f->NewShaderConstants[MESA_SHADER_GEOMETRY] = ST_NEW_GS_CONSTANTS;
+   f->NewShaderConstants[MESA_SHADER_FRAGMENT] = ST_NEW_FS_CONSTANTS;
+   f->NewShaderConstants[MESA_SHADER_COMPUTE] = ST_NEW_CS_CONSTANTS;
+
+   f->NewWindowRectangles = ST_NEW_WINDOW_RECTANGLES;
+   f->NewFramebufferSRGB = ST_NEW_FB_STATE;
+   f->NewScissorRect = ST_NEW_SCISSOR;
+   f->NewScissorTest = ST_NEW_SCISSOR | ST_NEW_RASTERIZER;
+   f->NewAlphaTest = ST_NEW_DSA;
+   f->NewBlend = ST_NEW_BLEND;
+   f->NewBlendColor = ST_NEW_BLEND_COLOR;
+   f->NewColorMask = ST_NEW_BLEND;
+   f->NewDepth = ST_NEW_DSA;
+   f->NewLogicOp = ST_NEW_BLEND;
+   f->NewStencil = ST_NEW_DSA;
+   f->NewMultisampleEnable = ST_NEW_BLEND | ST_NEW_RASTERIZER |
+                             ST_NEW_SAMPLE_MASK | ST_NEW_SAMPLE_SHADING;
+   f->NewSampleAlphaToXEnable = ST_NEW_BLEND;
+   f->NewSampleMask = ST_NEW_SAMPLE_MASK;
+   f->NewSampleShading = ST_NEW_SAMPLE_SHADING;
+
+   /* This depends on what the gallium driver wants. */
+   if (st->force_persample_in_shader) {
+      f->NewMultisampleEnable |= ST_NEW_FS_STATE;
+      f->NewSampleShading |= ST_NEW_FS_STATE;
+   } else {
+      f->NewSampleShading |= ST_NEW_RASTERIZER;
+   }
+
+   f->NewClipControl = ST_NEW_VIEWPORT | ST_NEW_RASTERIZER;
+   f->NewClipPlane = ST_NEW_CLIP_STATE;
+   f->NewClipPlaneEnable = ST_NEW_RASTERIZER;
+   f->NewDepthClamp = ST_NEW_RASTERIZER;
+   f->NewLineState = ST_NEW_RASTERIZER;
+   f->NewPolygonState = ST_NEW_RASTERIZER;
+   f->NewPolygonStipple = ST_NEW_POLY_STIPPLE;
+   f->NewViewport = ST_NEW_VIEWPORT;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
                                      const struct gl_config *visual,
                                      struct st_context *share,
-                                     const struct st_config_options *options)
+                                     const struct st_config_options *options,
+                                     bool no_error)
 {
    struct gl_context *ctx;
    struct gl_context *shareCtx = share ? share->ctx : NULL;
@@ -540,15 +568,13 @@
        !(ST_DEBUG & DEBUG_TGSI))
       ctx->Cache = pipe->screen->get_disk_shader_cache(pipe->screen);
 
-   st_init_driver_flags(&ctx->DriverFlags);
-
    /* XXX: need a capability bit in gallium to query if the pipe
     * driver prefers DP4 or MUL/MAD for vertex transformation.
     */
    if (debug_get_option_mesa_mvp_dp4())
       ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = GL_TRUE;
 
-   st = st_create_context_priv(ctx, pipe, options);
+   st = st_create_context_priv(ctx, pipe, options, no_error);
    if (!st) {
       _mesa_destroy_context(ctx);
    }
@@ -573,7 +599,17 @@
 void st_destroy_context( struct st_context *st )
 {
    struct gl_context *ctx = st->ctx;
-   GLuint i;
+   struct st_framebuffer *stfb, *next;
+
+   GET_CURRENT_CONTEXT(curctx);
+   if (curctx == NULL) {
+
+      /* No current context, but we need one to release
+       * renderbuffer surface when we release framebuffer.
+       * So temporarily bind the context.
+       */
+      _mesa_make_current(ctx, NULL, NULL);
+   }
 
    /* This must be called first so that glthread has a chance to finish */
    _mesa_glthread_destroy(ctx);
@@ -581,17 +617,17 @@
    _mesa_HashWalk(ctx->Shared->TexObjects, destroy_tex_sampler_cb, st);
 
    st_reference_fragprog(st, &st->fp, NULL);
-   st_reference_geomprog(st, &st->gp, NULL);
+   st_reference_prog(st, &st->gp, NULL);
    st_reference_vertprog(st, &st->vp, NULL);
-   st_reference_tesscprog(st, &st->tcp, NULL);
-   st_reference_tesseprog(st, &st->tep, NULL);
+   st_reference_prog(st, &st->tcp, NULL);
+   st_reference_prog(st, &st->tep, NULL);
    st_reference_compprog(st, &st->cp, NULL);
 
-   /* release framebuffer surfaces */
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      pipe_surface_reference(&st->state.framebuffer.cbufs[i], NULL);
+   /* release framebuffer in the winsys buffers list */
+   LIST_FOR_EACH_ENTRY_SAFE_REV(stfb, next, &st->winsys_buffers, head) {
+      st_framebuffer_reference(&stfb, NULL);
    }
-   pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL);
+
    pipe_sampler_view_reference(&st->pixel_xfer.pixelmap_sampler_view, NULL);
    pipe_resource_reference(&st->pixel_xfer.pixelmap_texture, NULL);
 
@@ -617,14 +653,15 @@
 }
 
 static void
-st_set_background_context(struct gl_context *ctx)
+st_set_background_context(struct gl_context *ctx,
+                          struct util_queue_monitoring *queue_info)
 {
    struct st_context *st = ctx->st;
    struct st_manager *smapi =
       (struct st_manager*)st->iface.st_context_private;
 
    assert(smapi->set_background_context);
-   smapi->set_background_context(&st->iface);
+   smapi->set_background_context(&st->iface, queue_info);
 }
 
 void st_init_driver_functions(struct pipe_screen *screen,
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index bb00384..4c742ca 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -29,10 +29,11 @@
 #define ST_CONTEXT_H
 
 #include "main/mtypes.h"
-#include "pipe/p_state.h"
 #include "state_tracker/st_api.h"
 #include "main/fbobject.h"
 #include "state_tracker/st_atom.h"
+#include "util/u_inlines.h"
+#include "util/list.h"
 
 
 #ifdef __cplusplus
@@ -40,7 +41,6 @@
 #endif
 
 
-struct bitmap_cache;
 struct dd_function_table;
 struct draw_context;
 struct draw_stage;
@@ -59,6 +59,32 @@
    float s, t;
 };
 
+struct st_bitmap_cache
+{
+   /** Window pos to render the cached image */
+   GLint xpos, ypos;
+   /** Bounds of region used in window coords */
+   GLint xmin, ymin, xmax, ymax;
+
+   GLfloat color[4];
+
+   /** Bitmap's Z position */
+   GLfloat zpos;
+
+   struct pipe_resource *texture;
+   struct pipe_transfer *trans;
+
+   GLboolean empty;
+
+   /** An I8 texture image: */
+   ubyte *buffer;
+};
+
+struct st_bound_handles
+{
+   unsigned num_handles;
+   uint64_t *handles;
+};
 
 struct st_context
 {
@@ -85,6 +111,7 @@
    boolean has_half_float_packing;
    boolean has_multi_draw_indirect;
    boolean has_user_constbuf;
+   boolean can_bind_const_buffer_as_vertex;
 
    /**
     * If a shader can be created when we get its source.
@@ -100,7 +127,7 @@
     * on glViewpport calls, this is set via a option.
     */
    boolean invalidate_on_gl_viewport;
-
+   boolean draw_needs_minmax_index;
    boolean vertex_array_out_of_memory;
 
    /* Some state is contained in constant objects.
@@ -119,7 +146,11 @@
          void *ptr;
          unsigned size;
       } constants[PIPE_SHADER_TYPES];
-      struct pipe_framebuffer_state framebuffer;
+      unsigned fb_width;
+      unsigned fb_height;
+      unsigned fb_num_samples;
+      unsigned fb_num_layers;
+      unsigned num_viewports;
       struct pipe_scissor_state scissor[PIPE_MAX_VIEWPORTS];
       struct pipe_viewport_state viewport[PIPE_MAX_VIEWPORTS];
       struct {
@@ -127,16 +158,12 @@
          boolean include;
          struct pipe_scissor_state rects[PIPE_MAX_WINDOW_RECTANGLES];
       } window_rects;
-      unsigned sample_mask;
 
       GLuint poly_stipple[32];  /**< In OpenGL's bottom-to-top order */
 
       GLuint fb_orientation;
    } state;
 
-   char vendor[100];
-   char renderer[100];
-
    uint64_t dirty; /**< dirty states */
 
    /** This masks out unused shader resources. Only valid in draw calls. */
@@ -151,22 +178,14 @@
    GLboolean vertdata_edgeflags;
    GLboolean edgeflag_culls_prims;
 
-   /** Mapping from VARYING_SLOT_x to post-transformed vertex slot */
-   const GLuint *vertex_result_to_slot;
-
    struct st_vertex_program *vp;    /**< Currently bound vertex program */
    struct st_fragment_program *fp;  /**< Currently bound fragment program */
-   struct st_geometry_program *gp;  /**< Currently bound geometry program */
-   struct st_tessctrl_program *tcp; /**< Currently bound tess control program */
-   struct st_tesseval_program *tep; /**< Currently bound tess eval program */
+   struct st_common_program *gp;  /**< Currently bound geometry program */
+   struct st_common_program *tcp; /**< Currently bound tess control program */
+   struct st_common_program *tep; /**< Currently bound tess eval program */
    struct st_compute_program *cp;   /**< Currently bound compute program */
 
    struct st_vp_variant *vp_variant;
-   struct st_fp_variant *fp_variant;
-   struct st_basic_variant *gp_variant;
-   struct st_basic_variant *tcp_variant;
-   struct st_basic_variant *tep_variant;
-   struct st_basic_variant *cp_variant;
 
    struct {
       struct pipe_resource *pixelmap_texture;
@@ -180,7 +199,7 @@
       struct pipe_sampler_state atlas_sampler;
       enum pipe_format tex_format;
       void *vs;
-      struct bitmap_cache *cache;
+      struct st_bitmap_cache cache;
    } bitmap;
 
    /** for glDraw/CopyPixels */
@@ -254,6 +273,14 @@
    struct st_perf_monitor_group *perfmon;
 
    enum pipe_reset_status reset_status;
+
+   /* Array of bound texture/image handles which are resident in the context.
+    */
+   struct st_bound_handles bound_texture_handles[PIPE_SHADER_TYPES];
+   struct st_bound_handles bound_image_handles[PIPE_SHADER_TYPES];
+
+   /* Winsys buffers */
+   struct list_head winsys_buffers;
 };
 
 
@@ -272,22 +299,35 @@
 struct st_framebuffer
 {
    struct gl_framebuffer Base;
-   void *Private;
 
    struct st_framebuffer_iface *iface;
    enum st_attachment_type statts[ST_ATTACHMENT_COUNT];
    unsigned num_statts;
    int32_t stamp;
    int32_t iface_stamp;
+   uint32_t iface_ID;
+
+   /* list of framebuffer objects */
+   struct list_head head;
 };
 
 
 extern void st_init_driver_functions(struct pipe_screen *screen,
                                      struct dd_function_table *functions);
 
-void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state);
+void
+st_invalidate_buffers(struct st_context *st);
 
-void st_invalidate_readpix_cache(struct st_context *st);
+/* Invalidate the readpixels cache to ensure we don't read stale data.
+ */
+static inline void
+st_invalidate_readpix_cache(struct st_context *st)
+{
+   if (unlikely(st->readpix_cache.src)) {
+      pipe_resource_reference(&st->readpix_cache.src, NULL);
+      pipe_resource_reference(&st->readpix_cache.cache, NULL);
+   }
+}
 
 
 #define Y_0_TOP 1
@@ -334,6 +374,8 @@
       return PIPE_SHADER_TESS_EVAL;
    case MESA_SHADER_COMPUTE:
       return PIPE_SHADER_COMPUTE;
+   default:
+      break;
    }
 
    assert(!"should not be reached");
@@ -356,7 +398,8 @@
 st_create_context(gl_api api, struct pipe_context *pipe,
                   const struct gl_config *visual,
                   struct st_context *share,
-                  const struct st_config_options *options);
+                  const struct st_config_options *options,
+                  bool no_error);
 
 extern void
 st_destroy_context(struct st_context *st);
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index eb8ac2e..2fe7070 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -67,74 +67,23 @@
 
 
 /**
- * This is very similar to vbo_all_varyings_in_vbos() but we are
- * only interested in per-vertex data.  See bug 38626.
- */
-static GLboolean
-all_varyings_in_vbos(const struct gl_vertex_array *arrays[])
-{
-   GLuint i;
-
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
-      if (arrays[i]->StrideB &&
-          !arrays[i]->InstanceDivisor &&
-          !_mesa_is_bufferobj(arrays[i]->BufferObj))
-	 return GL_FALSE;
-
-   return GL_TRUE;
-}
-
-
-/**
- * Basically, translate Mesa's index buffer information into
- * a pipe_index_buffer object.
- */
-static void
-setup_index_buffer(struct st_context *st,
-                   const struct _mesa_index_buffer *ib)
-{
-   struct pipe_index_buffer ibuffer;
-   struct gl_buffer_object *bufobj = ib->obj;
-
-   ibuffer.index_size = vbo_sizeof_ib_type(ib->type);
-
-   /* get/create the index buffer object */
-   if (_mesa_is_bufferobj(bufobj)) {
-      /* indices are in a real VBO */
-      ibuffer.buffer = st_buffer_object(bufobj)->buffer;
-      ibuffer.offset = pointer_to_offset(ib->ptr);
-      ibuffer.user_buffer = NULL;
-   }
-   else {
-      /* indices are in user space memory */
-      ibuffer.buffer = NULL;
-      ibuffer.offset = 0;
-      ibuffer.user_buffer = ib->ptr;
-   }
-
-   cso_set_index_buffer(st->cso_context, &ibuffer);
-}
-
-
-/**
  * Set the restart index.
  */
 static void
-setup_primitive_restart(struct gl_context *ctx,
-                        const struct _mesa_index_buffer *ib,
-                        struct pipe_draw_info *info)
+setup_primitive_restart(struct gl_context *ctx, struct pipe_draw_info *info)
 {
    if (ctx->Array._PrimitiveRestart) {
-      info->restart_index = _mesa_primitive_restart_index(ctx, ib->type);
+      unsigned index_size = info->index_size;
+
+      info->restart_index =
+         _mesa_primitive_restart_index(ctx, index_size);
 
       /* Enable primitive restart only when the restart index can have an
        * effect. This is required for correctness in radeonsi VI support.
        * Other hardware may also benefit from taking a faster, non-restart path
        * when possible.
        */
-      if ((ib->type == GL_UNSIGNED_INT) ||
-          (ib->type == GL_UNSIGNED_SHORT && info->restart_index <= 0xffff) ||
-          (ib->type == GL_UNSIGNED_BYTE && info->restart_index <= 0xff))
+      if (index_size == 4 || info->restart_index < (1 << (index_size * 8)))
          info->primitive_restart = true;
    }
 }
@@ -156,13 +105,30 @@
    return prim;
 }
 
+static inline void
+prepare_draw(struct st_context *st, struct gl_context *ctx)
+{
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+
+   if (unlikely(!st->bitmap.cache.empty))
+      st_flush_bitmap_cache(st);
+
+   st_invalidate_readpix_cache(st);
+
+   /* Validate state. */
+   if ((st->dirty | ctx->NewDriverState) & ST_PIPELINE_RENDER_STATE_MASK ||
+       st->gfx_shaders_may_be_dirty) {
+      st_validate_state(st, ST_PIPELINE_RENDER);
+   }
+}
 
 /**
  * This function gets plugged into the VBO module and is called when
  * we have something to render.
  * Basically, translate the information into the format expected by gallium.
  */
-void
+static void
 st_draw_vbo(struct gl_context *ctx,
             const struct _mesa_prim *prims,
             GLuint nr_prims,
@@ -176,48 +142,50 @@
 {
    struct st_context *st = st_context(ctx);
    struct pipe_draw_info info;
-   const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
    unsigned i;
+   unsigned start = 0;
 
-   /* Mesa core state should have been validated already */
-   assert(ctx->NewState == 0x0);
+   prepare_draw(st, ctx);
 
-   st_flush_bitmap_cache(st);
-   st_invalidate_readpix_cache(st);
-
-   /* Validate state. */
-   if ((st->dirty | ctx->NewDriverState) & ST_PIPELINE_RENDER_STATE_MASK ||
-       st->gfx_shaders_may_be_dirty) {
-      st_validate_state(st, ST_PIPELINE_RENDER);
-   }
-
-   if (st->vertex_array_out_of_memory) {
+   if (st->vertex_array_out_of_memory)
       return;
-   }
 
-   util_draw_init_info(&info);
+   /* Initialize pipe_draw_info. */
+   info.primitive_restart = false;
+   info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
+   info.indirect = NULL;
+   info.count_from_stream_output = NULL;
 
    if (ib) {
+      struct gl_buffer_object *bufobj = ib->obj;
+
       /* Get index bounds for user buffers. */
-      if (!index_bounds_valid)
-         if (!all_varyings_in_vbos(arrays))
-            vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index,
-                                   nr_prims);
-
-      setup_index_buffer(st, ib);
-
-      info.indexed = TRUE;
-      if (min_index != ~0U && max_index != ~0U) {
-         info.min_index = min_index;
-         info.max_index = max_index;
+      if (!index_bounds_valid && st->draw_needs_minmax_index) {
+         vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index,
+                                nr_prims);
       }
 
-      /* The VBO module handles restart for the non-indexed GLDrawArrays
-       * so we only set these fields for indexed drawing:
-       */
-      setup_primitive_restart(ctx, ib, &info);
+      info.index_size = ib->index_size;
+      info.min_index = min_index;
+      info.max_index = max_index;
+
+      if (_mesa_is_bufferobj(bufobj)) {
+         /* indices are in a real VBO */
+         info.has_user_indices = false;
+         info.index.resource = st_buffer_object(bufobj)->buffer;
+         start = pointer_to_offset(ib->ptr) / info.index_size;
+      } else {
+         /* indices are in user space memory */
+         info.has_user_indices = true;
+         info.index.user = ib->ptr;
+      }
+
+      setup_primitive_restart(ctx, &info);
    }
    else {
+      info.index_size = 0;
+      info.has_user_indices = false;
+
       /* Transform feedback drawing is always non-indexed. */
       /* Set info.count_from_stream_output. */
       if (tfb_vertcount) {
@@ -230,12 +198,16 @@
 
    /* do actual drawing */
    for (i = 0; i < nr_prims; i++) {
-      info.mode = translate_prim(ctx, prims[i].mode);
-      info.start = prims[i].start;
       info.count = prims[i].count;
+
+      /* Skip no-op draw calls. */
+      if (!info.count && !tfb_vertcount)
+         continue;
+
+      info.mode = translate_prim(ctx, prims[i].mode);
+      info.start = start + prims[i].start;
       info.start_instance = prims[i].base_instance;
       info.instance_count = prims[i].num_instances;
-      info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
       info.drawid = prims[i].draw_id;
       if (!ib) {
@@ -244,23 +216,15 @@
       }
 
       if (ST_DEBUG & DEBUG_DRAW) {
-         debug_printf("st/draw: mode %s  start %u  count %u  indexed %d\n",
+         debug_printf("st/draw: mode %s  start %u  count %u  index_size %d\n",
                       u_prim_name(info.mode),
                       info.start,
                       info.count,
-                      info.indexed);
+                      info.index_size);
       }
 
-      if (info.count_from_stream_output) {
-         cso_draw_vbo(st->cso_context, &info);
-      }
-      else if (info.primitive_restart) {
-         /* don't trim, restarts might be inside index list */
-         cso_draw_vbo(st->cso_context, &info);
-      }
-      else if (u_trim_pipe_prim(prims[i].mode, &info.count)) {
-         cso_draw_vbo(st->cso_context, &info);
-      }
+      /* Don't call u_trim_pipe_prim. Drivers should do it if they need it. */
+      cso_draw_vbo(st->cso_context, &info);
    }
 }
 
@@ -277,62 +241,61 @@
 {
    struct st_context *st = st_context(ctx);
    struct pipe_draw_info info;
+   struct pipe_draw_indirect_info indirect;
 
-   /* Mesa core state should have been validated already */
-   assert(ctx->NewState == 0x0);
    assert(stride);
+   prepare_draw(st, ctx);
 
-   st_invalidate_readpix_cache(st);
-
-   /* Validate state. */
-   if ((st->dirty | ctx->NewDriverState) & ST_PIPELINE_RENDER_STATE_MASK ||
-       st->gfx_shaders_may_be_dirty) {
-      st_validate_state(st, ST_PIPELINE_RENDER);
-   }
-
-   if (st->vertex_array_out_of_memory) {
+   if (st->vertex_array_out_of_memory)
       return;
-   }
 
+   memset(&indirect, 0, sizeof(indirect));
    util_draw_init_info(&info);
+   info.start = 0; /* index offset / index size */
 
    if (ib) {
-      setup_index_buffer(st, ib);
+      struct gl_buffer_object *bufobj = ib->obj;
 
-      info.indexed = TRUE;
+      /* indices are always in a real VBO */
+      assert(_mesa_is_bufferobj(bufobj));
+
+      info.index_size = ib->index_size;
+      info.index.resource = st_buffer_object(bufobj)->buffer;
+      info.start = pointer_to_offset(ib->ptr) / info.index_size;
 
       /* Primitive restart is not handled by the VBO module in this case. */
-      setup_primitive_restart(ctx, ib, &info);
+      setup_primitive_restart(ctx, &info);
    }
 
    info.mode = translate_prim(ctx, mode);
    info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
-   info.indirect = st_buffer_object(indirect_data)->buffer;
-   info.indirect_offset = indirect_offset;
+   info.indirect = &indirect;
+   indirect.buffer = st_buffer_object(indirect_data)->buffer;
+   indirect.offset = indirect_offset;
 
    if (ST_DEBUG & DEBUG_DRAW) {
-      debug_printf("st/draw indirect: mode %s drawcount %d indexed %d\n",
+      debug_printf("st/draw indirect: mode %s drawcount %d index_size %d\n",
                    u_prim_name(info.mode),
                    draw_count,
-                   info.indexed);
+                   info.index_size);
    }
 
    if (!st->has_multi_draw_indirect) {
       int i;
 
       assert(!indirect_params);
-      info.indirect_count = 1;
+      indirect.draw_count = 1;
       for (i = 0; i < draw_count; i++) {
          info.drawid = i;
          cso_draw_vbo(st->cso_context, &info);
-         info.indirect_offset += stride;
+         indirect.offset += stride;
       }
    } else {
-      info.indirect_count = draw_count;
-      info.indirect_stride = stride;
+      indirect.draw_count = draw_count;
+      indirect.stride = stride;
       if (indirect_params) {
-         info.indirect_params = st_buffer_object(indirect_params)->buffer;
-         info.indirect_params_offset = indirect_params_offset;
+         indirect.indirect_draw_count = st_buffer_object(indirect_params)->buffer;
+         indirect.indirect_draw_count_offset = indirect_params_offset;
       }
       cso_draw_vbo(st->cso_context, &info);
    }
@@ -398,8 +361,8 @@
 
    u_upload_alloc(st->pipe->stream_uploader, 0,
                   4 * sizeof(struct st_util_vertex), 4,
-                  &vb.buffer_offset, &vb.buffer, (void **) &verts);
-   if (!vb.buffer) {
+                  &vb.buffer_offset, &vb.buffer.resource, (void **) &verts);
+   if (!vb.buffer.resource) {
       return false;
    }
 
@@ -466,7 +429,7 @@
       cso_draw_arrays(st->cso_context, PIPE_PRIM_TRIANGLE_FAN, 0, 4);
    }
 
-   pipe_resource_reference(&vb.buffer, NULL);
+   pipe_resource_reference(&vb.buffer.resource, NULL);
 
    return true;
 }
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 2c69ca4..12a30ee 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -50,18 +50,6 @@
 struct draw_context *st_get_draw_context(struct st_context *st);
 
 extern void
-st_draw_vbo(struct gl_context *ctx,
-            const struct _mesa_prim *prims,
-            GLuint nr_prims,
-            const struct _mesa_index_buffer *ib,
-	    GLboolean index_bounds_valid,
-            GLuint min_index,
-            GLuint max_index,
-            struct gl_transform_feedback_object *tfb_vertcount,
-            unsigned stream,
-            struct gl_buffer_object *indirect);
-
-extern void
 st_feedback_draw_vbo(struct gl_context *ctx,
                      const struct _mesa_prim *prims,
                      GLuint nr_prims,
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index fac83b9..987a156 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -128,7 +128,6 @@
    const struct pipe_shader_state *vs;
    struct pipe_vertex_buffer vbuffers[PIPE_MAX_SHADER_INPUTS];
    struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
-   struct pipe_index_buffer ibuffer;
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {NULL};
    struct pipe_transfer *ib_transfer = NULL;
    const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
@@ -194,27 +193,27 @@
          struct st_buffer_object *stobj = st_buffer_object(bufobj);
          assert(stobj->buffer);
 
-         vbuffers[attr].buffer = NULL;
-         vbuffers[attr].user_buffer = NULL;
-         pipe_resource_reference(&vbuffers[attr].buffer, stobj->buffer);
+         vbuffers[attr].buffer.resource = NULL;
+         vbuffers[attr].is_user_buffer = false;
+         pipe_resource_reference(&vbuffers[attr].buffer.resource, stobj->buffer);
          vbuffers[attr].buffer_offset = pointer_to_offset(low_addr);
          velements[attr].src_offset = arrays[mesaAttr]->Ptr - low_addr;
 
          /* map the attrib buffer */
-         map = pipe_buffer_map(pipe, vbuffers[attr].buffer,
+         map = pipe_buffer_map(pipe, vbuffers[attr].buffer.resource,
                                PIPE_TRANSFER_READ,
                                &vb_transfer[attr]);
          draw_set_mapped_vertex_buffer(draw, attr, map,
-                                       vbuffers[attr].buffer->width0);
+                                       vbuffers[attr].buffer.resource->width0);
       }
       else {
-         vbuffers[attr].buffer = NULL;
-         vbuffers[attr].user_buffer = arrays[mesaAttr]->Ptr;
+         vbuffers[attr].buffer.user = arrays[mesaAttr]->Ptr;
+         vbuffers[attr].is_user_buffer = true;
          vbuffers[attr].buffer_offset = 0;
          velements[attr].src_offset = 0;
 
-         draw_set_mapped_vertex_buffer(draw, attr, vbuffers[attr].user_buffer,
-                                       ~0);
+         draw_set_mapped_vertex_buffer(draw, attr,
+                                       vbuffers[attr].buffer.user, ~0);
       }
 
       /* common-case setup */
@@ -238,31 +237,29 @@
    draw_set_vertex_buffers(draw, 0, vp->num_inputs, vbuffers);
    draw_set_vertex_elements(draw, vp->num_inputs, velements);
 
-   memset(&ibuffer, 0, sizeof(ibuffer));
+   unsigned start = 0;
+
    if (ib) {
       struct gl_buffer_object *bufobj = ib->obj;
+      unsigned index_size = ib->index_size;
 
-      ibuffer.index_size = vbo_sizeof_ib_type(ib->type);
-      if (ibuffer.index_size == 0)
+      if (index_size == 0)
          goto out_unref_vertex;
 
       if (bufobj && bufobj->Name) {
          struct st_buffer_object *stobj = st_buffer_object(bufobj);
 
-         pipe_resource_reference(&ibuffer.buffer, stobj->buffer);
-         ibuffer.offset = pointer_to_offset(ib->ptr);
-
+         start = pointer_to_offset(ib->ptr) / index_size;
          mapped_indices = pipe_buffer_map(pipe, stobj->buffer,
                                           PIPE_TRANSFER_READ, &ib_transfer);
       }
       else {
-         /* skip setting ibuffer.buffer as the draw module does not use it */
          mapped_indices = ib->ptr;
       }
 
       draw_set_indexes(draw,
-                       (ubyte *) mapped_indices + ibuffer.offset,
-                       ibuffer.index_size, ~0);
+                       (ubyte *) mapped_indices,
+                       index_size, ~0);
    }
 
    /* set the constant buffer */
@@ -273,7 +270,7 @@
 
    /* draw here */
    for (i = 0; i < nr_prims; i++) {
-      draw_arrays(draw, prims[i].mode, prims[i].start, prims[i].count);
+      draw_arrays(draw, prims[i].mode, start + prims[i].start, prims[i].count);
    }
 
 
@@ -284,7 +281,6 @@
       draw_set_indexes(draw, NULL, 0, 0);
       if (ib_transfer)
          pipe_buffer_unmap(pipe, ib_transfer);
-      pipe_resource_reference(&ibuffer.buffer, NULL);
    }
 
  out_unref_vertex:
@@ -292,7 +288,7 @@
       if (vb_transfer[attr])
          pipe_buffer_unmap(pipe, vb_transfer[attr]);
       draw_set_mapped_vertex_buffer(draw, attr, NULL, 0);
-      pipe_resource_reference(&vbuffers[attr].buffer, NULL);
+      pipe_vertex_buffer_unreference(&vbuffers[attr]);
    }
    draw_set_vertex_buffers(draw, 0, vp->num_inputs, NULL);
 }
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 1df2ba7..74193cc 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -270,7 +270,6 @@
       options->EmitNoLoops =
          !screen->get_shader_param(screen, sh,
                                    PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoFunctions = true;
       options->EmitNoMainReturn =
          !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
 
@@ -464,6 +463,9 @@
 
    c->SparseBufferPageSize =
       screen->get_param(screen, PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE);
+
+   c->AllowMappedBuffersDuringExecution =
+      screen->get_param(screen, PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION);
 }
 
 
@@ -575,6 +577,7 @@
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
+      { o(ARB_bindless_texture),             PIPE_CAP_BINDLESS_TEXTURE                 },
       { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
       { o(ARB_clear_texture),                PIPE_CAP_CLEAR_TEXTURE                    },
       { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
@@ -598,6 +601,7 @@
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
+      { o(ARB_post_depth_coverage),          PIPE_CAP_POST_DEPTH_COVERAGE              },
       { o(ARB_query_buffer_object),          PIPE_CAP_QUERY_BUFFER_OBJECT              },
       { o(ARB_robust_buffer_access_behavior), PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR   },
       { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
@@ -890,6 +894,8 @@
 
    consts->ForceGLSLAbsSqrt = options->force_glsl_abs_sqrt;
 
+   consts->AllowGLSLBuiltinVariableRedeclaration = options->allow_glsl_builtin_variable_redeclaration;
+
    consts->dri_config_options_sha1 = options->config_options_sha1;
 
    if (consts->GLSLVersion >= 400)
@@ -925,7 +931,6 @@
       extensions->OES_depth_texture_cube_map = GL_TRUE;
       extensions->ARB_shading_language_420pack = GL_TRUE;
       extensions->ARB_texture_query_levels = GL_TRUE;
-      extensions->ARB_shader_subroutine = GL_TRUE;
 
       if (!options->disable_shader_bit_encoding) {
          extensions->ARB_shader_bit_encoding = GL_TRUE;
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 7901d50..c5dd0a5 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -1186,7 +1186,7 @@
    },
    {
       { 1, GL_LUMINANCE, GL_LUMINANCE4, GL_LUMINANCE8, 0 },
-      { PIPE_FORMAT_L8_UNORM, DEFAULT_RGB_FORMATS }
+      { PIPE_FORMAT_L8_UNORM, PIPE_FORMAT_L8A8_UNORM, DEFAULT_RGB_FORMATS }
    },
 
    /* basic Luminance/Alpha formats */
@@ -1682,101 +1682,101 @@
    {
       { GL_ALPHA_INTEGER_EXT,
         GL_ALPHA8I_EXT, 0 },
-      { PIPE_FORMAT_A8_SINT, 0 }
+      { PIPE_FORMAT_A8_SINT, PIPE_FORMAT_R8G8B8A8_SINT, 0 }
    },
    {
       { GL_ALPHA16I_EXT, 0 },
-      { PIPE_FORMAT_A16_SINT, 0 }
+      { PIPE_FORMAT_A16_SINT, PIPE_FORMAT_R16G16B16A16_SINT, 0 }
    },
    {
       { GL_ALPHA32I_EXT, 0 },
-      { PIPE_FORMAT_A32_SINT, 0 }
+      { PIPE_FORMAT_A32_SINT, PIPE_FORMAT_R32G32B32A32_SINT, 0 }
    },
    {
       { GL_ALPHA8UI_EXT, 0 },
-      { PIPE_FORMAT_A8_UINT, 0 }
+      { PIPE_FORMAT_A8_UINT, PIPE_FORMAT_R8G8B8A8_UINT, 0 }
    },
    {
       { GL_ALPHA16UI_EXT, 0 },
-      { PIPE_FORMAT_A16_UINT, 0 }
+      { PIPE_FORMAT_A16_UINT, PIPE_FORMAT_R16G16B16A16_UINT, 0 }
    },
    {
       { GL_ALPHA32UI_EXT, 0 },
-      { PIPE_FORMAT_A32_UINT, 0 }
+      { PIPE_FORMAT_A32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0 }
    },
    {
       { GL_INTENSITY8I_EXT, 0 },
-      { PIPE_FORMAT_I8_SINT, 0 }
+      { PIPE_FORMAT_I8_SINT, PIPE_FORMAT_R8G8B8A8_SINT, 0 }
    },
    {
       { GL_INTENSITY16I_EXT, 0 },
-      { PIPE_FORMAT_I16_SINT, 0 }
+      { PIPE_FORMAT_I16_SINT, PIPE_FORMAT_R16G16B16A16_SINT, 0 }
    },
    {
       { GL_INTENSITY32I_EXT, 0 },
-      { PIPE_FORMAT_I32_SINT, 0 }
+      { PIPE_FORMAT_I32_SINT, PIPE_FORMAT_R32G32B32A32_SINT, 0 }
    },
    {
       { GL_INTENSITY8UI_EXT, 0 },
-      { PIPE_FORMAT_I8_UINT, 0 }
+      { PIPE_FORMAT_I8_UINT, PIPE_FORMAT_R8G8B8A8_UINT, 0 }
    },
    {
       { GL_INTENSITY16UI_EXT, 0 },
-      { PIPE_FORMAT_I16_UINT, 0 }
+      { PIPE_FORMAT_I16_UINT, PIPE_FORMAT_R16G16B16A16_UINT, 0 }
    },
    {
       { GL_INTENSITY32UI_EXT, 0 },
-      { PIPE_FORMAT_I32_UINT, 0 }
+      { PIPE_FORMAT_I32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0 }
    },
    {
       { GL_LUMINANCE8I_EXT, 0 },
-      { PIPE_FORMAT_L8_SINT, 0 }
+      { PIPE_FORMAT_L8_SINT, PIPE_FORMAT_R8G8B8A8_SINT, 0 }
    },
    {
       { GL_LUMINANCE16I_EXT, 0 },
-      { PIPE_FORMAT_L16_SINT, 0 }
+      { PIPE_FORMAT_L16_SINT, PIPE_FORMAT_R16G16B16A16_SINT, 0 }
    },
    {
       { GL_LUMINANCE32I_EXT, 0 },
-      { PIPE_FORMAT_L32_SINT, 0 }
+      { PIPE_FORMAT_L32_SINT, PIPE_FORMAT_R32G32B32A32_SINT, 0 }
    },
    {
       { GL_LUMINANCE_INTEGER_EXT,
         GL_LUMINANCE8UI_EXT, 0 },
-      { PIPE_FORMAT_L8_UINT, 0 }
+      { PIPE_FORMAT_L8_UINT, PIPE_FORMAT_R8G8B8A8_UINT, 0 }
    },
    {
       { GL_LUMINANCE16UI_EXT, 0 },
-      { PIPE_FORMAT_L16_UINT, 0 }
+      { PIPE_FORMAT_L16_UINT, PIPE_FORMAT_R16G16B16A16_UINT, 0 }
    },
    {
       { GL_LUMINANCE32UI_EXT, 0 },
-      { PIPE_FORMAT_L32_UINT, 0 }
+      { PIPE_FORMAT_L32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA_INTEGER_EXT,
         GL_LUMINANCE_ALPHA8I_EXT, 0 },
-      { PIPE_FORMAT_L8A8_SINT, 0 }
+      { PIPE_FORMAT_L8A8_SINT, PIPE_FORMAT_R8G8B8A8_SINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA16I_EXT, 0 },
-      { PIPE_FORMAT_L16A16_SINT, 0 }
+      { PIPE_FORMAT_L16A16_SINT, PIPE_FORMAT_R16G16B16A16_SINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA32I_EXT, 0 },
-      { PIPE_FORMAT_L32A32_SINT, 0 }
+      { PIPE_FORMAT_L32A32_SINT, PIPE_FORMAT_R32G32B32A32_SINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA8UI_EXT, 0 },
-      { PIPE_FORMAT_L8A8_UINT, 0 }
+      { PIPE_FORMAT_L8A8_UINT, PIPE_FORMAT_R8G8B8A8_UINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA16UI_EXT, 0 },
-      { PIPE_FORMAT_L16A16_UINT, 0 }
+      { PIPE_FORMAT_L16A16_UINT, PIPE_FORMAT_R16G16B16A16_UINT, 0 }
    },
    {
       { GL_LUMINANCE_ALPHA32UI_EXT, 0 },
-      { PIPE_FORMAT_L32A32_UINT, 0 }
+      { PIPE_FORMAT_L32A32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0 }
    },
    {
       { GL_RGB16I_EXT, 0 },
@@ -2373,9 +2373,10 @@
       break;
 
    case GL_NUM_SAMPLE_COUNTS: {
+      int samples[16];
       size_t num_samples;
       num_samples = st_QuerySamplesForFormat(ctx, target, internalFormat,
-                                             params);
+                                             samples);
       params[0] = (GLint) num_samples;
       break;
    }
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index 1d850ef..6b6e3db 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -35,7 +35,6 @@
 #include "main/errors.h"
 #include "main/shaderapi.h"
 #include "main/uniforms.h"
-#include "util/string_to_uint_map.h"
 
 #include "st_context.h"
 #include "st_program.h"
@@ -45,6 +44,7 @@
 #include "compiler/glsl_types.h"
 #include "compiler/glsl/glsl_to_nir.h"
 #include "compiler/glsl/ir.h"
+#include "compiler/glsl/string_to_uint_map.h"
 
 
 /* Depending on PIPE_CAP_TGSI_TEXCOORD (st->needs_texcoord_semantic) we
@@ -92,6 +92,9 @@
       }
    }
 
+   /* bit of a hack, mirroring st_translate_vertex_program */
+   input_to_index[VERT_ATTRIB_EDGEFLAG] = num_inputs;
+
    nir->num_inputs = 0;
    nir_foreach_variable_safe(var, &nir->inputs) {
       attr = var->data.location;
@@ -242,6 +245,7 @@
    NIR_PASS_V(nir, nir_split_var_copies);
    NIR_PASS_V(nir, nir_lower_var_copies);
    NIR_PASS_V(nir, st_nir_lower_builtin);
+   NIR_PASS_V(nir, nir_lower_atomics, shader_program);
 
    /* fragment shaders may need : */
    if (stage == MESA_SHADER_FRAGMENT) {
@@ -336,6 +340,8 @@
       nir_assign_var_locations(&nir->outputs,
                                &nir->num_outputs,
                                st_glsl_type_size);
+   } else if (nir->stage == MESA_SHADER_COMPUTE) {
+       /* TODO? */
    } else {
       unreachable("invalid shader type for tgsi bypass\n");
    }
@@ -348,11 +354,17 @@
    case MESA_SHADER_FRAGMENT:
       shader_program = ((struct st_fragment_program *)prog)->shader_program;
       break;
+   case MESA_SHADER_COMPUTE:
+      shader_program = ((struct st_compute_program *)prog)->shader_program;
+      break;
    default:
       assert(!"should not be reached");
       return;
    }
 
+   NIR_PASS_V(nir, nir_lower_atomics_to_ssbo,
+         st->ctx->Const.Program[nir->stage].MaxAtomicBuffers);
+
    st_nir_assign_uniform_locations(prog, shader_program,
                                    &nir->uniforms, &nir->num_uniforms);
 
@@ -427,11 +439,11 @@
     * prog->ParameterValues to get reallocated (e.g., anything that adds a
     * program constant) has to happen before creating this linkage.
     */
-   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters,
-                                   true);
+   _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
 
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
+   struct st_compute_program *stcp;
 
    switch (shader->Stage) {
    case MESA_SHADER_VERTEX:
@@ -442,6 +454,10 @@
       stfp = (struct st_fragment_program *)prog;
       stfp->shader_program = shader_program;
       break;
+   case MESA_SHADER_COMPUTE:
+      stcp = (struct st_compute_program *)prog;
+      stcp->shader_program = shader_program;
+      break;
    default:
       assert(!"should not be reached");
       return NULL;
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 96c08a6..54c4e05 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -56,6 +56,7 @@
 #include "st_nir.h"
 #include "st_shader_cache.h"
 
+#include "util/hash_table.h"
 #include <algorithm>
 
 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
@@ -86,6 +87,13 @@
    return swizzle;
 }
 
+static unsigned is_precise(const ir_variable *ir)
+{
+   if (!ir)
+      return 0;
+   return ir->data.precise || ir->data.invariant;
+}
+
 /**
  * This struct is a corresponding struct to TGSI ureg_src.
  */
@@ -165,7 +173,7 @@
 
    explicit st_src_reg(st_dst_reg reg);
 
-   int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int32_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
    int16_t index2D;
    uint16_t swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
    int negate:4; /**< NEGATE_XYZW mask from mesa */
@@ -239,7 +247,7 @@
 
    explicit st_dst_reg(st_src_reg reg);
 
-   int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int32_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
    int16_t index2D;
    gl_register_file file:5; /**< PROGRAM_* from Mesa */
    unsigned writemask:4; /**< Bitfield of WRITEMASK_[XYZW] */
@@ -288,13 +296,14 @@
 
    st_dst_reg dst[2];
    st_src_reg src[4];
-   st_src_reg resource; /**< sampler or buffer register */
+   st_src_reg resource; /**< sampler, image or buffer register */
    st_src_reg *tex_offsets;
 
    /** Pointer to the ir source this tree came from for debugging */
    ir_instruction *ir;
 
    unsigned op:8; /**< TGSI opcode */
+   unsigned precise:1;
    unsigned saturate:1;
    unsigned is_64bit_expanded:1;
    unsigned sampler_base:5;
@@ -310,7 +319,9 @@
    const struct tgsi_opcode_info *info;
 };
 
-class variable_storage : public exec_node {
+class variable_storage {
+   DECLARE_RZALLOC_CXX_OPERATORS(variable_storage)
+
 public:
    variable_storage(ir_variable *var, gl_register_file file, int index,
                     unsigned array_id = 0)
@@ -346,8 +357,8 @@
    int type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
 };
 
-static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
-static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
+static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
+static const st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
 
 struct inout_decl {
    unsigned mesa_index;
@@ -420,7 +431,6 @@
    uint32_t samplers_used;
    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
    int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
-   int buffers_used;
    int images_used;
    int image_targets[PIPE_MAX_SHADER_IMAGES];
    unsigned image_formats[PIPE_MAX_SHADER_IMAGES];
@@ -433,6 +443,7 @@
    bool have_fma;
    bool use_shared_memory;
    bool has_tex_txf_lz;
+   bool precise;
 
    variable_storage *find_variable_storage(ir_variable *var);
 
@@ -445,6 +456,7 @@
    st_src_reg st_src_reg_for_double(double val);
    st_src_reg st_src_reg_for_float(float val);
    st_src_reg st_src_reg_for_int(int val);
+   st_src_reg st_src_reg_for_int64(int64_t val);
    st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
 
    /**
@@ -484,11 +496,12 @@
    void visit_membar_intrinsic(ir_call *);
    void visit_shared_intrinsic(ir_call *);
    void visit_image_intrinsic(ir_call *);
+   void visit_generic_intrinsic(ir_call *, unsigned op);
 
    st_src_reg result;
 
    /** List of variable_storage */
-   exec_list variables;
+   struct hash_table *variables;
 
    /** List of immediate_storage */
    exec_list immediates;
@@ -558,6 +571,7 @@
 
    void rename_temp_registers(int num_renames, struct rename_reg_pair *renames);
    void get_first_temp_read(int *first_reads);
+   void get_first_temp_write(int *first_writes);
    void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
    void get_last_temp_write(int *last_writes);
 
@@ -687,6 +701,7 @@
    STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
 
    inst->op = op;
+   inst->precise = this->precise;
    inst->info = tgsi_get_opcode_info(op);
    inst->dst[0] = dst;
    inst->dst[1] = dst1;
@@ -1212,6 +1227,19 @@
 }
 
 st_src_reg
+glsl_to_tgsi_visitor::st_src_reg_for_int64(int64_t val)
+{
+   st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT64);
+   union gl_constant_value uval[2];
+
+   memcpy(uval, &val, sizeof(uval));
+   src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
+   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+
+   return src;
+}
+
+st_src_reg
 glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
 {
    if (native_integers)
@@ -1306,13 +1334,13 @@
 variable_storage *
 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
 {
+   struct hash_entry *entry;
 
-   foreach_in_list(variable_storage, entry, &this->variables) {
-      if (entry->var == var)
-         return entry;
-   }
+   entry = _mesa_hash_table_search(this->variables, var);
+   if (!entry)
+      return NULL;
 
-   return NULL;
+   return (variable_storage *)entry->data;
 }
 
 void
@@ -1345,7 +1373,8 @@
       if (i == ir->get_num_state_slots()) {
          /* We'll set the index later. */
          storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
-         this->variables.push_tail(storage);
+
+         _mesa_hash_table_insert(this->variables, ir, storage);
 
          dst = undef_dst;
       } else {
@@ -1360,7 +1389,7 @@
          storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
                                                  dst.array_id);
 
-         this->variables.push_tail(storage);
+         _mesa_hash_table_insert(this->variables, ir, storage);
       }
 
 
@@ -1526,7 +1555,7 @@
    if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
 
    if (*num_reladdr != 1) {
-      st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
+      st_src_reg temp = get_temp(glsl_type::get_instance(reg->type, 4, 1));
 
       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
       *reg = temp;
@@ -1542,7 +1571,7 @@
 
    /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
     */
-   if (ir->operation == ir_binop_add) {
+   if (!this->precise && ir->operation == ir_binop_add) {
       if (try_emit_mad(ir, 1))
          return;
       if (try_emit_mad(ir, 0))
@@ -2225,7 +2254,7 @@
                                        const_offset % 16 / 4,
                                        const_offset % 16 / 4);
 
-      if (ir->type->base_type == GLSL_TYPE_BOOL) {
+      if (ir->type->is_boolean()) {
          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
       } else {
          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
@@ -2326,11 +2355,15 @@
    case ir_unop_pack_int_2x32:
    case ir_unop_unpack_uint_2x32:
    case ir_unop_pack_uint_2x32:
+   case ir_unop_unpack_sampler_2x32:
+   case ir_unop_pack_sampler_2x32:
+   case ir_unop_unpack_image_2x32:
+   case ir_unop_pack_image_2x32:
       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
       break;
 
    case ir_binop_ldexp:
-      if (ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE) {
+      if (ir->operands[0]->type->is_double()) {
          emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
       } else {
          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
@@ -2360,24 +2393,6 @@
       break;
    }
 
-   case ir_unop_vote_any:
-      emit_asm(ir, TGSI_OPCODE_VOTE_ANY, result_dst, op[0]);
-      break;
-   case ir_unop_vote_all:
-      emit_asm(ir, TGSI_OPCODE_VOTE_ALL, result_dst, op[0]);
-      break;
-   case ir_unop_vote_eq:
-      emit_asm(ir, TGSI_OPCODE_VOTE_EQ, result_dst, op[0]);
-      break;
-   case ir_unop_ballot:
-      emit_asm(ir, TGSI_OPCODE_BALLOT, result_dst, op[0]);
-      break;
-   case ir_unop_read_first_invocation:
-      emit_asm(ir, TGSI_OPCODE_READ_FIRST, result_dst, op[0]);
-      break;
-   case ir_binop_read_invocation:
-      emit_asm(ir, TGSI_OPCODE_READ_INVOC, result_dst, op[0], op[1]);
-      break;
    case ir_unop_u2i64:
    case ir_unop_u2u64:
    case ir_unop_b2i64: {
@@ -2459,7 +2474,7 @@
       break;
    }
    case ir_unop_i642b:
-      emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int(0));
+      emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int64(0));
       break;
    case ir_unop_i642f:
       emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
@@ -2616,7 +2631,7 @@
       case ir_var_uniform:
          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
                                                var->data.param_index);
-         this->variables.push_tail(entry);
+         _mesa_hash_table_insert(this->variables, var, entry);
          break;
       case ir_var_shader_in: {
          /* The linker assigns locations for varyings and attributes,
@@ -2663,7 +2678,8 @@
                                                decl->array_id);
          entry->component = component;
 
-         this->variables.push_tail(entry);
+         _mesa_hash_table_insert(this->variables, var, entry);
+
          break;
       }
       case ir_var_shader_out: {
@@ -2721,7 +2737,8 @@
          }
          entry->component = component;
 
-         this->variables.push_tail(entry);
+         _mesa_hash_table_insert(this->variables, var, entry);
+
          break;
       }
       case ir_var_system_value:
@@ -2735,7 +2752,7 @@
 
          entry = new(mem_ctx) variable_storage(var, src.file, src.index,
                                                src.array_id);
-         this->variables.push_tail(entry);
+         _mesa_hash_table_insert(this->variables, var, entry);
 
          break;
       }
@@ -2748,7 +2765,9 @@
 
    this->result = st_src_reg(entry->file, entry->index, var->type,
                              entry->component, entry->array_id);
-   if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
+   if (this->shader->Stage == MESA_SHADER_VERTEX &&
+       var->data.mode == ir_var_shader_in &&
+       var->type->without_array()->is_double())
       this->result.is_double_vertex_input = true;
    if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
@@ -2838,7 +2857,7 @@
    ir->array->accept(this);
    src = this->result;
 
-   if (ir->array->ir_type != ir_type_dereference_array) {
+   if (!src.has_index2) {
       switch (this->prog->Target) {
       case GL_TESS_CONTROL_PROGRAM_NV:
          is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
@@ -3091,7 +3110,7 @@
                                      st_dst_reg *l, st_src_reg *r,
                                      st_src_reg *cond, bool cond_swap)
 {
-   if (type->base_type == GLSL_TYPE_STRUCT) {
+   if (type->is_record()) {
       for (unsigned int i = 0; i < type->length; i++) {
          emit_block_mov(ir, type->fields.structure[i].type, l, r,
                         cond, cond_swap);
@@ -3124,7 +3143,15 @@
    r->type = type->base_type;
    if (cond) {
       st_src_reg l_src = st_src_reg(*l);
-      l_src.swizzle = swizzle_for_size(type->vector_elements);
+
+      if (l_src.file == PROGRAM_OUTPUT &&
+          this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
+          (l_src.index == FRAG_RESULT_DEPTH || l_src.index == FRAG_RESULT_STENCIL)) {
+         /* This is a special case because the source swizzles will be shifted
+          * later to account for the difference between GLSL (where they're
+          * plain floats) and TGSI (where they're Z and Y components). */
+         l_src.swizzle = SWIZZLE_XXXX;
+      }
 
       if (native_integers) {
          emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
@@ -3154,6 +3181,8 @@
    st_dst_reg l;
    st_src_reg r;
 
+   /* all generated instructions need to be flaged as precise */
+   this->precise = is_precise(ir->lhs->variable_referenced());
    ir->rhs->accept(this);
    r = this->result;
 
@@ -3245,6 +3274,7 @@
    } else {
       emit_block_mov(ir, ir->rhs->type, &l, &r, NULL, false);
    }
+   this->precise = 0;
 }
 
 
@@ -3264,7 +3294,7 @@
     * aggregate constant and move each constant value into it.  If we
     * get lucky, copy propagation will eliminate the extra moves.
     */
-   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
+   if (ir->type->is_record()) {
       st_src_reg temp_base = get_temp(ir->type);
       st_dst_reg temp = st_dst_reg(temp_base);
 
@@ -3770,6 +3800,59 @@
    }
 }
 
+static void
+get_image_qualifiers(ir_dereference *ir, const glsl_type **type,
+                     bool *memory_coherent, bool *memory_volatile,
+                     bool *memory_restrict, unsigned *image_format)
+{
+
+   switch (ir->ir_type) {
+   case ir_type_dereference_record: {
+      ir_dereference_record *deref_record = ir->as_dereference_record();
+      const glsl_type *struct_type = deref_record->record->type;
+
+      for (unsigned i = 0; i < struct_type->length; i++) {
+         if (!strcmp(struct_type->fields.structure[i].name,
+                     deref_record->field)) {
+            *type = struct_type->fields.structure[i].type->without_array();
+            *memory_coherent =
+               struct_type->fields.structure[i].memory_coherent;
+            *memory_volatile =
+               struct_type->fields.structure[i].memory_volatile;
+            *memory_restrict =
+               struct_type->fields.structure[i].memory_restrict;
+            *image_format =
+               struct_type->fields.structure[i].image_format;
+            break;
+         }
+      }
+      break;
+   }
+
+   case ir_type_dereference_array: {
+      ir_dereference_array *deref_arr = ir->as_dereference_array();
+      get_image_qualifiers((ir_dereference *)deref_arr->array, type,
+                           memory_coherent, memory_volatile, memory_restrict,
+                           image_format);
+      break;
+   }
+
+   case ir_type_dereference_variable: {
+      ir_variable *var = ir->variable_referenced();
+
+      *type = var->type->without_array();
+      *memory_coherent = var->data.memory_coherent;
+      *memory_volatile = var->data.memory_volatile;
+      *memory_restrict = var->data.memory_restrict;
+      *image_format = var->data.image_format;
+      break;
+   }
+
+   default:
+      break;
+   }
+}
+
 void
 glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
 {
@@ -3777,15 +3860,21 @@
 
    ir_dereference *img = (ir_dereference *)param;
    const ir_variable *imgvar = img->variable_referenced();
-   const glsl_type *type = imgvar->type->without_array();
    unsigned sampler_array_size = 1, sampler_base = 0;
+   bool memory_coherent = false, memory_volatile = false, memory_restrict = false;
+   unsigned image_format = 0;
+   const glsl_type *type = NULL;
+
+   get_image_qualifiers(img, &type, &memory_coherent, &memory_volatile,
+                        &memory_restrict, &image_format);
 
    st_src_reg reladdr;
    st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
-
+   uint16_t index = 0;
    get_deref_offsets(img, &sampler_array_size, &sampler_base,
-                     (uint16_t*)&image.index, &reladdr, true);
+                     &index, &reladdr, !imgvar->contains_bindless());
 
+   image.index = index;
    if (reladdr.file != PROGRAM_UNDEFINED) {
       image.reladdr = ralloc(mem_ctx, st_src_reg);
       *image.reladdr = reladdr;
@@ -3801,6 +3890,12 @@
 
    glsl_to_tgsi_instruction *inst;
 
+   st_src_reg bindless;
+   if (imgvar->contains_bindless()) {
+      img->accept(this);
+      bindless = this->result;
+   }
+
    if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
       dst.writemask = WRITEMASK_XYZ;
       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
@@ -3897,23 +3992,53 @@
          inst->dst[0].writemask = WRITEMASK_XYZW;
    }
 
-   inst->resource = image;
-   inst->sampler_array_size = sampler_array_size;
-   inst->sampler_base = sampler_base;
+   if (imgvar->contains_bindless()) {
+      inst->resource = bindless;
+      inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
+                                             SWIZZLE_X, SWIZZLE_Y);
+   } else {
+      inst->resource = image;
+      inst->sampler_array_size = sampler_array_size;
+      inst->sampler_base = sampler_base;
+   }
 
    inst->tex_target = type->sampler_index();
    inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
-         _mesa_get_shader_image_format(imgvar->data.image_format));
+         _mesa_get_shader_image_format(image_format));
 
-   if (imgvar->data.image_coherent)
+   if (memory_coherent)
       inst->buffer_access |= TGSI_MEMORY_COHERENT;
-   if (imgvar->data.image_restrict)
+   if (memory_restrict)
       inst->buffer_access |= TGSI_MEMORY_RESTRICT;
-   if (imgvar->data.image_volatile)
+   if (memory_volatile)
       inst->buffer_access |= TGSI_MEMORY_VOLATILE;
 }
 
 void
+glsl_to_tgsi_visitor::visit_generic_intrinsic(ir_call *ir, unsigned op)
+{
+   ir->return_deref->accept(this);
+   st_dst_reg dst = st_dst_reg(this->result);
+
+   dst.writemask = u_bit_consecutive(0, ir->return_deref->var->type->vector_elements);
+
+   st_src_reg src[4] = { undef_src, undef_src, undef_src, undef_src };
+   unsigned num_src = 0;
+   foreach_in_list(ir_rvalue, param, &ir->actual_parameters) {
+      assert(num_src < ARRAY_SIZE(src));
+
+      this->result.file = PROGRAM_UNDEFINED;
+      param->accept(this);
+      assert(this->result.file != PROGRAM_UNDEFINED);
+
+      src[num_src] = this->result;
+      num_src++;
+   }
+
+   emit_asm(ir, op, dst, src[0], src[1], src[2], src[3]);
+}
+
+void
 glsl_to_tgsi_visitor::visit(ir_call *ir)
 {
    ir_function_signature *sig = ir->callee;
@@ -3984,15 +4109,28 @@
       visit_image_intrinsic(ir);
       return;
 
-   case ir_intrinsic_shader_clock: {
-      ir->return_deref->accept(this);
-
-      st_dst_reg dst = st_dst_reg(this->result);
-      dst.writemask = TGSI_WRITEMASK_XY;
-
-      emit_asm(ir, TGSI_OPCODE_CLOCK, dst);
+   case ir_intrinsic_shader_clock:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_CLOCK);
       return;
-   }
+
+   case ir_intrinsic_vote_all:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ALL);
+      return;
+   case ir_intrinsic_vote_any:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ANY);
+      return;
+   case ir_intrinsic_vote_eq:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_EQ);
+      return;
+   case ir_intrinsic_ballot:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_BALLOT);
+      return;
+   case ir_intrinsic_read_first_invocation:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_READ_FIRST);
+      return;
+   case ir_intrinsic_read_invocation:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_READ_INVOC);
+      return;
 
    case ir_intrinsic_invalid:
    case ir_intrinsic_generic_load:
@@ -4130,8 +4268,8 @@
    unsigned opcode = TGSI_OPCODE_NOP;
    const glsl_type *sampler_type = ir->sampler->type;
    unsigned sampler_array_size = 1, sampler_base = 0;
-   uint16_t sampler_index = 0;
    bool is_cube_array = false, is_cube_shadow = false;
+   ir_variable *var = ir->sampler->variable_referenced();
    unsigned i;
 
    /* if we are a cube array sampler or a cube shadow */
@@ -4246,7 +4384,7 @@
       component = this->result;
       if (ir->offset) {
          ir->offset->accept(this);
-         if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) {
+         if (ir->offset->type->is_array()) {
             const glsl_type *elt_type = ir->offset->type->fields.array;
             for (i = 0; i < ir->offset->type->length; i++) {
                offset[i] = this->result;
@@ -4361,10 +4499,24 @@
       coord_dst.writemask = WRITEMASK_XYZW;
    }
 
+   st_src_reg sampler(PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT);
+
+   uint16_t index = 0;
    get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
-                     &sampler_index, &reladdr, true);
-   if (reladdr.file != PROGRAM_UNDEFINED)
+                     &index, &reladdr, !var->contains_bindless());
+
+   sampler.index = index;
+   if (reladdr.file != PROGRAM_UNDEFINED) {
+      sampler.reladdr = ralloc(mem_ctx, st_src_reg);
+      *sampler.reladdr = reladdr;
       emit_arl(ir, sampler_reladdr, reladdr);
+   }
+
+   st_src_reg bindless;
+   if (var->contains_bindless()) {
+      ir->sampler->accept(this);
+      bindless = this->result;
+   }
 
    if (opcode == TGSI_OPCODE_TXD)
       inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
@@ -4395,13 +4547,14 @@
    if (ir->shadow_comparator)
       inst->tex_shadow = GL_TRUE;
 
-   inst->resource.index = sampler_index;
-   inst->sampler_array_size = sampler_array_size;
-   inst->sampler_base = sampler_base;
-
-   if (reladdr.file != PROGRAM_UNDEFINED) {
-      inst->resource.reladdr = ralloc(mem_ctx, st_src_reg);
-      memcpy(inst->resource.reladdr, &reladdr, sizeof(reladdr));
+   if (var->contains_bindless()) {
+      inst->resource = bindless;
+      inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
+                                             SWIZZLE_X, SWIZZLE_Y);
+   } else {
+      inst->resource = sampler;
+      inst->sampler_array_size = sampler_array_size;
+      inst->sampler_base = sampler_base;
    }
 
    if (ir->offset) {
@@ -4519,7 +4672,6 @@
    num_immediates = 0;
    num_address_regs = 0;
    samplers_used = 0;
-   buffers_used = 0;
    images_used = 0;
    indirect_addr_consts = false;
    wpos_transform_const = -1;
@@ -4528,6 +4680,7 @@
    mem_ctx = ralloc_context(NULL);
    ctx = NULL;
    prog = NULL;
+   precise = 0;
    shader_program = NULL;
    shader = NULL;
    options = NULL;
@@ -4535,10 +4688,19 @@
    have_fma = false;
    use_shared_memory = false;
    has_tex_txf_lz = false;
+   variables = NULL;
+}
+
+static void var_destroy(struct hash_entry *entry)
+{
+   variable_storage *storage = (variable_storage *)entry->data;
+
+   delete storage;
 }
 
 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
 {
+   _mesa_hash_table_destroy(variables, var_destroy);
    free(array_sizes);
    ralloc_free(mem_ctx);
 }
@@ -4557,7 +4719,6 @@
 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
 {
    v->samplers_used = 0;
-   v->buffers_used = 0;
    v->images_used = 0;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
@@ -4583,12 +4744,9 @@
       if (inst->resource.file != PROGRAM_UNDEFINED && (
                 is_resource_instruction(inst->op) ||
                 inst->op == TGSI_OPCODE_STORE)) {
-         if (inst->resource.file == PROGRAM_BUFFER) {
-            v->buffers_used |= 1 << inst->resource.index;
-         } else if (inst->resource.file == PROGRAM_MEMORY) {
+         if (inst->resource.file == PROGRAM_MEMORY) {
             v->use_shared_memory = true;
-         } else {
-            assert(inst->resource.file == PROGRAM_IMAGE);
+         } else if (inst->resource.file == PROGRAM_IMAGE) {
             for (int i = 0; i < inst->sampler_array_size; i++) {
                unsigned idx = inst->sampler_base + i;
                v->images_used |= 1 << idx;
@@ -4663,7 +4821,7 @@
       /* Give up if we encounter relative addressing or flow control. */
       if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
           inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
-          tgsi_get_opcode_info(inst->op)->is_branch ||
+          inst->info->is_branch ||
           inst->op == TGSI_OPCODE_CONT ||
           inst->op == TGSI_OPCODE_END ||
           inst->op == TGSI_OPCODE_RET) {
@@ -4731,6 +4889,12 @@
                   inst->tex_offsets[j].index = renames[k].new_reg;
       }
 
+      if (inst->resource.file == PROGRAM_TEMPORARY) {
+         for (k = 0; k < num_renames; k++)
+            if (inst->resource.index == renames[k].old_reg)
+               inst->resource.index = renames[k].new_reg;
+      }
+
       for (j = 0; j < num_inst_dst_regs(inst); j++) {
          if (inst->dst[j].file == PROGRAM_TEMPORARY)
              for (k = 0; k < num_renames; k++)
@@ -4741,6 +4905,33 @@
 }
 
 void
+glsl_to_tgsi_visitor::get_first_temp_write(int *first_writes)
+{
+   int depth = 0; /* loop depth */
+   int loop_start = -1; /* index of the first active BGNLOOP (if any) */
+   unsigned i = 0, j;
+
+   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
+      for (j = 0; j < num_inst_dst_regs(inst); j++) {
+         if (inst->dst[j].file == PROGRAM_TEMPORARY) {
+            if (first_writes[inst->dst[j].index] == -1)
+                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
+         }
+      }
+
+      if (inst->op == TGSI_OPCODE_BGNLOOP) {
+         if(depth++ == 0)
+            loop_start = i;
+      } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
+         if (--depth == 0)
+            loop_start = -1;
+      }
+      assert(depth >= 0);
+      i++;
+   }
+}
+
+void
 glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
 {
    int depth = 0; /* loop depth */
@@ -5154,6 +5345,21 @@
                }
             }
          }
+
+         if (inst->resource.file == PROGRAM_TEMPORARY) {
+            int src_chans;
+
+            src_chans  = 1 << GET_SWZ(inst->resource.swizzle, 0);
+            src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 1);
+            src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 2);
+            src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 3);
+
+            for (int c = 0; c < 4; c++) {
+               if (src_chans & (1 << c))
+                  writes[4 * inst->resource.index + c] = NULL;
+            }
+         }
+
          break;
       }
 
@@ -5223,7 +5429,8 @@
 void
 glsl_to_tgsi_visitor::merge_two_dsts(void)
 {
-   foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
+   /* We never delete inst, but we may delete its successor. */
+   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
       glsl_to_tgsi_instruction *inst2;
       bool merged;
       if (num_inst_dst_regs(inst) != 2)
@@ -5270,8 +5477,8 @@
 void
 glsl_to_tgsi_visitor::merge_registers(void)
 {
-   int *last_reads = rzalloc_array(mem_ctx, int, this->next_temp);
-   int *first_writes = rzalloc_array(mem_ctx, int, this->next_temp);
+   int *last_reads = ralloc_array(mem_ctx, int, this->next_temp);
+   int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
    int i, j;
    int num_renames = 0;
@@ -5329,16 +5536,17 @@
 {
    int i = 0;
    int new_index = 0;
-   int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
+   int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
    int num_renames = 0;
-   for (i = 0; i < this->next_temp; i++) {
-      first_reads[i] = -1;
-   }
-   get_first_temp_read(first_reads);
 
    for (i = 0; i < this->next_temp; i++) {
-      if (first_reads[i] < 0) continue;
+      first_writes[i] = -1;
+   }
+   get_first_temp_write(first_writes);
+
+   for (i = 0; i < this->next_temp; i++) {
+      if (first_writes[i] < 0) continue;
       if (i != new_index) {
          renames[num_renames].old_reg = i;
          renames[num_renames].new_reg = new_index;
@@ -5350,7 +5558,7 @@
    rename_temp_registers(num_renames, renames);
    this->next_temp = new_index;
    ralloc_free(renames);
-   ralloc_free(first_reads);
+   ralloc_free(first_writes);
 }
 
 /* ------------------------- TGSI conversion stuff -------------------------- */
@@ -5384,8 +5592,8 @@
    struct inout_decl *output_decls;
    unsigned num_output_decls;
 
-   const GLuint *inputMapping;
-   const GLuint *outputMapping;
+   const ubyte *inputMapping;
+   const ubyte *outputMapping;
 
    unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
 };
@@ -5792,7 +6000,7 @@
    case TGSI_OPCODE_IF:
    case TGSI_OPCODE_UIF:
       assert(num_dst == 0);
-      ureg_insn(ureg, inst->op, NULL, 0, src, num_src);
+      ureg_insn(ureg, inst->op, NULL, 0, src, num_src, inst->precise);
       return;
 
    case TGSI_OPCODE_TEX:
@@ -5810,7 +6018,12 @@
    case TGSI_OPCODE_TXL2:
    case TGSI_OPCODE_TG4:
    case TGSI_OPCODE_LODQ:
-      src[num_src] = t->samplers[inst->resource.index];
+      if (inst->resource.file == PROGRAM_SAMPLER) {
+         src[num_src] = t->samplers[inst->resource.index];
+      } else {
+         /* Bindless samplers. */
+         src[num_src] = translate_src(t, &inst->resource);
+      }
       assert(src[num_src].File != TGSI_FILE_NULL);
       if (inst->resource.reladdr)
          src[num_src] =
@@ -5825,6 +6038,7 @@
                     inst->op,
                     dst, num_dst,
                     tex_target,
+                    st_translate_texture_type(inst->tex_type),
                     texoffsets, inst->tex_offset_num_offset,
                     src, num_src);
       return;
@@ -5849,7 +6063,12 @@
       } else if (inst->resource.file == PROGRAM_BUFFER) {
          src[0] = t->buffers[inst->resource.index];
       } else {
-         src[0] = t->images[inst->resource.index];
+         if (inst->resource.file == PROGRAM_IMAGE) {
+            src[0] = t->images[inst->resource.index];
+         } else {
+            /* Bindless images. */
+            src[0] = translate_src(t, &inst->resource);
+         }
          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
       }
       if (inst->resource.reladdr)
@@ -5866,7 +6085,12 @@
       } else if (inst->resource.file == PROGRAM_BUFFER) {
          dst[0] = ureg_dst(t->buffers[inst->resource.index]);
       } else {
-         dst[0] = ureg_dst(t->images[inst->resource.index]);
+         if (inst->resource.file == PROGRAM_IMAGE) {
+            dst[0] = ureg_dst(t->images[inst->resource.index]);
+         } else {
+            /* Bindless images. */
+            dst[0] = ureg_dst(translate_src(t, &inst->resource));
+         }
          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
       }
       dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
@@ -5880,14 +6104,14 @@
 
    case TGSI_OPCODE_SCS:
       dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XY);
-      ureg_insn(ureg, inst->op, dst, num_dst, src, num_src);
+      ureg_insn(ureg, inst->op, dst, num_dst, src, num_src, inst->precise);
       break;
 
    default:
       ureg_insn(ureg,
                 inst->op,
                 dst, num_dst,
-                src, num_src);
+                src, num_src, inst->precise);
       break;
    }
 }
@@ -6120,7 +6344,7 @@
       return mapping[a.mesa_index] < mapping[b.mesa_index];
    }
 
-   const GLuint *mapping;
+   const ubyte *mapping;
 };
 
 /* Sort the given array of decls by the corresponding slot (TGSI file index).
@@ -6131,7 +6355,7 @@
 static void
 sort_inout_decls_by_slot(struct inout_decl *decls,
                          unsigned count,
-                         const GLuint mapping[])
+                         const ubyte mapping[])
 {
    sort_inout_decls sorter;
    sorter.mapping = mapping;
@@ -6185,14 +6409,13 @@
    glsl_to_tgsi_visitor *program,
    const struct gl_program *proginfo,
    GLuint numInputs,
-   const GLuint inputMapping[],
-   const GLuint inputSlotToAttr[],
+   const ubyte inputMapping[],
+   const ubyte inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
+   const ubyte interpMode[],
    GLuint numOutputs,
-   const GLuint outputMapping[],
-   const GLuint outputSlotToAttr[],
+   const ubyte outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[])
 {
@@ -6330,9 +6553,14 @@
    }
 
    if (procType == PIPE_SHADER_FRAGMENT) {
-      if (program->shader->Program->info.fs.early_fragment_tests)
+      if (program->shader->Program->info.fs.early_fragment_tests ||
+          program->shader->Program->info.fs.post_depth_coverage) {
          ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
 
+         if (program->shader->Program->info.fs.post_depth_coverage)
+            ureg_property(ureg, TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE, 1);
+      }
+
       if (proginfo->info.inputs_read & VARYING_BIT_POS) {
           /* Must do this after setting up t->inputs. */
           emit_wpos(st_context(ctx), t, proginfo, ureg,
@@ -6519,39 +6747,29 @@
    /* texture samplers */
    for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
       if (program->samplers_used & (1u << i)) {
-         unsigned type;
+         unsigned type = st_translate_texture_type(program->sampler_types[i]);
 
          t->samplers[i] = ureg_DECL_sampler(ureg, i);
 
-         switch (program->sampler_types[i]) {
-         case GLSL_TYPE_INT:
-            type = TGSI_RETURN_TYPE_SINT;
-            break;
-         case GLSL_TYPE_UINT:
-            type = TGSI_RETURN_TYPE_UINT;
-            break;
-         case GLSL_TYPE_FLOAT:
-            type = TGSI_RETURN_TYPE_FLOAT;
-            break;
-         default:
-            unreachable("not reached");
-         }
-
          ureg_DECL_sampler_view( ureg, i, program->sampler_targets[i],
                                  type, type, type, type );
       }
    }
 
-   for (i = 0; i < frag_const->MaxAtomicBuffers; i++) {
-      if (program->buffers_used & (1 << i)) {
-         t->buffers[i] = ureg_DECL_buffer(ureg, i, true);
-      }
-   }
+   /* Declare atomic and shader storage buffers. */
+   {
+      struct gl_program *prog = program->prog;
 
-   for (; i < frag_const->MaxAtomicBuffers + frag_const->MaxShaderStorageBlocks;
-        i++) {
-      if (program->buffers_used & (1 << i)) {
-         t->buffers[i] = ureg_DECL_buffer(ureg, i, false);
+      for (i = 0; i < prog->info.num_abos; i++) {
+         unsigned index = prog->sh.AtomicBuffers[i]->Binding;
+         assert(index < frag_const->MaxAtomicBuffers);
+         t->buffers[index] = ureg_DECL_buffer(ureg, index, true);
+      }
+
+      assert(prog->info.num_ssbos <= frag_const->MaxShaderStorageBlocks);
+      for (i = 0; i < prog->info.num_ssbos; i++) {
+         unsigned index = frag_const->MaxAtomicBuffers + i;
+         t->buffers[index] = ureg_DECL_buffer(ureg, index, false);
       }
    }
 
@@ -6581,27 +6799,8 @@
 
       for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
          if (program->shader_program->_LinkedShaders[i]) {
-            unsigned next;
-
-            switch (i) {
-            case MESA_SHADER_TESS_CTRL:
-               next = PIPE_SHADER_TESS_CTRL;
-               break;
-            case MESA_SHADER_TESS_EVAL:
-               next = PIPE_SHADER_TESS_EVAL;
-               break;
-            case MESA_SHADER_GEOMETRY:
-               next = PIPE_SHADER_GEOMETRY;
-               break;
-            case MESA_SHADER_FRAGMENT:
-               next = PIPE_SHADER_FRAGMENT;
-               break;
-            default:
-               assert(0);
-               continue;
-            }
-
-            ureg_set_next_shader_processor(ureg, next);
+            ureg_set_next_shader_processor(
+                  ureg, pipe_shader_type_from_mesa((gl_shader_stage)i));
             break;
          }
       }
@@ -6639,6 +6838,7 @@
          &ctx->Const.ShaderCompilerOptions[shader->Stage];
    struct pipe_screen *pscreen = ctx->st->pipe->screen;
    enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(shader->Stage);
+   unsigned skip_merge_registers;
 
    validate_ir_tree(shader->ir);
 
@@ -6661,6 +6861,12 @@
    v->has_tex_txf_lz = pscreen->get_param(pscreen,
                                           PIPE_CAP_TGSI_TEX_TXF_LZ);
 
+   v->variables = _mesa_hash_table_create(v->mem_ctx, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+   skip_merge_registers =
+      pscreen->get_shader_param(pscreen, ptarget,
+                                PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS);
+
    _mesa_generate_parameters_list_for_uniforms(shader_program, shader,
                                                prog->Parameters);
 
@@ -6676,10 +6882,10 @@
     * optimization passes. */
    {
       int i;
-      int *first_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *first_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *last_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *last_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
+      int *first_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *first_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
 
       for (i = 0; i < v->next_temp; i++) {
          first_writes[i] = -1;
@@ -6712,7 +6918,8 @@
    while (v->eliminate_dead_code());
 
    v->merge_two_dsts();
-   v->merge_registers();
+   if (!skip_merge_registers)
+      v->merge_registers();
    v->renumber_registers();
 
    /* Write the END instruction. */
@@ -6764,8 +6971,7 @@
     * prog->ParameterValues to get reallocated (e.g., anything that adds a
     * program constant) has to happen before creating this linkage.
     */
-   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters,
-                                   true);
+   _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
    if (!shader_program->data->LinkStatus) {
       free_glsl_to_tgsi_visitor(v);
       _mesa_reference_program(ctx, &shader->Program, NULL);
@@ -6774,9 +6980,7 @@
 
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
-   struct st_geometry_program *stgp;
-   struct st_tessctrl_program *sttcp;
-   struct st_tesseval_program *sttep;
+   struct st_common_program *stp;
    struct st_compute_program *stcp;
 
    switch (shader->Stage) {
@@ -6788,17 +6992,11 @@
       stfp = (struct st_fragment_program *)prog;
       stfp->glsl_to_tgsi = v;
       break;
-   case MESA_SHADER_GEOMETRY:
-      stgp = (struct st_geometry_program *)prog;
-      stgp->glsl_to_tgsi = v;
-      break;
    case MESA_SHADER_TESS_CTRL:
-      sttcp = (struct st_tessctrl_program *)prog;
-      sttcp->glsl_to_tgsi = v;
-      break;
    case MESA_SHADER_TESS_EVAL:
-      sttep = (struct st_tesseval_program *)prog;
-      sttep->glsl_to_tgsi = v;
+   case MESA_SHADER_GEOMETRY:
+      stp = st_common_program(prog);
+      stp->glsl_to_tgsi = v;
       break;
    case MESA_SHADER_COMPUTE:
       stcp = (struct st_compute_program *)prog;
@@ -7014,10 +7212,11 @@
 
       struct gl_program *linked_prog = NULL;
       if (preferred_ir == PIPE_SHADER_IR_NIR) {
-         /* TODO only for GLSL VS/FS for now: */
+         /* TODO only for GLSL VS/FS/CS for now: */
          switch (shader->Stage) {
          case MESA_SHADER_VERTEX:
          case MESA_SHADER_FRAGMENT:
+         case MESA_SHADER_COMPUTE:
             linked_prog = st_nir_get_mesa_program(ctx, prog, shader);
          default:
             break;
@@ -7042,7 +7241,7 @@
 
 void
 st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
-                                const GLuint outputMapping[],
+                                const ubyte outputMapping[],
                                 struct pipe_stream_output_info *so)
 {
    if (!glsl_to_tgsi->shader_program->last_vert_prog)
@@ -7055,7 +7254,7 @@
 
 void
 st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
-                                const GLuint outputMapping[],
+                                const ubyte outputMapping[],
                                 struct pipe_stream_output_info *so)
 {
    unsigned i;
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h
index add534c..37dfdb3 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h
@@ -42,14 +42,13 @@
    struct glsl_to_tgsi_visitor *program,
    const struct gl_program *proginfo,
    GLuint numInputs,
-   const GLuint inputMapping[],
-   const GLuint inputSlotToAttr[],
+   const ubyte inputMapping[],
+   const ubyte inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
+   const ubyte interpMode[],
    GLuint numOutputs,
-   const GLuint outputMapping[],
-   const GLuint outputSlotToAttr[],
+   const ubyte outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[]);
 
@@ -59,12 +58,12 @@
 
 void
 st_translate_stream_output_info(struct glsl_to_tgsi_visitor *glsl_to_tgsi,
-                                const GLuint outputMapping[],
+                                const ubyte outputMapping[],
                                 struct pipe_stream_output_info *so);
 
 void
 st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
-                                const GLuint outputMapping[],
+                                const ubyte outputMapping[],
                                 struct pipe_stream_output_info *so);
 
 unsigned
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index dec2c28..011c05f 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -38,6 +38,7 @@
 #include "main/fbobject.h"
 #include "main/renderbuffer.h"
 #include "main/version.h"
+#include "util/hash_table.h"
 #include "st_texture.h"
 
 #include "st_context.h"
@@ -57,7 +58,16 @@
 #include "util/u_inlines.h"
 #include "util/u_atomic.h"
 #include "util/u_surface.h"
+#include "util/list.h"
 
+struct hash_table;
+struct st_manager_private
+{
+   struct hash_table *stfbi_ht; /* framebuffer iface objects hash table */
+   mtx_t st_mutex;
+};
+
+static void st_manager_destroy(struct st_manager *);
 
 /**
  * Map an attachment to a buffer index.
@@ -180,11 +190,13 @@
    if (stfb->iface_stamp == new_stamp)
       return;
 
+   memset(textures, 0, stfb->num_statts * sizeof(textures[0]));
+
    /* validate the fb */
    do {
       if (!stfb->iface->validate(&st->iface, stfb->iface, stfb->statts,
-				 stfb->num_statts, textures))
-	 return;
+                                 stfb->num_statts, textures))
+         return;
 
       stfb->iface_stamp = new_stamp;
       new_stamp = p_atomic_read(&stfb->iface->stamp);
@@ -217,7 +229,12 @@
       u_surface_default_template(&surf_tmpl, textures[i]);
       ps = st->pipe->create_surface(st->pipe, textures[i], &surf_tmpl);
       if (ps) {
-         pipe_surface_reference(&strb->surface, ps);
+         struct pipe_surface **psurf =
+            util_format_is_srgb(ps->format) ? &strb->surface_srgb :
+                                              &strb->surface_linear;
+
+         pipe_surface_reference(psurf, ps);
+         strb->surface = *psurf;
          pipe_resource_reference(&strb->texture, ps->texture);
          /* ownership transfered */
          pipe_surface_reference(&ps, NULL);
@@ -308,21 +325,21 @@
       return FALSE;
 
    if (idx != BUFFER_DEPTH) {
-      _mesa_add_renderbuffer_without_ref(&stfb->Base, idx, rb);
+      _mesa_attach_and_own_rb(&stfb->Base, idx, rb);
       return TRUE;
    }
 
    bool rb_ownership_taken = false;
    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_ZS, 0)) {
-      _mesa_add_renderbuffer_without_ref(&stfb->Base, BUFFER_DEPTH, rb);
+      _mesa_attach_and_own_rb(&stfb->Base, BUFFER_DEPTH, rb);
       rb_ownership_taken = true;
    }
 
    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_ZS, 1)) {
       if (rb_ownership_taken)
-         _mesa_add_renderbuffer(&stfb->Base, BUFFER_STENCIL, rb);
+         _mesa_attach_and_reference_rb(&stfb->Base, BUFFER_STENCIL, rb);
       else
-         _mesa_add_renderbuffer_without_ref(&stfb->Base, BUFFER_STENCIL, rb);
+         _mesa_attach_and_own_rb(&stfb->Base, BUFFER_STENCIL, rb);
    }
 
    return TRUE;
@@ -454,6 +471,7 @@
    _mesa_initialize_window_framebuffer(&stfb->Base, &mode);
 
    stfb->iface = stfbi;
+   stfb->iface_ID = stfbi->ID;
    stfb->iface_stamp = p_atomic_read(&stfbi->stamp) - 1;
 
    /* add the color buffer */
@@ -475,7 +493,7 @@
 /**
  * Reference a framebuffer.
  */
-static void
+void
 st_framebuffer_reference(struct st_framebuffer **ptr,
                          struct st_framebuffer *stfb)
 {
@@ -483,6 +501,128 @@
    _mesa_reference_framebuffer((struct gl_framebuffer **) ptr, fb);
 }
 
+
+static uint32_t
+st_framebuffer_iface_hash(const void *key)
+{
+   return (uintptr_t)key;
+}
+
+
+static bool
+st_framebuffer_iface_equal(const void *a, const void *b)
+{
+   return (struct st_framebuffer_iface *)a == (struct st_framebuffer_iface *)b;
+}
+
+
+static boolean
+st_framebuffer_iface_lookup(struct st_manager *smapi,
+                            const struct st_framebuffer_iface *stfbi)
+{
+   struct st_manager_private *smPriv =
+      (struct st_manager_private *)smapi->st_manager_private;
+   struct hash_entry *entry;
+
+   assert(smPriv);
+   assert(smPriv->stfbi_ht);
+
+   mtx_lock(&smPriv->st_mutex);
+   entry = _mesa_hash_table_search(smPriv->stfbi_ht, stfbi);
+   mtx_unlock(&smPriv->st_mutex);
+
+   return entry != NULL;
+}
+
+
+static boolean
+st_framebuffer_iface_insert(struct st_manager *smapi,
+                            struct st_framebuffer_iface *stfbi)
+{
+   struct st_manager_private *smPriv =
+      (struct st_manager_private *)smapi->st_manager_private;
+   struct hash_entry *entry;
+
+   assert(smPriv);
+   assert(smPriv->stfbi_ht);
+
+   mtx_lock(&smPriv->st_mutex);
+   entry = _mesa_hash_table_insert(smPriv->stfbi_ht, stfbi, stfbi);
+   mtx_unlock(&smPriv->st_mutex);
+
+   return entry != NULL;
+}
+
+
+static void
+st_framebuffer_iface_remove(struct st_manager *smapi,
+                            struct st_framebuffer_iface *stfbi)
+{
+   struct st_manager_private *smPriv =
+      (struct st_manager_private *)smapi->st_manager_private;
+   struct hash_entry *entry;
+
+   if (!smPriv || !smPriv->stfbi_ht)
+      return;
+
+   mtx_lock(&smPriv->st_mutex);
+   entry = _mesa_hash_table_search(smPriv->stfbi_ht, stfbi);
+   if (!entry)
+      goto unlock;
+
+   _mesa_hash_table_remove(smPriv->stfbi_ht, entry);
+
+unlock:
+   mtx_unlock(&smPriv->st_mutex);
+}
+
+
+/**
+ * The framebuffer interface object is no longer valid.
+ * Remove the object from the framebuffer interface hash table.
+ */
+static void
+st_api_destroy_drawable(struct st_api *stapi,
+                        struct st_framebuffer_iface *stfbi)
+{
+   if (!stfbi)
+      return;
+
+   st_framebuffer_iface_remove(stfbi->state_manager, stfbi);
+}
+
+
+/**
+ * Purge the winsys buffers list to remove any references to
+ * non-existing framebuffer interface objects.
+ */
+static void
+st_framebuffers_purge(struct st_context *st)
+{
+   struct st_context_iface *st_iface = &st->iface;
+   struct st_manager *smapi = st_iface->state_manager;
+   struct st_framebuffer *stfb, *next;
+
+   assert(smapi);
+
+   LIST_FOR_EACH_ENTRY_SAFE_REV(stfb, next, &st->winsys_buffers, head) {
+      struct st_framebuffer_iface *stfbi = stfb->iface;
+
+      assert(stfbi);
+
+      /**
+       * If the corresponding framebuffer interface object no longer exists,
+       * remove the framebuffer object from the context's winsys buffers list,
+       * and unreference the framebuffer object, so its resources can be
+       * deleted.
+       */
+      if (!st_framebuffer_iface_lookup(smapi, stfbi)) {
+         LIST_DEL(&stfb->head);
+         st_framebuffer_reference(&stfb, NULL);
+      }
+   }
+}
+
 static void
 st_context_flush(struct st_context_iface *stctxi, unsigned flags,
                  struct pipe_fence_handle **fence)
@@ -496,7 +636,7 @@
 
    st_flush(st, fence, pipe_flags);
 
-   if ((flags & ST_FLUSH_WAIT) && fence) {
+   if ((flags & ST_FLUSH_WAIT) && fence && *fence) {
       st->pipe->screen->fence_finish(st->pipe->screen, NULL, *fence,
                                      PIPE_TIMEOUT_INFINITE);
       st->pipe->screen->fence_reference(st->pipe->screen, fence, NULL);
@@ -504,6 +644,16 @@
 
    if (flags & ST_FLUSH_FRONT)
       st_manager_flush_frontbuffer(st);
+
+   /* DRI3 changes the framebuffer after SwapBuffers, but we need to invoke
+    * st_manager_validate_framebuffers to notice that.
+    *
+    * Set gfx_shaders_may_be_dirty to invoke st_validate_state in the next
+    * draw call, which will invoke st_manager_validate_framebuffers, but it
+    * won't dirty states if there is no change.
+    */
+   if (flags & ST_FLUSH_END_OF_FRAME)
+      st->gfx_shaders_may_be_dirty = true;
 }
 
 static boolean
@@ -587,9 +737,11 @@
    pipe_resource_reference(&stImage->pt, tex);
    stObj->surface_format = pipe_format;
 
+   stObj->needs_validation = true;
+
    _mesa_dirty_texobj(ctx, texObj);
    _mesa_unlock_texture(ctx, texObj);
-   
+
    return TRUE;
 }
 
@@ -647,7 +799,8 @@
    struct pipe_context *pipe;
    struct gl_config mode;
    gl_api api;
-   unsigned ctx_flags = 0;
+   bool no_error = false;
+   unsigned ctx_flags = PIPE_CONTEXT_PREFER_THREADED;
 
    if (!(stapi->profile_mask & (1 << attribs->profile)))
       return NULL;
@@ -670,9 +823,27 @@
       return NULL;
    }
 
+   /* Create a hash table for the framebuffer interface objects
+    * if it has not been created for this st manager.
+    */
+   if (smapi->st_manager_private == NULL) {
+      struct st_manager_private *smPriv;
+
+      smPriv = CALLOC_STRUCT(st_manager_private);
+      mtx_init(&smPriv->st_mutex, mtx_plain);
+      smPriv->stfbi_ht = _mesa_hash_table_create(NULL,
+                                                 st_framebuffer_iface_hash,
+                                                 st_framebuffer_iface_equal);
+      smapi->st_manager_private = smPriv;
+      smapi->destroy = st_manager_destroy;
+   }
+
    if (attribs->flags & ST_CONTEXT_FLAG_ROBUST_ACCESS)
       ctx_flags |= PIPE_CONTEXT_ROBUST_BUFFER_ACCESS;
 
+   if (attribs->flags & ST_CONTEXT_FLAG_NO_ERROR)
+      no_error = true;
+
    pipe = smapi->screen->context_create(smapi->screen, NULL, ctx_flags);
    if (!pipe) {
       *error = ST_CONTEXT_ERROR_NO_MEMORY;
@@ -680,7 +851,7 @@
    }
 
    st_visual_to_context_mode(&attribs->visual, &mode);
-   st = st_create_context(api, pipe, &mode, shared_ctx, &attribs->options);
+   st = st_create_context(api, pipe, &mode, shared_ctx, &attribs->options, no_error);
    if (!st) {
       *error = ST_CONTEXT_ERROR_NO_MEMORY;
       pipe->destroy(pipe);
@@ -716,7 +887,7 @@
       /* Is the actual version less than the requested version?
        */
       if (st->ctx->Version < attribs->major * 10U + attribs->minor) {
-	 *error = ST_CONTEXT_ERROR_BAD_VERSION;
+         *error = ST_CONTEXT_ERROR_BAD_VERSION;
          st_destroy_context(st);
          return NULL;
       }
@@ -735,6 +906,7 @@
    st->iface.st_context_private = (void *) smapi;
    st->iface.cso_context = st->cso_context;
    st->iface.pipe = st->pipe;
+   st->iface.state_manager = smapi;
 
    *error = ST_CONTEXT_SUCCESS;
    return &st->iface;
@@ -754,17 +926,39 @@
                                struct gl_framebuffer *fb,
                                struct st_framebuffer_iface *stfbi)
 {
-   struct st_framebuffer *cur = st_ws_framebuffer(fb), *stfb = NULL;
+   struct st_framebuffer *cur = NULL, *stfb = NULL;
 
-   /* dummy framebuffers cant be used as st_framebuffer */
-   if (cur && &cur->Base != _mesa_get_incomplete_framebuffer() &&
-       cur->iface == stfbi) {
-      /* reuse the current stfb */
-      st_framebuffer_reference(&stfb, cur);
+   if (!stfbi)
+	return NULL;
+
+   /* Check if there is already a framebuffer object for the specified
+    * framebuffer interface in this context. If there is one, use it.
+    */
+   LIST_FOR_EACH_ENTRY(cur, &st->winsys_buffers, head) {
+      if (cur->iface_ID == stfbi->ID) {
+         st_framebuffer_reference(&stfb, cur);
+         break;
+      }
    }
-   else {
-      /* create a new one */
-      stfb = st_framebuffer_create(st, stfbi);
+
+   /* If there is not already a framebuffer object, create one */
+   if (stfb == NULL) {
+      cur = st_framebuffer_create(st, stfbi);
+
+      if (cur) {
+         /* add the referenced framebuffer interface object to
+          * the framebuffer interface object hash table.
+          */
+         if (!st_framebuffer_iface_insert(stfbi->state_manager, stfbi)) {
+            st_framebuffer_reference(&cur, NULL);
+            return NULL;
+         }
+
+         /* add to the context's winsys buffers list */
+         LIST_ADD(&cur->head, &st->winsys_buffers);
+
+         st_framebuffer_reference(&stfb, cur);
+      }
    }
 
    return stfb;
@@ -815,6 +1009,11 @@
 
       st_framebuffer_reference(&stdraw, NULL);
       st_framebuffer_reference(&stread, NULL);
+
+      /* Purge the context's winsys_buffers list in case any
+       * of the referenced drawables no longer exist.
+       */
+      st_framebuffers_purge(st);
    }
    else {
       ret = _mesa_make_current(NULL, NULL, NULL);
@@ -903,16 +1102,30 @@
     * new renderbuffer. It might be that there is a window system
     * renderbuffer available.
     */
-   if(stfb->iface)
+   if (stfb->iface)
       stfb->iface_stamp = p_atomic_read(&stfb->iface->stamp) - 1;
 
-   st_invalidate_state(st->ctx, _NEW_BUFFERS);
+   st_invalidate_buffers(st);
 
    return TRUE;
 }
 
-static unsigned get_version(struct pipe_screen *screen,
-                            struct st_config_options *options, gl_api api)
+static void
+st_manager_destroy(struct st_manager *smapi)
+{
+   struct st_manager_private *smPriv = smapi->st_manager_private;
+
+   if (smPriv && smPriv->stfbi_ht) {
+      _mesa_hash_table_destroy(smPriv->stfbi_ht, NULL);
+      mtx_destroy(&smPriv->st_mutex);
+      free(smPriv);
+      smapi->st_manager_private = NULL;
+   }
+}
+
+static unsigned
+get_version(struct pipe_screen *screen,
+            struct st_config_options *options, gl_api api)
 {
    struct gl_constants consts = {0};
    struct gl_extensions extensions = {0};
@@ -959,6 +1172,7 @@
    .create_context = st_api_create_context,
    .make_current = st_api_make_current,
    .get_current = st_api_get_current,
+   .destroy_drawable = st_api_destroy_drawable,
 };
 
 struct st_api *
diff --git a/src/mesa/state_tracker/st_manager.h b/src/mesa/state_tracker/st_manager.h
index 65874b0..c54f29e 100644
--- a/src/mesa/state_tracker/st_manager.h
+++ b/src/mesa/state_tracker/st_manager.h
@@ -33,6 +33,8 @@
 #include "pipe/p_compiler.h"
 
 struct st_context;
+struct st_framebuffer;
+struct st_framebuffer_interface;
 
 void
 st_manager_flush_frontbuffer(struct st_context *st);
@@ -44,4 +46,11 @@
 st_manager_add_color_renderbuffer(struct st_context *st, struct gl_framebuffer *fb,
                                   gl_buffer_index idx);
 
+void
+st_framebuffer_reference(struct st_framebuffer **ptr,
+                         struct st_framebuffer *stfb);
+
+void
+st_framebuffer_interface_destroy(struct st_framebuffer_interface *stfbi);
+
 #endif /* ST_MANAGER_H */
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index f906fed..f6eb5ef 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -65,8 +65,8 @@
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
 
-   const GLuint *inputMapping;
-   const GLuint *outputMapping;
+   const ubyte *inputMapping;
+   const ubyte *outputMapping;
 
    unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
 };
@@ -223,6 +223,26 @@
 
 
 /**
+ * Map GLSL base type to TGSI return type.
+ */
+unsigned
+st_translate_texture_type(enum glsl_base_type type)
+{
+	switch (type) {
+	case GLSL_TYPE_INT:
+		return TGSI_RETURN_TYPE_SINT;
+	case GLSL_TYPE_UINT:
+		return TGSI_RETURN_TYPE_UINT;
+	case GLSL_TYPE_FLOAT:
+		return TGSI_RETURN_TYPE_FLOAT;
+	default:
+		assert(!"unexpected texture type");
+		return TGSI_RETURN_TYPE_UNKNOWN;
+	}
+}
+
+
+/**
  * Translate a (1 << TEXTURE_x_INDEX) bit into a TGSI_TEXTURE_x enum.
  */
 static unsigned
@@ -536,6 +556,7 @@
                      dst, num_dst, 
                      st_translate_texture_target( inst->TexSrcTarget,
                                                inst->TexShadow ),
+                     TGSI_RETURN_TYPE_FLOAT,
                      NULL, 0,
                      src, num_src );
       return;
@@ -545,7 +566,7 @@
       ureg_insn( ureg, 
                  translate_opcode( inst->Opcode ), 
                  dst, num_dst, 
-                 src, num_src );
+                 src, num_src, 0 );
       break;
 
    case OPCODE_XPD:
@@ -553,7 +574,7 @@
       ureg_insn( ureg, 
                  translate_opcode( inst->Opcode ), 
                  dst, num_dst, 
-                 src, num_src );
+                 src, num_src, 0 );
       break;
 
    case OPCODE_RSQ:
@@ -572,7 +593,7 @@
       ureg_insn( ureg, 
                  translate_opcode( inst->Opcode ), 
                  dst, num_dst, 
-                 src, num_src );
+                 src, num_src, 0);
       break;
    }
 }
@@ -799,12 +820,12 @@
    struct ureg_program *ureg,
    const struct gl_program *program,
    GLuint numInputs,
-   const GLuint inputMapping[],
+   const ubyte inputMapping[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
+   const ubyte interpMode[],
    GLuint numOutputs,
-   const GLuint outputMapping[],
+   const ubyte outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[])
 {
@@ -917,42 +938,41 @@
 
    /* Declare misc input registers
     */
-   {
-      GLbitfield sysInputs = program->info.system_values_read;
+   GLbitfield sysInputs = program->info.system_values_read;
+   for (i = 0; sysInputs; i++) {
+      if (sysInputs & (1 << i)) {
+         unsigned semName = _mesa_sysval_to_semantic(i);
 
-      for (i = 0; sysInputs; i++) {
-         if (sysInputs & (1 << i)) {
-            unsigned semName = _mesa_sysval_to_semantic(i);
+         t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
 
-            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
-
-            if (semName == TGSI_SEMANTIC_INSTANCEID ||
-                semName == TGSI_SEMANTIC_VERTEXID) {
-               /* From Gallium perspective, these system values are always
-                * integer, and require native integer support.  However, if
-                * native integer is supported on the vertex stage but not the
-                * pixel stage (e.g, i915g + draw), Mesa will generate IR that
-                * assumes these system values are floats. To resolve the
-                * inconsistency, we insert a U2F.
-                */
-               struct st_context *st = st_context(ctx);
-               struct pipe_screen *pscreen = st->pipe->screen;
-               assert(procType == PIPE_SHADER_VERTEX);
-               assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
-               (void) pscreen;  /* silence non-debug build warnings */
-               if (!ctx->Const.NativeIntegers) {
-                  struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
-                  ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
-                  t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
-               }
+         if (semName == TGSI_SEMANTIC_INSTANCEID ||
+             semName == TGSI_SEMANTIC_VERTEXID) {
+            /* From Gallium perspective, these system values are always
+             * integer, and require native integer support.  However, if
+             * native integer is supported on the vertex stage but not the
+             * pixel stage (e.g, i915g + draw), Mesa will generate IR that
+             * assumes these system values are floats. To resolve the
+             * inconsistency, we insert a U2F.
+             */
+            struct st_context *st = st_context(ctx);
+            struct pipe_screen *pscreen = st->pipe->screen;
+            assert(procType == PIPE_SHADER_VERTEX);
+            assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX,
+                   PIPE_SHADER_CAP_INTEGERS));
+            (void) pscreen;  /* silence non-debug build warnings */
+            if (!ctx->Const.NativeIntegers) {
+               struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
+               ureg_U2F(t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X),
+                        t->systemValues[i]);
+               t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
             }
-
-            if (procType == PIPE_SHADER_FRAGMENT &&
-                semName == TGSI_SEMANTIC_POSITION)
-               emit_wpos(st_context(ctx), t, program, ureg);
-
-            sysInputs &= ~(1 << i);
          }
+
+         if (procType == PIPE_SHADER_FRAGMENT &&
+             semName == TGSI_SEMANTIC_POSITION)
+            emit_wpos(st_context(ctx), t, program, ureg);
+
+          sysInputs &= ~(1 << i);
       }
    }
 
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.h b/src/mesa/state_tracker/st_mesa_to_tgsi.h
index 3df54ce..106cf85 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.h
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.h
@@ -34,6 +34,8 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_defines.h"
 
+#include "compiler/glsl_types.h"
+
 #if defined __cplusplus
 extern "C" {
 #endif
@@ -51,18 +53,20 @@
    struct ureg_program *ureg,
    const struct gl_program *program,
    GLuint numInputs,
-   const GLuint inputMapping[],
+   const ubyte inputMapping[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
+   const ubyte interpMode[],
    GLuint numOutputs,
-   const GLuint outputMapping[],
+   const ubyte outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[]);
 
 unsigned
 st_translate_texture_target(GLuint textarget, GLboolean shadow);
 
+unsigned
+st_translate_texture_type(enum glsl_base_type type);
 
 #if defined __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c
index 1ded583..303c853 100644
--- a/src/mesa/state_tracker/st_pbo.c
+++ b/src/mesa/state_tracker/st_pbo.c
@@ -215,7 +215,7 @@
 
    /* Upload vertices */
    {
-      struct pipe_vertex_buffer vbo;
+      struct pipe_vertex_buffer vbo = {0};
       struct pipe_vertex_element velem;
 
       float x0 = (float) addr->xoffset / surface_width * 2.0f - 1.0f;
@@ -225,12 +225,10 @@
 
       float *verts = NULL;
 
-      vbo.user_buffer = NULL;
-      vbo.buffer = NULL;
       vbo.stride = 2 * sizeof(float);
 
       u_upload_alloc(st->pipe->stream_uploader, 0, 8 * sizeof(float), 4,
-                     &vbo.buffer_offset, &vbo.buffer, (void **) &verts);
+                     &vbo.buffer_offset, &vbo.buffer.resource, (void **) &verts);
       if (!verts)
          return false;
 
@@ -254,7 +252,7 @@
 
       cso_set_vertex_buffers(cso, velem.vertex_buffer_index, 1, &vbo);
 
-      pipe_resource_reference(&vbo.buffer, NULL);
+      pipe_resource_reference(&vbo.buffer.resource, NULL);
    }
 
    /* Upload constants */
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 0dc3b1e..41ebfa9 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -120,7 +120,7 @@
       break;
 
    case MESA_SHADER_TESS_CTRL:
-      states = &((struct st_tessctrl_program*)prog)->affected_states;
+      states = &(st_common_program(prog))->affected_states;
 
       *states = ST_NEW_TCS_STATE;
 
@@ -135,7 +135,7 @@
       break;
 
    case MESA_SHADER_TESS_EVAL:
-      states = &((struct st_tesseval_program*)prog)->affected_states;
+      states = &(st_common_program(prog))->affected_states;
 
       *states = ST_NEW_TES_STATE |
                 ST_NEW_RASTERIZER;
@@ -151,7 +151,7 @@
       break;
 
    case MESA_SHADER_GEOMETRY:
-      states = &((struct st_geometry_program*)prog)->affected_states;
+      states = &(st_common_program(prog))->affected_states;
 
       *states = ST_NEW_GS_STATE |
                 ST_NEW_RASTERIZER;
@@ -376,8 +376,7 @@
    enum pipe_error error;
    unsigned num_outputs = 0;
    unsigned attr;
-   unsigned input_to_index[VERT_ATTRIB_MAX] = {0};
-   unsigned output_slot_to_attr[VARYING_SLOT_MAX] = {0};
+   ubyte input_to_index[VERT_ATTRIB_MAX] = {0};
    ubyte output_semantic_name[VARYING_SLOT_MAX] = {0};
    ubyte output_semantic_index[VARYING_SLOT_MAX] = {0};
 
@@ -417,89 +416,12 @@
          unsigned slot = num_outputs++;
 
          stvp->result_to_output[attr] = slot;
-         output_slot_to_attr[slot] = attr;
 
-         switch (attr) {
-         case VARYING_SLOT_POS:
-            output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_COL0:
-            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_COL1:
-            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            output_semantic_index[slot] = 1;
-            break;
-         case VARYING_SLOT_BFC0:
-            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_BFC1:
-            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            output_semantic_index[slot] = 1;
-            break;
-         case VARYING_SLOT_FOGC:
-            output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_PSIZ:
-            output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_CLIP_DIST0:
-            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_CLIP_DIST1:
-            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            output_semantic_index[slot] = 1;
-            break;
-         case VARYING_SLOT_CULL_DIST0:
-         case VARYING_SLOT_CULL_DIST1:
-            /* these should have been lowered by GLSL */
-            assert(0);
-            break;
-         case VARYING_SLOT_EDGE:
-            assert(0);
-            break;
-         case VARYING_SLOT_CLIP_VERTEX:
-            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_LAYER:
-            output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
-            output_semantic_index[slot] = 0;
-            break;
-         case VARYING_SLOT_VIEWPORT:
-            output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
-            output_semantic_index[slot] = 0;
-            break;
-
-         case VARYING_SLOT_TEX0:
-         case VARYING_SLOT_TEX1:
-         case VARYING_SLOT_TEX2:
-         case VARYING_SLOT_TEX3:
-         case VARYING_SLOT_TEX4:
-         case VARYING_SLOT_TEX5:
-         case VARYING_SLOT_TEX6:
-         case VARYING_SLOT_TEX7:
-            if (st->needs_texcoord_semantic) {
-               output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
-               output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
-               break;
-            }
-            /* fall through */
-         case VARYING_SLOT_VAR0:
-         default:
-            assert(attr >= VARYING_SLOT_VAR0 ||
-                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
-            output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            output_semantic_index[slot] =
-               st_get_generic_varying_index(st, attr);
-            break;
-         }
+         unsigned semantic_name, semantic_index;
+         tgsi_get_gl_varying_semantic(attr, st->needs_texcoord_semantic,
+                                      &semantic_name, &semantic_index);
+         output_semantic_name[slot] = semantic_name;
+         output_semantic_index[slot] = semantic_index;
       }
    }
    /* similar hack to above, presetup potentially unused edgeflag output */
@@ -574,7 +496,6 @@
                                    /* outputs */
                                    num_outputs,
                                    stvp->result_to_output,
-                                   output_slot_to_attr,
                                    output_semantic_name,
                                    output_semantic_index);
 
@@ -636,8 +557,10 @@
       vpv->tgsi.ir.nir = nir_shader_clone(NULL, stvp->tgsi.ir.nir);
       if (key->clamp_color)
          NIR_PASS_V(vpv->tgsi.ir.nir, nir_lower_clamp_color_outputs);
-      if (key->passthrough_edgeflags)
+      if (key->passthrough_edgeflags) {
          NIR_PASS_V(vpv->tgsi.ir.nir, nir_lower_passthrough_edgeflags);
+         vpv->num_inputs++;
+      }
 
       st_finalize_nir(st, &stvp->Base, vpv->tgsi.ir.nir);
 
@@ -716,10 +639,10 @@
 st_translate_fragment_program(struct st_context *st,
                               struct st_fragment_program *stfp)
 {
-   GLuint outputMapping[2 * FRAG_RESULT_MAX];
-   GLuint inputMapping[VARYING_SLOT_MAX];
-   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint interpMode[PIPE_MAX_SHADER_INPUTS];  /* XXX size? */
+   ubyte outputMapping[2 * FRAG_RESULT_MAX];
+   ubyte inputMapping[VARYING_SLOT_MAX];
+   ubyte inputSlotToAttr[VARYING_SLOT_MAX];
+   ubyte interpMode[PIPE_MAX_SHADER_INPUTS];  /* XXX size? */
    GLuint attr;
    GLbitfield64 inputsRead;
    struct ureg_program *ureg;
@@ -1032,7 +955,6 @@
                            /* outputs */
                            fs_num_outputs,
                            outputMapping,
-                           NULL,
                            fs_output_semantic_name,
                            fs_output_semantic_index);
 
@@ -1326,9 +1248,25 @@
       /* create new */
       fpv = st_create_fp_variant(st, stfp, key);
       if (fpv) {
-         /* insert into list */
-         fpv->next = stfp->variants;
-         stfp->variants = fpv;
+         if (key->bitmap || key->drawpixels) {
+            /* Regular variants should always come before the
+             * bitmap & drawpixels variants, (unless there
+             * are no regular variants) so that
+             * st_update_fp can take a fast path when
+             * shader_has_one_variant is set.
+             */
+            if (!stfp->variants) {
+               stfp->variants = fpv;
+            } else {
+               /* insert into list after the first one */
+               fpv->next = stfp->variants->next;
+               stfp->variants->next = fpv;
+            }
+         } else {
+            /* insert into list */
+            fpv->next = stfp->variants;
+            stfp->variants = fpv;
+         }
       }
    }
 
@@ -1348,10 +1286,9 @@
                             unsigned tgsi_processor,
                             struct pipe_shader_state *out_state)
 {
-   GLuint inputSlotToAttr[VARYING_SLOT_TESS_MAX];
-   GLuint inputMapping[VARYING_SLOT_TESS_MAX];
-   GLuint outputSlotToAttr[VARYING_SLOT_TESS_MAX];
-   GLuint outputMapping[VARYING_SLOT_TESS_MAX];
+   ubyte inputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   ubyte inputMapping[VARYING_SLOT_TESS_MAX];
+   ubyte outputMapping[VARYING_SLOT_TESS_MAX];
    GLuint attr;
 
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS];
@@ -1366,7 +1303,6 @@
 
    memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
-   memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
    memset(out_state, 0, sizeof(*out_state));
 
@@ -1484,7 +1420,6 @@
          GLuint slot = num_outputs++;
 
          outputMapping[attr] = slot;
-         outputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
@@ -1587,7 +1522,6 @@
          GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
 
          outputMapping[patch_attr] = slot;
-         outputSlotToAttr[slot] = patch_attr;
          output_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
          output_semantic_index[slot] = attr;
       }
@@ -1608,7 +1542,6 @@
                         /* outputs */
                         num_outputs,
                         outputMapping,
-                        outputSlotToAttr,
                         output_semantic_name,
                         output_semantic_index);
 
@@ -1639,7 +1572,7 @@
  */
 bool
 st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp)
+                              struct st_common_program *stgp)
 {
    struct ureg_program *ureg;
 
@@ -1726,7 +1659,7 @@
  */
 bool
 st_translate_tessctrl_program(struct st_context *st,
-                              struct st_tessctrl_program *sttcp)
+                              struct st_common_program *sttcp)
 {
    struct ureg_program *ureg;
 
@@ -1751,7 +1684,7 @@
  */
 bool
 st_translate_tesseval_program(struct st_context *st,
-                              struct st_tesseval_program *sttep)
+                              struct st_common_program *sttep)
 {
    struct ureg_program *ureg;
 
@@ -1798,6 +1731,19 @@
    struct ureg_program *ureg;
    struct pipe_shader_state prog;
 
+   if (stcp->shader_program) {
+      nir_shader *nir = st_glsl_to_nir(st, &stcp->Base, stcp->shader_program,
+                                       MESA_SHADER_COMPUTE);
+
+      /* no compute variants: */
+      st_finalize_nir(st, &stcp->Base, nir);
+
+      stcp->tgsi.ir_type = PIPE_SHADER_IR_NIR;
+      stcp->tgsi.prog = nir;
+
+      return true;
+   }
+
    ureg = ureg_create_with_screen(PIPE_SHADER_COMPUTE, st->pipe->screen);
    if (ureg == NULL)
       return false;
@@ -1914,16 +1860,11 @@
    case GL_TESS_EVALUATION_PROGRAM_NV:
    case GL_COMPUTE_PROGRAM_NV:
       {
-         struct st_geometry_program *gp = (struct st_geometry_program*)target;
-         struct st_tessctrl_program *tcp = (struct st_tessctrl_program*)target;
-         struct st_tesseval_program *tep = (struct st_tesseval_program*)target;
+         struct st_common_program *p = st_common_program(target);
          struct st_compute_program *cp = (struct st_compute_program*)target;
          struct st_basic_variant **variants =
-            target->Target == GL_GEOMETRY_PROGRAM_NV ? &gp->variants :
-            target->Target == GL_TESS_CONTROL_PROGRAM_NV ? &tcp->variants :
-            target->Target == GL_TESS_EVALUATION_PROGRAM_NV ? &tep->variants :
             target->Target == GL_COMPUTE_PROGRAM_NV ? &cp->variants :
-            NULL;
+                                                      &p->variants;
          struct st_basic_variant *v, **prevPtr = variants;
 
          for (v = *variants; v; ) {
@@ -2062,19 +2003,19 @@
    }
 
    case GL_TESS_CONTROL_PROGRAM_NV: {
-      struct st_tessctrl_program *p = (struct st_tessctrl_program *)prog;
+      struct st_common_program *p = st_common_program(prog);
       st_get_basic_variant(st, PIPE_SHADER_TESS_CTRL, &p->tgsi, &p->variants);
       break;
    }
 
    case GL_TESS_EVALUATION_PROGRAM_NV: {
-      struct st_tesseval_program *p = (struct st_tesseval_program *)prog;
+      struct st_common_program *p = st_common_program(prog);
       st_get_basic_variant(st, PIPE_SHADER_TESS_EVAL, &p->tgsi, &p->variants);
       break;
    }
 
    case GL_GEOMETRY_PROGRAM_NV: {
-      struct st_geometry_program *p = (struct st_geometry_program *)prog;
+      struct st_common_program *p = st_common_program(prog);
       st_get_basic_variant(st, PIPE_SHADER_GEOMETRY, &p->tgsi, &p->variants);
       break;
    }
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 70664d1..8e9f4c5 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -38,16 +38,16 @@
 #include "main/atifragshader.h"
 #include "program/program.h"
 #include "pipe/p_state.h"
+#include "tgsi/tgsi_from_mesa.h"
 #include "st_context.h"
 #include "st_texture.h"
 #include "st_glsl_to_tgsi.h"
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define ST_DOUBLE_ATTRIB_PLACEHOLDER 0xffffffff
+#define ST_DOUBLE_ATTRIB_PLACEHOLDER 0xff
 
 struct st_external_sampler_key
 {
@@ -212,13 +212,12 @@
    /* used when bypassing glsl_to_tgsi: */
    struct gl_shader_program *shader_program;
 
-   /** maps a Mesa VERT_ATTRIB_x to a packed TGSI input index */
    /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */
-   GLuint index_to_input[PIPE_MAX_SHADER_INPUTS];
-   GLuint num_inputs;
+   ubyte index_to_input[PIPE_MAX_ATTRIBS];
+   ubyte num_inputs;
 
    /** Maps VARYING_SLOT_x to slot */
-   GLuint result_to_output[VARYING_SLOT_MAX];
+   ubyte result_to_output[VARYING_SLOT_MAX];
 
    /** List of translated variants of this vertex program.
     */
@@ -254,43 +253,9 @@
 /**
  * Derived from Mesa gl_program:
  */
-struct st_geometry_program
+struct st_common_program
 {
-   struct gl_program Base;  /**< The Mesa geometry program */
-   struct pipe_shader_state tgsi;
-   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
-   uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */
-
-   struct st_basic_variant *variants;
-
-   /** SHA1 hash of linked tgsi shader program, used for on-disk cache */
-   unsigned char sha1[20];
-};
-
-
-/**
- * Derived from Mesa gl_program:
- */
-struct st_tessctrl_program
-{
-   struct gl_program Base;  /**< The Mesa tess ctrl program */
-   struct pipe_shader_state tgsi;
-   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
-   uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */
-
-   struct st_basic_variant *variants;
-
-   /** SHA1 hash of linked tgsi shader program, used for on-disk cache */
-   unsigned char sha1[20];
-};
-
-
-/**
- * Derived from Mesa gl_program:
- */
-struct st_tesseval_program
-{
-   struct gl_program Base;  /**< The Mesa tess eval program */
+   struct gl_program Base;
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
    uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */
@@ -312,6 +277,9 @@
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
    uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */
 
+   /* used when bypassing glsl_to_tgsi: */
+   struct gl_shader_program *shader_program;
+
    struct st_basic_variant *variants;
 
    /** SHA1 hash of linked tgsi shader program, used for on-disk cache */
@@ -332,22 +300,10 @@
    return (struct st_vertex_program *)vp;
 }
 
-static inline struct st_geometry_program *
-st_geometry_program( struct gl_program *gp )
+static inline struct st_common_program *
+st_common_program( struct gl_program *gp )
 {
-   return (struct st_geometry_program *)gp;
-}
-
-static inline struct st_tessctrl_program *
-st_tessctrl_program( struct gl_program *tcp )
-{
-   return (struct st_tessctrl_program *)tcp;
-}
-
-static inline struct st_tesseval_program *
-st_tesseval_program( struct gl_program *tep )
-{
-   return (struct st_tesseval_program *)tep;
+   return (struct st_common_program *)gp;
 }
 
 static inline struct st_compute_program *
@@ -367,16 +323,6 @@
 }
 
 static inline void
-st_reference_geomprog(struct st_context *st,
-                      struct st_geometry_program **ptr,
-                      struct st_geometry_program *prog)
-{
-   _mesa_reference_program(st->ctx,
-                           (struct gl_program **) ptr,
-                           (struct gl_program *) prog);
-}
-
-static inline void
 st_reference_fragprog(struct st_context *st,
                       struct st_fragment_program **ptr,
                       struct st_fragment_program *prog)
@@ -387,19 +333,9 @@
 }
 
 static inline void
-st_reference_tesscprog(struct st_context *st,
-                       struct st_tessctrl_program **ptr,
-                       struct st_tessctrl_program *prog)
-{
-   _mesa_reference_program(st->ctx,
-                           (struct gl_program **) ptr,
-                           (struct gl_program *) prog);
-}
-
-static inline void
-st_reference_tesseprog(struct st_context *st,
-                       struct st_tesseval_program **ptr,
-                       struct st_tesseval_program *prog)
+st_reference_prog(struct st_context *st,
+                  struct st_common_program **ptr,
+                  struct st_common_program *prog)
 {
    _mesa_reference_program(st->ctx,
                            (struct gl_program **) ptr,
@@ -422,23 +358,8 @@
 static inline unsigned
 st_get_generic_varying_index(struct st_context *st, GLuint attr)
 {
-   if (attr >= VARYING_SLOT_VAR0) {
-      if (st->needs_texcoord_semantic)
-         return attr - VARYING_SLOT_VAR0;
-      else
-         return 9 + (attr - VARYING_SLOT_VAR0);
-   }
-   if (attr == VARYING_SLOT_PNTC) {
-      assert(!st->needs_texcoord_semantic);
-      return 8;
-   }
-   if (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) {
-      assert(!st->needs_texcoord_semantic);
-      return attr - VARYING_SLOT_TEX0;
-   }
-
-   assert(0);
-   return 0;
+   return tgsi_get_generic_gl_varying_index((gl_varying_slot)attr,
+                                            st->needs_texcoord_semantic);
 }
 
 extern void
@@ -496,15 +417,15 @@
 
 extern bool
 st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp);
+                              struct st_common_program *stgp);
 
 extern bool
 st_translate_tessctrl_program(struct st_context *st,
-                              struct st_tessctrl_program *sttcp);
+                              struct st_common_program *sttcp);
 
 extern bool
 st_translate_tesseval_program(struct st_context *st,
-                              struct st_tesseval_program *sttep);
+                              struct st_common_program *sttep);
 
 extern bool
 st_translate_compute_program(struct st_context *st,
diff --git a/src/mesa/state_tracker/st_sampler_view.c b/src/mesa/state_tracker/st_sampler_view.c
index c78a987..fbf0aae 100644
--- a/src/mesa/state_tracker/st_sampler_view.c
+++ b/src/mesa/state_tracker/st_sampler_view.c
@@ -38,6 +38,7 @@
 #include "st_sampler_view.h"
 #include "st_texture.h"
 #include "st_format.h"
+#include "st_cb_bufferobjects.h"
 #include "st_cb_texture.h"
 
 
@@ -188,49 +189,27 @@
  */
 static unsigned
 compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
-                               enum pipe_format actualFormat,
-                               unsigned glsl_version)
+                               bool glsl130_or_later)
 {
    switch (baseFormat) {
    case GL_RGBA:
       return SWIZZLE_XYZW;
    case GL_RGB:
-      if (util_format_has_alpha(actualFormat))
-         return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
    case GL_RG:
-      if (util_format_get_nr_components(actualFormat) > 2)
-         return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
    case GL_RED:
-      if (util_format_get_nr_components(actualFormat) > 1)
-         return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO,
-                              SWIZZLE_ZERO, SWIZZLE_ONE);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO,
+                           SWIZZLE_ZERO, SWIZZLE_ONE);
    case GL_ALPHA:
-      if (util_format_get_nr_components(actualFormat) > 1)
-         return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
-                              SWIZZLE_ZERO, SWIZZLE_W);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
+                           SWIZZLE_ZERO, SWIZZLE_W);
    case GL_LUMINANCE:
-      if (util_format_get_nr_components(actualFormat) > 1)
-         return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
    case GL_LUMINANCE_ALPHA:
-      if (util_format_get_nr_components(actualFormat) > 2)
-         return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_W);
-      else
-         return SWIZZLE_XYZW;
+      return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_W);
    case GL_INTENSITY:
-      if (util_format_get_nr_components(actualFormat) > 1)
-         return SWIZZLE_XXXX;
-      else
-         return SWIZZLE_XYZW;
+      return SWIZZLE_XXXX;
    case GL_STENCIL_INDEX:
    case GL_DEPTH_STENCIL:
    case GL_DEPTH_COMPONENT:
@@ -256,7 +235,7 @@
           * BTW, it's required that sampler views are updated when
           * shaders change (check_sampler_swizzle takes care of that).
           */
-         if (glsl_version && glsl_version >= 130)
+         if (glsl130_or_later)
             return SWIZZLE_XXXX;
          else
             return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
@@ -278,33 +257,29 @@
 static unsigned
 get_texture_format_swizzle(const struct st_context *st,
                            const struct st_texture_object *stObj,
-                           unsigned glsl_version)
+                           bool glsl130_or_later)
 {
-   GLenum baseFormat = _mesa_texture_base_format(&stObj->base);
+   GLenum baseFormat = _mesa_base_tex_image(&stObj->base)->_BaseFormat;
    unsigned tex_swizzle;
+   GLenum depth_mode = stObj->base.DepthMode;
 
-   if (baseFormat != GL_NONE) {
-      GLenum depth_mode = stObj->base.DepthMode;
-      /* In ES 3.0, DEPTH_TEXTURE_MODE is expected to be GL_RED for textures
-       * with depth component data specified with a sized internal format.
-       */
-      if (_mesa_is_gles3(st->ctx) &&
-          util_format_is_depth_or_stencil(stObj->pt->format)) {
-         const struct gl_texture_image *firstImage =
-            _mesa_base_tex_image(&stObj->base);
-         if (firstImage->InternalFormat != GL_DEPTH_COMPONENT &&
-             firstImage->InternalFormat != GL_DEPTH_STENCIL &&
-             firstImage->InternalFormat != GL_STENCIL_INDEX)
-            depth_mode = GL_RED;
-      }
-      tex_swizzle = compute_texture_format_swizzle(baseFormat,
-                                                   depth_mode,
-                                                   stObj->pt->format,
-                                                   glsl_version);
+   /* In ES 3.0, DEPTH_TEXTURE_MODE is expected to be GL_RED for textures
+    * with depth component data specified with a sized internal format.
+    */
+   if (_mesa_is_gles3(st->ctx) &&
+       (baseFormat == GL_DEPTH_COMPONENT ||
+        baseFormat == GL_DEPTH_STENCIL ||
+        baseFormat == GL_STENCIL_INDEX)) {
+      const struct gl_texture_image *firstImage =
+         _mesa_base_tex_image(&stObj->base);
+      if (firstImage->InternalFormat != GL_DEPTH_COMPONENT &&
+          firstImage->InternalFormat != GL_DEPTH_STENCIL &&
+          firstImage->InternalFormat != GL_STENCIL_INDEX)
+         depth_mode = GL_RED;
    }
-   else {
-      tex_swizzle = SWIZZLE_XYZW;
-   }
+   tex_swizzle = compute_texture_format_swizzle(baseFormat,
+                                                depth_mode,
+                                                glsl130_or_later);
 
    /* Combine the texture format swizzle with user's swizzle */
    return swizzle_swizzle(stObj->base._Swizzle, tex_swizzle);
@@ -320,9 +295,10 @@
 MAYBE_UNUSED static boolean
 check_sampler_swizzle(const struct st_context *st,
                       const struct st_texture_object *stObj,
-		      const struct pipe_sampler_view *sv, unsigned glsl_version)
+                      const struct pipe_sampler_view *sv,
+                      bool glsl130_or_later)
 {
-   unsigned swizzle = get_texture_format_swizzle(st, stObj, glsl_version);
+   unsigned swizzle = get_texture_format_swizzle(st, stObj, glsl130_or_later);
 
    return ((sv->swizzle_r != GET_SWZ(swizzle, 0)) ||
            (sv->swizzle_g != GET_SWZ(swizzle, 1)) ||
@@ -362,43 +338,31 @@
 {
    enum pipe_format format;
 
-   if (stObj->base.Target == GL_TEXTURE_BUFFER) {
-      format =
-         st_mesa_format_to_pipe_format(st, stObj->base._BufferObjectFormat);
-   }
-   else {
-      format =
-         stObj->surface_based ? stObj->surface_format : stObj->pt->format;
+   GLenum baseFormat = _mesa_base_tex_image(&stObj->base)->_BaseFormat;
+   format = stObj->surface_based ? stObj->surface_format : stObj->pt->format;
 
-      if (util_format_is_depth_and_stencil(format)) {
-         if (stObj->base.StencilSampling) {
-            format = util_format_stencil_only(format);
-         }
-         else {
-            GLenum baseFormat = _mesa_texture_base_format(&stObj->base);
-            if (baseFormat == GL_STENCIL_INDEX) {
-               format = util_format_stencil_only(format);
-            }
-         }
-      }
-      else {
-         /* If sRGB decoding is off, use the linear format */
-         if (samp->sRGBDecode == GL_SKIP_DECODE_EXT) {
-            format = util_format_linear(format);
-         }
+   if (baseFormat == GL_DEPTH_COMPONENT ||
+       baseFormat == GL_DEPTH_STENCIL ||
+       baseFormat == GL_STENCIL_INDEX) {
+      if (stObj->base.StencilSampling || baseFormat == GL_STENCIL_INDEX)
+         format = util_format_stencil_only(format);
 
-         /* Use R8_UNORM for video formats */
-         switch (format) {
-         case PIPE_FORMAT_NV12:
-         case PIPE_FORMAT_IYUV:
-            format = PIPE_FORMAT_R8_UNORM;
-            break;
-         default:
-            break;
-         }
-      }
+      return format;
    }
 
+   /* If sRGB decoding is off, use the linear format */
+   if (samp->sRGBDecode == GL_SKIP_DECODE_EXT)
+      format = util_format_linear(format);
+
+   /* Use R8_UNORM for video formats */
+   switch (format) {
+   case PIPE_FORMAT_NV12:
+   case PIPE_FORMAT_IYUV:
+      format = PIPE_FORMAT_R8_UNORM;
+      break;
+   default:
+      break;
+   }
    return format;
 }
 
@@ -407,38 +371,25 @@
 st_create_texture_sampler_view_from_stobj(struct st_context *st,
 					  struct st_texture_object *stObj,
 					  enum pipe_format format,
-                                          unsigned glsl_version)
+                                          bool glsl130_or_later)
 {
+   /* There is no need to clear this structure (consider CPU overhead). */
    struct pipe_sampler_view templ;
-   unsigned swizzle = get_texture_format_swizzle(st, stObj, glsl_version);
+   unsigned swizzle = get_texture_format_swizzle(st, stObj, glsl130_or_later);
 
-   u_sampler_view_default_template(&templ, stObj->pt, format);
+   templ.format = format;
 
-   if (stObj->pt->target == PIPE_BUFFER) {
-      unsigned base, size;
-
-      base = stObj->base.BufferOffset;
-      if (base >= stObj->pt->width0)
-         return NULL;
-      size = MIN2(stObj->pt->width0 - base, (unsigned)stObj->base.BufferSize);
-      if (!size)
-         return NULL;
-
-      templ.u.buf.offset = base;
-      templ.u.buf.size = size;
+   templ.u.tex.first_level = stObj->base.MinLevel + stObj->base.BaseLevel;
+   templ.u.tex.last_level = last_level(stObj);
+   assert(templ.u.tex.first_level <= templ.u.tex.last_level);
+   if (stObj->layer_override) {
+      templ.u.tex.first_layer = templ.u.tex.last_layer = stObj->layer_override;
    } else {
-      templ.u.tex.first_level = stObj->base.MinLevel + stObj->base.BaseLevel;
-      templ.u.tex.last_level = last_level(stObj);
-      assert(templ.u.tex.first_level <= templ.u.tex.last_level);
-      if (stObj->layer_override) {
-         templ.u.tex.first_layer = templ.u.tex.last_layer = stObj->layer_override;
-      } else {
-         templ.u.tex.first_layer = stObj->base.MinLayer;
-         templ.u.tex.last_layer = last_layer(stObj);
-      }
-      assert(templ.u.tex.first_layer <= templ.u.tex.last_layer);
-      templ.target = gl_target_to_pipe(stObj->base.Target);
+      templ.u.tex.first_layer = stObj->base.MinLayer;
+      templ.u.tex.last_layer = last_layer(stObj);
    }
+   assert(templ.u.tex.first_layer <= templ.u.tex.last_layer);
+   templ.target = gl_target_to_pipe(stObj->base.Target);
 
    templ.swizzle_r = GET_SWZ(swizzle, 0);
    templ.swizzle_g = GET_SWZ(swizzle, 1);
@@ -453,14 +404,10 @@
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
                                        const struct gl_sampler_object *samp,
-                                       unsigned glsl_version)
+                                       bool glsl130_or_later)
 {
    struct pipe_sampler_view **sv;
 
-   if (!stObj || !stObj->pt) {
-      return NULL;
-   }
-
    sv = st_texture_get_sampler_view(st, stObj);
 
    if (*sv) {
@@ -468,35 +415,88 @@
        * what they're supposed to be.
        */
       MAYBE_UNUSED struct pipe_sampler_view *view = *sv;
-      assert(!check_sampler_swizzle(st, stObj, view, glsl_version));
+      assert(stObj->pt == view->texture);
+      assert(!check_sampler_swizzle(st, stObj, view, glsl130_or_later));
       assert(get_sampler_view_format(st, stObj, samp) == view->format);
       assert(gl_target_to_pipe(stObj->base.Target) == view->target);
-      if (stObj->base.Target == GL_TEXTURE_BUFFER) {
-         unsigned base = stObj->base.BufferOffset;
-         MAYBE_UNUSED unsigned size = MIN2(stObj->pt->width0 - base,
-                              (unsigned) stObj->base.BufferSize);
-         assert(view->u.buf.offset == base);
-         assert(view->u.buf.size == size);
-      }
-      else {
-         assert(stObj->base.MinLevel + stObj->base.BaseLevel ==
-                view->u.tex.first_level);
-         assert(last_level(stObj) == view->u.tex.last_level);
-         assert(stObj->layer_override || stObj->base.MinLayer == view->u.tex.first_layer);
-         assert(stObj->layer_override || last_layer(stObj) == view->u.tex.last_layer);
-         assert(!stObj->layer_override ||
-                (stObj->layer_override == view->u.tex.first_layer &&
-                 stObj->layer_override == view->u.tex.last_layer));
-      }
+      assert(stObj->base.MinLevel + stObj->base.BaseLevel ==
+             view->u.tex.first_level);
+      assert(last_level(stObj) == view->u.tex.last_level);
+      assert(stObj->layer_override || stObj->base.MinLayer == view->u.tex.first_layer);
+      assert(stObj->layer_override || last_layer(stObj) == view->u.tex.last_layer);
+      assert(!stObj->layer_override ||
+             (stObj->layer_override == view->u.tex.first_layer &&
+              stObj->layer_override == view->u.tex.last_layer));
    }
    else {
       /* create new sampler view */
       enum pipe_format format = get_sampler_view_format(st, stObj, samp);
 
       *sv = st_create_texture_sampler_view_from_stobj(st, stObj,
-                                                      format, glsl_version);
+                                                      format, glsl130_or_later);
 
    }
 
    return *sv;
 }
+
+
+struct pipe_sampler_view *
+st_get_buffer_sampler_view_from_stobj(struct st_context *st,
+                                      struct st_texture_object *stObj)
+{
+   struct pipe_sampler_view **sv;
+   struct st_buffer_object *stBuf =
+      st_buffer_object(stObj->base.BufferObject);
+
+   if (!stBuf || !stBuf->buffer)
+      return NULL;
+
+   sv = st_texture_get_sampler_view(st, stObj);
+
+   struct pipe_resource *buf = stBuf->buffer;
+   struct pipe_sampler_view *view = *sv;
+
+   if (view && view->texture == buf) {
+      /* Debug check: make sure that the sampler view's parameters are
+       * what they're supposed to be.
+       */
+      assert(st_mesa_format_to_pipe_format(st, stObj->base._BufferObjectFormat)
+             == view->format);
+      assert(view->target == PIPE_BUFFER);
+      unsigned base = stObj->base.BufferOffset;
+      MAYBE_UNUSED unsigned size = MIN2(buf->width0 - base,
+                           (unsigned) stObj->base.BufferSize);
+      assert(view->u.buf.offset == base);
+      assert(view->u.buf.size == size);
+   } else {
+      unsigned base = stObj->base.BufferOffset;
+
+      if (base >= buf->width0)
+         return NULL;
+
+      unsigned size = buf->width0 - base;
+      size = MIN2(size, (unsigned)stObj->base.BufferSize);
+      if (!size)
+         return NULL;
+
+      /* Create a new sampler view. There is no need to clear the entire
+       * structure (consider CPU overhead).
+       */
+      struct pipe_sampler_view templ;
+
+      templ.format =
+         st_mesa_format_to_pipe_format(st, stObj->base._BufferObjectFormat);
+      templ.target = PIPE_BUFFER;
+      templ.swizzle_r = PIPE_SWIZZLE_X;
+      templ.swizzle_g = PIPE_SWIZZLE_Y;
+      templ.swizzle_b = PIPE_SWIZZLE_Z;
+      templ.swizzle_a = PIPE_SWIZZLE_W;
+      templ.u.buf.offset = base;
+      templ.u.buf.size = size;
+
+      pipe_sampler_view_reference(sv, NULL);
+      *sv = st->pipe->create_sampler_view(st->pipe, buf, &templ);
+   }
+   return *sv;
+}
diff --git a/src/mesa/state_tracker/st_sampler_view.h b/src/mesa/state_tracker/st_sampler_view.h
index 6825ceb..392206b 100644
--- a/src/mesa/state_tracker/st_sampler_view.h
+++ b/src/mesa/state_tracker/st_sampler_view.h
@@ -73,7 +73,10 @@
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
                                        const struct gl_sampler_object *samp,
-                                       unsigned glsl_version);
+                                       bool glsl130_or_later);
 
+struct pipe_sampler_view *
+st_get_buffer_sampler_view_from_stobj(struct st_context *st,
+                                      struct st_texture_object *stObj);
 
 #endif /* ST_SAMPLER_VIEW_H */
diff --git a/src/mesa/state_tracker/st_shader_cache.c b/src/mesa/state_tracker/st_shader_cache.c
index 44abd4a..ba964eb 100644
--- a/src/mesa/state_tracker/st_shader_cache.c
+++ b/src/mesa/state_tracker/st_shader_cache.c
@@ -86,25 +86,11 @@
       write_tgsi_to_cache(blob, &stvp->tgsi, st, sha1, num_tokens);
       break;
    }
-   case MESA_SHADER_TESS_CTRL: {
-      struct st_tessctrl_program *stcp = (struct st_tessctrl_program *) prog;
-      sha1 = stcp->sha1;
-
-      write_stream_out_to_cache(blob, out_state);
-      write_tgsi_to_cache(blob, out_state, st, sha1, num_tokens);
-      break;
-   }
-   case MESA_SHADER_TESS_EVAL: {
-      struct st_tesseval_program *step = (struct st_tesseval_program *) prog;
-      sha1 = step->sha1;
-
-      write_stream_out_to_cache(blob, out_state);
-      write_tgsi_to_cache(blob, out_state, st, sha1, num_tokens);
-      break;
-   }
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY: {
-      struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
-      sha1 = stgp->sha1;
+      struct st_common_program *p = st_common_program(prog);
+      sha1 = p->sha1;
 
       write_stream_out_to_cache(blob, out_state);
       write_tgsi_to_cache(blob, out_state, st, sha1, num_tokens);
@@ -188,24 +174,21 @@
          break;
       }
       case MESA_SHADER_TESS_CTRL: {
-         struct st_tessctrl_program *stcp =
-            (struct st_tessctrl_program *) glprog;
+         struct st_common_program *stcp = st_common_program(glprog);
          stage_sha1[i] = stcp->sha1;
          ralloc_strcat(&buf, " tcs");
          disk_cache_compute_key(ctx->Cache, buf, strlen(buf), stage_sha1[i]);
          break;
       }
       case MESA_SHADER_TESS_EVAL: {
-         struct st_tesseval_program *step =
-            (struct st_tesseval_program *) glprog;
+         struct st_common_program *step = st_common_program(glprog);
          stage_sha1[i] = step->sha1;
          ralloc_strcat(&buf, " tes");
          disk_cache_compute_key(ctx->Cache, buf, strlen(buf), stage_sha1[i]);
          break;
       }
       case MESA_SHADER_GEOMETRY: {
-         struct st_geometry_program *stgp =
-            (struct st_geometry_program *) glprog;
+         struct st_common_program *stgp = st_common_program(glprog);
          stage_sha1[i] = stgp->sha1;
          ralloc_strcat(&buf, " gs");
          disk_cache_compute_key(ctx->Cache, buf, strlen(buf), stage_sha1[i]);
@@ -242,8 +225,12 @@
    if (prog->data->LinkStatus != linking_skipped)
       return false;
 
-   struct st_context *st = st_context(ctx);
    uint8_t *buffer = NULL;
+   if (ctx->_Shader->Flags & GLSL_CACHE_FALLBACK) {
+      goto fallback_recompile;
+   }
+
+   struct st_context *st = st_context(ctx);
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
          continue;
@@ -278,8 +265,7 @@
             break;
          }
          case MESA_SHADER_TESS_CTRL: {
-            struct st_tessctrl_program *sttcp =
-               (struct st_tessctrl_program *) glprog;
+            struct st_common_program *sttcp = st_common_program(glprog);
 
             st_release_basic_variants(st, sttcp->Base.Target,
                                       &sttcp->variants, &sttcp->tgsi);
@@ -293,8 +279,7 @@
             break;
          }
          case MESA_SHADER_TESS_EVAL: {
-            struct st_tesseval_program *sttep =
-               (struct st_tesseval_program *) glprog;
+            struct st_common_program *sttep = st_common_program(glprog);
 
             st_release_basic_variants(st, sttep->Base.Target,
                                       &sttep->variants, &sttep->tgsi);
@@ -308,8 +293,7 @@
             break;
          }
          case MESA_SHADER_GEOMETRY: {
-            struct st_geometry_program *stgp =
-               (struct st_geometry_program *) glprog;
+            struct st_common_program *stgp = st_common_program(glprog);
 
             st_release_basic_variants(st, stgp->Base.Target, &stgp->variants,
                                       &stgp->tgsi);
@@ -380,8 +364,7 @@
          }
 
          st_set_prog_affected_state_flags(glprog);
-         _mesa_associate_uniform_storage(ctx, prog, glprog->Parameters,
-                                         false);
+         _mesa_associate_uniform_storage(ctx, prog, glprog, false);
 
          /* Create Gallium shaders now instead of on demand. */
          if (ST_DEBUG & DEBUG_PRECOMPILE ||
@@ -393,8 +376,7 @@
          /* Failed to find a matching cached shader so fallback to recompile.
           */
          if (ctx->_Shader->Flags & GLSL_CACHE_INFO) {
-            fprintf(stderr, "TGSI cache item not found falling back to "
-                    "compile.\n");
+            fprintf(stderr, "TGSI cache item not found.\n");
          }
 
          goto fallback_recompile;
@@ -406,6 +388,9 @@
 fallback_recompile:
    free(buffer);
 
+   if (ctx->_Shader->Flags & GLSL_CACHE_INFO)
+      fprintf(stderr, "TGSI cache falling back to recompile.\n");
+
    for (unsigned i = 0; i < prog->NumShaders; i++) {
       _mesa_glsl_compile_shader(ctx, prog->Shaders[i], false, false, true);
    }
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 2e9856d..2cd783e 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <stdio.h>
@@ -55,14 +55,14 @@
 struct pipe_resource *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
-		  enum pipe_format format,
-		  GLuint last_level,
-		  GLuint width0,
-		  GLuint height0,
-		  GLuint depth0,
+                  enum pipe_format format,
+                  GLuint last_level,
+                  GLuint width0,
+                  GLuint height0,
+                  GLuint depth0,
                   GLuint layers,
                   GLuint nr_samples,
-                  GLuint bind )
+                  GLuint bind)
 {
    struct pipe_resource pt, *newtex;
    struct pipe_screen *screen = st->pipe->screen;
@@ -205,9 +205,9 @@
    unsigned ptWidth;
    uint16_t ptHeight, ptDepth, ptLayers;
 
-   /* Images with borders are never pulled into mipmap textures. 
+   /* Images with borders are never pulled into mipmap textures.
     */
-   if (image->Border) 
+   if (image->Border)
       return GL_FALSE;
 
    /* Check if this image's format matches the established texture's format.
@@ -315,6 +315,7 @@
    *transfer = NULL;
 }
 
+
 /**
  * For debug only: get/print center pixel in the src resource.
  */
@@ -420,3 +421,217 @@
                           texSize, texSize, 1, 1, 0, PIPE_BIND_SAMPLER_VIEW);
    return pt;
 }
+
+
+/**
+ * Destroy bound texture handles for the given stage.
+ */
+static void
+st_destroy_bound_texture_handles_per_stage(struct st_context *st,
+                                           enum pipe_shader_type shader)
+{
+   struct st_bound_handles *bound_handles = &st->bound_texture_handles[shader];
+   struct pipe_context *pipe = st->pipe;
+   unsigned i;
+
+   if (likely(!bound_handles->num_handles))
+      return;
+
+   for (i = 0; i < bound_handles->num_handles; i++) {
+      uint64_t handle = bound_handles->handles[i];
+
+      pipe->make_texture_handle_resident(pipe, handle, false);
+      pipe->delete_texture_handle(pipe, handle);
+   }
+   free(bound_handles->handles);
+   bound_handles->handles = NULL;
+   bound_handles->num_handles = 0;
+}
+
+
+/**
+ * Destroy all bound texture handles in the context.
+ */
+void
+st_destroy_bound_texture_handles(struct st_context *st)
+{
+   unsigned i;
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      st_destroy_bound_texture_handles_per_stage(st, i);
+   }
+}
+
+
+/**
+ * Destroy bound image handles for the given stage.
+ */
+static void
+st_destroy_bound_image_handles_per_stage(struct st_context *st,
+                                         enum pipe_shader_type shader)
+{
+   struct st_bound_handles *bound_handles = &st->bound_image_handles[shader];
+   struct pipe_context *pipe = st->pipe;
+   unsigned i;
+
+   if (likely(!bound_handles->num_handles))
+      return;
+
+   for (i = 0; i < bound_handles->num_handles; i++) {
+      uint64_t handle = bound_handles->handles[i];
+
+      pipe->make_image_handle_resident(pipe, handle, GL_READ_WRITE, false);
+      pipe->delete_image_handle(pipe, handle);
+   }
+   free(bound_handles->handles);
+   bound_handles->handles = NULL;
+   bound_handles->num_handles = 0;
+}
+
+
+/**
+ * Destroy all bound image handles in the context.
+ */
+void
+st_destroy_bound_image_handles(struct st_context *st)
+{
+   unsigned i;
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      st_destroy_bound_image_handles_per_stage(st, i);
+   }
+}
+
+
+/**
+ * Create a texture handle from a texture unit.
+ */
+static GLuint64
+st_create_texture_handle_from_unit(struct st_context *st,
+                                   struct gl_program *prog, GLuint texUnit)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_sampler_view *view;
+   struct pipe_sampler_state sampler = {0};
+
+   st_update_single_texture(st, &view, texUnit, prog->sh.data->Version >= 130);
+   if (!view)
+      return 0;
+
+   if (view->target != PIPE_BUFFER)
+      st_convert_sampler_from_unit(st, &sampler, texUnit);
+
+   assert(st->ctx->Texture.Unit[texUnit]._Current);
+
+   return pipe->create_texture_handle(pipe, view, &sampler);
+}
+
+
+/**
+ * Create an image handle from an image unit.
+ */
+static GLuint64
+st_create_image_handle_from_unit(struct st_context *st,
+                                 struct gl_program *prog, GLuint imgUnit)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_image_view img;
+
+   st_convert_image_from_unit(st, &img, imgUnit);
+
+   return pipe->create_image_handle(pipe, &img);
+}
+
+
+/**
+ * Make all bindless samplers bound to texture units resident in the context.
+ */
+void
+st_make_bound_samplers_resident(struct st_context *st,
+                                struct gl_program *prog)
+{
+   enum pipe_shader_type shader = st_shader_stage_to_ptarget(prog->info.stage);
+   struct st_bound_handles *bound_handles = &st->bound_texture_handles[shader];
+   struct pipe_context *pipe = st->pipe;
+   GLuint64 handle;
+   int i;
+
+   /* Remove previous bound texture handles for this stage. */
+   st_destroy_bound_texture_handles_per_stage(st, shader);
+
+   if (likely(!prog->sh.HasBoundBindlessSampler))
+      return;
+
+   for (i = 0; i < prog->sh.NumBindlessSamplers; i++) {
+      struct gl_bindless_sampler *sampler = &prog->sh.BindlessSamplers[i];
+
+      if (!sampler->bound)
+         continue;
+
+      /* Request a new texture handle from the driver and make it resident. */
+      handle = st_create_texture_handle_from_unit(st, prog, sampler->unit);
+      if (!handle)
+         continue;
+
+      pipe->make_texture_handle_resident(st->pipe, handle, true);
+
+      /* Overwrite the texture unit value by the resident handle before
+       * uploading the constant buffer.
+       */
+      *(uint64_t *)sampler->data = handle;
+
+      /* Store the handle in the context. */
+      bound_handles->handles = (uint64_t *)
+         realloc(bound_handles->handles,
+                 (bound_handles->num_handles + 1) * sizeof(uint64_t));
+      bound_handles->handles[bound_handles->num_handles] = handle;
+      bound_handles->num_handles++;
+   }
+}
+
+
+/**
+ * Make all bindless images bound to image units resident in the context.
+ */
+void
+st_make_bound_images_resident(struct st_context *st,
+                              struct gl_program *prog)
+{
+   enum pipe_shader_type shader = st_shader_stage_to_ptarget(prog->info.stage);
+   struct st_bound_handles *bound_handles = &st->bound_image_handles[shader];
+   struct pipe_context *pipe = st->pipe;
+   GLuint64 handle;
+   int i;
+
+   /* Remove previous bound image handles for this stage. */
+   st_destroy_bound_image_handles_per_stage(st, shader);
+
+   if (likely(!prog->sh.HasBoundBindlessImage))
+      return;
+
+   for (i = 0; i < prog->sh.NumBindlessImages; i++) {
+      struct gl_bindless_image *image = &prog->sh.BindlessImages[i];
+
+      if (!image->bound)
+         continue;
+
+      /* Request a new image handle from the driver and make it resident. */
+      handle = st_create_image_handle_from_unit(st, prog, image->unit);
+      if (!handle)
+         continue;
+
+      pipe->make_image_handle_resident(st->pipe, handle, GL_READ_WRITE, true);
+
+      /* Overwrite the image unit value by the resident handle before uploading
+       * the constant buffer.
+       */
+      *(uint64_t *)image->data = handle;
+
+      /* Store the handle in the context. */
+      bound_handles->handles = (uint64_t *)
+         realloc(bound_handles->handles,
+                 (bound_handles->num_handles + 1) * sizeof(uint64_t));
+      bound_handles->handles[bound_handles->num_handles] = handle;
+      bound_handles->num_handles++;
+   }
+}
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index 44b07da..a6f6ee8 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -84,6 +84,9 @@
     */
    GLuint lastLevel;
 
+   unsigned int validated_first_level;
+   unsigned int validated_last_level;
+
    /* On validation any active images held in main memory or in other
     * textures will be copied to this texture and the old storage freed.
     */
@@ -118,9 +121,15 @@
    uint layer_override;
 
    /** The glsl version of the shader seen during the previous validation */
-   unsigned prev_glsl_version;
+   bool prev_glsl130_or_later;
    /** The value of the sampler's sRGBDecode state at the previous validation */
    GLenum prev_sRGBDecode;
+
+    /**
+     * Set when the texture images of this texture object might not all be in
+     * the pipe_resource *pt above.
+     */
+    bool needs_validation;
 };
 
 
@@ -250,6 +259,11 @@
 extern struct pipe_resource *
 st_create_color_map_texture(struct gl_context *ctx);
 
+void
+st_destroy_bound_texture_handles(struct st_context *st);
+
+void
+st_destroy_bound_image_handles(struct st_context *st);
 
 bool
 st_etc_fallback(struct st_context *st, struct gl_texture_image *texImage);
@@ -258,4 +272,33 @@
 st_convert_image(const struct st_context *st, const struct gl_image_unit *u,
                  struct pipe_image_view *img);
 
+void
+st_convert_image_from_unit(const struct st_context *st,
+                           struct pipe_image_view *img,
+                           GLuint imgUnit);
+
+void
+st_convert_sampler(const struct st_context *st,
+                   const struct gl_texture_object *texobj,
+                   const struct gl_sampler_object *msamp,
+                   struct pipe_sampler_state *sampler);
+
+void
+st_convert_sampler_from_unit(const struct st_context *st,
+                             struct pipe_sampler_state *sampler,
+                             GLuint texUnit);
+
+void
+st_update_single_texture(struct st_context *st,
+                         struct pipe_sampler_view **sampler_view,
+                         GLuint texUnit, bool glsl130_or_later);
+
+void
+st_make_bound_samplers_resident(struct st_context *st,
+                                struct gl_program *prog);
+
+void
+st_make_bound_images_resident(struct st_context *st,
+                              struct gl_program *prog);
+
 #endif
diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
index 9cb03b3..9f3d21f 100644
--- a/src/mesa/swrast/s_context.c
+++ b/src/mesa/swrast/s_context.c
@@ -29,6 +29,8 @@
 #include "main/bufferobj.h"
 #include "main/mtypes.h"
 #include "main/samplerobj.h"
+#include "main/state.h"
+#include "main/stencil.h"
 #include "main/teximage.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
@@ -61,7 +63,7 @@
    if (ctx->Depth.Test)                   rasterMask |= DEPTH_BIT;
    if (swrast->_FogEnabled)               rasterMask |= FOG_BIT;
    if (ctx->Scissor.EnableFlags)          rasterMask |= CLIP_BIT;
-   if (ctx->Stencil._Enabled)             rasterMask |= STENCIL_BIT;
+   if (_mesa_stencil_is_enabled(ctx))     rasterMask |= STENCIL_BIT;
    for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
       if (!ctx->Color.ColorMask[i][0] ||
           !ctx->Color.ColorMask[i][1] ||
@@ -108,7 +110,7 @@
       rasterMask |= FRAGPROG_BIT;
    }
 
-   if (ctx->ATIFragmentShader._Enabled) {
+   if (_mesa_ati_fragment_shader_enabled(ctx)) {
       rasterMask |= ATIFRAGSHADER_BIT;
    }
 
@@ -288,7 +290,7 @@
    swrast->SpecularVertexAdd = (separateSpecular
                                 && ctx->Texture._MaxEnabledTexImageUnit == -1
                                 && !_swrast_use_fragment_program(ctx)
-                                && !ctx->ATIFragmentShader._Enabled);
+                                && !_mesa_ati_fragment_shader_enabled(ctx));
 }
 
 
@@ -503,7 +505,7 @@
       attribsMask = ctx->FragmentProgram._Current->info.inputs_read;
       attribsMask &= ~VARYING_BIT_POS; /* WPOS is always handled specially */
    }
-   else if (ctx->ATIFragmentShader._Enabled) {
+   else if (_mesa_ati_fragment_shader_enabled(ctx)) {
       attribsMask = VARYING_BIT_COL0 | VARYING_BIT_COL1 |
                     VARYING_BIT_FOGC | VARYING_BITS_TEX_ANY;
    }
diff --git a/src/mesa/swrast/s_renderbuffer.c b/src/mesa/swrast/s_renderbuffer.c
index 940c7b7..66a823d 100644
--- a/src/mesa/swrast/s_renderbuffer.c
+++ b/src/mesa/swrast/s_renderbuffer.c
@@ -274,7 +274,7 @@
       rb->InternalFormat = GL_RGBA;
 
       rb->AllocStorage = soft_renderbuffer_storage;
-      _mesa_add_renderbuffer_without_ref(fb, b, rb);
+      _mesa_attach_and_own_rb(fb, b, rb);
    }
 
    return GL_TRUE;
@@ -320,7 +320,7 @@
    }
 
    rb->AllocStorage = soft_renderbuffer_storage;
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, rb);
+   _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, rb);
 
    return GL_TRUE;
 }
@@ -358,7 +358,7 @@
    rb->InternalFormat = GL_STENCIL_INDEX8;
 
    rb->AllocStorage = soft_renderbuffer_storage;
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_STENCIL, rb);
+   _mesa_attach_and_own_rb(fb, BUFFER_STENCIL, rb);
 
    return GL_TRUE;
 }
@@ -382,8 +382,8 @@
    rb->InternalFormat = GL_DEPTH_STENCIL;
 
    rb->AllocStorage = soft_renderbuffer_storage;
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_DEPTH, rb);
-   _mesa_add_renderbuffer(fb, BUFFER_STENCIL, rb);
+   _mesa_attach_and_own_rb(fb, BUFFER_DEPTH, rb);
+   _mesa_attach_and_reference_rb(fb, BUFFER_STENCIL, rb);
 
    return GL_TRUE;
 }
@@ -420,7 +420,7 @@
 
    rb->InternalFormat = GL_RGBA16_SNORM;
    rb->AllocStorage = soft_renderbuffer_storage;
-   _mesa_add_renderbuffer_without_ref(fb, BUFFER_ACCUM, rb);
+   _mesa_attach_and_own_rb(fb, BUFFER_ACCUM, rb);
 
    return GL_TRUE;
 }
@@ -465,7 +465,7 @@
       rb->InternalFormat = GL_RGBA;
 
       rb->AllocStorage = soft_renderbuffer_storage;
-      _mesa_add_renderbuffer_without_ref(fb, BUFFER_AUX0 + i, rb);
+      _mesa_attach_and_own_rb(fb, BUFFER_AUX0 + i, rb);
    }
    return GL_TRUE;
 }
diff --git a/src/mesa/swrast/s_span.c b/src/mesa/swrast/s_span.c
index 49fc580..47a73e9 100644
--- a/src/mesa/swrast/s_span.c
+++ b/src/mesa/swrast/s_span.c
@@ -39,6 +39,8 @@
 #include "main/imports.h"
 #include "main/image.h"
 #include "main/samplerobj.h"
+#include "main/state.h"
+#include "main/stencil.h"
 #include "main/teximage.h"
 
 #include "s_atifragshader.h"
@@ -142,7 +144,7 @@
          const GLuint attr = VARYING_SLOT_TEX0 + i;
          const GLfloat *tc = ctx->Current.RasterTexCoords[i];
          if (_swrast_use_fragment_program(ctx) ||
-             ctx->ATIFragmentShader._Enabled) {
+             _mesa_ati_fragment_shader_enabled(ctx)) {
             COPY_4V(span->attrStart[attr], tc);
          }
          else if (tc[3] > 0.0F) {
@@ -523,7 +525,7 @@
          if (needLambda) {
             GLuint i;
             if (_swrast_use_fragment_program(ctx)
-                || ctx->ATIFragmentShader._Enabled) {
+                || _mesa_ati_fragment_shader_enabled(ctx)) {
                /* do perspective correction but don't divide s, t, r by q */
                const GLfloat dwdx = span->attrStepX[VARYING_SLOT_POS][3];
                GLfloat w = span->attrStart[VARYING_SLOT_POS][3] + span->leftClip * dwdx;
@@ -564,7 +566,7 @@
          else {
             GLuint i;
             if (_swrast_use_fragment_program(ctx) ||
-                ctx->ATIFragmentShader._Enabled) {
+                _mesa_ati_fragment_shader_enabled(ctx)) {
                /* do perspective correction but don't divide s, t, r by q */
                const GLfloat dwdx = span->attrStepX[VARYING_SLOT_POS][3];
                GLfloat w = span->attrStart[VARYING_SLOT_POS][3] + span->leftClip * dwdx;
@@ -976,7 +978,7 @@
 shade_texture_span(struct gl_context *ctx, SWspan *span)
 {
    if (_swrast_use_fragment_program(ctx) ||
-       ctx->ATIFragmentShader._Enabled) {
+       _mesa_ati_fragment_shader_enabled(ctx)) {
       /* programmable shading */
       if (span->primitive == GL_BITMAP && span->array->ChanType != GL_FLOAT) {
          convert_color_type(span, span->array->ChanType, GL_FLOAT, 0);
@@ -1008,7 +1010,7 @@
          _swrast_exec_fragment_program(ctx, span);
       }
       else {
-         assert(ctx->ATIFragmentShader._Enabled);
+         assert(_mesa_ati_fragment_shader_enabled(ctx));
          _swrast_exec_fragment_shader(ctx, span);
       }
    }
@@ -1138,7 +1140,7 @@
    const GLenum origChanType = span->array->ChanType;
    void * const origRgba = span->array->rgba;
    const GLboolean shader = (_swrast_use_fragment_program(ctx)
-                             || ctx->ATIFragmentShader._Enabled);
+                             || _mesa_ati_fragment_shader_enabled(ctx));
    const GLboolean shaderOrTexture = shader || ctx->Texture._EnabledCoordUnits;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
 
@@ -1213,14 +1215,14 @@
    }
 
    /* Stencil and Z testing */
-   if (ctx->Stencil._Enabled || ctx->Depth.Test) {
+   if (_mesa_stencil_is_enabled(ctx) || ctx->Depth.Test) {
       if (!(span->arrayMask & SPAN_Z))
          _swrast_span_interpolate_z(ctx, span);
 
       if (ctx->Transform.DepthClamp)
 	 _swrast_depth_clamp_span(ctx, span);
 
-      if (ctx->Stencil._Enabled) {
+      if (_mesa_stencil_is_enabled(ctx)) {
          /* Combined Z/stencil tests */
          if (!_swrast_stencil_and_ztest_span(ctx, span)) {
             /* all fragments failed test */
diff --git a/src/mesa/swrast/s_triangle.c b/src/mesa/swrast/s_triangle.c
index 876a74b..a4113e5 100644
--- a/src/mesa/swrast/s_triangle.c
+++ b/src/mesa/swrast/s_triangle.c
@@ -36,6 +36,7 @@
 #include "main/mtypes.h"
 #include "main/state.h"
 #include "main/samplerobj.h"
+#include "main/stencil.h"
 #include "main/teximage.h"
 #include "program/prog_instruction.h"
 
@@ -1023,7 +1024,7 @@
           ctx->Depth.Test &&
           ctx->Depth.Mask == GL_FALSE &&
           ctx->Depth.Func == GL_LESS &&
-          !ctx->Stencil._Enabled &&
+          !_mesa_stencil_is_enabled(ctx) &&
           depthRb &&
           depthRb->Format == MESA_FORMAT_Z_UNORM16) {
          if (ctx->Color.ColorMask[0][0] == 0 &&
@@ -1041,7 +1042,7 @@
        */
       if (ctx->Texture._EnabledCoordUnits ||
 	  _swrast_use_fragment_program(ctx) ||
-          ctx->ATIFragmentShader._Enabled ||
+          _mesa_ati_fragment_shader_enabled(ctx) ||
           _mesa_need_secondary_color(ctx) ||
           swrast->_FogEnabled) {
          /* Ugh, we do a _lot_ of tests to pick the best textured tri func */
@@ -1070,7 +1071,7 @@
          /* First see if we can use an optimized 2-D texture function */
          if (ctx->Texture._EnabledCoordUnits == 0x1
              && !_swrast_use_fragment_program(ctx)
-             && !ctx->ATIFragmentShader._Enabled
+             && !_mesa_ati_fragment_shader_enabled(ctx)
              && ctx->Texture._MaxEnabledTexImageUnit == 0
              && ctx->Texture.Unit[0]._Current->Target == GL_TEXTURE_2D
              && samp->WrapS == GL_REPEAT
diff --git a/src/mesa/swrast_setup/ss_context.c b/src/mesa/swrast_setup/ss_context.c
index 74b1da3..ec20d6c 100644
--- a/src/mesa/swrast_setup/ss_context.c
+++ b/src/mesa/swrast_setup/ss_context.c
@@ -28,6 +28,7 @@
 #include "main/glheader.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/state.h"
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
@@ -113,7 +114,7 @@
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    SScontext *swsetup = SWSETUP_CONTEXT(ctx);
    GLboolean intColors = !ctx->FragmentProgram._Current
-                      && !ctx->ATIFragmentShader._Enabled
+                      && !_mesa_ati_fragment_shader_enabled(ctx)
                       && ctx->RenderMode == GL_RENDER
                       && CHAN_TYPE != GL_FLOAT;
 
diff --git a/src/mesa/swrast_setup/ss_triangle.c b/src/mesa/swrast_setup/ss_triangle.c
index b92c20b..d3a0e23 100644
--- a/src/mesa/swrast_setup/ss_triangle.c
+++ b/src/mesa/swrast_setup/ss_triangle.c
@@ -29,6 +29,8 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
+#include "main/stencil.h"
+#include "main/state.h"
 
 #include "tnl/t_context.h"
 
@@ -255,7 +257,7 @@
     */
    if (ctx->Polygon.FrontMode != GL_FILL ||
        ctx->Polygon.BackMode != GL_FILL ||
-       (ctx->Stencil.Enabled && ctx->Stencil._TestTwoSide))
+       (ctx->Stencil.Enabled && _mesa_stencil_is_two_sided(ctx)))
       ind |= SS_UNFILLED_BIT;
 
    tnl->Driver.Render.Triangle = tri_tab[ind];
diff --git a/src/mesa/swrast_setup/ss_tritmp.h b/src/mesa/swrast_setup/ss_tritmp.h
index adb77bd..c887472 100644
--- a/src/mesa/swrast_setup/ss_tritmp.h
+++ b/src/mesa/swrast_setup/ss_tritmp.h
@@ -58,7 +58,7 @@
 
       if (IND & (SS_TWOSIDE_BIT | SS_UNFILLED_BIT))
       {
-	 facing = (cc < 0.0F) ^ ctx->Polygon._FrontBit;
+	 facing = (cc < 0.0F) ^ _mesa_polygon_get_front_bit(ctx);
 
 	 if (IND & SS_UNFILLED_BIT)
 	    mode = facing ? ctx->Polygon.BackMode : ctx->Polygon.FrontMode;
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 24d74c0..9fca4da 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -358,7 +358,7 @@
       bo[*nr_bo] = ib->obj;
       (*nr_bo)++;
       ptr = ctx->Driver.MapBufferRange(ctx, (GLsizeiptr) ib->ptr,
-                                       ib->count * vbo_sizeof_ib_type(ib->type),
+                                       ib->count * ib->index_size,
 				       GL_MAP_READ_BIT, ib->obj,
                                        MAP_INTERNAL);
       assert(ib->obj->Mappings[MAP_INTERNAL].Pointer);
@@ -367,19 +367,19 @@
       ptr = ADD_POINTERS(ib->obj->Mappings[MAP_INTERNAL].Pointer, ib->ptr);
    }
 
-   if (ib->type == GL_UNSIGNED_INT && VB->Primitive[0].basevertex == 0) {
+   if (ib->index_size == 4 && VB->Primitive[0].basevertex == 0) {
       VB->Elts = (GLuint *) ptr;
    }
    else {
       GLuint *elts = (GLuint *)get_space(ctx, ib->count * sizeof(GLuint));
       VB->Elts = elts;
 
-      if (ib->type == GL_UNSIGNED_INT) {
+      if (ib->index_size == 4) {
 	 const GLuint *in = (GLuint *)ptr;
 	 for (i = 0; i < ib->count; i++)
 	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
       }
-      else if (ib->type == GL_UNSIGNED_SHORT) {
+      else if (ib->index_size == 2) {
 	 const GLushort *in = (GLushort *)ptr;
 	 for (i = 0; i < ib->count; i++) 
 	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index 23e09a2..19be5ee 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -35,6 +35,7 @@
 #include "main/macros.h"
 #include "main/imports.h"
 #include "main/samplerobj.h"
+#include "main/state.h"
 #include "math/m_xform.h"
 #include "program/prog_instruction.h"
 #include "program/prog_statevars.h"
@@ -162,7 +163,8 @@
    /* Test userclip planes.  This contributes to VB->ClipMask.
     */
    /** XXX NEW_SLANG _Enabled ??? */
-   if (ctx->Transform.ClipPlanesEnabled && (!ctx->VertexProgram._Enabled ||
+   if (ctx->Transform.ClipPlanesEnabled &&
+       (!_mesa_arb_vertex_program_enabled(ctx) ||
       ctx->VertexProgram.Current->arb.IsPositionInvariant)) {
       userclip( ctx,
 		VB->ClipPtr,
diff --git a/src/mesa/tnl_dd/t_dd_tritmp.h b/src/mesa/tnl_dd/t_dd_tritmp.h
index 2176f1f..2294a76 100644
--- a/src/mesa/tnl_dd/t_dd_tritmp.h
+++ b/src/mesa/tnl_dd/t_dd_tritmp.h
@@ -137,7 +137,7 @@
 
       if (DO_TWOSIDE || DO_UNFILLED || DO_TWOSTENCIL)
       {
-	 facing = AREA_IS_CCW( cc ) ^ ctx->Polygon._FrontBit;
+	 facing = AREA_IS_CCW( cc ) ^ _mesa_polygon_get_front_bit(ctx);
 
 	 if (DO_UNFILLED) {
 	    if (facing) {
@@ -362,7 +362,7 @@
 
       if (DO_TWOSIDE || DO_UNFILLED || DO_TWOSTENCIL)
       {
-	 facing = AREA_IS_CCW( cc ) ^ ctx->Polygon._FrontBit;
+	 facing = AREA_IS_CCW( cc ) ^ _mesa_polygon_get_front_bit(ctx);
 
 	 if (DO_UNFILLED) {
 	    if (facing) {
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index d62ab4e..c8e87d3 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -69,7 +69,7 @@
  */
 struct _mesa_index_buffer {
    GLuint count;
-   GLenum type;
+   unsigned index_size;
    struct gl_buffer_object *obj;
    const void *ptr;
 };
@@ -78,7 +78,6 @@
 
 GLboolean _vbo_CreateContext( struct gl_context *ctx );
 void _vbo_DestroyContext( struct gl_context *ctx );
-void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state );
 
 
 void
@@ -91,7 +90,7 @@
 
 void vbo_exec_FlushVertices(struct gl_context *ctx, GLuint flags);
 void vbo_save_SaveFlushVertices(struct gl_context *ctx);
-GLboolean vbo_save_NotifyBegin(struct gl_context *ctx, GLenum mode);
+void vbo_save_NotifyBegin(struct gl_context *ctx, GLenum mode);
 void vbo_save_NewList(struct gl_context *ctx, GLuint list, GLenum mode);
 void vbo_save_EndList(struct gl_context *ctx);
 void vbo_save_BeginCallList(struct gl_context *ctx, struct gl_display_list *list);
diff --git a/src/mesa/vbo/vbo_attrib_tmp.h b/src/mesa/vbo/vbo_attrib_tmp.h
index 4e2c874..8328445 100644
--- a/src/mesa/vbo/vbo_attrib_tmp.h
+++ b/src/mesa/vbo/vbo_attrib_tmp.h
@@ -41,6 +41,8 @@
         FLOAT_AS_UNION(V2), FLOAT_AS_UNION(V3))
 #define ATTRD( A, N, V0, V1, V2, V3 ) \
     ATTR_UNION(A, N, GL_DOUBLE, double, V0, V1, V2, V3)
+#define ATTRUI64( A, N, V0, V1, V2, V3 ) \
+    ATTR_UNION(A, N, GL_UNSIGNED_INT64_ARB, uint64_t, V0, V1, V2, V3)
 
 
 /* float */
@@ -246,6 +248,9 @@
 #define ATTR3D( A, X, Y, Z )    ATTRD( A, 3, X, Y, Z, 1 )
 #define ATTR4D( A, X, Y, Z, W ) ATTRD( A, 4, X, Y, Z, W )
 
+#define ATTR1UIV64( A, V ) ATTRUI64( A, 1, (V)[0], 0, 0, 0 )
+#define ATTR1UI64( A, X )  ATTRUI64( A, 1, X, 0, 0, 0 )
+
 
 static void GLAPIENTRY
 TAG(Vertex2f)(GLfloat x, GLfloat y)
@@ -1302,6 +1307,29 @@
       ERROR(GL_INVALID_VALUE);
 }
 
+static void GLAPIENTRY
+TAG(VertexAttribL1ui64ARB)(GLuint index, GLuint64EXT x)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (index == 0 && _mesa_attr_zero_aliases_vertex(ctx))
+      ATTR1UI64(0, x);
+   else if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      ATTR1UI64(VBO_ATTRIB_GENERIC0 + index, x);
+   else
+      ERROR(GL_INVALID_VALUE);
+}
+
+static void GLAPIENTRY
+TAG(VertexAttribL1ui64vARB)(GLuint index, const GLuint64EXT *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (index == 0 && _mesa_attr_zero_aliases_vertex(ctx))
+      ATTR1UIV64(0, v);
+   else if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      ATTR1UIV64(VBO_ATTRIB_GENERIC0 + index, v);
+   else
+      ERROR(GL_INVALID_VALUE);
+}
 
 #undef ATTR1FV
 #undef ATTR2FV
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 7022fe9..a5f915d 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -168,7 +168,7 @@
    }
 
    vbo->draw_prims(ctx, prim, draw_count,
-                   ib, false, ~0, ~0,
+                   ib, false, 0, ~0,
                    NULL, 0,
                    ctx->DrawIndirectBuffer);
 
@@ -227,12 +227,6 @@
 }
 
 
-void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state )
-{
-   vbo_exec_invalidate_state(ctx, new_state);
-}
-
-
 void _vbo_DestroyContext( struct gl_context *ctx )
 {
    struct vbo_context *vbo = vbo_context(ctx);
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 5cf399f..70757d0 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -56,6 +56,7 @@
 #include "vbo_exec.h"
 #include "vbo_save.h"
 
+#include "main/api_arrayelt.h"
 #include "main/macros.h"
 
 #ifdef __cplusplus
@@ -91,6 +92,24 @@
 }
 
 
+static inline void
+vbo_exec_invalidate_state(struct gl_context *ctx)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+
+   if (ctx->NewState & (_NEW_PROGRAM | _NEW_ARRAY)) {
+      if (!exec->validating)
+         exec->array.recalculate_inputs = GL_TRUE;
+
+      _ae_invalidate_state(ctx);
+   }
+
+   if (ctx->NewState & _NEW_EVAL)
+      exec->eval.recalculate_maps = GL_TRUE;
+}
+
+
 /**
  * Return VP_x token to indicate whether we're running fixed-function
  * vertex transformation, an NV vertex program or ARB vertex program/shader.
@@ -156,6 +175,7 @@
       return GL_FALSE;
    case GL_INT:
    case GL_UNSIGNED_INT:
+   case GL_UNSIGNED_INT64_ARB:
       return GL_TRUE;
    default:
       assert(0);
@@ -170,6 +190,7 @@
    case GL_FLOAT:
    case GL_INT:
    case GL_UNSIGNED_INT:
+   case GL_UNSIGNED_INT64_ARB:
       return GL_FALSE;
    case GL_DOUBLE:
       return GL_TRUE;
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c
index 4db4f40..dc26dfd 100644
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -26,7 +26,6 @@
  */
 
 
-#include "main/api_arrayelt.h"
 #include "main/glheader.h"
 #include "main/mtypes.h"
 #include "main/vtxfmt.h"
@@ -34,24 +33,26 @@
 
 
 
-void vbo_exec_init( struct gl_context *ctx )
+void
+vbo_exec_init(struct gl_context *ctx)
 {
    struct vbo_exec_context *exec = &vbo_context(ctx)->exec;
 
    exec->ctx = ctx;
 
-   /* Initialize the arrayelt helper
-    */
-   if (!ctx->aelt_context &&
-       !_ae_create_context( ctx )) 
-      return;
+   /* aelt_context should have been created by the caller */
+   assert(ctx->aelt_context);
 
-   vbo_exec_vtx_init( exec );
+   vbo_exec_vtx_init(exec);
 
    ctx->Driver.NeedFlush = 0;
    ctx->Driver.CurrentExecPrimitive = PRIM_OUTSIDE_BEGIN_END;
 
-   vbo_exec_invalidate_state( ctx, ~0 );
+   /* The aelt_context state should still be dirty from its creation */
+   assert(_ae_is_state_dirty(ctx));
+
+   exec->array.recalculate_inputs = GL_TRUE;
+   exec->eval.recalculate_maps = GL_TRUE;
 }
 
 
@@ -69,27 +70,6 @@
 
 
 /**
- * Really want to install these callbacks to a central facility to be
- * invoked according to the state flags.  That will have to wait for a
- * mesa rework:
- */ 
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state )
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-
-   if (!exec->validating && new_state & (_NEW_PROGRAM|_NEW_ARRAY)) {
-      exec->array.recalculate_inputs = GL_TRUE;
-   }
-
-   if (new_state & _NEW_EVAL)
-      exec->eval.recalculate_maps = GL_TRUE;
-
-   _ae_invalidate_state(ctx, new_state);
-}
-
-
-/**
  * Figure out the number of transform feedback primitives that will be output
  * considering the drawing mode, number of vertices, and instance count,
  * assuming that no geometry shading is done and primitive restart is not
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index 9358ca2..f1e3881 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -147,7 +147,6 @@
  */
 void vbo_exec_init( struct gl_context *ctx );
 void vbo_exec_destroy( struct gl_context *ctx );
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
 
 
 /* Internal functions:
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index f08fd4c..019f986 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -176,11 +176,16 @@
        */
       GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
       fi_type tmp[8]; /* space for doubles */
-      int dmul = exec->vtx.attrtype[i] == GL_DOUBLE ? 2 : 1;
+      int dmul = 1;
+
+      if (exec->vtx.attrtype[i] == GL_DOUBLE ||
+          exec->vtx.attrtype[i] == GL_UNSIGNED_INT64_ARB)
+         dmul = 2;
 
       assert(exec->vtx.attrsz[i]);
 
-      if (exec->vtx.attrtype[i] == GL_DOUBLE) {
+      if (exec->vtx.attrtype[i] == GL_DOUBLE ||
+          exec->vtx.attrtype[i] == GL_UNSIGNED_INT64_ARB) {
          memset(tmp, 0, sizeof(tmp));
          memcpy(tmp, exec->vtx.attrptr[i], exec->vtx.attrsz[i] * sizeof(GLfloat));
       } else {
@@ -241,7 +246,8 @@
    GLint i;
 
    for (i = VBO_ATTRIB_POS + 1; i < VBO_ATTRIB_MAX; i++) {
-      if (exec->vtx.attrtype[i] == GL_DOUBLE) {
+      if (exec->vtx.attrtype[i] == GL_DOUBLE ||
+          exec->vtx.attrtype[i] == GL_UNSIGNED_INT64_ARB) {
          memcpy(exec->vtx.attrptr[i], vbo->currval[i].Ptr,
                 exec->vtx.attrsz[i] * sizeof(GLfloat));
       } else {
@@ -1089,6 +1095,9 @@
    vfmt->VertexAttribL2dv = vbo_VertexAttribL2dv;
    vfmt->VertexAttribL3dv = vbo_VertexAttribL3dv;
    vfmt->VertexAttribL4dv = vbo_VertexAttribL4dv;
+
+   vfmt->VertexAttribL1ui64ARB = vbo_VertexAttribL1ui64ARB;
+   vfmt->VertexAttribL1ui64vARB = vbo_VertexAttribL1ui64vARB;
 }
 
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index e5eeae4..21d14a8 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -180,6 +180,60 @@
 
 
 /**
+ * Check if we should skip the draw call even after validation was successful.
+ */
+static bool
+skip_validated_draw(struct gl_context *ctx)
+{
+   switch (ctx->API) {
+   case API_OPENGLES2:
+      /* For ES2, we can draw if we have a vertex program/shader). */
+      return ctx->VertexProgram._Current == NULL;
+
+   case API_OPENGLES:
+      /* For OpenGL ES, only draw if we have vertex positions
+       */
+      if (!ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POS].Enabled)
+         return true;
+      break;
+
+   case API_OPENGL_CORE:
+      /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec
+       * says:
+       *
+       *     "If there is no active program for the vertex or fragment shader
+       *     stages, the results of vertex and/or fragment processing will be
+       *     undefined. However, this is not an error."
+       *
+       * The fragment shader is not tested here because other state (e.g.,
+       * GL_RASTERIZER_DISCARD) affects whether or not we actually care.
+       */
+      return ctx->VertexProgram._Current == NULL;
+
+   case API_OPENGL_COMPAT:
+      if (ctx->VertexProgram._Current != NULL) {
+         /* Draw regardless of whether or not we have any vertex arrays.
+          * (Ex: could draw a point using a constant vertex pos)
+          */
+         return false;
+      } else {
+         /* Draw if we have vertex positions (GL_VERTEX_ARRAY or generic
+          * array [0]).
+          */
+         return (!ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POS].Enabled &&
+                 !ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_GENERIC0].Enabled);
+      }
+      break;
+
+   default:
+      unreachable("Invalid API value in check_valid_to_render()");
+   }
+
+   return false;
+}
+
+
+/**
  * Print info/data for glDrawArrays(), for debugging.
  */
 static void
@@ -408,25 +462,28 @@
                 GLuint drawID)
 {
    struct vbo_context *vbo = vbo_context(ctx);
-   struct _mesa_prim prim[2];
+   struct _mesa_prim prim;
+
+   if (skip_validated_draw(ctx))
+      return;
 
    vbo_bind_arrays(ctx);
 
    /* OpenGL 4.5 says that primitive restart is ignored with non-indexed
     * draws.
     */
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].num_instances = numInstances;
-   prim[0].base_instance = baseInstance;
-   prim[0].draw_id = drawID;
-   prim[0].is_indirect = 0;
-   prim[0].start = start;
-   prim[0].count = count;
+   memset(&prim, 0, sizeof(prim));
+   prim.begin = 1;
+   prim.end = 1;
+   prim.mode = mode;
+   prim.num_instances = numInstances;
+   prim.base_instance = baseInstance;
+   prim.draw_id = drawID;
+   prim.is_indirect = 0;
+   prim.start = start;
+   prim.count = count;
 
-   vbo->draw_prims(ctx, prim, 1, NULL,
+   vbo->draw_prims(ctx, &prim, 1, NULL,
                    GL_TRUE, start, start + count - 1, NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
@@ -568,8 +625,15 @@
       _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
                   _mesa_enum_to_string(mode), start, count);
 
-   if (!_mesa_validate_DrawArrays(ctx, mode, count))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawArrays(ctx, mode, count))
+         return;
+   }
 
    if (0)
       check_draw_arrays_data(ctx, start, count);
@@ -595,9 +659,17 @@
       _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
                   _mesa_enum_to_string(mode), start, count, numInstances);
 
-   if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count,
-                                           numInstances))
-      return;
+
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count,
+                                              numInstances))
+         return;
+   }
 
    if (0)
       check_draw_arrays_data(ctx, start, count);
@@ -625,9 +697,16 @@
                   _mesa_enum_to_string(mode), first, count,
                   numInstances, baseInstance);
 
-   if (!_mesa_validate_DrawArraysInstanced(ctx, mode, first, count,
-                                           numInstances))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawArraysInstanced(ctx, mode, first, count,
+                                              numInstances))
+         return;
+   }
 
    if (0)
       check_draw_arrays_data(ctx, first, count);
@@ -739,6 +818,25 @@
 #endif
 
 
+static bool
+skip_draw_elements(struct gl_context *ctx, GLsizei count,
+                   const GLvoid *indices)
+{
+   if (count == 0)
+      return true;
+
+   /* Not using a VBO for indices, so avoid NULL pointer derefs later.
+    */
+   if (!_mesa_is_bufferobj(ctx->Array.VAO->IndexBufferObj) && indices == NULL)
+      return true;
+
+   if (skip_validated_draw(ctx))
+      return true;
+
+   return false;
+}
+
+
 /**
  * Inner support for both _mesa_DrawElements and _mesa_DrawRangeElements.
  * Do the rendering for a glDrawElements or glDrawRangeElements call after
@@ -755,28 +853,36 @@
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct _mesa_index_buffer ib;
-   struct _mesa_prim prim[1];
+   struct _mesa_prim prim;
+
+   if (!index_bounds_valid) {
+      assert(start == 0u);
+      assert(end == ~0u);
+   }
+
+   if (skip_draw_elements(ctx, count, indices))
+      return;
 
    vbo_bind_arrays(ctx);
 
    ib.count = count;
-   ib.type = type;
+   ib.index_size = vbo_sizeof_ib_type(type);
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = indices;
 
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].weak = 0;
-   prim[0].pad = 0;
-   prim[0].mode = mode;
-   prim[0].start = 0;
-   prim[0].count = count;
-   prim[0].indexed = 1;
-   prim[0].is_indirect = 0;
-   prim[0].basevertex = basevertex;
-   prim[0].num_instances = numInstances;
-   prim[0].base_instance = baseInstance;
-   prim[0].draw_id = 0;
+   prim.begin = 1;
+   prim.end = 1;
+   prim.weak = 0;
+   prim.pad = 0;
+   prim.mode = mode;
+   prim.start = 0;
+   prim.count = count;
+   prim.indexed = 1;
+   prim.is_indirect = 0;
+   prim.basevertex = basevertex;
+   prim.num_instances = numInstances;
+   prim.base_instance = baseInstance;
+   prim.draw_id = 0;
 
    /* Need to give special consideration to rendering a range of
     * indices starting somewhere above zero.  Typically the
@@ -809,7 +915,7 @@
     * for the latter case elsewhere.
     */
 
-   vbo->draw_prims(ctx, prim, 1, &ib,
+   vbo->draw_prims(ctx, &prim, 1, &ib,
                    index_bounds_valid, start, end, NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
@@ -842,9 +948,16 @@
                   _mesa_enum_to_string(mode), start, end, count,
                   _mesa_enum_to_string(type), indices, basevertex);
 
-   if (!_mesa_validate_DrawRangeElements(ctx, mode, start, end, count,
-                                         type, indices))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawRangeElements(ctx, mode, start, end, count,
+                                            type, indices))
+         return;
+   }
 
    if ((int) end + basevertex < 0 || start + basevertex >= max_element) {
       /* The application requested we draw using a range of indices that's
@@ -898,6 +1011,11 @@
    (void) check_draw_elements_data;
 #endif
 
+   if (!index_bounds_valid) {
+      start = 0;
+      end = ~0;
+   }
+
    vbo_validated_drawrangeelements(ctx, mode, index_bounds_valid, start, end,
                                    count, type, indices, basevertex, 1, 0);
 }
@@ -937,10 +1055,17 @@
                   _mesa_enum_to_string(mode), count,
                   _mesa_enum_to_string(type), indices);
 
-   if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices, 0, 1, 0);
 }
 
@@ -955,14 +1080,21 @@
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
+      _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
                   _mesa_enum_to_string(mode), count,
-                  _mesa_enum_to_string(type), indices, basevertex);
+                  _mesa_enum_to_string(type), indices);
 
-   if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices, basevertex, 1, 0);
 }
 
@@ -977,15 +1109,22 @@
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
+      _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
                   _mesa_enum_to_string(mode), count,
-                  _mesa_enum_to_string(type), indices, numInstances);
+                  _mesa_enum_to_string(type), indices);
 
-   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
-                                             numInstances))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type,
+                                                indices, numInstances))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices, 0, numInstances, 0);
 }
 
@@ -1003,16 +1142,24 @@
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx,
-                  "glDrawElementsInstancedBaseVertex(%s, %d, %s, %p, %d; %d)\n",
+                  "glDrawElementsInstancedBaseVertex"
+                  "(%s, %d, %s, %p, %d; %d)\n",
                   _mesa_enum_to_string(mode), count,
                   _mesa_enum_to_string(type), indices,
                   numInstances, basevertex);
 
-   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
-                                             numInstances))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type,
+                                                indices, numInstances))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices,
                                    basevertex, numInstances, 0);
 }
@@ -1038,11 +1185,18 @@
                   _mesa_enum_to_string(type), indices,
                   numInstances, baseInstance);
 
-   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
-                                             numInstances))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type,
+                                                indices, numInstances))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices, 0, numInstances,
                                    baseInstance);
 }
@@ -1070,11 +1224,18 @@
                   _mesa_enum_to_string(type), indices,
                   numInstances, basevertex, baseInstance);
 
-   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
-                                             numInstances))
-      return;
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
 
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type,
+                                                indices, numInstances))
+         return;
+   }
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, 0, ~0,
                                    count, type, indices, basevertex,
                                    numInstances, baseInstance);
 }
@@ -1153,7 +1314,7 @@
 
    if (!fallback) {
       ib.count = (max_index_ptr - min_index_ptr) / index_type_size;
-      ib.type = type;
+      ib.index_size = vbo_sizeof_ib_type(type);
       ib.obj = ctx->Array.VAO->IndexBufferObj;
       ib.ptr = (void *) min_index_ptr;
 
@@ -1178,7 +1339,7 @@
       }
 
       vbo->draw_prims(ctx, prim, primcount, &ib,
-                      false, ~0, ~0, NULL, 0, NULL);
+                      false, 0, ~0, NULL, 0, NULL);
    }
    else {
       /* render one prim at a time */
@@ -1186,7 +1347,7 @@
          if (count[i] == 0)
             continue;
          ib.count = count[i];
-         ib.type = type;
+         ib.index_size = vbo_sizeof_ib_type(type);
          ib.obj = ctx->Array.VAO->IndexBufferObj;
          ib.ptr = indices[i];
 
@@ -1207,7 +1368,7 @@
          else
             prim[0].basevertex = 0;
 
-         vbo->draw_prims(ctx, prim, 1, &ib, false, ~0, ~0, NULL, 0, NULL);
+         vbo->draw_prims(ctx, prim, 1, &ib, false, 0, ~0, NULL, 0, NULL);
       }
    }
 
@@ -1230,6 +1391,9 @@
                                          primcount))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
                                    NULL);
 }
@@ -1248,6 +1412,9 @@
                                          primcount))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
                                    basevertex);
 }
@@ -1259,7 +1426,7 @@
                             GLuint stream, GLuint numInstances)
 {
    struct vbo_context *vbo = vbo_context(ctx);
-   struct _mesa_prim prim[2];
+   struct _mesa_prim prim;
 
    if (!_mesa_validate_DrawTransformFeedback(ctx, mode, obj, stream,
                                              numInstances)) {
@@ -1275,22 +1442,25 @@
       return;
    }
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_bind_arrays(ctx);
 
    /* init most fields to zero */
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].num_instances = numInstances;
-   prim[0].base_instance = 0;
-   prim[0].is_indirect = 0;
+   memset(&prim, 0, sizeof(prim));
+   prim.begin = 1;
+   prim.end = 1;
+   prim.mode = mode;
+   prim.num_instances = numInstances;
+   prim.base_instance = 0;
+   prim.is_indirect = 0;
 
    /* Maybe we should do some primitive splitting for primitive restart
     * (like in DrawArrays), but we have no way to know how many vertices
     * will be rendered. */
 
-   vbo->draw_prims(ctx, prim, 1, NULL, GL_FALSE, ~0, ~0, obj, stream, NULL);
+   vbo->draw_prims(ctx, &prim, 1, NULL, GL_FALSE, 0, ~0, obj, stream, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1421,7 +1591,7 @@
    vbo_bind_arrays(ctx);
 
    ib.count = 0;                /* unknown */
-   ib.type = type;
+   ib.index_size = vbo_sizeof_ib_type(type);
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
@@ -1453,7 +1623,7 @@
    /* NOTE: IndexBufferObj is guaranteed to be a VBO. */
 
    ib.count = 0;                /* unknown */
-   ib.type = type;
+   ib.index_size = vbo_sizeof_ib_type(type);
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
@@ -1479,7 +1649,17 @@
       _mesa_debug(ctx, "glDrawArraysIndirect(%s, %p)\n",
                   _mesa_enum_to_string(mode), indirect);
 
-   if (!_mesa_validate_DrawArraysIndirect(ctx, mode, indirect))
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawArraysIndirect(ctx, mode, indirect))
+         return;
+   }
+
+   if (skip_validated_draw(ctx))
       return;
 
    vbo_validated_drawarraysindirect(ctx, mode, indirect);
@@ -1496,7 +1676,17 @@
                   _mesa_enum_to_string(mode),
                   _mesa_enum_to_string(type), indirect);
 
-   if (!_mesa_validate_DrawElementsIndirect(ctx, mode, type, indirect))
+   if (_mesa_is_no_error_enabled(ctx)) {
+      FLUSH_CURRENT(ctx, 0);
+
+      if (ctx->NewState)
+         _mesa_update_state(ctx);
+   } else {
+      if (!_mesa_validate_DrawElementsIndirect(ctx, mode, type, indirect))
+         return;
+   }
+
+   if (skip_validated_draw(ctx))
       return;
 
    vbo_validated_drawelementsindirect(ctx, mode, type, indirect);
@@ -1521,6 +1711,9 @@
                                                primcount, stride))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawarraysindirect(ctx, mode, indirect,
                                          primcount, stride);
 }
@@ -1546,6 +1739,9 @@
                                                  primcount, stride))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawelementsindirect(ctx, mode, type, indirect,
                                            primcount, stride);
 }
@@ -1597,7 +1793,7 @@
    /* NOTE: IndexBufferObj is guaranteed to be a VBO. */
 
    ib.count = 0;                /* unknown */
-   ib.type = type;
+   ib.index_size = vbo_sizeof_ib_type(type);
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
@@ -1634,6 +1830,9 @@
                                                     maxdrawcount, stride))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawarraysindirectcount(ctx, mode, indirect, drawcount,
                                               maxdrawcount, stride);
 }
@@ -1662,6 +1861,9 @@
                                                       maxdrawcount, stride))
       return;
 
+   if (skip_validated_draw(ctx))
+      return;
+
    vbo_validated_multidrawelementsindirectcount(ctx, mode, type, indirect,
                                                 drawcount, maxdrawcount,
                                                 stride);
diff --git a/src/mesa/vbo/vbo_minmax_index.c b/src/mesa/vbo/vbo_minmax_index.c
index 0f75a87..1377926 100644
--- a/src/mesa/vbo/vbo_minmax_index.c
+++ b/src/mesa/vbo/vbo_minmax_index.c
@@ -38,7 +38,7 @@
 struct minmax_cache_key {
    GLintptr offset;
    GLuint count;
-   GLenum type;
+   unsigned index_size;
 };
 
 
@@ -60,7 +60,8 @@
 vbo_minmax_cache_key_equal(const struct minmax_cache_key *a,
                            const struct minmax_cache_key *b)
 {
-   return (a->offset == b->offset) && (a->count == b->count) && (a->type == b->type);
+   return (a->offset == b->offset) && (a->count == b->count) &&
+          (a->index_size == b->index_size);
 }
 
 
@@ -101,7 +102,7 @@
 
 static GLboolean
 vbo_get_minmax_cached(struct gl_buffer_object *bufferObj,
-                      GLenum type, GLintptr offset, GLuint count,
+                      unsigned index_size, GLintptr offset, GLuint count,
                       GLuint *min_index, GLuint *max_index)
 {
    GLboolean found = GL_FALSE;
@@ -137,7 +138,7 @@
       goto out_invalidate;
    }
 
-   key.type = type;
+   key.index_size = index_size;
    key.offset = offset;
    key.count = count;
    hash = vbo_minmax_cache_hash(&key);
@@ -173,7 +174,7 @@
 static void
 vbo_minmax_cache_store(struct gl_context *ctx,
                        struct gl_buffer_object *bufferObj,
-                       GLenum type, GLintptr offset, GLuint count,
+                       unsigned index_size, GLintptr offset, GLuint count,
                        GLuint min, GLuint max)
 {
    struct minmax_cache_entry *entry;
@@ -200,7 +201,7 @@
 
    entry->key.offset = offset;
    entry->key.count = count;
-   entry->key.type = type;
+   entry->key.index_size = index_size;
    entry->min = min;
    entry->max = max;
    hash = vbo_minmax_cache_hash(&entry->key);
@@ -240,26 +241,28 @@
                      const GLuint count)
 {
    const GLboolean restart = ctx->Array._PrimitiveRestart;
-   const GLuint restartIndex = _mesa_primitive_restart_index(ctx, ib->type);
-   const int index_size = vbo_sizeof_ib_type(ib->type);
+   const GLuint restartIndex =
+      _mesa_primitive_restart_index(ctx, ib->index_size);
    const char *indices;
    GLuint i;
+   GLintptr offset = 0;
 
-   indices = (char *) ib->ptr + prim->start * index_size;
+   indices = (char *) ib->ptr + prim->start * ib->index_size;
    if (_mesa_is_bufferobj(ib->obj)) {
-      GLsizeiptr size = MIN2(count * index_size, ib->obj->Size);
+      GLsizeiptr size = MIN2(count * ib->index_size, ib->obj->Size);
 
-      if (vbo_get_minmax_cached(ib->obj, ib->type, (GLintptr) indices, count,
-                                min_index, max_index))
+      if (vbo_get_minmax_cached(ib->obj, ib->index_size, (GLintptr) indices,
+                                count, min_index, max_index))
          return;
 
-      indices = ctx->Driver.MapBufferRange(ctx, (GLintptr) indices, size,
+      offset = (GLintptr) indices;
+      indices = ctx->Driver.MapBufferRange(ctx, offset, size,
                                            GL_MAP_READ_BIT, ib->obj,
                                            MAP_INTERNAL);
    }
 
-   switch (ib->type) {
-   case GL_UNSIGNED_INT: {
+   switch (ib->index_size) {
+   case 4: {
       const GLuint *ui_indices = (const GLuint *)indices;
       GLuint max_ui = 0;
       GLuint min_ui = ~0U;
@@ -287,7 +290,7 @@
       *max_index = max_ui;
       break;
    }
-   case GL_UNSIGNED_SHORT: {
+   case 2: {
       const GLushort *us_indices = (const GLushort *)indices;
       GLuint max_us = 0;
       GLuint min_us = ~0U;
@@ -309,7 +312,7 @@
       *max_index = max_us;
       break;
    }
-   case GL_UNSIGNED_BYTE: {
+   case 1: {
       const GLubyte *ub_indices = (const GLubyte *)indices;
       GLuint max_ub = 0;
       GLuint min_ub = ~0U;
@@ -336,8 +339,8 @@
    }
 
    if (_mesa_is_bufferobj(ib->obj)) {
-      vbo_minmax_cache_store(ctx, ib->obj, ib->type, prim->start, count,
-                             *min_index, *max_index);
+      vbo_minmax_cache_store(ctx, ib->obj, ib->index_size, offset,
+                             count, *min_index, *max_index);
       ctx->Driver.UnmapBuffer(ctx, ib->obj, MAP_INTERNAL);
    }
 }
diff --git a/src/mesa/vbo/vbo_primitive_restart.c b/src/mesa/vbo/vbo_primitive_restart.c
index 0662c5c..8f04def 100644
--- a/src/mesa/vbo/vbo_primitive_restart.c
+++ b/src/mesa/vbo/vbo_primitive_restart.c
@@ -175,7 +175,7 @@
    GLuint sub_prim_num;
    GLuint end_index;
    GLuint sub_end_index;
-   GLuint restart_index = _mesa_primitive_restart_index(ctx, ib->type);
+   GLuint restart_index = _mesa_primitive_restart_index(ctx, ib->index_size);
    struct _mesa_prim temp_prim;
    struct vbo_context *vbo = vbo_context(ctx);
    vbo_draw_func draw_prims_func = vbo->draw_prims;
@@ -226,7 +226,7 @@
 
    ptr = ADD_POINTERS(ib->obj->Mappings[MAP_INTERNAL].Pointer, ib->ptr);
 
-   sub_prims = find_sub_primitives(ptr, vbo_sizeof_ib_type(ib->type),
+   sub_prims = find_sub_primitives(ptr, ib->index_size,
                                    0, ib->count, restart_index,
                                    &num_sub_prims);
 
diff --git a/src/mesa/vbo/vbo_rebase.c b/src/mesa/vbo/vbo_rebase.c
index f40c59f..9f5dc46 100644
--- a/src/mesa/vbo/vbo_rebase.c
+++ b/src/mesa/vbo/vbo_rebase.c
@@ -182,14 +182,14 @@
       /* Some users might prefer it if we translated elements to
        * GLuints here.  Others wouldn't...
        */
-      switch (ib->type) {
-      case GL_UNSIGNED_INT: 
+      switch (ib->index_size) {
+      case 4:
 	 tmp_indices = rebase_GLuint( ptr, ib->count, min_index );
 	 break;
-      case GL_UNSIGNED_SHORT: 
+      case 2:
 	 tmp_indices = rebase_GLushort( ptr, ib->count, min_index );
 	 break;
-      case GL_UNSIGNED_BYTE: 
+      case 1:
 	 tmp_indices = rebase_GLubyte( ptr, ib->count, min_index );
 	 break;
       }      
@@ -204,7 +204,7 @@
       tmp_ib.obj = ctx->Shared->NullBufferObj;
       tmp_ib.ptr = tmp_indices;
       tmp_ib.count = ib->count;
-      tmp_ib.type = ib->type;
+      tmp_ib.index_size = ib->index_size;
 
       ib = &tmp_ib;
    }
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index ad54c3b..aab5f54 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -1035,7 +1035,7 @@
  * Called when a glBegin is getting compiled into a display list.
  * Updating of ctx->Driver.CurrentSavePrimitive is already taken care of.
  */
-GLboolean
+void
 vbo_save_NotifyBegin(struct gl_context *ctx, GLenum mode)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
@@ -1064,11 +1064,6 @@
 
    /* We need to call vbo_save_SaveFlushVertices() if there's state change */
    ctx->Driver.SaveNeedFlush = GL_TRUE;
-
-   /* GL_TRUE means we've handled this glBegin here; don't compile a BEGIN
-    * opcode into the display list.
-    */
-   return GL_TRUE;
 }
 
 
@@ -1113,13 +1108,23 @@
 static void GLAPIENTRY
 _save_PrimitiveRestartNV(void)
 {
-   GLenum curPrim;
    GET_CURRENT_CONTEXT(ctx);
+   struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   curPrim = ctx->Driver.CurrentSavePrimitive;
+   if (save->prim_count == 0) {
+      /* We're not inside a glBegin/End pair, so calling glPrimitiverRestartNV
+       * is an error.
+       */
+      _mesa_compile_error(ctx, GL_INVALID_OPERATION,
+                          "glPrimitiveRestartNV called outside glBegin/End");
+   } else {
+      /* get current primitive mode */
+      GLenum curPrim = save->prim[save->prim_count - 1].mode;
 
-   _save_End();
-   _save_Begin(curPrim);
+      /* restart primitive */
+      CALL_End(GET_DISPATCH(), ());
+      vbo_save_NotifyBegin(ctx, curPrim);
+   }
 }
 
 
@@ -1490,6 +1495,9 @@
    vfmt->VertexAttribL3dv = _save_VertexAttribL3dv;
    vfmt->VertexAttribL4dv = _save_VertexAttribL4dv;
 
+   vfmt->VertexAttribL1ui64ARB = _save_VertexAttribL1ui64ARB;
+   vfmt->VertexAttribL1ui64vARB = _save_VertexAttribL1ui64vARB;
+
    /* This will all require us to fallback to saving the list as opcodes:
     */
    vfmt->CallList = _save_CallList;
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index e718f29..8a4b659 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -293,8 +293,10 @@
 	 _mesa_update_state( ctx );
 
       /* XXX also need to check if shader enabled, but invalid */
-      if ((ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) ||
-          (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled)) {
+      if ((ctx->VertexProgram.Enabled &&
+           !_mesa_arb_vertex_program_enabled(ctx)) ||
+          (ctx->FragmentProgram.Enabled &&
+           !_mesa_arb_fragment_program_enabled(ctx))) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBegin (invalid vertex/fragment program)");
          return;
diff --git a/src/mesa/vbo/vbo_save_loopback.c b/src/mesa/vbo/vbo_save_loopback.c
index 7410f18..1dae91b 100644
--- a/src/mesa/vbo/vbo_save_loopback.c
+++ b/src/mesa/vbo/vbo_save_loopback.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2005 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <stdio.h>
@@ -37,33 +37,41 @@
 #include "vbo_context.h"
 
 
-typedef void (*attr_func)( struct gl_context *ctx, GLint target, const GLfloat * );
+typedef void (*attr_func)(struct gl_context *ctx, GLint index, const GLfloat *);
 
 
 /* This file makes heavy use of the aliasing of NV vertex attributes
  * with the legacy attributes, and also with ARB and Material
  * attributes as currently implemented.
  */
-static void VertexAttrib1fvNV(struct gl_context *ctx, GLint target, const GLfloat *v)
+static void
+VertexAttrib1fvNV(struct gl_context *ctx, GLint index, const GLfloat *v)
 {
-   CALL_VertexAttrib1fvNV(ctx->Exec, (target, v));
+   CALL_VertexAttrib1fvNV(ctx->Exec, (index, v));
 }
 
-static void VertexAttrib2fvNV(struct gl_context *ctx, GLint target, const GLfloat *v)
+
+static void
+VertexAttrib2fvNV(struct gl_context *ctx, GLint index, const GLfloat *v)
 {
-   CALL_VertexAttrib2fvNV(ctx->Exec, (target, v));
+   CALL_VertexAttrib2fvNV(ctx->Exec, (index, v));
 }
 
-static void VertexAttrib3fvNV(struct gl_context *ctx, GLint target, const GLfloat *v)
+
+static void
+VertexAttrib3fvNV(struct gl_context *ctx, GLint index, const GLfloat *v)
 {
-   CALL_VertexAttrib3fvNV(ctx->Exec, (target, v));
+   CALL_VertexAttrib3fvNV(ctx->Exec, (index, v));
 }
 
-static void VertexAttrib4fvNV(struct gl_context *ctx, GLint target, const GLfloat *v)
+
+static void
+VertexAttrib4fvNV(struct gl_context *ctx, GLint index, const GLfloat *v)
 {
-   CALL_VertexAttrib4fvNV(ctx->Exec, (target, v));
+   CALL_VertexAttrib4fvNV(ctx->Exec, (index, v));
 }
 
+
 static attr_func vert_attrfunc[4] = {
    VertexAttrib1fvNV,
    VertexAttrib2fvNV,
@@ -71,22 +79,26 @@
    VertexAttrib4fvNV
 };
 
+
 struct loopback_attr {
-   GLint target;
+   GLint index;
    GLint sz;
    attr_func func;
 };
 
-/* Don't emit ends and begins on wrapped primitives.  Don't replay
+
+/**
+ * Don't emit ends and begins on wrapped primitives.  Don't replay
  * wrapped vertices.  If we get here, it's probably because the
  * precalculated wrapping is wrong.
  */
-static void loopback_prim( struct gl_context *ctx,
-			   const GLfloat *buffer,
-			   const struct _mesa_prim *prim,
-			   GLuint wrap_count,
-			   GLuint vertex_size,
-			   const struct loopback_attr *la, GLuint nr )
+static void
+loopback_prim(struct gl_context *ctx,
+              const GLfloat *buffer,
+              const struct _mesa_prim *prim,
+              GLuint wrap_count,
+              GLuint vertex_size,
+              const struct loopback_attr *la, GLuint nr)
 {
    GLint start = prim->start;
    GLint end = start + prim->count;
@@ -96,14 +108,13 @@
 
    if (0)
       printf("loopback prim %s(%s,%s) verts %d..%d\n",
-	     _mesa_lookup_prim_by_nr(prim->mode),
-	     prim->begin ? "begin" : "..",
-	     prim->end ? "end" : "..",
-	     start, 
-	     end);
+             _mesa_lookup_prim_by_nr(prim->mode),
+             prim->begin ? "begin" : "..",
+             prim->end ? "end" : "..",
+             start, end);
 
    if (prim->begin) {
-      CALL_Begin(GET_DISPATCH(), ( prim->mode ));
+      CALL_Begin(GET_DISPATCH(), (prim->mode));
    }
    else {
       assert(start == 0);
@@ -112,17 +123,17 @@
 
    data = buffer + start * vertex_size;
 
-   for (j = start ; j < end ; j++) {
+   for (j = start; j < end; j++) {
       const GLfloat *tmp = data + la[0].sz;
 
-      for (k = 1 ; k < nr ; k++) {
-	 la[k].func( ctx, la[k].target, tmp );
-	 tmp += la[k].sz;
+      for (k = 1; k < nr; k++) {
+         la[k].func(ctx, la[k].index, tmp);
+         tmp += la[k].sz;
       }
-	 
+
       /* Fire the vertex
        */
-      la[0].func( ctx, VBO_ATTRIB_POS, data );
+      la[0].func(ctx, VBO_ATTRIB_POS, data);
       data = tmp;
    }
 
@@ -131,13 +142,16 @@
    }
 }
 
-/* Primitives generated by DrawArrays/DrawElements/Rectf may be
+
+/**
+ * Primitives generated by DrawArrays/DrawElements/Rectf may be
  * caught here.  If there is no primitive in progress, execute them
  * normally, otherwise need to track and discard the generated
  * primitives.
  */
-static void loopback_weak_prim( struct gl_context *ctx,
-				const struct _mesa_prim *prim )
+static void
+loopback_weak_prim(struct gl_context *ctx,
+                   const struct _mesa_prim *prim)
 {
    /* Use the prim_weak flag to ensure that if this primitive
     * wraps, we don't mistake future vertex_lists for part of the
@@ -153,13 +167,14 @@
 }
 
 
-void vbo_loopback_vertex_list( struct gl_context *ctx,
-			       const GLfloat *buffer,
-			       const GLubyte *attrsz,
-			       const struct _mesa_prim *prim,
-			       GLuint prim_count,
-			       GLuint wrap_count,
-			       GLuint vertex_size)
+void
+vbo_loopback_vertex_list(struct gl_context *ctx,
+                         const GLfloat *buffer,
+                         const GLubyte *attrsz,
+                         const struct _mesa_prim *prim,
+                         GLuint prim_count,
+                         GLuint wrap_count,
+                         GLuint vertex_size)
 {
    struct loopback_attr la[VBO_ATTRIB_MAX];
    GLuint i, nr = 0;
@@ -167,24 +182,21 @@
    /* All Legacy, NV, ARB and Material attributes are routed through
     * the NV attributes entrypoints:
     */
-   for (i = 0 ; i < VBO_ATTRIB_MAX ; i++) {
+   for (i = 0; i < VBO_ATTRIB_MAX; i++) {
       if (attrsz[i]) {
-	 la[nr].target = i;
-	 la[nr].sz = attrsz[i];
-	 la[nr].func = vert_attrfunc[attrsz[i]-1];
-	 nr++;
+         la[nr].index = i;
+         la[nr].sz = attrsz[i];
+         la[nr].func = vert_attrfunc[attrsz[i]-1];
+         nr++;
       }
    }
 
-   for (i = 0 ; i < prim_count ; i++) {
+   for (i = 0; i < prim_count; i++) {
       if ((prim[i].mode & VBO_SAVE_PRIM_WEAK) &&
-          _mesa_inside_begin_end(ctx))
-      {
-	 loopback_weak_prim( ctx, &prim[i] );
-      }
-      else
-      {
-	 loopback_prim( ctx, buffer, &prim[i], wrap_count, vertex_size, la, nr );
+          _mesa_inside_begin_end(ctx)) {
+         loopback_weak_prim(ctx, &prim[i]);
+      } else {
+         loopback_prim(ctx, buffer, &prim[i], wrap_count, vertex_size, la, nr);
       }
    }
 }
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index ce8831d..8e35e44 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -479,8 +479,8 @@
             ADD_POINTERS(copy->ib->obj->Mappings[MAP_INTERNAL].Pointer,
                          copy->ib->ptr);
 
-   switch (copy->ib->type) {
-   case GL_UNSIGNED_BYTE:
+   switch (copy->ib->index_size) {
+   case 1:
       copy->translated_elt_buf = malloc(sizeof(GLuint) * copy->ib->count);
       copy->srcelt = copy->translated_elt_buf;
 
@@ -488,7 +488,7 @@
 	 copy->translated_elt_buf[i] = ((const GLubyte *)srcptr)[i];
       break;
 
-   case GL_UNSIGNED_SHORT:
+   case 2:
       copy->translated_elt_buf = malloc(sizeof(GLuint) * copy->ib->count);
       copy->srcelt = copy->translated_elt_buf;
 
@@ -496,7 +496,7 @@
 	 copy->translated_elt_buf[i] = ((const GLushort *)srcptr)[i];
       break;
 
-   case GL_UNSIGNED_INT:
+   case 4:
       copy->translated_elt_buf = NULL;
       copy->srcelt = (const GLuint *)srcptr;
       break;
@@ -551,7 +551,7 @@
     * list:
     */
    copy->dstib.count = 0;	/* duplicates dstelt_nr */
-   copy->dstib.type = GL_UNSIGNED_INT;
+   copy->dstib.index_size = 4;
    copy->dstib.obj = ctx->Shared->NullBufferObj;
    copy->dstib.ptr = copy->dstelt;
 }
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index 1430ac9..9c5c288 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -75,7 +75,7 @@
 
       ib.count = split->max_index - split->min_index + 1;
       ib.ptr = (const void *)((const char *)ib.ptr + 
-                              split->min_index * _mesa_sizeof_type(ib.type));
+                              split->min_index * ib.index_size);
 
       /* Rebase the primitives to save index buffer entries. */
       for (i = 0; i < split->dstprim_nr; i++)
@@ -223,7 +223,7 @@
 	    elts[j] = prim->start + j;
 
 	 ib.count = count;
-	 ib.type = GL_UNSIGNED_INT;
+	 ib.index_size = 4;
 	 ib.obj = split->ctx->Shared->NullBufferObj;
 	 ib.ptr = elts;
 
diff --git a/src/mesa/x86/mmx_blend.S b/src/mesa/x86/mmx_blend.S
index eeaf43e..df736cd 100644
--- a/src/mesa/x86/mmx_blend.S
+++ b/src/mesa/x86/mmx_blend.S
@@ -235,7 +235,7 @@
 TWO(PUNPCKHDQ  ( MA2, MA2 ))                    /*    pa2    |    pa2    |    pa2    |    pa2    */
 
 #define GMB_PACK( MS1, MS2 ) \
-    PACKUSWB   ( MS2, MS1 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
+    PACKUSWB   ( MS2, MS1 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;
 
 #define GMB_STORE(rgba, MSS ) \
 ONE(MOVD       ( MSS, REGIND(rgba) ))		/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */	;\
diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S
index 5def1f8..0b94f02 100644
--- a/src/mesa/x86/read_rgba_span_x86.S
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -65,7 +65,7 @@
 	movl	(%ebx), %eax ; \
 	bswap	%eax          /* ARGB -> BGRA */ ; \
 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
-	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
 
 
 /**
diff --git a/src/util/Makefile.am b/src/util/Makefile.am
index f094eb4..a20d55a 100644
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -44,8 +44,12 @@
 	$(MESA_UTIL_FILES) \
 	$(MESA_UTIL_GENERATED_FILES)
 
-libmesautil_la_LIBADD = $(ZLIB_LIBS)
+libmesautil_la_LIBADD = \
+	$(CLOCK_LIB) \
+	$(ZLIB_LIBS) \
+	$(LIBATOMIC_LIBS)
 
+u_atomic_test_LDADD = libmesautil.la
 roundeven_test_LDADD = -lm
 
 check_PROGRAMS = u_atomic_test roundeven_test
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index e905734..8a8c3b3 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -37,23 +37,20 @@
 	simple_list.h \
 	slab.c \
 	slab.h \
-	string_to_uint_map.cpp \
-	string_to_uint_map.h \
 	strndup.h \
 	strtod.c \
 	strtod.h \
 	texcompress_rgtc_tmp.h \
 	u_atomic.c \
 	u_atomic.h \
+	u_dynarray.h \
 	u_endian.h \
 	u_queue.c \
 	u_queue.h \
 	u_string.h \
 	u_thread.h \
 	u_vector.c \
-	u_vector.h \
-	vk_alloc.h \
-	vk_util.h
+	u_vector.h
 
 MESA_UTIL_GENERATED_FILES = \
 	format_srgb.c
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index 7a605e0..611e812 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -136,7 +136,7 @@
 static inline void
 u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
 {
-   if (*mask == ~0llu) {
+   if (*mask == ~0ull) {
       *start = 0;
       *count = 64;
       *mask = 0;
diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
index cf5d518..b222987 100644
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -161,7 +161,8 @@
 }
 
 struct disk_cache *
-disk_cache_create(const char *gpu_name, const char *timestamp)
+disk_cache_create(const char *gpu_name, const char *timestamp,
+                  uint64_t driver_flags)
 {
    void *local;
    struct disk_cache *cache = NULL;
@@ -341,7 +342,7 @@
     * really care about getting things to disk quickly just that it's not
     * blocking other tasks.
     */
-   util_queue_init(&cache->cache_queue, "disk_cache", 32, 1);
+   util_queue_init(&cache->cache_queue, "disk_cache", 32, 1, 0);
 
    /* Create driver id keys */
    size_t ts_size = strlen(timestamp) + 1;
@@ -356,6 +357,9 @@
    size_t ptr_size_size = sizeof(ptr_size);
    cache->driver_keys_blob_size += ptr_size_size;
 
+   size_t driver_flags_size = sizeof(driver_flags);
+   cache->driver_keys_blob_size += driver_flags_size;
+
    cache->driver_keys_blob =
       ralloc_size(cache, cache->driver_keys_blob_size);
    if (!cache->driver_keys_blob)
@@ -365,6 +369,8 @@
    memcpy(cache->driver_keys_blob + ts_size, gpu_name, gpu_name_size);
    memcpy(cache->driver_keys_blob + ts_size + gpu_name_size, &ptr_size,
           ptr_size_size);
+   memcpy(cache->driver_keys_blob + ts_size + gpu_name_size + ptr_size_size,
+          &driver_flags, driver_flags_size);
 
    /* Seed our rand function */
    s_rand_xorshift128plus(cache->seed_xorshift128plus, true);
diff --git a/src/util/disk_cache.h b/src/util/disk_cache.h
index 2bb1cf5..9aade16 100644
--- a/src/util/disk_cache.h
+++ b/src/util/disk_cache.h
@@ -93,7 +93,8 @@
  * assistance in computing SHA-1 signatures.
  */
 struct disk_cache *
-disk_cache_create(const char *gpu_name, const char *timestamp);
+disk_cache_create(const char *gpu_name, const char *timestamp,
+                  uint64_t driver_flags);
 
 /**
  * Destroy a cache object, (freeing all associated resources).
@@ -142,7 +143,7 @@
  * Later this key can be checked with disk_cache_has_key(), (unless the key
  * has been evicted in the interim).
  *
- * Any call to cache_record() may cause an existing, random key to be
+ * Any call to disk_cache_put_key() may cause an existing, random key to be
  * evicted from the cache.
  */
 void
@@ -171,7 +172,8 @@
 #else
 
 static inline struct disk_cache *
-disk_cache_create(const char *gpu_name, const char *timestamp)
+disk_cache_create(const char *gpu_name, const char *timestamp,
+                  uint64_t driver_flags)
 {
    return NULL;
 }
diff --git a/src/util/hash_table.c b/src/util/hash_table.c
index 9e643af..a9d442d 100644
--- a/src/util/hash_table.c
+++ b/src/util/hash_table.c
@@ -47,6 +47,7 @@
 #include "hash_table.h"
 #include "ralloc.h"
 #include "macros.h"
+#include "main/hash.h"
 
 static const uint32_t deleted_key_value;
 
@@ -502,3 +503,149 @@
 {
    return a == b;
 }
+
+/**
+ * Hash table wrapper which supports 64-bit keys.
+ *
+ * TODO: unify all hash table implementations.
+ */
+
+struct hash_key_u64 {
+   uint64_t value;
+};
+
+static uint32_t
+key_u64_hash(const void *key)
+{
+   return _mesa_hash_data(key, sizeof(struct hash_key_u64));
+}
+
+static bool
+key_u64_equals(const void *a, const void *b)
+{
+   const struct hash_key_u64 *aa = a;
+   const struct hash_key_u64 *bb = b;
+
+   return aa->value == bb->value;
+}
+
+struct hash_table_u64 *
+_mesa_hash_table_u64_create(void *mem_ctx)
+{
+   struct hash_table_u64 *ht;
+
+   ht = CALLOC_STRUCT(hash_table_u64);
+   if (!ht)
+      return NULL;
+
+   if (sizeof(void *) == 8) {
+      ht->table = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+   } else {
+      ht->table = _mesa_hash_table_create(mem_ctx, key_u64_hash,
+                                          key_u64_equals);
+   }
+
+   if (ht->table)
+      _mesa_hash_table_set_deleted_key(ht->table, uint_key(DELETED_KEY_VALUE));
+
+   return ht;
+}
+
+void
+_mesa_hash_table_u64_destroy(struct hash_table_u64 *ht,
+                             void (*delete_function)(struct hash_entry *entry))
+{
+   if (!ht)
+      return;
+
+   if (ht->deleted_key_data) {
+      if (delete_function) {
+         struct hash_table *table = ht->table;
+         struct hash_entry deleted_entry;
+
+         /* Create a fake entry for the delete function. */
+         deleted_entry.hash = table->key_hash_function(table->deleted_key);
+         deleted_entry.key = table->deleted_key;
+         deleted_entry.data = ht->deleted_key_data;
+
+         delete_function(&deleted_entry);
+      }
+      ht->deleted_key_data = NULL;
+   }
+
+   _mesa_hash_table_destroy(ht->table, delete_function);
+   free(ht);
+}
+
+void
+_mesa_hash_table_u64_insert(struct hash_table_u64 *ht, uint64_t key,
+                            void *data)
+{
+   if (key == DELETED_KEY_VALUE) {
+      ht->deleted_key_data = data;
+      return;
+   }
+
+   if (sizeof(void *) == 8) {
+      _mesa_hash_table_insert(ht->table, (void *)key, data);
+   } else {
+      struct hash_key_u64 *_key = CALLOC_STRUCT(hash_key_u64);
+
+      if (!_key)
+         return;
+      _key->value = key;
+
+      _mesa_hash_table_insert(ht->table, _key, data);
+   }
+}
+
+static struct hash_entry *
+hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key)
+{
+   if (sizeof(void *) == 8) {
+      return _mesa_hash_table_search(ht->table, (void *)key);
+   } else {
+      struct hash_key_u64 _key = { .value = key };
+      return _mesa_hash_table_search(ht->table, &_key);
+   }
+}
+
+void *
+_mesa_hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key)
+{
+   struct hash_entry *entry;
+
+   if (key == DELETED_KEY_VALUE)
+      return ht->deleted_key_data;
+
+   entry = hash_table_u64_search(ht, key);
+   if (!entry)
+      return NULL;
+
+   return entry->data;
+}
+
+void
+_mesa_hash_table_u64_remove(struct hash_table_u64 *ht, uint64_t key)
+{
+   struct hash_entry *entry;
+
+   if (key == DELETED_KEY_VALUE) {
+      ht->deleted_key_data = NULL;
+      return;
+   }
+
+   entry = hash_table_u64_search(ht, key);
+   if (!entry)
+      return;
+
+   if (sizeof(void *) == 8) {
+      _mesa_hash_table_remove(ht->table, entry);
+   } else {
+      struct hash_key *_key = (struct hash_key *)entry->key;
+
+      _mesa_hash_table_remove(ht->table, entry);
+      free(_key);
+   }
+}
diff --git a/src/util/hash_table.h b/src/util/hash_table.h
index b35ee87..cf93913 100644
--- a/src/util/hash_table.h
+++ b/src/util/hash_table.h
@@ -105,7 +105,8 @@
 
 static inline uint32_t _mesa_hash_pointer(const void *pointer)
 {
-   return _mesa_hash_data(&pointer, sizeof(pointer));
+   uintptr_t num = (uintptr_t) pointer;
+   return (uint32_t) ((num >> 2) ^ (num >> 6) ^ (num >> 10) ^ (num >> 14));
 }
 
 enum {
@@ -152,6 +153,31 @@
       callback(entry->key, entry->data, closure);
 }
 
+/**
+ * Hash table wrapper which supports 64-bit keys.
+ */
+struct hash_table_u64 {
+   struct hash_table *table;
+   void *deleted_key_data;
+};
+
+struct hash_table_u64 *
+_mesa_hash_table_u64_create(void *mem_ctx);
+
+void
+_mesa_hash_table_u64_destroy(struct hash_table_u64 *ht,
+                             void (*delete_function)(struct hash_entry *entry));
+
+void
+_mesa_hash_table_u64_insert(struct hash_table_u64 *ht, uint64_t key,
+                            void *data);
+
+void *
+_mesa_hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key);
+
+void
+_mesa_hash_table_u64_remove(struct hash_table_u64 *ht, uint64_t key);
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
diff --git a/src/util/macros.h b/src/util/macros.h
index 6f55ac6..a66f1bf 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -136,6 +136,17 @@
 #define MALLOCLIKE
 #endif
 
+/* Forced function inlining */
+#ifndef ALWAYS_INLINE
+#  if defined(__GNUC__) || defined(__clang__)
+#    define ALWAYS_INLINE inline __attribute__((always_inline))
+#  elif defined(_MSC_VER)
+#    define ALWAYS_INLINE __forceinline
+#  else
+#    define ALWAYS_INLINE inline
+#  endif
+#endif
+
 /* Used to optionally mark structures with misaligned elements or size as
  * packed, to trade off performance for space.
  */
@@ -233,8 +244,8 @@
 /** Compute ceiling of integer quotient of A divided by B. */
 #define DIV_ROUND_UP( A, B )  ( (A) % (B) == 0 ? (A)/(B) : (A)/(B)+1 )
 
-/** Clamp X to [MIN,MAX] */
-#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
+/** Clamp X to [MIN,MAX].  Turn NaN into MIN, arbitrarily. */
+#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
 
 /** Minimum of two values: */
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
diff --git a/src/util/ralloc.c b/src/util/ralloc.c
index 7bf192e..821ee72 100644
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -28,11 +28,6 @@
 #include <string.h>
 #include <stdint.h>
 
-/* Android defines SIZE_MAX in limits.h, instead of the standard stdint.h */
-#ifdef ANDROID
-#include <limits.h>
-#endif
-
 /* Some versions of MinGW are missing _vscprintf's declaration, although they
  * still provide the symbol in the import library. */
 #ifdef __MINGW32__
@@ -405,12 +400,7 @@
 bool
 ralloc_strncat(char **dest, const char *str, size_t n)
 {
-   /* Clamp n to the string length */
-   size_t str_length = strlen(str);
-   if (str_length < n)
-      n = str_length;
-
-   return cat(dest, str, n);
+   return cat(dest, str, strnlen(str, n));
 }
 
 char *
diff --git a/src/util/set.c b/src/util/set.c
index 99abefd..3925066 100644
--- a/src/util/set.c
+++ b/src/util/set.c
@@ -45,8 +45,8 @@
  * free to avoid exponential performance degradation as the hash table fills
  */
 
-uint32_t deleted_key_value;
-const void *deleted_key = &deleted_key_value;
+static const uint32_t deleted_key_value;
+static const void *deleted_key = &deleted_key_value;
 
 static const struct {
    uint32_t max_entries, size, rehash;
diff --git a/src/util/slab.c b/src/util/slab.c
index 4264814..4ce0e9a 100644
--- a/src/util/slab.c
+++ b/src/util/slab.c
@@ -140,6 +140,9 @@
  */
 void slab_destroy_child(struct slab_child_pool *pool)
 {
+   if (!pool->parent)
+      return; /* the slab probably wasn't even created */
+
    mtx_lock(&pool->parent->mutex);
 
    while (pool->pages) {
diff --git a/src/util/strtod.c b/src/util/strtod.c
index ea7d395..de695d6 100644
--- a/src/util/strtod.c
+++ b/src/util/strtod.c
@@ -26,12 +26,12 @@
 
 #include <stdlib.h>
 
-#ifdef _GNU_SOURCE
+#if defined(_GNU_SOURCE) && defined(HAVE_STRTOD_L)
 #include <locale.h>
 #ifdef HAVE_XLOCALE_H
 #include <xlocale.h>
-static locale_t loc;
 #endif
+static locale_t loc;
 #endif
 
 #include "strtod.h"
@@ -40,7 +40,7 @@
 void
 _mesa_locale_init(void)
 {
-#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+#if defined(_GNU_SOURCE) && defined(HAVE_STRTOD_L)
    loc = newlocale(LC_CTYPE_MASK, "C", NULL);
 #endif
 }
@@ -48,7 +48,7 @@
 void
 _mesa_locale_fini(void)
 {
-#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+#if defined(_GNU_SOURCE) && defined(HAVE_STRTOD_L)
    freelocale(loc);
 #endif
 }
@@ -60,7 +60,7 @@
 double
 _mesa_strtod(const char *s, char **end)
 {
-#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+#if defined(_GNU_SOURCE) && defined(HAVE_STRTOD_L)
    return strtod_l(s, end, loc);
 #else
    return strtod(s, end);
@@ -75,7 +75,7 @@
 float
 _mesa_strtof(const char *s, char **end)
 {
-#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+#if defined(_GNU_SOURCE) && defined(HAVE_STRTOD_L)
    return strtof_l(s, end, loc);
 #elif defined(HAVE_STRTOF)
    return strtof(s, end);
diff --git a/src/util/u_atomic.c b/src/util/u_atomic.c
index 44b75fb..b32527f 100644
--- a/src/util/u_atomic.c
+++ b/src/util/u_atomic.c
@@ -61,6 +61,20 @@
 }
 
 WEAK uint64_t
+__sync_val_compare_and_swap_8(uint64_t *ptr, uint64_t oldval, uint64_t newval)
+{
+   uint64_t r;
+
+   pthread_mutex_lock(&sync_mutex);
+   r = *ptr;
+   if (*ptr == oldval)
+      *ptr = newval;
+   pthread_mutex_unlock(&sync_mutex);
+
+   return r;
+}
+
+WEAK uint64_t
 __atomic_fetch_add_8(uint64_t *ptr, uint64_t val, int memorder)
 {
    return __sync_add_and_fetch(ptr, val);
diff --git a/src/util/u_dynarray.h b/src/util/u_dynarray.h
new file mode 100644
index 0000000..cc31632
--- /dev/null
+++ b/src/util/u_dynarray.h
@@ -0,0 +1,166 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DYNARRAY_H
+#define U_DYNARRAY_H
+
+#include <stdlib.h>
+#include "ralloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A zero-initialized version of this is guaranteed to represent an
+ * empty array.
+ *
+ * Also, size <= capacity and data != 0 if and only if capacity != 0
+ * capacity will always be the allocation size of data
+ */
+struct util_dynarray
+{
+   void *mem_ctx;
+   void *data;
+   unsigned size;
+   unsigned capacity;
+};
+
+static inline void
+util_dynarray_init(struct util_dynarray *buf, void *mem_ctx)
+{
+   memset(buf, 0, sizeof(*buf));
+   buf->mem_ctx = mem_ctx;
+}
+
+static inline void
+util_dynarray_fini(struct util_dynarray *buf)
+{
+   if (buf->data) {
+      if (buf->mem_ctx) {
+         ralloc_free(buf->data);
+      } else {
+         free(buf->data);
+      }
+      util_dynarray_init(buf, buf->mem_ctx);
+   }
+}
+
+static inline void
+util_dynarray_clear(struct util_dynarray *buf)
+{
+	buf->size = 0;
+}
+
+#define DYN_ARRAY_INITIAL_SIZE 64
+
+/* use util_dynarray_trim to reduce the allocated storage */
+static inline void *
+util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
+{
+   void *p;
+   if (newsize > buf->capacity) {
+      if (buf->capacity == 0)
+         buf->capacity = DYN_ARRAY_INITIAL_SIZE;
+
+      while (newsize > buf->capacity)
+         buf->capacity *= 2;
+
+      if (buf->mem_ctx) {
+         buf->data = reralloc_size(buf->mem_ctx, buf->data, buf->capacity);
+      } else {
+         buf->data = realloc(buf->data, buf->capacity);
+      }
+   }
+
+   p = (void *)((char *)buf->data + buf->size);
+   buf->size = newsize;
+
+   return p;
+}
+
+static inline void *
+util_dynarray_grow(struct util_dynarray *buf, int diff)
+{
+   return util_dynarray_resize(buf, buf->size + diff);
+}
+
+static inline void
+util_dynarray_trim(struct util_dynarray *buf)
+{
+   if (buf->size != buf->capacity) {
+      if (buf->size) {
+         if (buf->mem_ctx) {
+            buf->data = reralloc_size(buf->mem_ctx, buf->data, buf->size);
+         } else {
+            buf->data = realloc(buf->data, buf->size);
+         }
+         buf->capacity = buf->size;
+      } else {
+         if (buf->mem_ctx) {
+            ralloc_free(buf->data);
+         } else {
+            free(buf->data);
+         }
+         buf->data = 0;
+         buf->capacity = 0;
+      }
+   }
+}
+
+#define util_dynarray_append(buf, type, v) do {type __v = (v); memcpy(util_dynarray_grow((buf), sizeof(type)), &__v, sizeof(type));} while(0)
+#define util_dynarray_top_ptr(buf, type) (type*)((char*)(buf)->data + (buf)->size - sizeof(type))
+#define util_dynarray_top(buf, type) *util_dynarray_top_ptr(buf, type)
+#define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
+#define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
+#define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
+#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx))
+#define util_dynarray_begin(buf) ((buf)->data)
+#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size))
+
+#define util_dynarray_foreach(buf, type, elem) \
+   for (type *elem = (type *)(buf)->data; \
+        elem < (type *)((char *)(buf)->data + (buf)->size); elem++)
+
+#define util_dynarray_delete_unordered(buf, type, v)                    \
+   do {                                                                 \
+      unsigned num_elements = (buf)->size / sizeof(type);               \
+      unsigned i;                                                       \
+      for (i = 0; i < num_elements; i++) {                              \
+         type __v = *util_dynarray_element((buf), type, (i));           \
+         if (v == __v) {                                                \
+            memcpy(util_dynarray_element((buf), type, (i)),             \
+                   util_dynarray_pop_ptr((buf), type), sizeof(type));   \
+            break;                                                      \
+         }                                                              \
+      }                                                                 \
+   } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DYNARRAY_H */
+
diff --git a/src/util/u_queue.c b/src/util/u_queue.c
index 8db09b0..ca41d05 100644
--- a/src/util/u_queue.c
+++ b/src/util/u_queue.c
@@ -120,6 +120,19 @@
 util_queue_fence_destroy(struct util_queue_fence *fence)
 {
    assert(fence->signalled);
+
+   /* Ensure that another thread is not in the middle of
+    * util_queue_fence_signal (having set the fence to signalled but still
+    * holding the fence mutex).
+    *
+    * A common contract between threads is that as soon as a fence is signalled
+    * by thread A, thread B is allowed to destroy it. Since
+    * util_queue_fence_is_signalled does not lock the fence mutex (for
+    * performance reasons), we must do so here.
+    */
+   mtx_lock(&fence->mutex);
+   mtx_unlock(&fence->mutex);
+
    cnd_destroy(&fence->cond);
    mtx_destroy(&fence->mutex);
 }
@@ -180,13 +193,15 @@
 
    /* signal remaining jobs before terminating */
    mtx_lock(&queue->lock);
-   while (queue->jobs[queue->read_idx].job) {
-      util_queue_fence_signal(queue->jobs[queue->read_idx].fence);
-
-      queue->jobs[queue->read_idx].job = NULL;
-      queue->read_idx = (queue->read_idx + 1) % queue->max_jobs;
+   for (unsigned i = queue->read_idx; i != queue->write_idx;
+        i = (i + 1) % queue->max_jobs) {
+      if (queue->jobs[i].job) {
+         util_queue_fence_signal(queue->jobs[i].fence);
+         queue->jobs[i].job = NULL;
+      }
    }
-   queue->num_queued = 0; /* reset this when exiting the thread */
+   queue->read_idx = queue->write_idx;
+   queue->num_queued = 0;
    mtx_unlock(&queue->lock);
    return 0;
 }
@@ -195,12 +210,14 @@
 util_queue_init(struct util_queue *queue,
                 const char *name,
                 unsigned max_jobs,
-                unsigned num_threads)
+                unsigned num_threads,
+                unsigned flags)
 {
    unsigned i;
 
    memset(queue, 0, sizeof(*queue));
    queue->name = name;
+   queue->flags = flags;
    queue->num_threads = num_threads;
    queue->max_jobs = max_jobs;
 
@@ -240,6 +257,20 @@
             break;
          }
       }
+
+      if (flags & UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY) {
+   #if defined(__linux__)
+         struct sched_param sched_param = {0};
+
+         /* The nice() function can only set a maximum of 19.
+          * SCHED_IDLE is the same as nice = 20.
+          *
+          * Note that Linux only allows decreasing the priority. The original
+          * priority can't be restored.
+          */
+         pthread_setschedparam(queue->threads[i], SCHED_IDLE, &sched_param);
+   #endif
+      }
    }
 
    add_to_atexit_list(queue);
@@ -312,9 +343,39 @@
 
    assert(queue->num_queued >= 0 && queue->num_queued <= queue->max_jobs);
 
-   /* if the queue is full, wait until there is space */
-   while (queue->num_queued == queue->max_jobs)
-      cnd_wait(&queue->has_space_cond, &queue->lock);
+   if (queue->num_queued == queue->max_jobs) {
+      if (queue->flags & UTIL_QUEUE_INIT_RESIZE_IF_FULL) {
+         /* If the queue is full, make it larger to avoid waiting for a free
+          * slot.
+          */
+         unsigned new_max_jobs = queue->max_jobs + 8;
+         struct util_queue_job *jobs =
+            (struct util_queue_job*)calloc(new_max_jobs,
+                                           sizeof(struct util_queue_job));
+         assert(jobs);
+
+         /* Copy all queued jobs into the new list. */
+         unsigned num_jobs = 0;
+         unsigned i = queue->read_idx;
+
+         do {
+            jobs[num_jobs++] = queue->jobs[i];
+            i = (i + 1) % queue->max_jobs;
+         } while (i != queue->write_idx);
+
+         assert(num_jobs == queue->num_queued);
+
+         free(queue->jobs);
+         queue->jobs = jobs;
+         queue->read_idx = 0;
+         queue->write_idx = num_jobs;
+         queue->max_jobs = new_max_jobs;
+      } else {
+         /* Wait until there is a free slot. */
+         while (queue->num_queued == queue->max_jobs)
+            cnd_wait(&queue->has_space_cond, &queue->lock);
+      }
+   }
 
    ptr = &queue->jobs[queue->write_idx];
    assert(ptr->job == NULL);
@@ -329,6 +390,45 @@
    mtx_unlock(&queue->lock);
 }
 
+/**
+ * Remove a queued job. If the job hasn't started execution, it's removed from
+ * the queue. If the job has started execution, the function waits for it to
+ * complete.
+ *
+ * In all cases, the fence is signalled when the function returns.
+ *
+ * The function can be used when destroying an object associated with the job
+ * when you don't care about the job completion state.
+ */
+void
+util_queue_drop_job(struct util_queue *queue, struct util_queue_fence *fence)
+{
+   bool removed = false;
+
+   if (util_queue_fence_is_signalled(fence))
+      return;
+
+   mtx_lock(&queue->lock);
+   for (unsigned i = queue->read_idx; i != queue->write_idx;
+        i = (i + 1) % queue->max_jobs) {
+      if (queue->jobs[i].fence == fence) {
+         if (queue->jobs[i].cleanup)
+            queue->jobs[i].cleanup(queue->jobs[i].job, -1);
+
+         /* Just clear it. The threads will treat as a no-op job. */
+         memset(&queue->jobs[i], 0, sizeof(queue->jobs[i]));
+         removed = true;
+         break;
+      }
+   }
+   mtx_unlock(&queue->lock);
+
+   if (removed)
+      util_queue_fence_signal(fence);
+   else
+      util_queue_fence_wait(fence);
+}
+
 int64_t
 util_queue_get_thread_time_nano(struct util_queue *queue, unsigned thread_index)
 {
diff --git a/src/util/u_queue.h b/src/util/u_queue.h
index 4aec1f2..ff713ae 100644
--- a/src/util/u_queue.h
+++ b/src/util/u_queue.h
@@ -42,6 +42,9 @@
 extern "C" {
 #endif
 
+#define UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY      (1 << 0)
+#define UTIL_QUEUE_INIT_RESIZE_IF_FULL            (1 << 1)
+
 /* Job completion fence.
  * Put this into your job structure.
  */
@@ -67,6 +70,7 @@
    cnd_t has_queued_cond;
    cnd_t has_space_cond;
    thrd_t *threads;
+   unsigned flags;
    int num_queued;
    unsigned num_threads;
    int kill_threads;
@@ -81,7 +85,8 @@
 bool util_queue_init(struct util_queue *queue,
                      const char *name,
                      unsigned max_jobs,
-                     unsigned num_threads);
+                     unsigned num_threads,
+                     unsigned flags);
 void util_queue_destroy(struct util_queue *queue);
 void util_queue_fence_init(struct util_queue_fence *fence);
 void util_queue_fence_destroy(struct util_queue_fence *fence);
@@ -92,6 +97,8 @@
                         struct util_queue_fence *fence,
                         util_queue_execute_func execute,
                         util_queue_execute_func cleanup);
+void util_queue_drop_job(struct util_queue *queue,
+                         struct util_queue_fence *fence);
 
 void util_queue_fence_wait(struct util_queue_fence *fence);
 int64_t util_queue_get_thread_time_nano(struct util_queue *queue,
@@ -110,6 +117,20 @@
    return fence->signalled != 0;
 }
 
+/* Convenient structure for monitoring the queue externally and passing
+ * the structure between Mesa components. The queue doesn't use it directly.
+ */
+struct util_queue_monitoring
+{
+   /* For querying the thread busyness. */
+   struct util_queue *queue;
+
+   /* Counters updated by the user of the queue. */
+   unsigned num_offloaded_items;
+   unsigned num_direct_items;
+   unsigned num_syncs;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/util/u_thread.h b/src/util/u_thread.h
index 8eab3a5..6b5458a 100644
--- a/src/util/u_thread.h
+++ b/src/util/u_thread.h
@@ -28,6 +28,7 @@
 #define U_THREAD_H_
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #include "c11/threads.h"
 
@@ -88,4 +89,15 @@
 #endif
 }
 
+static inline bool u_thread_is_self(thrd_t thread)
+{
+#if defined(HAVE_PTHREAD)
+#  if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+      (__GLIBC__ >= 3 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 12))
+   return pthread_equal(pthread_self(), thread);
+#  endif
+#endif
+   return false;
+}
+
 #endif /* U_THREAD_H_ */
diff --git a/src/util/u_vector.c b/src/util/u_vector.c
index 37c4245..4dc7bc2 100644
--- a/src/util/u_vector.c
+++ b/src/util/u_vector.c
@@ -20,6 +20,9 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
+
+#include <string.h>
+#include "util/u_math.h"
 #include "util/u_vector.h"
 
 int
diff --git a/src/util/u_vector.h b/src/util/u_vector.h
index c7fcb37..cd8a95d 100644
--- a/src/util/u_vector.h
+++ b/src/util/u_vector.h
@@ -31,7 +31,6 @@
 
 #include <stdint.h>
 #include <stdlib.h>
-#include "util/u_math.h"
 #include "util/macros.h"
 
 /* TODO - move to u_math.h - name it better etc */
@@ -84,7 +83,7 @@
 #define u_vector_foreach(elem, queue)                                  \
    STATIC_ASSERT(__builtin_types_compatible_p(__typeof__(queue), struct u_vector *)); \
    for (uint32_t __u_vector_offset = (queue)->tail;                                \
-        elem = (queue)->data + (__u_vector_offset & ((queue)->size - 1)), __u_vector_offset < (queue)->head; \
+        elem = (void *)((char *)(queue)->data + (__u_vector_offset & ((queue)->size - 1))), __u_vector_offset < (queue)->head; \
         __u_vector_offset += (queue)->element_size)
 
 
diff --git a/src/vulkan/Android.mk b/src/vulkan/Android.mk
index 7653f34..e19a33d 100644
--- a/src/vulkan/Android.mk
+++ b/src/vulkan/Android.mk
@@ -41,6 +41,8 @@
 LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, \
 	$(VULKAN_UTIL_GENERATED_FILES))
 
+LOCAL_SRC_FILES := $(VULKAN_UTIL_FILES)
+
 vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml
 
 $(LOCAL_GENERATED_SOURCES): $(MESA_TOP)/src/vulkan/util/gen_enum_to_str.py $(vulkan_api_xml)
diff --git a/src/vulkan/Makefile.am b/src/vulkan/Makefile.am
index c48245c..c897a07 100644
--- a/src/vulkan/Makefile.am
+++ b/src/vulkan/Makefile.am
@@ -11,6 +11,10 @@
 EXTRA_DIST = \
 	util/gen_enum_to_str.py
 
+VULKAN_UTIL_SOURCES = \
+	$(VULKAN_UTIL_FILES) \
+	$(VULKAN_UTIL_GENERATED_FILES)
+
 BUILT_SOURCES = \
 	$(VULKAN_UTIL_GENERATED_FILES)
 
@@ -18,12 +22,13 @@
 	$(MKDIR_GEN)
 	$(PYTHON_GEN) $(srcdir)/util/gen_enum_to_str.py --xml $(vulkan_api_xml) --outdir $(top_builddir)/src/vulkan/util
 
-libvulkan_util_la_SOURCES = $(VULKAN_UTIL_GENERATED_FILES)
+libvulkan_util_la_SOURCES = $(VULKAN_UTIL_SOURCES)
 
 AM_CPPFLAGS = \
 	$(DEFINES) \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/src/vulkan/util \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/include
 
@@ -31,8 +36,6 @@
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
-VULKAN_LIB_DEPS =
-
 VULKAN_WSI_SOURCES = \
 	$(VULKAN_WSI_FILES)
 
@@ -43,8 +46,6 @@
 	-DVK_USE_PLATFORM_XLIB_KHR
 
 VULKAN_WSI_SOURCES += $(VULKAN_WSI_X11_FILES)
-
-VULKAN_LIB_DEPS += $(XCB_DRI3_LIBS)
 endif
 
 BUILT_SOURCES += $(VULKAN_WSI_WAYLAND_GENERATED_FILES)
@@ -70,8 +71,6 @@
 	$(VULKAN_WSI_WAYLAND_FILES) \
 	$(VULKAN_WSI_WAYLAND_GENERATED_FILES)
 
-VULKAN_LIB_DEPS += \
-	$(WAYLAND_LIBS)
 endif
 
 libvulkan_wsi_la_SOURCES = $(VULKAN_WSI_SOURCES)
diff --git a/src/vulkan/Makefile.sources b/src/vulkan/Makefile.sources
index 63f4ac1..2cf7218 100644
--- a/src/vulkan/Makefile.sources
+++ b/src/vulkan/Makefile.sources
@@ -15,6 +15,11 @@
 	wsi/wsi_common_x11.c \
 	wsi/wsi_common_x11.h
 
+VULKAN_UTIL_FILES := \
+	util/vk_alloc.h \
+	util/vk_util.c \
+	util/vk_util.h
+
 VULKAN_UTIL_GENERATED_FILES := \
 	util/vk_enum_to_str.c \
 	util/vk_enum_to_str.h
diff --git a/src/vulkan/util/BUILD.gn b/src/vulkan/util/BUILD.gn
index bb21c35..41e500d 100644
--- a/src/vulkan/util/BUILD.gn
+++ b/src/vulkan/util/BUILD.gn
@@ -33,7 +33,9 @@
   ]
   sources = [
     "$target_gen_dir/vk_enum_to_str.c",
-    "vk_enum_to_str.h"
+    "vk_enum_to_str.h",
+    "vk_util.c",
+    "vk_util.h"
   ]
   deps += [ ":gen_enum_to_str" ]
 }
diff --git a/src/vulkan/util/gen_enum_to_str.py b/src/vulkan/util/gen_enum_to_str.py
index 73ea7a6..a7dcab0 100755
--- a/src/vulkan/util/gen_enum_to_str.py
+++ b/src/vulkan/util/gen_enum_to_str.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-"""Create enum to string functions for vulking using vk.xml."""
+"""Create enum to string functions for vulkan using vk.xml."""
 
 from __future__ import print_function
 import argparse
diff --git a/src/util/vk_alloc.h b/src/vulkan/util/vk_alloc.h
similarity index 100%
rename from src/util/vk_alloc.h
rename to src/vulkan/util/vk_alloc.h
diff --git a/src/vulkan/util/vk_util.c b/src/vulkan/util/vk_util.c
new file mode 100644
index 0000000..769e690
--- /dev/null
+++ b/src/vulkan/util/vk_util.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "vk_util.h"
+
+uint32_t vk_get_driver_version(void)
+{
+   const char *minor_string = strchr(VERSION, '.');
+   const char *patch_string = minor_string ? strchr(minor_string + 1, '.') : NULL;
+   int major = atoi(VERSION);
+   int minor = minor_string ? atoi(minor_string + 1) : 0;
+   int patch = patch_string ? atoi(patch_string + 1) : 0;
+   if (strstr(VERSION, "devel")) {
+      if (patch == 0) {
+         patch = 99;
+         if (minor == 0) {
+            minor = 99;
+            --major;
+         } else
+            --minor;
+      } else
+         --patch;
+   }
+   return VK_MAKE_VERSION(major, minor, patch);
+}
diff --git a/src/util/vk_util.h b/src/vulkan/util/vk_util.h
similarity index 98%
rename from src/util/vk_util.h
rename to src/vulkan/util/vk_util.h
index 5ff1f00..2ed601f 100644
--- a/src/util/vk_util.h
+++ b/src/vulkan/util/vk_util.h
@@ -197,4 +197,6 @@
 #define vk_find_struct_const(__start, __sType) \
    (const void *)__vk_find_struct((void *)(__start), VK_STRUCTURE_TYPE_##__sType)
 
+uint32_t vk_get_driver_version(void);
+
 #endif /* VK_UTIL_H */
diff --git a/src/vulkan/wsi/BUILD.gn b/src/vulkan/wsi/BUILD.gn
index 5e8a0e2..2b465f3 100644
--- a/src/vulkan/wsi/BUILD.gn
+++ b/src/vulkan/wsi/BUILD.gn
@@ -37,6 +37,8 @@
     "$magma_build_root/include:magma_abi",
     "$magma_build_root/src/magma_util",
     "$mesa_build_root/include:vulkan",
-    "$mesa_build_root/src/util" 
+    "$mesa_build_root/src/util"
   ]
+
+  include_dirs = [ "$mesa_build_root/src/vulkan/util" ]
 }
diff --git a/src/vulkan/wsi/wsi_common.h b/src/vulkan/wsi/wsi_common.h
index d484b31..ada7df0 100644
--- a/src/vulkan/wsi/wsi_common.h
+++ b/src/vulkan/wsi/wsi_common.h
@@ -26,7 +26,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 
-#include "util/vk_alloc.h"
+#include "vk_alloc.h"
 #include <vulkan/vulkan.h>
 #include <vulkan/vk_icd.h>
 
@@ -87,10 +87,18 @@
                            VkBool32* pSupported);
    VkResult (*get_capabilities)(VkIcdSurfaceBase *surface,
                                 VkSurfaceCapabilitiesKHR* pSurfaceCapabilities);
+   VkResult (*get_capabilities2)(VkIcdSurfaceBase *surface,
+                                 const void *info_next,
+                                 VkSurfaceCapabilities2KHR* pSurfaceCapabilities);
    VkResult (*get_formats)(VkIcdSurfaceBase *surface,
                            struct wsi_device *wsi_device,
                            uint32_t* pSurfaceFormatCount,
                            VkSurfaceFormatKHR* pSurfaceFormats);
+   VkResult (*get_formats2)(VkIcdSurfaceBase *surface,
+                            struct wsi_device *wsi_device,
+                            const void *info_next,
+                            uint32_t* pSurfaceFormatCount,
+                            VkSurfaceFormat2KHR* pSurfaceFormats);
    VkResult (*get_present_modes)(VkIcdSurfaceBase *surface,
                                  uint32_t* pPresentModeCount,
                                  VkPresentModeKHR* pPresentModes);
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index 50a34e9..4c94cd6 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <pthread.h>
 
+#include "vk_util.h"
 #include "wsi_common_wayland.h"
 #include "wayland-drm-client-protocol.h"
 
@@ -57,6 +58,9 @@
    struct u_vector                            formats;
 
    uint32_t                                     capabilities;
+
+   /* Only used for displays created by wsi_wl_display_create */
+   uint32_t                                     refcount;
 };
 
 struct wsi_wayland {
@@ -65,10 +69,6 @@
    const VkAllocationCallbacks *alloc;
    VkPhysicalDevice physical_device;
 
-   pthread_mutex_t                              mutex;
-   /* Hash table of wl_display -> wsi_wl_display mappings */
-   struct hash_table *                          displays;
-
    const struct wsi_callbacks *cbs;
 };
 
@@ -97,7 +97,6 @@
 static void
 drm_handle_device(void *data, struct wl_drm *drm, const char *name)
 {
-   fprintf(stderr, "wl_drm.device(%s)\n", name);
 }
 
 static uint32_t
@@ -148,6 +147,8 @@
 drm_handle_format(void *data, struct wl_drm *drm, uint32_t wl_format)
 {
    struct wsi_wl_display *display = data;
+   if (display->formats.element_size == 0)
+      return;
 
    switch (wl_format) {
 #if 0
@@ -249,8 +250,10 @@
 };
 
 static void
-wsi_wl_display_destroy(struct wsi_wayland *wsi, struct wsi_wl_display *display)
+wsi_wl_display_finish(struct wsi_wl_display *display)
 {
+   assert(display->refcount == 0);
+
    u_vector_finish(&display->formats);
    if (display->drm)
       wl_drm_destroy(display->drm);
@@ -258,113 +261,136 @@
       wl_proxy_wrapper_destroy(display->wl_display_wrapper);
    if (display->queue)
       wl_event_queue_destroy(display->queue);
-   vk_free(wsi->alloc, display);
 }
 
-static struct wsi_wl_display *
-wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display)
+static VkResult
+wsi_wl_display_init(struct wsi_wayland *wsi_wl,
+                    struct wsi_wl_display *display,
+                    struct wl_display *wl_display,
+                    bool get_format_list)
 {
-   struct wsi_wl_display *display =
-      vk_alloc(wsi->alloc, sizeof(*display), 8,
-               VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
-   if (!display)
-      return NULL;
-
+   VkResult result = VK_SUCCESS;
    memset(display, 0, sizeof(*display));
 
-   display->wsi_wl = wsi;
+   display->wsi_wl = wsi_wl;
    display->wl_display = wl_display;
 
-   if (!u_vector_init(&display->formats, sizeof(VkFormat), 8))
-      goto fail;
+   if (get_format_list) {
+      if (!u_vector_init(&display->formats, sizeof(VkFormat), 8)) {
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+         goto fail;
+      }
+   }
 
    display->queue = wl_display_create_queue(wl_display);
-   if (!display->queue)
+   if (!display->queue) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
       goto fail;
+   }
 
    display->wl_display_wrapper = wl_proxy_create_wrapper(wl_display);
-   if (!display->wl_display_wrapper)
+   if (!display->wl_display_wrapper) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
       goto fail;
+   }
 
    wl_proxy_set_queue((struct wl_proxy *) display->wl_display_wrapper,
                       display->queue);
 
    struct wl_registry *registry =
       wl_display_get_registry(display->wl_display_wrapper);
-   if (!registry)
+   if (!registry) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
       goto fail;
+   }
 
    wl_registry_add_listener(registry, &registry_listener, display);
 
    /* Round-trip to get the wl_drm global */
    wl_display_roundtrip_queue(display->wl_display, display->queue);
 
-   if (!display->drm)
+   if (!display->drm) {
+      result = VK_ERROR_SURFACE_LOST_KHR;
       goto fail_registry;
+   }
 
    /* Round-trip to get wl_drm formats and capabilities */
    wl_display_roundtrip_queue(display->wl_display, display->queue);
 
    /* We need prime support */
-   if (!(display->capabilities & WL_DRM_CAPABILITY_PRIME))
+   if (!(display->capabilities & WL_DRM_CAPABILITY_PRIME)) {
+      result = VK_ERROR_SURFACE_LOST_KHR;
       goto fail_registry;
+   }
 
    /* We don't need this anymore */
    wl_registry_destroy(registry);
 
-   return display;
+   display->refcount = 0;
+
+   return VK_SUCCESS;
 
 fail_registry:
    if (registry)
       wl_registry_destroy(registry);
 
 fail:
-   wsi_wl_display_destroy(wsi, display);
-   return NULL;
+   wsi_wl_display_finish(display);
+   return result;
+}
+
+static VkResult
+wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display,
+                      struct wsi_wl_display **display_out)
+{
+   struct wsi_wl_display *display =
+      vk_alloc(wsi->alloc, sizeof(*display), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!display)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   VkResult result = wsi_wl_display_init(wsi, display, wl_display, true);
+   if (result != VK_SUCCESS) {
+      vk_free(wsi->alloc, display);
+      return result;
+   }
+
+   display->refcount++;
+   *display_out = display;
+
+   return result;
 }
 
 static struct wsi_wl_display *
-wsi_wl_get_display(struct wsi_device *wsi_device,
-                   struct wl_display *wl_display)
+wsi_wl_display_ref(struct wsi_wl_display *display)
 {
-   struct wsi_wayland *wsi =
-      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+   display->refcount++;
+   return display;
+}
 
-   pthread_mutex_lock(&wsi->mutex);
+static void
+wsi_wl_display_unref(struct wsi_wl_display *display)
+{
+   if (display->refcount-- > 1)
+      return;
 
-   struct hash_entry *entry = _mesa_hash_table_search(wsi->displays,
-                                                      wl_display);
-   if (!entry) {
-      /* We're about to make a bunch of blocking calls.  Let's drop the
-       * mutex for now so we don't block up too badly.
-       */
-      pthread_mutex_unlock(&wsi->mutex);
-
-      struct wsi_wl_display *display = wsi_wl_display_create(wsi, wl_display);
-      if (!display)
-         return NULL;
-
-      pthread_mutex_lock(&wsi->mutex);
-
-      entry = _mesa_hash_table_search(wsi->displays, wl_display);
-      if (entry) {
-         /* Oops, someone raced us to it */
-         wsi_wl_display_destroy(wsi, display);
-      } else {
-         entry = _mesa_hash_table_insert(wsi->displays, wl_display, display);
-      }
-   }
-
-   pthread_mutex_unlock(&wsi->mutex);
-
-   return entry->data;
+   struct wsi_wayland *wsi = display->wsi_wl;
+   wsi_wl_display_finish(display);
+   vk_free(wsi->alloc, display);
 }
 
 VkBool32
 wsi_wl_get_presentation_support(struct wsi_device *wsi_device,
 				struct wl_display *wl_display)
 {
-   return wsi_wl_get_display(wsi_device, wl_display) != NULL;
+   struct wsi_wayland *wsi =
+      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+
+   struct wsi_wl_display display;
+   int ret = wsi_wl_display_init(wsi, &display, wl_display, false);
+   wsi_wl_display_finish(&display);
+
+   return ret == 0;
 }
 
 static VkResult
@@ -422,39 +448,72 @@
 }
 
 static VkResult
+wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface,
+                                 const void *info_next,
+                                 VkSurfaceCapabilities2KHR* caps)
+{
+   assert(caps->sType == VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR);
+
+   return wsi_wl_surface_get_capabilities(surface, &caps->surfaceCapabilities);
+}
+
+static VkResult
 wsi_wl_surface_get_formats(VkIcdSurfaceBase *icd_surface,
 			   struct wsi_device *wsi_device,
                            uint32_t* pSurfaceFormatCount,
                            VkSurfaceFormatKHR* pSurfaceFormats)
 {
    VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface;
-   struct wsi_wl_display *display =
-      wsi_wl_get_display(wsi_device, surface->display);
-   if (!display)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   struct wsi_wayland *wsi =
+      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
 
-   if (pSurfaceFormats == NULL) {
-      *pSurfaceFormatCount = u_vector_length(&display->formats);
-      return VK_SUCCESS;
+   struct wsi_wl_display display;
+   if (wsi_wl_display_init(wsi, &display, surface->display, true))
+      return VK_ERROR_SURFACE_LOST_KHR;
+
+   VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+
+   VkFormat *disp_fmt;
+   u_vector_foreach(disp_fmt, &display.formats) {
+      vk_outarray_append(&out, out_fmt) {
+         out_fmt->format = *disp_fmt;
+         out_fmt->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
    }
 
-   uint32_t count = 0;
-   VkFormat *f;
-   u_vector_foreach(f, &display->formats) {
-      if (count == *pSurfaceFormatCount)
-         return VK_INCOMPLETE;
+   wsi_wl_display_finish(&display);
 
-      pSurfaceFormats[count++] = (VkSurfaceFormatKHR) {
-         .format = *f,
-         /* TODO: We should get this from the compositor somehow */
-         .colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
-      };
+   return vk_outarray_status(&out);
+}
+
+static VkResult
+wsi_wl_surface_get_formats2(VkIcdSurfaceBase *icd_surface,
+			    struct wsi_device *wsi_device,
+                            const void *info_next,
+                            uint32_t* pSurfaceFormatCount,
+                            VkSurfaceFormat2KHR* pSurfaceFormats)
+{
+   VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface;
+   struct wsi_wayland *wsi =
+      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+
+   struct wsi_wl_display display;
+   if (wsi_wl_display_init(wsi, &display, surface->display, true))
+      return VK_ERROR_SURFACE_LOST_KHR;
+
+   VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+
+   VkFormat *disp_fmt;
+   u_vector_foreach(disp_fmt, &display.formats) {
+      vk_outarray_append(&out, out_fmt) {
+         out_fmt->surfaceFormat.format = *disp_fmt;
+         out_fmt->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
    }
 
-   assert(*pSurfaceFormatCount <= count);
-   *pSurfaceFormatCount = count;
+   wsi_wl_display_finish(&display);
 
-   return VK_SUCCESS;
+   return vk_outarray_status(&out);
 }
 
 static VkResult
@@ -506,8 +565,8 @@
 struct wsi_wl_swapchain {
    struct wsi_swapchain                        base;
 
-   struct wsi_wl_display *                      display;
-   struct wl_event_queue *                      queue;
+   struct wsi_wl_display                        *display;
+
    struct wl_surface *                          surface;
    uint32_t                                     surface_version;
    struct wl_drm *                              drm_wrapper;
@@ -558,7 +617,7 @@
    struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)wsi_chain;
 
    int ret = wl_display_dispatch_queue_pending(chain->display->wl_display,
-                                               chain->queue);
+                                               chain->display->queue);
    /* XXX: I'm not sure if out-of-date is the right error here.  If
     * wl_display_dispatch_queue_pending fails it most likely means we got
     * kicked by the server so this seems more-or-less correct.
@@ -580,7 +639,7 @@
        * anywhere until we get an event.
        */
       int ret = wl_display_roundtrip_queue(chain->display->wl_display,
-                                           chain->queue);
+                                           chain->display->queue);
       if (ret < 0)
          return VK_ERROR_OUT_OF_DATE_KHR;
    }
@@ -611,7 +670,7 @@
    if (chain->base.present_mode == VK_PRESENT_MODE_FIFO_KHR) {
       while (!chain->fifo_ready) {
          int ret = wl_display_dispatch_queue(chain->display->wl_display,
-                                             chain->queue);
+                                             chain->display->queue);
          if (ret < 0)
             return VK_ERROR_OUT_OF_DATE_KHR;
       }
@@ -731,8 +790,9 @@
       wl_proxy_wrapper_destroy(chain->surface);
    if (chain->drm_wrapper)
       wl_proxy_wrapper_destroy(chain->drm_wrapper);
-   if (chain->queue)
-      wl_event_queue_destroy(chain->queue);
+
+   if (chain->display)
+      wsi_wl_display_unref(chain->display);
 
    vk_free(pAllocator, chain);
 
@@ -750,6 +810,8 @@
                                 struct wsi_swapchain **swapchain_out)
 {
    VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface;
+   struct wsi_wayland *wsi =
+      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
    struct wsi_wl_swapchain *chain;
    VkResult result;
 
@@ -768,7 +830,6 @@
     */
    for (uint32_t i = 0; i < num_images; i++)
       chain->images[i].buffer = NULL;
-   chain->queue = NULL;
    chain->surface = NULL;
    chain->drm_wrapper = NULL;
    chain->frame = NULL;
@@ -789,32 +850,35 @@
    chain->vk_format = pCreateInfo->imageFormat;
    chain->drm_format = wl_drm_format_for_vk_format(chain->vk_format, alpha);
 
-   chain->display = wsi_wl_get_display(wsi_device, surface->display);
-   if (!chain->display) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
-      goto fail;
-   }
-
-   chain->queue = wl_display_create_queue(chain->display->wl_display);
-   if (!chain->queue) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
-      goto fail;
+   if (pCreateInfo->oldSwapchain) {
+      /* If we have an oldSwapchain parameter, copy the display struct over
+       * from the old one so we don't have to fully re-initialize it.
+       */
+      struct wsi_wl_swapchain *old_chain = (void *)pCreateInfo->oldSwapchain;
+      chain->display = wsi_wl_display_ref(old_chain->display);
+   } else {
+      chain->display = NULL;
+      result = wsi_wl_display_create(wsi, surface->display, &chain->display);
+      if (result != VK_SUCCESS)
+         goto fail;
    }
 
    chain->surface = wl_proxy_create_wrapper(surface->surface);
    if (!chain->surface) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
       goto fail;
    }
-   wl_proxy_set_queue((struct wl_proxy *) chain->surface, chain->queue);
+   wl_proxy_set_queue((struct wl_proxy *) chain->surface,
+                      chain->display->queue);
    chain->surface_version = wl_proxy_get_version((void *)surface->surface);
 
    chain->drm_wrapper = wl_proxy_create_wrapper(chain->display->drm);
    if (!chain->drm_wrapper) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
       goto fail;
    }
-   wl_proxy_set_queue((struct wl_proxy *) chain->drm_wrapper, chain->queue);
+   wl_proxy_set_queue((struct wl_proxy *) chain->drm_wrapper,
+                      chain->display->queue);
 
    chain->fifo_ready = true;
 
@@ -855,28 +919,12 @@
    wsi->physical_device = physical_device;
    wsi->alloc = alloc;
    wsi->cbs = cbs;
-   int ret = pthread_mutex_init(&wsi->mutex, NULL);
-   if (ret != 0) {
-      if (ret == ENOMEM) {
-         result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      } else {
-         /* FINISHME: Choose a better error. */
-         result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      }
-
-      goto fail_alloc;
-   }
-
-   wsi->displays = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                           _mesa_key_pointer_equal);
-   if (!wsi->displays) {
-      result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      goto fail_mutex;
-   }
 
    wsi->base.get_support = wsi_wl_surface_get_support;
    wsi->base.get_capabilities = wsi_wl_surface_get_capabilities;
+   wsi->base.get_capabilities2 = wsi_wl_surface_get_capabilities2;
    wsi->base.get_formats = wsi_wl_surface_get_formats;
+   wsi->base.get_formats2 = wsi_wl_surface_get_formats2;
    wsi->base.get_present_modes = wsi_wl_surface_get_present_modes;
    wsi->base.create_swapchain = wsi_wl_surface_create_swapchain;
 
@@ -884,11 +932,6 @@
 
    return VK_SUCCESS;
 
-fail_mutex:
-   pthread_mutex_destroy(&wsi->mutex);
-
-fail_alloc:
-   vk_free(alloc, wsi);
 fail:
    wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND] = NULL;
 
@@ -901,16 +944,8 @@
 {
    struct wsi_wayland *wsi =
       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
+   if (!wsi)
+      return;
 
-   if (wsi) {
-      struct hash_entry *entry;
-      hash_table_foreach(wsi->displays, entry)
-         wsi_wl_display_destroy(wsi, entry->data);
-
-      _mesa_hash_table_destroy(wsi->displays, NULL);
-
-      pthread_mutex_destroy(&wsi->mutex);
-
-      vk_free(alloc, wsi);
-   }
+   vk_free(alloc, wsi);
 }
diff --git a/src/vulkan/wsi/wsi_common_x11.c b/src/vulkan/wsi/wsi_common_x11.c
index c399aae..22b067b 100644
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -38,6 +38,7 @@
 #include <xf86drm.h>
 #include "util/hash_table.h"
 
+#include "vk_util.h"
 #include "wsi_common.h"
 #include "wsi_common_x11.h"
 #include "wsi_common_queue.h"
@@ -235,9 +236,9 @@
    return entry->data;
 }
 
-static const VkSurfaceFormatKHR formats[] = {
-   { .format = VK_FORMAT_B8G8R8A8_SRGB, },
-   { .format = VK_FORMAT_B8G8R8A8_UNORM, },
+static const VkFormat formats[] = {
+   VK_FORMAT_B8G8R8A8_SRGB,
+   VK_FORMAT_B8G8R8A8_UNORM,
 };
 
 static const VkPresentModeKHR present_modes[] = {
@@ -511,21 +512,51 @@
 }
 
 static VkResult
+x11_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
+                              const void *info_next,
+                              VkSurfaceCapabilities2KHR *caps)
+{
+   assert(caps->sType == VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR);
+
+   return x11_surface_get_capabilities(icd_surface, &caps->surfaceCapabilities);
+}
+
+static VkResult
 x11_surface_get_formats(VkIcdSurfaceBase *surface,
                         struct wsi_device *wsi_device,
                         uint32_t *pSurfaceFormatCount,
                         VkSurfaceFormatKHR *pSurfaceFormats)
 {
-   if (pSurfaceFormats == NULL) {
-      *pSurfaceFormatCount = ARRAY_SIZE(formats);
-      return VK_SUCCESS;
+   VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(formats); i++) {
+      vk_outarray_append(&out, f) {
+         f->format = formats[i];
+         f->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
    }
 
-   *pSurfaceFormatCount = MIN2(*pSurfaceFormatCount, ARRAY_SIZE(formats));
-   typed_memcpy(pSurfaceFormats, formats, *pSurfaceFormatCount);
+   return vk_outarray_status(&out);
+}
 
-   return *pSurfaceFormatCount < ARRAY_SIZE(formats) ?
-      VK_INCOMPLETE : VK_SUCCESS;
+static VkResult
+x11_surface_get_formats2(VkIcdSurfaceBase *surface,
+                        struct wsi_device *wsi_device,
+                        const void *info_next,
+                        uint32_t *pSurfaceFormatCount,
+                        VkSurfaceFormat2KHR *pSurfaceFormats)
+{
+   VK_OUTARRAY_MAKE(out, pSurfaceFormats, pSurfaceFormatCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(formats); i++) {
+      vk_outarray_append(&out, f) {
+         assert(f->sType == VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR);
+         f->surfaceFormat.format = formats[i];
+         f->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
+   }
+
+   return vk_outarray_status(&out);
 }
 
 static VkResult
@@ -908,6 +939,7 @@
             goto fail;
 
          result = x11_handle_dri3_present_event(chain, (void *)event);
+         free(event);
          if (result != VK_SUCCESS)
             goto fail;
       }
@@ -1242,7 +1274,9 @@
 
    wsi->base.get_support = x11_surface_get_support;
    wsi->base.get_capabilities = x11_surface_get_capabilities;
+   wsi->base.get_capabilities2 = x11_surface_get_capabilities2;
    wsi->base.get_formats = x11_surface_get_formats;
+   wsi->base.get_formats2 = x11_surface_get_formats2;
    wsi->base.get_present_modes = x11_surface_get_present_modes;
    wsi->base.create_swapchain = x11_surface_create_swapchain;